Adding Application-Level Metrics#
Ray provides a convenient API in ray.util.metrics for defining and exporting custom metrics for visibility into your applications. Three metrics are supported: Counter, Gauge, and Histogram. These metrics correspond to the same Prometheus metric types. Below is a simple example of an Actor that exports metrics using these APIs:
import time
import ray
from ray.util.metrics import Counter, Gauge, Histogram
ray.init(_metrics_export_port=8080)
@ray.remote
class MyActor:
def __init__(self, name):
self._curr_count = 0
self.counter = Counter(
"num_requests",
description="Number of requests processed by the actor.",
tag_keys=("actor_name",),
)
self.counter.set_default_tags({"actor_name": name})
self.gauge = Gauge(
"curr_count",
description="Current count held by the actor. Goes up and down.",
tag_keys=("actor_name",),
)
self.gauge.set_default_tags({"actor_name": name})
self.histogram = Histogram(
"request_latency",
description="Latencies of requests in ms.",
boundaries=[0.1, 1],
tag_keys=("actor_name",),
)
self.histogram.set_default_tags({"actor_name": name})
def process_request(self, num):
start = time.time()
self._curr_count += num
# Increment the total request count.
self.counter.inc()
# Update the gauge to the new value.
self.gauge.set(self._curr_count)
# Record the latency for this request in ms.
self.histogram.observe(1000 * (time.time() - start))
return self._curr_count
print("Starting actor.")
my_actor = MyActor.remote("my_actor")
print("Calling actor.")
my_actor.process_request.remote(-10)
print("Calling actor.")
my_actor.process_request.remote(5)
print("Metrics should be exported.")
print("See http://localhost:8080 (this may take a few seconds to load).")
# Sleep so we can look at the metrics before exiting.
time.sleep(30)
print("Exiting!")
While the script is running, the metrics are exported to localhost:8080
(this is the endpoint that Prometheus would be configured to scrape).
Open this in the browser. You should see the following output:
# HELP ray_request_latency Latencies of requests in ms.
# TYPE ray_request_latency histogram
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="0.1"} 2.0
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="1.0"} 2.0
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="+Inf"} 2.0
ray_request_latency_count{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 2.0
ray_request_latency_sum{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 0.11992454528808594
# HELP ray_curr_count Current count held by the actor. Goes up and down.
# TYPE ray_curr_count gauge
ray_curr_count{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} -15.0
# HELP ray_num_requests_total Number of requests processed by the actor.
# TYPE ray_num_requests_total counter
ray_num_requests_total{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 2.0
Please see ray.util.metrics for more details.