Adding Application-Level Metrics#

Ray provides a convenient API in ray.util.metrics for defining and exporting custom metrics for visibility into your applications. Three metrics are supported: Counter, Gauge, and Histogram. These metrics correspond to the same Prometheus metric types. Below is a simple example of an Actor that exports metrics using these APIs:

import time

import ray
from ray.util.metrics import Counter, Gauge, Histogram

ray.init(_metrics_export_port=8080)


@ray.remote
class MyActor:
    def __init__(self, name):
        self._curr_count = 0

        self.counter = Counter(
            "num_requests",
            description="Number of requests processed by the actor.",
            tag_keys=("actor_name",),
        )
        self.counter.set_default_tags({"actor_name": name})

        self.gauge = Gauge(
            "curr_count",
            description="Current count held by the actor. Goes up and down.",
            tag_keys=("actor_name",),
        )
        self.gauge.set_default_tags({"actor_name": name})

        self.histogram = Histogram(
            "request_latency",
            description="Latencies of requests in ms.",
            boundaries=[0.1, 1],
            tag_keys=("actor_name",),
        )
        self.histogram.set_default_tags({"actor_name": name})

    def process_request(self, num):
        start = time.time()
        self._curr_count += num

        # Increment the total request count.
        self.counter.inc()
        # Update the gauge to the new value.
        self.gauge.set(self._curr_count)
        # Record the latency for this request in ms.
        self.histogram.observe(1000 * (time.time() - start))

        return self._curr_count


print("Starting actor.")
my_actor = MyActor.remote("my_actor")
print("Calling actor.")
my_actor.process_request.remote(-10)
print("Calling actor.")
my_actor.process_request.remote(5)
print("Metrics should be exported.")
print("See http://localhost:8080 (this may take a few seconds to load).")

# Sleep so we can look at the metrics before exiting.
time.sleep(30)
print("Exiting!")

While the script is running, the metrics are exported to localhost:8080 (this is the endpoint that Prometheus would be configured to scrape). Open this in the browser. You should see the following output:

# HELP ray_request_latency Latencies of requests in ms.
# TYPE ray_request_latency histogram
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="0.1"} 2.0
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="1.0"} 2.0
ray_request_latency_bucket{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor",le="+Inf"} 2.0
ray_request_latency_count{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 2.0
ray_request_latency_sum{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 0.11992454528808594
# HELP ray_curr_count Current count held by the actor. Goes up and down.
# TYPE ray_curr_count gauge
ray_curr_count{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} -15.0
# HELP ray_num_requests_total Number of requests processed by the actor.
# TYPE ray_num_requests_total counter
ray_num_requests_total{Component="core_worker",Version="3.0.0.dev0",actor_name="my_actor"} 2.0

Please see ray.util.metrics for more details.