Source code for ray.tune.logger.aim

import logging
from typing import TYPE_CHECKING, Dict, List, Optional, Union

import numpy as np

from ray.air.constants import TRAINING_ITERATION
from ray.tune.logger.logger import LoggerCallback
from ray.tune.result import TIME_TOTAL_S, TIMESTEPS_TOTAL
from ray.tune.utils import flatten_dict
from ray.util.annotations import PublicAPI

if TYPE_CHECKING:
    from ray.tune.experiment.trial import Trial

try:
    from aim.sdk import Repo, Run
except ImportError:
    Repo, Run = None, None

logger = logging.getLogger(__name__)

VALID_SUMMARY_TYPES = [int, float, np.float32, np.float64, np.int32, np.int64]


[docs] @PublicAPI class AimLoggerCallback(LoggerCallback): """Aim Logger: logs metrics in Aim format. Aim is an open-source, self-hosted ML experiment tracking tool. It's good at tracking lots (thousands) of training runs, and it allows you to compare them with a performant and well-designed UI. Source: https://github.com/aimhubio/aim Args: repo: Aim repository directory or a `Repo` object that the Run object will log results to. If not provided, a default repo will be set up in the experiment directory (one level above trial directories). experiment: Sets the `experiment` property of each Run object, which is the experiment name associated with it. Can be used later to query runs/sequences. If not provided, the default will be the Tune experiment name set by `RunConfig(name=...)`. metrics: List of metric names (out of the metrics reported by Tune) to track in Aim. If no metric are specified, log everything that is reported. aim_run_kwargs: Additional arguments that will be passed when creating the individual `Run` objects for each trial. For the full list of arguments, please see the Aim documentation: https://aimstack.readthedocs.io/en/latest/refs/sdk.html """ VALID_HPARAMS = (str, bool, int, float, list, type(None)) VALID_NP_HPARAMS = (np.bool_, np.float32, np.float64, np.int32, np.int64)
[docs] def __init__( self, repo: Optional[Union[str, "Repo"]] = None, experiment_name: Optional[str] = None, metrics: Optional[List[str]] = None, **aim_run_kwargs, ): """ See help(AimLoggerCallback) for more information about parameters. """ assert Run is not None, ( "aim must be installed!. You can install aim with" " the command: `pip install aim`." ) self._repo_path = repo self._experiment_name = experiment_name if not (bool(metrics) or metrics is None): raise ValueError( "`metrics` must either contain at least one metric name, or be None, " "in which case all reported metrics will be logged to the aim repo." ) self._metrics = metrics self._aim_run_kwargs = aim_run_kwargs self._trial_to_run: Dict["Trial", Run] = {}
def _create_run(self, trial: "Trial") -> Run: """Initializes an Aim Run object for a given trial. Args: trial: The Tune trial that aim will track as a Run. Returns: Run: The created aim run for a specific trial. """ experiment_dir = trial.local_experiment_path run = Run( repo=self._repo_path or experiment_dir, experiment=self._experiment_name or trial.experiment_dir_name, **self._aim_run_kwargs, ) # Attach a few useful trial properties run["trial_id"] = trial.trial_id run["trial_log_dir"] = trial.path trial_ip = trial.get_ray_actor_ip() if trial_ip: run["trial_ip"] = trial_ip return run def log_trial_start(self, trial: "Trial"): if trial in self._trial_to_run: # Cleanup an existing run if the trial has been restarted self._trial_to_run[trial].close() trial.init_local_path() self._trial_to_run[trial] = self._create_run(trial) if trial.evaluated_params: self._log_trial_hparams(trial) def log_trial_result(self, iteration: int, trial: "Trial", result: Dict): tmp_result = result.copy() step = result.get(TIMESTEPS_TOTAL, None) or result[TRAINING_ITERATION] for k in ["config", "pid", "timestamp", TIME_TOTAL_S, TRAINING_ITERATION]: tmp_result.pop(k, None) # not useful to log these # `context` and `epoch` are special keys that users can report, # which are treated as special aim metrics/configurations. context = tmp_result.pop("context", None) epoch = tmp_result.pop("epoch", None) trial_run = self._trial_to_run[trial] path = ["ray", "tune"] flat_result = flatten_dict(tmp_result, delimiter="/") valid_result = {} for attr, value in flat_result.items(): if self._metrics and attr not in self._metrics: continue full_attr = "/".join(path + [attr]) if isinstance(value, tuple(VALID_SUMMARY_TYPES)) and not ( np.isnan(value) or np.isinf(value) ): valid_result[attr] = value trial_run.track( value=value, name=full_attr, epoch=epoch, step=step, context=context, ) elif (isinstance(value, (list, tuple, set)) and len(value) > 0) or ( isinstance(value, np.ndarray) and value.size > 0 ): valid_result[attr] = value def log_trial_end(self, trial: "Trial", failed: bool = False): trial_run = self._trial_to_run.pop(trial) trial_run.close() def _log_trial_hparams(self, trial: "Trial"): params = flatten_dict(trial.evaluated_params, delimiter="/") flat_params = flatten_dict(params) scrubbed_params = { k: v for k, v in flat_params.items() if isinstance(v, self.VALID_HPARAMS) } np_params = { k: v.tolist() for k, v in flat_params.items() if isinstance(v, self.VALID_NP_HPARAMS) } scrubbed_params.update(np_params) removed = { k: v for k, v in flat_params.items() if not isinstance(v, self.VALID_HPARAMS + self.VALID_NP_HPARAMS) } if removed: logger.info( "Removed the following hyperparameter values when " "logging to aim: %s", str(removed), ) run = self._trial_to_run[trial] run["hparams"] = scrubbed_params