Source code for ray.tune.callback

import glob
import warnings
from abc import ABCMeta
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple

from ray.tune.utils.util import _atomic_save, _load_newest_checkpoint
from ray.util.annotations import DeveloperAPI, PublicAPI

if TYPE_CHECKING:
    from ray.train import Checkpoint
    from ray.tune.experiment import Trial
    from ray.tune.stopper import Stopper


class _CallbackMeta(ABCMeta):
    """A helper metaclass to ensure container classes (e.g. CallbackList) have
    implemented all the callback methods (e.g. `on_*`).
    """

    def __new__(mcs, name: str, bases: Tuple[type], attrs: Dict[str, Any]) -> type:
        cls = super().__new__(mcs, name, bases, attrs)

        if mcs.need_check(cls, name, bases, attrs):
            mcs.check(cls, name, bases, attrs)

        return cls

    @classmethod
    def need_check(
        mcs, cls: type, name: str, bases: Tuple[type], attrs: Dict[str, Any]
    ) -> bool:
        return attrs.get("IS_CALLBACK_CONTAINER", False)

    @classmethod
    def check(
        mcs, cls: type, name: str, bases: Tuple[type], attrs: Dict[str, Any]
    ) -> None:
        methods = set()
        for base in bases:
            methods.update(
                attr_name
                for attr_name, attr in vars(base).items()
                if mcs.need_override_by_subclass(attr_name, attr)
            )
        overridden = {
            attr_name
            for attr_name, attr in attrs.items()
            if mcs.need_override_by_subclass(attr_name, attr)
        }
        missing = methods.difference(overridden)
        if missing:
            raise TypeError(
                f"Found missing callback method: {missing} "
                f"in class {cls.__module__}.{cls.__qualname__}."
            )

    @classmethod
    def need_override_by_subclass(mcs, attr_name: str, attr: Any) -> bool:
        return (
            (
                attr_name.startswith("on_")
                and not attr_name.startswith("on_trainer_init")
            )
            or attr_name == "setup"
        ) and callable(attr)


[docs] @PublicAPI(stability="beta") class Callback(metaclass=_CallbackMeta): """Tune base callback that can be extended and passed to a ``TrialRunner`` Tune callbacks are called from within the ``TrialRunner`` class. There are several hooks that can be used, all of which are found in the submethod definitions of this base class. The parameters passed to the ``**info`` dict vary between hooks. The parameters passed are described in the docstrings of the methods. This example will print a metric each time a result is received: .. testcode:: from ray import train, tune from ray.tune import Callback class MyCallback(Callback): def on_trial_result(self, iteration, trials, trial, result, **info): print(f"Got result: {result['metric']}") def train_func(config): for i in range(10): tune.report(metric=i) tuner = tune.Tuner( train_func, run_config=train.RunConfig( callbacks=[MyCallback()] ) ) tuner.fit() .. testoutput:: :hide: ... """ # File templates for any artifacts written by this callback # These files should live in the `trial.local_path` for each trial. # TODO(ml-team): Make this more visible to users to override. Internal use for now. _SAVED_FILE_TEMPLATES = [] # arguments here match Experiment.public_spec
[docs] def setup( self, stop: Optional["Stopper"] = None, num_samples: Optional[int] = None, total_num_samples: Optional[int] = None, **info, ): """Called once at the very beginning of training. Any Callback setup should be added here (setting environment variables, etc.) Arguments: stop: Stopping criteria. If ``time_budget_s`` was passed to ``train.RunConfig``, a ``TimeoutStopper`` will be passed here, either by itself or as a part of a ``CombinedStopper``. num_samples: Number of times to sample from the hyperparameter space. Defaults to 1. If `grid_search` is provided as an argument, the grid will be repeated `num_samples` of times. If this is -1, (virtually) infinite samples are generated until a stopping condition is met. total_num_samples: Total number of samples factoring in grid search samplers. **info: Kwargs dict for forward compatibility. """ pass
[docs] def on_step_begin(self, iteration: int, trials: List["Trial"], **info): """Called at the start of each tuning loop step. Arguments: iteration: Number of iterations of the tuning loop. trials: List of trials. **info: Kwargs dict for forward compatibility. """ pass
[docs] def on_step_end(self, iteration: int, trials: List["Trial"], **info): """Called at the end of each tuning loop step. The iteration counter is increased before this hook is called. Arguments: iteration: Number of iterations of the tuning loop. trials: List of trials. **info: Kwargs dict for forward compatibility. """ pass
[docs] def on_trial_start( self, iteration: int, trials: List["Trial"], trial: "Trial", **info ): """Called after starting a trial instance. Arguments: iteration: Number of iterations of the tuning loop. trials: List of trials. trial: Trial that just has been started. **info: Kwargs dict for forward compatibility. """ pass
[docs] def on_trial_restore( self, iteration: int, trials: List["Trial"], trial: "Trial", **info ): """Called after restoring a trial instance. Arguments: iteration: Number of iterations of the tuning loop. trials: List of trials. trial: Trial that just has been restored. **info: Kwargs dict for forward compatibility. """ pass
[docs] def on_trial_save( self, iteration: int, trials: List["Trial"], trial: "Trial", **info ): """Called after receiving a checkpoint from a trial. Arguments: iteration: Number of iterations of the tuning loop. trials: List of trials. trial: Trial that just saved a checkpoint. **info: Kwargs dict for forward compatibility. """ pass
[docs] def on_trial_result( self, iteration: int, trials: List["Trial"], trial: "Trial", result: Dict, **info, ): """Called after receiving a result from a trial. The search algorithm and scheduler are notified before this hook is called. Arguments: iteration: Number of iterations of the tuning loop. trials: List of trials. trial: Trial that just sent a result. result: Result that the trial sent. **info: Kwargs dict for forward compatibility. """ pass
[docs] def on_trial_complete( self, iteration: int, trials: List["Trial"], trial: "Trial", **info ): """Called after a trial instance completed. The search algorithm and scheduler are notified before this hook is called. Arguments: iteration: Number of iterations of the tuning loop. trials: List of trials. trial: Trial that just has been completed. **info: Kwargs dict for forward compatibility. """ pass
[docs] def on_trial_recover( self, iteration: int, trials: List["Trial"], trial: "Trial", **info ): """Called after a trial instance failed (errored) but the trial is scheduled for retry. The search algorithm and scheduler are not notified. Arguments: iteration: Number of iterations of the tuning loop. trials: List of trials. trial: Trial that just has errored. **info: Kwargs dict for forward compatibility. """ pass
[docs] def on_trial_error( self, iteration: int, trials: List["Trial"], trial: "Trial", **info ): """Called after a trial instance failed (errored). The search algorithm and scheduler are notified before this hook is called. Arguments: iteration: Number of iterations of the tuning loop. trials: List of trials. trial: Trial that just has errored. **info: Kwargs dict for forward compatibility. """ pass
[docs] def on_checkpoint( self, iteration: int, trials: List["Trial"], trial: "Trial", checkpoint: "Checkpoint", **info, ): """Called after a trial saved a checkpoint with Tune. Arguments: iteration: Number of iterations of the tuning loop. trials: List of trials. trial: Trial that just has errored. checkpoint: Checkpoint object that has been saved by the trial. **info: Kwargs dict for forward compatibility. """ pass
[docs] def on_experiment_end(self, trials: List["Trial"], **info): """Called after experiment is over and all trials have concluded. Arguments: trials: List of trials. **info: Kwargs dict for forward compatibility. """ pass
[docs] def get_state(self) -> Optional[Dict]: """Get the state of the callback. This method should be implemented by subclasses to return a dictionary representation of the object's current state. This is called automatically by Tune to periodically checkpoint callback state. Upon :ref:`Tune experiment restoration <tune-experiment-level-fault-tolerance>`, callback state will be restored via :meth:`~ray.tune.Callback.set_state`. .. testcode:: from typing import Dict, List, Optional from ray.tune import Callback from ray.tune.experiment import Trial class MyCallback(Callback): def __init__(self): self._trial_ids = set() def on_trial_start( self, iteration: int, trials: List["Trial"], trial: "Trial", **info ): self._trial_ids.add(trial.trial_id) def get_state(self) -> Optional[Dict]: return {"trial_ids": self._trial_ids.copy()} def set_state(self, state: Dict) -> Optional[Dict]: self._trial_ids = state["trial_ids"] Returns: dict: State of the callback. Should be `None` if the callback does not have any state to save (this is the default). """ return None
[docs] def set_state(self, state: Dict): """Set the state of the callback. This method should be implemented by subclasses to restore the callback's state based on the given dict state. This is used automatically by Tune to restore checkpoint callback state on :ref:`Tune experiment restoration <tune-experiment-level-fault-tolerance>`. See :meth:`~ray.tune.Callback.get_state` for an example implementation. Args: state: State of the callback. """ pass
@DeveloperAPI class CallbackList(Callback): """Call multiple callbacks at once.""" IS_CALLBACK_CONTAINER = True CKPT_FILE_TMPL = "callback-states-{}.pkl" def __init__(self, callbacks: List[Callback]): self._callbacks = callbacks def setup(self, **info): for callback in self._callbacks: try: callback.setup(**info) except TypeError as e: if "argument" in str(e): warnings.warn( "Please update `setup` method in callback " f"`{callback.__class__}` to match the method signature" " in `ray.tune.callback.Callback`.", FutureWarning, ) callback.setup() else: raise e def on_step_begin(self, **info): for callback in self._callbacks: callback.on_step_begin(**info) def on_step_end(self, **info): for callback in self._callbacks: callback.on_step_end(**info) def on_trial_start(self, **info): for callback in self._callbacks: callback.on_trial_start(**info) def on_trial_restore(self, **info): for callback in self._callbacks: callback.on_trial_restore(**info) def on_trial_save(self, **info): for callback in self._callbacks: callback.on_trial_save(**info) def on_trial_result(self, **info): for callback in self._callbacks: callback.on_trial_result(**info) def on_trial_complete(self, **info): for callback in self._callbacks: callback.on_trial_complete(**info) def on_trial_recover(self, **info): for callback in self._callbacks: callback.on_trial_recover(**info) def on_trial_error(self, **info): for callback in self._callbacks: callback.on_trial_error(**info) def on_checkpoint(self, **info): for callback in self._callbacks: callback.on_checkpoint(**info) def on_experiment_end(self, **info): for callback in self._callbacks: callback.on_experiment_end(**info) def get_state(self) -> Optional[Dict]: """Gets the state of all callbacks contained within this list. If there are no stateful callbacks, then None will be returned in order to avoid saving an unnecessary callback checkpoint file.""" state = {} any_stateful_callbacks = False for i, callback in enumerate(self._callbacks): callback_state = callback.get_state() if callback_state: any_stateful_callbacks = True state[i] = callback_state if not any_stateful_callbacks: return None return state def set_state(self, state: Dict): """Sets the state for all callbacks contained within this list. Skips setting state for all stateless callbacks where `get_state` returned None.""" for i, callback in enumerate(self._callbacks): callback_state = state.get(i, None) if callback_state: callback.set_state(callback_state) def save_to_dir(self, checkpoint_dir: str, session_str: str = "default"): """Save the state of the callback list to the checkpoint_dir. Args: checkpoint_dir: directory where the checkpoint is stored. session_str: Unique identifier of the current run session (ex: timestamp). """ state_dict = self.get_state() if state_dict: file_name = self.CKPT_FILE_TMPL.format(session_str) tmp_file_name = f".tmp-{file_name}" _atomic_save( state=state_dict, checkpoint_dir=checkpoint_dir, file_name=file_name, tmp_file_name=tmp_file_name, ) def restore_from_dir(self, checkpoint_dir: str): """Restore the state of the list of callbacks from the checkpoint_dir. You should check if it's possible to restore with `can_restore` before calling this method. Args: checkpoint_dir: directory where the checkpoint is stored. Raises: RuntimeError: if unable to find checkpoint. NotImplementedError: if the `set_state` method is not implemented. """ state_dict = _load_newest_checkpoint( checkpoint_dir, self.CKPT_FILE_TMPL.format("*") ) if not state_dict: raise RuntimeError( "Unable to find checkpoint in {}.".format(checkpoint_dir) ) self.set_state(state_dict) def can_restore(self, checkpoint_dir: str) -> bool: """Check if the checkpoint_dir contains the saved state for this callback list. Returns: can_restore: True if the checkpoint_dir contains a file of the format `CKPT_FILE_TMPL`. False otherwise. """ return any( glob.iglob(Path(checkpoint_dir, self.CKPT_FILE_TMPL.format("*")).as_posix()) ) def __len__(self) -> int: return len(self._callbacks) def __getitem__(self, i: int) -> "Callback": return self._callbacks[i]