Using RLlib with Tune#

Example#

Example of using a Tune scheduler (Population Based Training) with RLlib.

This example specifies num_workers=4, num_cpus=1, and num_gpus=0, which means that each PPO trial will use 5 CPUs: 1 (for training) + 4 (for sample collection). This example runs 2 trials, so at least 10 CPUs must be available in the cluster resources in order to run both trials concurrently. Otherwise, the PBT scheduler will round-robin between training each trial, which is less efficient.

If you want to run this example with GPUs, you can set num_gpus accordingly.

import random

from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.core.rl_module.default_model_config import DefaultModelConfig
from ray.tune.schedulers import PopulationBasedTraining

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing"
    )
    args, _ = parser.parse_known_args()

    # Postprocess the perturbed config to ensure it's still valid
    def explore(config):
        # ensure we collect enough timesteps to do sgd
        if config["train_batch_size"] < config["sgd_minibatch_size"] * 2:
            config["train_batch_size"] = config["sgd_minibatch_size"] * 2
        # ensure we run at least one sgd iter
        if config["num_sgd_iter"] < 1:
            config["num_sgd_iter"] = 1
        return config

    hyperparam_mutations = {
        "clip_param": lambda: random.uniform(0.01, 0.5),
        "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
        "num_epochs": lambda: random.randint(1, 30),
        "minibatch_size": lambda: random.randint(128, 16384),
        "train_batch_size_per_learner": lambda: random.randint(2000, 160000),
    }

    pbt = PopulationBasedTraining(
        time_attr="time_total_s",
        perturbation_interval=120,
        resample_probability=0.25,
        # Specifies the mutations of these hyperparams
        hyperparam_mutations=hyperparam_mutations,
        custom_explore_fn=explore,
    )

    # Stop when we've either reached 100 training iterations or reward=300
    stopping_criteria = {"training_iteration": 100, "episode_reward_mean": 300}

    config = (
        PPOConfig()
        .environment("Humanoid-v2")
        .env_runners(num_env_runners=4)
        .training(
            # These params are tuned from a fixed starting value.
            kl_coeff=1.0,
            lambda_=0.95,
            clip_param=0.2,
            lr=1e-4,
            # These params start off randomly drawn from a set.
            num_epochs=tune.choice([10, 20, 30]),
            minibatch_size=tune.choice([128, 512, 2048]),
            train_batch_size_per_learner=tune.choice([10000, 20000, 40000]),
        )
        .rl_module(
            model_config=DefaultModelConfig(free_log_std=True),
        )
    )

    tuner = tune.Tuner(
        "PPO",
        tune_config=tune.TuneConfig(
            metric="env_runners/episode_return_mean",
            mode="max",
            scheduler=pbt,
            num_samples=1 if args.smoke_test else 2,
        ),
        param_space=config,
        run_config=tune.RunConfig(stop=stopping_criteria),
    )
    results = tuner.fit()

import pprint

best_result = results.get_best_result()

print("Best performing trial's final set of hyperparameters:\n")
pprint.pprint(
    {k: v for k, v in best_result.config.items() if k in hyperparam_mutations}
)

print("\nBest performing trial's final reported metrics:\n")

metrics_to_print = [
    "episode_reward_mean",
    "episode_reward_max",
    "episode_reward_min",
    "episode_len_mean",
]
pprint.pprint({k: v for k, v in best_result.metrics.items() if k in metrics_to_print})

Best performing trial's final set of hyperparameters:

{'clip_param': 0.2,
 'lambda': 0.95,
 'lr': 0.0001,
 'num_sgd_iter': 30,
 'sgd_minibatch_size': 2048,
 'train_batch_size': 20000}

Best performing trial's final reported metrics:

{'episode_len_mean': 61.09146341463415,
 'episode_reward_max': 567.4424113245353,
 'episode_reward_mean': 310.36948184391935,
 'episode_reward_min': 87.74736189944105}

from ray.rllib.algorithms.algorithm import Algorithm

loaded_ppo = Algorithm.from_checkpoint(best_result.checkpoint)
loaded_policy = loaded_ppo.get_policy()

# See your trained policy in action
# loaded_policy.compute_single_action(...)

More RLlib Examples#

PB2 PPO Example: Example of optimizing a distributed RLlib algorithm (PPO) with the PB2 scheduler. Uses a small population size of 4, so can train on a laptop.