Cluster YAML Configuration Options#

The cluster configuration is defined within a YAML file that will be used by the Cluster Launcher to launch the head node, and by the Autoscaler to launch worker nodes. Once the cluster configuration is defined, you will need to use the Ray CLI to perform any operations such as starting and stopping the cluster.

Syntax#

cluster_name: str
max_workers: int
upscaling_speed: float
idle_timeout_minutes: int
docker:
    docker
provider:
    provider
auth:
    auth
available_node_types:
    node_types
head_node_type: str
file_mounts:
    file_mounts
cluster_synced_files:
    - str
rsync_exclude:
    - str
rsync_filter:
    - str
initialization_commands:
    - str
setup_commands:
    - str
head_setup_commands:
    - str
worker_setup_commands:
    - str
head_start_ray_commands:
    - str
worker_start_ray_commands:
    - str

Examples#

Minimal configuration#

AWS

# An unique identifier for the head node and workers of this cluster.
cluster_name: aws-example-minimal

# Cloud-provider specific configuration.
provider:
    type: aws
    region: us-west-2

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 3

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray.head.default:
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g., instance type. By default
        # Ray auto-configures unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large
    ray.worker.default:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 3
        # The maximum number of worker nodes of this type to launch.
        # This parameter takes precedence over min_workers.
        max_workers: 3
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g., instance type. By default
        # Ray auto-configures unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large

Azure

# An unique identifier for the head node and workers of this cluster.
cluster_name: minimal

# The maximum number of workers nodes to launch in addition to the head
# node. min_workers default to 0.
max_workers: 1

# Cloud-provider specific configuration.
provider:
    type: azure
    location: westus2
    resource_group: ray-cluster

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu
    # you must specify paths to matching private and public key pair files
    # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
    ssh_private_key: ~/.ssh/id_rsa
    # changes to this should match what is specified in file_mounts
    ssh_public_key: ~/.ssh/id_rsa.pub

GCP

auth:
  ssh_user: ubuntu
cluster_name: minimal
provider:
  availability_zone: us-west1-a
  project_id: null # TODO: set your GCP project ID here
  region: us-west1
  type: gcp

vSphere

# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 5

# Cloud-provider specific configuration.
provider:
    type: vsphere

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ray
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
    ssh_private_key: ~/ray-bootstrap-key.pem

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray.head.default:
        # You can override the resources here. Adding GPU to the head node is not recommended.
        # resources: { "CPU": 2, "Memory": 4096}
        resources: {}
    ray.worker.default:
        # The minimum number of nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 1
        max_workers: 3
        # You can override the resources here. For GPU, currently only NVIDIA GPU is supported. If no ESXi host can
        # fulfill the requirement, the Ray node creation will fail. The number of created nodes may not meet the desired
        # minimum number. The vSphere node provider will not distinguish the GPU type. It will just count the quantity:
        # mount the first k random available NVIDIA GPU to the VM, if the user set {"GPU": k}.
        # resources: {"CPU": 2, "Memory": 4096, "GPU": 1}
        resources: {}

# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default

Full configuration#

AWS

# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 2

# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-cpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
    run_options:   # Extra options to pass into "docker run"
        - --ulimit nofile=65536:65536

    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
    # Allow Ray to automatically detect GPUs

    # worker_image: "rayproject/ray-ml:latest-cpu"
    # worker_run_options: []

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Cloud-provider specific configuration.
provider:
    type: aws
    region: us-west-2
    # Availability zone(s), comma-separated, that nodes may be launched in.
    # Nodes will be launched in the first listed availability zone and will
    # be tried in the subsequent availability zones if launching fails.
    availability_zone: us-west-2a,us-west-2b
    # Whether to allow node reuse. If set to False, nodes will be terminated
    # instead of stopped.
    cache_stopped_nodes: True # If not present, the default is True.

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
#    ssh_private_key: /path/to/your/key.pem

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray.head.default:
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large
            # Default AMI for us-west-2.
            # Check https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/aws/config.py
            # for default images for other zones.
            ImageId: ami-0387d929287ab193e
            # You can provision additional disk space with a conf as follows
            BlockDeviceMappings:
                - DeviceName: /dev/sda1
                  Ebs:
                      VolumeSize: 140
                      VolumeType: gp3
            # Additional options in the boto docs.
    ray.worker.default:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 1
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 2
        # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
        # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
        # You can also set custom resources.
        # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
        # resources: {"CPU": 1, "GPU": 1, "custom": 5}
        resources: {}
        # Provider-specific config for this node type, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as SubnetId and KeyName.
        # For more documentation on available fields, see:
        # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
        node_config:
            InstanceType: m5.large
            # Default AMI for us-west-2.
            # Check https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/aws/config.py
            # for default images for other zones.
            ImageId: ami-0387d929287ab193e
            # Run workers on spot by default. Comment this out to use on-demand.
            # NOTE: If relying on spot instances, it is best to specify multiple different instance
            # types to avoid interruption when one instance type is experiencing heightened demand.
            # Demand information can be found at https://aws.amazon.com/ec2/spot/instance-advisor/
            InstanceMarketOptions:
                MarketType: spot
                # Additional options can be found in the boto docs, e.g.
                #   SpotOptions:
                #       MaxPrice: MAX_HOURLY_PRICE
            # Additional options in the boto docs.

# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
#    "/path1/on/remote/machine": "/path1/on/local/machine",
#    "/path2/on/remote/machine": "/path2/on/local/machine",
}

# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []

# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False

# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
    - "**/.git"
    - "**/.git/**"

# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
    - ".gitignore"

# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []

# List of shell commands to run to set up nodes.
setup_commands: []
    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
    # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
    # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"

# Custom commands that will be run on the head node after common setup.
head_setup_commands: []

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

Azure

# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 2

# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty object means disabled.
docker:
    image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
    run_options: # Extra options to pass into "docker run"
        - --ulimit nofile=65536:65536

    # Example of running a GPU head with CPU workers
    # head_image: "rayproject/ray-ml:latest-gpu"
    # Allow Ray to automatically detect GPUs

    # worker_image: "rayproject/ray-ml:latest-cpu"
    # worker_run_options: []

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Cloud-provider specific configuration.
provider:
    type: azure
    # https://azure.microsoft.com/en-us/global-infrastructure/locations
    location: westus2
    resource_group: ray-cluster
    # set subscription id otherwise the default from az cli will be used
    # subscription_id: 00000000-0000-0000-0000-000000000000
    # set unique subnet mask or a random mask will be used
    # subnet_mask: 10.0.0.0/16
    # set unique id for resources in this cluster
    # if not set a default id will be generated based on the resource group and cluster name
    # unique_id: RAY1
    # set managed identity name and resource group
    # if not set, a default user-assigned identity will be generated in the resource group specified above
    # msi_name: ray-cluster-msi
    # msi_resource_group: other-rg
    # Set provisioning and use of public/private IPs for head and worker nodes. If both options below are true,
    # only the head node will have a public IP address provisioned.
    # use_internal_ips: True
    # use_external_head_ip: True

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu
    # you must specify paths to matching private and public key pair files
    # use `ssh-keygen -t rsa -b 4096` to generate a new ssh key pair
    ssh_private_key: ~/.ssh/id_rsa
    # changes to this should match what is specified in file_mounts
    ssh_public_key: ~/.ssh/id_rsa.pub

# More specific customization to node configurations can be made using the ARM template azure-vm-template.json file
# See documentation here: https://docs.microsoft.com/en-us/azure/templates/microsoft.compute/2019-03-01/virtualmachines
# Changes to the local file will be used during deployment of the head node, however worker nodes deployment occurs
# on the head node, so changes to the template must be included in the wheel file used in setup_commands section below

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray.head.default:
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config, e.g. instance type.
        node_config:
            azure_arm_parameters:
                vmSize: Standard_D2s_v3
                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
                imagePublisher: microsoft-dsvm
                imageOffer: ubuntu-1804
                imageSku: 1804-gen2
                imageVersion: latest

                # OR use a custom image from Azure Compute Gallery
                # note if you use a custom image, imagePublisher,
                # imageOffer, imageSku, and imageVersion are ignored
                # imageId: /subscriptions/[subscription-id]/resourceGroups/[resource-group-id]/providers/Microsoft.Compute/galleries/[azure-compute-gallery-id]/images/[image-id]/versions/[image-version]

                # optionally set osDiskSize if you want to use a
                # custom disk size
                # osDiskSize: 128

    ray.worker.default:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 0
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 2
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config, e.g. instance type.
        node_config:
            azure_arm_parameters:
                vmSize: Standard_D2s_v3
                # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
                imagePublisher: microsoft-dsvm
                imageOffer: ubuntu-1804
                imageSku: 1804-gen2
                imageVersion: latest
                # optionally set priority to use Spot instances
                priority: Spot
                # set a maximum price for spot instances if desired
                # billingProfile:
                #     maxPrice: -1

# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
    #    "/path1/on/remote/machine": "/path1/on/local/machine",
    #    "/path2/on/remote/machine": "/path2/on/local/machine",
    "~/.ssh/id_rsa.pub": "~/.ssh/id_rsa.pub"}

# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []

# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False

# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
    - "**/.git"
    - "**/.git/**"

# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
    - ".gitignore"

# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands:
    # enable docker setup
    - sudo usermod -aG docker $USER || true
    - sleep 10 # delay to avoid docker permission denied errors
    # get rid of annoying Ubuntu message
    - touch ~/.sudo_as_admin_successful

# List of shell commands to run to set up nodes.
# NOTE: rayproject/ray-ml:latest has ray latest bundled
setup_commands: []
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
# that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl"

# Custom commands that will be run on the head node after common setup.
head_setup_commands:
    - pip install -U azure-cli-core==2.29.1 azure-identity==1.7.0 azure-mgmt-compute==23.1.0 azure-mgmt-network==19.0.0 azure-mgmt-resource==20.0.0 msrestazure==0.6.4

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

GCP

# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 2

# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
  image: "rayproject/ray-ml:latest-gpu" # You can change this to latest-cpu if you don't need GPU support and want a faster startup
    # image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
  container_name: "ray_container"
  # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
  # if no cached version is present.
  pull_before_run: True
  run_options:  # Extra options to pass into "docker run"
    - --ulimit nofile=65536:65536

  # Example of running a GPU head with CPU workers
  # head_image: "rayproject/ray-ml:latest-gpu"
  # Allow Ray to automatically detect GPUs

  # worker_image: "rayproject/ray-ml:latest-cpu"
  # worker_run_options: []

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Cloud-provider specific configuration.
provider:
    type: gcp
    region: us-west1
    availability_zone: us-west1-a
    project_id: null # Globally unique project id

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below. This requires that you have added the key into the
# project wide meta-data.
#    ssh_private_key: /path/to/your/key.pem

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray_head_default:
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config for the head node, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
        # For more documentation on available fields, see:
        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
        node_config:
            machineType: n1-standard-2
            disks:
              - boot: true
                autoDelete: true
                type: PERSISTENT
                initializeParams:
                  diskSizeGb: 50
                  # See https://cloud.google.com/compute/docs/images for more images
                  sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922

            # Additional options can be found in in the compute docs at
            # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert

            # If the network interface is specified as below in both head and worker
            # nodes, the manual network config is used.  Otherwise an existing subnet is
            # used.  To use a shared subnet, ask the subnet owner to grant permission
            # for 'compute.subnetworks.use' to the ray autoscaler account...
            # networkInterfaces:
            #   - kind: compute#networkInterface
            #     subnetwork: path/to/subnet
            #     aliasIpRanges: []
    ray_worker_small:
        # The minimum number of worker nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 1
        # The maximum number of worker nodes of this type to launch.
        # This takes precedence over min_workers.
        max_workers: 2
        # The resources provided by this node type.
        resources: {"CPU": 2}
        # Provider-specific config for the head node, e.g. instance type. By default
        # Ray will auto-configure unspecified fields such as subnets and ssh-keys.
        # For more documentation on available fields, see:
        # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
        node_config:
            machineType: n1-standard-2
            disks:
              - boot: true
                autoDelete: true
                type: PERSISTENT
                initializeParams:
                  diskSizeGb: 50
                  # See https://cloud.google.com/compute/docs/images for more images
                  sourceImage: projects/deeplearning-platform-release/global/images/common-cpu-v20240922
            # Run workers on preemtible instance by default.
            # Comment this out to use on-demand.
            scheduling:
              - preemptible: true
            # Un-Comment this to launch workers with the Service Account of the Head Node
            # serviceAccounts:
            # - email: ray-autoscaler-sa-v1@<project_id>.iam.gserviceaccount.com
            #   scopes:
            #   - https://www.googleapis.com/auth/cloud-platform

    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert

# Specify the node type of the head node (as configured above).
head_node_type: ray_head_default

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
#    "/path1/on/remote/machine": "/path1/on/local/machine",
#    "/path2/on/remote/machine": "/path2/on/local/machine",
}

# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []

# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False

# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude:
    - "**/.git"
    - "**/.git/**"

# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter:
    - ".gitignore"

# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []

# List of shell commands to run to set up nodes.
setup_commands: []
    # Note: if you're developing Ray, you probably want to create a Docker image that
    # has your Ray repo pre-cloned. Then, you can replace the pip installs
    # below with a git checkout <your_sha> (and possibly a recompile).
    # To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
    # that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
    # - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"


# Custom commands that will be run on the head node after common setup.
head_setup_commands:
  - pip install google-api-python-client==1.7.8

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - >-
      ray start
      --head
      --port=6379
      --object-manager-port=8076
      --autoscaling-config=~/ray_bootstrap_config.yaml

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - >-
      ray start
      --address=$RAY_HEAD_IP:6379
      --object-manager-port=8076

vSphere

# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The maximum number of workers nodes to launch in addition to the head
# node.
max_workers: 5

# The autoscaler will scale up the cluster faster with higher upscaling speed.
# E.g., if the task requires adding more nodes then autoscaler will gradually
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
# This number should be > 0.
upscaling_speed: 1.0

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
    image: "rayproject/ray-ml:latest"
    # image: rayproject/ray:latest   # use this one if you don't need ML dependencies, it's faster to pull
    container_name: "ray_container"
    # If true, pulls latest version of image. Otherwise, `docker run` will only pull the image
    # if no cached version is present.
    pull_before_run: True
    run_options:   # Extra options to pass into "docker run"
        - --ulimit nofile=65536:65536

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Cloud-provider specific configuration.
provider:
    type: vsphere

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: ray
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
    ssh_private_key: ~/ray-bootstrap-key.pem

# Tell the autoscaler the allowed node types and the resources they provide.
# The key is the name of the node type, which is just for debugging purposes.
# The node config specifies the launch config and physical instance type.
available_node_types:
    ray.head.default:
        # You can override the resources here. Adding GPU to the head node is not recommended.
        # resources: { "CPU": 2, "Memory": 4096}
        resources: {}
        node_config: {"vm_class": "best-effort-xlarge"}
    worker:
        # The minimum number of nodes of this type to launch.
        # This number should be >= 0.
        min_workers: 1
        max_workers: 3
        # You can override the resources here. For GPU, currently only NVIDIA GPU is supported. If no ESXi host can
        # fulfill the requirement, the Ray node creation will fail. The number of created nodes may not meet the desired
        # minimum number. The vSphere node provider will not distinguish the GPU type. It will just count the quantity:
        # mount the first k random available NVIDIA GPU to the VM, if the user set {"GPU": k}.
        # resources: {"CPU": 2, "Memory": 4096, "GPU": 1}
        resources: {}
        node_config: {"vm_class": "best-effort-xlarge"}
    worker_2:
         # The minimum number of nodes of this type to launch.
         # This number should be >= 0.
         min_workers: 1
         max_workers: 2
         # You can override the resources here. For GPU, currently only NVIDIA GPU is supported. If no ESXi host can
         # fulfill the requirement, the Ray node creation will fail. The number of created nodes may not meet the desired
         # minimum number. The vSphere node provider will not distinguish the GPU type. It will just count the quantity:
         # mount the first k random available NVIDIA GPU to the VM, if the user set {"GPU": k}.
         # resources: {"CPU": 2, "Memory": 4096, "GPU": 1}
         resources: {}
         node_config: {"vm_class": "best-effort-xlarge"}

# Specify the node type of the head node (as configured above).
head_node_type: ray.head.default

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
#    "/path1/on/remote/machine": "/path1/on/local/machine",
#    "/path2/on/remote/machine": "/path2/on/local/machine",
}

# Files or directories to copy from the head node to the worker nodes. The format is a
# list of paths. The same path on the head node will be copied to the worker node.
# This behavior is a subset of the file_mounts behavior. In the vast majority of cases
# you should just use file_mounts. Only use this if you know what you're doing!
cluster_synced_files: []

# Whether changes to directories in file_mounts or cluster_synced_files in the head node
# should sync to the worker node continuously
file_mounts_sync_continuously: False

# Patterns for files to exclude when running rsync up or rsync down
rsync_exclude: []

# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
rsync_filter: []

# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []

# List of shell commands to run to set up nodes.
setup_commands: []

# Custom commands that will be run on the head node after common setup.
head_setup_commands:
    - pip install 'git+https://github.com/vmware/vsphere-automation-sdk-python.git'

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --head --port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379

TPU Configuration#

It is possible to use TPU VMs on GCP. Currently, TPU pods (TPUs other than v2-8, v3-8 and v4-8) are not supported.

Before using a config with TPUs, ensure that the TPU API is enabled for your GCP project.

GCP

# A unique identifier for the head node and workers of this cluster.
cluster_name: tputest

# The maximum number of worker nodes to launch in addition to the head node.
max_workers: 7

available_node_types:
    ray_head_default:
        resources: {"TPU": 1}  # use TPU custom resource in your code
        node_config:
            # Only v2-8, v3-8 and v4-8 accelerator types are currently supported.
            # Support for TPU pods will be added in the future.
            acceleratorType: v2-8
            runtimeVersion: v2-alpha
            schedulingConfig:
                # Set to false to use non-preemptible TPUs
                preemptible: false
    ray_tpu:
        min_workers: 1
        resources: {"TPU": 1}  # use TPU custom resource in your code
        node_config:
            acceleratorType: v2-8
            runtimeVersion: v2-alpha
            schedulingConfig:
                preemptible: true

provider:
    type: gcp
    region: us-central1
    availability_zone: us-central1-b
    project_id: null # Replace this with your GCP project ID.

setup_commands:
  - sudo apt install python-is-python3 -y
  - pip3 install --upgrade pip
  - pip3 install -U "ray[default]"

# Specify the node type of the head node (as configured above).
head_node_type: ray_head_default

Cluster YAML Configuration Options#

Syntax#

Custom types#

Docker#

Auth#

Provider#

Security Group#

vSphere Config#

vSphere Credentials#

vSphere Frozen VM Configs#

vSphere GPU Configs#

Node types#

Node config#

Node Docker#

Resources#

File mounts#

Properties and Definitions#

cluster_name#

max_workers#

upscaling_speed#

idle_timeout_minutes#

docker#

provider#

auth#

available_node_types#

head_node_type#

file_mounts#

cluster_synced_files#

rsync_exclude#

rsync_filter#

initialization_commands#

setup_commands#

head_setup_commands#

worker_setup_commands#

head_start_ray_commands#

worker_start_ray_commands#

docker.image#

docker.head_image#

docker.worker_image#

docker.container_name#

docker.pull_before_run#

docker.run_options#

docker.head_run_options#

docker.worker_run_options#

docker.disable_automatic_runtime_detection#

docker.disable_shm_size_detection#

auth.ssh_user#

auth.ssh_private_key#

auth.ssh_public_key#

provider.type#

provider.region#

provider.availability_zone#

provider.location#

provider.resource_group#

provider.subscription_id#

provider.msi_name#

provider.msi_resource_group#

provider.project_id#

provider.cache_stopped_nodes#

provider.use_internal_ips#

provider.use_external_head_ip#

provider.security_group#

provider.vsphere_config#

security_group.GroupName#

security_group.IpPermissions#

vsphere_config.credentials#

vsphere_config.credentials.user#

vsphere_config.credentials.password#

vsphere_config.credentials.server#

vsphere_config.frozen_vm#

vsphere_config.frozen_vm.name#

vsphere_config.frozen_vm.library_item#

vsphere_config.frozen_vm.resource_pool#

vsphere_config.frozen_vm.cluster#

vsphere_config.frozen_vm.datastore#

vsphere_config.gpu_config#

vsphere_config.gpu_config.dynamic_pci_passthrough#

available_node_types.<node_type_name>.node_type.node_config#

available_node_types.<node_type_name>.node_type.resources#

available_node_types.<node_type_name>.node_type.min_workers#

`cluster_name`#

`max_workers`#

`upscaling_speed`#

`idle_timeout_minutes`#

`docker`#

`provider`#

`auth`#

`available_node_types`#

`head_node_type`#

`file_mounts`#

`cluster_synced_files`#

`rsync_exclude`#

`rsync_filter`#

`initialization_commands`#

`setup_commands`#

`head_setup_commands`#

`worker_setup_commands`#

`head_start_ray_commands`#

`worker_start_ray_commands`#

`docker.image`#

`docker.head_image`#

`docker.worker_image`#

`docker.container_name`#

`docker.pull_before_run`#

`docker.run_options`#

`docker.head_run_options`#

`docker.worker_run_options`#

`docker.disable_automatic_runtime_detection`#

`docker.disable_shm_size_detection`#

`auth.ssh_user`#

`auth.ssh_private_key`#

`auth.ssh_public_key`#

`provider.type`#

`provider.region`#

`provider.availability_zone`#

`provider.location`#

`provider.resource_group`#

`provider.subscription_id`#

`provider.msi_name`#

`provider.msi_resource_group`#

`provider.project_id`#

`provider.cache_stopped_nodes`#

`provider.use_internal_ips`#

`provider.use_external_head_ip`#

`provider.security_group`#

`provider.vsphere_config`#

`security_group.GroupName`#

`security_group.IpPermissions`#

`vsphere_config.credentials`#

`vsphere_config.credentials.user`#

`vsphere_config.credentials.password`#

`vsphere_config.credentials.server`#

`vsphere_config.frozen_vm`#

`vsphere_config.frozen_vm.name`#

`vsphere_config.frozen_vm.library_item`#

`vsphere_config.frozen_vm.resource_pool`#

`vsphere_config.frozen_vm.cluster`#

`vsphere_config.frozen_vm.datastore`#

`vsphere_config.gpu_config`#

`vsphere_config.gpu_config.dynamic_pci_passthrough`#

`available_node_types.<node_type_name>.node_type.node_config`#

`available_node_types.<node_type_name>.node_type.resources`#

`available_node_types.<node_type_name>.node_type.min_workers`#

`available_node_types.<node_type_name>.node_type.max_workers`#

`available_node_types.<node_type_name>.node_type.worker_setup_commands`#

`available_node_types.<node_type_name>.node_type.resources.CPU`#

`available_node_types.<node_type_name>.node_type.resources.GPU`#

`available_node_types.<node_type_name>.node_type.resources.memory`#

`available_node_types.<node_type_name>.node_type.resources.object-store-memory`#

`available_node_types.<node_type_name>.docker`#