Source code for ray.util.spark.cluster_init

import os
import socket
import sys
import time
import threading
import logging
import uuid
from packaging.version import Version
from typing import Optional, Dict, Type

import ray
from ray.util.annotations import PublicAPI
from ray._private.storage import _load_class

from .utils import (
    exec_cmd,
    check_port_open,
    get_random_unused_port,
    get_spark_session,
    get_spark_application_driver_host,
    is_in_databricks_runtime,
    get_spark_task_assigned_physical_gpus,
    get_avail_mem_per_ray_worker_node,
    get_max_num_concurrent_tasks,
    gen_cmd_exec_failure_msg,
    setup_sigterm_on_parent_death,
)
from .start_hook_base import RayOnSparkStartHook
from .databricks_hook import DefaultDatabricksRayOnSparkStartHook


_logger = logging.getLogger("ray.util.spark")
_logger.setLevel(logging.INFO)


RAY_ON_SPARK_START_HOOK = "RAY_ON_SPARK_START_HOOK"

MAX_NUM_WORKER_NODES = -1

RAY_ON_SPARK_COLLECT_LOG_TO_PATH = "RAY_ON_SPARK_COLLECT_LOG_TO_PATH"


def _check_system_environment():
    if not sys.platform.startswith("linux"):
        raise RuntimeError("Ray on spark only supports running on Linux.")

    spark_dependency_error = "ray.util.spark module requires pyspark >= 3.3"
    try:
        import pyspark

        if Version(pyspark.__version__).release < (3, 3, 0):
            raise RuntimeError(spark_dependency_error)
    except ImportError:
        raise RuntimeError(spark_dependency_error)


class RayClusterOnSpark:
    """
    This class is the type of instance returned by the `_setup_ray_cluster` interface.
    Its main functionality is to:
    Connect to, disconnect from, and shutdown the Ray cluster running on Apache Spark.
    Serve as a Python context manager for the `RayClusterOnSpark` instance.

    Args
        address: The url for the ray head node (defined as the hostname and unused
                 port on Spark driver node)
        head_proc: Ray head process
        spark_job_group_id: The Spark job id for a submitted ray job
        num_workers_node: The number of workers in the ray cluster.
    """

    def __init__(
        self,
        address,
        head_proc,
        spark_job_group_id,
        num_workers_node,
        temp_dir,
        cluster_unique_id,
        start_hook,
        ray_dashboard_port,
    ):
        self.address = address
        self.head_proc = head_proc
        self.spark_job_group_id = spark_job_group_id
        self.num_worker_nodes = num_workers_node
        self.temp_dir = temp_dir
        self.cluster_unique_id = cluster_unique_id
        self.start_hook = start_hook
        self.ray_dashboard_port = ray_dashboard_port

        self.is_shutdown = False
        self.spark_job_is_canceled = False
        self.background_job_exception = None

    def _cancel_background_spark_job(self):
        self.spark_job_is_canceled = True
        get_spark_session().sparkContext.cancelJobGroup(self.spark_job_group_id)

    def wait_until_ready(self):
        import ray

        if self.is_shutdown:
            raise RuntimeError(
                "The ray cluster has been shut down or it failed to start."
            )
        try:
            # connect to the ray cluster.
            ray_ctx = ray.init(address=self.address)
            webui_url = ray_ctx.address_info.get("webui_url", None)
            if webui_url:
                self.start_hook.on_ray_dashboard_created(self.ray_dashboard_port)

        except Exception:
            self.shutdown()
            raise

        try:
            last_alive_worker_count = 0
            last_progress_move_time = time.time()
            while True:
                time.sleep(_RAY_CLUSTER_STARTUP_PROGRESS_CHECKING_INTERVAL)

                # Inside the waiting ready loop,
                # checking `self.background_job_exception`, if it is not None,
                # it means the background spark job has failed,
                # in this case, raise error directly.
                if self.background_job_exception is not None:
                    raise RuntimeError(
                        "Ray workers failed to start."
                    ) from self.background_job_exception

                cur_alive_worker_count = (
                    len([node for node in ray.nodes() if node["Alive"]]) - 1
                )  # Minus 1 means excluding the head node.

                if cur_alive_worker_count >= self.num_worker_nodes:
                    return

                if cur_alive_worker_count > last_alive_worker_count:
                    last_alive_worker_count = cur_alive_worker_count
                    last_progress_move_time = time.time()
                    _logger.info(
                        "Ray worker nodes are starting. Progress: "
                        f"({cur_alive_worker_count} / {self.num_worker_nodes})"
                    )
                else:
                    if (
                        time.time() - last_progress_move_time
                        > _RAY_CONNECT_CLUSTER_POLL_PROGRESS_TIMEOUT
                    ):
                        if cur_alive_worker_count == 0:
                            raise RuntimeError(
                                "Current spark cluster has no resources to launch "
                                "Ray worker nodes."
                            )
                        _logger.warning(
                            "Timeout in waiting for all ray workers to start. "
                            "Started / Total requested: "
                            f"({cur_alive_worker_count} / {self.num_worker_nodes}). "
                            "Current spark cluster does not have sufficient resources "
                            "to launch requested number of Ray worker nodes."
                        )
                        return
        finally:
            ray.shutdown()

    def connect(self):
        if ray.is_initialized():
            raise RuntimeError("Already connected to Ray cluster.")
        ray.init(address=self.address)

    def disconnect(self):
        ray.shutdown()

    def shutdown(self, cancel_background_job=True):
        """
        Shutdown the ray cluster created by the `setup_ray_cluster` API.
        NB: In the background thread that runs the background spark job, if spark job
        raise unexpected error, its exception handler will also call this method, in
        the case, it will set cancel_background_job=False to avoid recursive call.
        """
        if not self.is_shutdown:
            self.disconnect()
            os.environ.pop("RAY_ADDRESS", None)
            if cancel_background_job:
                try:
                    self._cancel_background_spark_job()
                except Exception as e:
                    # swallow exception.
                    _logger.warning(
                        f"An error occurred while cancelling the ray cluster "
                        f"background spark job: {repr(e)}"
                    )
            try:
                self.head_proc.terminate()
            except Exception as e:
                # swallow exception.
                _logger.warning(
                    "An Error occurred during shutdown of ray head node: " f"{repr(e)}"
                )
            self.is_shutdown = True

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.shutdown()


def _convert_ray_node_option_key(key):
    return f"--{key.replace('_', '-')}"


def _convert_ray_node_options(options):
    return [
        f"{_convert_ray_node_option_key(k)}"
        if v is None
        else f"{_convert_ray_node_option_key(k)}={str(v)}"
        for k, v in options.items()
    ]


_RAY_HEAD_STARTUP_TIMEOUT = 5
_BACKGROUND_JOB_STARTUP_WAIT = int(
    os.environ.get("RAY_ON_SPARK_BACKGROUND_JOB_STARTUP_WAIT", "30")
)
_RAY_CLUSTER_STARTUP_PROGRESS_CHECKING_INTERVAL = 3
_RAY_WORKER_NODE_STARTUP_INTERVAL = int(
    os.environ.get("RAY_ON_SPARK_RAY_WORKER_NODE_STARTUP_INTERVAL", "10")
)
_RAY_CONNECT_CLUSTER_POLL_PROGRESS_TIMEOUT = 120


def _prepare_for_ray_worker_node_startup():
    """
    If we start multiple ray workers on a machine concurrently, some ray worker
    processes might fail due to ray port conflicts, this is because race condition
    on getting free port and opening the free port.
    To address the issue, this function use an exclusive file lock to delay the
    worker processes to ensure that port acquisition does not create a resource
    contention issue due to a race condition.

    After acquiring lock, it will allocate port range for worker ports
    (for ray node config --min-worker-port and --max-worker-port).
    Because on a spark cluster, multiple ray cluster might be created, so on one spark
    worker machine, there might be multiple ray worker nodes running, these worker
    nodes might belong to different ray cluster, and we must ensure these ray nodes on
    the same machine using non-overlapping worker port range, to achieve this, in this
    function, it creates a file `/tmp/ray_on_spark_worker_port_allocation.txt` file,
    the file format is composed of multiple lines, each line contains 2 number: `pid`
    and `port_range_slot_index`, each port range slot allocates 1000 ports, and
    corresponding port range is:
     - range_begin (inclusive): 20000 + port_range_slot_index * 1000
     - range_end (exclusive): range_begin + 1000
    In this function, it first scans `/tmp/ray_on_spark_worker_port_allocation.txt`
    file, removing lines that containing dead process pid, then find the first unused
    port_range_slot_index, then regenerate this file, and return the allocated port
    range.

    Returns: Allocated port range for current worker ports
    """
    import psutil
    import fcntl

    def acquire_lock(file_path):
        mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
        try:
            fd = os.open(file_path, mode)
            # The lock file must be readable / writable to all users.
            os.chmod(file_path, 0o0777)
            # Allow for retrying getting a file lock a maximum number of seconds
            max_lock_iter = 600
            for _ in range(max_lock_iter):
                try:
                    fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
                except BlockingIOError:
                    # Lock is used by other processes, continue loop to wait for lock
                    # available
                    pass
                else:
                    # Acquire lock successfully.
                    return fd
                time.sleep(10)
            raise TimeoutError(f"Acquiring lock on file {file_path} timeout.")
        except Exception:
            os.close(fd)

    lock_file_path = "/tmp/ray_on_spark_worker_startup_barrier_lock.lock"
    try:
        lock_fd = acquire_lock(lock_file_path)
    except TimeoutError:
        # If timeout happens, the file lock might be hold by another process and that
        # process does not release the lock in time by some unexpected reason.
        # In this case, remove the existing lock file and create the file again, and
        # then acquire file lock on the new file.
        try:
            os.remove(lock_file_path)
        except Exception:
            pass
        lock_fd = acquire_lock(lock_file_path)

    def release_lock():
        fcntl.flock(lock_fd, fcntl.LOCK_UN)
        os.close(lock_fd)

    try:
        port_alloc_file = "/tmp/ray_on_spark_worker_port_allocation.txt"

        # NB: reading / writing `port_alloc_file` is protected by exclusive lock
        # on file `lock_file_path`
        if os.path.exists(port_alloc_file):
            with open(port_alloc_file, mode="r") as fp:
                port_alloc_data = fp.read()
            port_alloc_table = [
                line.split(" ") for line in port_alloc_data.strip().split("\n")
            ]
            port_alloc_table = [
                (int(pid_str), int(slot_index_str))
                for pid_str, slot_index_str in port_alloc_table
            ]
        else:
            port_alloc_table = []
            with open(port_alloc_file, mode="w"):
                pass
            # The port range allocation file must be readable / writable to all users.
            os.chmod(port_alloc_file, 0o0777)

        port_alloc_map = {
            pid: slot_index
            for pid, slot_index in port_alloc_table
            if psutil.pid_exists(pid)  # remove slot used by dead process
        }

        allocated_slot_set = set(port_alloc_map.values())

        if len(allocated_slot_set) == 0:
            new_slot_index = 0
        else:
            new_slot_index = max(allocated_slot_set) + 1
            for index in range(new_slot_index):
                if index not in allocated_slot_set:
                    new_slot_index = index
                    break

        port_alloc_map[os.getpid()] = new_slot_index

        with open(port_alloc_file, mode="w") as fp:
            for pid, slot_index in port_alloc_map.items():
                fp.write(f"{pid} {slot_index}\n")

        worker_port_range_begin = 20000 + new_slot_index * 1000
        worker_port_range_end = worker_port_range_begin + 1000

        if worker_port_range_end > 65536:
            raise RuntimeError(
                "Too many ray worker nodes are running on this machine, cannot "
                "allocate worker port range for new ray worker node."
            )
    except Exception:
        release_lock()
        raise

    def hold_lock():
        time.sleep(_RAY_WORKER_NODE_STARTUP_INTERVAL)
        release_lock()

    threading.Thread(target=hold_lock, args=()).start()

    return worker_port_range_begin, worker_port_range_end


def _setup_ray_cluster(
    *,
    num_worker_nodes: int,
    num_cpus_per_node: int,
    num_gpus_per_node: int,
    using_stage_scheduling: bool,
    heap_memory_per_node: int,
    object_store_memory_per_node: int,
    head_node_options: Dict,
    worker_node_options: Dict,
    ray_temp_root_dir: str,
    collect_log_to_path: str,
) -> Type[RayClusterOnSpark]:
    """
    The public API `ray.util.spark.setup_ray_cluster` does some argument
    validation and then pass validated arguments to this interface.
    and it returns a `RayClusterOnSpark` instance.

    The returned instance can be used to connect to, disconnect from and shutdown the
    ray cluster. This instance can also be used as a context manager (used by
    encapsulating operations within `with _setup_ray_cluster(...):`). Upon entering the
    managed scope, the ray cluster is initiated and connected to. When exiting the
    scope, the ray cluster is disconnected and shut down.

    Note: This function interface is stable and can be used for
    instrumentation logging patching.
    """
    from pyspark.util import inheritable_thread_target

    if RAY_ON_SPARK_START_HOOK in os.environ:
        start_hook = _load_class(os.environ[RAY_ON_SPARK_START_HOOK])()
    elif is_in_databricks_runtime():
        start_hook = DefaultDatabricksRayOnSparkStartHook()
    else:
        start_hook = RayOnSparkStartHook()

    spark = get_spark_session()

    ray_head_ip = socket.gethostbyname(get_spark_application_driver_host(spark))
    ray_head_port = get_random_unused_port(ray_head_ip, min_port=9000, max_port=10000)

    # Make a copy for head_node_options to avoid changing original dict in user code.
    head_node_options = head_node_options.copy()
    include_dashboard = head_node_options.pop("include_dashboard", None)
    ray_dashboard_port = head_node_options.pop("dashboard_port", None)

    if include_dashboard is None or include_dashboard is True:
        if ray_dashboard_port is None:
            ray_dashboard_port = get_random_unused_port(
                ray_head_ip, min_port=9000, max_port=10000, exclude_list=[ray_head_port]
            )
        ray_dashboard_agent_port = get_random_unused_port(
            ray_head_ip,
            min_port=9000,
            max_port=10000,
            exclude_list=[ray_head_port, ray_dashboard_port],
        )

        dashboard_options = [
            "--dashboard-host=0.0.0.0",
            f"--dashboard-port={ray_dashboard_port}",
            f"--dashboard-agent-listen-port={ray_dashboard_agent_port}",
        ]
        # If include_dashboard is None, we don't set `--include-dashboard` option,
        # in this case Ray will decide whether dashboard can be started
        # (e.g. checking any missing dependencies).
        if include_dashboard is True:
            dashboard_options += ["--include-dashboard=true"]
    else:
        dashboard_options = [
            "--include-dashboard=false",
        ]

    _logger.info(f"Ray head hostname {ray_head_ip}, port {ray_head_port}")

    cluster_unique_id = uuid.uuid4().hex[:8]

    if ray_temp_root_dir is None:
        ray_temp_root_dir = start_hook.get_default_temp_dir()
    ray_temp_dir = os.path.join(
        ray_temp_root_dir, f"ray-{ray_head_port}-{cluster_unique_id}"
    )
    os.makedirs(ray_temp_dir, exist_ok=True)

    ray_head_node_cmd = [
        sys.executable,
        "-m",
        "ray.util.spark.start_ray_node",
        f"--temp-dir={ray_temp_dir}",
        "--block",
        "--head",
        f"--node-ip-address={ray_head_ip}",
        f"--port={ray_head_port}",
        # disallow ray tasks with cpu/gpu requirements from being scheduled on the head
        # node.
        "--num-cpus=0",
        "--num-gpus=0",
        # limit the memory allocation to the head node (actual usage may increase
        # beyond this for processing of tasks and actors).
        f"--memory={128 * 1024 * 1024}",
        # limit the object store memory allocation to the head node (actual usage
        # may increase beyond this for processing of tasks and actors).
        f"--object-store-memory={128 * 1024 * 1024}",
        *dashboard_options,
        *_convert_ray_node_options(head_node_options),
    ]

    _logger.info(f"Starting Ray head, command: {' '.join(ray_head_node_cmd)}")

    # `preexec_fn=setup_sigterm_on_parent_death` ensures the ray head node being
    # killed if parent process died unexpectedly.
    ray_head_proc, tail_output_deque = exec_cmd(
        ray_head_node_cmd,
        synchronous=False,
        preexec_fn=setup_sigterm_on_parent_death,
        extra_env={RAY_ON_SPARK_COLLECT_LOG_TO_PATH: collect_log_to_path or ""},
    )

    # wait ray head node spin up.
    time.sleep(_RAY_HEAD_STARTUP_TIMEOUT)

    if not check_port_open(ray_head_ip, ray_head_port):
        if ray_head_proc.poll() is None:
            # Ray head GCS service is down. Kill ray head node.
            ray_head_proc.terminate()
            # wait killing complete.
            time.sleep(0.5)

        cmd_exec_failure_msg = gen_cmd_exec_failure_msg(
            ray_head_node_cmd, ray_head_proc.returncode, tail_output_deque
        )
        raise RuntimeError("Start Ray head node failed!\n" + cmd_exec_failure_msg)

    _logger.info("Ray head node started.")

    # NB:
    # In order to start ray worker nodes on spark cluster worker machines,
    # We launch a background spark job:
    #  1. Each spark task launches one ray worker node. This design ensures all ray
    #     worker nodes have the same shape (same cpus / gpus / memory configuration).
    #     If ray worker nodes have a non-uniform shape, the Ray cluster setup will
    #     be non-deterministic and could create issues with node sizing.
    #  2. A ray worker node is started via the `ray start` CLI. In each spark task,
    #     a child process is started and will execute a `ray start ...` command in
    #     blocking mode.
    #  3. Each task will acquire a file lock for 10s to ensure that the ray worker
    #     init will acquire a port connection to the ray head node that does not
    #     contend with other worker processes on the same Spark worker node.
    #  4. When the ray cluster is shutdown, killing ray worker nodes is implemented by:
    #     Installing a PR_SET_PDEATHSIG signal for the `ray start ...` child processes
    #     so that when parent process (pyspark task) is killed, the child processes
    #     (`ray start ...` processes) will receive a SIGTERM signal, killing it.
    #     Shutting down the ray cluster is performed by calling
    #     `sparkContext.cancelJobGroup` to cancel the background spark job, sending a
    #     SIGKILL signal to all spark tasks. Once the spark tasks are killed, this
    #     triggers the sending of a SIGTERM to the child processes spawned by the
    #     `ray_start ...` process.

    def ray_cluster_job_mapper(_):
        from pyspark.taskcontext import TaskContext

        _worker_logger = logging.getLogger("ray.util.spark.worker")

        context = TaskContext.get()

        (
            worker_port_range_begin,
            worker_port_range_end,
        ) = _prepare_for_ray_worker_node_startup()

        # Ray worker might run on a machine different with the head node, so create the
        # local log dir and temp dir again.
        os.makedirs(ray_temp_dir, exist_ok=True)

        ray_worker_node_dashboard_agent_port = get_random_unused_port(
            ray_head_ip, min_port=10000, max_port=20000
        )
        ray_worker_node_cmd = [
            sys.executable,
            "-m",
            "ray.util.spark.start_ray_node",
            f"--temp-dir={ray_temp_dir}",
            f"--num-cpus={num_cpus_per_node}",
            "--block",
            f"--address={ray_head_ip}:{ray_head_port}",
            f"--memory={heap_memory_per_node}",
            f"--object-store-memory={object_store_memory_per_node}",
            f"--min-worker-port={worker_port_range_begin}",
            f"--max-worker-port={worker_port_range_end - 1}",
            f"--dashboard-agent-listen-port={ray_worker_node_dashboard_agent_port}",
            *_convert_ray_node_options(worker_node_options),
        ]

        ray_worker_node_extra_envs = {
            RAY_ON_SPARK_COLLECT_LOG_TO_PATH: collect_log_to_path or ""
        }

        if num_gpus_per_node > 0:
            task_resources = context.resources()

            if "gpu" not in task_resources:
                raise RuntimeError(
                    "Couldn't get the gpu id, Please check the GPU resource "
                    "configuration"
                )
            gpu_addr_list = [
                int(addr.strip()) for addr in task_resources["gpu"].addresses
            ]

            available_physical_gpus = get_spark_task_assigned_physical_gpus(
                gpu_addr_list
            )
            ray_worker_node_cmd.append(
                f"--num-gpus={len(available_physical_gpus)}",
            )
            ray_worker_node_extra_envs["CUDA_VISIBLE_DEVICES"] = ",".join(
                [str(gpu_id) for gpu_id in available_physical_gpus]
            )

        _worker_logger.info(
            f"Start Ray worker, command: {' '.join(ray_worker_node_cmd)}"
        )

        # `preexec_fn=setup_sigterm_on_parent_death` handles the case:
        # If a user cancels the PySpark job, the worker process gets killed, regardless
        # of PySpark daemon and worker reuse settings.
        # We use prctl to ensure the command process receives SIGTERM after spark job
        # cancellation.
        # Note:
        # When a pyspark job cancelled, the UDF python process are killed by signal
        # "SIGKILL", This case neither "atexit" nor signal handler can capture SIGKILL
        # signal. prctl is the only way to capture SIGKILL signal.
        exec_cmd(
            ray_worker_node_cmd,
            synchronous=True,
            extra_env=ray_worker_node_extra_envs,
            preexec_fn=setup_sigterm_on_parent_death,
        )

        # NB: Not reachable.
        yield 0

    spark_job_group_id = f"ray-cluster-{ray_head_port}-{cluster_unique_id}"

    cluster_address = f"{ray_head_ip}:{ray_head_port}"
    # Set RAY_ADDRESS environment variable to the cluster address.
    os.environ["RAY_ADDRESS"] = cluster_address

    ray_cluster_handler = RayClusterOnSpark(
        address=cluster_address,
        head_proc=ray_head_proc,
        spark_job_group_id=spark_job_group_id,
        num_workers_node=num_worker_nodes,
        temp_dir=ray_temp_dir,
        cluster_unique_id=cluster_unique_id,
        start_hook=start_hook,
        ray_dashboard_port=ray_dashboard_port,
    )

    def background_job_thread_fn():
        try:
            spark.sparkContext.setJobGroup(
                spark_job_group_id,
                "This job group is for spark job which runs the Ray cluster with ray "
                f"head node {ray_head_ip}:{ray_head_port}",
            )

            # Starting a normal spark job (not barrier spark job) to run ray worker
            # nodes, the design purpose is:
            # 1. Using normal spark job, spark tasks can automatically retry
            # individually, we don't need to write additional retry logic, But, in
            # barrier mode, if one spark task fails, it will cause all other spark
            # tasks killed.
            # 2. Using normal spark job, we can support failover when a spark worker
            # physical machine crashes. (spark will try to re-schedule the spark task
            # to other spark worker nodes)
            # 3. Using barrier mode job, if the cluster resources does not satisfy
            # "idle spark task slots >= argument num_spark_task", then the barrier
            # job gets stuck and waits until enough idle task slots available, this
            # behavior is not user-friendly, on a shared spark cluster, user is hard
            # to estimate how many idle tasks available at a time, But, if using normal
            # spark job, it can launch job with less spark tasks (i.e. user will see a
            # ray cluster setup with less worker number initially), and when more task
            # slots become available, it continues to launch tasks on new available
            # slots, and user can see the ray cluster worker number increases when more
            # slots available.
            job_rdd = spark.sparkContext.parallelize(
                list(range(num_worker_nodes)), num_worker_nodes
            )

            if using_stage_scheduling:
                resource_profile = _create_resource_profile(
                    num_cpus_per_node,
                    num_gpus_per_node,
                )
                job_rdd = job_rdd.withResources(resource_profile)

            job_rdd.mapPartitions(ray_cluster_job_mapper).collect()
        except Exception as e:
            # NB:
            # The background spark job is designed to running forever until it is
            # killed, The exception might be raised in following cases:
            #  1. The background job raises unexpected exception (i.e. ray cluster dies
            #    unexpectedly)
            #  2. User explicitly orders shutting down the ray cluster.
            #  3. On Databricks runtime, when a notebook is detached, it triggers
            #     python REPL `onCancel` event, cancelling the background running spark
            #     job.
            #  For case 1 and 3, only ray workers are killed, but driver side ray head
            #  might still be running and the ray context might be in connected status.
            #  In order to disconnect and kill the ray head node, a call to
            #  `ray_cluster_handler.shutdown()` is performed.
            if not ray_cluster_handler.spark_job_is_canceled:
                # Set `background_job_exception` attribute before calling `shutdown`
                # so inside `shutdown` we can get exception information easily.
                ray_cluster_handler.background_job_exception = e
                ray_cluster_handler.shutdown(cancel_background_job=False)

    try:
        threading.Thread(
            target=inheritable_thread_target(background_job_thread_fn), args=()
        ).start()

        # Call hook immediately after spark job started.
        start_hook.on_cluster_created(ray_cluster_handler)

        # wait background spark task starting.
        for _ in range(_BACKGROUND_JOB_STARTUP_WAIT):
            time.sleep(1)
            if ray_cluster_handler.background_job_exception is not None:
                raise RuntimeError(
                    "Ray workers failed to start."
                ) from ray_cluster_handler.background_job_exception

        return ray_cluster_handler
    except Exception:
        # If driver side setup ray-cluster routine raises exception, it might result
        # in part of ray processes has been launched (e.g. ray head or some ray workers
        # have been launched), calling `ray_cluster_handler.shutdown()` to kill them
        # and clean status.
        ray_cluster_handler.shutdown()
        raise


_active_ray_cluster = None
_active_ray_cluster_rwlock = threading.RLock()


def _create_resource_profile(num_cpus_per_node, num_gpus_per_node):
    from pyspark.resource.profile import ResourceProfileBuilder
    from pyspark.resource.requests import TaskResourceRequests

    task_res_req = TaskResourceRequests().cpus(num_cpus_per_node)
    if num_gpus_per_node > 0:
        task_res_req = task_res_req.resource("gpu", num_gpus_per_node)
    return ResourceProfileBuilder().require(task_res_req).build


# A dict storing blocked key to replacement argument you should use.
_head_node_option_block_keys = {
    "temp_dir": "ray_temp_root_dir",
    "block": None,
    "head": None,
    "node_ip_address": None,
    "port": None,
    "num_cpus": None,
    "num_gpus": None,
    "dashboard_host": None,
    "dashboard_agent_listen_port": None,
}

_worker_node_option_block_keys = {
    "temp_dir": "ray_temp_root_dir",
    "block": None,
    "head": None,
    "address": None,
    "num_cpus": "num_cpus_per_node",
    "num_gpus": "num_gpus_per_node",
    "memory": None,
    "object_store_memory": "object_store_memory_per_node",
    "dashboard_agent_listen_port": None,
    "min_worker_port": None,
    "max_worker_port": None,
}


def _verify_node_options(node_options, block_keys, node_type):
    for key in node_options:
        if key.startswith("--") or "-" in key:
            raise ValueError(
                "For a ray node option like '--foo-bar', you should convert it to "
                "following format 'foo_bar' in 'head_node_options' / "
                "'worker_node_options' arguments."
            )

        if key in block_keys:
            common_err_msg = (
                f"Setting the option '{key}' for {node_type} nodes is not allowed."
            )
            replacement_arg = block_keys[key]
            if replacement_arg:
                raise ValueError(
                    f"{common_err_msg} You should set the '{replacement_arg}' option "
                    "instead."
                )
            else:
                raise ValueError(
                    f"{common_err_msg} This option is controlled by Ray on Spark."
                )


[docs]@PublicAPI(stability="alpha") def setup_ray_cluster( num_worker_nodes: int, num_cpus_per_node: Optional[int] = None, num_gpus_per_node: Optional[int] = None, object_store_memory_per_node: Optional[int] = None, head_node_options: Optional[Dict] = None, worker_node_options: Optional[Dict] = None, ray_temp_root_dir: Optional[str] = None, strict_mode: bool = False, collect_log_to_path: Optional[str] = None, ) -> str: """ Set up a ray cluster on the spark cluster by starting a ray head node in the spark application's driver side node. After creating the head node, a background spark job is created that generates an instance of `RayClusterOnSpark` that contains configuration for the ray cluster that will run on the Spark cluster's worker nodes. After a ray cluster is set up, "RAY_ADDRESS" environment variable is set to the cluster address, so you can call `ray.init()` without specifying ray cluster address to connect to the cluster. To shut down the cluster you can call `ray.util.spark.shutdown_ray_cluster()`. Note: If the active ray cluster haven't shut down, you cannot create a new ray cluster. Args: num_worker_nodes: This argument represents how many ray worker nodes to start for the ray cluster. Specifying the `num_worker_nodes` as `ray.util.spark.MAX_NUM_WORKER_NODES` represents a ray cluster configuration that will use all available resources configured for the spark application. To create a spark application that is intended to exclusively run a shared ray cluster, it is recommended to set this argument to `ray.util.spark.MAX_NUM_WORKER_NODES`. num_cpus_per_node: Number of cpus available to per-ray worker node, if not provided, use spark application configuration 'spark.task.cpus' instead. **Limitation** Only spark version >= 3.4 or Databricks Runtime 12.x supports setting this argument. num_gpus_per_node: Number of gpus available to per-ray worker node, if not provided, use spark application configuration 'spark.task.resource.gpu.amount' instead. This argument is only available on spark cluster that is configured with 'gpu' resources. **Limitation** Only spark version >= 3.4 or Databricks Runtime 12.x supports setting this argument. object_store_memory_per_node: Object store memory available to per-ray worker node, but it is capped by "dev_shm_available_size * 0.8 / num_tasks_per_spark_worker". The default value equals to "0.3 * spark_worker_physical_memory * 0.8 / num_tasks_per_spark_worker". head_node_options: A dict representing Ray head node extra options, these options will be passed to `ray start` script. Note you need to convert `ray start` options key from `--foo-bar` format to `foo_bar` format. For flag options (e.g. '--disable-usage-stats'), you should set the value to None in the option dict, like `{"disable_usage_stats": None}`. Note: Short name options (e.g. '-v') are not supported. worker_node_options: A dict representing Ray worker node extra options, these options will be passed to `ray start` script. Note you need to convert `ray start` options key from `--foo-bar` format to `foo_bar` format. For flag options (e.g. '--disable-usage-stats'), you should set the value to None in the option dict, like `{"disable_usage_stats": None}`. Note: Short name options (e.g. '-v') are not supported. ray_temp_root_dir: A local disk path to store the ray temporary data. The created cluster will create a subdirectory "ray-{head_port}-{random_suffix}" beneath this path. strict_mode: Boolean flag to fast-fail initialization of the ray cluster if the available spark cluster does not have sufficient resources to fulfill the resource allocation for memory, cpu and gpu. When set to true, if the requested resources are not available for recommended minimum recommended functionality, an exception will be raised that details the inadequate spark cluster configuration settings. If overridden as `False`, a warning is raised. collect_log_to_path: If specified, after ray head / worker nodes terminated, collect their logs to the specified path. On Databricks Runtime, we recommend you to specify a local path starts with '/dbfs/', because the path mounts with a centralized storage device and stored data is persisted after Databricks spark cluster terminated. Returns: The address of the initiated Ray cluster on spark. """ global _active_ray_cluster _check_system_environment() head_node_options = head_node_options or {} worker_node_options = worker_node_options or {} _verify_node_options( head_node_options, _head_node_option_block_keys, "Ray head node on spark", ) _verify_node_options( worker_node_options, _worker_node_option_block_keys, "Ray worker node on spark", ) if _active_ray_cluster is not None: raise RuntimeError( "Current active ray cluster on spark haven't shut down. Please call " "`ray.util.spark.shutdown_ray_cluster()` before initiating a new Ray " "cluster on spark." ) if ray.is_initialized(): raise RuntimeError( "Current python process already initialized Ray, Please shut down it " "by `ray.shutdown()` before initiating a Ray cluster on spark." ) spark = get_spark_session() spark_master = spark.sparkContext.master is_spark_local_mode = spark_master == "local" or spark_master.startswith("local[") if not ( spark_master.startswith("spark://") or spark_master.startswith("local-cluster[") or is_spark_local_mode ): raise RuntimeError( "Ray on Spark only supports spark cluster in standalone mode, " "local-cluster mode or spark local mode." ) if is_spark_local_mode: support_stage_scheduling = False elif ( is_in_databricks_runtime() and Version(os.environ["DATABRICKS_RUNTIME_VERSION"]).major >= 12 ): support_stage_scheduling = True else: import pyspark if Version(pyspark.__version__).release >= (3, 4, 0): support_stage_scheduling = True else: support_stage_scheduling = False # Environment configurations within the Spark Session that dictate how many cpus # and gpus to use for each submitted spark task. num_spark_task_cpus = int(spark.sparkContext.getConf().get("spark.task.cpus", "1")) if num_cpus_per_node is not None and num_cpus_per_node <= 0: raise ValueError("Argument `num_cpus_per_node` value must be > 0.") num_spark_task_gpus = int( spark.sparkContext.getConf().get("spark.task.resource.gpu.amount", "0") ) if num_gpus_per_node is not None and num_spark_task_gpus == 0: raise ValueError( "The spark cluster is not configured with 'gpu' resources, so that " "you cannot specify the `num_gpus_per_node` argument." ) if num_gpus_per_node is not None and num_gpus_per_node < 0: raise ValueError("Argument `num_gpus_per_node` value must be >= 0.") if num_cpus_per_node is not None or num_gpus_per_node is not None: if support_stage_scheduling: num_cpus_per_node = num_cpus_per_node or num_spark_task_cpus num_gpus_per_node = num_gpus_per_node or num_spark_task_gpus using_stage_scheduling = True res_profile = _create_resource_profile(num_cpus_per_node, num_gpus_per_node) else: raise ValueError( "Current spark version does not support stage scheduling, so that " "you cannot set the argument `num_cpus_per_node` and " "`num_gpus_per_node` values. Without setting the 2 arguments, " "per-Ray worker node will be assigned with number of " f"'spark.task.cpus' (equals to {num_spark_task_cpus}) cpu cores " "and number of 'spark.task.resource.gpu.amount' " f"(equals to {num_spark_task_gpus}) GPUs. To enable spark stage " "scheduling, you need to upgrade spark to 3.4 version or use " "Databricks Runtime 12.x, and you cannot use spark local mode." ) else: using_stage_scheduling = False res_profile = None num_cpus_per_node = num_spark_task_cpus num_gpus_per_node = num_spark_task_gpus ( ray_worker_node_heap_mem_bytes, ray_worker_node_object_store_mem_bytes, ) = get_avail_mem_per_ray_worker_node( spark, object_store_memory_per_node, num_cpus_per_node, num_gpus_per_node, ) if num_worker_nodes == MAX_NUM_WORKER_NODES: # num_worker_nodes=MAX_NUM_WORKER_NODES represents using all available # spark task slots num_worker_nodes = get_max_num_concurrent_tasks(spark.sparkContext, res_profile) elif num_worker_nodes <= 0: raise ValueError( "The value of 'num_worker_nodes' argument must be either a positive " "integer or 'ray.util.spark.MAX_NUM_WORKER_NODES'." ) insufficient_resources = [] if num_cpus_per_node < 4: insufficient_resources.append( "The provided CPU resources for each ray worker are inadequate to start " "a ray cluster. Based on the total cpu resources available and the " "configured task sizing, each ray worker node would start with " f"{num_cpus_per_node} CPU cores. This is less than the recommended " "value of `4` CPUs per worker. On spark version >= 3.4 or Databricks " "Runtime 12.x, you can set the argument `num_cpus_per_node` to " "a value >= 4 to address it, otherwise you need to increase the spark " "application configuration 'spark.task.cpus' to a minimum of `4` to " "address it." ) if ray_worker_node_heap_mem_bytes < 10 * 1024 * 1024 * 1024: insufficient_resources.append( "The provided memory resources for each ray worker node are inadequate. " "Based on the total memory available on the spark cluster and the " "configured task sizing, each ray worker would start with " f"{ray_worker_node_heap_mem_bytes} bytes heap memory. This is less than " "the recommended value of 10GB. The ray worker node heap memory size is " "calculated by " "(SPARK_WORKER_NODE_PHYSICAL_MEMORY / num_local_spark_task_slots * 0.8) - " "object_store_memory_per_node. To increase the heap space available, " "increase the memory in the spark cluster by changing instance types or " "worker count, reduce the target `num_worker_nodes`, or apply a lower " "`object_store_memory_per_node`." ) if insufficient_resources: if strict_mode: raise ValueError( "You are creating ray cluster on spark with strict mode (it can be " "disabled by setting argument 'strict_mode=False' when calling API " "'setup_ray_cluster'), strict mode requires the spark cluster config " "satisfying following criterion: " "\n".join(insufficient_resources) ) else: _logger.warning("\n".join(insufficient_resources)) with _active_ray_cluster_rwlock: cluster = _setup_ray_cluster( num_worker_nodes=num_worker_nodes, num_cpus_per_node=num_cpus_per_node, num_gpus_per_node=num_gpus_per_node, using_stage_scheduling=using_stage_scheduling, heap_memory_per_node=ray_worker_node_heap_mem_bytes, object_store_memory_per_node=ray_worker_node_object_store_mem_bytes, head_node_options=head_node_options, worker_node_options=worker_node_options, ray_temp_root_dir=ray_temp_root_dir, collect_log_to_path=collect_log_to_path, ) cluster.wait_until_ready() # NB: this line might raise error. # If connect cluster successfully, set global _active_ray_cluster to be the # started cluster. _active_ray_cluster = cluster return cluster.address
[docs]@PublicAPI(stability="alpha") def shutdown_ray_cluster() -> None: """ Shut down the active ray cluster. """ global _active_ray_cluster with _active_ray_cluster_rwlock: if _active_ray_cluster is None: raise RuntimeError("No active ray cluster to shut down.") _active_ray_cluster.shutdown() _active_ray_cluster = None