slurm-basic.sh#

#!/bin/bash
# shellcheck disable=SC2206
#SBATCH --job-name=test
#SBATCH --cpus-per-task=5
#SBATCH --mem-per-cpu=1GB
#SBATCH --nodes=4
#SBATCH --tasks-per-node=1
#SBATCH --time=00:30:00

set -x

# __doc_head_address_start__

# Getting the node names
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
nodes_array=($nodes)

head_node=${nodes_array[0]}

port=6379
ip_head=$head_node:$port
export ip_head
echo "IP Head: $ip_head"
# __doc_head_address_end__

# __doc_symmetric_run_start__
# Start Ray cluster using symmetric_run.py on all nodes.
# Symmetric run will automatically start Ray on all nodes and run the script ONLY the head node.
# Use the '--' separator to separate Ray arguments and the entrypoint command.
# The --min-nodes argument ensures all nodes join before running the script.

# All nodes (including head and workers) will execute this block.
# The entrypoint (simple-trainer.py) will only run on the head node.
srun --nodes="$SLURM_JOB_NUM_NODES" --ntasks="$SLURM_JOB_NUM_NODES" \
    ray symmetric-run \
    --address "$ip_head" \
    --min-nodes "$SLURM_JOB_NUM_NODES" \
    --num-cpus="${SLURM_CPUS_PER_TASK}" \
    --num-gpus="${SLURM_GPUS_PER_TASK}" \
    -- \
    python -u simple-trainer.py "$SLURM_CPUS_PER_TASK"
# __doc_symmetric_run_end__

# __doc_script_start__
# The entrypoint script (simple-trainer.py) will be run on the head node by symmetric_run.