From 0a0c50ff00b58795e4544d4e4ce00d38e7126822 Mon Sep 17 00:00:00 2001 From: Wei Du Date: Fri, 17 Apr 2026 07:40:18 -0700 Subject: [PATCH 1/2] add support for sandbox Signed-off-by: Wei Du --- nemo_run/run/ray/templates/ray.sub.j2 | 27 ++++++++++++++++++++ nemo_run/run/ray/templates/ray_enroot.sub.j2 | 27 ++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/nemo_run/run/ray/templates/ray.sub.j2 b/nemo_run/run/ray/templates/ray.sub.j2 index 81a25c2c..e3e922c3 100644 --- a/nemo_run/run/ray/templates/ray.sub.j2 +++ b/nemo_run/run/ray/templates/ray.sub.j2 @@ -303,6 +303,33 @@ while ! (srun {% if heterogeneous %}--het-group=0 {% endif %}--overlap --nodes=1 elapsed_time=$((elapsed_time + 2)) done +# Run sandbox in parallel across all allocated nodes when explicitly configured. +if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then + SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}" + mkdir -p "$SANDBOX_PORTS_DIR" + echo "[INFO] Starting sandbox across all nodes in parallel (ports_dir=$SANDBOX_PORTS_DIR)..." + srun {% if heterogeneous %}--het-group=0 {% endif %}--output "$LOG_DIR/sandbox.log" \ + --error "$LOG_DIR/sandbox.log" \ + --container-image="$SANDBOX_CONTAINER" \ + --container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \ + --container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \ + --no-container-mount-home \ + --mpi=pmix \ + -A "$SLURM_JOB_ACCOUNT" \ + -p "$SLURM_JOB_PARTITION" \ + --wait=60 \ + --kill-on-bad-exit=1 \ + --overlap \ + --nodes="$SLURM_JOB_NUM_NODES" \ + --ntasks-per-node=1 \ + --export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \ + bash -c "$SANDBOX_COMMAND" & + SRUN_PIDS["sandbox"]=$! + echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})" +else + echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup" +fi + NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) # Start Ray worker nodes diff --git a/nemo_run/run/ray/templates/ray_enroot.sub.j2 b/nemo_run/run/ray/templates/ray_enroot.sub.j2 index 80f62057..90b1505b 100644 --- a/nemo_run/run/ray/templates/ray_enroot.sub.j2 +++ b/nemo_run/run/ray/templates/ray_enroot.sub.j2 @@ -312,6 +312,33 @@ while true; do elapsed_time=$((elapsed_time + 2)) done +# Run sandbox in parallel on the head node (overlap) when explicitly configured. +if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then + SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}" + mkdir -p "$SANDBOX_PORTS_DIR" + echo "[INFO] Starting sandbox on head node in parallel (ports_dir=$SANDBOX_PORTS_DIR)..." + srun --output "$LOG_DIR/sandbox.log" \ + --error "$LOG_DIR/sandbox.log" \ + --container-image="$SANDBOX_CONTAINER" \ + --container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \ + --container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \ + --no-container-mount-home \ + --mpi=pmix \ + -A "$SLURM_JOB_ACCOUNT" \ + -p "$SLURM_JOB_PARTITION" \ + --wait=60 \ + --kill-on-bad-exit=1 \ + --overlap \ + --nodes="$SLURM_JOB_NUM_NODES" \ + --ntasks-per-node=1 \ + --export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \ + bash -c "$SANDBOX_COMMAND" & + SRUN_PIDS["sandbox"]=$! + echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})" +else + echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup" +fi + NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) # Start Ray worker nodes From 4db09942860da81cf3c3dd67774fcdab0c37c224 Mon Sep 17 00:00:00 2001 From: Wei Du Date: Fri, 17 Apr 2026 11:36:30 -0700 Subject: [PATCH 2/2] Add sandbox support to Ray templates and update tests Signed-off-by: Wei Du --- .../artifacts/expected_ray_cluster.sub | 27 +++++++++++++++++ .../artifacts/expected_ray_cluster_enroot.sub | 29 ++++++++++++++++++- .../artifacts/expected_ray_cluster_ssh.sub | 27 +++++++++++++++++ .../artifacts/expected_ray_het_cluster.sub | 27 +++++++++++++++++ test/run/ray/test_slurm_ray_request.py | 7 +++-- 5 files changed, 114 insertions(+), 3 deletions(-) diff --git a/test/core/execution/artifacts/expected_ray_cluster.sub b/test/core/execution/artifacts/expected_ray_cluster.sub index 981640bc..6149136c 100644 --- a/test/core/execution/artifacts/expected_ray_cluster.sub +++ b/test/core/execution/artifacts/expected_ray_cluster.sub @@ -290,6 +290,33 @@ while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STAR elapsed_time=$((elapsed_time + 2)) done +# Run sandbox in parallel across all allocated nodes when explicitly configured. +if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then + SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}" + mkdir -p "$SANDBOX_PORTS_DIR" + echo "[INFO] Starting sandbox across all nodes in parallel (ports_dir=$SANDBOX_PORTS_DIR)..." + srun --output "$LOG_DIR/sandbox.log" \ + --error "$LOG_DIR/sandbox.log" \ + --container-image="$SANDBOX_CONTAINER" \ + --container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \ + --container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \ + --no-container-mount-home \ + --mpi=pmix \ + -A "$SLURM_JOB_ACCOUNT" \ + -p "$SLURM_JOB_PARTITION" \ + --wait=60 \ + --kill-on-bad-exit=1 \ + --overlap \ + --nodes="$SLURM_JOB_NUM_NODES" \ + --ntasks-per-node=1 \ + --export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \ + bash -c "$SANDBOX_COMMAND" & + SRUN_PIDS["sandbox"]=$! + echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})" +else + echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup" +fi + NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) # Start Ray worker nodes diff --git a/test/core/execution/artifacts/expected_ray_cluster_enroot.sub b/test/core/execution/artifacts/expected_ray_cluster_enroot.sub index 227afe8e..9e04ac7f 100644 --- a/test/core/execution/artifacts/expected_ray_cluster_enroot.sub +++ b/test/core/execution/artifacts/expected_ray_cluster_enroot.sub @@ -306,6 +306,33 @@ while true; do elapsed_time=$((elapsed_time + 2)) done +# Run sandbox in parallel on the head node (overlap) when explicitly configured. +if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then + SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}" + mkdir -p "$SANDBOX_PORTS_DIR" + echo "[INFO] Starting sandbox on head node in parallel (ports_dir=$SANDBOX_PORTS_DIR)..." + srun --output "$LOG_DIR/sandbox.log" \ + --error "$LOG_DIR/sandbox.log" \ + --container-image="$SANDBOX_CONTAINER" \ + --container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \ + --container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \ + --no-container-mount-home \ + --mpi=pmix \ + -A "$SLURM_JOB_ACCOUNT" \ + -p "$SLURM_JOB_PARTITION" \ + --wait=60 \ + --kill-on-bad-exit=1 \ + --overlap \ + --nodes="$SLURM_JOB_NUM_NODES" \ + --ntasks-per-node=1 \ + --export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \ + bash -c "$SANDBOX_COMMAND" & + SRUN_PIDS["sandbox"]=$! + echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})" +else + echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup" +fi + NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) # Start Ray worker nodes @@ -495,4 +522,4 @@ EOF echo " bash $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh 1 # to attach to worker 1" echo " bash $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh 2 # to attach to worker 2, etc." sleep infinity -fi +fi \ No newline at end of file diff --git a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub index e0d21e39..d8278537 100644 --- a/test/core/execution/artifacts/expected_ray_cluster_ssh.sub +++ b/test/core/execution/artifacts/expected_ray_cluster_ssh.sub @@ -295,6 +295,33 @@ while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STAR elapsed_time=$((elapsed_time + 2)) done +# Run sandbox in parallel across all allocated nodes when explicitly configured. +if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then + SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}" + mkdir -p "$SANDBOX_PORTS_DIR" + echo "[INFO] Starting sandbox across all nodes in parallel (ports_dir=$SANDBOX_PORTS_DIR)..." + srun --output "$LOG_DIR/sandbox.log" \ + --error "$LOG_DIR/sandbox.log" \ + --container-image="$SANDBOX_CONTAINER" \ + --container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \ + --container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \ + --no-container-mount-home \ + --mpi=pmix \ + -A "$SLURM_JOB_ACCOUNT" \ + -p "$SLURM_JOB_PARTITION" \ + --wait=60 \ + --kill-on-bad-exit=1 \ + --overlap \ + --nodes="$SLURM_JOB_NUM_NODES" \ + --ntasks-per-node=1 \ + --export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \ + bash -c "$SANDBOX_COMMAND" & + SRUN_PIDS["sandbox"]=$! + echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})" +else + echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup" +fi + NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) # Start Ray worker nodes diff --git a/test/core/execution/artifacts/expected_ray_het_cluster.sub b/test/core/execution/artifacts/expected_ray_het_cluster.sub index 9a1a0cd0..51bdd485 100644 --- a/test/core/execution/artifacts/expected_ray_het_cluster.sub +++ b/test/core/execution/artifacts/expected_ray_het_cluster.sub @@ -319,6 +319,33 @@ while ! (srun --het-group=0 --overlap --nodes=1 --ntasks=1 -w $head_node test -f elapsed_time=$((elapsed_time + 2)) done +# Run sandbox in parallel across all allocated nodes when explicitly configured. +if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then + SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}" + mkdir -p "$SANDBOX_PORTS_DIR" + echo "[INFO] Starting sandbox across all nodes in parallel (ports_dir=$SANDBOX_PORTS_DIR)..." + srun --het-group=0 --output "$LOG_DIR/sandbox.log" \ + --error "$LOG_DIR/sandbox.log" \ + --container-image="$SANDBOX_CONTAINER" \ + --container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \ + --container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \ + --no-container-mount-home \ + --mpi=pmix \ + -A "$SLURM_JOB_ACCOUNT" \ + -p "$SLURM_JOB_PARTITION" \ + --wait=60 \ + --kill-on-bad-exit=1 \ + --overlap \ + --nodes="$SLURM_JOB_NUM_NODES" \ + --ntasks-per-node=1 \ + --export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \ + bash -c "$SANDBOX_COMMAND" & + SRUN_PIDS["sandbox"]=$! + echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})" +else + echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup" +fi + NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES)) # Start Ray worker nodes diff --git a/test/run/ray/test_slurm_ray_request.py b/test/run/ray/test_slurm_ray_request.py index 652d283b..1f8d0cc8 100644 --- a/test/run/ray/test_slurm_ray_request.py +++ b/test/run/ray/test_slurm_ray_request.py @@ -285,8 +285,11 @@ def test_container_configurations(self): # Should use cluster_dir as default workdir assert "--container-workdir=/tmp/test_jobs/test-ray-cluster" in script - # Should not contain container-image flag when none specified - assert "--container-image" not in script + # The main Ray cluster script should not contain a container-image flag + # when none is specified on the executor. Ignore the optional sandbox + # stanza, which may carry its own container-image placeholder. + pre_sandbox_script = script.split("# Run sandbox", 1)[0] + assert "--container-image" not in pre_sandbox_script def test_special_mount_handling(self): """Test materialize handles special RUNDIR_SPECIAL_NAME mounts."""