Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions nemo_run/run/ray/templates/ray.sub.j2
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,33 @@ while ! (srun {% if heterogeneous %}--het-group=0 {% endif %}--overlap --nodes=1
elapsed_time=$((elapsed_time + 2))
done

# Run sandbox in parallel across all allocated nodes when explicitly configured.
if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then
SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}"
mkdir -p "$SANDBOX_PORTS_DIR"
echo "[INFO] Starting sandbox across all nodes in parallel (ports_dir=$SANDBOX_PORTS_DIR)..."
srun {% if heterogeneous %}--het-group=0 {% endif %}--output "$LOG_DIR/sandbox.log" \
--error "$LOG_DIR/sandbox.log" \
--container-image="$SANDBOX_CONTAINER" \
--container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \
--container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \
--no-container-mount-home \
--mpi=pmix \
-A "$SLURM_JOB_ACCOUNT" \
-p "$SLURM_JOB_PARTITION" \
--wait=60 \
--kill-on-bad-exit=1 \
--overlap \
--nodes="$SLURM_JOB_NUM_NODES" \
--ntasks-per-node=1 \
--export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \
bash -c "$SANDBOX_COMMAND" &
SRUN_PIDS["sandbox"]=$!
echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})"
else
echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup"
fi

NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))

# Start Ray worker nodes
Expand Down
27 changes: 27 additions & 0 deletions nemo_run/run/ray/templates/ray_enroot.sub.j2
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,33 @@ while true; do
elapsed_time=$((elapsed_time + 2))
done

# Run sandbox in parallel on the head node (overlap) when explicitly configured.
if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then
SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}"
mkdir -p "$SANDBOX_PORTS_DIR"
echo "[INFO] Starting sandbox on head node in parallel (ports_dir=$SANDBOX_PORTS_DIR)..."
srun --output "$LOG_DIR/sandbox.log" \
--error "$LOG_DIR/sandbox.log" \
--container-image="$SANDBOX_CONTAINER" \
--container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \
--container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \
--no-container-mount-home \
--mpi=pmix \
-A "$SLURM_JOB_ACCOUNT" \
-p "$SLURM_JOB_PARTITION" \
--wait=60 \
--kill-on-bad-exit=1 \
--overlap \
--nodes="$SLURM_JOB_NUM_NODES" \
--ntasks-per-node=1 \
--export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \
bash -c "$SANDBOX_COMMAND" &
SRUN_PIDS["sandbox"]=$!
echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})"
else
echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup"
fi

NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))

# Start Ray worker nodes
Expand Down
27 changes: 27 additions & 0 deletions test/core/execution/artifacts/expected_ray_cluster.sub
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,33 @@ while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STAR
elapsed_time=$((elapsed_time + 2))
done

# Run sandbox in parallel across all allocated nodes when explicitly configured.
if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then
SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}"
mkdir -p "$SANDBOX_PORTS_DIR"
echo "[INFO] Starting sandbox across all nodes in parallel (ports_dir=$SANDBOX_PORTS_DIR)..."
srun --output "$LOG_DIR/sandbox.log" \
--error "$LOG_DIR/sandbox.log" \
--container-image="$SANDBOX_CONTAINER" \
--container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \
--container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \
--no-container-mount-home \
--mpi=pmix \
-A "$SLURM_JOB_ACCOUNT" \
-p "$SLURM_JOB_PARTITION" \
--wait=60 \
--kill-on-bad-exit=1 \
--overlap \
--nodes="$SLURM_JOB_NUM_NODES" \
--ntasks-per-node=1 \
--export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \
bash -c "$SANDBOX_COMMAND" &
SRUN_PIDS["sandbox"]=$!
echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})"
else
echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup"
fi

NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))

# Start Ray worker nodes
Expand Down
29 changes: 28 additions & 1 deletion test/core/execution/artifacts/expected_ray_cluster_enroot.sub
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,33 @@ while true; do
elapsed_time=$((elapsed_time + 2))
done

# Run sandbox in parallel on the head node (overlap) when explicitly configured.
if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then
SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}"
mkdir -p "$SANDBOX_PORTS_DIR"
echo "[INFO] Starting sandbox on head node in parallel (ports_dir=$SANDBOX_PORTS_DIR)..."
srun --output "$LOG_DIR/sandbox.log" \
--error "$LOG_DIR/sandbox.log" \
--container-image="$SANDBOX_CONTAINER" \
--container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \
--container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \
--no-container-mount-home \
--mpi=pmix \
-A "$SLURM_JOB_ACCOUNT" \
-p "$SLURM_JOB_PARTITION" \
--wait=60 \
--kill-on-bad-exit=1 \
--overlap \
--nodes="$SLURM_JOB_NUM_NODES" \
--ntasks-per-node=1 \
--export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \
bash -c "$SANDBOX_COMMAND" &
SRUN_PIDS["sandbox"]=$!
echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})"
else
echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup"
fi

NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))

# Start Ray worker nodes
Expand Down Expand Up @@ -495,4 +522,4 @@ EOF
echo " bash $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh 1 # to attach to worker 1"
echo " bash $CLUSTER_DIR/scripts/${SLURM_JOB_ID}-attach.sh 2 # to attach to worker 2, etc."
sleep infinity
fi
fi
27 changes: 27 additions & 0 deletions test/core/execution/artifacts/expected_ray_cluster_ssh.sub
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,33 @@ while ! (srun --overlap --nodes=1 --ntasks=1 -w $head_node test -f $LOG_DIR/STAR
elapsed_time=$((elapsed_time + 2))
done

# Run sandbox in parallel across all allocated nodes when explicitly configured.
if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then
SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}"
mkdir -p "$SANDBOX_PORTS_DIR"
echo "[INFO] Starting sandbox across all nodes in parallel (ports_dir=$SANDBOX_PORTS_DIR)..."
srun --output "$LOG_DIR/sandbox.log" \
--error "$LOG_DIR/sandbox.log" \
--container-image="$SANDBOX_CONTAINER" \
--container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \
--container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \
--no-container-mount-home \
--mpi=pmix \
-A "$SLURM_JOB_ACCOUNT" \
-p "$SLURM_JOB_PARTITION" \
--wait=60 \
--kill-on-bad-exit=1 \
--overlap \
--nodes="$SLURM_JOB_NUM_NODES" \
--ntasks-per-node=1 \
--export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \
bash -c "$SANDBOX_COMMAND" &
SRUN_PIDS["sandbox"]=$!
echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})"
else
echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup"
fi

NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))

# Start Ray worker nodes
Expand Down
27 changes: 27 additions & 0 deletions test/core/execution/artifacts/expected_ray_het_cluster.sub
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,33 @@ while ! (srun --het-group=0 --overlap --nodes=1 --ntasks=1 -w $head_node test -f
elapsed_time=$((elapsed_time + 2))
done

# Run sandbox in parallel across all allocated nodes when explicitly configured.
if [[ -n "${SANDBOX_CONTAINER:-}" ]] && [[ -n "${SANDBOX_COMMAND:-}" ]]; then
SANDBOX_PORTS_DIR="${SANDBOX_PORTS_DIR:-$LOG_DIR/sandbox}"
mkdir -p "$SANDBOX_PORTS_DIR"
echo "[INFO] Starting sandbox across all nodes in parallel (ports_dir=$SANDBOX_PORTS_DIR)..."
srun --het-group=0 --output "$LOG_DIR/sandbox.log" \
--error "$LOG_DIR/sandbox.log" \
--container-image="$SANDBOX_CONTAINER" \
--container-mounts="$SANDBOX_PORTS_DIR:$SANDBOX_PORTS_DIR" \
--container-env=SANDBOX_PORTS_DIR,LISTEN_PORT,NGINX_PORT \
--no-container-mount-home \
--mpi=pmix \
-A "$SLURM_JOB_ACCOUNT" \
-p "$SLURM_JOB_PARTITION" \
--wait=60 \
--kill-on-bad-exit=1 \
--overlap \
--nodes="$SLURM_JOB_NUM_NODES" \
--ntasks-per-node=1 \
--export=ALL,SANDBOX_PORTS_DIR=$SANDBOX_PORTS_DIR,LISTEN_PORT=${SANDBOX_PORT:-6000},NGINX_PORT=${SANDBOX_PORT:-6000} \
bash -c "$SANDBOX_COMMAND" &
SRUN_PIDS["sandbox"]=$!
echo "[INFO] Sandbox started in background (PID: ${SRUN_PIDS["sandbox"]})"
else
echo "[INFO] SANDBOX_CONTAINER or SANDBOX_COMMAND not defined, skipping sandbox startup"
fi

NUM_ACTORS=$((gpus_per_node * SLURM_JOB_NUM_NODES))

# Start Ray worker nodes
Expand Down
7 changes: 5 additions & 2 deletions test/run/ray/test_slurm_ray_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,11 @@ def test_container_configurations(self):

# Should use cluster_dir as default workdir
assert "--container-workdir=/tmp/test_jobs/test-ray-cluster" in script
# Should not contain container-image flag when none specified
assert "--container-image" not in script
# The main Ray cluster script should not contain a container-image flag
# when none is specified on the executor. Ignore the optional sandbox
# stanza, which may carry its own container-image placeholder.
pre_sandbox_script = script.split("# Run sandbox", 1)[0]
assert "--container-image" not in pre_sandbox_script

def test_special_mount_handling(self):
"""Test materialize handles special RUNDIR_SPECIAL_NAME mounts."""
Expand Down
Loading