ai-dynamo · ishandhanani · Jul 7, 2025 · Jul 7, 2025 · Jul 7, 2025 · Jul 7, 2025
diff --git a/components/backends/sglang/docs/dsr1-wideep-gb200.md b/components/backends/sglang/docs/dsr1-wideep-gb200.md
@@ -0,0 +1,172 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Running DeepSeek-R1 Disaggregated with WideEP on GB200s
+
+Dynamo supports SGLang's GB200 implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-06-16-gb200-part-1/) for more details. Full end to end optimization is still a work in progress but you can get this up and running with the following steps. In ths example, we will run 1 prefill worker on 2 GB200 nodes (4 GPUs each) and 1 decode worker on 12 GB200 nodes (total 56 GPUs).
+
+## Instructions
+
+1. Build the Dynamo container
+
+```bash
+cd $DYNAMO_ROOT
+docker build \
+  -f container/Dockerfile.sglang-wideep \
+  -t dynamo-wideep-gb200 \
+  --build-arg MODE=blackwell \
+  --build-arg SGLANG_IMAGE_TAG=v0.4.9.post6-cu128-gb200 \
+  --build-arg ARCH=arm64 \
+  --build-arg ARCH_ALT=aarch64 \
+  . \
+  --no-cache
+```
+
+2. You can run this container on each 4xGB200 node using the following command.
+
+> [!IMPORTANT]
+> We recommend downloading DeepSeek-R1 and then mounting it to the container. You can find the model [here](https://huggingface.co/deepseek-ai/DeepSeek-R1)
+
+```bash
+docker run \
+    --gpus all \
+    -it \
+    --rm \
+    --network host \
+    --volume /PATH_TO_DSR1_MODEL/:/model/ \
+    --shm-size=10G \
+    --ulimit memlock=-1 \
+    --ulimit stack=67108864 \
+    --ulimit nofile=65536:65536 \
+    --cap-add CAP_SYS_PTRACE \
+    --ipc host \
+    dynamo-wideep-gb200:latest
+```
+
+3. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier.
+
+```bash
+./utils/gen_env_vars.sh
+```
+
+4. Run the ingress and prefill worker
+
+```bash
+# run ingress
+python3 -m dynamo.frontend --http-port=8000 &
+# optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below)
+python3 utils/sgl_http_server.py --ns dynamo &
+# run prefill worker
+SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
+MC_TE_METRIC=true \
+SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+MC_FORCE_MNNVL=1 \
+NCCL_MNNVL_ENABLE=1 \
+NCCL_CUMEM_ENABLE=1 \
+SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+PYTHONUNBUFFERED=1 \
+python3 components/worker.py \
+  --served-model-name deepseek-ai/DeepSeek-R1 \
+  --model-path /model/ \
+  --skip-tokenizer-init \
+  --trust-remote-code \
+  --disaggregation-mode prefill \
+  --dist-init-addr ${HEAD_PREFILL_NODE_IP}:29500 \
+  --disaggregation-bootstrap-port 30001 \
+  --disaggregation-transfer-backend nixl \
+  --nnodes 2 \
+  --node-rank 0 \
+  --tp-size 8 \
+  --dp-size 8 \
+  --enable-dp-attention \
+  --host 0.0.0.0 \
+  --decode-log-interval 1 \
+  --max-running-requests 6144 \
+  --context-length 2716 \
+  --disable-radix-cache \
+  --enable-deepep-moe \
+  --deepep-mode low_latency \
+  --moe-dense-tp-size 1 \
+  --enable-dp-lm-head \
+  --disable-shared-experts-fusion \
+  --ep-num-redundant-experts 32 \
+  --ep-dispatch-algorithm static \
+  --eplb-algorithm deepseek \
+  --attention-backend cutlass_mla \
+  --watchdog-timeout 1000000 \
+  --disable-cuda-graph \
+  --chunked-prefill-size 16384 \
+  --max-total-tokens 32768 \
+  --mem-fraction-static 0.8 \
+  --log-level debug
+```
+
+5. Run the decode worker on the head decode node
+
+```bash
+SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \
+MC_TE_METRIC=true \
+SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
+SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+NCCL_MNNVL_ENABLE=1 \
+MC_FORCE_MNNVL=1 \
+NCCL_CUMEM_ENABLE=1 \
+SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+PYTHONUNBUFFERED=1 \
+python3 components/decode_worker.py \
+  --served-model-name deepseek-ai/DeepSeek-R1 \
+  --model-path /model/ \
+  --skip-tokenizer-init \
+  --trust-remote-code \
+  --disaggregation-mode decode \
+  --dist-init-addr ${HEAD_DECODE_NODE_IP}:29500 \
+  --disaggregation-bootstrap-port 30001 \
+  --nnodes 12 \
+  --node-rank 0 \
+  --tp-size 48 \
+  --dp-size 48 \
+  --enable-dp-attention \
+  --host 0.0.0.0 \
+  --decode-log-interval 1 \
+  --max-running-requests 36864 \
+  --context-length 2716 \
+  --disable-radix-cache \
+  --enable-deepep-moe \
+  --deepep-mode low_latency \
+  --moe-dense-tp-size 1 \
+  --enable-dp-lm-head \
+  --cuda-graph-bs 768 \
+  --disable-shared-experts-fusion \
+  --ep-num-redundant-experts 32 \
+  --ep-dispatch-algorithm static \
+  --eplb-algorithm deepseek \
+  --attention-backend cutlass_mla \
+  --watchdog-timeout 1000000 \
+  --chunked-prefill-size 36864 \
+  --mem-fraction-static 0.82 \
+  --log-level debug
+```
+
+On the other decode nodes (this example has 12 total decode nodes), run the same command but change `--node-rank` to 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
diff --git a/components/backends/sglang/docs/dsr1-wideep-h100.md b/components/backends/sglang/docs/dsr1-wideep-h100.md
@@ -57,7 +57,7 @@ In each container, you should be in the `/sgl-workspace/dynamo/components/backen
 
 ```bash
 # run ingress
-dynamo run in=http out=dyn &
+python3 -m dynamo.frontend --http-port=8000 &
 # optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below)
 python3 utils/sgl_http_server.py --ns dynamo &
 # run prefill worker
@@ -93,7 +93,7 @@ python3 -m dynamo.sglang.worker \
 
 On the other prefill node (since this example has 4 total prefill nodes), run the same command but change `--node-rank` to 1,2, and 3
 
-7. Run the decode worker on the head decode node
+6. Run the decode worker on the head decode node
 
 ```bash
 python3 -m dynamo.sglang.decode_worker \

diff --git a/components/backends/sglang/slurm_jobs/README.md b/components/backends/sglang/slurm_jobs/README.md
@@ -61,6 +61,9 @@ For simplicity of the example, we will make some assumptions about your SLURM cl
 
 ## Usage
 
+> [!NOTE]
+> The logic for finding prefill and decode node IPs in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` bits for your cluster, especially if your networking or hostname conventions differ. PRs and suggestions welcome.
+
 1. **Submit a benchmark job**:
    ```bash
    python submit_job_script.py \
@@ -85,20 +88,54 @@ For simplicity of the example, we will make some assumptions about your SLURM cl
    - `--network-interface`: Network interface to use (default: `eth3`)
    - `--job-name`: SLURM job name (default: `dynamo_setup`)
    - `--time-limit`: Time limit in HH:MM:SS format (default: `01:00:00`)
+   - `--gpu-type`: GPU type to use, choices: `h100`, `gb200` (default: `h100`)
+   - `--use-sglang-commands`: Use SGLang commands instead of Dynamo (default: `false`)
 
    **Note**: The script automatically calculates the total number of nodes needed based on `--prefill-nodes` and `--decode-nodes` parameters.
 
-2. **Monitor job progress**:
+2. **Example with different GPU types**:
+   ```bash
+   # For H100 with Dynamo (default)
+   python submit_job_script.py \
+     --template job_script_template.j2 \
+     --model-dir /path/to/model \
+     --config-dir /path/to/configs \
+     --container-image container-image-uri \
+     --account your-slurm-account \
+     --gpu-type h100
+
+   # For GB200 with SGLang
+   python submit_job_script.py \
+     --template job_script_template.j2 \
+     --model-dir /path/to/model \
+     --config-dir /path/to/configs \
+     --container-image container-image-uri \
+     --account your-slurm-account \
+     --gpu-type gb200 \
+     --use-sglang-commands
+     --gpus-per-node 4
+   ```
+
+3. **Monitor job progress**:
    ```bash
    squeue -u $USER
    ```
 
-3. **Check logs in real-time**:
+4. **Check logs in real-time**:
    ```bash
    tail -f logs/{JOB_ID}/log.out
    ```
 
-4. **Monitor GPU utilization**:
+   You can view logs of all prefill or decode workers simultaneously by running:
+   ```bash
+   # prefill workers err (or .out)
+   tail -f logs/{JOB_ID}/*_prefill.err
+
+   # decode workers err (or .out)
+   tail -f logs/{JOB_ID}/*_decode.err
+   ```
+
+5. **Monitor GPU utilization**:
    ```bash
    tail -f logs/{JOB_ID}/{node}_prefill_gpu_utilization.log
    ```

diff --git a/components/backends/sglang/slurm_jobs/job_script_template.j2 b/components/backends/sglang/slurm_jobs/job_script_template.j2
@@ -7,6 +7,7 @@
 #SBATCH --time={{ time_limit }}
 #SBATCH --output=logs/%j/log.out
 #SBATCH --error=logs/%j/log.err
+#SBATCH --partition=36x2-a01r
 
 # Constants
 PREFILL_NODES={{ prefill_nodes }}
@@ -20,6 +21,8 @@ MODEL_DIR="{{ model_dir }}"
 CONFIG_DIR="{{ config_dir }}"
 CONTAINER_IMAGE="{{ container_image }}"
 NETWORK_INTERFACE="{{ network_interface }}"
+GPU_TYPE="{{ gpu_type | default('h100') }}"
+USE_SGLANG_COMMANDS="{{ use_sglang_commands | default(false) }}"
 
 {% raw %}
 
@@ -36,14 +39,14 @@ for i in "${!nodes[@]}"; do
     echo "Node $i: ${nodes[$i]}"
 done
 
-PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+')
+PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip route get $(getent ahosts ${nodes[0]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
 if [ -z "$PREFILL_HOST_IP" ]; then
     echo "Error: Could not retrieve IP address for prefill host ${nodes[0]} on interface $NETWORK_INTERFACE"
     exit 1
 fi
 echo "Prefill host IP address: $PREFILL_HOST_IP"
 
-DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+')
+DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ip route get $(getent ahosts ${nodes[$PREFILL_NODES]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
 if [ -z "$DECODE_HOST_IP" ]; then
     echo "Error: Could not retrieve IP address for decode host ${nodes[$PREFILL_NODES]} on interface $NETWORK_INTERFACE"
     exit 1
@@ -54,33 +57,36 @@ echo "Decode host IP address: $DECODE_HOST_IP"
 ENROOT_ARGS="\
     --container-image=${CONTAINER_IMAGE} \
     --no-container-entrypoint \
-    --container-mount-home \
-    --no-container-remap-root \
+    --no-container-mount-home \
     --container-mounts=${MODEL_DIR}:/model/,${CONFIG_DIR}:/configs/,${SCRIPT_DIR}:/scripts/,${OUTPUT_DIR}:/outputs/,${LOG_DIR}:/logs/ \
 "
 
+# Build common worker arguments
+WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE}"
+if [ "$USE_SGLANG_COMMANDS" = "True" ]; then
+    WORKER_ARGS="${WORKER_ARGS} --use-sglang-commands"
+fi
+
 # Launch prefill tasks on the first PREFILL_NODES nodes
 for i in $(seq 0 $((PREFILL_NODES - 1))); do
     node=${nodes[$i]}
     rank=$i
     echo "Launching prefill task on node ${i} (rank ${rank}): $node"
-    echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err"
-    echo "Command: python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log &"
-    srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \
-    --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err \
-    python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log &
+
+    cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS}"
+    echo "$cmd"
+    $cmd &
 done
 
 # Launch decode tasks on the next DECODE_NODES nodes
 for i in $(seq $PREFILL_NODES $((PREFILL_NODES + DECODE_NODES - 1))); do
     node=${nodes[$i]}
     rank=$((i - PREFILL_NODES))
     echo "Launching decode task on node ${i} (rank ${rank}): $node"
-    echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err"
-    echo "Command: python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log &"
-    srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \
-    --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err \
-    python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log &
+
+    cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS}"
+    echo "$cmd"
+    $cmd &
 done
 
 echo ""