From a28da9235be2b3f963ea5f36dab7dbefca519c57 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 7 Jul 2025 22:39:51 +0000
Subject: [PATCH 01/65] iter

---
 examples/sglang/slurm_jobs/scripts/gen_cmd.py | 250 ++++++++++++++++++
 .../sglang/slurm_jobs/scripts/worker_setup.py | 101 ++++---
 2 files changed, 315 insertions(+), 36 deletions(-)
 create mode 100644 examples/sglang/slurm_jobs/scripts/gen_cmd.py

diff --git a/examples/sglang/slurm_jobs/scripts/gen_cmd.py b/examples/sglang/slurm_jobs/scripts/gen_cmd.py
new file mode 100644
index 0000000000..c7096bd3a2
--- /dev/null
+++ b/examples/sglang/slurm_jobs/scripts/gen_cmd.py
@@ -0,0 +1,250 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Use this script to generate dynamo/sglang flags for h100 or gb200 disagg
+"""
+
+def get_prefill_command_args(config_flag: str, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict:
+    """
+    Get the command arguments for a specific config and worker type.
+    
+    Args:
+        config_flag: One of "h100_dynamo", "h100_sglang", "gb200_dynamo", "gb200_sglang"
+        worker_type: "prefill" or "decode"
+        
+    Returns:
+        Dictionary with 'script' and 'args' keys
+    """
+
+    # TODO: validate if config_flag is valid and exists in support matrix
+
+    if config_flag == "h100_dynamo":
+        return {
+            "script": "python3 components/worker.py",
+            "args": [
+                "--model-path /model/ "
+                "--served-model-name deepseek-ai/DeepSeek-R1 "
+                "--skip-tokenizer-init "
+                "--disaggregation-mode prefill "
+                "--disaggregation-transfer-backend nixl "
+                "--disaggregation-bootstrap-port 30001 "
+                f"--dist-init-addr {host_ip}:{port} "
+                f"--nnodes {total_nodes} "
+                f"--node-rank {rank} "
+                f"--tp-size {total_gpus} "
+                f"--dp-size {total_gpus} "
+                "--enable-dp-attention "
+                "--decode-log-interval 1 "
+                "--enable-deepep-moe "
+                "--page-size 1 "
+                "--trust-remote-code "
+                "--moe-dense-tp-size 1 "
+                "--enable-dp-lm-head "
+                "--disable-radix-cache "
+                "--watchdog-timeout 1000000 "
+                "--enable-two-batch-overlap "
+                "--deepep-mode normal "
+                "--mem-fraction-static 0.85 "
+                "--deepep-config /configs/deepep.json "
+                "--ep-num-redundant-experts 32 "
+                "--ep-dispatch-algorithm dynamic "
+                "--eplb-algorithm deepseek "
+            ]
+        }
+    elif config_flag == "h100_sglang":
+        return {
+            "script": "python3 -m sglang.launch_server",
+            "args": [
+                "--model-path /model/ ",
+                "--served-model-name deepseek-ai/DeepSeek-R1 ",
+                "--disaggregation-transfer-backend nixl ",
+                "--disaggregation-mode prefill ",
+                f"--dist-init-addr {host_ip}:{port} ",
+                f"--nnodes {total_nodes} ",
+                f"--node-rank {rank} ",
+                f"--tp-size {total_gpus} ",
+                f"--dp-size {total_gpus} ",
+                "--enable-dp-attention ",
+                "--decode-log-interval 1 ",
+                "--enable-deepep-moe ",
+                "--page-size 1 ",
+                "--host 0.0.0.0 ",
+                "--trust-remote-code ",
+                "--moe-dense-tp-size 1 ",
+                "--enable-dp-lm-head ",
+                "--disable-radix-cache ",
+                "--watchdog-timeout 1000000 ",
+                "--enable-two-batch-overlap ",
+                "--deepep-mode normal ",
+                "--mem-fraction-static 0.85 ",
+                "--chunked-prefill-size 524288 ",
+                "--max-running-requests 8192 ",
+                "--max-total-tokens 131072 ",
+                "--context-length 8192 ",
+                "--init-expert-location /configs/prefill_in4096.json ",
+                "--ep-num-redundant-experts 32 ",
+                "--ep-dispatch-algorithm dynamic ",
+                "--eplb-algorithm deepseek ",
+                "--deepep-config /configs/deepep.json "
+            ]
+        }
+    elif config_flag == "gb200_sglang":
+        return {
+            "script": "python3 -m sglang.launch_server",
+            "args": [
+                "--served-model-name deepseek-ai/DeepSeek-R1 ",
+                "--model-path /model/ ",
+                "--trust-remote-code ",
+                "--disaggregation-mode prefill ",
+                f"--dist-init-addr {host_ip}:{port} ",
+                f"--nnodes {total_nodes} ",
+                f"--node-rank {rank} ",
+                f"--tp-size {total_gpus} ",
+                f"--dp-size {total_gpus} ",
+                "--enable-dp-attention ",
+                "--host 0.0.0.0 ",
+                "--decode-log-interval 1 ",
+                "--max-running-requests 6144 ",
+                "--context-length 2176 ",
+                "--disable-radix-cache ",
+                "--enable-deepep-moe ",
+                "--deepep-mode low_latency ",
+                "--moe-dense-tp-size 1 ",
+                "--enable-dp-lm-head ",
+                "--disable-shared-experts-fusion ",
+                "--ep-num-redundant-experts 32 ",
+                "--ep-dispatch-algorithm static ",
+                "--eplb-algorithm deepseek ",
+                "--attention-backend cutlass_mla ",
+                "--watchdog-timeout 1000000 ",
+                "--disable-cuda-graph ",
+                "--chunked-prefill-size 16384 ",
+                "--max-total-tokens 32768 ",
+                "--mem-fraction-static 0.9 "
+            ]
+        }
+    else:
+        raise ValueError(f"Invalid config flag: {config_flag}")
+    
+def get_decode_command_args(config_flag: str, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict:
+    """
+    Get the command arguments for a specific config and worker type.
+    
+    Args:
+        config_flag: One of "h100_dynamo", "h100_sglang", "gb200_dynamo", "gb200_sglang"
+        worker_type: "prefill" or "decode"
+        
+    Returns:
+        Dictionary with 'script' and 'args' keys
+    """
+
+    if config_flag == "h100_dynamo":
+        return {
+            "script": "python3 components/decode_worker.py",
+            "args": [
+                "--model-path /model/ ",
+                "--served-model-name deepseek-ai/DeepSeek-R1 ",
+                "--skip-tokenizer-init ",
+                "--disaggregation-mode decode ",
+                "--disaggregation-transfer-backend nixl ",
+                "--disaggregation-bootstrap-port 30001 ",
+                f"--dist-init-addr {host_ip}:{port} ",
+                f"--nnodes {total_nodes} ",
+                f"--node-rank {rank} ",
+                f"--tp-size {total_gpus} ",
+                f"--dp-size {total_gpus} ",
+                "--enable-dp-attention ",
+                "--decode-log-interval 1 "
+                "--enable-deepep-moe "
+                "--page-size 1 "
+                "--trust-remote-code "
+                "--moe-dense-tp-size 1 "
+                "--enable-dp-lm-head "
+                "--disable-radix-cache "
+                "--watchdog-timeout 1000000 "
+                "--enable-two-batch-overlap "
+                "--deepep-mode low_latency "
+                "--mem-fraction-static 0.835 "
+                "--ep-num-redundant-experts 32 "
+                "--cuda-graph-bs 256 "
+            ]
+        }
+    elif config_flag == "h100_sglang":
+        return {
+            "script": "python3 -m sglang.launch_server",
+            "args": [
+                "--model-path /model/ ",
+                "--disaggregation-transfer-backend nixl ",
+                "--disaggregation-mode decode ",
+                f"--dist-init-addr {host_ip}:{port} ",
+                f"--nnodes {total_nodes} ",
+                f"--node-rank {rank} ",
+                f"--tp-size {total_gpus} ",
+                f"--dp-size {total_gpus} ",
+                "--enable-dp-attention ",
+                "--decode-log-interval 1 ",
+                "--enable-deepep-moe ",
+                "--page-size 1 ",
+                "--host 0.0.0.0 ",
+                "--trust-remote-code ",
+                "--moe-dense-tp-size 1 ",
+                "--enable-dp-lm-head ",
+                "--disable-radix-cache ",
+                "--watchdog-timeout 1000000 ",
+                "--enable-two-batch-overlap ",
+                "--deepep-mode low_latency ",
+                "--mem-fraction-static 0.835 ",
+                "--max-running-requests 18432 ",
+                "--context-length 4500 ",
+                "--ep-num-redundant-experts 32 ",
+                "--cuda-graph-bs 256 "
+            ]
+        }
+    elif config_flag == "gb200_sglang":
+        return {
+            "script": "python3 -m sglang.launch_server",
+            "args": [
+                "--model-path /model/ ",
+                "--trust-remote-code ",
+                "--disaggregation-transfer-backend nixl ",
+                "--disaggregation-mode decode ",
+                f"--dist-init-addr {host_ip}:{port} ",
+                f"--nnodes {total_nodes} ",
+                f"--node-rank {rank} ",
+                f"--tp-size {total_gpus} ",
+                f"--dp-size {total_gpus} ",
+                "--enable-dp-attention ",
+                "--host 0.0.0.0 ",
+                "--decode-log-interval 1 ",
+                "--max-running-requests 36864 ",
+                "--context-length 2176 ",
+                "--disable-radix-cache ",
+                "--enable-deepep-moe ",
+                "--deepep-mode low_latency ",
+                "--moe-dense-tp-size 1 ",
+                "--enable-dp-lm-head ",
+                "--cuda-graph-bs 768 ",
+                "--disable-shared-experts-fusion ",
+                "--ep-num-redundant-experts 32 ",
+                "--ep-dispatch-algorithm static ",
+                "--eplb-algorithm deepseek ",
+                "--attention-backend cutlass_mla ",
+                "--watchdog-timeout 1000000 ",
+                "--chunked-prefill-size 36864 ",
+                "--mem-fraction-static 0.82 "
+            ]
+        }
+    
\ No newline at end of file
diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index adda2b6407..9546e8c3b8 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -20,8 +20,8 @@
 
 The script will:
 - Setup the environment
-- Update the YAML config file
-- Start Dynamo graphs.disagg service
+- Generate the python3 command to run the prefill or decode worker
+- Start dynamo (or sglang)
 - Monitor the GPU utilization
 """
 
@@ -177,6 +177,11 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
         default=None,
         help="File to log GPU utilization (default: None)",
     )
+    parser.add_argument(
+        "--use-sglang-commands",
+        action="store_true",
+        help="Helper to spin up SGLang servers instead of dynamo. This is helpful for benchmarking SGLang as well",
+    )
 
     return parser.parse_args(args)
 
@@ -194,43 +199,50 @@ def _validate_args(args: argparse.Namespace) -> None:
 
 
 def setup_prefill_node(
-    rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int
+    rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int, use_sglang_commands: bool
 ) -> int:
     """
     Setup the prefill node.
     """
-    if rank == 0:
-        logging.info(f"Setting up host prefill node: {rank}")
-        logging.info(f"Starting nats server on node {rank} with IP {prefill_host_ip}")
-
-        nats_process = run_command("nats-server -js", background=True)
-        if not nats_process:
-            raise RuntimeError("Failed to start nats-server")
-
-        etcd_cmd = (
-            f"etcd --listen-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} "
-            f"--advertise-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} "
-            f"--listen-peer-urls {ETCD_LISTEN_ADDR}:{ETCD_PEER_PORT} "
-            f"--initial-cluster default=http://{prefill_host_ip}:{ETCD_PEER_PORT}"
-        )
-
-        etcd_process = run_command(etcd_cmd, background=True)
-        if not etcd_process:
-            raise RuntimeError("Failed to start etcd")
-
-        ingress_process = run_command("dynamo run in=http out=dyn", background=True)
-        if not ingress_process:
-            raise RuntimeError("Failed to start ingress")
+    if not use_sglang_commands:
+        python_cmd = "python3 components/worker.py "
+        if rank == 0:
+            logging.info(f"Setting up host prefill node: {rank}")
+            logging.info(f"Starting nats server on node {rank} with IP {prefill_host_ip}")
+
+            nats_process = run_command("nats-server -js", background=True)
+            if not nats_process:
+                raise RuntimeError("Failed to start nats-server")
+
+            etcd_cmd = (
+                f"etcd --listen-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} "
+                f"--advertise-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} "
+                f"--listen-peer-urls {ETCD_LISTEN_ADDR}:{ETCD_PEER_PORT} "
+                f"--initial-cluster default=http://{prefill_host_ip}:{ETCD_PEER_PORT}"
+            )
+
+            etcd_process = run_command(etcd_cmd, background=True)
+            if not etcd_process:
+                raise RuntimeError("Failed to start etcd")
+
+            ingress_process = run_command("dynamo run in=http out=dyn", background=True)
+            if not ingress_process:
+                raise RuntimeError("Failed to start ingress")
+
+        else:
+            logging.info(f"Setting up child prefill node: {rank}")
+            if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"):
+                raise RuntimeError("Failed to connect to etcd")
 
     else:
-        logging.info(f"Setting up child prefill node: {rank}")
-        if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"):
-            raise RuntimeError("Failed to connect to etcd")
+        python_cmd = "python3 -m sglang.launch_server "
+        logging.info("Using SGLang servers. No need to setup etcd or nats")
 
     # NOTE: This implements the example in examples/sglang/dsr1-wideep.md
     # For other examples, the command might have to be modified.
-    dynamo_cmd = (
-        f"python3 components/worker.py "
+    # Because we use the sgl arg parser, we can use the same flags for both dynamo and sglang
+    cmd_to_run = (
+        f"{python_cmd} "
         "--model-path /model/ "
         "--served-model-name deepseek-ai/DeepSeek-R1 "
         "--skip-tokenizer-init "
@@ -259,7 +271,7 @@ def setup_prefill_node(
         "--ep-dispatch-algorithm dynamic "
         "--eplb-algorithm deepseek "
     )
-    return run_command(dynamo_cmd)
+    return run_command(cmd_to_run)
 
 
 def setup_decode_node(
@@ -268,17 +280,31 @@ def setup_decode_node(
     prefill_host_ip: str,
     total_nodes: int,
     total_gpus: int,
+    use_sglang_commands: bool
 ) -> int:
     """
     Setup the decode node.
     """
     logging.info(f"Setting up child decode node: {rank}")
 
-    if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"):
-        raise RuntimeError("Failed to connect to etcd")
+    if use_sglang_commands:
+        python_cmd = "python3 -m sglang.launch_server "
+        sgl_mini_lb_cmd = (
+            "python3 -m sglang.srt.disaggregation.launch_lb "
+            f"--prefill http://{prefill_host_ip}:30000 "
+            f"--decode http://{decode_host_ip}:30000 "
+            "--host 0.0.0.0 "
+            "--port 8000 "
+            "--timeout 3600"
+        )
+        run_command(sgl_mini_lb_cmd, background=True)
+    else:
+        python_cmd = "python3 components/decode_worker.py "
+        if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"):
+            raise RuntimeError("Failed to connect to etcd")
 
-    dynamo_cmd = (
-        "python3 components/decode_worker.py "
+    cmd_to_run = (
+        f"{python_cmd} "
         "--model-path /model/ "
         "--served-model-name deepseek-ai/DeepSeek-R1 "
         "--skip-tokenizer-init "
@@ -306,7 +332,7 @@ def setup_decode_node(
         "--cuda-graph-bs 256 "
     )
 
-    return run_command(dynamo_cmd)
+    return run_command(cmd_to_run)
 
 
 def setup_env(prefill_host_ip: str):
@@ -333,6 +359,7 @@ def main(input_args: list[str] | None = None):
     logging.info(f"Prefill host IP: {args.prefill_host_ip}")
     logging.info(f"Decode host IP: {args.decode_host_ip}")
     logging.info(f"Rank: {args.rank}")
+    logging.info(f"Use SGLang commands: {args.use_sglang_commands}")
 
     setup_env(args.prefill_host_ip)
     if args.worker_type == "prefill":
@@ -341,6 +368,7 @@ def main(input_args: list[str] | None = None):
             args.prefill_host_ip,
             args.total_nodes,
             args.total_nodes * args.gpus_per_node,
+            args.use_sglang_commands
         )
     else:
         setup_decode_node(
@@ -349,6 +377,7 @@ def main(input_args: list[str] | None = None):
             args.prefill_host_ip,
             args.total_nodes,
             args.total_nodes * args.gpus_per_node,
+            args.use_sglang_commands
         )
 
     logging.info(f"{args.worker_type.capitalize()} node setup complete")

From be7e2b626842bccef4670e6749589d3f60e5f335 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 7 Jul 2025 22:40:13 +0000
Subject: [PATCH 02/65] deleted in here

---
 examples/sglang/slurm_jobs/scripts/gen_cmd.py | 250 ------------------
 1 file changed, 250 deletions(-)
 delete mode 100644 examples/sglang/slurm_jobs/scripts/gen_cmd.py

diff --git a/examples/sglang/slurm_jobs/scripts/gen_cmd.py b/examples/sglang/slurm_jobs/scripts/gen_cmd.py
deleted file mode 100644
index c7096bd3a2..0000000000
--- a/examples/sglang/slurm_jobs/scripts/gen_cmd.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Use this script to generate dynamo/sglang flags for h100 or gb200 disagg
-"""
-
-def get_prefill_command_args(config_flag: str, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict:
-    """
-    Get the command arguments for a specific config and worker type.
-    
-    Args:
-        config_flag: One of "h100_dynamo", "h100_sglang", "gb200_dynamo", "gb200_sglang"
-        worker_type: "prefill" or "decode"
-        
-    Returns:
-        Dictionary with 'script' and 'args' keys
-    """
-
-    # TODO: validate if config_flag is valid and exists in support matrix
-
-    if config_flag == "h100_dynamo":
-        return {
-            "script": "python3 components/worker.py",
-            "args": [
-                "--model-path /model/ "
-                "--served-model-name deepseek-ai/DeepSeek-R1 "
-                "--skip-tokenizer-init "
-                "--disaggregation-mode prefill "
-                "--disaggregation-transfer-backend nixl "
-                "--disaggregation-bootstrap-port 30001 "
-                f"--dist-init-addr {host_ip}:{port} "
-                f"--nnodes {total_nodes} "
-                f"--node-rank {rank} "
-                f"--tp-size {total_gpus} "
-                f"--dp-size {total_gpus} "
-                "--enable-dp-attention "
-                "--decode-log-interval 1 "
-                "--enable-deepep-moe "
-                "--page-size 1 "
-                "--trust-remote-code "
-                "--moe-dense-tp-size 1 "
-                "--enable-dp-lm-head "
-                "--disable-radix-cache "
-                "--watchdog-timeout 1000000 "
-                "--enable-two-batch-overlap "
-                "--deepep-mode normal "
-                "--mem-fraction-static 0.85 "
-                "--deepep-config /configs/deepep.json "
-                "--ep-num-redundant-experts 32 "
-                "--ep-dispatch-algorithm dynamic "
-                "--eplb-algorithm deepseek "
-            ]
-        }
-    elif config_flag == "h100_sglang":
-        return {
-            "script": "python3 -m sglang.launch_server",
-            "args": [
-                "--model-path /model/ ",
-                "--served-model-name deepseek-ai/DeepSeek-R1 ",
-                "--disaggregation-transfer-backend nixl ",
-                "--disaggregation-mode prefill ",
-                f"--dist-init-addr {host_ip}:{port} ",
-                f"--nnodes {total_nodes} ",
-                f"--node-rank {rank} ",
-                f"--tp-size {total_gpus} ",
-                f"--dp-size {total_gpus} ",
-                "--enable-dp-attention ",
-                "--decode-log-interval 1 ",
-                "--enable-deepep-moe ",
-                "--page-size 1 ",
-                "--host 0.0.0.0 ",
-                "--trust-remote-code ",
-                "--moe-dense-tp-size 1 ",
-                "--enable-dp-lm-head ",
-                "--disable-radix-cache ",
-                "--watchdog-timeout 1000000 ",
-                "--enable-two-batch-overlap ",
-                "--deepep-mode normal ",
-                "--mem-fraction-static 0.85 ",
-                "--chunked-prefill-size 524288 ",
-                "--max-running-requests 8192 ",
-                "--max-total-tokens 131072 ",
-                "--context-length 8192 ",
-                "--init-expert-location /configs/prefill_in4096.json ",
-                "--ep-num-redundant-experts 32 ",
-                "--ep-dispatch-algorithm dynamic ",
-                "--eplb-algorithm deepseek ",
-                "--deepep-config /configs/deepep.json "
-            ]
-        }
-    elif config_flag == "gb200_sglang":
-        return {
-            "script": "python3 -m sglang.launch_server",
-            "args": [
-                "--served-model-name deepseek-ai/DeepSeek-R1 ",
-                "--model-path /model/ ",
-                "--trust-remote-code ",
-                "--disaggregation-mode prefill ",
-                f"--dist-init-addr {host_ip}:{port} ",
-                f"--nnodes {total_nodes} ",
-                f"--node-rank {rank} ",
-                f"--tp-size {total_gpus} ",
-                f"--dp-size {total_gpus} ",
-                "--enable-dp-attention ",
-                "--host 0.0.0.0 ",
-                "--decode-log-interval 1 ",
-                "--max-running-requests 6144 ",
-                "--context-length 2176 ",
-                "--disable-radix-cache ",
-                "--enable-deepep-moe ",
-                "--deepep-mode low_latency ",
-                "--moe-dense-tp-size 1 ",
-                "--enable-dp-lm-head ",
-                "--disable-shared-experts-fusion ",
-                "--ep-num-redundant-experts 32 ",
-                "--ep-dispatch-algorithm static ",
-                "--eplb-algorithm deepseek ",
-                "--attention-backend cutlass_mla ",
-                "--watchdog-timeout 1000000 ",
-                "--disable-cuda-graph ",
-                "--chunked-prefill-size 16384 ",
-                "--max-total-tokens 32768 ",
-                "--mem-fraction-static 0.9 "
-            ]
-        }
-    else:
-        raise ValueError(f"Invalid config flag: {config_flag}")
-    
-def get_decode_command_args(config_flag: str, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict:
-    """
-    Get the command arguments for a specific config and worker type.
-    
-    Args:
-        config_flag: One of "h100_dynamo", "h100_sglang", "gb200_dynamo", "gb200_sglang"
-        worker_type: "prefill" or "decode"
-        
-    Returns:
-        Dictionary with 'script' and 'args' keys
-    """
-
-    if config_flag == "h100_dynamo":
-        return {
-            "script": "python3 components/decode_worker.py",
-            "args": [
-                "--model-path /model/ ",
-                "--served-model-name deepseek-ai/DeepSeek-R1 ",
-                "--skip-tokenizer-init ",
-                "--disaggregation-mode decode ",
-                "--disaggregation-transfer-backend nixl ",
-                "--disaggregation-bootstrap-port 30001 ",
-                f"--dist-init-addr {host_ip}:{port} ",
-                f"--nnodes {total_nodes} ",
-                f"--node-rank {rank} ",
-                f"--tp-size {total_gpus} ",
-                f"--dp-size {total_gpus} ",
-                "--enable-dp-attention ",
-                "--decode-log-interval 1 "
-                "--enable-deepep-moe "
-                "--page-size 1 "
-                "--trust-remote-code "
-                "--moe-dense-tp-size 1 "
-                "--enable-dp-lm-head "
-                "--disable-radix-cache "
-                "--watchdog-timeout 1000000 "
-                "--enable-two-batch-overlap "
-                "--deepep-mode low_latency "
-                "--mem-fraction-static 0.835 "
-                "--ep-num-redundant-experts 32 "
-                "--cuda-graph-bs 256 "
-            ]
-        }
-    elif config_flag == "h100_sglang":
-        return {
-            "script": "python3 -m sglang.launch_server",
-            "args": [
-                "--model-path /model/ ",
-                "--disaggregation-transfer-backend nixl ",
-                "--disaggregation-mode decode ",
-                f"--dist-init-addr {host_ip}:{port} ",
-                f"--nnodes {total_nodes} ",
-                f"--node-rank {rank} ",
-                f"--tp-size {total_gpus} ",
-                f"--dp-size {total_gpus} ",
-                "--enable-dp-attention ",
-                "--decode-log-interval 1 ",
-                "--enable-deepep-moe ",
-                "--page-size 1 ",
-                "--host 0.0.0.0 ",
-                "--trust-remote-code ",
-                "--moe-dense-tp-size 1 ",
-                "--enable-dp-lm-head ",
-                "--disable-radix-cache ",
-                "--watchdog-timeout 1000000 ",
-                "--enable-two-batch-overlap ",
-                "--deepep-mode low_latency ",
-                "--mem-fraction-static 0.835 ",
-                "--max-running-requests 18432 ",
-                "--context-length 4500 ",
-                "--ep-num-redundant-experts 32 ",
-                "--cuda-graph-bs 256 "
-            ]
-        }
-    elif config_flag == "gb200_sglang":
-        return {
-            "script": "python3 -m sglang.launch_server",
-            "args": [
-                "--model-path /model/ ",
-                "--trust-remote-code ",
-                "--disaggregation-transfer-backend nixl ",
-                "--disaggregation-mode decode ",
-                f"--dist-init-addr {host_ip}:{port} ",
-                f"--nnodes {total_nodes} ",
-                f"--node-rank {rank} ",
-                f"--tp-size {total_gpus} ",
-                f"--dp-size {total_gpus} ",
-                "--enable-dp-attention ",
-                "--host 0.0.0.0 ",
-                "--decode-log-interval 1 ",
-                "--max-running-requests 36864 ",
-                "--context-length 2176 ",
-                "--disable-radix-cache ",
-                "--enable-deepep-moe ",
-                "--deepep-mode low_latency ",
-                "--moe-dense-tp-size 1 ",
-                "--enable-dp-lm-head ",
-                "--cuda-graph-bs 768 ",
-                "--disable-shared-experts-fusion ",
-                "--ep-num-redundant-experts 32 ",
-                "--ep-dispatch-algorithm static ",
-                "--eplb-algorithm deepseek ",
-                "--attention-backend cutlass_mla ",
-                "--watchdog-timeout 1000000 ",
-                "--chunked-prefill-size 36864 ",
-                "--mem-fraction-static 0.82 "
-            ]
-        }
-    
\ No newline at end of file

From e67abd21c6606101cb4917d520e698086330897e Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 7 Jul 2025 23:06:27 +0000
Subject: [PATCH 03/65] add

---
 examples/sglang/slurm_jobs/scripts/gen_cmd.py | 231 ++++++++++++++++++
 .../sglang/slurm_jobs/scripts/worker_setup.py |  99 ++------
 2 files changed, 251 insertions(+), 79 deletions(-)
 create mode 100644 examples/sglang/slurm_jobs/scripts/gen_cmd.py

diff --git a/examples/sglang/slurm_jobs/scripts/gen_cmd.py b/examples/sglang/slurm_jobs/scripts/gen_cmd.py
new file mode 100644
index 0000000000..b2e61960da
--- /dev/null
+++ b/examples/sglang/slurm_jobs/scripts/gen_cmd.py
@@ -0,0 +1,231 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Use this script to generate dynamo/sglang flags for h100 or gb200 disagg
+"""
+
+def get_prefill_command_args(gpu_type: str, use_sglang_commands: bool, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict:
+    if gpu_type == "h100" and not use_sglang_commands:
+        cmd = (
+            f"python3 components/worker.py "
+            "--model-path /model/ "
+            "--served-model-name deepseek-ai/DeepSeek-R1 "
+            "--skip-tokenizer-init "
+            "--disaggregation-mode prefill "
+            "--disaggregation-transfer-backend nixl "
+            "--disaggregation-bootstrap-port 30001 "
+            f"--dist-init-addr {host_ip}:{port} "
+            f"--nnodes {total_nodes} "
+            f"--node-rank {rank} "
+            f"--tp-size {total_gpus} "
+            f"--dp-size {total_gpus} "
+            "--enable-dp-attention "
+            "--decode-log-interval 1 "
+            "--enable-deepep-moe "
+            "--page-size 1 "
+            "--trust-remote-code "
+            "--moe-dense-tp-size 1 "
+            "--enable-dp-lm-head "
+            "--disable-radix-cache "
+            "--watchdog-timeout 1000000 "
+            "--enable-two-batch-overlap "
+            "--deepep-mode normal "
+            "--mem-fraction-static 0.85 "
+            "--deepep-config /configs/deepep.json "
+            "--ep-num-redundant-experts 32 "
+            "--ep-dispatch-algorithm dynamic "
+            "--eplb-algorithm deepseek "
+        )
+    elif gpu_type == "h100" and use_sglang_commands:
+        cmd = (
+            f"python3 -m sglang.launch_server "
+            "--model-path /model/ "
+            "--served-model-name deepseek-ai/DeepSeek-R1 "
+            "--disaggregation-transfer-backend nixl "
+            "--disaggregation-mode prefill "
+            f"--dist-init-addr {host_ip}:{port} "
+            f"--nnodes {total_nodes} "
+            f"--node-rank {rank} "
+            f"--tp-size {total_gpus} "
+            f"--dp-size {total_gpus} "
+            "--enable-dp-attention "
+            "--decode-log-interval 1 "
+            "--enable-deepep-moe "
+            "--page-size 1 "
+            "--host 0.0.0.0 "
+            "--trust-remote-code "
+            "--moe-dense-tp-size 1 "
+            "--enable-dp-lm-head "
+            "--disable-radix-cache "
+            "--watchdog-timeout 1000000 "
+            "--enable-two-batch-overlap "
+            "--deepep-mode normal "
+            "--mem-fraction-static 0.85 "
+            "--chunked-prefill-size 524288 "
+            "--max-running-requests 8192 "
+            "--max-total-tokens 131072 "
+            "--context-length 8192 "
+            "--init-expert-location /configs/prefill_in4096.json "
+            "--ep-num-redundant-experts 32 "
+            "--ep-dispatch-algorithm dynamic "
+            "--eplb-algorithm deepseek "
+            "--deepep-config /configs/deepep.json "
+
+        )
+    elif gpu_type == "gb200" and use_sglang_commands:
+        cmd = (
+            f"python3 -m sglang.launch_server "
+            "--served-model-name deepseek-ai/DeepSeek-R1 "
+            "--model-path /model/ "
+            "--trust-remote-code "
+            "--disaggregation-mode prefill "
+            f"--dist-init-addr {host_ip}:{port} "
+            f"--nnodes {total_nodes} "
+            f"--node-rank {rank} "
+            f"--tp-size {total_gpus} "
+            f"--dp-size {total_gpus} "
+            "--enable-dp-attention "
+            "--host 0.0.0.0 "
+            "--decode-log-interval 1 "
+            "--max-running-requests 6144 "
+            "--context-length 2176 "
+            "--disable-radix-cache "
+            "--enable-deepep-moe "
+            "--deepep-mode low_latency "
+            "--moe-dense-tp-size 1 "
+            "--enable-dp-lm-head "
+            "--disable-shared-experts-fusion "
+            "--ep-num-redundant-experts 32 "
+            "--ep-dispatch-algorithm static "
+            "--eplb-algorithm deepseek "
+            "--attention-backend cutlass_mla "
+            "--watchdog-timeout 1000000 "
+            "--disable-cuda-graph "
+            "--chunked-prefill-size 16384 "
+            "--max-total-tokens 32768 "
+            "--mem-fraction-static 0.9 "
+        )
+    else:
+        raise ValueError(f"Unsupported: {gpu_type} and use_sglang_commands={use_sglang_commands}")
+
+    return cmd
+    
+def get_decode_command_args(gpu_type: str, use_sglang_commands: bool, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict:
+    if gpu_type == "h100" and not use_sglang_commands:
+        cmd = (
+            f"python3 components/decode_worker.py "
+            "--model-path /model/ "
+            "--served-model-name deepseek-ai/DeepSeek-R1 "
+            "--skip-tokenizer-init "
+            "--disaggregation-mode decode "
+            "--disaggregation-transfer-backend nixl "
+            "--disaggregation-bootstrap-port 30001 "
+            f"--dist-init-addr {host_ip}:{port} "
+            f"--nnodes {total_nodes} "
+            f"--node-rank {rank} "
+            f"--tp-size {total_gpus} "
+            f"--dp-size {total_gpus} "
+            "--enable-dp-attention "
+            "--decode-log-interval 1 "
+            "--enable-deepep-moe "
+            "--page-size 1 "
+            "--trust-remote-code "
+            "--moe-dense-tp-size 1 "
+            "--enable-dp-lm-head "
+            "--disable-radix-cache "
+            "--watchdog-timeout 1000000 "
+            "--enable-two-batch-overlap "
+            "--deepep-mode low_latency "
+            "--mem-fraction-static 0.835 "
+            "--ep-num-redundant-experts 32 "
+            "--cuda-graph-bs 256 "
+        )
+    elif gpu_type == "h100" and use_sglang_commands:
+        cmd = (
+            f"python3 -m sglang.launch_server "
+            "--model-path /model/ "
+            "--disaggregation-transfer-backend nixl "
+            "--disaggregation-mode decode "
+            f"--dist-init-addr {host_ip}:{port} "
+            f"--nnodes {total_nodes} "
+            f"--node-rank {rank} "
+            f"--tp-size {total_gpus} "
+            f"--dp-size {total_gpus} "
+            "--enable-dp-attention "
+            "--decode-log-interval 1 "
+            "--enable-deepep-moe "
+            "--page-size 1 "
+            "--host 0.0.0.0 "
+            "--trust-remote-code "
+            "--moe-dense-tp-size 1 "
+            "--enable-dp-lm-head "
+            "--disable-radix-cache "
+            "--watchdog-timeout 1000000 "
+            "--enable-two-batch-overlap "
+            "--deepep-mode low_latency "
+            "--mem-fraction-static 0.835 "
+            "--max-running-requests 18432 "
+            "--context-length 4500 "
+            "--ep-num-redundant-experts 32 "
+            "--cuda-graph-bs 256 "
+        )
+    elif gpu_type == "gb200" and use_sglang_commands:
+        cmd = (
+            f"python3 -m sglang.launch_server "
+            "--model-path /model/ "
+            "--trust-remote-code "
+            "--disaggregation-transfer-backend nixl "
+            "--disaggregation-mode decode "
+            f"--dist-init-addr {host_ip}:{port} "
+            f"--nnodes {total_nodes} "
+            f"--node-rank {rank} "
+            f"--tp-size {total_gpus} "
+            f"--dp-size {total_gpus} "
+            "--enable-dp-attention "
+            "--host 0.0.0.0 "
+            "--decode-log-interval 1 "
+            "--max-running-requests 36864 "
+            "--context-length 2176 "
+            "--disable-radix-cache "
+            "--enable-deepep-moe "
+            "--deepep-mode low_latency "
+            "--moe-dense-tp-size 1 "
+            "--enable-dp-lm-head "
+            "--cuda-graph-bs 768 "
+            "--disable-shared-experts-fusion "
+            "--ep-num-redundant-experts 32 "
+            "--ep-dispatch-algorithm static "
+            "--eplb-algorithm deepseek "
+            "--attention-backend cutlass_mla "
+            "--watchdog-timeout 1000000 "
+            "--chunked-prefill-size 36864 "
+            "--mem-fraction-static 0.82 "
+        )
+    else:
+        raise ValueError(f"Unsupported: {gpu_type} and use_sglang_commands={use_sglang_commands}")
+
+    return cmd
+
+def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> dict:
+    cmd = (
+        f"python3 -m sglang.srt.disaggregation.launch_lb "
+        f"--prefill http://{prefill_host_ip}:30000 "
+        f"--decode http://{decode_host_ip}:30000 "
+        "--host 0.0.0.0 "
+        "--port 8000 "
+        "--timeout 3600"
+    )
+    return cmd
\ No newline at end of file
diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index 9546e8c3b8..90911127e8 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -33,6 +33,7 @@
 import time
 from pathlib import Path
 
+from .gen_cmd import get_prefill_command_args, get_decode_command_args, get_sglang_mini_lb_command_args
 import requests
 
 # Network configurations
@@ -180,8 +181,16 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
     parser.add_argument(
         "--use-sglang-commands",
         action="store_true",
+        default=False,
         help="Helper to spin up SGLang servers instead of dynamo. This is helpful for benchmarking SGLang as well",
     )
+    parser.add_argument(
+        "--gpu_type",
+        type=str,
+        choices=["h100", "gb200"],
+        default="h100",
+        help="Type of GPU to use",
+    )
 
     return parser.parse_args(args)
 
@@ -199,13 +208,12 @@ def _validate_args(args: argparse.Namespace) -> None:
 
 
 def setup_prefill_node(
-    rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int, use_sglang_commands: bool
+    rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int, use_sglang_commands: bool, gpu_type: str
 ) -> int:
     """
     Setup the prefill node.
     """
     if not use_sglang_commands:
-        python_cmd = "python3 components/worker.py "
         if rank == 0:
             logging.info(f"Setting up host prefill node: {rank}")
             logging.info(f"Starting nats server on node {rank} with IP {prefill_host_ip}")
@@ -233,44 +241,11 @@ def setup_prefill_node(
             logging.info(f"Setting up child prefill node: {rank}")
             if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"):
                 raise RuntimeError("Failed to connect to etcd")
-
     else:
-        python_cmd = "python3 -m sglang.launch_server "
         logging.info("Using SGLang servers. No need to setup etcd or nats")
 
-    # NOTE: This implements the example in examples/sglang/dsr1-wideep.md
-    # For other examples, the command might have to be modified.
-    # Because we use the sgl arg parser, we can use the same flags for both dynamo and sglang
-    cmd_to_run = (
-        f"{python_cmd} "
-        "--model-path /model/ "
-        "--served-model-name deepseek-ai/DeepSeek-R1 "
-        "--skip-tokenizer-init "
-        "--disaggregation-mode prefill "
-        "--disaggregation-transfer-backend nixl "
-        "--disaggregation-bootstrap-port 30001 "
-        f"--dist-init-addr {prefill_host_ip}:{DIST_INIT_PORT} "
-        f"--nnodes {total_nodes} "
-        f"--node-rank {rank} "
-        f"--tp-size {total_gpus} "
-        f"--dp-size {total_gpus} "
-        "--enable-dp-attention "
-        "--decode-log-interval 1 "
-        "--enable-deepep-moe "
-        "--page-size 1 "
-        "--trust-remote-code "
-        "--moe-dense-tp-size 1 "
-        "--enable-dp-lm-head "
-        "--disable-radix-cache "
-        "--watchdog-timeout 1000000 "
-        "--enable-two-batch-overlap "
-        "--deepep-mode normal "
-        "--mem-fraction-static 0.85 "
-        "--deepep-config /configs/deepep.json "
-        "--ep-num-redundant-experts 32 "
-        "--ep-dispatch-algorithm dynamic "
-        "--eplb-algorithm deepseek "
-    )
+    # NOTE: Default command for h100 and dynamo implements the example in examples/sglang/dsr1-wideep.md
+    cmd_to_run = get_prefill_command_args(gpu_type, use_sglang_commands, prefill_host_ip, total_nodes, rank, total_gpus)
     return run_command(cmd_to_run)
 
 
@@ -280,7 +255,8 @@ def setup_decode_node(
     prefill_host_ip: str,
     total_nodes: int,
     total_gpus: int,
-    use_sglang_commands: bool
+    use_sglang_commands: bool,
+    gpu_type: str
 ) -> int:
     """
     Setup the decode node.
@@ -288,50 +264,13 @@ def setup_decode_node(
     logging.info(f"Setting up child decode node: {rank}")
 
     if use_sglang_commands:
-        python_cmd = "python3 -m sglang.launch_server "
-        sgl_mini_lb_cmd = (
-            "python3 -m sglang.srt.disaggregation.launch_lb "
-            f"--prefill http://{prefill_host_ip}:30000 "
-            f"--decode http://{decode_host_ip}:30000 "
-            "--host 0.0.0.0 "
-            "--port 8000 "
-            "--timeout 3600"
-        )
+        sgl_mini_lb_cmd = get_sglang_mini_lb_command_args(prefill_host_ip, decode_host_ip)
         run_command(sgl_mini_lb_cmd, background=True)
     else:
-        python_cmd = "python3 components/decode_worker.py "
         if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"):
             raise RuntimeError("Failed to connect to etcd")
 
-    cmd_to_run = (
-        f"{python_cmd} "
-        "--model-path /model/ "
-        "--served-model-name deepseek-ai/DeepSeek-R1 "
-        "--skip-tokenizer-init "
-        "--disaggregation-mode decode "
-        "--disaggregation-transfer-backend nixl "
-        "--disaggregation-bootstrap-port 30001 "
-        f"--dist-init-addr {decode_host_ip}:{DIST_INIT_PORT} "
-        f"--nnodes {total_nodes} "
-        f"--node-rank {rank} "
-        f"--tp-size {total_gpus} "
-        f"--dp-size {total_gpus} "
-        "--enable-dp-attention "
-        "--decode-log-interval 1 "
-        "--enable-deepep-moe "
-        "--page-size 1 "
-        "--trust-remote-code "
-        "--moe-dense-tp-size 1 "
-        "--enable-dp-lm-head "
-        "--disable-radix-cache "
-        "--watchdog-timeout 1000000 "
-        "--enable-two-batch-overlap "
-        "--deepep-mode low_latency "
-        "--mem-fraction-static 0.835 "
-        "--ep-num-redundant-experts 32 "
-        "--cuda-graph-bs 256 "
-    )
-
+    cmd_to_run = get_decode_command_args(gpu_type, use_sglang_commands, decode_host_ip, total_nodes, rank, total_gpus)
     return run_command(cmd_to_run)
 
 
@@ -368,7 +307,8 @@ def main(input_args: list[str] | None = None):
             args.prefill_host_ip,
             args.total_nodes,
             args.total_nodes * args.gpus_per_node,
-            args.use_sglang_commands
+            args.use_sglang_commands,
+            args.gpu_type
         )
     else:
         setup_decode_node(
@@ -377,7 +317,8 @@ def main(input_args: list[str] | None = None):
             args.prefill_host_ip,
             args.total_nodes,
             args.total_nodes * args.gpus_per_node,
-            args.use_sglang_commands
+            args.use_sglang_commands,
+            args.gpu_type
         )
 
     logging.info(f"{args.worker_type.capitalize()} node setup complete")

From 78047a91ab2771c117169a0b77b074077419651b Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 7 Jul 2025 23:29:48 +0000
Subject: [PATCH 04/65] shell script

---
 examples/sglang/slurm_jobs/scripts/gb200.sh   | 142 +++++++++++
 examples/sglang/slurm_jobs/scripts/gen_cmd.py | 231 ------------------
 examples/sglang/slurm_jobs/scripts/h100.sh    | 187 ++++++++++++++
 .../sglang/slurm_jobs/scripts/worker_setup.py |  48 +++-
 4 files changed, 372 insertions(+), 236 deletions(-)
 create mode 100644 examples/sglang/slurm_jobs/scripts/gb200.sh
 delete mode 100644 examples/sglang/slurm_jobs/scripts/gen_cmd.py
 create mode 100644 examples/sglang/slurm_jobs/scripts/h100.sh

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
new file mode 100644
index 0000000000..698c1df0ae
--- /dev/null
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+# Function to print usage
+print_usage() {
+    echo "Usage: $0 <mode> <cmd>"
+    echo "  mode: prefill or decode"
+    echo "  cmd:  dynamo or sglang"
+    echo ""
+    echo "Examples:"
+    echo "  $0 prefill dynamo"
+    echo "  $0 decode sglang"
+    exit 1
+}
+
+# Check if correct number of arguments provided
+if [ $# -ne 2 ]; then
+    echo "Error: Expected 2 arguments, got $#"
+    print_usage
+fi
+
+# Parse arguments
+mode=$1
+cmd=$2
+
+# Validate mode argument
+if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
+    echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
+    print_usage
+fi
+
+# Validate cmd argument
+if [ "$cmd" != "dynamo" ] && [ "$cmd" != "sglang" ]; then
+    echo "Error: cmd must be 'dynamo' or 'sglang', got '$cmd'"
+    print_usage
+fi
+
+echo "Mode: $mode"
+echo "Command: $cmd"
+
+
+# Check if required environment variables are set
+if [ -z "$HOST_IP" ]; then
+    echo "Error: HOST_IP environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$PORT" ]; then
+    echo "Error: PORT environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_GPUS" ]; then
+    echo "Error: TOTAL_GPUS environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$RANK" ]; then
+    echo "Error: RANK environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_NODES" ]; then
+    echo "Error: TOTAL_NODES environment variable is not set"
+    exit 1
+fi
+
+
+# Construct command based on mode and cmd
+if [ "$mode" = "prefill" ]; then
+    if [ "$cmd" = "dynamo" ]; then
+        echo "Error: dynamo command not implemented for GB200"
+        exit 1
+    elif [ "$cmd" = "sglang" ]; then
+        # GB200 sglang prefill command
+        python3 -m sglang.launch_server \
+            --served-model-name deepseek-ai/DeepSeek-R1 \
+            --model-path /model/ \
+            --trust-remote-code \
+            --disaggregation-mode prefill \
+            --dist-init-addr "$HOST_IP:$PORT" \
+            --nnodes "$TOTAL_NODES" \
+            --node-rank "$RANK" \
+            --tp-size "$TOTAL_GPUS" \
+            --dp-size "$TOTAL_GPUS" \
+            --enable-dp-attention \
+            --host 0.0.0.0 \
+            --decode-log-interval 1 \
+            --max-running-requests 6144 \
+            --context-length 2176 \
+            --disable-radix-cache \
+            --enable-deepep-moe \
+            --deepep-mode low_latency \
+            --moe-dense-tp-size 1 \
+            --enable-dp-lm-head \
+            --disable-shared-experts-fusion \
+            --ep-num-redundant-experts 32 \
+            --ep-dispatch-algorithm static \
+            --eplb-algorithm deepseek \
+            --attention-backend cutlass_mla \
+            --watchdog-timeout 1000000 \
+            --disable-cuda-graph \
+            --chunked-prefill-size 16384 \
+            --max-total-tokens 32768 \
+            --mem-fraction-static 0.9
+    fi
+elif [ "$mode" = "decode" ]; then
+    if [ "$cmd" = "dynamo" ]; then
+        echo "Error: dynamo command not implemented for GB200"
+        exit 1
+    elif [ "$cmd" = "sglang" ]; then
+        # GB200 sglang decode command
+        python3 -m sglang.launch_server \
+            --model-path /model/ \
+            --trust-remote-code \
+            --disaggregation-transfer-backend nixl \
+            --disaggregation-mode decode \
+            --dist-init-addr "$HOST_IP:$PORT" \
+            --nnodes "$TOTAL_NODES" \
+            --node-rank "$RANK" \
+            --tp-size "$TOTAL_GPUS" \
+            --dp-size "$TOTAL_GPUS" \
+            --enable-dp-attention \
+            --host 0.0.0.0 \
+            --decode-log-interval 1 \
+            --max-running-requests 36864 \
+            --context-length 2176 \
+            --disable-radix-cache \
+            --enable-deepep-moe \
+            --deepep-mode low_latency \
+            --moe-dense-tp-size 1 \
+            --enable-dp-lm-head \
+            --cuda-graph-bs 768 \
+            --disable-shared-experts-fusion \
+            --ep-num-redundant-experts 32 \
+            --ep-dispatch-algorithm static \
+            --eplb-algorithm deepseek \
+            --attention-backend cutlass_mla \
+            --watchdog-timeout 1000000 \
+            --chunked-prefill-size 36864 \
+            --mem-fraction-static 0.82
+    fi
+fi
diff --git a/examples/sglang/slurm_jobs/scripts/gen_cmd.py b/examples/sglang/slurm_jobs/scripts/gen_cmd.py
deleted file mode 100644
index b2e61960da..0000000000
--- a/examples/sglang/slurm_jobs/scripts/gen_cmd.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Use this script to generate dynamo/sglang flags for h100 or gb200 disagg
-"""
-
-def get_prefill_command_args(gpu_type: str, use_sglang_commands: bool, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict:
-    if gpu_type == "h100" and not use_sglang_commands:
-        cmd = (
-            f"python3 components/worker.py "
-            "--model-path /model/ "
-            "--served-model-name deepseek-ai/DeepSeek-R1 "
-            "--skip-tokenizer-init "
-            "--disaggregation-mode prefill "
-            "--disaggregation-transfer-backend nixl "
-            "--disaggregation-bootstrap-port 30001 "
-            f"--dist-init-addr {host_ip}:{port} "
-            f"--nnodes {total_nodes} "
-            f"--node-rank {rank} "
-            f"--tp-size {total_gpus} "
-            f"--dp-size {total_gpus} "
-            "--enable-dp-attention "
-            "--decode-log-interval 1 "
-            "--enable-deepep-moe "
-            "--page-size 1 "
-            "--trust-remote-code "
-            "--moe-dense-tp-size 1 "
-            "--enable-dp-lm-head "
-            "--disable-radix-cache "
-            "--watchdog-timeout 1000000 "
-            "--enable-two-batch-overlap "
-            "--deepep-mode normal "
-            "--mem-fraction-static 0.85 "
-            "--deepep-config /configs/deepep.json "
-            "--ep-num-redundant-experts 32 "
-            "--ep-dispatch-algorithm dynamic "
-            "--eplb-algorithm deepseek "
-        )
-    elif gpu_type == "h100" and use_sglang_commands:
-        cmd = (
-            f"python3 -m sglang.launch_server "
-            "--model-path /model/ "
-            "--served-model-name deepseek-ai/DeepSeek-R1 "
-            "--disaggregation-transfer-backend nixl "
-            "--disaggregation-mode prefill "
-            f"--dist-init-addr {host_ip}:{port} "
-            f"--nnodes {total_nodes} "
-            f"--node-rank {rank} "
-            f"--tp-size {total_gpus} "
-            f"--dp-size {total_gpus} "
-            "--enable-dp-attention "
-            "--decode-log-interval 1 "
-            "--enable-deepep-moe "
-            "--page-size 1 "
-            "--host 0.0.0.0 "
-            "--trust-remote-code "
-            "--moe-dense-tp-size 1 "
-            "--enable-dp-lm-head "
-            "--disable-radix-cache "
-            "--watchdog-timeout 1000000 "
-            "--enable-two-batch-overlap "
-            "--deepep-mode normal "
-            "--mem-fraction-static 0.85 "
-            "--chunked-prefill-size 524288 "
-            "--max-running-requests 8192 "
-            "--max-total-tokens 131072 "
-            "--context-length 8192 "
-            "--init-expert-location /configs/prefill_in4096.json "
-            "--ep-num-redundant-experts 32 "
-            "--ep-dispatch-algorithm dynamic "
-            "--eplb-algorithm deepseek "
-            "--deepep-config /configs/deepep.json "
-
-        )
-    elif gpu_type == "gb200" and use_sglang_commands:
-        cmd = (
-            f"python3 -m sglang.launch_server "
-            "--served-model-name deepseek-ai/DeepSeek-R1 "
-            "--model-path /model/ "
-            "--trust-remote-code "
-            "--disaggregation-mode prefill "
-            f"--dist-init-addr {host_ip}:{port} "
-            f"--nnodes {total_nodes} "
-            f"--node-rank {rank} "
-            f"--tp-size {total_gpus} "
-            f"--dp-size {total_gpus} "
-            "--enable-dp-attention "
-            "--host 0.0.0.0 "
-            "--decode-log-interval 1 "
-            "--max-running-requests 6144 "
-            "--context-length 2176 "
-            "--disable-radix-cache "
-            "--enable-deepep-moe "
-            "--deepep-mode low_latency "
-            "--moe-dense-tp-size 1 "
-            "--enable-dp-lm-head "
-            "--disable-shared-experts-fusion "
-            "--ep-num-redundant-experts 32 "
-            "--ep-dispatch-algorithm static "
-            "--eplb-algorithm deepseek "
-            "--attention-backend cutlass_mla "
-            "--watchdog-timeout 1000000 "
-            "--disable-cuda-graph "
-            "--chunked-prefill-size 16384 "
-            "--max-total-tokens 32768 "
-            "--mem-fraction-static 0.9 "
-        )
-    else:
-        raise ValueError(f"Unsupported: {gpu_type} and use_sglang_commands={use_sglang_commands}")
-
-    return cmd
-    
-def get_decode_command_args(gpu_type: str, use_sglang_commands: bool, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict:
-    if gpu_type == "h100" and not use_sglang_commands:
-        cmd = (
-            f"python3 components/decode_worker.py "
-            "--model-path /model/ "
-            "--served-model-name deepseek-ai/DeepSeek-R1 "
-            "--skip-tokenizer-init "
-            "--disaggregation-mode decode "
-            "--disaggregation-transfer-backend nixl "
-            "--disaggregation-bootstrap-port 30001 "
-            f"--dist-init-addr {host_ip}:{port} "
-            f"--nnodes {total_nodes} "
-            f"--node-rank {rank} "
-            f"--tp-size {total_gpus} "
-            f"--dp-size {total_gpus} "
-            "--enable-dp-attention "
-            "--decode-log-interval 1 "
-            "--enable-deepep-moe "
-            "--page-size 1 "
-            "--trust-remote-code "
-            "--moe-dense-tp-size 1 "
-            "--enable-dp-lm-head "
-            "--disable-radix-cache "
-            "--watchdog-timeout 1000000 "
-            "--enable-two-batch-overlap "
-            "--deepep-mode low_latency "
-            "--mem-fraction-static 0.835 "
-            "--ep-num-redundant-experts 32 "
-            "--cuda-graph-bs 256 "
-        )
-    elif gpu_type == "h100" and use_sglang_commands:
-        cmd = (
-            f"python3 -m sglang.launch_server "
-            "--model-path /model/ "
-            "--disaggregation-transfer-backend nixl "
-            "--disaggregation-mode decode "
-            f"--dist-init-addr {host_ip}:{port} "
-            f"--nnodes {total_nodes} "
-            f"--node-rank {rank} "
-            f"--tp-size {total_gpus} "
-            f"--dp-size {total_gpus} "
-            "--enable-dp-attention "
-            "--decode-log-interval 1 "
-            "--enable-deepep-moe "
-            "--page-size 1 "
-            "--host 0.0.0.0 "
-            "--trust-remote-code "
-            "--moe-dense-tp-size 1 "
-            "--enable-dp-lm-head "
-            "--disable-radix-cache "
-            "--watchdog-timeout 1000000 "
-            "--enable-two-batch-overlap "
-            "--deepep-mode low_latency "
-            "--mem-fraction-static 0.835 "
-            "--max-running-requests 18432 "
-            "--context-length 4500 "
-            "--ep-num-redundant-experts 32 "
-            "--cuda-graph-bs 256 "
-        )
-    elif gpu_type == "gb200" and use_sglang_commands:
-        cmd = (
-            f"python3 -m sglang.launch_server "
-            "--model-path /model/ "
-            "--trust-remote-code "
-            "--disaggregation-transfer-backend nixl "
-            "--disaggregation-mode decode "
-            f"--dist-init-addr {host_ip}:{port} "
-            f"--nnodes {total_nodes} "
-            f"--node-rank {rank} "
-            f"--tp-size {total_gpus} "
-            f"--dp-size {total_gpus} "
-            "--enable-dp-attention "
-            "--host 0.0.0.0 "
-            "--decode-log-interval 1 "
-            "--max-running-requests 36864 "
-            "--context-length 2176 "
-            "--disable-radix-cache "
-            "--enable-deepep-moe "
-            "--deepep-mode low_latency "
-            "--moe-dense-tp-size 1 "
-            "--enable-dp-lm-head "
-            "--cuda-graph-bs 768 "
-            "--disable-shared-experts-fusion "
-            "--ep-num-redundant-experts 32 "
-            "--ep-dispatch-algorithm static "
-            "--eplb-algorithm deepseek "
-            "--attention-backend cutlass_mla "
-            "--watchdog-timeout 1000000 "
-            "--chunked-prefill-size 36864 "
-            "--mem-fraction-static 0.82 "
-        )
-    else:
-        raise ValueError(f"Unsupported: {gpu_type} and use_sglang_commands={use_sglang_commands}")
-
-    return cmd
-
-def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> dict:
-    cmd = (
-        f"python3 -m sglang.srt.disaggregation.launch_lb "
-        f"--prefill http://{prefill_host_ip}:30000 "
-        f"--decode http://{decode_host_ip}:30000 "
-        "--host 0.0.0.0 "
-        "--port 8000 "
-        "--timeout 3600"
-    )
-    return cmd
\ No newline at end of file
diff --git a/examples/sglang/slurm_jobs/scripts/h100.sh b/examples/sglang/slurm_jobs/scripts/h100.sh
new file mode 100644
index 0000000000..a00b63f554
--- /dev/null
+++ b/examples/sglang/slurm_jobs/scripts/h100.sh
@@ -0,0 +1,187 @@
+#!/bin/bash
+
+# Function to print usage
+print_usage() {
+    echo "Usage: $0 <mode> <cmd>"
+    echo "  mode: prefill or decode"
+    echo "  cmd:  dynamo or sglang"
+    echo ""
+    echo "Examples:"
+    echo "  $0 prefill dynamo"
+    echo "  $0 decode sglang"
+    exit 1
+}
+
+# Check if correct number of arguments provided
+if [ $# -ne 2 ]; then
+    echo "Error: Expected 2 arguments, got $#"
+    print_usage
+fi
+
+# Parse arguments
+mode=$1
+cmd=$2
+
+# Validate mode argument
+if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then
+    echo "Error: mode must be 'prefill' or 'decode', got '$mode'"
+    print_usage
+fi
+
+# Validate cmd argument
+if [ "$cmd" != "dynamo" ] && [ "$cmd" != "sglang" ]; then
+    echo "Error: cmd must be 'dynamo' or 'sglang', got '$cmd'"
+    print_usage
+fi
+
+echo "Mode: $mode"
+echo "Command: $cmd"
+
+
+# Check if required environment variables are set
+if [ -z "$HOST_IP" ]; then
+    echo "Error: HOST_IP environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$PORT" ]; then
+    echo "Error: PORT environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_GPUS" ]; then
+    echo "Error: TOTAL_GPUS environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$RANK" ]; then
+    echo "Error: RANK environment variable is not set"
+    exit 1
+fi
+
+if [ -z "$TOTAL_NODES" ]; then
+    echo "Error: TOTAL_NODES environment variable is not set"
+    exit 1
+fi
+
+# Construct command based on mode and cmd
+if [ "$mode" = "prefill" ]; then
+    if [ "$cmd" = "dynamo" ]; then
+        # H100 dynamo prefill command
+        python3 components/worker.py \
+            --model-path /model/ \
+            --served-model-name deepseek-ai/DeepSeek-R1 \
+            --skip-tokenizer-init \
+            --disaggregation-mode prefill \
+            --disaggregation-transfer-backend nixl \
+            --disaggregation-bootstrap-port 30001 \
+            --dist-init-addr "$HOST_IP:$PORT" \
+            --nnodes "$TOTAL_NODES" \
+            --node-rank "$RANK" \
+            --tp-size "$TOTAL_GPUS" \
+            --dp-size "$TOTAL_GPUS" \
+            --enable-dp-attention \
+            --decode-log-interval 1 \
+            --enable-deepep-moe \
+            --page-size 1 \
+            --trust-remote-code \
+            --moe-dense-tp-size 1 \
+            --enable-dp-lm-head \
+            --disable-radix-cache \
+            --watchdog-timeout 1000000 \
+            --enable-two-batch-overlap \
+            --deepep-mode normal \
+            --mem-fraction-static 0.85 \
+            --deepep-config /configs/deepep.json \
+            --ep-num-redundant-experts 32 \
+            --ep-dispatch-algorithm dynamic \
+            --eplb-algorithm deepseek
+    elif [ "$cmd" = "sglang" ]; then
+        # H100 sglang prefill command
+        python3 -m sglang.launch_server \
+            --model-path /model/ \
+            --served-model-name deepseek-ai/DeepSeek-R1 \
+            --disaggregation-transfer-backend nixl \
+            --disaggregation-mode prefill \
+            --dist-init-addr "$HOST_IP:$PORT" \
+            --nnodes "$TOTAL_NODES" \
+            --node-rank "$RANK" \
+            --tp-size "$TOTAL_GPUS" \
+            --dp-size "$TOTAL_GPUS" \
+            --enable-dp-attention \
+            --decode-log-interval 1 \
+            --enable-deepep-moe \
+            --page-size 1 \
+            --host 0.0.0.0 \
+            --trust-remote-code \
+            --moe-dense-tp-size 1 \
+            --enable-dp-lm-head \
+            --disable-radix-cache \
+            --watchdog-timeout 1000000 \
+            --enable-two-batch-overlap \
+            --deepep-mode normal \
+            --mem-fraction-static 0.85 \
+            --ep-num-redundant-experts 32 \
+            --ep-dispatch-algorithm dynamic \
+            --eplb-algorithm deepseek \
+            --deepep-config /configs/deepep.json
+    fi
+elif [ "$mode" = "decode" ]; then
+    if [ "$cmd" = "dynamo" ]; then
+        # H100 dynamo decode command
+        python3 components/decode_worker.py \
+            --model-path /model/ \
+            --served-model-name deepseek-ai/DeepSeek-R1 \
+            --skip-tokenizer-init \
+            --disaggregation-mode decode \
+            --disaggregation-transfer-backend nixl \
+            --disaggregation-bootstrap-port 30001 \
+            --dist-init-addr "$HOST_IP:$PORT" \
+            --nnodes "$TOTAL_NODES" \
+            --node-rank "$RANK" \
+            --tp-size "$TOTAL_GPUS" \
+            --dp-size "$TOTAL_GPUS" \
+            --enable-dp-attention \
+            --decode-log-interval 1 \
+            --enable-deepep-moe \
+            --page-size 1 \
+            --trust-remote-code \
+            --moe-dense-tp-size 1 \
+            --enable-dp-lm-head \
+            --disable-radix-cache \
+            --watchdog-timeout 1000000 \
+            --enable-two-batch-overlap \
+            --deepep-mode low_latency \
+            --mem-fraction-static 0.835 \
+            --ep-num-redundant-experts 32 \
+            --cuda-graph-bs 256
+    elif [ "$cmd" = "sglang" ]; then
+        # H100 sglang decode command
+        python3 -m sglang.launch_server \
+            --model-path /model/ \
+            --disaggregation-transfer-backend nixl \
+            --disaggregation-mode decode \
+            --dist-init-addr "$HOST_IP:$PORT" \
+            --nnodes "$TOTAL_NODES" \
+            --node-rank "$RANK" \
+            --tp-size "$TOTAL_GPUS" \
+            --dp-size "$TOTAL_GPUS" \
+            --enable-dp-attention \
+            --decode-log-interval 1 \
+            --enable-deepep-moe \
+            --page-size 1 \
+            --host 0.0.0.0 \
+            --trust-remote-code \
+            --moe-dense-tp-size 1 \
+            --enable-dp-lm-head \
+            --disable-radix-cache \
+            --watchdog-timeout 1000000 \
+            --enable-two-batch-overlap \
+            --deepep-mode low_latency \
+            --mem-fraction-static 0.835 \
+            --ep-num-redundant-experts 32 \
+            --cuda-graph-bs 256
+    fi
+fi
+
+
diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index 90911127e8..8b69f9c937 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -33,7 +33,6 @@
 import time
 from pathlib import Path
 
-from .gen_cmd import get_prefill_command_args, get_decode_command_args, get_sglang_mini_lb_command_args
 import requests
 
 # Network configurations
@@ -206,6 +205,39 @@ def _validate_args(args: argparse.Namespace) -> None:
     if args.gpus_per_node < 1:
         raise ValueError("GPUs per node must be at least 1")
 
+def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> dict:
+    cmd = (
+        f"python3 -m sglang.srt.disaggregation.launch_lb "
+        f"--prefill http://{prefill_host_ip}:30000 "
+        f"--decode http://{decode_host_ip}:30000 "
+        "--host 0.0.0.0 "
+        "--port 8000 "
+        "--timeout 3600"
+    )
+    return cmd
+
+def setup_env_vars_for_gpu_script(host_ip: str, rank: int, total_gpus: int, total_nodes: int, port: int = DIST_INIT_PORT):
+    """Setup environment variables required by GPU scripts (h100.sh, gb200.sh)"""
+    os.environ["HOST_IP"] = host_ip
+    os.environ["PORT"] = str(port)
+    os.environ["TOTAL_GPUS"] = str(total_gpus)
+    os.environ["RANK"] = str(rank)
+    os.environ["TOTAL_NODES"] = str(total_nodes)
+    
+    logging.info(f"Set HOST_IP: {host_ip}")
+    logging.info(f"Set PORT: {port}")
+    logging.info(f"Set TOTAL_GPUS: {total_gpus}")
+    logging.info(f"Set RANK: {rank}")
+    logging.info(f"Set TOTAL_NODES: {total_nodes}")
+
+def get_gpu_command(worker_type: str, use_sglang_commands: bool, gpu_type: str) -> str:
+    """Generate command to run the appropriate GPU script"""
+    script_name = f"{gpu_type}.sh"
+    script_path = Path(__file__).parent / script_name
+    mode = worker_type  # "prefill" or "decode"
+    cmd = "sglang" if use_sglang_commands else "dynamo"
+    
+    return f"bash {script_path} {mode} {cmd}"
 
 def setup_prefill_node(
     rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int, use_sglang_commands: bool, gpu_type: str
@@ -244,11 +276,13 @@ def setup_prefill_node(
     else:
         logging.info("Using SGLang servers. No need to setup etcd or nats")
 
-    # NOTE: Default command for h100 and dynamo implements the example in examples/sglang/dsr1-wideep.md
-    cmd_to_run = get_prefill_command_args(gpu_type, use_sglang_commands, prefill_host_ip, total_nodes, rank, total_gpus)
+    # Setup environment variables for GPU script
+    setup_env_vars_for_gpu_script(prefill_host_ip, rank, total_gpus, total_nodes)
+    
+    # Use appropriate GPU script instead of generating command directly
+    cmd_to_run = get_gpu_command("prefill", use_sglang_commands, gpu_type)
     return run_command(cmd_to_run)
 
-
 def setup_decode_node(
     rank: int,
     decode_host_ip: str,
@@ -270,7 +304,11 @@ def setup_decode_node(
         if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"):
             raise RuntimeError("Failed to connect to etcd")
 
-    cmd_to_run = get_decode_command_args(gpu_type, use_sglang_commands, decode_host_ip, total_nodes, rank, total_gpus)
+    # Setup environment variables for GPU script
+    setup_env_vars_for_gpu_script(decode_host_ip, rank, total_gpus, total_nodes)
+    
+    # Use appropriate GPU script instead of generating command directly
+    cmd_to_run = get_gpu_command("decode", use_sglang_commands, gpu_type)
     return run_command(cmd_to_run)
 
 

From 4a3c140a8f90459f60d0f13c711d28a502930dde Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 7 Jul 2025 23:41:40 +0000
Subject: [PATCH 05/65] updated jinja

---
 .../sglang/slurm_jobs/job_script_template.j2     | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
index 84e0e33396..7939977538 100755
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -20,6 +20,8 @@ MODEL_DIR="{{ model_dir }}"
 CONFIG_DIR="{{ config_dir }}"
 CONTAINER_IMAGE="{{ container_image }}"
 NETWORK_INTERFACE="{{ network_interface }}"
+GPU_TYPE="{{ gpu_type | default('h100') }}"
+USE_SGLANG_COMMANDS="{{ use_sglang_commands | default(false) }}"
 
 {% raw %}
 
@@ -59,16 +61,22 @@ ENROOT_ARGS="\
     --container-mounts=${MODEL_DIR}:/model/,${CONFIG_DIR}:/configs/,${SCRIPT_DIR}:/scripts/,${OUTPUT_DIR}:/outputs/,${LOG_DIR}:/logs/ \
 "
 
+# Build common worker arguments
+WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE}"
+if [ "$USE_SGLANG_COMMANDS" = "true" ]; then
+    WORKER_ARGS="${WORKER_ARGS} --use-sglang-commands"
+fi
+
 # Launch prefill tasks on the first PREFILL_NODES nodes
 for i in $(seq 0 $((PREFILL_NODES - 1))); do
     node=${nodes[$i]}
     rank=$i
     echo "Launching prefill task on node ${i} (rank ${rank}): $node"
     echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err"
-    echo "Command: python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log &"
+    echo "Command: python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} &"
     srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \
     --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err \
-    python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log &
+    python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} &
 done
 
 # Launch decode tasks on the next DECODE_NODES nodes
@@ -77,10 +85,10 @@ for i in $(seq $PREFILL_NODES $((PREFILL_NODES + DECODE_NODES - 1))); do
     rank=$((i - PREFILL_NODES))
     echo "Launching decode task on node ${i} (rank ${rank}): $node"
     echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err"
-    echo "Command: python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log &"
+    echo "Command: python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} &"
     srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \
     --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err \
-    python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log &
+    python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} &
 done
 
 echo ""

From 647e7b7f87df751170d301a94916868d8ce8c971 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 7 Jul 2025 23:43:36 +0000
Subject: [PATCH 06/65] readme and submitter

---
 examples/sglang/slurm_jobs/README.md          | 31 +++++++++++++++++--
 .../sglang/slurm_jobs/submit_job_script.py    |  9 ++++++
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md
index 19f7c27ada..d7367de965 100644
--- a/examples/sglang/slurm_jobs/README.md
+++ b/examples/sglang/slurm_jobs/README.md
@@ -85,20 +85,45 @@ For simplicity of the example, we will make some assumptions about your SLURM cl
    - `--network-interface`: Network interface to use (default: `eth3`)
    - `--job-name`: SLURM job name (default: `dynamo_setup`)
    - `--time-limit`: Time limit in HH:MM:SS format (default: `01:00:00`)
+   - `--gpu-type`: GPU type to use, choices: `h100`, `gb200` (default: `h100`)
+   - `--use-sglang-commands`: Use SGLang commands instead of Dynamo (default: `false`)
 
    **Note**: The script automatically calculates the total number of nodes needed based on `--prefill-nodes` and `--decode-nodes` parameters.
 
-2. **Monitor job progress**:
+2. **Example with different GPU types**:
+   ```bash
+   # For H100 with Dynamo (default)
+   python submit_job_script.py \
+     --template job_script_template.j2 \
+     --model-dir /path/to/model \
+     --config-dir /path/to/configs \
+     --container-image container-image-uri \
+     --account your-slurm-account \
+     --gpu-type h100
+
+   # For GB200 with SGLang
+   python submit_job_script.py \
+     --template job_script_template.j2 \
+     --model-dir /path/to/model \
+     --config-dir /path/to/configs \
+     --container-image container-image-uri \
+     --account your-slurm-account \
+     --gpu-type gb200 \
+     --use-sglang-commands
+     --gpus-per-node 4
+   ```
+
+3. **Monitor job progress**:
    ```bash
    squeue -u $USER
    ```
 
-3. **Check logs in real-time**:
+4. **Check logs in real-time**:
    ```bash
    tail -f logs/{JOB_ID}/log.out
    ```
 
-4. **Monitor GPU utilization**:
+5. **Monitor GPU utilization**:
    ```bash
    tail -f logs/{JOB_ID}/{node}_prefill_gpu_utilization.log
    ```
diff --git a/examples/sglang/slurm_jobs/submit_job_script.py b/examples/sglang/slurm_jobs/submit_job_script.py
index 64f492224e..510aa40fb2 100644
--- a/examples/sglang/slurm_jobs/submit_job_script.py
+++ b/examples/sglang/slurm_jobs/submit_job_script.py
@@ -100,6 +100,13 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
     parser.add_argument(
         "--network-interface", default="eth3", help="Network interface to use"
     )
+    parser.add_argument(
+        "--gpu-type", choices=["h100", "gb200"], default="h100", help="GPU type to use"
+    )
+    parser.add_argument(
+        "--use-sglang-commands", action="store_true", default=False, 
+        help="Use SGLang commands instead of Dynamo"
+    )
     return parser.parse_args(args)
 
 
@@ -120,6 +127,8 @@ def main(input_args: list[str] | None = None):
         "container_image": args.container_image,
         "gpus_per_node": args.gpus_per_node,
         "network_interface": args.network_interface,
+        "gpu_type": args.gpu_type,
+        "use_sglang_commands": args.use_sglang_commands,
     }
 
     with tempfile.NamedTemporaryFile(mode="w", suffix=".sh") as temp_file:

From e8c3b4663a3819a2da3ac7de98f15db4361acdc5 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 7 Jul 2025 23:51:35 +0000
Subject: [PATCH 07/65] go

---
 examples/sglang/slurm_jobs/job_script_template.j2 | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
index 7939977538..9706a9a41c 100755
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -38,14 +38,22 @@ for i in "${!nodes[@]}"; do
     echo "Node $i: ${nodes[$i]}"
 done
 
-PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+')
+if [ "$GPU_TYPE" = "gb200" ]; then
+    PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} hostname -I | awk '{print $1}')
+else
+    PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+')
+fi
 if [ -z "$PREFILL_HOST_IP" ]; then
     echo "Error: Could not retrieve IP address for prefill host ${nodes[0]} on interface $NETWORK_INTERFACE"
     exit 1
 fi
 echo "Prefill host IP address: $PREFILL_HOST_IP"
 
-DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+')
+if [ "$GPU_TYPE" = "gb200" ]; then
+    DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} hostname -I | awk '{print $1}')
+else
+    DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+')
+fi
 if [ -z "$DECODE_HOST_IP" ]; then
     echo "Error: Could not retrieve IP address for decode host ${nodes[$PREFILL_NODES]} on interface $NETWORK_INTERFACE"
     exit 1

From 7d33f1eb61fc775a0cb9a84e2d5d562f3f92075a Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 7 Jul 2025 23:57:54 +0000
Subject: [PATCH 08/65] executable

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 0
 examples/sglang/slurm_jobs/scripts/h100.sh  | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 examples/sglang/slurm_jobs/scripts/gb200.sh
 mode change 100644 => 100755 examples/sglang/slurm_jobs/scripts/h100.sh

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
old mode 100644
new mode 100755
diff --git a/examples/sglang/slurm_jobs/scripts/h100.sh b/examples/sglang/slurm_jobs/scripts/h100.sh
old mode 100644
new mode 100755

From e56a0f416c144a7cf9c311f6d90f6c263dc58d61 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 00:06:22 +0000
Subject: [PATCH 09/65] bool check

---
 examples/sglang/slurm_jobs/job_script_template.j2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
index 9706a9a41c..f6abc0343f 100755
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -71,7 +71,7 @@ ENROOT_ARGS="\
 
 # Build common worker arguments
 WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE}"
-if [ "$USE_SGLANG_COMMANDS" = "true" ]; then
+if [ "$USE_SGLANG_COMMANDS" = "True" ]; then
     WORKER_ARGS="${WORKER_ARGS} --use-sglang-commands"
 fi
 

From ba0cc3cc2b4487805b5a78d120d4b6d7cd779f0c Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 00:13:35 +0000
Subject: [PATCH 10/65] Added env vars

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 698c1df0ae..1fe254b42c 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -72,6 +72,12 @@ if [ "$mode" = "prefill" ]; then
         exit 1
     elif [ "$cmd" = "sglang" ]; then
         # GB200 sglang prefill command
+        SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
+        NCCL_MNNVL_ENABLE=1 \
+        NCCL_CUMEM_ENABLE=1 \
+        SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+        SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+        PYTHONUNBUFFERED=1 \
         python3 -m sglang.launch_server \
             --served-model-name deepseek-ai/DeepSeek-R1 \
             --model-path /model/ \
@@ -109,6 +115,8 @@ elif [ "$mode" = "decode" ]; then
         exit 1
     elif [ "$cmd" = "sglang" ]; then
         # GB200 sglang decode command
+        SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \
+        SGLANG_NUM_RESERVED_DECODE_TOKENS=176 \
         python3 -m sglang.launch_server \
             --model-path /model/ \
             --trust-remote-code \

From d0316c24f4b913dbbec8fe1c90dd94e5ae97694f Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 00:19:47 +0000
Subject: [PATCH 11/65] go

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 1fe254b42c..518c4af671 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -83,6 +83,7 @@ if [ "$mode" = "prefill" ]; then
             --model-path /model/ \
             --trust-remote-code \
             --disaggregation-mode prefill \
+            --disaggregation-transfer-backend nixl \
             --dist-init-addr "$HOST_IP:$PORT" \
             --nnodes "$TOTAL_NODES" \
             --node-rank "$RANK" \

From b0cfbd1b139ee8c2cc4ebac883ffbb53e0820460 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 01:58:54 +0000
Subject: [PATCH 12/65] go

---
 examples/sglang/slurm_jobs/job_script_template.j2  | 2 +-
 examples/sglang/slurm_jobs/scripts/worker_setup.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
index f6abc0343f..afe05c5662 100755
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -64,7 +64,7 @@ echo "Decode host IP address: $DECODE_HOST_IP"
 ENROOT_ARGS="\
     --container-image=${CONTAINER_IMAGE} \
     --no-container-entrypoint \
-    --container-mount-home \
+    --no-container-mount-home \
     --no-container-remap-root \
     --container-mounts=${MODEL_DIR}:/model/,${CONFIG_DIR}:/configs/,${SCRIPT_DIR}:/scripts/,${OUTPUT_DIR}:/outputs/,${LOG_DIR}:/logs/ \
 "
diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index 8b69f9c937..2b69700401 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -265,7 +265,7 @@ def setup_prefill_node(
             if not etcd_process:
                 raise RuntimeError("Failed to start etcd")
 
-            ingress_process = run_command("dynamo run in=http out=dyn", background=True)
+            ingress_process = run_command("dynamo run in=http out=dyn --http-port=8000", background=True)
             if not ingress_process:
                 raise RuntimeError("Failed to start ingress")
 

From bcaf9f5c9852fbc22cd655b8910d338ccbb581a1 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 7 Jul 2025 19:53:23 -0700
Subject: [PATCH 13/65] pc

---
 .../sglang/slurm_jobs/scripts/worker_setup.py | 46 ++++++++++++++-----
 .../sglang/slurm_jobs/submit_job_script.py    |  6 ++-
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index 2b69700401..8a039c7b40 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -205,6 +205,7 @@ def _validate_args(args: argparse.Namespace) -> None:
     if args.gpus_per_node < 1:
         raise ValueError("GPUs per node must be at least 1")
 
+
 def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> dict:
     cmd = (
         f"python3 -m sglang.srt.disaggregation.launch_lb "
@@ -216,31 +217,45 @@ def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -
     )
     return cmd
 
-def setup_env_vars_for_gpu_script(host_ip: str, rank: int, total_gpus: int, total_nodes: int, port: int = DIST_INIT_PORT):
+
+def setup_env_vars_for_gpu_script(
+    host_ip: str,
+    rank: int,
+    total_gpus: int,
+    total_nodes: int,
+    port: int = DIST_INIT_PORT,
+):
     """Setup environment variables required by GPU scripts (h100.sh, gb200.sh)"""
     os.environ["HOST_IP"] = host_ip
     os.environ["PORT"] = str(port)
     os.environ["TOTAL_GPUS"] = str(total_gpus)
     os.environ["RANK"] = str(rank)
     os.environ["TOTAL_NODES"] = str(total_nodes)
-    
+
     logging.info(f"Set HOST_IP: {host_ip}")
     logging.info(f"Set PORT: {port}")
     logging.info(f"Set TOTAL_GPUS: {total_gpus}")
     logging.info(f"Set RANK: {rank}")
     logging.info(f"Set TOTAL_NODES: {total_nodes}")
 
+
 def get_gpu_command(worker_type: str, use_sglang_commands: bool, gpu_type: str) -> str:
     """Generate command to run the appropriate GPU script"""
     script_name = f"{gpu_type}.sh"
     script_path = Path(__file__).parent / script_name
     mode = worker_type  # "prefill" or "decode"
     cmd = "sglang" if use_sglang_commands else "dynamo"
-    
+
     return f"bash {script_path} {mode} {cmd}"
 
+
 def setup_prefill_node(
-    rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int, use_sglang_commands: bool, gpu_type: str
+    rank: int,
+    prefill_host_ip: str,
+    total_nodes: int,
+    total_gpus: int,
+    use_sglang_commands: bool,
+    gpu_type: str,
 ) -> int:
     """
     Setup the prefill node.
@@ -248,7 +263,9 @@ def setup_prefill_node(
     if not use_sglang_commands:
         if rank == 0:
             logging.info(f"Setting up host prefill node: {rank}")
-            logging.info(f"Starting nats server on node {rank} with IP {prefill_host_ip}")
+            logging.info(
+                f"Starting nats server on node {rank} with IP {prefill_host_ip}"
+            )
 
             nats_process = run_command("nats-server -js", background=True)
             if not nats_process:
@@ -265,7 +282,9 @@ def setup_prefill_node(
             if not etcd_process:
                 raise RuntimeError("Failed to start etcd")
 
-            ingress_process = run_command("dynamo run in=http out=dyn --http-port=8000", background=True)
+            ingress_process = run_command(
+                "dynamo run in=http out=dyn --http-port=8000", background=True
+            )
             if not ingress_process:
                 raise RuntimeError("Failed to start ingress")
 
@@ -278,11 +297,12 @@ def setup_prefill_node(
 
     # Setup environment variables for GPU script
     setup_env_vars_for_gpu_script(prefill_host_ip, rank, total_gpus, total_nodes)
-    
+
     # Use appropriate GPU script instead of generating command directly
     cmd_to_run = get_gpu_command("prefill", use_sglang_commands, gpu_type)
     return run_command(cmd_to_run)
 
+
 def setup_decode_node(
     rank: int,
     decode_host_ip: str,
@@ -290,7 +310,7 @@ def setup_decode_node(
     total_nodes: int,
     total_gpus: int,
     use_sglang_commands: bool,
-    gpu_type: str
+    gpu_type: str,
 ) -> int:
     """
     Setup the decode node.
@@ -298,7 +318,9 @@ def setup_decode_node(
     logging.info(f"Setting up child decode node: {rank}")
 
     if use_sglang_commands:
-        sgl_mini_lb_cmd = get_sglang_mini_lb_command_args(prefill_host_ip, decode_host_ip)
+        sgl_mini_lb_cmd = get_sglang_mini_lb_command_args(
+            prefill_host_ip, decode_host_ip
+        )
         run_command(sgl_mini_lb_cmd, background=True)
     else:
         if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"):
@@ -306,7 +328,7 @@ def setup_decode_node(
 
     # Setup environment variables for GPU script
     setup_env_vars_for_gpu_script(decode_host_ip, rank, total_gpus, total_nodes)
-    
+
     # Use appropriate GPU script instead of generating command directly
     cmd_to_run = get_gpu_command("decode", use_sglang_commands, gpu_type)
     return run_command(cmd_to_run)
@@ -346,7 +368,7 @@ def main(input_args: list[str] | None = None):
             args.total_nodes,
             args.total_nodes * args.gpus_per_node,
             args.use_sglang_commands,
-            args.gpu_type
+            args.gpu_type,
         )
     else:
         setup_decode_node(
@@ -356,7 +378,7 @@ def main(input_args: list[str] | None = None):
             args.total_nodes,
             args.total_nodes * args.gpus_per_node,
             args.use_sglang_commands,
-            args.gpu_type
+            args.gpu_type,
         )
 
     logging.info(f"{args.worker_type.capitalize()} node setup complete")
diff --git a/examples/sglang/slurm_jobs/submit_job_script.py b/examples/sglang/slurm_jobs/submit_job_script.py
index 510aa40fb2..3b08c26827 100644
--- a/examples/sglang/slurm_jobs/submit_job_script.py
+++ b/examples/sglang/slurm_jobs/submit_job_script.py
@@ -104,8 +104,10 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
         "--gpu-type", choices=["h100", "gb200"], default="h100", help="GPU type to use"
     )
     parser.add_argument(
-        "--use-sglang-commands", action="store_true", default=False, 
-        help="Use SGLang commands instead of Dynamo"
+        "--use-sglang-commands",
+        action="store_true",
+        default=False,
+        help="Use SGLang commands instead of Dynamo",
     )
     return parser.parse_args(args)
 

From 1265ac8495587a511e55ca96f6184a917436dfdb Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 02:55:37 +0000
Subject: [PATCH 14/65] mypy

---
 examples/sglang/slurm_jobs/scripts/worker_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index 8a039c7b40..f0fdea75ff 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -206,7 +206,7 @@ def _validate_args(args: argparse.Namespace) -> None:
         raise ValueError("GPUs per node must be at least 1")
 
 
-def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> dict:
+def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> str:
     cmd = (
         f"python3 -m sglang.srt.disaggregation.launch_lb "
         f"--prefill http://{prefill_host_ip}:30000 "

From bf8d1035511dd699dceef4ad038bbda1b337ab59 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 03:10:11 +0000
Subject: [PATCH 15/65] bump

---
 examples/sglang/slurm_jobs/README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md
index d7367de965..a15ab59ca6 100644
--- a/examples/sglang/slurm_jobs/README.md
+++ b/examples/sglang/slurm_jobs/README.md
@@ -123,6 +123,15 @@ For simplicity of the example, we will make some assumptions about your SLURM cl
    tail -f logs/{JOB_ID}/log.out
    ```
 
+   You can view logs of all prefill or decode workers simultaneously by running:
+   ```bash
+   # prefill workers err (or .out)
+   tail -f logs/{JOB_ID}/*_prefill.err 
+   
+   # decode workers err (or .out)
+   tail -f logs/{JOB_ID}/*_decode.err 
+   ```
+
 5. **Monitor GPU utilization**:
    ```bash
    tail -f logs/{JOB_ID}/{node}_prefill_gpu_utilization.log

From 51ca695a1dd2cb1dbdebc5e54cec6bfb6016cd11 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 7 Jul 2025 20:30:03 -0700
Subject: [PATCH 16/65] pc

---
 examples/sglang/slurm_jobs/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md
index a15ab59ca6..da0f00dd41 100644
--- a/examples/sglang/slurm_jobs/README.md
+++ b/examples/sglang/slurm_jobs/README.md
@@ -126,10 +126,10 @@ For simplicity of the example, we will make some assumptions about your SLURM cl
    You can view logs of all prefill or decode workers simultaneously by running:
    ```bash
    # prefill workers err (or .out)
-   tail -f logs/{JOB_ID}/*_prefill.err 
-   
+   tail -f logs/{JOB_ID}/*_prefill.err
+
    # decode workers err (or .out)
-   tail -f logs/{JOB_ID}/*_decode.err 
+   tail -f logs/{JOB_ID}/*_decode.err
    ```
 
 5. **Monitor GPU utilization**:

From 40c5d33015099c88f569d4b3b99b9fb777705b78 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 03:30:49 +0000
Subject: [PATCH 17/65] cpy

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 2 ++
 examples/sglang/slurm_jobs/scripts/h100.sh  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 518c4af671..af4d3aa549 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # Function to print usage
 print_usage() {
diff --git a/examples/sglang/slurm_jobs/scripts/h100.sh b/examples/sglang/slurm_jobs/scripts/h100.sh
index a00b63f554..b457484e3a 100755
--- a/examples/sglang/slurm_jobs/scripts/h100.sh
+++ b/examples/sglang/slurm_jobs/scripts/h100.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # Function to print usage
 print_usage() {

From c1a8e5abfc23b149b1ea840dc7b72e8494aa53a5 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 18:35:52 +0000
Subject: [PATCH 18/65] combined echo and print based on pr comment

---
 .../sglang/slurm_jobs/job_script_template.j2   | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
index afe05c5662..b6fe92416d 100755
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -80,11 +80,10 @@ for i in $(seq 0 $((PREFILL_NODES - 1))); do
     node=${nodes[$i]}
     rank=$i
     echo "Launching prefill task on node ${i} (rank ${rank}): $node"
-    echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err"
-    echo "Command: python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} &"
-    srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \
-    --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err \
-    python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} &
+    
+    cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} &"
+    echo "$cmd"
+    $cmd
 done
 
 # Launch decode tasks on the next DECODE_NODES nodes
@@ -92,11 +91,10 @@ for i in $(seq $PREFILL_NODES $((PREFILL_NODES + DECODE_NODES - 1))); do
     node=${nodes[$i]}
     rank=$((i - PREFILL_NODES))
     echo "Launching decode task on node ${i} (rank ${rank}): $node"
-    echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err"
-    echo "Command: python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} &"
-    srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \
-    --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err \
-    python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} &
+    
+    cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} &"
+    echo "$cmd"
+    $cmd
 done
 
 echo ""

From 2033b1f5be4e3e6c5c7f9d9f7cb8d6ce86e91bfb Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 22:44:11 +0000
Subject: [PATCH 19/65] option1

---
 examples/sglang/slurm_jobs/job_script_template.j2 | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
index b6fe92416d..fca83b6726 100755
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -41,7 +41,7 @@ done
 if [ "$GPU_TYPE" = "gb200" ]; then
     PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} hostname -I | awk '{print $1}')
 else
-    PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+')
+    PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip route get $(getent hosts ${nodes[0]} | awk '{ print $1 }') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
 fi
 if [ -z "$PREFILL_HOST_IP" ]; then
     echo "Error: Could not retrieve IP address for prefill host ${nodes[0]} on interface $NETWORK_INTERFACE"
@@ -52,7 +52,7 @@ echo "Prefill host IP address: $PREFILL_HOST_IP"
 if [ "$GPU_TYPE" = "gb200" ]; then
     DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} hostname -I | awk '{print $1}')
 else
-    DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+')
+    DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ip route get $(getent hosts ${nodes[$PREFILL_NODES]} | awk '{ print $1 }') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
 fi
 if [ -z "$DECODE_HOST_IP" ]; then
     echo "Error: Could not retrieve IP address for decode host ${nodes[$PREFILL_NODES]} on interface $NETWORK_INTERFACE"
@@ -81,9 +81,9 @@ for i in $(seq 0 $((PREFILL_NODES - 1))); do
     rank=$i
     echo "Launching prefill task on node ${i} (rank ${rank}): $node"
     
-    cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} &"
+    cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS}"
     echo "$cmd"
-    $cmd
+    $cmd &
 done
 
 # Launch decode tasks on the next DECODE_NODES nodes
@@ -92,9 +92,9 @@ for i in $(seq $PREFILL_NODES $((PREFILL_NODES + DECODE_NODES - 1))); do
     rank=$((i - PREFILL_NODES))
     echo "Launching decode task on node ${i} (rank ${rank}): $node"
     
-    cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} &"
+    cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS}"
     echo "$cmd"
-    $cmd
+    $cmd &
 done
 
 echo ""

From 44330642ed9b6114d66f06352af4ca5f1fcd8997 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 22:55:31 +0000
Subject: [PATCH 20/65] option1

---
 examples/sglang/slurm_jobs/job_script_template.j2 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
index fca83b6726..6460a7a23e 100755
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -41,7 +41,7 @@ done
 if [ "$GPU_TYPE" = "gb200" ]; then
     PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} hostname -I | awk '{print $1}')
 else
-    PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip route get $(getent hosts ${nodes[0]} | awk '{ print $1 }') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
+    PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip route get $(getent ahosts ${nodes[0]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
 fi
 if [ -z "$PREFILL_HOST_IP" ]; then
     echo "Error: Could not retrieve IP address for prefill host ${nodes[0]} on interface $NETWORK_INTERFACE"
@@ -52,7 +52,7 @@ echo "Prefill host IP address: $PREFILL_HOST_IP"
 if [ "$GPU_TYPE" = "gb200" ]; then
     DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} hostname -I | awk '{print $1}')
 else
-    DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ip route get $(getent hosts ${nodes[$PREFILL_NODES]} | awk '{ print $1 }') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
+    DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ip route get $(getent ahosts ${nodes[$PREFILL_NODES]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
 fi
 if [ -z "$DECODE_HOST_IP" ]; then
     echo "Error: Could not retrieve IP address for decode host ${nodes[$PREFILL_NODES]} on interface $NETWORK_INTERFACE"

From d827ac7bdd3120af3c63cacb554003ae34a7e16c Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 23:01:39 +0000
Subject: [PATCH 21/65] works on h100 and gb200 cluster

---
 examples/sglang/slurm_jobs/job_script_template.j2 | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
index 6460a7a23e..90da04c7ad 100755
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -38,22 +38,14 @@ for i in "${!nodes[@]}"; do
     echo "Node $i: ${nodes[$i]}"
 done
 
-if [ "$GPU_TYPE" = "gb200" ]; then
-    PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} hostname -I | awk '{print $1}')
-else
-    PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip route get $(getent ahosts ${nodes[0]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
-fi
+PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip route get $(getent ahosts ${nodes[0]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
 if [ -z "$PREFILL_HOST_IP" ]; then
     echo "Error: Could not retrieve IP address for prefill host ${nodes[0]} on interface $NETWORK_INTERFACE"
     exit 1
 fi
 echo "Prefill host IP address: $PREFILL_HOST_IP"
 
-if [ "$GPU_TYPE" = "gb200" ]; then
-    DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} hostname -I | awk '{print $1}')
-else
-    DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ip route get $(getent ahosts ${nodes[$PREFILL_NODES]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
-fi
+DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ip route get $(getent ahosts ${nodes[$PREFILL_NODES]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
 if [ -z "$DECODE_HOST_IP" ]; then
     echo "Error: Could not retrieve IP address for decode host ${nodes[$PREFILL_NODES]} on interface $NETWORK_INTERFACE"
     exit 1

From c2fe7b0ff750be45ec51d4bc76a53b1694c36639 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 23:21:31 +0000
Subject: [PATCH 22/65] added nats etcd ingress setup function

---
 .../sglang/slurm_jobs/scripts/worker_setup.py | 53 ++++++++++---------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index f0fdea75ff..e097ab8d2d 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -249,6 +249,32 @@ def get_gpu_command(worker_type: str, use_sglang_commands: bool, gpu_type: str)
     return f"bash {script_path} {mode} {cmd}"
 
 
+def setup_nats_etcd_and_ingress(prefill_host_ip: str) -> None:
+    logging.info(f"Starting nats server on node {prefill_host_ip}")
+
+    nats_process = run_command("nats-server -js", background=True)
+    if not nats_process:
+        raise RuntimeError("Failed to start nats-server")
+
+    logging.info(f"Starting etcd server on node {prefill_host_ip}")
+    etcd_cmd = (
+        f"etcd --listen-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} "
+        f"--advertise-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} "
+        f"--listen-peer-urls {ETCD_LISTEN_ADDR}:{ETCD_PEER_PORT} "
+        f"--initial-cluster default=http://{prefill_host_ip}:{ETCD_PEER_PORT}"
+    )
+
+    etcd_process = run_command(etcd_cmd, background=True)
+    if not etcd_process:
+        raise RuntimeError("Failed to start etcd")
+
+    logging.info(f"Starting ingress server on node {prefill_host_ip}")
+    ingress_process = run_command(
+        "dynamo run in=http out=dyn --http-port=8000", background=True
+    )
+    if not ingress_process:
+        raise RuntimeError("Failed to start ingress")
+
 def setup_prefill_node(
     rank: int,
     prefill_host_ip: str,
@@ -262,32 +288,7 @@ def setup_prefill_node(
     """
     if not use_sglang_commands:
         if rank == 0:
-            logging.info(f"Setting up host prefill node: {rank}")
-            logging.info(
-                f"Starting nats server on node {rank} with IP {prefill_host_ip}"
-            )
-
-            nats_process = run_command("nats-server -js", background=True)
-            if not nats_process:
-                raise RuntimeError("Failed to start nats-server")
-
-            etcd_cmd = (
-                f"etcd --listen-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} "
-                f"--advertise-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} "
-                f"--listen-peer-urls {ETCD_LISTEN_ADDR}:{ETCD_PEER_PORT} "
-                f"--initial-cluster default=http://{prefill_host_ip}:{ETCD_PEER_PORT}"
-            )
-
-            etcd_process = run_command(etcd_cmd, background=True)
-            if not etcd_process:
-                raise RuntimeError("Failed to start etcd")
-
-            ingress_process = run_command(
-                "dynamo run in=http out=dyn --http-port=8000", background=True
-            )
-            if not ingress_process:
-                raise RuntimeError("Failed to start ingress")
-
+            setup_nats_etcd_and_ingress(prefill_host_ip)
         else:
             logging.info(f"Setting up child prefill node: {rank}")
             if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"):

From 30b9d8ec879f346e4d21d1ca1d4ec44ae5de29b3 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 16:22:37 -0700
Subject: [PATCH 23/65] precommit

---
 examples/sglang/slurm_jobs/job_script_template.j2  | 4 ++--
 examples/sglang/slurm_jobs/scripts/worker_setup.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
index 90da04c7ad..2e873c42fa 100755
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -72,7 +72,7 @@ for i in $(seq 0 $((PREFILL_NODES - 1))); do
     node=${nodes[$i]}
     rank=$i
     echo "Launching prefill task on node ${i} (rank ${rank}): $node"
-    
+
     cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS}"
     echo "$cmd"
     $cmd &
@@ -83,7 +83,7 @@ for i in $(seq $PREFILL_NODES $((PREFILL_NODES + DECODE_NODES - 1))); do
     node=${nodes[$i]}
     rank=$((i - PREFILL_NODES))
     echo "Launching decode task on node ${i} (rank ${rank}): $node"
-    
+
     cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS}"
     echo "$cmd"
     $cmd &
diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index e097ab8d2d..a00e2a1b95 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -275,6 +275,7 @@ def setup_nats_etcd_and_ingress(prefill_host_ip: str) -> None:
     if not ingress_process:
         raise RuntimeError("Failed to start ingress")
 
+
 def setup_prefill_node(
     rank: int,
     prefill_host_ip: str,

From 3a66d66e66bb23cc9a1b13ee85550f17ab324a1e Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 8 Jul 2025 23:42:02 +0000
Subject: [PATCH 24/65] add server

---
 .../sglang/slurm_jobs/scripts/worker_setup.py     | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index e097ab8d2d..a806c37643 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -249,7 +249,10 @@ def get_gpu_command(worker_type: str, use_sglang_commands: bool, gpu_type: str)
     return f"bash {script_path} {mode} {cmd}"
 
 
-def setup_nats_etcd_and_ingress(prefill_host_ip: str) -> None:
+def setup_head_prefill_node(prefill_host_ip: str) -> None:
+    """
+    Setup NATS, etcd, ingress, and http servers on the prefill host node.
+    """
     logging.info(f"Starting nats server on node {prefill_host_ip}")
 
     nats_process = run_command("nats-server -js", background=True)
@@ -274,6 +277,14 @@ def setup_nats_etcd_and_ingress(prefill_host_ip: str) -> None:
     )
     if not ingress_process:
         raise RuntimeError("Failed to start ingress")
+    
+    logging.info(f"Starting http server on port 9001for flush_cache endpoint on node {prefill_host_ip}")
+    cache_flush_server_cmd = (
+        f"python3 utils/sgl_http_server.py --ns dynamo"
+    )
+    cache_flush_server_process = run_command(cache_flush_server_cmd, background=True)
+    if not cache_flush_server_process:
+        raise RuntimeError("Failed to start cache flush server")
 
 def setup_prefill_node(
     rank: int,
@@ -288,7 +299,7 @@ def setup_prefill_node(
     """
     if not use_sglang_commands:
         if rank == 0:
-            setup_nats_etcd_and_ingress(prefill_host_ip)
+            setup_head_prefill_node(prefill_host_ip)
         else:
             logging.info(f"Setting up child prefill node: {rank}")
             if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"):

From 62f2ea52fe1a04b7fd492a8cdf6ec3f1257de1a1 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Wed, 9 Jul 2025 18:34:21 +0000
Subject: [PATCH 25/65] bump time limit

---
 examples/sglang/slurm_jobs/submit_job_script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sglang/slurm_jobs/submit_job_script.py b/examples/sglang/slurm_jobs/submit_job_script.py
index 3b08c26827..196de92a0d 100644
--- a/examples/sglang/slurm_jobs/submit_job_script.py
+++ b/examples/sglang/slurm_jobs/submit_job_script.py
@@ -86,7 +86,7 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
     parser.add_argument("--config-dir", required=True, help="Config directory path")
     parser.add_argument("--container-image", required=True, help="Container image")
     parser.add_argument(
-        "--time-limit", default="01:00:00", help="Time limit (HH:MM:SS)"
+        "--time-limit", default="04:00:00", help="Time limit (HH:MM:SS)"
     )
     parser.add_argument(
         "--prefill-nodes", type=int, default=2, help="Number of prefill nodes"

From ec9aa138569891dd3085f38edb19a4eb22aaa9ba Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 10 Jul 2025 00:29:52 +0000
Subject: [PATCH 26/65] fix mooncake env vars on gb200

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index af4d3aa549..51443c4a46 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -75,11 +75,8 @@ if [ "$mode" = "prefill" ]; then
     elif [ "$cmd" = "sglang" ]; then
         # GB200 sglang prefill command
         SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
-        NCCL_MNNVL_ENABLE=1 \
-        NCCL_CUMEM_ENABLE=1 \
-        SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
-        SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
-        PYTHONUNBUFFERED=1 \
+        SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \
+        SGLANG_MOONCAKE_CUSTOM_POOL=True \
         python3 -m sglang.launch_server \
             --served-model-name deepseek-ai/DeepSeek-R1 \
             --model-path /model/ \
@@ -120,6 +117,8 @@ elif [ "$mode" = "decode" ]; then
         # GB200 sglang decode command
         SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \
         SGLANG_NUM_RESERVED_DECODE_TOKENS=176 \
+        SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \
+        SGLANG_MOONCAKE_CUSTOM_POOL=True \
         python3 -m sglang.launch_server \
             --model-path /model/ \
             --trust-remote-code \

From 75d2a25e1c662483de327cf3ef5bb521b1a9c64c Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 10 Jul 2025 00:47:33 +0000
Subject: [PATCH 27/65] bump

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 51443c4a46..a431cbecdb 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -82,7 +82,7 @@ if [ "$mode" = "prefill" ]; then
             --model-path /model/ \
             --trust-remote-code \
             --disaggregation-mode prefill \
-            --disaggregation-transfer-backend nixl \
+            --disaggregation-transfer-backend mooncake \
             --dist-init-addr "$HOST_IP:$PORT" \
             --nnodes "$TOTAL_NODES" \
             --node-rank "$RANK" \
@@ -122,7 +122,7 @@ elif [ "$mode" = "decode" ]; then
         python3 -m sglang.launch_server \
             --model-path /model/ \
             --trust-remote-code \
-            --disaggregation-transfer-backend nixl \
+            --disaggregation-transfer-backend mooncake \
             --disaggregation-mode decode \
             --dist-init-addr "$HOST_IP:$PORT" \
             --nnodes "$TOTAL_NODES" \

From d2ae39fc7621efc5c53806f4e20a639c539f7d45 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 10 Jul 2025 21:43:37 +0000
Subject: [PATCH 28/65] use sglang rust balancer

---
 examples/sglang/slurm_jobs/scripts/worker_setup.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index eea8489296..5e638f819e 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -208,12 +208,14 @@ def _validate_args(args: argparse.Namespace) -> None:
 
 def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> str:
     cmd = (
-        f"python3 -m sglang.srt.disaggregation.launch_lb "
-        f"--prefill http://{prefill_host_ip}:30000 "
-        f"--decode http://{decode_host_ip}:30000 "
-        "--host 0.0.0.0 "
-        "--port 8000 "
-        "--timeout 3600"
+        f"python3 -m sglang_router.launch_router "
+        f"--policy prefill_decode "
+        f"--prefill-urls http://{prefill_host_ip}:30000:8998 "
+        f"--decode-urls http://{decode_host_ip}:30000 "
+        f"--pd-policy random "
+        f"--host 0.0.0.0 "
+        f"--port 8000 "
+        f"--timeout 3600"
     )
     return cmd
 

From 4f22a17a472e9c9839a437f8ee1a2e7ed54f0963 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 10 Jul 2025 21:53:08 +0000
Subject: [PATCH 29/65] fix

---
 examples/sglang/slurm_jobs/scripts/worker_setup.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index 5e638f819e..0b48087403 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -208,14 +208,14 @@ def _validate_args(args: argparse.Namespace) -> None:
 
 def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> str:
     cmd = (
+        "pip install sglang-router; "
         f"python3 -m sglang_router.launch_router "
-        f"--policy prefill_decode "
-        f"--prefill-urls http://{prefill_host_ip}:30000:8998 "
-        f"--decode-urls http://{decode_host_ip}:30000 "
-        f"--pd-policy random "
+        f"--policy random "
+        f"--pd-disaggregation "
+        f"--prefill http://{prefill_host_ip}:30000 30001 "
+        f"--decode http://{decode_host_ip}:30000 "
         f"--host 0.0.0.0 "
         f"--port 8000 "
-        f"--timeout 3600"
     )
     return cmd
 

From 7cac87ed3e88a7d0b833b7eb33b124450504bf8d Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 10 Jul 2025 22:06:33 +0000
Subject: [PATCH 30/65] another

---
 examples/sglang/slurm_jobs/scripts/worker_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index 0b48087403..c3e95c65ca 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -208,7 +208,7 @@ def _validate_args(args: argparse.Namespace) -> None:
 
 def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> str:
     cmd = (
-        "pip install sglang-router; "
+        "pip install sglang-router && "
         f"python3 -m sglang_router.launch_router "
         f"--policy random "
         f"--pd-disaggregation "

From c9c5e26da70ac9401b6071958ff3ffadf133fd35 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Fri, 11 Jul 2025 20:24:44 +0000
Subject: [PATCH 31/65] gb 200 but nixl

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index a431cbecdb..38799c2547 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -77,12 +77,15 @@ if [ "$mode" = "prefill" ]; then
         SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
         SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \
         SGLANG_MOONCAKE_CUSTOM_POOL=True \
+        NIXL_LOG_LEVEL=TRACE \
+        UCX_LOG_LEVEL=debug \
+        MC_FORCE_MNNVL=1 \
         python3 -m sglang.launch_server \
             --served-model-name deepseek-ai/DeepSeek-R1 \
             --model-path /model/ \
             --trust-remote-code \
             --disaggregation-mode prefill \
-            --disaggregation-transfer-backend mooncake \
+            --disaggregation-transfer-backend nixl \
             --dist-init-addr "$HOST_IP:$PORT" \
             --nnodes "$TOTAL_NODES" \
             --node-rank "$RANK" \
@@ -107,7 +110,8 @@ if [ "$mode" = "prefill" ]; then
             --disable-cuda-graph \
             --chunked-prefill-size 16384 \
             --max-total-tokens 32768 \
-            --mem-fraction-static 0.9
+            --mem-fraction-static 0.9 \
+            --log-level debug
     fi
 elif [ "$mode" = "decode" ]; then
     if [ "$cmd" = "dynamo" ]; then
@@ -119,6 +123,9 @@ elif [ "$mode" = "decode" ]; then
         SGLANG_NUM_RESERVED_DECODE_TOKENS=176 \
         SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \
         SGLANG_MOONCAKE_CUSTOM_POOL=True \
+        NIXL_LOG_LEVEL=TRACE \
+        UCX_LOG_LEVEL=debug \
+        MC_FORCE_MNNVL=1 \
         python3 -m sglang.launch_server \
             --model-path /model/ \
             --trust-remote-code \
@@ -147,6 +154,7 @@ elif [ "$mode" = "decode" ]; then
             --attention-backend cutlass_mla \
             --watchdog-timeout 1000000 \
             --chunked-prefill-size 36864 \
-            --mem-fraction-static 0.82
+            --mem-fraction-static 0.82 \
+            --log-level debug
     fi
 fi

From 1afee3c82dd167c127a2861fe81e6684b614e710 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Fri, 11 Jul 2025 20:28:09 +0000
Subject: [PATCH 32/65] bump

---
 examples/sglang/slurm_jobs/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md
index da0f00dd41..661fb62278 100644
--- a/examples/sglang/slurm_jobs/README.md
+++ b/examples/sglang/slurm_jobs/README.md
@@ -62,6 +62,9 @@ For simplicity of the example, we will make some assumptions about your SLURM cl
 ## Usage
 
 1. **Submit a benchmark job**:
+
+   > **Note:** The logic for finding the prefill and decode node IP addresses in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` commands for your cluster setup, especially if your networking or hostname conventions differ. PRs and suggestions welcome.
+
    ```bash
    python submit_job_script.py \
      --template job_script_template.j2 \

From 3da74682408fbc5e8c516eb3ed4a1df5b6613604 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Fri, 11 Jul 2025 20:29:40 +0000
Subject: [PATCH 33/65] bump

---
 examples/sglang/slurm_jobs/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md
index 661fb62278..e9389d41cd 100644
--- a/examples/sglang/slurm_jobs/README.md
+++ b/examples/sglang/slurm_jobs/README.md
@@ -63,7 +63,8 @@ For simplicity of the example, we will make some assumptions about your SLURM cl
 
 1. **Submit a benchmark job**:
 
-   > **Note:** The logic for finding the prefill and decode node IP addresses in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` commands for your cluster setup, especially if your networking or hostname conventions differ. PRs and suggestions welcome.
+   > [!NOTE]  
+   > The logic for finding prefill and decode node IPs in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` bits for your cluster, especially if your networking or hostname conventions differ. PRs and suggestions welcome.
 
    ```bash
    python submit_job_script.py \

From d32c09a5de0429dfa0551a1aaa261e3ef69b0d02 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Fri, 11 Jul 2025 20:30:29 +0000
Subject: [PATCH 34/65] pc

---
 examples/sglang/slurm_jobs/README.md               | 2 +-
 examples/sglang/slurm_jobs/scripts/worker_setup.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md
index e9389d41cd..de91444cae 100644
--- a/examples/sglang/slurm_jobs/README.md
+++ b/examples/sglang/slurm_jobs/README.md
@@ -63,7 +63,7 @@ For simplicity of the example, we will make some assumptions about your SLURM cl
 
 1. **Submit a benchmark job**:
 
-   > [!NOTE]  
+   > [!NOTE]
    > The logic for finding prefill and decode node IPs in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` bits for your cluster, especially if your networking or hostname conventions differ. PRs and suggestions welcome.
 
    ```bash
diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index c3e95c65ca..5e6aa9309a 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -279,11 +279,11 @@ def setup_head_prefill_node(prefill_host_ip: str) -> None:
     )
     if not ingress_process:
         raise RuntimeError("Failed to start ingress")
-    
-    logging.info(f"Starting http server on port 9001for flush_cache endpoint on node {prefill_host_ip}")
-    cache_flush_server_cmd = (
-        f"python3 utils/sgl_http_server.py --ns dynamo"
+
+    logging.info(
+        f"Starting http server on port 9001 for flush_cache endpoint on node {prefill_host_ip}"
     )
+    cache_flush_server_cmd = "python3 utils/sgl_http_server.py --ns dynamo"
     cache_flush_server_process = run_command(cache_flush_server_cmd, background=True)
     if not cache_flush_server_process:
         raise RuntimeError("Failed to start cache flush server")

From d9cc3ff6148de2a32a0da5ab9f26df06bc00daaf Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Fri, 11 Jul 2025 20:31:38 +0000
Subject: [PATCH 35/65] go

---
 examples/sglang/slurm_jobs/README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md
index de91444cae..ba8539a41a 100644
--- a/examples/sglang/slurm_jobs/README.md
+++ b/examples/sglang/slurm_jobs/README.md
@@ -61,11 +61,10 @@ For simplicity of the example, we will make some assumptions about your SLURM cl
 
 ## Usage
 
-1. **Submit a benchmark job**:
-
-   > [!NOTE]
-   > The logic for finding prefill and decode node IPs in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` bits for your cluster, especially if your networking or hostname conventions differ. PRs and suggestions welcome.
+> [!NOTE]
+> The logic for finding prefill and decode node IPs in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` bits for your cluster, especially if your networking or hostname conventions differ. PRs and suggestions welcome.
 
+1. **Submit a benchmark job**:
    ```bash
    python submit_job_script.py \
      --template job_script_template.j2 \

From dc82cabee29bce3c07cbb143f0d02a17ff68576f Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Sun, 13 Jul 2025 23:40:47 +0000
Subject: [PATCH 36/65] bump

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 38799c2547..c914d31c73 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -95,7 +95,7 @@ if [ "$mode" = "prefill" ]; then
             --host 0.0.0.0 \
             --decode-log-interval 1 \
             --max-running-requests 6144 \
-            --context-length 2176 \
+            --context-length 10000 \
             --disable-radix-cache \
             --enable-deepep-moe \
             --deepep-mode low_latency \
@@ -140,7 +140,7 @@ elif [ "$mode" = "decode" ]; then
             --host 0.0.0.0 \
             --decode-log-interval 1 \
             --max-running-requests 36864 \
-            --context-length 2176 \
+            --context-length 10000 \
             --disable-radix-cache \
             --enable-deepep-moe \
             --deepep-mode low_latency \

From efb01b12ccb01932fd4175acf9a292cb7b78c2fd Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Sun, 13 Jul 2025 23:56:42 +0000
Subject: [PATCH 37/65] bump

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index c914d31c73..9879f0c567 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -76,7 +76,7 @@ if [ "$mode" = "prefill" ]; then
         # GB200 sglang prefill command
         SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
         SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \
-        SGLANG_MOONCAKE_CUSTOM_POOL=True \
+        SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
         NIXL_LOG_LEVEL=TRACE \
         UCX_LOG_LEVEL=debug \
         MC_FORCE_MNNVL=1 \
@@ -122,7 +122,7 @@ elif [ "$mode" = "decode" ]; then
         SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \
         SGLANG_NUM_RESERVED_DECODE_TOKENS=176 \
         SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \
-        SGLANG_MOONCAKE_CUSTOM_POOL=True \
+        SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
         NIXL_LOG_LEVEL=TRACE \
         UCX_LOG_LEVEL=debug \
         MC_FORCE_MNNVL=1 \

From 2db0ebccf7e19c2fa8d791a6c8d09cd8cd10d628 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 14 Jul 2025 04:46:18 +0000
Subject: [PATCH 38/65] tot dynamo

---
 container/Dockerfile.sglang-deepep | 11 +---
 container/Dockerfile.sglang-gb200  | 88 ++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 8 deletions(-)
 create mode 100644 container/Dockerfile.sglang-gb200

diff --git a/container/Dockerfile.sglang-deepep b/container/Dockerfile.sglang-deepep
index 53e001f82e..4a807fd6b2 100644
--- a/container/Dockerfile.sglang-deepep
+++ b/container/Dockerfile.sglang-deepep
@@ -71,10 +71,8 @@ RUN rm -rf /opt/hpcx/ucx && \
 
 ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH
 
-# Pinning to NIXL 0.2.1 right now
-# There is a fix that was merged into SGLang after 0.4.8.post1
-# TODO: Investigate perf hit of that change before we bump to up to date NIXL
-ARG NIXL_COMMIT="5e4c179ee850d482a83cb2a211e0947e46281060"
+# Pinnning to NIXL 0.3.1
+ARG NIXL_COMMIT="3503658e71143b56f9d5b1b440d84a94b9c41af8"
 RUN git clone https://github.com/ai-dynamo/nixl.git && cd nixl && git checkout ${NIXL_COMMIT} && pip install --break-system-packages . --config-settings=setup-args="-Ducx_path=/usr/local/ucx"
 
 WORKDIR /sgl-workspace
@@ -89,10 +87,7 @@ RUN pip install --break-system-packages "sglang==0.4.8.post1"
 ENV SGL_FORCE_SHUTDOWN=1
 
 WORKDIR /sgl-workspace
-# include flush cache endpoint and server support
-# https://github.com/ai-dynamo/dynamo/pull/1769
-ARG DYNAMO_COMMIT="bd91dca6141e05bcfbe9bd4dea54cc58b9e37d75"
-RUN git clone https://github.com/ai-dynamo/dynamo.git && cd dynamo && git checkout ${DYNAMO_COMMIT}
+RUN git clone https://github.com/ai-dynamo/dynamo.git
 
 # install dynamo in editable mode
 WORKDIR /sgl-workspace/dynamo
diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200
new file mode 100644
index 0000000000..84c413a1ef
--- /dev/null
+++ b/container/Dockerfile.sglang-gb200
@@ -0,0 +1,88 @@
+FROM sglarm:latest
+
+# Define architecture variables for ARM64
+ARG ARCH=arm64
+ARG ARCH_ALT=aarch64
+
+WORKDIR /sgl-workspace
+# include flush cache endpoint and server support
+# https://github.com/ai-dynamo/dynamo/pull/1769
+ARG DYNAMO_COMMIT="bd91dca6141e05bcfbe9bd4dea54cc58b9e37d75"
+RUN git clone https://github.com/ai-dynamo/dynamo.git && cd dynamo && git checkout ${DYNAMO_COMMIT}
+
+# install dynamo in editable mode
+WORKDIR /sgl-workspace/dynamo
+# Rust build/dev dependencies
+RUN apt update -y && \
+    apt install --no-install-recommends -y \
+    build-essential \
+    protobuf-compiler \
+    cmake \
+    libssl-dev \
+    pkg-config \
+    clang \
+    libclang-dev \
+    git
+
+# Define Rust target based on ARCH_ALT ARG
+ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu
+
+ENV RUSTUP_HOME=/usr/local/rustup \
+    CARGO_HOME=/usr/local/cargo \
+    PATH=/usr/local/cargo/bin:$PATH \
+    RUST_VERSION=1.86.0
+
+# Install Rust using RUSTARCH derived from ARCH_ALT
+RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
+    # TODO: Add SHA check back based on RUSTARCH
+    chmod +x rustup-init && \
+    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \
+    rm rustup-init && \
+    chmod -R a+w $RUSTUP_HOME $CARGO_HOME
+
+ARG CARGO_BUILD_JOBS
+# Set CARGO_BUILD_JOBS to 16 if not provided
+# This is to prevent cargo from building $(nproc) jobs in parallel,
+# which might exceed the number of opened files limit.
+ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
+
+RUN cargo build --release
+RUN mkdir -p deploy/sdk/src/dynamo/sdk/cli/bin
+RUN cp target/release/http deploy/sdk/src/dynamo/sdk/cli/bin
+RUN cp target/release/llmctl deploy/sdk/src/dynamo/sdk/cli/bin
+RUN cp target/release/dynamo-run deploy/sdk/src/dynamo/sdk/cli/bin
+
+RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../..
+RUN pip install --break-system-packages -e .
+
+ENV PYTHONPATH=/sgl-workspace/dynamo/components/planner/src:/sgl-workspace/dynamo/examples/sglang:$PYTHONPATH
+
+RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \
+    dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb
+
+ENV ETCD_VERSION="v3.5.21"
+RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \
+    mkdir -p /usr/local/bin/etcd && \
+    tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \
+    rm /tmp/etcd.tar.gz
+ENV PATH=/usr/local/bin/etcd/:$PATH
+
+# Install perf_analyzer and genai-perf
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    rapidjson-dev \
+    zlib1g-dev
+
+RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \
+    mkdir perf_analyzer/build && \
+    cmake -B perf_analyzer/build -S perf_analyzer && \
+    cmake --build perf_analyzer/build -- -j8
+
+ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH
+
+RUN pip install --break-system-packages genai-perf
+
+COPY examples/sglang/configs/deepseek_r1/wideep/* /sgl-workspace/dynamo/examples/sglang/configs/
+COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/utils/
+
+WORKDIR /sgl-workspace/dynamo/examples/sglang
\ No newline at end of file

From f275433f6e1fa5a124e3085288b19bc0eb53ec5c Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 15 Jul 2025 17:45:27 +0000
Subject: [PATCH 39/65] bump

---
 container/Dockerfile.sglang-gb200             |  7 ++--
 .../wideep/install_mooncake_from_src.sh       | 20 +++++++++
 examples/sglang/docs/dsr1-wideep-gb200.md     | 42 +++++++++++++++++++
 3 files changed, 65 insertions(+), 4 deletions(-)
 create mode 100644 examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh
 create mode 100644 examples/sglang/docs/dsr1-wideep-gb200.md

diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200
index 84c413a1ef..f1ba0ab348 100644
--- a/container/Dockerfile.sglang-gb200
+++ b/container/Dockerfile.sglang-gb200
@@ -1,13 +1,12 @@
-FROM sglarm:latest
+FROM sgl-blackwell-wideep:latest
 
 # Define architecture variables for ARM64
 ARG ARCH=arm64
 ARG ARCH_ALT=aarch64
 
 WORKDIR /sgl-workspace
-# include flush cache endpoint and server support
-# https://github.com/ai-dynamo/dynamo/pull/1769
-ARG DYNAMO_COMMIT="bd91dca6141e05bcfbe9bd4dea54cc58b9e37d75"
+# https://github.com/ai-dynamo/dynamo/pull/1938
+ARG DYNAMO_COMMIT="3c6fc6fdaf61397813cc58f4c1de7ece4c0203f0"
 RUN git clone https://github.com/ai-dynamo/dynamo.git && cd dynamo && git checkout ${DYNAMO_COMMIT}
 
 # install dynamo in editable mode
diff --git a/examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh b/examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh
new file mode 100644
index 0000000000..2f4729bdb2
--- /dev/null
+++ b/examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# We've been having some trouble with the mooncake installation when we build
+# the container. This script is ran before SGL starts up and allows us to use
+# the mnnvl capabilites from mooncake main
+
+set -ex
+
+cd /sgl-workspace
+
+pip uninstall mooncake-transfer-engine
+
+git clone https://github.com/kvcache-ai/Mooncake.git
+cd Mooncake
+bash dependencies.sh
+mkdir build
+cd build
+cmake .. -DUSE_MNNVL=ON
+make -j
+sudo make install
\ No newline at end of file
diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md
new file mode 100644
index 0000000000..3f4dd03477
--- /dev/null
+++ b/examples/sglang/docs/dsr1-wideep-gb200.md
@@ -0,0 +1,42 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Running DeepSeek-R1 Disaggregated with WideEP on GB200s
+
+Dynamo supports SGLang's GB200 implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-06-16-gb200-part-1/) for more details. Full end to end optimization is still a work in progress but you can get this up and running with the following steps.
+
+## Instructions
+
+1. Build the SGLang DeepEP container on an ARM64 machine.
+
+```bash
+git clone https://github.com/kyleliang-nv/sglang.git
+git checkout sglang_gb200_wideep_docker
+cd sglang/docker
+docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE=blackwell --build-arg CUDA_VERSION=12.8.1 .
+```
+
+2. Build the Dynamo container
+
+```bash
+cd $DYNAMO_ROOT
+docker build -f container/Dockerfile.gb200 . -t dynamo-wideep-gb200 --no-cache
+```
+
+
+
+

From 1b272af6ab6db1f17f4048e810cd7ecb7ab20fd2 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 15 Jul 2025 17:58:29 +0000
Subject: [PATCH 40/65] update gb200 deployment instructions

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 46 ++++++++++++++-------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 9879f0c567..a311d23683 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -69,23 +69,32 @@ fi
 
 # Construct command based on mode and cmd
 if [ "$mode" = "prefill" ]; then
+    # We need to install Mooncake from source inside of the container for now 
+    bash /configs/install_mooncake_from_src.sh
     if [ "$cmd" = "dynamo" ]; then
         echo "Error: dynamo command not implemented for GB200"
         exit 1
     elif [ "$cmd" = "sglang" ]; then
         # GB200 sglang prefill command
+        # We are not using a init-expert-location file for e2e benchmarking
+        # We also don't currently have a --deepep-config file for GB200
+        # Need to increase --context-length to 10k for 8k1k benchmarking
         SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
-        SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \
+        MC_TE_METRIC=true \
+        SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+        SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+        SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
         SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
-        NIXL_LOG_LEVEL=TRACE \
-        UCX_LOG_LEVEL=debug \
-        MC_FORCE_MNNVL=1 \
+        NCCL_MNNVL_ENABLE=1 \
+        NCCL_CUMEM_ENABLE=1 \
+        SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+        SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+        PYTHONUNBUFFERED=1 \
         python3 -m sglang.launch_server \
             --served-model-name deepseek-ai/DeepSeek-R1 \
             --model-path /model/ \
             --trust-remote-code \
             --disaggregation-mode prefill \
-            --disaggregation-transfer-backend nixl \
             --dist-init-addr "$HOST_IP:$PORT" \
             --nnodes "$TOTAL_NODES" \
             --node-rank "$RANK" \
@@ -95,7 +104,7 @@ if [ "$mode" = "prefill" ]; then
             --host 0.0.0.0 \
             --decode-log-interval 1 \
             --max-running-requests 6144 \
-            --context-length 10000 \
+            --context-length 2716 \
             --disable-radix-cache \
             --enable-deepep-moe \
             --deepep-mode low_latency \
@@ -107,29 +116,38 @@ if [ "$mode" = "prefill" ]; then
             --eplb-algorithm deepseek \
             --attention-backend cutlass_mla \
             --watchdog-timeout 1000000 \
+            --init-export-location
             --disable-cuda-graph \
             --chunked-prefill-size 16384 \
             --max-total-tokens 32768 \
-            --mem-fraction-static 0.9 \
+            --mem-fraction-static 0.8 \
             --log-level debug
     fi
 elif [ "$mode" = "decode" ]; then
+    # We need to install Mooncake from source inside of the container for now 
+    bash /configs/install_mooncake_from_src.sh
     if [ "$cmd" = "dynamo" ]; then
         echo "Error: dynamo command not implemented for GB200"
         exit 1
     elif [ "$cmd" = "sglang" ]; then
         # GB200 sglang decode command
+        # Need to increase --context-length to 10k for 8k1k benchmarking
+        # We are not using a init-expert-location file for e2e benchmarking
         SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \
-        SGLANG_NUM_RESERVED_DECODE_TOKENS=176 \
-        SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \
+        MC_TE_METRIC=true \
+        SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+        SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+        SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+        SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
         SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
-        NIXL_LOG_LEVEL=TRACE \
-        UCX_LOG_LEVEL=debug \
-        MC_FORCE_MNNVL=1 \
+        NCCL_MNNVL_ENABLE=1 \
+        NCCL_CUMEM_ENABLE=1 \
+        SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+        SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+        PYTHONUNBUFFERED=1 \
         python3 -m sglang.launch_server \
             --model-path /model/ \
             --trust-remote-code \
-            --disaggregation-transfer-backend mooncake \
             --disaggregation-mode decode \
             --dist-init-addr "$HOST_IP:$PORT" \
             --nnodes "$TOTAL_NODES" \
@@ -140,7 +158,7 @@ elif [ "$mode" = "decode" ]; then
             --host 0.0.0.0 \
             --decode-log-interval 1 \
             --max-running-requests 36864 \
-            --context-length 10000 \
+            --context-length 2716 \
             --disable-radix-cache \
             --enable-deepep-moe \
             --deepep-mode low_latency \

From de7bc220224c7a5a42aed3292ec7b420d621625e Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 15 Jul 2025 18:14:45 +0000
Subject: [PATCH 41/65] untested gb200 + dynamo command

---
 examples/sglang/slurm_jobs/scripts/gb200.sh   | 99 +++++++++++++++++--
 .../install_mooncake_from_src.sh              |  0
 2 files changed, 93 insertions(+), 6 deletions(-)
 rename examples/sglang/{configs/deepseek_r1/wideep => utils}/install_mooncake_from_src.sh (100%)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index a311d23683..b9cbde58e6 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -70,10 +70,55 @@ fi
 # Construct command based on mode and cmd
 if [ "$mode" = "prefill" ]; then
     # We need to install Mooncake from source inside of the container for now 
-    bash /configs/install_mooncake_from_src.sh
+    bash /sgl-workspace/dynamo/examples/sglang/utils/install_mooncake_from_src.sh
     if [ "$cmd" = "dynamo" ]; then
-        echo "Error: dynamo command not implemented for GB200"
-        exit 1
+    # We are not using a init-expert-location file for e2e benchmarking
+        # We also don't currently have a --deepep-config file for GB200
+        # Need to increase --context-length to 10k for 8k1k benchmarking
+        SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
+        MC_TE_METRIC=true \
+        SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+        SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+        SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+        SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+        NCCL_MNNVL_ENABLE=1 \
+        NCCL_CUMEM_ENABLE=1 \
+        SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+        SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+        PYTHONUNBUFFERED=1 \
+        python3 components/worker.py \
+            --served-model-name deepseek-ai/DeepSeek-R1 \
+            --model-path /model/ \
+            --trust-remote-code \
+            --disaggregation-mode prefill \
+            --dist-init-addr "$HOST_IP:$PORT" \
+            --nnodes "$TOTAL_NODES" \
+            --node-rank "$RANK" \
+            --tp-size "$TOTAL_GPUS" \
+            --dp-size "$TOTAL_GPUS" \
+            --enable-dp-attention \
+            --host 0.0.0.0 \
+            --decode-log-interval 1 \
+            --max-running-requests 6144 \
+            --context-length 2716 \
+            --disable-radix-cache \
+            --enable-deepep-moe \
+            --deepep-mode low_latency \
+            --moe-dense-tp-size 1 \
+            --enable-dp-lm-head \
+            --disable-shared-experts-fusion \
+            --ep-num-redundant-experts 32 \
+            --ep-dispatch-algorithm static \
+            --eplb-algorithm deepseek \
+            --attention-backend cutlass_mla \
+            --watchdog-timeout 1000000 \
+            --init-export-location
+            --disable-cuda-graph \
+            --chunked-prefill-size 16384 \
+            --max-total-tokens 32768 \
+            --mem-fraction-static 0.8 \
+            --log-level debug
+        
     elif [ "$cmd" = "sglang" ]; then
         # GB200 sglang prefill command
         # We are not using a init-expert-location file for e2e benchmarking
@@ -125,10 +170,52 @@ if [ "$mode" = "prefill" ]; then
     fi
 elif [ "$mode" = "decode" ]; then
     # We need to install Mooncake from source inside of the container for now 
-    bash /configs/install_mooncake_from_src.sh
+    bash /sgl-workspace/dynamo/examples/sglang/utils/install_mooncake_from_src.sh
     if [ "$cmd" = "dynamo" ]; then
-        echo "Error: dynamo command not implemented for GB200"
-        exit 1
+        # Need to increase --context-length to 10k for 8k1k benchmarking
+        # We are not using a init-expert-location file for e2e benchmarking
+        SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \
+        MC_TE_METRIC=true \
+        SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+        SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+        SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+        SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
+        SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+        NCCL_MNNVL_ENABLE=1 \
+        NCCL_CUMEM_ENABLE=1 \
+        SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+        SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+        PYTHONUNBUFFERED=1 \
+        python3 components/worker.py \
+            --model-path /model/ \
+            --trust-remote-code \
+            --disaggregation-mode decode \
+            --dist-init-addr "$HOST_IP:$PORT" \
+            --nnodes "$TOTAL_NODES" \
+            --node-rank "$RANK" \
+            --tp-size "$TOTAL_GPUS" \
+            --dp-size "$TOTAL_GPUS" \
+            --enable-dp-attention \
+            --host 0.0.0.0 \
+            --decode-log-interval 1 \
+            --max-running-requests 36864 \
+            --context-length 2716 \
+            --disable-radix-cache \
+            --enable-deepep-moe \
+            --deepep-mode low_latency \
+            --moe-dense-tp-size 1 \
+            --enable-dp-lm-head \
+            --cuda-graph-bs 768 \
+            --disable-shared-experts-fusion \
+            --ep-num-redundant-experts 32 \
+            --ep-dispatch-algorithm static \
+            --eplb-algorithm deepseek \
+            --attention-backend cutlass_mla \
+            --watchdog-timeout 1000000 \
+            --chunked-prefill-size 36864 \
+            --mem-fraction-static 0.82 \
+            --log-level debug
+        
     elif [ "$cmd" = "sglang" ]; then
         # GB200 sglang decode command
         # Need to increase --context-length to 10k for 8k1k benchmarking
diff --git a/examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh
similarity index 100%
rename from examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh
rename to examples/sglang/utils/install_mooncake_from_src.sh

From 2d9e621454e9fd1aa07c1b97b82f4476c455eb8e Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 15 Jul 2025 19:16:24 +0000
Subject: [PATCH 42/65] bump

---
 container/Dockerfile.sglang-gb200         | 2 --
 examples/sglang/docs/dsr1-wideep-gb200.md | 9 +++++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200
index f1ba0ab348..542b55af7e 100644
--- a/container/Dockerfile.sglang-gb200
+++ b/container/Dockerfile.sglang-gb200
@@ -47,8 +47,6 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
 
 RUN cargo build --release
 RUN mkdir -p deploy/sdk/src/dynamo/sdk/cli/bin
-RUN cp target/release/http deploy/sdk/src/dynamo/sdk/cli/bin
-RUN cp target/release/llmctl deploy/sdk/src/dynamo/sdk/cli/bin
 RUN cp target/release/dynamo-run deploy/sdk/src/dynamo/sdk/cli/bin
 
 RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../..
diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md
index 3f4dd03477..d6423cc945 100644
--- a/examples/sglang/docs/dsr1-wideep-gb200.md
+++ b/examples/sglang/docs/dsr1-wideep-gb200.md
@@ -24,9 +24,9 @@ Dynamo supports SGLang's GB200 implementation of wide expert parallelism and lar
 1. Build the SGLang DeepEP container on an ARM64 machine.
 
 ```bash
-git clone https://github.com/kyleliang-nv/sglang.git
-git checkout sglang_gb200_wideep_docker
-cd sglang/docker
+git clone https://github.com/kyleliang-nv/sglang.git # temporary
+cd sglang
+git checkout sglang_gb200_wideep_docker # temporary
 docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE=blackwell --build-arg CUDA_VERSION=12.8.1 .
 ```
 
@@ -34,7 +34,8 @@ docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE
 
 ```bash
 cd $DYNAMO_ROOT
-docker build -f container/Dockerfile.gb200 . -t dynamo-wideep-gb200 --no-cache
+git checkout ishan/more-slurm-targets # temporary
+docker build -f container/Dockerfile.sglang-gb200 . -t dynamo-wideep-gb200 --no-cache
 ```
 
 

From 74f6ffaf4b317d20b1aff3281af4d7d75d085b3f Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 15 Jul 2025 19:58:38 +0000
Subject: [PATCH 43/65] path swap

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index b9cbde58e6..9539df573d 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -70,7 +70,7 @@ fi
 # Construct command based on mode and cmd
 if [ "$mode" = "prefill" ]; then
     # We need to install Mooncake from source inside of the container for now 
-    bash /sgl-workspace/dynamo/examples/sglang/utils/install_mooncake_from_src.sh
+    bash /configs/install_mooncake_from_src.sh
     if [ "$cmd" = "dynamo" ]; then
     # We are not using a init-expert-location file for e2e benchmarking
         # We also don't currently have a --deepep-config file for GB200
@@ -170,7 +170,7 @@ if [ "$mode" = "prefill" ]; then
     fi
 elif [ "$mode" = "decode" ]; then
     # We need to install Mooncake from source inside of the container for now 
-    bash /sgl-workspace/dynamo/examples/sglang/utils/install_mooncake_from_src.sh
+    bash /configs/install_mooncake_from_src.sh
     if [ "$cmd" = "dynamo" ]; then
         # Need to increase --context-length to 10k for 8k1k benchmarking
         # We are not using a init-expert-location file for e2e benchmarking

From 22a227f3590febbbdaa97936faac8b604dff27d5 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 15 Jul 2025 20:29:04 +0000
Subject: [PATCH 44/65] ok

---
 examples/sglang/slurm_jobs/scripts/gb200.sh   |  2 --
 .../sglang/utils/install_mooncake_from_src.sh | 36 ++++++++++++++++---
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 9539df573d..ab974aaf13 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -112,7 +112,6 @@ if [ "$mode" = "prefill" ]; then
             --eplb-algorithm deepseek \
             --attention-backend cutlass_mla \
             --watchdog-timeout 1000000 \
-            --init-export-location
             --disable-cuda-graph \
             --chunked-prefill-size 16384 \
             --max-total-tokens 32768 \
@@ -161,7 +160,6 @@ if [ "$mode" = "prefill" ]; then
             --eplb-algorithm deepseek \
             --attention-backend cutlass_mla \
             --watchdog-timeout 1000000 \
-            --init-export-location
             --disable-cuda-graph \
             --chunked-prefill-size 16384 \
             --max-total-tokens 32768 \
diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh
index 2f4729bdb2..15f757c3f4 100644
--- a/examples/sglang/utils/install_mooncake_from_src.sh
+++ b/examples/sglang/utils/install_mooncake_from_src.sh
@@ -3,18 +3,46 @@
 # We've been having some trouble with the mooncake installation when we build
 # the container. This script is ran before SGL starts up and allows us to use
 # the mnnvl capabilites from mooncake main
+#
+# Usage: ./install_mooncake.sh <dynamo|sglang>
+if [ "$#" -ne 1 ]; then
+  echo "Usage: $0 <dynamo|sglang>"
+  exit 1
+fi
 
-set -ex
+MODE="$1"
+case "$MODE" in
+  dynamo)
+    SUDO=""
+    ;;
+  sglang)
+    SUDO="sudo"
+    ;;
+  *)
+    echo "Error: invalid mode '$MODE'. Use 'dynamo' or 'sglang'."
+    exit 1
+    ;;
+esac
 
 cd /sgl-workspace
 
-pip uninstall mooncake-transfer-engine
+# Clean up previous build
+$SUDO rm -rf Mooncake/
 
+# Uninstall any existing package
+pip uninstall -y mooncake-transfer-engine
+
+# Clone & build
 git clone https://github.com/kvcache-ai/Mooncake.git
 cd Mooncake
 bash dependencies.sh
-mkdir build
+
+mkdir -p build
 cd build
 cmake .. -DUSE_MNNVL=ON
 make -j
-sudo make install
\ No newline at end of file
+
+# Install (with sudo if in sglang mode)
+$SUDO make install
+
+echo "Mooncake built and installed in '$MODE' mode."
\ No newline at end of file

From f32a4228d3f44cecad5357afc38fb6290fba9d91 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 15 Jul 2025 20:47:36 +0000
Subject: [PATCH 45/65] cmd

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index ab974aaf13..73ac48c612 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -70,7 +70,7 @@ fi
 # Construct command based on mode and cmd
 if [ "$mode" = "prefill" ]; then
     # We need to install Mooncake from source inside of the container for now 
-    bash /configs/install_mooncake_from_src.sh
+    bash /configs/install_mooncake_from_src.sh $cmd
     if [ "$cmd" = "dynamo" ]; then
     # We are not using a init-expert-location file for e2e benchmarking
         # We also don't currently have a --deepep-config file for GB200
@@ -168,7 +168,7 @@ if [ "$mode" = "prefill" ]; then
     fi
 elif [ "$mode" = "decode" ]; then
     # We need to install Mooncake from source inside of the container for now 
-    bash /configs/install_mooncake_from_src.sh
+    bash /configs/install_mooncake_from_src.sh $cmd
     if [ "$cmd" = "dynamo" ]; then
         # Need to increase --context-length to 10k for 8k1k benchmarking
         # We are not using a init-expert-location file for e2e benchmarking

From 0a02c8aae5d6bf9c906dd4aa9da93161c1831731 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 15 Jul 2025 20:57:48 +0000
Subject: [PATCH 46/65] try something else

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 73ac48c612..79f439fdfb 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -70,7 +70,7 @@ fi
 # Construct command based on mode and cmd
 if [ "$mode" = "prefill" ]; then
     # We need to install Mooncake from source inside of the container for now 
-    bash /configs/install_mooncake_from_src.sh $cmd
+    ./configs/install_mooncake_from_src.sh $cmd
     if [ "$cmd" = "dynamo" ]; then
     # We are not using a init-expert-location file for e2e benchmarking
         # We also don't currently have a --deepep-config file for GB200
@@ -168,7 +168,7 @@ if [ "$mode" = "prefill" ]; then
     fi
 elif [ "$mode" = "decode" ]; then
     # We need to install Mooncake from source inside of the container for now 
-    bash /configs/install_mooncake_from_src.sh $cmd
+    ./configs/install_mooncake_from_src.sh $cmd
     if [ "$cmd" = "dynamo" ]; then
         # Need to increase --context-length to 10k for 8k1k benchmarking
         # We are not using a init-expert-location file for e2e benchmarking

From a24763da54e8f751a09045046e0f8e2b8fdfbd30 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 15 Jul 2025 21:08:24 +0000
Subject: [PATCH 47/65] keep us as root to install mooncake deps as it needs
 sudo

---
 examples/sglang/slurm_jobs/job_script_template.j2 | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
index 2e873c42fa..a9d7388ea2 100755
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -57,7 +57,6 @@ ENROOT_ARGS="\
     --container-image=${CONTAINER_IMAGE} \
     --no-container-entrypoint \
     --no-container-mount-home \
-    --no-container-remap-root \
     --container-mounts=${MODEL_DIR}:/model/,${CONFIG_DIR}:/configs/,${SCRIPT_DIR}:/scripts/,${OUTPUT_DIR}:/outputs/,${LOG_DIR}:/logs/ \
 "
 

From d6a6a3ebdd8a2f7063979083c0ac44ffc1d8d9e0 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 15 Jul 2025 21:09:07 +0000
Subject: [PATCH 48/65] try

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 79f439fdfb..73ac48c612 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -70,7 +70,7 @@ fi
 # Construct command based on mode and cmd
 if [ "$mode" = "prefill" ]; then
     # We need to install Mooncake from source inside of the container for now 
-    ./configs/install_mooncake_from_src.sh $cmd
+    bash /configs/install_mooncake_from_src.sh $cmd
     if [ "$cmd" = "dynamo" ]; then
     # We are not using a init-expert-location file for e2e benchmarking
         # We also don't currently have a --deepep-config file for GB200
@@ -168,7 +168,7 @@ if [ "$mode" = "prefill" ]; then
     fi
 elif [ "$mode" = "decode" ]; then
     # We need to install Mooncake from source inside of the container for now 
-    ./configs/install_mooncake_from_src.sh $cmd
+    bash /configs/install_mooncake_from_src.sh $cmd
     if [ "$cmd" = "dynamo" ]; then
         # Need to increase --context-length to 10k for 8k1k benchmarking
         # We are not using a init-expert-location file for e2e benchmarking

From 87578320b1ea917de0a56261ab0441bec9029f0c Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Wed, 16 Jul 2025 00:56:55 +0000
Subject: [PATCH 49/65] revert to sgl balancer and fix scripts and add MC MNNVL
 flag

---
 examples/sglang/slurm_jobs/scripts/gb200.sh   |  4 ++
 .../sglang/slurm_jobs/scripts/worker_setup.py | 12 +++---
 .../sglang/utils/install_mooncake_from_src.sh | 41 ++++++-------------
 3 files changed, 21 insertions(+), 36 deletions(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 73ac48c612..0a7ae5d25b 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -81,6 +81,7 @@ if [ "$mode" = "prefill" ]; then
         SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
         SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
         SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+        MC_FORCE_MNNVL=1 \
         NCCL_MNNVL_ENABLE=1 \
         NCCL_CUMEM_ENABLE=1 \
         SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
@@ -130,6 +131,7 @@ if [ "$mode" = "prefill" ]; then
         SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
         SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
         NCCL_MNNVL_ENABLE=1 \
+        MC_FORCE_MNNVL=1 \
         NCCL_CUMEM_ENABLE=1 \
         SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
         SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
@@ -180,6 +182,7 @@ elif [ "$mode" = "decode" ]; then
         SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
         SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
         NCCL_MNNVL_ENABLE=1 \
+        MC_FORCE_MNNVL=1 \
         NCCL_CUMEM_ENABLE=1 \
         SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
         SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
@@ -226,6 +229,7 @@ elif [ "$mode" = "decode" ]; then
         SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
         SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
         NCCL_MNNVL_ENABLE=1 \
+        MC_FORCE_MNNVL=1 \
         NCCL_CUMEM_ENABLE=1 \
         SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
         SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py
index 5e6aa9309a..5071c6f25b 100644
--- a/examples/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py
@@ -208,14 +208,12 @@ def _validate_args(args: argparse.Namespace) -> None:
 
 def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> str:
     cmd = (
-        "pip install sglang-router && "
-        f"python3 -m sglang_router.launch_router "
-        f"--policy random "
-        f"--pd-disaggregation "
-        f"--prefill http://{prefill_host_ip}:30000 30001 "
+        f"python3 -m sglang.srt.disaggregation.launch_lb "
+        f"--prefill http://{prefill_host_ip}:30000 "
         f"--decode http://{decode_host_ip}:30000 "
-        f"--host 0.0.0.0 "
-        f"--port 8000 "
+        "--host 0.0.0.0 "
+        "--port 8000 "
+        "--timeout 3600"
     )
     return cmd
 
diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh
index 15f757c3f4..c3284d09b0 100644
--- a/examples/sglang/utils/install_mooncake_from_src.sh
+++ b/examples/sglang/utils/install_mooncake_from_src.sh
@@ -3,46 +3,29 @@
 # We've been having some trouble with the mooncake installation when we build
 # the container. This script is ran before SGL starts up and allows us to use
 # the mnnvl capabilites from mooncake main
-#
-# Usage: ./install_mooncake.sh <dynamo|sglang>
-if [ "$#" -ne 1 ]; then
-  echo "Usage: $0 <dynamo|sglang>"
-  exit 1
-fi
-
-MODE="$1"
-case "$MODE" in
-  dynamo)
-    SUDO=""
-    ;;
-  sglang)
-    SUDO="sudo"
-    ;;
-  *)
-    echo "Error: invalid mode '$MODE'. Use 'dynamo' or 'sglang'."
-    exit 1
-    ;;
-esac
 
 cd /sgl-workspace
 
-# Clean up previous build
-$SUDO rm -rf Mooncake/
+# Try to set this
+export TORCH_CUDA_ARCH_LIST=10.0
+
+echo $LD_LIBRARY_PATH
 
 # Uninstall any existing package
-pip uninstall -y mooncake-transfer-engine
+#pip install --break-system-packages mooncake-transfer-engine
 
 # Clone & build
-git clone https://github.com/kvcache-ai/Mooncake.git
+git clone https://github.com/ishandhanani/Mooncake.git
 cd Mooncake
-bash dependencies.sh
-
+git checkout ishan/manual-nvl-installation
+bash dependencies.sh -y
 mkdir -p build
 cd build
 cmake .. -DUSE_MNNVL=ON
 make -j
 
-# Install (with sudo if in sglang mode)
-$SUDO make install
+make install
+
+chmod +x /usr/local/lib/python3.10/dist-packages/mooncake/nvlink_allocator.so
 
-echo "Mooncake built and installed in '$MODE' mode."
\ No newline at end of file
+echo "Mooncake built and installed"

From 462c4a8dea1cba0f0e7ad75b1e0a288ad4eab9ae Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Wed, 16 Jul 2025 02:39:04 +0000
Subject: [PATCH 50/65] bump

---
 examples/sglang/slurm_jobs/job_script_template.j2 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2
index a9d7388ea2..a0959fbb91 100755
--- a/examples/sglang/slurm_jobs/job_script_template.j2
+++ b/examples/sglang/slurm_jobs/job_script_template.j2
@@ -7,6 +7,7 @@
 #SBATCH --time={{ time_limit }}
 #SBATCH --output=logs/%j/log.out
 #SBATCH --error=logs/%j/log.err
+#SBATCH --partition=36x2-a01r
 
 # Constants
 PREFILL_NODES={{ prefill_nodes }}

From 3f5361ef3e5566c40643572c983dc432871ca84e Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Wed, 16 Jul 2025 18:59:05 +0000
Subject: [PATCH 51/65] init instructions for others

---
 container/Dockerfile.sglang-gb200             | 17 +--------
 examples/sglang/docs/dsr1-wideep-gb200.md     | 38 ++++++++++++++++++-
 examples/sglang/slurm_jobs/scripts/gb200.sh   |  1 +
 .../sglang/utils/install_mooncake_from_src.sh |  6 ++-
 4 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200
index 542b55af7e..9bc3fef5da 100644
--- a/container/Dockerfile.sglang-gb200
+++ b/container/Dockerfile.sglang-gb200
@@ -6,7 +6,7 @@ ARG ARCH_ALT=aarch64
 
 WORKDIR /sgl-workspace
 # https://github.com/ai-dynamo/dynamo/pull/1938
-ARG DYNAMO_COMMIT="3c6fc6fdaf61397813cc58f4c1de7ece4c0203f0"
+ARG DYNAMO_COMMIT="aba60996f225038b691d9f255da515b27695b179"
 RUN git clone https://github.com/ai-dynamo/dynamo.git && cd dynamo && git checkout ${DYNAMO_COMMIT}
 
 # install dynamo in editable mode
@@ -64,21 +64,6 @@ RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/downlo
     rm /tmp/etcd.tar.gz
 ENV PATH=/usr/local/bin/etcd/:$PATH
 
-# Install perf_analyzer and genai-perf
-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends \
-    rapidjson-dev \
-    zlib1g-dev
-
-RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \
-    mkdir perf_analyzer/build && \
-    cmake -B perf_analyzer/build -S perf_analyzer && \
-    cmake --build perf_analyzer/build -- -j8
-
-ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH
-
-RUN pip install --break-system-packages genai-perf
-
 COPY examples/sglang/configs/deepseek_r1/wideep/* /sgl-workspace/dynamo/examples/sglang/configs/
 COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/utils/
 
diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md
index d6423cc945..8c712cb7dd 100644
--- a/examples/sglang/docs/dsr1-wideep-gb200.md
+++ b/examples/sglang/docs/dsr1-wideep-gb200.md
@@ -23,21 +23,55 @@ Dynamo supports SGLang's GB200 implementation of wide expert parallelism and lar
 
 1. Build the SGLang DeepEP container on an ARM64 machine.
 
+> [!NOTE]  
+> This sglang side branch is based on an open [PR](https://github.com/sgl-project/sglang/pull/7721/files) to SGLang that allows their main dockerfile to be built for aarch64. Once that PR is merged in, we can add the gb200 dockerfile to the main sglang repo.
+
 ```bash
-git clone https://github.com/kyleliang-nv/sglang.git # temporary
+git clone https://github.com/kyleliang-nv/sglang.git
 cd sglang
-git checkout sglang_gb200_wideep_docker # temporary
+git checkout sglang_gb200_wideep_docker
 docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE=blackwell --build-arg CUDA_VERSION=12.8.1 .
 ```
 
 2. Build the Dynamo container
 
+> [!NOTE]  
+> This is a side branch that contains all of the scripts to run on GB200s. Once the PR is merged in, we can switch to the main branch.
+
 ```bash
 cd $DYNAMO_ROOT
 git checkout ishan/more-slurm-targets # temporary
 docker build -f container/Dockerfile.sglang-gb200 . -t dynamo-wideep-gb200 --no-cache
 ```
 
+3. In your SLURM cluster, clone dynamo and switch to this side branch.
+
+```bash
+git clone https://github.com/ai-dynamo/dynamo.git
+git checkout ishan/more-slurm-targets
+cd examples/sglang/slurm_jobs
+```
 
+4. Ensure you have the proper paths that you can use to mount things to the container
 
+- The path to the DSR1 model which should be mounted to the `--model-dir` flag
+- The path to the `install_mooncake_from_src.sh` which will be mounted to the `--config-dir` flag
+
+5. Run the following command to submit the job
+
+```bash
+python3 submit_job_script.py \
+  --template job_script_template.j2 \
+  --model-dir <path-to-dsr1-model> \
+  --container-image <image-from-step-2> \
+  --account <your-account> \
+  --gpus-per-node 4 \
+  --config-dir <path-to-configs> \
+  --network-interface enp138s0f0np0 \
+  --gpu-type gb200 \
+  --use-sglang-commands \
+  --prefill-nodes 2 \
+  --decode-nodes 12
+```
 
+6. This will create a logs directory in the `examples/sglang/slurm_jobs` directory. You can `cd` into the directory, cd into your job id, and then run `tail -f *_prefill.err *_decode.err` or `tail -f *_prefill.out *_decode.out` to see the logs.
\ No newline at end of file
diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 0a7ae5d25b..1030424e57 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -66,6 +66,7 @@ if [ -z "$TOTAL_NODES" ]; then
     exit 1
 fi
 
+# TODO: since the args for sglang and dynamo are the same, we can be a bit cleaner here
 
 # Construct command based on mode and cmd
 if [ "$mode" = "prefill" ]; then
diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh
index c3284d09b0..9dc918f756 100644
--- a/examples/sglang/utils/install_mooncake_from_src.sh
+++ b/examples/sglang/utils/install_mooncake_from_src.sh
@@ -15,9 +15,13 @@ echo $LD_LIBRARY_PATH
 #pip install --break-system-packages mooncake-transfer-engine
 
 # Clone & build
+# Once Mooncake main branch has fixed 
+# 1. proper g++ compilation
+# 2. solved std::function call issue - we can swap back to ToT
+# As of 7/16 10:20AM PST - I've been told its was solved but I have not been able to test it E2E
 git clone https://github.com/ishandhanani/Mooncake.git
 cd Mooncake
-git checkout ishan/manual-nvl-installation
+git checkout ishan/pr-571-diff-build
 bash dependencies.sh -y
 mkdir -p build
 cd build

From b258074cafbda678064a045822907b9aafb00777 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Wed, 16 Jul 2025 19:02:24 +0000
Subject: [PATCH 52/65] atempt

---
 examples/sglang/docs/dsr1-wideep-gb200.md          | 2 ++
 examples/sglang/utils/install_mooncake_from_src.sh | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md
index 8c712cb7dd..d2e33a10b2 100644
--- a/examples/sglang/docs/dsr1-wideep-gb200.md
+++ b/examples/sglang/docs/dsr1-wideep-gb200.md
@@ -74,4 +74,6 @@ python3 submit_job_script.py \
   --decode-nodes 12
 ```
 
+**UNTESTED**: if you want to spin up dynamo, you can remove the `--use-sglang-commands` flag.
+
 6. This will create a logs directory in the `examples/sglang/slurm_jobs` directory. You can `cd` into the directory, cd into your job id, and then run `tail -f *_prefill.err *_decode.err` or `tail -f *_prefill.out *_decode.out` to see the logs.
\ No newline at end of file
diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh
index 9dc918f756..85e2c48379 100644
--- a/examples/sglang/utils/install_mooncake_from_src.sh
+++ b/examples/sglang/utils/install_mooncake_from_src.sh
@@ -11,14 +11,15 @@ export TORCH_CUDA_ARCH_LIST=10.0
 
 echo $LD_LIBRARY_PATH
 
-# Uninstall any existing package
-#pip install --break-system-packages mooncake-transfer-engine
+# Uninstall any existing mooncake package
+pip install --break-system-packages mooncake-transfer-engine
 
 # Clone & build
 # Once Mooncake main branch has fixed 
 # 1. proper g++ compilation
 # 2. solved std::function call issue - we can swap back to ToT
 # As of 7/16 10:20AM PST - I've been told its was solved but I have not been able to test it E2E
+# So for now we will stay on my side branch 
 git clone https://github.com/ishandhanani/Mooncake.git
 cd Mooncake
 git checkout ishan/pr-571-diff-build

From b195bbfcaa757b288b53edd3cdec5cf54274aa40 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Wed, 16 Jul 2025 19:35:30 +0000
Subject: [PATCH 53/65] bump

---
 examples/sglang/docs/dsr1-wideep-gb200.md          | 4 ++--
 examples/sglang/slurm_jobs/scripts/gb200.sh        | 8 ++++----
 examples/sglang/utils/install_mooncake_from_src.sh | 9 ++++-----
 3 files changed, 10 insertions(+), 11 deletions(-)
 mode change 100644 => 100755 examples/sglang/utils/install_mooncake_from_src.sh

diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md
index d2e33a10b2..c2ff937789 100644
--- a/examples/sglang/docs/dsr1-wideep-gb200.md
+++ b/examples/sglang/docs/dsr1-wideep-gb200.md
@@ -23,7 +23,7 @@ Dynamo supports SGLang's GB200 implementation of wide expert parallelism and lar
 
 1. Build the SGLang DeepEP container on an ARM64 machine.
 
-> [!NOTE]  
+> [!NOTE]
 > This sglang side branch is based on an open [PR](https://github.com/sgl-project/sglang/pull/7721/files) to SGLang that allows their main dockerfile to be built for aarch64. Once that PR is merged in, we can add the gb200 dockerfile to the main sglang repo.
 
 ```bash
@@ -35,7 +35,7 @@ docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE
 
 2. Build the Dynamo container
 
-> [!NOTE]  
+> [!NOTE]
 > This is a side branch that contains all of the scripts to run on GB200s. Once the PR is merged in, we can switch to the main branch.
 
 ```bash
diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 1030424e57..e454e7c55a 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -70,7 +70,7 @@ fi
 
 # Construct command based on mode and cmd
 if [ "$mode" = "prefill" ]; then
-    # We need to install Mooncake from source inside of the container for now 
+    # We need to install Mooncake from source inside of the container for now
     bash /configs/install_mooncake_from_src.sh $cmd
     if [ "$cmd" = "dynamo" ]; then
     # We are not using a init-expert-location file for e2e benchmarking
@@ -119,7 +119,7 @@ if [ "$mode" = "prefill" ]; then
             --max-total-tokens 32768 \
             --mem-fraction-static 0.8 \
             --log-level debug
-        
+
     elif [ "$cmd" = "sglang" ]; then
         # GB200 sglang prefill command
         # We are not using a init-expert-location file for e2e benchmarking
@@ -170,7 +170,7 @@ if [ "$mode" = "prefill" ]; then
             --log-level debug
     fi
 elif [ "$mode" = "decode" ]; then
-    # We need to install Mooncake from source inside of the container for now 
+    # We need to install Mooncake from source inside of the container for now
     bash /configs/install_mooncake_from_src.sh $cmd
     if [ "$cmd" = "dynamo" ]; then
         # Need to increase --context-length to 10k for 8k1k benchmarking
@@ -217,7 +217,7 @@ elif [ "$mode" = "decode" ]; then
             --chunked-prefill-size 36864 \
             --mem-fraction-static 0.82 \
             --log-level debug
-        
+
     elif [ "$cmd" = "sglang" ]; then
         # GB200 sglang decode command
         # Need to increase --context-length to 10k for 8k1k benchmarking
diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh
old mode 100644
new mode 100755
index 85e2c48379..9281a599f4
--- a/examples/sglang/utils/install_mooncake_from_src.sh
+++ b/examples/sglang/utils/install_mooncake_from_src.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # We've been having some trouble with the mooncake installation when we build
 # the container. This script is ran before SGL starts up and allows us to use
@@ -11,15 +13,12 @@ export TORCH_CUDA_ARCH_LIST=10.0
 
 echo $LD_LIBRARY_PATH
 
-# Uninstall any existing mooncake package
-pip install --break-system-packages mooncake-transfer-engine
-
 # Clone & build
-# Once Mooncake main branch has fixed 
+# Once Mooncake main branch has fixed
 # 1. proper g++ compilation
 # 2. solved std::function call issue - we can swap back to ToT
 # As of 7/16 10:20AM PST - I've been told its was solved but I have not been able to test it E2E
-# So for now we will stay on my side branch 
+# So for now we will stay on my side branch
 git clone https://github.com/ishandhanani/Mooncake.git
 cd Mooncake
 git checkout ishan/pr-571-diff-build

From da162bc6f7bf9fbe6b4ef366a470f618c5fc742c Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Wed, 16 Jul 2025 23:23:45 +0000
Subject: [PATCH 54/65] bump

---
 container/Dockerfile.sglang-gb200 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200
index 9bc3fef5da..ceaa1c51c5 100644
--- a/container/Dockerfile.sglang-gb200
+++ b/container/Dockerfile.sglang-gb200
@@ -67,4 +67,6 @@ ENV PATH=/usr/local/bin/etcd/:$PATH
 COPY examples/sglang/configs/deepseek_r1/wideep/* /sgl-workspace/dynamo/examples/sglang/configs/
 COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/utils/
 
+ENV PYTHONPATH=/workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang:$PYTHONPATH
+
 WORKDIR /sgl-workspace/dynamo/examples/sglang
\ No newline at end of file

From 4266fe030e41ed79ddcfe7b3d2b10f6224d2f168 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Wed, 16 Jul 2025 23:49:42 +0000
Subject: [PATCH 55/65] lel

---
 container/Dockerfile.sglang-gb200 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200
index ceaa1c51c5..48f7e34e62 100644
--- a/container/Dockerfile.sglang-gb200
+++ b/container/Dockerfile.sglang-gb200
@@ -67,6 +67,6 @@ ENV PATH=/usr/local/bin/etcd/:$PATH
 COPY examples/sglang/configs/deepseek_r1/wideep/* /sgl-workspace/dynamo/examples/sglang/configs/
 COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/utils/
 
-ENV PYTHONPATH=/workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang:$PYTHONPATH
+ENV PYTHONPATH=/sgl-workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang:$PYTHONPATH
 
 WORKDIR /sgl-workspace/dynamo/examples/sglang
\ No newline at end of file

From f6ab522b95c22b879c0e0f6a1bad1ef007540490 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 17 Jul 2025 00:32:16 +0000
Subject: [PATCH 56/65] bump

---
 examples/sglang/docs/dsr1-wideep-gb200.md     |  5 ++-
 examples/sglang/slurm_jobs/scripts/gb200.sh   |  4 ---
 .../sglang/utils/install_mooncake_from_src.sh | 35 -------------------
 3 files changed, 2 insertions(+), 42 deletions(-)
 delete mode 100755 examples/sglang/utils/install_mooncake_from_src.sh

diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md
index c2ff937789..e7874f64fb 100644
--- a/examples/sglang/docs/dsr1-wideep-gb200.md
+++ b/examples/sglang/docs/dsr1-wideep-gb200.md
@@ -54,8 +54,7 @@ cd examples/sglang/slurm_jobs
 
 4. Ensure you have the proper paths that you can use to mount things to the container
 
-- The path to the DSR1 model which should be mounted to the `--model-dir` flag
-- The path to the `install_mooncake_from_src.sh` which will be mounted to the `--config-dir` flag
+- The path to the DSR1 model which should be mounted to the `--model-dir` flag and `--config-dir` flag 
 
 5. Run the following command to submit the job
 
@@ -66,7 +65,7 @@ python3 submit_job_script.py \
   --container-image <image-from-step-2> \
   --account <your-account> \
   --gpus-per-node 4 \
-  --config-dir <path-to-configs> \
+  --config-dir <path-to-dsr1-model> \
   --network-interface enp138s0f0np0 \
   --gpu-type gb200 \
   --use-sglang-commands \
diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index e454e7c55a..486c2a6c7e 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -70,8 +70,6 @@ fi
 
 # Construct command based on mode and cmd
 if [ "$mode" = "prefill" ]; then
-    # We need to install Mooncake from source inside of the container for now
-    bash /configs/install_mooncake_from_src.sh $cmd
     if [ "$cmd" = "dynamo" ]; then
     # We are not using a init-expert-location file for e2e benchmarking
         # We also don't currently have a --deepep-config file for GB200
@@ -170,8 +168,6 @@ if [ "$mode" = "prefill" ]; then
             --log-level debug
     fi
 elif [ "$mode" = "decode" ]; then
-    # We need to install Mooncake from source inside of the container for now
-    bash /configs/install_mooncake_from_src.sh $cmd
     if [ "$cmd" = "dynamo" ]; then
         # Need to increase --context-length to 10k for 8k1k benchmarking
         # We are not using a init-expert-location file for e2e benchmarking
diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh
deleted file mode 100755
index 9281a599f4..0000000000
--- a/examples/sglang/utils/install_mooncake_from_src.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-# We've been having some trouble with the mooncake installation when we build
-# the container. This script is ran before SGL starts up and allows us to use
-# the mnnvl capabilites from mooncake main
-
-cd /sgl-workspace
-
-# Try to set this
-export TORCH_CUDA_ARCH_LIST=10.0
-
-echo $LD_LIBRARY_PATH
-
-# Clone & build
-# Once Mooncake main branch has fixed
-# 1. proper g++ compilation
-# 2. solved std::function call issue - we can swap back to ToT
-# As of 7/16 10:20AM PST - I've been told its was solved but I have not been able to test it E2E
-# So for now we will stay on my side branch
-git clone https://github.com/ishandhanani/Mooncake.git
-cd Mooncake
-git checkout ishan/pr-571-diff-build
-bash dependencies.sh -y
-mkdir -p build
-cd build
-cmake .. -DUSE_MNNVL=ON
-make -j
-
-make install
-
-chmod +x /usr/local/lib/python3.10/dist-packages/mooncake/nvlink_allocator.so
-
-echo "Mooncake built and installed"

From 2c3f0855c84817788bdf978f8a5729e6173b1f71 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 17 Jul 2025 00:56:11 +0000
Subject: [PATCH 57/65] bump

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 486c2a6c7e..aee53c3407 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -92,6 +92,7 @@ if [ "$mode" = "prefill" ]; then
             --trust-remote-code \
             --disaggregation-mode prefill \
             --dist-init-addr "$HOST_IP:$PORT" \
+            --disaggregation-bootstrap-port 30001 \
             --nnodes "$TOTAL_NODES" \
             --node-rank "$RANK" \
             --tp-size "$TOTAL_GPUS" \
@@ -141,6 +142,7 @@ if [ "$mode" = "prefill" ]; then
             --trust-remote-code \
             --disaggregation-mode prefill \
             --dist-init-addr "$HOST_IP:$PORT" \
+            --disaggregation-bootstrap-port 30001 \
             --nnodes "$TOTAL_NODES" \
             --node-rank "$RANK" \
             --tp-size "$TOTAL_GPUS" \

From aa123a1ee0c5a01ffa9e5c5a50279c6351e98b3e Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 17 Jul 2025 01:34:02 +0000
Subject: [PATCH 58/65] bump

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index aee53c3407..439e977982 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -191,6 +191,7 @@ elif [ "$mode" = "decode" ]; then
             --trust-remote-code \
             --disaggregation-mode decode \
             --dist-init-addr "$HOST_IP:$PORT" \
+            --disaggregation-bootstrap-port 30001 \
             --nnodes "$TOTAL_NODES" \
             --node-rank "$RANK" \
             --tp-size "$TOTAL_GPUS" \
@@ -238,6 +239,7 @@ elif [ "$mode" = "decode" ]; then
             --trust-remote-code \
             --disaggregation-mode decode \
             --dist-init-addr "$HOST_IP:$PORT" \
+            --disaggregation-bootstrap-port 30001 \
             --nnodes "$TOTAL_NODES" \
             --node-rank "$RANK" \
             --tp-size "$TOTAL_GPUS" \

From 58f4d330aa1167725bd8c5cf271d01eced13925d Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 17 Jul 2025 01:51:26 +0000
Subject: [PATCH 59/65] sadness

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 439e977982..88e5d954ce 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -186,7 +186,7 @@ elif [ "$mode" = "decode" ]; then
         SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
         SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
         PYTHONUNBUFFERED=1 \
-        python3 components/worker.py \
+        python3 components/decode_worker.py \
             --model-path /model/ \
             --trust-remote-code \
             --disaggregation-mode decode \

From d4fc6be1f3b675aaa3b2c37ed771ad5ca2a4c997 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 17 Jul 2025 02:12:12 +0000
Subject: [PATCH 60/65] so close to crash out

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 88e5d954ce..2b46f346a0 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -89,6 +89,7 @@ if [ "$mode" = "prefill" ]; then
         python3 components/worker.py \
             --served-model-name deepseek-ai/DeepSeek-R1 \
             --model-path /model/ \
+            --skip-tokenizer-init \
             --trust-remote-code \
             --disaggregation-mode prefill \
             --dist-init-addr "$HOST_IP:$PORT" \
@@ -187,7 +188,9 @@ elif [ "$mode" = "decode" ]; then
         SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
         PYTHONUNBUFFERED=1 \
         python3 components/decode_worker.py \
+            --served-model-name deepseek-ai/DeepSeek-R1 \
             --model-path /model/ \
+            --skip-tokenizer-init \
             --trust-remote-code \
             --disaggregation-mode decode \
             --dist-init-addr "$HOST_IP:$PORT" \

From 4394938815e9202e77e3b65bf6a331e6621f3e2c Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 17 Jul 2025 02:25:45 +0000
Subject: [PATCH 61/65] bump

---
 examples/sglang/docs/dsr1-wideep-gb200.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md
index e7874f64fb..bb07ccb424 100644
--- a/examples/sglang/docs/dsr1-wideep-gb200.md
+++ b/examples/sglang/docs/dsr1-wideep-gb200.md
@@ -73,6 +73,6 @@ python3 submit_job_script.py \
   --decode-nodes 12
 ```
 
-**UNTESTED**: if you want to spin up dynamo, you can remove the `--use-sglang-commands` flag.
+**Note**: if you want to spin up dynamo, you can remove the `--use-sglang-commands` flag.
 
 6. This will create a logs directory in the `examples/sglang/slurm_jobs` directory. You can `cd` into the directory, cd into your job id, and then run `tail -f *_prefill.err *_decode.err` or `tail -f *_prefill.out *_decode.out` to see the logs.
\ No newline at end of file

From 11d68c26102c4f0a447867b9bbdee3816ba5db26 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 17 Jul 2025 19:49:27 +0000
Subject: [PATCH 62/65] update the gb200 dockerfile

---
 container/Dockerfile.sglang-gb200 | 33 +++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200
index 48f7e34e62..5af553feb1 100644
--- a/container/Dockerfile.sglang-gb200
+++ b/container/Dockerfile.sglang-gb200
@@ -69,4 +69,37 @@ COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/
 
 ENV PYTHONPATH=/sgl-workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang:$PYTHONPATH
 
+# properly install cmake so that gap can be installed
+RUN cmake --version
+
+ARG CMAKE_VERSION=3.31.8
+RUN mkdir /sgl-workspace/cmake_build
+WORKDIR /sgl-workspace/cmake_build
+
+# uninstall CMake
+RUN apt-get purge -y cmake
+# download newer version of CMake
+RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \
+    tar -xvzf cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \
+    mv cmake-${CMAKE_VERSION}-linux-$(uname -m) custom_cmake
+ENV PATH=/sgl-workspace/cmake_build/custom_cmake/bin:$PATH
+
+# should be 3.31.8
+RUN cmake --version
+
+# Install perf_analyzer and genai-perf
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    rapidjson-dev \
+    zlib1g-dev
+
+RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \
+    mkdir perf_analyzer/build && \
+    cmake -B perf_analyzer/build -S perf_analyzer && \
+    cmake --build perf_analyzer/build -- -j8
+
+ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH
+
+RUN pip install --break-system-packages genai-perf
+
 WORKDIR /sgl-workspace/dynamo/examples/sglang
\ No newline at end of file

From b40e60ea1b031b012830f91e929cff12da1d9cdc Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 29 Jul 2025 15:55:54 +0000
Subject: [PATCH 63/65] nixl

---
 examples/sglang/slurm_jobs/scripts/gb200.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh
index 2b46f346a0..cbc8cbce88 100755
--- a/examples/sglang/slurm_jobs/scripts/gb200.sh
+++ b/examples/sglang/slurm_jobs/scripts/gb200.sh
@@ -94,6 +94,7 @@ if [ "$mode" = "prefill" ]; then
             --disaggregation-mode prefill \
             --dist-init-addr "$HOST_IP:$PORT" \
             --disaggregation-bootstrap-port 30001 \
+            --disaggregation-transfer-backend nixl \
             --nnodes "$TOTAL_NODES" \
             --node-rank "$RANK" \
             --tp-size "$TOTAL_GPUS" \

From a4decf81db825f1f91e102fe8e0313ebd3a16b69 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 31 Jul 2025 18:57:27 +0000
Subject: [PATCH 64/65] feat(docs): update DeepSeek-R1 instructions for GB200
 and WideEP container configuration

---
 .../backends/sglang/docs/dsr1-wideep-gb200.md |  50 +++--
 .../backends/sglang/docs/dsr1-wideep-h100.md  |   2 +-
 container/Dockerfile.sglang-gb200             | 105 ---------
 container/Dockerfile.sglang-wideep            | 207 ++++++++----------
 4 files changed, 125 insertions(+), 239 deletions(-)
 delete mode 100644 container/Dockerfile.sglang-gb200

diff --git a/components/backends/sglang/docs/dsr1-wideep-gb200.md b/components/backends/sglang/docs/dsr1-wideep-gb200.md
index bb07ccb424..757dbc0e6b 100644
--- a/components/backends/sglang/docs/dsr1-wideep-gb200.md
+++ b/components/backends/sglang/docs/dsr1-wideep-gb200.md
@@ -17,34 +17,54 @@ limitations under the License.
 
 # Running DeepSeek-R1 Disaggregated with WideEP on GB200s
 
-Dynamo supports SGLang's GB200 implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-06-16-gb200-part-1/) for more details. Full end to end optimization is still a work in progress but you can get this up and running with the following steps.
+Dynamo supports SGLang's GB200 implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-06-16-gb200-part-1/) for more details. Full end to end optimization is still a work in progress but you can get this up and running with the following steps. In ths example, we will run 1 prefill worker on 2 GB200 nodes (4 GPUs each) and 1 decode worker on 12 GB200 nodes (total 56 GPUs).
 
 ## Instructions
 
-1. Build the SGLang DeepEP container on an ARM64 machine.
 
-> [!NOTE]
-> This sglang side branch is based on an open [PR](https://github.com/sgl-project/sglang/pull/7721/files) to SGLang that allows their main dockerfile to be built for aarch64. Once that PR is merged in, we can add the gb200 dockerfile to the main sglang repo.
+1. Build the Dynamo container
 
 ```bash
-git clone https://github.com/kyleliang-nv/sglang.git
-cd sglang
-git checkout sglang_gb200_wideep_docker
-docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE=blackwell --build-arg CUDA_VERSION=12.8.1 .
+cd $DYNAMO_ROOT
+docker build \
+  -f container/Dockerfile.sglang-wideep \
+  -t dynamo-wideep-gb200 \
+  --build-arg MODE=blackwell \
+  --build-arg SGLANG_IMAGE_TAG=v0.4.9.post6-cu128-gb200 \
+  --build-arg ARCH=arm64 \
+  --build-arg ARCH_ALT=aarch64 \
+  . \
+  --no-cache
 ```
 
-2. Build the Dynamo container
+2. You can run this container on each 4xGB200 node using the following command.
 
-> [!NOTE]
-> This is a side branch that contains all of the scripts to run on GB200s. Once the PR is merged in, we can switch to the main branch.
+> [!IMPORTANT]
+> We recommend downloading DeepSeek-R1 and then mounting it to the container. You can find the model [here](https://huggingface.co/deepseek-ai/DeepSeek-R1)
 
 ```bash
-cd $DYNAMO_ROOT
-git checkout ishan/more-slurm-targets # temporary
-docker build -f container/Dockerfile.sglang-gb200 . -t dynamo-wideep-gb200 --no-cache
+docker run \
+    --gpus all \
+    -it \
+    --rm \
+    --network host \
+    --volume /PATH_TO_DSR1_MODEL/:/model/ \
+    --shm-size=10G \
+    --ulimit memlock=-1 \
+    --ulimit stack=67108864 \
+    --ulimit nofile=65536:65536 \
+    --cap-add CAP_SYS_PTRACE \
+    --ipc host \
+    dynamo-wideep-gb200:latest
+```
+
+4. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier.
+
+```bash
+./utils/gen_env_vars.sh
 ```
 
-3. In your SLURM cluster, clone dynamo and switch to this side branch.
+In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory.
 
 ```bash
 git clone https://github.com/ai-dynamo/dynamo.git
diff --git a/components/backends/sglang/docs/dsr1-wideep-h100.md b/components/backends/sglang/docs/dsr1-wideep-h100.md
index a23a3ada13..e1dc372146 100644
--- a/components/backends/sglang/docs/dsr1-wideep-h100.md
+++ b/components/backends/sglang/docs/dsr1-wideep-h100.md
@@ -57,7 +57,7 @@ In each container, you should be in the `/sgl-workspace/dynamo/components/backen
 
 ```bash
 # run ingress
-dynamo run in=http out=dyn &
+python3 -m dynamo.frontend --http-port=8000 &
 # optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below)
 python3 utils/sgl_http_server.py --ns dynamo &
 # run prefill worker
diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200
deleted file mode 100644
index 5af553feb1..0000000000
--- a/container/Dockerfile.sglang-gb200
+++ /dev/null
@@ -1,105 +0,0 @@
-FROM sgl-blackwell-wideep:latest
-
-# Define architecture variables for ARM64
-ARG ARCH=arm64
-ARG ARCH_ALT=aarch64
-
-WORKDIR /sgl-workspace
-# https://github.com/ai-dynamo/dynamo/pull/1938
-ARG DYNAMO_COMMIT="aba60996f225038b691d9f255da515b27695b179"
-RUN git clone https://github.com/ai-dynamo/dynamo.git && cd dynamo && git checkout ${DYNAMO_COMMIT}
-
-# install dynamo in editable mode
-WORKDIR /sgl-workspace/dynamo
-# Rust build/dev dependencies
-RUN apt update -y && \
-    apt install --no-install-recommends -y \
-    build-essential \
-    protobuf-compiler \
-    cmake \
-    libssl-dev \
-    pkg-config \
-    clang \
-    libclang-dev \
-    git
-
-# Define Rust target based on ARCH_ALT ARG
-ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu
-
-ENV RUSTUP_HOME=/usr/local/rustup \
-    CARGO_HOME=/usr/local/cargo \
-    PATH=/usr/local/cargo/bin:$PATH \
-    RUST_VERSION=1.86.0
-
-# Install Rust using RUSTARCH derived from ARCH_ALT
-RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
-    # TODO: Add SHA check back based on RUSTARCH
-    chmod +x rustup-init && \
-    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \
-    rm rustup-init && \
-    chmod -R a+w $RUSTUP_HOME $CARGO_HOME
-
-ARG CARGO_BUILD_JOBS
-# Set CARGO_BUILD_JOBS to 16 if not provided
-# This is to prevent cargo from building $(nproc) jobs in parallel,
-# which might exceed the number of opened files limit.
-ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
-
-RUN cargo build --release
-RUN mkdir -p deploy/sdk/src/dynamo/sdk/cli/bin
-RUN cp target/release/dynamo-run deploy/sdk/src/dynamo/sdk/cli/bin
-
-RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../..
-RUN pip install --break-system-packages -e .
-
-ENV PYTHONPATH=/sgl-workspace/dynamo/components/planner/src:/sgl-workspace/dynamo/examples/sglang:$PYTHONPATH
-
-RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \
-    dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb
-
-ENV ETCD_VERSION="v3.5.21"
-RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \
-    mkdir -p /usr/local/bin/etcd && \
-    tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \
-    rm /tmp/etcd.tar.gz
-ENV PATH=/usr/local/bin/etcd/:$PATH
-
-COPY examples/sglang/configs/deepseek_r1/wideep/* /sgl-workspace/dynamo/examples/sglang/configs/
-COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/utils/
-
-ENV PYTHONPATH=/sgl-workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang:$PYTHONPATH
-
-# properly install cmake so that gap can be installed
-RUN cmake --version
-
-ARG CMAKE_VERSION=3.31.8
-RUN mkdir /sgl-workspace/cmake_build
-WORKDIR /sgl-workspace/cmake_build
-
-# uninstall CMake
-RUN apt-get purge -y cmake
-# download newer version of CMake
-RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \
-    tar -xvzf cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \
-    mv cmake-${CMAKE_VERSION}-linux-$(uname -m) custom_cmake
-ENV PATH=/sgl-workspace/cmake_build/custom_cmake/bin:$PATH
-
-# should be 3.31.8
-RUN cmake --version
-
-# Install perf_analyzer and genai-perf
-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends \
-    rapidjson-dev \
-    zlib1g-dev
-
-RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \
-    mkdir perf_analyzer/build && \
-    cmake -B perf_analyzer/build -S perf_analyzer && \
-    cmake --build perf_analyzer/build -- -j8
-
-ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH
-
-RUN pip install --break-system-packages genai-perf
-
-WORKDIR /sgl-workspace/dynamo/examples/sglang
\ No newline at end of file
diff --git a/container/Dockerfile.sglang-wideep b/container/Dockerfile.sglang-wideep
index e6aa11092f..dfcc0090ac 100644
--- a/container/Dockerfile.sglang-wideep
+++ b/container/Dockerfile.sglang-wideep
@@ -13,160 +13,131 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This should be pinned to the sglang version that is installed with Dynamo
-# in the pyproject.toml
-FROM lmsysorg/sglang:v0.4.8.post1-cu126
+FROM lmsysorg/sglang:${SGLANG_IMAGE_TAG}
+
+ARG MODE="hopper"                            
+ARG SGLANG_IMAGE_TAG="v0.4.8.post1-cu126"  
+ARG ARCH="amd64"
+ARG ARCH_ALT="x86_64"
+ARG NIXL_UCX_REF="v1.19.x"
+ARG NIXL_TAG="0.4.1"
+ARG CMAKE_VERSION="3.31.8"
+ARG RUST_VERSION="1.87.0"
+ARG CARGO_BUILD_JOBS="16"
 
-# Add NIXL build dependencies
 RUN apt-get update -y && \
     apt-get install -y \
-    cmake \
-    meson \
-    ninja-build \
-    pybind11-dev \
-    patchelf \
-    net-tools
-
-# Install Python build dependencies
-RUN pip install --break-system-packages meson-python wheel build
-
-# Add architecture args for NIXL build
-ARG ARCH=amd64
-ARG ARCH_ALT=x86_64
-
-WORKDIR /sgl-workspace
-
-# Install UCX dependencies
-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends \
-    --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev \
-    libnuma-dev librdmacm-dev ibverbs-providers \
-    autoconf libtool
-
-# Build UCX from source
-ARG NIXL_UCX_REF=v1.19.x
-RUN rm -rf /opt/hpcx/ucx && \
-    rm -rf /usr/local/ucx && \
-    cd /usr/local/src && \
-    git clone https://github.com/openucx/ucx.git && \
-    cd ucx && \
-    git checkout $NIXL_UCX_REF && \
-    ./autogen.sh && ./configure \
-    --prefix=/usr/local/ucx \
-    --enable-shared \
-    --disable-static \
-    --disable-doxygen-doc \
-    --enable-optimizations \
-    --enable-cma \
-    --enable-devel-headers \
-    --with-cuda=/usr/local/cuda \
-    --with-verbs \
-    --with-efa \
-    --with-dm \
-    --with-gdrcopy=/usr/local \
-    --enable-mt && \
-    make -j && \
-    make -j install-strip && \
-    ldconfig
+      cmake meson ninja-build pybind11-dev patchelf net-tools \
+      build-essential protobuf-compiler libssl-dev pkg-config \
+      clang libclang-dev git rapidjson-dev zlib1g-dev && \
+    pip install --break-system-packages meson-python wheel build
+
+# Build UCX + NIXL for x86/hopper until its fully tested on GB200
+RUN if [ "$MODE" = "hopper" ]; then \
+      apt-get install -y --no-install-recommends \
+        libibverbs-dev rdma-core ibverbs-utils libibumad-dev \
+        libnuma-dev librdmacm-dev ibverbs-providers autoconf libtool && \
+      # UCX from source
+      rm -rf /opt/hpcx/ucx /usr/local/ucx && \
+      cd /usr/local/src && \
+      git clone https://github.com/openucx/ucx.git && \
+      cd ucx && git checkout $NIXL_UCX_REF && \
+      ./autogen.sh && \
+      ./configure \
+        --prefix=/usr/local/ucx \
+        --enable-shared \
+        --disable-static \
+        --disable-doxygen-doc \
+        --enable-optimizations \
+        --enable-cma \
+        --enable-devel-headers \
+        --with-cuda=/usr/local/cuda \
+        --with-verbs \
+        --with-efa \
+        --with-dm \
+        --with-gdrcopy=/usr/local \
+        --enable-mt && \
+      make -j && make install-strip && ldconfig && \
+      # NIXL
+      git clone https://github.com/ai-dynamo/nixl.git /opt/nixl && \
+      cd /opt/nixl && git checkout $NIXL_TAG && \
+      pip install --break-system-packages . \
+        --config-settings="setup-args=-Ducx_path=/usr/local/ucx"; \
+    fi
 
 ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH
 
-ARG NIXL_TAG=0.3.1
-RUN git clone https://github.com/ai-dynamo/nixl.git && cd nixl && git checkout ${NIXL_TAG} && pip install --break-system-packages . --config-settings=setup-args="-Ducx_path=/usr/local/ucx"
-
-WORKDIR /sgl-workspace
-
-# Allow forceful shutdown of inflight requests
-ENV SGL_FORCE_SHUTDOWN=1
-
+# Dynamo 
 WORKDIR /sgl-workspace
 RUN git clone https://github.com/ai-dynamo/dynamo.git
 
-# install dynamo in editable mode
-WORKDIR /sgl-workspace/dynamo
-# Rust build/dev dependencies
-RUN apt update -y && \
-    apt install --no-install-recommends -y \
-    build-essential \
-    protobuf-compiler \
-    cmake \
-    libssl-dev \
-    pkg-config \
-    clang \
-    libclang-dev \
-    git
-
-# Define Rust target based on ARCH_ALT ARG
-ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu
-
 ENV RUSTUP_HOME=/usr/local/rustup \
     CARGO_HOME=/usr/local/cargo \
-    PATH=/usr/local/cargo/bin:$PATH \
-    RUST_VERSION=1.86.0
+    PATH=/usr/local/cargo/bin:$PATH
 
-# Install Rust using RUSTARCH derived from ARCH_ALT
-RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \
-    # TODO: Add SHA check back based on RUSTARCH
+RUN wget --tries=3 --waitretry=5 \
+    "https://static.rust-lang.org/rustup/archive/1.28.1/${ARCH_ALT}-unknown-linux-gnu/rustup-init" && \
     chmod +x rustup-init && \
-    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \
+    ./rustup-init -y \
+      --no-modify-path \
+      --profile minimal \
+      --default-toolchain $RUST_VERSION \
+      --default-host ${ARCH_ALT}-unknown-linux-gnu && \
     rm rustup-init && \
     chmod -R a+w $RUSTUP_HOME $CARGO_HOME
 
 ARG CARGO_BUILD_JOBS
-# Set CARGO_BUILD_JOBS to 16 if not provided
-# This is to prevent cargo from building $(nproc) jobs in parallel,
-# which might exceed the number of opened files limit.
-ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
+ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS}
+
+RUN cd dynamo && cargo build --release
 
-RUN cargo build --release
+RUN cd dynamo/lib/bindings/python && \
+    pip install --break-system-packages -e . && \
+    cd /sgl-workspace/dynamo && \
+    pip install --break-system-packages .
 
-RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../..
-RUN pip install --break-system-packages .
+RUN pip install --break-system-packages sglang-router==0.1.5
 
-RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \
+RUN wget --tries=3 --waitretry=5 \
+      https://github.com/nats-io/nats-server/releases/download/v2.10.28/\
+nats-server-v2.10.28-${ARCH}.deb && \
     dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb
 
 ENV ETCD_VERSION="v3.5.21"
-RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \
+RUN wget --tries=3 --waitretry=5 \
+      https://github.com/etcd-io/etcd/releases/download/${ETCD_VERSION}/\
+etcd-${ETCD_VERSION}-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \
     mkdir -p /usr/local/bin/etcd && \
-    tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \
+    tar -xzf /tmp/etcd.tar.gz \
+        -C /usr/local/bin/etcd --strip-components=1 && \
     rm /tmp/etcd.tar.gz
-ENV PATH=/usr/local/bin/etcd/:$PATH
 
-ARG CMAKE_VERSION=3.31.8
-RUN mkdir /sgl-workspace/cmake_build
-WORKDIR /sgl-workspace/cmake_build
+ENV PATH=/usr/local/bin/etcd:$PATH
 
-# uninstall CMake
+# GenAI Perf
 RUN apt-get purge -y cmake
-# download newer version of CMake
-RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \
-    tar -xvzf cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \
-    mv cmake-${CMAKE_VERSION}-linux-$(uname -m) custom_cmake
-ENV PATH=/sgl-workspace/cmake_build/custom_cmake/bin:$PATH
 
-# should be 3.31.8
+RUN mkdir /sgl-workspace/cmake_build && \
+    cd /sgl-workspace/cmake_build && \
+    wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/\
+cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \
+    tar -xzf cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \
+    mv cmake-${CMAKE_VERSION}-linux-$(uname -m) custom_cmake && \
+    rm cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz
+
+ENV PATH=/sgl-workspace/cmake_build/custom_cmake/bin:$PATH
 RUN cmake --version
 
-# Install perf_analyzer and genai-perf
-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends \
-    rapidjson-dev \
-    # jq and curl for polling various endpoints and health checks
-    jq \
-    curl \
-    zlib1g-dev
-
-RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \
+RUN git clone --depth=1 \
+      https://github.com/triton-inference-server/perf_analyzer.git && \
     mkdir perf_analyzer/build && \
     cmake -B perf_analyzer/build -S perf_analyzer && \
-    cmake --build perf_analyzer/build -- -j8
+    cmake --build perf_analyzer/build -- -j$(nproc)
 
 ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH
-
 RUN pip install --break-system-packages genai-perf
 
-# https://pypi.org/project/sglang-router/0.1.5 is latest
-RUN pip install sglang-router==0.1.5
+# Enable forceful shutdown of inflight requests
+ENV SGL_FORCE_SHUTDOWN=1
 
 WORKDIR /sgl-workspace/dynamo/components/backends/sglang

From ca6040d3f19bbe8d8860dcaa2282bde13566923b Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Thu, 31 Jul 2025 19:05:19 +0000
Subject: [PATCH 65/65] docs(sglang): update deployment instructions for GB200
 and H100 models

---
 .../backends/sglang/docs/dsr1-wideep-gb200.md | 126 ++++++++++++++----
 .../backends/sglang/docs/dsr1-wideep-h100.md  |   2 +-
 2 files changed, 101 insertions(+), 27 deletions(-)

diff --git a/components/backends/sglang/docs/dsr1-wideep-gb200.md b/components/backends/sglang/docs/dsr1-wideep-gb200.md
index 757dbc0e6b..ea987fae0f 100644
--- a/components/backends/sglang/docs/dsr1-wideep-gb200.md
+++ b/components/backends/sglang/docs/dsr1-wideep-gb200.md
@@ -21,7 +21,6 @@ Dynamo supports SGLang's GB200 implementation of wide expert parallelism and lar
 
 ## Instructions
 
-
 1. Build the Dynamo container
 
 ```bash
@@ -58,41 +57,116 @@ docker run \
     dynamo-wideep-gb200:latest
 ```
 
-4. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier.
+3. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier.
 
 ```bash
 ./utils/gen_env_vars.sh
 ```
 
-In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory.
+4. Run the ingress and prefill worker
 
 ```bash
-git clone https://github.com/ai-dynamo/dynamo.git
-git checkout ishan/more-slurm-targets
-cd examples/sglang/slurm_jobs
+# run ingress
+python3 -m dynamo.frontend --http-port=8000 &
+# optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below)
+python3 utils/sgl_http_server.py --ns dynamo &
+# run prefill worker
+SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
+MC_TE_METRIC=true \
+SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+MC_FORCE_MNNVL=1 \
+NCCL_MNNVL_ENABLE=1 \
+NCCL_CUMEM_ENABLE=1 \
+SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+PYTHONUNBUFFERED=1 \
+python3 components/worker.py \
+  --served-model-name deepseek-ai/DeepSeek-R1 \
+  --model-path /model/ \
+  --skip-tokenizer-init \
+  --trust-remote-code \
+  --disaggregation-mode prefill \
+  --dist-init-addr ${HEAD_PREFILL_NODE_IP}:29500 \
+  --disaggregation-bootstrap-port 30001 \
+  --disaggregation-transfer-backend nixl \
+  --nnodes 2 \
+  --node-rank 0 \
+  --tp-size 8 \
+  --dp-size 8 \
+  --enable-dp-attention \
+  --host 0.0.0.0 \
+  --decode-log-interval 1 \
+  --max-running-requests 6144 \
+  --context-length 2716 \
+  --disable-radix-cache \
+  --enable-deepep-moe \
+  --deepep-mode low_latency \
+  --moe-dense-tp-size 1 \
+  --enable-dp-lm-head \
+  --disable-shared-experts-fusion \
+  --ep-num-redundant-experts 32 \
+  --ep-dispatch-algorithm static \
+  --eplb-algorithm deepseek \
+  --attention-backend cutlass_mla \
+  --watchdog-timeout 1000000 \
+  --disable-cuda-graph \
+  --chunked-prefill-size 16384 \
+  --max-total-tokens 32768 \
+  --mem-fraction-static 0.8 \
+  --log-level debug
 ```
 
-4. Ensure you have the proper paths that you can use to mount things to the container
-
-- The path to the DSR1 model which should be mounted to the `--model-dir` flag and `--config-dir` flag 
-
-5. Run the following command to submit the job
+5. Run the decode worker on the head decode node
 
 ```bash
-python3 submit_job_script.py \
-  --template job_script_template.j2 \
-  --model-dir <path-to-dsr1-model> \
-  --container-image <image-from-step-2> \
-  --account <your-account> \
-  --gpus-per-node 4 \
-  --config-dir <path-to-dsr1-model> \
-  --network-interface enp138s0f0np0 \
-  --gpu-type gb200 \
-  --use-sglang-commands \
-  --prefill-nodes 2 \
-  --decode-nodes 12
+SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \
+MC_TE_METRIC=true \
+SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
+SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
+SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \
+SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \
+SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \
+NCCL_MNNVL_ENABLE=1 \
+MC_FORCE_MNNVL=1 \
+NCCL_CUMEM_ENABLE=1 \
+SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \
+SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \
+PYTHONUNBUFFERED=1 \
+python3 components/decode_worker.py \
+  --served-model-name deepseek-ai/DeepSeek-R1 \
+  --model-path /model/ \
+  --skip-tokenizer-init \
+  --trust-remote-code \
+  --disaggregation-mode decode \
+  --dist-init-addr ${HEAD_DECODE_NODE_IP}:29500 \
+  --disaggregation-bootstrap-port 30001 \
+  --nnodes 12 \
+  --node-rank 0 \
+  --tp-size 48 \
+  --dp-size 48 \
+  --enable-dp-attention \
+  --host 0.0.0.0 \
+  --decode-log-interval 1 \
+  --max-running-requests 36864 \
+  --context-length 2716 \
+  --disable-radix-cache \
+  --enable-deepep-moe \
+  --deepep-mode low_latency \
+  --moe-dense-tp-size 1 \
+  --enable-dp-lm-head \
+  --cuda-graph-bs 768 \
+  --disable-shared-experts-fusion \
+  --ep-num-redundant-experts 32 \
+  --ep-dispatch-algorithm static \
+  --eplb-algorithm deepseek \
+  --attention-backend cutlass_mla \
+  --watchdog-timeout 1000000 \
+  --chunked-prefill-size 36864 \
+  --mem-fraction-static 0.82 \
+  --log-level debug
 ```
 
-**Note**: if you want to spin up dynamo, you can remove the `--use-sglang-commands` flag.
-
-6. This will create a logs directory in the `examples/sglang/slurm_jobs` directory. You can `cd` into the directory, cd into your job id, and then run `tail -f *_prefill.err *_decode.err` or `tail -f *_prefill.out *_decode.out` to see the logs.
\ No newline at end of file
+On the other decode nodes (this example has 12 total decode nodes), run the same command but change `--node-rank` to 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
diff --git a/components/backends/sglang/docs/dsr1-wideep-h100.md b/components/backends/sglang/docs/dsr1-wideep-h100.md
index e1dc372146..57f0b6ba3b 100644
--- a/components/backends/sglang/docs/dsr1-wideep-h100.md
+++ b/components/backends/sglang/docs/dsr1-wideep-h100.md
@@ -93,7 +93,7 @@ python3 -m dynamo.sglang.worker \
 
 On the other prefill node (since this example has 4 total prefill nodes), run the same command but change `--node-rank` to 1,2, and 3
 
-7. Run the decode worker on the head decode node
+6. Run the decode worker on the head decode node
 
 ```bash
 python3 -m dynamo.sglang.decode_worker \