From a28da9235be2b3f963ea5f36dab7dbefca519c57 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 7 Jul 2025 22:39:51 +0000 Subject: [PATCH 01/65] iter --- examples/sglang/slurm_jobs/scripts/gen_cmd.py | 250 ++++++++++++++++++ .../sglang/slurm_jobs/scripts/worker_setup.py | 101 ++++--- 2 files changed, 315 insertions(+), 36 deletions(-) create mode 100644 examples/sglang/slurm_jobs/scripts/gen_cmd.py diff --git a/examples/sglang/slurm_jobs/scripts/gen_cmd.py b/examples/sglang/slurm_jobs/scripts/gen_cmd.py new file mode 100644 index 0000000000..c7096bd3a2 --- /dev/null +++ b/examples/sglang/slurm_jobs/scripts/gen_cmd.py @@ -0,0 +1,250 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Use this script to generate dynamo/sglang flags for h100 or gb200 disagg +""" + +def get_prefill_command_args(config_flag: str, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict: + """ + Get the command arguments for a specific config and worker type. + + Args: + config_flag: One of "h100_dynamo", "h100_sglang", "gb200_dynamo", "gb200_sglang" + worker_type: "prefill" or "decode" + + Returns: + Dictionary with 'script' and 'args' keys + """ + + # TODO: validate if config_flag is valid and exists in support matrix + + if config_flag == "h100_dynamo": + return { + "script": "python3 components/worker.py", + "args": [ + "--model-path /model/ " + "--served-model-name deepseek-ai/DeepSeek-R1 " + "--skip-tokenizer-init " + "--disaggregation-mode prefill " + "--disaggregation-transfer-backend nixl " + "--disaggregation-bootstrap-port 30001 " + f"--dist-init-addr {host_ip}:{port} " + f"--nnodes {total_nodes} " + f"--node-rank {rank} " + f"--tp-size {total_gpus} " + f"--dp-size {total_gpus} " + "--enable-dp-attention " + "--decode-log-interval 1 " + "--enable-deepep-moe " + "--page-size 1 " + "--trust-remote-code " + "--moe-dense-tp-size 1 " + "--enable-dp-lm-head " + "--disable-radix-cache " + "--watchdog-timeout 1000000 " + "--enable-two-batch-overlap " + "--deepep-mode normal " + "--mem-fraction-static 0.85 " + "--deepep-config /configs/deepep.json " + "--ep-num-redundant-experts 32 " + "--ep-dispatch-algorithm dynamic " + "--eplb-algorithm deepseek " + ] + } + elif config_flag == "h100_sglang": + return { + "script": "python3 -m sglang.launch_server", + "args": [ + "--model-path /model/ ", + "--served-model-name deepseek-ai/DeepSeek-R1 ", + "--disaggregation-transfer-backend nixl ", + "--disaggregation-mode prefill ", + f"--dist-init-addr {host_ip}:{port} ", + f"--nnodes {total_nodes} ", + f"--node-rank {rank} ", + f"--tp-size {total_gpus} ", + f"--dp-size {total_gpus} ", + "--enable-dp-attention ", + "--decode-log-interval 1 ", + "--enable-deepep-moe ", + "--page-size 1 ", + "--host 0.0.0.0 ", + "--trust-remote-code ", + "--moe-dense-tp-size 1 ", + "--enable-dp-lm-head ", + "--disable-radix-cache ", + "--watchdog-timeout 1000000 ", + "--enable-two-batch-overlap ", + "--deepep-mode normal ", + "--mem-fraction-static 0.85 ", + "--chunked-prefill-size 524288 ", + "--max-running-requests 8192 ", + "--max-total-tokens 131072 ", + "--context-length 8192 ", + "--init-expert-location /configs/prefill_in4096.json ", + "--ep-num-redundant-experts 32 ", + "--ep-dispatch-algorithm dynamic ", + "--eplb-algorithm deepseek ", + "--deepep-config /configs/deepep.json " + ] + } + elif config_flag == "gb200_sglang": + return { + "script": "python3 -m sglang.launch_server", + "args": [ + "--served-model-name deepseek-ai/DeepSeek-R1 ", + "--model-path /model/ ", + "--trust-remote-code ", + "--disaggregation-mode prefill ", + f"--dist-init-addr {host_ip}:{port} ", + f"--nnodes {total_nodes} ", + f"--node-rank {rank} ", + f"--tp-size {total_gpus} ", + f"--dp-size {total_gpus} ", + "--enable-dp-attention ", + "--host 0.0.0.0 ", + "--decode-log-interval 1 ", + "--max-running-requests 6144 ", + "--context-length 2176 ", + "--disable-radix-cache ", + "--enable-deepep-moe ", + "--deepep-mode low_latency ", + "--moe-dense-tp-size 1 ", + "--enable-dp-lm-head ", + "--disable-shared-experts-fusion ", + "--ep-num-redundant-experts 32 ", + "--ep-dispatch-algorithm static ", + "--eplb-algorithm deepseek ", + "--attention-backend cutlass_mla ", + "--watchdog-timeout 1000000 ", + "--disable-cuda-graph ", + "--chunked-prefill-size 16384 ", + "--max-total-tokens 32768 ", + "--mem-fraction-static 0.9 " + ] + } + else: + raise ValueError(f"Invalid config flag: {config_flag}") + +def get_decode_command_args(config_flag: str, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict: + """ + Get the command arguments for a specific config and worker type. + + Args: + config_flag: One of "h100_dynamo", "h100_sglang", "gb200_dynamo", "gb200_sglang" + worker_type: "prefill" or "decode" + + Returns: + Dictionary with 'script' and 'args' keys + """ + + if config_flag == "h100_dynamo": + return { + "script": "python3 components/decode_worker.py", + "args": [ + "--model-path /model/ ", + "--served-model-name deepseek-ai/DeepSeek-R1 ", + "--skip-tokenizer-init ", + "--disaggregation-mode decode ", + "--disaggregation-transfer-backend nixl ", + "--disaggregation-bootstrap-port 30001 ", + f"--dist-init-addr {host_ip}:{port} ", + f"--nnodes {total_nodes} ", + f"--node-rank {rank} ", + f"--tp-size {total_gpus} ", + f"--dp-size {total_gpus} ", + "--enable-dp-attention ", + "--decode-log-interval 1 " + "--enable-deepep-moe " + "--page-size 1 " + "--trust-remote-code " + "--moe-dense-tp-size 1 " + "--enable-dp-lm-head " + "--disable-radix-cache " + "--watchdog-timeout 1000000 " + "--enable-two-batch-overlap " + "--deepep-mode low_latency " + "--mem-fraction-static 0.835 " + "--ep-num-redundant-experts 32 " + "--cuda-graph-bs 256 " + ] + } + elif config_flag == "h100_sglang": + return { + "script": "python3 -m sglang.launch_server", + "args": [ + "--model-path /model/ ", + "--disaggregation-transfer-backend nixl ", + "--disaggregation-mode decode ", + f"--dist-init-addr {host_ip}:{port} ", + f"--nnodes {total_nodes} ", + f"--node-rank {rank} ", + f"--tp-size {total_gpus} ", + f"--dp-size {total_gpus} ", + "--enable-dp-attention ", + "--decode-log-interval 1 ", + "--enable-deepep-moe ", + "--page-size 1 ", + "--host 0.0.0.0 ", + "--trust-remote-code ", + "--moe-dense-tp-size 1 ", + "--enable-dp-lm-head ", + "--disable-radix-cache ", + "--watchdog-timeout 1000000 ", + "--enable-two-batch-overlap ", + "--deepep-mode low_latency ", + "--mem-fraction-static 0.835 ", + "--max-running-requests 18432 ", + "--context-length 4500 ", + "--ep-num-redundant-experts 32 ", + "--cuda-graph-bs 256 " + ] + } + elif config_flag == "gb200_sglang": + return { + "script": "python3 -m sglang.launch_server", + "args": [ + "--model-path /model/ ", + "--trust-remote-code ", + "--disaggregation-transfer-backend nixl ", + "--disaggregation-mode decode ", + f"--dist-init-addr {host_ip}:{port} ", + f"--nnodes {total_nodes} ", + f"--node-rank {rank} ", + f"--tp-size {total_gpus} ", + f"--dp-size {total_gpus} ", + "--enable-dp-attention ", + "--host 0.0.0.0 ", + "--decode-log-interval 1 ", + "--max-running-requests 36864 ", + "--context-length 2176 ", + "--disable-radix-cache ", + "--enable-deepep-moe ", + "--deepep-mode low_latency ", + "--moe-dense-tp-size 1 ", + "--enable-dp-lm-head ", + "--cuda-graph-bs 768 ", + "--disable-shared-experts-fusion ", + "--ep-num-redundant-experts 32 ", + "--ep-dispatch-algorithm static ", + "--eplb-algorithm deepseek ", + "--attention-backend cutlass_mla ", + "--watchdog-timeout 1000000 ", + "--chunked-prefill-size 36864 ", + "--mem-fraction-static 0.82 " + ] + } + \ No newline at end of file diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index adda2b6407..9546e8c3b8 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -20,8 +20,8 @@ The script will: - Setup the environment -- Update the YAML config file -- Start Dynamo graphs.disagg service +- Generate the python3 command to run the prefill or decode worker +- Start dynamo (or sglang) - Monitor the GPU utilization """ @@ -177,6 +177,11 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac default=None, help="File to log GPU utilization (default: None)", ) + parser.add_argument( + "--use-sglang-commands", + action="store_true", + help="Helper to spin up SGLang servers instead of dynamo. This is helpful for benchmarking SGLang as well", + ) return parser.parse_args(args) @@ -194,43 +199,50 @@ def _validate_args(args: argparse.Namespace) -> None: def setup_prefill_node( - rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int + rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int, use_sglang_commands: bool ) -> int: """ Setup the prefill node. """ - if rank == 0: - logging.info(f"Setting up host prefill node: {rank}") - logging.info(f"Starting nats server on node {rank} with IP {prefill_host_ip}") - - nats_process = run_command("nats-server -js", background=True) - if not nats_process: - raise RuntimeError("Failed to start nats-server") - - etcd_cmd = ( - f"etcd --listen-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} " - f"--advertise-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} " - f"--listen-peer-urls {ETCD_LISTEN_ADDR}:{ETCD_PEER_PORT} " - f"--initial-cluster default=http://{prefill_host_ip}:{ETCD_PEER_PORT}" - ) - - etcd_process = run_command(etcd_cmd, background=True) - if not etcd_process: - raise RuntimeError("Failed to start etcd") - - ingress_process = run_command("dynamo run in=http out=dyn", background=True) - if not ingress_process: - raise RuntimeError("Failed to start ingress") + if not use_sglang_commands: + python_cmd = "python3 components/worker.py " + if rank == 0: + logging.info(f"Setting up host prefill node: {rank}") + logging.info(f"Starting nats server on node {rank} with IP {prefill_host_ip}") + + nats_process = run_command("nats-server -js", background=True) + if not nats_process: + raise RuntimeError("Failed to start nats-server") + + etcd_cmd = ( + f"etcd --listen-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} " + f"--advertise-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} " + f"--listen-peer-urls {ETCD_LISTEN_ADDR}:{ETCD_PEER_PORT} " + f"--initial-cluster default=http://{prefill_host_ip}:{ETCD_PEER_PORT}" + ) + + etcd_process = run_command(etcd_cmd, background=True) + if not etcd_process: + raise RuntimeError("Failed to start etcd") + + ingress_process = run_command("dynamo run in=http out=dyn", background=True) + if not ingress_process: + raise RuntimeError("Failed to start ingress") + + else: + logging.info(f"Setting up child prefill node: {rank}") + if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"): + raise RuntimeError("Failed to connect to etcd") else: - logging.info(f"Setting up child prefill node: {rank}") - if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"): - raise RuntimeError("Failed to connect to etcd") + python_cmd = "python3 -m sglang.launch_server " + logging.info("Using SGLang servers. No need to setup etcd or nats") # NOTE: This implements the example in examples/sglang/dsr1-wideep.md # For other examples, the command might have to be modified. - dynamo_cmd = ( - f"python3 components/worker.py " + # Because we use the sgl arg parser, we can use the same flags for both dynamo and sglang + cmd_to_run = ( + f"{python_cmd} " "--model-path /model/ " "--served-model-name deepseek-ai/DeepSeek-R1 " "--skip-tokenizer-init " @@ -259,7 +271,7 @@ def setup_prefill_node( "--ep-dispatch-algorithm dynamic " "--eplb-algorithm deepseek " ) - return run_command(dynamo_cmd) + return run_command(cmd_to_run) def setup_decode_node( @@ -268,17 +280,31 @@ def setup_decode_node( prefill_host_ip: str, total_nodes: int, total_gpus: int, + use_sglang_commands: bool ) -> int: """ Setup the decode node. """ logging.info(f"Setting up child decode node: {rank}") - if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"): - raise RuntimeError("Failed to connect to etcd") + if use_sglang_commands: + python_cmd = "python3 -m sglang.launch_server " + sgl_mini_lb_cmd = ( + "python3 -m sglang.srt.disaggregation.launch_lb " + f"--prefill http://{prefill_host_ip}:30000 " + f"--decode http://{decode_host_ip}:30000 " + "--host 0.0.0.0 " + "--port 8000 " + "--timeout 3600" + ) + run_command(sgl_mini_lb_cmd, background=True) + else: + python_cmd = "python3 components/decode_worker.py " + if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"): + raise RuntimeError("Failed to connect to etcd") - dynamo_cmd = ( - "python3 components/decode_worker.py " + cmd_to_run = ( + f"{python_cmd} " "--model-path /model/ " "--served-model-name deepseek-ai/DeepSeek-R1 " "--skip-tokenizer-init " @@ -306,7 +332,7 @@ def setup_decode_node( "--cuda-graph-bs 256 " ) - return run_command(dynamo_cmd) + return run_command(cmd_to_run) def setup_env(prefill_host_ip: str): @@ -333,6 +359,7 @@ def main(input_args: list[str] | None = None): logging.info(f"Prefill host IP: {args.prefill_host_ip}") logging.info(f"Decode host IP: {args.decode_host_ip}") logging.info(f"Rank: {args.rank}") + logging.info(f"Use SGLang commands: {args.use_sglang_commands}") setup_env(args.prefill_host_ip) if args.worker_type == "prefill": @@ -341,6 +368,7 @@ def main(input_args: list[str] | None = None): args.prefill_host_ip, args.total_nodes, args.total_nodes * args.gpus_per_node, + args.use_sglang_commands ) else: setup_decode_node( @@ -349,6 +377,7 @@ def main(input_args: list[str] | None = None): args.prefill_host_ip, args.total_nodes, args.total_nodes * args.gpus_per_node, + args.use_sglang_commands ) logging.info(f"{args.worker_type.capitalize()} node setup complete") From be7e2b626842bccef4670e6749589d3f60e5f335 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 7 Jul 2025 22:40:13 +0000 Subject: [PATCH 02/65] deleted in here --- examples/sglang/slurm_jobs/scripts/gen_cmd.py | 250 ------------------ 1 file changed, 250 deletions(-) delete mode 100644 examples/sglang/slurm_jobs/scripts/gen_cmd.py diff --git a/examples/sglang/slurm_jobs/scripts/gen_cmd.py b/examples/sglang/slurm_jobs/scripts/gen_cmd.py deleted file mode 100644 index c7096bd3a2..0000000000 --- a/examples/sglang/slurm_jobs/scripts/gen_cmd.py +++ /dev/null @@ -1,250 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Use this script to generate dynamo/sglang flags for h100 or gb200 disagg -""" - -def get_prefill_command_args(config_flag: str, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict: - """ - Get the command arguments for a specific config and worker type. - - Args: - config_flag: One of "h100_dynamo", "h100_sglang", "gb200_dynamo", "gb200_sglang" - worker_type: "prefill" or "decode" - - Returns: - Dictionary with 'script' and 'args' keys - """ - - # TODO: validate if config_flag is valid and exists in support matrix - - if config_flag == "h100_dynamo": - return { - "script": "python3 components/worker.py", - "args": [ - "--model-path /model/ " - "--served-model-name deepseek-ai/DeepSeek-R1 " - "--skip-tokenizer-init " - "--disaggregation-mode prefill " - "--disaggregation-transfer-backend nixl " - "--disaggregation-bootstrap-port 30001 " - f"--dist-init-addr {host_ip}:{port} " - f"--nnodes {total_nodes} " - f"--node-rank {rank} " - f"--tp-size {total_gpus} " - f"--dp-size {total_gpus} " - "--enable-dp-attention " - "--decode-log-interval 1 " - "--enable-deepep-moe " - "--page-size 1 " - "--trust-remote-code " - "--moe-dense-tp-size 1 " - "--enable-dp-lm-head " - "--disable-radix-cache " - "--watchdog-timeout 1000000 " - "--enable-two-batch-overlap " - "--deepep-mode normal " - "--mem-fraction-static 0.85 " - "--deepep-config /configs/deepep.json " - "--ep-num-redundant-experts 32 " - "--ep-dispatch-algorithm dynamic " - "--eplb-algorithm deepseek " - ] - } - elif config_flag == "h100_sglang": - return { - "script": "python3 -m sglang.launch_server", - "args": [ - "--model-path /model/ ", - "--served-model-name deepseek-ai/DeepSeek-R1 ", - "--disaggregation-transfer-backend nixl ", - "--disaggregation-mode prefill ", - f"--dist-init-addr {host_ip}:{port} ", - f"--nnodes {total_nodes} ", - f"--node-rank {rank} ", - f"--tp-size {total_gpus} ", - f"--dp-size {total_gpus} ", - "--enable-dp-attention ", - "--decode-log-interval 1 ", - "--enable-deepep-moe ", - "--page-size 1 ", - "--host 0.0.0.0 ", - "--trust-remote-code ", - "--moe-dense-tp-size 1 ", - "--enable-dp-lm-head ", - "--disable-radix-cache ", - "--watchdog-timeout 1000000 ", - "--enable-two-batch-overlap ", - "--deepep-mode normal ", - "--mem-fraction-static 0.85 ", - "--chunked-prefill-size 524288 ", - "--max-running-requests 8192 ", - "--max-total-tokens 131072 ", - "--context-length 8192 ", - "--init-expert-location /configs/prefill_in4096.json ", - "--ep-num-redundant-experts 32 ", - "--ep-dispatch-algorithm dynamic ", - "--eplb-algorithm deepseek ", - "--deepep-config /configs/deepep.json " - ] - } - elif config_flag == "gb200_sglang": - return { - "script": "python3 -m sglang.launch_server", - "args": [ - "--served-model-name deepseek-ai/DeepSeek-R1 ", - "--model-path /model/ ", - "--trust-remote-code ", - "--disaggregation-mode prefill ", - f"--dist-init-addr {host_ip}:{port} ", - f"--nnodes {total_nodes} ", - f"--node-rank {rank} ", - f"--tp-size {total_gpus} ", - f"--dp-size {total_gpus} ", - "--enable-dp-attention ", - "--host 0.0.0.0 ", - "--decode-log-interval 1 ", - "--max-running-requests 6144 ", - "--context-length 2176 ", - "--disable-radix-cache ", - "--enable-deepep-moe ", - "--deepep-mode low_latency ", - "--moe-dense-tp-size 1 ", - "--enable-dp-lm-head ", - "--disable-shared-experts-fusion ", - "--ep-num-redundant-experts 32 ", - "--ep-dispatch-algorithm static ", - "--eplb-algorithm deepseek ", - "--attention-backend cutlass_mla ", - "--watchdog-timeout 1000000 ", - "--disable-cuda-graph ", - "--chunked-prefill-size 16384 ", - "--max-total-tokens 32768 ", - "--mem-fraction-static 0.9 " - ] - } - else: - raise ValueError(f"Invalid config flag: {config_flag}") - -def get_decode_command_args(config_flag: str, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict: - """ - Get the command arguments for a specific config and worker type. - - Args: - config_flag: One of "h100_dynamo", "h100_sglang", "gb200_dynamo", "gb200_sglang" - worker_type: "prefill" or "decode" - - Returns: - Dictionary with 'script' and 'args' keys - """ - - if config_flag == "h100_dynamo": - return { - "script": "python3 components/decode_worker.py", - "args": [ - "--model-path /model/ ", - "--served-model-name deepseek-ai/DeepSeek-R1 ", - "--skip-tokenizer-init ", - "--disaggregation-mode decode ", - "--disaggregation-transfer-backend nixl ", - "--disaggregation-bootstrap-port 30001 ", - f"--dist-init-addr {host_ip}:{port} ", - f"--nnodes {total_nodes} ", - f"--node-rank {rank} ", - f"--tp-size {total_gpus} ", - f"--dp-size {total_gpus} ", - "--enable-dp-attention ", - "--decode-log-interval 1 " - "--enable-deepep-moe " - "--page-size 1 " - "--trust-remote-code " - "--moe-dense-tp-size 1 " - "--enable-dp-lm-head " - "--disable-radix-cache " - "--watchdog-timeout 1000000 " - "--enable-two-batch-overlap " - "--deepep-mode low_latency " - "--mem-fraction-static 0.835 " - "--ep-num-redundant-experts 32 " - "--cuda-graph-bs 256 " - ] - } - elif config_flag == "h100_sglang": - return { - "script": "python3 -m sglang.launch_server", - "args": [ - "--model-path /model/ ", - "--disaggregation-transfer-backend nixl ", - "--disaggregation-mode decode ", - f"--dist-init-addr {host_ip}:{port} ", - f"--nnodes {total_nodes} ", - f"--node-rank {rank} ", - f"--tp-size {total_gpus} ", - f"--dp-size {total_gpus} ", - "--enable-dp-attention ", - "--decode-log-interval 1 ", - "--enable-deepep-moe ", - "--page-size 1 ", - "--host 0.0.0.0 ", - "--trust-remote-code ", - "--moe-dense-tp-size 1 ", - "--enable-dp-lm-head ", - "--disable-radix-cache ", - "--watchdog-timeout 1000000 ", - "--enable-two-batch-overlap ", - "--deepep-mode low_latency ", - "--mem-fraction-static 0.835 ", - "--max-running-requests 18432 ", - "--context-length 4500 ", - "--ep-num-redundant-experts 32 ", - "--cuda-graph-bs 256 " - ] - } - elif config_flag == "gb200_sglang": - return { - "script": "python3 -m sglang.launch_server", - "args": [ - "--model-path /model/ ", - "--trust-remote-code ", - "--disaggregation-transfer-backend nixl ", - "--disaggregation-mode decode ", - f"--dist-init-addr {host_ip}:{port} ", - f"--nnodes {total_nodes} ", - f"--node-rank {rank} ", - f"--tp-size {total_gpus} ", - f"--dp-size {total_gpus} ", - "--enable-dp-attention ", - "--host 0.0.0.0 ", - "--decode-log-interval 1 ", - "--max-running-requests 36864 ", - "--context-length 2176 ", - "--disable-radix-cache ", - "--enable-deepep-moe ", - "--deepep-mode low_latency ", - "--moe-dense-tp-size 1 ", - "--enable-dp-lm-head ", - "--cuda-graph-bs 768 ", - "--disable-shared-experts-fusion ", - "--ep-num-redundant-experts 32 ", - "--ep-dispatch-algorithm static ", - "--eplb-algorithm deepseek ", - "--attention-backend cutlass_mla ", - "--watchdog-timeout 1000000 ", - "--chunked-prefill-size 36864 ", - "--mem-fraction-static 0.82 " - ] - } - \ No newline at end of file From e67abd21c6606101cb4917d520e698086330897e Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 7 Jul 2025 23:06:27 +0000 Subject: [PATCH 03/65] add --- examples/sglang/slurm_jobs/scripts/gen_cmd.py | 231 ++++++++++++++++++ .../sglang/slurm_jobs/scripts/worker_setup.py | 99 ++------ 2 files changed, 251 insertions(+), 79 deletions(-) create mode 100644 examples/sglang/slurm_jobs/scripts/gen_cmd.py diff --git a/examples/sglang/slurm_jobs/scripts/gen_cmd.py b/examples/sglang/slurm_jobs/scripts/gen_cmd.py new file mode 100644 index 0000000000..b2e61960da --- /dev/null +++ b/examples/sglang/slurm_jobs/scripts/gen_cmd.py @@ -0,0 +1,231 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Use this script to generate dynamo/sglang flags for h100 or gb200 disagg +""" + +def get_prefill_command_args(gpu_type: str, use_sglang_commands: bool, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict: + if gpu_type == "h100" and not use_sglang_commands: + cmd = ( + f"python3 components/worker.py " + "--model-path /model/ " + "--served-model-name deepseek-ai/DeepSeek-R1 " + "--skip-tokenizer-init " + "--disaggregation-mode prefill " + "--disaggregation-transfer-backend nixl " + "--disaggregation-bootstrap-port 30001 " + f"--dist-init-addr {host_ip}:{port} " + f"--nnodes {total_nodes} " + f"--node-rank {rank} " + f"--tp-size {total_gpus} " + f"--dp-size {total_gpus} " + "--enable-dp-attention " + "--decode-log-interval 1 " + "--enable-deepep-moe " + "--page-size 1 " + "--trust-remote-code " + "--moe-dense-tp-size 1 " + "--enable-dp-lm-head " + "--disable-radix-cache " + "--watchdog-timeout 1000000 " + "--enable-two-batch-overlap " + "--deepep-mode normal " + "--mem-fraction-static 0.85 " + "--deepep-config /configs/deepep.json " + "--ep-num-redundant-experts 32 " + "--ep-dispatch-algorithm dynamic " + "--eplb-algorithm deepseek " + ) + elif gpu_type == "h100" and use_sglang_commands: + cmd = ( + f"python3 -m sglang.launch_server " + "--model-path /model/ " + "--served-model-name deepseek-ai/DeepSeek-R1 " + "--disaggregation-transfer-backend nixl " + "--disaggregation-mode prefill " + f"--dist-init-addr {host_ip}:{port} " + f"--nnodes {total_nodes} " + f"--node-rank {rank} " + f"--tp-size {total_gpus} " + f"--dp-size {total_gpus} " + "--enable-dp-attention " + "--decode-log-interval 1 " + "--enable-deepep-moe " + "--page-size 1 " + "--host 0.0.0.0 " + "--trust-remote-code " + "--moe-dense-tp-size 1 " + "--enable-dp-lm-head " + "--disable-radix-cache " + "--watchdog-timeout 1000000 " + "--enable-two-batch-overlap " + "--deepep-mode normal " + "--mem-fraction-static 0.85 " + "--chunked-prefill-size 524288 " + "--max-running-requests 8192 " + "--max-total-tokens 131072 " + "--context-length 8192 " + "--init-expert-location /configs/prefill_in4096.json " + "--ep-num-redundant-experts 32 " + "--ep-dispatch-algorithm dynamic " + "--eplb-algorithm deepseek " + "--deepep-config /configs/deepep.json " + + ) + elif gpu_type == "gb200" and use_sglang_commands: + cmd = ( + f"python3 -m sglang.launch_server " + "--served-model-name deepseek-ai/DeepSeek-R1 " + "--model-path /model/ " + "--trust-remote-code " + "--disaggregation-mode prefill " + f"--dist-init-addr {host_ip}:{port} " + f"--nnodes {total_nodes} " + f"--node-rank {rank} " + f"--tp-size {total_gpus} " + f"--dp-size {total_gpus} " + "--enable-dp-attention " + "--host 0.0.0.0 " + "--decode-log-interval 1 " + "--max-running-requests 6144 " + "--context-length 2176 " + "--disable-radix-cache " + "--enable-deepep-moe " + "--deepep-mode low_latency " + "--moe-dense-tp-size 1 " + "--enable-dp-lm-head " + "--disable-shared-experts-fusion " + "--ep-num-redundant-experts 32 " + "--ep-dispatch-algorithm static " + "--eplb-algorithm deepseek " + "--attention-backend cutlass_mla " + "--watchdog-timeout 1000000 " + "--disable-cuda-graph " + "--chunked-prefill-size 16384 " + "--max-total-tokens 32768 " + "--mem-fraction-static 0.9 " + ) + else: + raise ValueError(f"Unsupported: {gpu_type} and use_sglang_commands={use_sglang_commands}") + + return cmd + +def get_decode_command_args(gpu_type: str, use_sglang_commands: bool, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict: + if gpu_type == "h100" and not use_sglang_commands: + cmd = ( + f"python3 components/decode_worker.py " + "--model-path /model/ " + "--served-model-name deepseek-ai/DeepSeek-R1 " + "--skip-tokenizer-init " + "--disaggregation-mode decode " + "--disaggregation-transfer-backend nixl " + "--disaggregation-bootstrap-port 30001 " + f"--dist-init-addr {host_ip}:{port} " + f"--nnodes {total_nodes} " + f"--node-rank {rank} " + f"--tp-size {total_gpus} " + f"--dp-size {total_gpus} " + "--enable-dp-attention " + "--decode-log-interval 1 " + "--enable-deepep-moe " + "--page-size 1 " + "--trust-remote-code " + "--moe-dense-tp-size 1 " + "--enable-dp-lm-head " + "--disable-radix-cache " + "--watchdog-timeout 1000000 " + "--enable-two-batch-overlap " + "--deepep-mode low_latency " + "--mem-fraction-static 0.835 " + "--ep-num-redundant-experts 32 " + "--cuda-graph-bs 256 " + ) + elif gpu_type == "h100" and use_sglang_commands: + cmd = ( + f"python3 -m sglang.launch_server " + "--model-path /model/ " + "--disaggregation-transfer-backend nixl " + "--disaggregation-mode decode " + f"--dist-init-addr {host_ip}:{port} " + f"--nnodes {total_nodes} " + f"--node-rank {rank} " + f"--tp-size {total_gpus} " + f"--dp-size {total_gpus} " + "--enable-dp-attention " + "--decode-log-interval 1 " + "--enable-deepep-moe " + "--page-size 1 " + "--host 0.0.0.0 " + "--trust-remote-code " + "--moe-dense-tp-size 1 " + "--enable-dp-lm-head " + "--disable-radix-cache " + "--watchdog-timeout 1000000 " + "--enable-two-batch-overlap " + "--deepep-mode low_latency " + "--mem-fraction-static 0.835 " + "--max-running-requests 18432 " + "--context-length 4500 " + "--ep-num-redundant-experts 32 " + "--cuda-graph-bs 256 " + ) + elif gpu_type == "gb200" and use_sglang_commands: + cmd = ( + f"python3 -m sglang.launch_server " + "--model-path /model/ " + "--trust-remote-code " + "--disaggregation-transfer-backend nixl " + "--disaggregation-mode decode " + f"--dist-init-addr {host_ip}:{port} " + f"--nnodes {total_nodes} " + f"--node-rank {rank} " + f"--tp-size {total_gpus} " + f"--dp-size {total_gpus} " + "--enable-dp-attention " + "--host 0.0.0.0 " + "--decode-log-interval 1 " + "--max-running-requests 36864 " + "--context-length 2176 " + "--disable-radix-cache " + "--enable-deepep-moe " + "--deepep-mode low_latency " + "--moe-dense-tp-size 1 " + "--enable-dp-lm-head " + "--cuda-graph-bs 768 " + "--disable-shared-experts-fusion " + "--ep-num-redundant-experts 32 " + "--ep-dispatch-algorithm static " + "--eplb-algorithm deepseek " + "--attention-backend cutlass_mla " + "--watchdog-timeout 1000000 " + "--chunked-prefill-size 36864 " + "--mem-fraction-static 0.82 " + ) + else: + raise ValueError(f"Unsupported: {gpu_type} and use_sglang_commands={use_sglang_commands}") + + return cmd + +def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> dict: + cmd = ( + f"python3 -m sglang.srt.disaggregation.launch_lb " + f"--prefill http://{prefill_host_ip}:30000 " + f"--decode http://{decode_host_ip}:30000 " + "--host 0.0.0.0 " + "--port 8000 " + "--timeout 3600" + ) + return cmd \ No newline at end of file diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index 9546e8c3b8..90911127e8 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -33,6 +33,7 @@ import time from pathlib import Path +from .gen_cmd import get_prefill_command_args, get_decode_command_args, get_sglang_mini_lb_command_args import requests # Network configurations @@ -180,8 +181,16 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac parser.add_argument( "--use-sglang-commands", action="store_true", + default=False, help="Helper to spin up SGLang servers instead of dynamo. This is helpful for benchmarking SGLang as well", ) + parser.add_argument( + "--gpu_type", + type=str, + choices=["h100", "gb200"], + default="h100", + help="Type of GPU to use", + ) return parser.parse_args(args) @@ -199,13 +208,12 @@ def _validate_args(args: argparse.Namespace) -> None: def setup_prefill_node( - rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int, use_sglang_commands: bool + rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int, use_sglang_commands: bool, gpu_type: str ) -> int: """ Setup the prefill node. """ if not use_sglang_commands: - python_cmd = "python3 components/worker.py " if rank == 0: logging.info(f"Setting up host prefill node: {rank}") logging.info(f"Starting nats server on node {rank} with IP {prefill_host_ip}") @@ -233,44 +241,11 @@ def setup_prefill_node( logging.info(f"Setting up child prefill node: {rank}") if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"): raise RuntimeError("Failed to connect to etcd") - else: - python_cmd = "python3 -m sglang.launch_server " logging.info("Using SGLang servers. No need to setup etcd or nats") - # NOTE: This implements the example in examples/sglang/dsr1-wideep.md - # For other examples, the command might have to be modified. - # Because we use the sgl arg parser, we can use the same flags for both dynamo and sglang - cmd_to_run = ( - f"{python_cmd} " - "--model-path /model/ " - "--served-model-name deepseek-ai/DeepSeek-R1 " - "--skip-tokenizer-init " - "--disaggregation-mode prefill " - "--disaggregation-transfer-backend nixl " - "--disaggregation-bootstrap-port 30001 " - f"--dist-init-addr {prefill_host_ip}:{DIST_INIT_PORT} " - f"--nnodes {total_nodes} " - f"--node-rank {rank} " - f"--tp-size {total_gpus} " - f"--dp-size {total_gpus} " - "--enable-dp-attention " - "--decode-log-interval 1 " - "--enable-deepep-moe " - "--page-size 1 " - "--trust-remote-code " - "--moe-dense-tp-size 1 " - "--enable-dp-lm-head " - "--disable-radix-cache " - "--watchdog-timeout 1000000 " - "--enable-two-batch-overlap " - "--deepep-mode normal " - "--mem-fraction-static 0.85 " - "--deepep-config /configs/deepep.json " - "--ep-num-redundant-experts 32 " - "--ep-dispatch-algorithm dynamic " - "--eplb-algorithm deepseek " - ) + # NOTE: Default command for h100 and dynamo implements the example in examples/sglang/dsr1-wideep.md + cmd_to_run = get_prefill_command_args(gpu_type, use_sglang_commands, prefill_host_ip, total_nodes, rank, total_gpus) return run_command(cmd_to_run) @@ -280,7 +255,8 @@ def setup_decode_node( prefill_host_ip: str, total_nodes: int, total_gpus: int, - use_sglang_commands: bool + use_sglang_commands: bool, + gpu_type: str ) -> int: """ Setup the decode node. @@ -288,50 +264,13 @@ def setup_decode_node( logging.info(f"Setting up child decode node: {rank}") if use_sglang_commands: - python_cmd = "python3 -m sglang.launch_server " - sgl_mini_lb_cmd = ( - "python3 -m sglang.srt.disaggregation.launch_lb " - f"--prefill http://{prefill_host_ip}:30000 " - f"--decode http://{decode_host_ip}:30000 " - "--host 0.0.0.0 " - "--port 8000 " - "--timeout 3600" - ) + sgl_mini_lb_cmd = get_sglang_mini_lb_command_args(prefill_host_ip, decode_host_ip) run_command(sgl_mini_lb_cmd, background=True) else: - python_cmd = "python3 components/decode_worker.py " if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"): raise RuntimeError("Failed to connect to etcd") - cmd_to_run = ( - f"{python_cmd} " - "--model-path /model/ " - "--served-model-name deepseek-ai/DeepSeek-R1 " - "--skip-tokenizer-init " - "--disaggregation-mode decode " - "--disaggregation-transfer-backend nixl " - "--disaggregation-bootstrap-port 30001 " - f"--dist-init-addr {decode_host_ip}:{DIST_INIT_PORT} " - f"--nnodes {total_nodes} " - f"--node-rank {rank} " - f"--tp-size {total_gpus} " - f"--dp-size {total_gpus} " - "--enable-dp-attention " - "--decode-log-interval 1 " - "--enable-deepep-moe " - "--page-size 1 " - "--trust-remote-code " - "--moe-dense-tp-size 1 " - "--enable-dp-lm-head " - "--disable-radix-cache " - "--watchdog-timeout 1000000 " - "--enable-two-batch-overlap " - "--deepep-mode low_latency " - "--mem-fraction-static 0.835 " - "--ep-num-redundant-experts 32 " - "--cuda-graph-bs 256 " - ) - + cmd_to_run = get_decode_command_args(gpu_type, use_sglang_commands, decode_host_ip, total_nodes, rank, total_gpus) return run_command(cmd_to_run) @@ -368,7 +307,8 @@ def main(input_args: list[str] | None = None): args.prefill_host_ip, args.total_nodes, args.total_nodes * args.gpus_per_node, - args.use_sglang_commands + args.use_sglang_commands, + args.gpu_type ) else: setup_decode_node( @@ -377,7 +317,8 @@ def main(input_args: list[str] | None = None): args.prefill_host_ip, args.total_nodes, args.total_nodes * args.gpus_per_node, - args.use_sglang_commands + args.use_sglang_commands, + args.gpu_type ) logging.info(f"{args.worker_type.capitalize()} node setup complete") From 78047a91ab2771c117169a0b77b074077419651b Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 7 Jul 2025 23:29:48 +0000 Subject: [PATCH 04/65] shell script --- examples/sglang/slurm_jobs/scripts/gb200.sh | 142 +++++++++++ examples/sglang/slurm_jobs/scripts/gen_cmd.py | 231 ------------------ examples/sglang/slurm_jobs/scripts/h100.sh | 187 ++++++++++++++ .../sglang/slurm_jobs/scripts/worker_setup.py | 48 +++- 4 files changed, 372 insertions(+), 236 deletions(-) create mode 100644 examples/sglang/slurm_jobs/scripts/gb200.sh delete mode 100644 examples/sglang/slurm_jobs/scripts/gen_cmd.py create mode 100644 examples/sglang/slurm_jobs/scripts/h100.sh diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh new file mode 100644 index 0000000000..698c1df0ae --- /dev/null +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +# Function to print usage +print_usage() { + echo "Usage: $0 " + echo " mode: prefill or decode" + echo " cmd: dynamo or sglang" + echo "" + echo "Examples:" + echo " $0 prefill dynamo" + echo " $0 decode sglang" + exit 1 +} + +# Check if correct number of arguments provided +if [ $# -ne 2 ]; then + echo "Error: Expected 2 arguments, got $#" + print_usage +fi + +# Parse arguments +mode=$1 +cmd=$2 + +# Validate mode argument +if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then + echo "Error: mode must be 'prefill' or 'decode', got '$mode'" + print_usage +fi + +# Validate cmd argument +if [ "$cmd" != "dynamo" ] && [ "$cmd" != "sglang" ]; then + echo "Error: cmd must be 'dynamo' or 'sglang', got '$cmd'" + print_usage +fi + +echo "Mode: $mode" +echo "Command: $cmd" + + +# Check if required environment variables are set +if [ -z "$HOST_IP" ]; then + echo "Error: HOST_IP environment variable is not set" + exit 1 +fi + +if [ -z "$PORT" ]; then + echo "Error: PORT environment variable is not set" + exit 1 +fi + +if [ -z "$TOTAL_GPUS" ]; then + echo "Error: TOTAL_GPUS environment variable is not set" + exit 1 +fi + +if [ -z "$RANK" ]; then + echo "Error: RANK environment variable is not set" + exit 1 +fi + +if [ -z "$TOTAL_NODES" ]; then + echo "Error: TOTAL_NODES environment variable is not set" + exit 1 +fi + + +# Construct command based on mode and cmd +if [ "$mode" = "prefill" ]; then + if [ "$cmd" = "dynamo" ]; then + echo "Error: dynamo command not implemented for GB200" + exit 1 + elif [ "$cmd" = "sglang" ]; then + # GB200 sglang prefill command + python3 -m sglang.launch_server \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --model-path /model/ \ + --trust-remote-code \ + --disaggregation-mode prefill \ + --dist-init-addr "$HOST_IP:$PORT" \ + --nnodes "$TOTAL_NODES" \ + --node-rank "$RANK" \ + --tp-size "$TOTAL_GPUS" \ + --dp-size "$TOTAL_GPUS" \ + --enable-dp-attention \ + --host 0.0.0.0 \ + --decode-log-interval 1 \ + --max-running-requests 6144 \ + --context-length 2176 \ + --disable-radix-cache \ + --enable-deepep-moe \ + --deepep-mode low_latency \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-shared-experts-fusion \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm static \ + --eplb-algorithm deepseek \ + --attention-backend cutlass_mla \ + --watchdog-timeout 1000000 \ + --disable-cuda-graph \ + --chunked-prefill-size 16384 \ + --max-total-tokens 32768 \ + --mem-fraction-static 0.9 + fi +elif [ "$mode" = "decode" ]; then + if [ "$cmd" = "dynamo" ]; then + echo "Error: dynamo command not implemented for GB200" + exit 1 + elif [ "$cmd" = "sglang" ]; then + # GB200 sglang decode command + python3 -m sglang.launch_server \ + --model-path /model/ \ + --trust-remote-code \ + --disaggregation-transfer-backend nixl \ + --disaggregation-mode decode \ + --dist-init-addr "$HOST_IP:$PORT" \ + --nnodes "$TOTAL_NODES" \ + --node-rank "$RANK" \ + --tp-size "$TOTAL_GPUS" \ + --dp-size "$TOTAL_GPUS" \ + --enable-dp-attention \ + --host 0.0.0.0 \ + --decode-log-interval 1 \ + --max-running-requests 36864 \ + --context-length 2176 \ + --disable-radix-cache \ + --enable-deepep-moe \ + --deepep-mode low_latency \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --cuda-graph-bs 768 \ + --disable-shared-experts-fusion \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm static \ + --eplb-algorithm deepseek \ + --attention-backend cutlass_mla \ + --watchdog-timeout 1000000 \ + --chunked-prefill-size 36864 \ + --mem-fraction-static 0.82 + fi +fi diff --git a/examples/sglang/slurm_jobs/scripts/gen_cmd.py b/examples/sglang/slurm_jobs/scripts/gen_cmd.py deleted file mode 100644 index b2e61960da..0000000000 --- a/examples/sglang/slurm_jobs/scripts/gen_cmd.py +++ /dev/null @@ -1,231 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Use this script to generate dynamo/sglang flags for h100 or gb200 disagg -""" - -def get_prefill_command_args(gpu_type: str, use_sglang_commands: bool, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict: - if gpu_type == "h100" and not use_sglang_commands: - cmd = ( - f"python3 components/worker.py " - "--model-path /model/ " - "--served-model-name deepseek-ai/DeepSeek-R1 " - "--skip-tokenizer-init " - "--disaggregation-mode prefill " - "--disaggregation-transfer-backend nixl " - "--disaggregation-bootstrap-port 30001 " - f"--dist-init-addr {host_ip}:{port} " - f"--nnodes {total_nodes} " - f"--node-rank {rank} " - f"--tp-size {total_gpus} " - f"--dp-size {total_gpus} " - "--enable-dp-attention " - "--decode-log-interval 1 " - "--enable-deepep-moe " - "--page-size 1 " - "--trust-remote-code " - "--moe-dense-tp-size 1 " - "--enable-dp-lm-head " - "--disable-radix-cache " - "--watchdog-timeout 1000000 " - "--enable-two-batch-overlap " - "--deepep-mode normal " - "--mem-fraction-static 0.85 " - "--deepep-config /configs/deepep.json " - "--ep-num-redundant-experts 32 " - "--ep-dispatch-algorithm dynamic " - "--eplb-algorithm deepseek " - ) - elif gpu_type == "h100" and use_sglang_commands: - cmd = ( - f"python3 -m sglang.launch_server " - "--model-path /model/ " - "--served-model-name deepseek-ai/DeepSeek-R1 " - "--disaggregation-transfer-backend nixl " - "--disaggregation-mode prefill " - f"--dist-init-addr {host_ip}:{port} " - f"--nnodes {total_nodes} " - f"--node-rank {rank} " - f"--tp-size {total_gpus} " - f"--dp-size {total_gpus} " - "--enable-dp-attention " - "--decode-log-interval 1 " - "--enable-deepep-moe " - "--page-size 1 " - "--host 0.0.0.0 " - "--trust-remote-code " - "--moe-dense-tp-size 1 " - "--enable-dp-lm-head " - "--disable-radix-cache " - "--watchdog-timeout 1000000 " - "--enable-two-batch-overlap " - "--deepep-mode normal " - "--mem-fraction-static 0.85 " - "--chunked-prefill-size 524288 " - "--max-running-requests 8192 " - "--max-total-tokens 131072 " - "--context-length 8192 " - "--init-expert-location /configs/prefill_in4096.json " - "--ep-num-redundant-experts 32 " - "--ep-dispatch-algorithm dynamic " - "--eplb-algorithm deepseek " - "--deepep-config /configs/deepep.json " - - ) - elif gpu_type == "gb200" and use_sglang_commands: - cmd = ( - f"python3 -m sglang.launch_server " - "--served-model-name deepseek-ai/DeepSeek-R1 " - "--model-path /model/ " - "--trust-remote-code " - "--disaggregation-mode prefill " - f"--dist-init-addr {host_ip}:{port} " - f"--nnodes {total_nodes} " - f"--node-rank {rank} " - f"--tp-size {total_gpus} " - f"--dp-size {total_gpus} " - "--enable-dp-attention " - "--host 0.0.0.0 " - "--decode-log-interval 1 " - "--max-running-requests 6144 " - "--context-length 2176 " - "--disable-radix-cache " - "--enable-deepep-moe " - "--deepep-mode low_latency " - "--moe-dense-tp-size 1 " - "--enable-dp-lm-head " - "--disable-shared-experts-fusion " - "--ep-num-redundant-experts 32 " - "--ep-dispatch-algorithm static " - "--eplb-algorithm deepseek " - "--attention-backend cutlass_mla " - "--watchdog-timeout 1000000 " - "--disable-cuda-graph " - "--chunked-prefill-size 16384 " - "--max-total-tokens 32768 " - "--mem-fraction-static 0.9 " - ) - else: - raise ValueError(f"Unsupported: {gpu_type} and use_sglang_commands={use_sglang_commands}") - - return cmd - -def get_decode_command_args(gpu_type: str, use_sglang_commands: bool, host_ip: str, port: int, total_nodes: int, rank: int, total_gpus: int) -> dict: - if gpu_type == "h100" and not use_sglang_commands: - cmd = ( - f"python3 components/decode_worker.py " - "--model-path /model/ " - "--served-model-name deepseek-ai/DeepSeek-R1 " - "--skip-tokenizer-init " - "--disaggregation-mode decode " - "--disaggregation-transfer-backend nixl " - "--disaggregation-bootstrap-port 30001 " - f"--dist-init-addr {host_ip}:{port} " - f"--nnodes {total_nodes} " - f"--node-rank {rank} " - f"--tp-size {total_gpus} " - f"--dp-size {total_gpus} " - "--enable-dp-attention " - "--decode-log-interval 1 " - "--enable-deepep-moe " - "--page-size 1 " - "--trust-remote-code " - "--moe-dense-tp-size 1 " - "--enable-dp-lm-head " - "--disable-radix-cache " - "--watchdog-timeout 1000000 " - "--enable-two-batch-overlap " - "--deepep-mode low_latency " - "--mem-fraction-static 0.835 " - "--ep-num-redundant-experts 32 " - "--cuda-graph-bs 256 " - ) - elif gpu_type == "h100" and use_sglang_commands: - cmd = ( - f"python3 -m sglang.launch_server " - "--model-path /model/ " - "--disaggregation-transfer-backend nixl " - "--disaggregation-mode decode " - f"--dist-init-addr {host_ip}:{port} " - f"--nnodes {total_nodes} " - f"--node-rank {rank} " - f"--tp-size {total_gpus} " - f"--dp-size {total_gpus} " - "--enable-dp-attention " - "--decode-log-interval 1 " - "--enable-deepep-moe " - "--page-size 1 " - "--host 0.0.0.0 " - "--trust-remote-code " - "--moe-dense-tp-size 1 " - "--enable-dp-lm-head " - "--disable-radix-cache " - "--watchdog-timeout 1000000 " - "--enable-two-batch-overlap " - "--deepep-mode low_latency " - "--mem-fraction-static 0.835 " - "--max-running-requests 18432 " - "--context-length 4500 " - "--ep-num-redundant-experts 32 " - "--cuda-graph-bs 256 " - ) - elif gpu_type == "gb200" and use_sglang_commands: - cmd = ( - f"python3 -m sglang.launch_server " - "--model-path /model/ " - "--trust-remote-code " - "--disaggregation-transfer-backend nixl " - "--disaggregation-mode decode " - f"--dist-init-addr {host_ip}:{port} " - f"--nnodes {total_nodes} " - f"--node-rank {rank} " - f"--tp-size {total_gpus} " - f"--dp-size {total_gpus} " - "--enable-dp-attention " - "--host 0.0.0.0 " - "--decode-log-interval 1 " - "--max-running-requests 36864 " - "--context-length 2176 " - "--disable-radix-cache " - "--enable-deepep-moe " - "--deepep-mode low_latency " - "--moe-dense-tp-size 1 " - "--enable-dp-lm-head " - "--cuda-graph-bs 768 " - "--disable-shared-experts-fusion " - "--ep-num-redundant-experts 32 " - "--ep-dispatch-algorithm static " - "--eplb-algorithm deepseek " - "--attention-backend cutlass_mla " - "--watchdog-timeout 1000000 " - "--chunked-prefill-size 36864 " - "--mem-fraction-static 0.82 " - ) - else: - raise ValueError(f"Unsupported: {gpu_type} and use_sglang_commands={use_sglang_commands}") - - return cmd - -def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> dict: - cmd = ( - f"python3 -m sglang.srt.disaggregation.launch_lb " - f"--prefill http://{prefill_host_ip}:30000 " - f"--decode http://{decode_host_ip}:30000 " - "--host 0.0.0.0 " - "--port 8000 " - "--timeout 3600" - ) - return cmd \ No newline at end of file diff --git a/examples/sglang/slurm_jobs/scripts/h100.sh b/examples/sglang/slurm_jobs/scripts/h100.sh new file mode 100644 index 0000000000..a00b63f554 --- /dev/null +++ b/examples/sglang/slurm_jobs/scripts/h100.sh @@ -0,0 +1,187 @@ +#!/bin/bash + +# Function to print usage +print_usage() { + echo "Usage: $0 " + echo " mode: prefill or decode" + echo " cmd: dynamo or sglang" + echo "" + echo "Examples:" + echo " $0 prefill dynamo" + echo " $0 decode sglang" + exit 1 +} + +# Check if correct number of arguments provided +if [ $# -ne 2 ]; then + echo "Error: Expected 2 arguments, got $#" + print_usage +fi + +# Parse arguments +mode=$1 +cmd=$2 + +# Validate mode argument +if [ "$mode" != "prefill" ] && [ "$mode" != "decode" ]; then + echo "Error: mode must be 'prefill' or 'decode', got '$mode'" + print_usage +fi + +# Validate cmd argument +if [ "$cmd" != "dynamo" ] && [ "$cmd" != "sglang" ]; then + echo "Error: cmd must be 'dynamo' or 'sglang', got '$cmd'" + print_usage +fi + +echo "Mode: $mode" +echo "Command: $cmd" + + +# Check if required environment variables are set +if [ -z "$HOST_IP" ]; then + echo "Error: HOST_IP environment variable is not set" + exit 1 +fi + +if [ -z "$PORT" ]; then + echo "Error: PORT environment variable is not set" + exit 1 +fi + +if [ -z "$TOTAL_GPUS" ]; then + echo "Error: TOTAL_GPUS environment variable is not set" + exit 1 +fi + +if [ -z "$RANK" ]; then + echo "Error: RANK environment variable is not set" + exit 1 +fi + +if [ -z "$TOTAL_NODES" ]; then + echo "Error: TOTAL_NODES environment variable is not set" + exit 1 +fi + +# Construct command based on mode and cmd +if [ "$mode" = "prefill" ]; then + if [ "$cmd" = "dynamo" ]; then + # H100 dynamo prefill command + python3 components/worker.py \ + --model-path /model/ \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --skip-tokenizer-init \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --dist-init-addr "$HOST_IP:$PORT" \ + --nnodes "$TOTAL_NODES" \ + --node-rank "$RANK" \ + --tp-size "$TOTAL_GPUS" \ + --dp-size "$TOTAL_GPUS" \ + --enable-dp-attention \ + --decode-log-interval 1 \ + --enable-deepep-moe \ + --page-size 1 \ + --trust-remote-code \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-radix-cache \ + --watchdog-timeout 1000000 \ + --enable-two-batch-overlap \ + --deepep-mode normal \ + --mem-fraction-static 0.85 \ + --deepep-config /configs/deepep.json \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm dynamic \ + --eplb-algorithm deepseek + elif [ "$cmd" = "sglang" ]; then + # H100 sglang prefill command + python3 -m sglang.launch_server \ + --model-path /model/ \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --disaggregation-transfer-backend nixl \ + --disaggregation-mode prefill \ + --dist-init-addr "$HOST_IP:$PORT" \ + --nnodes "$TOTAL_NODES" \ + --node-rank "$RANK" \ + --tp-size "$TOTAL_GPUS" \ + --dp-size "$TOTAL_GPUS" \ + --enable-dp-attention \ + --decode-log-interval 1 \ + --enable-deepep-moe \ + --page-size 1 \ + --host 0.0.0.0 \ + --trust-remote-code \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-radix-cache \ + --watchdog-timeout 1000000 \ + --enable-two-batch-overlap \ + --deepep-mode normal \ + --mem-fraction-static 0.85 \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm dynamic \ + --eplb-algorithm deepseek \ + --deepep-config /configs/deepep.json + fi +elif [ "$mode" = "decode" ]; then + if [ "$cmd" = "dynamo" ]; then + # H100 dynamo decode command + python3 components/decode_worker.py \ + --model-path /model/ \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --skip-tokenizer-init \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --dist-init-addr "$HOST_IP:$PORT" \ + --nnodes "$TOTAL_NODES" \ + --node-rank "$RANK" \ + --tp-size "$TOTAL_GPUS" \ + --dp-size "$TOTAL_GPUS" \ + --enable-dp-attention \ + --decode-log-interval 1 \ + --enable-deepep-moe \ + --page-size 1 \ + --trust-remote-code \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-radix-cache \ + --watchdog-timeout 1000000 \ + --enable-two-batch-overlap \ + --deepep-mode low_latency \ + --mem-fraction-static 0.835 \ + --ep-num-redundant-experts 32 \ + --cuda-graph-bs 256 + elif [ "$cmd" = "sglang" ]; then + # H100 sglang decode command + python3 -m sglang.launch_server \ + --model-path /model/ \ + --disaggregation-transfer-backend nixl \ + --disaggregation-mode decode \ + --dist-init-addr "$HOST_IP:$PORT" \ + --nnodes "$TOTAL_NODES" \ + --node-rank "$RANK" \ + --tp-size "$TOTAL_GPUS" \ + --dp-size "$TOTAL_GPUS" \ + --enable-dp-attention \ + --decode-log-interval 1 \ + --enable-deepep-moe \ + --page-size 1 \ + --host 0.0.0.0 \ + --trust-remote-code \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-radix-cache \ + --watchdog-timeout 1000000 \ + --enable-two-batch-overlap \ + --deepep-mode low_latency \ + --mem-fraction-static 0.835 \ + --ep-num-redundant-experts 32 \ + --cuda-graph-bs 256 + fi +fi + + diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index 90911127e8..8b69f9c937 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -33,7 +33,6 @@ import time from pathlib import Path -from .gen_cmd import get_prefill_command_args, get_decode_command_args, get_sglang_mini_lb_command_args import requests # Network configurations @@ -206,6 +205,39 @@ def _validate_args(args: argparse.Namespace) -> None: if args.gpus_per_node < 1: raise ValueError("GPUs per node must be at least 1") +def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> dict: + cmd = ( + f"python3 -m sglang.srt.disaggregation.launch_lb " + f"--prefill http://{prefill_host_ip}:30000 " + f"--decode http://{decode_host_ip}:30000 " + "--host 0.0.0.0 " + "--port 8000 " + "--timeout 3600" + ) + return cmd + +def setup_env_vars_for_gpu_script(host_ip: str, rank: int, total_gpus: int, total_nodes: int, port: int = DIST_INIT_PORT): + """Setup environment variables required by GPU scripts (h100.sh, gb200.sh)""" + os.environ["HOST_IP"] = host_ip + os.environ["PORT"] = str(port) + os.environ["TOTAL_GPUS"] = str(total_gpus) + os.environ["RANK"] = str(rank) + os.environ["TOTAL_NODES"] = str(total_nodes) + + logging.info(f"Set HOST_IP: {host_ip}") + logging.info(f"Set PORT: {port}") + logging.info(f"Set TOTAL_GPUS: {total_gpus}") + logging.info(f"Set RANK: {rank}") + logging.info(f"Set TOTAL_NODES: {total_nodes}") + +def get_gpu_command(worker_type: str, use_sglang_commands: bool, gpu_type: str) -> str: + """Generate command to run the appropriate GPU script""" + script_name = f"{gpu_type}.sh" + script_path = Path(__file__).parent / script_name + mode = worker_type # "prefill" or "decode" + cmd = "sglang" if use_sglang_commands else "dynamo" + + return f"bash {script_path} {mode} {cmd}" def setup_prefill_node( rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int, use_sglang_commands: bool, gpu_type: str @@ -244,11 +276,13 @@ def setup_prefill_node( else: logging.info("Using SGLang servers. No need to setup etcd or nats") - # NOTE: Default command for h100 and dynamo implements the example in examples/sglang/dsr1-wideep.md - cmd_to_run = get_prefill_command_args(gpu_type, use_sglang_commands, prefill_host_ip, total_nodes, rank, total_gpus) + # Setup environment variables for GPU script + setup_env_vars_for_gpu_script(prefill_host_ip, rank, total_gpus, total_nodes) + + # Use appropriate GPU script instead of generating command directly + cmd_to_run = get_gpu_command("prefill", use_sglang_commands, gpu_type) return run_command(cmd_to_run) - def setup_decode_node( rank: int, decode_host_ip: str, @@ -270,7 +304,11 @@ def setup_decode_node( if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"): raise RuntimeError("Failed to connect to etcd") - cmd_to_run = get_decode_command_args(gpu_type, use_sglang_commands, decode_host_ip, total_nodes, rank, total_gpus) + # Setup environment variables for GPU script + setup_env_vars_for_gpu_script(decode_host_ip, rank, total_gpus, total_nodes) + + # Use appropriate GPU script instead of generating command directly + cmd_to_run = get_gpu_command("decode", use_sglang_commands, gpu_type) return run_command(cmd_to_run) From 4a3c140a8f90459f60d0f13c711d28a502930dde Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 7 Jul 2025 23:41:40 +0000 Subject: [PATCH 05/65] updated jinja --- .../sglang/slurm_jobs/job_script_template.j2 | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2 index 84e0e33396..7939977538 100755 --- a/examples/sglang/slurm_jobs/job_script_template.j2 +++ b/examples/sglang/slurm_jobs/job_script_template.j2 @@ -20,6 +20,8 @@ MODEL_DIR="{{ model_dir }}" CONFIG_DIR="{{ config_dir }}" CONTAINER_IMAGE="{{ container_image }}" NETWORK_INTERFACE="{{ network_interface }}" +GPU_TYPE="{{ gpu_type | default('h100') }}" +USE_SGLANG_COMMANDS="{{ use_sglang_commands | default(false) }}" {% raw %} @@ -59,16 +61,22 @@ ENROOT_ARGS="\ --container-mounts=${MODEL_DIR}:/model/,${CONFIG_DIR}:/configs/,${SCRIPT_DIR}:/scripts/,${OUTPUT_DIR}:/outputs/,${LOG_DIR}:/logs/ \ " +# Build common worker arguments +WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE}" +if [ "$USE_SGLANG_COMMANDS" = "true" ]; then + WORKER_ARGS="${WORKER_ARGS} --use-sglang-commands" +fi + # Launch prefill tasks on the first PREFILL_NODES nodes for i in $(seq 0 $((PREFILL_NODES - 1))); do node=${nodes[$i]} rank=$i echo "Launching prefill task on node ${i} (rank ${rank}): $node" echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err" - echo "Command: python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log &" + echo "Command: python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} &" srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \ --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err \ - python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log & + python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} & done # Launch decode tasks on the next DECODE_NODES nodes @@ -77,10 +85,10 @@ for i in $(seq $PREFILL_NODES $((PREFILL_NODES + DECODE_NODES - 1))); do rank=$((i - PREFILL_NODES)) echo "Launching decode task on node ${i} (rank ${rank}): $node" echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err" - echo "Command: python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log &" + echo "Command: python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} &" srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \ --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err \ - python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpus_per_node ${GPUS_PER_NODE} --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log & + python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} & done echo "" From 647e7b7f87df751170d301a94916868d8ce8c971 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 7 Jul 2025 23:43:36 +0000 Subject: [PATCH 06/65] readme and submitter --- examples/sglang/slurm_jobs/README.md | 31 +++++++++++++++++-- .../sglang/slurm_jobs/submit_job_script.py | 9 ++++++ 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md index 19f7c27ada..d7367de965 100644 --- a/examples/sglang/slurm_jobs/README.md +++ b/examples/sglang/slurm_jobs/README.md @@ -85,20 +85,45 @@ For simplicity of the example, we will make some assumptions about your SLURM cl - `--network-interface`: Network interface to use (default: `eth3`) - `--job-name`: SLURM job name (default: `dynamo_setup`) - `--time-limit`: Time limit in HH:MM:SS format (default: `01:00:00`) + - `--gpu-type`: GPU type to use, choices: `h100`, `gb200` (default: `h100`) + - `--use-sglang-commands`: Use SGLang commands instead of Dynamo (default: `false`) **Note**: The script automatically calculates the total number of nodes needed based on `--prefill-nodes` and `--decode-nodes` parameters. -2. **Monitor job progress**: +2. **Example with different GPU types**: + ```bash + # For H100 with Dynamo (default) + python submit_job_script.py \ + --template job_script_template.j2 \ + --model-dir /path/to/model \ + --config-dir /path/to/configs \ + --container-image container-image-uri \ + --account your-slurm-account \ + --gpu-type h100 + + # For GB200 with SGLang + python submit_job_script.py \ + --template job_script_template.j2 \ + --model-dir /path/to/model \ + --config-dir /path/to/configs \ + --container-image container-image-uri \ + --account your-slurm-account \ + --gpu-type gb200 \ + --use-sglang-commands + --gpus-per-node 4 + ``` + +3. **Monitor job progress**: ```bash squeue -u $USER ``` -3. **Check logs in real-time**: +4. **Check logs in real-time**: ```bash tail -f logs/{JOB_ID}/log.out ``` -4. **Monitor GPU utilization**: +5. **Monitor GPU utilization**: ```bash tail -f logs/{JOB_ID}/{node}_prefill_gpu_utilization.log ``` diff --git a/examples/sglang/slurm_jobs/submit_job_script.py b/examples/sglang/slurm_jobs/submit_job_script.py index 64f492224e..510aa40fb2 100644 --- a/examples/sglang/slurm_jobs/submit_job_script.py +++ b/examples/sglang/slurm_jobs/submit_job_script.py @@ -100,6 +100,13 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac parser.add_argument( "--network-interface", default="eth3", help="Network interface to use" ) + parser.add_argument( + "--gpu-type", choices=["h100", "gb200"], default="h100", help="GPU type to use" + ) + parser.add_argument( + "--use-sglang-commands", action="store_true", default=False, + help="Use SGLang commands instead of Dynamo" + ) return parser.parse_args(args) @@ -120,6 +127,8 @@ def main(input_args: list[str] | None = None): "container_image": args.container_image, "gpus_per_node": args.gpus_per_node, "network_interface": args.network_interface, + "gpu_type": args.gpu_type, + "use_sglang_commands": args.use_sglang_commands, } with tempfile.NamedTemporaryFile(mode="w", suffix=".sh") as temp_file: From e8c3b4663a3819a2da3ac7de98f15db4361acdc5 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 7 Jul 2025 23:51:35 +0000 Subject: [PATCH 07/65] go --- examples/sglang/slurm_jobs/job_script_template.j2 | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2 index 7939977538..9706a9a41c 100755 --- a/examples/sglang/slurm_jobs/job_script_template.j2 +++ b/examples/sglang/slurm_jobs/job_script_template.j2 @@ -38,14 +38,22 @@ for i in "${!nodes[@]}"; do echo "Node $i: ${nodes[$i]}" done -PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+') +if [ "$GPU_TYPE" = "gb200" ]; then + PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} hostname -I | awk '{print $1}') +else + PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+') +fi if [ -z "$PREFILL_HOST_IP" ]; then echo "Error: Could not retrieve IP address for prefill host ${nodes[0]} on interface $NETWORK_INTERFACE" exit 1 fi echo "Prefill host IP address: $PREFILL_HOST_IP" -DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+') +if [ "$GPU_TYPE" = "gb200" ]; then + DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} hostname -I | awk '{print $1}') +else + DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+') +fi if [ -z "$DECODE_HOST_IP" ]; then echo "Error: Could not retrieve IP address for decode host ${nodes[$PREFILL_NODES]} on interface $NETWORK_INTERFACE" exit 1 From 7d33f1eb61fc775a0cb9a84e2d5d562f3f92075a Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 7 Jul 2025 23:57:54 +0000 Subject: [PATCH 08/65] executable --- examples/sglang/slurm_jobs/scripts/gb200.sh | 0 examples/sglang/slurm_jobs/scripts/h100.sh | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 examples/sglang/slurm_jobs/scripts/gb200.sh mode change 100644 => 100755 examples/sglang/slurm_jobs/scripts/h100.sh diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh old mode 100644 new mode 100755 diff --git a/examples/sglang/slurm_jobs/scripts/h100.sh b/examples/sglang/slurm_jobs/scripts/h100.sh old mode 100644 new mode 100755 From e56a0f416c144a7cf9c311f6d90f6c263dc58d61 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 00:06:22 +0000 Subject: [PATCH 09/65] bool check --- examples/sglang/slurm_jobs/job_script_template.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2 index 9706a9a41c..f6abc0343f 100755 --- a/examples/sglang/slurm_jobs/job_script_template.j2 +++ b/examples/sglang/slurm_jobs/job_script_template.j2 @@ -71,7 +71,7 @@ ENROOT_ARGS="\ # Build common worker arguments WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE}" -if [ "$USE_SGLANG_COMMANDS" = "true" ]; then +if [ "$USE_SGLANG_COMMANDS" = "True" ]; then WORKER_ARGS="${WORKER_ARGS} --use-sglang-commands" fi From ba0cc3cc2b4487805b5a78d120d4b6d7cd779f0c Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 00:13:35 +0000 Subject: [PATCH 10/65] Added env vars --- examples/sglang/slurm_jobs/scripts/gb200.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 698c1df0ae..1fe254b42c 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -72,6 +72,12 @@ if [ "$mode" = "prefill" ]; then exit 1 elif [ "$cmd" = "sglang" ]; then # GB200 sglang prefill command + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \ + NCCL_MNNVL_ENABLE=1 \ + NCCL_CUMEM_ENABLE=1 \ + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ + SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ + PYTHONUNBUFFERED=1 \ python3 -m sglang.launch_server \ --served-model-name deepseek-ai/DeepSeek-R1 \ --model-path /model/ \ @@ -109,6 +115,8 @@ elif [ "$mode" = "decode" ]; then exit 1 elif [ "$cmd" = "sglang" ]; then # GB200 sglang decode command + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \ + SGLANG_NUM_RESERVED_DECODE_TOKENS=176 \ python3 -m sglang.launch_server \ --model-path /model/ \ --trust-remote-code \ From d0316c24f4b913dbbec8fe1c90dd94e5ae97694f Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 00:19:47 +0000 Subject: [PATCH 11/65] go --- examples/sglang/slurm_jobs/scripts/gb200.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 1fe254b42c..518c4af671 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -83,6 +83,7 @@ if [ "$mode" = "prefill" ]; then --model-path /model/ \ --trust-remote-code \ --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ --dist-init-addr "$HOST_IP:$PORT" \ --nnodes "$TOTAL_NODES" \ --node-rank "$RANK" \ From b0cfbd1b139ee8c2cc4ebac883ffbb53e0820460 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 01:58:54 +0000 Subject: [PATCH 12/65] go --- examples/sglang/slurm_jobs/job_script_template.j2 | 2 +- examples/sglang/slurm_jobs/scripts/worker_setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2 index f6abc0343f..afe05c5662 100755 --- a/examples/sglang/slurm_jobs/job_script_template.j2 +++ b/examples/sglang/slurm_jobs/job_script_template.j2 @@ -64,7 +64,7 @@ echo "Decode host IP address: $DECODE_HOST_IP" ENROOT_ARGS="\ --container-image=${CONTAINER_IMAGE} \ --no-container-entrypoint \ - --container-mount-home \ + --no-container-mount-home \ --no-container-remap-root \ --container-mounts=${MODEL_DIR}:/model/,${CONFIG_DIR}:/configs/,${SCRIPT_DIR}:/scripts/,${OUTPUT_DIR}:/outputs/,${LOG_DIR}:/logs/ \ " diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index 8b69f9c937..2b69700401 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -265,7 +265,7 @@ def setup_prefill_node( if not etcd_process: raise RuntimeError("Failed to start etcd") - ingress_process = run_command("dynamo run in=http out=dyn", background=True) + ingress_process = run_command("dynamo run in=http out=dyn --http-port=8000", background=True) if not ingress_process: raise RuntimeError("Failed to start ingress") From bcaf9f5c9852fbc22cd655b8910d338ccbb581a1 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 7 Jul 2025 19:53:23 -0700 Subject: [PATCH 13/65] pc --- .../sglang/slurm_jobs/scripts/worker_setup.py | 46 ++++++++++++++----- .../sglang/slurm_jobs/submit_job_script.py | 6 ++- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index 2b69700401..8a039c7b40 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -205,6 +205,7 @@ def _validate_args(args: argparse.Namespace) -> None: if args.gpus_per_node < 1: raise ValueError("GPUs per node must be at least 1") + def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> dict: cmd = ( f"python3 -m sglang.srt.disaggregation.launch_lb " @@ -216,31 +217,45 @@ def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) - ) return cmd -def setup_env_vars_for_gpu_script(host_ip: str, rank: int, total_gpus: int, total_nodes: int, port: int = DIST_INIT_PORT): + +def setup_env_vars_for_gpu_script( + host_ip: str, + rank: int, + total_gpus: int, + total_nodes: int, + port: int = DIST_INIT_PORT, +): """Setup environment variables required by GPU scripts (h100.sh, gb200.sh)""" os.environ["HOST_IP"] = host_ip os.environ["PORT"] = str(port) os.environ["TOTAL_GPUS"] = str(total_gpus) os.environ["RANK"] = str(rank) os.environ["TOTAL_NODES"] = str(total_nodes) - + logging.info(f"Set HOST_IP: {host_ip}") logging.info(f"Set PORT: {port}") logging.info(f"Set TOTAL_GPUS: {total_gpus}") logging.info(f"Set RANK: {rank}") logging.info(f"Set TOTAL_NODES: {total_nodes}") + def get_gpu_command(worker_type: str, use_sglang_commands: bool, gpu_type: str) -> str: """Generate command to run the appropriate GPU script""" script_name = f"{gpu_type}.sh" script_path = Path(__file__).parent / script_name mode = worker_type # "prefill" or "decode" cmd = "sglang" if use_sglang_commands else "dynamo" - + return f"bash {script_path} {mode} {cmd}" + def setup_prefill_node( - rank: int, prefill_host_ip: str, total_nodes: int, total_gpus: int, use_sglang_commands: bool, gpu_type: str + rank: int, + prefill_host_ip: str, + total_nodes: int, + total_gpus: int, + use_sglang_commands: bool, + gpu_type: str, ) -> int: """ Setup the prefill node. @@ -248,7 +263,9 @@ def setup_prefill_node( if not use_sglang_commands: if rank == 0: logging.info(f"Setting up host prefill node: {rank}") - logging.info(f"Starting nats server on node {rank} with IP {prefill_host_ip}") + logging.info( + f"Starting nats server on node {rank} with IP {prefill_host_ip}" + ) nats_process = run_command("nats-server -js", background=True) if not nats_process: @@ -265,7 +282,9 @@ def setup_prefill_node( if not etcd_process: raise RuntimeError("Failed to start etcd") - ingress_process = run_command("dynamo run in=http out=dyn --http-port=8000", background=True) + ingress_process = run_command( + "dynamo run in=http out=dyn --http-port=8000", background=True + ) if not ingress_process: raise RuntimeError("Failed to start ingress") @@ -278,11 +297,12 @@ def setup_prefill_node( # Setup environment variables for GPU script setup_env_vars_for_gpu_script(prefill_host_ip, rank, total_gpus, total_nodes) - + # Use appropriate GPU script instead of generating command directly cmd_to_run = get_gpu_command("prefill", use_sglang_commands, gpu_type) return run_command(cmd_to_run) + def setup_decode_node( rank: int, decode_host_ip: str, @@ -290,7 +310,7 @@ def setup_decode_node( total_nodes: int, total_gpus: int, use_sglang_commands: bool, - gpu_type: str + gpu_type: str, ) -> int: """ Setup the decode node. @@ -298,7 +318,9 @@ def setup_decode_node( logging.info(f"Setting up child decode node: {rank}") if use_sglang_commands: - sgl_mini_lb_cmd = get_sglang_mini_lb_command_args(prefill_host_ip, decode_host_ip) + sgl_mini_lb_cmd = get_sglang_mini_lb_command_args( + prefill_host_ip, decode_host_ip + ) run_command(sgl_mini_lb_cmd, background=True) else: if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"): @@ -306,7 +328,7 @@ def setup_decode_node( # Setup environment variables for GPU script setup_env_vars_for_gpu_script(decode_host_ip, rank, total_gpus, total_nodes) - + # Use appropriate GPU script instead of generating command directly cmd_to_run = get_gpu_command("decode", use_sglang_commands, gpu_type) return run_command(cmd_to_run) @@ -346,7 +368,7 @@ def main(input_args: list[str] | None = None): args.total_nodes, args.total_nodes * args.gpus_per_node, args.use_sglang_commands, - args.gpu_type + args.gpu_type, ) else: setup_decode_node( @@ -356,7 +378,7 @@ def main(input_args: list[str] | None = None): args.total_nodes, args.total_nodes * args.gpus_per_node, args.use_sglang_commands, - args.gpu_type + args.gpu_type, ) logging.info(f"{args.worker_type.capitalize()} node setup complete") diff --git a/examples/sglang/slurm_jobs/submit_job_script.py b/examples/sglang/slurm_jobs/submit_job_script.py index 510aa40fb2..3b08c26827 100644 --- a/examples/sglang/slurm_jobs/submit_job_script.py +++ b/examples/sglang/slurm_jobs/submit_job_script.py @@ -104,8 +104,10 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac "--gpu-type", choices=["h100", "gb200"], default="h100", help="GPU type to use" ) parser.add_argument( - "--use-sglang-commands", action="store_true", default=False, - help="Use SGLang commands instead of Dynamo" + "--use-sglang-commands", + action="store_true", + default=False, + help="Use SGLang commands instead of Dynamo", ) return parser.parse_args(args) From 1265ac8495587a511e55ca96f6184a917436dfdb Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 02:55:37 +0000 Subject: [PATCH 14/65] mypy --- examples/sglang/slurm_jobs/scripts/worker_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index 8a039c7b40..f0fdea75ff 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -206,7 +206,7 @@ def _validate_args(args: argparse.Namespace) -> None: raise ValueError("GPUs per node must be at least 1") -def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> dict: +def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> str: cmd = ( f"python3 -m sglang.srt.disaggregation.launch_lb " f"--prefill http://{prefill_host_ip}:30000 " From bf8d1035511dd699dceef4ad038bbda1b337ab59 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 03:10:11 +0000 Subject: [PATCH 15/65] bump --- examples/sglang/slurm_jobs/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md index d7367de965..a15ab59ca6 100644 --- a/examples/sglang/slurm_jobs/README.md +++ b/examples/sglang/slurm_jobs/README.md @@ -123,6 +123,15 @@ For simplicity of the example, we will make some assumptions about your SLURM cl tail -f logs/{JOB_ID}/log.out ``` + You can view logs of all prefill or decode workers simultaneously by running: + ```bash + # prefill workers err (or .out) + tail -f logs/{JOB_ID}/*_prefill.err + + # decode workers err (or .out) + tail -f logs/{JOB_ID}/*_decode.err + ``` + 5. **Monitor GPU utilization**: ```bash tail -f logs/{JOB_ID}/{node}_prefill_gpu_utilization.log From 51ca695a1dd2cb1dbdebc5e54cec6bfb6016cd11 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 7 Jul 2025 20:30:03 -0700 Subject: [PATCH 16/65] pc --- examples/sglang/slurm_jobs/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md index a15ab59ca6..da0f00dd41 100644 --- a/examples/sglang/slurm_jobs/README.md +++ b/examples/sglang/slurm_jobs/README.md @@ -126,10 +126,10 @@ For simplicity of the example, we will make some assumptions about your SLURM cl You can view logs of all prefill or decode workers simultaneously by running: ```bash # prefill workers err (or .out) - tail -f logs/{JOB_ID}/*_prefill.err - + tail -f logs/{JOB_ID}/*_prefill.err + # decode workers err (or .out) - tail -f logs/{JOB_ID}/*_decode.err + tail -f logs/{JOB_ID}/*_decode.err ``` 5. **Monitor GPU utilization**: From 40c5d33015099c88f569d4b3b99b9fb777705b78 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 03:30:49 +0000 Subject: [PATCH 17/65] cpy --- examples/sglang/slurm_jobs/scripts/gb200.sh | 2 ++ examples/sglang/slurm_jobs/scripts/h100.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 518c4af671..af4d3aa549 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -1,4 +1,6 @@ #!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # Function to print usage print_usage() { diff --git a/examples/sglang/slurm_jobs/scripts/h100.sh b/examples/sglang/slurm_jobs/scripts/h100.sh index a00b63f554..b457484e3a 100755 --- a/examples/sglang/slurm_jobs/scripts/h100.sh +++ b/examples/sglang/slurm_jobs/scripts/h100.sh @@ -1,4 +1,6 @@ #!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # Function to print usage print_usage() { From c1a8e5abfc23b149b1ea840dc7b72e8494aa53a5 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 18:35:52 +0000 Subject: [PATCH 18/65] combined echo and print based on pr comment --- .../sglang/slurm_jobs/job_script_template.j2 | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2 index afe05c5662..b6fe92416d 100755 --- a/examples/sglang/slurm_jobs/job_script_template.j2 +++ b/examples/sglang/slurm_jobs/job_script_template.j2 @@ -80,11 +80,10 @@ for i in $(seq 0 $((PREFILL_NODES - 1))); do node=${nodes[$i]} rank=$i echo "Launching prefill task on node ${i} (rank ${rank}): $node" - echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err" - echo "Command: python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} &" - srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \ - --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err \ - python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} & + + cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} &" + echo "$cmd" + $cmd done # Launch decode tasks on the next DECODE_NODES nodes @@ -92,11 +91,10 @@ for i in $(seq $PREFILL_NODES $((PREFILL_NODES + DECODE_NODES - 1))); do node=${nodes[$i]} rank=$((i - PREFILL_NODES)) echo "Launching decode task on node ${i} (rank ${rank}): $node" - echo "Srun args: $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err" - echo "Command: python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} &" - srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node \ - --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err \ - python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} & + + cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} &" + echo "$cmd" + $cmd done echo "" From 2033b1f5be4e3e6c5c7f9d9f7cb8d6ce86e91bfb Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 22:44:11 +0000 Subject: [PATCH 19/65] option1 --- examples/sglang/slurm_jobs/job_script_template.j2 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2 index b6fe92416d..fca83b6726 100755 --- a/examples/sglang/slurm_jobs/job_script_template.j2 +++ b/examples/sglang/slurm_jobs/job_script_template.j2 @@ -41,7 +41,7 @@ done if [ "$GPU_TYPE" = "gb200" ]; then PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} hostname -I | awk '{print $1}') else - PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+') + PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip route get $(getent hosts ${nodes[0]} | awk '{ print $1 }') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}') fi if [ -z "$PREFILL_HOST_IP" ]; then echo "Error: Could not retrieve IP address for prefill host ${nodes[0]} on interface $NETWORK_INTERFACE" @@ -52,7 +52,7 @@ echo "Prefill host IP address: $PREFILL_HOST_IP" if [ "$GPU_TYPE" = "gb200" ]; then DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} hostname -I | awk '{print $1}') else - DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ifconfig $NETWORK_INTERFACE | grep -oP 'inet \K[0-9.]+') + DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ip route get $(getent hosts ${nodes[$PREFILL_NODES]} | awk '{ print $1 }') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}') fi if [ -z "$DECODE_HOST_IP" ]; then echo "Error: Could not retrieve IP address for decode host ${nodes[$PREFILL_NODES]} on interface $NETWORK_INTERFACE" @@ -81,9 +81,9 @@ for i in $(seq 0 $((PREFILL_NODES - 1))); do rank=$i echo "Launching prefill task on node ${i} (rank ${rank}): $node" - cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS} &" + cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS}" echo "$cmd" - $cmd + $cmd & done # Launch decode tasks on the next DECODE_NODES nodes @@ -92,9 +92,9 @@ for i in $(seq $PREFILL_NODES $((PREFILL_NODES + DECODE_NODES - 1))); do rank=$((i - PREFILL_NODES)) echo "Launching decode task on node ${i} (rank ${rank}): $node" - cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS} &" + cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS}" echo "$cmd" - $cmd + $cmd & done echo "" From 44330642ed9b6114d66f06352af4ca5f1fcd8997 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 22:55:31 +0000 Subject: [PATCH 20/65] option1 --- examples/sglang/slurm_jobs/job_script_template.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2 index fca83b6726..6460a7a23e 100755 --- a/examples/sglang/slurm_jobs/job_script_template.j2 +++ b/examples/sglang/slurm_jobs/job_script_template.j2 @@ -41,7 +41,7 @@ done if [ "$GPU_TYPE" = "gb200" ]; then PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} hostname -I | awk '{print $1}') else - PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip route get $(getent hosts ${nodes[0]} | awk '{ print $1 }') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}') + PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip route get $(getent ahosts ${nodes[0]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}') fi if [ -z "$PREFILL_HOST_IP" ]; then echo "Error: Could not retrieve IP address for prefill host ${nodes[0]} on interface $NETWORK_INTERFACE" @@ -52,7 +52,7 @@ echo "Prefill host IP address: $PREFILL_HOST_IP" if [ "$GPU_TYPE" = "gb200" ]; then DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} hostname -I | awk '{print $1}') else - DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ip route get $(getent hosts ${nodes[$PREFILL_NODES]} | awk '{ print $1 }') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}') + DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ip route get $(getent ahosts ${nodes[$PREFILL_NODES]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}') fi if [ -z "$DECODE_HOST_IP" ]; then echo "Error: Could not retrieve IP address for decode host ${nodes[$PREFILL_NODES]} on interface $NETWORK_INTERFACE" From d827ac7bdd3120af3c63cacb554003ae34a7e16c Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 23:01:39 +0000 Subject: [PATCH 21/65] works on h100 and gb200 cluster --- examples/sglang/slurm_jobs/job_script_template.j2 | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2 index 6460a7a23e..90da04c7ad 100755 --- a/examples/sglang/slurm_jobs/job_script_template.j2 +++ b/examples/sglang/slurm_jobs/job_script_template.j2 @@ -38,22 +38,14 @@ for i in "${!nodes[@]}"; do echo "Node $i: ${nodes[$i]}" done -if [ "$GPU_TYPE" = "gb200" ]; then - PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} hostname -I | awk '{print $1}') -else - PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip route get $(getent ahosts ${nodes[0]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}') -fi +PREFILL_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip route get $(getent ahosts ${nodes[0]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}') if [ -z "$PREFILL_HOST_IP" ]; then echo "Error: Could not retrieve IP address for prefill host ${nodes[0]} on interface $NETWORK_INTERFACE" exit 1 fi echo "Prefill host IP address: $PREFILL_HOST_IP" -if [ "$GPU_TYPE" = "gb200" ]; then - DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} hostname -I | awk '{print $1}') -else - DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ip route get $(getent ahosts ${nodes[$PREFILL_NODES]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}') -fi +DECODE_HOST_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[$PREFILL_NODES]} ip route get $(getent ahosts ${nodes[$PREFILL_NODES]} | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}') if [ -z "$DECODE_HOST_IP" ]; then echo "Error: Could not retrieve IP address for decode host ${nodes[$PREFILL_NODES]} on interface $NETWORK_INTERFACE" exit 1 From c2fe7b0ff750be45ec51d4bc76a53b1694c36639 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 23:21:31 +0000 Subject: [PATCH 22/65] added nats etcd ingress setup function --- .../sglang/slurm_jobs/scripts/worker_setup.py | 53 ++++++++++--------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index f0fdea75ff..e097ab8d2d 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -249,6 +249,32 @@ def get_gpu_command(worker_type: str, use_sglang_commands: bool, gpu_type: str) return f"bash {script_path} {mode} {cmd}" +def setup_nats_etcd_and_ingress(prefill_host_ip: str) -> None: + logging.info(f"Starting nats server on node {prefill_host_ip}") + + nats_process = run_command("nats-server -js", background=True) + if not nats_process: + raise RuntimeError("Failed to start nats-server") + + logging.info(f"Starting etcd server on node {prefill_host_ip}") + etcd_cmd = ( + f"etcd --listen-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} " + f"--advertise-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} " + f"--listen-peer-urls {ETCD_LISTEN_ADDR}:{ETCD_PEER_PORT} " + f"--initial-cluster default=http://{prefill_host_ip}:{ETCD_PEER_PORT}" + ) + + etcd_process = run_command(etcd_cmd, background=True) + if not etcd_process: + raise RuntimeError("Failed to start etcd") + + logging.info(f"Starting ingress server on node {prefill_host_ip}") + ingress_process = run_command( + "dynamo run in=http out=dyn --http-port=8000", background=True + ) + if not ingress_process: + raise RuntimeError("Failed to start ingress") + def setup_prefill_node( rank: int, prefill_host_ip: str, @@ -262,32 +288,7 @@ def setup_prefill_node( """ if not use_sglang_commands: if rank == 0: - logging.info(f"Setting up host prefill node: {rank}") - logging.info( - f"Starting nats server on node {rank} with IP {prefill_host_ip}" - ) - - nats_process = run_command("nats-server -js", background=True) - if not nats_process: - raise RuntimeError("Failed to start nats-server") - - etcd_cmd = ( - f"etcd --listen-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} " - f"--advertise-client-urls {ETCD_LISTEN_ADDR}:{ETCD_CLIENT_PORT} " - f"--listen-peer-urls {ETCD_LISTEN_ADDR}:{ETCD_PEER_PORT} " - f"--initial-cluster default=http://{prefill_host_ip}:{ETCD_PEER_PORT}" - ) - - etcd_process = run_command(etcd_cmd, background=True) - if not etcd_process: - raise RuntimeError("Failed to start etcd") - - ingress_process = run_command( - "dynamo run in=http out=dyn --http-port=8000", background=True - ) - if not ingress_process: - raise RuntimeError("Failed to start ingress") - + setup_nats_etcd_and_ingress(prefill_host_ip) else: logging.info(f"Setting up child prefill node: {rank}") if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"): From 30b9d8ec879f346e4d21d1ca1d4ec44ae5de29b3 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 16:22:37 -0700 Subject: [PATCH 23/65] precommit --- examples/sglang/slurm_jobs/job_script_template.j2 | 4 ++-- examples/sglang/slurm_jobs/scripts/worker_setup.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2 index 90da04c7ad..2e873c42fa 100755 --- a/examples/sglang/slurm_jobs/job_script_template.j2 +++ b/examples/sglang/slurm_jobs/job_script_template.j2 @@ -72,7 +72,7 @@ for i in $(seq 0 $((PREFILL_NODES - 1))); do node=${nodes[$i]} rank=$i echo "Launching prefill task on node ${i} (rank ${rank}): $node" - + cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill.out --error=${LOG_DIR}/${node}_prefill.err python /scripts/worker_setup.py --prefill_host_ip ${PREFILL_HOST_IP} --decode_host_ip ${DECODE_HOST_IP} --rank ${rank} --total_nodes ${PREFILL_NODES} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_gpu_utilization.log ${WORKER_ARGS}" echo "$cmd" $cmd & @@ -83,7 +83,7 @@ for i in $(seq $PREFILL_NODES $((PREFILL_NODES + DECODE_NODES - 1))); do node=${nodes[$i]} rank=$((i - PREFILL_NODES)) echo "Launching decode task on node ${i} (rank ${rank}): $node" - + cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode.out --error=${LOG_DIR}/${node}_decode.err python /scripts/worker_setup.py --decode_host_ip ${DECODE_HOST_IP} --prefill_host_ip ${PREFILL_HOST_IP} --rank ${rank} --total_nodes ${DECODE_NODES} --worker_type decode --gpu_utilization_log /logs/${node}_decode_gpu_utilization.log ${WORKER_ARGS}" echo "$cmd" $cmd & diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index e097ab8d2d..a00e2a1b95 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -275,6 +275,7 @@ def setup_nats_etcd_and_ingress(prefill_host_ip: str) -> None: if not ingress_process: raise RuntimeError("Failed to start ingress") + def setup_prefill_node( rank: int, prefill_host_ip: str, From 3a66d66e66bb23cc9a1b13ee85550f17ab324a1e Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 8 Jul 2025 23:42:02 +0000 Subject: [PATCH 24/65] add server --- .../sglang/slurm_jobs/scripts/worker_setup.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index e097ab8d2d..a806c37643 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -249,7 +249,10 @@ def get_gpu_command(worker_type: str, use_sglang_commands: bool, gpu_type: str) return f"bash {script_path} {mode} {cmd}" -def setup_nats_etcd_and_ingress(prefill_host_ip: str) -> None: +def setup_head_prefill_node(prefill_host_ip: str) -> None: + """ + Setup NATS, etcd, ingress, and http servers on the prefill host node. + """ logging.info(f"Starting nats server on node {prefill_host_ip}") nats_process = run_command("nats-server -js", background=True) @@ -274,6 +277,14 @@ def setup_nats_etcd_and_ingress(prefill_host_ip: str) -> None: ) if not ingress_process: raise RuntimeError("Failed to start ingress") + + logging.info(f"Starting http server on port 9001for flush_cache endpoint on node {prefill_host_ip}") + cache_flush_server_cmd = ( + f"python3 utils/sgl_http_server.py --ns dynamo" + ) + cache_flush_server_process = run_command(cache_flush_server_cmd, background=True) + if not cache_flush_server_process: + raise RuntimeError("Failed to start cache flush server") def setup_prefill_node( rank: int, @@ -288,7 +299,7 @@ def setup_prefill_node( """ if not use_sglang_commands: if rank == 0: - setup_nats_etcd_and_ingress(prefill_host_ip) + setup_head_prefill_node(prefill_host_ip) else: logging.info(f"Setting up child prefill node: {rank}") if not wait_for_etcd(f"http://{prefill_host_ip}:{ETCD_CLIENT_PORT}"): From 62f2ea52fe1a04b7fd492a8cdf6ec3f1257de1a1 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 9 Jul 2025 18:34:21 +0000 Subject: [PATCH 25/65] bump time limit --- examples/sglang/slurm_jobs/submit_job_script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sglang/slurm_jobs/submit_job_script.py b/examples/sglang/slurm_jobs/submit_job_script.py index 3b08c26827..196de92a0d 100644 --- a/examples/sglang/slurm_jobs/submit_job_script.py +++ b/examples/sglang/slurm_jobs/submit_job_script.py @@ -86,7 +86,7 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac parser.add_argument("--config-dir", required=True, help="Config directory path") parser.add_argument("--container-image", required=True, help="Container image") parser.add_argument( - "--time-limit", default="01:00:00", help="Time limit (HH:MM:SS)" + "--time-limit", default="04:00:00", help="Time limit (HH:MM:SS)" ) parser.add_argument( "--prefill-nodes", type=int, default=2, help="Number of prefill nodes" From ec9aa138569891dd3085f38edb19a4eb22aaa9ba Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 10 Jul 2025 00:29:52 +0000 Subject: [PATCH 26/65] fix mooncake env vars on gb200 --- examples/sglang/slurm_jobs/scripts/gb200.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index af4d3aa549..51443c4a46 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -75,11 +75,8 @@ if [ "$mode" = "prefill" ]; then elif [ "$cmd" = "sglang" ]; then # GB200 sglang prefill command SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \ - NCCL_MNNVL_ENABLE=1 \ - NCCL_CUMEM_ENABLE=1 \ - SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ - SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ - PYTHONUNBUFFERED=1 \ + SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \ + SGLANG_MOONCAKE_CUSTOM_POOL=True \ python3 -m sglang.launch_server \ --served-model-name deepseek-ai/DeepSeek-R1 \ --model-path /model/ \ @@ -120,6 +117,8 @@ elif [ "$mode" = "decode" ]; then # GB200 sglang decode command SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \ SGLANG_NUM_RESERVED_DECODE_TOKENS=176 \ + SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \ + SGLANG_MOONCAKE_CUSTOM_POOL=True \ python3 -m sglang.launch_server \ --model-path /model/ \ --trust-remote-code \ From 75d2a25e1c662483de327cf3ef5bb521b1a9c64c Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 10 Jul 2025 00:47:33 +0000 Subject: [PATCH 27/65] bump --- examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 51443c4a46..a431cbecdb 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -82,7 +82,7 @@ if [ "$mode" = "prefill" ]; then --model-path /model/ \ --trust-remote-code \ --disaggregation-mode prefill \ - --disaggregation-transfer-backend nixl \ + --disaggregation-transfer-backend mooncake \ --dist-init-addr "$HOST_IP:$PORT" \ --nnodes "$TOTAL_NODES" \ --node-rank "$RANK" \ @@ -122,7 +122,7 @@ elif [ "$mode" = "decode" ]; then python3 -m sglang.launch_server \ --model-path /model/ \ --trust-remote-code \ - --disaggregation-transfer-backend nixl \ + --disaggregation-transfer-backend mooncake \ --disaggregation-mode decode \ --dist-init-addr "$HOST_IP:$PORT" \ --nnodes "$TOTAL_NODES" \ From d2ae39fc7621efc5c53806f4e20a639c539f7d45 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 10 Jul 2025 21:43:37 +0000 Subject: [PATCH 28/65] use sglang rust balancer --- examples/sglang/slurm_jobs/scripts/worker_setup.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index eea8489296..5e638f819e 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -208,12 +208,14 @@ def _validate_args(args: argparse.Namespace) -> None: def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> str: cmd = ( - f"python3 -m sglang.srt.disaggregation.launch_lb " - f"--prefill http://{prefill_host_ip}:30000 " - f"--decode http://{decode_host_ip}:30000 " - "--host 0.0.0.0 " - "--port 8000 " - "--timeout 3600" + f"python3 -m sglang_router.launch_router " + f"--policy prefill_decode " + f"--prefill-urls http://{prefill_host_ip}:30000:8998 " + f"--decode-urls http://{decode_host_ip}:30000 " + f"--pd-policy random " + f"--host 0.0.0.0 " + f"--port 8000 " + f"--timeout 3600" ) return cmd From 4f22a17a472e9c9839a437f8ee1a2e7ed54f0963 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 10 Jul 2025 21:53:08 +0000 Subject: [PATCH 29/65] fix --- examples/sglang/slurm_jobs/scripts/worker_setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index 5e638f819e..0b48087403 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -208,14 +208,14 @@ def _validate_args(args: argparse.Namespace) -> None: def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> str: cmd = ( + "pip install sglang-router; " f"python3 -m sglang_router.launch_router " - f"--policy prefill_decode " - f"--prefill-urls http://{prefill_host_ip}:30000:8998 " - f"--decode-urls http://{decode_host_ip}:30000 " - f"--pd-policy random " + f"--policy random " + f"--pd-disaggregation " + f"--prefill http://{prefill_host_ip}:30000 30001 " + f"--decode http://{decode_host_ip}:30000 " f"--host 0.0.0.0 " f"--port 8000 " - f"--timeout 3600" ) return cmd From 7cac87ed3e88a7d0b833b7eb33b124450504bf8d Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 10 Jul 2025 22:06:33 +0000 Subject: [PATCH 30/65] another --- examples/sglang/slurm_jobs/scripts/worker_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index 0b48087403..c3e95c65ca 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -208,7 +208,7 @@ def _validate_args(args: argparse.Namespace) -> None: def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> str: cmd = ( - "pip install sglang-router; " + "pip install sglang-router && " f"python3 -m sglang_router.launch_router " f"--policy random " f"--pd-disaggregation " From c9c5e26da70ac9401b6071958ff3ffadf133fd35 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Fri, 11 Jul 2025 20:24:44 +0000 Subject: [PATCH 31/65] gb 200 but nixl --- examples/sglang/slurm_jobs/scripts/gb200.sh | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index a431cbecdb..38799c2547 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -77,12 +77,15 @@ if [ "$mode" = "prefill" ]; then SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \ SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \ SGLANG_MOONCAKE_CUSTOM_POOL=True \ + NIXL_LOG_LEVEL=TRACE \ + UCX_LOG_LEVEL=debug \ + MC_FORCE_MNNVL=1 \ python3 -m sglang.launch_server \ --served-model-name deepseek-ai/DeepSeek-R1 \ --model-path /model/ \ --trust-remote-code \ --disaggregation-mode prefill \ - --disaggregation-transfer-backend mooncake \ + --disaggregation-transfer-backend nixl \ --dist-init-addr "$HOST_IP:$PORT" \ --nnodes "$TOTAL_NODES" \ --node-rank "$RANK" \ @@ -107,7 +110,8 @@ if [ "$mode" = "prefill" ]; then --disable-cuda-graph \ --chunked-prefill-size 16384 \ --max-total-tokens 32768 \ - --mem-fraction-static 0.9 + --mem-fraction-static 0.9 \ + --log-level debug fi elif [ "$mode" = "decode" ]; then if [ "$cmd" = "dynamo" ]; then @@ -119,6 +123,9 @@ elif [ "$mode" = "decode" ]; then SGLANG_NUM_RESERVED_DECODE_TOKENS=176 \ SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \ SGLANG_MOONCAKE_CUSTOM_POOL=True \ + NIXL_LOG_LEVEL=TRACE \ + UCX_LOG_LEVEL=debug \ + MC_FORCE_MNNVL=1 \ python3 -m sglang.launch_server \ --model-path /model/ \ --trust-remote-code \ @@ -147,6 +154,7 @@ elif [ "$mode" = "decode" ]; then --attention-backend cutlass_mla \ --watchdog-timeout 1000000 \ --chunked-prefill-size 36864 \ - --mem-fraction-static 0.82 + --mem-fraction-static 0.82 \ + --log-level debug fi fi From 1afee3c82dd167c127a2861fe81e6684b614e710 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Fri, 11 Jul 2025 20:28:09 +0000 Subject: [PATCH 32/65] bump --- examples/sglang/slurm_jobs/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md index da0f00dd41..661fb62278 100644 --- a/examples/sglang/slurm_jobs/README.md +++ b/examples/sglang/slurm_jobs/README.md @@ -62,6 +62,9 @@ For simplicity of the example, we will make some assumptions about your SLURM cl ## Usage 1. **Submit a benchmark job**: + + > **Note:** The logic for finding the prefill and decode node IP addresses in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` commands for your cluster setup, especially if your networking or hostname conventions differ. PRs and suggestions welcome. + ```bash python submit_job_script.py \ --template job_script_template.j2 \ From 3da74682408fbc5e8c516eb3ed4a1df5b6613604 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Fri, 11 Jul 2025 20:29:40 +0000 Subject: [PATCH 33/65] bump --- examples/sglang/slurm_jobs/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md index 661fb62278..e9389d41cd 100644 --- a/examples/sglang/slurm_jobs/README.md +++ b/examples/sglang/slurm_jobs/README.md @@ -63,7 +63,8 @@ For simplicity of the example, we will make some assumptions about your SLURM cl 1. **Submit a benchmark job**: - > **Note:** The logic for finding the prefill and decode node IP addresses in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` commands for your cluster setup, especially if your networking or hostname conventions differ. PRs and suggestions welcome. + > [!NOTE] + > The logic for finding prefill and decode node IPs in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` bits for your cluster, especially if your networking or hostname conventions differ. PRs and suggestions welcome. ```bash python submit_job_script.py \ From d32c09a5de0429dfa0551a1aaa261e3ef69b0d02 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Fri, 11 Jul 2025 20:30:29 +0000 Subject: [PATCH 34/65] pc --- examples/sglang/slurm_jobs/README.md | 2 +- examples/sglang/slurm_jobs/scripts/worker_setup.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md index e9389d41cd..de91444cae 100644 --- a/examples/sglang/slurm_jobs/README.md +++ b/examples/sglang/slurm_jobs/README.md @@ -63,7 +63,7 @@ For simplicity of the example, we will make some assumptions about your SLURM cl 1. **Submit a benchmark job**: - > [!NOTE] + > [!NOTE] > The logic for finding prefill and decode node IPs in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` bits for your cluster, especially if your networking or hostname conventions differ. PRs and suggestions welcome. ```bash diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index c3e95c65ca..5e6aa9309a 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -279,11 +279,11 @@ def setup_head_prefill_node(prefill_host_ip: str) -> None: ) if not ingress_process: raise RuntimeError("Failed to start ingress") - - logging.info(f"Starting http server on port 9001for flush_cache endpoint on node {prefill_host_ip}") - cache_flush_server_cmd = ( - f"python3 utils/sgl_http_server.py --ns dynamo" + + logging.info( + f"Starting http server on port 9001 for flush_cache endpoint on node {prefill_host_ip}" ) + cache_flush_server_cmd = "python3 utils/sgl_http_server.py --ns dynamo" cache_flush_server_process = run_command(cache_flush_server_cmd, background=True) if not cache_flush_server_process: raise RuntimeError("Failed to start cache flush server") From d9cc3ff6148de2a32a0da5ab9f26df06bc00daaf Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Fri, 11 Jul 2025 20:31:38 +0000 Subject: [PATCH 35/65] go --- examples/sglang/slurm_jobs/README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/sglang/slurm_jobs/README.md b/examples/sglang/slurm_jobs/README.md index de91444cae..ba8539a41a 100644 --- a/examples/sglang/slurm_jobs/README.md +++ b/examples/sglang/slurm_jobs/README.md @@ -61,11 +61,10 @@ For simplicity of the example, we will make some assumptions about your SLURM cl ## Usage -1. **Submit a benchmark job**: - - > [!NOTE] - > The logic for finding prefill and decode node IPs in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` bits for your cluster, especially if your networking or hostname conventions differ. PRs and suggestions welcome. +> [!NOTE] +> The logic for finding prefill and decode node IPs in [`job_script_template.j2`](job_script_template.j2) is still a work in progress. You may need to tweak the `srun`/`ip route`/`getent`/`awk` bits for your cluster, especially if your networking or hostname conventions differ. PRs and suggestions welcome. +1. **Submit a benchmark job**: ```bash python submit_job_script.py \ --template job_script_template.j2 \ From dc82cabee29bce3c07cbb143f0d02a17ff68576f Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Sun, 13 Jul 2025 23:40:47 +0000 Subject: [PATCH 36/65] bump --- examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 38799c2547..c914d31c73 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -95,7 +95,7 @@ if [ "$mode" = "prefill" ]; then --host 0.0.0.0 \ --decode-log-interval 1 \ --max-running-requests 6144 \ - --context-length 2176 \ + --context-length 10000 \ --disable-radix-cache \ --enable-deepep-moe \ --deepep-mode low_latency \ @@ -140,7 +140,7 @@ elif [ "$mode" = "decode" ]; then --host 0.0.0.0 \ --decode-log-interval 1 \ --max-running-requests 36864 \ - --context-length 2176 \ + --context-length 10000 \ --disable-radix-cache \ --enable-deepep-moe \ --deepep-mode low_latency \ From efb01b12ccb01932fd4175acf9a292cb7b78c2fd Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Sun, 13 Jul 2025 23:56:42 +0000 Subject: [PATCH 37/65] bump --- examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index c914d31c73..9879f0c567 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -76,7 +76,7 @@ if [ "$mode" = "prefill" ]; then # GB200 sglang prefill command SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \ SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \ - SGLANG_MOONCAKE_CUSTOM_POOL=True \ + SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ NIXL_LOG_LEVEL=TRACE \ UCX_LOG_LEVEL=debug \ MC_FORCE_MNNVL=1 \ @@ -122,7 +122,7 @@ elif [ "$mode" = "decode" ]; then SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \ SGLANG_NUM_RESERVED_DECODE_TOKENS=176 \ SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \ - SGLANG_MOONCAKE_CUSTOM_POOL=True \ + SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ NIXL_LOG_LEVEL=TRACE \ UCX_LOG_LEVEL=debug \ MC_FORCE_MNNVL=1 \ From 2db0ebccf7e19c2fa8d791a6c8d09cd8cd10d628 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 14 Jul 2025 04:46:18 +0000 Subject: [PATCH 38/65] tot dynamo --- container/Dockerfile.sglang-deepep | 11 +--- container/Dockerfile.sglang-gb200 | 88 ++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 8 deletions(-) create mode 100644 container/Dockerfile.sglang-gb200 diff --git a/container/Dockerfile.sglang-deepep b/container/Dockerfile.sglang-deepep index 53e001f82e..4a807fd6b2 100644 --- a/container/Dockerfile.sglang-deepep +++ b/container/Dockerfile.sglang-deepep @@ -71,10 +71,8 @@ RUN rm -rf /opt/hpcx/ucx && \ ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH -# Pinning to NIXL 0.2.1 right now -# There is a fix that was merged into SGLang after 0.4.8.post1 -# TODO: Investigate perf hit of that change before we bump to up to date NIXL -ARG NIXL_COMMIT="5e4c179ee850d482a83cb2a211e0947e46281060" +# Pinnning to NIXL 0.3.1 +ARG NIXL_COMMIT="3503658e71143b56f9d5b1b440d84a94b9c41af8" RUN git clone https://github.com/ai-dynamo/nixl.git && cd nixl && git checkout ${NIXL_COMMIT} && pip install --break-system-packages . --config-settings=setup-args="-Ducx_path=/usr/local/ucx" WORKDIR /sgl-workspace @@ -89,10 +87,7 @@ RUN pip install --break-system-packages "sglang==0.4.8.post1" ENV SGL_FORCE_SHUTDOWN=1 WORKDIR /sgl-workspace -# include flush cache endpoint and server support -# https://github.com/ai-dynamo/dynamo/pull/1769 -ARG DYNAMO_COMMIT="bd91dca6141e05bcfbe9bd4dea54cc58b9e37d75" -RUN git clone https://github.com/ai-dynamo/dynamo.git && cd dynamo && git checkout ${DYNAMO_COMMIT} +RUN git clone https://github.com/ai-dynamo/dynamo.git # install dynamo in editable mode WORKDIR /sgl-workspace/dynamo diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200 new file mode 100644 index 0000000000..84c413a1ef --- /dev/null +++ b/container/Dockerfile.sglang-gb200 @@ -0,0 +1,88 @@ +FROM sglarm:latest + +# Define architecture variables for ARM64 +ARG ARCH=arm64 +ARG ARCH_ALT=aarch64 + +WORKDIR /sgl-workspace +# include flush cache endpoint and server support +# https://github.com/ai-dynamo/dynamo/pull/1769 +ARG DYNAMO_COMMIT="bd91dca6141e05bcfbe9bd4dea54cc58b9e37d75" +RUN git clone https://github.com/ai-dynamo/dynamo.git && cd dynamo && git checkout ${DYNAMO_COMMIT} + +# install dynamo in editable mode +WORKDIR /sgl-workspace/dynamo +# Rust build/dev dependencies +RUN apt update -y && \ + apt install --no-install-recommends -y \ + build-essential \ + protobuf-compiler \ + cmake \ + libssl-dev \ + pkg-config \ + clang \ + libclang-dev \ + git + +# Define Rust target based on ARCH_ALT ARG +ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu + +ENV RUSTUP_HOME=/usr/local/rustup \ + CARGO_HOME=/usr/local/cargo \ + PATH=/usr/local/cargo/bin:$PATH \ + RUST_VERSION=1.86.0 + +# Install Rust using RUSTARCH derived from ARCH_ALT +RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \ + # TODO: Add SHA check back based on RUSTARCH + chmod +x rustup-init && \ + ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \ + rm rustup-init && \ + chmod -R a+w $RUSTUP_HOME $CARGO_HOME + +ARG CARGO_BUILD_JOBS +# Set CARGO_BUILD_JOBS to 16 if not provided +# This is to prevent cargo from building $(nproc) jobs in parallel, +# which might exceed the number of opened files limit. +ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} + +RUN cargo build --release +RUN mkdir -p deploy/sdk/src/dynamo/sdk/cli/bin +RUN cp target/release/http deploy/sdk/src/dynamo/sdk/cli/bin +RUN cp target/release/llmctl deploy/sdk/src/dynamo/sdk/cli/bin +RUN cp target/release/dynamo-run deploy/sdk/src/dynamo/sdk/cli/bin + +RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../.. +RUN pip install --break-system-packages -e . + +ENV PYTHONPATH=/sgl-workspace/dynamo/components/planner/src:/sgl-workspace/dynamo/examples/sglang:$PYTHONPATH + +RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \ + dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb + +ENV ETCD_VERSION="v3.5.21" +RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \ + mkdir -p /usr/local/bin/etcd && \ + tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \ + rm /tmp/etcd.tar.gz +ENV PATH=/usr/local/bin/etcd/:$PATH + +# Install perf_analyzer and genai-perf +RUN apt-get update -y && \ + apt-get install -y --no-install-recommends \ + rapidjson-dev \ + zlib1g-dev + +RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \ + mkdir perf_analyzer/build && \ + cmake -B perf_analyzer/build -S perf_analyzer && \ + cmake --build perf_analyzer/build -- -j8 + +ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH + +RUN pip install --break-system-packages genai-perf + +COPY examples/sglang/configs/deepseek_r1/wideep/* /sgl-workspace/dynamo/examples/sglang/configs/ +COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/utils/ + +WORKDIR /sgl-workspace/dynamo/examples/sglang \ No newline at end of file From f275433f6e1fa5a124e3085288b19bc0eb53ec5c Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 15 Jul 2025 17:45:27 +0000 Subject: [PATCH 39/65] bump --- container/Dockerfile.sglang-gb200 | 7 ++-- .../wideep/install_mooncake_from_src.sh | 20 +++++++++ examples/sglang/docs/dsr1-wideep-gb200.md | 42 +++++++++++++++++++ 3 files changed, 65 insertions(+), 4 deletions(-) create mode 100644 examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh create mode 100644 examples/sglang/docs/dsr1-wideep-gb200.md diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200 index 84c413a1ef..f1ba0ab348 100644 --- a/container/Dockerfile.sglang-gb200 +++ b/container/Dockerfile.sglang-gb200 @@ -1,13 +1,12 @@ -FROM sglarm:latest +FROM sgl-blackwell-wideep:latest # Define architecture variables for ARM64 ARG ARCH=arm64 ARG ARCH_ALT=aarch64 WORKDIR /sgl-workspace -# include flush cache endpoint and server support -# https://github.com/ai-dynamo/dynamo/pull/1769 -ARG DYNAMO_COMMIT="bd91dca6141e05bcfbe9bd4dea54cc58b9e37d75" +# https://github.com/ai-dynamo/dynamo/pull/1938 +ARG DYNAMO_COMMIT="3c6fc6fdaf61397813cc58f4c1de7ece4c0203f0" RUN git clone https://github.com/ai-dynamo/dynamo.git && cd dynamo && git checkout ${DYNAMO_COMMIT} # install dynamo in editable mode diff --git a/examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh b/examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh new file mode 100644 index 0000000000..2f4729bdb2 --- /dev/null +++ b/examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# We've been having some trouble with the mooncake installation when we build +# the container. This script is ran before SGL starts up and allows us to use +# the mnnvl capabilites from mooncake main + +set -ex + +cd /sgl-workspace + +pip uninstall mooncake-transfer-engine + +git clone https://github.com/kvcache-ai/Mooncake.git +cd Mooncake +bash dependencies.sh +mkdir build +cd build +cmake .. -DUSE_MNNVL=ON +make -j +sudo make install \ No newline at end of file diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md new file mode 100644 index 0000000000..3f4dd03477 --- /dev/null +++ b/examples/sglang/docs/dsr1-wideep-gb200.md @@ -0,0 +1,42 @@ + + +# Running DeepSeek-R1 Disaggregated with WideEP on GB200s + +Dynamo supports SGLang's GB200 implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-06-16-gb200-part-1/) for more details. Full end to end optimization is still a work in progress but you can get this up and running with the following steps. + +## Instructions + +1. Build the SGLang DeepEP container on an ARM64 machine. + +```bash +git clone https://github.com/kyleliang-nv/sglang.git +git checkout sglang_gb200_wideep_docker +cd sglang/docker +docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE=blackwell --build-arg CUDA_VERSION=12.8.1 . +``` + +2. Build the Dynamo container + +```bash +cd $DYNAMO_ROOT +docker build -f container/Dockerfile.gb200 . -t dynamo-wideep-gb200 --no-cache +``` + + + + From 1b272af6ab6db1f17f4048e810cd7ecb7ab20fd2 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 15 Jul 2025 17:58:29 +0000 Subject: [PATCH 40/65] update gb200 deployment instructions --- examples/sglang/slurm_jobs/scripts/gb200.sh | 46 ++++++++++++++------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 9879f0c567..a311d23683 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -69,23 +69,32 @@ fi # Construct command based on mode and cmd if [ "$mode" = "prefill" ]; then + # We need to install Mooncake from source inside of the container for now + bash /configs/install_mooncake_from_src.sh if [ "$cmd" = "dynamo" ]; then echo "Error: dynamo command not implemented for GB200" exit 1 elif [ "$cmd" = "sglang" ]; then # GB200 sglang prefill command + # We are not using a init-expert-location file for e2e benchmarking + # We also don't currently have a --deepep-config file for GB200 + # Need to increase --context-length to 10k for 8k1k benchmarking SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \ - SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \ + MC_TE_METRIC=true \ + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \ + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \ + SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \ SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ - NIXL_LOG_LEVEL=TRACE \ - UCX_LOG_LEVEL=debug \ - MC_FORCE_MNNVL=1 \ + NCCL_MNNVL_ENABLE=1 \ + NCCL_CUMEM_ENABLE=1 \ + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ + SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ + PYTHONUNBUFFERED=1 \ python3 -m sglang.launch_server \ --served-model-name deepseek-ai/DeepSeek-R1 \ --model-path /model/ \ --trust-remote-code \ --disaggregation-mode prefill \ - --disaggregation-transfer-backend nixl \ --dist-init-addr "$HOST_IP:$PORT" \ --nnodes "$TOTAL_NODES" \ --node-rank "$RANK" \ @@ -95,7 +104,7 @@ if [ "$mode" = "prefill" ]; then --host 0.0.0.0 \ --decode-log-interval 1 \ --max-running-requests 6144 \ - --context-length 10000 \ + --context-length 2716 \ --disable-radix-cache \ --enable-deepep-moe \ --deepep-mode low_latency \ @@ -107,29 +116,38 @@ if [ "$mode" = "prefill" ]; then --eplb-algorithm deepseek \ --attention-backend cutlass_mla \ --watchdog-timeout 1000000 \ + --init-export-location --disable-cuda-graph \ --chunked-prefill-size 16384 \ --max-total-tokens 32768 \ - --mem-fraction-static 0.9 \ + --mem-fraction-static 0.8 \ --log-level debug fi elif [ "$mode" = "decode" ]; then + # We need to install Mooncake from source inside of the container for now + bash /configs/install_mooncake_from_src.sh if [ "$cmd" = "dynamo" ]; then echo "Error: dynamo command not implemented for GB200" exit 1 elif [ "$cmd" = "sglang" ]; then # GB200 sglang decode command + # Need to increase --context-length to 10k for 8k1k benchmarking + # We are not using a init-expert-location file for e2e benchmarking SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \ - SGLANG_NUM_RESERVED_DECODE_TOKENS=176 \ - SGLANG_MOONCAKE_ALLOCATOR_SO_PATH=/configs/hook.so \ + MC_TE_METRIC=true \ + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \ + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \ + SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \ + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \ SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ - NIXL_LOG_LEVEL=TRACE \ - UCX_LOG_LEVEL=debug \ - MC_FORCE_MNNVL=1 \ + NCCL_MNNVL_ENABLE=1 \ + NCCL_CUMEM_ENABLE=1 \ + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ + SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ + PYTHONUNBUFFERED=1 \ python3 -m sglang.launch_server \ --model-path /model/ \ --trust-remote-code \ - --disaggregation-transfer-backend mooncake \ --disaggregation-mode decode \ --dist-init-addr "$HOST_IP:$PORT" \ --nnodes "$TOTAL_NODES" \ @@ -140,7 +158,7 @@ elif [ "$mode" = "decode" ]; then --host 0.0.0.0 \ --decode-log-interval 1 \ --max-running-requests 36864 \ - --context-length 10000 \ + --context-length 2716 \ --disable-radix-cache \ --enable-deepep-moe \ --deepep-mode low_latency \ From de7bc220224c7a5a42aed3292ec7b420d621625e Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 15 Jul 2025 18:14:45 +0000 Subject: [PATCH 41/65] untested gb200 + dynamo command --- examples/sglang/slurm_jobs/scripts/gb200.sh | 99 +++++++++++++++++-- .../install_mooncake_from_src.sh | 0 2 files changed, 93 insertions(+), 6 deletions(-) rename examples/sglang/{configs/deepseek_r1/wideep => utils}/install_mooncake_from_src.sh (100%) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index a311d23683..b9cbde58e6 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -70,10 +70,55 @@ fi # Construct command based on mode and cmd if [ "$mode" = "prefill" ]; then # We need to install Mooncake from source inside of the container for now - bash /configs/install_mooncake_from_src.sh + bash /sgl-workspace/dynamo/examples/sglang/utils/install_mooncake_from_src.sh if [ "$cmd" = "dynamo" ]; then - echo "Error: dynamo command not implemented for GB200" - exit 1 + # We are not using a init-expert-location file for e2e benchmarking + # We also don't currently have a --deepep-config file for GB200 + # Need to increase --context-length to 10k for 8k1k benchmarking + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \ + MC_TE_METRIC=true \ + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \ + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \ + SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \ + SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ + NCCL_MNNVL_ENABLE=1 \ + NCCL_CUMEM_ENABLE=1 \ + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ + SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ + PYTHONUNBUFFERED=1 \ + python3 components/worker.py \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --model-path /model/ \ + --trust-remote-code \ + --disaggregation-mode prefill \ + --dist-init-addr "$HOST_IP:$PORT" \ + --nnodes "$TOTAL_NODES" \ + --node-rank "$RANK" \ + --tp-size "$TOTAL_GPUS" \ + --dp-size "$TOTAL_GPUS" \ + --enable-dp-attention \ + --host 0.0.0.0 \ + --decode-log-interval 1 \ + --max-running-requests 6144 \ + --context-length 2716 \ + --disable-radix-cache \ + --enable-deepep-moe \ + --deepep-mode low_latency \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-shared-experts-fusion \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm static \ + --eplb-algorithm deepseek \ + --attention-backend cutlass_mla \ + --watchdog-timeout 1000000 \ + --init-export-location + --disable-cuda-graph \ + --chunked-prefill-size 16384 \ + --max-total-tokens 32768 \ + --mem-fraction-static 0.8 \ + --log-level debug + elif [ "$cmd" = "sglang" ]; then # GB200 sglang prefill command # We are not using a init-expert-location file for e2e benchmarking @@ -125,10 +170,52 @@ if [ "$mode" = "prefill" ]; then fi elif [ "$mode" = "decode" ]; then # We need to install Mooncake from source inside of the container for now - bash /configs/install_mooncake_from_src.sh + bash /sgl-workspace/dynamo/examples/sglang/utils/install_mooncake_from_src.sh if [ "$cmd" = "dynamo" ]; then - echo "Error: dynamo command not implemented for GB200" - exit 1 + # Need to increase --context-length to 10k for 8k1k benchmarking + # We are not using a init-expert-location file for e2e benchmarking + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \ + MC_TE_METRIC=true \ + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \ + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \ + SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \ + SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \ + SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ + NCCL_MNNVL_ENABLE=1 \ + NCCL_CUMEM_ENABLE=1 \ + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ + SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ + PYTHONUNBUFFERED=1 \ + python3 components/worker.py \ + --model-path /model/ \ + --trust-remote-code \ + --disaggregation-mode decode \ + --dist-init-addr "$HOST_IP:$PORT" \ + --nnodes "$TOTAL_NODES" \ + --node-rank "$RANK" \ + --tp-size "$TOTAL_GPUS" \ + --dp-size "$TOTAL_GPUS" \ + --enable-dp-attention \ + --host 0.0.0.0 \ + --decode-log-interval 1 \ + --max-running-requests 36864 \ + --context-length 2716 \ + --disable-radix-cache \ + --enable-deepep-moe \ + --deepep-mode low_latency \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --cuda-graph-bs 768 \ + --disable-shared-experts-fusion \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm static \ + --eplb-algorithm deepseek \ + --attention-backend cutlass_mla \ + --watchdog-timeout 1000000 \ + --chunked-prefill-size 36864 \ + --mem-fraction-static 0.82 \ + --log-level debug + elif [ "$cmd" = "sglang" ]; then # GB200 sglang decode command # Need to increase --context-length to 10k for 8k1k benchmarking diff --git a/examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh similarity index 100% rename from examples/sglang/configs/deepseek_r1/wideep/install_mooncake_from_src.sh rename to examples/sglang/utils/install_mooncake_from_src.sh From 2d9e621454e9fd1aa07c1b97b82f4476c455eb8e Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 15 Jul 2025 19:16:24 +0000 Subject: [PATCH 42/65] bump --- container/Dockerfile.sglang-gb200 | 2 -- examples/sglang/docs/dsr1-wideep-gb200.md | 9 +++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200 index f1ba0ab348..542b55af7e 100644 --- a/container/Dockerfile.sglang-gb200 +++ b/container/Dockerfile.sglang-gb200 @@ -47,8 +47,6 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} RUN cargo build --release RUN mkdir -p deploy/sdk/src/dynamo/sdk/cli/bin -RUN cp target/release/http deploy/sdk/src/dynamo/sdk/cli/bin -RUN cp target/release/llmctl deploy/sdk/src/dynamo/sdk/cli/bin RUN cp target/release/dynamo-run deploy/sdk/src/dynamo/sdk/cli/bin RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../.. diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md index 3f4dd03477..d6423cc945 100644 --- a/examples/sglang/docs/dsr1-wideep-gb200.md +++ b/examples/sglang/docs/dsr1-wideep-gb200.md @@ -24,9 +24,9 @@ Dynamo supports SGLang's GB200 implementation of wide expert parallelism and lar 1. Build the SGLang DeepEP container on an ARM64 machine. ```bash -git clone https://github.com/kyleliang-nv/sglang.git -git checkout sglang_gb200_wideep_docker -cd sglang/docker +git clone https://github.com/kyleliang-nv/sglang.git # temporary +cd sglang +git checkout sglang_gb200_wideep_docker # temporary docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE=blackwell --build-arg CUDA_VERSION=12.8.1 . ``` @@ -34,7 +34,8 @@ docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE ```bash cd $DYNAMO_ROOT -docker build -f container/Dockerfile.gb200 . -t dynamo-wideep-gb200 --no-cache +git checkout ishan/more-slurm-targets # temporary +docker build -f container/Dockerfile.sglang-gb200 . -t dynamo-wideep-gb200 --no-cache ``` From 74f6ffaf4b317d20b1aff3281af4d7d75d085b3f Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 15 Jul 2025 19:58:38 +0000 Subject: [PATCH 43/65] path swap --- examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index b9cbde58e6..9539df573d 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -70,7 +70,7 @@ fi # Construct command based on mode and cmd if [ "$mode" = "prefill" ]; then # We need to install Mooncake from source inside of the container for now - bash /sgl-workspace/dynamo/examples/sglang/utils/install_mooncake_from_src.sh + bash /configs/install_mooncake_from_src.sh if [ "$cmd" = "dynamo" ]; then # We are not using a init-expert-location file for e2e benchmarking # We also don't currently have a --deepep-config file for GB200 @@ -170,7 +170,7 @@ if [ "$mode" = "prefill" ]; then fi elif [ "$mode" = "decode" ]; then # We need to install Mooncake from source inside of the container for now - bash /sgl-workspace/dynamo/examples/sglang/utils/install_mooncake_from_src.sh + bash /configs/install_mooncake_from_src.sh if [ "$cmd" = "dynamo" ]; then # Need to increase --context-length to 10k for 8k1k benchmarking # We are not using a init-expert-location file for e2e benchmarking From 22a227f3590febbbdaa97936faac8b604dff27d5 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 15 Jul 2025 20:29:04 +0000 Subject: [PATCH 44/65] ok --- examples/sglang/slurm_jobs/scripts/gb200.sh | 2 -- .../sglang/utils/install_mooncake_from_src.sh | 36 ++++++++++++++++--- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 9539df573d..ab974aaf13 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -112,7 +112,6 @@ if [ "$mode" = "prefill" ]; then --eplb-algorithm deepseek \ --attention-backend cutlass_mla \ --watchdog-timeout 1000000 \ - --init-export-location --disable-cuda-graph \ --chunked-prefill-size 16384 \ --max-total-tokens 32768 \ @@ -161,7 +160,6 @@ if [ "$mode" = "prefill" ]; then --eplb-algorithm deepseek \ --attention-backend cutlass_mla \ --watchdog-timeout 1000000 \ - --init-export-location --disable-cuda-graph \ --chunked-prefill-size 16384 \ --max-total-tokens 32768 \ diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh index 2f4729bdb2..15f757c3f4 100644 --- a/examples/sglang/utils/install_mooncake_from_src.sh +++ b/examples/sglang/utils/install_mooncake_from_src.sh @@ -3,18 +3,46 @@ # We've been having some trouble with the mooncake installation when we build # the container. This script is ran before SGL starts up and allows us to use # the mnnvl capabilites from mooncake main +# +# Usage: ./install_mooncake.sh +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi -set -ex +MODE="$1" +case "$MODE" in + dynamo) + SUDO="" + ;; + sglang) + SUDO="sudo" + ;; + *) + echo "Error: invalid mode '$MODE'. Use 'dynamo' or 'sglang'." + exit 1 + ;; +esac cd /sgl-workspace -pip uninstall mooncake-transfer-engine +# Clean up previous build +$SUDO rm -rf Mooncake/ +# Uninstall any existing package +pip uninstall -y mooncake-transfer-engine + +# Clone & build git clone https://github.com/kvcache-ai/Mooncake.git cd Mooncake bash dependencies.sh -mkdir build + +mkdir -p build cd build cmake .. -DUSE_MNNVL=ON make -j -sudo make install \ No newline at end of file + +# Install (with sudo if in sglang mode) +$SUDO make install + +echo "Mooncake built and installed in '$MODE' mode." \ No newline at end of file From f32a4228d3f44cecad5357afc38fb6290fba9d91 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 15 Jul 2025 20:47:36 +0000 Subject: [PATCH 45/65] cmd --- examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index ab974aaf13..73ac48c612 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -70,7 +70,7 @@ fi # Construct command based on mode and cmd if [ "$mode" = "prefill" ]; then # We need to install Mooncake from source inside of the container for now - bash /configs/install_mooncake_from_src.sh + bash /configs/install_mooncake_from_src.sh $cmd if [ "$cmd" = "dynamo" ]; then # We are not using a init-expert-location file for e2e benchmarking # We also don't currently have a --deepep-config file for GB200 @@ -168,7 +168,7 @@ if [ "$mode" = "prefill" ]; then fi elif [ "$mode" = "decode" ]; then # We need to install Mooncake from source inside of the container for now - bash /configs/install_mooncake_from_src.sh + bash /configs/install_mooncake_from_src.sh $cmd if [ "$cmd" = "dynamo" ]; then # Need to increase --context-length to 10k for 8k1k benchmarking # We are not using a init-expert-location file for e2e benchmarking From 0a02c8aae5d6bf9c906dd4aa9da93161c1831731 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 15 Jul 2025 20:57:48 +0000 Subject: [PATCH 46/65] try something else --- examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 73ac48c612..79f439fdfb 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -70,7 +70,7 @@ fi # Construct command based on mode and cmd if [ "$mode" = "prefill" ]; then # We need to install Mooncake from source inside of the container for now - bash /configs/install_mooncake_from_src.sh $cmd + ./configs/install_mooncake_from_src.sh $cmd if [ "$cmd" = "dynamo" ]; then # We are not using a init-expert-location file for e2e benchmarking # We also don't currently have a --deepep-config file for GB200 @@ -168,7 +168,7 @@ if [ "$mode" = "prefill" ]; then fi elif [ "$mode" = "decode" ]; then # We need to install Mooncake from source inside of the container for now - bash /configs/install_mooncake_from_src.sh $cmd + ./configs/install_mooncake_from_src.sh $cmd if [ "$cmd" = "dynamo" ]; then # Need to increase --context-length to 10k for 8k1k benchmarking # We are not using a init-expert-location file for e2e benchmarking From a24763da54e8f751a09045046e0f8e2b8fdfbd30 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 15 Jul 2025 21:08:24 +0000 Subject: [PATCH 47/65] keep us as root to install mooncake deps as it needs sudo --- examples/sglang/slurm_jobs/job_script_template.j2 | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2 index 2e873c42fa..a9d7388ea2 100755 --- a/examples/sglang/slurm_jobs/job_script_template.j2 +++ b/examples/sglang/slurm_jobs/job_script_template.j2 @@ -57,7 +57,6 @@ ENROOT_ARGS="\ --container-image=${CONTAINER_IMAGE} \ --no-container-entrypoint \ --no-container-mount-home \ - --no-container-remap-root \ --container-mounts=${MODEL_DIR}:/model/,${CONFIG_DIR}:/configs/,${SCRIPT_DIR}:/scripts/,${OUTPUT_DIR}:/outputs/,${LOG_DIR}:/logs/ \ " From d6a6a3ebdd8a2f7063979083c0ac44ffc1d8d9e0 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 15 Jul 2025 21:09:07 +0000 Subject: [PATCH 48/65] try --- examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 79f439fdfb..73ac48c612 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -70,7 +70,7 @@ fi # Construct command based on mode and cmd if [ "$mode" = "prefill" ]; then # We need to install Mooncake from source inside of the container for now - ./configs/install_mooncake_from_src.sh $cmd + bash /configs/install_mooncake_from_src.sh $cmd if [ "$cmd" = "dynamo" ]; then # We are not using a init-expert-location file for e2e benchmarking # We also don't currently have a --deepep-config file for GB200 @@ -168,7 +168,7 @@ if [ "$mode" = "prefill" ]; then fi elif [ "$mode" = "decode" ]; then # We need to install Mooncake from source inside of the container for now - ./configs/install_mooncake_from_src.sh $cmd + bash /configs/install_mooncake_from_src.sh $cmd if [ "$cmd" = "dynamo" ]; then # Need to increase --context-length to 10k for 8k1k benchmarking # We are not using a init-expert-location file for e2e benchmarking From 87578320b1ea917de0a56261ab0441bec9029f0c Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 16 Jul 2025 00:56:55 +0000 Subject: [PATCH 49/65] revert to sgl balancer and fix scripts and add MC MNNVL flag --- examples/sglang/slurm_jobs/scripts/gb200.sh | 4 ++ .../sglang/slurm_jobs/scripts/worker_setup.py | 12 +++--- .../sglang/utils/install_mooncake_from_src.sh | 41 ++++++------------- 3 files changed, 21 insertions(+), 36 deletions(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 73ac48c612..0a7ae5d25b 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -81,6 +81,7 @@ if [ "$mode" = "prefill" ]; then SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \ SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \ SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ + MC_FORCE_MNNVL=1 \ NCCL_MNNVL_ENABLE=1 \ NCCL_CUMEM_ENABLE=1 \ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ @@ -130,6 +131,7 @@ if [ "$mode" = "prefill" ]; then SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \ SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ NCCL_MNNVL_ENABLE=1 \ + MC_FORCE_MNNVL=1 \ NCCL_CUMEM_ENABLE=1 \ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ @@ -180,6 +182,7 @@ elif [ "$mode" = "decode" ]; then SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \ SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ NCCL_MNNVL_ENABLE=1 \ + MC_FORCE_MNNVL=1 \ NCCL_CUMEM_ENABLE=1 \ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ @@ -226,6 +229,7 @@ elif [ "$mode" = "decode" ]; then SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \ SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ NCCL_MNNVL_ENABLE=1 \ + MC_FORCE_MNNVL=1 \ NCCL_CUMEM_ENABLE=1 \ SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ diff --git a/examples/sglang/slurm_jobs/scripts/worker_setup.py b/examples/sglang/slurm_jobs/scripts/worker_setup.py index 5e6aa9309a..5071c6f25b 100644 --- a/examples/sglang/slurm_jobs/scripts/worker_setup.py +++ b/examples/sglang/slurm_jobs/scripts/worker_setup.py @@ -208,14 +208,12 @@ def _validate_args(args: argparse.Namespace) -> None: def get_sglang_mini_lb_command_args(prefill_host_ip: str, decode_host_ip: str) -> str: cmd = ( - "pip install sglang-router && " - f"python3 -m sglang_router.launch_router " - f"--policy random " - f"--pd-disaggregation " - f"--prefill http://{prefill_host_ip}:30000 30001 " + f"python3 -m sglang.srt.disaggregation.launch_lb " + f"--prefill http://{prefill_host_ip}:30000 " f"--decode http://{decode_host_ip}:30000 " - f"--host 0.0.0.0 " - f"--port 8000 " + "--host 0.0.0.0 " + "--port 8000 " + "--timeout 3600" ) return cmd diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh index 15f757c3f4..c3284d09b0 100644 --- a/examples/sglang/utils/install_mooncake_from_src.sh +++ b/examples/sglang/utils/install_mooncake_from_src.sh @@ -3,46 +3,29 @@ # We've been having some trouble with the mooncake installation when we build # the container. This script is ran before SGL starts up and allows us to use # the mnnvl capabilites from mooncake main -# -# Usage: ./install_mooncake.sh -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - exit 1 -fi - -MODE="$1" -case "$MODE" in - dynamo) - SUDO="" - ;; - sglang) - SUDO="sudo" - ;; - *) - echo "Error: invalid mode '$MODE'. Use 'dynamo' or 'sglang'." - exit 1 - ;; -esac cd /sgl-workspace -# Clean up previous build -$SUDO rm -rf Mooncake/ +# Try to set this +export TORCH_CUDA_ARCH_LIST=10.0 + +echo $LD_LIBRARY_PATH # Uninstall any existing package -pip uninstall -y mooncake-transfer-engine +#pip install --break-system-packages mooncake-transfer-engine # Clone & build -git clone https://github.com/kvcache-ai/Mooncake.git +git clone https://github.com/ishandhanani/Mooncake.git cd Mooncake -bash dependencies.sh - +git checkout ishan/manual-nvl-installation +bash dependencies.sh -y mkdir -p build cd build cmake .. -DUSE_MNNVL=ON make -j -# Install (with sudo if in sglang mode) -$SUDO make install +make install + +chmod +x /usr/local/lib/python3.10/dist-packages/mooncake/nvlink_allocator.so -echo "Mooncake built and installed in '$MODE' mode." \ No newline at end of file +echo "Mooncake built and installed" From 462c4a8dea1cba0f0e7ad75b1e0a288ad4eab9ae Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 16 Jul 2025 02:39:04 +0000 Subject: [PATCH 50/65] bump --- examples/sglang/slurm_jobs/job_script_template.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/sglang/slurm_jobs/job_script_template.j2 b/examples/sglang/slurm_jobs/job_script_template.j2 index a9d7388ea2..a0959fbb91 100755 --- a/examples/sglang/slurm_jobs/job_script_template.j2 +++ b/examples/sglang/slurm_jobs/job_script_template.j2 @@ -7,6 +7,7 @@ #SBATCH --time={{ time_limit }} #SBATCH --output=logs/%j/log.out #SBATCH --error=logs/%j/log.err +#SBATCH --partition=36x2-a01r # Constants PREFILL_NODES={{ prefill_nodes }} From 3f5361ef3e5566c40643572c983dc432871ca84e Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 16 Jul 2025 18:59:05 +0000 Subject: [PATCH 51/65] init instructions for others --- container/Dockerfile.sglang-gb200 | 17 +-------- examples/sglang/docs/dsr1-wideep-gb200.md | 38 ++++++++++++++++++- examples/sglang/slurm_jobs/scripts/gb200.sh | 1 + .../sglang/utils/install_mooncake_from_src.sh | 6 ++- 4 files changed, 43 insertions(+), 19 deletions(-) diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200 index 542b55af7e..9bc3fef5da 100644 --- a/container/Dockerfile.sglang-gb200 +++ b/container/Dockerfile.sglang-gb200 @@ -6,7 +6,7 @@ ARG ARCH_ALT=aarch64 WORKDIR /sgl-workspace # https://github.com/ai-dynamo/dynamo/pull/1938 -ARG DYNAMO_COMMIT="3c6fc6fdaf61397813cc58f4c1de7ece4c0203f0" +ARG DYNAMO_COMMIT="aba60996f225038b691d9f255da515b27695b179" RUN git clone https://github.com/ai-dynamo/dynamo.git && cd dynamo && git checkout ${DYNAMO_COMMIT} # install dynamo in editable mode @@ -64,21 +64,6 @@ RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/downlo rm /tmp/etcd.tar.gz ENV PATH=/usr/local/bin/etcd/:$PATH -# Install perf_analyzer and genai-perf -RUN apt-get update -y && \ - apt-get install -y --no-install-recommends \ - rapidjson-dev \ - zlib1g-dev - -RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \ - mkdir perf_analyzer/build && \ - cmake -B perf_analyzer/build -S perf_analyzer && \ - cmake --build perf_analyzer/build -- -j8 - -ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH - -RUN pip install --break-system-packages genai-perf - COPY examples/sglang/configs/deepseek_r1/wideep/* /sgl-workspace/dynamo/examples/sglang/configs/ COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/utils/ diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md index d6423cc945..8c712cb7dd 100644 --- a/examples/sglang/docs/dsr1-wideep-gb200.md +++ b/examples/sglang/docs/dsr1-wideep-gb200.md @@ -23,21 +23,55 @@ Dynamo supports SGLang's GB200 implementation of wide expert parallelism and lar 1. Build the SGLang DeepEP container on an ARM64 machine. +> [!NOTE] +> This sglang side branch is based on an open [PR](https://github.com/sgl-project/sglang/pull/7721/files) to SGLang that allows their main dockerfile to be built for aarch64. Once that PR is merged in, we can add the gb200 dockerfile to the main sglang repo. + ```bash -git clone https://github.com/kyleliang-nv/sglang.git # temporary +git clone https://github.com/kyleliang-nv/sglang.git cd sglang -git checkout sglang_gb200_wideep_docker # temporary +git checkout sglang_gb200_wideep_docker docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE=blackwell --build-arg CUDA_VERSION=12.8.1 . ``` 2. Build the Dynamo container +> [!NOTE] +> This is a side branch that contains all of the scripts to run on GB200s. Once the PR is merged in, we can switch to the main branch. + ```bash cd $DYNAMO_ROOT git checkout ishan/more-slurm-targets # temporary docker build -f container/Dockerfile.sglang-gb200 . -t dynamo-wideep-gb200 --no-cache ``` +3. In your SLURM cluster, clone dynamo and switch to this side branch. + +```bash +git clone https://github.com/ai-dynamo/dynamo.git +git checkout ishan/more-slurm-targets +cd examples/sglang/slurm_jobs +``` +4. Ensure you have the proper paths that you can use to mount things to the container +- The path to the DSR1 model which should be mounted to the `--model-dir` flag +- The path to the `install_mooncake_from_src.sh` which will be mounted to the `--config-dir` flag + +5. Run the following command to submit the job + +```bash +python3 submit_job_script.py \ + --template job_script_template.j2 \ + --model-dir \ + --container-image \ + --account \ + --gpus-per-node 4 \ + --config-dir \ + --network-interface enp138s0f0np0 \ + --gpu-type gb200 \ + --use-sglang-commands \ + --prefill-nodes 2 \ + --decode-nodes 12 +``` +6. This will create a logs directory in the `examples/sglang/slurm_jobs` directory. You can `cd` into the directory, cd into your job id, and then run `tail -f *_prefill.err *_decode.err` or `tail -f *_prefill.out *_decode.out` to see the logs. \ No newline at end of file diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 0a7ae5d25b..1030424e57 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -66,6 +66,7 @@ if [ -z "$TOTAL_NODES" ]; then exit 1 fi +# TODO: since the args for sglang and dynamo are the same, we can be a bit cleaner here # Construct command based on mode and cmd if [ "$mode" = "prefill" ]; then diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh index c3284d09b0..9dc918f756 100644 --- a/examples/sglang/utils/install_mooncake_from_src.sh +++ b/examples/sglang/utils/install_mooncake_from_src.sh @@ -15,9 +15,13 @@ echo $LD_LIBRARY_PATH #pip install --break-system-packages mooncake-transfer-engine # Clone & build +# Once Mooncake main branch has fixed +# 1. proper g++ compilation +# 2. solved std::function call issue - we can swap back to ToT +# As of 7/16 10:20AM PST - I've been told its was solved but I have not been able to test it E2E git clone https://github.com/ishandhanani/Mooncake.git cd Mooncake -git checkout ishan/manual-nvl-installation +git checkout ishan/pr-571-diff-build bash dependencies.sh -y mkdir -p build cd build From b258074cafbda678064a045822907b9aafb00777 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 16 Jul 2025 19:02:24 +0000 Subject: [PATCH 52/65] atempt --- examples/sglang/docs/dsr1-wideep-gb200.md | 2 ++ examples/sglang/utils/install_mooncake_from_src.sh | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md index 8c712cb7dd..d2e33a10b2 100644 --- a/examples/sglang/docs/dsr1-wideep-gb200.md +++ b/examples/sglang/docs/dsr1-wideep-gb200.md @@ -74,4 +74,6 @@ python3 submit_job_script.py \ --decode-nodes 12 ``` +**UNTESTED**: if you want to spin up dynamo, you can remove the `--use-sglang-commands` flag. + 6. This will create a logs directory in the `examples/sglang/slurm_jobs` directory. You can `cd` into the directory, cd into your job id, and then run `tail -f *_prefill.err *_decode.err` or `tail -f *_prefill.out *_decode.out` to see the logs. \ No newline at end of file diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh index 9dc918f756..85e2c48379 100644 --- a/examples/sglang/utils/install_mooncake_from_src.sh +++ b/examples/sglang/utils/install_mooncake_from_src.sh @@ -11,14 +11,15 @@ export TORCH_CUDA_ARCH_LIST=10.0 echo $LD_LIBRARY_PATH -# Uninstall any existing package -#pip install --break-system-packages mooncake-transfer-engine +# Uninstall any existing mooncake package +pip install --break-system-packages mooncake-transfer-engine # Clone & build # Once Mooncake main branch has fixed # 1. proper g++ compilation # 2. solved std::function call issue - we can swap back to ToT # As of 7/16 10:20AM PST - I've been told its was solved but I have not been able to test it E2E +# So for now we will stay on my side branch git clone https://github.com/ishandhanani/Mooncake.git cd Mooncake git checkout ishan/pr-571-diff-build From b195bbfcaa757b288b53edd3cdec5cf54274aa40 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 16 Jul 2025 19:35:30 +0000 Subject: [PATCH 53/65] bump --- examples/sglang/docs/dsr1-wideep-gb200.md | 4 ++-- examples/sglang/slurm_jobs/scripts/gb200.sh | 8 ++++---- examples/sglang/utils/install_mooncake_from_src.sh | 9 ++++----- 3 files changed, 10 insertions(+), 11 deletions(-) mode change 100644 => 100755 examples/sglang/utils/install_mooncake_from_src.sh diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md index d2e33a10b2..c2ff937789 100644 --- a/examples/sglang/docs/dsr1-wideep-gb200.md +++ b/examples/sglang/docs/dsr1-wideep-gb200.md @@ -23,7 +23,7 @@ Dynamo supports SGLang's GB200 implementation of wide expert parallelism and lar 1. Build the SGLang DeepEP container on an ARM64 machine. -> [!NOTE] +> [!NOTE] > This sglang side branch is based on an open [PR](https://github.com/sgl-project/sglang/pull/7721/files) to SGLang that allows their main dockerfile to be built for aarch64. Once that PR is merged in, we can add the gb200 dockerfile to the main sglang repo. ```bash @@ -35,7 +35,7 @@ docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE 2. Build the Dynamo container -> [!NOTE] +> [!NOTE] > This is a side branch that contains all of the scripts to run on GB200s. Once the PR is merged in, we can switch to the main branch. ```bash diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 1030424e57..e454e7c55a 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -70,7 +70,7 @@ fi # Construct command based on mode and cmd if [ "$mode" = "prefill" ]; then - # We need to install Mooncake from source inside of the container for now + # We need to install Mooncake from source inside of the container for now bash /configs/install_mooncake_from_src.sh $cmd if [ "$cmd" = "dynamo" ]; then # We are not using a init-expert-location file for e2e benchmarking @@ -119,7 +119,7 @@ if [ "$mode" = "prefill" ]; then --max-total-tokens 32768 \ --mem-fraction-static 0.8 \ --log-level debug - + elif [ "$cmd" = "sglang" ]; then # GB200 sglang prefill command # We are not using a init-expert-location file for e2e benchmarking @@ -170,7 +170,7 @@ if [ "$mode" = "prefill" ]; then --log-level debug fi elif [ "$mode" = "decode" ]; then - # We need to install Mooncake from source inside of the container for now + # We need to install Mooncake from source inside of the container for now bash /configs/install_mooncake_from_src.sh $cmd if [ "$cmd" = "dynamo" ]; then # Need to increase --context-length to 10k for 8k1k benchmarking @@ -217,7 +217,7 @@ elif [ "$mode" = "decode" ]; then --chunked-prefill-size 36864 \ --mem-fraction-static 0.82 \ --log-level debug - + elif [ "$cmd" = "sglang" ]; then # GB200 sglang decode command # Need to increase --context-length to 10k for 8k1k benchmarking diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh old mode 100644 new mode 100755 index 85e2c48379..9281a599f4 --- a/examples/sglang/utils/install_mooncake_from_src.sh +++ b/examples/sglang/utils/install_mooncake_from_src.sh @@ -1,4 +1,6 @@ #!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # We've been having some trouble with the mooncake installation when we build # the container. This script is ran before SGL starts up and allows us to use @@ -11,15 +13,12 @@ export TORCH_CUDA_ARCH_LIST=10.0 echo $LD_LIBRARY_PATH -# Uninstall any existing mooncake package -pip install --break-system-packages mooncake-transfer-engine - # Clone & build -# Once Mooncake main branch has fixed +# Once Mooncake main branch has fixed # 1. proper g++ compilation # 2. solved std::function call issue - we can swap back to ToT # As of 7/16 10:20AM PST - I've been told its was solved but I have not been able to test it E2E -# So for now we will stay on my side branch +# So for now we will stay on my side branch git clone https://github.com/ishandhanani/Mooncake.git cd Mooncake git checkout ishan/pr-571-diff-build From da162bc6f7bf9fbe6b4ef366a470f618c5fc742c Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 16 Jul 2025 23:23:45 +0000 Subject: [PATCH 54/65] bump --- container/Dockerfile.sglang-gb200 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200 index 9bc3fef5da..ceaa1c51c5 100644 --- a/container/Dockerfile.sglang-gb200 +++ b/container/Dockerfile.sglang-gb200 @@ -67,4 +67,6 @@ ENV PATH=/usr/local/bin/etcd/:$PATH COPY examples/sglang/configs/deepseek_r1/wideep/* /sgl-workspace/dynamo/examples/sglang/configs/ COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/utils/ +ENV PYTHONPATH=/workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang:$PYTHONPATH + WORKDIR /sgl-workspace/dynamo/examples/sglang \ No newline at end of file From 4266fe030e41ed79ddcfe7b3d2b10f6224d2f168 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Wed, 16 Jul 2025 23:49:42 +0000 Subject: [PATCH 55/65] lel --- container/Dockerfile.sglang-gb200 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200 index ceaa1c51c5..48f7e34e62 100644 --- a/container/Dockerfile.sglang-gb200 +++ b/container/Dockerfile.sglang-gb200 @@ -67,6 +67,6 @@ ENV PATH=/usr/local/bin/etcd/:$PATH COPY examples/sglang/configs/deepseek_r1/wideep/* /sgl-workspace/dynamo/examples/sglang/configs/ COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/utils/ -ENV PYTHONPATH=/workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang:$PYTHONPATH +ENV PYTHONPATH=/sgl-workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang:$PYTHONPATH WORKDIR /sgl-workspace/dynamo/examples/sglang \ No newline at end of file From f6ab522b95c22b879c0e0f6a1bad1ef007540490 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 17 Jul 2025 00:32:16 +0000 Subject: [PATCH 56/65] bump --- examples/sglang/docs/dsr1-wideep-gb200.md | 5 ++- examples/sglang/slurm_jobs/scripts/gb200.sh | 4 --- .../sglang/utils/install_mooncake_from_src.sh | 35 ------------------- 3 files changed, 2 insertions(+), 42 deletions(-) delete mode 100755 examples/sglang/utils/install_mooncake_from_src.sh diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md index c2ff937789..e7874f64fb 100644 --- a/examples/sglang/docs/dsr1-wideep-gb200.md +++ b/examples/sglang/docs/dsr1-wideep-gb200.md @@ -54,8 +54,7 @@ cd examples/sglang/slurm_jobs 4. Ensure you have the proper paths that you can use to mount things to the container -- The path to the DSR1 model which should be mounted to the `--model-dir` flag -- The path to the `install_mooncake_from_src.sh` which will be mounted to the `--config-dir` flag +- The path to the DSR1 model which should be mounted to the `--model-dir` flag and `--config-dir` flag 5. Run the following command to submit the job @@ -66,7 +65,7 @@ python3 submit_job_script.py \ --container-image \ --account \ --gpus-per-node 4 \ - --config-dir \ + --config-dir \ --network-interface enp138s0f0np0 \ --gpu-type gb200 \ --use-sglang-commands \ diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index e454e7c55a..486c2a6c7e 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -70,8 +70,6 @@ fi # Construct command based on mode and cmd if [ "$mode" = "prefill" ]; then - # We need to install Mooncake from source inside of the container for now - bash /configs/install_mooncake_from_src.sh $cmd if [ "$cmd" = "dynamo" ]; then # We are not using a init-expert-location file for e2e benchmarking # We also don't currently have a --deepep-config file for GB200 @@ -170,8 +168,6 @@ if [ "$mode" = "prefill" ]; then --log-level debug fi elif [ "$mode" = "decode" ]; then - # We need to install Mooncake from source inside of the container for now - bash /configs/install_mooncake_from_src.sh $cmd if [ "$cmd" = "dynamo" ]; then # Need to increase --context-length to 10k for 8k1k benchmarking # We are not using a init-expert-location file for e2e benchmarking diff --git a/examples/sglang/utils/install_mooncake_from_src.sh b/examples/sglang/utils/install_mooncake_from_src.sh deleted file mode 100755 index 9281a599f4..0000000000 --- a/examples/sglang/utils/install_mooncake_from_src.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# We've been having some trouble with the mooncake installation when we build -# the container. This script is ran before SGL starts up and allows us to use -# the mnnvl capabilites from mooncake main - -cd /sgl-workspace - -# Try to set this -export TORCH_CUDA_ARCH_LIST=10.0 - -echo $LD_LIBRARY_PATH - -# Clone & build -# Once Mooncake main branch has fixed -# 1. proper g++ compilation -# 2. solved std::function call issue - we can swap back to ToT -# As of 7/16 10:20AM PST - I've been told its was solved but I have not been able to test it E2E -# So for now we will stay on my side branch -git clone https://github.com/ishandhanani/Mooncake.git -cd Mooncake -git checkout ishan/pr-571-diff-build -bash dependencies.sh -y -mkdir -p build -cd build -cmake .. -DUSE_MNNVL=ON -make -j - -make install - -chmod +x /usr/local/lib/python3.10/dist-packages/mooncake/nvlink_allocator.so - -echo "Mooncake built and installed" From 2c3f0855c84817788bdf978f8a5729e6173b1f71 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 17 Jul 2025 00:56:11 +0000 Subject: [PATCH 57/65] bump --- examples/sglang/slurm_jobs/scripts/gb200.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 486c2a6c7e..aee53c3407 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -92,6 +92,7 @@ if [ "$mode" = "prefill" ]; then --trust-remote-code \ --disaggregation-mode prefill \ --dist-init-addr "$HOST_IP:$PORT" \ + --disaggregation-bootstrap-port 30001 \ --nnodes "$TOTAL_NODES" \ --node-rank "$RANK" \ --tp-size "$TOTAL_GPUS" \ @@ -141,6 +142,7 @@ if [ "$mode" = "prefill" ]; then --trust-remote-code \ --disaggregation-mode prefill \ --dist-init-addr "$HOST_IP:$PORT" \ + --disaggregation-bootstrap-port 30001 \ --nnodes "$TOTAL_NODES" \ --node-rank "$RANK" \ --tp-size "$TOTAL_GPUS" \ From aa123a1ee0c5a01ffa9e5c5a50279c6351e98b3e Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 17 Jul 2025 01:34:02 +0000 Subject: [PATCH 58/65] bump --- examples/sglang/slurm_jobs/scripts/gb200.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index aee53c3407..439e977982 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -191,6 +191,7 @@ elif [ "$mode" = "decode" ]; then --trust-remote-code \ --disaggregation-mode decode \ --dist-init-addr "$HOST_IP:$PORT" \ + --disaggregation-bootstrap-port 30001 \ --nnodes "$TOTAL_NODES" \ --node-rank "$RANK" \ --tp-size "$TOTAL_GPUS" \ @@ -238,6 +239,7 @@ elif [ "$mode" = "decode" ]; then --trust-remote-code \ --disaggregation-mode decode \ --dist-init-addr "$HOST_IP:$PORT" \ + --disaggregation-bootstrap-port 30001 \ --nnodes "$TOTAL_NODES" \ --node-rank "$RANK" \ --tp-size "$TOTAL_GPUS" \ From 58f4d330aa1167725bd8c5cf271d01eced13925d Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 17 Jul 2025 01:51:26 +0000 Subject: [PATCH 59/65] sadness --- examples/sglang/slurm_jobs/scripts/gb200.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 439e977982..88e5d954ce 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -186,7 +186,7 @@ elif [ "$mode" = "decode" ]; then SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ PYTHONUNBUFFERED=1 \ - python3 components/worker.py \ + python3 components/decode_worker.py \ --model-path /model/ \ --trust-remote-code \ --disaggregation-mode decode \ From d4fc6be1f3b675aaa3b2c37ed771ad5ca2a4c997 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 17 Jul 2025 02:12:12 +0000 Subject: [PATCH 60/65] so close to crash out --- examples/sglang/slurm_jobs/scripts/gb200.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 88e5d954ce..2b46f346a0 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -89,6 +89,7 @@ if [ "$mode" = "prefill" ]; then python3 components/worker.py \ --served-model-name deepseek-ai/DeepSeek-R1 \ --model-path /model/ \ + --skip-tokenizer-init \ --trust-remote-code \ --disaggregation-mode prefill \ --dist-init-addr "$HOST_IP:$PORT" \ @@ -187,7 +188,9 @@ elif [ "$mode" = "decode" ]; then SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ PYTHONUNBUFFERED=1 \ python3 components/decode_worker.py \ + --served-model-name deepseek-ai/DeepSeek-R1 \ --model-path /model/ \ + --skip-tokenizer-init \ --trust-remote-code \ --disaggregation-mode decode \ --dist-init-addr "$HOST_IP:$PORT" \ From 4394938815e9202e77e3b65bf6a331e6621f3e2c Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 17 Jul 2025 02:25:45 +0000 Subject: [PATCH 61/65] bump --- examples/sglang/docs/dsr1-wideep-gb200.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sglang/docs/dsr1-wideep-gb200.md b/examples/sglang/docs/dsr1-wideep-gb200.md index e7874f64fb..bb07ccb424 100644 --- a/examples/sglang/docs/dsr1-wideep-gb200.md +++ b/examples/sglang/docs/dsr1-wideep-gb200.md @@ -73,6 +73,6 @@ python3 submit_job_script.py \ --decode-nodes 12 ``` -**UNTESTED**: if you want to spin up dynamo, you can remove the `--use-sglang-commands` flag. +**Note**: if you want to spin up dynamo, you can remove the `--use-sglang-commands` flag. 6. This will create a logs directory in the `examples/sglang/slurm_jobs` directory. You can `cd` into the directory, cd into your job id, and then run `tail -f *_prefill.err *_decode.err` or `tail -f *_prefill.out *_decode.out` to see the logs. \ No newline at end of file From 11d68c26102c4f0a447867b9bbdee3816ba5db26 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 17 Jul 2025 19:49:27 +0000 Subject: [PATCH 62/65] update the gb200 dockerfile --- container/Dockerfile.sglang-gb200 | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200 index 48f7e34e62..5af553feb1 100644 --- a/container/Dockerfile.sglang-gb200 +++ b/container/Dockerfile.sglang-gb200 @@ -69,4 +69,37 @@ COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/ ENV PYTHONPATH=/sgl-workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang:$PYTHONPATH +# properly install cmake so that gap can be installed +RUN cmake --version + +ARG CMAKE_VERSION=3.31.8 +RUN mkdir /sgl-workspace/cmake_build +WORKDIR /sgl-workspace/cmake_build + +# uninstall CMake +RUN apt-get purge -y cmake +# download newer version of CMake +RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \ + tar -xvzf cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \ + mv cmake-${CMAKE_VERSION}-linux-$(uname -m) custom_cmake +ENV PATH=/sgl-workspace/cmake_build/custom_cmake/bin:$PATH + +# should be 3.31.8 +RUN cmake --version + +# Install perf_analyzer and genai-perf +RUN apt-get update -y && \ + apt-get install -y --no-install-recommends \ + rapidjson-dev \ + zlib1g-dev + +RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \ + mkdir perf_analyzer/build && \ + cmake -B perf_analyzer/build -S perf_analyzer && \ + cmake --build perf_analyzer/build -- -j8 + +ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH + +RUN pip install --break-system-packages genai-perf + WORKDIR /sgl-workspace/dynamo/examples/sglang \ No newline at end of file From b40e60ea1b031b012830f91e929cff12da1d9cdc Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 29 Jul 2025 15:55:54 +0000 Subject: [PATCH 63/65] nixl --- examples/sglang/slurm_jobs/scripts/gb200.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/sglang/slurm_jobs/scripts/gb200.sh b/examples/sglang/slurm_jobs/scripts/gb200.sh index 2b46f346a0..cbc8cbce88 100755 --- a/examples/sglang/slurm_jobs/scripts/gb200.sh +++ b/examples/sglang/slurm_jobs/scripts/gb200.sh @@ -94,6 +94,7 @@ if [ "$mode" = "prefill" ]; then --disaggregation-mode prefill \ --dist-init-addr "$HOST_IP:$PORT" \ --disaggregation-bootstrap-port 30001 \ + --disaggregation-transfer-backend nixl \ --nnodes "$TOTAL_NODES" \ --node-rank "$RANK" \ --tp-size "$TOTAL_GPUS" \ From a4decf81db825f1f91e102fe8e0313ebd3a16b69 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 31 Jul 2025 18:57:27 +0000 Subject: [PATCH 64/65] feat(docs): update DeepSeek-R1 instructions for GB200 and WideEP container configuration --- .../backends/sglang/docs/dsr1-wideep-gb200.md | 50 +++-- .../backends/sglang/docs/dsr1-wideep-h100.md | 2 +- container/Dockerfile.sglang-gb200 | 105 --------- container/Dockerfile.sglang-wideep | 207 ++++++++---------- 4 files changed, 125 insertions(+), 239 deletions(-) delete mode 100644 container/Dockerfile.sglang-gb200 diff --git a/components/backends/sglang/docs/dsr1-wideep-gb200.md b/components/backends/sglang/docs/dsr1-wideep-gb200.md index bb07ccb424..757dbc0e6b 100644 --- a/components/backends/sglang/docs/dsr1-wideep-gb200.md +++ b/components/backends/sglang/docs/dsr1-wideep-gb200.md @@ -17,34 +17,54 @@ limitations under the License. # Running DeepSeek-R1 Disaggregated with WideEP on GB200s -Dynamo supports SGLang's GB200 implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-06-16-gb200-part-1/) for more details. Full end to end optimization is still a work in progress but you can get this up and running with the following steps. +Dynamo supports SGLang's GB200 implementation of wide expert parallelism and large scale P/D for DeepSeek-R1! You can read their blog post [here](https://lmsys.org/blog/2025-06-16-gb200-part-1/) for more details. Full end to end optimization is still a work in progress but you can get this up and running with the following steps. In ths example, we will run 1 prefill worker on 2 GB200 nodes (4 GPUs each) and 1 decode worker on 12 GB200 nodes (total 56 GPUs). ## Instructions -1. Build the SGLang DeepEP container on an ARM64 machine. -> [!NOTE] -> This sglang side branch is based on an open [PR](https://github.com/sgl-project/sglang/pull/7721/files) to SGLang that allows their main dockerfile to be built for aarch64. Once that PR is merged in, we can add the gb200 dockerfile to the main sglang repo. +1. Build the Dynamo container ```bash -git clone https://github.com/kyleliang-nv/sglang.git -cd sglang -git checkout sglang_gb200_wideep_docker -docker build -f docker/Dockerfile -t sgl-blackwell-wideep --build-arg BUILD_TYPE=blackwell --build-arg CUDA_VERSION=12.8.1 . +cd $DYNAMO_ROOT +docker build \ + -f container/Dockerfile.sglang-wideep \ + -t dynamo-wideep-gb200 \ + --build-arg MODE=blackwell \ + --build-arg SGLANG_IMAGE_TAG=v0.4.9.post6-cu128-gb200 \ + --build-arg ARCH=arm64 \ + --build-arg ARCH_ALT=aarch64 \ + . \ + --no-cache ``` -2. Build the Dynamo container +2. You can run this container on each 4xGB200 node using the following command. -> [!NOTE] -> This is a side branch that contains all of the scripts to run on GB200s. Once the PR is merged in, we can switch to the main branch. +> [!IMPORTANT] +> We recommend downloading DeepSeek-R1 and then mounting it to the container. You can find the model [here](https://huggingface.co/deepseek-ai/DeepSeek-R1) ```bash -cd $DYNAMO_ROOT -git checkout ishan/more-slurm-targets # temporary -docker build -f container/Dockerfile.sglang-gb200 . -t dynamo-wideep-gb200 --no-cache +docker run \ + --gpus all \ + -it \ + --rm \ + --network host \ + --volume /PATH_TO_DSR1_MODEL/:/model/ \ + --shm-size=10G \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --ulimit nofile=65536:65536 \ + --cap-add CAP_SYS_PTRACE \ + --ipc host \ + dynamo-wideep-gb200:latest +``` + +4. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier. + +```bash +./utils/gen_env_vars.sh ``` -3. In your SLURM cluster, clone dynamo and switch to this side branch. +In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory. ```bash git clone https://github.com/ai-dynamo/dynamo.git diff --git a/components/backends/sglang/docs/dsr1-wideep-h100.md b/components/backends/sglang/docs/dsr1-wideep-h100.md index a23a3ada13..e1dc372146 100644 --- a/components/backends/sglang/docs/dsr1-wideep-h100.md +++ b/components/backends/sglang/docs/dsr1-wideep-h100.md @@ -57,7 +57,7 @@ In each container, you should be in the `/sgl-workspace/dynamo/components/backen ```bash # run ingress -dynamo run in=http out=dyn & +python3 -m dynamo.frontend --http-port=8000 & # optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below) python3 utils/sgl_http_server.py --ns dynamo & # run prefill worker diff --git a/container/Dockerfile.sglang-gb200 b/container/Dockerfile.sglang-gb200 deleted file mode 100644 index 5af553feb1..0000000000 --- a/container/Dockerfile.sglang-gb200 +++ /dev/null @@ -1,105 +0,0 @@ -FROM sgl-blackwell-wideep:latest - -# Define architecture variables for ARM64 -ARG ARCH=arm64 -ARG ARCH_ALT=aarch64 - -WORKDIR /sgl-workspace -# https://github.com/ai-dynamo/dynamo/pull/1938 -ARG DYNAMO_COMMIT="aba60996f225038b691d9f255da515b27695b179" -RUN git clone https://github.com/ai-dynamo/dynamo.git && cd dynamo && git checkout ${DYNAMO_COMMIT} - -# install dynamo in editable mode -WORKDIR /sgl-workspace/dynamo -# Rust build/dev dependencies -RUN apt update -y && \ - apt install --no-install-recommends -y \ - build-essential \ - protobuf-compiler \ - cmake \ - libssl-dev \ - pkg-config \ - clang \ - libclang-dev \ - git - -# Define Rust target based on ARCH_ALT ARG -ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu - -ENV RUSTUP_HOME=/usr/local/rustup \ - CARGO_HOME=/usr/local/cargo \ - PATH=/usr/local/cargo/bin:$PATH \ - RUST_VERSION=1.86.0 - -# Install Rust using RUSTARCH derived from ARCH_ALT -RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \ - # TODO: Add SHA check back based on RUSTARCH - chmod +x rustup-init && \ - ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \ - rm rustup-init && \ - chmod -R a+w $RUSTUP_HOME $CARGO_HOME - -ARG CARGO_BUILD_JOBS -# Set CARGO_BUILD_JOBS to 16 if not provided -# This is to prevent cargo from building $(nproc) jobs in parallel, -# which might exceed the number of opened files limit. -ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} - -RUN cargo build --release -RUN mkdir -p deploy/sdk/src/dynamo/sdk/cli/bin -RUN cp target/release/dynamo-run deploy/sdk/src/dynamo/sdk/cli/bin - -RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../.. -RUN pip install --break-system-packages -e . - -ENV PYTHONPATH=/sgl-workspace/dynamo/components/planner/src:/sgl-workspace/dynamo/examples/sglang:$PYTHONPATH - -RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \ - dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb - -ENV ETCD_VERSION="v3.5.21" -RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \ - mkdir -p /usr/local/bin/etcd && \ - tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \ - rm /tmp/etcd.tar.gz -ENV PATH=/usr/local/bin/etcd/:$PATH - -COPY examples/sglang/configs/deepseek_r1/wideep/* /sgl-workspace/dynamo/examples/sglang/configs/ -COPY examples/sglang/utils/benchmarking/* /sgl-workspace/dynamo/examples/sglang/utils/ - -ENV PYTHONPATH=/sgl-workspace/dynamo/deploy/sdk/src:/workspace/dynamo/components/planner/src:/workspace/examples/sglang:$PYTHONPATH - -# properly install cmake so that gap can be installed -RUN cmake --version - -ARG CMAKE_VERSION=3.31.8 -RUN mkdir /sgl-workspace/cmake_build -WORKDIR /sgl-workspace/cmake_build - -# uninstall CMake -RUN apt-get purge -y cmake -# download newer version of CMake -RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \ - tar -xvzf cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \ - mv cmake-${CMAKE_VERSION}-linux-$(uname -m) custom_cmake -ENV PATH=/sgl-workspace/cmake_build/custom_cmake/bin:$PATH - -# should be 3.31.8 -RUN cmake --version - -# Install perf_analyzer and genai-perf -RUN apt-get update -y && \ - apt-get install -y --no-install-recommends \ - rapidjson-dev \ - zlib1g-dev - -RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \ - mkdir perf_analyzer/build && \ - cmake -B perf_analyzer/build -S perf_analyzer && \ - cmake --build perf_analyzer/build -- -j8 - -ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH - -RUN pip install --break-system-packages genai-perf - -WORKDIR /sgl-workspace/dynamo/examples/sglang \ No newline at end of file diff --git a/container/Dockerfile.sglang-wideep b/container/Dockerfile.sglang-wideep index e6aa11092f..dfcc0090ac 100644 --- a/container/Dockerfile.sglang-wideep +++ b/container/Dockerfile.sglang-wideep @@ -13,160 +13,131 @@ # See the License for the specific language governing permissions and # limitations under the License. -# This should be pinned to the sglang version that is installed with Dynamo -# in the pyproject.toml -FROM lmsysorg/sglang:v0.4.8.post1-cu126 +FROM lmsysorg/sglang:${SGLANG_IMAGE_TAG} + +ARG MODE="hopper" +ARG SGLANG_IMAGE_TAG="v0.4.8.post1-cu126" +ARG ARCH="amd64" +ARG ARCH_ALT="x86_64" +ARG NIXL_UCX_REF="v1.19.x" +ARG NIXL_TAG="0.4.1" +ARG CMAKE_VERSION="3.31.8" +ARG RUST_VERSION="1.87.0" +ARG CARGO_BUILD_JOBS="16" -# Add NIXL build dependencies RUN apt-get update -y && \ apt-get install -y \ - cmake \ - meson \ - ninja-build \ - pybind11-dev \ - patchelf \ - net-tools - -# Install Python build dependencies -RUN pip install --break-system-packages meson-python wheel build - -# Add architecture args for NIXL build -ARG ARCH=amd64 -ARG ARCH_ALT=x86_64 - -WORKDIR /sgl-workspace - -# Install UCX dependencies -RUN apt-get update -y && \ - apt-get install -y --no-install-recommends \ - --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev \ - libnuma-dev librdmacm-dev ibverbs-providers \ - autoconf libtool - -# Build UCX from source -ARG NIXL_UCX_REF=v1.19.x -RUN rm -rf /opt/hpcx/ucx && \ - rm -rf /usr/local/ucx && \ - cd /usr/local/src && \ - git clone https://github.com/openucx/ucx.git && \ - cd ucx && \ - git checkout $NIXL_UCX_REF && \ - ./autogen.sh && ./configure \ - --prefix=/usr/local/ucx \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=/usr/local/cuda \ - --with-verbs \ - --with-efa \ - --with-dm \ - --with-gdrcopy=/usr/local \ - --enable-mt && \ - make -j && \ - make -j install-strip && \ - ldconfig + cmake meson ninja-build pybind11-dev patchelf net-tools \ + build-essential protobuf-compiler libssl-dev pkg-config \ + clang libclang-dev git rapidjson-dev zlib1g-dev && \ + pip install --break-system-packages meson-python wheel build + +# Build UCX + NIXL for x86/hopper until its fully tested on GB200 +RUN if [ "$MODE" = "hopper" ]; then \ + apt-get install -y --no-install-recommends \ + libibverbs-dev rdma-core ibverbs-utils libibumad-dev \ + libnuma-dev librdmacm-dev ibverbs-providers autoconf libtool && \ + # UCX from source + rm -rf /opt/hpcx/ucx /usr/local/ucx && \ + cd /usr/local/src && \ + git clone https://github.com/openucx/ucx.git && \ + cd ucx && git checkout $NIXL_UCX_REF && \ + ./autogen.sh && \ + ./configure \ + --prefix=/usr/local/ucx \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-cma \ + --enable-devel-headers \ + --with-cuda=/usr/local/cuda \ + --with-verbs \ + --with-efa \ + --with-dm \ + --with-gdrcopy=/usr/local \ + --enable-mt && \ + make -j && make install-strip && ldconfig && \ + # NIXL + git clone https://github.com/ai-dynamo/nixl.git /opt/nixl && \ + cd /opt/nixl && git checkout $NIXL_TAG && \ + pip install --break-system-packages . \ + --config-settings="setup-args=-Ducx_path=/usr/local/ucx"; \ + fi ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH -ARG NIXL_TAG=0.3.1 -RUN git clone https://github.com/ai-dynamo/nixl.git && cd nixl && git checkout ${NIXL_TAG} && pip install --break-system-packages . --config-settings=setup-args="-Ducx_path=/usr/local/ucx" - -WORKDIR /sgl-workspace - -# Allow forceful shutdown of inflight requests -ENV SGL_FORCE_SHUTDOWN=1 - +# Dynamo WORKDIR /sgl-workspace RUN git clone https://github.com/ai-dynamo/dynamo.git -# install dynamo in editable mode -WORKDIR /sgl-workspace/dynamo -# Rust build/dev dependencies -RUN apt update -y && \ - apt install --no-install-recommends -y \ - build-essential \ - protobuf-compiler \ - cmake \ - libssl-dev \ - pkg-config \ - clang \ - libclang-dev \ - git - -# Define Rust target based on ARCH_ALT ARG -ARG RUSTARCH=${ARCH_ALT}-unknown-linux-gnu - ENV RUSTUP_HOME=/usr/local/rustup \ CARGO_HOME=/usr/local/cargo \ - PATH=/usr/local/cargo/bin:$PATH \ - RUST_VERSION=1.86.0 + PATH=/usr/local/cargo/bin:$PATH -# Install Rust using RUSTARCH derived from ARCH_ALT -RUN wget --tries=3 --waitretry=5 "https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" && \ - # TODO: Add SHA check back based on RUSTARCH +RUN wget --tries=3 --waitretry=5 \ + "https://static.rust-lang.org/rustup/archive/1.28.1/${ARCH_ALT}-unknown-linux-gnu/rustup-init" && \ chmod +x rustup-init && \ - ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \ + ./rustup-init -y \ + --no-modify-path \ + --profile minimal \ + --default-toolchain $RUST_VERSION \ + --default-host ${ARCH_ALT}-unknown-linux-gnu && \ rm rustup-init && \ chmod -R a+w $RUSTUP_HOME $CARGO_HOME ARG CARGO_BUILD_JOBS -# Set CARGO_BUILD_JOBS to 16 if not provided -# This is to prevent cargo from building $(nproc) jobs in parallel, -# which might exceed the number of opened files limit. -ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16} +ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS} + +RUN cd dynamo && cargo build --release -RUN cargo build --release +RUN cd dynamo/lib/bindings/python && \ + pip install --break-system-packages -e . && \ + cd /sgl-workspace/dynamo && \ + pip install --break-system-packages . -RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../.. -RUN pip install --break-system-packages . +RUN pip install --break-system-packages sglang-router==0.1.5 -RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \ +RUN wget --tries=3 --waitretry=5 \ + https://github.com/nats-io/nats-server/releases/download/v2.10.28/\ +nats-server-v2.10.28-${ARCH}.deb && \ dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb ENV ETCD_VERSION="v3.5.21" -RUN wget --tries=3 --waitretry=5 https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \ +RUN wget --tries=3 --waitretry=5 \ + https://github.com/etcd-io/etcd/releases/download/${ETCD_VERSION}/\ +etcd-${ETCD_VERSION}-linux-${ARCH}.tar.gz -O /tmp/etcd.tar.gz && \ mkdir -p /usr/local/bin/etcd && \ - tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \ + tar -xzf /tmp/etcd.tar.gz \ + -C /usr/local/bin/etcd --strip-components=1 && \ rm /tmp/etcd.tar.gz -ENV PATH=/usr/local/bin/etcd/:$PATH -ARG CMAKE_VERSION=3.31.8 -RUN mkdir /sgl-workspace/cmake_build -WORKDIR /sgl-workspace/cmake_build +ENV PATH=/usr/local/bin/etcd:$PATH -# uninstall CMake +# GenAI Perf RUN apt-get purge -y cmake -# download newer version of CMake -RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \ - tar -xvzf cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \ - mv cmake-${CMAKE_VERSION}-linux-$(uname -m) custom_cmake -ENV PATH=/sgl-workspace/cmake_build/custom_cmake/bin:$PATH -# should be 3.31.8 +RUN mkdir /sgl-workspace/cmake_build && \ + cd /sgl-workspace/cmake_build && \ + wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/\ +cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \ + tar -xzf cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz && \ + mv cmake-${CMAKE_VERSION}-linux-$(uname -m) custom_cmake && \ + rm cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz + +ENV PATH=/sgl-workspace/cmake_build/custom_cmake/bin:$PATH RUN cmake --version -# Install perf_analyzer and genai-perf -RUN apt-get update -y && \ - apt-get install -y --no-install-recommends \ - rapidjson-dev \ - # jq and curl for polling various endpoints and health checks - jq \ - curl \ - zlib1g-dev - -RUN git clone --depth=1 https://github.com/triton-inference-server/perf_analyzer.git && \ +RUN git clone --depth=1 \ + https://github.com/triton-inference-server/perf_analyzer.git && \ mkdir perf_analyzer/build && \ cmake -B perf_analyzer/build -S perf_analyzer && \ - cmake --build perf_analyzer/build -- -j8 + cmake --build perf_analyzer/build -- -j$(nproc) ENV PATH=/sgl-workspace/perf_analyzer/build/perf_analyzer/src/perf-analyzer-build:$PATH - RUN pip install --break-system-packages genai-perf -# https://pypi.org/project/sglang-router/0.1.5 is latest -RUN pip install sglang-router==0.1.5 +# Enable forceful shutdown of inflight requests +ENV SGL_FORCE_SHUTDOWN=1 WORKDIR /sgl-workspace/dynamo/components/backends/sglang From ca6040d3f19bbe8d8860dcaa2282bde13566923b Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Thu, 31 Jul 2025 19:05:19 +0000 Subject: [PATCH 65/65] docs(sglang): update deployment instructions for GB200 and H100 models --- .../backends/sglang/docs/dsr1-wideep-gb200.md | 126 ++++++++++++++---- .../backends/sglang/docs/dsr1-wideep-h100.md | 2 +- 2 files changed, 101 insertions(+), 27 deletions(-) diff --git a/components/backends/sglang/docs/dsr1-wideep-gb200.md b/components/backends/sglang/docs/dsr1-wideep-gb200.md index 757dbc0e6b..ea987fae0f 100644 --- a/components/backends/sglang/docs/dsr1-wideep-gb200.md +++ b/components/backends/sglang/docs/dsr1-wideep-gb200.md @@ -21,7 +21,6 @@ Dynamo supports SGLang's GB200 implementation of wide expert parallelism and lar ## Instructions - 1. Build the Dynamo container ```bash @@ -58,41 +57,116 @@ docker run \ dynamo-wideep-gb200:latest ``` -4. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier. +3. On the head prefill node, run the helper script provided to generate commands to start the `nats-server`, `etcd`. This script will also tell you which environment variables to export on each node to make deployment easier. ```bash ./utils/gen_env_vars.sh ``` -In each container, you should be in the `/sgl-workspace/dynamo/components/backends/sglang` directory. +4. Run the ingress and prefill worker ```bash -git clone https://github.com/ai-dynamo/dynamo.git -git checkout ishan/more-slurm-targets -cd examples/sglang/slurm_jobs +# run ingress +python3 -m dynamo.frontend --http-port=8000 & +# optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below) +python3 utils/sgl_http_server.py --ns dynamo & +# run prefill worker +SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \ +MC_TE_METRIC=true \ +SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \ +SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \ +SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \ +SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ +MC_FORCE_MNNVL=1 \ +NCCL_MNNVL_ENABLE=1 \ +NCCL_CUMEM_ENABLE=1 \ +SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ +SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ +PYTHONUNBUFFERED=1 \ +python3 components/worker.py \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --model-path /model/ \ + --skip-tokenizer-init \ + --trust-remote-code \ + --disaggregation-mode prefill \ + --dist-init-addr ${HEAD_PREFILL_NODE_IP}:29500 \ + --disaggregation-bootstrap-port 30001 \ + --disaggregation-transfer-backend nixl \ + --nnodes 2 \ + --node-rank 0 \ + --tp-size 8 \ + --dp-size 8 \ + --enable-dp-attention \ + --host 0.0.0.0 \ + --decode-log-interval 1 \ + --max-running-requests 6144 \ + --context-length 2716 \ + --disable-radix-cache \ + --enable-deepep-moe \ + --deepep-mode low_latency \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-shared-experts-fusion \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm static \ + --eplb-algorithm deepseek \ + --attention-backend cutlass_mla \ + --watchdog-timeout 1000000 \ + --disable-cuda-graph \ + --chunked-prefill-size 16384 \ + --max-total-tokens 32768 \ + --mem-fraction-static 0.8 \ + --log-level debug ``` -4. Ensure you have the proper paths that you can use to mount things to the container - -- The path to the DSR1 model which should be mounted to the `--model-dir` flag and `--config-dir` flag - -5. Run the following command to submit the job +5. Run the decode worker on the head decode node ```bash -python3 submit_job_script.py \ - --template job_script_template.j2 \ - --model-dir \ - --container-image \ - --account \ - --gpus-per-node 4 \ - --config-dir \ - --network-interface enp138s0f0np0 \ - --gpu-type gb200 \ - --use-sglang-commands \ - --prefill-nodes 2 \ - --decode-nodes 12 +SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=768 \ +MC_TE_METRIC=true \ +SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \ +SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \ +SGLANG_DISAGGREGATION_WAITING_TIMEOUT=100000 \ +SGLANG_HACK_SEQ_BOOTSTRAP_ROOM=1 \ +SGLANG_MOONCAKE_CUSTOM_MEM_POOL=True \ +NCCL_MNNVL_ENABLE=1 \ +MC_FORCE_MNNVL=1 \ +NCCL_CUMEM_ENABLE=1 \ +SGLANG_USE_MESSAGE_QUEUE_BROADCASTER=0 \ +SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=1 \ +PYTHONUNBUFFERED=1 \ +python3 components/decode_worker.py \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --model-path /model/ \ + --skip-tokenizer-init \ + --trust-remote-code \ + --disaggregation-mode decode \ + --dist-init-addr ${HEAD_DECODE_NODE_IP}:29500 \ + --disaggregation-bootstrap-port 30001 \ + --nnodes 12 \ + --node-rank 0 \ + --tp-size 48 \ + --dp-size 48 \ + --enable-dp-attention \ + --host 0.0.0.0 \ + --decode-log-interval 1 \ + --max-running-requests 36864 \ + --context-length 2716 \ + --disable-radix-cache \ + --enable-deepep-moe \ + --deepep-mode low_latency \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --cuda-graph-bs 768 \ + --disable-shared-experts-fusion \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm static \ + --eplb-algorithm deepseek \ + --attention-backend cutlass_mla \ + --watchdog-timeout 1000000 \ + --chunked-prefill-size 36864 \ + --mem-fraction-static 0.82 \ + --log-level debug ``` -**Note**: if you want to spin up dynamo, you can remove the `--use-sglang-commands` flag. - -6. This will create a logs directory in the `examples/sglang/slurm_jobs` directory. You can `cd` into the directory, cd into your job id, and then run `tail -f *_prefill.err *_decode.err` or `tail -f *_prefill.out *_decode.out` to see the logs. \ No newline at end of file +On the other decode nodes (this example has 12 total decode nodes), run the same command but change `--node-rank` to 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 diff --git a/components/backends/sglang/docs/dsr1-wideep-h100.md b/components/backends/sglang/docs/dsr1-wideep-h100.md index e1dc372146..57f0b6ba3b 100644 --- a/components/backends/sglang/docs/dsr1-wideep-h100.md +++ b/components/backends/sglang/docs/dsr1-wideep-h100.md @@ -93,7 +93,7 @@ python3 -m dynamo.sglang.worker \ On the other prefill node (since this example has 4 total prefill nodes), run the same command but change `--node-rank` to 1,2, and 3 -7. Run the decode worker on the head decode node +6. Run the decode worker on the head decode node ```bash python3 -m dynamo.sglang.decode_worker \