From 5710d0386852662f1699b866cf4103a7aab5a73f Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 31 Jul 2025 02:12:54 -0700 Subject: [PATCH 1/3] Update SLURM scripts Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- .../disaggregated/slurm/disaggr_torch.slurm | 80 +++++++++++-------- examples/disaggregated/slurm/gen_yaml.py | 61 +++++++++++--- examples/disaggregated/slurm/start_server.sh | 6 +- examples/disaggregated/slurm/start_worker.sh | 14 ++-- 4 files changed, 102 insertions(+), 59 deletions(-) diff --git a/examples/disaggregated/slurm/disaggr_torch.slurm b/examples/disaggregated/slurm/disaggr_torch.slurm index 941978a5656..f2afc1ffaa0 100644 --- a/examples/disaggregated/slurm/disaggr_torch.slurm +++ b/examples/disaggregated/slurm/disaggr_torch.slurm @@ -7,38 +7,49 @@ #SBATCH --time=02:00:00 #SBATCH --job-name=${job_name} # add your job name here -isl=1024 -osl=1024 -multi_round=10 -gen_yaml_file=gen_yaml.py -streaming=true -container_image=${container_image} # add your container image here -mount_dir=${mount_dir} # add your mount directory here -workdir=${workdir} # add your path to the slurm scripts here -model_dir=${model_dir} # add your model directory here - -mounts=${mount_dir}:${mount_dir} -logdir=${workdir}/benchmark-${isl}-${osl}/ -mkdir -p ${logdir} - -container_name=disaggr-test - -num_ctx_servers=$1 -ctx_tp_size=$2 -ctx_batch_size=$3 -ctx_max_num_tokens=$4 -ctx_enable_attention_dp=$5 -num_gen_servers=$6 -gen_tp_size=$7 -gen_batch_size=$8 -gen_max_num_tokens=$9 +# Context servers arguments +num_ctx_servers=${1} +ctx_tp_size=${2} +ctx_batch_size=${3} +ctx_max_num_tokens=${4} +ctx_enable_attention_dp=${5} + +# Generation servers arguments +num_gen_servers=${6} +gen_tp_size=${7} +gen_batch_size=${8} +gen_max_num_tokens=${9} gen_enable_attention_dp=${10} gen_gpu_memory_fraction=${11} + +# Other arguments eplb_num_slots=${12} mtp_size=${13} + +# Benchmarking arguments concurrency=${14} +isl=${15} +osl=${16} +multi_round=${17} +streaming=${18} + +# User specific arguments +container_image=${19} +mounts=${20} +workdir=${21} +model_dir=${22} + +ctx_max_seq_len=$((isl + 1)) +gen_max_seq_len=$((isl + osl)) +ctx_gpu_frac=0.75 +cache_transceiver_max_num_tokens=8448 + +container_name=disaggr +logdir=${workdir}/benchmark-${isl}-${osl}/ +mkdir -p ${logdir} +full_logdir=${logdir}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} -full_logdir=${logdir}/dep${gen_tp_size}_concurrency${concurrency}_eplb${eplb_num_slots}_mtp${mtp_size} +echo "concurrency: ${concurrency}" ctx_gpus=$((num_ctx_servers * ctx_tp_size)) gen_gpus=$((num_gen_servers * gen_tp_size)) @@ -49,9 +60,10 @@ enable_pdl=false if [ "${gen_enable_attention_dp}" = "false" ]; then enable_pdl=true echo "enable_pdl: ${enable_pdl}" - full_logdir=${logdir}/tep${gen_tp_size}_concurrency${concurrency}_eplb${eplb_num_slots}_mtp${mtp_size} + full_logdir=${logdir}/ctx${num_ctx_servers}_gen${num_gen_servers}_tep${gen_tp_size}_batch${gen_batch_size}_eplb${eplb_num_slots}_mtp${mtp_size} fi mkdir -p ${full_logdir} +echo "Log will be saved to: ${full_logdir}" nsys_on="" # nsys_on=${full_logdir} # Uncomment this line to enable Nsys profiling @@ -67,16 +79,20 @@ srun -l --container-image=${container_image} \ srun -l --container-name=${container_name} \ --container-mounts=${mounts} \ --mpi=pmix --overlap \ - python3 ${workdir}/${gen_yaml_file} --config ${full_logdir}/config.yaml \ + python3 ${workdir}/gen_yaml.py --config ${full_logdir}/config.yaml \ --model ${model_dir} \ --num_ctx_servers ${num_ctx_servers} \ --ctx_tp_size ${ctx_tp_size} \ --ctx_batch_size ${ctx_batch_size} \ --ctx_max_num_tokens ${ctx_max_num_tokens} \ + --ctx_max_seq_len ${ctx_max_seq_len} \ + --ctx_free_gpu_memory_fraction ${ctx_gpu_frac} \ + --cache_transceiver_max_num_tokens ${cache_transceiver_max_num_tokens} \ --num_gen_servers ${num_gen_servers} \ --gen_tp_size ${gen_tp_size} \ --gen_batch_size ${gen_batch_size} \ --gen_max_num_tokens ${gen_max_num_tokens} \ + --gen_max_seq_len ${gen_max_seq_len} \ --gen_gpu_memory_fraction ${gen_gpu_memory_fraction} \ --eplb_num_slots ${eplb_num_slots} \ $(if [ "${gen_enable_attention_dp}" = "true" ]; then echo "--gen_enable_attention_dp"; fi) \ @@ -88,17 +104,11 @@ echo "YAML file generated." hostname_value=$(grep '^hostname:' ${full_logdir}/config.yaml | awk -F': ' '{print $2}') echo "server host name: $hostname_value" -# try to kill the server and workers -srun -l --container-name=${container_name} \ - --container-mounts=${mounts} \ - --mpi=pmix --overlap \ - pkill -f "trtllm-serve" || true - # start the workers srun -l --container-name=${container_name} \ --container-mounts=${mounts} \ --mpi=pmix --overlap \ - bash ${workdir}/start_worker.sh ${full_logdir}/config.yaml "${concurrency}" "${enable_pdl}" ${ctx_gpus} ${nsys_on} &> ${full_logdir}/output_workers.log & + bash ${workdir}/start_worker.sh ${full_logdir}/config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} &> ${full_logdir}/output_workers.log & # start the server srun -l --container-name=${container_name} \ diff --git a/examples/disaggregated/slurm/gen_yaml.py b/examples/disaggregated/slurm/gen_yaml.py index e11d2419d03..a3f8ad32ac4 100644 --- a/examples/disaggregated/slurm/gen_yaml.py +++ b/examples/disaggregated/slurm/gen_yaml.py @@ -125,17 +125,21 @@ def gen_config_file(config_path: str, ctx_tp_size: int, ctx_batch_size: int, ctx_max_num_tokens: int, + ctx_max_seq_len: int, + ctx_free_gpu_memory_fraction: float, ctx_enable_attention_dp: bool, num_gen_servers: int, gen_tp_size: int, gen_batch_size: int, gen_max_num_tokens: int, + gen_max_seq_len: int, gen_enable_attention_dp: bool, gen_gpu_memory_fraction: float, eplb_num_slots: int, mtp_size: int = 0, worker_start_port: int = 8001, - server_port: int = 8000) -> None: + server_port: int = 8000, + cache_transceiver_max_num_tokens: int = 4608) -> None: """ Generate configuration YAML file for disaggregated inference. @@ -146,6 +150,8 @@ def gen_config_file(config_path: str, ctx_tp_size: Tensor parallel size for context servers ctx_batch_size: Batch size for context servers ctx_max_num_tokens: Max number of tokens for context servers + ctx_max_seq_len: Max sequence length for context servers + ctx_free_gpu_memory_fraction: Free GPU memory fraction for context servers ctx_enable_attention_dp: Enable attention DP for context servers num_gen_servers: Number of generation servers gen_tp_size: Tensor parallel size for generation servers @@ -161,7 +167,11 @@ def gen_config_file(config_path: str, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024, 2048, gen_batch_size ] - gen_moe_backend = "WIDEEP" + gen_moe_backend = "CUTLASS" + if gen_tp_size >= 16 and gen_enable_attention_dp: + gen_moe_backend = "WIDEEP" + if not gen_enable_attention_dp: + gen_moe_backend = "TRTLLM" config = { 'model': model_path, @@ -172,7 +182,8 @@ def gen_config_file(config_path: str, 'num_instances': num_ctx_servers, 'max_batch_size': ctx_batch_size, 'max_num_tokens': ctx_max_num_tokens, - 'max_seq_len': 1152, + 'max_seq_len': ctx_max_seq_len, + 'free_gpu_memory_fraction': ctx_free_gpu_memory_fraction, 'tensor_parallel_size': ctx_tp_size, 'moe_expert_parallel_size': ctx_tp_size, 'enable_attention_dp': ctx_enable_attention_dp, @@ -180,12 +191,13 @@ def gen_config_file(config_path: str, 'print_iter_log': True, 'disable_overlap_scheduler': True, 'kv_cache_config': { - 'free_gpu_memory_fraction': 0.85, + 'enable_block_reuse': False, + 'free_gpu_memory_fraction': ctx_free_gpu_memory_fraction, 'dtype': 'fp8', }, 'cache_transceiver_config': { + 'max_tokens_in_buffer': cache_transceiver_max_num_tokens, 'backend': 'default', - 'max_tokens_in_buffer': 8320, }, }, 'generation_servers': { @@ -196,13 +208,15 @@ def gen_config_file(config_path: str, 'pipeline_parallel_size': 1, 'max_batch_size': gen_batch_size, 'max_num_tokens': gen_max_num_tokens, - 'max_seq_len': 2176, + 'max_seq_len': gen_max_seq_len, + 'free_gpu_memory_fraction': gen_gpu_memory_fraction, 'cuda_graph_config': { 'enable_padding': True, 'batch_sizes': gen_cuda_graph_batch_sizes, }, 'print_iter_log': True, 'kv_cache_config': { + 'enable_block_reuse': False, 'free_gpu_memory_fraction': gen_gpu_memory_fraction, 'dtype': 'fp8', }, @@ -210,9 +224,10 @@ def gen_config_file(config_path: str, 'backend': gen_moe_backend, }, 'cache_transceiver_config': { + 'max_tokens_in_buffer': cache_transceiver_max_num_tokens, 'backend': 'default', - 'max_tokens_in_buffer': 8320, }, + 'stream_interval': 20, } } @@ -236,6 +251,9 @@ def gen_config_file(config_path: str, # set the hostname to the first node config['hostname'] = nodes[0] + if gen_tp_size == 8 and not gen_enable_attention_dp: + config['generation_servers']['allreduce_strategy'] = "MNNVL" + if eplb_num_slots > 0: moe_load_balancer_file = os.path.join(os.path.dirname(config_path), "moe_load_balancer.yaml") @@ -290,6 +308,14 @@ def gen_config_file(config_path: str, type=int, required=True, help="Max number of tokens for context servers") + parser.add_argument("--ctx_max_seq_len", + type=int, + required=True, + help="Max sequence length for context servers") + parser.add_argument("--ctx_free_gpu_memory_fraction", + type=float, + required=True, + help="Free GPU memory fraction for context servers") parser.add_argument("--ctx_enable_attention_dp", dest='ctx_enable_attention_dp', action='store_true', @@ -310,6 +336,10 @@ def gen_config_file(config_path: str, type=int, required=True, help="Max number of tokens for generation servers") + parser.add_argument("--gen_max_seq_len", + type=int, + required=True, + help="Max sequence length for generation servers") parser.add_argument("--gen_enable_attention_dp", dest='gen_enable_attention_dp', action='store_true', @@ -334,13 +364,20 @@ def gen_config_file(config_path: str, type=int, default=8333, help="Server port") + parser.add_argument("--cache_transceiver_max_num_tokens", + type=int, + default=4608, + help="Max number of tokens for cache transceiver") args = parser.parse_args() gen_config_file(args.config, args.model, args.num_ctx_servers, args.ctx_tp_size, args.ctx_batch_size, - args.ctx_max_num_tokens, args.ctx_enable_attention_dp, - args.num_gen_servers, args.gen_tp_size, args.gen_batch_size, - args.gen_max_num_tokens, args.gen_enable_attention_dp, - args.gen_gpu_memory_fraction, args.eplb_num_slots, - args.mtp_size, args.worker_start_port, args.server_port) + args.ctx_max_num_tokens, args.ctx_max_seq_len, + args.ctx_free_gpu_memory_fraction, + args.ctx_enable_attention_dp, args.num_gen_servers, + args.gen_tp_size, args.gen_batch_size, + args.gen_max_num_tokens, args.gen_max_seq_len, + args.gen_enable_attention_dp, args.gen_gpu_memory_fraction, + args.eplb_num_slots, args.mtp_size, args.worker_start_port, + args.server_port, args.cache_transceiver_max_num_tokens) diff --git a/examples/disaggregated/slurm/start_server.sh b/examples/disaggregated/slurm/start_server.sh index f91aa26653e..beb4b2f18c8 100644 --- a/examples/disaggregated/slurm/start_server.sh +++ b/examples/disaggregated/slurm/start_server.sh @@ -13,9 +13,9 @@ config_file=$1 # Check and replace hostname settings in config_file if [ -f "$config_file" ]; then # Use sed to find hostname line and check if replacement is needed - if grep -q "^hostname:" "$config_file"; then + if grep -q "hostname:" "$config_file"; then # Extract current hostname value from config - current_hostname=$(grep "^hostname:" "$config_file" | sed 's/^hostname:[ ]*//' | awk '{print $1}') + current_hostname=$(grep "hostname:" "$config_file" | sed 's/.*hostname:[ ]*//' | awk '{print $1}') if [ "$current_hostname" != "$short_hostname" ]; then echo "Replacing hostname '$current_hostname' with '$short_hostname' in $config_file" @@ -31,4 +31,4 @@ else echo "Config file $config_file not found" fi -trtllm-serve disaggregated -c ${config_file} -t 1800 -r 1800 +trtllm-serve disaggregated -c ${config_file} -t 1800 -r 7200 diff --git a/examples/disaggregated/slurm/start_worker.sh b/examples/disaggregated/slurm/start_worker.sh index 6110fc160ab..0c4a3ce70d6 100644 --- a/examples/disaggregated/slurm/start_worker.sh +++ b/examples/disaggregated/slurm/start_worker.sh @@ -1,17 +1,13 @@ #! /bin/bash config_file=$1 -concurrency=$2 -enable_pdl=$3 -ctx_gpus=$4 -work_dir=$5 +enable_pdl=$2 +ctx_gpus=$3 +work_dir=$4 unset UCX_TLS -echo "config_file: ${config_file}, concurrency: ${concurrency}, enable_pdl: ${enable_pdl}, ctx_gpus: ${ctx_gpus}, work_dir: ${work_dir}" +echo "config_file: ${config_file}, enable_pdl: ${enable_pdl}, ctx_gpus: ${ctx_gpus}, work_dir: ${work_dir}" export TLLM_LOG_LEVEL=INFO -export TRTLLM_USE_UCX_KVCACHE=1 -export TLLM_BENCHMARK_REQ_QUEUES_SIZE=${concurrency} -export TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1 export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1 if [ "${enable_pdl}" = "true" ]; then @@ -27,7 +23,7 @@ else nsys_file=${work_dir}/nsys_worker_proc_${SLURM_PROCID} export TLLM_PROFILE_RECORD_GC=1 export TLLM_NVTX_DEBUG=1 - if [ "${SLURM_PROCID}" -ge "${ctx_gpus}" ]; then + if [ ${SLURM_PROCID} -ge ${ctx_gpus} ]; then export TLLM_PROFILE_START_STOP=200-250 nsys_prefix="nsys profile -e \"NSYS_MPI_STORE_TEAMS_PER_RANK=1\" -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none" echo "nsys_prefix: ${nsys_prefix}" From 301bde626b7303747a1aabed5b8c82c28c330569 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 31 Jul 2025 09:14:57 -0700 Subject: [PATCH 2/3] Update Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- .../disaggregated/slurm/disaggr_torch.slurm | 6 +- examples/disaggregated/slurm/submit.sh | 52 ++++++++------- examples/wide_ep/slurm_scripts/submit.sh | 66 ++++++++++++++++--- 3 files changed, 88 insertions(+), 36 deletions(-) diff --git a/examples/disaggregated/slurm/disaggr_torch.slurm b/examples/disaggregated/slurm/disaggr_torch.slurm index f2afc1ffaa0..f0ae69b743b 100644 --- a/examples/disaggregated/slurm/disaggr_torch.slurm +++ b/examples/disaggregated/slurm/disaggr_torch.slurm @@ -2,10 +2,10 @@ #SBATCH --nodes=2 #SBATCH --ntasks=8 #SBATCH --ntasks-per-node=4 -#SBATCH --partition=${partition} # add your partition here -#SBATCH --account=${account} # add your account here +#SBATCH --partition=${partition} # add your partition here or specify in the sbatch command +#SBATCH --account=${account} # add your account here or specify in the sbatch command +#SBATCH --job-name=${job_name} # add your job name here or specify in the sbatch command #SBATCH --time=02:00:00 -#SBATCH --job-name=${job_name} # add your job name here # Context servers arguments num_ctx_servers=${1} diff --git a/examples/disaggregated/slurm/submit.sh b/examples/disaggregated/slurm/submit.sh index 635bcb5f382..69f506efafc 100644 --- a/examples/disaggregated/slurm/submit.sh +++ b/examples/disaggregated/slurm/submit.sh @@ -1,37 +1,41 @@ #!/bin/bash -echo "Make sure that SLURM parameters are correctly set in \`disaggr_torch.slurm\` before executing this script." - -# concurrency 8 -concurrency=8 -ctx_num=1 total_node_num=8 -ntasks_per_node=4 # 4 GPUs per GB200 node ntasks=$((total_node_num * ntasks_per_node)) +ntasks_per_node=4 # 4 GPUs per GB200 node -# `--segment` makes sure that all nodes are in the same NVLink domain -# disaggr_torch.slurm arguments: -# num_ctx_servers=$1 -# ctx_tp_size=$2 -# ctx_batch_size=$3 -# ctx_max_num_tokens=$4 -# ctx_enable_attention_dp=$5 -# num_gen_servers=$6 -# gen_tp_size=$7 -# gen_batch_size=$8 -# gen_max_num_tokens=$9 -# gen_enable_attention_dp=${10} -# gen_gpu_memory_fraction=${11} -# eplb_num_slots=${12} -# mtp_size=${13} -# concurrency=${14} +ctx_num=1 +concurrency=8 +isl=1024 +osl=1024 +multi_round=10 +streaming=true +container_image= +mounts= # e.g. /mnt/data:/mnt/data +workdir= # Path to disaggr_torch.slurm +model_dir= # Path to the model checkpoint # This command starts a job with 8 nodes, 32 GPUs in total. # The server will include 4 context workers with DEP4, and 1 generation worker with DEP8. +# `--segment` makes sure that all nodes are in the same NVLink domain sbatch --nodes=${total_node_num} \ --ntasks=${ntasks} \ --ntasks-per-node=${ntasks_per_node} \ + --partition=${partition} \ + --account=${account} \ + --job-name=${job_name} \ --gres=gpu:${ntasks_per_node} \ --segment=${total_node_num} \ - disaggr_torch.slurm \ - ${ctx_num} 4 4 4480 true 1 8 1024 1024 true "0.8" 0 0 "$concurrency" + ${workdir}/disaggr_torch.slurm \ + ${ctx_num} 4 4 4480 true \ # Context servers arguments + 1 8 1024 1024 true "0.8" \ # Generation servers arguments + 0 0 \ # Other arguments + $concurrency \ # Benchmarking arguments + $isl \ + $osl \ + $multi_round \ + $streaming \ + $container_image \ # User specific arguments + $mounts \ + $workdir \ + $model_dir diff --git a/examples/wide_ep/slurm_scripts/submit.sh b/examples/wide_ep/slurm_scripts/submit.sh index a1f5553a310..1b4049d9372 100644 --- a/examples/wide_ep/slurm_scripts/submit.sh +++ b/examples/wide_ep/slurm_scripts/submit.sh @@ -1,20 +1,48 @@ #!/bin/bash echo "Please find the \`disaggr_torch.slurm\` script in the \`examples/disaggregated/slurm/\` directory." -echo "Make sure that SLURM parameters are correctly set in \`disaggr_torch.slurm\` before executing this script." mtp_size=0 ntasks_per_node=4 # 4 GPUs per GB200 node +isl=1024 +osl=1024 +multi_round=10 +streaming=true +container_image= +mounts= # e.g. /mnt/data:/mnt/data +workdir= # Path to disaggr_torch.slurm +model_dir= # Path to the model checkpoint + # dep16 eplb0, 256, 288 for b in 1 64 1024; do - concurrency=$((b * 16)) - ctx_num=$(((concurrency + 5499)/5500)) - total_node_num=$((ctx_num + 4)) - ntasks=$((total_node_num * ntasks_per_node)) - # sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 0 "$mtp_size" "$concurrency" - # sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 256 "$mtp_size" "$concurrency" - sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 288 "$mtp_size" "$concurrency" + for eplb_num_slots in 0 256 288; do + concurrency=$((b * 16)) + ctx_num=$(((concurrency + 5499)/5500)) + total_node_num=$((ctx_num + 4)) + ntasks=$((total_node_num * ntasks_per_node)) + sbatch --nodes=${total_node_num} \ + --ntasks=${ntasks} \ + --ntasks-per-node=${ntasks_per_node} \ + --partition=${partition} \ + --account=${account} \ + --job-name=${job_name} \ + --gres=gpu:${ntasks_per_node} \ + --segment=${total_node_num} \ + ${workdir}/disaggr_torch.slurm \ + ${ctx_num} 4 4 4480 true \ # Context servers arguments + 1 16 1024 1024 "0.7" \ # Generation servers arguments + $eplb_num_slots $mtp_size \ # Other arguments + $concurrency \ # Benchmarking arguments + $isl \ + $osl \ + $multi_round \ + $streaming \ + $container_image \ # User specific arguments + $mounts \ + $workdir \ + $model_dir + done done # dep32 eplb288 @@ -23,5 +51,25 @@ for b in 512; do ctx_num=$(((concurrency + 5499)/5500)) total_node_num=$((ctx_num + 8)) ntasks=$((total_node_num * ntasks_per_node)) - sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 32 1024 1024 true "0.7" 288 "$mtp_size" "$concurrency" + sbatch --nodes=${total_node_num} \ + --ntasks=${ntasks} \ + --ntasks-per-node=${ntasks_per_node} \ + --partition=${partition} \ + --account=${account} \ + --job-name=${job_name} \ + --gres=gpu:${ntasks_per_node} \ + --segment=${total_node_num} \ + ${workdir}/disaggr_torch.slurm \ + ${ctx_num} 4 4 4480 true \ # Context servers arguments + 1 32 1024 1024 true "0.7" \ # Generation servers arguments + $eplb_num_slots $mtp_size \ # Other arguments + $concurrency \ # Benchmarking arguments + $isl \ + $osl \ + $multi_round \ + $streaming \ + $container_image \ # User specific arguments + $mounts \ + $workdir \ + $model_dir done From f62d4495d0f0bf9fd70e99da2f65acbbdbcd72d8 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 31 Jul 2025 09:41:31 -0700 Subject: [PATCH 3/3] Polish Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- examples/disaggregated/slurm/submit.sh | 44 ++++++------ examples/wide_ep/slurm_scripts/submit.sh | 87 ++++++++++++++---------- 2 files changed, 75 insertions(+), 56 deletions(-) diff --git a/examples/disaggregated/slurm/submit.sh b/examples/disaggregated/slurm/submit.sh index 69f506efafc..0498910ce12 100644 --- a/examples/disaggregated/slurm/submit.sh +++ b/examples/disaggregated/slurm/submit.sh @@ -1,19 +1,37 @@ #!/bin/bash +partition= +account= +job_name= +container_image= +mounts= # e.g. /mnt/data:/mnt/data +workdir= # Path to disaggr_torch.slurm +model_dir= # Path to the model checkpoint + +ntasks_per_node=4 # 4 GPUs per GB200 node total_node_num=8 ntasks=$((total_node_num * ntasks_per_node)) -ntasks_per_node=4 # 4 GPUs per GB200 node -ctx_num=1 concurrency=8 isl=1024 osl=1024 multi_round=10 streaming=true -container_image= -mounts= # e.g. /mnt/data:/mnt/data -workdir= # Path to disaggr_torch.slurm -model_dir= # Path to the model checkpoint + +args=( + 1 4 4 4480 true # Context servers arguments + 1 8 1024 1024 true "0.8" # Generation servers arguments + 0 0 # Other arguments + $concurrency # Benchmarking arguments + $isl + $osl + $multi_round + $streaming + $container_image # User specific arguments + $mounts + $workdir + $model_dir +) # This command starts a job with 8 nodes, 32 GPUs in total. # The server will include 4 context workers with DEP4, and 1 generation worker with DEP8. @@ -26,16 +44,4 @@ sbatch --nodes=${total_node_num} \ --job-name=${job_name} \ --gres=gpu:${ntasks_per_node} \ --segment=${total_node_num} \ - ${workdir}/disaggr_torch.slurm \ - ${ctx_num} 4 4 4480 true \ # Context servers arguments - 1 8 1024 1024 true "0.8" \ # Generation servers arguments - 0 0 \ # Other arguments - $concurrency \ # Benchmarking arguments - $isl \ - $osl \ - $multi_round \ - $streaming \ - $container_image \ # User specific arguments - $mounts \ - $workdir \ - $model_dir + ${workdir}/disaggr_torch.slurm "${args[@]}" diff --git a/examples/wide_ep/slurm_scripts/submit.sh b/examples/wide_ep/slurm_scripts/submit.sh index 1b4049d9372..f83fa53b55c 100644 --- a/examples/wide_ep/slurm_scripts/submit.sh +++ b/examples/wide_ep/slurm_scripts/submit.sh @@ -2,6 +2,14 @@ echo "Please find the \`disaggr_torch.slurm\` script in the \`examples/disaggregated/slurm/\` directory." +partition= +account= +job_name= +container_image= +mounts= # e.g. /mnt/data:/mnt/data +workdir= # Path to disaggr_torch.slurm +model_dir= # Path to the model checkpoint + mtp_size=0 ntasks_per_node=4 # 4 GPUs per GB200 node @@ -9,10 +17,6 @@ isl=1024 osl=1024 multi_round=10 streaming=true -container_image= -mounts= # e.g. /mnt/data:/mnt/data -workdir= # Path to disaggr_torch.slurm -model_dir= # Path to the model checkpoint # dep16 eplb0, 256, 288 for b in 1 64 1024; do @@ -21,6 +25,22 @@ for b in 1 64 1024; do ctx_num=$(((concurrency + 5499)/5500)) total_node_num=$((ctx_num + 4)) ntasks=$((total_node_num * ntasks_per_node)) + + args=( + ${ctx_num} 4 4 4480 true # Context servers arguments + 1 16 1024 1024 "0.7" # Generation servers arguments + $eplb_num_slots $mtp_size # Other arguments + $concurrency # Benchmarking arguments + $isl + $osl + $multi_round + $streaming + $container_image # User specific arguments + $mounts + $workdir + $model_dir + ) + sbatch --nodes=${total_node_num} \ --ntasks=${ntasks} \ --ntasks-per-node=${ntasks_per_node} \ @@ -29,19 +49,7 @@ for b in 1 64 1024; do --job-name=${job_name} \ --gres=gpu:${ntasks_per_node} \ --segment=${total_node_num} \ - ${workdir}/disaggr_torch.slurm \ - ${ctx_num} 4 4 4480 true \ # Context servers arguments - 1 16 1024 1024 "0.7" \ # Generation servers arguments - $eplb_num_slots $mtp_size \ # Other arguments - $concurrency \ # Benchmarking arguments - $isl \ - $osl \ - $multi_round \ - $streaming \ - $container_image \ # User specific arguments - $mounts \ - $workdir \ - $model_dir + ${workdir}/disaggr_torch.slurm "${args[@]}" done done @@ -51,25 +59,30 @@ for b in 512; do ctx_num=$(((concurrency + 5499)/5500)) total_node_num=$((ctx_num + 8)) ntasks=$((total_node_num * ntasks_per_node)) + eplb_num_slots=288 + + args=( + ${ctx_num} 4 4 4480 true # Context servers arguments + 1 32 1024 1024 true "0.7" # Generation servers arguments + $eplb_num_slots $mtp_size # Other arguments + $concurrency # Benchmarking arguments + $isl + $osl + $multi_round + $streaming + $container_image # User specific arguments + $mounts + $workdir + $model_dir + ) + sbatch --nodes=${total_node_num} \ - --ntasks=${ntasks} \ - --ntasks-per-node=${ntasks_per_node} \ - --partition=${partition} \ - --account=${account} \ - --job-name=${job_name} \ - --gres=gpu:${ntasks_per_node} \ - --segment=${total_node_num} \ - ${workdir}/disaggr_torch.slurm \ - ${ctx_num} 4 4 4480 true \ # Context servers arguments - 1 32 1024 1024 true "0.7" \ # Generation servers arguments - $eplb_num_slots $mtp_size \ # Other arguments - $concurrency \ # Benchmarking arguments - $isl \ - $osl \ - $multi_round \ - $streaming \ - $container_image \ # User specific arguments - $mounts \ - $workdir \ - $model_dir + --ntasks=${ntasks} \ + --ntasks-per-node=${ntasks_per_node} \ + --partition=${partition} \ + --account=${account} \ + --job-name=${job_name} \ + --gres=gpu:${ntasks_per_node} \ + --segment=${total_node_num} \ + ${workdir}/disaggr_torch.slurm "${args[@]}" done