diff --git a/examples/disaggregated/slurm/benchmark/README.md b/examples/disaggregated/slurm/benchmark/README.md index a81607b8bd4..7875d693ce1 100644 --- a/examples/disaggregated/slurm/benchmark/README.md +++ b/examples/disaggregated/slurm/benchmark/README.md @@ -34,17 +34,29 @@ It takes the following arguments in order: 1. `num_ctx_servers`: Number of context servers. 2. `ctx_tp_size`: Tensor parallel size for context servers. -3. `ctx_batch_size`: Max batch size for context servers. -4. `ctx_max_num_tokens`: Max number of tokens for context servers. -5. `ctx_enable_attention_dp`: `true` or `false` to enable attention DP for context servers. -6. `num_gen_servers`: Number of generation servers. -7. `gen_tp_size`: Tensor parallel size for generation servers. -8. `gen_batch_size`: Max batch size for generation servers. -9. `gen_max_num_tokens`: Max number of tokens for generation servers. -10. `gen_enable_attention_dp`: `true` or `false` to enable attention DP for generation servers. -11. `gen_gpu_memory_fraction`: GPU memory fraction for generation servers. -12. `concurrency_list`: A space-separated list of concurrencies to test (e.g., "1 2 4 8"). -13. `sub_file`: A subdirectory name for logs. +3. `ctx_pp_size`: Pipeline parallel size for context servers. +4. `ctx_batch_size`: Max batch size for context servers. +5. `ctx_max_num_tokens`: Max number of tokens for context servers. +6. `ctx_enable_attention_dp`: `true` or `false` to enable attention DP for context servers. +7. `num_gen_servers`: Number of generation servers. +8. `gen_tp_size`: Tensor parallel size for generation servers. +9. `gen_pp_size`: Pipeline parallel size for generation servers. +10. `gen_batch_size`: Max batch size for generation servers. +11. `gen_max_num_tokens`: Max number of tokens for generation servers. +12. `gen_enable_attention_dp`: `true` or `false` to enable attention DP for generation servers. +13. `gen_gpu_memory_fraction`: GPU memory fraction for generation servers. +14. `eplb_num_slots`: Number of slots for eplb. +15. `mtp_size`: Number of nextn layers for MTP. +16. `concurrency`: Concurrency level for benchmarking. +17. `isl`: Input sequence length. +18. `osl`: Output sequence length. +19. `multi_round`: Number of rounds for the benchmark. +20. `streaming`: `true` or `false` for streaming mode. +21. `container_image`: Container image to use. +22. `mounts`: Container mounts. +23. `workdir`: Working directory. +24. `model_dir`: Model directory path. +25. `trtllm_repo`: TensorRT-LLM repository path. ### `gen_yaml.py` @@ -90,5 +102,5 @@ This script orchestrates the execution of the benchmark client. It waits for the 7. `disaggr_torch.slurm` starts the main `trtllm-serve` process. 8. `disaggr_torch.slurm` runs `run_benchmark.sh` which waits for the server to be ready. 9. `run_benchmark.sh` executes the benchmark for each concurrency level specified. -10. After the benchmark, `run_benchmark.sh` and `disaggr_torch.slurm` attempt to kill the server and worker processes. +10. After the benchmark, `run_benchmark.sh` and `disaggr_torch.slurm` attempt to kill the server and worker processes. 11. Logs for each run are stored in a subdirectory specified by the `sub_file` parameter. diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm index b1e02c32fe1..377544ab23d 100644 --- a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm +++ b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm @@ -10,47 +10,51 @@ # Context servers arguments num_ctx_servers=${1} ctx_tp_size=${2} -ctx_batch_size=${3} -ctx_max_num_tokens=${4} -ctx_enable_attention_dp=${5} -ctx_gpu_memory_fraction=${6} +ctx_pp_size=${3} +ctx_batch_size=${4} +ctx_max_num_tokens=${5} +ctx_enable_attention_dp=${6} +ctx_gpu_memory_fraction=${7} # Generation servers arguments -num_gen_servers=${7} -gen_tp_size=${8} -gen_batch_size=${9} -gen_max_num_tokens=${10} -gen_enable_attention_dp=${11} -gen_gpu_memory_fraction=${12} +num_gen_servers=${8} +gen_tp_size=${9} +gen_pp_size=${10} +gen_batch_size=${11} +gen_max_num_tokens=${12} +gen_enable_attention_dp=${13} +gen_gpu_memory_fraction=${14} # Other arguments -eplb_num_slots=${13} -mtp_size=${14} +eplb_num_slots=${15} +mtp_size=${16} # Benchmarking arguments -concurrency=${15} -isl=${16} -osl=${17} -multi_round=${18} -streaming=${19} +concurrency=${17} +isl=${18} +osl=${19} +multi_round=${20} +streaming=${21} # User specific arguments -container_image=${20} -mounts=${21} -workdir=${22} -model_dir=${23} -benchmark_mode=${24} -trtllm_repo=${25} +container_image=${22} +mounts=${23} +workdir=${24} +model_dir=${25} +benchmark_mode=${26} +trtllm_repo=${27} echo "================= parameters =================" echo "num_ctx_servers: ${num_ctx_servers}" echo "ctx_tp_size: ${ctx_tp_size}" +echo "ctx_pp_size: ${ctx_pp_size}" echo "ctx_batch_size: ${ctx_batch_size}" echo "ctx_max_num_tokens: ${ctx_max_num_tokens}" echo "ctx_enable_attention_dp: ${ctx_enable_attention_dp}" echo "ctx_gpu_memory_fraction: ${ctx_gpu_memory_fraction}" echo "num_gen_servers: ${num_gen_servers}" echo "gen_tp_size: ${gen_tp_size}" +echo "gen_pp_size: ${gen_pp_size}" echo "gen_batch_size: ${gen_batch_size}" echo "gen_max_num_tokens: ${gen_max_num_tokens}" echo "gen_enable_attention_dp: ${gen_enable_attention_dp}" @@ -83,8 +87,8 @@ full_logdir=${logdir}/ctx${num_ctx_servers}_gen${num_gen_servers}_dep${gen_tp_si echo "concurrency: ${concurrency}" -ctx_gpus=$((num_ctx_servers * ctx_tp_size)) -gen_gpus=$((num_gen_servers * gen_tp_size)) +ctx_gpus=$((num_ctx_servers * ctx_tp_size * ctx_pp_size)) +gen_gpus=$((num_gen_servers * gen_tp_size * gen_pp_size)) echo "enable_attention_dp: ${ctx_enable_attention_dp}, ${gen_enable_attention_dp}, gpu_memory_fraction: ${gen_gpu_memory_fraction}" @@ -132,6 +136,7 @@ srun -l --container-name=${container_name} \ --model ${model_dir} \ --num_ctx_servers ${num_ctx_servers} \ --ctx_tp_size ${ctx_tp_size} \ + --ctx_pp_size ${ctx_pp_size} \ --ctx_batch_size ${ctx_batch_size} \ --ctx_max_num_tokens ${ctx_max_num_tokens} \ --ctx_max_seq_len ${ctx_max_seq_len} \ @@ -139,6 +144,7 @@ srun -l --container-name=${container_name} \ --cache_transceiver_max_num_tokens ${cache_transceiver_max_num_tokens} \ --num_gen_servers ${num_gen_servers} \ --gen_tp_size ${gen_tp_size} \ + --gen_pp_size ${gen_pp_size} \ --gen_batch_size ${gen_batch_size} \ --gen_max_num_tokens ${gen_max_num_tokens} \ --gen_max_seq_len ${gen_max_seq_len} \ diff --git a/examples/disaggregated/slurm/benchmark/gen_yaml.py b/examples/disaggregated/slurm/benchmark/gen_yaml.py index b3865fd700a..e0ea7dd4369 100644 --- a/examples/disaggregated/slurm/benchmark/gen_yaml.py +++ b/examples/disaggregated/slurm/benchmark/gen_yaml.py @@ -123,6 +123,7 @@ def gen_config_file(config_path: str, model_path: str, num_ctx_servers: int, ctx_tp_size: int, + ctx_pp_size: int, ctx_batch_size: int, ctx_max_num_tokens: int, ctx_max_seq_len: int, @@ -130,6 +131,7 @@ def gen_config_file(config_path: str, ctx_enable_attention_dp: bool, num_gen_servers: int, gen_tp_size: int, + gen_pp_size: int, gen_batch_size: int, gen_max_num_tokens: int, gen_max_seq_len: int, @@ -148,6 +150,7 @@ def gen_config_file(config_path: str, model_path: Path to the model num_ctx_servers: Number of context servers ctx_tp_size: Tensor parallel size for context servers + ctx_pp_size: Pipeline parallel size for context servers ctx_batch_size: Batch size for context servers ctx_max_num_tokens: Max number of tokens for context servers ctx_max_seq_len: Max sequence length for context servers @@ -155,6 +158,7 @@ def gen_config_file(config_path: str, ctx_enable_attention_dp: Enable attention DP for context servers num_gen_servers: Number of generation servers gen_tp_size: Tensor parallel size for generation servers + gen_pp_size: Pipeline parallel size for generation servers gen_batch_size: Batch size for generation servers gen_max_num_tokens: Max number of tokens for generation servers gen_enable_attention_dp: Enable attention DP for generation servers @@ -187,7 +191,7 @@ def gen_config_file(config_path: str, 'tensor_parallel_size': ctx_tp_size, 'moe_expert_parallel_size': ctx_tp_size, 'enable_attention_dp': ctx_enable_attention_dp, - 'pipeline_parallel_size': 1, + 'pipeline_parallel_size': ctx_pp_size, 'print_iter_log': True, 'disable_overlap_scheduler': True, 'kv_cache_config': { @@ -205,7 +209,7 @@ def gen_config_file(config_path: str, 'tensor_parallel_size': gen_tp_size, 'moe_expert_parallel_size': gen_tp_size, 'enable_attention_dp': gen_enable_attention_dp, - 'pipeline_parallel_size': 1, + 'pipeline_parallel_size': gen_pp_size, 'max_batch_size': gen_batch_size, 'max_num_tokens': gen_max_num_tokens, 'max_seq_len': gen_max_seq_len, @@ -237,15 +241,15 @@ def gen_config_file(config_path: str, # Generate URLs for context and generation servers ctx_urls, task_nodes_offset = generate_urls("ctx", num_ctx_servers, - ctx_tp_size, 1, + ctx_tp_size, ctx_pp_size, max_tasks_per_node, nodes, task_nodes, node_ports) if num_ctx_servers > 0: config['context_servers']['urls'] = ctx_urls - gen_urls, _ = generate_urls("gen", num_gen_servers, gen_tp_size, 1, - max_tasks_per_node, nodes, task_nodes, - node_ports, task_nodes_offset) + gen_urls, _ = generate_urls("gen", num_gen_servers, gen_tp_size, + gen_pp_size, max_tasks_per_node, nodes, + task_nodes, node_ports, task_nodes_offset) config['generation_servers']['urls'] = gen_urls # set the hostname to the first node @@ -300,6 +304,10 @@ def gen_config_file(config_path: str, type=int, required=True, help="Tensor parallel size for context servers") + parser.add_argument("--ctx_pp_size", + type=int, + default=1, + help="Pipeline parallel size for context servers") parser.add_argument("--ctx_batch_size", type=int, required=True, @@ -328,6 +336,10 @@ def gen_config_file(config_path: str, type=int, required=True, help="Tensor parallel size for generation servers") + parser.add_argument("--gen_pp_size", + type=int, + default=1, + help="Pipeline parallel size for generation servers") parser.add_argument("--gen_batch_size", type=int, required=True, @@ -372,11 +384,11 @@ def gen_config_file(config_path: str, args = parser.parse_args() gen_config_file(args.config, args.model, args.num_ctx_servers, - args.ctx_tp_size, args.ctx_batch_size, + args.ctx_tp_size, args.ctx_pp_size, args.ctx_batch_size, args.ctx_max_num_tokens, args.ctx_max_seq_len, args.ctx_free_gpu_memory_fraction, args.ctx_enable_attention_dp, args.num_gen_servers, - args.gen_tp_size, args.gen_batch_size, + args.gen_tp_size, args.gen_pp_size, args.gen_batch_size, args.gen_max_num_tokens, args.gen_max_seq_len, args.gen_enable_attention_dp, args.gen_gpu_memory_fraction, args.eplb_num_slots, args.mtp_size, args.worker_start_port, diff --git a/examples/disaggregated/slurm/benchmark/submit.sh b/examples/disaggregated/slurm/benchmark/submit.sh index 512c10d72b4..6ecef39c18e 100644 --- a/examples/disaggregated/slurm/benchmark/submit.sh +++ b/examples/disaggregated/slurm/benchmark/submit.sh @@ -21,15 +21,15 @@ streaming=true benchmark_mode=e2e args=( - 1 4 4 4480 true "0.75" # Context servers arguments - 1 8 1024 1024 true "0.8" # Generation servers arguments - 0 0 # Other arguments - $concurrency # Benchmarking arguments + 1 4 1 4 4480 true "0.75" # Context servers arguments + 1 8 1 1024 1024 true "0.8" # Generation servers arguments + 0 0 # Other arguments + $concurrency # Benchmarking arguments $isl $osl $multi_round $streaming - $container_image # User specific arguments + $container_image # User specific arguments $mounts $workdir $model_dir diff --git a/examples/wide_ep/slurm_scripts/submit_e2e.sh b/examples/wide_ep/slurm_scripts/submit_e2e.sh index 4a4a305fd18..a649144286f 100644 --- a/examples/wide_ep/slurm_scripts/submit_e2e.sh +++ b/examples/wide_ep/slurm_scripts/submit_e2e.sh @@ -30,8 +30,8 @@ for b in 1 64 1024; do ntasks=$((total_node_num * ntasks_per_node)) args=( - ${ctx_num} 4 4 4480 true "0.85" # Context servers arguments - 1 16 1024 1024 true "0.7" # Generation servers arguments + ${ctx_num} 4 1 4 4480 true "0.85" # Context servers arguments + 1 16 1 1024 1024 true "0.7" # Generation servers arguments $eplb_num_slots $mtp_size # Other arguments $concurrency # Benchmarking arguments $isl @@ -68,8 +68,8 @@ for b in 512; do eplb_num_slots=288 args=( - ${ctx_num} 4 4 4480 true "0.85" # Context servers arguments - 1 32 1024 1024 true "0.7" # Generation servers arguments + ${ctx_num} 4 1 4 4480 true "0.85" # Context servers arguments + 1 32 1 1024 1024 true "0.7" # Generation servers arguments $eplb_num_slots $mtp_size # Other arguments $concurrency # Benchmarking arguments $isl diff --git a/examples/wide_ep/slurm_scripts/submit_gen_only.sh b/examples/wide_ep/slurm_scripts/submit_gen_only.sh index 29c0027d220..f146fb1ba14 100644 --- a/examples/wide_ep/slurm_scripts/submit_gen_only.sh +++ b/examples/wide_ep/slurm_scripts/submit_gen_only.sh @@ -30,8 +30,8 @@ for b in 1 64 1024; do ntasks=$((total_node_num * ntasks_per_node)) args=( - ${ctx_num} 4 4 4480 true "0.85" # Context servers arguments - 1 16 1024 1024 true "0.7" # Generation servers arguments + ${ctx_num} 4 1 4 4480 true "0.85" # Context servers arguments + 1 16 1 1024 1024 true "0.7" # Generation servers arguments $eplb_num_slots $mtp_size # Other arguments $concurrency # Benchmarking arguments $isl @@ -68,8 +68,8 @@ for b in 512; do eplb_num_slots=288 args=( - ${ctx_num} 4 4 4480 true "0.85" # Context servers arguments - 1 32 1024 1024 true "0.7" # Generation servers arguments + ${ctx_num} 4 1 4 4480 true "0.85" # Context servers arguments + 1 32 1 1024 1024 true "0.7" # Generation servers arguments $eplb_num_slots $mtp_size # Other arguments $concurrency # Benchmarking arguments $isl