From 3bbcceb3b1bfcebe234afe7326e025cf195e6adb Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Wed, 20 Aug 2025 04:11:49 +0000 Subject: [PATCH] Add MNNVL to unittest and speed up weight creation Signed-off-by: Tailing Yuan --- .../_torch/modules/fused_moe/fused_moe_wide_ep.py | 2 +- tests/integration/test_lists/test-db/l0_dgx_b200.yml | 1 + tests/integration/test_lists/test-db/l0_dgx_h100.yml | 1 + tests/unittest/_torch/modules/test_fused_moe.py | 9 ++++++--- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py index 9fee27e6c93..b2699c5ff6e 100755 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py @@ -464,7 +464,7 @@ def forward_chunk( self.dummy_allreduce() token_count = x.shape[0] alltoall_info = None - if is_last_call: + if self.layer_load_balancer and is_last_call: loadbalancer_local_statistic_info = self.layer_load_balancer.get_local_statistic_tensor( ) else: diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index fb3f518a686..0ca60357479 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -15,6 +15,7 @@ l0_dgx_b200: backend: pytorch tests: - unittest/_torch/multi_gpu_modeling -k "deepseek" + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEP] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEPLowLatency] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=FLASHINFER-torch_compile=False] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 9b6d5b6f1fc..90c67d68114 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -73,6 +73,7 @@ l0_dgx_h100: auto_trigger: deepseek tests: - unittest/_torch/multi_gpu_modeling -k "deepseek" + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[MNNVL] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEP] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEPLowLatency] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype0] diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py index 2d11971d99e..5df59b39e9c 100644 --- a/tests/unittest/_torch/modules/test_fused_moe.py +++ b/tests/unittest/_torch/modules/test_fused_moe.py @@ -212,11 +212,14 @@ def per_rank_test_fused_moe_alltoall(job_id): weights = {} for expert_id in range(NUM_EXPERTS): w1_weight = torch.empty((INTERMEDIATE_SIZE, HIDDEN_SIZE), - dtype=dtype) + dtype=dtype, + device="cuda") w2_weight = torch.empty((HIDDEN_SIZE, INTERMEDIATE_SIZE), - dtype=dtype) + dtype=dtype, + device="cuda") w3_weight = torch.empty((INTERMEDIATE_SIZE, HIDDEN_SIZE), - dtype=dtype) + dtype=dtype, + device="cuda") torch.nn.init.xavier_uniform_(w1_weight) torch.nn.init.xavier_uniform_(w2_weight) torch.nn.init.xavier_uniform_(w3_weight)