NVIDIA · litaotju · Aug 29, 2025 · Aug 27, 2025
diff --git a/cpp/include/tensorrt_llm/deep_gemm/scheduler.cuh b/cpp/include/tensorrt_llm/deep_gemm/scheduler.cuh
@@ -379,7 +379,7 @@ struct GroupedMaskedScheduler
     }
 };
 
-// Need to keep the same as the one in tests/unittest/_torch/thop/deep_gemm_tests.py
+// Need to keep the same as the one in tests/unittest/_torch/thop/parallel/deep_gemm_tests.py
 template <typename T_offset, typename T_index>
 __host__ __device__ __forceinline__ T_offset compute_padded_offset(T_offset offset, T_index problem_idx)
 {

diff --git a/tests/integration/defs/.test_durations b/tests/integration/defs/.test_durations
@@ -146,7 +146,8 @@
    "test_unittests.py::test_unittests_v2[unittest/_torch/multimodal]": 23.54,
    "test_unittests.py::test_unittests_v2[unittest/_torch/sampler]": 107.66,
    "test_unittests.py::test_unittests_v2[unittest/_torch/speculative]": 1850.16,
-   "test_unittests.py::test_unittests_v2[unittest/_torch/thop]": 852.56,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/thop/parallel]": 311.58,
+   "test_unittests.py::test_unittests_v2[unittest/_torch/thop/serial]": 18.96,
    "test_unittests.py::test_unittests_v2[unittest/_torch/modeling -k \"modeling_mixtral\"]": 208.1838396479725,
    "test_unittests.py::test_unittests_v2[unittest/_torch/multi_gpu_modeling -k \"deepseek\"]": 393.0210295501165,
    "test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]": 21.019993914989755,

diff --git a/tests/integration/defs/agg_unit_mem_df.csv b/tests/integration/defs/agg_unit_mem_df.csv
@@ -101,9 +101,17 @@ unittest/trt/model/test_mamba.py,NVIDIA H100,10,
 "unittest/trt/attention/test_gpt_attention.py -k ""partition2""",NVIDIA L40,6,
 "unittest/trt/attention/test_gpt_attention.py -k ""partition3""",NVIDIA L40,6,
 "unittest/trt/attention/test_gpt_attention.py -k ""xqa_generic""",NVIDIA L40,3,
+unittest/_torch/attention,NVIDIA Graphics Device,4,B200 Bring Up Board
+unittest/_torch/misc,NVIDIA Graphics Device,4,B200 Bring Up Board
 unittest/_torch/speculative,NVIDIA Graphics Device,4,B200 Bring Up Board
-unittest/_torch/thop,NVIDIA Graphics Device,32,B200 Bring Up Board
+unittest/_torch/thop/parallel,NVIDIA Graphics Device,4,B200 Bring Up Board
 "unittest/_torch/auto_deploy/unit/singlegpu -k ""not test_trtllm_bench_backend_comparison""",NVIDIA Graphics Device,4,B200 Bring Up Board
+unittest/_torch/attention,NVIDIA B200,4,
+unittest/_torch/misc,NVIDIA B200,4,
 unittest/_torch/speculative,NVIDIA B200,4,
-unittest/_torch/thop,NVIDIA B200,32,
+unittest/_torch/thop/parallel,NVIDIA B200,4,
 "unittest/_torch/auto_deploy/unit/singlegpu -k ""not test_trtllm_bench_backend_comparison""",NVIDIA B200,4,
+unittest/_torch/attention,NVIDIA H100,4,
+unittest/_torch/misc,NVIDIA H100,4,
+unittest/_torch/speculative,NVIDIA H100,2,
+unittest/_torch/thop/parallel,NVIDIA H100,4,
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -51,7 +51,8 @@ l0_b200:
   - unittest/_torch/multimodal
   - unittest/_torch/sampler
   - unittest/_torch/speculative
-  - unittest/_torch/thop
+  - unittest/_torch/thop/parallel
+  - unittest/_torch/thop/serial
   - unittest/_torch/modeling -k "modeling_llama"
   - unittest/_torch/modeling -k "modeling_mixtral"
   - unittest/_torch/modeling -k "modeling_deepseek"

diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -23,7 +23,8 @@ l0_h100:
   - unittest/_torch/multimodal
   - unittest/_torch/sampler
   - unittest/_torch/speculative
-  - unittest/_torch/thop
+  - unittest/_torch/thop/parallel
+  - unittest/_torch/thop/serial
   # Only key models in H100: llama/mixtral/nemotron/deepseek
   - unittest/_torch/modeling -k "modeling_llama"
   - unittest/_torch/modeling -k "modeling_mixtral"

diff --git a/...s/unittest/_torch/thop/deep_gemm_tests.py → ...t/_torch/thop/parallel/deep_gemm_tests.py b/...s/unittest/_torch/thop/deep_gemm_tests.py → ...t/_torch/thop/parallel/deep_gemm_tests.py
diff --git a/...test/_torch/thop/test_causal_conv1d_op.py → ...ch/thop/parallel/test_causal_conv1d_op.py b/...test/_torch/thop/test_causal_conv1d_op.py → ...ch/thop/parallel/test_causal_conv1d_op.py
diff --git a/tests/unittest/_torch/thop/test_cublas_mm.py → ...st/_torch/thop/parallel/test_cublas_mm.py b/tests/unittest/_torch/thop/test_cublas_mm.py → ...st/_torch/thop/parallel/test_cublas_mm.py
diff --git a/...s/unittest/_torch/thop/test_custom_ops.py → ...t/_torch/thop/parallel/test_custom_ops.py b/...s/unittest/_torch/thop/test_custom_ops.py → ...t/_torch/thop/parallel/test_custom_ops.py
diff --git a/...est/_torch/thop/test_dsv3_fused_a_gemm.py → ...h/thop/parallel/test_dsv3_fused_a_gemm.py b/...est/_torch/thop/test_dsv3_fused_a_gemm.py → ...h/thop/parallel/test_dsv3_fused_a_gemm.py
diff --git a/...test/_torch/thop/test_dsv3_router_gemm.py → ...ch/thop/parallel/test_dsv3_router_gemm.py b/...test/_torch/thop/test_dsv3_router_gemm.py → ...ch/thop/parallel/test_dsv3_router_gemm.py
diff --git a/...thop/test_finegrained_mixed_dtype_gemm.py → ...llel/test_finegrained_mixed_dtype_gemm.py b/...thop/test_finegrained_mixed_dtype_gemm.py → ...llel/test_finegrained_mixed_dtype_gemm.py
diff --git a/...test/_torch/thop/test_fp4_bmm_quantize.py → ...ch/thop/parallel/test_fp4_bmm_quantize.py b/...test/_torch/thop/test_fp4_bmm_quantize.py → ...ch/thop/parallel/test_fp4_bmm_quantize.py
diff --git a/...h/thop/test_fp4_calculate_global_scale.py → ...rallel/test_fp4_calculate_global_scale.py b/...h/thop/test_fp4_calculate_global_scale.py → ...rallel/test_fp4_calculate_global_scale.py
diff --git a/...est/_torch/thop/test_fp4_gemm_quantize.py → ...h/thop/parallel/test_fp4_gemm_quantize.py b/...est/_torch/thop/test_fp4_gemm_quantize.py → ...h/thop/parallel/test_fp4_gemm_quantize.py
diff --git a/...s/unittest/_torch/thop/test_fp4_linear.py → ...t/_torch/thop/parallel/test_fp4_linear.py b/...s/unittest/_torch/thop/test_fp4_linear.py → ...t/_torch/thop/parallel/test_fp4_linear.py
diff --git a/.../unittest/_torch/thop/test_fp4_swizzle.py → .../_torch/thop/parallel/test_fp4_swizzle.py b/.../unittest/_torch/thop/test_fp4_swizzle.py → .../_torch/thop/parallel/test_fp4_swizzle.py
diff --git a/.../_torch/thop/test_fp8_block_scale_gemm.py → ...hop/parallel/test_fp8_block_scale_gemm.py b/.../_torch/thop/test_fp8_block_scale_gemm.py → ...hop/parallel/test_fp8_block_scale_gemm.py
diff --git a/...s/unittest/_torch/thop/test_fp8_linear.py → ...t/_torch/thop/parallel/test_fp8_linear.py b/...s/unittest/_torch/thop/test_fp8_linear.py → ...t/_torch/thop/parallel/test_fp8_linear.py
diff --git a/...p/test_fp8_per_tensor_scale_tllmg_gemm.py → ...l/test_fp8_per_tensor_scale_tllmg_gemm.py b/...p/test_fp8_per_tensor_scale_tllmg_gemm.py → ...l/test_fp8_per_tensor_scale_tllmg_gemm.py
diff --git a/...unittest/_torch/thop/test_fp8_quantize.py → ..._torch/thop/parallel/test_fp8_quantize.py b/...unittest/_torch/thop/test_fp8_quantize.py → ..._torch/thop/parallel/test_fp8_quantize.py
diff --git a/...st/_torch/thop/test_fp8_rowwise_linear.py → .../thop/parallel/test_fp8_rowwise_linear.py b/...st/_torch/thop/test_fp8_rowwise_linear.py → .../thop/parallel/test_fp8_rowwise_linear.py
diff --git a/...st/_torch/thop/test_fused_qk_norm_rope.py → .../thop/parallel/test_fused_qk_norm_rope.py b/...st/_torch/thop/test_fused_qk_norm_rope.py → .../thop/parallel/test_fused_qk_norm_rope.py
diff --git a/...est/_torch/thop/test_logits_bitmask_op.py → ...h/thop/parallel/test_logits_bitmask_op.py b/...est/_torch/thop/test_logits_bitmask_op.py → ...h/thop/parallel/test_logits_bitmask_op.py
diff --git a/...torch/thop/test_mamba2_chunk_ss_update.py → ...p/parallel/test_mamba2_chunk_ss_update.py b/...torch/thop/test_mamba2_chunk_ss_update.py → ...p/parallel/test_mamba2_chunk_ss_update.py
diff --git a/...ttest/_torch/thop/test_mamba_conv1d_op.py → ...rch/thop/parallel/test_mamba_conv1d_op.py b/...ttest/_torch/thop/test_mamba_conv1d_op.py → ...rch/thop/parallel/test_mamba_conv1d_op.py
diff --git a/tests/unittest/_torch/thop/test_moe.py → ...unittest/_torch/thop/parallel/test_moe.py b/tests/unittest/_torch/thop/test_moe.py → ...unittest/_torch/thop/parallel/test_moe.py
diff --git a/tests/unittest/_torch/thop/test_noaux_tc.py → ...est/_torch/thop/parallel/test_noaux_tc.py b/tests/unittest/_torch/thop/test_noaux_tc.py → ...est/_torch/thop/parallel/test_noaux_tc.py
diff --git a/tests/unittest/_torch/thop/test_scaled_mm.py → ...st/_torch/thop/parallel/test_scaled_mm.py b/tests/unittest/_torch/thop/test_scaled_mm.py → ...st/_torch/thop/parallel/test_scaled_mm.py
diff --git a/...est/_torch/thop/test_selective_scan_op.py → ...h/thop/parallel/test_selective_scan_op.py b/...est/_torch/thop/test_selective_scan_op.py → ...h/thop/parallel/test_selective_scan_op.py
diff --git a/tests/unittest/_torch/thop/test_tllmg_bmm.py → ...st/_torch/thop/parallel/test_tllmg_bmm.py b/tests/unittest/_torch/thop/test_tllmg_bmm.py → ...st/_torch/thop/parallel/test_tllmg_bmm.py
diff --git a/...unittest/_torch/thop/test_w4a16_linear.py → ..._torch/thop/parallel/test_w4a16_linear.py b/...unittest/_torch/thop/test_w4a16_linear.py → ..._torch/thop/parallel/test_w4a16_linear.py
diff --git a/.../unittest/_torch/thop/test_w4a8_linear.py → .../_torch/thop/parallel/test_w4a8_linear.py b/.../unittest/_torch/thop/test_w4a8_linear.py → .../_torch/thop/parallel/test_w4a8_linear.py
diff --git a/..._torch/thop/test_w4a8_mxfp4_mxfp8_gemm.py → ...op/parallel/test_w4a8_mxfp4_mxfp8_gemm.py b/..._torch/thop/test_w4a8_mxfp4_mxfp8_gemm.py → ...op/parallel/test_w4a8_mxfp4_mxfp8_gemm.py
diff --git a/...torch/thop/test_weight_only_quant_gemm.py → ...p/parallel/test_weight_only_quant_gemm.py b/...torch/thop/test_weight_only_quant_gemm.py → ...p/parallel/test_weight_only_quant_gemm.py
diff --git a/...rch/thop/test_weight_only_quant_linear.py → ...parallel/test_weight_only_quant_linear.py b/...rch/thop/test_weight_only_quant_linear.py → ...parallel/test_weight_only_quant_linear.py
diff --git a/...unittest/_torch/thop/test_moe_alltoall.py → ...t/_torch/thop/serial/test_moe_alltoall.py b/...unittest/_torch/thop/test_moe_alltoall.py → ...t/_torch/thop/serial/test_moe_alltoall.py