more benchmark and test extension

shmsong · shmsong · commit 4d8daeac00b2 · 2022-09-22T09:23:32.000-07:00
diff --git a/benchmarks/cpp/nvfuser/matmul.cpp b/benchmarks/cpp/nvfuser/matmul.cpp
@@ -241,7 +241,7 @@ MatmulParam getMatmulParams(
   return params;
 }
 
-static void Nvfuser_Matmul_4warp(
+static void Nvfuser_Matmul_4warp3stage(
     benchmark::State& benchmark_state,
     MatmulLayout layout) {
   auto cta_tile = GemmTile(128, 128, 32);
@@ -256,7 +256,7 @@ static void Nvfuser_Matmul_4warp(
   SingleMatmulBase(benchmark_state, layout, params);
 }
 
-static void Nvfuser_Matmul_8warp(
+static void Nvfuser_Matmul_8warp3stage(
     benchmark::State& benchmark_state,
     MatmulLayout layout) {
   auto cta_tile = GemmTile(256, 128, 32);
@@ -271,6 +271,36 @@ static void Nvfuser_Matmul_8warp(
   SingleMatmulBase(benchmark_state, layout, params);
 }
 
+static void Nvfuser_Matmul_4warp4stage(
+    benchmark::State& benchmark_state,
+    MatmulLayout layout) {
+  auto cta_tile = GemmTile(128, 128, 32);
+  int number_of_stage = 4;
+
+  auto params = getMatmulParams(cta_tile, number_of_stage, layout);
+
+  NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
+      8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
+
+  // Run benchmark:
+  SingleMatmulBase(benchmark_state, layout, params);
+}
+
+static void Nvfuser_Matmul_8warp4stage(
+    benchmark::State& benchmark_state,
+    MatmulLayout layout) {
+  auto cta_tile = GemmTile(256, 128, 32);
+  int number_of_stage = 4;
+
+  auto params = getMatmulParams(cta_tile, number_of_stage, layout);
+
+  NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
+      8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
+
+  // Run benchmark:
+  SingleMatmulBase(benchmark_state, layout, params);
+}
+
 // ----------------------------- Benchmark Instantiation-------
 
 // Common utils:
@@ -286,21 +316,41 @@ static void Nvfuser_Matmul_8warp(
   run(NT, MatmulLayout::NT)
 
 // Instantiations:
-#define Nvfuser_4warp_test(layout_label, layout)                           \
-  BENCHMARK_CAPTURE(                                                       \
-      Nvfuser_Matmul_4warp, no_quant_nvfuser_4warp_##layout_label, layout) \
+#define Nvfuser_4warp3stage_test(layout_label, layout) \
+  BENCHMARK_CAPTURE(                                   \
+      Nvfuser_Matmul_4warp3stage,                      \
+      no_quant_nvfuser_4warp_##layout_label,           \
+      layout)                                          \
+      ->NO_TILE_QUANTIZATION_ARGS
+
+#define Nvfuser_8warp3stage_test(layout_label, layout) \
+  BENCHMARK_CAPTURE(                                   \
+      Nvfuser_Matmul_8warp3stage,                      \
+      no_quant_nvfuser_8warp_##layout_label,           \
+      layout)                                          \
+      ->NO_TILE_QUANTIZATION_ARGS
+
+#define Nvfuser_4warp4stage_test(layout_label, layout) \
+  BENCHMARK_CAPTURE(                                   \
+      Nvfuser_Matmul_4warp4stage,                      \
+      no_quant_nvfuser_4warp_##layout_label,           \
+      layout)                                          \
       ->NO_TILE_QUANTIZATION_ARGS
 
-#define Nvfuser_8warp_test(layout_label, layout)                           \
-  BENCHMARK_CAPTURE(                                                       \
-      Nvfuser_Matmul_8warp, no_quant_nvfuser_8warp_##layout_label, layout) \
+#define Nvfuser_8warp4stage_test(layout_label, layout) \
+  BENCHMARK_CAPTURE(                                   \
+      Nvfuser_Matmul_8warp4stage,                      \
+      no_quant_nvfuser_8warp_##layout_label,           \
+      layout)                                          \
       ->NO_TILE_QUANTIZATION_ARGS
 
 #define Eagermode_test(layout_label, layout)                      \
   BENCHMARK_CAPTURE(                                              \
       EagerModeMatmul, no_quant_eagermode_##layout_label, layout) \
       ->NO_TILE_QUANTIZATION_ARGS
 
-ForAllLayouts(Nvfuser_4warp_test);
-ForAllLayouts(Nvfuser_8warp_test);
+ForAllLayouts(Nvfuser_4warp3stage_test);
+ForAllLayouts(Nvfuser_4warp4stage_test);
+ForAllLayouts(Nvfuser_8warp3stage_test);
+ForAllLayouts(Nvfuser_8warp4stage_test);
 ForAllLayouts(Eagermode_test);
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
@@ -3056,6 +3056,59 @@ TEST_F(NVFuserTest, FusionTuringMatmulLargeLoad_CUDA) {
   }
 }
 
+// Matmul test on Ampere using ldmatrix.x4 to load operands
+TEST_F(NVFuserTest, FusionAmpereMatmulLargeLoadLargeK_CUDA) {
+  // Keep multiples of 8 to keep vectorizable.
+  int M = 504, N = 136, K = 2048;
+  for (auto layout : kAllSupportedLayout) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+    auto tv0 = makeContigTensor(2, DataType::Half);
+    auto tv1 = makeContigTensor(2, DataType::Half);
+
+    fusion.addInput(tv0);
+    fusion.addInput(tv1);
+
+    auto tv2 = matmul(tv0, tv1, layout);
+
+    fusion.addOutput(tv2);
+
+    MatMulTileOptions gemm_tile;
+    gemm_tile.cta_tile = GemmTile(128, 128, 64);
+    gemm_tile.warp_tile = GemmTile(64, 64, 64);
+    gemm_tile.instruction_tile = GemmTile(16, 16, 16);
+
+    auto mma_builder =
+        MmaBuilder(MmaOptions::MacroType::Ampere_16_16_16, gemm_tile)
+            .layout(layout);
+
+    MatmulParam params(mma_builder);
+    params.tile_sizes = gemm_tile;
+    params.async_gmem_load_operands = true;
+    params.double_buffer_options.double_buffer_smem_write = true;
+    params.double_buffer_options.double_buffer_smem_read = true;
+    params.double_buffer_options.smem_double_buffer_stage = 3;
+    scheduleMatmul(tv2, tv0, tv1, params);
+
+    at::manual_seed(0);
+    auto inputs = fp16MatmulAtInput(M, N, K, layout);
+
+    CompileOptions co;
+    co.index_mode = KernelIndexMode::INT32;
+
+    FusionExecutor fe;
+    NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
+        8,
+        0,
+        fe.compileFusion(
+            &fusion, {inputs.first, inputs.second}, LaunchParams(), co));
+    auto cg_outputs = fe.runFusion({inputs.first, inputs.second});
+    auto tref = atMatmul(
+        inputs.first.to(at::kFloat), inputs.second.to(at::kFloat), layout);
+    TORCH_CHECK(cg_outputs[0].allclose(tref, 0.001, 0.001));
+  }
+}
+
 #undef NVFUSER_TEST_CUDA_ARCH_GUARD
 
 } // namespace jit