csarofeen · zasdfgbnm · Oct 4, 2022 · Oct 4, 2022 · Oct 4, 2022
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
@@ -7853,6 +7853,74 @@ TEST_F(NVFuserTest, FusionReductionScheduler_CUDA) {
       lparams);
 }
 
+// This test checks if our system could correctly handles the case where both
+// reduction and trivial reduction exist in the fusion. Trivial reduction
+// deserve testing because trivial reduction is handled more like a broadcasting
+// rather than a reduction.
+TEST_F(NVFuserTest, FusionReductionWithTrivialReduction_CUDA) {
+  constexpr int bid_x = 80;
+  constexpr int tid_x = 4096;
+
+  std::vector<std::vector<int64_t>> shapes = {
+      {-1, -1, 1}, {-1, 1, -1}, {1, -1, -1}};
+
+  for (auto shape : shapes) {
+    std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+    Fusion& fusion = *fusion_ptr;
+    FusionGuard fg(&fusion);
+
+    std::vector<std::vector<int64_t>> reduction_dims = {
+        {0},
+        {1},
+        {2},
+        {0, 1},
+        {0, 2},
+        {1, 2},
+        {0, 1, 2},
+    };
+
+    // Set up your input tensor views
+    TensorView* tv0 = makeConcreteTensor(shape);
+    fusion.addInput(tv0);
+
+    for (auto rdims : reduction_dims) {
+      std::vector<int> rdims_(rdims.begin(), rdims.end());
+      auto tv = sum(tv0, rdims_);
+      fusion.addOutput(tv);
+    }
+
+    const auto options =
+        at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+    auto concrete_shape = shape;
+    std::deque<int64_t> concrete_values = {bid_x, tid_x};
+    for (auto& s : concrete_shape) {
+      if (s == -1) {
+        s = concrete_values.front();
+        concrete_values.pop_front();
+      }
+    }
+
+    at::Tensor aten_input = at::randn(concrete_shape, options);
+    std::vector<at::Tensor> aten_outputs;
+    for (auto rdims : reduction_dims) {
+      aten_outputs.push_back(aten_input.sum(rdims));
+    }
+
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
+    auto cg_outputs = executor_cache.runFusionWithInputs({aten_input});
+
+    testValidate(
+        &fusion,
+        cg_outputs,
+        {aten_input},
+        aten_outputs,
+        __LINE__,
+        __FILE__,
+        "");
+  }
+}
+
 // Simple reduction parallelized on a symbolic size.
 TEST_F(NVFuserTest, FusionSymbolicReduction_CUDA) {
   Fusion fusion;