issue 1189 repro and fix (pytorch#1193)

shmsong · web-flow · commit f10afcd98cb3 · 2021-10-13T10:41:43.000-04:00
diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
@@ -17250,6 +17250,47 @@ TEST(NVFuserTest, FusionUnswitchPredicate_CUDA) {
   testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__);
 }
 
+TEST(NVFuserTest, FusionIssue1189_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeConcreteTensor({16, 16});
+  auto tv1 = makeConcreteTensor({16, 16});
+
+  auto tv0b = broadcast(tv0, {false, false, true});
+  auto tv1b = broadcast(tv1, {false, false, true});
+
+  fusion.addInput(tv0b);
+  fusion.addInput(tv1b);
+
+  auto tv2 = add(tv0b, tv1b);
+  auto tv3 = sum(tv2, {1});
+  fusion.addOutput(tv3);
+
+  auto parallelize = [](auto tv) {
+    tv->axis(0)->parallelize(ParallelType::TIDx);
+    tv->axis(1)->parallelize(ParallelType::BIDx);
+    tv->axis(2)->parallelize(ParallelType::BIDy);
+  };
+
+  parallelize(tv0b);
+  parallelize(tv1b);
+  parallelize(tv2);
+  parallelize(tv3);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({16, 16, 1}, options);
+  at::Tensor t1 = at::randn({16, 16, 1}, options);
+  auto outputs = fe.runFusion({t0, t1});
+
+  auto ref = (t0 + t1).sum({1});
+
+  testValidate(&fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__);
+}
+
 TEST(NVFuserTest, FusionIssue1052_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -339,18 +339,19 @@ LaunchParams FusionExecutor::computeLaunchParams(
 
   auto data_cache = compileTimeDataCache();
 
+  auto& lower = lowered_;
+
   auto& used_tvs = getUsedTVs();
   auto parallel_binding_ids_entry =
       executor_utils::caching::ExecutorCompileTimeEntry<
           executor_utils::caching::ParallelBindingIterDomains>(
-          data_cache, [&used_tvs]() {
+          data_cache, [&used_tvs, &lower]() {
             return std::make_unique<std::vector<IterDomain*>>(
-                executor_utils::getParallelBindingsIterDomains(used_tvs));
+                executor_utils::getParallelBindingsIterDomains(
+                    lower, used_tvs));
           });
   auto& parallel_binding_ids = parallel_binding_ids_entry.get();
 
-  auto& lower = lowered_;
-
   auto parallel_iter_extent_entry =
       executor_utils::caching::ExecutorCompileTimeEntry<
           executor_utils::caching::ParallelIterExtentMap>(
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -989,12 +989,25 @@ template class ExecutorCompileTimeEntry<OutputAliasIndices>;
 } // namespace caching
 
 std::vector<IterDomain*> getParallelBindingsIterDomains(
+    GpuLower& lower,
     const std::vector<TensorView*>& used_tvs) {
   std::vector<IterDomain*> parallel_ids;
   for (auto tv : used_tvs) {
     for (auto id : tv->domain()->domain()) {
-      if (id->isThread() && !id->isBroadcast()) {
-        parallel_ids.push_back(id);
+      if (id->isThread()) {
+        if (id->isBroadcast()) {
+          // Want to keep the broadcast dimensions if they are not resolved
+          // TODO: piping down the parallel dimension map here would
+          //  be helpful
+          auto& parallel_map = lower.caParallelMap();
+          if (parallel_map.getConcreteMappedID(id) == id) {
+            parallel_ids.push_back(id);
+          }
+        } else {
+          // Non broadcast ids are directly added to the binding
+          //  ids.
+          parallel_ids.push_back(id);
+        }
       }
     }
   }
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h
@@ -282,6 +282,7 @@ class ExecutorCompileTimeEntry {
 //! Returns the vector of tensorviews that will be used to bind parallel
 //!  dimensions.
 std::vector<IterDomain*> getParallelBindingsIterDomains(
+    GpuLower& lower,
     const std::vector<TensorView*>& used_tvs);
 
 using ParallelExtentMap =