Fix ir_utils::hasBlockSync + misc fixes in transpose scheduler (#1924)

zasdfgbnm · web-flow · commit b34e3b93ee1a · 2022-08-24T08:49:49.000-04:00
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -754,10 +754,12 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     auto out_tv = rop->output(0)->as<kir::TensorIndex>()->view();
     auto index = genTensorIndex(rop->getPhiloxIndex()->as<kir::TensorIndex>());
     int multiple = out_tv->getDataType() == DataType::Double ? 2 : 4;
-    indent() << "nvfuser_index_t rng_subseq" << rop->name() << " = (" << index
-             << ") / " << multiple << ";\n";
-    indent() << "nvfuser_index_t rng_component" << rop->name() << " = ("
-             << index << ") % " << multiple << ";\n";
+    indent() << "nvfuser_index_t linear_index" << rop->name() << " = " << index
+             << ";\n";
+    indent() << "nvfuser_index_t rng_subseq" << rop->name() << " = linear_index"
+             << rop->name() << " / " << multiple << ";\n";
+    indent() << "nvfuser_index_t rng_component" << rop->name()
+             << " = linear_index" << rop->name() << " % " << multiple << ";\n";
     indent() << "nvfuser_index_t rng_offset" << rop->name() << " = "
              << rop->getRNGOffset() << ";\n";
     indent() << "if (rng_subseq != rng_subseq" << rop->name()
diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.cpp b/torch/csrc/jit/codegen/cuda/lower_utils.cpp
@@ -204,6 +204,10 @@ bool isScalarOp(const Expr* expr) {
 }
 
 bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map) {
+  if (expr->isA<kir::BlockSync>()) {
+    return true;
+  }
+
   if (!isTvOp(expr)) {
     return false;
   }
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -109,6 +109,9 @@ class DomainMap : public pointwise_utils::DomainMap {
     decltype(input_tvs)* tv_filtered_groups[2] = {&output_tvs, &input_tvs};
     for (auto tv_filtered_group : tv_filtered_groups) {
       for (auto tv : *tv_filtered_group) {
+        if (tv->isFusionInput() && tv->uses().empty()) {
+          continue;
+        }
         if (grouped.count(tv) > 0) {
           continue;
         }
@@ -653,7 +656,7 @@ void scheduleTranspose(Fusion* fusion, TransposeParams params) {
     if (inner_most_pos2_in_ref1 > inner_most_pos1_in_ref1) {
       inner_most_pos2_in_ref1--;
     }
-    if (!merged2.has_value() && *merged2 > inner_most_pos1_in_ref1) {
+    if (merged2.has_value() && *merged2 > inner_most_pos1_in_ref1) {
       (*merged2)--;
     }
     reference1->merge(*merged1, inner_most_pos1_in_ref1);
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
@@ -25512,6 +25512,49 @@ TEST_F(NVFuserTest, FusionSizeDependentData_CUDA) {
       executor_cache.fusion(), cg_outputs, {a}, {a + 123}, __LINE__, __FILE__);
 }
 
+TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) {
+  // https://github.com/csarofeen/pytorch/issues/1926
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeSymbolicTensor(2);
+  fusion->addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tv2 = set(tv1);
+  fusion->addOutput(tv2);
+
+  tv1->setMemoryType(MemoryType::Shared);
+  for (auto tv : {tv1, tv2}) {
+    tv->split(0, 4);
+    tv->reorder({{1, -1}});
+    tv->split(1, 8);
+    tv->merge(0);
+    tv->split(0, 1);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+    tv->axis(1)->parallelize(ParallelType::Unswitch);
+  }
+  tv1->merge(2);
+  tv2->reorder({{2, 3}});
+  tv2->merge(2);
+  for (auto tv : {tv1, tv2}) {
+    tv->axis(-1)->parallelize(ParallelType::TIDx);
+  }
+
+  InlinePropagator propagator(tv2, -1, ComputeAtMode::MostInlined);
+  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({5, 5}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {t0});
+  auto cg_outputs = fe.runFusion({t0});
+  auto out = cg_outputs[0];
+
+  testValidate(fusion, {out}, {t0}, {t0}, __LINE__, __FILE__);
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu b/torch/csrc/jit/codegen/cuda/test/test_gpu_rng.cu
@@ -264,5 +264,40 @@ TEST_F(NVFuserTest, FusionBroadcastingRNGSmem_CUDA) {
   }
 }
 
+TEST_F(NVFuserTest, FusionBroadcastingRNGSmemNonSquareTile_CUDA) {
+  // https://github.com/csarofeen/pytorch/issues/1926
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeConcreteTensor({5, 1});
+  TensorView* tv1 = makeConcreteTensor({5, 5});
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = randlike(tv0);
+  auto tv3 = add(tv1, tv2);
+  auto tv4 = add(tv0, tv3);
+  fusion->addOutput(tv4);
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::zeros({5, 1}, options);
+  at::Tensor t1 = at::zeros({5, 5}, options);
+
+  TransposeParams heuristics;
+  heuristics.tile_size1 = 8;
+  heuristics.tile_size2 = 4;
+  scheduleTranspose(fusion, heuristics);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+  auto out = cg_outputs[0];
+
+  TORCH_CHECK((out.select(1, 0) == out.select(1, 1)).all().item<bool>());
+  TORCH_CHECK((out.select(1, 0) == out.select(1, 2)).all().item<bool>());
+  TORCH_CHECK((out.select(1, 0) == out.select(1, 3)).all().item<bool>());
+  TORCH_CHECK((out.select(1, 0) == out.select(1, 4)).all().item<bool>());
+}
+
 } // namespace jit
 } // namespace torch

Original file line number	Diff line number	Diff line change
`@@ -204,6 +204,10 @@ bool isScalarOp(const Expr* expr) {`
`204`	`204`	`}`
`205`	`205`
`206`	`206`	`bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map) {`
	`207`	`+ if (expr->isA<kir::BlockSync>()) {`
	`208`	`+ return true;`
	`209`	`+ }`
	`210`	`+`
`207`	`211`	`if (!isTvOp(expr)) {`
`208`	`212`	`return false;`
`209`	`213`	`}`
Original file line number	Diff line number	Diff line change
`@@ -109,6 +109,9 @@ class DomainMap : public pointwise_utils::DomainMap {`
`109`	`109`	`decltype(input_tvs)* tv_filtered_groups[2] = {&output_tvs, &input_tvs};`
`110`	`110`	`for (auto tv_filtered_group : tv_filtered_groups) {`
`111`	`111`	`for (auto tv : *tv_filtered_group) {`
	`112`	`+ if (tv->isFusionInput() && tv->uses().empty()) {`
	`113`	`+ continue;`
	`114`	`+ }`
`112`	`115`	`if (grouped.count(tv) > 0) {`
`113`	`116`	`continue;`
`114`	`117`	`}`
`@@ -653,7 +656,7 @@ void scheduleTranspose(Fusion* fusion, TransposeParams params) {`
`653`	`656`	`if (inner_most_pos2_in_ref1 > inner_most_pos1_in_ref1) {`
`654`	`657`	`inner_most_pos2_in_ref1--;`
`655`	`658`	`}`
`656`		`- if (!merged2.has_value() && *merged2 > inner_most_pos1_in_ref1) {`
	`659`	`+ if (merged2.has_value() && *merged2 > inner_most_pos1_in_ref1) {`
`657`	`660`	`(*merged2)--;`
`658`	`661`	`}`
`659`	`662`	`reference1->merge(*merged1, inner_most_pos1_in_ref1);`