Make FrontendBasic test match auto schedule

jacobhinkle · jacobhinkle · commit e667b74a6b7b · 2023-01-17T13:43:32.000-05:00
Again I used the fusion_debug dump from #2326 to trace what the reduction scheduler is doing. This time I learned about multiReductionInliner, which uses two calls to parallelizeAllLike for different types of ParallelTypes, followed by an undoing of unrolling and vectorization on the reference tensor. The need for the latter is still a little unclear to me.
diff --git a/third_party/nvfuser/test/test_gpu_match_frontend.cpp b/third_party/nvfuser/test/test_gpu_match_frontend.cpp
@@ -447,7 +447,32 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 
   std::vector<IValue> inputs = {t0, t1};
 
-  // Define fusion
+  Fusion fauto;
+  { // Do automatic scheduling on fauto
+    FusionGuard fg(&fauto);
+
+    auto tv0 = makeSymbolicTensor(3);
+    auto tv1 = makeSymbolicTensor(3);
+    auto c0 = IrBuilder::create<Double>(3.0);
+
+    fauto.addInput(tv0);
+    fauto.addInput(tv1);
+
+    auto tv2 = add(tv0, tv1);
+    auto tv3 = mul(tv2, c0);
+    auto tv4 = sum(tv3, {-1}, false, DataType::Float);
+
+    fauto.addOutput(tv4);
+
+    // Run automatic scheduler
+    auto reduction_params = getReductionHeuristics(&fauto, inputs);
+    TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
+    scheduleReduction(&fauto, *reduction_params);
+  }
+
+  // Re-define the fusion exactly for manual scheduling
+  // This is necessary in order to catch all the constructors inside each
+  // Fusion independently.
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -464,39 +489,55 @@ TEST_F(NVFuserTest, FusionFrontendBasic_CUDA) {
 
   fusion.addOutput(tv4);
 
-  // Run automatic scheduler
-  auto fauto = Fusion(fusion); // unique_ptr to copy of fusion
-  auto reduction_params = getReductionHeuristics(&fauto, inputs);
-  TORCH_CHECK(reduction_params, "Reduction schedule was not generated!");
-  scheduleReduction(&fauto, *reduction_params);
-
   // Perform manual scheduling
-  tv4->merge(0, 1); // {i0*i1, i2}
-  tv4->split(
-      1,
-      NamedScalar::getParallelDim(
-          ParallelType::TIDx)); // {i0*i1, r2 / bDx, bDx}
-  tv4->split(-2, 1);
-  tv4->reorder({{-2, -1}, {-1, -2}});
-  tv4->split(0, 2);
-  tv4->reorder({{1, 2}, {2, 1}});
-  tv4->split(0, 1);
-  tv4->reorder({{1, 2}, {2, 1}});
-  tv4->axis(0)->parallelize(ParallelType::BIDx);
-  tv4->axis(2)->parallelize(ParallelType::Unswitch);
-  tv4->axis(3)->parallelize(ParallelType::Unroll);
-  tv4->axis(4)->parallelize(ParallelType::TIDx);
-  tv4->axis(5)->parallelize(ParallelType::Unswitch);
 
-  auto tv5 = tv0->cacheAfter();
-  auto tv6 = tv1->cacheAfter();
-  auto tv7 = tv4->cacheBefore();
+  auto tv5 = tv0->cacheAfter(); // tv5
+  auto tv6 = tv1->cacheAfter(); // tv6
+  auto tv7 = tv4->cacheBefore(); // tv7
+
+  tv7->reorder({{2, 0}});
+  tv7->merge(1, 2);
+  tv7->reorder({{1, 0}});
+  tv7->split(1, NamedScalar::getParallelDim(ParallelType::TIDx));
+  tv7->axis(2)->parallelize(ParallelType::TIDx);
+  tv7->split(1, 1);
+  tv7->axis(2)->parallelize(ParallelType::Unswitch);
+  tv7->split(0, 2);
+  tv7->axis(1)->parallelize(ParallelType::Unroll);
+  tv7->split(0, 1);
+  tv7->axis(1)->parallelize(ParallelType::Unswitch);
+  tv7->axis(0)->parallelize(ParallelType::BIDx);
+
+  tv7->reorder({{0, 0}, {1, 2}, {2, 3}, {3, 1}, {4, 5}, {5, 4}});
+
   auto tv8 = tv7->rFactor({1, 5});
 
+  // NOTE: see multiReductionInliner for more info on how propagation and
+  // inlining works in the reduction scheduler
+
   // propagate the mapping to other tensors
-  TransformPropagatorWithCheck propagator(tv7);
-  MaxRootDomainInfoSpanningTree(tv7).traverse(&propagator);
-  scheduler_utils::parallelizeAllLike(tv7, {tv2, tv3, tv4, tv5, tv6, tv8});
+  TransformPropagatorWithCheck propagator(tv8);
+  MaxRootDomainInfoSpanningTree(tv8).traverse(&propagator);
+  // Propagate parallelization except vectorization and unrolling
+  scheduler_utils::parallelizeAllLike(
+      tv8,
+      {},
+      allParallelTypesExcept(
+          {ParallelType::Unroll,
+           ParallelType::Vectorize,
+           ParallelType::MisalignedVectorize}));
+  // Propagate vectorization/unrolling to those tensors that need it
+  scheduler_utils::parallelizeAllLike(
+      tv8,
+      {tv4, tv6, tv5},
+      {
+          ParallelType::Unroll,
+          ParallelType::Vectorize,
+          ParallelType::MisalignedVectorize,
+      });
+  // If reference shouldn't be unrolled, clear that parallel type.
+  tv8->axis(3)->parallelize(ParallelType::Serial);
+  tv7->axis(2)->parallelize(ParallelType::Serial);
 
   inlineMost();