persistent_use_of_buffer is accumulated over all the resolution points. (#4)

jjsjann123 · naoyam · web-flow · commit cd899f60e70d · 2023-03-14T03:26:35.000-07:00
Cherry-picking from: csarofeen/pytorch#2576 Author: Naoya Maruyama naoyam@users.noreply.github.com Date: Mon Mar 13 17:50:01 2023 -0700 persistent_use_of_buffer is accumulated over all the resolution points. (#2576) Recomputation for each persistent use should be done after the accumulation is done. Currently, recomputation and replaceVal can be done redundantly. For example, on A100, that happens with NvFuserScheduler_BatchNorm_fp32/64/32/256. Co-authored-by: Naoya Maruyama <naoyam@users.noreply.github.com>
diff --git a/csrc/scheduler/reduction_utils.cpp b/csrc/scheduler/reduction_utils.cpp
@@ -728,42 +728,42 @@ std::vector<TensorView*> projectPersistentBuffers(Fusion* fusion) {
         }
         persistent_use_of_buffer.emplace_back(use);
       }
+    }
 
-      // For all uses that do not go towards the reduction operations in the
-      // persistent section of the graph, recompute the persistent buffer.
-      for (auto use : persistent_use_of_buffer) {
-        TORCH_INTERNAL_ASSERT(use->definition() != nullptr);
-        auto buffer_replicate = RecomputeTv::recompute(buffer);
-        // Create a shortcut buffer <--> buffer_replicate for propagation.
-        // Why is this needed?
-        // Consider that we have a fusion
-        //
-        //   T0[I]
-        //   T1[b b I] = broadcast(T0)
-        //   T2[b b r] = reduction(T1)
-        //   T3[b b b] = broadcast(T2)
-        //   T4[b, b, I] = T1 + T3
-        //   T5[b, b, r] = reduction(T4)
-        //
-        // After projection, it becomes
-        //
-        //   T0[I]
-        //   T1[b b I] = broadcast(T0)
-        //   T2[b b r] = reduction(T1)
-        //   T3[b b b] = broadcast(T2)
-        //   T6[b b I] = broadcast(T0)
-        //   T4[b, b, I] = T6 + T3
-        //   T5[b, b, r] = reduction(T4)
-        //
-        // During schedule, we need to propagate from T2 to T5. However, in the
-        // resulting DAG, neither the propagation path T2->T3->T4->T5 nor
-        // T2->T1->T0->T6->T4->T5 works because they both have missing root
-        // domain. But adding `T7 = T1 + T6` creates a new propagation path
-        // `T2->T1->T7->T6->T4->T5` which has all root domain information.
-        // See FusionBroadcastPersistentReduction_CUDA for an example
-        dummy_outputs.emplace_back(add(buffer_replicate, buffer));
-        ir_utils::replaceValInExpr(use->definition(), buffer, buffer_replicate);
-      }
+    // For all uses that do not go towards the reduction operations in the
+    // persistent section of the graph, recompute the persistent buffer.
+    for (auto use : persistent_use_of_buffer) {
+      TORCH_INTERNAL_ASSERT(use->definition() != nullptr);
+      auto buffer_replicate = RecomputeTv::recompute(buffer);
+      // Create a shortcut buffer <--> buffer_replicate for propagation.
+      // Why is this needed?
+      // Consider that we have a fusion
+      //
+      //   T0[I]
+      //   T1[b b I] = broadcast(T0)
+      //   T2[b b r] = reduction(T1)
+      //   T3[b b b] = broadcast(T2)
+      //   T4[b, b, I] = T1 + T3
+      //   T5[b, b, r] = reduction(T4)
+      //
+      // After projection, it becomes
+      //
+      //   T0[I]
+      //   T1[b b I] = broadcast(T0)
+      //   T2[b b r] = reduction(T1)
+      //   T3[b b b] = broadcast(T2)
+      //   T6[b b I] = broadcast(T0)
+      //   T4[b, b, I] = T6 + T3
+      //   T5[b, b, r] = reduction(T4)
+      //
+      // During schedule, we need to propagate from T2 to T5. However, in the
+      // resulting DAG, neither the propagation path T2->T3->T4->T5 nor
+      // T2->T1->T0->T6->T4->T5 works because they both have missing root
+      // domain. But adding `T7 = T1 + T6` creates a new propagation path
+      // `T2->T1->T7->T6->T4->T5` which has all root domain information.
+      // See FusionBroadcastPersistentReduction_CUDA for an example
+      dummy_outputs.emplace_back(add(buffer_replicate, buffer));
+      ir_utils::replaceValInExpr(use->definition(), buffer, buffer_replicate);
     }
   }
   return dummy_outputs;