csarofeen · naoyam · Oct 6, 2022 · Jul 12, 2022 · Jul 12, 2022 · Jul 13, 2022
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -543,9 +543,18 @@ class CudaKernelGenerator : private OptOutConstDispatch {
   void genCpAsync(const LoadStoreOp* ldst, int vec_size) {
     auto dtype = ldst->in()->getDataType().value();
 
-    indent() << "Ampere::cpAsync("
-             << genVectorPointer(ldst->out(), dtype, vec_size) << ","
-             << genVectorPointer(ldst->in(), dtype, vec_size) << ");\n";
+    if (ldst->predicate() == nullptr) {
+      // Out of line predicate variant
+      indent() << "Ampere::cpAsync("
+               << genVectorPointer(ldst->out(), dtype, vec_size) << ","
+               << genVectorPointer(ldst->in(), dtype, vec_size) << ");\n";
+    } else {
+      // Inline predicate variant
+      indent() << "Ampere::cpAsync("
+               << genVectorPointer(ldst->out(), dtype, vec_size) << ","
+               << genVectorPointer(ldst->in(), dtype, vec_size) << ","
+               << genInline(ldst->predicate()) << ");\n";
+    }
   }
 
   void genLdMatrix(const LoadStoreOp* ldst, int vector_word_size) {

diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -1019,6 +1019,12 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
   // compile to sass is not allowed prior to CUDA 11.1
   compile_to_sass = false;
 #endif
+
+  if (isOptionDisabled(DisableOption::CompileToSass)) {
+    // Allows manually disabling compilation to sass
+    //  so the intermediate ptx could be checked.
+    compile_to_sass = false;
+  }
   // CUDA 11.1 allows going directly to SASS (sm_) instead of PTX (compute_)
   // which gives better backwards compatibility to work on older driver,
   // (since older driver doesn't necessrily recognize PTX emitted by new

diff --git a/torch/csrc/jit/codegen/cuda/lower_index.cpp b/torch/csrc/jit/codegen/cuda/lower_index.cpp
@@ -982,7 +982,9 @@ void IndexLowering::handleGroupedGridWelford(
 void IndexLowering::handle(const LoadStoreOp* ldst) {
   const auto in = lowerSrcIndex(ldst->in(), ldst->out());
   const auto out = lowerDstIndex(ldst->out());
-  pushBack(IrBuilder::create<LoadStoreOp>(ldst->opType(), out, in));
+  auto new_ldst = IrBuilder::create<LoadStoreOp>(ldst->opType(), out, in)
+                      ->withPredicate(ldst->predicate());
+  pushBack(new_ldst);
   GpuLower::current()->propagateExprInfo(ldst, back());
 }
 

diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_predicate.cpp
@@ -43,29 +43,40 @@ class ConditionalFromPredicateModifier : public kir::ExprMutator {
       // Replace expr predicate with bool conditional
       auto conditional = generateConditional(expr->predicate());
       if (expr->predicate()->predicate_type() == PredicateType::Vectorize) {
-        // TODO: This logic doesn't seem to fit well here, for unswitch the
-        // logic is in the unroll loop to set the thread predicate to the expr.
-        // I didn't have a quick way to do that so placing this here for now.
-        TORCH_INTERNAL_ASSERT(
-            expr->isA<kir::IfThenElse>(),
-            "Predicate handling expects ITE statement.");
-        auto ite = expr->as<kir::IfThenElse>();
-
-        TORCH_INTERNAL_ASSERT(
-            ite->thenBody().size() == 1,
-            "Expecting predicated body to only have one vectorized expression.");
-        auto vec_expr = ite->thenBody()[0];
-        TORCH_INTERNAL_ASSERT(
-            vec_expr->isA<UnaryOp>() || vec_expr->isA<LoadStoreOp>(),
-            "Vectorize predicate exprs only supported on set operations.");
-        TORCH_INTERNAL_ASSERT(
-            ir_utils::isTvOp(vec_expr),
-            "Vectorize predicate exprs only supported on tensor view operations.");
-        if (!vec_expr->inputs()[0]->isConstScalar()) {
+        if (expr->isA<kir::IfThenElse>()) {
+          // TODO: This logic doesn't seem to fit well here, for unswitch the
+          // logic is in the unroll loop to set the thread predicate to the
+          // expr. I didn't have a quick way to do that so placing this here for
+          // now.
+          auto ite = expr->as<kir::IfThenElse>();
+
+          TORCH_INTERNAL_ASSERT(
+              ite->thenBody().size() == 1,
+              "Expecting predicated body to only have one vectorized expression.");
+          auto vec_expr = ite->thenBody()[0];
+          TORCH_INTERNAL_ASSERT(
+              vec_expr->isA<UnaryOp>() || vec_expr->isA<LoadStoreOp>(),
+              "Vectorize predicate exprs only supported on set operations.");
+          TORCH_INTERNAL_ASSERT(
+              ir_utils::isTvOp(vec_expr),
+              "Vectorize predicate exprs only supported on tensor view operations.");
+          if (!vec_expr->inputs()[0]->isConstScalar()) {
+            conditional = SimplifyingIrBuilder::andExpr(
+                              conditional,
+                              GpuLower::current()->threadPredMap().getPredicate(
+                                  ir_utils::getTvOutput(vec_expr)))
+                              ->as<Bool>();
+          }
+        } else {
+          TORCH_INTERNAL_ASSERT(lower_utils::supportInlinePredicate(expr));
+          auto thread_pred = GpuLower::current()->threadPredMap().getPredicate(
+              ir_utils::getTvOutput(expr));
+          TORCH_INTERNAL_ASSERT(
+              thread_pred->isConst() && thread_pred->value().value());
           conditional = SimplifyingIrBuilder::andExpr(
                             conditional,
                             GpuLower::current()->threadPredMap().getPredicate(
-                                ir_utils::getTvOutput(vec_expr)))
+                                ir_utils::getTvOutput(expr)))
                             ->as<Bool>();
         }
       }

diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp
@@ -138,6 +138,12 @@ void UnrollPass::handle(Expr* expr) {
                                     PredicateType::Inline, expr, thread_pred);
     }
 
+    if (lower_utils::supportInlinePredicate(expr)) {
+      expr_with_predicate = expr_with_predicate->withPredicate(pred);
+      registerReplace(expr, expr_with_predicate, &for_loops_.back()->body());
+      return;
+    }
+
     // If we need a predicate, put expr inside an if then else
     kir::IfThenElse* inline_ite = IrBuilder::create<kir::IfThenElse>(pred);
     if (for_loops_.empty()) {

diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.cpp b/torch/csrc/jit/codegen/cuda/lower_utils.cpp
@@ -727,6 +727,17 @@ BasicAllocInfo getAllocInformation(
   return info;
 }
 
+//! Implementing this in here to avoid including too many headers
+//!  in type.cpp. Conceptually this should be a generic definition
+//!  rather than a util.
+bool supportInlinePredicate(Expr* expr) {
+  if (ir_utils::isCpAsyncOp(expr)) {
+    return true;
+  }
+  // TODO: build out support.
+  return false;
+}
+
 } // namespace lower_utils
 
 } // namespace cuda

diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.h b/torch/csrc/jit/codegen/cuda/lower_utils.h
@@ -263,6 +263,11 @@ BasicAllocInfo getAllocInformation(
     const std::vector<kir::ForLoop*>& loops,
     const std::unordered_map<IterDomain*, IterDomain*>& id_map = {},
     bool use_id_map = false);
+
+//! Returns true if the expression has a variant that takes a predicate
+//!  as an inline argument.
+bool supportInlinePredicate(Expr* expr);
+
 } // namespace lower_utils
 
 } // namespace cuda

diff --git a/torch/csrc/jit/codegen/cuda/runtime/memory.cu b/torch/csrc/jit/codegen/cuda/runtime/memory.cu
@@ -152,6 +152,31 @@ DEVICE_INLINE void cpAsync(
       "n"(byte_size));
 }
 
+// Global to SMEM load that is asynchronous,
+// not guaranteed to be completed until cpAsyncBarrier() is called.
+template <typename dtype, int len>
+DEVICE_INLINE void cpAsync(
+    Array<dtype, len, len>* smem_ptr,
+    void const* gmem_ptr,
+    bool predicate) {
+  unsigned smem_addr = util::toSmem(&(smem_ptr->array[0]));
+  constexpr int byte_size = sizeof(dtype) * len;
+
+  static_assert(
+      byte_size == 4 || byte_size == 8 || byte_size == 16,
+      "cp_async : unsupported byte size");
+
+  asm volatile(
+      "{\n"
+      "  .reg .pred p;\n"
+      "  setp.ne.b32 p, %3, 0;\n"
+      "@p cp.async.ca.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem_addr),
+      "l"(gmem_ptr),
+      "n"(byte_size),
+      "r"((int)predicate));
+}
+
 // TODO: Might have a different category of sync if we want to build out this:
 DEVICE_INLINE void cpAsyncBarrier() {
   asm volatile("cp.async.wait_all;");

diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu1.cpp
@@ -1434,42 +1434,6 @@ TEST_F(NVFuserTest, FusionSimplePWise_CUDA) {
   TORCH_CHECK(output_ref.equal(output));
 }
 
-TEST_F(NVFuserTest, FusionSimpleAmperePipeline_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  // requires ampere+ GPU
-  if (!deviceMajorMinorCheck(8)) {
-    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
-    return;
-  }
-
-  auto tv0 = makeContigTensor(1);
-
-  fusion.addInput(tv0);
-
-  auto tv1 = set(tv0);
-
-  fusion.addOutput(tv1);
-
-  auto tv_cache = tv0->cacheAfter(LoadStoreOpType::CpAsync);
-  tv_cache->setMemoryType(MemoryType::Shared);
-
-  tv1->split(0, 16);
-  tv0->computeAt(tv1, 1);
-
-  tv_cache->circularBuffer(10);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input1 = at::randn({255}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input1});
-  auto cg_outputs = fe.runFusion({input1});
-
-  testValidate(&fusion, cg_outputs, {input1}, {input1}, __LINE__, __FILE__);
-}
-
 TEST_F(NVFuserTest, FusionSimplePWiseDtypeComplex_CUDA) {
   Fusion fusion;
   FusionGuard fg(&fusion);

diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
@@ -4073,7 +4073,7 @@ TEST_F(NVFuserTest, FusionLoopSwizzleCheck1_CUDA) {
   // Swizzle inner tile of tv2
   tv2->swizzle(Swizzle2DType::ZShape, -2, -1, SwizzleMode::Loop);
 
-  // Make tv2 swizzled and half-inlined (unsupported).
+  // Make tv2 swizzled and partially-inlined (unsupported).
   tv0->computeAt(tv3, -2);
 
   FusionExecutor fe;
@@ -6440,6 +6440,73 @@ TEST_F(NVFuserTest, FusionVectorizeStrideContiguitySelfOverlapping_CUDA) {
   }
 }
 
+TEST_F(NVFuserTest, FusionSimpleAmperePipeline_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  // requires ampere+ GPU
+  if (!deviceMajorMinorCheck(8)) {
+    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
+    return;
+  }
+
+  auto tv0 = makeContigTensor(1);
+
+  fusion.addInput(tv0);
+
+  auto tv1 = set(tv0);
+
+  fusion.addOutput(tv1);
+
+  auto tv_cache = tv0->cacheAfter(LoadStoreOpType::CpAsync);
+  tv_cache->setMemoryType(MemoryType::Shared);
+
+  tv1->split(0, 16);
+  tv0->computeAt(tv1, 1);
+
+  tv_cache->circularBuffer(10);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input1 = at::randn({255}, options);
+
+  // Add check that the cp async op has an inlined predicate.
+  class InlinedCpAsyncPredChecker : public kir::IrVisitor {
+   public:
+    using kir::IrVisitor::handle;
+
+   private:
+    void handle(kir::IfThenElse* ite) final {
+      auto prev_within_ite = within_ite_;
+      within_ite_ = true;
+      kir::IrVisitor::handle(ite);
+      within_ite_ = prev_within_ite;
+    }
+
+    void handle(LoadStoreOp* ldst) final {
+      if (ldst->opType() == LoadStoreOpType::CpAsync) {
+        TORCH_INTERNAL_ASSERT(!within_ite_, "CPASYNC predicate not inlined");
+        TORCH_INTERNAL_ASSERT(
+            ldst->predicate()->hasValue() &&
+                !ldst->predicate()->value()->isConst(),
+            "CPASYNC predicate is not generated");
+      }
+    }
+
+   private:
+    bool within_ite_ = false;
+  } pred_checker;
+
+  // Check that cp async is inlined:
+  GpuLower gpulw(&fusion);
+  pred_checker.handle(gpulw.kernel()->topLevelExprs());
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input1});
+  auto cg_outputs = fe.runFusion({input1});
+
+  testValidate(&fusion, cg_outputs, {input1}, {input1}, __LINE__, __FILE__);
+}
+
 // Test file size should be up to 10K LoC. Create a new file for more tests.
 
 } // namespace jit

diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp
@@ -132,6 +132,7 @@ auto parseDebugDumpOptions() {
 auto parseDisableOptions() {
   std::unordered_map<DisableOption, bool> options_map = {
       {DisableOption::ArchCheck, false},
+      {DisableOption::CompileToSass, false},
       {DisableOption::Fallback, false},
       {DisableOption::Fma, false},
       {DisableOption::IndexHoist, false},
@@ -145,6 +146,8 @@ auto parseDisableOptions() {
       const auto token = options_view.substr(0, end_pos);
       if (token == "arch_check") {
         options_map[DisableOption::ArchCheck] = true;
+      } else if (token == "compile_to_sass") {
+        options_map[DisableOption::CompileToSass] = true;
       } else if (token == "fallback") {
         options_map[DisableOption::Fallback] = true;
       } else if (token == "fma") {

diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h
@@ -70,6 +70,8 @@ TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
 //!
 enum class DisableOption {
   ArchCheck, //! Disable hardware-specific checks to enable cross arch debug
+  CompileToSass, //! Disable direct compilation to sass so the ptx can be
+                 //! examined
   Fallback, //! Disable fallback
   Fma, //! Disable FMA instructions
   IndexHoist, //! Disable index hoisting