fragment iteration to support fully unrolled mma ops (#1823)

shmsong · web-flow · commit 9bb4cf7a66b0 · 2022-07-29T11:29:11.000-07:00
diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -474,7 +474,11 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     }
   }
 
-  void handle(const kir::TensorIndex* ti) final {
+  //! Returns the sum of all indices in a TensorIndex,
+  //!  or 0 if the indices vector is empty.
+  //! Used lowering generic tensor index and lowering
+  //!  mma fragment indices.
+  std::string genTensorIndex(const kir::TensorIndex* ti) {
     bool first = true;
     std::stringstream index;
     for (auto* ind : ti->indices()) {
@@ -490,12 +494,17 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     if (first) {
       index << "0";
     }
+
+    return index.str();
+  }
+
+  void handle(const kir::TensorIndex* ti) final {
     bool is_volatile = ti->view()->getMemoryType() == MemoryType::Global &&
         kernel_->summary().sync_map.needsRawSync(ti->view()).hasBID();
     if (is_volatile) {
       code_ << "*(volatile " << ti->getDataType().value() << "*)&";
     }
-    code_ << varName(ti->view()) << "[" << index.str() << "]";
+    code_ << varName(ti->view()) << "[" << genTensorIndex(ti) << "]";
   }
 
   void handle(const ViewAsScalar* sv) final {
@@ -1013,14 +1022,17 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     auto options = mma->options();
     auto in_a = mma->inA()->as<kir::TensorIndex>()->view();
     auto dtype = in_a->getDataType().value();
-    indent() << kTab << "reinterpret_cast<Array<" << dtype << ","
+    indent() << kTab << "&(reinterpret_cast<Array<" << dtype << ","
              << getInputARegisterSize(options.macro) << ","
              << getInputARegisterSize(options.macro) << ">*>(&"
-             << gen(mma->inA()) << "),\n";
-    indent() << kTab << "reinterpret_cast<Array<" << dtype << ","
+             << varName(mma->inA()->as<kir::TensorIndex>()->view()) << ")["
+             << genTensorIndex(mma->inA()->as<kir::TensorIndex>()) << "])"
+             << ",\n";
+    indent() << kTab << "&(reinterpret_cast<Array<" << dtype << ","
              << getInputBRegisterSize(options.macro) << ","
              << getInputBRegisterSize(options.macro) << ">*>(&"
-             << gen(mma->inB()) << ")";
+             << varName(mma->inB()->as<kir::TensorIndex>()->view()) << ")["
+             << genTensorIndex(mma->inB()->as<kir::TensorIndex>()) << "])";
   }
 
   void genMmaInitialization(const MmaOp* mma, const UnaryOp* uop) {
diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp
@@ -1125,7 +1125,16 @@ indexMapFromTV(
         // Similarly for local memory tensors, zero replacement can be
         // only done when there's a matching domain with the same
         // parallel type
-        (loop->iter_domain()->isThread() && is_local && same_parallel_type)) {
+        (loop->iter_domain()->isThread() && is_local && same_parallel_type) ||
+        // MMA operands are currently indexed in units of "fragments",
+        //  so each mma tensor domain would be zero-ed and the tensor index
+        //  calculated here would be the fragment index.
+        // TODO: This is a quick WAR to enable iterating over a register array
+        //  of MMA fragments, so we could generate unrolled mma loops.
+        //  Eventually we still want IdGraph to be able to analyze the
+        //  in-register layout of mma fragments for more unified indexing math
+        //  as well as more flexibility in swizzling loops.
+        (loop->iter_domain()->isMma() && !as_consumer)) {
       idx = GpuLower::current()->kernel()->zeroVal();
       zero_loops.insert(loop);
     } else {
diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
@@ -899,22 +899,42 @@ void validateMmaTensors(MmaOp* mma) {
   }
 
   // Note: this check will be relaxed in a follow up.
-  auto validate_operand_ids = [](const TensorView* tv) {
+  auto validate_operand = [](const TensorView* tv) {
+    TORCH_INTERNAL_ASSERT(
+        tv->getMemoryType() == MemoryType::Local,
+        "Only supporting register input for mma ops, up to sm80 all mma ops have to take register inputs.");
+
     TORCH_INTERNAL_ASSERT(
         std::all_of(
             tv->domain()->domain().begin() + tv->getComputeAtPosition(),
             tv->domain()->domain().end(),
             [](IterDomain* id) {
               return id->isMmaSwizzled() ||
-                  (id->isBroadcast() &&
+                  // MMA instructions can only take inputs from registers,
+                  //  so we always assume mma op inputs are located on
+                  //  registers.
+                  // Currently requiring that serial ids on the right of the
+                  //  CA axis are constant sized to ensure early detection of
+                  //  invalid mma schedules.
+                  ((id->isBroadcast() || id->extent()->isConstInt()) &&
                    id->getParallelType() == ParallelType::Serial);
             }),
         "All id's on the right of CA pos needs to be mma-swizzled by WarpMmaSwizzler\n",
         tv);
   };
 
-  validate_operand_ids(mma->inA()->as<TensorView>());
-  validate_operand_ids(mma->inB()->as<TensorView>());
+  validate_operand(mma->inA()->as<TensorView>());
+  validate_operand(mma->inB()->as<TensorView>());
+
+  // Additionally validate that mma is not directly taking a double buffered
+  //  register input as the double buffer indexing is currently not compatible
+  //  with fragment iteration. Would need to require a cache stage in this case.
+  TORCH_INTERNAL_ASSERT(
+      !mma->inA()->as<TensorView>()->isDoubleBuffered(),
+      "MMA op cannot directly take double buffered register input, put a set stage before.");
+  TORCH_INTERNAL_ASSERT(
+      !mma->inB()->as<TensorView>()->isDoubleBuffered(),
+      "MMA op cannot directly take double buffered register input, put a set stage before.");
 }
 
 //! Note and TODO:
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/matmul.cpp b/torch/csrc/jit/codegen/cuda/scheduler/matmul.cpp
@@ -46,8 +46,37 @@ void scheduleMatmul(
     TensorView* c,
     TensorView* a,
     TensorView* b,
-    MmaBuilder& mma_builder,
-    MatMulTileOptions& gemm_tile) {
+    MatmulParam& params) {
+  // Unpack from params.
+  auto& mma_builder = params.mma_builder;
+  auto& gemm_tile = params.tile_sizes;
+
+  // Including current tensor naming convention for reference,
+  //  this is very temporary and will change over time and
+  //  in fact the whole body of this function will
+  //  eventually be a set of utility functions for different
+  //  sections of matmul(fusion) kernels, with
+  //  each having its own build out to do.
+  //
+  // Current naming convention:
+  //
+  //  operands assumed in global memory : a, b
+  //
+  //  registers staging global load : ar, br (short for a/b read)
+  //
+  //  shared mem cache of operands : acw_smem, bcw_smem (short for a/b
+  //  cache_write smem)
+  //
+  //  registers at shared memory load output : acr, bcr (short for a/b cache
+  //  read)
+  //
+  //  register tensor input to the actual mma op: ab, bb (short for a/b
+  //  broadcasted)
+  //
+  //  accumulator register: cc (short for c cache)
+  //
+  //  result in global memory: c
+
   // Currently only support a, b, c as fusion inputs/outputs
   //  aka. no prolog and epilog fusion yet.
   TORCH_CHECK(
@@ -112,6 +141,17 @@ void scheduleMatmul(
 
     acr = acw_smem->cacheAfter();
     bcr = bcw_smem->cacheAfter();
+    if (params.double_buffer_options.double_buffer_smem_read) {
+      // Provide another copy op between the double buffered
+      //  smem load register and the actual mma ops to avoid
+      //  complication in double buffered fragment iteration.
+      ab = acr->cacheAfter();
+      bb = bcr->cacheAfter();
+    } else {
+      ab = acr;
+      bb = bcr;
+    }
+
   } else {
     acw_smem = ar->cacheAfter();
     bcw_smem = br->cacheAfter();
@@ -182,8 +222,8 @@ void scheduleMatmul(
   b->computeAt(cc, 3);
 
   // Main Loop:
-  acr->computeAt(cc, -4);
-  bcr->computeAt(cc, -4);
+  acr->computeAt(cc, -6);
+  bcr->computeAt(cc, -6);
 
   // Add mma swizzle:
   //   TODO: this section goes to a separate matmul util,
@@ -192,30 +232,26 @@ void scheduleMatmul(
   if (isTuring(mma_options.macro) || isAmpere(mma_options.macro)) {
     moveInnerBroadcastLeft(ab);
     moveInnerBroadcastLeft(bb);
-    ab->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::A).build());
-    bb->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::B).build());
-
-    // Propagate mma input swizzle up the DAG
-    //  to all the tensors before mma op and after shared mem read.
-    scheduler_utils::BoundedDirectionalTransformPropagator::backward(
-        ab,
-        -1,
-        {acw_smem},
-        scheduler_utils::BoundedDirectionalTransformPropagator::Options()
-            .propagateParallelType());
-    scheduler_utils::BoundedDirectionalTransformPropagator::backward(
-        bb,
-        -1,
-        {bcw_smem},
-        scheduler_utils::BoundedDirectionalTransformPropagator::Options()
-            .propagateParallelType());
-  } else {
-    // TODO:
-    //  Need to build out this to support balanced prolog fusion on Volta.
-    acr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::A).build());
-    bcr->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::B).build());
   }
 
+  ab->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::A).build());
+  bb->applyMmaSwizzle(mma_builder.operand(MmaOptions::Operand::B).build());
+
+  // Propagate mma input swizzle up the DAG
+  //  to all the tensors before mma op and after shared mem read.
+  scheduler_utils::BoundedDirectionalTransformPropagator::backward(
+      ab,
+      -1,
+      {acw_smem},
+      scheduler_utils::BoundedDirectionalTransformPropagator::Options()
+          .propagateParallelType());
+  scheduler_utils::BoundedDirectionalTransformPropagator::backward(
+      bb,
+      -1,
+      {bcw_smem},
+      scheduler_utils::BoundedDirectionalTransformPropagator::Options()
+          .propagateParallelType());
+
   cc->applyMmaSwizzle(
       mma_builder.operand(MmaOptions::Operand::Accumulator).build());
 
@@ -243,6 +279,16 @@ void scheduleMatmul(
   cc->axis(4)->parallelize(ParallelType::TIDy);
 
   // Propagate mma output swizzle and parallelization down the DAG
+  if (params.double_buffer_options.double_buffer_smem_write) {
+    acw_smem->doubleBuffer();
+    bcw_smem->doubleBuffer();
+  }
+
+  if (params.double_buffer_options.double_buffer_smem_read) {
+    acr->doubleBuffer();
+    bcr->doubleBuffer();
+  }
+
   scheduler_utils::BoundedDirectionalTransformPropagator::forward(
       cc,
       -1,
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/matmul.h b/torch/csrc/jit/codegen/cuda/scheduler/matmul.h
@@ -10,6 +10,27 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+//! Starting point for a matmul scheduler parameters:
+class MatmulParam {
+ public:
+  MatmulParam(MmaBuilder builder) : mma_builder(builder) {}
+
+  struct DoubleBufferOptions {
+    bool double_buffer_smem_write = false;
+    bool double_buffer_smem_read = false;
+  };
+
+  //! Specifies the tiling hierarchy on block,
+  //!  warp, and instruction levels.
+  MatMulTileOptions tile_sizes;
+
+  //! Parameters for configuring mma ops.
+  MmaBuilder mma_builder;
+
+  //! Specify which tensor we double buffer.
+  DoubleBufferOptions double_buffer_options;
+};
+
 //! Prototype auto scheduling function.
 //!  Currently only support a pure matmul with no
 //!   fused prolog or epilog.
@@ -22,8 +43,7 @@ TORCH_CUDA_CU_API void scheduleMatmul(
     TensorView* c_tv,
     TensorView* a_tv,
     TensorView* b_tv,
-    MmaBuilder& mma_builder,
-    MatMulTileOptions& gemm_tile);
+    MatmulParam& params);
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp