csarofeen · shmsong · Jun 27, 2022 · Jun 13, 2022 · Jun 13, 2022 · Jun 13, 2022
diff --git a/torch/csrc/jit/codegen/cuda/mma_type.cpp b/torch/csrc/jit/codegen/cuda/mma_type.cpp
@@ -40,6 +40,9 @@ MmaBuilder& MmaBuilder::operand(MmaOptions::Operand a_or_b) {
 
 // TODO: validate op config
 MmaOptions MmaBuilder::build() const {
+  TORCH_CHECK(
+      option_.mma_op != nullptr,
+      "Please configure accumulator tv before using swizzle options.")
   return option_;
 }
 
@@ -53,6 +56,15 @@ void MmaBuilder::configureMma(TensorView* mma_output) const {
   mma->configureOptions(option_);
 }
 
+void MmaBuilder::accumulatorTv(TensorView* tv) {
+  TORCH_CHECK(
+      tv->getMemoryType() == MemoryType::Local, "Mma only outputs to register");
+  TORCH_CHECK(tv->definition(), "Input cannot be accumulator tv");
+  auto mma = dynamic_cast<MmaOp*>(tv->definition());
+  TORCH_CHECK(mma, "Requires mma op output for reduction tv");
+  option_.mma_op = mma;
+}
+
 namespace {
 
 // Utility to get ldmatrix direction a mma layout and operand

diff --git a/torch/csrc/jit/codegen/cuda/mma_type.h b/torch/csrc/jit/codegen/cuda/mma_type.h
@@ -94,6 +94,9 @@ struct MmaOptions {
         operand == other.operand &&
         accumulator_stride == other.accumulator_stride;
   }
+
+  // To be inferred by mma builder interface.
+  MmaOp* mma_op = nullptr;
 };
 
 //! User interface for configuring the mma and mma related
@@ -127,6 +130,10 @@ class TORCH_CUDA_CU_API MmaBuilder {
   //!  specified mma option.
   LoadStoreOpType ldMatrix() const;
 
+  //! Store the accumulator tv register reference in mma builder
+  //!  to avoid automatic matching of which mma ops.
+  void accumulatorTv(TensorView* tv);
+
   //! Fill in mma options in scheduling time.
   //!  Each mma op in Fusion IR must be configured once before lowering.
   //!  Mma options are configuration parameters used in lowering to mma