csarofeen · csarofeen · Feb 11, 2022 · Feb 10, 2022 · Feb 11, 2022
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -958,6 +958,7 @@ if(USE_CUDA OR USE_ROCM)
 
   # The list of NVFUSER runtime files
   list(APPEND NVFUSER_RUNTIME_FILES
+    ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/array.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_reduction.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu
     ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_default.cu

diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
@@ -20414,7 +20414,7 @@ TEST_F(NVFuserTest, FusionSmemBlockGemmCacheDoubleBuffer_CUDA) {
 }
 
 TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) {
-  auto mem_types = {MemoryType::Shared, MemoryType::Local};
+  std::vector<MemoryType> mem_types = {MemoryType::Shared, MemoryType::Local};
 
   for (auto mem_type : mem_types) {
     Fusion fusion;

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
@@ -33,6 +33,7 @@ GENERATED_CPP = [
 
 # NVFuser runtime library
 libtorch_nvfuser_runtime_sources = [
+    "torch/csrc/jit/codegen/cuda/runtime/array.cu",
     "torch/csrc/jit/codegen/cuda/runtime/bf16_support.cu",
     "torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu",
     "torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu",

diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -375,26 +375,48 @@ class CudaKernelGenerator : private OptOutConstDispatch {
             uop->out()->dtype() == uop->in()->dtype(),
             "Vectorized store/load requires input and output datatypes match.");
       }
-    }
 
-    if (is_vector_op) {
-      if (uop->in()->isScalar()) {
-        indent() << "reinterpret_cast<"
-                 << "Array<" << uop->out()->dtype() << ", " << vector_word_size
-                 << ">*>"
-                 << "(&" << gen(uop->out()) << ")->set(" << gen(uop->in())
-                 << ");\n";
-      } else {
-        indent() << "*reinterpret_cast<"
-                 << "Array<" << uop->out()->dtype() << ", " << vector_word_size
-                 << ">*>"
-                 << "(&" << gen(uop->out()) << ")"
-                 << " = *reinterpret_cast<"
-                 << "Array<" << uop->in()->dtype() << ", " << vector_word_size
-                 << ">*>"
-                 << "(&" << gen(uop->in()) << ");\n";
+      if (is_vector_op) {
+        auto out_tv = uop->out()->as<kir::TensorIndex>()->view();
+        if (uop->in()->isScalar()) {
+          if (out_tv->getMemoryType() == MemoryType::Local) {
+            // Vectorized initialization
+            indent() << varName(out_tv) << ".set(" << gen(uop->in()) << ");\n";
+          } else {
+            indent() << "arraySet<" << out_tv->getMemoryType() << ", "
+                     << vector_word_size << ">(" << gen(uop->out()) << ", "
+                     << gen(uop->in()) << ");\n";
+          }
+        } else {
+          // Vectorized load
+          TORCH_INTERNAL_ASSERT(
+              uop->in()->isA<kir::TensorIndex>(),
+              "Invalid input to unary op with tensor output, found: ",
+              uop->in()->toString());
+
+          auto in_tv = uop->in()->as<kir::TensorIndex>()->view();
+          bool localToGlobal = out_tv->getMemoryType() == MemoryType::Global &&
+              in_tv->getMemoryType() == MemoryType::Local;
+
+          bool globalToLocal = out_tv->getMemoryType() == MemoryType::Local &&
+              in_tv->getMemoryType() == MemoryType::Global;
+
+          if (localToGlobal) {
+            indent() << "loadLocalToGlobal<" << uop->out()->dtype() << ", "
+                     << vector_word_size << ">(&" << gen(uop->out()) << ", &"
+                     << gen(uop->in()) << ");\n";
+          } else if (globalToLocal) {
+            indent() << "loadGlobalToLocal<" << uop->out()->dtype() << ", "
+                     << vector_word_size << ">(&" << gen(uop->out()) << ", &"
+                     << gen(uop->in()) << ");\n";
+          } else {
+            indent() << "loadGeneric<" << uop->out()->dtype() << ", "
+                     << vector_word_size << ">(&" << gen(uop->out()) << ", &"
+                     << gen(uop->in()) << ");\n";
+          }
+        }
+        return;
       }
-      return;
     }
 
     if (uop->out()->isA<NamedScalar>()) {
@@ -1281,8 +1303,9 @@ class CudaKernelGenerator : private OptOutConstDispatch {
       // Allocate alias another Allocate stmt
       const auto alias_tv = alloc->alias()->buffer()->as<TensorView>();
       indent() << "// Alias Allocation - " << alloc->memoryType() << "\n";
-      indent() << buffer_dtype << "* " << varName(tv) << " = "
-               << varName(alias_tv) << ";\n";
+      indent() << "auto& " << varName(tv) << " = " << varName(alias_tv)
+               << ";\n";
+
     } else {
       // Standard Memory Allocation
       switch (tv->getMemoryType()) {
@@ -1307,10 +1330,16 @@ class CudaKernelGenerator : private OptOutConstDispatch {
                      << buffer_dtype << "));\n";
           }
           break;
-        case MemoryType::Local:
-          indent() << buffer_dtype << " " << varName(tv) << "["
-                   << genInline(size) << "];\n";
-          break;
+        case MemoryType::Local: {
+          auto va = kernel_->summary().vectorized_accesses;
+          if (va.find(tv) != va.end()) {
+            indent() << "Array<" << buffer_dtype << ", " << genInline(size)
+                     << ", " << va.at(tv) << "> " << varName(tv) << ";\n";
+          } else {
+            indent() << buffer_dtype << " " << varName(tv) << "["
+                     << genInline(size) << "];\n";
+          }
+        } break;
         default:
           TORCH_INTERNAL_ASSERT(false, "Unexpected memory type");
       }

diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -13,6 +13,7 @@
 #include <torch/csrc/jit/resource_guard.h>
 
 #include <nvfuser_resources/PhiloxCudaStateRaw.h>
+#include <nvfuser_resources/array.h>
 #include <nvfuser_resources/bf16_support.h>
 #include <nvfuser_resources/block_reduction.h>
 #include <nvfuser_resources/block_sync_atomic.h>
@@ -44,6 +45,8 @@ namespace executor_utils {
 std::string kernelPreamble() {
   std::stringstream ss;
 
+  ss << nvfuser_resources::array_cu;
+
 #ifndef __HIP_PLATFORM_HCC__
   ss << nvfuser_resources::fp16_support_cu;
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000

diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp
@@ -270,12 +270,16 @@ class ValidateAllocation : private OptOutConstDispatch {
 } // namespace
 
 // TODO(kir): Kernel IR validation
-void Kernel::finalize(std::vector<Expr*> top_level_exprs) {
+void Kernel::finalize(
+    std::vector<Expr*> top_level_exprs,
+    const std::unordered_map<TensorView*, int>& vectorized_info) {
   TORCH_INTERNAL_ASSERT(top_level_exprs_.empty());
   top_level_exprs_ = std::move(top_level_exprs);
   warp_padded_parallel_info_ = GpuLower::current()->getWarpPaddedParallelInfo();
   ValidateAllocation::validate(this);
   analyze();
+  // Make sure this is after analyze as it sets summary_
+  summary_.vectorized_accesses = vectorized_info;
 }
 
 void Kernel::analyze() {

diff --git a/torch/csrc/jit/codegen/cuda/kernel.h b/torch/csrc/jit/codegen/cuda/kernel.h
@@ -78,6 +78,10 @@ struct KernelSummary {
   //! Effective ParallelTypes of broadcast ops
   std::unordered_map<const BroadcastOp*, ParallelTypeBitmap>
       broadcast_parallel_types;
+
+  // Track which tensor views are inputs or outputs of a vectorized operation
+  // and their maximum vectorized access size
+  std::unordered_map<TensorView*, int> vectorized_accesses;
 };
 
 class KernelInternalProxy;
@@ -108,9 +112,13 @@ class TORCH_CUDA_CU_API Kernel final : public Fusion {
   //! Finalize a kernel definition
   //!
   //! At this point we have a complete kernel definition and we can
-  //! run analysis passes to build a KernelSummary
+  //! run analysis passes to build a KernelSummary. Manually send in vectorized
+  //! info so it doesn't have to be rebuilt.
   //!
-  void finalize(std::vector<Expr*> top_level_exprs);
+
+  void finalize(
+      std::vector<Expr*> top_level_exprs,
+      const std::unordered_map<TensorView*, int>& vectorized_info);
 
   const std::vector<Expr*>& topLevelExprs() const {
     return top_level_exprs_;

diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp
@@ -206,7 +206,8 @@ ForLoop::ForLoop(IrBuilderPasskey passkey, IterDomain* iter_domain)
           nullptr,
           nullptr,
           nullptr,
-          isParallelTypeVectorize(iter_domain->getParallelType()),
+          !iter_domain->isBroadcast() &&
+              isParallelTypeVectorize(iter_domain->getParallelType()),
           nullptr,
           false) {
   TORCH_INTERNAL_ASSERT(

diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.h b/torch/csrc/jit/codegen/cuda/kernel_ir.h
@@ -143,7 +143,7 @@ class TORCH_CUDA_CU_API TensorIndex final : public Val {
  public:
   TensorIndex(
       IrBuilderPasskey,
-      const fuser::cuda::TensorView* view,
+      const TensorView* view,
       std::vector<Val*> indices);
 
   std::vector<Val*>::size_type nDims() const {

diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp
@@ -258,6 +258,10 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
   // Want to run this after parallel map is created
   validateVectorize(fusion_);
 
+  // Extract TensorViews that are accessed in a vectorized way and track their
+  // word size.
+  fillVectorizeInfo();
+
   // Compute thread predicates. Depends on parallel_dimension_map_
   thread_pred_map_.build(fusion_);
 
@@ -340,8 +344,9 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
   const auto exprs_cleaned_up_loops =
       KIRCleaner::cleanUp(exprs_register_adjusted);
 
-  // We now have the lowered expressions, finalize the kernel IR
-  kernel_->finalize(exprs_cleaned_up_loops);
+  // We now have the lowered expressions, finalize the kernel IR, add the
+  // vectorized entry to it manually as it's already populated in GpuLower
+  kernel_->finalize(exprs_cleaned_up_loops, vectorized_accesses_);
 }
 
 kir::Kernel* GpuLower::kernel() const {
@@ -355,6 +360,66 @@ GpuLower* GpuLower::current() {
   return active_gpu_lower;
 }
 
+// This was primarily copied from codegen.cpp::CudaKernelGenerator::handle(const
+// UnaryOp*)
+void GpuLower::fillVectorizeInfo() {
+  for (auto expr : fusion_->exprs()) {
+    if (expr->isA<UnaryOp>()) {
+      if (ir_utils::isTvOp(expr)) {
+        auto uop = expr->as<UnaryOp>();
+        auto out_tv = ir_utils::getTvOutput(expr);
+        auto out_domain = out_tv->domain()->domain();
+
+        bool is_vector_op = false;
+        int vector_word_size = 1;
+        bool vectorize_op = false;
+        bool misaligned_op = false;
+
+        for (auto id : out_domain) {
+          if (!isParallelTypeVectorize(id->getParallelType())) {
+            continue;
+          }
+
+          ExpressionEvaluator expr_eval(id->fusion());
+          auto vector_size_optional = expr_eval.evaluate(id->extent());
+
+          TORCH_INTERNAL_ASSERT(
+              vector_size_optional.has_value(),
+              "Could not evaluate constant value bound to vectorized dim.");
+
+          vector_word_size = (int)vector_size_optional.value();
+
+          vectorize_op = isParallelTypeVectorize(id->getParallelType());
+          break;
+        }
+        if (!vectorize_op) {
+          continue;
+        }
+
+        if (vectorized_accesses_.find(out_tv) != vectorized_accesses_.end()) {
+          vectorized_accesses_[out_tv] =
+              std::max(vectorized_accesses_[out_tv], vector_word_size);
+        } else {
+          vectorized_accesses_[out_tv] = vector_word_size;
+        }
+
+        TORCH_INTERNAL_ASSERT(
+            uop->in()->isA<TensorView>(),
+            "Input of vectorized uop must be a tensorview but found input: ",
+            uop->in()->toString());
+
+        TensorView* in_tv = uop->in()->as<TensorView>();
+        if (vectorized_accesses_.find(in_tv) != vectorized_accesses_.end()) {
+          vectorized_accesses_[in_tv] =
+              std::max(vectorized_accesses_[in_tv], vector_word_size);
+        } else {
+          vectorized_accesses_[in_tv] = vector_word_size;
+        }
+      }
+    }
+  }
+}
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit

diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h
@@ -22,6 +22,7 @@
 
 #include <memory>
 #include <ostream>
+#include <unordered_map>
 
 namespace torch {
 namespace jit {
@@ -133,6 +134,10 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
     return common_index_map_;
   }
 
+  const auto& vectorizedAccesses() const {
+    return vectorized_accesses_;
+  }
+
  private:
   void lower(Fusion* fusion, DataType index_type);
 
@@ -141,6 +146,8 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
   //  warp size.
   void collectPaddedParallelDims();
 
+  void fillVectorizeInfo();
+
  private:
   // Lowered Kernel IR
   std::unique_ptr<kir::Kernel> kernel_;
@@ -162,6 +169,10 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
   DoubleBufferInfo double_buffer_info_;
   CommonIndexMap common_index_map_;
 
+  // Track which tensor views are inputs or outputs of a vectorized operation
+  // and their maximum vectorized access size
+  std::unordered_map<TensorView*, int> vectorized_accesses_;
+
   Fusion* fusion_ = nullptr;
 };
 

diff --git a/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp b/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp
@@ -920,6 +920,31 @@ class AllocateReuseModifier {
           continue;
         }
 
+        if (alloc_info->alloc_expr->buffer()->isA<TensorView>()) {
+          if (!alloc_info->alloc_expr->buffer()->isA<TensorView>()) {
+            continue;
+          }
+          auto this_tv = alloc_info->alloc_expr->buffer()->as<TensorView>();
+          auto reuse_tv = alloc_info->alloc_expr->buffer()->as<TensorView>();
+          // Check that either both tv's are vectorized acceses, or neither are.
+          // Vectorized allocations require correct alignment so they can only
+          // alias with other allocations with the right alignment
+          const auto& va = GpuLower::current()->vectorizedAccesses();
+          if ((va.find(this_tv) == va.end()) !=
+              (va.find(reuse_tv) == va.end())) {
+            return false;
+          }
+
+          // Shared memory is all aligned to 128 bits, local memory might not be
+          if (this_tv->getMemoryType() == MemoryType::Local &&
+              va.find(this_tv) != va.end()) {
+            // Make sure alignment matches
+            if (va.at(this_tv) != va.at(reuse_tv)) {
+              return false;
+            }
+          }
+        }
+
         // TODO:
         //  Outer interval based sharing supports arbitrary re-indexing into
         //    the same buffer and would require additional syncs if fully

diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
@@ -284,8 +284,10 @@ class VectorizeValidator : public OptInDispatch {
       }
     }
 
-    // If no vectorized id's found simply return;
-    if (v_id == nullptr) {
+    // If no vectorized ids found simply return. If vectorized access is
+    // broadcast, it won't generate an actual vector instruction, so can safely
+    // be ignore
+    if (v_id == nullptr || v_id->isBroadcast()) {
       return;
     }