Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,7 @@ if(USE_CUDA OR USE_ROCM)

# The list of NVFUSER runtime files
list(APPEND NVFUSER_RUNTIME_FILES
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/array.cu
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_reduction.cu
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu
${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/runtime/block_sync_default.cu
Expand Down
2 changes: 1 addition & 1 deletion test/cpp/jit/test_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20414,7 +20414,7 @@ TEST_F(NVFuserTest, FusionSmemBlockGemmCacheDoubleBuffer_CUDA) {
}

TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) {
auto mem_types = {MemoryType::Shared, MemoryType::Local};
std::vector<MemoryType> mem_types = {MemoryType::Shared, MemoryType::Local};

for (auto mem_type : mem_types) {
Fusion fusion;
Expand Down
1 change: 1 addition & 0 deletions tools/build_variables.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ GENERATED_CPP = [

# NVFuser runtime library
libtorch_nvfuser_runtime_sources = [
"torch/csrc/jit/codegen/cuda/runtime/array.cu",
"torch/csrc/jit/codegen/cuda/runtime/bf16_support.cu",
"torch/csrc/jit/codegen/cuda/runtime/block_reduction.cu",
"torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu",
Expand Down
77 changes: 53 additions & 24 deletions torch/csrc/jit/codegen/cuda/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,26 +375,48 @@ class CudaKernelGenerator : private OptOutConstDispatch {
uop->out()->dtype() == uop->in()->dtype(),
"Vectorized store/load requires input and output datatypes match.");
}
}

if (is_vector_op) {
if (uop->in()->isScalar()) {
indent() << "reinterpret_cast<"
<< "Array<" << uop->out()->dtype() << ", " << vector_word_size
<< ">*>"
<< "(&" << gen(uop->out()) << ")->set(" << gen(uop->in())
<< ");\n";
} else {
indent() << "*reinterpret_cast<"
<< "Array<" << uop->out()->dtype() << ", " << vector_word_size
<< ">*>"
<< "(&" << gen(uop->out()) << ")"
<< " = *reinterpret_cast<"
<< "Array<" << uop->in()->dtype() << ", " << vector_word_size
<< ">*>"
<< "(&" << gen(uop->in()) << ");\n";
if (is_vector_op) {
auto out_tv = uop->out()->as<kir::TensorIndex>()->view();
if (uop->in()->isScalar()) {
if (out_tv->getMemoryType() == MemoryType::Local) {
// Vectorized initialization
indent() << varName(out_tv) << ".set(" << gen(uop->in()) << ");\n";
} else {
indent() << "arraySet<" << out_tv->getMemoryType() << ", "
<< vector_word_size << ">(" << gen(uop->out()) << ", "
<< gen(uop->in()) << ");\n";
}
} else {
// Vectorized load
TORCH_INTERNAL_ASSERT(
uop->in()->isA<kir::TensorIndex>(),
"Invalid input to unary op with tensor output, found: ",
uop->in()->toString());

auto in_tv = uop->in()->as<kir::TensorIndex>()->view();
bool localToGlobal = out_tv->getMemoryType() == MemoryType::Global &&
in_tv->getMemoryType() == MemoryType::Local;

bool globalToLocal = out_tv->getMemoryType() == MemoryType::Local &&
in_tv->getMemoryType() == MemoryType::Global;

if (localToGlobal) {
indent() << "loadLocalToGlobal<" << uop->out()->dtype() << ", "
<< vector_word_size << ">(&" << gen(uop->out()) << ", &"
<< gen(uop->in()) << ");\n";
} else if (globalToLocal) {
indent() << "loadGlobalToLocal<" << uop->out()->dtype() << ", "
<< vector_word_size << ">(&" << gen(uop->out()) << ", &"
<< gen(uop->in()) << ");\n";
} else {
indent() << "loadGeneric<" << uop->out()->dtype() << ", "
<< vector_word_size << ">(&" << gen(uop->out()) << ", &"
<< gen(uop->in()) << ");\n";
}
}
return;
}
return;
}

if (uop->out()->isA<NamedScalar>()) {
Expand Down Expand Up @@ -1281,8 +1303,9 @@ class CudaKernelGenerator : private OptOutConstDispatch {
// Allocate alias another Allocate stmt
const auto alias_tv = alloc->alias()->buffer()->as<TensorView>();
indent() << "// Alias Allocation - " << alloc->memoryType() << "\n";
indent() << buffer_dtype << "* " << varName(tv) << " = "
<< varName(alias_tv) << ";\n";
indent() << "auto& " << varName(tv) << " = " << varName(alias_tv)
<< ";\n";

} else {
// Standard Memory Allocation
switch (tv->getMemoryType()) {
Expand All @@ -1307,10 +1330,16 @@ class CudaKernelGenerator : private OptOutConstDispatch {
<< buffer_dtype << "));\n";
}
break;
case MemoryType::Local:
indent() << buffer_dtype << " " << varName(tv) << "["
<< genInline(size) << "];\n";
break;
case MemoryType::Local: {
auto va = kernel_->summary().vectorized_accesses;
if (va.find(tv) != va.end()) {
indent() << "Array<" << buffer_dtype << ", " << genInline(size)
<< ", " << va.at(tv) << "> " << varName(tv) << ";\n";
} else {
indent() << buffer_dtype << " " << varName(tv) << "["
<< genInline(size) << "];\n";
}
} break;
default:
TORCH_INTERNAL_ASSERT(false, "Unexpected memory type");
}
Expand Down
3 changes: 3 additions & 0 deletions torch/csrc/jit/codegen/cuda/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <torch/csrc/jit/resource_guard.h>

#include <nvfuser_resources/PhiloxCudaStateRaw.h>
#include <nvfuser_resources/array.h>
#include <nvfuser_resources/bf16_support.h>
#include <nvfuser_resources/block_reduction.h>
#include <nvfuser_resources/block_sync_atomic.h>
Expand Down Expand Up @@ -44,6 +45,8 @@ namespace executor_utils {
std::string kernelPreamble() {
std::stringstream ss;

ss << nvfuser_resources::array_cu;

#ifndef __HIP_PLATFORM_HCC__
ss << nvfuser_resources::fp16_support_cu;
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
Expand Down
6 changes: 5 additions & 1 deletion torch/csrc/jit/codegen/cuda/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,12 +270,16 @@ class ValidateAllocation : private OptOutConstDispatch {
} // namespace

// TODO(kir): Kernel IR validation
void Kernel::finalize(std::vector<Expr*> top_level_exprs) {
void Kernel::finalize(
std::vector<Expr*> top_level_exprs,
const std::unordered_map<TensorView*, int>& vectorized_info) {
TORCH_INTERNAL_ASSERT(top_level_exprs_.empty());
top_level_exprs_ = std::move(top_level_exprs);
warp_padded_parallel_info_ = GpuLower::current()->getWarpPaddedParallelInfo();
ValidateAllocation::validate(this);
analyze();
// Make sure this is after analyze as it sets summary_
summary_.vectorized_accesses = vectorized_info;
}

void Kernel::analyze() {
Expand Down
12 changes: 10 additions & 2 deletions torch/csrc/jit/codegen/cuda/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ struct KernelSummary {
//! Effective ParallelTypes of broadcast ops
std::unordered_map<const BroadcastOp*, ParallelTypeBitmap>
broadcast_parallel_types;

// Track which tensor views are inputs or outputs of a vectorized operation
// and their maximum vectorized access size
std::unordered_map<TensorView*, int> vectorized_accesses;
};

class KernelInternalProxy;
Expand Down Expand Up @@ -108,9 +112,13 @@ class TORCH_CUDA_CU_API Kernel final : public Fusion {
//! Finalize a kernel definition
//!
//! At this point we have a complete kernel definition and we can
//! run analysis passes to build a KernelSummary
//! run analysis passes to build a KernelSummary. Manually send in vectorized
//! info so it doesn't have to be rebuilt.
//!
void finalize(std::vector<Expr*> top_level_exprs);

void finalize(
std::vector<Expr*> top_level_exprs,
const std::unordered_map<TensorView*, int>& vectorized_info);

const std::vector<Expr*>& topLevelExprs() const {
return top_level_exprs_;
Expand Down
3 changes: 2 additions & 1 deletion torch/csrc/jit/codegen/cuda/kernel_ir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,8 @@ ForLoop::ForLoop(IrBuilderPasskey passkey, IterDomain* iter_domain)
nullptr,
nullptr,
nullptr,
isParallelTypeVectorize(iter_domain->getParallelType()),
!iter_domain->isBroadcast() &&
isParallelTypeVectorize(iter_domain->getParallelType()),
nullptr,
false) {
TORCH_INTERNAL_ASSERT(
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/jit/codegen/cuda/kernel_ir.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ class TORCH_CUDA_CU_API TensorIndex final : public Val {
public:
TensorIndex(
IrBuilderPasskey,
const fuser::cuda::TensorView* view,
const TensorView* view,
std::vector<Val*> indices);

std::vector<Val*>::size_type nDims() const {
Expand Down
69 changes: 67 additions & 2 deletions torch/csrc/jit/codegen/cuda/lower2device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,10 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
// Want to run this after parallel map is created
validateVectorize(fusion_);

// Extract TensorViews that are accessed in a vectorized way and track their
// word size.
fillVectorizeInfo();

// Compute thread predicates. Depends on parallel_dimension_map_
thread_pred_map_.build(fusion_);

Expand Down Expand Up @@ -340,8 +344,9 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
const auto exprs_cleaned_up_loops =
KIRCleaner::cleanUp(exprs_register_adjusted);

// We now have the lowered expressions, finalize the kernel IR
kernel_->finalize(exprs_cleaned_up_loops);
// We now have the lowered expressions, finalize the kernel IR, add the
// vectorized entry to it manually as it's already populated in GpuLower
kernel_->finalize(exprs_cleaned_up_loops, vectorized_accesses_);
}

kir::Kernel* GpuLower::kernel() const {
Expand All @@ -355,6 +360,66 @@ GpuLower* GpuLower::current() {
return active_gpu_lower;
}

// This was primarily copied from codegen.cpp::CudaKernelGenerator::handle(const
// UnaryOp*)
void GpuLower::fillVectorizeInfo() {
for (auto expr : fusion_->exprs()) {
if (expr->isA<UnaryOp>()) {
if (ir_utils::isTvOp(expr)) {
auto uop = expr->as<UnaryOp>();
auto out_tv = ir_utils::getTvOutput(expr);
auto out_domain = out_tv->domain()->domain();

bool is_vector_op = false;
int vector_word_size = 1;
bool vectorize_op = false;
bool misaligned_op = false;

for (auto id : out_domain) {
if (!isParallelTypeVectorize(id->getParallelType())) {
continue;
}

ExpressionEvaluator expr_eval(id->fusion());
auto vector_size_optional = expr_eval.evaluate(id->extent());

TORCH_INTERNAL_ASSERT(
vector_size_optional.has_value(),
"Could not evaluate constant value bound to vectorized dim.");

vector_word_size = (int)vector_size_optional.value();

vectorize_op = isParallelTypeVectorize(id->getParallelType());
break;
}
if (!vectorize_op) {
continue;
}

if (vectorized_accesses_.find(out_tv) != vectorized_accesses_.end()) {
vectorized_accesses_[out_tv] =
std::max(vectorized_accesses_[out_tv], vector_word_size);
} else {
vectorized_accesses_[out_tv] = vector_word_size;
}

TORCH_INTERNAL_ASSERT(
uop->in()->isA<TensorView>(),
"Input of vectorized uop must be a tensorview but found input: ",
uop->in()->toString());

TensorView* in_tv = uop->in()->as<TensorView>();
if (vectorized_accesses_.find(in_tv) != vectorized_accesses_.end()) {
vectorized_accesses_[in_tv] =
std::max(vectorized_accesses_[in_tv], vector_word_size);
} else {
vectorized_accesses_[in_tv] = vector_word_size;
}
}
}
}
}

} // namespace cuda
} // namespace fuser
} // namespace jit
Expand Down
11 changes: 11 additions & 0 deletions torch/csrc/jit/codegen/cuda/lower2device.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include <memory>
#include <ostream>
#include <unordered_map>

namespace torch {
namespace jit {
Expand Down Expand Up @@ -133,6 +134,10 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
return common_index_map_;
}

const auto& vectorizedAccesses() const {
return vectorized_accesses_;
}

private:
void lower(Fusion* fusion, DataType index_type);

Expand All @@ -141,6 +146,8 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
// warp size.
void collectPaddedParallelDims();

void fillVectorizeInfo();

private:
// Lowered Kernel IR
std::unique_ptr<kir::Kernel> kernel_;
Expand All @@ -162,6 +169,10 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
DoubleBufferInfo double_buffer_info_;
CommonIndexMap common_index_map_;

// Track which tensor views are inputs or outputs of a vectorized operation
// and their maximum vectorized access size
std::unordered_map<TensorView*, int> vectorized_accesses_;

Fusion* fusion_ = nullptr;
};

Expand Down
25 changes: 25 additions & 0 deletions torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,31 @@ class AllocateReuseModifier {
continue;
}

if (alloc_info->alloc_expr->buffer()->isA<TensorView>()) {
if (!alloc_info->alloc_expr->buffer()->isA<TensorView>()) {
continue;
}
auto this_tv = alloc_info->alloc_expr->buffer()->as<TensorView>();
auto reuse_tv = alloc_info->alloc_expr->buffer()->as<TensorView>();
// Check that either both tv's are vectorized acceses, or neither are.
// Vectorized allocations require correct alignment so they can only
// alias with other allocations with the right alignment
const auto& va = GpuLower::current()->vectorizedAccesses();
if ((va.find(this_tv) == va.end()) !=
(va.find(reuse_tv) == va.end())) {
return false;
}

// Shared memory is all aligned to 128 bits, local memory might not be
if (this_tv->getMemoryType() == MemoryType::Local &&
va.find(this_tv) != va.end()) {
// Make sure alignment matches
if (va.at(this_tv) != va.at(reuse_tv)) {
return false;
}
}
}

// TODO:
// Outer interval based sharing supports arbitrary re-indexing into
// the same buffer and would require additional syncs if fully
Expand Down
6 changes: 4 additions & 2 deletions torch/csrc/jit/codegen/cuda/lower_validation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,10 @@ class VectorizeValidator : public OptInDispatch {
}
}

// If no vectorized id's found simply return;
if (v_id == nullptr) {
// If no vectorized ids found simply return. If vectorized access is
// broadcast, it won't generate an actual vector instruction, so can safely
// be ignore
if (v_id == nullptr || v_id->isBroadcast()) {
return;
}

Expand Down
Loading