Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a598d85
Add test_gpu_match_frontend.cpp
jacobhinkle Dec 21, 2022
8290482
Properly parallelize SuperBasic and Basic tests
jacobhinkle Dec 28, 2022
a10fe6a
Add FP16 Basic and SuperBasic frontend tests
jacobhinkle Dec 28, 2022
d7cc442
Add parallelize tv4 in FusionFrontendBasicFP16_CUDA
jacobhinkle Dec 29, 2022
aa6b378
Add tv1->computeAt() to BasicFP16 test
jacobhinkle Dec 29, 2022
2e614f9
Add test: FusionFrontendCastDoubleToHalf_CUDA
jacobhinkle Dec 29, 2022
fb4fa85
Remove griddim split in CastDoubleToHalf test
jacobhinkle Dec 29, 2022
c0f211c
Add test FusionFrontendPromoteToDouble_CUDA
jacobhinkle Dec 29, 2022
9175568
Replace computeAt(..MostInlined) with inlineMost()
jacobhinkle Jan 3, 2023
5c43ccc
Add stream argument to printMath, add ir_math_check
jacobhinkle Jan 3, 2023
6d3d34f
Make SuperBasic manual schedule match automatic
jacobhinkle Jan 3, 2023
b1c51c9
Enable dumping kernel to string, fix FP16 SuperBasic test
jacobhinkle Jan 3, 2023
5ffd5a5
Add python defs to docstrings of frontend tests
jacobhinkle Jan 4, 2023
cd5fc9f
Add stream arg to Fusion::print{,Transforms}()
jacobhinkle Jan 4, 2023
e041b9f
Add compare_ir() which compares math, fusions & kernels
jacobhinkle Jan 4, 2023
acbdbc9
Add pointwise Add() example to frontend and C++ tests
jacobhinkle Jan 4, 2023
30ed719
Add FusionFrontendImplicitBroadcastInput_CUDA test
jacobhinkle Jan 4, 2023
a75c509
Update {super,}basic tests to closer match auto sched
jacobhinkle Jan 4, 2023
cff9b5e
Make ImplicitBroadcastInput manual IR match auto
jacobhinkle Jan 4, 2023
95c0e08
Add IR comparisons to Cast and Promote tests
jacobhinkle Jan 4, 2023
35b872d
Modify CMakeLists.txt and headers following rebase.
jacobhinkle Jan 5, 2023
bb1d5ab
Fix Frontend{Add,SuperBasic,SuperBasicFP16} tests
jacobhinkle Jan 17, 2023
dc6e5e0
Make FrontendBasic test match auto schedule
jacobhinkle Jan 17, 2023
dd44075
Make FrontendBasicFP16 test match auto schedule
jacobhinkle Jan 17, 2023
4519bd1
Make FrontendCastDoubleToHalf match auto schedule
jacobhinkle Jan 18, 2023
4697c30
Make FrontendPromoteToDouble test match auto sched
jacobhinkle Jan 18, 2023
98e5259
Make FrontendImplicitBroadcastInput match auto sched
jacobhinkle Jan 18, 2023
aaf5713
Add two broadcasting frontend tests
jacobhinkle Jan 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 98 additions & 4 deletions test/test_nvfuser_frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,107 @@ def exec_nvfuser(self, fusion_func, inputs, new_fusion_expected=True) :
self.assertEqual(fc.num_fusions() - before_fusions, int(new_fusion_expected))
return out, fs

def test_basic(self) :
def test_add(self):
inputs = [
torch.ones(2, 4, 8, device='cuda'),
torch.ones(2, 4, 8, device='cuda'),
torch.ones(2, 4, 8, device="cuda"),
torch.ones(2, 4, 8, device="cuda"),
]

def fusion_func(fd: FusionDefinition) :
def fusion_func(fd: FusionDefinition):
t0 = fd.define_tensor(3)
t1 = fd.define_tensor(3)

t2 = fd.ops.add(t0, t1)

fd.add_output(t2)

# Expected Output is a tensor of 2's
nvf_out1, _ = self.exec_nvfuser(fusion_func, inputs)

# Create a new fusion with the same definition, it should hit the cache!
nvf_out2, fs2 = self.exec_nvfuser(
fusion_func, inputs, new_fusion_expected=False
)

# Create a fusion from a fusion id and make sure it executes!
fs3 = Fusion(fs2.id())
nvf_out3 = fs3.execute(inputs)[0]

eager_out = inputs[0] + inputs[1]
self.assertEqual(eager_out, nvf_out1)
self.assertEqual(eager_out, nvf_out2)
self.assertEqual(eager_out, nvf_out3)

def test_super_basic(self):
inputs = [
torch.ones(4, 8, device="cuda"),
]

def fusion_func(fd: FusionDefinition):
t0 = fd.define_tensor(2)
c0 = fd.define_constant(3.0)

t1 = fd.ops.mul(t0, c0)
t2 = fd.ops.sum(t1, [-1], False, DataType.Float)

fd.add_output(t2)

# Expected Output is a tensor of 24's
nvf_out1, _ = self.exec_nvfuser(fusion_func, inputs)

# Create a new fusion with the same definition, it should hit the cache!
nvf_out2, fs2 = self.exec_nvfuser(
fusion_func, inputs, new_fusion_expected=False
)

# Create a fusion from a fusion id and make sure it executes!
fs3 = Fusion(fs2.id())
nvf_out3 = fs3.execute(inputs)[0]

eager_out = torch.sum(inputs[0] * 3.0, dim=-1)
self.assertEqual(eager_out, nvf_out1)
self.assertEqual(eager_out, nvf_out2)
self.assertEqual(eager_out, nvf_out3)

def test_super_basic_fp16(self):
inputs = [
torch.ones(4, 8, device="cuda", dtype=torch.float16),
]

def fusion_func(fd: FusionDefinition):
t0 = fd.define_tensor(2, DataType.Half)
c0 = fd.define_constant(3.0)

t1 = fd.ops.mul(t0, c0)
t2 = fd.ops.sum(t1, [-1], False, DataType.Float)

t3 = fd.ops.cast(t2, DataType.Half)
fd.add_output(t3)

# Expected Output is a tensor of 48's
nvf_out1, _ = self.exec_nvfuser(fusion_func, inputs)

# Create a new fusion with the same definition, it should hit the cache!
nvf_out2, fs2 = self.exec_nvfuser(
fusion_func, inputs, new_fusion_expected=False
)

# Create a fusion from a fusion id and make sure it executes!
fs3 = Fusion(fs2.id())
nvf_out3 = fs3.execute(inputs)[0]

eager_out = torch.sum(inputs[0] * 3.0, dim=-1)
self.assertEqual(eager_out, nvf_out1)
self.assertEqual(eager_out, nvf_out2)
self.assertEqual(eager_out, nvf_out3)

def test_basic(self):
inputs = [
torch.ones(2, 4, 8, device="cuda"),
torch.ones(2, 4, 8, device="cuda"),
]

def fusion_func(fd: FusionDefinition):
t0 = fd.define_tensor(3)
t1 = fd.define_tensor(3)
c0 = fd.define_constant(3.0)
Expand Down
1 change: 1 addition & 0 deletions third_party/nvfuser/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ if(BUILD_TEST)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_indexing_ops.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_indexing.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_match_frontend.cpp)

set(JIT_TEST_CU_SRCS)
list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)
Expand Down
38 changes: 20 additions & 18 deletions third_party/nvfuser/csrc/fusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#include <lower2device.h>
#include <lower_bank_conflict.h>

#include <ostream>

namespace torch {
namespace jit {
namespace fuser {
Expand Down Expand Up @@ -344,26 +346,26 @@ void Fusion::validateInputs() {
}
}

void Fusion::print() {
void Fusion::print(std::ostream& stream) {
FUSER_PERF_SCOPE("Fusion::print");

FusionGuard fg(this);
std::cout << "\n%kernel {\n";
IrMathPrinter op_exprs(std::cout);
stream << "\n%kernel {\n";
IrMathPrinter op_exprs(stream);
op_exprs.handle(this);
std::cout << "\nTransformPrinter : \n";
IrTransformPrinter t_exprs(std::cout);
stream << "\nTransformPrinter : \n";
IrTransformPrinter t_exprs(stream);
t_exprs.handle(this);
std::cout << "}\n\n";
stream << "}\n\n";
}

void Fusion::printKernel(DataType index_type) {
void Fusion::printKernel(DataType index_type, std::ostream& stream) {
FUSER_PERF_SCOPE("Fusion::printKernel");
TORCH_INTERNAL_ASSERT(
!this->isA<kir::Kernel>(),
"Cannot \"print kernel\" of a kernel container. ",
"This would require lowering during lowering.");
std::cout << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
stream << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
}

std::unordered_map<std::string, std::pair<int, int>> Fusion::bankConflictInfo(
Expand All @@ -380,19 +382,19 @@ std::unordered_map<std::string, std::pair<int, int>> Fusion::bankConflictInfo(
return result;
}

void Fusion::printMath(bool from_outputs_only) {
void Fusion::printMath(bool from_outputs_only, std::ostream& stream) {
FUSER_PERF_SCOPE("Fusion::printMath");

FusionGuard fg(this);
auto exprs_for_print = exprs();
std::cout << "Inputs:" << std::endl;
stream << "Inputs:" << std::endl;
for (auto inp : inputs()) {
std::cout << " " << inp << ", " << inp->getDataType().value() << std::endl;
stream << " " << inp << ", " << inp->getDataType().value() << std::endl;
}

std::cout << "Outputs:" << std::endl;
stream << "Outputs:" << std::endl;
for (auto out : outputs()) {
std::cout << " " << out << ", " << out->getDataType().value() << std::endl;
stream << " " << out << ", " << out->getDataType().value() << std::endl;
}

// If we want everything in the fusion, grab all values without uses to
Expand All @@ -407,11 +409,11 @@ void Fusion::printMath(bool from_outputs_only) {
exprs_for_print = StmtSort::getExprs(this, leaf_vals);
}

std::cout << "\n%kernel_math {\n";
stream << "\n%kernel_math {\n";
for (auto expr : exprs_for_print) {
std::cout << expr;
stream << expr;
}
std::cout << "}\n\n";
stream << "}\n\n";
}

std::vector<Val*> Fusion::inputsAndCreated() {
Expand All @@ -427,11 +429,11 @@ std::vector<Val*> Fusion::inputsAndCreated() {
return result;
}

void Fusion::printTransforms() {
void Fusion::printTransforms(std::ostream& stream) {
FUSER_PERF_SCOPE("Fusion::printTransforms");

FusionGuard fg(this);
IrTransformPrinter t_exprs(std::cout);
IrTransformPrinter t_exprs(stream);
t_exprs.handle(this);
}

Expand Down
13 changes: 9 additions & 4 deletions third_party/nvfuser/csrc/fusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,17 +124,22 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
void validateInputs();

//! Print this fusion to the console
void print();
void print(std::ostream& stream = std::cout);

//! Print Arith exprs
//! \param from_outputs_only Only print exprs reachable from outputs
void printMath(bool from_outputs_only = true);
//! \param stream Where to print output (defaults to std::cout)
void printMath(
bool from_outputs_only = true,
std::ostream& stream = std::cout);

//! Print transformations used in fusion (can be very verbose)
void printTransforms();
void printTransforms(std::ostream& stream = std::cout);

//! Lower the fusion and print a kernel
void printKernel(DataType index_type = DataType::Int);
void printKernel(
DataType index_type = DataType::Int,
std::ostream& stream = std::cout);

//! Returns if this fusion is noop, for example, trivially forwarding inputs,
//! or all outputs are size-0 tensors, etc.
Expand Down
Loading