Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
ea115b1
added trivial forward support
jjsjann123 Sep 26, 2022
0ea8c19
fixing build
jjsjann123 Sep 26, 2022
fcb128e
added tests
jjsjann123 Sep 26, 2022
fc463c8
fixing tests
jjsjann123 Sep 26, 2022
7fb63e1
WAR added for segmented kernel with trivial input forward
jjsjann123 Sep 26, 2022
9436c2b
removing redundant code'
jjsjann123 Sep 27, 2022
e839f7e
trivial forwarding: remove allocation
jjsjann123 Sep 27, 2022
20d1566
patching cache id on kernel argument holder
jjsjann123 Sep 27, 2022
1ca3058
debug
jjsjann123 Sep 27, 2022
4466651
group cache id reused for segmented fusion
jjsjann123 Sep 27, 2022
4411118
removing debug prints
jjsjann123 Sep 27, 2022
d44d0dd
patching output mismatch
jjsjann123 Sep 27, 2022
a7c4bf9
Disable cache
jjsjann123 Sep 27, 2022
f1bf406
removing commented code. fixing trivial forwarding for non-segmented …
jjsjann123 Sep 27, 2022
6aa9c90
putting comment: WIP
jjsjann123 Sep 27, 2022
a3e16a2
adding trivial forwarding documentation
jjsjann123 Sep 27, 2022
219b948
lintrunner
jjsjann123 Sep 27, 2022
9063a87
Merge remote-tracking branch 'origin/devel' into HEAD
jjsjann123 Sep 27, 2022
3e6be64
lintrunner
jjsjann123 Sep 28, 2022
64e5736
code cleaning per review comments
jjsjann123 Sep 28, 2022
8e4841e
more review comments addressed
jjsjann123 Sep 28, 2022
4fa5fbf
code cleaning; test cleaning
jjsjann123 Sep 28, 2022
ff6c29e
Merge remote-tracking branch 'csarofeen/devel' into HEAD
jjsjann123 Sep 28, 2022
4b6583e
lintrunner
jjsjann123 Sep 28, 2022
0517d0a
Merge remote-tracking branch 'csarofeen/devel' into HEAD
jjsjann123 Sep 30, 2022
7a3ddb0
removing set-copy as a WAR of lack of trivial forwarding support
jjsjann123 Sep 30, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 13 additions & 17 deletions torch/csrc/jit/codegen/cuda/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -741,29 +741,24 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
}

std::vector<at::Tensor> FusionExecutor::allocOutputs(
const KernelArgumentHolder& args,
kir::ExpressionEvaluator& expr_eval,
const std::unordered_set<int>& alias_indices) {
FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs");
const auto kernel = lowered_->kernel();
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
std::vector<at::Tensor> outputs;
TORCH_INTERNAL_ASSERT(
args.size() == kernel->inputs().size(),
"kernel arguments length does not match runtime arguments.");
for (const auto out_i : c10::irange(kernel->outputs().size())) {
// TODO: FIX this short-cut where we trivially forward inputs to outputs
if (kernel->outputs()[out_i]->isFusionInput()) {
TORCH_INTERNAL_ASSERT(false, "trivial input forwarding NOT IMPLEMENTED");
// for (auto inp_i : c10::irange(kernel->inputs().size())) {
// if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
// TORCH_INTERNAL_ASSERT(
// inp_i < inputs.size(),
// "Issue with an input showing up as output, couldn't find
// input.");
// TORCH_INTERNAL_ASSERT(
// inputs[inp_i].isTensor(),
// "Cannot register a scalar as an output in a fusion.");
// outputs.push_back(inputs[inp_i].toTensor());
// break;
// }
// }
// pushing empty tensor for trivial forwarding. Since we handle this in
// integration, see step 1 - note [trivial forwarding]
c10::Device device(c10::DeviceType::CUDA, args.getDeviceIndex());
const auto tensor_options =
at::TensorOptions().dtype(at::kFloat).device(device);
outputs.emplace_back(at::empty({0}, tensor_options));
} else {
TORCH_INTERNAL_ASSERT(
kernel->outputs()[out_i]->isA<TensorView>(),
Expand Down Expand Up @@ -803,7 +798,8 @@ KernelArgumentHolder FusionExecutor::evaluateOutputSizes(
meta_options.device = c10::Device(DeviceType::Meta, 0);

for (const auto out_i : c10::irange(kernel->outputs().size())) {
// If the output is just trivially the input, just "copy" it over.
// If the output is just trivially the input, just "copy" it over, see note
// [trivial forwarding]
if (kernel->outputs()[out_i]->isFusionInput()) {
for (auto inp_i : c10::irange(kernel->inputs().size())) {
if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
Expand Down Expand Up @@ -1124,7 +1120,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(

auto& output_alias_indices = output_alias_indices_entry.get();

allocated_outputs = allocOutputs(expr_eval, output_alias_indices);
allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);

for (const auto& entry : alias_indices) {
auto aliased_output_index = entry.first;
Expand Down
1 change: 1 addition & 0 deletions torch/csrc/jit/codegen/cuda/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
// skip allocating real storage for those, but still maintain its spot to
// maintain the indexing from output aliases to inputs
std::vector<at::Tensor> allocOutputs(
const KernelArgumentHolder& args,
kir::ExpressionEvaluator& expr_eval,
const std::unordered_set<int>& alias_indices = {});

Expand Down
33 changes: 29 additions & 4 deletions torch/csrc/jit/codegen/cuda/kernel_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -649,11 +649,16 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
group_outputs.size() == group_runtime_outputs.size(),
"output size does not match");
for (const size_t group_out_i : c10::irange(group_outputs.size())) {
output_holder[group_outputs[group_out_i]] =
group_runtime_outputs[group_out_i];
// trivial forwarding outputs empty tensor to save bandwidth, skip
// tensor_map update on those, since we want all future use of inputs on
// the original tensor input. See note [trivial forwarding]
if (!group_outputs[group_out_i]->isFusionInput()) {
output_holder[group_outputs[group_out_i]] =
group_runtime_outputs[group_out_i];

args.push(group_runtime_outputs[group_out_i]);
tensor_map.emplace(group_outputs[group_out_i], args.back());
args.push(group_runtime_outputs[group_out_i]);
tensor_map.emplace(group_outputs[group_out_i], args.back());
}
}
}

Expand All @@ -669,10 +674,30 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
if (iter != output_holder.end()) {
fusion_outputs.push_back(iter->second);
} else if (output->isFusionInput()) {
// Note [ trivial forwarding ]
//
// Background:
// nvfuser codegen doesn't handle aliases at all. When we have a fusion
// that forwards an input to output without any operations on it, this is
// a no-op for codegen and the output tensor is never written to. However,
// the codegen cannot "forward" an input to output, since all outputs are
// allocated in integration. If we do not special case it, we'll ended up
// having a "fresh" tensor allocated for the forwarded-input.
//
// Approach:
// There are two aspects of the support:
// step 1. Codegen handles forwarding implicitly. Forwarded inputs doesn't
// have any producer in the IR, hence the output argument is not used in
// the code. But it does require to have an argument in the kernel as a
// place-holder so we'll map each arguments correctly.
// step 2. Integration handles the trivial forwarding of inputs. When we
// put together `fusion_outputs` for a given fusion, when outputs are just
// fusion inputs, we directly return the input tensor.
const auto iter = tensor_map.find(output);
TORCH_INTERNAL_ASSERT(
iter != tensor_map.end(), "Can not find output as aliased intput");
auto arg = dynamic_cast<const TensorArgAbstract*>(iter->second);
// See step 2 - note [ trivial forwarding ]
fusion_outputs.push_back(arg->getTensor());
} else {
bool empty_type_check = output->getDataType().has_value() &&
Expand Down
4 changes: 2 additions & 2 deletions torch/csrc/jit/codegen/cuda/ops/normalization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ ForwardNormResult batch_norm(
// During inference, mean/invstd output are empty tensors
// on CPU, but not on CUDA. We need to make sure we have the same
// behavior as with eager mode on CUDA.
mean = set(running_mean);
mean = running_mean;
invstd = unbiased_invstd;
y = mul(x_sub_mean, invstd_bcast);
}
Expand Down Expand Up @@ -844,7 +844,7 @@ ForwardNormResult instance_norm(
// During inference, mean/invstd output are empty tensors
// on CPU, but not on CUDA. We need to make sure we have the same
// behavior as with eager mode on CUDA.
mean = set(running_mean);
mean = running_mean;
invstd = unbiased_invstd;
y = mul(x_sub_mean, invstd_bcast);
}
Expand Down
28 changes: 28 additions & 0 deletions torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26026,6 +26026,34 @@ TEST_F(NVFuserTest, FusionInlineAt_CUDA) {
testValidate(fusion, {out}, {t0}, {t0.sin().cos()}, __LINE__, __FILE__);
}

TEST_F(NVFuserTest, FusionTrivialInputForwarding_CUDA) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
auto fusion = fusion_ptr.get();
FusionGuard fg(fusion);

TensorView* tv0 = makeConcreteTensor({-1, -1});
TensorView* tv1 = makeConcreteTensor({-1, -1});
fusion->addInput(tv0);
fusion->addInput(tv1);
// Note: tv2 is not needed. Kept it here since previously there was an
// assertion from sorting in codegen.
auto tv2 = add(tv1, IrBuilder::create<Double>(3.141));
fusion->addOutput(tv0);

auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({10, 4}, options);
at::Tensor t1 = at::randn({10, 4}, options);

FusionExecutorCache fec(std::move(fusion_ptr));
auto cg_outputs = fec.runFusionWithInputs({t0, t1});

testValidate(fusion, cg_outputs, {t0, t1}, {t0}, __LINE__, __FILE__);

// Second run to ensure cache hit handles trivial forwarding properly
auto cg_outputs2 = fec.runFusionWithInputs({t0, t1});
testValidate(fusion, cg_outputs2, {t0, t1}, {t0}, __LINE__, __FILE__);
}

} // namespace jit
} // namespace torch
#endif // #if defined(USE_CUDA)