-
Notifications
You must be signed in to change notification settings - Fork 7
Trivial forwarding #1995
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Trivial forwarding #1995
Changes from 20 commits
ea115b1
0ea8c19
fcb128e
fc463c8
7fb63e1
9436c2b
e839f7e
20d1566
1ca3058
4466651
4411118
d44d0dd
a7c4bf9
f1bf406
6aa9c90
a3e16a2
219b948
9063a87
3e6be64
64e5736
8e4841e
4fa5fbf
ff6c29e
4b6583e
0517d0a
7a3ddb0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -712,29 +712,32 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals( | |
} | ||
|
||
std::vector<at::Tensor> FusionExecutor::allocOutputs( | ||
const KernelArgumentHolder& args, | ||
kir::ExpressionEvaluator& expr_eval, | ||
const std::unordered_set<int>& alias_indices) { | ||
FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs"); | ||
const auto kernel = lowered_->kernel(); | ||
// NOLINTNEXTLINE(cppcoreguidelines-init-variables) | ||
std::vector<at::Tensor> outputs; | ||
for (const auto out_i : c10::irange(kernel->outputs().size())) { | ||
// TODO: FIX this short-cut where we trivially forward inputs to outputs | ||
if (kernel->outputs()[out_i]->isFusionInput()) { | ||
TORCH_INTERNAL_ASSERT(false, "trivial input forwarding NOT IMPLEMENTED"); | ||
// for (auto inp_i : c10::irange(kernel->inputs().size())) { | ||
// if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) { | ||
// TORCH_INTERNAL_ASSERT( | ||
// inp_i < inputs.size(), | ||
// "Issue with an input showing up as output, couldn't find | ||
// input."); | ||
// TORCH_INTERNAL_ASSERT( | ||
// inputs[inp_i].isTensor(), | ||
// "Cannot register a scalar as an output in a fusion."); | ||
// outputs.push_back(inputs[inp_i].toTensor()); | ||
// break; | ||
// } | ||
// } | ||
for (auto inp_i : c10::irange(kernel->inputs().size())) { | ||
if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) { | ||
TORCH_INTERNAL_ASSERT( | ||
inp_i < args.size(), | ||
|
||
"Issue with an input showing up as output, couldn't find input."); | ||
TORCH_INTERNAL_ASSERT( | ||
args[inp_i]->isType(ArgType::Tensor), | ||
"Cannot register a scalar as an output in a fusion."); | ||
// pushing empty tensor for trivial forwarding. Since we handle this | ||
// in integration, see step 1 - note [trivial forwarding] | ||
c10::Device device(c10::DeviceType::CUDA, args.getDeviceIndex()); | ||
const auto tensor_options = | ||
at::TensorOptions().dtype(at::kFloat).device(device); | ||
outputs.emplace_back(at::empty({0}, tensor_options)); | ||
zasdfgbnm marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
break; | ||
} | ||
} | ||
} else { | ||
jjsjann123 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
TORCH_INTERNAL_ASSERT( | ||
kernel->outputs()[out_i]->isA<TensorView>(), | ||
|
@@ -774,7 +777,8 @@ KernelArgumentHolder FusionExecutor::evaluateOutputSizes( | |
meta_options.device = c10::Device(DeviceType::Meta, 0); | ||
|
||
for (const auto out_i : c10::irange(kernel->outputs().size())) { | ||
// If the output is just trivially the input, just "copy" it over. | ||
// If the output is just trivially the input, just "copy" it over, see note | ||
// [trivial forwarding] | ||
if (kernel->outputs()[out_i]->isFusionInput()) { | ||
for (auto inp_i : c10::irange(kernel->inputs().size())) { | ||
if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) { | ||
|
@@ -1095,7 +1099,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion( | |
|
||
auto& output_alias_indices = output_alias_indices_entry.get(); | ||
|
||
allocated_outputs = allocOutputs(expr_eval, output_alias_indices); | ||
allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices); | ||
|
||
for (const auto& entry : alias_indices) { | ||
auto aliased_output_index = entry.first; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25986,6 +25986,36 @@ TEST_F(NVFuserTest, FusionMappingRelation_CUDA) { | |
fusion, {out}, {t0, t1}, {t1 + t0.squeeze(0)}, __LINE__, __FILE__); | ||
} | ||
|
||
TEST_F(NVFuserTest, FusionTrivialInputForwarding_CUDA) { | ||
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>(); | ||
auto fusion = fusion_ptr.get(); | ||
FusionGuard fg(fusion); | ||
|
||
TensorView* tv0 = makeConcreteTensor({-1, -1}); | ||
TensorView* tv1 = makeConcreteTensor({-1, -1}); | ||
fusion->addInput(tv0); | ||
fusion->addInput(tv1); | ||
auto tv2 = add(tv1, IrBuilder::create<Double>(3.141)); | ||
// note, removing this line seems to give error in expression sorting. | ||
fusion->addOutput(tv2); | ||
fusion->addOutput(tv0); | ||
|
||
auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); | ||
at::Tensor t0 = at::randn({10, 4}, options); | ||
at::Tensor t1 = at::randn({10, 4}, options); | ||
|
||
// Note | ||
|
||
FusionExecutorCache fec(std::move(fusion_ptr)); | ||
auto cg_outputs = fec.runFusionWithInputs({t0, t1}); | ||
|
||
testValidate( | ||
fusion, cg_outputs, {t0, t1}, {t1.add(3.141), t0}, __LINE__, __FILE__); | ||
|
||
auto cg_outputs2 = fec.runFusionWithInputs({t0, t1}); | ||
testValidate( | ||
fusion, cg_outputs2, {t0, t1}, {t1.add(3.141), t0}, __LINE__, __FILE__); | ||
} | ||
|
||
} // namespace jit | ||
} // namespace torch | ||
#endif // #if defined(USE_CUDA) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need this for loop here? Could we simplify the code into:
Doing so, we won't be checking that an output is not a scalar, but isn't this already checked somewhere else? For example, does
Fusion::addOutput
check it?