From 7325643262b64d47b1b97ce93b1f094799077acd Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Tue, 18 Aug 2020 11:53:54 -0700 Subject: [PATCH 001/167] CI, to our fork. (#145) (#303) Co-authored-by: Christian Sarofeen --- .github/workflows/clang_format.yml | 2 +- .github/workflows/lint.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml index 4b5fc19cdf045..b09b2d0f40384 100644 --- a/.github/workflows/clang_format.yml +++ b/.github/workflows/clang_format.yml @@ -29,7 +29,7 @@ jobs: set -eu # This is necessary to get the same results regardless of whether the # PR was opened directly or from a forked repo. See: `9f890a92` for more info. - git remote add upstream https://github.com/pytorch/pytorch + git remote add upstream https://github.com/csarofeen/pytorch git fetch upstream "$GITHUB_BASE_REF" BASE_SHA=${{ github.event.pull_request.base.sha }} HEAD_SHA=${{ github.event.pull_request.head.sha }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 087397bcca6dd..b9db2a1c8c1c6 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -117,7 +117,7 @@ jobs: - name: Run clang-tidy run: | set -eux - git remote add upstream https://github.com/pytorch/pytorch + git remote add upstream https://github.com/csarofeen/pytorch git fetch upstream "$GITHUB_BASE_REF" BASE_SHA=${{ github.event.pull_request.base.sha }} HEAD_SHA=${{ github.event.pull_request.head.sha }} From 47f6a57bea521d7855b267ef4095b04d05c5ce44 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Thu, 20 Aug 2020 12:40:28 -0400 Subject: [PATCH 002/167] Fix for issue #306 and #296 (#307) * Fix https://github.com/csarofeen/pytorch/issues/306 * Reenable smem block gemm cache test. --- test/cpp/jit/test_gpu.cpp | 40 +++++++++++++++---- torch/csrc/jit/codegen/cuda/index_compute.cpp | 20 ++++++++-- 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index e6670ac5bdfb7..1da88e13f6a7b 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -3454,10 +3454,6 @@ void testGPU_FusionAdvancedIndexing() { FusionGuard fg(&fusion); int w = 3, x = 4, y = 7, z = 8; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({x, y, z}, options); - at::Tensor t1 = at::randn({w, x, y, z}, options); auto tv0 = makeDummyTensor(3); auto tv1 = makeDummyTensor(4); @@ -3466,9 +3462,12 @@ void testGPU_FusionAdvancedIndexing() { auto tv2 = add(tv0, new Float(1.0)); auto tv3 = add(tv2, tv1); - fusion.addOutput(tv3); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({x, y, z}, options); + at::Tensor t1 = at::randn({w, x, y, z}, options); + fuser::cuda::scheduleFusion(&fusion, {t0, t1}); torch::jit::fuser::cuda::FusionExecutor fe; @@ -3480,6 +3479,35 @@ void testGPU_FusionAdvancedIndexing() { TORCH_CHECK(t3.allclose(outputs[0])); } + + { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeConcreteTensor({10, 20}); + fusion.addInput(tv0); + TensorView* tv1 = makeConcreteTensor({10, 10, 20}); + fusion.addInput(tv1); + + TensorView* tv2 = add(tv0, new Float(1)); + TensorView* tv3 = broadcast(tv2, {true, false, false}); + TensorView* tv4 = add(tv3, tv1); + fusion.addOutput(tv4); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({10, 20}, options); + at::Tensor t1 = at::randn({10, 10, 20}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({t0, t1}); + + auto t2 = t0.add(1.0); + auto t3 = t2.add(t1); + + TORCH_CHECK(t3.allclose(outputs[0])); + } } // Test a simple Gemm but also play around with fusion executor features @@ -5318,7 +5346,6 @@ void testGPU_FusionSmemBlockGemm() { } void testGPU_FusionSmemBlockGemmCache() { -#if 0 Fusion fusion; FusionGuard fg(&fusion); @@ -5401,7 +5428,6 @@ void testGPU_FusionSmemBlockGemmCache() { aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); -#endif } void testGPU_FusionConstCheck() { diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp index ba5976dc53c89..c9cdd38a3c301 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp @@ -705,7 +705,19 @@ generateIndexAndExtentMap( // PROPAGATE CONSUMER -> PRODUCER END - return std::make_pair(index_compute.indexMap(), index_compute.extentMap()); + // Fill in extent map as some mapped indices may not have their extent filled + // in it, but consumers of this function expect it to be there + + std::unordered_map extent_map( + index_compute.extentMap()); + for (auto ind_entry : index_compute.indexMap()) { + auto id = ind_entry.first; + if (extent_map.find(id) == extent_map.end()) { + extent_map[id] = id->extent(); + } + } + + return std::make_pair(index_compute.indexMap(), extent_map); } } // namespace @@ -1011,7 +1023,7 @@ kir::TensorIndex* Index::getGlobalConsumerIndex( kir::TensorIndex* Index::getConsumerIndex_impl( TensorView* consumer_tv, const std::vector& loops) { - // grab all tensor views from producer_tv <- computeAtRoot + // grab all tensor views from consumer_tv <- computeAtRoot std::deque tv_stack = getComputeAtTVStackFrom(consumer_tv); std::unordered_map loop_to_ind_map = @@ -1026,9 +1038,9 @@ kir::TensorIndex* Index::getConsumerIndex_impl( auto index_map = index_and_extent_map.first; auto extent_map = index_and_extent_map.second; - // Indices should now be mapped onto IterDomains in producer, so just grab + // Indices should now be mapped onto IterDomains in consumer, so just grab // and use them. - auto root_dom = consumer_tv->getMaybeRFactorDomain(); + auto root_dom = consumer_tv->getRootDomain(); std::vector strided_inds; for (size_t i = 0; i < root_dom.size(); i++) { From 17935338119fc70235e01d5920b621cfcdb4e472 Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Thu, 20 Aug 2020 11:15:53 -0700 Subject: [PATCH 003/167] removing WAR of contig flag for broadcasting (#301) Fixes #230 removing WAR of contig flag for broadcasting removing unnecessary tests for the WAR --- test/cpp/jit/test_gpu.cpp | 42 --------------------- torch/csrc/jit/codegen/cuda/tensor_view.cpp | 9 ----- 2 files changed, 51 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index 1da88e13f6a7b..568f88c4b4c45 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -800,48 +800,6 @@ void testGPU_FusionTensor() { } } - { - auto tensor = at::randn({2, 1, 4}, options); - auto tensor_type = TensorType::create(tensor); - auto fuser_tensor = new TensorView(tensor_type); - TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); - TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); - TORCH_CHECK(fuser_tensor->domain() != nullptr); - for (int i = 0; i < static_cast(fuser_tensor->nDims()); i++) { - // size 1 dimension are makred as broadcast - TORCH_CHECK( - fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1)); - } - TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]); - - // temporary WAR to disable contig & bcast; issue # 230 - // TODO: insert the check where broadcast & contiguous cannot be marked - // together - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]); - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); - } - - { - auto tensor = at::randn({2, 3, 1}, options); - auto tensor_type = TensorType::create(tensor); - auto fuser_tensor = new TensorView(tensor_type); - TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); - TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); - TORCH_CHECK(fuser_tensor->domain() != nullptr); - for (int i = 0; i < static_cast(fuser_tensor->nDims()); i++) { - // size 1 dimension are makred as broadcast - TORCH_CHECK( - fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1)); - } - TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]); - - // temporary WAR to disable contig & bcast; issue # 230 - // TODO: insert the check where broadcast & contiguous cannot be marked - // together - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[2]); - } - // TensorType::create fills stride_properties, which helps us to mark // IterDomain properly // Note: implementation could change, depending on how much we want to invest diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/torch/csrc/jit/codegen/cuda/tensor_view.cpp index e8032c51925a2..66b202531fea1 100644 --- a/torch/csrc/jit/codegen/cuda/tensor_view.cpp +++ b/torch/csrc/jit/codegen/cuda/tensor_view.cpp @@ -67,11 +67,6 @@ TensorView::TensorView(const std::shared_ptr& tensor_type) stride_property_i->contiguous_.has_value() && stride_property_i->contiguous_.value() == true) { const size_t index = stride_property_i->stride_index_.value(); - // TODO: this is a temporary WAR to avoid contiguous_ flag on broadcasted - // dim, which results in wrong indexing math. issue #230 - if (sizes[index]->isBroadcast()) { - continue; - } if (i == 0) { // mark fastest changing dimension collapsible only when it's the last // dim; @@ -81,10 +76,6 @@ TensorView::TensorView(const std::shared_ptr& tensor_type) if (auto left_index_opt = tensor_type->stride_properties()[static_cast(i) - 1] ->stride_index_) { - // TODO: `isBroadcast` -> issue #230 - if (sizes[left_index_opt.value()]->isBroadcast()) { - continue; - } // collapse if two axes are neighboring in both sizes & stride_index; contig_info[index] = (left_index_opt.value() == (index + 1)); } From f12ab01af1292bafd1fffa2913ad376be34a3440 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Fri, 21 Aug 2020 11:56:26 -0400 Subject: [PATCH 004/167] LSTM cell C++ test (#310) Add an lstm cell c++ test for convenience. --- test/cpp/jit/test_gpu.cpp | 80 +++++++++++++++++++++++++++++++++++++++ test/cpp/jit/tests.h | 3 +- 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index 568f88c4b4c45..898d12b8ff5c0 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -5975,6 +5975,86 @@ void testGPU_FusionThreadPredicate() { TORCH_CHECK(aten_output_tv3.allclose(cg_output_tv3)); } +void testGPU_FusionLSTMCell() { + const int hidden_features = 512; + const int batch_size = 64; + + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tvs[16]; + for (size_t i = 0; i < 16; i++) { + tvs[i] = makeDummyTensor(2); + fusion.addInput(tvs[i]); + } + + auto ingate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3])); + + auto forgetgate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7])); + + auto cellgate = unaryOp( + UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11])); + + auto outgate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15])); + + auto cx = makeContigTensor(2); + fusion.addInput(cx); + + auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate)); + + auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy)); + + fusion.addOutput(cy); + fusion.addOutput(hy); + + std::vector inputs; + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor large_tensor0 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor1 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor2 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor3 = + at::randn({batch_size, hidden_features * 4}, options); + + auto chunked0 = large_tensor0.chunk(4, 1); + auto chunked1 = large_tensor1.chunk(4, 1); + auto chunked2 = large_tensor2.chunk(4, 1); + auto chunked3 = large_tensor3.chunk(4, 1); + + inputs.insert(inputs.end(), chunked0.begin(), chunked0.end()); + inputs.insert(inputs.end(), chunked1.begin(), chunked1.end()); + inputs.insert(inputs.end(), chunked2.begin(), chunked2.end()); + inputs.insert(inputs.end(), chunked3.begin(), chunked3.end()); + + auto at_ingate = + chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid(); + auto at_forgetgate = + chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid(); + auto at_cellgate = + chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh(); + auto at_outgate = + chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid(); + + auto at_cx = at::randn({batch_size, hidden_features}, options); + inputs.push_back(at_cx); + auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate)); + auto at_hy = at_outgate.mul(at_cy.tanh()); + + fuser::cuda::scheduleFusion(&fusion, c10::ArrayRef(inputs)); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion(c10::ArrayRef(inputs)); + + TORCH_CHECK(at_cy.allclose(outputs[0], 1e-4, 1e-7)); + TORCH_CHECK(at_hy.allclose(outputs[1], 1e-4, 1e-7)); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 773bf8dd71be3..bd21781a2b8b4 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -215,7 +215,8 @@ namespace jit { _(GPU_FusionTraversalOrder6) \ _(GPU_FusionTraversalOrder7) \ _(GPU_FusionBranches) \ - _(GPU_FusionThreadPredicate) + _(GPU_FusionThreadPredicate) \ + _(GPU_FusionLSTMCell) #else #define TH_FORALL_TESTS_CUDA(_) \ _(ArgumentSpec) \ From 4ab41103e8fea96ba4d2982eeaf0666011c4699a Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Fri, 21 Aug 2020 12:03:14 -0400 Subject: [PATCH 005/167] Fix predicate generation, there was a broken root map. (#311) --- torch/csrc/jit/codegen/cuda/lower_unroll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.h b/torch/csrc/jit/codegen/cuda/lower_unroll.h index 238f4de30f603..8898637925869 100644 --- a/torch/csrc/jit/codegen/cuda/lower_unroll.h +++ b/torch/csrc/jit/codegen/cuda/lower_unroll.h @@ -92,7 +92,7 @@ class TORCH_CUDA_API UnrollPass : public OptOutDispatch { : fusion_(_fusion), incoming_exprs_(_incoming_exprs), thread_predicates_(_thread_predicates) { - auto p2c_root_map = loop_utils::p2cRootMap(_fusion->exprs(true)); + p2c_root_map = loop_utils::p2cRootMap(_fusion->exprs(true)); } // Generate the for Expr replacement map From ce9ac6e69b9b40c9f083d4b740399c9d6f459099 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 21 Aug 2020 10:41:05 -0700 Subject: [PATCH 006/167] Reorder expressions in a breadth-first order (#312) --- torch/csrc/jit/codegen/cuda/lower_loops.cpp | 77 +++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp index fd7033e500166..59e10656dece3 100644 --- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp @@ -1,9 +1,11 @@ #include #include #include +#include #include #include +#include #include namespace torch { @@ -466,6 +468,79 @@ void sortGroup(TensorView* target, ExprListT& exprs, ExprScoreMapT& scores) { }); } +// Reorder expressions that are computed at the same position in a +// breadth-first order. +void reorderSegmentBreadthFirst( + ExprListT::iterator seg_begin, + ExprListT::const_iterator seg_end) { + // mapping of each expression to a bool flag indicating if it's + // already been visited + std::unordered_map expr_status; + for (auto it = seg_begin; it != seg_end; ++it) { + expr_status.insert({*it, false}); + } + + while (seg_begin != seg_end) { + std::vector visited_exprs; + for (auto it = seg_begin; it != seg_end; ++it) { + const auto expr = *it; + const auto& expr_inputs = + ir_utils::filterByType(expr->inputs()); + // expr can be visited if all input expressions are already + // visited. If an input expression is not found in expr_status, + // that should be safe to ignore. + const bool ready_to_visit = std::all_of( + expr_inputs.begin(), + expr_inputs.end(), + [&expr_status](const TensorView* input) { + const Expr* input_origin = input->getOrigin(); + return input_origin == nullptr || + expr_status.find(input_origin) == expr_status.end() || + expr_status.at(input_origin); + }); + if (ready_to_visit) { + std::iter_swap(seg_begin, it); + TORCH_INTERNAL_ASSERT(*seg_begin == expr); + ++seg_begin; + visited_exprs.push_back(expr); + } + } + for (const auto& visited_expr : visited_exprs) { + expr_status.at(visited_expr) = true; + } + } +} + +// Reorder expressions in a group in a breadth-first order. Reordering +// is done within a subset of expressions that have the same score +// (i.e., computeAt position). For each subset, +// reorderSegmentBreadthFirst is called. +void reorderGroupBreadthFirst(ExprListT& exprs, const ExprScoreMapT& scores) { + auto seg_begin = exprs.begin(); + auto seg_end = exprs.begin(); + ScoreT seg_score = scores.at(*seg_begin); + while (seg_end != exprs.end()) { + const auto expr = *seg_end; + const auto cur_score = scores.at(expr); + if (seg_score == cur_score) { + // advance further + ++seg_end; + continue; + } else if (seg_score < cur_score) { + // segment ended + reorderSegmentBreadthFirst(seg_begin, seg_end); + seg_begin = seg_end; + seg_score = cur_score; + } else { + // expre list is assumed to be sorted in the order of scores, so + // this should never be reachable + TORCH_INTERNAL_ASSERT( + false, "Unexpected expression: ", expr, ", score: ", cur_score); + } + } + reorderSegmentBreadthFirst(seg_begin, seg_end); +} + void mergeNonRootGroupsIntoRootGroups( TargetGroupMapT& computed_at_exprs, ExprTargetMapT& target_map) { @@ -549,6 +624,8 @@ void reorderExprsForComputeAt(std::vector& exprs) { // 2. Sort each loop-nest group based on axis (i.e., score) for (auto& group : computed_at_exprs) { sortGroup(group.first, group.second, scores); + // Reorder expressions in a breadth-first order + reorderGroupBreadthFirst(group.second, scores); } // 3. Merge non-root loop-nests into root loop-nests From 9766713c1a9bdd6ac4c3d021266d4e884b491e08 Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Fri, 21 Aug 2020 10:51:26 -0700 Subject: [PATCH 007/167] Runtime overhead reduction pr (#309) removing graph copy from critical code path; cache hasReduction result --- torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 7 ++- torch/csrc/jit/codegen/cuda/kernel_cache.h | 10 ++++ torch/csrc/jit/codegen/cuda/manager.cpp | 49 ++++++++++---------- 3 files changed, 39 insertions(+), 27 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp index 6277a8103c797..d7e62cb386b61 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp @@ -188,13 +188,16 @@ at::DimVector inversePermutation( FusionExecutorCache::FusionExecutorCache( std::unique_ptr&& fusion, at::Device device) - : device_(device), fusion_(std::move(fusion)) {} + : device_(device), fusion_(std::move(fusion)) { + // avoid putting `has_reduction_` in the initializer list + has_reduction_ = fusion_->hasReduction(); +} // TODO: dummy cache std::vector FusionExecutorCache::runFusionWithInputs( const at::ArrayRef& inputs) { // caching strategy is different for pw-fusion and reduction-fusion. - if (fusion_->hasReduction()) { + if (has_reduction_) { // copy the fusion, since each FusionExecutor needs to manipulate the fusion // in order to generate kernel. Fusion fusion = *fusion_; diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.h b/torch/csrc/jit/codegen/cuda/kernel_cache.h index 1b8233846dda0..a59fbc38f1bfa 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.h +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.h @@ -75,6 +75,16 @@ class FusionExecutorCache { // original un-scheduled `Fusion`; std::unique_ptr fusion_; + // I'm trading the const model in favor of assigning `has_reduction_` in the + // body of constructor, instead of the initializer list; + // Because of the move statement used in the constructor, it's tricky to + // maintain the code if we have `has_reduction_` as a const member and + // initizlize it in the initializer list, where the order of initialization + // is controled by the order of declaration instead of their order in the list + // + // cache fusion->hasReduction() because it's expensive; + bool has_reduction_; + // TODO: ugly logic for now. We should integrate the hashing of cache for // different kernels. (alternatively we could do so in scheduler). // ugly bits now: diff --git a/torch/csrc/jit/codegen/cuda/manager.cpp b/torch/csrc/jit/codegen/cuda/manager.cpp index 076803dce2fa5..f9f7b7655806f 100644 --- a/torch/csrc/jit/codegen/cuda/manager.cpp +++ b/torch/csrc/jit/codegen/cuda/manager.cpp @@ -99,7 +99,6 @@ class CudaFusionManager { std::vector runFusionNode( int32_t kernel_id, - std::shared_ptr& graph, const at::ArrayRef inputs) { std::lock_guard guard(mutex_); return graph_cache_[kernel_id]->runGraphWithInputs(inputs); @@ -222,9 +221,24 @@ void compileCudaFusionGroup(Node* fusion_node) { if (fusion_node->hasAttribute(attr::cache_id)) { TORCH_WARN("Double registration of CudaFusionGroup on CudaFusionManager"); } + // This is not a critical code path, it's OK to do graph copy here; + auto graph = fusion_node->g(attr::Subgraph)->copy(); + + if (!IsNewExecutorEnabled()) { + // TODO: this doesn't cover the case where input types are missing. If we do + // the graph construction at run-time, it's expensive to copy graph + // at critical path. We take the trade-off here as profiling executor + // is the future; + // + // Type propagation that's here just to cover corner case, incase type + // propagation failed in the original subgraph. We currently need output + // types in order to support fp16, where we cast input to fp32 and output + // back to fp16. + TypePropagate(graph); + } + int32_t fusion_cache_id = - CudaFusionManager::getManager().registerOrGetCacheId( - fusion_node->g(attr::Subgraph)); + CudaFusionManager::getManager().registerOrGetCacheId(graph); fusion_node->i_(attr::cache_id, fusion_cache_id); } @@ -240,31 +254,14 @@ void runCudaFusionGroup(const Node* fusion_node, Stack& stack) { int32_t kernel_id = fusion_node->i(attr::cache_id); // Currently we just construct I/O tensors for static graph; - std::shared_ptr graph = fusion_node->g(attr::Subgraph)->copy(); + + const auto nInputs = fusion_node->g(attr::Subgraph)->inputs().size(); auto execute_lambda = [&]() { - const auto nInputs = graph->inputs().size(); at::ArrayRef inputs = last(stack, nInputs); - // TODO: we would/could want an extra layer of graph cache in order to - // handle varying contiguity/broadcast; - // Only needed if we are doing codegen - // if no shape information available, we feed current shape into the kernel; - // This is needed because our current broadcast on size-1 dimension - if (!IsNewExecutorEnabled()) { - EraseShapeInformation(graph); - for (size_t i = 0; i < nInputs; i++) { - graph->inputs()[i]->setType(inputs[i].type()); - } - // Type propagation that's here just to cover corner case, incase type - // propagation failed in the original subgraph. We currently need output - // types in order to support fp16, where we cast input to fp32 and output - // back to fp16. - TypePropagate(graph); - } - auto outputs = - CudaFusionManager::getManager().runFusionNode(kernel_id, graph, inputs); + CudaFusionManager::getManager().runFusionNode(kernel_id, inputs); drop(stack, inputs.size()); stack.insert( @@ -286,8 +283,10 @@ void runCudaFusionGroup(const Node* fusion_node, Stack& stack) { "Failed for some reason. To debug try disable codegen fallback path" "via setting the env variable" "`export PYTORCH_CUDA_FUSER_DISABLE_FALLBACK=1`"); - EraseShapeInformation(graph); - InterpreterState{Code(graph, "fallback_cuda_fuser")}.run(stack); + // copying graph here since we are eliminating shape information; + auto copied_graph = fusion_node->g(attr::Subgraph)->copy(); + EraseShapeInformation(copied_graph); + InterpreterState{Code(copied_graph, "fallback_cuda_fuser")}.run(stack); } } } From 1bf4028d9ed838614be8bcb4e3f8ea9048772cfd Mon Sep 17 00:00:00 2001 From: Lemo Date: Fri, 21 Aug 2020 11:31:02 -0700 Subject: [PATCH 008/167] Split the origin (def) links between Fusion IR and Kernel IR --- torch/csrc/jit/codegen/cuda/fusion.cpp | 32 +++++++------------ torch/csrc/jit/codegen/cuda/fusion.h | 8 ++--- torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp | 20 ++++++++++++ torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 4 +-- 4 files changed, 37 insertions(+), 27 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index 2f6f06c6359cc..6842464f31265 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -335,9 +335,6 @@ void Fusion::assertInFusion(const Statement* stmt, const std::string& msg) if (inFusion(stmt)) { return; } - if (inKernelIr(stmt)) { - return; - } TORCH_CHECK(false, msg, " it was not found in the active fusion."); } @@ -478,13 +475,11 @@ StmtNameType Fusion::registerLoweredExpr(Expr* expr) { for (Val* input : expr->inputs()) { TORCH_CHECK(inKernelIr(input)); - assertInFusion(input); } for (Val* output : expr->outputs()) { TORCH_CHECK(inKernelIr(output)); - assertInFusion(output); - TORCH_CHECK(origin_.insert({output, expr}).second); + TORCH_CHECK(lowered_origin_.insert({output, expr}).second); } lowered_expr_set_.insert(expr); @@ -518,20 +513,17 @@ std::unordered_set Fusion::unordered_uses(Val* val) const { return std::unordered_set(); } -Expr* Fusion::origin(Val* val) const { - assertInFusion(val, "Cannot detect the origin of val, "); - auto it = origin_.find(val); - if (it == origin_.end()) - return nullptr; - return it->second; -} - -const Expr* Fusion::origin(const Val* val) const { - assertInFusion(val, "Cannot dettect the origin of val, "); - auto it = origin_.find(const_cast(val)); // NOLINT - if (it == origin_.end()) - return nullptr; - return it->second; +Expr* Fusion::origin(const Val* val) const { + // TODO(kir): remove the lowered branch + if (kir::isLoweredVal(val)) { + TORCH_INTERNAL_ASSERT(inKernelIr(val)); + auto it = lowered_origin_.find(val); + return it != lowered_origin_.end() ? it->second : nullptr; + } else { + assertInFusion(val, "Cannot detect the origin of val, "); + auto it = origin_.find(val); + return it != origin_.end() ? it->second : nullptr; + } } bool Fusion::hasInput(const Val* val) const { diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h index 4d0d50b78dc91..1bf844119980b 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.h +++ b/torch/csrc/jit/codegen/cuda/fusion.h @@ -198,10 +198,7 @@ class TORCH_CUDA_API Fusion final { std::unordered_set unordered_uses(Val* val) const; // Return the Expr that produces val - Expr* origin(Val* val) const; - - // Return the Expr that produces val (const version) - const Expr* origin(const Val* val) const; + Expr* origin(const Val* val) const; // Indicate to kernel to set itself up to generate random numbers bool hasRNG(); @@ -247,7 +244,7 @@ class TORCH_CUDA_API Fusion final { StmtNameType expr_name_counter_ = 0; // Dependency tracking for Vals. Where did it come from? Where is it used? - std::unordered_map origin_; + std::unordered_map origin_; std::unordered_map> uses_; // Fusion inputs and outputs @@ -257,6 +254,7 @@ class TORCH_CUDA_API Fusion final { // Lowered IR std::unordered_set lowered_val_set_; std::unordered_set lowered_expr_set_; + std::unordered_map lowered_origin_; }; } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp index f743db767d9aa..67c337afa1963 100644 --- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp @@ -121,6 +121,26 @@ class ConstCheck : OptOutConstDispatch { is_const_ = is_const_ && false; } + void handle(const kir::Bool* b) override { + is_const_ = is_const_ && b->isConst(); + } + + void handle(const kir::Float* f) override { + is_const_ = is_const_ && f->isConst(); + } + + void handle(const kir::Half* h) override { + is_const_ = is_const_ && h->isConst(); + } + + void handle(const kir::Int* i) override { + is_const_ = is_const_ && i->isConst(); + } + + void handle(const kir::NamedScalar* ns) override { + is_const_ = is_const_ && false; + } + void handle(const Expr* expr) override { for (auto inp : expr->inputs()) { handle(inp); diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp index 6277a8103c797..6bac6fda9c31d 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp @@ -157,9 +157,9 @@ at::DimVector inversePermutation( for (const auto& dim : permuted) { int adjusted_offset = 0; for (const auto& red_dim : reduction_axes) { - if (red_dim < (const unsigned long)dim) { + if (red_dim < (unsigned long)dim) { adjusted_offset++; // 1.b - } else if (red_dim == (const unsigned long)dim) { + } else if (red_dim == (unsigned long)dim) { adjusted_offset = -1; // 1.a break; } From 907782b1f9a2694867295002e6c1bdb8fd09aee7 Mon Sep 17 00:00:00 2001 From: Leonard Mosescu Date: Fri, 21 Aug 2020 13:41:47 -0700 Subject: [PATCH 009/167] Kernel IR refactoring: part 6 (#314) Splits the origin (definition) links between Fusion IR and Kernel IR. This will allow moving the nodes into different containers (as well as cleaning up parts which are not really needed for the Kernel IR, ex. cloning) Also fixing isConstScalar() and a couple of build warnings in kernel_cache.cpp --- torch/csrc/jit/codegen/cuda/fusion.cpp | 32 +++++++------------ torch/csrc/jit/codegen/cuda/fusion.h | 10 +++--- torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp | 20 ++++++++++++ torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 4 +-- 4 files changed, 39 insertions(+), 27 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index 2f6f06c6359cc..6842464f31265 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -335,9 +335,6 @@ void Fusion::assertInFusion(const Statement* stmt, const std::string& msg) if (inFusion(stmt)) { return; } - if (inKernelIr(stmt)) { - return; - } TORCH_CHECK(false, msg, " it was not found in the active fusion."); } @@ -478,13 +475,11 @@ StmtNameType Fusion::registerLoweredExpr(Expr* expr) { for (Val* input : expr->inputs()) { TORCH_CHECK(inKernelIr(input)); - assertInFusion(input); } for (Val* output : expr->outputs()) { TORCH_CHECK(inKernelIr(output)); - assertInFusion(output); - TORCH_CHECK(origin_.insert({output, expr}).second); + TORCH_CHECK(lowered_origin_.insert({output, expr}).second); } lowered_expr_set_.insert(expr); @@ -518,20 +513,17 @@ std::unordered_set Fusion::unordered_uses(Val* val) const { return std::unordered_set(); } -Expr* Fusion::origin(Val* val) const { - assertInFusion(val, "Cannot detect the origin of val, "); - auto it = origin_.find(val); - if (it == origin_.end()) - return nullptr; - return it->second; -} - -const Expr* Fusion::origin(const Val* val) const { - assertInFusion(val, "Cannot dettect the origin of val, "); - auto it = origin_.find(const_cast(val)); // NOLINT - if (it == origin_.end()) - return nullptr; - return it->second; +Expr* Fusion::origin(const Val* val) const { + // TODO(kir): remove the lowered branch + if (kir::isLoweredVal(val)) { + TORCH_INTERNAL_ASSERT(inKernelIr(val)); + auto it = lowered_origin_.find(val); + return it != lowered_origin_.end() ? it->second : nullptr; + } else { + assertInFusion(val, "Cannot detect the origin of val, "); + auto it = origin_.find(val); + return it != origin_.end() ? it->second : nullptr; + } } bool Fusion::hasInput(const Val* val) const { diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h index 4d0d50b78dc91..e1ee80e369baa 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.h +++ b/torch/csrc/jit/codegen/cuda/fusion.h @@ -162,8 +162,10 @@ class TORCH_CUDA_API Fusion final { // Print transformations used in fusion (can be very verbose) void printTransforms(); + // Lower the fusion and print a kernel void printKernel(); + // Register the Val with this fusion StmtNameType registerVal(Val* val); @@ -198,10 +200,7 @@ class TORCH_CUDA_API Fusion final { std::unordered_set unordered_uses(Val* val) const; // Return the Expr that produces val - Expr* origin(Val* val) const; - - // Return the Expr that produces val (const version) - const Expr* origin(const Val* val) const; + Expr* origin(const Val* val) const; // Indicate to kernel to set itself up to generate random numbers bool hasRNG(); @@ -247,7 +246,7 @@ class TORCH_CUDA_API Fusion final { StmtNameType expr_name_counter_ = 0; // Dependency tracking for Vals. Where did it come from? Where is it used? - std::unordered_map origin_; + std::unordered_map origin_; std::unordered_map> uses_; // Fusion inputs and outputs @@ -257,6 +256,7 @@ class TORCH_CUDA_API Fusion final { // Lowered IR std::unordered_set lowered_val_set_; std::unordered_set lowered_expr_set_; + std::unordered_map lowered_origin_; }; } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp index f743db767d9aa..67c337afa1963 100644 --- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp @@ -121,6 +121,26 @@ class ConstCheck : OptOutConstDispatch { is_const_ = is_const_ && false; } + void handle(const kir::Bool* b) override { + is_const_ = is_const_ && b->isConst(); + } + + void handle(const kir::Float* f) override { + is_const_ = is_const_ && f->isConst(); + } + + void handle(const kir::Half* h) override { + is_const_ = is_const_ && h->isConst(); + } + + void handle(const kir::Int* i) override { + is_const_ = is_const_ && i->isConst(); + } + + void handle(const kir::NamedScalar* ns) override { + is_const_ = is_const_ && false; + } + void handle(const Expr* expr) override { for (auto inp : expr->inputs()) { handle(inp); diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp index d7e62cb386b61..6b370b57b1470 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp @@ -157,9 +157,9 @@ at::DimVector inversePermutation( for (const auto& dim : permuted) { int adjusted_offset = 0; for (const auto& red_dim : reduction_axes) { - if (red_dim < (const unsigned long)dim) { + if (red_dim < (unsigned long)dim) { adjusted_offset++; // 1.b - } else if (red_dim == (const unsigned long)dim) { + } else if (red_dim == (unsigned long)dim) { adjusted_offset = -1; // 1.a break; } From 3cc7ab748954dd93c2b0cec2a8cf241e0f378375 Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Fri, 21 Aug 2020 15:44:03 -0700 Subject: [PATCH 010/167] Debug env disable fma (#315) Fixes #305 sys env to disabling fma and specify optimization level for jit compilation --- aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h | 1 + .../csrc/jit/codegen/cuda/executor_utils.cpp | 43 ++++++++++++++++--- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h index 4630465115c7c..00e57ca635203 100644 --- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h @@ -42,6 +42,7 @@ namespace at { namespace cuda { _(nvrtcGetProgramLog) \ _(nvrtcGetLoweredName) \ _(cuModuleLoadData) \ + _(cuModuleLoadDataEx) \ _(cuModuleGetFunction) \ _(cuOccupancyMaxActiveBlocksPerMultiprocessor) \ _(cuGetErrorString) \ diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp index c32538070a609..228a7723e1c4a 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp @@ -267,9 +267,36 @@ NvrtcFunction nvrtcCompile( const std::string compute = "--gpu-architecture=compute_" + std::to_string(major) + std::to_string(minor); - const std::vector args = { + std::vector args = { "--std=c++14", compute.c_str(), "-default-device"}; + const char* disable_fma = getenv("PYTORCH_CUDA_FUSER_DISABLE_FMA"); + // int disable_fma_flag = disable_fma ? atoi(disable_fma) : 0; + if (disable_fma && atoi(disable_fma)) { + printf("disabling fmad\n"); + args.push_back("--fmad=false"); + } + + const char* ptxas_opt_level = getenv("PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL"); + uint32_t jit_opt_level; + + std::vector options; + std::vector option_vals; + + if (ptxas_opt_level) { + int val = atoi(ptxas_opt_level); + if (val <= 4 && val >= 0) { + jit_opt_level = static_cast(val); + options.push_back(CU_JIT_OPTIMIZATION_LEVEL); + option_vals.emplace_back(&jit_opt_level); + } else { + TORCH_WARN_ONCE( + "acceptable range for PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL is between 0 and 4, but received ", + jit_opt_level, + ", ignoring the option"); + } + } + at::globalContext().getNVRTC().nvrtcAddNameExpression( program, func_name.c_str()); const auto result = at::globalContext().getNVRTC().nvrtcCompileProgram( @@ -323,9 +350,9 @@ NvrtcFunction nvrtcCompile( ptx.data(), ptx_size, "compiling PTX", - 0, - nullptr, - nullptr)); + options.size(), + options.data(), + option_vals.data())); size_t cubinSize; void* cubin; @@ -348,8 +375,12 @@ NvrtcFunction nvrtcCompile( &(compiled_kernel_.module), cubin)); } else { // load ptx directly - AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadData( - &(compiled_kernel_.module), ptx.data())); + AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadDataEx( + &(compiled_kernel_.module), + ptx.data(), + options.size(), + options.data(), + option_vals.data())); } AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleGetFunction( &(compiled_kernel_.function), From e40aacaa7414d14f2e3baaff375ee907bbeb9a6f Mon Sep 17 00:00:00 2001 From: Leonard Mosescu Date: Fri, 21 Aug 2020 15:49:42 -0700 Subject: [PATCH 011/167] Kernel IR refactoring: part 6.1 (#316) Removing support for cloning Kernel IR nodes, which is not needed today. --- torch/csrc/jit/codegen/cuda/fusion.cpp | 11 +-- torch/csrc/jit/codegen/cuda/ir_cloner.cpp | 76 ---------------- torch/csrc/jit/codegen/cuda/ir_cloner.h | 23 ----- torch/csrc/jit/codegen/cuda/kernel_ir.cpp | 101 +--------------------- torch/csrc/jit/codegen/cuda/kernel_ir.h | 43 --------- 5 files changed, 5 insertions(+), 249 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index 6842464f31265..82bf7847d59b5 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -90,6 +90,7 @@ void swap(Fusion& a, Fusion& b) noexcept { // Lowered IR nodes swap(a.lowered_val_set_, b.lowered_val_set_); swap(a.lowered_expr_set_, b.lowered_expr_set_); + swap(a.lowered_origin_, b.lowered_origin_); for (auto val : a.lowered_val_set_) { val->fusion_ = &a; @@ -140,15 +141,6 @@ Fusion::Fusion(const Fusion& other) { inputs_ = ir_cloner.clone(other.inputs_); outputs_ = ir_cloner.clone(other.outputs_); - - // Lowered nodes - for (auto val : other.lowered_val_set_) { - lowered_val_set_.insert(ir_cloner.clone(val)); - } - - for (auto expr : other.lowered_expr_set_) { - lowered_expr_set_.insert(ir_cloner.clone(expr)); - } } Fusion::Fusion(Fusion&& other) noexcept { @@ -208,6 +200,7 @@ void Fusion::clear() noexcept { } lowered_val_set_.clear(); lowered_expr_set_.clear(); + lowered_origin_.clear(); } void Fusion::removeExpr(Expr* expr) { diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp index ad85dc4642abc..17efc3e692e7a 100644 --- a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp @@ -114,82 +114,6 @@ void IrCloner::handle(const Merge* merge) { clone_ = new Merge(merge, this); } -void IrCloner::handle(const kir::Bool* node) { - clone_ = new kir::Bool(node, this); -} - -void IrCloner::handle(const kir::Float* node) { - clone_ = new kir::Float(node, this); -} - -void IrCloner::handle(const kir::Half* node) { - clone_ = new kir::Half(node, this); -} - -void IrCloner::handle(const kir::Int* node) { - clone_ = new kir::Int(node, this); -} - -void IrCloner::handle(const kir::NamedScalar* node) { - clone_ = new kir::NamedScalar(node, this); -} - -void IrCloner::handle(const kir::IterDomain* node) { - clone_ = new kir::IterDomain(node, this); -} - -void IrCloner::handle(const kir::TensorDomain* node) { - clone_ = new kir::TensorDomain(node, this); -} - -void IrCloner::handle(const kir::TensorView* node) { - clone_ = new kir::TensorView(node, this); -} - -void IrCloner::handle(const kir::UnaryOp* node) { - clone_ = new kir::UnaryOp(node, this); -} - -void IrCloner::handle(const kir::BinaryOp* node) { - clone_ = new kir::BinaryOp(node, this); -} - -void IrCloner::handle(const kir::TernaryOp* node) { - clone_ = new kir::TernaryOp(node, this); -} - -void IrCloner::handle(const kir::ReductionOp* node) { - clone_ = new kir::ReductionOp(node, this); -} - -void IrCloner::handle(const kir::BroadcastOp* node) { - clone_ = new kir::BroadcastOp(node, this); -} - -void IrCloner::handle(const kir::TensorIndex* node) { - clone_ = new kir::TensorIndex(node, this); -} - -void IrCloner::handle(const kir::Allocate* node) { - clone_ = new kir::Allocate(node, this); -} - -void IrCloner::handle(const kir::Sync* node) { - clone_ = new kir::Sync(node, this); -} - -void IrCloner::handle(const kir::ForLoop* node) { - clone_ = new kir::ForLoop(node, this); -} - -void IrCloner::handle(const kir::IfThenElse* node) { - clone_ = new kir::IfThenElse(node, this); -} - -void IrCloner::handle(const kir::GridReduction* node) { - clone_ = new kir::GridReduction(node, this); -} - } // namespace fuser } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.h b/torch/csrc/jit/codegen/cuda/ir_cloner.h index 25b101d612c88..39435aab4e657 100644 --- a/torch/csrc/jit/codegen/cuda/ir_cloner.h +++ b/torch/csrc/jit/codegen/cuda/ir_cloner.h @@ -67,29 +67,6 @@ class TORCH_CUDA_API IrCloner : private OptInConstDispatch { void handle(const Split*) override; void handle(const Merge*) override; - void handle(const kir::Bool*) override; - void handle(const kir::Float*) override; - void handle(const kir::Half*) override; - void handle(const kir::Int*) override; - void handle(const kir::NamedScalar*) override; - - void handle(const kir::IterDomain*) override; - void handle(const kir::TensorDomain*) override; - void handle(const kir::TensorView*) override; - - void handle(const kir::UnaryOp*) override; - void handle(const kir::BinaryOp*) override; - void handle(const kir::TernaryOp*) override; - void handle(const kir::ReductionOp*) override; - void handle(const kir::BroadcastOp*) override; - - void handle(const kir::TensorIndex*) override; - void handle(const kir::Allocate*) override; - void handle(const kir::Sync*) override; - void handle(const kir::ForLoop*) override; - void handle(const kir::IfThenElse*) override; - void handle(const kir::GridReduction*) override; - private: // The destination Fusion container Fusion* fusion_ = nullptr; diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp index c7c6d0ec39f0d..4da8dba26dd88 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp @@ -4,9 +4,6 @@ #include #include -// TODO(kir): remove -#include - namespace torch { namespace jit { namespace fuser { @@ -69,14 +66,6 @@ IterDomain::IterDomain(const fuser::IterDomain* iter_domain) iter_type_(iter_domain->getIterType()), is_rfactor_domain_(iter_domain->isRFactorProduct()) {} -IterDomain::IterDomain(const IterDomain* src, IrCloner* ir_cloner) - : Val(src, ir_cloner), - start_(ir_cloner->clone(src->start_)), - extent_(ir_cloner->clone(src->extent_)), - parallel_type_(src->parallel_type_), - iter_type_(src->iter_type_), - is_rfactor_domain_(src->is_rfactor_domain_) {} - Val* IterDomain::extent() const { TORCH_CHECK(isLoweredVal(extent_)); if (isThread()) { @@ -115,15 +104,6 @@ TensorDomain::TensorDomain(const fuser::TensorDomain* tensor_domain) rfactor_domain_ = lowerIterDomains(tensor_domain->getRFactorDomain()); } -TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner) - : Val(src, ir_cloner), - root_domain_(ir_cloner->clone(src->root_domain_)), - domain_(ir_cloner->clone(src->domain_)), - no_bcast_domain_(ir_cloner->clone(src->no_bcast_domain_)), - no_reduction_domain_(ir_cloner->clone(src->no_reduction_domain_)), - rfactor_domain_(ir_cloner->clone(src->rfactor_domain_)), - contiguity_(src->contiguity()) {} - bool TensorDomain::hasReduction() const { return no_reduction_domain_.size() != domain_.size(); } @@ -180,12 +160,6 @@ TensorView::TensorView(const fuser::TensorView* tv) : Val(tv), fuser_tv_(tv) { memory_type_ = tv->getMemoryType(); } -TensorView::TensorView(const TensorView* src, IrCloner* ir_cloner) - : Val(src, ir_cloner), - domain_(ir_cloner->clone(src->domain_)), - memory_type_(src->memory_type_), - fuser_tv_(src->fuser_tv_) {} - UnaryOp::UnaryOp(UnaryOpType type, Val* out, Val* in) : Expr(ExprType::KirUnaryOp), unary_op_type_{type}, out_{out}, in_{in} { addOutput(out); @@ -193,12 +167,6 @@ UnaryOp::UnaryOp(UnaryOpType type, Val* out, Val* in) name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this); } -UnaryOp::UnaryOp(const UnaryOp* src, IrCloner* ir_cloner) - : Expr(src, ir_cloner), - unary_op_type_(src->unary_op_type_), - out_(ir_cloner->clone(src->out_)), - in_(ir_cloner->clone(src->in_)) {} - BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs) : Expr(ExprType::KirBinaryOp), binary_op_type_{type}, @@ -211,13 +179,6 @@ BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs) name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this); } -BinaryOp::BinaryOp(const BinaryOp* src, IrCloner* ir_cloner) - : Expr(src, ir_cloner), - binary_op_type_(src->binary_op_type_), - out_(ir_cloner->clone(src->out_)), - lhs_(ir_cloner->clone(src->lhs_)), - rhs_(ir_cloner->clone(src->rhs_)) {} - TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3) : Expr(ExprType::KirTernaryOp), ternary_op_type_{type}, @@ -232,14 +193,6 @@ TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3) name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this); } -TernaryOp::TernaryOp(const TernaryOp* src, IrCloner* ir_cloner) - : Expr(src, ir_cloner), - ternary_op_type_(src->ternary_op_type_), - out_(ir_cloner->clone(src->out_)), - in1_(ir_cloner->clone(src->in1_)), - in2_(ir_cloner->clone(src->in2_)), - in3_(ir_cloner->clone(src->in3_)) {} - ReductionOp::ReductionOp( BinaryOpType reduction_op_type, Val* init, @@ -255,13 +208,6 @@ ReductionOp::ReductionOp( name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this); } -ReductionOp::ReductionOp(const ReductionOp* src, IrCloner* ir_cloner) - : Expr(src, ir_cloner), - reduction_op_type_(src->reduction_op_type_), - init_(ir_cloner->clone(src->init_)), - out_(ir_cloner->clone(src->out_)), - in_(ir_cloner->clone(src->in_)) {} - std::vector ReductionOp::getReductionDomains() const { // out is a TensorIndex after lowering const auto out_val = out()->as()->view(); @@ -297,11 +243,6 @@ BroadcastOp::BroadcastOp(Val* out, Val* in) name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this); } -BroadcastOp::BroadcastOp(const BroadcastOp* src, IrCloner* ir_cloner) - : Expr(src, ir_cloner), - out_(ir_cloner->clone(src->out_)), - in_(ir_cloner->clone(src->in_)) {} - TensorIndex::TensorIndex( const fuser::TensorView* view, std::vector indices) @@ -320,13 +261,9 @@ TensorIndex::TensorIndex( "Cannot index with a value other than an int."); } -TensorIndex::TensorIndex(const TensorIndex* src, IrCloner* ir_cloner) - : Val(src, ir_cloner), - view_(ir_cloner->clone(src->view_)), - indices_(ir_cloner->clone(src->indices_)) {} - -Scope::Scope(const Scope* src, IrCloner* ir_cloner) - : exprs_(ir_cloner->clone(src->exprs_)) {} +Sync::Sync() : Expr(ExprType::Sync) { + name_ = FusionGuard::getCurFusion()->registerExpr(this); +} void Scope::insert_before(Expr* ref, Expr* expr) { auto it = exprs_.begin(); @@ -391,13 +328,6 @@ ForLoop::ForLoop( } } -ForLoop::ForLoop(const ForLoop* src, IrCloner* ir_cloner) - : Expr(src, ir_cloner), - index_(ir_cloner->clone(src->index_)), - iter_domain_(ir_cloner->clone(src->iter_domain_)), - body_(&src->body_, ir_cloner), - parent_scope_(ir_cloner->clone(src->parent_scope_)) {} - void ForLoop::setParentScope(Expr* scope) { TORCH_INTERNAL_ASSERT( !scope_utils::exprInScope(parentScope(), this), @@ -420,13 +350,6 @@ IfThenElse::IfThenElse( else_body_.push_back(expr); } -IfThenElse::IfThenElse(const IfThenElse* src, IrCloner* ir_cloner) - : Expr(src, ir_cloner), - cond_(src->cond_), - body_(&src->body_, ir_cloner), - else_body_(&src->else_body_, ir_cloner), - parent_scope_(ir_cloner->clone(src->parent_scope_)) {} - void IfThenElse::setParentScope(Expr* scope) { TORCH_INTERNAL_ASSERT( !scope_utils::exprInScope(parentScope(), this), @@ -480,18 +403,6 @@ Allocate::Allocate(Val* buffer, MemoryType memory_type, Val* size) name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this); } -Allocate::Allocate(const Allocate* src, IrCloner* ir_cloner) - : Expr(src, ir_cloner), - buffer_(ir_cloner->clone(src->buffer_)), - memory_type_(src->memory_type_), - size_(ir_cloner->clone(src->size_)) {} - -Sync::Sync() : Expr(ExprType::Sync) { - name_ = FusionGuard::getCurFusion()->registerExpr(this); -} - -Sync::Sync(const Sync* src, IrCloner* ir_cloner) : Expr(src, ir_cloner) {} - GridReduction::GridReduction(ReductionOp* reduction_op) : Expr(ExprType::GridReduction), reduction_op_(reduction_op) { TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); @@ -506,12 +417,6 @@ GridReduction::GridReduction( reduction_buffer_(reduction_buffer), sync_buffer_(sync_buffer) {} -GridReduction::GridReduction(const GridReduction* src, IrCloner* ir_cloner) - : Expr(src, ir_cloner), - reduction_op_(ir_cloner->clone(src->reduction_op_)), - reduction_buffer_(ir_cloner->clone(src->reduction_buffer_)), - sync_buffer_(ir_cloner->clone(src->sync_buffer_)) {} - std::string GridReduction::getPredicateFlagName(const TensorView* val) { std::stringstream ss; ss << "T" << val->name() << "pred"; diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.h b/torch/csrc/jit/codegen/cuda/kernel_ir.h index ef7c455ef8fbc..67b493fe62455 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_ir.h +++ b/torch/csrc/jit/codegen/cuda/kernel_ir.h @@ -30,9 +30,6 @@ class TORCH_CUDA_API NamedScalar : public Val { explicit NamedScalar(const fuser::NamedScalar* node) : Val(node), name_(node->name()) {} - NamedScalar(const NamedScalar* src, IrCloner* ir_cloner) - : Val(src, ir_cloner), name_(src->name_) {} - const std::string& name() const { return name_; } @@ -64,9 +61,6 @@ class TORCH_CUDA_API Bool : public Val { explicit Bool(const fuser::Bool* node) : Val(node), maybe_value_(node->value()) {} - Bool(const Bool* src, IrCloner* ir_cloner) - : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} - bool isSymbolic() const { return !(maybe_value_.has_value()); } @@ -92,9 +86,6 @@ class TORCH_CUDA_API Float : public Val { explicit Float(const fuser::Float* node) : Val(node), maybe_value_(node->value()) {} - Float(const Float* src, IrCloner* ir_cloner) - : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} - bool isSymbolic() const { return !(maybe_value_.has_value()); } @@ -118,9 +109,6 @@ class TORCH_CUDA_API Half : public Val { explicit Half(const fuser::Half* node) : Val(node), maybe_value_(node->value()) {} - Half(const Half* src, IrCloner* ir_cloner) - : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} - bool isSymbolic() const { return !(maybe_value_.has_value()); } @@ -146,9 +134,6 @@ class TORCH_CUDA_API Int : public Val { explicit Int(const fuser::Int* node, bool /*avoid_zero_ambiguity*/) : Val(node), maybe_value_(node->value()) {} - Int(const Int* src, IrCloner* ir_cloner) - : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} - bool isSymbolic() const { return !(maybe_value_.has_value()); } @@ -169,8 +154,6 @@ class TORCH_CUDA_API IterDomain : public Val { explicit IterDomain(const fuser::IterDomain* iter_domain); - IterDomain(const IterDomain* src, IrCloner* ir_cloner); - bool isReduction() const { return getIterType() == IterType::Reduction; } @@ -237,8 +220,6 @@ class TORCH_CUDA_API TensorDomain : public Val { explicit TensorDomain(const fuser::TensorDomain* tensor_domain); - TensorDomain(const TensorDomain* src, IrCloner* ir_cloner); - std::vector::size_type nDims() const { return domain_.size(); } @@ -304,8 +285,6 @@ class TORCH_CUDA_API TensorView : public Val { public: explicit TensorView(const fuser::TensorView* tv); - TensorView(const TensorView* src, IrCloner* ir_cloner); - TensorDomain* domain() const { return domain_; } @@ -331,8 +310,6 @@ class TORCH_CUDA_API UnaryOp : public Expr { public: UnaryOp(UnaryOpType type, Val* out, Val* in); - UnaryOp(const UnaryOp* src, IrCloner* ir_cloner); - Val* out() const { return out_; } @@ -355,8 +332,6 @@ class TORCH_CUDA_API BinaryOp : public Expr { public: BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs); - BinaryOp(const BinaryOp* src, IrCloner* ir_cloner); - Val* out() const { return out_; } @@ -384,8 +359,6 @@ class TORCH_CUDA_API TernaryOp : public Expr { public: TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3); - TernaryOp(const TernaryOp* src, IrCloner* ir_cloner); - Val* out() const { return out_; } @@ -418,8 +391,6 @@ class TORCH_CUDA_API ReductionOp : public Expr { public: ReductionOp(BinaryOpType reduction_op_type, Val* init, Val* out, Val* in); - ReductionOp(const ReductionOp* src, IrCloner* ir_cloner); - Val* out() const { return out_; } @@ -453,8 +424,6 @@ class TORCH_CUDA_API TensorIndex : public Val { public: TensorIndex(const fuser::TensorView* view, std::vector indices); - TensorIndex(const TensorIndex* src, IrCloner* ir_cloner); - std::vector::size_type nDims() const { return indices_.size(); } @@ -480,8 +449,6 @@ class TORCH_CUDA_API BroadcastOp : public Expr { public: BroadcastOp(Val* out, Val* in); - BroadcastOp(const BroadcastOp* src, IrCloner* ir_cloner); - Val* out() const { return out_; } @@ -509,8 +476,6 @@ class TORCH_CUDA_API Allocate : public Expr { MemoryType memory_type = MemoryType::Local, Val* size = nullptr); - Allocate(const Allocate* src, IrCloner* ir_cloner); - Val* buffer() const { return buffer_; } @@ -537,13 +502,11 @@ class TORCH_CUDA_API Allocate : public Expr { class TORCH_CUDA_API Sync : public Expr { public: Sync(); - Sync(const Sync* src, IrCloner* ir_cloner); }; class TORCH_CUDA_API Scope { public: Scope() = default; - Scope(const Scope* src, IrCloner* ir_cloner); const std::vector& exprs() const { return exprs_; @@ -605,8 +568,6 @@ class TORCH_CUDA_API ForLoop : public Expr { const std::vector& body = {}, Expr* parent_scope = nullptr); - ForLoop(const ForLoop* src, IrCloner* ir_cloner); - Val* index() const { return index_; } @@ -648,8 +609,6 @@ class TORCH_CUDA_API IfThenElse : public Expr { const std::vector& else_body = {}, Expr* parent_scope = nullptr); - IfThenElse(const IfThenElse* src, IrCloner* ir_cloner); - Bool* cond() const { return cond_; } @@ -700,8 +659,6 @@ class TORCH_CUDA_API GridReduction : public Expr { Allocate* reduction_buffer, Allocate* sync_buffer); - GridReduction(const GridReduction* src, IrCloner* ir_cloner); - ReductionOp* reduction_op() const { return reduction_op_; } From ffd7ba3071bac5a52a9a4da4f6df94d3f7b45cca Mon Sep 17 00:00:00 2001 From: Leonard Mosescu Date: Fri, 21 Aug 2020 16:25:17 -0700 Subject: [PATCH 012/167] Fix kir::Sync::Sync() registration (#317) Kernel IR expressions must call Fusion::registerLoweredExpr() instead of Fusion::registerExpr() --- torch/csrc/jit/codegen/cuda/kernel_ir.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp index 4da8dba26dd88..01a099db8ad16 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp @@ -262,7 +262,7 @@ TensorIndex::TensorIndex( } Sync::Sync() : Expr(ExprType::Sync) { - name_ = FusionGuard::getCurFusion()->registerExpr(this); + name_ = FusionGuard::getCurFusion()->registerLoweredExpr(this); } void Scope::insert_before(Expr* ref, Expr* expr) { From 6f947249b092455919060c7df97e6b93508107d2 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Fri, 21 Aug 2020 18:14:44 -0700 Subject: [PATCH 013/167] Add an IRPrinter handler for kir::TensorView (#318) * Add an IRPrinter handler for kir::TensorView This is considered a temporary workaround as IRPrinter is meant to be exclusive to the fusion IR. * Add a comment --- torch/csrc/jit/codegen/cuda/ir_iostream.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp index 66aeec2c5bd17..112bcc8827a3c 100644 --- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp @@ -352,8 +352,10 @@ void IRPrinter::handle(const kir::TensorDomain*) { TORCH_INTERNAL_ASSERT(false, "Unreachable"); } -void IRPrinter::handle(const kir::TensorView*) { - TORCH_INTERNAL_ASSERT(false, "Unreachable"); +void IRPrinter::handle(const kir::TensorView* tv) { + // This should never be reachable, but the current codebase assumes + // kir::TensorView can be printable for debugging messages. + os << "KT" << tv->name(); } static bool isTV(const Val* val) { From 3136899c81780dd5e0f461606db42b1278966da3 Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Mon, 24 Aug 2020 16:01:25 -0700 Subject: [PATCH 014/167] Dynamic Shared Memory (#304) * Initial Dynamic Shared Memory Check if shared memory usage is within limits for current GPU Gather buffers in a single pass Use single dynamic shared memory for reduction/broadcast workspace Align dynamic shared memory by data type Co-authored-by: Ryan Spring --- test/cpp/jit/test_gpu.cpp | 163 ++++++++++++++++++ test/cpp/jit/tests.h | 3 + torch/csrc/jit/codegen/cuda/executor.cpp | 88 ++++++++-- torch/csrc/jit/codegen/cuda/executor.h | 6 + .../jit/codegen/cuda/executor_kernel_arg.cpp | 2 +- .../jit/codegen/cuda/executor_kernel_arg.h | 8 + .../jit/codegen/cuda/executor_launch_params.h | 5 + .../csrc/jit/codegen/cuda/executor_utils.cpp | 42 +++++ torch/csrc/jit/codegen/cuda/executor_utils.h | 8 + .../csrc/jit/codegen/cuda/expr_evaluator.cpp | 4 +- torch/csrc/jit/codegen/cuda/fusion.cpp | 39 +++++ torch/csrc/jit/codegen/cuda/fusion.h | 2 + torch/csrc/jit/codegen/cuda/index_compute.cpp | 2 +- torch/csrc/jit/codegen/cuda/ir_iostream.cpp | 80 +++++++-- torch/csrc/jit/codegen/cuda/ir_iostream.h | 6 +- torch/csrc/jit/codegen/cuda/kernel_ir.cpp | 3 +- .../codegen/cuda/kernel_resource_strings.h | 8 +- torch/csrc/jit/codegen/cuda/lower2device.cpp | 80 +++++++-- torch/csrc/jit/codegen/cuda/lower2device.h | 23 +++ torch/csrc/jit/codegen/cuda/lower_loops.cpp | 29 +++- torch/csrc/jit/codegen/cuda/lower_loops.h | 4 + torch/csrc/jit/codegen/cuda/scheduler.cpp | 4 - torch/csrc/jit/codegen/cuda/utils.h | 5 + 23 files changed, 548 insertions(+), 66 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index 898d12b8ff5c0..da53698983667 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -5388,6 +5388,169 @@ void testGPU_FusionSmemBlockGemmCache() { aten_output.sub(outputs[0]).abs().max()); } +void testGPU_FusionSmemDynamicReductionSymbolic() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0); + fusion.addInput(tv0); + fusion.addOutput(tv1); + // tv1[I0, R1] = tv0[I0, I1] + + // Interface should just be a direct split with a Parallel type. We can + // include the parallelize call if we do this. + tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); + // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] + + TensorView* tv2 = tv1->rFactor({2}); + tv2->setMemoryType(MemoryType::Shared); + // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] + // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] + + tv0->computeAt(tv1, 1); + + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv1->axis(0)->parallelize(ParallelType::BIDx); + + constexpr int numel_x = 65000, numel_y = 1024; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::rand({numel_x, numel_y}, options); + + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + + torch::jit::fuser::cuda::FusionExecutor executor; + executor.compileFusion(&fusion); + auto outputs = executor.runFusion( + {input}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); + + auto aten_output = input.sum({1}); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); +} + +void testGPU_FusionSmemDynamicReductionSymbolicArg() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Algorithm + Int* sym_bsx = new Int(); + TensorView* tv0 = makeDummyTensor(3); // M, K, N + fusion.addInput(tv0); + fusion.addInput(sym_bsx); + + TensorView* tv1 = sum(tv0, {1}); // M, R, N + fusion.addOutput(tv1); + + TensorView* tv2 = tv0->cache_after(); + tv2->setMemoryType(MemoryType::Shared); + + // Schedule + constexpr int BSX = 32; + tv1->split(2, BSX); + tv1->split(1, sym_bsx); + tv1->split(0, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}}); + TensorView* tv3 = tv1->rFactor({-2}); + + tv0->computeAt(tv1, -2); + tv0->computeAt(tv3, -2); + + // Thread and Block binding + tv1->axis(0)->parallelize(ParallelType::BIDx); + tv1->axis(1)->parallelize(ParallelType::BIDy); + tv1->axis(-1)->parallelize(ParallelType::TIDx); + // Manual Binding + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + + constexpr int M = 154, K = 45, N = 1524; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K, N}, options); + + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + + torch::jit::fuser::cuda::FusionExecutor executor; + executor.compileFusion(&fusion); + auto outputs = executor.runFusion( + {t0, runtime_threadIdx_dim}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); + + at::Tensor aten_output = sum(t0, {1}); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); +} + +void testGPU_FusionSmemDynamicPwiseMulSymbolicArg() { + Fusion fusion; + FusionGuard fg(&fusion); + + Int* sym_bsx = new Int(); + TensorView* tv0 = makeDummyTensor(2); // (M, K) + TensorView* tv1 = makeDummyTensor(2); // (K, N) + TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) + TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) + TensorView* tv4 = mul(tv2, tv3); // M, K, N + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addInput(sym_bsx); + fusion.addOutput(tv4); + // Algorithm + + tv2->setMemoryType(MemoryType::Shared); + tv3->setMemoryType(MemoryType::Shared); + + constexpr int BSX = 32; + tv4->split(2, BSX); + tv4->split(1, sym_bsx); + tv4->split(0, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}}); + // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX + + tv0->computeAt(tv4, 3); + tv1->computeAt(tv4, 3); + // Schedule + + tv4->axis(0)->parallelize(ParallelType::BIDx); + tv4->axis(2)->parallelize(ParallelType::BIDy); + // Manual Binding + tv2->axis(-2)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + // Thread and Block binding + + constexpr int M = 128, K = 457, N = 1024; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K}, options); + at::Tensor t1 = at::randn({K, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( + {t0, t1, BSX}, + torch::jit::fuser::cuda::LaunchParams(-1, -1, -1, BSX, -1, -1)); + + at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); +} + void testGPU_FusionConstCheck() { Fusion fusion; FusionGuard fg(&fusion); diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index bd21781a2b8b4..a2b1cdc49f2f3 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -201,6 +201,9 @@ namespace jit { _(GPU_FusionSmemReduce) \ _(GPU_FusionSmemBlockGemm) \ _(GPU_FusionSmemBlockGemmCache) \ + _(GPU_FusionSmemDynamicReductionSymbolic) \ + _(GPU_FusionSmemDynamicReductionSymbolicArg) \ + _(GPU_FusionSmemDynamicPwiseMulSymbolicArg) \ _(GPU_FusionConstCheck) \ _(GPU_FusionSymbolicReduction) \ _(GPU_FusionUnrollWithAlloc) \ diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index a546ee5cf2f6f..f2582b48e4f96 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -6,6 +6,7 @@ #include +#include #include #include #include @@ -55,6 +56,16 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) { const auto kernel = lowered_.getKernel(kernelName()); const auto structured_code = getStructuredCode(kernel); + if (lowered_.static_allocations().size() > 0) { + EvaluationContext evaluation_context(&fusion_); + unsigned static_smem_size = + computeSharedMemory(evaluation_context, lowered_.static_allocations()); + TORCH_INTERNAL_ASSERT( + static_smem_size < + at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock, + "The static shared memory allocation is larger than available memory."); + } + compiled_kernel_ = executor_utils::nvrtcCompile( structured_code, (kernelNamespace() + "::" + kernelName()).c_str(), @@ -71,14 +82,14 @@ at::Tensor inferAndAlloc( bool zero_init = false) { std::vector sizes; for (auto id : TensorDomain::noReductions(tv->getRootDomain())) { - auto infered_val = ExpressionEvaluator::evaluate(id->rawExtent(), &ec); + auto inferred_val = ExpressionEvaluator::evaluate(id->rawExtent(), &ec); TORCH_INTERNAL_ASSERT( - infered_val.has_value(), + inferred_val.has_value(), "Could not launch kernel as program could not infer ", id->rawExtent(), " for the buffer ", tv); - sizes.push_back(infered_val.value()); + sizes.push_back(inferred_val.value()); } auto at_type = data_type_to_aten(tv->getDataType().value()); @@ -96,6 +107,32 @@ at::Tensor inferAndAlloc( } // namespace +uint64_t FusionExecutor::computeSharedMemory( + EvaluationContext& ec, + const std::vector& buffers, + bool align_padding, + uint64_t total) { + for (auto smem_alloc : buffers) { + auto inferred_size = ExpressionEvaluator::evaluate(smem_alloc->size(), &ec); + if (inferred_size.has_value()) { + const uint64_t data_size = dataTypeSize(smem_alloc->buffer_type()); + // Add padding to align dynamic shared memory + if (align_padding) { + total = ceilDiv(total, data_size) * data_size; + } + total += inferred_size.value() * data_size; + } else { + TORCH_INTERNAL_ASSERT( + false, + "Failed to evaluate the size ", + smem_alloc->size(), + " of shared memory buffer - T", + smem_alloc->buffer()->name()); + } + } + return total; +} + LaunchParams FusionExecutor::computeLaunchParams( const at::ArrayRef& aten_inputs, const LaunchParams& launch_constraints, @@ -129,24 +166,24 @@ LaunchParams FusionExecutor::computeLaunchParams( // If any dimension was set in launch constraints we need to run through // IterDomains that have been parallelized, and bind those values. Or make - // sure if they could be infered the inference matches what was set. + // sure if they could be inferred the inference matches what was set. if (launch_constraints.nBlocks() * launch_constraints.nThreads() != -1) { for (auto& entry : parallel_iter_domains) { auto p_type = entry.first; if (launch_constraints.hasDim(p_type)) { auto parallel_ids = entry.second; for (auto parallel_id : parallel_ids) { - auto infered_val = + auto inferred_val = ExpressionEvaluator::evaluate(parallel_id->rawExtent(), &ec); - if (infered_val.has_value()) { - // This value could have been infered, make sure it was set right. + if (inferred_val.has_value()) { + // This value could have been inferred, make sure it was set right. TORCH_CHECK( - infered_val.value() == launch_constraints.getDim(p_type) || + inferred_val.value() == launch_constraints.getDim(p_type) || launch_constraints.getRawVal(p_type) == -1, - "Infered that ", + "inferred that ", p_type, " should be set to ", - infered_val.value(), + inferred_val.value(), " but launch constraints specified ", launch_constraints.getDim(p_type)); } else { @@ -155,6 +192,10 @@ LaunchParams FusionExecutor::computeLaunchParams( ec, parallel_id->rawExtent(), launch_constraints.getDim(entry.first)); + executor_utils::safeBind( + ec, + lowered_.getLowerValue(parallel_id->rawExtent()), + launch_constraints.getDim(entry.first)); launch_params.bind(launch_constraints.getDim(p_type), p_type); } } @@ -177,6 +218,29 @@ LaunchParams FusionExecutor::computeLaunchParams( } } + // Calculate Dynamic Shared Memory Size + // Add workspace for reduction and broadcast + uint64_t reduction_broadcast_workspace = 0; + if (fusion_.hasBlockReduction() || fusion_.hasGridReduction() || + lowered_.hasBlockBroadcast()) { + // Not using nThreads here since it does not handle uninitialized value + reduction_broadcast_workspace = + dataTypeSize(fusion_.getMaximumSmemDataType()) * launch_params.bdimx() * + launch_params.bdimy() * launch_params.bdimz(); + } + + uint64_t dynamic_smem_size = computeSharedMemory( + ec, lowered_.dynamic_allocations(), true, reduction_broadcast_workspace); + + uint64_t static_smem_size = + computeSharedMemory(ec, lowered_.static_allocations()); + + TORCH_INTERNAL_ASSERT( + (dynamic_smem_size + static_smem_size) < + at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock, + "The total shared memory allocation is larger than available memory."); + launch_params.setSmem(dynamic_smem_size); + return launch_params; } @@ -231,7 +295,7 @@ std::vector FusionExecutor::runFusion( auto stream = at::cuda::getCurrentCUDAStream(); EvaluationContext evaluation_context = - executor_utils::bindInputs(inputs, &fusion_); + executor_utils::bindInputs(inputs, &fusion_, &lowered_); LaunchParams launch_params = computeLaunchParams(inputs, launch_constraints, evaluation_context); @@ -266,7 +330,7 @@ std::vector FusionExecutor::runFusion( launch_params.bdimx(), launch_params.bdimy(), launch_params.bdimz(), - 0, // smem + launch_params.smem(), stream, kernel_arguments.getBuffer(), nullptr)); diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index 10e71827a37b1..86a70fc27f73e 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -61,6 +61,12 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { const LaunchParams& launch_constraints, EvaluationContext& ec); + uint64_t computeSharedMemory( + EvaluationContext& ec, + const std::vector& buffers, + bool align_padding = false, + uint64_t total = 0); + std::vector allocGlobalVals(EvaluationContext& ec); std::vector allocOutputs(EvaluationContext& ec); diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp index 1f3f44dbf5511..76358eb7868f4 100644 --- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp @@ -58,7 +58,7 @@ void KernelArgumentHolder::push(const IValue& val) { arguments_.push_back(std::make_unique((float)val.toDouble())); return; case c10::ScalarType::Long: - arguments_.push_back(std::make_unique((int)val.toInt())); + arguments_.push_back(std::make_unique(val.toInt())); return; default: TORCH_INTERNAL_ASSERT( diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h index ca9a83c60a56c..44d0eeacc7dfe 100644 --- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h +++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h @@ -61,6 +61,14 @@ struct ULongArg : public ArgAbstract { } }; +struct LongArg : public ArgAbstract { + int64_t val_; + LongArg(int64_t _val) : val_(_val){}; + void* arg() { + return &val_; + } +}; + struct IntArg : public ArgAbstract { int val_; IntArg(int _val) : val_(_val){}; diff --git a/torch/csrc/jit/codegen/cuda/executor_launch_params.h b/torch/csrc/jit/codegen/cuda/executor_launch_params.h index 872fa2d06b868..981352e4839bf 100644 --- a/torch/csrc/jit/codegen/cuda/executor_launch_params.h +++ b/torch/csrc/jit/codegen/cuda/executor_launch_params.h @@ -24,9 +24,14 @@ class TORCH_CUDA_API LaunchParams { bdimy_(bdimy), bdimz_(bdimz) {} + void setSmem(int64_t smem) { + smem_ = smem; + } + int64_t smem() const { return smem_; } + int64_t nBlocks() const { return gdimx_ * gdimy_ * gdimz_; } diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp index 228a7723e1c4a..97113fb4232c6 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp @@ -232,6 +232,48 @@ EvaluationContext bindInputs( return eval_context; } +EvaluationContext bindInputs( + const at::ArrayRef& aten_inputs, + Fusion* fusion, + GpuLower* lowered) { + TORCH_INTERNAL_ASSERT( + fusion->inputs().size() == aten_inputs.size(), + "Something went wrong configuring launch. Inputs no longer match."); + + auto fusion_inputs = fusion->inputs(); + EvaluationContext eval_context(fusion); + + // This should probably move to EvaluationContext as we may want to bind + // input values frequently. Bind fusion input values to runtime values. + for (size_t i = 0; i < fusion->inputs().size(); i++) { + if (fusion->inputs()[i]->getValType() == ValType::TensorView) { + TensorView* cg_tensor = fusion->inputs()[i]->as(); + + TORCH_INTERNAL_ASSERT( + aten_inputs[i].isTensor(), + "Something went wrong configuring launch. Inputs no longer match."); + + auto aten_tensor = aten_inputs[i].toTensor(); + auto root_dom = TensorDomain::noReductions(cg_tensor->getRootDomain()); + TORCH_INTERNAL_ASSERT( + aten_tensor.ndimension() == root_dom.size(), + "Something went wrong configuring launch. Inputs no longer match."); + + for (size_t dim = 0; dim < root_dom.size(); dim++) { + auto extent = root_dom[dim]->extent(); + safeBind(eval_context, extent, aten_tensor.sizes()[dim]); + if (!extent->isConstScalar()) { + safeBind( + eval_context, + lowered->getLowerValue(extent), + aten_tensor.sizes()[dim]); + } + } + } + } + return eval_context; +} + NvrtcFunction nvrtcCompile( const std::string& code, const std::string& func_name, diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h index d7f50ff7813b1..f105c9b88f82c 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.h +++ b/torch/csrc/jit/codegen/cuda/executor_utils.h @@ -11,6 +11,7 @@ #include #include #include +#include namespace torch { namespace jit { @@ -38,10 +39,17 @@ void safeBind( const Val* value, Int::ScalarType concrete_value); +// Bind Inputs to Fusion IR EvaluationContext bindInputs( const at::ArrayRef& aten_inputs, Fusion* fusion); +// Bind Inputs to Fusion and Kernel IR +EvaluationContext bindInputs( + const at::ArrayRef& aten_inputs, + Fusion* fusion, + GpuLower* lowered); + struct NvrtcFunction { CUmodule module = CUmodule(); CUfunction function = CUfunction(); diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp index b82813748a0bf..78aeab910e33e 100644 --- a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp +++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp @@ -13,7 +13,7 @@ namespace fuser { void EvaluationContext::bind(const Val* value, Int::ScalarType concrete_value) { TORCH_INTERNAL_ASSERT( value->isAnInt(), - "Expressoin Evaluation does not support values other than integers at this time."); + "Expression Evaluation does not support values other than integers at this time."); if (value->isConstScalar()) { auto const_value = value->as()->value().value(); @@ -53,7 +53,7 @@ void EvaluationContext::print() const { std::cout << " ; original value = " << kv.first->as()->value().value(); } - std::cout << "\n"; + std::cout << " ; " << *kv.first->getValType() << "\n"; } std::cout << "--------------------\n\n"; } diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index 82bf7847d59b5..381695cd27ab9 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -328,6 +328,9 @@ void Fusion::assertInFusion(const Statement* stmt, const std::string& msg) if (inFusion(stmt)) { return; } + if (inKernelIr(stmt)) { + return; + } TORCH_CHECK(false, msg, " it was not found in the active fusion."); } @@ -583,6 +586,42 @@ bool Fusion::hasGridReduction() { return false; } +bool Fusion::hasBroadcast() { + for (auto expr : exprs(true)) + for (auto out : expr->outputs()) + if (out->getValType() == ValType::TensorView) + if (out->as()->hasBroadcast()) + return true; + + return false; +} + +DataType Fusion::getMaximumSmemDataType() { + DataType result = DataType::Null; + unsigned max_size = 0; + for (auto expr : exprs(true)) { + for (auto out : expr->outputs()) { + if (out->getValType() == ValType::TensorView) { + auto tv = out->as(); + bool hasWorkspace = tv->hasBlockReduction() || tv->hasGridReduction(); + bool hasDynamic = tv->getMemoryType() == MemoryType::Shared; + if (hasWorkspace || hasDynamic) { + auto data_type = tv->getDataType(); + if (data_type.has_value()) { + unsigned size = dataTypeSize(data_type.value()); + if (size > max_size) { + max_size = size; + result = data_type.value(); + } + } + } + } + } + } + + return result; +} + std::vector Fusion::getTerminatingOutputs() { FusionGuard fg(this); diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h index e1ee80e369baa..d7dd74070ca99 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.h +++ b/torch/csrc/jit/codegen/cuda/fusion.h @@ -208,6 +208,8 @@ class TORCH_CUDA_API Fusion final { bool hasReduction(); bool hasBlockReduction(); bool hasGridReduction(); + bool hasBroadcast(); + DataType getMaximumSmemDataType(); size_t gridReductionTempBufferSize(); const auto& inputs() const { diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp index c9cdd38a3c301..e75440c48a185 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp @@ -1040,7 +1040,7 @@ kir::TensorIndex* Index::getConsumerIndex_impl( // Indices should now be mapped onto IterDomains in consumer, so just grab // and use them. - auto root_dom = consumer_tv->getRootDomain(); + auto root_dom = consumer_tv->getMaybeRFactorDomain(); std::vector strided_inds; for (size_t i = 0; i < root_dom.size(); i++) { diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp index 112bcc8827a3c..81178139e450c 100644 --- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp @@ -40,7 +40,8 @@ void IRPrinter::handle(const Expr* e) { void IRPrinter::printHeader( Fusion* fusion, const std::string& kernel_name_, - const std::vector& global_buffers) { + const std::vector& global_buffers, + bool hasDynamicSmem) { os << "__global__ void " << kernel_name_ << "("; std::vector vals; @@ -89,17 +90,38 @@ void IRPrinter::printHeader( os << "){\n"; indent_size++; + if (fusion->hasRNG()) { indent(); os << "int idx = blockIdx.x*blockDim.x + threadIdx.x;\n"; indent(); os << "Philox rnd(seed, idx, offset);\n"; } - if (fusion->hasBlockReduction() || fusion->hasGridReduction()) { + + // Dynamic Shared Memory + const bool hasWorkspace = + fusion->hasBlockReduction() || fusion->hasGridReduction(); + if (hasDynamicSmem || hasWorkspace) { + indent(); + os << "alignas("; + os << dataTypeSize(fusion->getMaximumSmemDataType()); + os << ") extern __shared__ char array[];\n"; + } + + if (hasDynamicSmem) { + indent(); + os << "unsigned offset = 0;\n"; + } + + if (hasWorkspace) { indent(); - // TODO: Dynamic sizing possible? blockReduce originally used 1024 - // values of a given type - os << "__shared__ float shared_mem[1024];\n"; + os << "void* shared_mem = array;\n"; + if (hasDynamicSmem) { + indent(); + os << "offset += ((blockDim.x * blockDim.y * blockDim.z) * sizeof("; + os << fusion->getMaximumSmemDataType(); + os << "));\n"; + } } } @@ -675,7 +697,7 @@ void IRPrinter::handle(const kir::ReductionOp* rop) { os << ", "; os << "reduction_" << op_type << "_" << d_type; os << ", threadIdx, blockDim"; - os << ", reinterpret_cast<" << d_type << "*>(shared_mem)"; + os << ", static_cast<" << d_type << "*>(shared_mem)"; os << ");\n"; } } @@ -730,7 +752,7 @@ void IRPrinter::handle(const kir::GridReduction* gr) { os << "reduction_" << op_type << "_" << d_type; os << ", &T" << work_buffer->name() << "[0]"; os << ", T" << sync_buffer->name() << ""; - os << ", reinterpret_cast<" << d_type << "*>(shared_mem)"; + os << ", static_cast<" << d_type << "*>(shared_mem)"; os << ");\n"; } @@ -760,6 +782,7 @@ void IRPrinter::handle(const kir::BroadcastOp* bop) { !grid_broadcast_needed, "Parallel broadcast across blocks not supported"); if (block_broadcast_needed) { + auto d_type = bop->out()->getDataType().value(); indent(); os << "broadcast::blockBroadcast<"; os << (thread_x ? "true" : "false") << ", "; @@ -769,6 +792,7 @@ void IRPrinter::handle(const kir::BroadcastOp* bop) { handle(bop->out()); os << ", "; handle(bop->in()); + os << ", static_cast<" << d_type << "*>(shared_mem)"; os << ");\n"; } else { indent(); @@ -850,15 +874,42 @@ void IRPrinter::handle(const kir::Allocate* a) { os << "// Allocate global tensor "; break; case MemoryType::Shared: - os << "__shared__ "; + if (a->size()->isConstScalar()) { + // Static Shared Memory + os << "__shared__ "; + } break; case MemoryType::Local: break; } - os << a->buffer_type(); - os << " T" << tv->name() << "["; - print_inline(a->size()); - os << "];\n"; + + // Dynamic Shared Memory + if (tv->getMemoryType() == MemoryType::Shared && + !a->size()->isConstScalar()) { + // Align Offset Position + os << "offset = alignBufferSize(offset,"; + os << dataTypeSize(a->buffer_type()); + os << ");\n"; + // Shared Memory Pointer + indent(); + os << a->buffer_type() << "* "; + os << "T" << tv->name(); + os << " = reinterpret_cast<" << a->buffer_type() << "*>"; + os << "(array + offset);\n"; + // Increment Offset Position + indent(); + os << "offset += ("; + print_inline(a->size()); + os << " * sizeof("; + os << a->buffer_type(); + os << "));\n"; + } else { + os << a->buffer_type(); + os << " T" << tv->name() << "["; + print_inline(a->size()); + os << "];\n"; + } + } else { os << a->buffer_type() << " "; handle(a->buffer()); @@ -938,7 +989,8 @@ void IRPrinter::printReductionOps(Fusion* fusion) { void IRPrinter::printKernel( const std::vector& exprs, const std::string& kernel_name, - const std::vector& global_buffers) { + const std::vector& global_buffers, + bool hasDynamicSmem) { Fusion* fusion = FusionGuard::getCurFusion(); if (exprs.empty()) return; @@ -947,7 +999,7 @@ void IRPrinter::printKernel( "Incorrect fusion set during printKernel."); printReductionOps(fusion); - printHeader(fusion, kernel_name, global_buffers); + printHeader(fusion, kernel_name, global_buffers, hasDynamicSmem); for (auto* expr : exprs) { handle(expr); diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.h b/torch/csrc/jit/codegen/cuda/ir_iostream.h index e6d4b473a758f..eb07f86c5aead 100644 --- a/torch/csrc/jit/codegen/cuda/ir_iostream.h +++ b/torch/csrc/jit/codegen/cuda/ir_iostream.h @@ -94,7 +94,8 @@ class TORCH_CUDA_API IRPrinter : public OptInConstDispatch { void printHeader( Fusion* fusion, const std::string& kernel_name_, - const std::vector& global_buffers); + const std::vector& global_buffers, + bool hasDynamicSmem); IRPrinter(std::ostream& _os) : os(_os) {} @@ -169,7 +170,8 @@ class TORCH_CUDA_API IRPrinter : public OptInConstDispatch { void printKernel( const std::vector& exprs, const std::string& kernel_name, - const std::vector& global_buffers); + const std::vector& global_buffers, + bool hasDynamicSmem); private: std::unique_ptr thread_predicates_; diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp index 01a099db8ad16..8f8fd95fb0d4a 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp @@ -386,8 +386,7 @@ Allocate::Allocate(Val* buffer, MemoryType memory_type, Val* size) } } - if ((memory_type_ == MemoryType::Local || - memory_type_ == MemoryType::Shared)) { + if (memory_type_ == MemoryType::Local) { if (!size_->isConstScalar()) { TORCH_INTERNAL_ASSERT( false, diff --git a/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h b/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h index cdc41ddab51c3..a099b1a7698ea 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h +++ b/torch/csrc/jit/codegen/cuda/kernel_resource_strings.h @@ -149,6 +149,9 @@ static auto code_helper_funcs = R"( __device__ constexpr int ceilDiv(const int a, const int b) { return (a + b - 1) / b; } +__device__ constexpr int alignBufferSize(const int buffer, const int size) { + return (buffer + (size-1)) & ~(size-1); +} __device__ float clamp(const float x, const float minv, const float maxv) { return x < minv ? minv : (x > maxv ? maxv : x); } @@ -595,10 +598,7 @@ __host__ __device__ unsigned offset_of_source(const dim3& block_dim, const dim3& out: Per-thread output location */ template -__device__ void blockBroadcast(T& out, T inp_val) { - - // Use worst case for memory. - __shared__ T shared_mem[1024]; + __device__ void blockBroadcast(T& out, T inp_val, T* shared_mem) { const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) && diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp index 94ac287722bb3..424ed4ae13386 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.cpp +++ b/torch/csrc/jit/codegen/cuda/lower2device.cpp @@ -19,31 +19,44 @@ namespace { // TODO(kir): revisit this thread_local GpuLower* active_gpu_lower = nullptr; -class GridReductionBuffers : OptOutDispatch { +class BuffersExtractor : OptOutDispatch { public: - static std::vector getGlobalAllocs( - const std::vector& exprs) { - GridReductionBuffers fgr; + BuffersExtractor( + const std::vector& exprs, + ThreadPredicateMap& _thread_predicates) + : thread_predicates_(_thread_predicates), has_block_broadcast_(false) { for (auto expr : exprs) { - fgr.handle(expr); + handle(expr); } - return fgr.global_allocations_; } - static std::vector getSyncAllocs( - const std::vector& exprs) { - GridReductionBuffers fgr; - for (auto expr : exprs) { - fgr.handle(expr); - } - return fgr.sync_allocations_; + std::vector getGlobalAllocs() { + return global_allocations_; + } + + std::vector getSyncAllocs() { + return sync_allocations_; + } + + std::vector getDynamicAllocs() { + return dynamic_allocations_; + } + + std::vector getStaticAllocs() { + return static_allocations_; + } + + bool hasBlockBroadcast() { + return has_block_broadcast_; } private: + ThreadPredicateMap& thread_predicates_; + bool has_block_broadcast_; std::vector global_allocations_; std::vector sync_allocations_; - - GridReductionBuffers() = default; + std::vector dynamic_allocations_; + std::vector static_allocations_; void handle(Expr* expr) final { OptOutDispatch::handle(expr); @@ -65,10 +78,30 @@ class GridReductionBuffers : OptOutDispatch { } } + void handle(kir::BroadcastOp* bop) final { + const ir_utils::ParallelTypeBitmap domains = + ir_utils::getParallelBroadcastDomains(bop->out(), thread_predicates_); + const bool thread_x = domains.get(ParallelType::TIDx); + const bool thread_y = domains.get(ParallelType::TIDy); + const bool thread_z = domains.get(ParallelType::TIDz); + const bool block_broadcast_needed = thread_x || thread_y || thread_z; + has_block_broadcast_ |= block_broadcast_needed; + } + void handle(kir::GridReduction* gr) final { global_allocations_.push_back(gr->reduction_buffer()); sync_allocations_.push_back(gr->sync_buffer()); } + + void handle(kir::Allocate* a) final { + if (a->getMemoryType() == MemoryType::Shared) { + if (a->size()->isConstScalar()) { + static_allocations_.push_back(a); + } else { + dynamic_allocations_.push_back(a); + } + } + } }; } // namespace @@ -181,8 +214,12 @@ void GpuLower::lower() { lowered_exprs_ = indexed_loops; // Get allocations - global_allocations_ = GridReductionBuffers::getGlobalAllocs(lowered_exprs_); - sync_allocations_ = GridReductionBuffers::getSyncAllocs(lowered_exprs_); + BuffersExtractor be(lowered_exprs_, preds); + global_allocations_ = be.getGlobalAllocs(); + sync_allocations_ = be.getSyncAllocs(); + dynamic_smem_allocations_ = be.getDynamicAllocs(); + static_smem_allocations_ = be.getStaticAllocs(); + has_block_broadcast_ = be.hasBlockBroadcast(); } // Traverse through the fusion and print CUDA code associated with it @@ -204,8 +241,10 @@ std::ostream& GpuLower::printKernel( global_tensors.begin(), [](kir::Allocate* alloc) { return alloc->buffer(); }); + bool hasDynamicSmem = dynamic_smem_allocations_.size() > 0; + IRPrinter irp(os); - irp.printKernel(lowered_exprs_, kernel_name, global_tensors); + irp.printKernel(lowered_exprs_, kernel_name, global_tensors, hasDynamicSmem); return os; } @@ -338,6 +377,11 @@ Val* GpuLower::lowerValue(const Val* val) { return kir_mapper.lower(val); } +Val* GpuLower::getLowerValue(const Val* val) { + KernelIrMapper kir_mapper(this); + return kir_mapper.lower(val); +} + } // namespace fuser } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h index 4ffccba33339c..c9a8a283b0916 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.h +++ b/torch/csrc/jit/codegen/cuda/lower2device.h @@ -37,12 +37,26 @@ class TORCH_CUDA_API GpuLower { return sync_allocations_; } + std::vector dynamic_allocations() { + return dynamic_smem_allocations_; + } + + std::vector static_allocations() { + return static_smem_allocations_; + } + + bool hasBlockBroadcast() { + return has_block_broadcast_; + } + // Converts a Fusion IR value into the Kernel IR equivalent // // TODO(kir): revisit this interface // static Val* lowerValue(const Val* val); + Val* getLowerValue(const Val* val); + private: void lower(); @@ -65,6 +79,15 @@ class TORCH_CUDA_API GpuLower { // the fusion std::vector sync_allocations_; + // List of dynamic shared memory buffers + std::vector dynamic_smem_allocations_; + + // List of static shared memory buffers + std::vector static_smem_allocations_; + + // Check if kernel has shared memory broadcast op + bool has_block_broadcast_; + // Lowered IR std::vector lowered_exprs_; diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp index 59e10656dece3..b27ef32c2207c 100644 --- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp @@ -42,7 +42,7 @@ Expr* LoopNestGenerator::pushAlloc(TensorView* tv) { local_dim->isBroadcast()) { continue; } - alloc_dims.push_back(compute_at_dim->extent()); + alloc_dims.push_back(compute_at_dim->rawExtent()); } // Multiply all the dimensions we're going to use for the allocation together @@ -62,10 +62,22 @@ Expr* LoopNestGenerator::pushAlloc(TensorView* tv) { const auto alloc = new kir::Allocate(lowered_tv, lowered_tv->getMemoryType(), size); - if (alloc_loop != nullptr) { - alloc_loop->body().insert(0, alloc); - } else { - lowered_exprs.insert(lowered_exprs.begin(), alloc); + // Track Shared Memory Allocation Nodes + bool hasDynamicSmemAlloc = false; + if (tv->getMemoryType() == MemoryType::Shared) { + if (!size->isConstScalar()) { + hasDynamicSmemAlloc = true; + dynamic_smem_.push_front(alloc); + } + } + + // Place the allocation + if (!hasDynamicSmemAlloc) { + if (alloc_loop != nullptr) { + alloc_loop->body().insert(0, alloc); + } else { + lowered_exprs.insert(lowered_exprs.begin(), alloc); + } } return alloc; @@ -656,7 +668,7 @@ void LoopNestGenerator::generate(const std::vector& exprs) { FusionGuard fg(fusion_); // Identify all shared memory TensorViews - // Initialize Modified status + // Insert into shared_memory map for (auto v : fusion_->vals()) { if (v->getValType().value() == ValType::TensorView) { if (v->as()->getMemoryType() == MemoryType::Shared) { @@ -674,6 +686,11 @@ void LoopNestGenerator::generate(const std::vector& exprs) { for (auto* expr : reordered) { handle(expr); } + + // Insert Dynamic Shared Memory at beginning of kernel + for (auto smem_alloc : dynamic_smem_) { + lowered_exprs.insert(lowered_exprs.begin(), smem_alloc); + } } void LoopNestGenerator::cleanSharedMemory() { diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.h b/torch/csrc/jit/codegen/cuda/lower_loops.h index f15ea29d218fe..2da3548de4a69 100644 --- a/torch/csrc/jit/codegen/cuda/lower_loops.h +++ b/torch/csrc/jit/codegen/cuda/lower_loops.h @@ -52,6 +52,10 @@ class TORCH_CUDA_API LoopNestGenerator : public OptOutDispatch { // Tracks if shared memory is modified std::unordered_map smem_; + // Track dynamic shared memory buffer + // Insert allocation at the beginning of the kernel + std::deque dynamic_smem_; + // Clear the modify status for all shared memory buffers void cleanSharedMemory(); diff --git a/torch/csrc/jit/codegen/cuda/scheduler.cpp b/torch/csrc/jit/codegen/cuda/scheduler.cpp index c3e2f10c0f625..5a1611d157785 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler.cpp @@ -208,10 +208,6 @@ bool scheduleFusion(Fusion* fusion, const at::ArrayRef inputs) { } namespace { -constexpr int ceilDiv(int a, int b) { - return (a + b - 1) / b; -} - // Largest Power of 2 less-than n constexpr int lastPow2(int n) { n |= (n >> 1); diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h index 08be561aad0df..e286cc09ed3ad 100644 --- a/torch/csrc/jit/codegen/cuda/utils.h +++ b/torch/csrc/jit/codegen/cuda/utils.h @@ -7,6 +7,11 @@ namespace torch { namespace jit { namespace fuser { +// Common Functions +constexpr int64_t ceilDiv(int64_t a, int64_t b) { + return (a + b - 1) / b; +} + // Simple mixin for suppressing copy & move operations, ex: // // class Foo : public NonCopyable { From 930cfe04ebdbcc620fdd90dc49cf638659f27fc4 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 26 Aug 2020 11:36:41 -0700 Subject: [PATCH 015/167] Detect computeAt causing mismatched TensorDomain (#327) An example of this error happens with tv4 of testGPU_FusionComputeAtMultiBCast. --- test/cpp/jit/test_gpu.cpp | 18 ++++++++++++++++++ test/cpp/jit/tests.h | 3 ++- torch/csrc/jit/codegen/cuda/compute_at.cpp | 14 ++++++++++++++ torch/csrc/jit/codegen/cuda/compute_at.h | 4 +--- .../csrc/jit/codegen/cuda/ir_internal_nodes.h | 5 +++++ torch/csrc/jit/codegen/cuda/ir_nodes.cpp | 9 +++++++++ 6 files changed, 49 insertions(+), 4 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index da53698983667..2a0cf272865d2 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -6218,6 +6218,24 @@ void testGPU_FusionLSTMCell() { TORCH_CHECK(at_hy.allclose(outputs[1], 1e-4, 1e-7)); } +void testGPU_FusionComputeAtMultiBCast() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = broadcast(tv1, {true, false}); + TensorView* tv3 = broadcast(tv1, {false, true}); + TensorView* tv4 = add(tv2, tv3); + fusion.addOutput(tv4); + + // This is not supported and should throw an exception. + ASSERT_ANY_THROW(tv1->computeAt(tv3, -1)); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index a2b1cdc49f2f3..0f3fac2077409 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -219,7 +219,8 @@ namespace jit { _(GPU_FusionTraversalOrder7) \ _(GPU_FusionBranches) \ _(GPU_FusionThreadPredicate) \ - _(GPU_FusionLSTMCell) + _(GPU_FusionLSTMCell) \ + _(GPU_FusionComputeAtMultiBCast) #else #define TH_FORALL_TESTS_CUDA(_) \ _(ArgumentSpec) \ diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/torch/csrc/jit/codegen/cuda/compute_at.cpp index 3e0f5303b9669..4780d699d5546 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.cpp +++ b/torch/csrc/jit/codegen/cuda/compute_at.cpp @@ -82,6 +82,20 @@ void ComputeAtData::validateNewComputeAt() const { "."); } +void ComputeAtData::setComputeAtDomain(TensorDomain* td) { + if (new_compute_at_domain_ != original_domain_) { + TORCH_INTERNAL_ASSERT( + *new_compute_at_domain_ == *td, + "TensorDomain, ", + td, + ", does not match with the previously set domain of ", + tv_ref_, + ", which is ", + new_compute_at_domain_); + } + new_compute_at_domain_ = td; +} + namespace { // Wrapper around set_intersection template diff --git a/torch/csrc/jit/codegen/cuda/compute_at.h b/torch/csrc/jit/codegen/cuda/compute_at.h index 84677ae994480..a9112a6225ca6 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.h +++ b/torch/csrc/jit/codegen/cuda/compute_at.h @@ -56,9 +56,7 @@ class ComputeAtData { // If we set computeAt, save the domain so we can reset it after traversal. // Traversal state can deviate from the domain we will want to save after the // entire computeAt pass. - void setComputeAtDomain(TensorDomain* td) { - new_compute_at_domain_ = td; - } + void setComputeAtDomain(TensorDomain* td); // Return domain set in setComputeAtDomain TensorDomain* getComputeAtDomain() const { diff --git a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h index 10446b1235329..7fd760bc60dfa 100644 --- a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h +++ b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h @@ -384,6 +384,11 @@ class TORCH_CUDA_API TensorDomain : public Val { TensorDomain(const TensorDomain* src, IrCloner* ir_cloner); + bool operator==(const TensorDomain& other) const; + bool operator!=(const TensorDomain& other) const { + return !(*this == other); + } + std::vector::size_type nDims() const { return domain_.size(); } diff --git a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp index d63d2ce681834..27756751814e0 100644 --- a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp @@ -571,6 +571,15 @@ TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner) rfactor_domain_(ir_cloner->clone(src->rfactor_domain_)), contiguity_(src->contiguity()) {} +bool TensorDomain::operator==(const TensorDomain& other) const { + // Checks equality of each class field. Should not be necessary to + // check no_bcast_domain_ and no_reduction_domain_ as they are just + // derived from domain_. + return root_domain_ == other.root_domain_ && domain_ == other.domain_ && + rfactor_domain_ == other.rfactor_domain_ && + contiguity_ == other.contiguity_; +} + bool TensorDomain::sameAs(const TensorDomain* const other) const { if (nDims() != other->nDims()) return false; From b7a1060e14299ecf67823d8f200739f9ffe113dd Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 26 Aug 2020 17:15:30 -0700 Subject: [PATCH 016/167] Additional tests on computeAt with minor refactoring (#331) * Add computeAt tests with minor cleanup * Print names of IterDomains for better debugging experience --- test/cpp/jit/test_gpu.cpp | 397 +++++++++++++++++++ test/cpp/jit/tests.h | 5 + torch/csrc/jit/codegen/cuda/compute_at.cpp | 30 +- torch/csrc/jit/codegen/cuda/ir_iostream.cpp | 2 + torch/csrc/jit/codegen/cuda/transform_iter.h | 2 + 5 files changed, 421 insertions(+), 15 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index 2a0cf272865d2..fcf62c718f814 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -1711,6 +1711,403 @@ void testGPU_FusionAdvancedComputeAt() { } } +void testGPU_FusionComputeAtMultiConsumers() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -2 + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + fusion.addOutput(tv2); + fusion.addOutput(tv3); + + // This computeAt will affect tv2 as well, even though tv2 is not in + // the data-flow path between tv1 and tv3. The reason is that tv1 is + // now computed at tv3, so tv2 must also be computed at the same + // location. Overall, what will happen is basically we merge + // expressions of all tensors and compute them in a single loop + // nest. + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + TensorView* affected_tensors[] = {tv1, tv2, tv3}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + // Note that tv2 is also computed at tv3. + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + TORCH_CHECK(!tv3->hasComputeAt()); + + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + + at::Tensor kernel_tv2 = at::empty_like(t0, options); + at::Tensor kernel_tv3 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv2, kernel_tv3}); + + TORCH_CHECK(at::allclose(kernel_tv2, t2)); + TORCH_CHECK(at::allclose(kernel_tv3, t3)); +} + +// Similar to ComputeAtMultiConsumers, but with a common consumer. +void testGPU_FusionComputeAtCommonConsumer1() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -2 + // tv4 = tv2 + tv3 + // tv5 = tv4 * 5 + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + TensorView* tv4 = add(tv2, tv3); + TensorView* tv5 = mul(tv4, new Float(5.0)); + fusion.addOutput(tv3); + fusion.addOutput(tv4); + fusion.addOutput(tv5); + + // Computing tv1 at tv3. This will affect tv2 as discussed in + // ComplexComputeAt1. Additionally, in this case, notice that tv4 is + // the common consumer of tv2 and tv3, so they are computed at + // tv4. The indirect propagation of the computeAt should stop at the + // common consumer, and no further change should occur. More + // specifically, tv4 and tv5 should not have a computeAt tensor. + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + fusion.printMath(); + fusion.printKernel(); + + TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv4); + TORCH_CHECK(tv3->getComputeAtView() == tv4); + TORCH_CHECK(!tv4->hasComputeAt()); + TORCH_CHECK(!tv5->hasComputeAt()); + + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + auto t4 = t2 + t3; + auto t5 = t4 * 5.0; + + at::Tensor kernel_tv3 = at::empty_like(t0, options); + at::Tensor kernel_tv4 = at::empty_like(t0, options); + at::Tensor kernel_tv5 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5}); + + TORCH_CHECK(at::allclose(kernel_tv3, t3)); + TORCH_CHECK(at::allclose(kernel_tv4, t4)); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); +} + +void testGPU_FusionComputeAtCommonConsumer2() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -1 + // tv4 = tv1 + 4 + // tv5 = tv3 + tv4 + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv2, new Float(-1.0)); + TensorView* tv4 = add(tv1, new Float(4.0)); + TensorView* tv5 = add(tv3, tv4); + + fusion.addOutput(tv5); + + TensorView* computeAtTarget = tv3; + + computeAtTarget->merge(0); + computeAtTarget->split(0, 128); + computeAtTarget->split(0, 4); + + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + // This computeAt will affect all tensors including tv3, tv4 and + // tv5, even though it appears to impact only tv1 and tv2. The + // reason is that tv1 is now computed at tv3, so tv4 must also be + // computed at the same location. Similarly, the consumer of tv4, + // tv5, must also be computed at the same location. Overall, what + // will happen is basically we merge expressions of all tensors and + // compute them in a single loop nest. Internally, this will be + // realized by making all tensors, except for those in the path + // between tv1 and tv3, computed at tv5, which we call the common + // consumer. + tv1->computeAt(computeAtTarget, 1); + + fusion.printKernel(); + + // All tensors should have the same dimenionality as the target + for (Val* val : fusion.vals()) { + if (fusion.hasInput(val) || + val->getValType().value() != ValType::TensorView) { + continue; + } + TensorView* tv = val->as(); + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + TORCH_CHECK(tv1->getComputeAtView() == tv2); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + // tv3 and tv4 are computed at tv5 + TORCH_CHECK(tv3->getComputeAtView() == tv5); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + TORCH_CHECK(!tv5->hasComputeAt()); + + for (Val* val : fusion.vals()) { + if (!fusion.hasInput(val) && + val->getValType().value() == ValType::TensorView) { + TensorView* tv = val->as(); + tv->axis(1)->parallelize(ParallelType::Unroll); + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({129, 127}, options); + + auto t1 = t0.mul({0.5}); + auto t2 = t1.mul({-1.0}); + auto t3 = t2.mul({-1.0}); + auto t4 = t1.add({4.0}); + auto t5 = t3 + t4; + + at::Tensor kernel_tv5 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv5}); + + TORCH_CHECK(at::allclose(kernel_tv5, t5)); +} + +// Similar to the above common consumer test but adds an additional +// tensor that has no common consumer with the other tensors. +void testGPU_FusionComputeAtCommonConsumer3() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -1 + // tv4 = tv1 + 4 + // tv5 = tv2 + tv3 + // tv6 = tv1 + 6 + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv2, new Float(-1.0)); + TensorView* tv4 = add(tv1, new Float(4.0)); + TensorView* tv5 = add(tv3, tv4); + TensorView* tv6 = add(tv1, new Float(6.0)); + + fusion.addOutput(tv5); + fusion.addOutput(tv6); + + TensorView* computeAtTarget = tv3; + + computeAtTarget->merge(0); + computeAtTarget->split(0, 128); + computeAtTarget->split(0, 4); + + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + // This will have the same impact on the tensors except for tv5 and + // tv6. tv6 does not have any common consumer with the computeAt + // target, but since it uses tv1, it must be also computed at the + // same location as the other impacted tensors. We can either make + // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5 + // should be computed at tv6 just because the current implementation + // orders the computeAt relationship based on the order in which + // tensors are specified as outputs. + + tv1->computeAt(computeAtTarget, 1); + + fusion.printKernel(); + + // All tensors should have the same dimenionality as the target + for (Val* val : fusion.vals()) { + if (fusion.hasInput(val) || + val->getValType().value() != ValType::TensorView) { + continue; + } + TensorView* tv = val->as(); + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + TORCH_CHECK(tv1->getComputeAtView() == tv2); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + + // tv3 and tv4 are computed at tv5 + TORCH_CHECK(tv3->getComputeAtView() == tv5); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + + // tv5 should be computed at tv6 since tv5 is added as an output + // before tv6. If we call fusion.addOutput(tv6) first, tv6 should be + // computed at tv5. + TORCH_CHECK(tv5->getComputeAtView() == tv6); + TORCH_CHECK(!tv6->hasComputeAt()); + + for (Val* val : fusion.vals()) { + if (!fusion.hasInput(val) && + val->getValType().value() == ValType::TensorView) { + TensorView* tv = val->as(); + tv->axis(1)->parallelize(ParallelType::Unroll); + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({129, 127}, options); + + auto t1 = t0.mul({0.5}); + auto t2 = t1.mul({-1.0}); + auto t3 = t2.mul({-1.0}); + auto t4 = t1.add({4.0}); + auto t5 = t3 + t4; + auto t6 = t1.add({6.0}); + + at::Tensor kernel_tv5 = at::empty_like(t0, options); + at::Tensor kernel_tv6 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv5, kernel_tv6}); + + TORCH_CHECK(at::allclose(kernel_tv5, t5)); + TORCH_CHECK(at::allclose(kernel_tv6, t6)); +} + +// Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor +// that does not have data dependency with the consumer. +void testGPU_FusionComputeAtNoCommonConsumer() { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv1 * -2 + // tv4 = tv2 + tv3 + // tv5 = tv4 * 5 + // tv6 = tv1 * 6 + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + TensorView* tv4 = add(tv2, tv3); + TensorView* tv5 = mul(tv4, new Float(5.0)); + // Notice that tv6 is not a consumer of tv4. + TensorView* tv6 = mul(tv1, new Float(6.0)); + fusion.addOutput(tv3); + fusion.addOutput(tv4); + fusion.addOutput(tv5); + fusion.addOutput(tv6); + + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv6}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv4); + TORCH_CHECK(tv3->getComputeAtView() == tv4); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + TORCH_CHECK(tv5->getComputeAtView() == tv6); + TORCH_CHECK(!tv6->hasComputeAt()); + + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + auto t4 = t2 + t3; + auto t5 = t4 * 5.0; + auto t6 = t1 * 6.0; + + at::Tensor kernel_tv3 = at::empty_like(t0, options); + at::Tensor kernel_tv4 = at::empty_like(t0, options); + at::Tensor kernel_tv5 = at::empty_like(t0, options); + at::Tensor kernel_tv6 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5, kernel_tv6}); + + TORCH_CHECK(at::allclose(kernel_tv3, t3)); + TORCH_CHECK(at::allclose(kernel_tv4, t4)); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); + TORCH_CHECK(at::allclose(kernel_tv6, t6)); +} + void testGPU_FusionScalarInputs() { Fusion fusion; FusionGuard fg(&fusion); diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 0f3fac2077409..63d8006c172ff 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -153,6 +153,11 @@ namespace jit { _(GPU_FusionCompoundOps) \ _(GPU_FusionCastOps) \ _(GPU_FusionAdvancedComputeAt) \ + _(GPU_FusionComputeAtMultiConsumers) \ + _(GPU_FusionComputeAtCommonConsumer1) \ + _(GPU_FusionComputeAtCommonConsumer2) \ + _(GPU_FusionComputeAtCommonConsumer3) \ + _(GPU_FusionComputeAtNoCommonConsumer) \ _(GPU_FusionScalarInputs) \ _(GPU_FusionRFactorReplay) \ _(GPU_FusionReduction) \ diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/torch/csrc/jit/codegen/cuda/compute_at.cpp index 4780d699d5546..d0ee8f10e04c8 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.cpp +++ b/torch/csrc/jit/codegen/cuda/compute_at.cpp @@ -20,11 +20,10 @@ ComputeAtData::ComputeAtData(TensorView* tv) void ComputeAtData::clearPass() { // If the last pass set a position, update the new_compute_at_position if // latest position would be greater than previously set. - auto pass_pos = current_traversal_position_set ? current_traversal_position - : new_compute_at_position; - - new_compute_at_position = - pass_pos > new_compute_at_position ? pass_pos : new_compute_at_position; + if (current_traversal_position_set && + current_traversal_position > new_compute_at_position) { + new_compute_at_position = current_traversal_position; + } current_traversal_position_set = false; current_traversal_position = 0; @@ -52,13 +51,14 @@ void ComputeAtData::setPassPosition(unsigned int pos) { } unsigned int ComputeAtData::getNewPosition() const { - // If the last pass set a position, update the new_compute_at_position if - // latest position would be greater than previously set. - auto pass_pos = current_traversal_position_set ? current_traversal_position - : new_compute_at_position; - - return pass_pos > new_compute_at_position ? pass_pos - : new_compute_at_position; + // If the last pass set a position, return the latest position if + // it would be greater than previously set. + if (current_traversal_position_set && + current_traversal_position > new_compute_at_position) { + return current_traversal_position; + } else { + return new_compute_at_position; + } } void ComputeAtData::validateNewComputeAt() const { @@ -174,6 +174,9 @@ void ComputeAt::run( // Check all dependency chains, select the next TV after producer towards // consumer. These are the TVs we're going to actually call computeAt on. for (const auto& tv_chain : all_chains) { + // When a chain only has two tensors, they must be the producer, + // which is an input, and the consumer. There is nothing we need + // to do for such chains. if (tv_chain.size() > 2) { // Make sure we only add once, but we want to add in a determinsitic // order @@ -435,9 +438,6 @@ ComputeAt::ComputeAt( : producer_(_producer), consumer_(_consumer), consumer_position_(_consumer_position) { - if (consumer_position_ < 0) - consumer_position_ += consumer_->nDims(); - TORCH_INTERNAL_ASSERT( consumer_position_ >= 0 && consumer_position_ <= consumer_->nDims(), "Invalid computeAt axis, received ", diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp index 81178139e450c..11482113e0f9f 100644 --- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp @@ -182,6 +182,7 @@ void IRPrinter::handle(const TensorView* tv) { void IRPrinter::handle(const IterDomain* id) { os << id->getIterType(); os << id->getParallelType(); + os << id->name(); os << "{"; if (!id->start()->isZeroInt()) { print_inline(id->start()); @@ -359,6 +360,7 @@ void IRPrinter::handle(const kir::NamedScalar* i) { void IRPrinter::handle(const kir::IterDomain* id) { os << id->getIterType(); os << id->getParallelType(); + os << id->name(); os << "{"; if (!id->start()->isZeroInt()) { print_inline(id->start()); diff --git a/torch/csrc/jit/codegen/cuda/transform_iter.h b/torch/csrc/jit/codegen/cuda/transform_iter.h index e3cdab856366c..161fa547680e4 100644 --- a/torch/csrc/jit/codegen/cuda/transform_iter.h +++ b/torch/csrc/jit/codegen/cuda/transform_iter.h @@ -154,6 +154,8 @@ class TORCH_CUDA_API BestEffortReplay { size_t counter = 0; public: + // replay_map: mapping of target root domains to corresponding + // replay root domains BestEffortReplay( const std::vector& replay_domain, const std::vector& target_domain, From 0fbfa908e50c9a569de2a98ef1279b15dd3d5327 Mon Sep 17 00:00:00 2001 From: Kevin Stephano Date: Fri, 28 Aug 2020 14:33:57 -0700 Subject: [PATCH 017/167] Fix Inner Dimension Reductions for FP16 to perform just as well as TI. (#333) Add Executor method to compile from a string for debug usage. Fix Reduction Scheduler to have TI level perf for FP16 inner dimension reductions. Fix tests to use randn() so large reductions aren't matching on inf. --- test/cpp/jit/test_gpu.cpp | 18 ++--- torch/csrc/jit/codegen/cuda/executor.cpp | 25 +++++++ torch/csrc/jit/codegen/cuda/executor.h | 6 ++ torch/csrc/jit/codegen/cuda/scheduler.cpp | 83 ++++++++--------------- torch/csrc/jit/codegen/cuda/scheduler.h | 7 +- 5 files changed, 73 insertions(+), 66 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index fcf62c718f814..334c458d07c10 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -5008,7 +5008,7 @@ void testGPU_FusionReductionScheduler() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand({bid_x, tid_x}, options); + at::Tensor input = at::randn({bid_x, tid_x}, options); // Apply reduction heuristic const at::ArrayRef inputs({input}); @@ -5024,7 +5024,7 @@ void testGPU_FusionReductionScheduler() { auto aten_output = input.sum({red_dim}); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-04, 1e-04), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } @@ -5100,7 +5100,7 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand(tensor_dims_in, options); + at::Tensor input = at::randn(tensor_dims_in, options); at::Tensor cg_output = at::empty(tensor_dims_out, options); // Apply reduction heuristic @@ -5117,7 +5117,7 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() { auto aten_output = input.sum(red_dims64); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-04, 1e-04), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } @@ -5142,7 +5142,7 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand(tensor_dims_in, options); + at::Tensor input = at::randn(tensor_dims_in, options); TORCH_CHECK( cuda::scheduleReduction(&fusion, {input}, tv1), @@ -5155,7 +5155,7 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() { auto aten_output = input.sum(red_dims64); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-05, 1e-05), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } @@ -5205,8 +5205,8 @@ void testGPU_FusionReductionSchedulerDimShmoo() { .dtype((fp16 ? at::kHalf : at::kFloat)) .device(at::kCUDA, 0); at::Tensor input = - (axis ? at::rand({odim, rdim}, options) - : at::rand({rdim, odim}, options)); + (axis ? at::randn({odim, rdim}, options) + : at::randn({rdim, odim}, options)); const at::ArrayRef inputs({input}); @@ -5236,7 +5236,7 @@ void testGPU_FusionReductionSchedulerDimShmoo() { auto aten_output = input.sum({axis}); TORCH_CHECK( - aten_output.allclose(cg_output[0]), + aten_output.allclose(cg_output[0], 1e-03, 1e-03), "Error of: ", aten_output.sub(cg_output[0]).abs().max()); } diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index f2582b48e4f96..584b770b05b22 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -36,6 +36,31 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) { return code; } +void FusionExecutor::compileFusionFromStr( + Fusion* fusion, + const std::string& code, + const std::string& name, + int id, + CompileOptions options) { + fusion_ = *fusion; + FusionGuard fg(&fusion_); + options_ = options; + + const char* debug_env = getenv("PYTORCH_CUDA_FUSER_DEBUG"); + if (debug_env && atoi(debug_env)) { + std::cout << "\n==== codegen output for kernel: " << kernelName() + << " ====" << std::endl + << code << std::endl + << "=====*===============================" << std::endl; + } + + fusion_id_ = id; + has_random_ = fusion->hasRNG(); + lowered_ = GpuLower(&fusion_); + compiled_kernel_ = executor_utils::nvrtcCompile(code, name, fusion_id_); + compiled_ = true; +} + void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) { TORCH_INTERNAL_ASSERT( !fusion->outputs().empty(), "No output found for this kernel, aborting."); diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index 86a70fc27f73e..3b621d2338794 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -23,6 +23,12 @@ struct TORCH_CUDA_API CompileOptions { class TORCH_CUDA_API FusionExecutor : public NonCopyable { public: + void compileFusionFromStr( + Fusion* fusion, + const std::string& code, + const std::string& name, + int id, + CompileOptions options = CompileOptions()); void compileFusion(Fusion* fusion, CompileOptions options = CompileOptions()); std::vector runFusion( diff --git a/torch/csrc/jit/codegen/cuda/scheduler.cpp b/torch/csrc/jit/codegen/cuda/scheduler.cpp index 5a1611d157785..b284c7b1a9832 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler.cpp @@ -239,7 +239,10 @@ ReductionParams reductionHeuristic( // Is fastest dimension a reduction dimension? if (rparams.fastest_dim) { - bdimx = red_elems; + if (red_elems < rparams.loop_unroll) { + rparams.loop_unroll = 1; + } + bdimx = ceilDiv(red_elems, rparams.loop_unroll); bdimy = red_outputs; } else { bdimx = red_outputs; @@ -426,22 +429,12 @@ c10::optional scheduleReduction( // Do multiple reductions per block if (rparams.mul_reds_per_blk) { // Reduction Splits - // [outputs, |rF-Leftover, rf-Unroll, X-Warp|] - // Idx: 0 | 1(-1) 2(-2) 3(-1) | + // [outputs, |rF-Leftover, X-Warp, rf-Unroll|] + // Idx: 0 | 1(-1) 2(-2) 3(-1) | // -------------------------------- // Reduction Dimensions + red_tv->split(1, rparams.loop_unroll); red_tv->split(1, rparams.lparams.bdimx()); - red_tv->split(1, kLoopUnrollSplit); - - // Reordering the Unroll dimension eases applying computeAt() - // for preceeding operations and the rFactored Tensor. - // |- Reordered -| - // V V - // [outputs, |rF-Leftover, X-Warp, rF-Unroll|] - // Idx: 0 | 1(-3) 2(-2) 3(-1) | - // -------------------------------- - // Reduction Dimensions - red_tv->reorder({{-1, -2}, {-2, -1}}); // Output Splits // [|Out-Leftover, Out-PerBlock|, ] @@ -454,8 +447,8 @@ c10::optional scheduleReduction( // WARNING: computeAt will coalesce the rFactored dimensions // rFactored Reduction Tensor after computeAt(): - // [, |X-Warp, rF-Leftover, rF-Unroll|] - // Idx: 0 -- 1 | 2(-3) 3(-2) 4(-1) | + // [, | rF-Leftover, X-Warp, rF-Unroll|] + // Idx: 0 -- 1 | 2(-3) 3(-2) 4(-1) | // --------------------------------- // Reduction Dimensions red_tv_rf->computeAt(red_tv, -1); @@ -481,47 +474,37 @@ c10::optional scheduleReduction( } else { if (rparams.cross_grid) { // Reduction Splits - // [outputs, |rF-Leftover, rf-Unroll, X-Grid, X-Block, X-Warp|] - // Idx: 0 | 1(-5) 2(-4) 3(-3) 4(-2) 5(-1) | + // [outputs, |rF-Leftover, X-Grid, X-Block, X-Warp, rf-Unroll|] + // Idx: 0 | 1(-5) 2(-4) 3(-3) 4(-2) 5(-1) | // ------------------------------------------------- // Reduction Dimensions + red_tv->split(1, rparams.loop_unroll); red_tv->split(1, rparams.lparams.bdimx()); red_tv->split(1, rparams.lparams.bdimy()); red_tv->split(1, rparams.lparams.gdimy()); - red_tv->split(1, kLoopUnrollSplit); - - // Reordering the Unroll dimension eases applying computeAt() - // for preceeding operations and the rFactored Tensor. - // |------ Reordered --------| - // V V - // [outputs, |rF-Leftover, X-Warp, X-Grid, X-Block, rf-Unroll|] - // Idx: 0 | 1(-5) 2(-4) 3(-3) 4(-2) 5(-1) | - // ------------------------------------------------- - // Reduction Dimensions - red_tv->reorder({{-1, -4}, {-4, -1}}); auto red_tv_rf = red_tv->rFactor( {-5, -1}); // NOLINT(cppcoreguidelines-avoid-magic-numbers) // WARNING: computeAt will coalesce the rFactored dimensions // rFactored Reduction Tensor after computeAt(): - // [Outputs, |X-Warp, X-Grid, X-Block, rF-Leftover, rF-Unroll|] - // Idx: 0 | 1(-5) 2(-4) 3(-3) 4(-2) 5(-1) | + // [Outputs, |X-Grid, X-Block, X-Warp, rF-Leftover, rF-Unroll|] + // Idx: 0 | 1(-5) 2(-4) 3(-3) 4(-2) 5(-1) | // ------------------------------------------------- // Reduction Dimensions red_tv_rf->computeAt(red_tv, -1); // After the Reduction Tensor has rFactoring applied // Reduction Output Tensor: - // [Outputs, X-Warp, X-Grid, X-Block] - // Idx: 0 1(-3) 2(-2) 3(-1) + // [Outputs, X-Grid, X-Block, X-Warp] + // Idx: 0 1(-3) 2(-2) 3(-1) red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll); red_tv->axis(0)->parallelize(ParallelType::BIDx); - red_tv->axis(-3)->parallelize(ParallelType::TIDx); + red_tv->axis(-1)->parallelize(ParallelType::TIDx); red_tv->axis(-2)->parallelize(ParallelType::BIDy); - red_tv->axis(-1)->parallelize(ParallelType::TIDy); + red_tv->axis(-3)->parallelize(ParallelType::TIDy); // Bind Inputs to Reduction for (auto input : fusion->inputsOf(red_tv_rf)) { @@ -531,29 +514,19 @@ c10::optional scheduleReduction( } } else { // Reduction Splits - // [outputs, |rF-Leftover, rf-Unroll, X-Block, X-Warp|] - // Idx: 0 | 1(-4) 2(-3) 3(-2) 4(-1) | + // [outputs, |rF-Leftover, X-Block, X-Warp, rf-Unroll|] + // Idx: 0 | 1(-4) 2(-3) 3(-2) 4(-1) | // ----------------------------------------- // Reduction Dimensions + red_tv->split(1, rparams.loop_unroll); red_tv->split(1, rparams.lparams.bdimx()); red_tv->split(1, rparams.lparams.bdimy()); - red_tv->split(1, kLoopUnrollSplit); - - // Reordering the Unroll dimension eases applying computeAt() - // for preceeding operations and the rFactored Tensor. - // |--- Reordered ----| - // V V - // [outputs, |rF-Leftover, X-Warp, X-Block, rF-Unroll|] - // Idx: 0 | 1(-4) 2(-3) 3(-2) 4(-1) | - // ----------------------------------------- - // Reduction Dimensions - red_tv->reorder({{-1, -3}, {-3, -1}}); auto red_tv_rf = red_tv->rFactor({-4, -1}); // WARNING: computeAt will coalesce the rFactored dimensions // rFactored Reduction Tensor after computeAt(): - // [Outputs, |X-Warp, X-Block, rF-Leftover, rF-Unroll|] + // [Outputs, |X-Block, X-Warp, rF-Leftover, rF-Unroll|] // Idx: 0 | 1(-4) 2(-3) 3(-2) 4(-1) | // ----------------------------------------- // Reduction Dimensions @@ -561,14 +534,14 @@ c10::optional scheduleReduction( // After the Reduction Tensor has rFactoring applied // Reduction Output Tensor: - // [Outputs, X-Warp, X-Block] - // Idx: 0 1(-2) 2(-1) + // [Outputs, X-Block, X-Warp] + // Idx: 0 1(-2) 2(-1) red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll); red_tv->axis(0)->parallelize(ParallelType::BIDx); - red_tv->axis(-2)->parallelize(ParallelType::TIDx); - red_tv->axis(-1)->parallelize(ParallelType::TIDy); + red_tv->axis(-1)->parallelize(ParallelType::TIDx); + red_tv->axis(-2)->parallelize(ParallelType::TIDy); // Bind Inputs to Reduction for (auto input : fusion->inputsOf(red_tv_rf)) { @@ -625,7 +598,7 @@ c10::optional scheduleReduction( red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll); red_tv->axis(0)->parallelize(ParallelType::BIDx); - red_tv->axis(1)->parallelize(ParallelType::TIDx); + red_tv->axis(-3)->parallelize(ParallelType::TIDx); red_tv->axis(-2)->parallelize(ParallelType::TIDy); red_tv->axis(-1)->parallelize(ParallelType::BIDy); @@ -679,7 +652,7 @@ c10::optional scheduleReduction( red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll); red_tv->axis(0)->parallelize(ParallelType::BIDx); - red_tv->axis(1)->parallelize(ParallelType::TIDx); + red_tv->axis(-2)->parallelize(ParallelType::TIDx); red_tv->axis(-1)->parallelize(ParallelType::TIDy); // Bind Inputs to Reduction diff --git a/torch/csrc/jit/codegen/cuda/scheduler.h b/torch/csrc/jit/codegen/cuda/scheduler.h index 2b35b6586f305..ce732391b543f 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler.h +++ b/torch/csrc/jit/codegen/cuda/scheduler.h @@ -24,12 +24,15 @@ struct ReductionParams { bool cross_grid = false; bool mul_reds_per_blk = false; + int loop_unroll = 4; + LaunchParams lparams; bool operator==(const ReductionParams& other) const { bool attr_equal = other.fastest_dim == fastest_dim && other.cross_block == cross_block && other.cross_grid == cross_grid && - other.mul_reds_per_blk == mul_reds_per_blk; + other.mul_reds_per_blk == mul_reds_per_blk && + other.loop_unroll == loop_unroll; return attr_equal && lparams == other.lparams; } }; @@ -38,7 +41,7 @@ class ReductionParamsHash { public: size_t operator()(const ReductionParams& rp) const { size_t lp_hash = rp.lparams.gdimx() ^ rp.lparams.gdimy() ^ - rp.lparams.bdimx() ^ rp.lparams.bdimy(); + rp.lparams.bdimx() ^ rp.lparams.bdimy() ^ rp.loop_unroll; constexpr size_t bits = sizeof(std::size_t) * 8; size_t attr_hash = static_cast(rp.fastest_dim) << (bits - 1) | static_cast(rp.cross_block) << (bits - 2) | From c68fba885fd0b346c304b918655124ffdae06b31 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 31 Aug 2020 18:07:34 -0400 Subject: [PATCH 018/167] Change pointwise scheduling to not generate multiple unrolled loops. (#338) --- torch/csrc/jit/codegen/cuda/scheduler.cpp | 28 ++++++++++++++--------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler.cpp b/torch/csrc/jit/codegen/cuda/scheduler.cpp index b284c7b1a9832..3dac8e65f7e41 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler.cpp @@ -15,7 +15,7 @@ namespace jit { namespace fuser { namespace cuda { -constexpr int kUnrollFactor = 4; +constexpr int kUnrollFactor = 1; namespace { @@ -173,9 +173,11 @@ bool scheduleFusion(Fusion* fusion, const at::ArrayRef inputs) { TensorView* out_tv = output->as(); for (Val* inp : fusion->inputsOf(output)) { if (inp->getValType().value() == ValType::TensorView) - inp->as()->computeAt(out_tv, 1); + inp->as()->computeAt(out_tv, -1); } out_tv->axis(0)->parallelize(ParallelType::BIDx); + out_tv->axis(1)->parallelize(ParallelType::Unroll); + out_tv->axis(2)->parallelize(ParallelType::TIDx); } // Run through all values, unroll, and bind their axes @@ -185,15 +187,19 @@ bool scheduleFusion(Fusion* fusion, const at::ArrayRef inputs) { continue; TensorView* tv = val->as(); - // Should be true for all intermediates, but if one isn't hooked - // up right, skip it and hope for the best for now - if (!disable_unroll && tv->nDims() == 3) { - tv->axis(-2)->parallelize(ParallelType::Unroll); - tv->axis(-1)->parallelize(ParallelType::TIDx); - } else { - if (tv->nDims() == 2) - tv->axis(-1)->parallelize(ParallelType::TIDx); - } + // Disabling below as currently unrolling doesn't make a lot of sense as + // we don't extract global loads/reads out of intermediate logic. + // + // Below check should be true for all intermediates, but if one isn't + // hooked up right, skip it and hope for the best for now + // + // if (!disable_unroll && tv->nDims() == 3) { + // tv->axis(-2)->parallelize(ParallelType::Unroll); + // tv->axis(-1)->parallelize(ParallelType::TIDx); + // } else { + // if (tv->nDims() == 2) + // tv->axis(-1)->parallelize(ParallelType::TIDx); + // } } TensorView* out0 = fusion->outputs()[0]->as(); int ndim = (int)out0->nDims(); From 4194f49486b4efafbf0dcbfd3282724c195071dc Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 31 Aug 2020 19:03:00 -0400 Subject: [PATCH 019/167] Move IterVisitor derived classes from fusion.h to iter_visitor.h (#339) Move IterVisitor derived classes from fusion.h to iter_visitor.h --- torch/csrc/jit/codegen/cuda/fusion.cpp | 30 +-------- torch/csrc/jit/codegen/cuda/fusion.h | 35 ---------- .../jit/codegen/cuda/ir_interface_nodes.h | 1 + torch/csrc/jit/codegen/cuda/iter_visitor.cpp | 55 ++++++++------- torch/csrc/jit/codegen/cuda/iter_visitor.h | 67 ++++++++++++++++--- torch/csrc/jit/codegen/cuda/lower_loops.cpp | 1 + torch/csrc/jit/codegen/cuda/lower_utils.cpp | 1 + .../jit/codegen/cuda/lower_validation.cpp | 2 +- torch/csrc/jit/codegen/cuda/type.h | 8 +++ 9 files changed, 98 insertions(+), 102 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index 381695cd27ab9..1b71e6a168d60 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -25,35 +26,6 @@ Fusion* FusionGuard::getCurFusion() { return ACTIVE_FUSION; } -void ExprSort::handle(Expr* expr) { - exprs.push_back(expr); -} - -std::vector ExprSort::getExprs(Fusion* fusion, bool from_outputs_only) { - ExprSort es; - es.traverse(fusion, from_outputs_only); - return es.exprs; -} - -std::vector ExprSort::getExprs( - Fusion* fusion, - const std::vector& from) { - ExprSort es; - es.traverseFrom(fusion, from, false); - return es.exprs; -} - -void InputsOf::handle(Val* v) { - if (FusionGuard::getCurFusion()->origin(v) == nullptr) - inputs.emplace(v); -} - -std::unordered_set InputsOf::output(Fusion* fusion, Val* output_) { - InputsOf io; - io.traverseFrom(FusionGuard::getCurFusion(), {output_}, false); - return io.inputs; -} - void swap(Fusion& a, Fusion& b) noexcept { using std::swap; diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h index d7dd74070ca99..efd957ec2ecd2 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.h +++ b/torch/csrc/jit/codegen/cuda/fusion.h @@ -4,7 +4,6 @@ #include #include -#include #include #include @@ -14,14 +13,6 @@ namespace torch { namespace jit { namespace fuser { -// https://stackoverflow.com/questions/18837857/cant-use-enum-class-as-unordered-map-key -struct TypeHash { - template - std::size_t operator()(T t) const { - return static_cast(t); - } -}; - /* * Usage: FusionGuard and Fusion are required user interfaces for any operation * underlying the code generator. In order to create values, expressions, and @@ -65,32 +56,6 @@ class TORCH_CUDA_API FusionGuard { static Fusion* getCurFusion(); }; -// Expr sort will take a fusion and return a topologically sorted list of -// expressions. -class ExprSort : public IterVisitor { - private: - std::vector exprs; - - void handle(Expr* expr) override; - - public: - static std::vector getExprs(Fusion* fusion, bool from_outputs_only); - - static std::vector getExprs( - Fusion* fusion, - const std::vector& from); -}; - -class InputsOf : public IterVisitor { - private: - std::unordered_set inputs; - - void handle(Val* v) final; - - public: - static std::unordered_set output(Fusion* fusion, Val* output_); -}; - /* * Fusion is mutable but unique. Nodes cannot be copied in any way from one * Fusion to another. If anything like that is desired, it would require diff --git a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h index 5e1ebf3f5bfe3..d7701ef75e125 100644 --- a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h +++ b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h @@ -4,6 +4,7 @@ #include #include +#include #include diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp index 198643414a09e..3aeffc96fa330 100644 --- a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp +++ b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp @@ -10,32 +10,6 @@ namespace fuser { /* ITER VISITOR */ -std::vector IterVisitor::next(Statement* stmt) { - if (stmt->isVal()) { - return next(stmt->as()); - } else if (stmt->isExpr()) { - return next(stmt->as()); - } else { - TORCH_INTERNAL_ASSERT( - false, "IterVisitor could not detect type in next_dispatch."); - } -} - -std::vector IterVisitor::next(Val* v) { - FusionGuard::getCurFusion()->assertInFusion(v, "Cannot traverse val, "); - if (FusionGuard::getCurFusion()->origin(v) != nullptr) { - return {FusionGuard::getCurFusion()->origin(v)}; - } - return {}; -} - -std::vector IterVisitor::next(Expr* expr) { - FusionGuard::getCurFusion()->assertInFusion(expr, "Cannot traverse expr, "); - std::vector next_stmts{expr->inputs().begin(), - expr->inputs().end()}; - return next_stmts; -} - namespace { // Remove any stmt in stmts that is in visited @@ -496,6 +470,35 @@ std::unordered_set DependencyCheck::getAllValsBetween( return Dependencies::getAllVals(dependencies, of); } +void ExprSort::handle(Expr* expr) { + exprs.push_back(expr); +} + +std::vector ExprSort::getExprs(Fusion* fusion, bool from_outputs_only) { + ExprSort es; + es.traverse(fusion, from_outputs_only); + return es.exprs; +} + +std::vector ExprSort::getExprs( + Fusion* fusion, + const std::vector& from) { + ExprSort es; + es.traverseFrom(fusion, from, false); + return es.exprs; +} + +void InputsOf::handle(Val* v) { + if (FusionGuard::getCurFusion()->origin(v) == nullptr) + inputs.emplace(v); +} + +std::unordered_set InputsOf::output(Fusion* fusion, Val* output_) { + InputsOf io; + io.traverseFrom(FusionGuard::getCurFusion(), {output_}, false); + return io.inputs; +} + } // namespace fuser } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.h b/torch/csrc/jit/codegen/cuda/iter_visitor.h index ec08df28a89f4..a51eae88d243f 100644 --- a/torch/csrc/jit/codegen/cuda/iter_visitor.h +++ b/torch/csrc/jit/codegen/cuda/iter_visitor.h @@ -4,6 +4,11 @@ #include +#include +#include +#include +#include + #include #include #include @@ -12,14 +17,6 @@ namespace torch { namespace jit { namespace fuser { -class Statement; -class Val; -class Expr; - -class Fusion; - -enum class ValType; - /* * IterVisitor starts from leaf nodes, fusion outputs, or the provided values. * It walks the DAG bacwkards from the starting nodes, to roots. Each node in @@ -49,9 +46,31 @@ class TORCH_CUDA_API IterVisitor : public OptOutDispatch { // These functions will start at outputs and propagate up through the DAG // to inputs based on depth first traversal. Next could be called on a node // multiple times. - virtual std::vector next(Statement* stmt); - virtual std::vector next(Expr* expr); - virtual std::vector next(Val* v); + virtual std::vector next(Statement* stmt) { + if (stmt->isVal()) { + return next(stmt->as()); + } else if (stmt->isExpr()) { + return next(stmt->as()); + } else { + TORCH_INTERNAL_ASSERT( + false, "IterVisitor could not detect type in next_dispatch."); + } + } + + virtual std::vector next(Val* v) { + FusionGuard::getCurFusion()->assertInFusion(v, "Cannot traverse val, "); + if (FusionGuard::getCurFusion()->origin(v) != nullptr) { + return {FusionGuard::getCurFusion()->origin(v)}; + } + return {}; + } + + virtual std::vector next(Expr* expr) { + FusionGuard::getCurFusion()->assertInFusion(expr, "Cannot traverse expr, "); + std::vector next_stmts{expr->inputs().begin(), + expr->inputs().end()}; + return next_stmts; + } // This handle functions is called on every Statement* in topological order, // starting from outputs to inputs. @@ -214,6 +233,32 @@ class TORCH_CUDA_API DependencyCheck { const std::vector& of); }; +// Expr sort will take a fusion and return a topologically sorted list of +// expressions. +class ExprSort : public IterVisitor { + private: + std::vector exprs; + + void handle(Expr* expr) override; + + public: + static std::vector getExprs(Fusion* fusion, bool from_outputs_only); + + static std::vector getExprs( + Fusion* fusion, + const std::vector& from); +}; + +class InputsOf : public IterVisitor { + private: + std::unordered_set inputs; + + void handle(Val* v) final; + + public: + static std::unordered_set output(Fusion* fusion, Val* output_); +}; + } // namespace fuser } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp index b27ef32c2207c..761c51d95b39e 100644 --- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.cpp b/torch/csrc/jit/codegen/cuda/lower_utils.cpp index 1393d2ffb5bef..a24aaa77a7f5c 100644 --- a/torch/csrc/jit/codegen/cuda/lower_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_utils.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp index 593d6172c9887..7bb867100285a 100644 --- a/torch/csrc/jit/codegen/cuda/lower_validation.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_validation.cpp @@ -1,5 +1,5 @@ - #include +#include #include #include #include diff --git a/torch/csrc/jit/codegen/cuda/type.h b/torch/csrc/jit/codegen/cuda/type.h index a1f2e412a5001..bb60fb2e0d15d 100644 --- a/torch/csrc/jit/codegen/cuda/type.h +++ b/torch/csrc/jit/codegen/cuda/type.h @@ -13,6 +13,14 @@ namespace torch { namespace jit { namespace fuser { +// https://stackoverflow.com/questions/18837857/cant-use-enum-class-as-unordered-map-key +struct TypeHash { + template + std::size_t operator()(T t) const { + return static_cast(t); + } +}; + // Order of strength enum class ValType { TensorDomain, From 339e629b7f5c774cb73598acff0c846f669895a9 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 31 Aug 2020 19:27:58 -0400 Subject: [PATCH 020/167] Update fusion parser test, remove printing from common consumer tests. (#341) --- test/cpp/jit/test_gpu.cpp | 46 ++++++++++++++------------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index 334c458d07c10..8829ea249d748 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -1116,34 +1116,27 @@ void testGPU_FusionParser() { // 2. use a fuzzy compare (ignore non-significant whitespaces for example) const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3){ - float T2[4]; - if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - for(size_t i6 = 0; i6 < 4; ++i6 ) { + float T2[1]; + if ( ( ( ( ( ( blockIdx.x * 1 ) + ( 1 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { + for(size_t i6 = 0; i6 < 1; ++i6 ) { T2[ i6 ] - = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ] - * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]; + = T0[ ( ( ( ( blockIdx.x * 1 ) + i6 ) * 128 ) + threadIdx.x ) ] + * T1[ ( ( ( ( blockIdx.x * 1 ) + i6 ) * 128 ) + threadIdx.x ) ]; + T3[ ( ( ( ( blockIdx.x * 1 ) + i6 ) * 128 ) + threadIdx.x ) ] + = T2[ i6 ] + * T0[ ( ( ( ( blockIdx.x * 1 ) + i6 ) * 128 ) + threadIdx.x ) ]; } } else { - for(size_t i6 = 0; i6 < 4; ++i6 ) { - if ( ( ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { + for(size_t i6 = 0; i6 < 1; ++i6 ) { + if ( ( ( ( ( ( blockIdx.x * 1 ) + i6 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { T2[ i6 ] - = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ] - * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]; + = T0[ ( ( ( ( blockIdx.x * 1 ) + i6 ) * 128 ) + threadIdx.x ) ] + * T1[ ( ( ( ( blockIdx.x * 1 ) + i6 ) * 128 ) + threadIdx.x ) ]; } - } - } - if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - for(size_t i13 = 0; i13 < 4; ++i13 ) { - T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ] - = T2[ i13 ] - * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]; - } - } else { - for(size_t i13 = 0; i13 < 4; ++i13 ) { - if ( ( ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ] - = T2[ i13 ] - * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]; + if ( ( ( ( ( ( blockIdx.x * 1 ) + i6 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { + T3[ ( ( ( ( blockIdx.x * 1 ) + i6 ) * 128 ) + threadIdx.x ) ] + = T2[ i6 ] + * T0[ ( ( ( ( blockIdx.x * 1 ) + i6 ) * 128 ) + threadIdx.x ) ]; } } } @@ -1803,9 +1796,6 @@ void testGPU_FusionComputeAtCommonConsumer1() { computeAtTarget->split(0, 128); tv1->computeAt(computeAtTarget, 1); - fusion.printMath(); - fusion.printKernel(); - TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4}; for (auto tv : affected_tensors) { TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); @@ -1886,8 +1876,6 @@ void testGPU_FusionComputeAtCommonConsumer2() { // consumer. tv1->computeAt(computeAtTarget, 1); - fusion.printKernel(); - // All tensors should have the same dimenionality as the target for (Val* val : fusion.vals()) { if (fusion.hasInput(val) || @@ -1977,8 +1965,6 @@ void testGPU_FusionComputeAtCommonConsumer3() { tv1->computeAt(computeAtTarget, 1); - fusion.printKernel(); - // All tensors should have the same dimenionality as the target for (Val* val : fusion.vals()) { if (fusion.hasInput(val) || From 2c1060ad117c54d2454aca99a4800b2d454933a7 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Mon, 31 Aug 2020 19:28:24 -0400 Subject: [PATCH 021/167] Cleanup of hasBlockBroadcast (#340) Implement hasBlockBroadcast like hasGrid/BlockReduction, cache results of these functions in executor during compilation. Improves average latency on LSTMCell 77.5us -> 20.5us. --- torch/csrc/jit/codegen/cuda/executor.cpp | 20 ++++++++++++------- torch/csrc/jit/codegen/cuda/executor.h | 10 ++++++++-- torch/csrc/jit/codegen/cuda/fusion.cpp | 13 ++++++++++++ torch/csrc/jit/codegen/cuda/fusion.h | 1 + .../jit/codegen/cuda/ir_interface_nodes.h | 1 + .../csrc/jit/codegen/cuda/ir_internal_nodes.h | 1 + torch/csrc/jit/codegen/cuda/ir_nodes.cpp | 6 ++++++ torch/csrc/jit/codegen/cuda/lower2device.cpp | 1 - torch/csrc/jit/codegen/cuda/lower2device.h | 7 ------- torch/csrc/jit/codegen/cuda/tensor_view.cpp | 4 ++++ 10 files changed, 47 insertions(+), 17 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index 584b770b05b22..dca2cde534c3c 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -6,6 +6,7 @@ #include +#include #include #include #include @@ -36,7 +37,7 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) { return code; } -void FusionExecutor::compileFusionFromStr( +void FusionExecutor::debugCompileFusionFromStr( Fusion* fusion, const std::string& code, const std::string& name, @@ -75,8 +76,16 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) { FusionGuard fg(&fusion_); options_ = options; + TORCH_INTERNAL_ASSERT( + options.device.is_cuda(), "Provided device to CUDA fuser is the CPU."); + max_device_smem = + at::cuda::getDeviceProperties(options.device.index())->sharedMemPerBlock; + fusion_id_ = ++fusion_id_counter_; has_random_ = fusion->hasRNG(); + has_block_reductions = fusion_.hasBlockReduction(); + has_grid_reductions = fusion_.hasGridReduction(); + has_block_broadcasts = fusion_.hasBlockBroadcast(); lowered_ = GpuLower(&fusion_); const auto kernel = lowered_.getKernel(kernelName()); const auto structured_code = getStructuredCode(kernel); @@ -86,8 +95,7 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) { unsigned static_smem_size = computeSharedMemory(evaluation_context, lowered_.static_allocations()); TORCH_INTERNAL_ASSERT( - static_smem_size < - at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock, + static_smem_size < max_device_smem, "The static shared memory allocation is larger than available memory."); } @@ -246,8 +254,7 @@ LaunchParams FusionExecutor::computeLaunchParams( // Calculate Dynamic Shared Memory Size // Add workspace for reduction and broadcast uint64_t reduction_broadcast_workspace = 0; - if (fusion_.hasBlockReduction() || fusion_.hasGridReduction() || - lowered_.hasBlockBroadcast()) { + if (has_block_reductions || has_grid_reductions || has_block_broadcasts) { // Not using nThreads here since it does not handle uninitialized value reduction_broadcast_workspace = dataTypeSize(fusion_.getMaximumSmemDataType()) * launch_params.bdimx() * @@ -261,8 +268,7 @@ LaunchParams FusionExecutor::computeLaunchParams( computeSharedMemory(ec, lowered_.static_allocations()); TORCH_INTERNAL_ASSERT( - (dynamic_smem_size + static_smem_size) < - at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock, + (dynamic_smem_size + static_smem_size) < max_device_smem, "The total shared memory allocation is larger than available memory."); launch_params.setSmem(dynamic_smem_size); diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index 3b621d2338794..e134f2869fd8d 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -23,7 +23,9 @@ struct TORCH_CUDA_API CompileOptions { class TORCH_CUDA_API FusionExecutor : public NonCopyable { public: - void compileFusionFromStr( + // Unsafe compilation that's useful for debugging kernels, iterating over + // slight modifications of a generated kernel + void debugCompileFusionFromStr( Fusion* fusion, const std::string& code, const std::string& name, @@ -82,8 +84,12 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { Fusion fusion_; - CompileOptions options_; + bool has_block_reductions = false; + bool has_grid_reductions = false; + bool has_block_broadcasts = false; + CompileOptions options_; + size_t max_device_smem = std::numeric_limits().max(); executor_utils::NvrtcFunction compiled_kernel_; // State of the fusion that's important diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index 1b71e6a168d60..d26feb0772a5c 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -558,6 +558,19 @@ bool Fusion::hasGridReduction() { return false; } +bool Fusion::hasBlockBroadcast() { + for (auto expr : exprs(true)) { + for (auto out : expr->outputs()) { + if (out->getValType() == ValType::TensorView) { + if (out->as()->hasBlockBroadcast()) { + return true; + } + } + } + } + return false; +} + bool Fusion::hasBroadcast() { for (auto expr : exprs(true)) for (auto out : expr->outputs()) diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h index efd957ec2ecd2..52c12763f0e7c 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.h +++ b/torch/csrc/jit/codegen/cuda/fusion.h @@ -173,6 +173,7 @@ class TORCH_CUDA_API Fusion final { bool hasReduction(); bool hasBlockReduction(); bool hasGridReduction(); + bool hasBlockBroadcast(); bool hasBroadcast(); DataType getMaximumSmemDataType(); size_t gridReductionTempBufferSize(); diff --git a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h index d7701ef75e125..737869a39fd65 100644 --- a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h +++ b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h @@ -225,6 +225,7 @@ class TORCH_CUDA_API TensorView : public Val { bool hasReduction() const; bool hasBlockReduction() const; bool hasGridReduction() const; + bool hasBlockBroadcast() const; bool hasBroadcast() const; bool hasRFactor() const; diff --git a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h index 7fd760bc60dfa..7409430068eea 100644 --- a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h +++ b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h @@ -418,6 +418,7 @@ class TORCH_CUDA_API TensorDomain : public Val { bool hasReduction() const; bool hasBlockReduction() const; bool hasGridReduction() const; + bool hasBlockBroadcast() const; bool hasBroadcast() const; bool hasRFactor() const; diff --git a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp index 27756751814e0..43d91c82534e5 100644 --- a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp @@ -632,6 +632,12 @@ bool TensorDomain::hasGridReduction() const { }); } +bool TensorDomain::hasBlockBroadcast() const { + return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { + return id->isBroadcast() && id->isThreadDim(); + }); +} + bool TensorDomain::hasBroadcast() const { return no_bcast_domain_.size() != domain_.size(); } diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp index 424ed4ae13386..6a8c4115ff048 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.cpp +++ b/torch/csrc/jit/codegen/cuda/lower2device.cpp @@ -219,7 +219,6 @@ void GpuLower::lower() { sync_allocations_ = be.getSyncAllocs(); dynamic_smem_allocations_ = be.getDynamicAllocs(); static_smem_allocations_ = be.getStaticAllocs(); - has_block_broadcast_ = be.hasBlockBroadcast(); } // Traverse through the fusion and print CUDA code associated with it diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h index c9a8a283b0916..39630a334c69b 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.h +++ b/torch/csrc/jit/codegen/cuda/lower2device.h @@ -45,10 +45,6 @@ class TORCH_CUDA_API GpuLower { return static_smem_allocations_; } - bool hasBlockBroadcast() { - return has_block_broadcast_; - } - // Converts a Fusion IR value into the Kernel IR equivalent // // TODO(kir): revisit this interface @@ -85,9 +81,6 @@ class TORCH_CUDA_API GpuLower { // List of static shared memory buffers std::vector static_smem_allocations_; - // Check if kernel has shared memory broadcast op - bool has_block_broadcast_; - // Lowered IR std::vector lowered_exprs_; diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/torch/csrc/jit/codegen/cuda/tensor_view.cpp index 66b202531fea1..b1f6f731d96c5 100644 --- a/torch/csrc/jit/codegen/cuda/tensor_view.cpp +++ b/torch/csrc/jit/codegen/cuda/tensor_view.cpp @@ -107,6 +107,10 @@ bool TensorView::hasGridReduction() const { return domain()->hasGridReduction(); } +bool TensorView::hasBlockBroadcast() const { + return domain()->hasBlockBroadcast(); +} + bool TensorView::hasBroadcast() const { return domain()->hasBroadcast(); } From 65b6469efb0371e0d6213a27c3a30d140dda9810 Mon Sep 17 00:00:00 2001 From: Lemo Date: Tue, 1 Sep 2020 12:03:13 -0700 Subject: [PATCH 022/167] Minor cleanup --- torch/csrc/jit/codegen/cuda/ir_graphviz.cpp | 50 --------------------- torch/csrc/jit/codegen/cuda/ir_graphviz.h | 7 --- 2 files changed, 57 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp index bb3335fa1b890..488e626299ad4 100644 --- a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp @@ -333,17 +333,6 @@ void IrGraphGenerator::handle(const IterDomain* id) { } } -void IrGraphGenerator::handle(const kir::TensorIndex* ti) { - graph_def_ << " " << getid(ti) << " [label=\"TensorIndex\", " - << "shape=rarrow, color=gray, fontsize=10];\n"; - - addArc(ti, ti->view()); - - for (const auto index : ti->indices()) { - addArc(index, ti); - } -} - void IrGraphGenerator::handle(const Bool* b) { printValue(b, IrNodeLabel::gen(b, detail_level_)); } @@ -453,45 +442,6 @@ void IrGraphGenerator::handle(const ReductionOp* op) { addArc(op, op->out()); } -void IrGraphGenerator::handle(const kir::GridReduction* op) { - printExpr(op, "Grid Reduction"); - - // inputs & outputs - addArc(op, op->reduction_op()); - addArc(op->reduction_buffer(), op); - addArc(op->sync_buffer(), op); -} - -void IrGraphGenerator::handle(const kir::ForLoop* for_loop) { - printExpr(for_loop, "ForLoop"); - addArc(for_loop->index(), for_loop); - addArc(for_loop->iter_domain(), for_loop); - if (for_loop->parentScope()) { - addArc(for_loop, for_loop->parentScope()); - } -} - -void IrGraphGenerator::handle(const kir::IfThenElse* if_then_else) { - printExpr(if_then_else, "IfThenElse"); - addArc(if_then_else->cond(), if_then_else); - if (if_then_else->parentScope()) { - addArc(if_then_else, if_then_else->parentScope()); - } -} - -void IrGraphGenerator::handle(const kir::Allocate* allocate) { - std::stringstream msg; - msg << "Allocate( memory type = " << allocate->getMemoryType() << ")"; - - printExpr(allocate, msg.str()); - addArc(allocate->size(), allocate); - addArc(allocate->buffer(), allocate); -} - -void IrGraphGenerator::handle(const kir::Sync* sync) { - printExpr(sync, "SyncThreads"); -} - void IrGraphGenerator::handle(const Split* split) { printExpr(split, IrNodeLabel::gen(split)); addArc(split->in(), split); diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.h b/torch/csrc/jit/codegen/cuda/ir_graphviz.h index 1940ea0a2a5b6..e3c41fb525ff0 100644 --- a/torch/csrc/jit/codegen/cuda/ir_graphviz.h +++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.h @@ -66,7 +66,6 @@ class TORCH_CUDA_API IrGraphGenerator : private OptInConstDispatch { void handle(const TensorDomain*) override; void handle(const TensorView*) override; void handle(const IterDomain*) override; - void handle(const kir::TensorIndex*) override; void handle(const Bool*) override; void handle(const Float*) override; @@ -79,12 +78,6 @@ class TORCH_CUDA_API IrGraphGenerator : private OptInConstDispatch { void handle(const TernaryOp*) override; void handle(const BroadcastOp*) override; void handle(const ReductionOp*) override; - void handle(const kir::GridReduction*) override; - - void handle(const kir::ForLoop*) override; - void handle(const kir::IfThenElse*) override; - void handle(const kir::Allocate*) override; - void handle(const kir::Sync*) override; void handle(const Split*) override; void handle(const Merge*) override; From f8f506264d884df72982ea9b6cb107390f9f0c25 Mon Sep 17 00:00:00 2001 From: Leonard Mosescu Date: Tue, 1 Sep 2020 13:16:13 -0700 Subject: [PATCH 023/167] Kernel IR: minor cleanup (#351) Removing support for Kernel IR nodes from IrGraphGenerator --- torch/csrc/jit/codegen/cuda/ir_graphviz.cpp | 50 --------------------- torch/csrc/jit/codegen/cuda/ir_graphviz.h | 7 --- 2 files changed, 57 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp index bb3335fa1b890..488e626299ad4 100644 --- a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp @@ -333,17 +333,6 @@ void IrGraphGenerator::handle(const IterDomain* id) { } } -void IrGraphGenerator::handle(const kir::TensorIndex* ti) { - graph_def_ << " " << getid(ti) << " [label=\"TensorIndex\", " - << "shape=rarrow, color=gray, fontsize=10];\n"; - - addArc(ti, ti->view()); - - for (const auto index : ti->indices()) { - addArc(index, ti); - } -} - void IrGraphGenerator::handle(const Bool* b) { printValue(b, IrNodeLabel::gen(b, detail_level_)); } @@ -453,45 +442,6 @@ void IrGraphGenerator::handle(const ReductionOp* op) { addArc(op, op->out()); } -void IrGraphGenerator::handle(const kir::GridReduction* op) { - printExpr(op, "Grid Reduction"); - - // inputs & outputs - addArc(op, op->reduction_op()); - addArc(op->reduction_buffer(), op); - addArc(op->sync_buffer(), op); -} - -void IrGraphGenerator::handle(const kir::ForLoop* for_loop) { - printExpr(for_loop, "ForLoop"); - addArc(for_loop->index(), for_loop); - addArc(for_loop->iter_domain(), for_loop); - if (for_loop->parentScope()) { - addArc(for_loop, for_loop->parentScope()); - } -} - -void IrGraphGenerator::handle(const kir::IfThenElse* if_then_else) { - printExpr(if_then_else, "IfThenElse"); - addArc(if_then_else->cond(), if_then_else); - if (if_then_else->parentScope()) { - addArc(if_then_else, if_then_else->parentScope()); - } -} - -void IrGraphGenerator::handle(const kir::Allocate* allocate) { - std::stringstream msg; - msg << "Allocate( memory type = " << allocate->getMemoryType() << ")"; - - printExpr(allocate, msg.str()); - addArc(allocate->size(), allocate); - addArc(allocate->buffer(), allocate); -} - -void IrGraphGenerator::handle(const kir::Sync* sync) { - printExpr(sync, "SyncThreads"); -} - void IrGraphGenerator::handle(const Split* split) { printExpr(split, IrNodeLabel::gen(split)); addArc(split->in(), split); diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.h b/torch/csrc/jit/codegen/cuda/ir_graphviz.h index 1940ea0a2a5b6..e3c41fb525ff0 100644 --- a/torch/csrc/jit/codegen/cuda/ir_graphviz.h +++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.h @@ -66,7 +66,6 @@ class TORCH_CUDA_API IrGraphGenerator : private OptInConstDispatch { void handle(const TensorDomain*) override; void handle(const TensorView*) override; void handle(const IterDomain*) override; - void handle(const kir::TensorIndex*) override; void handle(const Bool*) override; void handle(const Float*) override; @@ -79,12 +78,6 @@ class TORCH_CUDA_API IrGraphGenerator : private OptInConstDispatch { void handle(const TernaryOp*) override; void handle(const BroadcastOp*) override; void handle(const ReductionOp*) override; - void handle(const kir::GridReduction*) override; - - void handle(const kir::ForLoop*) override; - void handle(const kir::IfThenElse*) override; - void handle(const kir::Allocate*) override; - void handle(const kir::Sync*) override; void handle(const Split*) override; void handle(const Merge*) override; From d7540b67834e0b846caadc78e0433adeb3d6ed36 Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Tue, 1 Sep 2020 14:16:01 -0700 Subject: [PATCH 024/167] cache on static input size/stride pr_0 (#326) While our kernels handle dynamic input sizes, we are now caching kernel selection and launch parameters on static sizes. This improves kernel launch latency for repeated input sizes. The encoding from input array to a unique_id is done at `GraphCache` level, where we record and encode every seen inputs. We plumb the unique_id through the `FusionExecutorCache` and `FusionExecutor`, so we do not repeatedly infer launch parameters / cache entry selections. --- test/test_jit_cuda_fuser.py | 24 +-- torch/csrc/jit/codegen/cuda/executor.cpp | 132 +++++++++++--- torch/csrc/jit/codegen/cuda/executor.h | 41 ++++- .../csrc/jit/codegen/cuda/executor_utils.cpp | 3 - torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 172 ++++++++++++------ torch/csrc/jit/codegen/cuda/kernel_cache.h | 30 ++- 6 files changed, 291 insertions(+), 111 deletions(-) diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py index f6bc2740a140c..39353d41336a8 100644 --- a/test/test_jit_cuda_fuser.py +++ b/test/test_jit_cuda_fuser.py @@ -554,9 +554,8 @@ def t(x: torch.Tensor, y: torch.Tensor): jit_o = t_jit(x, y) jit_o = t_jit(x, y) o = t(x, y) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP) # end-2-end test of permutation & contiguity handling in integration. @@ -599,11 +598,10 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): jit_o = t_jit(x, y) jit_o = t_jit(x, y) o = t(x, y) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - # numerical issues here due to our scheduling. - # can't use `self.assertEqual(oo, jit_oo)` - self.assertTrue(self._compare("comparing output failed", oo, jit_oo, 1e-4)) + self.assertEqual(o.dtype, jit_o.dtype) + # numerical issues here due to our scheduling. + # can't use `self.assertEqual(o, jit_o)` + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP) @unittest.skipIf(not RUN_CUDA, "requires CUDA") @@ -655,9 +653,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): jit_o = t_jit(x, y, z) jit_o = t_jit(x, y, z) o = t(x, y, z) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP) @unittest.skipIf(not RUN_CUDA, "requires CUDA") @@ -680,9 +677,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): jit_o = t_jit(x, y, z) jit_o = t_jit(x, y, z) o = t(x, y, z) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP) diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index dca2cde534c3c..ffee2c92c0069 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -103,7 +103,8 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) { structured_code, (kernelNamespace() + "::" + kernelName()).c_str(), fusion_id_); - compiled_ = true; + TORCH_INTERNAL_ASSERT( + fusion_id_ > 0, "failed to assign a fusion_id_ after compilation."); } namespace { @@ -275,13 +276,14 @@ LaunchParams FusionExecutor::computeLaunchParams( return launch_params; } -std::vector FusionExecutor::allocGlobalVals(EvaluationContext& ec) { - std::vector global_buffers; +FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals( + EvaluationContext& ec) { + GlobalBuffers global_buffers; for (auto alloc : lowered_.global_allocations()) { TORCH_INTERNAL_ASSERT( alloc->buffer()->getValType() == ValType::KirTensorView, "Cannot allocate global buffers that are not tensors."); - global_buffers.push_back(inferAndAlloc( + global_buffers.empty_buffers.push_back(inferAndAlloc( alloc->buffer()->as()->fuserTv(), ec, options_, @@ -292,7 +294,7 @@ std::vector FusionExecutor::allocGlobalVals(EvaluationContext& ec) { TORCH_INTERNAL_ASSERT( alloc->buffer()->getValType() == ValType::KirTensorView, "Cannot allocate global buffers that are not tensors."); - global_buffers.push_back(inferAndAlloc( + global_buffers.zero_buffers.push_back(inferAndAlloc( alloc->buffer()->as()->fuserTv(), ec, options_, true)); } @@ -314,42 +316,120 @@ std::vector FusionExecutor::allocOutputs(EvaluationContext& ec) { std::vector FusionExecutor::runFusion( const at::ArrayRef& inputs, const std::vector& outputs, - const LaunchParams& launch_constraints) { + const LaunchParams& launch_constraints, + const c10::optional& opt_code) { TORCH_INTERNAL_ASSERT( fusion_id_ > 0, "Cannot run fusion, it was not compiled."); + TORCH_INTERNAL_ASSERT( + !opt_code.has_value() || outputs.empty(), + "short cut input cache is not compatible with pre-allocated output"); - FusionGuard fg(&fusion_); - - executor_utils::validateKernelInputs(&fusion_, inputs, options_.device); + ExecutorEntry* executor_entry = nullptr; + if (opt_code.has_value()) { + executor_entry = &executor_entry_lookup_[*opt_code]; + } + FusionGuard fg(&fusion_); c10::DeviceGuard dg(options_.device); auto stream = at::cuda::getCurrentCUDAStream(); - EvaluationContext evaluation_context = - executor_utils::bindInputs(inputs, &fusion_, &lowered_); + LaunchParams launch_params; + std::vector alloced_outputs = outputs; + GlobalBuffers global_buffers; + uint64_t rand_offset = 0; + + if (executor_entry && executor_entry->init) { + { + // context manager to disable auto grad for `empty_cuda` calls later; + at::AutoNonVariableTypeMode non_variable_type_mode; + // take the short-cut for launch if we see a recorded input set again; + launch_params = executor_entry->launch_params; + for (size_t i = 0; i < executor_entry->output_sizes.size(); i++) { + auto tensor_options = at::TensorOptions() + .dtype(executor_entry->output_types[i]) + .device(options_.device); + alloced_outputs.push_back(at::native::empty_cuda( + executor_entry->output_sizes[i], tensor_options)); + } + for (size_t i = 0; i < executor_entry->empty_buffer_sizes.size(); i++) { + auto tensor_options = at::TensorOptions() + .dtype(executor_entry->empty_buffer_types[i]) + .device(options_.device); + global_buffers.empty_buffers.push_back(at::native::empty_cuda( + executor_entry->empty_buffer_sizes[i], tensor_options)); + } + } + for (size_t i = 0; i < executor_entry->zero_buffer_sizes.size(); i++) { + auto tensor_options = at::TensorOptions() + .dtype(executor_entry->zero_buffer_types[i]) + .device(options_.device); + global_buffers.zero_buffers.push_back( + at::zeros(executor_entry->zero_buffer_sizes[i], tensor_options)); + } + rand_offset = executor_entry->rand_offset; + } else { + // code path to take when either: + // 1. no opt_code is provided or; + // 2. `executor_entry` is not initialized + executor_utils::validateKernelInputs(&fusion_, inputs, options_.device); + + EvaluationContext evaluation_context = + executor_utils::bindInputs(inputs, &fusion_, &lowered_); - LaunchParams launch_params = - computeLaunchParams(inputs, launch_constraints, evaluation_context); + launch_params = + computeLaunchParams(inputs, launch_constraints, evaluation_context); - std::vector alloced_outputs = outputs; - if (outputs.empty() || outputs.size() != fusion_.outputs().size()) { - alloced_outputs = allocOutputs(evaluation_context); - } + if (outputs.empty() || outputs.size() != fusion_.outputs().size()) { + alloced_outputs = allocOutputs(evaluation_context); + } + + executor_utils::validateKernelOutputs( + &fusion_, alloced_outputs, options_.device); + + global_buffers = allocGlobalVals(evaluation_context); + + if (has_random_) { + // NOTE: this is how we map offset to PW kernels in order to have + // identical random number generator to match native PyTorch results. + // But it doesn't really work as it takes assumption how threads are + // binded but is not generally how we handle that in scheduler. + // Refer to `Philox` in generated kernel to understand how the mapping + // works. + rand_offset = 4 * + (std::ceil( + alloced_outputs[0].numel() / + (4.0 * 128 * launch_params.gdimx())) + // NOLINT + 1); + } - executor_utils::validateKernelOutputs( - &fusion_, alloced_outputs, options_.device); + // This is the entry when we have provided `opt_code` but the entry has not + // been initialized yet. + if (executor_entry) { + // record the the short-cut executor entry for the given input set; + executor_entry->launch_params = launch_params; + for (const auto& output : alloced_outputs) { + executor_entry->output_sizes.push_back(output.sizes().vec()); + executor_entry->output_types.push_back(output.scalar_type()); + } + for (const auto& buffer : global_buffers.empty_buffers) { + executor_entry->empty_buffer_sizes.push_back(buffer.sizes().vec()); + executor_entry->empty_buffer_types.push_back(buffer.scalar_type()); + } + for (const auto& buffer : global_buffers.zero_buffers) { + executor_entry->zero_buffer_sizes.push_back(buffer.sizes().vec()); + executor_entry->zero_buffer_types.push_back(buffer.scalar_type()); + } + executor_entry->rand_offset = rand_offset; + executor_entry->init = true; + } + } KernelArgumentHolder kernel_arguments; kernel_arguments.push(inputs); kernel_arguments.push(alloced_outputs); - auto buffers = allocGlobalVals(evaluation_context); - kernel_arguments.push(buffers); - + kernel_arguments.push(global_buffers.empty_buffers); + kernel_arguments.push(global_buffers.zero_buffers); if (has_random_) { - const auto rand_offset = 4 * - (std::ceil( - alloced_outputs[0].numel() / (4.0 * 128 * launch_params.gdimx())) + - 1); kernel_arguments.appendPhiloxRNGSeed(rand_offset); } diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index e134f2869fd8d..8164b25bb80b6 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -36,21 +36,44 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { std::vector runFusion( const at::ArrayRef& inputs, const std::vector& outputs, - const LaunchParams& launch_constraints = LaunchParams()); + const LaunchParams& launch_constraints = LaunchParams(), + const c10::optional& opt_code = c10::nullopt); std::vector runFusion( const at::ArrayRef& inputs, - const LaunchParams& launch_constraints = LaunchParams()) { - return runFusion(inputs, {}, launch_constraints); + const LaunchParams& launch_constraints = LaunchParams(), + const c10::optional& opt_code = c10::nullopt) { + return runFusion(inputs, {}, launch_constraints, opt_code); } // function to query whether a `FusionExecutor` has a compiled kernel to // execute bool compiled() const { - return compiled_; + return fusion_id_ != -1; + }; + + // TODO: strides would also be important when we handle permutations in + // codegen. + // struct used to hold necessary information to launch compiled kernel on a + // given input set. + struct ExecutorEntry { + bool init = false; + LaunchParams launch_params; + std::vector> output_sizes; + std::vector output_types; + std::vector> empty_buffer_sizes; + std::vector empty_buffer_types; + std::vector> zero_buffer_sizes; + std::vector zero_buffer_types; + uint64_t rand_offset; }; private: + struct GlobalBuffers { + std::vector empty_buffers; + std::vector zero_buffers; + }; + std::string kernelName() const { std::stringstream ss; ss << "kernel" << fusion_id_; @@ -75,13 +98,13 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { bool align_padding = false, uint64_t total = 0); - std::vector allocGlobalVals(EvaluationContext& ec); + // return a pair of vector of tensors, where tensors in the first vector are + // not initialized, while the second vector contains zero-initiliazed tensors + GlobalBuffers allocGlobalVals(EvaluationContext& ec); std::vector allocOutputs(EvaluationContext& ec); private: - bool compiled_ = false; - Fusion fusion_; bool has_block_reductions = false; @@ -100,6 +123,10 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { static int fusion_id_counter_; GpuLower lowered_; + + // lookup table to take short cut to retrieve recorded information in order to + // launch kernels without re-inference parameters. + std::unordered_map executor_entry_lookup_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp index 97113fb4232c6..e549a3608e3a1 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp @@ -135,8 +135,6 @@ void validateKernelInputs( Fusion* fusion, const at::ArrayRef& inputs, c10::Device device) { - // This is necessary as we were traversing the fusion graph later in the check - FusionGuard fg(fusion); // Check inputs TORCH_INTERNAL_ASSERT( inputs.size() == fusion->inputs().size(), @@ -315,7 +313,6 @@ NvrtcFunction nvrtcCompile( const char* disable_fma = getenv("PYTORCH_CUDA_FUSER_DISABLE_FMA"); // int disable_fma_flag = disable_fma ? atoi(disable_fma) : 0; if (disable_fma && atoi(disable_fma)) { - printf("disabling fmad\n"); args.push_back("--fmad=false"); } diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp index 6b370b57b1470..ee58eaa9245e8 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp @@ -3,9 +3,6 @@ #include #include -// TODO: This class is dead at the moment, but we need to figure out a generic -// cacheing system that will suite our needs. - namespace torch { namespace jit { namespace fuser { @@ -185,6 +182,36 @@ at::DimVector inversePermutation( } // namespace +size_t InputsIdLookup::getCode(const at::ArrayRef& inputs) { + std::stringstream encoded_inputs; + for (const auto& input : inputs) { + if (input.isTensor()) { + auto input_tensor = input.toTensor(); + + encoded_inputs << ";"; + auto sep = ""; + for (auto size : input_tensor.sizes()) { + encoded_inputs << sep << size; + sep = ","; + } + encoded_inputs << "@"; + sep = ""; + for (auto stride : input_tensor.strides()) { + encoded_inputs << sep << stride; + sep = ","; + } + } else { + // encode s for scalar; + encoded_inputs << ";s"; + } + } + auto& iter = encoding_lookup_[encoded_inputs.str()]; + if (iter == 0) { + iter = current_id_++; + } + return iter; +} + FusionExecutorCache::FusionExecutorCache( std::unique_ptr&& fusion, at::Device device) @@ -193,49 +220,57 @@ FusionExecutorCache::FusionExecutorCache( has_reduction_ = fusion_->hasReduction(); } -// TODO: dummy cache std::vector FusionExecutorCache::runFusionWithInputs( - const at::ArrayRef& inputs) { - // caching strategy is different for pw-fusion and reduction-fusion. - if (has_reduction_) { - // copy the fusion, since each FusionExecutor needs to manipulate the fusion - // in order to generate kernel. - Fusion fusion = *fusion_; - FusionGuard fg(&fusion); - TensorView* red_tv = nullptr; - for (auto expr : fusion.exprs()) { - if (expr->getExprType().has_value() && - expr->getExprType().value() == ExprType::ReductionOp) { - red_tv = expr->outputs()[0]->as(); - break; + const at::ArrayRef& inputs, + size_t unique_id) { + if (code_to_fe_lookup_.count(unique_id) == 0) { + // enter when we get a new input set. We need to search for compatible + // entries in cached `FusionExecutor` or compile new one as needed. + + // caching strategy is different for pw-fusion and reduction-fusion. + if (has_reduction_) { + // copy the fusion, since each FusionExecutor needs to manipulate the + // fusion in order to generate kernel. + Fusion fusion = *fusion_; + TensorView* red_tv = nullptr; + for (auto expr : fusion.exprs()) { + if (expr->getExprType().has_value() && + expr->getExprType().value() == ExprType::ReductionOp) { + red_tv = expr->outputs()[0]->as(); + break; + } } + auto reduction_params = scheduleReduction(&fusion, inputs, red_tv); + TORCH_INTERNAL_ASSERT( + reduction_params.has_value(), + "reduction schedule failed in `scheduleReduction`"); + auto fusion_executor = + &red_fusion_executor_cache_[reduction_params.value()]; + if (!fusion_executor->compiled()) { + // This means we have not found a previously generated kernel that's + // compatible with the new reduction params. We need to finish codegen. + CompileOptions options; + options.device = device_; + fusion_executor->compileFusion(&fusion, options); + } + // record new short cut to `FusionExecutor` + code_to_fe_lookup_[unique_id] = fusion_executor; + } else { + if (!pw_fusion_executor_cache_) { + pw_fusion_executor_cache_ = std::make_unique(); + CompileOptions options; + options.device = device_; + // no need to copy fusion_, as we are not generating more than 1 kernel + // for PW. + scheduleFusion(fusion_.get(), inputs); + pw_fusion_executor_cache_->compileFusion(fusion_.get(), options); + } + // record new short cut to `FusionExecutor` + code_to_fe_lookup_[unique_id] = pw_fusion_executor_cache_.get(); } - auto reduction_params = scheduleReduction(&fusion, inputs, red_tv); - TORCH_INTERNAL_ASSERT( - reduction_params.has_value(), - "reduction schedule failed in `scheduleReduction`"); - auto& fusion_executor = - red_fusion_executor_cache_[reduction_params.value()]; - if (!fusion_executor.compiled()) { - // This means we have not found a previously generated kernel that's - // compatible with the new reduction params. We need to finish codegen. - CompileOptions options; - options.device = device_; - fusion_executor.compileFusion(&fusion, options); - } - return fusion_executor.runFusion(inputs); - } else { - if (!pw_fusion_executor_cache_) { - pw_fusion_executor_cache_ = std::make_unique(); - CompileOptions options; - options.device = device_; - // no need to copy fusion_, as we are not generating more than 1 kernel - // for PW. - scheduleFusion(fusion_.get(), inputs); - pw_fusion_executor_cache_->compileFusion(fusion_.get(), options); - } - return pw_fusion_executor_cache_->runFusion(inputs); } + return code_to_fe_lookup_[unique_id]->runFusion( + inputs, LaunchParams(), unique_id); } GraphCache::InputsRequirement::InputsRequirement( @@ -384,7 +419,7 @@ bool GraphCache::InputsRequirement::complyWith( return true; } -FusionExecutorCache* GraphCache::createFusionExecutorCache( +FusionExecutorCache* GraphCache::appendFusionExecutorCache( const InputsRequirement& input_stack) { input_stacks_.emplace_back(input_stack); std::shared_ptr parsing_graph = graph_->copy(); @@ -514,50 +549,71 @@ GraphCache::GraphCache(std::shared_ptr graph) // compile a kernel if we have enough information from graph (profiling // record) if (IsNewExecutorEnabled()) { - createFusionExecutorCache( + appendFusionExecutorCache( InputsRequirement(graph_, toVector(reduction_axes_))); } } std::vector GraphCache::runGraphWithInputs( const at::ArrayRef& inputs) { - InputsRequirement input_stack(inputs, toVector(reduction_axes_)); + // get unique id `unique_id` for given input set `inputs`; + const size_t unique_id = inputs_id_lookup_.getCode(inputs); + FusionExecutorCache* fusion_executor_cache = nullptr; - // TODO: hash indexing; - for (size_t i = 0; i < fe_cache_.size(); i++) { - if (input_stack.complyWith(input_stacks_[i])) { - fusion_executor_cache = fe_cache_[i].get(); - break; + if (code_to_index_lookup_.count(unique_id) == 0) { + InputsRequirement input_stack(inputs, toVector(reduction_axes_)); + for (size_t i = 0; i < fe_cache_.size(); i++) { + if (input_stack.complyWith(input_stacks_[i])) { + // found compliable fe_cache_ entry + fusion_executor_cache = fe_cache_[i].get(); + // record short cut to designated fusion executor + code_to_index_lookup_[unique_id] = i; + break; + } } + if (!fusion_executor_cache) { + // This is the ugly bit, each level of cache has their own entry. At this + // point, we are creating an instance of FusionExecutorCache as well as a + // cache entry for GraphCache; + // But we are not creating any cache entry for nested structures. We only + // create cache entry below when we later call + // `fusion_executor_cache->runFusionWithInputs` + fusion_executor_cache = appendFusionExecutorCache(input_stack); + // record short cut to designated fusion executor + code_to_index_lookup_[unique_id] = fe_cache_.size() - 1; + } + } else { + // take short cut to designated fusion executor + fusion_executor_cache = fe_cache_[code_to_index_lookup_[unique_id]].get(); } - if (!fusion_executor_cache) { - fusion_executor_cache = createFusionExecutorCache(input_stack); - } + InputsRequirement* input_requirement = + &input_stacks_[code_to_index_lookup_[unique_id]]; // GraphCache need to permute inputs/outputs to accommodate dimension // coalescing - if (input_stack.requiresPermutation()) { + if (input_requirement->requiresPermutation()) { std::vector permuted_inputs; permuted_inputs.reserve(inputs.size()); for (const auto& input : inputs) { if (input.isTensor()) { permuted_inputs.emplace_back( - input.toTensor().permute(input_stack.input_permutation_)); + input.toTensor().permute(input_requirement->input_permutation_)); } else { permuted_inputs.emplace_back(input); } } - auto outputs = fusion_executor_cache->runFusionWithInputs(permuted_inputs); + auto outputs = + fusion_executor_cache->runFusionWithInputs(permuted_inputs, unique_id); std::vector permuted_outputs; permuted_outputs.reserve(outputs.size()); for (const auto& output : outputs) { permuted_outputs.emplace_back( - output.permute(input_stack.output_permutation_)); + output.permute(input_requirement->output_permutation_)); } return permuted_outputs; } else { - return fusion_executor_cache->runFusionWithInputs(inputs); + return fusion_executor_cache->runFusionWithInputs(inputs, unique_id); } } diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.h b/torch/csrc/jit/codegen/cuda/kernel_cache.h index a59fbc38f1bfa..02d0c9c8b1d73 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.h +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.h @@ -8,12 +8,27 @@ #include #include +#include namespace torch { namespace jit { namespace fuser { namespace cuda { +// Note, the uniqueness of the ide generated for a given input set is only local +// to the instance of `InputsIdLookup`. +class InputsIdLookup { + public: + // encode each unique input sets to an unique id; + size_t getCode(const at::ArrayRef& inputs); + + private: + size_t current_id_ = 1; + + // TODO: change this to a trie for efficiency; + std::unordered_map encoding_lookup_; +}; + // [ Note -- 2 level cache implementation ] // // 2 level hierarchically nested cache is to handle the code generation and @@ -65,7 +80,8 @@ class FusionExecutorCache { // Execute fusion graph with given inputs, create `FusionExecutor` as needed; std::vector runFusionWithInputs( - const at::ArrayRef& inputs); + const at::ArrayRef& inputs, + size_t unique_id); private: // device_ where compiled binaries are loaded on & inputs are expected to @@ -102,6 +118,9 @@ class FusionExecutorCache { std::unique_ptr pw_fusion_executor_cache_; std::unordered_map red_fusion_executor_cache_; + + // short cut to FusionExecutor for input set encoded with id; + std::unordered_map code_to_fe_lookup_; }; class GraphCache { @@ -146,7 +165,6 @@ class GraphCache { const at::ArrayRef& inputs, const std::vector& reduction_axes); - // bool operator==(const InputsRequirement& other); bool complyWith(const InputsRequirement& expect); // helper function used at run-time to check whether a common permutation is @@ -157,7 +175,7 @@ class GraphCache { // construct FusionExecutorCache per InputsRequirement. // This function makes sure that we properly insert both `input_stacks_` and // `fe_cache_` at the same time. - FusionExecutorCache* createFusionExecutorCache( + FusionExecutorCache* appendFusionExecutorCache( const InputsRequirement& input_stack); private: @@ -166,10 +184,16 @@ class GraphCache { // TODO: poor name, we should use `eliminated_axes_` instead; at::DimVector reduction_axes_; + // short cut to index of stack for input set encoded with id; + std::unordered_map code_to_index_lookup_; + // TODO: we should really hash instead of iterative check. Optimize later... // unordered_map; std::vector input_stacks_; std::vector> fe_cache_; + + // inputs to unique_id lookup table; + InputsIdLookup inputs_id_lookup_; }; } // namespace cuda From 82248bb392dc07106190777cf32db6b1a4af2cfe Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Tue, 1 Sep 2020 14:32:34 -0700 Subject: [PATCH 025/167] oops, resolving auto merge issue (#354) --- torch/csrc/jit/codegen/cuda/executor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index ffee2c92c0069..d25a6675511ee 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -59,7 +59,8 @@ void FusionExecutor::debugCompileFusionFromStr( has_random_ = fusion->hasRNG(); lowered_ = GpuLower(&fusion_); compiled_kernel_ = executor_utils::nvrtcCompile(code, name, fusion_id_); - compiled_ = true; + TORCH_INTERNAL_ASSERT( + fusion_id_ > 0, "assign a fusion_id_ <= 0 is not accepted."); } void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) { From ada5150d624abf1d17f6225298636351243ecb6c Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Tue, 1 Sep 2020 15:04:12 -0700 Subject: [PATCH 026/167] Fixing CUDA fuser ci flag (#355) Adding environment variable to: 1. disable fma lower jit optimization level for robust python end-2-end tests 2. disable fallback path --- test/test_jit_cuda_fuser_legacy.py | 6 ++++++ test/test_jit_cuda_fuser_profiling.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/test/test_jit_cuda_fuser_legacy.py b/test/test_jit_cuda_fuser_legacy.py index 4b9959c1231e8..41e16df7d6869 100644 --- a/test/test_jit_cuda_fuser_legacy.py +++ b/test/test_jit_cuda_fuser_legacy.py @@ -1,5 +1,11 @@ import sys sys.argv.append("--ge_config=legacy") + +import os +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1' +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1' +os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0' + from test_jit_cuda_fuser import * if __name__ == '__main__': diff --git a/test/test_jit_cuda_fuser_profiling.py b/test/test_jit_cuda_fuser_profiling.py index e2869eca7b5ff..7559b85519c45 100644 --- a/test/test_jit_cuda_fuser_profiling.py +++ b/test/test_jit_cuda_fuser_profiling.py @@ -1,5 +1,11 @@ import sys sys.argv.append("--ge_config=profiling") + +import os +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1' +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1' +os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0' + from test_jit_cuda_fuser import * if __name__ == '__main__': From 4ec6d5a1f886d2b197c2cedbde89ec1f1c9b424f Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Tue, 1 Sep 2020 19:08:27 -0700 Subject: [PATCH 027/167] Enable Global Intermediate Buffers (#325) * Enable Global Intermediate Buffers * Set the default MemoryType to Local * Merge Sync_Allocations into Global_Allocations * Check that all inputs/outputs are in global memory Co-authored-by: Ryan Spring --- test/cpp/jit/test_gpu.cpp | 88 +++++++++++++++++++ test/cpp/jit/tests.h | 2 + torch/csrc/jit/codegen/cuda/executor.cpp | 12 +-- torch/csrc/jit/codegen/cuda/fusion.cpp | 13 +-- .../jit/codegen/cuda/ir_interface_nodes.h | 7 +- torch/csrc/jit/codegen/cuda/ir_iostream.cpp | 5 +- torch/csrc/jit/codegen/cuda/kernel_ir.cpp | 14 ++- torch/csrc/jit/codegen/cuda/kernel_ir.h | 10 ++- torch/csrc/jit/codegen/cuda/lower2device.cpp | 46 +++------- torch/csrc/jit/codegen/cuda/lower2device.h | 14 +-- torch/csrc/jit/codegen/cuda/lower_index.cpp | 17 ++-- torch/csrc/jit/codegen/cuda/tensor_view.cpp | 8 +- 12 files changed, 155 insertions(+), 81 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index 8829ea249d748..e96c8925c8079 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -5934,6 +5934,94 @@ void testGPU_FusionSmemDynamicPwiseMulSymbolicArg() { aten_output.sub(outputs[0]).abs().max()); } +void testGPU_FusionGlobalIntermediate() { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0); + fusion.addInput(tv0); + fusion.addOutput(tv1); + // tv1[I0, R1] = tv0[I0, I1] + + // Interface should just be a direct split with a Parallel type. We can + // include the parallelize call if we do this. + tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); + // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] + + TensorView* tv2 = tv1->rFactor({2}); + tv2->setMemoryType(MemoryType::Global); + // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] + // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] + + tv0->computeAt(tv1, 1); + + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv1->axis(0)->parallelize(ParallelType::BIDx); + + constexpr int numel_x = 65000, numel_y = 1024; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::rand({numel_x, numel_y}, options); + + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + + torch::jit::fuser::cuda::FusionExecutor executor; + executor.compileFusion(&fusion); + auto outputs = executor.runFusion( + {input}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); + + auto aten_output = input.sum({1}); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); +} + +void testGPU_FusionGlobalIntermediateDefaultSchedule() { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + TensorView* tv2 = makeDummyTensor(2); + TensorView* tv3 = makeDummyTensor(2); + TensorView* tv4 = sub(tv2, tv3); + TensorView* tv5 = add(tv1, tv4); + TensorView* tv6 = sub(tv5, tv0); + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addInput(tv2); + fusion.addInput(tv3); + fusion.addOutput(tv6); + // t6 = ((t1 + (t2 - t3)) - t0) + + tv4->setMemoryType(MemoryType::Global); + tv5->setMemoryType(MemoryType::Global); + tv6->setMemoryType(MemoryType::Global); + + constexpr int M = 32, N = 810; + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor in0 = at::rand({M, N}, options); + at::Tensor in1 = at::rand({M, N}, options); + at::Tensor in2 = at::rand({M, N}, options); + at::Tensor in3 = at::rand({M, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({in0, in1, in2, in3}); + + at::Tensor aten_output = (in1 + (in2 - in3)) - in0; + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().sum()); +} + void testGPU_FusionConstCheck() { Fusion fusion; FusionGuard fg(&fusion); diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 63d8006c172ff..62f3f20f9af7c 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -209,6 +209,8 @@ namespace jit { _(GPU_FusionSmemDynamicReductionSymbolic) \ _(GPU_FusionSmemDynamicReductionSymbolicArg) \ _(GPU_FusionSmemDynamicPwiseMulSymbolicArg) \ + _(GPU_FusionGlobalIntermediate) \ + _(GPU_FusionGlobalIntermediateDefaultSchedule) \ _(GPU_FusionConstCheck) \ _(GPU_FusionSymbolicReduction) \ _(GPU_FusionUnrollWithAlloc) \ diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index d25a6675511ee..1f46a3a1ee172 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -116,7 +116,7 @@ at::Tensor inferAndAlloc( const CompileOptions& options, bool zero_init = false) { std::vector sizes; - for (auto id : TensorDomain::noReductions(tv->getRootDomain())) { + for (auto id : TensorDomain::noReductions(tv->getMaybeRFactorDomain())) { auto inferred_val = ExpressionEvaluator::evaluate(id->rawExtent(), &ec); TORCH_INTERNAL_ASSERT( inferred_val.has_value(), @@ -288,15 +288,7 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals( alloc->buffer()->as()->fuserTv(), ec, options_, - false)); - } - - for (auto alloc : lowered_.sync_allocations()) { - TORCH_INTERNAL_ASSERT( - alloc->buffer()->getValType() == ValType::KirTensorView, - "Cannot allocate global buffers that are not tensors."); - global_buffers.zero_buffers.push_back(inferAndAlloc( - alloc->buffer()->as()->fuserTv(), ec, options_, true)); + alloc->zeroInit())); } return global_buffers; diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index d26feb0772a5c..3ac4c95584d13 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -233,20 +233,21 @@ void Fusion::addInput(Val* const input) { if (input->getValType().value() == ValType::TensorView) { auto tv = input->as(); - if (tv->hasReduction()) + if (tv->hasReduction()) { TORCH_WARN_ONCE( "Registered input ", input, " has a reduction axis, but this does nothing in the fusion."); + } + tv->setMemoryType(MemoryType::Global); } - TORCH_CHECK( + TORCH_INTERNAL_ASSERT( input->getOrigin() == nullptr, input, " cannot be registered as an input as it is used as an output of an expression (", input->getOrigin(), ")."); - inputs_.push_back(input); } @@ -254,13 +255,15 @@ void Fusion::addOutput(Val* const output) { assertInFusion(output, "Cannot register output "); if (output->getValType().value() == ValType::TensorView) { auto tv = output->as(); - if (TensorDomain::hasBroadcast(tv->getRootDomain())) + if (TensorDomain::hasBroadcast(tv->getRootDomain())) { // Go to the root as we can merge bcast and // non-bcast dims, making a non-bcast dim. - TORCH_CHECK( // Should we warn instead? + TORCH_INTERNAL_ASSERT( // Should we warn instead? false, output, " cannot be registered as an output as it has a broadcast axis."); + } + tv->setMemoryType(MemoryType::Global); } outputs_.push_back(output); } diff --git a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h index 737869a39fd65..4186f7dfcd885 100644 --- a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h +++ b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h @@ -209,7 +209,10 @@ class TORCH_CUDA_API TensorView : public Val { TensorView(TensorView&& other) = delete; TensorView& operator=(TensorView&& other) = delete; - TensorView(TensorDomain* _domain, DataType dtype); + TensorView( + TensorDomain* _domain, + DataType dtype, + MemoryType mtype = MemoryType::Local); TensorView(const std::shared_ptr& tensor_type); @@ -407,7 +410,7 @@ class TORCH_CUDA_API TensorView : public Val { // compute at axis in compute at view unsigned int relative_compute_at_axis_ = 0; unsigned int this_compute_at_axis_ = 0; - MemoryType memory_type_ = MemoryType::Global; + MemoryType memory_type_ = MemoryType::Local; }; } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp index 11482113e0f9f..d3d7f1099fd4c 100644 --- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp @@ -67,8 +67,9 @@ void IRPrinter::printHeader( break; case ValType::KirTensorView: os << "Tensor<" << val->getDataType().value() << ", " - << kir::TensorDomain::noReductions( - val->as()->domain()->rootDomain()) + << TensorDomain::noReductions(val->as() + ->fuserTv() + ->getMaybeRFactorDomain()) .size() << "> T" << val->name(); break; diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp index 8f8fd95fb0d4a..e41fd66138ec6 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp @@ -366,11 +366,16 @@ Val* TensorIndex::index(int i) const { return indices_[i]; } -Allocate::Allocate(Val* buffer, MemoryType memory_type, Val* size) +Allocate::Allocate( + Val* buffer, + MemoryType memory_type, + Val* size, + bool zero_init) : Expr(ExprType::Allocate), buffer_(buffer), memory_type_(memory_type), - size_(size) { + size_(size), + zero_init_(zero_init) { if (size_ != nullptr) { TORCH_INTERNAL_ASSERT( size_->isOneInt() || @@ -378,7 +383,10 @@ Allocate::Allocate(Val* buffer, MemoryType memory_type, Val* size) "Cannot allocate a non-TensorView buffer with a size != 1, received buffer: ", buffer_); } else { - TORCH_CHECK(buffer_->getValType().value() == ValType::KirTensorView); + TORCH_INTERNAL_ASSERT( + buffer_->getValType().value() == ValType::KirTensorView); + TORCH_INTERNAL_ASSERT( + buffer_->as()->getMemoryType() == memory_type_); const auto domain = buffer_->as()->domain(); size_ = domain->nDims() == 0 ? new Int(1) : domain->axis(0)->extent(); for (size_t i = 1; i < domain->nDims(); i++) { diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.h b/torch/csrc/jit/codegen/cuda/kernel_ir.h index 67b493fe62455..9afb8fef30f58 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_ir.h +++ b/torch/csrc/jit/codegen/cuda/kernel_ir.h @@ -300,7 +300,7 @@ class TORCH_CUDA_API TensorView : public Val { private: TensorDomain* domain_ = nullptr; - MemoryType memory_type_ = MemoryType::Global; + MemoryType memory_type_ = MemoryType::Local; // TODO(kir): remove temporary hack const fuser::TensorView* fuser_tv_ = nullptr; @@ -474,7 +474,8 @@ class TORCH_CUDA_API Allocate : public Expr { explicit Allocate( Val* buffer, MemoryType memory_type = MemoryType::Local, - Val* size = nullptr); + Val* size = nullptr, + bool zero_init = false); Val* buffer() const { return buffer_; @@ -488,6 +489,10 @@ class TORCH_CUDA_API Allocate : public Expr { return size_; } + bool zeroInit() const { + return zero_init_; + } + DataType buffer_type() const { return buffer_->getDataType().value(); } @@ -496,6 +501,7 @@ class TORCH_CUDA_API Allocate : public Expr { Val* buffer_ = nullptr; MemoryType memory_type_ = MemoryType::Local; Val* size_ = nullptr; + bool zero_init_ = false; }; // Sync represents __syncthreads barrier for block level coordination. diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp index 6a8c4115ff048..99de992b31ff7 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.cpp +++ b/torch/csrc/jit/codegen/cuda/lower2device.cpp @@ -34,10 +34,6 @@ class BuffersExtractor : OptOutDispatch { return global_allocations_; } - std::vector getSyncAllocs() { - return sync_allocations_; - } - std::vector getDynamicAllocs() { return dynamic_allocations_; } @@ -54,7 +50,6 @@ class BuffersExtractor : OptOutDispatch { ThreadPredicateMap& thread_predicates_; bool has_block_broadcast_; std::vector global_allocations_; - std::vector sync_allocations_; std::vector dynamic_allocations_; std::vector static_allocations_; @@ -88,18 +83,20 @@ class BuffersExtractor : OptOutDispatch { has_block_broadcast_ |= block_broadcast_needed; } - void handle(kir::GridReduction* gr) final { - global_allocations_.push_back(gr->reduction_buffer()); - sync_allocations_.push_back(gr->sync_buffer()); - } - void handle(kir::Allocate* a) final { - if (a->getMemoryType() == MemoryType::Shared) { - if (a->size()->isConstScalar()) { - static_allocations_.push_back(a); - } else { - dynamic_allocations_.push_back(a); - } + switch (a->getMemoryType()) { + case MemoryType::Global: + global_allocations_.push_back(a); + break; + case MemoryType::Shared: + if (a->size()->isConstScalar()) { + static_allocations_.push_back(a); + } else { + dynamic_allocations_.push_back(a); + } + break; + case MemoryType::Local: + break; } } }; @@ -161,19 +158,6 @@ void GpuLower::buildSizesMap() { } } -void GpuLower::adjustMemoryTypes() { - for (auto val : fusion_->deterministic_vals()) { - if (ir_utils::isTV(val)) { - auto tv = val->as(); - if (fusion_->hasInput(tv) || fusion_->hasOutput(tv)) { - tv->setMemoryType(MemoryType::Global); - } else if (tv->getMemoryType() == MemoryType::Global) { - tv->setMemoryType(MemoryType::Local); - } - } - } -} - void GpuLower::lower() { TORCH_INTERNAL_ASSERT(fusion_ != nullptr); TORCH_INTERNAL_ASSERT( @@ -194,7 +178,6 @@ void GpuLower::lower() { // prepare for lowering validateIr(fusion_); buildSizesMap(); - adjustMemoryTypes(); // Compute thread predicates ThreadPredicateMap preds(fusion_); @@ -216,7 +199,6 @@ void GpuLower::lower() { // Get allocations BuffersExtractor be(lowered_exprs_, preds); global_allocations_ = be.getGlobalAllocs(); - sync_allocations_ = be.getSyncAllocs(); dynamic_smem_allocations_ = be.getDynamicAllocs(); static_smem_allocations_ = be.getStaticAllocs(); } @@ -230,8 +212,6 @@ std::ostream& GpuLower::printKernel( std::vector allocs; allocs.insert( allocs.end(), global_allocations_.begin(), global_allocations_.end()); - allocs.insert( - allocs.end(), sync_allocations_.begin(), sync_allocations_.end()); std::vector global_tensors(allocs.size(), nullptr); std::transform( diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h index 39630a334c69b..e0908f26d74c2 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.h +++ b/torch/csrc/jit/codegen/cuda/lower2device.h @@ -33,10 +33,6 @@ class TORCH_CUDA_API GpuLower { return global_allocations_; } - std::vector sync_allocations() { - return sync_allocations_; - } - std::vector dynamic_allocations() { return dynamic_smem_allocations_; } @@ -64,17 +60,11 @@ class TORCH_CUDA_API GpuLower { // tensors to reference the runtime structure containing sizes. void buildSizesMap(); - // Adjust memory types to make sure they are valid - void adjustMemoryTypes(); - private: - // List of global buffers (not including buffers for grid syncronization) + // List of global buffers + // Allocate nodes track if it needs to be initialized to 0 std::vector global_allocations_; - // List of syncronization buffers that must be initialized to 0 when running - // the fusion - std::vector sync_allocations_; - // List of dynamic shared memory buffers std::vector dynamic_smem_allocations_; diff --git a/torch/csrc/jit/codegen/cuda/lower_index.cpp b/torch/csrc/jit/codegen/cuda/lower_index.cpp index 443a718cb0140..dbae6e3388643 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_index.cpp @@ -217,16 +217,21 @@ void IndexLowering::handle(ReductionOp* rop) { IterDomain* buffer_id = new IterDomain(new Int(0), buffer_size); TensorView* reduce_buffer_tv = new TensorView( - new TensorDomain({buffer_id}), out->getDataType().value()); + new TensorDomain({buffer_id}), + out->getDataType().value(), + MemoryType::Global); IterDomain* sync_id = new IterDomain(new Int(0), sync_size); - TensorView* reduce_sync_tv = - new TensorView(new TensorDomain({sync_id}), DataType::Int); + TensorView* reduce_sync_tv = new TensorView( + new TensorDomain({sync_id}), DataType::Int, MemoryType::Global); const auto reduce_buffer = new kir::Allocate( - kir::lowerValue(reduce_buffer_tv), MemoryType::Global); - const auto sync_buffer = - new kir::Allocate(kir::lowerValue(reduce_sync_tv), MemoryType::Global); + kir::lowerValue(reduce_buffer_tv), reduce_sync_tv->getMemoryType()); + const auto sync_buffer = new kir::Allocate( + kir::lowerValue(reduce_sync_tv), + reduce_sync_tv->getMemoryType(), + nullptr, + true); const auto grid_reduction_op = block_reduction_op == nullptr ? new kir::ReductionOp( diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/torch/csrc/jit/codegen/cuda/tensor_view.cpp index b1f6f731d96c5..86ff7263af248 100644 --- a/torch/csrc/jit/codegen/cuda/tensor_view.cpp +++ b/torch/csrc/jit/codegen/cuda/tensor_view.cpp @@ -24,8 +24,8 @@ DataType aten_opt_type_map(const c10::optional& scalar_type) { } } // namespace -TensorView::TensorView(TensorDomain* _domain, DataType dtype) - : Val(ValType::TensorView, dtype), domain_(_domain) {} +TensorView::TensorView(TensorDomain* _domain, DataType dtype, MemoryType mtype) + : Val(ValType::TensorView, dtype), domain_(_domain), memory_type_(mtype) {} TensorView::TensorView(const std::shared_ptr& tensor_type) : Val(ValType::TensorView, @@ -557,10 +557,6 @@ void TensorView::setMemoryType(MemoryType mt) { TORCH_INTERNAL_ASSERT( mt == MemoryType::Global, "Tried to set an input or output to the fusion to a non-global memory type."); - } else { - TORCH_INTERNAL_ASSERT( - mt != MemoryType::Global, - "Tried to set an intermediate tensor in the fusion to the global memory type."); } } From 23f00e122ae96963a0729689c0420f92fdcc6998 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Wed, 2 Sep 2020 15:48:20 -0400 Subject: [PATCH 028/167] Stateful evaluation (#347) Stateful evaluation in runFusion reduces pointwise latency by ~20us on new shapes. --- test/cpp/jit/test_gpu.cpp | 134 ++++++------ torch/csrc/jit/codegen/cuda/executor.cpp | 81 +++---- torch/csrc/jit/codegen/cuda/executor.h | 10 +- .../csrc/jit/codegen/cuda/executor_utils.cpp | 78 +------ torch/csrc/jit/codegen/cuda/executor_utils.h | 17 +- .../csrc/jit/codegen/cuda/expr_evaluator.cpp | 206 ++++++++---------- torch/csrc/jit/codegen/cuda/expr_evaluator.h | 78 +++---- torch/csrc/jit/codegen/cuda/scheduler.cpp | 10 +- 8 files changed, 266 insertions(+), 348 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index e96c8925c8079..a56c7166a5dcb 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -74,11 +74,11 @@ TensorView* makeTensorWithContig( } void checkIntValue( - const EvaluationContext* eval_context, + StatefulExpressionEvaluator& evaluator, Val* val, Int::ScalarType expected_value) { TORCH_CHECK(val->isAnInt()); - const auto actual_value = ExpressionEvaluator::evaluate(val, eval_context); + const auto actual_value = evaluator.inferValue(val); TORCH_CHECK(actual_value.has_value()); TORCH_CHECK(actual_value.value() == expected_value); } @@ -163,16 +163,16 @@ void testGPU_FusionExprEvalConstants() { Fusion fusion; FusionGuard fg(&fusion); - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); auto* a = new Int(7); auto* b = new Int(3); - checkIntValue(&eval_context, neg(a), -7); - checkIntValue(&eval_context, add(a, b), 10); - checkIntValue(&eval_context, neg(mul(sub(a, b), div(a, b))), -8); - checkIntValue(&eval_context, mod(a, b), 1); - checkIntValue(&eval_context, ceilDiv(a, b), 3); + checkIntValue(evaluator, neg(a), -7); + checkIntValue(evaluator, add(a, b), 10); + checkIntValue(evaluator, neg(mul(sub(a, b), div(a, b))), -8); + checkIntValue(evaluator, mod(a, b), 1); + checkIntValue(evaluator, ceilDiv(a, b), 3); } // Evaluate basic scalar operations with bound values @@ -180,7 +180,7 @@ void testGPU_FusionExprEvalBindings() { Fusion fusion; FusionGuard fg(&fusion); - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); auto* a = new Int(); auto* b = new Int(); @@ -189,35 +189,35 @@ void testGPU_FusionExprEvalBindings() { auto* e = new Int(0); // trying to evaluate before binding should give empty results - TORCH_CHECK(!ExpressionEvaluator::evaluate(a, &eval_context).has_value()); - TORCH_CHECK(!ExpressionEvaluator::evaluate(d, &eval_context).has_value()); + TORCH_CHECK(!evaluator.inferValue(a).has_value()); + TORCH_CHECK(!evaluator.inferValue(d).has_value()); - eval_context.bind(a, 7); - eval_context.bind(b, 3); + evaluator.safeBind(a, 7); + evaluator.safeBind(b, 3); // can't bind to the results of expressions - ASSERT_ANY_THROW(eval_context.bind(c, 100)); + ASSERT_ANY_THROW(evaluator.safeBind(c, 100)); // can't bind to concrete values - ASSERT_ANY_THROW(eval_context.bind(e, 100)); + ASSERT_ANY_THROW(evaluator.safeBind(e, 100)); - checkIntValue(&eval_context, c, 10); - checkIntValue(&eval_context, sub(a, b), 4); - checkIntValue(&eval_context, mod(a, b), 1); - checkIntValue(&eval_context, ceilDiv(a, b), 3); - checkIntValue(&eval_context, d, -4); + checkIntValue(evaluator, c, 10); + checkIntValue(evaluator, sub(a, b), 4); + checkIntValue(evaluator, mod(a, b), 1); + checkIntValue(evaluator, ceilDiv(a, b), 3); + checkIntValue(evaluator, d, -4); // Reset evaluation context - eval_context = EvaluationContext(&fusion); + evaluator = StatefulExpressionEvaluator(&fusion); - eval_context.bind(a, 2); - eval_context.bind(b, 5); + evaluator.safeBind(a, 2); + evaluator.safeBind(b, 5); - checkIntValue(&eval_context, c, 7); - checkIntValue(&eval_context, sub(a, b), -3); - checkIntValue(&eval_context, mod(a, b), 2); - checkIntValue(&eval_context, ceilDiv(a, b), 1); - checkIntValue(&eval_context, d, -2); + checkIntValue(evaluator, c, 7); + checkIntValue(evaluator, sub(a, b), -3); + checkIntValue(evaluator, mod(a, b), 2); + checkIntValue(evaluator, ceilDiv(a, b), 1); + checkIntValue(evaluator, d, -2); } // Evaluate expressions in a simple IR @@ -248,8 +248,8 @@ void testGPU_FusionExprEvalBasic() { tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); - // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + // 1. Create an evaluator + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values // @@ -259,21 +259,21 @@ void testGPU_FusionExprEvalBasic() { // (ex. `tv0->getRootDomain()[0]->extent()` // instead of `tv0->axis(0)->extent()`) // - eval_context.bind(tv0->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 128); - eval_context.bind(tv1->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv1->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128); // 3. Evaluate and check result values TORCH_CHECK(tv2->domain()->nDims() == 3); - checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128); TORCH_CHECK(tv3->domain()->nDims() == 3); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128); } // Evaluate expressions in a more complex IR @@ -299,33 +299,33 @@ void testGPU_FusionExprEvalComplex() { tv6->split(0, 5); tv5->merge(0); - // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + // 1. Create an evaluator + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values - eval_context.bind(tv0->getRootDomain()[0]->extent(), 129); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 127); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 129); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 127); // Evaluate and check extent values TORCH_CHECK(tv0->domain()->nDims() == 2); - checkIntValue(&eval_context, tv0->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv0->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv0->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv0->axis(1)->rawExtent(), 127); TORCH_CHECK(tv3->domain()->nDims() == 2); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 127); TORCH_CHECK(tv4->domain()->nDims() == 2); - checkIntValue(&eval_context, tv4->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv4->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv4->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv4->axis(1)->rawExtent(), 127); TORCH_CHECK(tv5->domain()->nDims() == 1); - checkIntValue(&eval_context, tv5->axis(0)->rawExtent(), 16383); + checkIntValue(evaluator, tv5->axis(0)->rawExtent(), 16383); TORCH_CHECK(tv6->domain()->nDims() == 3); - checkIntValue(&eval_context, tv6->axis(0)->rawExtent(), 26); - checkIntValue(&eval_context, tv6->axis(1)->rawExtent(), 5); - checkIntValue(&eval_context, tv6->axis(2)->rawExtent(), 127); + checkIntValue(evaluator, tv6->axis(0)->rawExtent(), 26); + checkIntValue(evaluator, tv6->axis(1)->rawExtent(), 5); + checkIntValue(evaluator, tv6->axis(2)->rawExtent(), 127); } // Evaluate expressions post lowering @@ -365,27 +365,27 @@ void testGPU_FusionExprEvalPostLower() { gpulw.printKernel(kernel); // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values - eval_context.bind(tv0->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 128); - eval_context.bind(tv1->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv1->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128); // 3. Evaluate and check result values TORCH_CHECK(tv2->domain()->nDims() == 3); - checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128); TORCH_CHECK(tv3->domain()->nDims() == 3); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128); - checkIntValue(&eval_context, bid_x, 2); - checkIntValue(&eval_context, tid_x, 128); + checkIntValue(evaluator, bid_x, 2); + checkIntValue(evaluator, tid_x, 128); } void testGPU_FusionClear() { diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index 1f46a3a1ee172..f671e772a9371 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -73,6 +73,7 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) { "Output types from fusions that are not tensors are not supported at this point."); } + // Clone the fusion so we can store it fusion_ = *fusion; FusionGuard fg(&fusion_); options_ = options; @@ -92,9 +93,9 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) { const auto structured_code = getStructuredCode(kernel); if (lowered_.static_allocations().size() > 0) { - EvaluationContext evaluation_context(&fusion_); + StatefulExpressionEvaluator static_evaluator(&fusion_); unsigned static_smem_size = - computeSharedMemory(evaluation_context, lowered_.static_allocations()); + computeSharedMemory(static_evaluator, lowered_.static_allocations()); TORCH_INTERNAL_ASSERT( static_smem_size < max_device_smem, "The static shared memory allocation is larger than available memory."); @@ -112,12 +113,12 @@ namespace { at::Tensor inferAndAlloc( const TensorView* tv, - EvaluationContext& ec, + StatefulExpressionEvaluator& see, const CompileOptions& options, bool zero_init = false) { std::vector sizes; for (auto id : TensorDomain::noReductions(tv->getMaybeRFactorDomain())) { - auto inferred_val = ExpressionEvaluator::evaluate(id->rawExtent(), &ec); + auto inferred_val = see.inferValue(id->rawExtent()); TORCH_INTERNAL_ASSERT( inferred_val.has_value(), "Could not launch kernel as program could not infer ", @@ -143,19 +144,19 @@ at::Tensor inferAndAlloc( } // namespace uint64_t FusionExecutor::computeSharedMemory( - EvaluationContext& ec, + StatefulExpressionEvaluator& see, const std::vector& buffers, bool align_padding, uint64_t total) { for (auto smem_alloc : buffers) { - auto inferred_size = ExpressionEvaluator::evaluate(smem_alloc->size(), &ec); - if (inferred_size.has_value()) { + auto inferred_val = see.inferValue(smem_alloc->size()); + if (inferred_val.has_value()) { const uint64_t data_size = dataTypeSize(smem_alloc->buffer_type()); // Add padding to align dynamic shared memory if (align_padding) { total = ceilDiv(total, data_size) * data_size; } - total += inferred_size.value() * data_size; + total += inferred_val.value() * data_size; } else { TORCH_INTERNAL_ASSERT( false, @@ -169,9 +170,8 @@ uint64_t FusionExecutor::computeSharedMemory( } LaunchParams FusionExecutor::computeLaunchParams( - const at::ArrayRef& aten_inputs, const LaunchParams& launch_constraints, - EvaluationContext& ec) { + StatefulExpressionEvaluator& see) { LaunchParams launch_params; // Grab all values that are actually used in the fusion @@ -208,8 +208,7 @@ LaunchParams FusionExecutor::computeLaunchParams( if (launch_constraints.hasDim(p_type)) { auto parallel_ids = entry.second; for (auto parallel_id : parallel_ids) { - auto inferred_val = - ExpressionEvaluator::evaluate(parallel_id->rawExtent(), &ec); + auto inferred_val = see.inferValue(parallel_id->rawExtent()); if (inferred_val.has_value()) { // This value could have been inferred, make sure it was set right. TORCH_CHECK( @@ -223,14 +222,10 @@ LaunchParams FusionExecutor::computeLaunchParams( launch_constraints.getDim(p_type)); } else { // Bind the launch constraint into our evaluation context - executor_utils::safeBind( - ec, + see.safeBind( parallel_id->rawExtent(), - launch_constraints.getDim(entry.first)); - executor_utils::safeBind( - ec, - lowered_.getLowerValue(parallel_id->rawExtent()), - launch_constraints.getDim(entry.first)); + launch_constraints.getDim(entry.first), + &lowered_); launch_params.bind(launch_constraints.getDim(p_type), p_type); } } @@ -243,7 +238,7 @@ LaunchParams FusionExecutor::computeLaunchParams( auto p_type = entry.first; auto parallel_ids = entry.second; for (auto parallel_id : parallel_ids) { - auto val = ExpressionEvaluator::evaluate(parallel_id->rawExtent(), &ec); + auto val = see.inferValue(parallel_id->rawExtent()); TORCH_INTERNAL_ASSERT( val, "Tried to evaluate the extent of ", @@ -264,10 +259,10 @@ LaunchParams FusionExecutor::computeLaunchParams( } uint64_t dynamic_smem_size = computeSharedMemory( - ec, lowered_.dynamic_allocations(), true, reduction_broadcast_workspace); + see, lowered_.dynamic_allocations(), true, reduction_broadcast_workspace); uint64_t static_smem_size = - computeSharedMemory(ec, lowered_.static_allocations()); + computeSharedMemory(see, lowered_.static_allocations()); TORCH_INTERNAL_ASSERT( (dynamic_smem_size + static_smem_size) < max_device_smem, @@ -278,30 +273,39 @@ LaunchParams FusionExecutor::computeLaunchParams( } FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals( - EvaluationContext& ec) { + StatefulExpressionEvaluator& see) { GlobalBuffers global_buffers; for (auto alloc : lowered_.global_allocations()) { TORCH_INTERNAL_ASSERT( alloc->buffer()->getValType() == ValType::KirTensorView, "Cannot allocate global buffers that are not tensors."); - global_buffers.empty_buffers.push_back(inferAndAlloc( - alloc->buffer()->as()->fuserTv(), - ec, - options_, - alloc->zeroInit())); + if (!alloc->zeroInit()) { + global_buffers.empty_buffers.push_back(inferAndAlloc( + alloc->buffer()->as()->fuserTv(), + see, + options_, + false)); + } else { + global_buffers.zero_buffers.push_back(inferAndAlloc( + alloc->buffer()->as()->fuserTv(), + see, + options_, + true)); + } } return global_buffers; } -std::vector FusionExecutor::allocOutputs(EvaluationContext& ec) { +std::vector FusionExecutor::allocOutputs( + StatefulExpressionEvaluator& see) { std::vector outputs; for (auto output : fusion_.outputs()) { TORCH_INTERNAL_ASSERT( output->getValType() == ValType::TensorView, "Cannot allocate outputs that are not tensors."); outputs.push_back( - inferAndAlloc(output->as(), ec, options_, false)); + inferAndAlloc(output->as(), see, options_, false)); } return outputs; } @@ -366,20 +370,19 @@ std::vector FusionExecutor::runFusion( // 2. `executor_entry` is not initialized executor_utils::validateKernelInputs(&fusion_, inputs, options_.device); - EvaluationContext evaluation_context = - executor_utils::bindInputs(inputs, &fusion_, &lowered_); + StatefulExpressionEvaluator evaluator = + executor_utils::statefulBindInputs(inputs, &fusion_, &lowered_); - launch_params = - computeLaunchParams(inputs, launch_constraints, evaluation_context); + launch_params = computeLaunchParams(launch_constraints, evaluator); if (outputs.empty() || outputs.size() != fusion_.outputs().size()) { - alloced_outputs = allocOutputs(evaluation_context); + alloced_outputs = allocOutputs(evaluator); + } else { + executor_utils::validateKernelOutputs( + &fusion_, alloced_outputs, options_.device); } - executor_utils::validateKernelOutputs( - &fusion_, alloced_outputs, options_.device); - - global_buffers = allocGlobalVals(evaluation_context); + global_buffers = allocGlobalVals(evaluator); if (has_random_) { // NOTE: this is how we map offset to PW kernels in order to have diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index 8164b25bb80b6..2de938bf09820 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -88,21 +88,20 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { std::string getStructuredCode(const std::string& kernel); LaunchParams computeLaunchParams( - const at::ArrayRef& aten_inputs, const LaunchParams& launch_constraints, - EvaluationContext& ec); + StatefulExpressionEvaluator& see); uint64_t computeSharedMemory( - EvaluationContext& ec, + StatefulExpressionEvaluator& see, const std::vector& buffers, bool align_padding = false, uint64_t total = 0); // return a pair of vector of tensors, where tensors in the first vector are // not initialized, while the second vector contains zero-initiliazed tensors - GlobalBuffers allocGlobalVals(EvaluationContext& ec); + GlobalBuffers allocGlobalVals(StatefulExpressionEvaluator& see); - std::vector allocOutputs(EvaluationContext& ec); + std::vector allocOutputs(StatefulExpressionEvaluator& see); private: Fusion fusion_; @@ -113,6 +112,7 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { CompileOptions options_; size_t max_device_smem = std::numeric_limits().max(); + size_t static_smem_size = 0; executor_utils::NvrtcFunction compiled_kernel_; // State of the fusion that's important diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp index e549a3608e3a1..a7349efe62e2d 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp @@ -176,34 +176,16 @@ void validateKernelOutputs( } } -void safeBind( - EvaluationContext& ec, - const Val* value, - Int::ScalarType concrete_value) { - auto already_concrete_val = ec.concreteValue(value); - - if (already_concrete_val.has_value()) { - TORCH_INTERNAL_ASSERT( - concrete_value == already_concrete_val.value(), - "Tried to bind ", - value, - " to ", - " concrete value, but it's already set to ", - already_concrete_val.value()); - } else { - ec.bind(value, concrete_value); - } -} - -EvaluationContext bindInputs( +StatefulExpressionEvaluator statefulBindInputs( const at::ArrayRef& aten_inputs, - Fusion* fusion) { + Fusion* fusion, + GpuLower* lower) { TORCH_INTERNAL_ASSERT( fusion->inputs().size() == aten_inputs.size(), "Something went wrong configuring launch. Inputs no longer match."); auto fusion_inputs = fusion->inputs(); - EvaluationContext eval_context(fusion); + StatefulExpressionEvaluator evaluator(fusion); // This should probably move to EvaluationContext as we may want to bind // input values frequently. Bind fusion input values to runtime values. @@ -222,54 +204,18 @@ EvaluationContext bindInputs( "Something went wrong configuring launch. Inputs no longer match."); for (size_t dim = 0; dim < root_dom.size(); dim++) { - safeBind( - eval_context, root_dom[dim]->extent(), aten_tensor.sizes()[dim]); + evaluator.safeBind( + root_dom[dim]->extent(), aten_tensor.sizes()[dim], lower); } - } - } - return eval_context; -} - -EvaluationContext bindInputs( - const at::ArrayRef& aten_inputs, - Fusion* fusion, - GpuLower* lowered) { - TORCH_INTERNAL_ASSERT( - fusion->inputs().size() == aten_inputs.size(), - "Something went wrong configuring launch. Inputs no longer match."); - - auto fusion_inputs = fusion->inputs(); - EvaluationContext eval_context(fusion); - - // This should probably move to EvaluationContext as we may want to bind - // input values frequently. Bind fusion input values to runtime values. - for (size_t i = 0; i < fusion->inputs().size(); i++) { - if (fusion->inputs()[i]->getValType() == ValType::TensorView) { - TensorView* cg_tensor = fusion->inputs()[i]->as(); - + } else if ( + fusion->inputs()[i]->getValType().value() == ValType::Scalar && + fusion->inputs()[i]->getDataType().value() == DataType::Int) { TORCH_INTERNAL_ASSERT( - aten_inputs[i].isTensor(), - "Something went wrong configuring launch. Inputs no longer match."); - - auto aten_tensor = aten_inputs[i].toTensor(); - auto root_dom = TensorDomain::noReductions(cg_tensor->getRootDomain()); - TORCH_INTERNAL_ASSERT( - aten_tensor.ndimension() == root_dom.size(), - "Something went wrong configuring launch. Inputs no longer match."); - - for (size_t dim = 0; dim < root_dom.size(); dim++) { - auto extent = root_dom[dim]->extent(); - safeBind(eval_context, extent, aten_tensor.sizes()[dim]); - if (!extent->isConstScalar()) { - safeBind( - eval_context, - lowered->getLowerValue(extent), - aten_tensor.sizes()[dim]); - } - } + aten_inputs[i].type()->kind() == c10::TypeKind::IntType); + evaluator.safeBind(fusion->inputs()[i], aten_inputs[i].toInt(), lower); } } - return eval_context; + return evaluator; } NvrtcFunction nvrtcCompile( diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h index f105c9b88f82c..7a01bfa5d8f3c 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.h +++ b/torch/csrc/jit/codegen/cuda/executor_utils.h @@ -32,23 +32,10 @@ void validateKernelOutputs( const std::vector& outputs, c10::Device device); -// Check if a value is already bound, if so validate we're trying to bind to the -// same value -void safeBind( - EvaluationContext& ec, - const Val* value, - Int::ScalarType concrete_value); - -// Bind Inputs to Fusion IR -EvaluationContext bindInputs( - const at::ArrayRef& aten_inputs, - Fusion* fusion); - -// Bind Inputs to Fusion and Kernel IR -EvaluationContext bindInputs( +StatefulExpressionEvaluator statefulBindInputs( const at::ArrayRef& aten_inputs, Fusion* fusion, - GpuLower* lowered); + GpuLower* lower = nullptr); struct NvrtcFunction { CUmodule module = CUmodule(); diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp index 78aeab910e33e..04aeabab75a7c 100644 --- a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp +++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp @@ -1,4 +1,3 @@ - #include #include #include @@ -10,41 +9,58 @@ namespace torch { namespace jit { namespace fuser { -void EvaluationContext::bind(const Val* value, Int::ScalarType concrete_value) { - TORCH_INTERNAL_ASSERT( - value->isAnInt(), - "Expression Evaluation does not support values other than integers at this time."); +void StatefulExpressionEvaluator::safeBind( + Val* value, + Int::ScalarType concrete_value, + GpuLower* lower) { + auto already_concrete_val = getValue(value); - if (value->isConstScalar()) { - auto const_value = value->as()->value().value(); + if (already_concrete_val.has_value()) { TORCH_INTERNAL_ASSERT( - concrete_value == const_value, + concrete_value == already_concrete_val.value(), "Tried to bind ", - concrete_value, - " to ", value, - " however ", - value, - " is set to a constant ", - const_value); - } + " to ", + " concrete value, but it's already set to ", + already_concrete_val.value()); + } else { + TORCH_INTERNAL_ASSERT( + value->getOrigin() == nullptr, + "Tried to bind to a value that is computed in the fusion IR. ", + "Can only bind to symbolic values to the fusion that do not have an origin expr."); - TORCH_INTERNAL_ASSERT( - fusion_->origin(value) == nullptr, - "Tried to bind to a value that is computed in the fusion IR. ", - "Can only bind to symbolic values to the fusion that do not have an origin expr."); + bindings_[value] = concrete_value; + } - bindings_[value] = concrete_value; + if (lower != nullptr) { + auto lowered_val = lower->getLowerValue(value); + already_concrete_val = getValue(lowered_val); + + if (already_concrete_val.has_value()) { + TORCH_INTERNAL_ASSERT( + concrete_value == already_concrete_val.value(), + "Tried to bind ", + lowered_val, + " to ", + " concrete value, but it's already set to ", + already_concrete_val.value()); + } else { + TORCH_INTERNAL_ASSERT( + lowered_val->getOrigin() == nullptr, + "Tried to bind to a value that is computed in the fusion IR. ", + "Can only bind to symbolic values to the fusion that do not have an origin expr."); + + bindings_[lowered_val] = concrete_value; + } + } } -c10::optional EvaluationContext::concreteValue( - const Val* value) const { - const auto it = bindings_.find(value); - return (it != bindings_.end()) ? c10::optional(it->second) - : c10::nullopt; +c10::optional StatefulExpressionEvaluator::inferValue( + Val* value) { + return maybeHandle(value); } -void EvaluationContext::print() const { +void StatefulExpressionEvaluator::print() const { std::cout << "\nEvaluation context\n"; std::cout << "--------------------\n"; for (const auto& kv : bindings_) { @@ -58,56 +74,49 @@ void EvaluationContext::print() const { std::cout << "--------------------\n\n"; } -c10::optional ExpressionEvaluator::evaluate( - Val* val, - const EvaluationContext* context) { - TORCH_CHECK(context != nullptr); - ExpressionEvaluator evaluator(context); - evaluator.traverseFrom(context->fusion(), {val}, false); - return evaluator.value(val); -} +inline c10::optional StatefulExpressionEvaluator::getValue( + Val* value) { + TORCH_INTERNAL_ASSERT( + value->isAnInt(), + "Expressoin Evaluation does not support values other than integers at this time."); -c10::optional ExpressionEvaluator::value( - const Statement* stmt) const { - const auto it = values_.find(stmt); - return (it != values_.end()) ? c10::optional(it->second) - : c10::nullopt; -} + auto v_type = value->getValType().value(); + bool is_named_scalar = + v_type == ValType::NamedScalar || v_type == ValType::KirNamedScalar; -void ExpressionEvaluator::handle(NamedScalar* i) { - if (i->isAnInt()) { - const auto& bound_value = context_->concreteValue(i); - if (bound_value.has_value()) { - values_[i] = *bound_value; - } + if (!is_named_scalar && value->as()->value().has_value()) { + return value->as()->value(); + } + + auto it = bindings_.find(value); + if (it != bindings_.end()) { + return c10::optional(it->second); } + return c10::nullopt; } -void ExpressionEvaluator::handle(Int* i) { - if (i->value().has_value()) { - values_[i] = *i->value(); - } else if (const auto* def = context_->fusion()->origin(i)) { - const auto& def_result = value(def); - if (def_result.has_value()) { - values_[i] = *def_result; - } - } else { - const auto& bound_value = context_->concreteValue(i); - if (bound_value.has_value()) { - values_[i] = *bound_value; +c10::optional StatefulExpressionEvaluator::maybeHandle( + Val* val) { + auto maybe_concrete_value = getValue(val); + if (!maybe_concrete_value.has_value()) { + auto origin = val->getOrigin(); + if (origin != nullptr) { + handle(origin); + maybe_concrete_value = getValue(val); } } + return maybe_concrete_value; } -void ExpressionEvaluator::handle(UnaryOp* uop) { - const auto in = value(uop->in()); +void StatefulExpressionEvaluator::handle(UnaryOp* uop) { + const auto in = maybeHandle(uop->in()); if (in.has_value()) { switch (uop->getUnaryOpType()) { case UnaryOpType::Neg: - values_[uop] = -*in; + bindings_[uop->out()] = -*in; break; case UnaryOpType::Cast: - values_[uop] = *in; + bindings_[uop->out()] = *in; break; default: TORCH_CHECK(!"Unexpected operator type"); @@ -115,34 +124,34 @@ void ExpressionEvaluator::handle(UnaryOp* uop) { } } -void ExpressionEvaluator::handle(BinaryOp* bop) { - const auto lhs = value(bop->lhs()); - const auto rhs = value(bop->rhs()); +void StatefulExpressionEvaluator::handle(BinaryOp* bop) { + const auto lhs = maybeHandle(bop->lhs()); + const auto rhs = maybeHandle(bop->rhs()); if (lhs.has_value() && rhs.has_value()) { switch (bop->getBinaryOpType()) { case BinaryOpType::Add: - values_[bop] = *lhs + *rhs; + bindings_[bop->out()] = *lhs + *rhs; break; case BinaryOpType::Sub: - values_[bop] = *lhs - *rhs; + bindings_[bop->out()] = *lhs - *rhs; break; case BinaryOpType::Mul: - values_[bop] = *lhs * *rhs; + bindings_[bop->out()] = *lhs * *rhs; break; case BinaryOpType::Div: TORCH_CHECK(*rhs != 0); - values_[bop] = *lhs / *rhs; + bindings_[bop->out()] = *lhs / *rhs; break; case BinaryOpType::Mod: TORCH_CHECK(*rhs != 0); - values_[bop] = *lhs % *rhs; + bindings_[bop->out()] = *lhs % *rhs; break; case BinaryOpType::CeilDiv: TORCH_CHECK(*rhs != 0); - values_[bop] = (*lhs + *rhs - 1) / *rhs; + bindings_[bop->out()] = (*lhs + *rhs - 1) / *rhs; break; case BinaryOpType::And: - values_[bop] = Int::ScalarType(*lhs && *rhs); + bindings_[bop->out()] = Int::ScalarType(*lhs && *rhs); break; default: TORCH_CHECK(!"Unexpected operator type"); @@ -150,40 +159,15 @@ void ExpressionEvaluator::handle(BinaryOp* bop) { } } -void ExpressionEvaluator::handle(kir::NamedScalar* i) { - if (i->isAnInt()) { - const auto& bound_value = context_->concreteValue(i); - if (bound_value.has_value()) { - values_[i] = *bound_value; - } - } -} - -void ExpressionEvaluator::handle(kir::Int* i) { - if (i->value().has_value()) { - values_[i] = *i->value(); - } else if (const auto* def = context_->fusion()->origin(i)) { - const auto& def_result = value(def); - if (def_result.has_value()) { - values_[i] = *def_result; - } - } else { - const auto& bound_value = context_->concreteValue(i); - if (bound_value.has_value()) { - values_[i] = *bound_value; - } - } -} - -void ExpressionEvaluator::handle(kir::UnaryOp* uop) { - const auto in = value(uop->in()); +void StatefulExpressionEvaluator::handle(kir::UnaryOp* uop) { + const auto in = maybeHandle(uop->in()); if (in.has_value()) { switch (uop->getUnaryOpType()) { case UnaryOpType::Neg: - values_[uop] = -*in; + bindings_[uop->out()] = -*in; break; case UnaryOpType::Cast: - values_[uop] = *in; + bindings_[uop->out()] = *in; break; default: TORCH_CHECK(!"Unexpected operator type"); @@ -191,34 +175,34 @@ void ExpressionEvaluator::handle(kir::UnaryOp* uop) { } } -void ExpressionEvaluator::handle(kir::BinaryOp* bop) { - const auto lhs = value(bop->lhs()); - const auto rhs = value(bop->rhs()); +void StatefulExpressionEvaluator::handle(kir::BinaryOp* bop) { + const auto lhs = maybeHandle(bop->lhs()); + const auto rhs = maybeHandle(bop->rhs()); if (lhs.has_value() && rhs.has_value()) { switch (bop->getBinaryOpType()) { case BinaryOpType::Add: - values_[bop] = *lhs + *rhs; + bindings_[bop->out()] = *lhs + *rhs; break; case BinaryOpType::Sub: - values_[bop] = *lhs - *rhs; + bindings_[bop->out()] = *lhs - *rhs; break; case BinaryOpType::Mul: - values_[bop] = *lhs * *rhs; + bindings_[bop->out()] = *lhs * *rhs; break; case BinaryOpType::Div: TORCH_CHECK(*rhs != 0); - values_[bop] = *lhs / *rhs; + bindings_[bop->out()] = *lhs / *rhs; break; case BinaryOpType::Mod: TORCH_CHECK(*rhs != 0); - values_[bop] = *lhs % *rhs; + bindings_[bop->out()] = *lhs % *rhs; break; case BinaryOpType::CeilDiv: TORCH_CHECK(*rhs != 0); - values_[bop] = (*lhs + *rhs - 1) / *rhs; + bindings_[bop->out()] = (*lhs + *rhs - 1) / *rhs; break; case BinaryOpType::And: - values_[bop] = Int::ScalarType(*lhs && *rhs); + bindings_[bop->out()] = Int::ScalarType(*lhs && *rhs); break; default: TORCH_CHECK(!"Unexpected operator type"); diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.h b/torch/csrc/jit/codegen/cuda/expr_evaluator.h index 1e107ff129b2d..57264d816d78f 100644 --- a/torch/csrc/jit/codegen/cuda/expr_evaluator.h +++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.h @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -13,68 +14,67 @@ namespace torch { namespace jit { namespace fuser { -// Encapsulates a set of value bindings on top of a Fusion IR -// (used to provide known values to ExpressionEvaluator) -// -// NOTE: currently it only supports Int values -// -class TORCH_CUDA_API EvaluationContext { +class TORCH_CUDA_API StatefulExpressionEvaluator : private OptOutDispatch { public: - explicit EvaluationContext(Fusion* fusion) : fusion_(fusion) {} - - // Set the concrete value for a Int* - void bind(const Val* value, Int::ScalarType concrete_value); - - // Retrieves the concrete value, or nullopt if not set - c10::optional concreteValue(const Val* value) const; + explicit StatefulExpressionEvaluator(Fusion* fusion) : fusion_(fusion) {} Fusion* fusion() const { return fusion_; } + void safeBind( + Val* value, + Int::ScalarType concrete_value, + GpuLower* lower = nullptr); + + // Returns value if found in mapping, otherwise returns c10::nullopt + c10::optional getValue(Val* value); + + // Checks if value is already infered, returns infered value if so, otherwise + // runs traversal on value. Warning: should not be called in traversal. + c10::optional inferValue(Val* value); + // Debugging helper, prints all the currently set values void print() const; private: std::unordered_map bindings_; Fusion* fusion_ = nullptr; -}; -// Evaluates expressions in a Fusion IR, using the passed in -// context (EvaluationContext) to query for concrete_values. The -// evaluation context may override concrete values in the IR as well. -class TORCH_CUDA_API ExpressionEvaluator : private IterVisitor { - public: - // Returns the result of the specified expression, or nullopt if - // the result cannot be evaluated - static c10::optional evaluate( - Val* val, - const EvaluationContext* context); + using OptOutDispatch::handle; private: - explicit ExpressionEvaluator(const EvaluationContext* context) - : context_(context) {} - - ~ExpressionEvaluator() override = default; - - c10::optional value(const Statement* stmt) const; - - using IterVisitor::handle; + void handle(Expr* expr) override { + switch (expr->getExprType().value()) { + case ExprType::UnaryOp: + handle(expr->as()); + break; + case ExprType::BinaryOp: + handle(expr->as()); + break; + case ExprType::KirUnaryOp: + handle(expr->as()); + break; + case ExprType::KirBinaryOp: + handle(expr->as()); + break; + default: + TORCH_INTERNAL_ASSERT( + false, + "Cannot handle Expr type: ", + expr->getExprType().value(), + " in stateful expression evaluator."); + } + } - void handle(NamedScalar*) override; - void handle(Int*) override; void handle(UnaryOp*) override; void handle(BinaryOp*) override; // TODO(kir): remove this - void handle(kir::NamedScalar*) override; - void handle(kir::Int*) override; void handle(kir::UnaryOp*) override; void handle(kir::BinaryOp*) override; - private: - const EvaluationContext* context_ = nullptr; - std::unordered_map values_; + c10::optional maybeHandle(Val*); }; } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/scheduler.cpp b/torch/csrc/jit/codegen/cuda/scheduler.cpp index 3dac8e65f7e41..b8c04118add26 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler.cpp @@ -407,17 +407,15 @@ c10::optional scheduleReduction( red_tv->merge(-2, -1); } - EvaluationContext eval_context( - executor_utils::bindInputs(fusion_inputs, fusion)); + StatefulExpressionEvaluator evaluator( + executor_utils::statefulBindInputs(fusion_inputs, fusion)); // Evaluate Dimensions of Reduction TensorView auto red_ids = red_tv->domain()->domain(); TORCH_INTERNAL_ASSERT( red_ids.size() == 2, "We coalesced all dimensions into 2 previously."); - const auto red_outputs = - ExpressionEvaluator::evaluate(red_ids[0]->extent(), &eval_context); - const auto red_elems = - ExpressionEvaluator::evaluate(red_ids[1]->extent(), &eval_context); + const auto red_outputs = evaluator.inferValue(red_ids[0]->extent()); + const auto red_elems = evaluator.inferValue(red_ids[1]->extent()); TORCH_INTERNAL_ASSERT( red_outputs != c10::nullopt, "The number of reduction outputs is expected."); From c522c1f0ccc8bec28bab9f7bbd26a9502ef800f4 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Wed, 2 Sep 2020 16:23:03 -0400 Subject: [PATCH 029/167] Simple executor changes (#348) * Fusion executor, hold onto used TVs in the fusion. Reduces avg latency on LSTM Cell 75us -> 56us. * Arg validation. Improves average latency on LSTMCell 56us -> 51us. * Don't validate outputs that were allocated by fusion executor. Improves average latency on LSTMCell 46us -> 44us. * Replace IValue::type() for isTensor as type() can be relatively slow. Improves average latency on LSTMCell 42us -> 29us. * Use empty_cuda instead of empty. Improves average latency on LSTMCell 22us -> 20.5us. --- torch/csrc/jit/codegen/cuda/executor.cpp | 43 +++++++----- torch/csrc/jit/codegen/cuda/executor.h | 9 +++ .../csrc/jit/codegen/cuda/executor_utils.cpp | 65 ++++++++++--------- torch/csrc/jit/codegen/cuda/executor_utils.h | 4 +- 4 files changed, 72 insertions(+), 49 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index f671e772a9371..7c713b3640d25 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -83,6 +83,8 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) { max_device_smem = at::cuda::getDeviceProperties(options.device.index())->sharedMemPerBlock; + setUsedTVs(); + fusion_id_ = ++fusion_id_counter_; has_random_ = fusion->hasRNG(); has_block_reductions = fusion_.hasBlockReduction(); @@ -137,7 +139,9 @@ at::Tensor inferAndAlloc( return at::zeros(isizes, tensor_options); } else { c10::IntArrayRef isizes(sizes); - return at::empty(isizes, tensor_options); + // Non Variable type guard for empty_cuda call + at::AutoNonVariableTypeMode non_variable_type_mode; + return at::native::empty_cuda(isizes, tensor_options); } } @@ -174,26 +178,18 @@ LaunchParams FusionExecutor::computeLaunchParams( StatefulExpressionEvaluator& see) { LaunchParams launch_params; - // Grab all values that are actually used in the fusion - auto unordered_vals = DependencyCheck::getAllValsBetween( - {fusion_.inputs().begin(), fusion_.inputs().end()}, fusion_.outputs()); - // Lets collect all IterDomains that are bound to a thread binding std::unordered_map, TypeHash> parallel_iter_domains; - - for (auto val : unordered_vals) { - if (val->getValType().value() == ValType::TensorView) { - TensorView* tv = val->as(); - for (auto id : tv->domain()->domain()) { - if (id->isThread() && !id->isBroadcast()) { - if (parallel_iter_domains.find(id->getParallelType()) != - parallel_iter_domains.end()) { - parallel_iter_domains.at(id->getParallelType()).push_back(id); - } else { - parallel_iter_domains[id->getParallelType()] = - std::vector({id}); - } + for (auto tv : getUsedTVs()) { + for (auto id : tv->domain()->domain()) { + if (id->isThread() && !id->isBroadcast()) { + if (parallel_iter_domains.find(id->getParallelType()) != + parallel_iter_domains.end()) { + parallel_iter_domains.at(id->getParallelType()).push_back(id); + } else { + parallel_iter_domains[id->getParallelType()] = + std::vector({id}); } } } @@ -310,6 +306,17 @@ std::vector FusionExecutor::allocOutputs( return outputs; } +void FusionExecutor::setUsedTVs() { + used_tvs_.clear(); + auto used_vals = DependencyCheck::getAllValsBetween( + {fusion_.inputs().begin(), fusion_.inputs().end()}, fusion_.outputs()); + for (auto val : used_vals) { + if (val->getValType().value() == ValType::TensorView) { + used_tvs_.push_back(val->as()); + } + } +} + std::vector FusionExecutor::runFusion( const at::ArrayRef& inputs, const std::vector& outputs, diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index 2de938bf09820..7f1915789caaf 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -103,6 +103,12 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { std::vector allocOutputs(StatefulExpressionEvaluator& see); + void setUsedTVs(); + + const std::vector& getUsedTVs() const { + return used_tvs_; + }; + private: Fusion fusion_; @@ -115,6 +121,9 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { size_t static_smem_size = 0; executor_utils::NvrtcFunction compiled_kernel_; + // TensorViews actually used in the kernel. + std::vector used_tvs_; + // State of the fusion that's important bool has_random_ = false; diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp index a7349efe62e2d..22ea7bc660e61 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp @@ -32,14 +32,16 @@ std::string kernelPreamble() { namespace { +// return false if arg's type, number of dimensions, and device, doesn't match +// param and provided c10:device bool validateKernelArgTensor( const at::Tensor& arg, const Val* param, - c10::Device device, + const c10::Device& device, std::stringstream& msg) { // Arg is a tensor. Param must be a tensor too. if (*param->getValType() != ValType::TensorView) { - msg << "Argument is a tensor, but the parameter is not."; + msg << "Argument is a tensor, but the parameter is not.\n"; return false; } @@ -54,12 +56,13 @@ bool validateKernelArgTensor( // check as necessary. if (arg_dim > param_dim) { msg << "Argument tensor's rank is " << arg_dim << ", but the parameter is " - << param_dim; + << param_dim << "\n"; return false; } if (arg.device() != device) { - msg << "Argument is on device that is not compiled for"; + msg << "Argument is on device that is not compiled for." + << "\n"; return false; } // Check element type @@ -77,22 +80,24 @@ bool validateKernelArgTensor( match = param_data_type == DataType::Bool; break; default: - msg << "Argument element type, " << arg_data_type - << ", is not supported."; + msg << "Argument element type, " << arg_data_type << ", is not supported." + << "\n"; return false; } if (!match) msg << "Argument element type is " << arg_data_type - << ", but the parameter is " << param_data_type; + << ", but the parameter is " << param_data_type << "\n"; return match; } +// Return false if arg_type doesn't match the type in param bool validateKernelArgScalar( const c10::TypePtr& arg_type, const Val* param, std::stringstream& msg) { if (!param->isScalar()) { - msg << "Argument is a scalar, but the parameter is not."; + msg << "Argument is a scalar, but the parameter is not." + << "\n"; return false; } DataType param_type = *param->getDataType(); @@ -112,20 +117,22 @@ bool validateKernelArgScalar( } if (!match) { msg << "Argument type is " << *arg_type << ", but the parameter is " - << param_type; + << param_type << "\n"; } return match; } +// Return false if arg and param don't match up and if arg's device (if a +// tensor) doesn't match provided device bool validateKernelArg( const c10::IValue& arg, const Val* param, - c10::Device device, + const c10::Device& device, std::stringstream& msg) { - if (arg.type()->kind() != c10::TypeKind::TensorType) { - return validateKernelArgScalar(arg.type(), param, msg); - } else { + if (arg.isTensor()) { return validateKernelArgTensor(arg.toTensor(), param, device, msg); + } else { + return validateKernelArgScalar(arg.type(), param, msg); } } @@ -134,28 +141,29 @@ bool validateKernelArg( void validateKernelInputs( Fusion* fusion, const at::ArrayRef& inputs, - c10::Device device) { + const c10::Device& device) { + // This is necessary as we were traversing the fusion graph later in the check + FusionGuard fg(fusion); // Check inputs TORCH_INTERNAL_ASSERT( inputs.size() == fusion->inputs().size(), "Wrong number of kernel inputs."); + + std::stringstream msg; + bool mismatch = false; for (size_t i = 0; i < inputs.size(); ++i) { const IValue& arg = inputs[i]; const Val* param = fusion->inputs()[i]; - std::stringstream msg; - TORCH_INTERNAL_ASSERT( - validateKernelArg(arg, param, device, msg), - "Input argument at position ", - i, - " is invalid; ", - msg.str()); + mismatch = !validateKernelArg(arg, param, device, msg) || mismatch; } + TORCH_INTERNAL_ASSERT( + !mismatch, "Found one or more invalid arguments: ", msg.str()); } void validateKernelOutputs( Fusion* fusion, const std::vector& outputs, - c10::Device device) { + const c10::Device& device) { TORCH_INTERNAL_ASSERT( fusion->outputs().size() != 0, "Kernel should have at least one output tensor."); @@ -163,17 +171,16 @@ void validateKernelOutputs( TORCH_INTERNAL_ASSERT( outputs.size() == fusion->outputs().size(), "Wrong number of kernel outputs."); + + std::stringstream msg; + bool mismatch = false; for (size_t i = 0; i < outputs.size(); ++i) { const at::Tensor& arg = outputs[i]; const Val* param = fusion->outputs()[i]; - std::stringstream msg; - TORCH_INTERNAL_ASSERT( - validateKernelArgTensor(arg, param, device, msg), - "Output argument at position ", - i, - " is invalid; ", - msg.str()); + mismatch = !validateKernelArg(arg, param, device, msg) || mismatch; } + TORCH_INTERNAL_ASSERT( + !mismatch, "Found one or more invalid arguments: ", msg.str()); } StatefulExpressionEvaluator statefulBindInputs( diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h index 7a01bfa5d8f3c..76b8a9a145f19 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.h +++ b/torch/csrc/jit/codegen/cuda/executor_utils.h @@ -25,12 +25,12 @@ std::string kernelPreamble(); void validateKernelInputs( Fusion* fusion, const at::ArrayRef& inputs, - c10::Device device); + const c10::Device& device); void validateKernelOutputs( Fusion* fusion, const std::vector& outputs, - c10::Device device); + const c10::Device& device); StatefulExpressionEvaluator statefulBindInputs( const at::ArrayRef& aten_inputs, From 5f988ab36012542bcbb5d8df6110598096d1ba06 Mon Sep 17 00:00:00 2001 From: Leonard Mosescu Date: Thu, 3 Sep 2020 11:41:03 -0700 Subject: [PATCH 030/167] Fix for an invalid downcast in the Expression Evaluator (#358) Fixes #359 --- .../csrc/jit/codegen/cuda/expr_evaluator.cpp | 29 +++++++++++-------- torch/csrc/jit/codegen/cuda/expr_evaluator.h | 8 ++--- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp index 04aeabab75a7c..2bba5cd774d4e 100644 --- a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp +++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp @@ -74,25 +74,30 @@ void StatefulExpressionEvaluator::print() const { std::cout << "--------------------\n\n"; } -inline c10::optional StatefulExpressionEvaluator::getValue( +c10::optional StatefulExpressionEvaluator::getValue( Val* value) { TORCH_INTERNAL_ASSERT( value->isAnInt(), "Expressoin Evaluation does not support values other than integers at this time."); - auto v_type = value->getValType().value(); - bool is_named_scalar = - v_type == ValType::NamedScalar || v_type == ValType::KirNamedScalar; - - if (!is_named_scalar && value->as()->value().has_value()) { - return value->as()->value(); + switch (value->getValType().value()) { + case ValType::Scalar: + if (value->as()->value().has_value()) { + return value->as()->value(); + } + break; + case ValType::KirScalar: + if (value->as()->value().has_value()) { + return value->as()->value(); + } + break; + default: + break; } - auto it = bindings_.find(value); - if (it != bindings_.end()) { - return c10::optional(it->second); - } - return c10::nullopt; + const auto it = bindings_.find(value); + return it != bindings_.end() ? c10::optional(it->second) + : c10::nullopt; } c10::optional StatefulExpressionEvaluator::maybeHandle( diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.h b/torch/csrc/jit/codegen/cuda/expr_evaluator.h index 57264d816d78f..40ba53380fae0 100644 --- a/torch/csrc/jit/codegen/cuda/expr_evaluator.h +++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.h @@ -38,12 +38,8 @@ class TORCH_CUDA_API StatefulExpressionEvaluator : private OptOutDispatch { void print() const; private: - std::unordered_map bindings_; - Fusion* fusion_ = nullptr; - using OptOutDispatch::handle; - private: void handle(Expr* expr) override { switch (expr->getExprType().value()) { case ExprType::UnaryOp: @@ -75,6 +71,10 @@ class TORCH_CUDA_API StatefulExpressionEvaluator : private OptOutDispatch { void handle(kir::BinaryOp*) override; c10::optional maybeHandle(Val*); + + private: + std::unordered_map bindings_; + Fusion* fusion_ = nullptr; }; } // namespace fuser From a375394c9c732ec0464ab2f1383455056bde7f77 Mon Sep 17 00:00:00 2001 From: Lemo Date: Thu, 3 Sep 2020 15:36:35 -0700 Subject: [PATCH 031/167] Minor comment --- torch/csrc/jit/codegen/cuda/utils.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h index e286cc09ed3ad..fdc1e7c3d2fdb 100644 --- a/torch/csrc/jit/codegen/cuda/utils.h +++ b/torch/csrc/jit/codegen/cuda/utils.h @@ -58,6 +58,16 @@ class PolymorphicBase { return downcast_ptr; } + // Check if the runtime time is T (or derived from T) + // + // NOTE: Don't use this for conditional casts. Use: + // + // if (auto t = dynamic_cast(p)) { ... } + // + // instead of: + // + // if (p->isA()) { auto t = p->as(); ... } + // template bool isA() const { return dynamic_cast(this) != nullptr; From 92875d70979fcaaa146e8d9aa5048a29dda2d5b4 Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Thu, 3 Sep 2020 16:35:29 -0700 Subject: [PATCH 032/167] Multiple output reduction (#337) lift the requirement on reduction fusion can only have single output; added a quick WAR to have proper permutation for multiple output with different rank in integration. --- test/test_jit_cuda_fuser.py | 37 ++++++++++ torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 73 ++++++++++++-------- torch/csrc/jit/codegen/cuda/kernel_cache.h | 9 ++- 3 files changed, 88 insertions(+), 31 deletions(-) diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py index 39353d41336a8..d22867ee96979 100644 --- a/test/test_jit_cuda_fuser.py +++ b/test/test_jit_cuda_fuser.py @@ -632,6 +632,43 @@ def test_reduction_permutation(self): for perm1 in itertools.permutations(range(len(x))): self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != + ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") + def test_reduction_multiple_output(self): + torch._C._jit_set_bailout_depth(2) + + def t(x: torch.Tensor, y: torch.Tensor, scale: float, z: torch.Tensor): + o = torch.mul(x, y) + o = torch.mul(o, scale) + out1 = torch.mul(o, z) + out2 = torch.sum(out1, dim=[2]) + return out1, out2 + + t_jit = torch.jit.script(t) + x = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + y = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + z = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + scale = 0.5 + jit_o = t_jit(x, y, scale, z) + jit_o = t_jit(x, y, scale, z) + o = t(x, y, scale, z) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP) + + x = x.to(memory_format=torch.channels_last) + y = y.to(memory_format=torch.channels_last) + z = z.to(memory_format=torch.channels_last) + jit_o = t_jit(x, y, scale, z) + jit_o = t_jit(x, y, scale, z) + o = t(x, y, scale, z) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp index ee58eaa9245e8..720ea588b0dd8 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp @@ -67,15 +67,9 @@ void debugPrint(const TensorTypePtr& type) { at::DimVector graphReductionAxes(const std::shared_ptr& graph) { at::DimVector reduction_axes; + // TODO: let check that we have only single reduction node in the graph. for (const auto& n : graph->nodes()) { if (isReductionNode(n)) { - // TODO: I think this is enough to detect reduction that's not the output - // as well. Since we go in topological order, we would run into - // intermediate reduction, if there's any. - TORCH_INTERNAL_ASSERT( - graph->outputs().size() == 1 && graph->outputs()[0] == n->output(), - "support for graph with reduction is limited to single output from reduction node"); - // TODO: we should return empty when `keepdim` is True? auto dims_list = constant_as>(n->input(1)); TORCH_INTERNAL_ASSERT( @@ -294,11 +288,7 @@ GraphCache::InputsRequirement::InputsRequirement( vec_optional_ttp.emplace_back(c10::nullopt); } } - input_permutation_ = getPermutationPerSortedStride(acc_type); - output_permutation_ = inversePermutation(input_permutation_, reduction_axes); - TORCH_CHECK( - acc_type->device().has_value(), "requires fixed device for all inputs"); - device_ = acc_type->device(); + extractPermutation(acc_type, reduction_axes); } GraphCache::InputsRequirement::InputsRequirement( @@ -325,11 +315,7 @@ GraphCache::InputsRequirement::InputsRequirement( vec_optional_ttp.emplace_back(c10::nullopt); } } - input_permutation_ = getPermutationPerSortedStride(acc_type); - output_permutation_ = inversePermutation(input_permutation_, reduction_axes); - TORCH_CHECK( - acc_type->device().has_value(), "requires fixed device for all inputs"); - device_ = acc_type->device(); + extractPermutation(acc_type, reduction_axes); } bool GraphCache::InputsRequirement::requiresPermutation() { @@ -340,10 +326,16 @@ bool GraphCache::InputsRequirement::requiresPermutation() { } } // Check if output agrees - const size_t output_rank = output_permutation_.size(); - for (size_t i = 0; i < output_rank; i++) { + const size_t pw_output_rank = pw_output_permutation_.size(); + for (size_t i = 0; i < pw_output_rank; i++) { TORCH_INTERNAL_ASSERT( - output_permutation_[i] == (long)i, + pw_output_permutation_[i] == (long)i, + "permutation of output and input is not consistent"); + } + const size_t reduction_output_rank = reduction_output_permutation_.size(); + for (size_t i = 0; i < reduction_output_rank; i++) { + TORCH_INTERNAL_ASSERT( + reduction_output_permutation_[i] == (long)i, "permutation of output and input is not consistent"); } return false; @@ -354,7 +346,8 @@ bool GraphCache::InputsRequirement::complyWith( const InputsRequirement& expect) { if (device_ != expect.device_ || input_permutation_ != expect.input_permutation_ || - output_permutation_ != expect.output_permutation_ || + pw_output_permutation_ != expect.pw_output_permutation_ || + reduction_output_permutation_ != expect.reduction_output_permutation_ || vec_optional_ttp.size() != expect.vec_optional_ttp.size()) { return false; } @@ -419,6 +412,18 @@ bool GraphCache::InputsRequirement::complyWith( return true; } +void GraphCache::InputsRequirement::extractPermutation( + const TensorTypePtr& acc_type, + const std::vector& reduction_axes) { + input_permutation_ = getPermutationPerSortedStride(acc_type); + reduction_output_permutation_ = + inversePermutation(input_permutation_, reduction_axes); + pw_output_permutation_ = inversePermutation(input_permutation_, {}); + TORCH_CHECK( + acc_type->device().has_value(), "requires fixed device for all inputs"); + device_ = acc_type->device(); +} + FusionExecutorCache* GraphCache::appendFusionExecutorCache( const InputsRequirement& input_stack) { input_stacks_.emplace_back(input_stack); @@ -495,12 +500,6 @@ FusionExecutorCache* GraphCache::appendFusionExecutorCache( // see [ NOTE - reduction in graph ] part 2. for (auto n : parsing_graph->nodes()) { if (isReductionNode(n)) { - // TODO: this is mostly redundant check, but it's compile time, we - // leave it here to be safe; - TORCH_INTERNAL_ASSERT( - parsing_graph->outputs().size() == 1 && - parsing_graph->outputs()[0] == n->output(), - "supporfor graph with reduction is limited to single output from reduction node"); auto dims_list = constant_as>(n->input(1)); TORCH_INTERNAL_ASSERT( dims_list.has_value(), "reduction axes should be constant"); @@ -537,7 +536,7 @@ GraphCache::GraphCache(std::shared_ptr graph) // [ NOTE - reduction in graph ] // // reduction complicates our permutation in integration, it addes two things: - // 1. we need to adjust output_permutation_; + // 1. we need to adjust xxx_output_permutation_; // because of dimension elimination during permutation (not necessarily, // given the `keepdim` argument.) this needs to be accommodated later when // we added the support. @@ -608,8 +607,22 @@ std::vector GraphCache::runGraphWithInputs( std::vector permuted_outputs; permuted_outputs.reserve(outputs.size()); for (const auto& output : outputs) { - permuted_outputs.emplace_back( - output.permute(input_requirement->output_permutation_)); + // This is to address the issue that not all outputs from a reduction + // fusion are reduced tensor; We support intermediate tensors to be output + if (output.dim() == input_requirement->pw_output_permutation_.size()) { + permuted_outputs.emplace_back( + output.permute(input_requirement->pw_output_permutation_)); + } else if ( + output.dim() == + input_requirement->reduction_output_permutation_.size()) { + permuted_outputs.emplace_back( + output.permute(input_requirement->reduction_output_permutation_)); + } else { + TORCH_INTERNAL_ASSERT( + false, + "Something went wrong with integration permutation, can't find a consistent permutation for output in fusion", + *graph_); + } } return permuted_outputs; } else { diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.h b/torch/csrc/jit/codegen/cuda/kernel_cache.h index 02d0c9c8b1d73..5bf35333856a0 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.h +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.h @@ -150,7 +150,8 @@ class GraphCache { // common permutation order used for dimension coalescing; at::DimVector input_permutation_; - at::DimVector output_permutation_; + at::DimVector pw_output_permutation_; + at::DimVector reduction_output_permutation_; // construct InputsRequirement from `Graph`, this is used for constructing // `GraphCache` entry using profiling record @@ -170,6 +171,12 @@ class GraphCache { // helper function used at run-time to check whether a common permutation is // present, this is used to take the short-cut to skip permutation logic. bool requiresPermutation(); + + // extract permutation for input output tensor from accumulcated tensor type + // pointer on all inputs; + void extractPermutation( + const TensorTypePtr& acc_type, + const std::vector& reduction_axes); }; // construct FusionExecutorCache per InputsRequirement. From 151cdb48aad59cbc72e6cf2b9e74914b8762d1a0 Mon Sep 17 00:00:00 2001 From: Lemo Date: Fri, 4 Sep 2020 14:59:57 -0700 Subject: [PATCH 033/167] Minor cleanup --- torch/csrc/jit/codegen/cuda/fusion.cpp | 4 ++-- torch/csrc/jit/codegen/cuda/fusion.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index 3ac4c95584d13..f6b64d32326e9 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -228,7 +228,7 @@ void Fusion::removeVal(Val* val) { delete val; } -void Fusion::addInput(Val* const input) { +void Fusion::addInput(Val* input) { assertInFusion(input, "Cannot register input "); if (input->getValType().value() == ValType::TensorView) { @@ -251,7 +251,7 @@ void Fusion::addInput(Val* const input) { inputs_.push_back(input); } -void Fusion::addOutput(Val* const output) { +void Fusion::addOutput(Val* output) { assertInFusion(output, "Cannot register output "); if (output->getValType().value() == ValType::TensorView) { auto tv = output->as(); diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h index 52c12763f0e7c..0f1dd20a9cac5 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.h +++ b/torch/csrc/jit/codegen/cuda/fusion.h @@ -90,10 +90,10 @@ class TORCH_CUDA_API Fusion final { void removeVal(Val* val); // Register input as an input of the fusion - void addInput(Val* const input); + void addInput(Val* input); // Register output as an output of the fusion - void addOutput(Val* const output); + void addOutput(Val* output); // Check if stmt is properly registered with this fusion bool inFusion(const Statement* stmt) const; From de78fd8bacc9a30c0ba9bc36c25b092c37219375 Mon Sep 17 00:00:00 2001 From: jjsjann123 Date: Tue, 8 Sep 2020 10:40:12 -0700 Subject: [PATCH 034/167] Cache eviction pr (#343) Simple implementation on LRU cache eviction. Something to note: We only evict the entries of short cut lookup table, but not the compiled kernel. Because compiled kernels for a computation graph is a very limited number. In the contrary, lookup table is bound to a given input set and could grow indefinitely with input size / stride, hence the need for lookup eviction. --- test/cpp/jit/test_gpu.cpp | 39 ++++++++++++ test/cpp/jit/tests.h | 3 +- torch/csrc/jit/codegen/cuda/executor.h | 4 ++ torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 48 +++++++++++++-- torch/csrc/jit/codegen/cuda/kernel_cache.h | 65 ++++++++++++++++++-- 5 files changed, 147 insertions(+), 12 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index a56c7166a5dcb..00296512af076 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -6707,6 +6708,44 @@ void testGPU_FusionComputeAtMultiBCast() { ASSERT_ANY_THROW(tv1->computeAt(tv3, -1)); } +void testGPU_FusionInputsIdLookup() { + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({16, 8, 8}, options); + at::Tensor t1 = at::randn({8, 8}, options); + at::Tensor t2 = at::randn({6, 4}, options); + + // create a cache with max size 2; + auto inputs_id_lookup = torch::jit::fuser::cuda::InputsIdLookup(2); + + // testing basic function, same encoding for identical inputs + auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0}); + auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5}); + TORCH_CHECK(id_0.id == id_0_lookup.id); + TORCH_CHECK(inputs_id_lookup.size() == 1); + TORCH_CHECK(id_0.eviction == false); + + // new input (even tho same shape, but we have different signature because of + // missing scalar input + auto id_1 = inputs_id_lookup.lookupId({t0, t1}); + auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1}); + TORCH_CHECK(id_1.id == id_1_lookup.id); + TORCH_CHECK(inputs_id_lookup.size() == 2); + TORCH_CHECK(id_1.eviction == false); + + // eviction should happen at this point + auto id_2 = inputs_id_lookup.lookupId({t2, t1}); + TORCH_CHECK(id_2.id != id_0.id); + TORCH_CHECK(id_2.id != id_1.id); + TORCH_CHECK(inputs_id_lookup.size() == 2); + TORCH_CHECK(id_2.eviction == true); + TORCH_CHECK(id_2.evict_id == id_0.id); + + // look at input 1 again + auto id_1_relook = inputs_id_lookup.lookupId({t0, t1}); + TORCH_CHECK(id_1_relook.id == id_1.id); + TORCH_CHECK(id_1_relook.eviction == false); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h index 62f3f20f9af7c..8e2e0a50e5ebc 100644 --- a/test/cpp/jit/tests.h +++ b/test/cpp/jit/tests.h @@ -227,7 +227,8 @@ namespace jit { _(GPU_FusionBranches) \ _(GPU_FusionThreadPredicate) \ _(GPU_FusionLSTMCell) \ - _(GPU_FusionComputeAtMultiBCast) + _(GPU_FusionComputeAtMultiBCast) \ + _(GPU_FusionInputsIdLookup) #else #define TH_FORALL_TESTS_CUDA(_) \ _(ArgumentSpec) \ diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index 7f1915789caaf..dc2972457489e 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -52,6 +52,10 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { return fusion_id_ != -1; }; + void evictCache(size_t cache_id) { + executor_entry_lookup_.erase(cache_id); + } + // TODO: strides would also be important when we handle permutations in // codegen. // struct used to hold necessary information to launch compiled kernel on a diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp index 720ea588b0dd8..94389c47970c2 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp @@ -176,7 +176,9 @@ at::DimVector inversePermutation( } // namespace -size_t InputsIdLookup::getCode(const at::ArrayRef& inputs) { +InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId( + const at::ArrayRef& inputs) { + IdLookupReturn ret; std::stringstream encoded_inputs; for (const auto& input : inputs) { if (input.isTensor()) { @@ -199,11 +201,33 @@ size_t InputsIdLookup::getCode(const at::ArrayRef& inputs) { encoded_inputs << ";s"; } } - auto& iter = encoding_lookup_[encoded_inputs.str()]; - if (iter == 0) { - iter = current_id_++; + auto& id_iter_pair = encoding_lookup_[encoded_inputs.str()]; + + // short-cut to leave LRU entry as is; + if (id_iter_pair.lru_iter == used_entry_.begin()) { + ret.id = id_iter_pair.id; + return ret; + } + + if (id_iter_pair.id == 0) { + // no entry existed for given input set, set id for given entry + id_iter_pair.id = current_id_++; + if (used_entry_.size() == max_cache_size_) { + // pop least recently used cache; + const auto& remove_iter = encoding_lookup_.find(used_entry_.back()); + used_entry_.pop_back(); + ret.evict_id = remove_iter->second.id; + ret.eviction = true; + encoding_lookup_.erase(remove_iter); + } + } else { + used_entry_.erase(id_iter_pair.lru_iter); } - return iter; + + ret.id = id_iter_pair.id; + id_iter_pair.lru_iter = + used_entry_.insert(used_entry_.begin(), encoded_inputs.str()); + return ret; } FusionExecutorCache::FusionExecutorCache( @@ -556,7 +580,19 @@ GraphCache::GraphCache(std::shared_ptr graph) std::vector GraphCache::runGraphWithInputs( const at::ArrayRef& inputs) { // get unique id `unique_id` for given input set `inputs`; - const size_t unique_id = inputs_id_lookup_.getCode(inputs); + auto id_lookup_ret = inputs_id_lookup_.lookupId(inputs); + const size_t unique_id = id_lookup_ret.id; + + // if we went over the cache size for short-cut, we evict entries using LRU; + if (id_lookup_ret.eviction) { + auto index_lookup_iter = code_to_index_lookup_.find(id_lookup_ret.evict_id); + TORCH_INTERNAL_ASSERT( + index_lookup_iter != code_to_index_lookup_.end(), + "evicting cache entry not found in lookup table"); + // evict nested cache in FusionExecutorCache + fe_cache_[index_lookup_iter->second]->evictCache(index_lookup_iter->first); + code_to_index_lookup_.erase(index_lookup_iter); + } FusionExecutorCache* fusion_executor_cache = nullptr; diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.h b/torch/csrc/jit/codegen/cuda/kernel_cache.h index 5bf35333856a0..ff787688f6eb3 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.h +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.h @@ -15,18 +15,62 @@ namespace jit { namespace fuser { namespace cuda { +// encoding an input set to unique id, which is used to short-cut cache entry +// selection in our nested cache implementation to cut off overhead. +// +// We have implemented naive LRU cache eviction policy here, since each entry in +// `InputsIdLookup` is attached to a static input shape/stride, and could grow +// gigantic when we have input shapes that does not stabalize to a finite set. +// // Note, the uniqueness of the ide generated for a given input set is only local // to the instance of `InputsIdLookup`. -class InputsIdLookup { +TORCH_CUDA_API class InputsIdLookup { public: - // encode each unique input sets to an unique id; - size_t getCode(const at::ArrayRef& inputs); + // constructor where maximum cache size is fixed during init + explicit InputsIdLookup(size_t max_cache_size = 10) + : max_cache_size_(max_cache_size){}; + + // struct to hold return value for lookupId. + struct IdLookupReturn { + size_t id = 0; + size_t evict_id = 0; + bool eviction = false; + }; + + // encode each input sets to with an unique id; + // Returned data structure also indicates whether eviction has happened within + // the lookup cache. This is needed because lookup shortcut is also cached in + // nested `GraphCache`, `FusionExecutorCache` and `FusionExecutor`. + // see [ Note -- 2 level cache implementation ] + TORCH_CUDA_API IdLookupReturn lookupId(const at::ArrayRef& inputs); + + // debugging API + size_t size() const { + return encoding_lookup_.size(); + } private: + // entry stored in `encoding_lookup_` to implement LRU + struct EncodingEntry { + size_t id; + std::list::iterator lru_iter; + }; + + // maximum cache size for LRU + const size_t max_cache_size_; + + // next available unique id, we monotonically increase `current_id_` avoid + // conflicts size_t current_id_ = 1; - // TODO: change this to a trie for efficiency; - std::unordered_map encoding_lookup_; + // entry in the cache, This is used to implement LRU cache, where entries in + // the list is ordered by their recent usage (freshly used entry is placed at + // the beginning) + std::list used_entry_; + + // map from `std::string` to a unique id `size_t` (packaged in `EncodingEntry` + // ). We store an iterator to `used_entry_` to implement LRU + std::unordered_map encoding_lookup_; }; // [ Note -- 2 level cache implementation ] @@ -83,6 +127,17 @@ class FusionExecutorCache { const at::ArrayRef& inputs, size_t unique_id); + // evict cached short cut entry in `code_to_fe_lookup_`; + inline void evictCache(size_t cache_id) { + auto iter = code_to_fe_lookup_.find(cache_id); + TORCH_INTERNAL_ASSERT( + iter != code_to_fe_lookup_.end(), + "evict cache failed to find an entry"); + // evict nested lookup entry in nested FusionExecutor + (iter->second)->evictCache(cache_id); + code_to_fe_lookup_.erase(iter); + }; + private: // device_ where compiled binaries are loaded on & inputs are expected to // reside; From 255e52ed61549195ad495a41945a671271366bb1 Mon Sep 17 00:00:00 2001 From: Lemo Date: Tue, 8 Sep 2020 15:14:43 -0700 Subject: [PATCH 035/167] Factor out the code generation and kernel state --- .gitignore | 1 + caffe2/CMakeLists.txt | 1 + test/cpp/jit/test_gpu.cpp | 52 +++---- tools/build_variables.bzl | 1 + torch/csrc/jit/codegen/cuda/codegen.cpp | 39 ++++++ torch/csrc/jit/codegen/cuda/codegen.h | 22 +++ torch/csrc/jit/codegen/cuda/executor.cpp | 22 +-- torch/csrc/jit/codegen/cuda/executor.h | 1 + torch/csrc/jit/codegen/cuda/fusion.cpp | 4 +- torch/csrc/jit/codegen/cuda/ir_iostream.cpp | 2 + torch/csrc/jit/codegen/cuda/kernel.cpp | 75 +++++++++- torch/csrc/jit/codegen/cuda/kernel.h | 43 +++++- torch/csrc/jit/codegen/cuda/lower2device.cpp | 137 ++----------------- torch/csrc/jit/codegen/cuda/lower2device.h | 37 +---- torch/csrc/jit/codegen/cuda/lower_index.h | 1 + 15 files changed, 229 insertions(+), 209 deletions(-) create mode 100644 torch/csrc/jit/codegen/cuda/codegen.cpp create mode 100644 torch/csrc/jit/codegen/cuda/codegen.h diff --git a/.gitignore b/.gitignore index 01739b3d92dd6..1f4b83dd7439d 100644 --- a/.gitignore +++ b/.gitignore @@ -187,6 +187,7 @@ build_android build_ios /build_* .build_debug/* +.build_profile/* .build_release/* distribute/* *.testbin diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 2f189614b2ea3..9a39a85ccf596 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -478,6 +478,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/arith.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/compute_at.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/codegen.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/dispatch.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/expr_evaluator.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/executor.cpp diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index a56c7166a5dcb..852988bdbba7a 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -30,7 +31,6 @@ namespace torch { namespace jit { -using namespace torch::jit::fuser; using namespace torch::jit::fuser; namespace { @@ -361,8 +361,6 @@ void testGPU_FusionExprEvalPostLower() { // Lower GpuLower gpulw(&fusion); - std::stringstream kernel; - gpulw.printKernel(kernel); // 1. Create an evaluation context StatefulExpressionEvaluator evaluator(&fusion); @@ -506,10 +504,12 @@ void testGPU_FusionCopy() { ASSERT_EQ(original_ir.str(), clone_ir.str()); // Lower original fusion - std::stringstream original_kernel; + std::string original_kernel; { - GpuLower lower(&original_fusion); - lower.printKernel(original_kernel); + // TODO(kir): remove this guard once we implement the cuda codegen visitor + FusionGuard fg(&original_fusion); + original_kernel = + codegen::generateCudaKernel(GpuLower(&original_fusion).kernel()); } // Make sure the "before lowering" clone was not mutated @@ -530,12 +530,14 @@ void testGPU_FusionCopy() { ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str()); // Lower the "before lowering" and compare kernels - std::stringstream clone_kernel; + std::string clone_kernel; { - GpuLower lower(&before_lowering); - lower.printKernel(clone_kernel); + // TODO(kir): remove this guard once we implement the cuda codegen visitor + FusionGuard fg(&before_lowering); + clone_kernel = + codegen::generateCudaKernel(GpuLower(&before_lowering).kernel()); } - ASSERT_EQ(original_kernel.str(), clone_kernel.str()); + ASSERT_EQ(original_kernel, clone_kernel); } void testGPU_FusionMove() { @@ -594,9 +596,7 @@ void testGPU_FusionMove() { ASSERT_EQ(original_ir.str(), another_ir.str()); // Lower the fusion IR - std::stringstream kernel; GpuLower lower(&another_fusion); - lower.printKernel(kernel); std::stringstream lowered_ir; lowered_ir << another_fusion; @@ -1143,8 +1143,8 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Te } )"; - std::string actual_kernel = GpuLower(fusion.get()).getKernel(); - actual_kernel = "\n" + actual_kernel; + const std::string actual_kernel = "\n" + + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel()); if (expected_kernel.size() != actual_kernel.size() || expected_kernel.compare(actual_kernel) != 0) { std::cerr @@ -1528,11 +1528,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(outputs[0], t5), actual_kernel.str()); + TORCH_CHECK(at::allclose(outputs[0], t5)); TORCH_CHECK(at::allclose(outputs[1], t6)); } @@ -1588,11 +1584,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); fe.runFusion({t0, t1}, {kernel_tv3}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(kernel_tv3, t3), actual_kernel.str()); + TORCH_CHECK(at::allclose(kernel_tv3, t3)); } // Case 4 @@ -1658,11 +1650,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0, t1, t2, t3}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(outputs[0], t6), actual_kernel.str()); + TORCH_CHECK(at::allclose(outputs[0], t6)); } // Case 5 @@ -2178,11 +2166,7 @@ void testGPU_FusionScalarInputs() { at::Scalar(fl3)}, {kernel_tv4}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(kernel_tv4, t4), actual_kernel.str()); + TORCH_CHECK(at::allclose(kernel_tv4, t4)); } void testGPU_FusionLoopUnroll() { diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 8b6c6fdeb26ac..7649fe93bf325 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -337,6 +337,7 @@ libtorch_cuda_sources = [ "torch/csrc/autograd/functions/comm.cpp", "torch/csrc/jit/codegen/cuda/arith.cpp", "torch/csrc/jit/codegen/cuda/compute_at.cpp", + "torch/csrc/jit/codegen/cuda/codegen.cpp", "torch/csrc/jit/codegen/cuda/dispatch.cpp", "torch/csrc/jit/codegen/cuda/expr_evaluator.cpp", "torch/csrc/jit/codegen/cuda/executor.cpp", diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp new file mode 100644 index 0000000000000..db15f42f22e31 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/codegen.cpp @@ -0,0 +1,39 @@ + +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace codegen { + +std::string generateCudaKernel( + const Kernel* kernel, + const std::string& kernel_name) { + const auto& allocations = kernel->globalAllocations(); + std::vector global_tensors(allocations.size()); + std::transform( + allocations.begin(), + allocations.end(), + global_tensors.begin(), + [](kir::Allocate* alloc) { return alloc->buffer(); }); + + std::stringstream ss; + + IRPrinter ir_printer(ss); + ir_printer.printKernel( + kernel->exprs(), + kernel_name, + global_tensors, + !kernel->dynamicAllocations().empty()); + + return ss.str(); +} + +} // namespace codegen +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/codegen.h b/torch/csrc/jit/codegen/cuda/codegen.h new file mode 100644 index 0000000000000..0e5f2cc2ebf56 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/codegen.h @@ -0,0 +1,22 @@ + +#pragma once + +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace codegen { + +TORCH_CUDA_API std::string generateCudaKernel( + const Kernel* kernel, + const std::string& kernel_name = "CUDAGeneratedKernel"); + +} // namespace codegen +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index 7c713b3640d25..42fa6373749ba 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -1,4 +1,5 @@ +#include #include #include #include @@ -91,13 +92,14 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) { has_grid_reductions = fusion_.hasGridReduction(); has_block_broadcasts = fusion_.hasBlockBroadcast(); lowered_ = GpuLower(&fusion_); - const auto kernel = lowered_.getKernel(kernelName()); - const auto structured_code = getStructuredCode(kernel); + const auto kernel = lowered_.kernel(); + const auto kernel_code = codegen::generateCudaKernel(kernel, kernelName()); + const auto structured_code = getStructuredCode(kernel_code); - if (lowered_.static_allocations().size() > 0) { + if (kernel->staticAllocations().size() > 0) { StatefulExpressionEvaluator static_evaluator(&fusion_); unsigned static_smem_size = - computeSharedMemory(static_evaluator, lowered_.static_allocations()); + computeSharedMemory(static_evaluator, kernel->staticAllocations()); TORCH_INTERNAL_ASSERT( static_smem_size < max_device_smem, "The static shared memory allocation is larger than available memory."); @@ -254,11 +256,13 @@ LaunchParams FusionExecutor::computeLaunchParams( launch_params.bdimy() * launch_params.bdimz(); } - uint64_t dynamic_smem_size = computeSharedMemory( - see, lowered_.dynamic_allocations(), true, reduction_broadcast_workspace); + const auto kernel = lowered_.kernel(); - uint64_t static_smem_size = - computeSharedMemory(see, lowered_.static_allocations()); + const uint64_t dynamic_smem_size = computeSharedMemory( + see, kernel->dynamicAllocations(), true, reduction_broadcast_workspace); + + const uint64_t static_smem_size = + computeSharedMemory(see, kernel->staticAllocations()); TORCH_INTERNAL_ASSERT( (dynamic_smem_size + static_smem_size) < max_device_smem, @@ -271,7 +275,7 @@ LaunchParams FusionExecutor::computeLaunchParams( FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals( StatefulExpressionEvaluator& see) { GlobalBuffers global_buffers; - for (auto alloc : lowered_.global_allocations()) { + for (auto alloc : lowered_.kernel()->globalAllocations()) { TORCH_INTERNAL_ASSERT( alloc->buffer()->getValType() == ValType::KirTensorView, "Cannot allocate global buffers that are not tensors."); diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index 7f1915789caaf..6c9e29c3e875a 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -31,6 +31,7 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { const std::string& name, int id, CompileOptions options = CompileOptions()); + void compileFusion(Fusion* fusion, CompileOptions options = CompileOptions()); std::vector runFusion( diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index f6b64d32326e9..4ed72d477e7e2 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -345,8 +346,7 @@ void Fusion::print() { } void Fusion::printKernel() { - GpuLower lower(this); - lower.printKernel(std::cout); + std::cout << codegen::generateCudaKernel(GpuLower(this).kernel()); } void Fusion::printMath() { diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp index d3d7f1099fd4c..d739b91c76ba1 100644 --- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp @@ -970,6 +970,8 @@ class ReductionOps : OptOutDispatch { void IRPrinter::printReductionOps(Fusion* fusion) { FusionGuard fg(fusion); + + // TODO(kir): we shouldn't be creating new nodes during printing auto a = new NamedScalar("a", DataType::Null); auto b = new NamedScalar("b", DataType::Null); for (auto rop_pair : ReductionOps::get(fusion)) { diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp index 284bcffda7fb6..971d011cca0de 100644 --- a/torch/csrc/jit/codegen/cuda/kernel.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel.cpp @@ -1,11 +1,84 @@ +#include #include namespace torch { namespace jit { namespace fuser { -void Kernel::print() const {} +namespace { + +class BuffersExtractor final : OptOutDispatch { + public: + explicit BuffersExtractor(const std::vector& exprs) { + for (auto expr : exprs) { + handle(expr); + } + } + + const auto& globalAllocs() const { + return global_allocations_; + } + + const auto& dynamicAllocs() const { + return dynamic_allocations_; + } + + const auto& staticAllocs() const { + return static_allocations_; + } + + private: + void handle(Expr* expr) final { + OptOutDispatch::handle(expr); + } + + void handle(kir::ForLoop* fl) final { + for (auto expr : fl->body().exprs()) { + OptOutDispatch::handle(expr); + } + } + + void handle(kir::IfThenElse* ite) final { + for (auto expr : ite->body().exprs()) { + OptOutDispatch::handle(expr); + } + for (auto expr : ite->elseBody().exprs()) { + OptOutDispatch::handle(expr); + } + } + + void handle(kir::Allocate* a) final { + switch (a->getMemoryType()) { + case MemoryType::Global: + global_allocations_.push_back(a); + break; + case MemoryType::Shared: + if (a->size()->isConstScalar()) { + static_allocations_.push_back(a); + } else { + dynamic_allocations_.push_back(a); + } + break; + case MemoryType::Local: + break; + } + } + + private: + std::vector global_allocations_; + std::vector dynamic_allocations_; + std::vector static_allocations_; +}; + +} // namespace + +Kernel::Kernel(const std::vector& exprs) : exprs_(exprs) { + BuffersExtractor buffers_extractor(exprs); + global_allocations_ = buffers_extractor.globalAllocs(); + dynamic_smem_allocations_ = buffers_extractor.dynamicAllocs(); + static_smem_allocations_ = buffers_extractor.staticAllocs(); +} } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/kernel.h b/torch/csrc/jit/codegen/cuda/kernel.h index 73774e6f85fb8..6ce65f6138b8e 100644 --- a/torch/csrc/jit/codegen/cuda/kernel.h +++ b/torch/csrc/jit/codegen/cuda/kernel.h @@ -3,21 +3,54 @@ #include #include +#include +#include +#include #include namespace torch { namespace jit { namespace fuser { -class TORCH_CUDA_API Kernel final { +// Container for a lowered Kernel IR +// +// TODO(kir): currently, it is just pointing to nodes owned +// by a Fusion object. The goal is to have the Kernel object +// own the Kernel IR nodes +// +class TORCH_CUDA_API Kernel final : public NonCopyable { public: - void print() const; + explicit Kernel(const std::vector& exprs); + + const auto& globalAllocations() const { + return global_allocations_; + } + + const auto& dynamicAllocations() const { + return dynamic_smem_allocations_; + } + + const auto& staticAllocations() const { + return static_smem_allocations_; + } + + const auto& exprs() const { + return exprs_; + } private: - // Lowered IR - std::unordered_set lowered_val_set_; - std::unordered_set lowered_expr_set_; + // List of global buffers + std::vector global_allocations_; + + // List of dynamic shared memory buffers + std::vector dynamic_smem_allocations_; + + // List of static shared memory buffers + std::vector static_smem_allocations_; + + // Lowered expressions + std::vector exprs_; }; } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp index 99de992b31ff7..092aa9e1a18ff 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.cpp +++ b/torch/csrc/jit/codegen/cuda/lower2device.cpp @@ -1,8 +1,6 @@ #include -#include #include -#include #include #include #include @@ -14,96 +12,10 @@ namespace torch { namespace jit { namespace fuser { -namespace { - // TODO(kir): revisit this thread_local GpuLower* active_gpu_lower = nullptr; -class BuffersExtractor : OptOutDispatch { - public: - BuffersExtractor( - const std::vector& exprs, - ThreadPredicateMap& _thread_predicates) - : thread_predicates_(_thread_predicates), has_block_broadcast_(false) { - for (auto expr : exprs) { - handle(expr); - } - } - - std::vector getGlobalAllocs() { - return global_allocations_; - } - - std::vector getDynamicAllocs() { - return dynamic_allocations_; - } - - std::vector getStaticAllocs() { - return static_allocations_; - } - - bool hasBlockBroadcast() { - return has_block_broadcast_; - } - - private: - ThreadPredicateMap& thread_predicates_; - bool has_block_broadcast_; - std::vector global_allocations_; - std::vector dynamic_allocations_; - std::vector static_allocations_; - - void handle(Expr* expr) final { - OptOutDispatch::handle(expr); - } - - void handle(kir::ForLoop* fl) final { - for (auto expr : fl->body().exprs()) { - OptOutDispatch::handle(expr); - } - } - - void handle(kir::IfThenElse* ite) final { - for (auto expr : ite->body().exprs()) { - OptOutDispatch::handle(expr); - } - - for (auto expr : ite->elseBody().exprs()) { - OptOutDispatch::handle(expr); - } - } - - void handle(kir::BroadcastOp* bop) final { - const ir_utils::ParallelTypeBitmap domains = - ir_utils::getParallelBroadcastDomains(bop->out(), thread_predicates_); - const bool thread_x = domains.get(ParallelType::TIDx); - const bool thread_y = domains.get(ParallelType::TIDy); - const bool thread_z = domains.get(ParallelType::TIDz); - const bool block_broadcast_needed = thread_x || thread_y || thread_z; - has_block_broadcast_ |= block_broadcast_needed; - } - - void handle(kir::Allocate* a) final { - switch (a->getMemoryType()) { - case MemoryType::Global: - global_allocations_.push_back(a); - break; - case MemoryType::Shared: - if (a->size()->isConstScalar()) { - static_allocations_.push_back(a); - } else { - dynamic_allocations_.push_back(a); - } - break; - case MemoryType::Local: - break; - } - } -}; - -} // namespace - -void GpuLower::buildSizesMap() { +void GpuLower::replaceSymbolicSizes() { // Grab inputs and outputs // TODO: Only run through inputs for the size map, outputs don't actually set // any sizes of the problem. @@ -177,7 +89,7 @@ void GpuLower::lower() { // prepare for lowering validateIr(fusion_); - buildSizesMap(); + replaceSymbolicSizes(); // Compute thread predicates ThreadPredicateMap preds(fusion_); @@ -193,48 +105,19 @@ void GpuLower::lower() { const auto indexed_loops = IndexLowering::getIndexedExprs(fusion_, unrolled_loops); - // Store the final lowered IR - lowered_exprs_ = indexed_loops; - - // Get allocations - BuffersExtractor be(lowered_exprs_, preds); - global_allocations_ = be.getGlobalAllocs(); - dynamic_smem_allocations_ = be.getDynamicAllocs(); - static_smem_allocations_ = be.getStaticAllocs(); -} - -// Traverse through the fusion and print CUDA code associated with it -std::ostream& GpuLower::printKernel( - std::ostream& os, - const std::string& kernel_name) { - FusionGuard fg(fusion_); - - std::vector allocs; - allocs.insert( - allocs.end(), global_allocations_.begin(), global_allocations_.end()); - - std::vector global_tensors(allocs.size(), nullptr); - std::transform( - allocs.begin(), - allocs.end(), - global_tensors.begin(), - [](kir::Allocate* alloc) { return alloc->buffer(); }); - - bool hasDynamicSmem = dynamic_smem_allocations_.size() > 0; - - IRPrinter irp(os); - irp.printKernel(lowered_exprs_, kernel_name, global_tensors, hasDynamicSmem); - return os; + // We now have the lowered expressions, store the final lowered Kernel IR + kernel_ = std::make_unique(indexed_loops); } -std::string GpuLower::getKernel(const std::string& kernel_name) { - std::stringstream ss; - printKernel(ss, kernel_name); - return ss.str(); +Kernel* GpuLower::kernel() const { + TORCH_CHECK(kernel_); + return kernel_.get(); } // Maps Fusion IR nodes to the Kernel IR counterparts -// (this is a interim solution for easing the Kernel IR splitting) +// +// TODO(kir): this is a interim solution for easing the Kernel IR splitting +// class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch { public: explicit KernelIrMapper(GpuLower* gpu_lower) : gpu_lower_(gpu_lower) {} diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h index e0908f26d74c2..f7d65c8c7ba9a 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.h +++ b/torch/csrc/jit/codegen/cuda/lower2device.h @@ -1,3 +1,4 @@ + #pragma once #include @@ -6,6 +7,7 @@ #include #include +#include #include namespace torch { @@ -22,24 +24,7 @@ class TORCH_CUDA_API GpuLower { lower(); } - // print generated code to ostream - std::ostream& printKernel( - std::ostream& _os, - const std::string& kernel_name = "CUDAGeneratedKernel"); - - std::string getKernel(const std::string& kernel_name = "CUDAGeneratedKernel"); - - std::vector global_allocations() { - return global_allocations_; - } - - std::vector dynamic_allocations() { - return dynamic_smem_allocations_; - } - - std::vector static_allocations() { - return static_smem_allocations_; - } + Kernel* kernel() const; // Converts a Fusion IR value into the Kernel IR equivalent // @@ -58,21 +43,11 @@ class TORCH_CUDA_API GpuLower { // not have this information. Since we need to have the correct information in // the kernel being fetched for shapes, we want to replace input and output // tensors to reference the runtime structure containing sizes. - void buildSizesMap(); + void replaceSymbolicSizes(); private: - // List of global buffers - // Allocate nodes track if it needs to be initialized to 0 - std::vector global_allocations_; - - // List of dynamic shared memory buffers - std::vector dynamic_smem_allocations_; - - // List of static shared memory buffers - std::vector static_smem_allocations_; - - // Lowered IR - std::vector lowered_exprs_; + // Lowered Kernel IR + std::unique_ptr kernel_; // Fusion IR node to Kernel IR node mapping std::unordered_map kir_map_; diff --git a/torch/csrc/jit/codegen/cuda/lower_index.h b/torch/csrc/jit/codegen/cuda/lower_index.h index ea420abdf3590..dd3e5a11c2767 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index.h +++ b/torch/csrc/jit/codegen/cuda/lower_index.h @@ -2,6 +2,7 @@ #include +#include #include #include From 737a2734af080bca7117d4a46e9cb9b3cad518e0 Mon Sep 17 00:00:00 2001 From: Lemo Date: Tue, 8 Sep 2020 15:17:11 -0700 Subject: [PATCH 036/167] clang-format --- test/cpp/jit/test_gpu.cpp | 4 ++-- torch/csrc/jit/codegen/cuda/executor.h | 2 +- torch/csrc/jit/codegen/cuda/kernel.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index c077747dbab90..1139524aabdd1 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -1144,8 +1144,8 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Te } )"; - const std::string actual_kernel = "\n" + - codegen::generateCudaKernel(GpuLower(fusion.get()).kernel()); + const std::string actual_kernel = + "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel()); if (expected_kernel.size() != actual_kernel.size() || expected_kernel.compare(actual_kernel) != 0) { std::cerr diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index 7a89ff7fa7a4f..0e2d88c958b47 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -31,7 +31,7 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { const std::string& name, int id, CompileOptions options = CompileOptions()); - + void compileFusion(Fusion* fusion, CompileOptions options = CompileOptions()); std::vector runFusion( diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp index 971d011cca0de..88955086dc5b9 100644 --- a/torch/csrc/jit/codegen/cuda/kernel.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel.cpp @@ -1,6 +1,6 @@ -#include #include +#include namespace torch { namespace jit { From d21d78f8d444c374b1eaae8f72094fd465220a53 Mon Sep 17 00:00:00 2001 From: Naoya Maruyama Date: Wed, 9 Sep 2020 05:41:22 -0700 Subject: [PATCH 037/167] Remove a false-positive assertion. (#372) Fixes #364 --- torch/csrc/jit/codegen/cuda/lower_loops.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp index 761c51d95b39e..f240a20150644 100644 --- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp @@ -409,9 +409,9 @@ void findTargetTensor(Expr* expr, TensorView*& target, unsigned& score) { auto axis = out_tv->getRelativeComputeAtAxis(); target = out_tv->getComputeAtView(); while (target->hasComputeAt()) { - if (target->getThisComputeAtAxis() < axis) + if (target->getThisComputeAtAxis() < axis) { break; - TORCH_INTERNAL_ASSERT(target->getThisComputeAtAxis() == axis); + } axis = target->getComputeAtRelPos(axis); target = target->getComputeAtView(); } From 5a08221b4f18377794ba321bd513d8d822e4f868 Mon Sep 17 00:00:00 2001 From: Leonard Mosescu Date: Wed, 9 Sep 2020 09:54:55 -0700 Subject: [PATCH 038/167] Kernel IR: part 7 (#371) This iteration accomplishes two main things: Use the new Kernel class to track the lowered expressions (the IR nodes are still owned by the Fusion class, the goal being to switch completely to Kernel ownership soon) Starting to factor out the actual CUDA kernel code generation (codege.h/.cpp) --- caffe2/CMakeLists.txt | 1 + test/cpp/jit/test_gpu.cpp | 52 +++---- tools/build_variables.bzl | 1 + torch/csrc/jit/codegen/cuda/codegen.cpp | 39 ++++++ torch/csrc/jit/codegen/cuda/codegen.h | 22 +++ torch/csrc/jit/codegen/cuda/executor.cpp | 22 +-- torch/csrc/jit/codegen/cuda/executor.h | 1 + torch/csrc/jit/codegen/cuda/fusion.cpp | 8 +- torch/csrc/jit/codegen/cuda/fusion.h | 4 +- torch/csrc/jit/codegen/cuda/ir_iostream.cpp | 2 + torch/csrc/jit/codegen/cuda/kernel.cpp | 75 +++++++++- torch/csrc/jit/codegen/cuda/kernel.h | 43 +++++- torch/csrc/jit/codegen/cuda/lower2device.cpp | 137 ++----------------- torch/csrc/jit/codegen/cuda/lower2device.h | 37 +---- torch/csrc/jit/codegen/cuda/lower_index.h | 1 + torch/csrc/jit/codegen/cuda/utils.h | 10 ++ 16 files changed, 242 insertions(+), 213 deletions(-) create mode 100644 torch/csrc/jit/codegen/cuda/codegen.cpp create mode 100644 torch/csrc/jit/codegen/cuda/codegen.h diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 2f189614b2ea3..9a39a85ccf596 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -478,6 +478,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/arith.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/compute_at.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/codegen.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/dispatch.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/expr_evaluator.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/executor.cpp diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index 00296512af076..1139524aabdd1 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -31,7 +32,6 @@ namespace torch { namespace jit { -using namespace torch::jit::fuser; using namespace torch::jit::fuser; namespace { @@ -362,8 +362,6 @@ void testGPU_FusionExprEvalPostLower() { // Lower GpuLower gpulw(&fusion); - std::stringstream kernel; - gpulw.printKernel(kernel); // 1. Create an evaluation context StatefulExpressionEvaluator evaluator(&fusion); @@ -507,10 +505,12 @@ void testGPU_FusionCopy() { ASSERT_EQ(original_ir.str(), clone_ir.str()); // Lower original fusion - std::stringstream original_kernel; + std::string original_kernel; { - GpuLower lower(&original_fusion); - lower.printKernel(original_kernel); + // TODO(kir): remove this guard once we implement the cuda codegen visitor + FusionGuard fg(&original_fusion); + original_kernel = + codegen::generateCudaKernel(GpuLower(&original_fusion).kernel()); } // Make sure the "before lowering" clone was not mutated @@ -531,12 +531,14 @@ void testGPU_FusionCopy() { ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str()); // Lower the "before lowering" and compare kernels - std::stringstream clone_kernel; + std::string clone_kernel; { - GpuLower lower(&before_lowering); - lower.printKernel(clone_kernel); + // TODO(kir): remove this guard once we implement the cuda codegen visitor + FusionGuard fg(&before_lowering); + clone_kernel = + codegen::generateCudaKernel(GpuLower(&before_lowering).kernel()); } - ASSERT_EQ(original_kernel.str(), clone_kernel.str()); + ASSERT_EQ(original_kernel, clone_kernel); } void testGPU_FusionMove() { @@ -595,9 +597,7 @@ void testGPU_FusionMove() { ASSERT_EQ(original_ir.str(), another_ir.str()); // Lower the fusion IR - std::stringstream kernel; GpuLower lower(&another_fusion); - lower.printKernel(kernel); std::stringstream lowered_ir; lowered_ir << another_fusion; @@ -1144,8 +1144,8 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Te } )"; - std::string actual_kernel = GpuLower(fusion.get()).getKernel(); - actual_kernel = "\n" + actual_kernel; + const std::string actual_kernel = + "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel()); if (expected_kernel.size() != actual_kernel.size() || expected_kernel.compare(actual_kernel) != 0) { std::cerr @@ -1529,11 +1529,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(outputs[0], t5), actual_kernel.str()); + TORCH_CHECK(at::allclose(outputs[0], t5)); TORCH_CHECK(at::allclose(outputs[1], t6)); } @@ -1589,11 +1585,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); fe.runFusion({t0, t1}, {kernel_tv3}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(kernel_tv3, t3), actual_kernel.str()); + TORCH_CHECK(at::allclose(kernel_tv3, t3)); } // Case 4 @@ -1659,11 +1651,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0, t1, t2, t3}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(outputs[0], t6), actual_kernel.str()); + TORCH_CHECK(at::allclose(outputs[0], t6)); } // Case 5 @@ -2179,11 +2167,7 @@ void testGPU_FusionScalarInputs() { at::Scalar(fl3)}, {kernel_tv4}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(kernel_tv4, t4), actual_kernel.str()); + TORCH_CHECK(at::allclose(kernel_tv4, t4)); } void testGPU_FusionLoopUnroll() { diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 8b6c6fdeb26ac..7649fe93bf325 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -337,6 +337,7 @@ libtorch_cuda_sources = [ "torch/csrc/autograd/functions/comm.cpp", "torch/csrc/jit/codegen/cuda/arith.cpp", "torch/csrc/jit/codegen/cuda/compute_at.cpp", + "torch/csrc/jit/codegen/cuda/codegen.cpp", "torch/csrc/jit/codegen/cuda/dispatch.cpp", "torch/csrc/jit/codegen/cuda/expr_evaluator.cpp", "torch/csrc/jit/codegen/cuda/executor.cpp", diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp new file mode 100644 index 0000000000000..db15f42f22e31 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/codegen.cpp @@ -0,0 +1,39 @@ + +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace codegen { + +std::string generateCudaKernel( + const Kernel* kernel, + const std::string& kernel_name) { + const auto& allocations = kernel->globalAllocations(); + std::vector global_tensors(allocations.size()); + std::transform( + allocations.begin(), + allocations.end(), + global_tensors.begin(), + [](kir::Allocate* alloc) { return alloc->buffer(); }); + + std::stringstream ss; + + IRPrinter ir_printer(ss); + ir_printer.printKernel( + kernel->exprs(), + kernel_name, + global_tensors, + !kernel->dynamicAllocations().empty()); + + return ss.str(); +} + +} // namespace codegen +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/codegen.h b/torch/csrc/jit/codegen/cuda/codegen.h new file mode 100644 index 0000000000000..0e5f2cc2ebf56 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/codegen.h @@ -0,0 +1,22 @@ + +#pragma once + +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace codegen { + +TORCH_CUDA_API std::string generateCudaKernel( + const Kernel* kernel, + const std::string& kernel_name = "CUDAGeneratedKernel"); + +} // namespace codegen +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index 7c713b3640d25..42fa6373749ba 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -1,4 +1,5 @@ +#include #include #include #include @@ -91,13 +92,14 @@ void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) { has_grid_reductions = fusion_.hasGridReduction(); has_block_broadcasts = fusion_.hasBlockBroadcast(); lowered_ = GpuLower(&fusion_); - const auto kernel = lowered_.getKernel(kernelName()); - const auto structured_code = getStructuredCode(kernel); + const auto kernel = lowered_.kernel(); + const auto kernel_code = codegen::generateCudaKernel(kernel, kernelName()); + const auto structured_code = getStructuredCode(kernel_code); - if (lowered_.static_allocations().size() > 0) { + if (kernel->staticAllocations().size() > 0) { StatefulExpressionEvaluator static_evaluator(&fusion_); unsigned static_smem_size = - computeSharedMemory(static_evaluator, lowered_.static_allocations()); + computeSharedMemory(static_evaluator, kernel->staticAllocations()); TORCH_INTERNAL_ASSERT( static_smem_size < max_device_smem, "The static shared memory allocation is larger than available memory."); @@ -254,11 +256,13 @@ LaunchParams FusionExecutor::computeLaunchParams( launch_params.bdimy() * launch_params.bdimz(); } - uint64_t dynamic_smem_size = computeSharedMemory( - see, lowered_.dynamic_allocations(), true, reduction_broadcast_workspace); + const auto kernel = lowered_.kernel(); - uint64_t static_smem_size = - computeSharedMemory(see, lowered_.static_allocations()); + const uint64_t dynamic_smem_size = computeSharedMemory( + see, kernel->dynamicAllocations(), true, reduction_broadcast_workspace); + + const uint64_t static_smem_size = + computeSharedMemory(see, kernel->staticAllocations()); TORCH_INTERNAL_ASSERT( (dynamic_smem_size + static_smem_size) < max_device_smem, @@ -271,7 +275,7 @@ LaunchParams FusionExecutor::computeLaunchParams( FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals( StatefulExpressionEvaluator& see) { GlobalBuffers global_buffers; - for (auto alloc : lowered_.global_allocations()) { + for (auto alloc : lowered_.kernel()->globalAllocations()) { TORCH_INTERNAL_ASSERT( alloc->buffer()->getValType() == ValType::KirTensorView, "Cannot allocate global buffers that are not tensors."); diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index dc2972457489e..0e2d88c958b47 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -31,6 +31,7 @@ class TORCH_CUDA_API FusionExecutor : public NonCopyable { const std::string& name, int id, CompileOptions options = CompileOptions()); + void compileFusion(Fusion* fusion, CompileOptions options = CompileOptions()); std::vector runFusion( diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index 3ac4c95584d13..4ed72d477e7e2 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -228,7 +229,7 @@ void Fusion::removeVal(Val* val) { delete val; } -void Fusion::addInput(Val* const input) { +void Fusion::addInput(Val* input) { assertInFusion(input, "Cannot register input "); if (input->getValType().value() == ValType::TensorView) { @@ -251,7 +252,7 @@ void Fusion::addInput(Val* const input) { inputs_.push_back(input); } -void Fusion::addOutput(Val* const output) { +void Fusion::addOutput(Val* output) { assertInFusion(output, "Cannot register output "); if (output->getValType().value() == ValType::TensorView) { auto tv = output->as(); @@ -345,8 +346,7 @@ void Fusion::print() { } void Fusion::printKernel() { - GpuLower lower(this); - lower.printKernel(std::cout); + std::cout << codegen::generateCudaKernel(GpuLower(this).kernel()); } void Fusion::printMath() { diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h index 52c12763f0e7c..0f1dd20a9cac5 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.h +++ b/torch/csrc/jit/codegen/cuda/fusion.h @@ -90,10 +90,10 @@ class TORCH_CUDA_API Fusion final { void removeVal(Val* val); // Register input as an input of the fusion - void addInput(Val* const input); + void addInput(Val* input); // Register output as an output of the fusion - void addOutput(Val* const output); + void addOutput(Val* output); // Check if stmt is properly registered with this fusion bool inFusion(const Statement* stmt) const; diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp index d3d7f1099fd4c..d739b91c76ba1 100644 --- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp @@ -970,6 +970,8 @@ class ReductionOps : OptOutDispatch { void IRPrinter::printReductionOps(Fusion* fusion) { FusionGuard fg(fusion); + + // TODO(kir): we shouldn't be creating new nodes during printing auto a = new NamedScalar("a", DataType::Null); auto b = new NamedScalar("b", DataType::Null); for (auto rop_pair : ReductionOps::get(fusion)) { diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp index 284bcffda7fb6..88955086dc5b9 100644 --- a/torch/csrc/jit/codegen/cuda/kernel.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel.cpp @@ -1,11 +1,84 @@ #include +#include namespace torch { namespace jit { namespace fuser { -void Kernel::print() const {} +namespace { + +class BuffersExtractor final : OptOutDispatch { + public: + explicit BuffersExtractor(const std::vector& exprs) { + for (auto expr : exprs) { + handle(expr); + } + } + + const auto& globalAllocs() const { + return global_allocations_; + } + + const auto& dynamicAllocs() const { + return dynamic_allocations_; + } + + const auto& staticAllocs() const { + return static_allocations_; + } + + private: + void handle(Expr* expr) final { + OptOutDispatch::handle(expr); + } + + void handle(kir::ForLoop* fl) final { + for (auto expr : fl->body().exprs()) { + OptOutDispatch::handle(expr); + } + } + + void handle(kir::IfThenElse* ite) final { + for (auto expr : ite->body().exprs()) { + OptOutDispatch::handle(expr); + } + for (auto expr : ite->elseBody().exprs()) { + OptOutDispatch::handle(expr); + } + } + + void handle(kir::Allocate* a) final { + switch (a->getMemoryType()) { + case MemoryType::Global: + global_allocations_.push_back(a); + break; + case MemoryType::Shared: + if (a->size()->isConstScalar()) { + static_allocations_.push_back(a); + } else { + dynamic_allocations_.push_back(a); + } + break; + case MemoryType::Local: + break; + } + } + + private: + std::vector global_allocations_; + std::vector dynamic_allocations_; + std::vector static_allocations_; +}; + +} // namespace + +Kernel::Kernel(const std::vector& exprs) : exprs_(exprs) { + BuffersExtractor buffers_extractor(exprs); + global_allocations_ = buffers_extractor.globalAllocs(); + dynamic_smem_allocations_ = buffers_extractor.dynamicAllocs(); + static_smem_allocations_ = buffers_extractor.staticAllocs(); +} } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/kernel.h b/torch/csrc/jit/codegen/cuda/kernel.h index 73774e6f85fb8..6ce65f6138b8e 100644 --- a/torch/csrc/jit/codegen/cuda/kernel.h +++ b/torch/csrc/jit/codegen/cuda/kernel.h @@ -3,21 +3,54 @@ #include #include +#include +#include +#include #include namespace torch { namespace jit { namespace fuser { -class TORCH_CUDA_API Kernel final { +// Container for a lowered Kernel IR +// +// TODO(kir): currently, it is just pointing to nodes owned +// by a Fusion object. The goal is to have the Kernel object +// own the Kernel IR nodes +// +class TORCH_CUDA_API Kernel final : public NonCopyable { public: - void print() const; + explicit Kernel(const std::vector& exprs); + + const auto& globalAllocations() const { + return global_allocations_; + } + + const auto& dynamicAllocations() const { + return dynamic_smem_allocations_; + } + + const auto& staticAllocations() const { + return static_smem_allocations_; + } + + const auto& exprs() const { + return exprs_; + } private: - // Lowered IR - std::unordered_set lowered_val_set_; - std::unordered_set lowered_expr_set_; + // List of global buffers + std::vector global_allocations_; + + // List of dynamic shared memory buffers + std::vector dynamic_smem_allocations_; + + // List of static shared memory buffers + std::vector static_smem_allocations_; + + // Lowered expressions + std::vector exprs_; }; } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp index 99de992b31ff7..092aa9e1a18ff 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.cpp +++ b/torch/csrc/jit/codegen/cuda/lower2device.cpp @@ -1,8 +1,6 @@ #include -#include #include -#include #include #include #include @@ -14,96 +12,10 @@ namespace torch { namespace jit { namespace fuser { -namespace { - // TODO(kir): revisit this thread_local GpuLower* active_gpu_lower = nullptr; -class BuffersExtractor : OptOutDispatch { - public: - BuffersExtractor( - const std::vector& exprs, - ThreadPredicateMap& _thread_predicates) - : thread_predicates_(_thread_predicates), has_block_broadcast_(false) { - for (auto expr : exprs) { - handle(expr); - } - } - - std::vector getGlobalAllocs() { - return global_allocations_; - } - - std::vector getDynamicAllocs() { - return dynamic_allocations_; - } - - std::vector getStaticAllocs() { - return static_allocations_; - } - - bool hasBlockBroadcast() { - return has_block_broadcast_; - } - - private: - ThreadPredicateMap& thread_predicates_; - bool has_block_broadcast_; - std::vector global_allocations_; - std::vector dynamic_allocations_; - std::vector static_allocations_; - - void handle(Expr* expr) final { - OptOutDispatch::handle(expr); - } - - void handle(kir::ForLoop* fl) final { - for (auto expr : fl->body().exprs()) { - OptOutDispatch::handle(expr); - } - } - - void handle(kir::IfThenElse* ite) final { - for (auto expr : ite->body().exprs()) { - OptOutDispatch::handle(expr); - } - - for (auto expr : ite->elseBody().exprs()) { - OptOutDispatch::handle(expr); - } - } - - void handle(kir::BroadcastOp* bop) final { - const ir_utils::ParallelTypeBitmap domains = - ir_utils::getParallelBroadcastDomains(bop->out(), thread_predicates_); - const bool thread_x = domains.get(ParallelType::TIDx); - const bool thread_y = domains.get(ParallelType::TIDy); - const bool thread_z = domains.get(ParallelType::TIDz); - const bool block_broadcast_needed = thread_x || thread_y || thread_z; - has_block_broadcast_ |= block_broadcast_needed; - } - - void handle(kir::Allocate* a) final { - switch (a->getMemoryType()) { - case MemoryType::Global: - global_allocations_.push_back(a); - break; - case MemoryType::Shared: - if (a->size()->isConstScalar()) { - static_allocations_.push_back(a); - } else { - dynamic_allocations_.push_back(a); - } - break; - case MemoryType::Local: - break; - } - } -}; - -} // namespace - -void GpuLower::buildSizesMap() { +void GpuLower::replaceSymbolicSizes() { // Grab inputs and outputs // TODO: Only run through inputs for the size map, outputs don't actually set // any sizes of the problem. @@ -177,7 +89,7 @@ void GpuLower::lower() { // prepare for lowering validateIr(fusion_); - buildSizesMap(); + replaceSymbolicSizes(); // Compute thread predicates ThreadPredicateMap preds(fusion_); @@ -193,48 +105,19 @@ void GpuLower::lower() { const auto indexed_loops = IndexLowering::getIndexedExprs(fusion_, unrolled_loops); - // Store the final lowered IR - lowered_exprs_ = indexed_loops; - - // Get allocations - BuffersExtractor be(lowered_exprs_, preds); - global_allocations_ = be.getGlobalAllocs(); - dynamic_smem_allocations_ = be.getDynamicAllocs(); - static_smem_allocations_ = be.getStaticAllocs(); -} - -// Traverse through the fusion and print CUDA code associated with it -std::ostream& GpuLower::printKernel( - std::ostream& os, - const std::string& kernel_name) { - FusionGuard fg(fusion_); - - std::vector allocs; - allocs.insert( - allocs.end(), global_allocations_.begin(), global_allocations_.end()); - - std::vector global_tensors(allocs.size(), nullptr); - std::transform( - allocs.begin(), - allocs.end(), - global_tensors.begin(), - [](kir::Allocate* alloc) { return alloc->buffer(); }); - - bool hasDynamicSmem = dynamic_smem_allocations_.size() > 0; - - IRPrinter irp(os); - irp.printKernel(lowered_exprs_, kernel_name, global_tensors, hasDynamicSmem); - return os; + // We now have the lowered expressions, store the final lowered Kernel IR + kernel_ = std::make_unique(indexed_loops); } -std::string GpuLower::getKernel(const std::string& kernel_name) { - std::stringstream ss; - printKernel(ss, kernel_name); - return ss.str(); +Kernel* GpuLower::kernel() const { + TORCH_CHECK(kernel_); + return kernel_.get(); } // Maps Fusion IR nodes to the Kernel IR counterparts -// (this is a interim solution for easing the Kernel IR splitting) +// +// TODO(kir): this is a interim solution for easing the Kernel IR splitting +// class TORCH_CUDA_API GpuLower::KernelIrMapper : private OptInConstDispatch { public: explicit KernelIrMapper(GpuLower* gpu_lower) : gpu_lower_(gpu_lower) {} diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h index e0908f26d74c2..f7d65c8c7ba9a 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.h +++ b/torch/csrc/jit/codegen/cuda/lower2device.h @@ -1,3 +1,4 @@ + #pragma once #include @@ -6,6 +7,7 @@ #include #include +#include #include namespace torch { @@ -22,24 +24,7 @@ class TORCH_CUDA_API GpuLower { lower(); } - // print generated code to ostream - std::ostream& printKernel( - std::ostream& _os, - const std::string& kernel_name = "CUDAGeneratedKernel"); - - std::string getKernel(const std::string& kernel_name = "CUDAGeneratedKernel"); - - std::vector global_allocations() { - return global_allocations_; - } - - std::vector dynamic_allocations() { - return dynamic_smem_allocations_; - } - - std::vector static_allocations() { - return static_smem_allocations_; - } + Kernel* kernel() const; // Converts a Fusion IR value into the Kernel IR equivalent // @@ -58,21 +43,11 @@ class TORCH_CUDA_API GpuLower { // not have this information. Since we need to have the correct information in // the kernel being fetched for shapes, we want to replace input and output // tensors to reference the runtime structure containing sizes. - void buildSizesMap(); + void replaceSymbolicSizes(); private: - // List of global buffers - // Allocate nodes track if it needs to be initialized to 0 - std::vector global_allocations_; - - // List of dynamic shared memory buffers - std::vector dynamic_smem_allocations_; - - // List of static shared memory buffers - std::vector static_smem_allocations_; - - // Lowered IR - std::vector lowered_exprs_; + // Lowered Kernel IR + std::unique_ptr kernel_; // Fusion IR node to Kernel IR node mapping std::unordered_map kir_map_; diff --git a/torch/csrc/jit/codegen/cuda/lower_index.h b/torch/csrc/jit/codegen/cuda/lower_index.h index ea420abdf3590..dd3e5a11c2767 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index.h +++ b/torch/csrc/jit/codegen/cuda/lower_index.h @@ -2,6 +2,7 @@ #include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h index e286cc09ed3ad..fdc1e7c3d2fdb 100644 --- a/torch/csrc/jit/codegen/cuda/utils.h +++ b/torch/csrc/jit/codegen/cuda/utils.h @@ -58,6 +58,16 @@ class PolymorphicBase { return downcast_ptr; } + // Check if the runtime time is T (or derived from T) + // + // NOTE: Don't use this for conditional casts. Use: + // + // if (auto t = dynamic_cast(p)) { ... } + // + // instead of: + // + // if (p->isA()) { auto t = p->as(); ... } + // template bool isA() const { return dynamic_cast(this) != nullptr; From 2f0c75122c63b3a8a9d7a9fd8544904957b3ed55 Mon Sep 17 00:00:00 2001 From: Lemo Date: Wed, 9 Sep 2020 09:56:49 -0700 Subject: [PATCH 039/167] revert .build_profile addition --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1f4b83dd7439d..01739b3d92dd6 100644 --- a/.gitignore +++ b/.gitignore @@ -187,7 +187,6 @@ build_android build_ios /build_* .build_debug/* -.build_profile/* .build_release/* distribute/* *.testbin From 6a60779519c7b47bc76f81b37d89c4d5243103a1 Mon Sep 17 00:00:00 2001 From: Leonard Mosescu Date: Wed, 9 Sep 2020 10:09:14 -0700 Subject: [PATCH 040/167] Experimental doxygen support (#350) This is the basis Doxygen scaffolding. To build the html documentation, first install doxygen, then: cd torch/csrc/jit/codegen/cuda/docs doxygen fuser.doxygen --- torch/csrc/jit/codegen/cuda/docs/.gitignore | 1 + .../jit/codegen/cuda/docs/documentation.h | 23 + .../csrc/jit/codegen/cuda/docs/fuser.doxygen | 2515 +++++++++++++++++ .../cuda/docs/images/ir_architecture.png | Bin 0 -> 96754 bytes torch/csrc/jit/codegen/cuda/docs/main_page.md | 8 + 5 files changed, 2547 insertions(+) create mode 100644 torch/csrc/jit/codegen/cuda/docs/.gitignore create mode 100644 torch/csrc/jit/codegen/cuda/docs/documentation.h create mode 100644 torch/csrc/jit/codegen/cuda/docs/fuser.doxygen create mode 100644 torch/csrc/jit/codegen/cuda/docs/images/ir_architecture.png create mode 100644 torch/csrc/jit/codegen/cuda/docs/main_page.md diff --git a/torch/csrc/jit/codegen/cuda/docs/.gitignore b/torch/csrc/jit/codegen/cuda/docs/.gitignore new file mode 100644 index 0000000000000..1936cc1d441e4 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/.gitignore @@ -0,0 +1 @@ +html diff --git a/torch/csrc/jit/codegen/cuda/docs/documentation.h b/torch/csrc/jit/codegen/cuda/docs/documentation.h new file mode 100644 index 0000000000000..cfd4435461b97 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/documentation.h @@ -0,0 +1,23 @@ + +#error This is used exclusively for generating the documentation (not a real header) + +//! \namespace torch::jit::fuser +//! \brief Main PyTorch JIT Fuser namespace + +//! \namespace torch::jit::fuser::cuda +//! \brief CUDA specific components + +//! \namespace torch::jit::fuser::cuda::executor_utils +//! \brief Fuser executor related utilities + +//! \namespace torch::jit::fuser::kir +//! \brief Kernel IR + +//! \namespace torch::jit::fuser::ir_utils +//! \brief IR manipulation utilities + +//! \namespace torch::jit::fuser::loop_utils +//! \brief Loop utilities + +//! \namespace torch::jit::fuser::scope_utils +//! \brief Scope utilities diff --git a/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen new file mode 100644 index 0000000000000..b9a51b187aa5d --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen @@ -0,0 +1,2515 @@ +# Doxyfile 1.8.14 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. + +PROJECT_NAME = "PyTorch JIT Fuser" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = YES + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = YES + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 0. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 0 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = YES + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. + +# TODO: switch to NO once key concepts are documented +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = YES + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = YES + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = NO + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = NO + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT += .. +INPUT += documentation.h +INPUT += main_page.md + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: https://www.gnu.org/software/libiconv/) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, +# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.pyw \ + *.f90 \ + *.f95 \ + *.f03 \ + *.f08 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE += + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS += Ui +EXCLUDE_SYMBOLS += internal +EXCLUDE_SYMBOLS += __* + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = images + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = main_page.md + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = --std=c++1z + +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files +# were built. This is equivalent to specifying the "-p" option to a clang tool, +# such as clang-check. These options will then be passed to the parser. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: 0. + +CLANG_COMPILATION_DATABASE_PATH = 0 + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via Javascript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have Javascript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: https://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 1 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/ + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /