From bb8cef111c4828a9690ba68752dca84acd83fd08 Mon Sep 17 00:00:00 2001 From: shmsong Date: Wed, 24 Aug 2022 10:52:49 -0700 Subject: [PATCH 1/4] add lower index resolution --- torch/csrc/jit/codegen/cuda/index_compute.cpp | 33 +++++++++++ .../jit/codegen/cuda/lower_index_compute.cpp | 57 +++++++++++++++++++ .../jit/codegen/cuda/lower_index_compute.h | 12 ++++ 3 files changed, 102 insertions(+) diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp index 895c3e4bd96e..e7feff88fb10 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp @@ -650,6 +650,8 @@ IndexCompute::IndexCompute( } void IndexCompute::run(const LoopIndexing& loop_indexing) { + TORCH_INTERNAL_ASSERT( + concrete_id_pass_, "concrete pass only for this option"); // Apply loop swizzles if there are any that outputs to // the loop domains. // Currently only support loop swizzles that directly output @@ -669,9 +671,40 @@ void IndexCompute::run(const LoopIndexing& loop_indexing) { } } + // Resolve the out of line expressions first: + std::unordered_map permissive_index_map; + + for (auto expr : loop_indexing.getBackwardOutOfLineExprList()) { + handle(expr); + + // Collect backward results from this expression if they are + // made available in by this expression. + auto id_inputs = ir_utils::filterByType(expr->inputs()); + for (auto id : id_inputs) { + auto idx_it = index_map_.find(ir_utils::caMapExactConcreteId(id)); + if (idx_it != index_map_.end()) { + permissive_index_map[GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::PERMISSIVE)] = idx_it->second; + } + } + } + // Run through the loop indexing expressions and generate // the indexing integer math for the concrete ids. for (auto expr : loop_indexing.getBackwardExprList()) { + auto id_outputs = ir_utils::filterByType(expr->outputs()); + + for (auto id : id_outputs) { + auto concrete_id = ir_utils::caMapExactConcreteId(id); + if (!index_map_.count(concrete_id)) { + auto permissive_id = GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::PERMISSIVE); + auto permissive_it = permissive_index_map.find(permissive_id); + if (permissive_it != permissive_index_map.end()) { + index_map_[concrete_id] = permissive_it->second; + } + } + } handle(expr); } } diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp index 70b019a4cc48..2d4444d34090 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp @@ -438,6 +438,7 @@ class LoopIndexingAnalysis { indexing.loop_root_ = loop_root_domains_; indexing.loop_domains_ = loop_domains_.vector(); indexing.index_exprs_ = replayed_exprs_; + indexing.out_of_line_exprs_ = out_of_line_exprs_; return indexing; } @@ -481,6 +482,12 @@ class LoopIndexingAnalysis { //! loop_domains_ with all of these iter domains. void constructLoopDomains(); + //! Fills out_of_line_exprs_ by traversing the selected list of + //! expressions in reverse topological order and collect iterdomains + //! on the indexing paths that only involves leaf id's on the right + //! of consumer's ca axis. + void collectOutOfLineExprs(); + private: //! Original loop nest input to derive info from. const std::vector& loops_; @@ -521,6 +528,10 @@ class LoopIndexingAnalysis { //! Selected list of exprs that will produce and consume each //! of the exact concrete ids from the loop nest exactly once. std::vector replayed_exprs_; + + //! Set of expressions from the selected list that can be + //! resolved from axes on the right of ca axes. + std::vector out_of_line_exprs_; }; LoopIndexingAnalysis::LoopIndexingAnalysis( @@ -559,6 +570,10 @@ LoopIndexingAnalysis::LoopIndexingAnalysis( // Reconstruct the iterdomain view of the original loopnest after resolving // the exact definition of each index. constructLoopDomains(); + + //! Collect the set of indexing expressions that can be + //! resolved out of line. + collectOutOfLineExprs(); } void LoopIndexingAnalysis::validateLoopStructure( @@ -1088,6 +1103,48 @@ std::vector LoopIndexingTraversal::getExprList() { } // namespace +void LoopIndexingAnalysis::collectOutOfLineExprs() { + // Keep track of all the id's that can be resolved without + // iterdomains on the left of ca axes. + std::unordered_set out_of_line_ids; + + // Start the set with all the leaf ids. + std::transform( + consumer_tv_->domain()->domain().begin() + + consumer_tv_->getComputeAtPosition(), + consumer_tv_->domain()->domain().end(), + std::inserter(out_of_line_ids, out_of_line_ids.end()), + ir_utils::caMapExactConcreteId); + + // Get the original selected list of index expressions + // in reverse topological order. + auto backward_expr_list = + LoopIndexingTraversal::backwardTopologicalOrder(replayed_exprs_); + + for (auto expr : backward_expr_list) { + auto id_outputs = ir_utils::filterByType(expr->outputs()); + if ( + // Check that all of the outputs are out of line + std::all_of( + id_outputs.begin(), + id_outputs.end(), + [&out_of_line_ids](IterDomain* id) { + return out_of_line_ids.count(ir_utils::caMapExactConcreteId(id)); + })) { + // Record out of line expression + out_of_line_exprs_.push_back(expr); + + // Add all of the expression inputs as out of line id's. + auto id_inputs = ir_utils::filterByType(expr->inputs()); + std::transform( + id_inputs.begin(), + id_inputs.end(), + std::inserter(out_of_line_ids, out_of_line_ids.end()), + ir_utils::caMapExactConcreteId); + } + } +} + std::vector LoopIndexing::getForwardExprList() const { return LoopIndexingTraversal::forwardTopologicalOrder(index_exprs_); } diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.h b/torch/csrc/jit/codegen/cuda/lower_index_compute.h index d8d4dd7103b3..4b81fd0dec0c 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index_compute.h +++ b/torch/csrc/jit/codegen/cuda/lower_index_compute.h @@ -127,6 +127,12 @@ class LoopIndexing { //! topological order. std::vector getBackwardExprList() const; + //! Returns the set of out of line expressions in + //! reverse topological order. + const std::vector& getBackwardOutOfLineExprList() const { + return out_of_line_exprs_; + } + //! Returns all exact concrete id's that were produced //! or consumed in the selected indexing expressions std::unordered_set getAllExactConcreteIdSet() const; @@ -152,6 +158,12 @@ class LoopIndexing { //! The selected sequence of expressions that should represent //! the correct indexing math from the given loop nest. std::vector index_exprs_; + + //! The subset of sequence of expressions that can be resolved + //! with only the iterdomains on the right of consumer tv's ca + //! axis. + //! Expressions are ordered in reverse topological order. + std::vector out_of_line_exprs_; }; // When indexing there are sometimes an option to propagate an index down From 799230907888fa218b300c9156979f38a54d4de1 Mon Sep 17 00:00:00 2001 From: shmsong Date: Wed, 24 Aug 2022 13:52:24 -0700 Subject: [PATCH 2/4] add repro --- .../codegen/cuda/test/test_gpu_tensorcore.cpp | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp index c00d02c8a40d..fa89f9e91a79 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp @@ -2857,6 +2857,67 @@ TEST_F(NVFuserTest, FusionTuringMatmulLargeLoad_CUDA) { } } +TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(4); + auto tv1 = makeConcreteTensor({-1, -1, -1, 1}); + fusion.addInput(tv0); + fusion.addInput(tv1); + auto tv2 = add(tv0, tv1); + fusion.addOutput(tv2); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input0 = at::randn({1, 1, 333, 1}, options); + at::Tensor input1 = at::randn({1, 1, 333, 1}, options); + + auto lparams = scheduleTranspose(&fusion, {input0, input1}); + + FusionExecutor fe; + fe.compileFusion(&fusion, {input0, input1}, lparams); + auto outputs = fe.runFusion({input0, input1}, lparams); + + auto tv_ref = input0 + input1; + + testValidate( + &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + auto tv1 = makeContigTensor(2); + fusion.addInput(tv0); + fusion.addInput(tv1); + auto tv2 = set(tv0); + auto tv3 = broadcast(tv2, {true, false}); + auto tv4 = add(tv3, tv1); + fusion.addOutput(tv4); + + tv4->merge(0); + tv4->split(0, 32); + + tv0->computeAt(tv4, 1); + + tv2->split(-1, 8); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({123}, options); + at::Tensor t1 = at::randn({3, 123}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0, t1}); + + auto outputs = fe.runFusion({t0, t1}); + + auto tv_ref = t0 + t1; + + testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__); +} + #undef NVFUSER_TEST_CUDA_ARCH_GUARD } // namespace jit From e9d09fe57264443cbe0ca22fd1c0b90d2f50175e Mon Sep 17 00:00:00 2001 From: shmsong Date: Wed, 24 Aug 2022 15:39:31 -0700 Subject: [PATCH 3/4] clear GPU memory after test --- torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h index 0247c33c8a72..05a4fd600b65 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h @@ -6,6 +6,7 @@ #include #include +#include #include @@ -36,6 +37,10 @@ class NVFuserTest : public ::testing::Test { GTEST_SKIP() << "skipping tests on pre-PASCAL GPUs"; } } + + void TearDown() override{ + c10::cuda::CUDACachingAllocator::emptyCache(); + } }; struct ValidationConstants { From f406e23bb49c755a2d97c04f839d5dc8bbc0384b Mon Sep 17 00:00:00 2001 From: shmsong Date: Wed, 24 Aug 2022 19:33:35 -0700 Subject: [PATCH 4/4] cleanup and comment --- torch/csrc/jit/codegen/cuda/index_compute.cpp | 88 +++++++++++++------ torch/csrc/jit/codegen/cuda/index_compute.h | 22 +++++ torch/csrc/jit/codegen/cuda/test/test_gpu.cpp | 63 +++++++++++++ .../codegen/cuda/test/test_gpu_tensorcore.cpp | 61 ------------- .../codegen/cuda/test/test_gpu_validator.h | 4 +- 5 files changed, 149 insertions(+), 89 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp index e7feff88fb10..edd4fcf2c1ac 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp @@ -671,41 +671,77 @@ void IndexCompute::run(const LoopIndexing& loop_indexing) { } } - // Resolve the out of line expressions first: - std::unordered_map permissive_index_map; + // Resolve the index vals that could be resolved with only + // the loops that consumer_tv doesn't share with any of its + // consumers, i.e. the not-inlined loops that define consumer_tv + // values. + collectIndexIntoPermissiveMap(loop_indexing); + + // Run through the loop indexing expressions and generate + // the indexing integer math for the concrete ids. + for (auto expr : loop_indexing.getBackwardExprList()) { + // Resolve missing values from permissive map. + updateIndexMapFromPermissiveMap(expr); - for (auto expr : loop_indexing.getBackwardOutOfLineExprList()) { handle(expr); + } +} - // Collect backward results from this expression if they are - // made available in by this expression. - auto id_inputs = ir_utils::filterByType(expr->inputs()); - for (auto id : id_inputs) { - auto idx_it = index_map_.find(ir_utils::caMapExactConcreteId(id)); - if (idx_it != index_map_.end()) { - permissive_index_map[GpuLower::current()->caMap()->getConcreteMappedID( - id, IdMappingMode::PERMISSIVE)] = idx_it->second; +void IndexCompute::collectIndexIntoPermissiveMap( + const LoopIndexing& loop_indexing) { + // Visit the expressions that only produces un-inlined iterdomains, + // in reverse topological order. + for (auto expr : loop_indexing.getBackwardOutOfLineExprList()) { + // Compute indexing vals for the expression inputs. + // + // This stage should run before any indexing computation so it could be + // made sure that all index values computed at this stage are + // the ones that can be resolved only with the not-inlined + // iterdomains. + // + auto id_outputs = ir_utils::filterByType(expr->outputs()); + if (std::all_of( + id_outputs.begin(), id_outputs.end(), [this](IterDomain* id) { + return index_map_.count(ir_utils::caMapExactConcreteId(id)); + })) { + // Visit this expression: + // LoopIndexingAnalysis::traverseFromDomainVals made sure that each + // concrete index is bound exactly once so computing these expressions + // early should still be consistent. + handle(expr); + + auto id_inputs = ir_utils::filterByType(expr->inputs()); + for (auto id : id_inputs) { + // Collect backward pass results from this expression if they are + // made available in by this expression. + auto idx_it = index_map_.find(ir_utils::caMapExactConcreteId(id)); + + if (idx_it != index_map_.end()) { + permissive_index_map_ + [GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::PERMISSIVE)] = idx_it->second; + } } } } +} - // Run through the loop indexing expressions and generate - // the indexing integer math for the concrete ids. - for (auto expr : loop_indexing.getBackwardExprList()) { - auto id_outputs = ir_utils::filterByType(expr->outputs()); - - for (auto id : id_outputs) { - auto concrete_id = ir_utils::caMapExactConcreteId(id); - if (!index_map_.count(concrete_id)) { - auto permissive_id = GpuLower::current()->caMap()->getConcreteMappedID( - id, IdMappingMode::PERMISSIVE); - auto permissive_it = permissive_index_map.find(permissive_id); - if (permissive_it != permissive_index_map.end()) { - index_map_[concrete_id] = permissive_it->second; - } +void IndexCompute::updateIndexMapFromPermissiveMap(const Expr* id_expr) { + auto id_outputs = ir_utils::filterByType(id_expr->outputs()); + for (auto id : id_outputs) { + auto concrete_id = ir_utils::caMapExactConcreteId(id); + // Only try to copy index val from permissive map when + // the index is missing. + if (!index_map_.count(concrete_id)) { + auto permissive_id = GpuLower::current()->caMap()->getConcreteMappedID( + id, IdMappingMode::PERMISSIVE); + // Write the permissive index val into index_map_ if the + // missing value is found here. + auto permissive_it = permissive_index_map_.find(permissive_id); + if (permissive_it != permissive_index_map_.end()) { + index_map_[concrete_id] = permissive_it->second; } } - handle(expr); } } diff --git a/torch/csrc/jit/codegen/cuda/index_compute.h b/torch/csrc/jit/codegen/cuda/index_compute.h index f064ebba293c..3d865b4a8ceb 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.h +++ b/torch/csrc/jit/codegen/cuda/index_compute.h @@ -86,6 +86,18 @@ class IndexCompute : public BackwardVisitor { //! based traversal. IterDomain* maybeGetExactMapConcreteID(IterDomain* id); + //! (Concrete indexing pass only) + //! Collect permissive index binding from the given expression. + //! See also permissive_map_ and LoopIndexing::getBackwardOutOfLineExprList. + void collectIndexIntoPermissiveMap(const LoopIndexing& loop_indexing); + + //! (Concrete indexing pass only) + //! Iterate through id_expr's input and pull index vals from permissive + //! map, when both of the following are true: + //! 1. the output id is missing in index_map_. + //! 2. the output id is found in permissive map. + void updateIndexMapFromPermissiveMap(const Expr* id_expr); + // Tensor domain we're mapping back to root const TensorDomain* td_; // NOLINT @@ -137,6 +149,16 @@ class IndexCompute : public BackwardVisitor { // pass. See also [Note on swizzle mode] SwizzleMode swizzle_mode_ = SwizzleMode::NoSwizzle; + // (Concrete id pass only) + // Contains the indexing math that could be resolved with only the + // iterdomains on the right of the consumer_tv's ca axis, i.e. the + // ones that corresponding to the loops that consumer_tv would not + // share with any of its consumers. + // These indexing vals should be kept separate from index_map_ and + // should only be used when the indexing traversal follows the + // order defined in LoopIndexingAnalysis::traverseFromDomainVals. + std::unordered_map permissive_index_map_; + public: const std::unordered_map& indexMap() const { return index_map_; diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp index 4f72bf93ba36..8f2d3927eb1c 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp @@ -25512,6 +25512,69 @@ TEST_F(NVFuserTest, FusionSizeDependentData_CUDA) { executor_cache.fusion(), cg_outputs, {a}, {a + 123}, __LINE__, __FILE__); } +// Repro for issue #1925 +TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(4); + auto tv1 = makeConcreteTensor({-1, -1, -1, 1}); + fusion.addInput(tv0); + fusion.addInput(tv1); + auto tv2 = add(tv0, tv1); + fusion.addOutput(tv2); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input0 = at::randn({1, 1, 333, 1}, options); + at::Tensor input1 = at::randn({1, 1, 333, 1}, options); + + auto lparams = scheduleTranspose(&fusion, {input0, input1}); + + FusionExecutor fe; + fe.compileFusion(&fusion, {input0, input1}, lparams); + auto outputs = fe.runFusion({input0, input1}, lparams); + + auto tv_ref = input0 + input1; + + testValidate( + &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__); +} + +// Repro for issue #1873 +TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + auto tv1 = makeContigTensor(2); + fusion.addInput(tv0); + fusion.addInput(tv1); + auto tv2 = set(tv0); + auto tv3 = broadcast(tv2, {true, false}); + auto tv4 = add(tv3, tv1); + fusion.addOutput(tv4); + + tv4->merge(0); + tv4->split(0, 32); + + tv0->computeAt(tv4, 1); + + tv2->split(-1, 8); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({123}, options); + at::Tensor t1 = at::randn({3, 123}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0, t1}); + + auto outputs = fe.runFusion({t0, t1}); + + auto tv_ref = t0 + t1; + + testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__); +} + TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) { // https://github.com/csarofeen/pytorch/issues/1926 std::unique_ptr fusion_ptr = std::make_unique(); diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp index fa89f9e91a79..c00d02c8a40d 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp @@ -2857,67 +2857,6 @@ TEST_F(NVFuserTest, FusionTuringMatmulLargeLoad_CUDA) { } } -TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(4); - auto tv1 = makeConcreteTensor({-1, -1, -1, 1}); - fusion.addInput(tv0); - fusion.addInput(tv1); - auto tv2 = add(tv0, tv1); - fusion.addOutput(tv2); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input0 = at::randn({1, 1, 333, 1}, options); - at::Tensor input1 = at::randn({1, 1, 333, 1}, options); - - auto lparams = scheduleTranspose(&fusion, {input0, input1}); - - FusionExecutor fe; - fe.compileFusion(&fusion, {input0, input1}, lparams); - auto outputs = fe.runFusion({input0, input1}, lparams); - - auto tv_ref = input0 + input1; - - testValidate( - &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__); -} - -TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeContigTensor(1); - auto tv1 = makeContigTensor(2); - fusion.addInput(tv0); - fusion.addInput(tv1); - auto tv2 = set(tv0); - auto tv3 = broadcast(tv2, {true, false}); - auto tv4 = add(tv3, tv1); - fusion.addOutput(tv4); - - tv4->merge(0); - tv4->split(0, 32); - - tv0->computeAt(tv4, 1); - - tv2->split(-1, 8); - - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({123}, options); - at::Tensor t1 = at::randn({3, 123}, options); - - FusionExecutor fe; - fe.compileFusion(&fusion, {t0, t1}); - - auto outputs = fe.runFusion({t0, t1}); - - auto tv_ref = t0 + t1; - - testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__); -} - #undef NVFUSER_TEST_CUDA_ARCH_GUARD } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h index 05a4fd600b65..2d0bada1c091 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h @@ -5,8 +5,8 @@ #include #include -#include #include +#include #include @@ -38,7 +38,7 @@ class NVFuserTest : public ::testing::Test { } } - void TearDown() override{ + void TearDown() override { c10::cuda::CUDACachingAllocator::emptyCache(); } };