Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions third_party/nvfuser/benchmark/bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,13 +349,13 @@ static void setupBiasDropoutAddLayernormBwd1(Fusion* fusion, DataType dtype) {
TensorView* tv3 = TensorViewBuilder()
.ndims(3)
.dtype(dtype)
.contiguity({true, true})
.contiguity({true, true, c10::nullopt})
.shape({-1, -1, 1})
.build();
TensorView* tv4 = TensorViewBuilder()
.ndims(3)
.dtype(dtype)
.contiguity({true, true})
.contiguity({true, true, c10::nullopt})
.shape({-1, -1, 1})
.build();

Expand Down Expand Up @@ -457,7 +457,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) {
TensorView* tv4 = TensorViewBuilder()
.ndims(3)
.dtype(dtype)
.contiguity({true, true})
.contiguity({true, true, c10::nullopt})
.shape({-1, -1, 1})
.build();
TensorView* tv5 = makeContigTensor(1, dtype);
Expand Down
4 changes: 2 additions & 2 deletions third_party/nvfuser/benchmark/layer_norm_backward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
auto bias = makeContigTensor(1, dtype);

auto mean = TensorViewBuilder()
.contiguity({false})
.contiguity({false, c10::nullopt})
.shape({-1, 1})
.dtype(DataType::Float)
.build();
auto rstd = TensorViewBuilder()
.contiguity({false})
.contiguity({false, c10::nullopt})
.shape({-1, 1})
.dtype(DataType::Float)
.build();
Expand Down
2 changes: 1 addition & 1 deletion third_party/nvfuser/benchmark/rms_norm_backward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ static void setupRMSNorm_BWD(Fusion* fusion, DataType dtype) {
auto input = makeContigTensor(3, dtype);
auto weight = makeContigTensor(1, dtype);
auto rstd = TensorViewBuilder()
.contiguity({false, false})
.contiguity({false, false, c10::nullopt})
.shape({-1, -1, 1})
.dtype(dtype)
.build();
Expand Down
6 changes: 2 additions & 4 deletions third_party/nvfuser/benchmark/scale_bias_relu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,17 @@ static void setupSBR(Fusion* fusion, DataType dtype) {
std::vector<int64_t> bcast_shape(kNumberOfDims, 1);
bcast_shape[bcast_shape.size() - 1] = -1;

std::vector<bool> bcast_contig(1, true);

auto x = makeContigTensor(kNumberOfDims, dtype);

auto scale = TensorViewBuilder()
.contiguity(bcast_contig)
.shape(bcast_shape)
.contiguity(true)
.dtype(dtype)
.build();

auto bias = TensorViewBuilder()
.contiguity(bcast_contig)
.shape(bcast_shape)
.contiguity(true)
.dtype(dtype)
.build();

Expand Down
8 changes: 4 additions & 4 deletions third_party/nvfuser/benchmark/timm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ static void setup_vit_base_patch16_224_bcast7(Fusion* fusion, void* null) {
auto t3 = TensorViewBuilder()
.shape({-1, -1, 1})
.dtype(DataType::Float)
.contiguity({true, true})
.contiguity({true, true, c10::nullopt})
.build();
auto t4 = TensorViewBuilder()
.shape({-1, -1, 1})
.dtype(DataType::Float)
.contiguity({true, true})
.contiguity({true, true, c10::nullopt})
.build();
auto t7 = makeContigTensor(3, DataType::Half);

Expand Down Expand Up @@ -538,14 +538,14 @@ static void setup_vit_base_patch16_224_LN_BWD(Fusion* fusion, void* null) {
auto t5 = TensorViewBuilder()
.shape({-1, -1, 1})
.dtype(DataType::Float)
.contiguity({true, true})
.contiguity({true, true, c10::nullopt})
.build();
fusion->addInput(t5);

auto t6 = TensorViewBuilder()
.shape({-1, -1, 1})
.dtype(DataType::Float)
.contiguity({true, true})
.contiguity({true, true, c10::nullopt})
.build();
fusion->addInput(t6);

Expand Down
19 changes: 2 additions & 17 deletions third_party/nvfuser/benchmark/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,7 @@ TensorView* makeSymbolicTensor(size_t ndims, DataType dtype) {
}

TensorView* makeContigTensor(size_t ndims, DataType dtype) {
return TensorViewBuilder()
.ndims(ndims)
.dtype(dtype)
.contiguity(std::vector<bool>(ndims, true))
.build();
return TensorViewBuilder().ndims(ndims).dtype(dtype).contiguity(true).build();
}

TensorView* makeConcreteTensor(std::vector<int64_t> shape, DataType dtype) {
Expand All @@ -159,18 +155,7 @@ TensorView* makeConcreteTensor(std::vector<int64_t> shape, DataType dtype) {
TensorView* makeContigConcreteTensor(
std::vector<int64_t> shape,
DataType dtype) {
std::vector<bool> contiguity;
for (auto s : shape) {
if (s == 1) {
continue;
}
contiguity.push_back(true);
}
return TensorViewBuilder()
.shape(shape)
.dtype(dtype)
.contiguity(contiguity)
.build();
return TensorViewBuilder().shape(shape).dtype(dtype).contiguity(true).build();
}

void runBenchmarkIterations(
Expand Down
19 changes: 8 additions & 11 deletions third_party/nvfuser/csrc/contiguity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ NonDivisibleSplitDependencies::NonDivisibleSplitDependencies(
ContigIDs::ContigIDs(
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
const std::vector<bool>& root_contiguity,
const std::vector<c10::optional<bool>>& root_contiguity,
const std::unordered_set<IterDomain*>& final_ids,
const std::unordered_map<IterDomain*, Val*>& index_map,
const std::unordered_set<Split*>& divisible_splits,
Expand Down Expand Up @@ -419,7 +419,7 @@ ContigIDs::ContigIDs(
ContigIDs::ContigIDs(
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
const std::vector<bool>& root_contiguity,
const std::vector<c10::optional<bool>>& root_contiguity,
const std::unordered_set<IterDomain*>& final_ids,
const std::unordered_map<IterDomain*, Val*>& index_map,
const std::unordered_set<Split*>& divisible_splits,
Expand Down Expand Up @@ -458,17 +458,16 @@ void ContigIDs::build(const std::vector<IterDomain*>& ids) {
}

TORCH_INTERNAL_ASSERT(
TensorDomain::noBroadcasts(root_domain_).size() ==
root_contiguity_.size(),
root_domain_.size() == root_contiguity_.size(),
"Arguments don't match ",
TensorDomain::noBroadcasts(root_domain_).size(),
root_domain_.size(),
" != ",
root_contiguity_.size());

int no_broadcast_i = 0;
for (const auto root_domain_i : c10::irange(root_domain_.size())) {
auto root_domain_id = root_domain_.at(root_domain_i)->as<IterDomain>();
if (root_domain_id->isBroadcast()) {
TORCH_INTERNAL_ASSERT(!root_contiguity_.at(root_domain_i).has_value());
continue;
}
root_to_indexed_id_[root_domain_id] = root_domain_id;
Expand All @@ -479,14 +478,13 @@ void ContigIDs::build(const std::vector<IterDomain*>& ids) {
// rfactor root domains, which should just return "zero"
// RootAxisInfo. This should be safe as no rfactor tensor should
// need halo.
if (root_contiguity_.at(no_broadcast_i) &&
if (*root_contiguity_.at(root_domain_i) &&
!halo_info_->getRootAxisInfo(root_domain_id).hasHalo() &&
root_domain_id->getIterType() != IterType::GatherScatter) {
contig_ids_.emplace(root_domain_id);
is_contig_root_.at(root_domain_id) = true;
within_contig_ids_[root_domain_id] = std::unordered_set<IterDomain*>();
}
no_broadcast_i++;
}

if (!contig_ids_.empty()) {
Expand Down Expand Up @@ -540,10 +538,10 @@ void ContigIDs::handle(Merge* merge) {
bool is_indexing_pass = !ignore_consistent_ordering_;

IterDomain* last_root = nullptr;
int no_broadcast_i = 0;
for (auto root_id_i : c10::irange(root_domain_.size())) {
auto root_id = root_domain_[root_id_i];
if (root_id->isBroadcast()) {
TORCH_INTERNAL_ASSERT(!root_contiguity_.at(root_id_i).has_value());
continue;
}
if (root_ids.has(root_id)) {
Expand All @@ -556,14 +554,13 @@ void ContigIDs::handle(Merge* merge) {
// If we're computing predicates (ignore_consistent_ordering_==true),
// then we don't have this same constraint, we can just ignore
// contiguity of the roots all together.
if (!root_contiguity_.at(no_broadcast_i) && is_indexing_pass) {
if (!*root_contiguity_.at(root_id_i) && is_indexing_pass) {
if (!root_ids.empty()) {
return;
}
}
last_root = root_id;
}
no_broadcast_i++;
}

// If there's a non_divisible split in the history of merge->out then it can't
Expand Down
6 changes: 3 additions & 3 deletions third_party/nvfuser/csrc/contiguity.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ class ContigIDs : public OptInDispatch {
ContigIDs(
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
const std::vector<bool>& root_contiguity,
const std::vector<c10::optional<bool>>& root_contiguity,
const std::unordered_set<IterDomain*>& final_ids,
const std::unordered_map<IterDomain*, Val*>& index_map,
const std::unordered_set<Split*>& divisible_splits,
Expand Down Expand Up @@ -188,7 +188,7 @@ class ContigIDs : public OptInDispatch {
ContigIDs(
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
const std::vector<bool>& root_contiguity,
const std::vector<c10::optional<bool>>& root_contiguity,
const std::unordered_set<IterDomain*>& final_ids,
const std::unordered_map<IterDomain*, Val*>& index_map,
const std::unordered_set<Split*>& divisible_splits,
Expand Down Expand Up @@ -264,7 +264,7 @@ class ContigIDs : public OptInDispatch {
//! Root domains to analyze contiguity
const std::vector<IterDomain*>& root_domain_;
//! Contiguity of root_domain_
const std::vector<bool>& root_contiguity_;
const std::vector<c10::optional<bool>>& root_contiguity_;
//! Domains where indexing/predicates cannot be done with their
//! consumers domains
const std::unordered_set<IterDomain*>& final_ids_;
Expand Down
1 change: 1 addition & 0 deletions third_party/nvfuser/csrc/executor_kernel_arg.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <torch/csrc/jit/ir/ir.h>
#include <type.h>
#include <array>
#include <optional>
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@naoyam I fixed a compilation error:

/home/gaoxiang/nvfuser7/third_party/nvfuser/csrc/executor_kernel_arg.h:282:18: error: ‘optional’ in namespace ‘std’ does not name a template type
  282 |       const std::optional<KernelIndexMode>& index_mode = std::nullopt);


namespace nvfuser {

Expand Down
2 changes: 1 addition & 1 deletion third_party/nvfuser/csrc/fusion_segmenter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,7 @@ TensorView* castIntermediateValueInCompleteFusion(
return IrBuilder::create<TensorView>(
IrBuilder::create<TensorDomain>(
new_root_domain,
TensorDomain::getContiguousContiguity(new_root_domain)),
TensorDomain::getContiguityFilledWith(new_root_domain, true)),
data_type);
};

Expand Down
23 changes: 8 additions & 15 deletions third_party/nvfuser/csrc/index_compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1470,11 +1470,8 @@ std::vector<Val*> Index::getGlobalProducerStridedIndices(
}
}

auto no_broadcast_root_dom = TensorDomain::noBroadcasts(root_dom);
TORCH_INTERNAL_ASSERT(
no_broadcast_root_dom.size() ==
producer_tv->domain()->contiguity().size());
auto full2nob_map = ir_utils::fullToNoBroadcastMap(root_dom);
root_dom.size() == producer_tv->domain()->contiguity().size());
Val* cur_contig_stride = GpuLower::current()->kernel()->oneVal();
for (const auto i : c10::irange(root_dom.size())) {
auto dim = root_dom.size() - i - 1;
Expand All @@ -1484,7 +1481,9 @@ std::vector<Val*> Index::getGlobalProducerStridedIndices(

if (root_dom[dim]->isBroadcast()) {
strides[dim] = cur_contig_stride->fusion()->zeroVal();
} else if (producer_tv->domain()->contiguity().at(full2nob_map.at(dim))) {
TORCH_INTERNAL_ASSERT(
!producer_tv->domain()->contiguity().at(dim).has_value());
} else if (*producer_tv->domain()->contiguity().at(dim)) {
// If contig, used the stored stride which may be the previous
// dimensions stride * previous dimensions size
strides[dim] = cur_contig_stride;
Expand Down Expand Up @@ -1881,10 +1880,7 @@ std::vector<Val*> Index::getStrides(const TensorView* tv) {
}
}

auto no_broadcast_root_dom = TensorDomain::noBroadcasts(root_dom);
TORCH_INTERNAL_ASSERT(
no_broadcast_root_dom.size() == tv->domain()->contiguity().size());
auto full2nob_map = ir_utils::fullToNoBroadcastMap(root_dom);
TORCH_INTERNAL_ASSERT(root_dom.size() == tv->domain()->contiguity().size());
Val* cur_contig_stride = GpuLower::current()->kernel()->oneVal();
for (const auto i : c10::irange(root_dom.size())) {
auto dim = root_dom.size() - i - 1;
Expand All @@ -1894,7 +1890,8 @@ std::vector<Val*> Index::getStrides(const TensorView* tv) {

if (root_dom[dim]->isBroadcast()) {
strides[dim] = cur_contig_stride->fusion()->zeroVal();
} else if (tv->domain()->contiguity().at(full2nob_map.at(dim))) {
TORCH_INTERNAL_ASSERT(!tv->domain()->contiguity().at(dim).has_value());
} else if (*tv->domain()->contiguity().at(dim)) {
// If contig, used the stored stride which may be the previous
// dimensions stride * previous dimensions size
strides[dim] = cur_contig_stride;
Expand Down Expand Up @@ -2312,12 +2309,8 @@ std::vector<PredicateDomainInfo> getPredicateContigIds(
}

std::unordered_set<IterDomain*> final_ids;
int no_broadcast_count = 0;
for (auto root_i : c10::irange(consumer_root_domain.size())) {
auto root_id = consumer_root_domain[root_i];
if (!root_id->isBroadcast()) {
no_broadcast_count++;
}
if (root_id->maybePartial()) {
final_ids.insert(root_id);
continue;
Expand All @@ -2335,7 +2328,7 @@ std::vector<PredicateDomainInfo> getPredicateContigIds(
ContigIDs contig_finder(
consumer_tv->domain()->domain(),
consumer_root_domain,
std::vector<bool>(no_broadcast_count, true),
TensorDomain::getContiguityFilledWith(consumer_root_domain, true),
final_ids,
concrete_index_map,
GpuLower::current()->divisibleSplitSet(),
Expand Down
22 changes: 18 additions & 4 deletions third_party/nvfuser/csrc/ir_interface_nodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,12 +243,13 @@ class TORCH_CUDA_CU_API TensorView : public Val {
//! expressions that use this TensorView are also updated.
void convertRfactorToRootDomain();

void setContiguity(const std::vector<bool>& contig) {
void setContiguity(const std::vector<c10::optional<bool>>& contig) {
domain()->setContiguity(contig);
}

void setContiguity(bool contig) {
setContiguity(std::vector<bool>(domain()->contiguity().size(), contig));
setContiguity(
TensorDomain::getContiguityFilledWith(getMaybeRFactorDomain(), contig));
}

bool hasReduction() const;
Expand Down Expand Up @@ -640,7 +641,8 @@ class TORCH_CUDA_CU_API TensorViewBuilder {
TensorViewBuilder& dtype(DataType dtype);

//! Set the contiguity information (default non-contiguous)
TensorViewBuilder& contiguity(std::vector<bool> contiguity);
TensorViewBuilder& contiguity(std::vector<c10::optional<bool>> contiguity);
TensorViewBuilder& contiguity(bool contiguity);

//! Set the shape (default 0 dimensional, ie. scalar)
TensorViewBuilder& shape(std::vector<Val*> shape);
Expand All @@ -655,7 +657,19 @@ class TORCH_CUDA_CU_API TensorViewBuilder {
private:
size_t ndims_ = 0;
DataType dtype_ = DataType::Float;
std::vector<bool> contiguity_;

// contiguity_ is the vector that you will pass to the constructor of
// TensorDomain. However, constructing this vector can be non-trivial, because
// it is required to be nullopt for broadcast dimensions. We often want to
// create contiguity vector that represents all contiguous or all
// discontiguous. uniform_contiguity_ is there to make this use case more
// convenient. If set, then TensorViewBuilder will automatically fill the
// contiguity with the value of uniform_contiguity_ where it is not required
// to be nullopt. Note that you can only set one of contiguity_ or
// uniform_contiguity_.
std::vector<c10::optional<bool>> contiguity_;
c10::optional<bool> uniform_contiguity_ = c10::nullopt;

std::vector<Val*> shape_;
std::vector<bool> expanded_;
};
Expand Down
Loading