pytorch · swolchok · Mar 6, 2025 · Mar 1, 2025 · Mar 1, 2025 · Mar 1, 2025
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <iostream>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+Tensor& opt_where_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& cond,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  // Common Dtype
+  ScalarType common_type = promoteTypes(a.scalar_type(), b.scalar_type());
+
+  // Check Common Dtype
+  ET_KERNEL_CHECK(ctx, common_type == out.scalar_type(), InvalidArgument, out);
+
+  // Check Dim Order
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(cond, a, b, out), InvalidArgument, out);
+
+  // Resize
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_to_broadcast_target_size(a, b, cond, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  // Compute Dtype
+  ScalarType compute_type = utils::get_compute_type(common_type);
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "where.self_out";
+
+  if (a.scalar_type() == b.scalar_type() &&
+      a.scalar_type() == out.scalar_type() && a.scalar_type() == compute_type &&
+      // Using a Byte tensor for cond has been deprecated for a long time.
+      cond.scalar_type() == ScalarType::Bool) {
+    auto out_numel = out.numel();
+    ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+      const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+      const bool cond_is_broadcasted = !out.sizes().equals(cond.sizes());
+      const bool any_is_broadcasted =
+          (a_is_broadcasted || b_is_broadcasted || cond_is_broadcasted);
+      const CTYPE_COMPUTE* const data_a = a.const_data_ptr<CTYPE_COMPUTE>();
+      const CTYPE_COMPUTE* const data_b = b.const_data_ptr<CTYPE_COMPUTE>();
+      const bool* const data_cond = cond.const_data_ptr<bool>();
+      CTYPE_COMPUTE* const data_out = out.data_ptr<CTYPE_COMPUTE>();
+      if (any_is_broadcasted) {
+        for (const auto [out_index, a_index, b_index, cond_index] :
+             BroadcastIndexesRange<3>(out, a, b, cond)) {
+          data_out[out_index] =
+              data_cond[cond_index] ? data_a[a_index] : data_b[b_index];
+        }
+      } else {
+        for (const auto i : c10::irange(out_numel)) {
+          data_out[i] = data_cond[i] ? data_a[i] : data_b[i];
+        }
+      }
+    });
+  } else {
+    // Fall back for mixed dtype to keep code size and compile time
+    // reasonable.
+    ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+      utils::apply_tritensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+          [](const CTYPE_COMPUTE val_a,
+             const CTYPE_COMPUTE val_b,
+             const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
+          ctx,
+          a,
+          utils::SupportedTensorDtypes::REALHBBF16,
+          b,
+          utils::SupportedTensorDtypes::REALHBBF16,
+          cond,
+          utils::SupportedTensorDtypes::BOOL_OR_BYTE,
+          out,
+          utils::SupportedTensorDtypes::SAME_AS_COMMON);
+    });
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
@@ -95,6 +95,12 @@ _OPTIMIZED_ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
     ),
+    op_target(
+        name = "op_where",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+        ],
+    ),
 )
 
 

@@ -101,3 +101,8 @@
   kernels:
     - arg_meta: null
       kernel_name: torch::executor::opt_sub_scalar_out
+
+- op: where.self_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_where_out
diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp
@@ -50,7 +50,10 @@ Tensor& argmax_out(
     for (const auto out_ix : c10::irange(out.numel())) {
       std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
           [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-            if (!std::isnan(acc_val) && (std::isnan(v) || v > acc_val)) {
+            // the below condition as written is equivalent to
+            // !isnan(accval) && (isnan(v) || v > acc_val). See
+            // argument in op_argmin.cpp.
+            if (!std::isnan(acc_val) && !(v <= acc_val)) {
               acc_val = v;
               acc_ix = ix;
             }

diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
@@ -50,7 +50,17 @@ Tensor& argmin_out(
     for (const auto out_ix : c10::irange(out.numel())) {
       std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
           [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-            if (!std::isnan(acc_val) && (std::isnan(v) || v < acc_val)) {
+            // the below condition as written is equivalent to !isnan(accval) &&
+            // (isnan(v) || v < acc_val). cases:
+            // - if neither acc_val nor v is NaN, !(v >= acc_val) is
+            //   trivially equivalent to v < acc_val.
+            // - if acc_val is NaN, the whole thing is trivially false.
+            // - if acc_val is not NaN and v is NaN, then v >= acc_val
+            // - is false because all comparisons involving NaN are
+            // - false, so the result is true. The result is trivially
+            // - true for the above condition that uses isnan(v) as
+            // - well.
+            if (!std::isnan(acc_val) && !(v >= acc_val)) {
               acc_val = v;
               acc_ix = ix;
             }

diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h>
+
+namespace torch::executor {
+
+namespace internal {
+template <std::size_t kNumInputs>
+class BroadcastIndexesIterator {
+ public:
+  using difference_type = ssize_t;
+  using value_type = std::array<ssize_t, kNumInputs + 1>;
+  using reference = const value_type&;
+  using pointer = const value_type*;
+  using iterator_category = std::forward_iterator_tag;
+
+  BroadcastIndexesIterator() = default;
+
+  template <typename... Args>
+  explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args)
+      : output_dim_(output.dim()),
+        output_shape_(output.sizes()),
+        effective_input_broadcast_strides_{
+            effective_input_broadcast_stride(output, args)...} {
+    static_assert(
+        sizeof...(args) == kNumInputs && (std::is_same_v<Args, Tensor> && ...),
+        "BroadcastIndexesIterator constructor requires kNumInputs input tensor"
+        "arguments!");
+  }
+
+  struct make_end_t {
+    explicit constexpr make_end_t() = default;
+  };
+
+  template <typename... Args>
+  BroadcastIndexesIterator(make_end_t, const Tensor& t, const Args&... args)
+      : current_indexes_{
+            t.numel(),
+            0,
+        } {}
+
+  bool operator==(const BroadcastIndexesIterator& rhs) const {
+    return output_index() == rhs.output_index();
+  }
+
+  bool operator!=(const BroadcastIndexesIterator& rhs) const {
+    return !operator==(rhs);
+  }
+
+  reference operator*() const {
+    return current_indexes_;
+  }
+
+  pointer operator->() const {
+    return &current_indexes_;
+  }
+
+  BroadcastIndexesIterator& operator++() {
+    output_index()++;
+    // TODO: add optimization for particular input tensors not being
+    // broadcasted?
+    for (auto ii = output_dim_ - 1; ii >= 0; --ii) {
+      // You might wonder what happens if output_shape_[ii] == 0. In that case,
+      // output.numel() would be 0, and thus the iterator would be the end()
+      // iterator, which is not legal to increment.
+      if ET_UNLIKELY (delinearized_output_index_[ii] == output_shape_[ii] - 1) {
+        const auto old_delinearized_output_index_item =
+            delinearized_output_index_[ii];
+        delinearized_output_index_[ii] = 0;
+        for (const auto jj : c10::irange(1, kNumInputs + 1)) {
+          current_indexes_[jj] -= old_delinearized_output_index_item *
+              effective_input_broadcast_strides_[jj - 1][ii];
+        }
+      } else {
+        delinearized_output_index_[ii]++;
+        for (const auto jj : c10::irange(1, kNumInputs + 1)) {
+          current_indexes_.at(jj) +=
+              effective_input_broadcast_strides_[jj - 1][ii];
+        }
+        break;
+      }
+    }
+    return *this;
+  }
+
+  BroadcastIndexesIterator operator++(int) {
+    auto it = *this;
+    operator++();
+    return it;
+  }
+
+  difference_type operator-(const BroadcastIndexesIterator& rhs) const {
+    return difference_type(output_index() - rhs.output_index());
+  }
+
+ private:
+  ssize_t output_index() const {
+    return current_indexes_[0];
+  }
+
+  ssize_t& output_index() {
+    return current_indexes_[0];
+  }
+
+  std::array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>
+  effective_input_broadcast_stride(const Tensor& output, const Tensor& t)
+      const {
+    std::array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>
+        result = {0};
+    ET_CHECK_MSG(
+        t.dim() <= output.dim(),
+        "input to broadcasting op should have dim at most output dim, but %d > %d!",
+        (int)t.dim(),
+        (int)output.dim());
+
+    const auto num_leading_ones = output.dim() - t.dim();
+    for (const auto idx : c10::irange(num_leading_ones)) {
+      result[idx] = 0;
+    }
+    const auto t_sizes = t.sizes();
+    const auto t_strides = t.strides();
+    for (const auto idx :
+         c10::irange(num_leading_ones, num_leading_ones + t.dim())) {
+      result[idx] = t_sizes[idx - num_leading_ones] == 1
+          ? 0
+          : t_strides[idx - num_leading_ones];
+    }
+    return result;
+  }
+
+  // The 0th entry is the current linear index into the output,
+  // followed by kNumInputs input indexes.
+  std::array<ssize_t, kNumInputs + 1> current_indexes_ = {0};
+  using ShapeType = std::
+      array<exec_aten::SizesType, executorch::runtime::kTensorDimensionLimit>;
+  ShapeType delinearized_output_index_ = {0};
+  ssize_t output_dim_;
+  ArrayRef<exec_aten::SizesType> output_shape_;
+  // The linear index for a broadcast tensor is
+  // sum(delinearized_output_index_[i] * input_stride_[i] if
+  // padded_input_shape_[i] != 1 else 0), where padded_input_shape is
+  // input.sizes() with leading 1s added to make its size equal to
+  // output_dim. This is straightforwardly implementable with an
+  // adjusted stride array that contains 0s where the padded input
+  // shape would contain 1s.
+  std::array<ShapeType, kNumInputs> effective_input_broadcast_strides_ = {{0}};
+};
+} // namespace internal
+
+// Efficient mechanism for looping over the index space for an output
+// tensor and kNumInputs possibly-broadcasted input tensors. Use as follows:
+//
+// auto* output_data = output.mutable_data_ptr<OutputType>();
+// const auto* a_data = a.mutable_data_ptr<AType>();
+// const auto* b_data = b.mutable_data_ptr<BType>();
+// for (const auto [output_index, a_index, b_index] :
+// BroadcastIndexesRange<2>(output, a, b)) {
+//   // Access output_data[output_index], a_data[a_index], and b_data[b_index].
+// }
+//
+// (where OutputType, AType, and BType are known concrete types.)
+//
+// Unlike looping using delinearize_index() and
+// linearize_access_indexes(), BroadcastIndexesRange avoids expensive
+// division and modulo operations on each iteration.
+template <std::size_t kNumInputs>
+class BroadcastIndexesRange {
+ public:
+  using iterator = internal::BroadcastIndexesIterator<kNumInputs>;
+
+  template <typename... Args>
+  BroadcastIndexesRange(const Tensor& output, const Args&... args)
+      : tensors_{&output, (&args)...} {}
+
+  iterator begin() const {
+    return std::apply(
+        [](const auto&... args) { return iterator((*args)...); }, tensors_);
+  }
+
+  iterator end() const {
+    return std::apply(
+        [](const auto&... args) {
+          return iterator(typename iterator::make_end_t(), (*args)...);
+        },
+        tensors_);
+  }
+
+ private:
+  std::array<const Tensor*, kNumInputs + 1> tensors_;
+};
+} // namespace torch::executor