pytorch · Apr 7, 2025
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/test/models/test_conformer.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/test/models/test_conformer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/test/models/test_llama.py‎
Lines changed: 2 additions & 1 deletion b/‎backends/arm/test/models/test_llama.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/xnnpack/operators/op_slice_copy.py‎
Lines changed: 3 additions & 1 deletion b/‎backends/xnnpack/operators/op_slice_copy.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/xnnpack/test/ops/test_slice_copy.py‎
Lines changed: 12 additions & 0 deletions b/‎backends/xnnpack/test/ops/test_slice_copy.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎devtools/etdump/etdump_filter.cpp‎
Lines changed: 95 additions & 0 deletions b/‎devtools/etdump/etdump_filter.cpp‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎devtools/etdump/etdump_filter.h‎
Lines changed: 102 additions & 0 deletions b/‎devtools/etdump/etdump_filter.h‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎devtools/etdump/etdump_flatcc.cpp‎
Lines changed: 2 additions & 0 deletions b/‎devtools/etdump/etdump_flatcc.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎devtools/etdump/etdump_flatcc.h‎
Lines changed: 0 additions & 1 deletion b/‎devtools/etdump/etdump_flatcc.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎devtools/etdump/targets.bzl‎
Lines changed: 21 additions & 0 deletions b/‎devtools/etdump/targets.bzl‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎devtools/etdump/tests/etdump_filter_test.cpp‎
Lines changed: 120 additions & 0 deletions b/‎devtools/etdump/tests/etdump_filter_test.cpp‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎devtools/etdump/tests/targets.bzl‎
Lines changed: 11 additions & 0 deletions b/‎devtools/etdump/tests/targets.bzl‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎examples/arm/README.md‎
Lines changed: 10 additions & 0 deletions b/‎examples/arm/README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎examples/arm/ethos-u-setup/core_platform/0002-Move-input_data_sec-to-NOLOAD-area.patch‎
Lines changed: 72 additions & 0 deletions b/‎examples/arm/ethos-u-setup/core_platform/0002-Move-input_data_sec-to-NOLOAD-area.patch‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎examples/arm/ethos_u_minimal_example.ipynb‎
Lines changed: 284 additions & 0 deletions b/‎examples/arm/ethos_u_minimal_example.ipynb‎
Lines changed: 284 additions & 0 deletions
diff --git a/‎exir/backend/canonical_partitioners/TARGETS‎
Lines changed: 1 addition & 0 deletions b/‎exir/backend/canonical_partitioners/TARGETS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎exir/backend/canonical_partitioners/all_node_partitioner.py‎
Lines changed: 55 additions & 0 deletions b/‎exir/backend/canonical_partitioners/all_node_partitioner.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎exir/backend/test/test_backends.py‎
Lines changed: 179 additions & 0 deletions b/‎exir/backend/test/test_backends.py‎
Lines changed: 179 additions & 0 deletions
diff --git a/‎exir/backend/test/test_backends_lifted.py‎
Lines changed: 15 additions & 0 deletions b/‎exir/backend/test/test_backends_lifted.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎exir/backend/test/test_compatibility.py‎
Lines changed: 49 additions & 0 deletions b/‎exir/backend/test/test_compatibility.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎exir/program/TARGETS‎
Lines changed: 1 addition & 0 deletions b/‎exir/program/TARGETS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎extension/llm/custom_ops/op_sdpa.cpp‎
Lines changed: 215 additions & 715 deletions b/‎extension/llm/custom_ops/op_sdpa.cpp‎
Lines changed: 215 additions & 715 deletions
diff --git a/‎extension/llm/custom_ops/op_sdpa_impl.h‎
Lines changed: 772 additions & 0 deletions b/‎extension/llm/custom_ops/op_sdpa_impl.h‎
Lines changed: 772 additions & 0 deletions
diff --git a/‎extension/llm/custom_ops/targets.bzl‎
Lines changed: 3 additions & 0 deletions b/‎extension/llm/custom_ops/targets.bzl‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎extension/parallel/targets.bzl‎
Lines changed: 1 addition & 1 deletion b/‎extension/parallel/targets.bzl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎extension/threadpool/targets.bzl‎
Lines changed: 33 additions & 1 deletion b/‎extension/threadpool/targets.bzl‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎install_requirements.py‎
Lines changed: 2 additions & 2 deletions b/‎install_requirements.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎kernels/optimized/cpu/op_elu.cpp‎
Lines changed: 96 additions & 0 deletions b/‎kernels/optimized/cpu/op_elu.cpp‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎kernels/optimized/cpu/targets.bzl‎
Lines changed: 9 additions & 1 deletion b/‎kernels/optimized/cpu/targets.bzl‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎kernels/optimized/lib_defs.bzl‎
Lines changed: 1 addition & 1 deletion b/‎kernels/optimized/lib_defs.bzl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernels/optimized/optimized.yaml‎
Lines changed: 5 additions & 0 deletions b/‎kernels/optimized/optimized.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎kernels/portable/CMakeLists.txt‎
Lines changed: 1 addition & 3 deletions b/‎kernels/portable/CMakeLists.txt‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎kernels/portable/cpu/util/targets.bzl‎
Lines changed: 4 additions & 4 deletions b/‎kernels/portable/cpu/util/targets.bzl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎kernels/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎kernels/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎kernels/test/targets.bzl‎
Lines changed: 1 addition & 1 deletion b/‎kernels/test/targets.bzl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎runtime/core/event_tracer.h‎
Lines changed: 3 additions & 3 deletions b/‎runtime/core/event_tracer.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎runtime/kernel/targets.bzl‎
Lines changed: 2 additions & 2 deletions b/‎runtime/kernel/targets.bzl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎shim_et/xplat/executorch/build/runtime_wrapper.bzl‎
Lines changed: 5 additions & 0 deletions b/‎shim_et/xplat/executorch/build/runtime_wrapper.bzl‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tools/cmake/executorch-config.cmake‎
Lines changed: 5 additions & 1 deletion b/‎tools/cmake/executorch-config.cmake‎
Lines changed: 5 additions & 1 deletion
@@ -1 +1 @@
-7ae0ce6360b6e4f944906502d20da24c04debee5
+59d5cf083b4f860dea76fe8936076177f9367f10
@@ -31,7 +31,7 @@ class TestConformer(unittest.TestCase):
     # .to_executorch step, i.e. after Arm partitioner.
     ops_after_partitioner = {
         "executorch_exir_dialects_edge__ops_aten_max_default": 1,
-        "torch.ops.aten._assert_scalar.default": 10,
+        "torch.ops.aten._assert_scalar.default": 7,
         "torch.ops.aten._local_scalar_dense.default": 1,
     }
 
 
@@ -11,6 +11,7 @@
 import sys
 import unittest
 
+import pytest
 import torch
 
 from executorch.backends.arm.test import common, conftest
@@ -102,7 +103,7 @@ def test_llama_tosa_MI(self):
         llama_model, llama_inputs, llama_meta = self.prepare_model()
 
         if llama_model is None and llama_inputs is None and llama_meta is None:
-            return
+            pytest.skip("Missing model and/or input files")
 
         with torch.no_grad():
             (
 
@@ -69,7 +69,9 @@ def define_node(
             output_shape = [output_shape[i] for i in PERM_NCHW_TO_NHWC]
             dim_of_slice = PERM_NHWC_TO_NCHW[dim_of_slice]
 
-        slice_begin_index = cast(int, node.args[2])
+        slice_begin_index = 0
+        if len(node.args) > 2 and node.args[2]:
+            slice_begin_index = cast(int, node.args[2])
         if slice_begin_index < 0:
             slice_begin_index = input_shape[dim_of_slice] + slice_begin_index
 
 
@@ -69,6 +69,18 @@ def forward(self, x):
         # Note that two of the slices are optimized away as they are identity.
         self._test_slice_copy(ConvSlice(), inputs, 4, 2)
 
+    def test_fp32_slice_copy_default_start(self):
+        """
+        XNNPACK supports default start in slice op.
+        """
+
+        class Slice(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.aten.slice.Tensor(x, 0, None, 2)
+
+        inputs = (torch.randn(5, 5),)
+        self._test_slice_copy(Slice(), inputs, 1, 1)
+
     def test_fp32_slice_copy_stride_non_1(self):
         """
         XNNPACK does not support strided slicing.
 
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/devtools/etdump/etdump_filter.h>
+
+#include <executorch/runtime/core/error.h>
+
+using ::executorch::runtime::DelegateDebugIntId;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::kUnsetDelegateDebugIntId;
+
+namespace executorch {
+namespace etdump {
+
+ETDumpFilter::ETDumpFilter() = default;
+
+Result<bool> ETDumpFilter::add_regex(string_view pattern) {
+  auto regex = std::make_unique<re2::RE2>(pattern.data());
+  if (!regex->ok()) {
+    return Error::InvalidArgument; // Error during regex compilation
+  }
+  regex_patterns_.emplace_back(std::move(regex));
+  return true;
+}
+
+Result<bool> ETDumpFilter::set_debug_handle_range(size_t start, size_t end) {
+  if (start >= end) {
+    return Error::InvalidArgument; // Start is greater than end
+  }
+  if (start < 0 || end < 0) {
+    return Error::InvalidArgument; // Start or end is negative
+  }
+  range_start_ = start;
+  range_end_ = end;
+  return true;
+}
+
+Result<bool> ETDumpFilter::filter_name_(const char* name) {
+  if (name == nullptr) {
+    return Error::InvalidArgument;
+  }
+  if (regex_patterns_.empty()) {
+    return true;
+  }
+  for (const auto& regex : regex_patterns_) {
+    if (RE2::FullMatch(name, *regex)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Result<bool> ETDumpFilter::filter_delegate_debug_index_(
+    DelegateDebugIntId debug_handle) {
+  if (debug_handle == kUnsetDelegateDebugIntId) {
+    return Error::InvalidArgument; // Delegate debug index is unset
+  }
+
+  if (range_start_ == 0 && range_end_ == 0) {
+    return true;
+  }
+
+  if (debug_handle < range_start_ || debug_handle >= range_end_) {
+    return false;
+  }
+
+  return true;
+}
+
+Result<bool> ETDumpFilter::filter(
+    const char* name,
+    DelegateDebugIntId delegate_debug_index) {
+  if ((name == nullptr) == (delegate_debug_index == kUnsetDelegateDebugIntId)) {
+    return Error::InvalidArgument; // Name and delegate debug index should be
+                                   // both set or unset
+  }
+
+  if (name) {
+    return filter_name_(name);
+  } else {
+    return filter_delegate_debug_index_(delegate_debug_index);
+  }
+}
+
+size_t ETDumpFilter::get_n_regex() const {
+  return regex_patterns_.size();
+}
+
+} // namespace etdump
+} // namespace executorch
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <re2/re2.h>
+#include <memory>
+
+#include <executorch/runtime/core/event_tracer.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/platform/platform.h>
+
+namespace executorch::etdump {
+
+using ::executorch::aten::string_view;
+using ::executorch::runtime::Result;
+
+/**
+ * ETDumpFilter is a class that filters intermediate output based on output's
+ * name by full regex filtering, or delegate debug indices by range-based
+ * filtering.
+ *
+ * Note that this filter supports up to MAX_REGEX_PATTERNS regex patterns with a
+ * maximum length of MAX_PATTERN_LENGTH characters each.
+ */
+
+class ETDumpFilter : public ::executorch::runtime::EventTracerFilterBase {
+ public:
+  ETDumpFilter();
+  ~ETDumpFilter() override = default;
+  /**
+   * Adds a regex pattern to the filter.
+   *
+   * @param[in] pattern A c string representing the regex pattern to be added.
+   *
+   * @return A Result<bool> indicating the success or failure of adding the
+   * regex pattern.
+   *         - True if the pattern is successfully added.
+   *         - False if the pattern could not be added or if the maximum number
+   * of patterns is exceeded.
+   *         - An error code if number of pattern has reached to cap, or any
+   * error occurs during regex compilation.
+   */
+  Result<bool> add_regex(string_view pattern);
+  /**
+   * Sets the range for the delegate debug index filtering as [start, end).
+   * Note that this function will flush the existing range.
+   *
+   * @param[in] start The start of the range for filtering.
+   * @param[in] end The end of the range for filtering.
+   *
+   * @return A Result<bool> indicating the success or failure of setting the
+   * range.
+   *         - True if the range is successfully set.
+   *         - An error code if an error occurs.
+   */
+  Result<bool> set_debug_handle_range(size_t start, size_t end);
+
+  /**
+   * Filters events based on the given name or delegate debug index.
+   *
+   * Note that everytime only one of either the name or delegate_debug_index
+   * should be passed in.
+   *
+   * @param[in] name A pointer to a string representing the `name` of the
+   * event. If `delegate_debug_index` is not set to kUnsetDebugHandle, `name`
+   * should be set to nullptr.
+   *
+   * @param[in] delegate_debug_index A DebugHandle representing the debug index
+   * of the delegate. If `name` is not nullptr, this should be set to
+   * kUnsetDebugHandle.
+   *
+   * @return A Result<bool> indicating whether the event matches the filter
+   * criteria.
+   *         - True if the event matches the filter, or filter is unset.
+   *         - False if the event does not match or is unknown.
+   *         - An error code if an error occurs during filtering.
+   */
+  Result<bool> filter(
+      const char* name,
+      ::executorch::runtime::DelegateDebugIntId delegate_debug_index) override;
+
+  /**
+   * Returns the number of regex patterns in the filter.
+   */
+  size_t get_n_regex() const;
+
+ private:
+  std::vector<std::unique_ptr<re2::RE2>> regex_patterns_;
+  size_t range_start_ = 0;
+  size_t range_end_ = 0;
+  Result<bool> filter_name_(const char* name);
+  Result<bool> filter_delegate_debug_index_(
+      ::executorch::runtime::DelegateDebugIntId delegate_debug_index);
+};
+
+} // namespace executorch::etdump
@@ -15,6 +15,7 @@
 #include <executorch/devtools/etdump/etdump_schema_flatcc_builder.h>
 #include <executorch/devtools/etdump/etdump_schema_flatcc_reader.h>
 #include <executorch/devtools/etdump/utils.h>
+#include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/platform/assert.h>
@@ -28,6 +29,7 @@ using ::executorch::runtime::ChainID;
 using ::executorch::runtime::DebugHandle;
 using ::executorch::runtime::DelegateDebugIdType;
 using ::executorch::runtime::DelegateDebugIntId;
+using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::EventTracerEntry;
 using ::executorch::runtime::kUnsetDelegateDebugIntId;
 
@@ -9,7 +9,6 @@
 #pragma once
 
 #include <cstdint>
-#include <memory>
 
 #include <executorch/devtools/etdump/data_sinks/buffer_data_sink.h>
 #include <executorch/devtools/etdump/data_sinks/data_sink_base.h>
 
@@ -101,6 +101,27 @@ def define_common_targets():
     for aten_mode in get_aten_mode_options():
         aten_suffix = "_aten" if aten_mode else ""
 
+        runtime.cxx_library(
+            name = "etdump_filter" + aten_suffix,
+            srcs = [
+                "etdump_filter.cpp",
+            ],
+            exported_headers = [
+                "etdump_filter.h",
+            ],
+            deps = [
+                "//executorch/runtime/platform:platform",
+            ],
+            exported_deps = [
+                "fbsource//third-party/re2:re2",
+                "//executorch/runtime/core:event_tracer" + aten_suffix,
+            ],
+            visibility = [
+                "//executorch/...",
+                "@EXECUTORCH_CLIENTS",
+            ],
+        )
+
         runtime.cxx_library(
             name = "etdump_flatcc" + aten_suffix,
             srcs = [
 
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <executorch/devtools/etdump/etdump_filter.h>
+#include <executorch/runtime/platform/runtime.h>
+
+#include <cstring>
+
+using ::executorch::etdump::ETDumpFilter;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::kUnsetDelegateDebugIntId;
+using ::executorch::runtime::Result;
+
+class ETDumpFilterTest : public ::testing::Test {
+ protected:
+  ETDumpFilter filter;
+
+  void SetUp() override {
+    torch::executor::runtime_init();
+  }
+
+  void TearDown() override {}
+};
+
+TEST_F(ETDumpFilterTest, AddRegexPatternSuccess) {
+  Result<bool> result = filter.add_regex("test.*");
+  EXPECT_TRUE(result.ok());
+  EXPECT_TRUE(result.get());
+}
+
+TEST_F(ETDumpFilterTest, SetDebugHandleRangeSuccess) {
+  Result<bool> result = filter.set_debug_handle_range(10, 20);
+  EXPECT_TRUE(result.ok());
+  EXPECT_TRUE(result.get());
+}
+
+TEST_F(ETDumpFilterTest, SetDebugHandleRangeFailure) {
+  Result<bool> result = filter.set_debug_handle_range(20, 10);
+  EXPECT_EQ(result.error(), Error::InvalidArgument);
+}
+
+TEST_F(ETDumpFilterTest, FilterByNameSuccess) {
+  filter.add_regex("event.*");
+  Result<bool> result = filter.filter("event_name", kUnsetDelegateDebugIntId);
+  EXPECT_TRUE(result.ok());
+  EXPECT_TRUE(result.get());
+}
+
+TEST_F(ETDumpFilterTest, PartialMatchingFailed) {
+  filter.add_regex("event.*");
+  Result<bool> result =
+      filter.filter("non_matching_event", kUnsetDelegateDebugIntId);
+  EXPECT_TRUE(result.ok());
+  EXPECT_FALSE(result.get());
+}
+
+TEST_F(ETDumpFilterTest, FilterByDelegateDebugIndexSuccess) {
+  filter.set_debug_handle_range(10, 20);
+  Result<bool> result = filter.filter(nullptr, 15);
+  EXPECT_TRUE(result.ok());
+  EXPECT_TRUE(result.get());
+}
+
+TEST_F(ETDumpFilterTest, FilterByDelegateDebugIndexFailure) {
+  filter.set_debug_handle_range(10, 20);
+  Result<bool> result = filter.filter(nullptr, 25);
+  EXPECT_TRUE(result.ok());
+  EXPECT_FALSE(result.get());
+}
+
+TEST_F(ETDumpFilterTest, NaiveFilterNameInputCanSucceed) {
+  Result<bool> result = filter.filter("any_input", kUnsetDelegateDebugIntId);
+  EXPECT_TRUE(result.ok());
+  EXPECT_TRUE(result.get());
+}
+
+TEST_F(ETDumpFilterTest, NaiveFilterDebugHandleInputCanSucceed) {
+  Result<bool> result = filter.filter(nullptr, 12345);
+  EXPECT_TRUE(result.ok());
+  EXPECT_TRUE(result.get());
+}
+
+TEST_F(ETDumpFilterTest, IllegalInput) {
+  filter.add_regex("pattern");
+  Result<bool> result = filter.filter("matching_event", 1);
+  EXPECT_EQ(result.error(), Error::InvalidArgument);
+}
+
+TEST_F(ETDumpFilterTest, NoMatchFirstThenMatch) {
+  filter.add_regex("non_matching_pattern");
+  Result<bool> result_1 =
+      filter.filter("matching_event", kUnsetDelegateDebugIntId);
+  EXPECT_TRUE(result_1.ok());
+  EXPECT_FALSE(result_1.get());
+  filter.add_regex("matching_.*");
+  Result<bool> result_2 =
+      filter.filter("matching_event", kUnsetDelegateDebugIntId);
+  EXPECT_TRUE(result_2.ok());
+  EXPECT_TRUE(result_2.get());
+}
+
+TEST_F(ETDumpFilterTest, MatchRegexFirstThen) {
+  filter.add_regex("matching.*");
+  Result<bool> result_1 =
+      filter.filter("matching_event", kUnsetDelegateDebugIntId);
+  EXPECT_TRUE(result_1.ok());
+  EXPECT_TRUE(result_1.get());
+  filter.add_regex("non_matching_pattern");
+  Result<bool> result_2 =
+      filter.filter("matching_event", kUnsetDelegateDebugIntId);
+  EXPECT_TRUE(result_2.ok());
+  EXPECT_TRUE(result_2.get());
+}
@@ -21,3 +21,14 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
         ],
     )
+
+    runtime.cxx_test(
+        name = "etdump_filter_test",
+        srcs = [
+            "etdump_filter_test.cpp",
+        ],
+        deps = [
+            "//executorch/devtools/etdump:etdump_filter",
+            "//executorch/runtime/platform:platform",
+        ],
+    )
@@ -32,6 +32,16 @@ $ source  executorch/examples/arm/ethos-u-scratch/setup_path.sh
 $ executorch/examples/arm/run.sh --model_name=mv2 --target=ethos-u85-128 [--scratch-dir=same-optional-scratch-dir-as-before]
 ```
 
+### Ethos-U minimal example
+
+See the jupyter notebook `ethos_u_minimal_example.ipynb` for an explained minimal example of the full flow for running a
+PyTorch module on the EthosUDelegate. The notebook runs directly in some IDE:s s.a. VS Code, otherwise it can be run in
+your browser using
+```
+pip install jupyter
+jupyter notebook ethos_u_minimal_example.ipynb
+```
+
 ### Online Tutorial
 
 We also have a [tutorial](https://pytorch.org/executorch/stable/executorch-arm-delegate-tutorial.html) explaining the steps performed in these
 
@@ -0,0 +1,72 @@
+From 23712ff626db16793d428dddcb530f9e5faaa073 Mon Sep 17 00:00:00 2001
+From: Adrian Lundell <adrian.lundell@arm.com>
+Date: Thu, 3 Apr 2025 14:25:52 +0200
+Subject: [PATCH] Move input_data_sec to NOLOAD area
+
+---
+ targets/corstone-300/platform.ld | 10 ++++++++--
+ targets/corstone-320/platform.ld |  8 ++++++--
+ 2 files changed, 14 insertions(+), 4 deletions(-)
+
+diff --git a/targets/corstone-300/platform.ld b/targets/corstone-300/platform.ld
+index 1733509..3ccce64 100644
+--- a/targets/corstone-300/platform.ld
++++ b/targets/corstone-300/platform.ld
+@@ -272,13 +272,12 @@ SECTIONS
+     *(.bss.tensor_arena)
+ #endif
+
+-    . = ALIGN(4);
+-    *(input_data_sec)
+     . = ALIGN(16);
+ #if (ETHOSU_MODEL == 1)
+     *(network_model_sec)
+ #endif
+     * (expected_output_data_sec)
++    . = ALIGN(16);
+     * (sec_command_stream, sec_weight_data, sec_input_data)
+     *(.got*)
+     *(.rodata*)
+@@ -287,6 +286,13 @@ SECTIONS
+     . = ALIGN(4);
+   } > DDR :rom_dram
+
++  .ddr_noload (NOLOAD) :
++  {
++    . = ALIGN(16);
++    *(input_data_sec)
++    . = ALIGN(16);
++  } > DDR :null
++
+   __eddr_data = ALIGN (4) ;
+   .sram.data : {
+     __sram_data_start__ = .;
+diff --git a/targets/corstone-320/platform.ld b/targets/corstone-320/platform.ld
+index c8261c0..9b7e071 100644
+--- a/targets/corstone-320/platform.ld
++++ b/targets/corstone-320/platform.ld
+@@ -268,8 +268,6 @@ SECTIONS
+     *(network_model_sec)
+ #endif
+
+-    . = ALIGN(4);
+-    *(input_data_sec)
+     *(expected_output_data_sec)
+     *(output_data_sec)
+
+@@ -279,6 +277,12 @@ SECTIONS
+     __etext = .;
+   } > DDR :rom_dram
+
++  .ddr_noload (NOLOAD) :
++  {
++    . = ALIGN(16);
++    *(input_data_sec)
++  } > DDR :null
++
+   .bss :
+   {
+     . = ALIGN(4);
+--
+2.43.0
+
@@ -0,0 +1,284 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright 2025 Arm Limited and/or its affiliates.\n",
+    "#\n",
+    "# This source code is licensed under the BSD-style license found in the\n",
+    "# LICENSE file in the root directory of this source tree."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Ethos-U delegate flow example\n",
+    "\n",
+    "This guide demonstrates the full flow for running a module on Arm Ethos-U using ExecuTorch. \n",
+    "Tested on Linux x86_64 and macOS aarch64. If something is not working for you, please raise a GitHub issue and tag Arm.\n",
+    "\n",
+    "Before you begin:\n",
+    "1. (In a clean virtual environment with a compatible Python version) Install executorch using `./install_executorch.sh`\n",
+    "2. Install Arm cross-compilation toolchain and simulators using `examples/arm/setup.sh --i-agree-to-the-contained-eula`\n",
+    "3. Add Arm cross-compilation toolchain and simulators to PATH using `examples/arm/ethos-u-scratch/setup_path.sh` \n",
+    "\n",
+    "With all commands executed from the base `executorch` folder.\n",
+    "\n",
+    "\n",
+    "\n",
+    "*Some scripts in this notebook produces long output logs: Configuring the 'Customizing Notebook Layout' settings to enable 'Output:scrolling' and setting 'Output:Text Line Limit' makes this more manageable*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## AOT Flow\n",
+    "\n",
+    "The first step is creating the PyTorch module and exporting it. Exporting converts the python code in the module into a graph structure. The result is still runnable python code, which can be displayed by printing the `graph_module` of the exported program.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "class Add(torch.nn.Module):\n",
+    "    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:\n",
+    "        return x + y\n",
+    "\n",
+    "example_inputs = (torch.ones(1,1,1,1),torch.ones(1,1,1,1))\n",
+    "\n",
+    "model = Add()\n",
+    "model = model.eval()\n",
+    "exported_program = torch.export.export_for_training(model, example_inputs)\n",
+    "graph_module = exported_program.module()\n",
+    "\n",
+    "_ = graph_module.print_readable()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To run on Ethos-U the `graph_module` must be quantized using the `arm_quantizer`. Quantization can be done in multiple ways and it can be customized for different parts of the graph; shown here is the recommended path for the EthosUBackend. Quantization also requires calibrating the module with example inputs.\n",
+    "\n",
+    "Again printing the module, it can be seen that the quantization wraps the node in quantization/dequantization nodes which contain the computed quanitzation parameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder\n",
+    "from executorch.backends.arm.quantizer.arm_quantizer import (\n",
+    "    EthosUQuantizer,\n",
+    "    get_symmetric_quantization_config,\n",
+    ")\n",
+    "from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e\n",
+    "\n",
+    "target = \"ethos-u55-128\"\n",
+    "\n",
+    "# Create a compilation spec describing the target for configuring the quantizer\n",
+    "# Some args are used by the Arm Vela graph compiler later in the example. Refer to Arm Vela documentation for an \n",
+    "# explanation of its flags: https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela/-/blob/main/OPTIONS.md\n",
+    "spec_builder = ArmCompileSpecBuilder().ethosu_compile_spec(\n",
+    "            target,\n",
+    "            system_config=\"Ethos_U55_High_End_Embedded\",\n",
+    "            memory_mode=\"Shared_Sram\",\n",
+    "            extra_flags=\"--output-format=raw --debug-force-regor\"\n",
+    "        )\n",
+    "compile_spec = spec_builder.build()\n",
+    "\n",
+    "# Create and configure quantizer to use a symmetric quantization config globally on all nodes\n",
+    "quantizer = EthosUQuantizer(compile_spec) \n",
+    "operator_config = get_symmetric_quantization_config(is_per_channel=False)\n",
+    "quantizer.set_global(operator_config)\n",
+    "\n",
+    "# Post training quantization\n",
+    "quantized_graph_module = prepare_pt2e(graph_module, quantizer) \n",
+    "quantized_graph_module(*example_inputs) # Calibrate the graph module with the example input\n",
+    "quantized_graph_module = convert_pt2e(quantized_graph_module)\n",
+    "\n",
+    "_ = quantized_graph_module.print_readable()\n",
+    "\n",
+    "# Create a new exported program using the quantized_graph_module\n",
+    "quantized_exported_program = torch.export.export_for_training(quantized_graph_module, example_inputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The quantization nodes created in the previous cell are not built by default with ExecuTorch but must be included in the .pte-file, and so they need to be built separately. `backends/arm/scripts/build_quantized_ops_aot_lib.sh` is a utility script which does this. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import subprocess \n",
+    "import os \n",
+    "\n",
+    "# Setup paths\n",
+    "cwd_dir = os.getcwd()\n",
+    "et_dir = os.path.join(cwd_dir, \"..\", \"..\")\n",
+    "et_dir = os.path.abspath(et_dir)\n",
+    "script_dir = os.path.join(et_dir, \"backends\", \"arm\", \"scripts\")\n",
+    "\n",
+    "# Run build_quantized_ops_aot_lib.sh\n",
+    "subprocess.run(os.path.join(script_dir, \"build_quantized_ops_aot_lib.sh\"), shell=True, cwd=et_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The lowering in the EthosUBackend happens in five steps:\n",
+    "\n",
+    "1. **Lowering to core Aten operator set**: Transform module to use a subset of operators applicable to edge devices. \n",
+    "2. **Partitioning**: Find subgraphs which are supported for running on Ethos-U\n",
+    "3. **Lowering to TOSA compatible operator set**: Perform transforms to make the Ethos-U subgraph(s) compatible with TOSA \n",
+    "4. **Serialization to TOSA**: Compiles the graph module into a TOSA graph \n",
+    "5. **Compilation to NPU**: Compiles the TOSA graph into an EthosU command stream using the Arm Vela graph compiler. This makes use of the `compile_spec` created earlier.\n",
+    "Step 5 also prints a Network summary for each processed subgraph.\n",
+    "\n",
+    "All of this happens behind the scenes in `to_edge_transform_and_lower`. Printing the graph module shows that what is left in the graph is two quantization nodes for `x` and `y` going into an `executorch_call_delegate` node, followed by a dequantization node."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from executorch.backends.arm.ethosu_partitioner import EthosUPartitioner\n",
+    "from executorch.exir import (\n",
+    "    EdgeCompileConfig,\n",
+    "    ExecutorchBackendConfig,\n",
+    "    to_edge_transform_and_lower,\n",
+    ")\n",
+    "from executorch.extension.export_util.utils import save_pte_program\n",
+    "import platform \n",
+    "\n",
+    "# Create partitioner from compile spec \n",
+    "partitioner = EthosUPartitioner(compile_spec)\n",
+    "\n",
+    "# Lower the exported program to the Ethos-U backend\n",
+    "edge_program_manager = to_edge_transform_and_lower(\n",
+    "            quantized_exported_program,\n",
+    "            partitioner=[partitioner],\n",
+    "            compile_config=EdgeCompileConfig(\n",
+    "                _check_ir_validity=False,\n",
+    "            ),\n",
+    "        )\n",
+    "\n",
+    "# Load quantization ops library\n",
+    "os_aot_lib_names = {\"Darwin\" : \"libquantized_ops_aot_lib.dylib\", \n",
+    "                \"Linux\"  : \"libquantized_ops_aot_lib.so\", \n",
+    "                \"Windows\": \"libquantized_ops_aot_lib.dll\"}\n",
+    "aot_lib_name = os_aot_lib_names[platform.system()]\n",
+    "\n",
+    "libquantized_ops_aot_lib_path = os.path.join(et_dir, \"cmake-out-aot-lib\", \"kernels\", \"quantized\", aot_lib_name)\n",
+    "torch.ops.load_library(libquantized_ops_aot_lib_path)\n",
+    "\n",
+    "# Convert edge program to executorch\n",
+    "executorch_program_manager = edge_program_manager.to_executorch(\n",
+    "            config=ExecutorchBackendConfig(extract_delegate_segments=False)\n",
+    "        )\n",
+    "\n",
+    "executorch_program_manager.exported_program().module().print_readable()\n",
+    "\n",
+    "# Save pte file\n",
+    "pte_base_name = \"simple_example\"\n",
+    "pte_name = pte_base_name + \".pte\"\n",
+    "pte_path = os.path.join(cwd_dir, pte_name)\n",
+    "save_pte_program(executorch_program_manager, pte_name)\n",
+    "assert os.path.exists(pte_path), \"Build failed; no .pte-file found\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Build executor runtime\n",
+    "\n",
+    "After the AOT compilation flow is done, the runtime can be cross compiled and linked to the produced .pte-file using the Arm cross-compilation toolchain. This is done in three steps:\n",
+    "1. Build the executorch library and EthosUDelegate.\n",
+    "2. Build any external kernels required. In this example this is not needed as the graph is fully delegated, but its included for completeness.\n",
+    "3. Build and link the `arm_executor_runner`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build executorch \n",
+    "subprocess.run(os.path.join(script_dir, \"build_executorch.sh\"), shell=True, cwd=et_dir)\n",
+    "\n",
+    "# Build portable kernels\n",
+    "subprocess.run(os.path.join(script_dir, \"build_portable_kernels.sh\"), shell=True, cwd=et_dir)\n",
+    "\n",
+    "# Build executorch runner\n",
+    "args = f\"--pte={pte_path} --target={target}\"\n",
+    "subprocess.run(os.path.join(script_dir, \"build_executorch_runner.sh\") + \" \" + args, shell=True, cwd=et_dir)\n",
+    "\n",
+    "elf_path = os.path.join(cwd_dir, pte_base_name, \"cmake-out\", \"arm_executor_runner\")\n",
+    "assert os.path.exists(elf_path), \"Build failed; no .elf-file found\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run on simulated model\n",
+    "\n",
+    "We can finally use the `backends/arm/scripts/run_fvp.sh` utility script to run the .elf-file on simulated Arm hardware. This Script runs the model with an input of ones, so the expected result of the addition should be close to 2."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "args = f\"--elf={elf_path}  --target={target}\"\n",
+    "subprocess.run(os.path.join(script_dir, \"run_fvp.sh\") + \" \" + args, shell=True, cwd=et_dir)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -7,6 +7,7 @@ runtime.python_library(
     srcs = [
         "duplicate_dequant_node_pass.py",
         "pattern_op_partitioner.py",
+        "all_node_partitioner.py",
     ],
     visibility = [
         "//executorch/...",
 
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List
+
+import torch
+from executorch.exir.backend.backend_details import ExportedProgram
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
+
+
+def is_non_tensor_placeholder(node: torch.fx.Node, ep: ExportedProgram) -> bool:
+    """
+    Returns true if the node is a placeholder node and it is not a tensor
+    """
+    return node.op == "placeholder" and not (
+        is_param(ep, node) or is_buffer(ep, node) or is_lifted_tensor_constant(ep, node)
+    )
+
+
+class AllNodePartitioner(Partitioner):
+    def __init__(
+        self,
+        backend_id: str,
+        compile_specs: List[CompileSpec],
+    ):
+        """
+        Partitioner that lowers every single node in the graph module unconditionally
+        to the specified backend_id
+        """
+        super().__init__()
+        self.delegation_spec = DelegationSpec(backend_id, compile_specs)
+
+    def partition(self, exported_program: ExportedProgram) -> PartitionResult:
+        # tag all nodes
+        partition_tags: Dict[str, DelegationSpec] = {}
+        for node in exported_program.graph_module.graph.nodes:
+            if is_non_tensor_placeholder(node, exported_program) or node.op == "output":
+                continue
+
+            delegation_tag = self.delegation_spec.backend_id
+            node.meta["delegation_tag"] = delegation_tag
+            partition_tags[delegation_tag] = self.delegation_spec
+
+        return PartitionResult(
+            tagged_exported_program=exported_program, partition_tags=partition_tags
+        )
@@ -10,7 +10,11 @@
 
 import executorch.exir as exir
 import torch
+from executorch.exir import to_edge
 from executorch.exir.backend.backend_api import LoweredBackendModule, to_backend
+from executorch.exir.backend.canonical_partitioners.all_node_partitioner import (
+    AllNodePartitioner,
+)
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import (
     DelegationSpec,
@@ -1266,3 +1270,178 @@ def forward(self, x: List[torch.Tensor]):
 
         gm = exir.capture(ComposedM(), inputs, exir.CaptureConfig()).to_edge()
         gm(*inputs)
+
+    def test_to_backend_delegation_spec(self):
+        class SinModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return [torch.sin(x)]
+
+        sin_module = SinModule()
+        model_inputs = (torch.ones(1),)
+        max_value = model_inputs[0].shape[0]
+
+        partitioner = AllNodePartitioner(
+            "BackendWithCompilerDemo", [CompileSpec("max_value", bytes([max_value]))]
+        )
+
+        edgeir_m = to_edge(torch.export.export(sin_module, model_inputs))
+        edgeir_m = edgeir_m.to_backend(partitioner)
+        exec_prog = edgeir_m.to_executorch()
+        graph_module = exec_prog.exported_program().graph_module
+        # Check that there is not an aten.sin node.
+        self.assertTrue(
+            exir_ops.edge.aten.sin
+            not in {node.target for node in graph_module.graph.nodes}
+        )
+
+        # Check that there exists a call_delegate, representing the call to the
+        # delegated function
+        FileCheck().check("torch.ops.higher_order.executorch_call_delegate").run(
+            graph_module.code
+        )
+        lowered_submodules = get_lowered_submodules(graph_module)
+        self.assertEqual(len(lowered_submodules), 1)
+
+        for node in graph_module.graph.nodes:
+            if node.op == "call_function" and node.target == executorch_call_delegate:
+                # Check that first arg is lowered_module_{unique_id}
+                self.assertEqual(node.args[0].target, "lowered_module_0")
+
+        program = exec_prog.executorch_program
+
+        # Check the program can be printed
+        print_program(program)
+
+        # Check the backend delegate
+        self.check_backend_delegate(
+            program=program,
+            delegate=program.execution_plan[0].delegates[0],
+            expected_id=BackendWithCompilerDemo.__name__,
+            expected_processed=b"1version:0#op:demo::aten.sin.default, numel:1, dtype:torch.float32<debug_handle>2#",
+        )
+
+        # Check the delegate instruction
+        self.assertTrue(
+            isinstance(
+                program.execution_plan[0].chains[0].instructions[0].instr_args,
+                DelegateCall,
+            )
+        )
+        buff = exec_prog.buffer
+
+        executorch_module = _load_for_executorch_from_buffer(buff)
+        model_inputs = torch.ones(1)
+        model_outputs = executorch_module.forward([model_inputs])
+        self.assertEqual(
+            model_inputs,
+            torch.ones(1),
+        )
+        expected_output = 0.8333 * torch.ones(1)
+
+        self.assertTrue(
+            torch.allclose(model_outputs[0], expected_output, atol=1e-03, rtol=1e-03)
+        )
+
+    def test_to_backend_multimethod_delegation_spec(self):
+        class SinModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.sin(x)
+
+            def inputs(self):
+                return (torch.ones(1),)
+
+        class AddMulModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, a, x, b):
+                y = torch.mm(a, x)
+                z = torch.add(y, b)
+                return z
+
+            def inputs(self):
+                return (torch.ones(2, 2), 2 * torch.ones(2, 2), 3 * torch.ones(2, 2))
+
+        sin_module = SinModule()
+        max_value_sin = sin_module.inputs()[0].shape[0]
+        sin_partitioner = AllNodePartitioner(
+            "BackendWithCompilerDemo",
+            [CompileSpec("max_value", bytes([max_value_sin]))],
+        )
+
+        add_mul_module = AddMulModule()
+        max_value_add_mul = add_mul_module.inputs()[0].shape[0]
+        add_mul_partitioner = AllNodePartitioner(
+            "BackendWithCompilerDemo",
+            [CompileSpec("max_value", bytes([max_value_add_mul]))],
+        )
+
+        edgeir_m = to_edge(
+            {
+                "sin": torch.export.export(sin_module, sin_module.inputs()),
+                "add_mul": torch.export.export(add_mul_module, add_mul_module.inputs()),
+            }
+        )
+        edgeir_m = edgeir_m.to_backend(
+            {
+                "sin": sin_partitioner,
+                "add_mul": add_mul_partitioner,
+            }
+        )
+        exec_prog = edgeir_m.to_executorch()
+
+        for method_name in ["sin", "add_mul"]:
+            graph_module = exec_prog.exported_program(method_name).graph_module
+            # Check delegated nodes are gone
+            self.assertTrue(
+                exir_ops.edge.aten.sin
+                not in {node.target for node in graph_module.graph.nodes}
+            )
+            self.assertTrue(
+                exir_ops.edge.aten.add
+                not in {node.target for node in graph_module.graph.nodes}
+            )
+            self.assertTrue(
+                exir_ops.edge.aten.mm
+                not in {node.target for node in graph_module.graph.nodes}
+            )
+            # Check that there exists a call_delegate, representing the call to the
+            # delegated function
+            FileCheck().check("torch.ops.higher_order.executorch_call_delegate").run(
+                graph_module.code
+            )
+            lowered_submodules = get_lowered_submodules(graph_module)
+            self.assertEqual(len(lowered_submodules), 1)
+
+        program = exec_prog.executorch_program
+
+        # Check the program can be printed
+        print_program(program)
+
+        buff = exec_prog.buffer
+
+        executorch_module = _load_for_executorch_from_buffer(buff)
+
+        for method_name, module in {
+            "sin": sin_module,
+            "add_mul": add_mul_module,
+        }.items():
+            inputs_flattened, _ = tree_flatten(module.inputs())
+            model_outputs = executorch_module.run_method(
+                method_name, tuple(inputs_flattened)
+            )
+
+            if method_name == "sin":
+                # backend with compiler demo does a taylor approximation of sin
+                ref_output = 0.8333 * torch.ones(1)
+            else:
+                ref_output = module(*module.inputs())
+            self.assertTrue(
+                torch.allclose(model_outputs[0], ref_output, atol=1e-03, rtol=1e-03)
+            )
@@ -11,6 +11,9 @@
 import torch
 from executorch.exir import to_edge
 from executorch.exir.backend.backend_api import LoweredBackendModule, to_backend
+from executorch.exir.backend.canonical_partitioners.all_node_partitioner import (
+    AllNodePartitioner,
+)
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import (
     DelegationSpec,
@@ -138,6 +141,18 @@ def forward(self, x):
 
         self.assertTrue(torch.allclose(new_res, expected_res))
 
+        # Test same flow but through edge_program_manager
+        edgeir_m = to_edge(export(sin_module, model_inputs, strict=True))
+        loweredir_m = edgeir_m.to_backend(
+            AllNodePartitioner(BackendWithCompilerDemo.__name__, [])
+        )
+        lowered_sin_module = get_lowered_submodules(
+            loweredir_m.exported_program().graph_module
+        )[0][1]
+
+        new_res = lowered_sin_module(*model_inputs)[0]
+
+        self.assertTrue(torch.allclose(new_res, expected_res))
         # TODO(tkaruturi): emitting single LoweredBackendModule
         # program = to_edge(export(graph_module)).to_exectorch()._emitter_output.program
 
 
@@ -10,6 +10,9 @@
 from executorch.exir import to_edge
 from executorch.exir._serialize import _serialize_pte_binary
 from executorch.exir.backend.backend_api import to_backend
+from executorch.exir.backend.canonical_partitioners.all_node_partitioner import (
+    AllNodePartitioner,
+)
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.test.backend_with_compiler_demo import (
     BackendWithCompilerDemo,
@@ -65,3 +68,49 @@ def forward(self, x):
             "loading method forward failed with error 0x30",
         ):
             executorch_module = _load_for_executorch_from_buffer(buff)
+
+    def test_compatibility_in_runtime_edge_program_manager(self):
+        class SinModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.sin(x)
+
+        sin_module = SinModule()
+        model_inputs = (torch.ones(1),)
+        edgeir_m = to_edge(export(sin_module, model_inputs, strict=True))
+        max_value = model_inputs[0].shape[0]
+        compile_specs = [CompileSpec("max_value", bytes([max_value]))]
+        lowered_edge_irm = edgeir_m.to_backend(
+            AllNodePartitioner("BackendWithCompilerDemo", compile_specs)
+        )
+        exec_prog = lowered_edge_irm.to_executorch()
+
+        buff = exec_prog.buffer
+
+        # The demo backend works well
+        executorch_module = _load_for_executorch_from_buffer(buff)
+        model_inputs = torch.ones(1)
+        _ = executorch_module.forward([model_inputs])
+
+        prog = exec_prog.executorch_program
+        # Rewrite the delegate version number from 0 to 1.
+        prog.backend_delegate_data[0].data = bytes(
+            "1version:1#op:demo::aten.sin.default, numel:1, dtype:torch.float32<debug_handle>1#",
+            encoding="utf8",
+        )
+
+        # Generate the .pte file with the wrong version.
+        buff = bytes(
+            _serialize_pte_binary(
+                program=prog,
+            )
+        )
+
+        # Throw runtime error with error code 0x30, meaning delegate is incompatible.
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "loading method forward failed with error 0x30",
+        ):
+            executorch_module = _load_for_executorch_from_buffer(buff)
@@ -31,6 +31,7 @@ python_library(
         "//executorch/exir/_serialize:lib",
         "//executorch/exir/backend:backend_api",
         "//executorch/exir/backend:partitioner",
+        "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
         "//executorch/exir/capture:config",
         "//executorch/exir/emit:emit",
         "//executorch/exir/emit:lib",
 
@@ -30,6 +30,9 @@ def define_common_targets():
                 "op_sdpa.h",
                 "op_update_cache.h",
             ],
+            headers = [
+                "op_sdpa_impl.h",
+            ],
             preprocessor_flags = get_vec_preprocessor_flags(),
             exported_deps = [
                 "//executorch/runtime/kernel:kernel_includes",
 
@@ -17,6 +17,6 @@ def define_common_targets():
             "@EXECUTORCH_CLIENTS",
         ],
         deps = [
-            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/extension/threadpool:threadpool",
         ],
     )
@@ -20,7 +20,7 @@ def define_common_targets():
     ] + (["fb/threadpool_use_n_threads.h"] if not runtime.is_oss else [])
 
     runtime.cxx_library(
-        name = "threadpool",
+        name = "threadpool_lib",
         srcs = _THREADPOOL_SRCS,
         deps = [
             "//executorch/runtime/core:core",
@@ -45,6 +45,38 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_library(
+        name = "threadpool",
+        # TODO: OSS doesn't have os:iphoneos. Sync buck2 prelude
+        # update to add it and remove duplication.
+        exported_deps = (select({
+            # Major operating systems should be able to use threadpool.
+            "ovr_config//os:linux": [":threadpool_lib"],
+            "ovr_config//os:macos": [":threadpool_lib"],
+            "ovr_config//os:windows": [":threadpool_lib"],
+            "ovr_config//os:android": [":threadpool_lib"],
+            "ovr_config//os:iphoneos": [":threadpool_lib"],
+            # Machines without an operating system shouldn't.
+            "ovr_config//os:none": ["//executorch/runtime/kernel:thread_parallel_interface"],
+            # If we don't know what it is, disable threadpool out of caution.
+            "DEFAULT": ["//executorch/runtime/kernel:thread_parallel_interface"],
+        }) if not runtime.is_oss else select({
+            # Major operating systems should be able to use threadpool.
+            "ovr_config//os:linux": [":threadpool_lib"],
+            "ovr_config//os:macos": [":threadpool_lib"],
+            "ovr_config//os:windows": [":threadpool_lib"],
+            "ovr_config//os:android": [":threadpool_lib"],
+            # Machines without an operating system shouldn't.
+            "ovr_config//os:none": ["//executorch/runtime/kernel:thread_parallel_interface"],
+            # If we don't know what it is, disable threadpool out of caution.
+            "DEFAULT": ["//executorch/runtime/kernel:thread_parallel_interface"],
+        })),
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     runtime.cxx_library(
         name = "cpuinfo_utils",
         srcs = [
 
@@ -71,7 +71,7 @@ def python_is_compatible():
 #
 # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
 # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-NIGHTLY_VERSION = "dev20250310"
+NIGHTLY_VERSION = "dev20250325"
 
 
 def install_requirements(use_pytorch_nightly):
@@ -80,7 +80,7 @@ def install_requirements(use_pytorch_nightly):
         # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        f"torch==2.7.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
+        f"torch==2.8.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
         (
             f"torchvision==0.22.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly
 
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/native/cpu/Elu.h>
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace torch::executor::native {
+
+namespace {
+template <typename CTYPE>
+void elu(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale,
+    Tensor& out) {
+  const CTYPE* in_data = input.const_data_ptr<CTYPE>();
+  CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
+  using MathT =
+      std::conditional_t<c10::is_reduced_floating_point_v<CTYPE>, float, CTYPE>;
+  MathT math_alpha = 0;
+  MathT math_scale = 0;
+  MathT math_input_scale = 0;
+  ET_EXTRACT_SCALAR(alpha, math_alpha);
+  ET_EXTRACT_SCALAR(scale, math_scale);
+  ET_EXTRACT_SCALAR(input_scale, math_input_scale);
+  const auto scalar_func =
+      at::native::get_scalar_elu_elementwise_func<CTYPE, MathT>(
+          math_alpha, math_scale, math_input_scale);
+  const auto vec_func = at::native::get_vectorized_elu_elementwise_func<CTYPE>(
+      math_alpha, math_scale, math_input_scale);
+
+  ::executorch::extension::parallel_for(
+      0,
+      out.numel(),
+      ::executorch::extension::internal::GRAIN_SIZE,
+      [&](const auto begin, const auto end) {
+        using Vec = at::vec::Vectorized<CTYPE>;
+        const auto vectorized_begin =
+            begin + (Vec::size() - begin % Vec::size()) % Vec::size();
+        const auto vectorized_end = end - (end % Vec::size());
+        // Scalar prologue.
+        for (const auto idx : c10::irange(begin, vectorized_begin)) {
+          out_data[idx] = scalar_func(in_data[idx]);
+        }
+
+        // Main vectorized loop.
+        for (auto idx = vectorized_begin; idx < vectorized_end;
+             idx += Vec::size()) {
+          auto result_vec = vec_func(Vec::loadu(&in_data[idx]));
+          result_vec.store(&out_data[idx]);
+        }
+
+        // Scalar epilogue.
+        for (const auto idx : c10::irange(vectorized_end, end)) {
+          out_data[idx] = scalar_func(in_data[idx]);
+        }
+      });
+}
+} // namespace
+
+Tensor& opt_elu_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale,
+    Tensor& out) {
+  ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(in), InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, tensors_have_same_dtype(in, out), InvalidArgument, out);
+
+  ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, "elu.out", CTYPE, [&]() {
+    elu<CTYPE>(ctx, in, alpha, scale, input_scale, out);
+  });
+  return out;
+}
+
+} // namespace torch::executor::native
@@ -25,6 +25,14 @@ _OPTIMIZED_ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:broadcast_util",
         ],
     ),
+    op_target(
+        name = "op_elu",
+        deps = [
+            "//executorch/extension/threadpool:threadpool",
+            "//executorch/kernels/portable/cpu:scalar_utils",
+            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
+        ],
+    ),
     op_target(name = "op_exp"),
     op_target(
         name = "op_fft_r2c",
@@ -99,8 +107,8 @@ _OPTIMIZED_ATEN_OPS = (
     op_target(
         name = "op_where",
         deps = [
+            "//executorch/extension/threadpool:threadpool",
             "//executorch/kernels/portable/cpu/util:elementwise_util",
-            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
     ),
 )
 
@@ -232,9 +232,9 @@ def define_libs(is_fbcode=False):
                 "DEFAULT": [],
             }) + LIBBLAS_DEPS,
             exported_deps = [
+                "//executorch/extension/threadpool:threadpool",
                 "//executorch/kernels/optimized:libutils",
                 "//executorch/runtime/core/exec_aten:lib",
-                "//executorch/runtime/kernel:thread_parallel_interface",
             ],
             **get_apple_framework_deps_kwargs(is_fbcode),
         )
@@ -37,6 +37,11 @@
     - arg_meta: null
       kernel_name: torch::executor::opt_div_scalar_out
 
+- op: elu.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_elu_out
+
 - op: exp.out
   kernels:
     - arg_meta: null
 
@@ -66,9 +66,7 @@ gen_operators_lib(
 # Portable kernels support optional parallelization (and, in the
 # future, perhaps other performance features). If support is present,
 # produce an optimized version.
-set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
-
-if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
+if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   add_library(optimized_portable_kernels ${_portable_kernels__srcs})
   target_link_libraries(optimized_portable_kernels PRIVATE executorch)
   target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
 
@@ -12,6 +12,7 @@ def define_common_targets():
     runtime.cxx_library(
         name = "all_deps",
         deps = [
+            "//executorch/extension/threadpool:threadpool",
             "//executorch/kernels/portable/cpu/util:functional_util",
             "//executorch/kernels/portable/cpu/util:broadcast_util",
             "//executorch/kernels/portable/cpu/util:kernel_ops_util",
@@ -32,7 +33,6 @@ def define_common_targets():
             "//executorch/kernels/portable/cpu/util:slice_util",
             "//executorch/kernels/portable/cpu/util:elementwise_util",
             "//executorch/kernels/portable/cpu/util:upsample_util",
-            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
         visibility = ["//executorch/...", "@EXECUTORCH_CLIENTS"],
     )
@@ -111,7 +111,7 @@ def define_common_targets():
             ":broadcast_util",
             ":dtype_util",
             "//executorch/runtime/kernel:kernel_runtime_context",
-            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/extension/threadpool:threadpool",
         ],
         deps = [
             "//executorch/kernels/portable/cpu:scalar_utils",
@@ -245,7 +245,7 @@ def define_common_targets():
         srcs = [],
         exported_headers = ["functional_util.h"],
         exported_deps = [
-            "//executorch/runtime/kernel:thread_parallel_interface",
+            "//executorch/extension/threadpool:threadpool",
         ],
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
@@ -319,7 +319,7 @@ def define_common_targets():
                 "//executorch/runtime/core/exec_aten/util:tensor_util{}".format(suffix),
             ],
             exported_deps = [
-                "//executorch/runtime/kernel:thread_parallel_interface",
+                "//executorch/extension/threadpool:threadpool",
             ],
             exported_preprocessor_flags = ["-DUSE_ATEN_LIB"] if aten_mode else [],
             visibility = [
 
@@ -274,6 +274,7 @@ set(_optimized_kernels_test_sources
     "op_add_test.cpp"
     "op_bmm_test.cpp"
     "op_div_test.cpp"
+    "op_elu_test.cpp"
     "op_exp_test.cpp"
     "op_fft_r2c_test.cpp"
     "op_gelu_test.cpp"
 
@@ -215,7 +215,7 @@ def define_common_targets():
     _common_op_test("op_detach_copy_test", ["aten", "portable"])
     _common_op_test("op_diagonal_copy_test", ["aten", "portable"])
     _common_op_test("op_div_test", ["aten", "portable", "optimized"])
-    _common_op_test("op_elu_test", ["aten", "portable"])
+    _common_op_test("op_elu_test", ["aten", "portable", "optimized"])
     _common_op_test("op_embedding_test", ["aten", "portable"])
     _common_op_test("op_empty_test", ["aten", "portable"])
     _common_op_test("op_eq_test", ["aten", "portable"])
 
@@ -101,14 +101,14 @@ class EventTracerFilterBase {
    *         - An error code if an error occurs during filtering.
    */
   virtual Result<bool> filter(
-      char* name,
-      DelegateDebugIntId delegate_debug_index);
+      const char* name,
+      DelegateDebugIntId delegate_debug_index) = 0;
 
   /**
    * Virtual destructor for the EventTracerFilterBase class.
    * Ensures proper cleanup of derived class objects.
    */
-  virtual ~EventTracerFilterBase();
+  virtual ~EventTracerFilterBase() = default;
 };
 
 /**
 
@@ -59,9 +59,9 @@ def define_common_targets():
             "//executorch/runtime/core/portable_type/c10/c10:c10",
             "//executorch/runtime/platform:platform",
         ],
+        # Don't depend on this target, depend on //executorch/extension/threadpool:threadpool.
         visibility = [
-            "//executorch/...",
-            "@EXECUTORCH_CLIENTS",
+            "//executorch/extension/threadpool/...",
         ],
     )
 
 
@@ -112,6 +112,11 @@ def _patch_build_mode_flags(kwargs):
         # @oss-disable: "ovr_config//build_mode:code-coverage": ["-D__ET_BUILD_MODE_COV=1"],
     })
 
+    kwargs["compiler_flags"] = kwargs["compiler_flags"] + select({
+            "DEFAULT": [],
+            "ovr_config//os:macos": ["-fvisibility=default"],
+    })
+
     return kwargs
 
 def _patch_test_compiler_flags(kwargs):
 
@@ -149,7 +149,7 @@ endif()
 if(TARGET coremldelegate)
   set_target_properties(
     coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES
-                             "coreml_inmemoryfs;coreml_util"
+                              "coreml_inmemoryfs;coreml_util"
   )
 endif()
 
@@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib)
 endif()
 if(TARGET extension_threadpool)
   target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
+  set_target_properties(
+    extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES
+                                    "cpuinfo;pthreadpool"
+  )
 endif()
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-7ae0ce6360b6e4f944906502d20da24c04debee5`
	`1`	`+59d5cf083b4f860dea76fe8936076177f9367f10`
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ class TestConformer(unittest.TestCase):`
`31`	`31`	`# .to_executorch step, i.e. after Arm partitioner.`
`32`	`32`	`ops_after_partitioner = {`
`33`	`33`	`"executorch_exir_dialects_edge__ops_aten_max_default": 1,`
`34`		`- "torch.ops.aten._assert_scalar.default": 10,`
	`34`	`+ "torch.ops.aten._assert_scalar.default": 7,`
`35`	`35`	`"torch.ops.aten._local_scalar_dense.default": 1,`
`36`	`36`	`}`
`37`	`37`
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,6 @@ def define_common_targets():`
`17`	`17`	`"@EXECUTORCH_CLIENTS",`
`18`	`18`	`],`
`19`	`19`	`deps = [`
`20`		`- "//executorch/runtime/kernel:thread_parallel_interface",`
	`20`	`+ "//executorch/extension/threadpool:threadpool",`
`21`	`21`	`],`
`22`	`22`	`)`