pytorch
diff --git a/‎.github/workflows/build-presets.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build-presets.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 24 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 24 additions & 0 deletions
diff --git a/‎CMakePresets.json
Lines changed: 20 additions & 0 deletions b/‎CMakePresets.json
Lines changed: 20 additions & 0 deletions
diff --git a/‎backends/apple/coreml/TARGETS
Lines changed: 2 additions & 0 deletions b/‎backends/apple/coreml/TARGETS
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/apple/coreml/compiler/coreml_preprocess.py
Lines changed: 5 additions & 4 deletions b/‎backends/apple/coreml/compiler/coreml_preprocess.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎backends/apple/coreml/compiler/torch_ops.py
Lines changed: 25 additions & 2 deletions b/‎backends/apple/coreml/compiler/torch_ops.py
Lines changed: 25 additions & 2 deletions
diff --git a/‎backends/apple/coreml/logging.py
Lines changed: 24 additions & 0 deletions b/‎backends/apple/coreml/logging.py
Lines changed: 24 additions & 0 deletions
diff --git a/‎backends/apple/coreml/partition/coreml_partitioner.py
Lines changed: 83 additions & 28 deletions b/‎backends/apple/coreml/partition/coreml_partitioner.py
Lines changed: 83 additions & 28 deletions
diff --git a/‎backends/apple/coreml/runtime/delegate/multiarray.mm
Lines changed: 3 additions & 0 deletions b/‎backends/apple/coreml/runtime/delegate/multiarray.mm
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/README.md
Lines changed: 2 additions & 2 deletions b/‎backends/arm/README.md
Lines changed: 2 additions & 2 deletions
@@ -20,7 +20,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        preset: [macos, ios, ios-simulator, pybind, llm]
+        preset: [macos, ios, ios-simulator, pybind, profiling, llm]
     with:
       job-name: build
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
 
@@ -278,6 +278,30 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   )
 endif()
 
+if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
+  set(TORCHAO_BUILD_ATEN_OPS OFF)
+  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
+  set(TORCHAO_BUILD_CPU_AARCH64 ON)
+  set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
+
+  list(APPEND TORCHAO_INCLUDE_DIRS
+    ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
+    ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
+    ${EXECUTORCH_ROOT}/third-party/ao
+  )
+
+  set(EXECUTORCH_INCLUDE_DIRS ${TORCHAO_INCLUDE_DIRS})
+
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental)
+  executorch_target_link_options_shared_lib(torchao_ops_executorch)
+  list(APPEND _executorch_kernels torchao_ops_executorch)
+endif()
+
+if(EXECUTORCH_BUILD_TESTS)
+  set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
+  include(CTest)
+endif()
+
 # TODO(dbort): Fix these warnings and remove this flag.
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
 
@@ -100,6 +100,26 @@
             "list": ["Darwin", "Linux", "Windows"]
         }
     },
+    {
+        "name": "profiling",
+        "displayName": "Build ExecuTorch with Profiling Enabled",
+        "inherits": [
+            "common"
+        ],
+        "cacheVariables": {
+            "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/profiling.cmake",
+            "CMAKE_OSX_DEPLOYMENT_TARGET": "12.0"
+        },
+        "condition": {
+            "type": "inList",
+            "string": "${hostSystemName}",
+            "list": [
+                "Darwin",
+                "Linux",
+                "Windows"
+            ]
+        }
+    },
     {
         "name": "zephyr",
         "displayName": "Build ExecuTorch for Zephyr RTOS",
 
@@ -17,6 +17,7 @@ runtime.python_library(
     name = "backend",
     srcs = glob([
         "compiler/*.py",
+        "logging.py",
     ]),
     visibility = [
         "@EXECUTORCH_CLIENTS",
@@ -33,6 +34,7 @@ runtime.python_library(
     name = "partitioner",
     srcs = glob([
         "partition/*.py",
+        "logging.py",
     ]),
     visibility = [
         "@EXECUTORCH_CLIENTS",
 
@@ -16,20 +16,20 @@
 
 import coremltools as ct
 import coremltools.optimize as cto
-
 from executorch.backends.apple.coreml import executorchcoreml
+from executorch.backends.apple.coreml.logging import get_coreml_log_level
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     ExportedProgram,
     PreprocessResult,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.WARNING)
-
 from executorch.backends.apple.coreml.compiler.torch_ops import *  # noqa: F401, F403
 
+logger = logging.getLogger(__name__)
+logger.setLevel(get_coreml_log_level(default_level=logging.WARNING))
+
 
 class COMPILE_SPEC_KEYS(Enum):
     COMPUTE_UNITS = "compute_units"
@@ -409,6 +409,7 @@ def preprocess(
         edge_program: ExportedProgram,
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
+        logger.info(f"Edge program: {edge_program}")
         model_type: CoreMLBackend.MODEL_TYPE = (
             CoreMLBackend.model_type_from_compile_specs(
                 compile_specs,
 
@@ -9,13 +9,15 @@
 # the op to the coremltools library.
 
 import torch as _torch
-from coremltools import _logger as logger
+from coremltools import _logger
 from coremltools.converters.mil.frontend import _utils
 from coremltools.converters.mil.frontend.torch.ops import (
     _get_inputs,
+    _get_kwinputs,
     NUM_TO_NUMPY_DTYPE,
     NUM_TO_TORCH_DTYPE,
     split,
+    to,
     transpose,
     unbind,
 )
@@ -24,6 +26,7 @@
     register_torch_op,
 )
 from coremltools.converters.mil.mil import types
+from executorch.exir.dim_order_utils import get_memory_format
 
 
 # https://github.com/apple/coremltools/pull/2556
@@ -44,6 +47,26 @@ def split_copy(context, node):
     split(context, node)
 
 
+@register_torch_op(
+    torch_alias=[
+        "dim_order_ops::_to_dim_order_copy",
+        "dim_order_ops._to_dim_order_copy",
+    ],
+    override=False,
+)
+def _to_dim_order_copy(context, node):
+    dim_order = _get_kwinputs(context, node, "dim_order", default=[None])[0]
+    node.kwinputs.pop("dim_order")
+
+    # In CoreML, dim_order.val will be an ndarray, so we convert it to a list
+    dim_order = [int(d) for d in dim_order.val]
+    memory_format = get_memory_format(dim_order)
+    assert (
+        memory_format == _torch.contiguous_format
+    ), "Only contiguous memory format is supported in CoreML"
+    to(context, node)
+
+
 # https://github.com/apple/coremltools/pull/2558
 @register_torch_op(
     torch_alias=["torchao::dequantize_affine", "torchao.dequantize_affine"],
@@ -88,7 +111,7 @@ def dequantize_affine(context, node):
     out_np_dtype = None
     if len(inputs) > 7:
         out_np_dtype = NUM_TO_NUMPY_DTYPE[inputs[7].val]
-        logger.warning(
+        _logger.warning(
             f"Core ML ignores output_dtype {out_np_dtype} on torchao.dequantize_affine and instead uses the native precision."
         )
 
 
@@ -0,0 +1,24 @@
+# Copyright © 2023 Apple Inc. All rights reserved.
+#
+# Please refer to the license found in the LICENSE file in the root directory of the source tree.
+
+import logging
+import os
+from typing import Optional
+
+
+def get_coreml_log_level(default_level: int) -> Optional[str]:
+    level_str = os.environ.get("ET_COREML_LOG_LEVEL", "").upper()
+    if level_str == "":
+        return default_level
+
+    level_map = {
+        "DEBUG": logging.DEBUG,
+        "INFO": logging.INFO,
+        "WARNING": logging.WARNING,
+        "ERROR": logging.ERROR,
+        "CRITICAL": logging.CRITICAL,
+    }
+    if level_str not in level_map:
+        raise ValueError(f"Invalid ET_COREML_LOG_LEVEL: {level_str}")
+    return level_map[level_str]
@@ -10,6 +10,8 @@
 import torch
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
+
+from executorch.backends.apple.coreml.logging import get_coreml_log_level
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 from executorch.exir.backend.partitioner import (
@@ -18,12 +20,13 @@
     PartitionResult,
 )
 from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
+from executorch.exir.dialects._ops import ops as exir_ops
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
 from torch.fx.passes.operator_support import OperatorSupportBase
 
 logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
+logger.setLevel(get_coreml_log_level(default_level=logging.INFO))
 
 
 def _is_view_op(op: torch._ops.OpOverload) -> bool:
@@ -54,6 +57,80 @@ def log_once(self, msg: str) -> None:
             logger.info(msg)
             self._logged_msgs.add(msg)
 
+    def should_skip_op_for_delegation(self, node_target_name: str) -> bool:
+        skipped_ops = self.skip_ops_for_coreml_delegation or []
+        if node_target_name in skipped_ops:
+            assert (
+                not self.lower_full_graph
+            ), f"Cannot skip {node_target_name} because lower_full_graph is True.  Please set skip_ops_for_coreml_delegation=None or lower_full_graph=False in the CoreMLPartitioner"
+            self.log_once(
+                "Skipping op for CoreML delegation because it is in skip_ops_for_coreml_delegation: "
+                + node_target_name
+            )
+            return True
+        return False
+
+    def should_override_support(self, node) -> bool:
+        # https://github.com/apple/coremltools/issues/2573
+        if (
+            node.target
+            in [
+                torch.ops.aten.sub.Tensor,
+                exir_ops.edge.aten.sub.Tensor,
+                torch.ops.aten.add.Tensor,
+                exir_ops.edge.aten.add.Tensor,
+            ]
+            and "alpha" in node.kwargs
+            and node.kwargs["alpha"] != 1
+        ):
+            self.log_once(
+                "torch.ops.aten.{sub, add}.Tensor with alpha != 1 is not supported by CoreML.  Overriding support."
+            )
+            return True
+
+        # https://github.com/apple/coremltools/issues/2565
+        if node.target in [
+            torch.ops.aten.diagonal.default,
+            torch.ops.aten.diagonal_copy.default,
+            exir_ops.edge.aten.diagonal.default,
+            exir_ops.edge.aten.diagonal_copy.default,
+        ]:
+            self.log_once(
+                "torch.ops.aten.diagonal.default has a bug in CoreML.  Overriding op support."
+            )
+            return True
+
+        # https://github.com/apple/coremltools/issues/2569
+        if node.target in [
+            torch.ops.aten.acosh.default,
+            exir_ops.edge.aten.acosh.default,
+            torch.ops.aten.asinh.default,
+            exir_ops.edge.aten.asinh.default,
+        ]:
+            self.log_once(
+                "torch.ops.aten.{acosh, asinh}.default is not supported by CoreML.  Overriding op support."
+            )
+            return True
+
+        # TODO: enable this after bugs in ExecuTorch's partitioner are fixed
+        # # If lower_full_graph=False, do not partition nodes with symbolic args because it can result in symbolic args
+        # # in the placeholders due to partitioning, which CoreML does not support
+        # if not self.lower_full_graph and any(
+        #     isinstance(arg, torch.fx.Node)
+        #     and isinstance(
+        #         arg.meta.get("val", None),
+        #         (torch.SymInt, torch.SymBool, torch.SymFloat),
+        #     )
+        #     for arg in node.args
+        # ):
+        #     self.log_once(
+        #         "Skipping op for CoreML delegation because it contains symbolic args: "
+        #         + node_target_name
+        #     )
+        #     return True
+
+        return False
+
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         # get_attr node can always be supported on any backend
         if node.op == "get_attr":
@@ -62,38 +139,17 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         elif node.op == "call_function":
             # skip ops if specified by user
             node_target_name = getattr(node.target, "__name__", "").lower()
-            if node_target_name in (self.skip_ops_for_coreml_delegation or []):
-                self.log_once(
-                    "Skipping op for CoreML delegation because it is in skip_ops_for_coreml_delegation: "
-                    + node_target_name
-                )
-                assert (
-                    not self.lower_full_graph
-                ), "Cannot have skip_ops_for_coreml_delegation when lower_full_graph is True"
-                return False
 
-            # TODO: enable this after bugs in ExecuTorch's partitioner are fixed
-            # # If lower_full_graph=False, do not partition nodes with symbolic args because it can result in symbolic args
-            # # in the placeholders due to partitioning, which CoreML does not support
-            # if not self.lower_full_graph and any(
-            #     isinstance(arg, torch.fx.Node)
-            #     and isinstance(
-            #         arg.meta.get("val", None),
-            #         (torch.SymInt, torch.SymBool, torch.SymFloat),
-            #     )
-            #     for arg in node.args
-            # ):
-            #     self.log_once(
-            #         "Skipping op for CoreML delegation because it contains symbolic args: "
-            #         + node_target_name
-            #     )
-            #     assert not self.lower_full_graph
-            #     return False
+            if self.should_skip_op_for_delegation(node_target_name):
+                return False
 
             # query coremltools to see if node is supported
             is_supported = ct.converters.mil.frontend.torch.is_torch_fx_node_supported(
                 node
             )
+            if self.should_override_support(node):
+                is_supported = False
+
             if not is_supported:
                 if self.lower_full_graph:
                     raise NotImplementedError(
@@ -124,7 +180,6 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
 
 
 class CoreMLPartitioner(Partitioner):
-
     def __init__(
         self,
         *,
 
@@ -123,6 +123,9 @@ bool init_bnns_descriptor(BNNSNDArrayDescriptor& bnns_descriptor, const MultiArr
 }
 
 bool copy_using_bnns(const MultiArray& src, MultiArray& dst) {
+    if (src.layout().dataType() != dst.layout().dataType()) {
+        return false;
+    }
     if (dst.layout().num_bytes() < src.layout().num_bytes()) {
         return false;
     }
 
@@ -181,8 +181,8 @@ The Arm EthosU Backend should be considered a prototype quality at this point, l
 ## Current flows
 
 The EthosUBackend has a two stage process,
-- Compile to TOSA to rationalise the graph into known hardware support profiles. Currently this is to v0.80 TOSA BI with specific concern to a subset which gives support on Ethos-U55 and Ethos-U85, the target of the initial prototype efforts. This calls into the TOSABackend.
-- Lower via the ethos-u-vela compilation flow which takes TOSA v0.80 as an input and produces a low level commandstream for the hardware which is then passed via the delegate to the ethos-u-core-driver for direct execution.
+- Compile to TOSA to rationalise the graph into known hardware support profiles. Currently this is to v1.0 TOSA INT with specific concern to a subset which gives support on Ethos-U55 and Ethos-U85, the target of the initial prototype efforts. This calls into the TOSABackend.
+- Lower via the ethos-u-vela compilation flow which takes TOSA v1.0 as an input and produces a low level commandstream for the hardware which is then passed via the delegate to the ethos-u-core-driver for direct execution.
 
 The EthosUPartitioner is currenly used to ensure the operations converted are Ethos-U compatible, but will be extended to offer spec-correct TOSA Base inference and TOSA Main Inference generation in future.
Original file line number	Diff line number	Diff line change
`@@ -123,6 +123,9 @@ bool init_bnns_descriptor(BNNSNDArrayDescriptor& bnns_descriptor, const MultiArr`
`123`	`123`	`}`
`124`	`124`
`125`	`125`	`bool copy_using_bnns(const MultiArray& src, MultiArray& dst) {`
	`126`	`+ if (src.layout().dataType() != dst.layout().dataType()) {`
	`127`	`+ return false;`
	`128`	`+ }`
`126`	`129`	`if (dst.layout().num_bytes() < src.layout().num_bytes()) {`
`127`	`130`	`return false;`
`128`	`131`	`}`