Merge branch 'main' into fix_logging

metascroy · web-flow · commit c28053beae8e · 2025-04-09T17:11:53.000-07:00
diff --git a/.ci/scripts/build_android_instrumentation.sh b/.ci/scripts/build_android_instrumentation.sh
@@ -12,10 +12,10 @@ if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
 fi
 which "${PYTHON_EXECUTABLE}"
 
-mkdir -p "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
-cp extension/module/test/resources/add.pte "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
+mkdir -p extension/android/executorch_android/src/androidTest/resources
+cp extension/module/test/resources/add.pte extension/android/executorch_android/src/androidTest/resources
 
-pushd "${BUILD_AAR_DIR}"
+pushd extension/android
 ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest
 ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest
 popd
diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml
@@ -37,7 +37,7 @@ jobs:
 
         mkdir -p ${ARTIFACTS_DIR_NAME}/library_test_dir
         bash .ci/scripts/build_android_instrumentation.sh
-        cp ${BUILD_AAR_DIR}/executorch_android/build/outputs/apk/androidTest/debug/executorch_android-debug-androidTest.apk "${ARTIFACTS_DIR_NAME}/library_test_dir"
+        cp extension/android/executorch_android/build/outputs/apk/androidTest/debug/executorch_android-debug-androidTest.apk "${ARTIFACTS_DIR_NAME}/library_test_dir"
 
         mkdir -p ${ARTIFACTS_DIR_NAME}/fp32-xnnpack-custom
         bash examples/models/llama/install_requirements.sh
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,7 +1,6 @@
 Thank you for your interest in contributing to ExecuTorch! We want to make
 it easy to contribute to this project.
 
-&nbsp;
 
 ## Dev Install
 
@@ -91,7 +90,7 @@ executorch
 │   └── <a href="runtime/platform">platform</a> - Layer between architecture specific code and portable C++.
 ├── <a href="schema">schema</a> - ExecuTorch PTE file format flatbuffer schemas.
 ├── <a href="scripts">scripts</a> - Utility scripts for building libs, size management, dependency management, etc.
-├── <a href="shim">shim</a> - Compatibility layer between OSS and Internal builds.
+├── <a href="shim_et">shim_et</a> - Compatibility layer between OSS and Internal builds.
 ├── <a href="test">test</a> - Broad scoped end-to-end tests.
 ├── <a href="third-party">third-party</a> - Third-party dependencies.
 ├── <a href="tools">tools</a> - Tools for building ExecuTorch from source, for different built tools (CMake, Buck).
@@ -192,9 +191,6 @@ in the Github repo.
 
 ## Coding Style
 
-Goal: Encourage standards that make it easier to read, edit, maintain, and debug
-the ExecuTorch code.
-
 ### lintrunner
 
 We use [`lintrunner`](https://pypi.org/project/lintrunner/) to help make sure the
@@ -259,7 +255,7 @@ toolchains, and having access to relatively modern C++ features.
 
 #### C/C++ standard library usage
 
-**Restricted usage of the C++ standard library.**
+**Restricted usage of the C++ standard library**
 
 Rationale: ExecuTorch is intended to be portable to bare-metal systems that lack
 certain features, like dynamic memory, threading, and locking, required by parts
@@ -280,7 +276,7 @@ careful to also manually destroy objects initialized in this way.
 
 #### C++ language features
 
-**Exceptions: Do not use.**
+**Exceptions: Do not use**
 - Rationale: Exceptions are not widely supported on some classes of
   microcontrollers and DSPs, and they can significantly increase binary size.
 
@@ -289,12 +285,12 @@ must work with threading**
 - Rationale: The core runtime must work on systems that do not have threading
   support.
 
-**RTTI, dynamic_cast, and `<typeid>`: Do not use.**
+**RTTI, dynamic_cast, and `<typeid>`: Do not use**
 - Rationale: RTTI adds extra data to every virtual class. ExecuTorch doesn't
   have a strong need for `dynamic_cast` and friends, so it's better to reduce
   the binary size.
 
-**Templates and template metaprogramming: Be careful and avoid if possible.**
+**Templates and template metaprogramming: Be careful and avoid if possible**
 - Rationale: Most templating results in code generation, and is one of the most
   common sources of binary bloat. Some use of templates is fine (e.g. an
   `ArrayRef<T>`, or code that handles multiple `ScalarType` types), but for the
@@ -359,7 +355,7 @@ docs](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/
 for basics.
 
 1. Push your branch to your fork of `pytorch/executorch`. Most people do not
-  have permission to push a branch directoy to the upstream repo.
+  have permission to push a branch directory to the upstream repo.
 1. Create your PR
    - Use the `main` branch as the base.
    - Give the PR a clear and descriptive title. It will become the title of the
diff --git a/README.md b/README.md
@@ -49,9 +49,9 @@ Key value propositions of ExecuTorch are:
 ## Getting Started
 To get started you can:
 
-- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index.html) on getting things running locally and deploy a model to a device
+- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index.html) to get things running locally and deploy a model to a device
 - Use this [Colab Notebook](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) to start playing around right away
-- Jump straight into LLMs use cases by following specific instructions for [Llama](./examples/models/llama/README.md) and [Llava](./examples/models/llava/README.md)
+- Jump straight into LLM use cases by following specific instructions for [Llama](./examples/models/llama/README.md) and [Llava](./examples/models/llava/README.md)
 
 ## Feedback and Engagement
 
diff --git a/backends/qualcomm/_passes/layout_transform.py b/backends/qualcomm/_passes/layout_transform.py
@@ -47,6 +47,7 @@ class LayoutTransform(ExportPass):
     layout_agnostic_ops = {
         exir_ops.edge.aten.abs.default,
         exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.amax.default,
         exir_ops.edge.aten.bitwise_or.Tensor,
         exir_ops.edge.aten.bmm.default,
         exir_ops.edge.aten.bitwise_and.Tensor,
diff --git a/backends/qualcomm/builders/__init__.py b/backends/qualcomm/builders/__init__.py
@@ -9,6 +9,7 @@
     op_abs,
     op_adaptive_avg_pool2d,
     op_add,
+    op_amax,
     op_and,
     op_arange,
     op_argmin,
@@ -95,6 +96,7 @@
     op_abs,
     op_adaptive_avg_pool2d,
     op_add,
+    op_amax,
     op_and,
     op_arange,
     op_argmin,
diff --git a/backends/qualcomm/builders/op_amax.py b/backends/qualcomm/builders/op_amax.py
@@ -0,0 +1,84 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import cast, Dict, List
+
+import executorch.backends.qualcomm.python.PyQnnWrapperAdaptor as PyQnnWrapper
+
+import numpy as np
+
+import torch
+from executorch.backends.qualcomm.utils.constants import QCOM_AXIS_ORDER, QCOM_DATA
+
+from .node_visitor import NodeVisitor, register_node_visitor
+from .qnn_constants import OpAmax, QNN_OP_PACKAGE_NAME_QTI_AISW
+
+
+@register_node_visitor
+class AMax(NodeVisitor):
+    target = ["aten.amax.default"]
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        nodes_to_wrappers: Dict[torch.fx.Node, PyQnnWrapper.TensorWrapper],
+    ) -> PyQnnWrapper.PyQnnOpWrapper:
+        input_node = node.args[0]
+        input_tensor = self.get_tensor(input_node, node)
+        input_tensor_wrapper = self.define_tensor(
+            input_node,
+            node,
+            input_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        # mean dims and keep dims
+        mean_dims = cast(List[int], node.args[1])
+        mean_dims = [
+            mean_dim % len(input_node.meta["val"].shape) for mean_dim in mean_dims
+        ]
+        if QCOM_AXIS_ORDER in node.meta:
+            mean_dims = [
+                node.meta[QCOM_AXIS_ORDER].index(mean_dim) for mean_dim in mean_dims
+            ]
+        mean_dims_shape = [len(mean_dims)]
+
+        output_tensor = self.get_tensor(node, node)
+        output_tensor_wrapper = self.define_tensor(
+            node,
+            node,
+            output_tensor,
+            PyQnnWrapper.Qnn_TensorType_t.QNN_TENSOR_TYPE_NATIVE,
+            nodes_to_wrappers,
+        )
+
+        reduce_max_op = PyQnnWrapper.PyQnnOpWrapper(
+            node.name,
+            QNN_OP_PACKAGE_NAME_QTI_AISW,
+            OpAmax.op_name,
+        )
+        reduce_max_op.AddInputTensors([input_tensor_wrapper])
+        reduce_max_op.AddOutputTensors([output_tensor_wrapper])
+        reduce_max_op.AddTensorParam(
+            OpAmax.param_axes,
+            PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_UINT_32,
+            len(mean_dims_shape),
+            mean_dims_shape,
+            np.array(mean_dims, dtype=np.uint32),
+            True,
+        )
+        if len(node.args) > 2:
+            keep_dims = cast(bool, node.args[2])
+            reduce_max_op.AddScalarParam(
+                OpAmax.param_keep_dims,
+                PyQnnWrapper.Qnn_DataType_t.QNN_DATATYPE_BOOL_8,
+                {QCOM_DATA: keep_dims},
+            )
+
+        return reduce_max_op
diff --git a/backends/qualcomm/builders/qnn_constants.py b/backends/qualcomm/builders/qnn_constants.py
@@ -14,6 +14,13 @@
 # instead of replicating them here.
 
 
+@dataclass(init=False, frozen=True)
+class OpAmax:
+    op_name: str = "ReduceMax"
+    param_axes: str = "axes"
+    param_keep_dims: str = "keep_dims"
+
+
 @dataclass(init=False, frozen=True)
 class OpBatchnorm:
     op_name: str = "Batchnorm"
diff --git a/backends/qualcomm/quantizer/annotators.py b/backends/qualcomm/quantizer/annotators.py
@@ -182,6 +182,11 @@ def annotate_add(node: Node, quantization_config: QuantizationConfig) -> None:
     annotate_binary(node, quantization_config)
 
 
+@register_annotator([torch.ops.aten.amax.default])
+def annotate_amax(node: Node, quantization_config: QuantizationConfig) -> None:
+    annotate_binary(node, quantization_config)
+
+
 @register_annotator([torch.ops.aten.argmin.default])
 def annotate_argmin(node: Node, quantization_config: QuantizationConfig) -> None:
     if _is_annotated([node]):
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -72,6 +72,16 @@ def forward(self, x):
         return torch.any(x, dim=self.dim, keepdim=self.keepdim)
 
 
+class AMax(torch.nn.Module):
+    def __init__(self, dim=None, keepdim=False):
+        super().__init__()
+        self.dim = dim
+        self.keepdim = keepdim
+
+    def forward(self, x):
+        return torch.amax(x, dim=self.dim, keepdim=self.keepdim)
+
+
 class Arange(torch.nn.Module):
     def __init__(self, start, end, step, dtype):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -113,6 +113,13 @@ def test_qnn_backend_adaptive_avg_pool2d(self):
         sample_input = (torch.randn(1, 512, 7, 7),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_amax(self):
+        modules = [AMax(dim=1, keepdim=False), AMax(dim=1, keepdim=True)]  # noqa: F405
+        sample_input = (torch.randn(4, 4),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_any(self):
         modules = [Any(), Any(dim=[0, 1]), Any(dim=1, keepdim=True)]  # noqa: F405
         sample_input = (torch.randn(3, 3, 3) > 0,)
@@ -1111,6 +1118,14 @@ def test_qnn_backend_adaptive_avg_pool2d(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_amax(self):
+        modules = [AMax(dim=1, keepdim=False), AMax(dim=1, keepdim=True)]  # noqa: F405
+        sample_input = (torch.randn(4, 4),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_any(self):
         modules = [Any(), Any(dim=[0, 1]), Any(dim=1, keepdim=True)]  # noqa: F405
         sample_input = (torch.randn(3, 3, 3) > 0,)
diff --git a/docs/source/backends-coreml.md b/docs/source/backends-coreml.md
@@ -172,8 +172,8 @@ add_subdirectory("executorch")
 target_link_libraries(
     my_target
     PRIVATE executorch
-    executorch_module_static
-    executorch_tensor
+    extension_module_static
+    extension_tensor
     optimized_native_cpu_ops_lib
     coremldelegate)
 ```
diff --git a/docs/source/backends-xnnpack.md b/docs/source/backends-xnnpack.md
@@ -128,8 +128,8 @@ add_subdirectory("executorch")
 target_link_libraries(
     my_target
     PRIVATE executorch
-    executorch_module_static
-    executorch_tensor
+    extension_module_static
+    extension_tensor
     optimized_native_cpu_ops_lib
     xnnpack_backend)
 ```
diff --git a/docs/source/getting-started-architecture.md b/docs/source/getting-started-architecture.md
@@ -18,7 +18,7 @@ There are three phases to deploy a PyTorch model to on-device: program preparati
 
 ExecuTorch extends the flexibility and usability of PyTorch to edge devices. It
 leverages PyTorch 2 compiler and export functionality
-([TorchDynamo](https://pytorch.org/docs/stable/dynamo/index.html),
+([TorchDynamo](https://pytorch.org/docs/stable/torch.compiler_dynamo_overview.html),
 [AOTAutograd](https://pytorch.org/functorch/stable/notebooks/aot_autograd_optimizations.html),
 [Quantization](https://pytorch.org/docs/main/quantization.html),
 [dynamic shapes](https://pytorch.org/get-started/pytorch-2.0/#pytorch-2x-faster-more-pythonic-and-as-dynamic-as-ever),
diff --git a/docs/source/getting-started.md b/docs/source/getting-started.md
@@ -121,6 +121,8 @@ To add the library to your app, add the following dependency to gradle build rul
 dependencies {
   implementation("org.pytorch:executorch-android:0.5.1")
 }
+
+# See latest available versions in https://mvnrepository.com/artifact/org.pytorch/executorch-android
 ```
 
 #### Runtime APIs
@@ -170,8 +172,8 @@ add_subdirectory("executorch")
 target_link_libraries(
   my_target
   PRIVATE executorch
-          executorch_module_static
-          executorch_tensor
+          extension_module_static
+          extension_tensor
           optimized_native_cpu_ops_lib
           xnnpack_backend)
 ```
diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md
@@ -172,18 +172,22 @@ public class MainActivity extends Activity {
     protected void onCreate(Bundle savedInstanceState) {
         super.onCreate(savedInstanceState);
         // Load the ExecuTorch module
-        module = Module.load("/path/to/module.pte");
-    }
-    public void runInference(View view) {
-        // Prepare input data
-        Tensor input = Tensor.fromBlob(getInputData());
-        // Run inference
-        Tensor output = module.forward(EValue.from(input))[0].toTensor();
-        // Process output data
-        processOutput(output);
+        Module module = Module.load("/data/local/tmp/add.pte");
+        Tensor tensor1 = Tensor.fromBlob(new float[] {1.0f}, new long[] {1});
+        Tensor tensor2 = Tensor.fromBlob(new float[] {20.0f}, new long[] {1});
+
+        EValue eValue1 = EValue.from(tensor1);
+        EValue eValue2 = EValue.from(tensor2);
+        float result = module.forward(eValue1, eValue2)[0].toTensor().getDataAsFloatArray()[0];
     }
 }
 ```
+
+Push the corresponding pte file to the phone:
+```sh
+adb push extension/module/test/resources/add.pte /data/local/tmp/
+```
+
 This example loads an ExecuTorch module, prepares input data, runs inference, and processes the output data.
 
 Please use [DeepLabV3AndroidDemo](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo)
diff --git a/docs/source/using-executorch-cpp.md b/docs/source/using-executorch-cpp.md
@@ -38,7 +38,7 @@ Running a model using the low-level runtime APIs allows for a high-degree of con
 
 ## Building with CMake
 
-ExecuTorch uses CMake as the primary build system. Inclusion of the module and tensor APIs are controlled by the `EXECUTORCH_BUILD_EXTENSION_MODULE` and `EXECUTORCH_BUILD_EXTENSION_TENSOR` CMake options. As these APIs may not be supported on embedded systems, they are disabled by default when building from source. The low-level API surface is always included. To link, add the `executorch` target as a CMake dependency, along with `executorch_module_static` and `executorch_tensor`, if desired.
+ExecuTorch uses CMake as the primary build system. Inclusion of the module and tensor APIs are controlled by the `EXECUTORCH_BUILD_EXTENSION_MODULE` and `EXECUTORCH_BUILD_EXTENSION_TENSOR` CMake options. As these APIs may not be supported on embedded systems, they are disabled by default when building from source. The low-level API surface is always included. To link, add the `executorch` target as a CMake dependency, along with `extension_module_static` and `extension_tensor`, if desired.
 
 ```
 # CMakeLists.txt
@@ -47,8 +47,8 @@ add_subdirectory("executorch")
 target_link_libraries(
     my_target
     PRIVATE executorch
-    executorch_module_static
-    executorch_tensor
+    extension_module_static
+    extension_tensor
     optimized_native_cpu_ops_lib
     xnnpack_backend)
 ```
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md
@@ -135,7 +135,7 @@ You may also wonder what the "--metadata" flag is doing. This flag helps export
 
 Convert tokenizer for Llama 2
 ```
-python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
 ```
 Rename tokenizer for Llama 3 with command: `mv tokenizer.model tokenizer.bin`. We are updating the demo app to support tokenizer in original format directly.
 
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md
diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md
diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
diff --git a/exir/program/test/test_program.py b/exir/program/test/test_program.py
diff --git a/exir/tracer.py b/exir/tracer.py
diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle
diff --git a/scripts/build_android_library.sh b/scripts/build_android_library.sh

-Original file line number
+Diff line change
 * Convert tokenizer for Llama 2 and Llava (skip this for Llama 3.x)
 ```
 -python -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
 +python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin
 ```
 ### For LLaVA model
-Original file line number
+Diff line change
     ```
 . Create tokenizer.bin.
     ```
 -    python -m extension.llm.tokenizer.tokenizer -t <tokenizer.model> -o tokenizer.bin
 +    python -m pytorch_tokenizers.tools.llama2c.convert -t <tokenizer.model> -o tokenizer.bin
     ```
     Pass the converted `tokenizer.bin` file instead of `tokenizer.model` for subsequent steps.
-Original file line number
+Diff line change
+        )
     def test_edge_dialect_non_core_aten_ops(self):
 -        class LinalgNorm(torch.nn.Module):
 +        class LinalgRank(torch.nn.Module):
             def __init__(self):
                 super().__init__()
             def forward(self, x: torch.Tensor) -> torch.Tensor:
 -                return torch.linalg.norm(x)
 +                return torch.linalg.matrix_rank(x)
         from torch._export.verifier import SpecViolationError
 -        input = torch.arange(9, dtype=torch.float) - 4
 -        ep = torch.export.export(LinalgNorm(), (input,), strict=True)
 +        input = torch.ones((9, 9, 9), dtype=torch.float)
 +        ep = torch.export.export(LinalgRank(), (input,), strict=True)
         # aten::linalg_norm is not a core op, so it should error out
         with self.assertRaises(SpecViolationError):
                 ep,
                 compile_config=EdgeCompileConfig(
                     _check_ir_validity=True,
 -                    _core_aten_ops_exception_list=[
 -                        torch.ops.aten.linalg_vector_norm.default
 -                    ],
 +                    _core_aten_ops_exception_list=[torch.ops.aten._linalg_svd.default],
                 ),
+            )
         except SpecViolationError:
-Original file line number
+Diff line change
+        ]
         # pyre-fixme[7]: Expected `Dict[OpOverload, typing.Callable[..., executorch.e...
         return get_decompositions(decomp_opset)
++
 +    decomps = default_decompositions()
 +    # Add edge specific decompositions
 +    additional_decomp_ops = [
 +        # TODO: Eventually this op should be added to the core decompo table, and will not
 +        # need to be added here.
 +        torch.ops.aten.linalg_vector_norm.default,
 +    ]
 +    additional_decomps = get_decompositions(additional_decomp_ops)
 +    decomps.update(additional_decomps)
     # pyre-fixme[7]: Expected `Dict[OpOverload, typing.Callable[..., executorch.exir....
 -    return default_decompositions()
 +    return decomps
 def dynamo_trace(
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,9 @@ android {`
`27`	`27`	`}`
`28`	`28`
`29`	`29`	`sourceSets {`
	`30`	`+ main {`
	`31`	`+ jniLibs.srcDirs = ['../../../cmake-out-android-so/']`
	`32`	`+ }`
`30`	`33`	`androidTest {`
`31`	`34`	`resources.srcDirs += [ 'src/androidTest/resources' ]`
`32`	`35`	`}`
-Original file line number
+Diff line change
 fi
 which "${PYTHON_EXECUTABLE}"
 -copy_src() {
 -  cp -r extension/android/build.gradle extension/android/settings.gradle extension/android/gradlew extension/android/gradle extension/android/gradlew.bat extension/android/gradle.properties "${BUILD_AAR_DIR}"
 -  cp -r extension/android/executorch_android "${BUILD_AAR_DIR}/executorch_android"
 -}
+-
 build_android_native_library() {
   ANDROID_ABI="$1"
   ANDROID_NDK="${ANDROID_NDK:-/opt/ndk}"
   cmake --build "${CMAKE_OUT}"/extension/android -j "${CMAKE_JOBS}" --config "${EXECUTORCH_CMAKE_BUILD_TYPE}"
   # Copy artifacts to ABI specific directory
 -  mkdir -p "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}"
 -  cp "${CMAKE_OUT}"/extension/android/*.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
 +  local SO_STAGE_DIR="cmake-out-android-so/${ANDROID_ABI}"
 +  mkdir -p ${SO_STAGE_DIR}
 +  cp "${CMAKE_OUT}"/extension/android/*.so "${SO_STAGE_DIR}/libexecutorch.so"
   # Copy QNN related so library
   if [ -n "$QNN_SDK_ROOT" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
 -    cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
 -    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
 -    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
 -    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
 -    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
 -    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
 -    cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
 -    cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
 -    cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so "${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/"
 +    cp "${CMAKE_OUT}"/lib/libqnn_executorch_backend.so ${SO_STAGE_DIR}
 +    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtp.so ${SO_STAGE_DIR}
 +    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnSystem.so ${SO_STAGE_DIR}
 +    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV69Stub.so ${SO_STAGE_DIR}
 +    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV73Stub.so ${SO_STAGE_DIR}
 +    cp "${QNN_SDK_ROOT}"/lib/aarch64-android/libQnnHtpV75Stub.so ${SO_STAGE_DIR}
 +    cp "${QNN_SDK_ROOT}"/lib/hexagon-v69/unsigned/libQnnHtpV69Skel.so ${SO_STAGE_DIR}
 +    cp "${QNN_SDK_ROOT}"/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so ${SO_STAGE_DIR}
 +    cp "${QNN_SDK_ROOT}"/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so ${SO_STAGE_DIR}
   fi
   # Copy MTK related so library
   if [ -n "$NEURON_BUFFER_ALLOCATOR_LIB" ] && [ -n "$NEURON_USDK_ADAPTER_LIB" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then
 -    cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/
 -    cp "${NEURON_BUFFER_ALLOCATOR_LIB}" ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/
 -    cp "${NEURON_USDK_ADAPTER_LIB}" ${BUILD_AAR_DIR}/executorch_android/src/main/jniLibs/${ANDROID_ABI}/
 +    cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${SO_STAGE_DIR}
 +    cp "${NEURON_BUFFER_ALLOCATOR_LIB}" ${SO_STAGE_DIR}
 +    cp "${NEURON_USDK_ADAPTER_LIB}" ${SO_STAGE_DIR}
   fi
+}
 build_aar() {
 -  pushd "${BUILD_AAR_DIR}"
 -  # Rename libexecutorch_jni.so to libexecutorch.so for soname consistency
 -  # between Java and JNI
 -  find . -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \;
   if [ "$EXECUTORCH_CMAKE_BUILD_TYPE" == "Release" ]; then
 -    find . -type f -name "*.so" -exec "$ANDROID_NDK"/toolchains/llvm/prebuilt/*/bin/llvm-strip {} \;
 +    find cmake-out-android-so -type f -name "*.so" -exec "$ANDROID_NDK"/toolchains/llvm/prebuilt/*/bin/llvm-strip {} \;
   fi
 +  pushd extension/android/
   ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build
 -  cp executorch_android/build/outputs/aar/executorch_android-debug.aar executorch.aar
   popd
 +  cp extension/android/executorch_android/build/outputs/aar/executorch_android-debug.aar "${BUILD_AAR_DIR}/executorch.aar"
+}
 main() {
   if [[ -z "${BUILD_AAR_DIR:-}" ]]; then
     BUILD_AAR_DIR="$(mktemp -d)"
   fi
   export BUILD_AAR_DIR
 +  mkdir -p $BUILD_AAR_DIR
   if [ -z "$ANDROID_ABIS" ]; then
     ANDROID_ABIS=("arm64-v8a" "x86_64")
   fi
   export ANDROID_ABIS
 -  copy_src
 +  mkdir -p cmake-out-android-so/
   for ANDROID_ABI in "${ANDROID_ABIS[@]}"; do
     build_android_native_library ${ANDROID_ABI}
   done