From 8a76d445ccf4004703283c8f48ae0acd2cf74ae6 Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Tue, 22 Apr 2025 10:04:43 +0800
Subject: [PATCH 1/5] Qualcomm AI Engine Direct - multi-method support

Summary
- refactor to adopt multi-method change
- framwork change to meet use case
---
 .../aot/python/PyQnnManagerAdaptor.cpp        |   7 +-
 .../qualcomm/aot/python/PyQnnManagerAdaptor.h | 257 +-----------------
 backends/qualcomm/qnn_preprocess.py           |  91 ++++++-
 backends/qualcomm/runtime/QnnManager.cpp      |  11 +-
 backends/qualcomm/runtime/QnnManager.h        |   4 -
 .../runtime/backends/QnnBackendCache.cpp      |   5 +-
 .../runtime/backends/QnnBackendCache.h        |  10 +-
 .../runtime/backends/QnnBackendFactory.cpp    |   3 +-
 .../backends/htpbackend/HtpBackendCache.h     |   6 +-
 .../serialization/qc_compiler_spec.fbs        |   6 +-
 backends/qualcomm/serialization/qc_schema.py  |   4 +-
 backends/qualcomm/tests/test_qnn_delegate.py  |  80 +++---
 backends/qualcomm/utils/utils.py              | 253 +----------------
 .../executor_runner/qnn_executor_runner.cpp   |   6 -
 examples/qualcomm/oss_scripts/llama/llama.py  | 158 +++--------
 examples/qualcomm/utils.py                    |   2 -
 exir/backend/backend_api.py                   |  43 ++-
 exir/lowered_backend_module.py                |   2 -
 18 files changed, 257 insertions(+), 691 deletions(-)
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
index e8261000bb5..67e6775f451 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
@@ -30,15 +30,14 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
   py::class_<PyQnnManager, std::shared_ptr<PyQnnManager>>(m, "QnnManager")
       .def(py::init<const py::bytes&>())
       .def(py::init<const py::bytes&, const py::bytes&>())
-      .def(py::init<const py::bytes&, const py::list&>())
       .def("Init", &PyQnnManager::Init)
       .def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend)
-      .def("Compile", py::overload_cast<>(&PyQnnManager::Compile))
       .def(
           "Compile",
           py::overload_cast<
-              const std::string&,
-              std::vector<std::shared_ptr<OpWrapper>>&>(&PyQnnManager::Compile))
+              const std::vector<std::string>&,
+              std::vector<std::vector<std::shared_ptr<OpWrapper>>>&>(
+              &PyQnnManager::Compile))
       .def("Destroy", &PyQnnManager::Destroy)
       .def("IsAvailable", &PyQnnManager::IsAvailable)
       .def("IsTensorDump", &PyQnnManager::IsTensorDump)
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
index 67abadd6731..66fd41721c6 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
@@ -50,119 +50,6 @@ class PyQnnManager {
         qnn_executorch_options, qnn_executorch_context_binary_);
   }
 
-  // used during stage 2 of multi-graph mode
-  explicit PyQnnManager(const py::bytes& buffer, const py::list& qcirs)
-      : qnn_executorch_option_ptr_(buffer) {
-    auto qnn_executorch_options = GetQnnExecuTorchOptions(
-        qnn_executorch_option_ptr_.cast<std::string_view>().data());
-
-    // merge multiple qcirs into one context with multiple graphs
-
-    // We start retrieving tensor from offsets = 0.
-    std::vector<uint32_t> offsets(1, 0);
-    std::vector<uint8_t> tensor_data;
-    std::vector<uint8_t*> tensor_ptr;
-    std::vector<uint64_t> tensor_size;
-    uint64_t total_tensor_size = 0;
-    for (size_t i = 0; i < qcirs.size(); ++i) {
-      py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request());
-
-      uint8_t* qcir_custom_buffer_ptr = static_cast<uint8_t*>(info.ptr);
-      QnnQcirCustomProtocol qnn_qcir_custom_protocol;
-      auto [status, _, qcir_tensor_size, __, qcir_tensor_ptr] =
-          qnn_qcir_custom_protocol.DeserializeQcirCustomBuffer(
-              qcir_custom_buffer_ptr);
-
-      if (status != Error::Ok) {
-        QNN_EXECUTORCH_LOG_ERROR("Fail to verify QnnQcirCustomProtocol");
-        return;
-      }
-
-      tensor_ptr.push_back(static_cast<uint8_t*>(qcir_tensor_ptr));
-      tensor_size.push_back(qcir_tensor_size);
-      total_tensor_size += qcir_tensor_size;
-      offsets.push_back(offsets.back() + qcir_tensor_size);
-    }
-
-    tensor_data.resize(total_tensor_size);
-
-    // store multiple graphs tensor in a contiguous memory space
-    for (size_t i = 0; i < tensor_ptr.size(); ++i) {
-      std::memcpy(
-          tensor_data.data() + offsets[i], tensor_ptr[i], tensor_size[i]);
-    }
-
-    std::vector<flatbuffers::Offset<qcir::Graph>> graphs;
-    for (size_t i = 0; i < qcirs.size(); ++i) {
-      py::buffer_info info(py::buffer(qcirs[i].cast<py::bytes>()).request());
-
-      uint8_t* qcir_custom_buffer_ptr = static_cast<uint8_t*>(info.ptr);
-      QnnQcirCustomProtocol qnn_qcir_custom_protocol;
-      auto [status, qcir_fbs_size, _, qcir_fbs_ptr, __] =
-          qnn_qcir_custom_protocol.DeserializeQcirCustomBuffer(
-              qcir_custom_buffer_ptr);
-
-      if (status != Error::Ok) {
-        QNN_EXECUTORCH_LOG_ERROR("Fail to verify QnnQcirCustomProtocol");
-        return;
-      }
-
-      auto context = qcir::GetContext(qcir_fbs_ptr);
-      for (const auto& graph : *context->graphs()) {
-        std::vector<flatbuffers::Offset<qcir::Tensor>> tensors;
-        for (const auto tensor : *graph->tensors()) {
-          // here we need to take a detour to merge multiple qcir flatbuffers
-          // outer ToTensor
-          //   return: flatbuffers::Offset<Tensor>
-          //   consume: QnnTensor, data_offset, flatbuffers::FlatBufferBuilder*
-          // inner ToTensor
-          //   return: QnnTensor
-          //   consume:
-          //   flatbuffers::Vector<::flatbuffers::Offset<qcir::Tensor>>,
-          //   data_ptr
-          tensors.emplace_back(ToTensor(
-              ToTensor(tensor, nullptr),
-              offsets[i] + tensor->offset(),
-              &builder_));
-        }
-        std::vector<flatbuffers::Offset<qcir::Operator>> nodes;
-        for (const auto& node : *graph->nodes()) {
-          uint32_t* inputs_ptr = const_cast<uint32_t*>(node->inputs()->data());
-          uint32_t* outputs_ptr =
-              const_cast<uint32_t*>(node->outputs()->data());
-          uint32_t* params_ptr = const_cast<uint32_t*>(node->params()->data());
-          std::vector<uint32_t> inputs(
-              inputs_ptr, inputs_ptr + node->inputs()->size());
-          std::vector<uint32_t> outputs(
-              outputs_ptr, outputs_ptr + node->outputs()->size());
-          std::vector<uint32_t> params(
-              params_ptr, params_ptr + node->params()->size());
-          nodes.emplace_back(qcir::CreateOperatorDirect(
-              builder_,
-              node->name()->str().c_str(),
-              node->package_name()->str().c_str(),
-              node->type_name()->str().c_str(),
-              &inputs,
-              &outputs,
-              &params));
-        }
-        graphs.emplace_back(qcir::CreateGraphDirect(
-            builder_, graph->name()->str().c_str(), &nodes, &tensors));
-      }
-    }
-
-    auto context = qcir::CreateContextDirect(builder_, &graphs);
-    builder_.Finish(context);
-    QnnExecuTorchContextBinary qcir_bin(
-        {builder_.GetBufferPointer(), builder_.GetSize()});
-
-    // Init QnnQcirCustomProtocol binary
-    qnn_executorch_context_binary_ =
-        MakeQcirCustomBinaryInfo(qcir_bin, tensor_data);
-    qnn_manager_ = std::make_shared<QnnManager>(
-        qnn_executorch_options, qnn_executorch_context_binary_);
-  }
-
   executorch::runtime::Error Init() {
     return qnn_manager_->Init();
   }
@@ -172,146 +59,24 @@ class PyQnnManager {
     return qnn_manager_->IsNodeSupportedByBackend(op_wrappers);
   }
 
-  // this method is specific for stage 2 of compiling multi-graphs
-  py::array_t<char> Compile() {
-    if (qnn_manager_->CompileQcir() != Error::Ok) {
-      QNN_EXECUTORCH_LOG_ERROR("Fail to compile qcir");
-      return py::array_t<char>(0);
-    }
-
-    // generate context binary if compilation succeded
-    QnnExecuTorchContextBinary binary_info;
-    qnn_manager_->GetContextBinary(binary_info);
-    // allocate py::array (to pass the result of the C++ function to Python)
-    auto result = py::array_t<char>(binary_info.nbytes);
-    auto result_buffer = result.request();
-    char* result_ptr = (char*)result_buffer.ptr;
-    std::memcpy(result_ptr, binary_info.buffer, binary_info.nbytes);
-    return result;
-  }
-
   py::array_t<char> Compile(
-      const std::string& graph_name,
-      std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
+      const std::vector<std::string>& graph_names,
+      std::vector<std::vector<std::shared_ptr<OpWrapper>>>& op_wrappers) {
     QnnExecuTorchContextBinary binary_info;
 
-    if (qnn_manager_->IsMultipleGraphs()) {
-      builder_.Reset();
-      std::vector<uint8_t> tensor_data;
-      std::vector<uint64_t> offsets;
-      std::unordered_map<void*, int> tensor_map;
-      std::vector<flatbuffers::Offset<qcir::Tensor>> fb_tensors;
-      std::vector<flatbuffers::Offset<qcir::Operator>> fb_ops;
-
-      auto set_tensor = [&](const std::shared_ptr<TensorWrapper>& wrapper,
-                            std::vector<uint32_t>& index) {
-        auto it = tensor_map.find(wrapper.get());
-        if (it != tensor_map.end()) {
-          index.push_back(it->second);
-        } else {
-          tensor_map[wrapper.get()] = fb_tensors.size();
-          index.push_back(fb_tensors.size());
-          offsets.push_back(tensor_data.size());
-          Qnn_Tensor_t qnn_tensor = wrapper->CloneTensorStruct();
-          fb_tensors.emplace_back(
-              ToTensor(qnn_tensor, offsets.back(), &builder_));
-          uint8_t* data_ptr = static_cast<uint8_t*>(
-              QNN_TENSOR_VER_PTR(qnn_tensor)->clientBuf.data);
-          if (data_ptr != nullptr) {
-            tensor_data.insert(
-                tensor_data.end(),
-                data_ptr,
-                data_ptr + QNN_TENSOR_VER_PTR(qnn_tensor)->clientBuf.dataSize);
-          }
-        }
-      };
-
-      for (std::shared_ptr<OpWrapper>& op_wrapper : op_wrappers) {
-        std::vector<uint32_t> inputs, outputs, params;
-
-        for (const auto& tensor_wrapper : op_wrapper->GetInputTensors()) {
-          set_tensor(tensor_wrapper, inputs);
-        }
-
-        for (const auto& tensor_wrapper : op_wrapper->GetOutputTensors()) {
-          set_tensor(tensor_wrapper, outputs);
-        }
-
-        for (const auto& param : op_wrapper->GetParams()) {
-          auto* p_tensor_param = dynamic_cast<TensorParamWrapper*>(param.get());
-          if (p_tensor_param != nullptr) {
-            auto wrapper = p_tensor_param->GetTensorWrapper();
-            wrapper->SetName(param->GetName());
-            set_tensor(wrapper, params);
-          } else {
-            executorch::runtime::Error err = param->PopulateQnnParam();
-            if (err != executorch::runtime::Error::Ok) {
-              QNN_EXECUTORCH_LOG_ERROR(
-                  "Fail to get scalar parameter in online prepare stage");
-              return py::array_t<char>(0);
-            }
-            Qnn_Param_t p = param->GetQnnParam();
-            Qnn_Tensor_t t(
-                {.version = QNN_TENSOR_VERSION_2, .v2 = QNN_TENSOR_V2_INIT});
-            QNN_TENSOR_VER_PTR(t)->name = p.name;
-            QNN_TENSOR_VER_PTR(t)->dataType = p.scalarParam.dataType;
-            QNN_TENSOR_VER_PTR(t)->clientBuf.data =
-                static_cast<void*>(&p.scalarParam.uint8Value);
-            QNN_TENSOR_VER_PTR(t)->clientBuf.dataSize =
-                GetDataTypeSize(QNN_TENSOR_VER_PTR(t)->dataType);
-
-            // collect tensor data
-            offsets.push_back(tensor_data.size());
-            const uint8_t* data_ptr =
-                static_cast<uint8_t*>(QNN_TENSOR_VER_PTR(t)->clientBuf.data);
-            tensor_data.insert(
-                tensor_data.end(),
-                data_ptr,
-                data_ptr + QNN_TENSOR_VER_PTR(t)->clientBuf.dataSize);
-            params.push_back(fb_tensors.size());
-            fb_tensors.emplace_back(ToTensor(t, offsets.back(), &builder_));
-          }
-        }
-
-        Qnn_OpConfig_t op_config = op_wrapper->GetOpConfig();
-        fb_ops.emplace_back(qcir::CreateOperatorDirect(
-            builder_,
-            QNN_OP_VER_PTR(op_config)->name,
-            QNN_OP_VER_PTR(op_config)->packageName,
-            QNN_OP_VER_PTR(op_config)->typeName,
-            &inputs,
-            &outputs,
-            &params));
-      }
-
-      std::vector<flatbuffers::Offset<qcir::Graph>> fb_graphs(
-          {qcir::CreateGraphDirect(
-              builder_, graph_name.c_str(), &fb_ops, &fb_tensors)});
-      auto context = qcir::CreateContextDirect(builder_, &fb_graphs);
-      builder_.Finish(context);
-
-      QnnExecuTorchContextBinary qcir_binary(
-          {builder_.GetBufferPointer(), builder_.GetSize()});
-
-      custom_qcir_protocol_buffer_ =
-          QnnQcirCustomProtocol(qcir_binary.nbytes, tensor_data.size());
-      custom_qcir_protocol_buffer_.BuildQcirCustomBuffer(
-          qcir_binary, tensor_data);
-      std::tie(binary_info.buffer, binary_info.nbytes) =
-          custom_qcir_protocol_buffer_.GetCustomProtocolBuffer();
-    } else {
-      if (qnn_manager_->Compile(graph_name, op_wrappers) !=
+    for (int i = 0; i < graph_names.size(); ++i) {
+      if (qnn_manager_->Compile(graph_names[i], op_wrappers[i]) !=
           executorch::runtime::Error::Ok) {
         QNN_EXECUTORCH_LOG_ERROR("Fail to compile QNN graph");
         return py::array_t<char>(0);
       }
-      auto qnn_executorch_options = GetQnnExecuTorchOptions(
-          qnn_executorch_option_ptr_.cast<std::string_view>().data());
-      if (qnn_executorch_options->saver() ||
-          qnn_manager_->GetContextBinary(binary_info) !=
-              executorch::runtime::Error::Ok) {
-        return py::array_t<char>(0);
-      }
+    }
+    auto qnn_executorch_options = GetQnnExecuTorchOptions(
+        qnn_executorch_option_ptr_.cast<std::string_view>().data());
+    if (qnn_executorch_options->saver() ||
+        qnn_manager_->GetContextBinary(binary_info) !=
+            executorch::runtime::Error::Ok) {
+      return py::array_t<char>(0);
     }
 
     // allocate py::array (to pass the result of the C++ function to Python)
diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py
index 63c1795c117..e7048f6b577 100644
--- a/backends/qualcomm/qnn_preprocess.py
+++ b/backends/qualcomm/qnn_preprocess.py
@@ -6,7 +6,7 @@
 
 import logging
 from collections import defaultdict
-from typing import final, List
+from typing import Dict, final, List
 
 import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager
 
@@ -17,6 +17,7 @@
 from executorch.backends.qualcomm.partition.utils import generate_qnn_executorch_option
 from executorch.backends.qualcomm.serialization.qc_schema_serialize import (
     flatbuffer_to_option,
+    option_to_flatbuffer,
 )
 from executorch.exir.backend.backend_details import (
     BackendDetails,
@@ -34,19 +35,11 @@
 @final
 class QnnBackend(BackendDetails):
     @staticmethod
-    def preprocess(
-        edge_program: ExportedProgram,
-        compile_specs: List[CompileSpec],
-    ) -> PreprocessResult:
-        option = generate_qnn_executorch_option(compile_specs)
-        qnn_manager = PyQnnManager.QnnManager(option)
-        qnn_manager.Init()
-
+    def _build_op_wrappers(edge_program: ExportedProgram, enable_tensor_dump: bool):
         # QNN Delegate Specific Passes
         graph_module = QnnPassManager().transform_for_preprocess_pipeline(edge_program)
         assert graph_module is not None
 
-        enable_tensor_dump = qnn_manager.IsTensorDump()
         nodes_to_wrappers = defaultdict(dict)
         node_visitors = get_node_visitors(
             edge_program, enable_tensor_dump=enable_tensor_dump
@@ -91,9 +84,24 @@ def preprocess(
                 continue
             else:
                 raise RuntimeError(f"{node.op} is not supported in Qnn")
+
+        return py_op_wrapper_list
+
+    @staticmethod
+    def preprocess(
+        edge_program: ExportedProgram,
+        compile_specs: List[CompileSpec],
+    ) -> PreprocessResult:
+        option = generate_qnn_executorch_option(compile_specs)
+        qnn_manager = PyQnnManager.QnnManager(option)
+        qnn_manager.Init()
+        py_op_wrapper_list = QnnBackend._build_op_wrappers(
+            edge_program, qnn_manager.IsTensorDump()
+        )
+
         qnn_context_binary = qnn_manager.Compile(
-            qnn_manager.GetGraphNames()[0],
-            [py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrapper_list],
+            qnn_manager.GetGraphNames(),
+            [[py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrapper_list]],
         )
 
         obj_options = flatbuffer_to_option(option)
@@ -108,3 +116,62 @@ def preprocess(
             processed_bytes=bytes(qnn_context_binary),
             debug_handle_map={},
         )
+
+    @staticmethod
+    def preprocess_multimethod(
+        edge_programs: Dict[str, List[ExportedProgram]],
+        compile_specs: Dict[str, List[List[CompileSpec]]],
+    ) -> PreprocessResult:
+        # TODO: refactor QnnManager to consume multiple compile_spec
+        # take first compile_specs here for the same partitions
+        graph_name = list(edge_programs.keys())
+        compile_spec = list(compile_specs.values())[0][0]
+        # gather all graph names
+        option = flatbuffer_to_option(compile_spec[0].value)
+        option.graph_name = graph_name
+        compile_spec[0].value = option_to_flatbuffer(option)
+        # check if each graph has equal number of partitions
+        num_sub_graphs = set()
+        for edge_program in edge_programs.values():
+            num_sub_graphs.add(len(edge_program))
+        # this constraint is dedicated to weight-sharing scenario
+        assert (
+            len(num_sub_graphs) == 1
+        ), "Only graphs with the same number of partitions could be used"
+
+        all_processed_results = {key: [] for key in edge_programs.keys()}
+        num_sub_graphs = next(iter(num_sub_graphs))
+        for i in range(num_sub_graphs):
+            # e.g. 2 methods (x, y) with 3 partitions
+            #      > context_binary_0: [x.subgraph_0, y.subgraph_0]
+            #      > context_binary_1: [x.subgraph_1, y.subgraph_1]
+            #      > context_binary_2: [x.subgraph_2, y.subgraph_2]
+            qnn_manager = PyQnnManager.QnnManager(
+                generate_qnn_executorch_option(compile_spec)
+            )
+            qnn_manager.Init()
+            py_op_wrapper_list = []
+            for j, programs in enumerate(edge_programs.values()):
+                logger.info(f"Processing Method({j}): ({i+1}/{num_sub_graphs})")
+                py_op_wrappers = QnnBackend._build_op_wrappers(
+                    programs[i], qnn_manager.IsTensorDump()
+                )
+                py_op_wrapper_list.append(
+                    [py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrappers]
+                )
+
+            qnn_context_binary = qnn_manager.Compile(graph_name, py_op_wrapper_list)
+            assert (
+                len(qnn_context_binary) != 0
+            ), "Failed to generate Qnn context binary."
+            qnn_manager.Destroy()
+            # methods should share the same context binary for current partition
+            for key in edge_programs.keys():
+                all_processed_results[key].append(
+                    PreprocessResult(
+                        processed_bytes=bytes(qnn_context_binary),
+                        debug_handle_map={},
+                    )
+                )
+
+        return all_processed_results
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index 13718b0891a..6850a92fdc6 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -59,7 +59,9 @@ QnnManager::QnnManager(
         EnumNameQcomChipset(options_->soc_info()->soc_model()));
     QNN_EXECUTORCH_LOG_INFO(
         "backend_type: %s", EnumNameQnnExecuTorchBackendType(backend_type));
-    QNN_EXECUTORCH_LOG_INFO("graph_name: %s", options_->graph_name()->c_str());
+    for (auto name : *options_->graph_name()) {
+      QNN_EXECUTORCH_LOG_INFO("graph_name: %s", name->c_str());
+    }
     QNN_EXECUTORCH_LOG_INFO("library_path: %s", library_path.c_str());
     QNN_EXECUTORCH_LOG_INFO("dump intermediate outputs: %s", IsTensorDump());
     QNN_EXECUTORCH_LOG_INFO(
@@ -281,6 +283,10 @@ Error QnnManager::Init() {
       LoadQnnLibrary() == Error::Ok, Internal, "Fail to load Qnn library");
   logger_ = std::make_unique<QnnLogger>(
       qnn_loaded_backend_, LoggingCallback, options_->log_level());
+  std::vector<std::string> graph_names;
+  for (auto name : *options_->graph_name()) {
+    graph_names.emplace_back(name->str());
+  }
   if (backend_params_ptr_->backend_init_state_ ==
       BackendInitializeState::UNINITIALIZED) {
     QNN_EXECUTORCH_LOG_INFO(
@@ -298,7 +304,8 @@ Error QnnManager::Init() {
         Internal,
         "Failed to load Qnn backend.");
     ET_CHECK_OR_RETURN_ERROR(
-        backend_params_ptr_->qnn_backend_cache_ptr_->Configure() == Error::Ok,
+        backend_params_ptr_->qnn_backend_cache_ptr_->Configure(graph_names) ==
+            Error::Ok,
         Internal,
         "Fail to configure Qnn backend cache");
     ET_CHECK_OR_RETURN_ERROR(
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index ee9c4337532..77412a184ff 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -57,10 +57,6 @@ class QnnManager {
     return options_->online_prepare();
   }
 
-  bool IsMultipleGraphs() {
-    return options_->multiple_graphs();
-  }
-
   bool IsTensorDump() {
     return options_->dump_intermediate_outputs();
   }
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
index 1e6b1262c3a..5cfe783c6f0 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
@@ -81,11 +81,10 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary(
   return Error::Ok;
 }
 
-Error QnnBackendCache::Configure() {
+Error QnnBackendCache::Configure(const std::vector<std::string>& graph_names) {
   if (qnn_context_blob_.buffer == nullptr) {
+    graph_names_ = graph_names;
     state_ = SERIALIZE;
-    // use aot_graph_name if we're lowering graph on host side
-    graph_names_.push_back(aot_graph_name_);
     QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in SAVE MODE.");
     return Error::Ok;
   }
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.h b/backends/qualcomm/runtime/backends/QnnBackendCache.h
index 9abec186c3a..f51fd5679a1 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.h
@@ -26,10 +26,8 @@ class QnnBackendCache {
     ONLINE_PREPARE = 3,
     MULTI_GRAPH = 4,
   };
-  explicit QnnBackendCache(
-      const QnnExecuTorchContextBinary& qnn_context_blob,
-      const std::string& aot_graph_name)
-      : qnn_context_blob_(qnn_context_blob), aot_graph_name_(aot_graph_name) {}
+  explicit QnnBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob)
+      : qnn_context_blob_(qnn_context_blob) {}
   virtual ~QnnBackendCache();
   QnnBackendCache(const QnnBackendCache&) = delete;
   QnnBackendCache(QnnBackendCache&&) = delete;
@@ -60,7 +58,8 @@ class QnnBackendCache {
     graph_names_.emplace_back(graph_name);
   }
 
-  executorch::runtime::Error Configure();
+  executorch::runtime::Error Configure(
+      const std::vector<std::string>& graph_names);
 
  protected:
   virtual executorch::runtime::Error RetrieveBackendBinaryInfo(
@@ -82,7 +81,6 @@ class QnnBackendCache {
   QnnSystemContext_Handle_t sys_context_handle_{nullptr};
   QnnSystemImplementation qnn_sys_impl_{"libQnnSystem.so"};
   std::vector<std::string> graph_names_;
-  std::string aot_graph_name_;
   std::unordered_map<std::string, std::vector<Qnn_Tensor_t>>
       input_tensor_structs_;
   std::unordered_map<std::string, std::vector<Qnn_Tensor_t>>
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
index 1f251aeaffa..e646a3add5e 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -62,8 +62,7 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
           implementation, logger, options->soc_info(), htp_options);
 
       backend_params->qnn_backend_cache_ptr_ =
-          std::make_unique<HtpBackendCache>(
-              qnn_context_blob, options->graph_name()->str());
+          std::make_unique<HtpBackendCache>(qnn_context_blob);
 
       backend_params->qnn_context_ptr_ = std::make_unique<HtpContext>(
           implementation,
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h
index 4dd6897f74a..faad456aed4 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h
@@ -13,10 +13,8 @@ namespace backends {
 namespace qnn {
 class HtpBackendCache : public QnnBackendCache {
  public:
-  explicit HtpBackendCache(
-      const QnnExecuTorchContextBinary& qnn_context_blob,
-      const std::string& aot_graph_name)
-      : QnnBackendCache(qnn_context_blob, aot_graph_name), spill_fill_buf_(0) {}
+  explicit HtpBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob)
+      : QnnBackendCache(qnn_context_blob), spill_fill_buf_(0) {}
   ~HtpBackendCache() override = default;
 
   uint64_t GetSpillFillBufferSize() {
diff --git a/backends/qualcomm/serialization/qc_compiler_spec.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs
index d8809231a9f..656bb5c76af 100644
--- a/backends/qualcomm/serialization/qc_compiler_spec.fbs
+++ b/backends/qualcomm/serialization/qc_compiler_spec.fbs
@@ -168,7 +168,8 @@ table QnnExecuTorchOptions {
   backend_options:QnnExecuTorchBackendOptions;
 
   /// Optional parameter to create qnn graph if QNN context blob is not given
-  graph_name:string;
+  /// It could be a list of names only when doing weight-sharing lowering
+  graph_name:[string];
 
   /// Optional parameter to override the QNN backend library.
   library_path:string;
@@ -192,9 +193,6 @@ table QnnExecuTorchOptions {
   /// Is model from qnn context binary
   is_from_context_binary:bool;
 
-  /// True if there exists multiple graphs in one .pte file.
-  multiple_graphs:bool;
-
   // Enable this option to record all QNN API calls for debugging purpose
   saver:bool;
 
diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py
index 93305b1dbb5..84ce23701ef 100644
--- a/backends/qualcomm/serialization/qc_schema.py
+++ b/backends/qualcomm/serialization/qc_schema.py
@@ -10,6 +10,7 @@
 
 from dataclasses import dataclass, field
 from enum import IntEnum, unique
+from typing import List
 
 
 @dataclass
@@ -148,7 +149,7 @@ class QnnExecuTorchBackendOptions:
 class QnnExecuTorchOptions:
     soc_info: SocInfo
     backend_options: QnnExecuTorchBackendOptions
-    graph_name: str = ""
+    graph_name: List[str] = field(default_factory=lambda: ["forward"])
     library_path: str = ""
     log_level: QnnExecuTorchLogLevel = QnnExecuTorchLogLevel.kLogOff
     online_prepare: bool = False
@@ -156,6 +157,5 @@ class QnnExecuTorchOptions:
     profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff
     shared_buffer: bool = False
     is_from_context_binary: bool = False
-    multiple_graphs: bool = False
     saver: bool = False
     saver_output_dir: str = "saver_output"
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 338209fcd4a..e1dfe3295dd 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -23,8 +23,10 @@
 )
 
 from executorch.backends.qualcomm.tests.utils import (
+    convert_pt2e,
     generate_context_binary,
     ModuleQConfig,
+    prepare_pt2e,
     QnnTool,
     QuantDtype,
     TestQNN,
@@ -44,9 +46,9 @@
     dump_context_from_pte,
     from_context_binary,
     generate_htp_compiler_spec,
-    generate_multi_graph_program,
     generate_qnn_executorch_compiler_spec,
     PyQnnManagerAdaptor,
+    QnnPartitioner,
     skip_annotation,
     to_edge_transform_and_lower_to_qnn,
     update_spill_fill_size,
@@ -87,8 +89,12 @@
 from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel
 
 from executorch.examples.models.wav2letter import Wav2LetterModel
-from executorch.exir import to_edge
-from executorch.exir.backend.backend_api import disable_validation
+from executorch.exir import EdgeProgramManager, to_edge
+from executorch.exir.backend.backend_api import (
+    disable_validation,
+    MethodProgramsPartitionerSpec,
+    to_backend,
+)
 
 
 class TestQNNFloatingPointOperator(TestQNN):
@@ -2459,35 +2465,37 @@ def test_qnn_backend_multi_graphs(self):
         graph_names = ["seq_conv", "single_conv"]
         backend_options = generate_htp_compiler_spec(
             use_fp16=True,
+            use_weight_sharing=True,
         )
         compiler_specs = [
             generate_qnn_executorch_compiler_spec(
                 soc_model=self.chipset_table[TestQNN.model],
                 backend_options=backend_options,
-                multiple_graphs=True,
-                weight_sharing=True,
                 graph_name=graph_name,
             )
             for graph_name in graph_names
         ]
-        edge_progs = [
-            to_edge_transform_and_lower_to_qnn(module, sample_input, compiler_spec)
-            for module, sample_input, compiler_spec in zip(
-                modules, sample_inputs, compiler_specs
+        # TODO: retire capture_program once we figure out how to extract
+        #       intermediate graph from official lowering API
+        edge_progs = {
+            graph_name: capture_program(module, sample_input).exported_program
+            for graph_name, module, sample_input in zip(
+                graph_names, modules, sample_inputs
             )
-        ]
-        prog_mgr, _ = generate_multi_graph_program(
-            compiler_specs=compiler_specs[0],
-            processed_bytes=[
-                edge_prog.exported_program().graph_module.lowered_module_0.processed_bytes
-                for edge_prog in edge_progs
-            ],
+        }
+        partitioners = {
+            graph_name: QnnPartitioner(compiler_spec)
+            for graph_name, compiler_spec in zip(graph_names, compiler_specs)
+        }
+        lowered_ep_dict = to_backend(
+            MethodProgramsPartitionerSpec(edge_progs, partitioners)
         )
+        executorch_prog = EdgeProgramManager(lowered_ep_dict).to_executorch()
         for index, module in enumerate(modules):
             self.verify_output(
                 module=module,
                 sample_inputs=sample_inputs[index],
-                executorch_prog=prog_mgr,
+                executorch_prog=executorch_prog,
                 method_index=index,
             )
 
@@ -3101,37 +3109,43 @@ def test_qnn_backend_multi_graphs(self):
         graph_names = ["seq_conv", "single_conv"]
         backend_options = generate_htp_compiler_spec(
             use_fp16=False,
+            use_weight_sharing=True,
         )
         compiler_specs = [
             generate_qnn_executorch_compiler_spec(
                 soc_model=self.chipset_table[TestQNN.model],
                 backend_options=backend_options,
-                multiple_graphs=True,
-                weight_sharing=True,
                 graph_name=graph_name,
             )
             for graph_name in graph_names
         ]
-        edge_progs = [
-            to_edge_transform_and_lower_to_qnn(
-                self.get_qdq_module(module, sample_input), sample_input, compiler_spec
-            )
-            for module, sample_input, compiler_spec in zip(
-                modules, sample_inputs, compiler_specs
+        # TODO: retire capture_program once we figure out how to extract
+        #       intermediate graph from official lowering API
+        for i, module in enumerate(modules):
+            module_exported = torch.export.export(module, sample_inputs[i]).module()
+            module_prepared = prepare_pt2e(module_exported, make_quantizer())
+            module_prepared(*sample_inputs[i])
+            modules[i] = convert_pt2e(module_prepared)
+
+        edge_progs = {
+            graph_name: capture_program(module, sample_input).exported_program
+            for graph_name, module, sample_input in zip(
+                graph_names, modules, sample_inputs
             )
-        ]
-        prog_mgr, _ = generate_multi_graph_program(
-            compiler_specs=compiler_specs[0],
-            processed_bytes=[
-                edge_prog.exported_program().graph_module.lowered_module_0.processed_bytes
-                for edge_prog in edge_progs
-            ],
+        }
+        partitioners = {
+            graph_name: QnnPartitioner(compiler_spec)
+            for graph_name, compiler_spec in zip(graph_names, compiler_specs)
+        }
+        lowered_ep_dict = to_backend(
+            MethodProgramsPartitionerSpec(edge_progs, partitioners)
         )
+        executorch_prog = EdgeProgramManager(lowered_ep_dict).to_executorch()
         for index, module in enumerate(modules):
             self.verify_output(
                 module=module,
                 sample_inputs=sample_inputs[index],
-                executorch_prog=prog_mgr,
+                executorch_prog=executorch_prog,
                 method_index=index,
             )
 
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 3653cd3176f..fec67833077 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -4,8 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 import operator
-import re
-import time
 import warnings
 from collections import OrderedDict
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
@@ -51,14 +49,8 @@
     QCOM_QUANTIZED_IO,
 )
 
-from executorch.exir import (
-    EdgeCompileConfig,
-    ExecutorchProgramManager,
-    ExirExportedProgram,
-    to_edge,
-)
+from executorch.exir import EdgeCompileConfig, ExirExportedProgram, to_edge
 from executorch.exir.backend.compile_spec_schema import CompileSpec
-from executorch.exir.capture import ExecutorchBackendConfig
 from executorch.exir.lowered_backend_module import LoweredBackendModule
 from executorch.exir.program._program import (
     EdgeProgramManager,
@@ -382,6 +374,7 @@ def to_edge_transform_and_lower_to_qnn(
 def capture_program(
     module: Union[torch.nn.Module, torch.fx.GraphModule],
     inputs: Tuple[torch.Tensor],
+    dep_table: Optional[Dict] = None,
     passes_job: OrderedDict = None,
     dynamic_shapes: Dict = None,
 ) -> exir.ExirExportedProgram:
@@ -393,6 +386,7 @@ def capture_program(
     Args:
         module (Union[torch.nn.Module, torch.fx.GraphModule]): The PyTorch module or fx.GraphModule to be captured.
         inputs (Tuple[torch.Tensor]): The input tensors for the module.
+        dep_table (Optional[Dict]): Dependency table for the transformation passes.
         passes_job (OrderedDict, optional): Ordered dictionary of transformation passes.
         dynamic_shapes (Dict, optional): Information about dynamic shapes.
 
@@ -413,7 +407,9 @@ def capture_program(
     core_ep = ExirExportedProgram(decomposed_ep, False)
     edge_ep = core_ep.to_edge(qnn_edge_config())
     transform_passes = QnnPassManager().get_to_edge_transform_passes(
-        edge_ep.exported_program, passes_job=passes_job
+        edge_ep.exported_program,
+        passes_job=passes_job,
+        dep_table=dep_table,
     )
     edge_ep.transform(*transform_passes)
     return edge_ep
@@ -855,219 +851,11 @@ def draw_graph(title, path, graph_module: torch.fx.GraphModule):
         f.write(graph.get_dot_graph().create_svg())
 
 
-def generate_multi_graph_program(
-    compiler_specs: List[CompileSpec],
-    processed_bytes: List[bytes],
-    input_nodes_dict: List[torch.fx.Node] = None,
-    output_nodes_dict: List[torch.fx.Node] = None,
-    backend_config: ExecutorchBackendConfig = None,
-    constant_methods: Optional[Dict[str, Any]] = None,
-) -> ExecutorchProgramManager:
-    # compile multiple graphs in qcir into single context binary
-    (
-        graph_inputs,
-        graph_outputs,
-        qnn_in_order,
-        executorch_in_order,
-        executorch_out_order,
-    ) = ({}, {}, {}, {}, {})
-    qnn_mgr = PyQnnManagerAdaptor.QnnManager(
-        generate_qnn_executorch_option(compiler_specs), processed_bytes
-    )
-    assert qnn_mgr.Init().value == 0, "failed to load processed bytes"
-    binary_info = bytes(qnn_mgr.Compile())
-    assert len(binary_info) != 0, "failed to generate QNN context binary"
-    graph_names = qnn_mgr.GetGraphNames()
-    for graph_name in graph_names:
-        graph_inputs[graph_name] = qnn_mgr.GetGraphInputs(graph_name)
-        graph_outputs[graph_name] = qnn_mgr.GetGraphOutputs(graph_name)
-
-    # We need to obtain the order of the IOs to correctly map QNN with nn.module
-    for graph_name in graph_names:
-        if input_nodes_dict:
-            # input
-            input_names = [node.name for node in input_nodes_dict[graph_name]]
-            qnn_input_names = [
-                wrapper.GetName() for wrapper in graph_inputs[graph_name]
-            ]
-            # The input of intermideate module including call_function node
-            # could not be reorder by node name
-            if len(input_names) == len(qnn_input_names):
-                input_order_list = []
-                for input_name in input_names:
-                    # e.g., input_0_tokens_0
-                    pattern = rf"^input_(\d+)_({input_name})_(\d+)$"
-                    for j in range(len(qnn_input_names)):
-                        if re.match(pattern, qnn_input_names[j]):
-                            input_order_list.append(j)
-                            break
-                assert len(input_order_list) == len(
-                    input_names
-                ), "Order list length is different from names"
-                executorch_in_order[graph_name] = input_order_list
-                qnn_in_order[graph_name] = sorted(
-                    range(len(input_order_list)), key=lambda k: input_order_list[k]
-                )
-        if output_nodes_dict:
-            # output
-            get_item_list = output_nodes_dict[graph_name][0].args[0]
-            output_order_list = [item.args[1] for item in get_item_list]
-            executorch_out_order[graph_name] = output_order_list
-
-    qnn_mgr.Destroy()
-
-    # build custom ops with different graph signatures
-    compiler_options = flatbuffer_to_option(compiler_specs[0].value)
-    bundle_progs = [
-        from_context_binary(
-            ctx_path=binary_info,
-            op_name=f"loader_{graph_name}_{int(time.time())}",
-            soc_model=compiler_options.soc_info.soc_model,
-            custom_info={
-                "graph_inputs": graph_inputs[graph_name],
-                "graph_outputs": graph_outputs[graph_name],
-                "graph_name": graph_name,
-                "qnn_in_order": qnn_in_order.get(graph_name, None),
-                "executorch_in_order": executorch_in_order.get(graph_name, None),
-                "executorch_out_order": executorch_out_order.get(graph_name, None),
-            },
-        )
-        for graph_name in graph_names
-    ]
-    # leverage ExecutorchProgramManager for generating pte with multi-methods
-    edge_prog_mgr = to_edge(
-        {
-            graph_name: bundle_prog["exported_program"]
-            for graph_name, bundle_prog in zip(graph_names, bundle_progs)
-        },
-        constant_methods=constant_methods,
-        # do not alter name for custom op
-        compile_config=EdgeCompileConfig(_use_edge_ops=False),
-    )
-    # restore meta losed in generating EdgeProgramManager
-    for graph_name in graph_names:
-        for n in edge_prog_mgr._edge_programs[graph_name].graph.nodes:
-            if graph_name in n.name:
-                n.meta[OpContextLoader.meta_ctx_bin] = binary_info
-                break
-
-    edge_prog_mgr = edge_prog_mgr.to_backend(QnnPartitioner(compiler_specs))
-    exec_prog = edge_prog_mgr.to_executorch(
-        config=backend_config or ExecutorchBackendConfig()
-    )
-    return exec_prog, bundle_progs
-
-
-def generate_composite_llama_program(
-    llama_model: torch.nn.Module,
-    graph_names: List[str],
-    sample_inputs_list: List[Tuple[Any]],
-    lower_module_dict: Dict[str, List[LoweredBackendModule]],
-    call_delegate_node_name_dict: Dict[str, List[str]],
-    call_delegate_inputs_dict: Dict[str, List[Tuple[str, int | None]]],
-    outputs_dict: Dict[str, List[Tuple[str, int]]],
-    embedding_quantize: str,
-    backend_config: ExecutorchBackendConfig = None,
-    constant_methods: Optional[Dict[str, Any]] = None,
-) -> ExecutorchProgramManager:
-    class CompositeLlamaModule(torch.nn.Module):
-        def __init__(
-            self,
-            llama_model,
-            lower_module_list,
-            call_delegate_node_name_list,
-            call_delegate_inputs_list,
-            outputs_list,
-            embedding_quantize,
-        ) -> None:
-            super().__init__()
-            self.llama_model = llama_model
-            self.lower_module_list = lower_module_list
-            self.call_delegate_node_name_list = call_delegate_node_name_list
-            self.call_delegate_inputs_list = call_delegate_inputs_list
-            self.outputs_list = outputs_list
-            self.embedding_quantize = embedding_quantize
-
-        def reorder(
-            self,
-            call_delegate_inputs: List[Tuple[str, int | None]],
-            module_inputs: dict[str, torch.Tensor],
-            all_ret: dict[str, torch.Tensor],
-        ) -> Tuple[torch.Tensor]:
-            ret = []
-            for name, index in call_delegate_inputs:
-                if index is not None:
-                    # Get tensor from previous results
-                    ret.append(all_ret[name][index])
-                else:
-                    # Get tensor from the inputs of module
-                    ret.append(module_inputs[name])
-            return tuple(ret)
-
-        def forward(
-            self,
-            tokens: torch.Tensor,
-            atten_mask: torch.Tensor,
-            input_pos: Optional[torch.Tensor] = None,
-            *args,
-        ) -> Tuple[torch.Tensor]:
-            all_ret = {}
-            module_input_dict = {
-                "tokens": tokens,
-                "atten_mask": atten_mask,
-                "input_pos": input_pos,
-            }
-            for num, arg in enumerate(args):
-                module_input_dict[f"args_{num}"] = arg
-
-            if self.embedding_quantize:
-                hidden_states = self.llama_model.tok_embeddings(tokens)
-                module_input_dict["quantized_decomposed_embedding_4bit_dtype"] = (
-                    hidden_states
-                )
-
-            for lower_module, call_delegate_node_name, call_delegate_inputs in zip(
-                self.lower_module_list,
-                self.call_delegate_node_name_list,
-                self.call_delegate_inputs_list,
-            ):
-                inp = self.reorder(call_delegate_inputs, module_input_dict, all_ret)
-                ret = lower_module(*inp)
-                all_ret[call_delegate_node_name] = ret
-            llama_outputs = []
-            for output_src_name, index in self.outputs_list:
-                llama_outputs.append(all_ret[output_src_name][index])
-            return tuple(llama_outputs)
-
-    progs_dict = {}
-    for graph_name, sample_inputs in zip(graph_names, sample_inputs_list):
-        composite_llama_module = CompositeLlamaModule(
-            llama_model,
-            lower_module_dict[graph_name],
-            call_delegate_node_name_dict[graph_name],
-            call_delegate_inputs_dict[graph_name],
-            outputs_dict[graph_name],
-            embedding_quantize,
-        )
-        prog = torch.export.export(composite_llama_module, sample_inputs, strict=True)
-        progs_dict[graph_name] = prog
-    # leverage ExecutorchProgramManager for generating pte with multi-methods
-    edge_prog_mgr = to_edge(
-        progs_dict,
-        constant_methods=constant_methods,
-        # do not alter name for custom op
-        compile_config=EdgeCompileConfig(_check_ir_validity=False, _use_edge_ops=False),
-    )
-    exec_prog = edge_prog_mgr.to_executorch(
-        config=backend_config or ExecutorchBackendConfig()
-    )
-    return exec_prog
-
-
 def generate_htp_compiler_spec(
     use_fp16: bool,
     use_dlbc: bool = False,
     use_multi_contexts: bool = False,
+    use_weight_sharing: bool = False,
 ) -> QnnExecuTorchBackendOptions:
     """
     Helper function generating backend options for QNN HTP
@@ -1081,6 +869,8 @@ def generate_htp_compiler_spec(
         use_multi_contexts: When multiple contexts are generated inside the same
             pte, it is possible to reserve a single spill-fill allocation that
             could be re-used across all the splits.
+        use_weight_sharing: Used with multiple_graphs, where model size will be
+            reduced when operations have the same weights across multiple graphs.
 
     Returns:
         QnnExecuTorchHtpBackendOptions: backend options for QNN HTP.
@@ -1096,6 +886,7 @@ def generate_htp_compiler_spec(
     # TODO: enable voting mechanism in runtime and make this as an option
     htp_options.performance_mode = QnnExecuTorchHtpPerformanceMode.kHtpBurst
     htp_options.use_multi_contexts = use_multi_contexts
+    htp_options.use_weight_sharing = use_weight_sharing
     htp_options.use_dlbc = use_dlbc
     return QnnExecuTorchBackendOptions(
         backend_type=QnnExecuTorchBackendType.kHtpBackend,
@@ -1114,8 +905,6 @@ def generate_qnn_executorch_compiler_spec(
     optrace: bool = False,
     shared_buffer: bool = False,
     is_from_context_binary: bool = False,
-    multiple_graphs: bool = False,
-    weight_sharing: bool = False,
     graph_name: str = "forward",
 ) -> List[CompileSpec]:
     """
@@ -1144,10 +933,7 @@ def generate_qnn_executorch_compiler_spec(
         shared_buffer: Enables usage of shared buffer between application
             and backend for graph I/O.
         is_from_context_binary: True if current graph comes from pre-built context binary.
-        multiple_graphs: True if multiple methods are expected to have in single .pte file.
-            Please see test cases for post-processing example.
-        weight_sharing: Used with multiple_graphs, where model size will be reduced when operations have the same weights across multiple graphs.
-        graph_name: Assign unique graph name if 'multiple_graphs' is used.
+        graph_name: Assign unique graph name if lowering multiple methods.
 
     Returns:
         List[CompileSpec]: Compiler specs for Qualcomm AI Engine Direct.
@@ -1167,16 +953,10 @@ def generate_qnn_executorch_compiler_spec(
             stacklevel=1,
         )
 
-    if weight_sharing and not multiple_graphs:
-        warnings.warn(
-            "Weight sharing is intended for multiple graphs scenario, please ensure if there are multiple graphs",
-            stacklevel=1,
-        )
-
     qnn_executorch_options = QnnExecuTorchOptions(
         _soc_info_table[soc_model], backend_options
     )
-    qnn_executorch_options.graph_name = graph_name
+    qnn_executorch_options.graph_name = [graph_name]
     qnn_executorch_options.log_level = (
         QnnExecuTorchLogLevel.kLogLevelDebug
         if debug
@@ -1212,15 +992,6 @@ def generate_qnn_executorch_compiler_spec(
     qnn_executorch_options.shared_buffer = shared_buffer
     qnn_executorch_options.online_prepare = online_prepare
     qnn_executorch_options.is_from_context_binary = is_from_context_binary
-    qnn_executorch_options.multiple_graphs = multiple_graphs
-
-    if multiple_graphs:
-        # enable weight sharing mechanism if multiple graphs appear
-        if (
-            backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend
-            and weight_sharing
-        ):
-            backend_options.htp_options.use_weight_sharing = True
 
     return [
         CompileSpec(QCOM_QNN_COMPILE_SPEC, option_to_flatbuffer(qnn_executorch_options))
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index 83478bd8e68..27ccd9fe3e9 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -481,12 +481,6 @@ int main(int argc, char** argv) {
 
       ++inference_index;
     }
-    ET_LOG(
-        Info,
-        "%d inference took %f ms, avg %f ms",
-        inference_index,
-        elapsed_time,
-        elapsed_time / inference_index);
   } else {
     // if no input is provided, fill the inputs with default values
     auto inputs = prepare_input_tensors(*method);
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index 375edf9fb6c..47b2eebf518 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -29,7 +29,7 @@
 )
 
 from executorch.backends.qualcomm.builders.utils import is_graph_output
-
+from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner
 from executorch.backends.qualcomm.quantizer.custom_annotation import (
     annotate_linear_16a8w_in_affine_layer,
     annotate_matmul_16a8w,
@@ -39,20 +39,15 @@
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
 from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset
 
-from executorch.backends.qualcomm.serialization.qc_schema_serialize import (
-    flatbuffer_to_option,
-    option_to_flatbuffer,
-)
 from executorch.backends.qualcomm.utils.constants import (
     QCOM_PASS_ACTIVATE_KEY,
     QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY,
     QCOM_QUANT_ATTRS_MAP,
 )
 from executorch.backends.qualcomm.utils.utils import (
+    capture_program,
     convert_linear_to_conv2d,
-    generate_composite_llama_program,
     generate_htp_compiler_spec,
-    generate_multi_graph_program,
     generate_qnn_executorch_compiler_spec,
     get_soc_to_chipset_map,
     to_edge_transform_and_lower_to_qnn,
@@ -73,6 +68,11 @@
     setup_common_args_and_variables,
     SimpleADB,
 )
+from executorch.exir import EdgeProgramManager
+from executorch.exir.backend.backend_api import (
+    MethodProgramsPartitionerSpec,
+    to_backend,
+)
 from executorch.exir.capture._config import ExecutorchBackendConfig
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
@@ -678,7 +678,9 @@ def permute(w, heads):
             llama_instace.inputs for llama_instace in llama_instance_list
         ]
         backend_options = generate_htp_compiler_spec(
-            use_fp16=use_fp16, use_multi_contexts=args.num_sharding > 1
+            use_fp16=use_fp16,
+            use_multi_contexts=args.num_sharding > 1,
+            use_weight_sharing=not args.enable_x86_64,  # x86 emulator does not support weight sharing
         )
         graph_names = ["kv_forward", "prefill_forward"]
         compiler_specs = [
@@ -686,45 +688,49 @@ def permute(w, heads):
                 soc_model=get_soc_to_chipset_map()[args.model],
                 backend_options=backend_options,
                 shared_buffer=args.shared_buffer,
-                multiple_graphs=True,
-                weight_sharing=not args.enable_x86_64,  # x86 emulator does not support weight sharing
                 graph_name=graph_name,
             )
             for graph_name in graph_names
         ]
-        skip_node_op_set = {"llama.fallback.default"}
-        edge_prog_mgrs = [
-            to_edge_transform_and_lower_to_qnn(
-                llama_instance.llama_graph_module,
-                sample_input,
-                compile_spec,
+
+        # TODO: retire capture_program once we figure out how to extract
+        #       intermediate graph from official lowering API
+        edge_progs = {
+            graph_name: capture_program(
+                module=llama_instance.llama_graph_module,
+                inputs=sample_input,
                 dep_table=llama_instance.dep_table,
                 passes_job=llama_instance.passes_job,
-                skip_node_op_set=skip_node_op_set,
-            )
-            for llama_instance, sample_input, compile_spec in zip(
-                llama_instance_list, sample_inputs_list, compiler_specs
+            ).exported_program
+            for graph_name, llama_instance, sample_input in zip(
+                graph_names, llama_instance_list, sample_inputs_list
             )
-        ]
-        for n in edge_prog_mgrs[0].exported_program().graph.nodes:
+        }
+        for n in edge_progs[graph_names[0]].graph.nodes:
             if n.op == "output":
                 for node, output_encoding in n.meta[QCOM_QUANT_ATTRS_MAP].items():
                     if node.meta["val"].size() in llama_instance_list[0].io_shape:
                         quant_attrs = output_encoding
 
-        if args.num_sharding > 1:
-            max_sf_size = update_spill_fill_size(
-                [edge_prog_mgr.exported_program() for edge_prog_mgr in edge_prog_mgrs]
+        partitioners = {
+            graph_name: QnnPartitioner(
+                compiler_spec, skip_node_op_set={"llama.fallback.default"}
             )
-            qnn_executorch_options = flatbuffer_to_option(compiler_specs[0][0].value)
-            qnn_executorch_options.backend_options.htp_options.max_sf_buf_size = (
-                max_sf_size
-            )
-            compiler_specs[0][0].value = option_to_flatbuffer(qnn_executorch_options)
+            for graph_name, compiler_spec in zip(graph_names, compiler_specs)
+        }
+
+        lowered_ep_dict = to_backend(
+            MethodProgramsPartitionerSpec(edge_progs, partitioners)
+        )
+
+        if args.num_sharding > 1:
+            # TODO: add arg parser of spill_fill_size since weight-sharing based
+            #       context binaries cannot be opened in x86 host
+            pass
 
         if args.verbose:
-            for edge_prog_mgr in edge_prog_mgrs:
-                print_delegation_info(edge_prog_mgr.exported_program().graph_module)
+            for ep in lowered_ep_dict.values():
+                print_delegation_info(ep.graph_module)
 
         executorch_config = ExecutorchBackendConfig(
             # For shared buffer, user must pass the memory address
@@ -737,91 +743,13 @@ def permute(w, heads):
             ),
             extract_delegate_segments=True,
         )
+        exec_prog_mgr = EdgeProgramManager(
+            edge_programs=lowered_ep_dict,
+            constant_methods=llama_instance_list[1].llama_meta,
+        ).to_executorch(executorch_config)
 
-        bundle_progs_list = []
-        lower_module_dict = {name: [] for name in graph_names}
-        call_delegate_inputs_dict = {name: [] for name in graph_names}
-        call_delegate_node_name_dict = {name: [] for name in graph_names}
-        outputs_dict = {name: [] for name in graph_names}
-        input_nodes_dict = {name: [] for name in graph_names}
-        for prog, graph_name in zip(edge_prog_mgrs, graph_names):
-            for node in prog.exported_program().graph_module.graph.nodes:
-                if (
-                    node.op == "call_function"
-                    and "executorch_call_delegate" in node.name
-                ):
-                    call_delegate_node_name_dict[graph_name].append(node.name)
-                    call_delegate_inputs_list = []
-                    for arg in node.args:
-                        if arg.op == "call_function":
-                            if (
-                                arg.target
-                                == exir_ops.edge.quantized_decomposed.embedding_4bit.dtype
-                            ):
-                                call_delegate_inputs_list.append((arg.name, None))
-                            else:
-                                while "getitem" not in arg.name:
-                                    arg = arg.args[0]
-                                call_delegate_inputs_list.append(
-                                    (arg.args[0].name, arg.args[1])
-                                )
-                        elif arg.op == "placeholder":
-                            call_delegate_inputs_list.append((arg.name, None))
-                        # No extra needs to do for get_attr node
-                    call_delegate_inputs_dict[graph_name].append(
-                        call_delegate_inputs_list
-                    )
-                elif node.op == "output":
-                    for arg in node.args[0]:
-                        outputs_dict[graph_name].append((arg.args[0].name, arg.args[1]))
-        for num in range(args.num_sharding - 1, -1, -1):
-            processed_bytes = []
-            for prog, graph_name in zip(edge_prog_mgrs, graph_names):
-                processed_bytes.append(
-                    getattr(
-                        prog.exported_program().graph_module, f"lowered_module_{num}"
-                    ).processed_bytes
-                )
-                call_delegate_node = [
-                    list(node.users.keys())[0]
-                    for node in prog.exported_program().graph_module.graph.nodes
-                    if node.op == "get_attr" and node.name == f"lowered_module_{num}"
-                ]
-                input_nodes_dict[graph_name] = [
-                    node
-                    for node in call_delegate_node[0].args
-                    if node.op == "placeholder"
-                    or node.target
-                    == exir_ops.edge.quantized_decomposed.embedding_4bit.dtype
-                ]
-            prog_mgr, bundle_progs = generate_multi_graph_program(
-                compiler_specs=compiler_specs[0],
-                processed_bytes=processed_bytes,
-                input_nodes_dict=input_nodes_dict,
-                backend_config=executorch_config,
-                constant_methods=llama_instance_list[0].llama_meta,  # kv method meta
-            )
-            bundle_progs_list.append(bundle_progs)
-            for graph_name in graph_names:
-                lower_module_dict[graph_name].append(
-                    prog_mgr.exported_program(graph_name).graph_module._modules.get(
-                        "lowered_module_0"
-                    )
-                )
-        exec_prog = generate_composite_llama_program(
-            llama_model=llama_instance_list[1].llama_model,
-            graph_names=graph_names,
-            sample_inputs_list=sample_inputs_list,
-            lower_module_dict=lower_module_dict,
-            call_delegate_node_name_dict=call_delegate_node_name_dict,
-            call_delegate_inputs_dict=call_delegate_inputs_dict,
-            outputs_dict=outputs_dict,
-            embedding_quantize=args.embedding_quantize,
-            backend_config=executorch_config,
-            constant_methods=llama_instance_list[1].llama_meta,  # kv method meta
-        )
         with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file:
-            exec_prog.write_to_file(file)
+            exec_prog_mgr.write_to_file(file)
 
     end_lowering_ts = time.time()
     logging.info(f"Time for compiling: {end_lowering_ts - start_lowering_ts}")
diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py
index 542739a2898..670cdde12ad 100755
--- a/examples/qualcomm/utils.py
+++ b/examples/qualcomm/utils.py
@@ -58,8 +58,6 @@ class SimpleADB:
         runner (str): Runtime executor binary
         expected_input_shape (Tuple[torch.Size]): input shape of dynamic graph
         expected_output_shape (Tuple[torch.Size]): output shape of dynamic graph
-        expected_input_dtype (Tuple[torch.dtype]): input dtype
-        expected_output_sdtype (Tuple[torch.dtype]): output dtype
     """
 
     def __init__(
diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index 310e5ea9379..0b8dac741a5 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -7,6 +7,7 @@
 
 import copy
 import logging
+import operator
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from functools import singledispatch
@@ -204,11 +205,36 @@ def _insert_lowered_submodule(
     owning_graph_module = call_submodule_node.graph.owning_module
     # call delegate args should only use user_inputs
     call_delegate_args = []
+    # handle getitem node in multi-method scenario
+    call_submodule_inputs = []
+    for inp_node in call_submodule_node.all_input_nodes:
+        if inp_node.target == operator.getitem:
+            # it could be an executorch_call_delegate node or a submodule to be replaced
+            subgraph = (
+                # get owning_module of lowered_module node
+                getattr(
+                    inp_node.args[0].all_input_nodes[0].graph.owning_module,
+                    inp_node.args[0].all_input_nodes[0].name,
+                ).original_module
+                if inp_node.args[0].target
+                == torch._higher_order_ops.executorch_call_delegate
+                # get owning_module of submodule node
+                else getattr(
+                    inp_node.args[0].graph.owning_module,
+                    inp_node.args[0].all_input_nodes[0].name,
+                )
+            )
+            output_node = [
+                node for node in subgraph.graph.nodes if node.name == "output"
+            ][0]
+            call_submodule_inputs.append(output_node.all_input_nodes[inp_node.args[1]])
+        else:
+            call_submodule_inputs.append(inp_node)
     # Preserve input order as user_inputs
     for inp_name in submodule_program.graph_signature.user_inputs:
-        for inp_node in call_submodule_node.all_input_nodes:
+        for i, inp_node in enumerate(call_submodule_inputs):
             if inp_node.name == inp_name:
-                call_delegate_args.append(inp_node)
+                call_delegate_args.append(call_submodule_node.all_input_nodes[i])
                 break
 
     def generate_debug_handle(ep: ExportedProgram) -> int:
@@ -325,6 +351,9 @@ def _partition_and_lower_one_graph_module(
             toplevel_output_specs_to_delete,
         )
 
+    # perform validation here to make sure all the delegated submodules are gone
+    # validate inside _insert_lowered_submodule will break multi-method scenario
+    owning_program._validate()
     return tagged_graph_module
 
 
@@ -569,7 +598,11 @@ def lower_all_submodules_to_backend(
     # The created exported program for the submodules are in the call_module node's meta data
     # We just map the method_to_submodule_nodes directly to the method_to_partitioned_exported_programs
     method_to_partitioned_program = {
-        method_name: [node.meta["submodule_program"] for node in call_submodule_nodes]
+        method_name: [
+            # perform deep copy here in case backends change graph inside preprocess method
+            copy.deepcopy(node.meta["submodule_program"])
+            for node in call_submodule_nodes
+        ]
         for method_name, call_submodule_nodes in method_to_submodules_nodes.items()
     }
     method_to_compile_specs = {
@@ -627,6 +660,10 @@ def lower_all_submodules_to_backend(
                 toplevel_output_specs_to_delete,
             )
 
+        # perform validation here to make sure all the delegated submodules are gone
+        # validate inside _insert_lowered_submodule will break multi-method scenario
+        owning_program._validate()
+
 
 @dataclass
 class MethodProgramsPartitionerSpec:
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index 78b031a238e..8bbf5a7a960 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -958,5 +958,3 @@ def _unsafe_adjust_original_program(  # noqa: C901
             if user_idx > idx:
                 user.args = (user.args[0], user_idx - (len(getitem_idxs) - i))
                 break
-
-    original_program._validate()

From 5f976652ba9e3430e3f283f2951495eb3f4868fd Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Thu, 1 May 2025 00:08:05 +0800
Subject: [PATCH 2/5] add call_module op check

---
 exir/backend/backend_api.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index 0b8dac741a5..3720087a113 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -208,7 +208,10 @@ def _insert_lowered_submodule(
     # handle getitem node in multi-method scenario
     call_submodule_inputs = []
     for inp_node in call_submodule_node.all_input_nodes:
-        if inp_node.target == operator.getitem:
+        if inp_node.target == operator.getitem and (
+            inp_node.args[0].target == torch._higher_order_ops.executorch_call_delegate
+            or inp_node.args[0].op == "call_module"
+        ):
             # it could be an executorch_call_delegate node or a submodule to be replaced
             subgraph = (
                 # get owning_module of lowered_module node

From 3624778c8f0a67ed30746d0e67a46c31beba19cd Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Thu, 1 May 2025 08:43:36 +0800
Subject: [PATCH 3/5] add is_submodule check

---
 exir/backend/backend_api.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index 3720087a113..06d0c566e06 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -227,9 +227,8 @@ def _insert_lowered_submodule(
                     inp_node.args[0].all_input_nodes[0].name,
                 )
             )
-            output_node = [
-                node for node in subgraph.graph.nodes if node.name == "output"
-            ][0]
+            output_node = list(subgraph.graph.nodes)[-1]
+            assert output_node.op == "output"
             call_submodule_inputs.append(output_node.all_input_nodes[inp_node.args[1]])
         else:
             call_submodule_inputs.append(inp_node)
@@ -356,7 +355,8 @@ def _partition_and_lower_one_graph_module(
 
     # perform validation here to make sure all the delegated submodules are gone
     # validate inside _insert_lowered_submodule will break multi-method scenario
-    owning_program._validate()
+    if not is_submodule:
+        owning_program._validate()
     return tagged_graph_module
 
 
@@ -665,7 +665,8 @@ def lower_all_submodules_to_backend(
 
         # perform validation here to make sure all the delegated submodules are gone
         # validate inside _insert_lowered_submodule will break multi-method scenario
-        owning_program._validate()
+        if not is_submodule:
+            owning_program._validate()
 
 
 @dataclass

From 8468fa73bda7b9f7c20c1fd3ddef80a1b3d0e82c Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Thu, 1 May 2025 10:15:53 +0800
Subject: [PATCH 4/5] rebase QNN IR PR

---
 backends/qualcomm/CMakeLists.txt              |   7 +-
 .../qualcomm/aot/python/PyQnnManagerAdaptor.h |   1 -
 backends/qualcomm/runtime/QnnManager.cpp      | 121 ------------------
 backends/qualcomm/runtime/QnnManager.h        |   1 -
 .../runtime/backends/QnnBackendCache.cpp      |  13 --
 .../runtime/backends/QnnCustomProtocol.cpp    |   2 +-
 .../irbackend/x86_64/QnnDlcManager.cpp        |  10 +-
 backends/qualcomm/tests/test_qnn_delegate.py  |   6 +
 8 files changed, 17 insertions(+), 144 deletions(-)

diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 1b7c8891a4e..37e814d0679 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -153,12 +153,12 @@ target_link_libraries(qnn_executorch_logging PRIVATE qnn_schema)
 target_link_libraries(qnn_profiler PRIVATE qnn_executorch_logging)
 target_link_libraries(qnn_logger PRIVATE qnn_implementation ${android_log})
 target_link_libraries(qnn_backend PRIVATE qnn_implementation qnn_logger)
-target_link_libraries(qnn_custom_protocol PRIVATE qcir_utils)
+target_link_libraries(qnn_custom_protocol PRIVATE qnn_logger)
 target_link_libraries(
   qnn_device PRIVATE qnn_executorch_logging qnn_implementation qnn_logger
 )
 target_link_libraries(
-  qnn_backend_cache PRIVATE qnn_sys_implementation qcir_utils
+  qnn_backend_cache PRIVATE qnn_sys_implementation
 )
 target_link_libraries(
   qnn_context PRIVATE qnn_implementation qnn_logger qnn_backend qnn_device
@@ -184,7 +184,7 @@ target_link_libraries(
 )
 target_link_libraries(
   qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager
-                                 executorch_core qcir_utils extension_tensor
+                                 executorch_core extension_tensor
 )
 set_target_properties(
   qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
@@ -243,7 +243,6 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
             qnn_manager
             qnn_executorch_header
             executorch
-            qcir_utils
             extension_tensor
   )
   target_link_libraries(
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
index 66fd41721c6..409ec1a4294 100644
--- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
+++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
@@ -6,7 +6,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 #pragma once
-#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
 #include <executorch/backends/qualcomm/aot/python/PyQnnWrapperAdaptor.h>
 #include <executorch/backends/qualcomm/qc_compiler_spec_generated.h>
 #include <executorch/backends/qualcomm/runtime/Logging.h>
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index 6850a92fdc6..600bc072b06 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -6,7 +6,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
 #include <executorch/backends/qualcomm/runtime/QnnManager.h>
 #include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
 #include <executorch/backends/qualcomm/runtime/Utils.h>
@@ -572,126 +571,6 @@ Error QnnManager::CompileDlc() {
   return Error::Ok;
 }
 
-Error QnnManager::CompileQcir() {
-  QnnQcirCustomProtocol qnn_qcir_custom_protocol;
-  auto [status, qcir_fbs_size, tensor_size, qcir_fbs_ptr, tensor_ptr] =
-      qnn_qcir_custom_protocol.DeserializeQcirCustomBuffer(
-          qnn_context_blob_.buffer);
-
-  if (status != Error::Ok) {
-    QNN_EXECUTORCH_LOG_ERROR("Failed to verify QnnQcirCustomProtocol");
-    return Error::Internal;
-  }
-
-  auto context = qcir::GetContext(qcir_fbs_ptr);
-  for (const auto& graph : *context->graphs()) {
-    // qcir tensors to TensorWrapper
-    std::vector<std::shared_ptr<TensorWrapper>> graph_inputs, graph_outputs,
-        tensors;
-    for (const auto& tensor : *graph->tensors()) {
-      tensors.emplace_back(CreateTensorWrapper(ToTensor(
-          tensor, static_cast<uint8_t*>(tensor_ptr) + tensor->offset())));
-      if (tensor->type() == qcir::TensorType::WRITE) {
-        graph_inputs.push_back(tensors.back());
-      } else if (tensor->type() == qcir::TensorType::READ) {
-        graph_outputs.push_back(tensors.back());
-      }
-    }
-    std::vector<std::shared_ptr<OpWrapper>> op_wrappers;
-    // qcir graph node to OpWrapper
-    for (const auto& node : *graph->nodes()) {
-      std::shared_ptr<OpWrapper> op = std::make_shared<OpWrapper>(
-          node->name()->str(),
-          node->package_name()->str(),
-          node->type_name()->str());
-
-      // qcir input tensors to OpWrapper input tensors
-      std::vector<std::shared_ptr<TensorWrapper>> inputs;
-      for (uint32_t index : *node->inputs()) {
-        inputs.push_back(tensors[index]);
-      }
-      op->AddInputTensors(inputs);
-
-      // qcir output tensors to OpWrapper output tensors
-      std::vector<std::shared_ptr<TensorWrapper>> outputs;
-      for (uint32_t index : *node->outputs()) {
-        outputs.push_back(tensors[index]);
-      }
-      op->AddOutputTensors(outputs);
-
-      // qcir operator param to OpWrapper param
-      for (uint32_t index : *node->params()) {
-        const auto& tensor = graph->tensors()->Get(index);
-        std::string name = tensor->name()->str();
-        Qnn_DataType_t dtype = ToDataType(tensor->dtype());
-        const uint8_t* data_ptr =
-            static_cast<uint8_t*>(tensor_ptr) + tensor->offset();
-        if (tensor->shape()->size() != 0) {
-          // add tensor param
-          op->AddTensorParam(
-              name,
-              dtype,
-              tensor->shape()->size(),
-              tensor->shape()->data(),
-              data_ptr);
-        } else {
-          // add scalar param
-          switch (dtype) {
-            case Qnn_DataType_t::QNN_DATATYPE_INT_32:
-              op->AddScalarParam(
-                  name, dtype, *reinterpret_cast<const int32_t*>(data_ptr));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_INT_16:
-              op->AddScalarParam(
-                  name, dtype, *reinterpret_cast<const int16_t*>(data_ptr));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_INT_8:
-              op->AddScalarParam(name, dtype, static_cast<int8_t>(*data_ptr));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_UINT_32:
-              op->AddScalarParam(
-                  name, dtype, *reinterpret_cast<const uint32_t*>(data_ptr));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_UINT_16:
-              op->AddScalarParam(
-                  name, dtype, *reinterpret_cast<const uint16_t*>(data_ptr));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_UINT_8:
-              op->AddScalarParam(name, dtype, *data_ptr);
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_FLOAT_32:
-            case Qnn_DataType_t::QNN_DATATYPE_FLOAT_16:
-              op->AddScalarParam(
-                  name, dtype, *reinterpret_cast<const float*>(data_ptr));
-              break;
-            case Qnn_DataType_t::QNN_DATATYPE_BOOL_8:
-              op->AddScalarParam(name, dtype, *data_ptr);
-              break;
-            default:
-              QNN_EXECUTORCH_LOG_ERROR(
-                  "Invalid scalar type: %s", tensor->name()->c_str());
-              break;
-          }
-        }
-      }
-      op_wrappers.emplace_back(std::move(op));
-    }
-    ET_CHECK_OR_RETURN_ERROR(
-        Compile(graph->name()->str(), op_wrappers) == Error::Ok,
-        Internal,
-        "Fail to compile graph from qcir with graph_name: %s",
-        graph->name()->str().c_str());
-    ET_CHECK_OR_RETURN_ERROR(
-        AllocateTensor(graph->name()->str(), graph_inputs, graph_outputs) ==
-            Error::Ok,
-        Internal,
-        "Fail to allocate tensor for qcir with graph_name: %s",
-        graph->name()->str().c_str());
-  }
-
-  return Error::Ok;
-}
-
 Error QnnManager::Compile(
     const std::string& graph_name,
     std::vector<std::shared_ptr<OpWrapper>>& op_wrappers) {
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index 77412a184ff..c01a537f7bd 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -67,7 +67,6 @@ class QnnManager {
   executorch::runtime::Error GetContextBinary(
       QnnExecuTorchContextBinary& qnn_executorch_context_binary);
 
-  executorch::runtime::Error CompileQcir();
   executorch::runtime::Error CompileDlc();
   executorch::runtime::Error Compile(
       const std::string& graph_name,
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
index 5cfe783c6f0..4387d61ab7c 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
@@ -6,7 +6,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCache.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnCustomProtocol.h>
 namespace executorch {
@@ -129,18 +128,6 @@ Error QnnBackendCache::Configure(const std::vector<std::string>& graph_names) {
       qnn_context_blob_.nbytes);
 
   if (status == Error::Internal) {
-    auto [status, qcir_fbs_size, _, qcir_fbs_ptr, __] =
-        QnnQcirCustomProtocol().DeserializeQcirCustomBuffer(
-            qnn_context_blob_.buffer);
-    if (status == Error::Ok) {
-      // first stage of multi graph
-      state_ = MULTI_GRAPH;
-      auto context = qcir::GetContext(qcir_fbs_ptr);
-      for (const auto& graph : *context->graphs()) {
-        graph_names_.emplace_back(graph->name()->str());
-      }
-      return Error::Ok;
-    }
     // online prepare
     state_ = ONLINE_PREPARE;
   }
diff --git a/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp b/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp
index 6bf65f59286..12de1b3e705 100644
--- a/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp
+++ b/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp
@@ -6,13 +6,13 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/qualcomm/aot/ir/qcir_utils.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnCustomProtocol.h>
 
 namespace executorch {
 namespace backends {
 namespace qnn {
 
+// we still need this for on-device op validation of other backends
 void QnnQcirCustomProtocol::BuildQcirCustomBuffer(
     const QnnExecuTorchContextBinary& qcir_binary,
     const std::vector<uint8_t>& tensor_data) {
diff --git a/backends/qualcomm/runtime/backends/irbackend/x86_64/QnnDlcManager.cpp b/backends/qualcomm/runtime/backends/irbackend/x86_64/QnnDlcManager.cpp
index 14b9aeadf3a..bd54a078ef7 100644
--- a/backends/qualcomm/runtime/backends/irbackend/x86_64/QnnDlcManager.cpp
+++ b/backends/qualcomm/runtime/backends/irbackend/x86_64/QnnDlcManager.cpp
@@ -41,8 +41,7 @@ Error QnnDlcManager::Create() {
       std::make_unique<QnnDevice>(qnn_loaded_backend_, logger_.get());
 
   backend_params_ptr_->qnn_backend_cache_ptr_ =
-      std::make_unique<QnnBackendCache>(
-          qnn_context_blob_, options_->graph_name()->str());
+      std::make_unique<QnnBackendCache>(qnn_context_blob_);
 
   backend_params_ptr_->qnn_context_ptr_ = std::make_unique<IrContext>(
       qnn_loaded_backend_,
@@ -64,8 +63,13 @@ Error QnnDlcManager::Create() {
 Error QnnDlcManager::Configure() {
   ET_CHECK_OR_RETURN_ERROR(
       backend_params_ptr_ != nullptr, Internal, "Failed to load Qnn backend.");
+  std::vector<std::string> graph_names;
+  for (auto name : *options_->graph_name()) {
+    graph_names.emplace_back(name->str());
+  }
   ET_CHECK_OR_RETURN_ERROR(
-      backend_params_ptr_->qnn_backend_cache_ptr_->Configure() == Error::Ok,
+      backend_params_ptr_->qnn_backend_cache_ptr_->Configure(graph_names) ==
+          Error::Ok,
       Internal,
       "Fail to configure Qnn backend cache");
   ET_CHECK_OR_RETURN_ERROR(
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index e1dfe3295dd..7d0a360d298 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -2535,6 +2535,9 @@ def test_qnn_backend_shared_buffer(self):
         )
 
     def test_qnn_backend_online_prepare(self):
+        if self.enable_x86_64:
+            self.skipTest("TODO: add online_prepare support on host platform")
+
         backend_options = generate_htp_compiler_spec(use_fp16=True)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.model],
@@ -3187,6 +3190,9 @@ def test_qnn_backend_shared_buffer(self):
         )
 
     def test_qnn_backend_online_prepare(self):
+        if self.enable_x86_64:
+            self.skipTest("TODO: add online_prepare support on host platform")
+
         backend_options = generate_htp_compiler_spec(use_fp16=False)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.model],

From 30883fde362e7951391f930b7eb181017e88dbab Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Thu, 1 May 2025 18:17:03 +0800
Subject: [PATCH 5/5] chenge validation logic

---
 exir/backend/backend_api.py    | 14 +++++---------
 exir/lowered_backend_module.py |  4 ++++
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index 06d0c566e06..54592478198 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -201,6 +201,7 @@ def _insert_lowered_submodule(
     is_submodule: bool,
     toplevel_input_specs_to_delete: Dict[str, InputSpec],
     toplevel_output_specs_to_delete: Dict[str, OutputSpec],
+    validate_program: bool = True,
 ):
     owning_graph_module = call_submodule_node.graph.owning_module
     # call delegate args should only use user_inputs
@@ -275,6 +276,7 @@ def generate_debug_handle(ep: ExportedProgram) -> int:
             call_delegate_node,
             toplevel_input_specs_to_delete,
             toplevel_output_specs_to_delete,
+            validate_program,
         )
 
 
@@ -353,10 +355,6 @@ def _partition_and_lower_one_graph_module(
             toplevel_output_specs_to_delete,
         )
 
-    # perform validation here to make sure all the delegated submodules are gone
-    # validate inside _insert_lowered_submodule will break multi-method scenario
-    if not is_submodule:
-        owning_program._validate()
     return tagged_graph_module
 
 
@@ -661,13 +659,11 @@ def lower_all_submodules_to_backend(
                 is_submodule,
                 toplevel_input_specs_to_delete,
                 toplevel_output_specs_to_delete,
+                # validate only when all submodules are processed
+                validate_program=call_submodule_node
+                == list_of_call_submodule_nodes[-1],
             )
 
-        # perform validation here to make sure all the delegated submodules are gone
-        # validate inside _insert_lowered_submodule will break multi-method scenario
-        if not is_submodule:
-            owning_program._validate()
-
 
 @dataclass
 class MethodProgramsPartitionerSpec:
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index 8bbf5a7a960..b1bd1b3164d 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -862,6 +862,7 @@ def _unsafe_adjust_original_program(  # noqa: C901
     call_delegate_node: torch.fx.Node,
     input_specs_to_delete: Dict[str, InputSpec],
     output_specs_to_delete: Dict[str, OutputSpec],
+    validate_program: bool,
 ) -> None:
     """
     Directly modify the original exported program's signature and state dict
@@ -958,3 +959,6 @@ def _unsafe_adjust_original_program(  # noqa: C901
             if user_idx > idx:
                 user.args = (user.args[0], user_idx - (len(getitem_idxs) - i))
                 break
+
+    if validate_program:
+        original_program._validate()