From 8a76d445ccf4004703283c8f48ae0acd2cf74ae6 Mon Sep 17 00:00:00 2001 From: haowhsu-quic Date: Tue, 22 Apr 2025 10:04:43 +0800 Subject: [PATCH 1/5] Qualcomm AI Engine Direct - multi-method support Summary - refactor to adopt multi-method change - framwork change to meet use case --- .../aot/python/PyQnnManagerAdaptor.cpp | 7 +- .../qualcomm/aot/python/PyQnnManagerAdaptor.h | 257 +----------------- backends/qualcomm/qnn_preprocess.py | 91 ++++++- backends/qualcomm/runtime/QnnManager.cpp | 11 +- backends/qualcomm/runtime/QnnManager.h | 4 - .../runtime/backends/QnnBackendCache.cpp | 5 +- .../runtime/backends/QnnBackendCache.h | 10 +- .../runtime/backends/QnnBackendFactory.cpp | 3 +- .../backends/htpbackend/HtpBackendCache.h | 6 +- .../serialization/qc_compiler_spec.fbs | 6 +- backends/qualcomm/serialization/qc_schema.py | 4 +- backends/qualcomm/tests/test_qnn_delegate.py | 80 +++--- backends/qualcomm/utils/utils.py | 253 +---------------- .../executor_runner/qnn_executor_runner.cpp | 6 - examples/qualcomm/oss_scripts/llama/llama.py | 158 +++-------- examples/qualcomm/utils.py | 2 - exir/backend/backend_api.py | 43 ++- exir/lowered_backend_module.py | 2 - 18 files changed, 257 insertions(+), 691 deletions(-) diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp index e8261000bb5..67e6775f451 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp @@ -30,15 +30,14 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) { py::class_>(m, "QnnManager") .def(py::init()) .def(py::init()) - .def(py::init()) .def("Init", &PyQnnManager::Init) .def("IsNodeSupportedByBackend", &PyQnnManager::IsNodeSupportedByBackend) - .def("Compile", py::overload_cast<>(&PyQnnManager::Compile)) .def( "Compile", py::overload_cast< - const std::string&, - std::vector>&>(&PyQnnManager::Compile)) + const std::vector&, + std::vector>>&>( + &PyQnnManager::Compile)) .def("Destroy", &PyQnnManager::Destroy) .def("IsAvailable", &PyQnnManager::IsAvailable) .def("IsTensorDump", &PyQnnManager::IsTensorDump) diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h index 67abadd6731..66fd41721c6 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h @@ -50,119 +50,6 @@ class PyQnnManager { qnn_executorch_options, qnn_executorch_context_binary_); } - // used during stage 2 of multi-graph mode - explicit PyQnnManager(const py::bytes& buffer, const py::list& qcirs) - : qnn_executorch_option_ptr_(buffer) { - auto qnn_executorch_options = GetQnnExecuTorchOptions( - qnn_executorch_option_ptr_.cast().data()); - - // merge multiple qcirs into one context with multiple graphs - - // We start retrieving tensor from offsets = 0. - std::vector offsets(1, 0); - std::vector tensor_data; - std::vector tensor_ptr; - std::vector tensor_size; - uint64_t total_tensor_size = 0; - for (size_t i = 0; i < qcirs.size(); ++i) { - py::buffer_info info(py::buffer(qcirs[i].cast()).request()); - - uint8_t* qcir_custom_buffer_ptr = static_cast(info.ptr); - QnnQcirCustomProtocol qnn_qcir_custom_protocol; - auto [status, _, qcir_tensor_size, __, qcir_tensor_ptr] = - qnn_qcir_custom_protocol.DeserializeQcirCustomBuffer( - qcir_custom_buffer_ptr); - - if (status != Error::Ok) { - QNN_EXECUTORCH_LOG_ERROR("Fail to verify QnnQcirCustomProtocol"); - return; - } - - tensor_ptr.push_back(static_cast(qcir_tensor_ptr)); - tensor_size.push_back(qcir_tensor_size); - total_tensor_size += qcir_tensor_size; - offsets.push_back(offsets.back() + qcir_tensor_size); - } - - tensor_data.resize(total_tensor_size); - - // store multiple graphs tensor in a contiguous memory space - for (size_t i = 0; i < tensor_ptr.size(); ++i) { - std::memcpy( - tensor_data.data() + offsets[i], tensor_ptr[i], tensor_size[i]); - } - - std::vector> graphs; - for (size_t i = 0; i < qcirs.size(); ++i) { - py::buffer_info info(py::buffer(qcirs[i].cast()).request()); - - uint8_t* qcir_custom_buffer_ptr = static_cast(info.ptr); - QnnQcirCustomProtocol qnn_qcir_custom_protocol; - auto [status, qcir_fbs_size, _, qcir_fbs_ptr, __] = - qnn_qcir_custom_protocol.DeserializeQcirCustomBuffer( - qcir_custom_buffer_ptr); - - if (status != Error::Ok) { - QNN_EXECUTORCH_LOG_ERROR("Fail to verify QnnQcirCustomProtocol"); - return; - } - - auto context = qcir::GetContext(qcir_fbs_ptr); - for (const auto& graph : *context->graphs()) { - std::vector> tensors; - for (const auto tensor : *graph->tensors()) { - // here we need to take a detour to merge multiple qcir flatbuffers - // outer ToTensor - // return: flatbuffers::Offset - // consume: QnnTensor, data_offset, flatbuffers::FlatBufferBuilder* - // inner ToTensor - // return: QnnTensor - // consume: - // flatbuffers::Vector<::flatbuffers::Offset>, - // data_ptr - tensors.emplace_back(ToTensor( - ToTensor(tensor, nullptr), - offsets[i] + tensor->offset(), - &builder_)); - } - std::vector> nodes; - for (const auto& node : *graph->nodes()) { - uint32_t* inputs_ptr = const_cast(node->inputs()->data()); - uint32_t* outputs_ptr = - const_cast(node->outputs()->data()); - uint32_t* params_ptr = const_cast(node->params()->data()); - std::vector inputs( - inputs_ptr, inputs_ptr + node->inputs()->size()); - std::vector outputs( - outputs_ptr, outputs_ptr + node->outputs()->size()); - std::vector params( - params_ptr, params_ptr + node->params()->size()); - nodes.emplace_back(qcir::CreateOperatorDirect( - builder_, - node->name()->str().c_str(), - node->package_name()->str().c_str(), - node->type_name()->str().c_str(), - &inputs, - &outputs, - ¶ms)); - } - graphs.emplace_back(qcir::CreateGraphDirect( - builder_, graph->name()->str().c_str(), &nodes, &tensors)); - } - } - - auto context = qcir::CreateContextDirect(builder_, &graphs); - builder_.Finish(context); - QnnExecuTorchContextBinary qcir_bin( - {builder_.GetBufferPointer(), builder_.GetSize()}); - - // Init QnnQcirCustomProtocol binary - qnn_executorch_context_binary_ = - MakeQcirCustomBinaryInfo(qcir_bin, tensor_data); - qnn_manager_ = std::make_shared( - qnn_executorch_options, qnn_executorch_context_binary_); - } - executorch::runtime::Error Init() { return qnn_manager_->Init(); } @@ -172,146 +59,24 @@ class PyQnnManager { return qnn_manager_->IsNodeSupportedByBackend(op_wrappers); } - // this method is specific for stage 2 of compiling multi-graphs - py::array_t Compile() { - if (qnn_manager_->CompileQcir() != Error::Ok) { - QNN_EXECUTORCH_LOG_ERROR("Fail to compile qcir"); - return py::array_t(0); - } - - // generate context binary if compilation succeded - QnnExecuTorchContextBinary binary_info; - qnn_manager_->GetContextBinary(binary_info); - // allocate py::array (to pass the result of the C++ function to Python) - auto result = py::array_t(binary_info.nbytes); - auto result_buffer = result.request(); - char* result_ptr = (char*)result_buffer.ptr; - std::memcpy(result_ptr, binary_info.buffer, binary_info.nbytes); - return result; - } - py::array_t Compile( - const std::string& graph_name, - std::vector>& op_wrappers) { + const std::vector& graph_names, + std::vector>>& op_wrappers) { QnnExecuTorchContextBinary binary_info; - if (qnn_manager_->IsMultipleGraphs()) { - builder_.Reset(); - std::vector tensor_data; - std::vector offsets; - std::unordered_map tensor_map; - std::vector> fb_tensors; - std::vector> fb_ops; - - auto set_tensor = [&](const std::shared_ptr& wrapper, - std::vector& index) { - auto it = tensor_map.find(wrapper.get()); - if (it != tensor_map.end()) { - index.push_back(it->second); - } else { - tensor_map[wrapper.get()] = fb_tensors.size(); - index.push_back(fb_tensors.size()); - offsets.push_back(tensor_data.size()); - Qnn_Tensor_t qnn_tensor = wrapper->CloneTensorStruct(); - fb_tensors.emplace_back( - ToTensor(qnn_tensor, offsets.back(), &builder_)); - uint8_t* data_ptr = static_cast( - QNN_TENSOR_VER_PTR(qnn_tensor)->clientBuf.data); - if (data_ptr != nullptr) { - tensor_data.insert( - tensor_data.end(), - data_ptr, - data_ptr + QNN_TENSOR_VER_PTR(qnn_tensor)->clientBuf.dataSize); - } - } - }; - - for (std::shared_ptr& op_wrapper : op_wrappers) { - std::vector inputs, outputs, params; - - for (const auto& tensor_wrapper : op_wrapper->GetInputTensors()) { - set_tensor(tensor_wrapper, inputs); - } - - for (const auto& tensor_wrapper : op_wrapper->GetOutputTensors()) { - set_tensor(tensor_wrapper, outputs); - } - - for (const auto& param : op_wrapper->GetParams()) { - auto* p_tensor_param = dynamic_cast(param.get()); - if (p_tensor_param != nullptr) { - auto wrapper = p_tensor_param->GetTensorWrapper(); - wrapper->SetName(param->GetName()); - set_tensor(wrapper, params); - } else { - executorch::runtime::Error err = param->PopulateQnnParam(); - if (err != executorch::runtime::Error::Ok) { - QNN_EXECUTORCH_LOG_ERROR( - "Fail to get scalar parameter in online prepare stage"); - return py::array_t(0); - } - Qnn_Param_t p = param->GetQnnParam(); - Qnn_Tensor_t t( - {.version = QNN_TENSOR_VERSION_2, .v2 = QNN_TENSOR_V2_INIT}); - QNN_TENSOR_VER_PTR(t)->name = p.name; - QNN_TENSOR_VER_PTR(t)->dataType = p.scalarParam.dataType; - QNN_TENSOR_VER_PTR(t)->clientBuf.data = - static_cast(&p.scalarParam.uint8Value); - QNN_TENSOR_VER_PTR(t)->clientBuf.dataSize = - GetDataTypeSize(QNN_TENSOR_VER_PTR(t)->dataType); - - // collect tensor data - offsets.push_back(tensor_data.size()); - const uint8_t* data_ptr = - static_cast(QNN_TENSOR_VER_PTR(t)->clientBuf.data); - tensor_data.insert( - tensor_data.end(), - data_ptr, - data_ptr + QNN_TENSOR_VER_PTR(t)->clientBuf.dataSize); - params.push_back(fb_tensors.size()); - fb_tensors.emplace_back(ToTensor(t, offsets.back(), &builder_)); - } - } - - Qnn_OpConfig_t op_config = op_wrapper->GetOpConfig(); - fb_ops.emplace_back(qcir::CreateOperatorDirect( - builder_, - QNN_OP_VER_PTR(op_config)->name, - QNN_OP_VER_PTR(op_config)->packageName, - QNN_OP_VER_PTR(op_config)->typeName, - &inputs, - &outputs, - ¶ms)); - } - - std::vector> fb_graphs( - {qcir::CreateGraphDirect( - builder_, graph_name.c_str(), &fb_ops, &fb_tensors)}); - auto context = qcir::CreateContextDirect(builder_, &fb_graphs); - builder_.Finish(context); - - QnnExecuTorchContextBinary qcir_binary( - {builder_.GetBufferPointer(), builder_.GetSize()}); - - custom_qcir_protocol_buffer_ = - QnnQcirCustomProtocol(qcir_binary.nbytes, tensor_data.size()); - custom_qcir_protocol_buffer_.BuildQcirCustomBuffer( - qcir_binary, tensor_data); - std::tie(binary_info.buffer, binary_info.nbytes) = - custom_qcir_protocol_buffer_.GetCustomProtocolBuffer(); - } else { - if (qnn_manager_->Compile(graph_name, op_wrappers) != + for (int i = 0; i < graph_names.size(); ++i) { + if (qnn_manager_->Compile(graph_names[i], op_wrappers[i]) != executorch::runtime::Error::Ok) { QNN_EXECUTORCH_LOG_ERROR("Fail to compile QNN graph"); return py::array_t(0); } - auto qnn_executorch_options = GetQnnExecuTorchOptions( - qnn_executorch_option_ptr_.cast().data()); - if (qnn_executorch_options->saver() || - qnn_manager_->GetContextBinary(binary_info) != - executorch::runtime::Error::Ok) { - return py::array_t(0); - } + } + auto qnn_executorch_options = GetQnnExecuTorchOptions( + qnn_executorch_option_ptr_.cast().data()); + if (qnn_executorch_options->saver() || + qnn_manager_->GetContextBinary(binary_info) != + executorch::runtime::Error::Ok) { + return py::array_t(0); } // allocate py::array (to pass the result of the C++ function to Python) diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py index 63c1795c117..e7048f6b577 100644 --- a/backends/qualcomm/qnn_preprocess.py +++ b/backends/qualcomm/qnn_preprocess.py @@ -6,7 +6,7 @@ import logging from collections import defaultdict -from typing import final, List +from typing import Dict, final, List import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager @@ -17,6 +17,7 @@ from executorch.backends.qualcomm.partition.utils import generate_qnn_executorch_option from executorch.backends.qualcomm.serialization.qc_schema_serialize import ( flatbuffer_to_option, + option_to_flatbuffer, ) from executorch.exir.backend.backend_details import ( BackendDetails, @@ -34,19 +35,11 @@ @final class QnnBackend(BackendDetails): @staticmethod - def preprocess( - edge_program: ExportedProgram, - compile_specs: List[CompileSpec], - ) -> PreprocessResult: - option = generate_qnn_executorch_option(compile_specs) - qnn_manager = PyQnnManager.QnnManager(option) - qnn_manager.Init() - + def _build_op_wrappers(edge_program: ExportedProgram, enable_tensor_dump: bool): # QNN Delegate Specific Passes graph_module = QnnPassManager().transform_for_preprocess_pipeline(edge_program) assert graph_module is not None - enable_tensor_dump = qnn_manager.IsTensorDump() nodes_to_wrappers = defaultdict(dict) node_visitors = get_node_visitors( edge_program, enable_tensor_dump=enable_tensor_dump @@ -91,9 +84,24 @@ def preprocess( continue else: raise RuntimeError(f"{node.op} is not supported in Qnn") + + return py_op_wrapper_list + + @staticmethod + def preprocess( + edge_program: ExportedProgram, + compile_specs: List[CompileSpec], + ) -> PreprocessResult: + option = generate_qnn_executorch_option(compile_specs) + qnn_manager = PyQnnManager.QnnManager(option) + qnn_manager.Init() + py_op_wrapper_list = QnnBackend._build_op_wrappers( + edge_program, qnn_manager.IsTensorDump() + ) + qnn_context_binary = qnn_manager.Compile( - qnn_manager.GetGraphNames()[0], - [py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrapper_list], + qnn_manager.GetGraphNames(), + [[py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrapper_list]], ) obj_options = flatbuffer_to_option(option) @@ -108,3 +116,62 @@ def preprocess( processed_bytes=bytes(qnn_context_binary), debug_handle_map={}, ) + + @staticmethod + def preprocess_multimethod( + edge_programs: Dict[str, List[ExportedProgram]], + compile_specs: Dict[str, List[List[CompileSpec]]], + ) -> PreprocessResult: + # TODO: refactor QnnManager to consume multiple compile_spec + # take first compile_specs here for the same partitions + graph_name = list(edge_programs.keys()) + compile_spec = list(compile_specs.values())[0][0] + # gather all graph names + option = flatbuffer_to_option(compile_spec[0].value) + option.graph_name = graph_name + compile_spec[0].value = option_to_flatbuffer(option) + # check if each graph has equal number of partitions + num_sub_graphs = set() + for edge_program in edge_programs.values(): + num_sub_graphs.add(len(edge_program)) + # this constraint is dedicated to weight-sharing scenario + assert ( + len(num_sub_graphs) == 1 + ), "Only graphs with the same number of partitions could be used" + + all_processed_results = {key: [] for key in edge_programs.keys()} + num_sub_graphs = next(iter(num_sub_graphs)) + for i in range(num_sub_graphs): + # e.g. 2 methods (x, y) with 3 partitions + # > context_binary_0: [x.subgraph_0, y.subgraph_0] + # > context_binary_1: [x.subgraph_1, y.subgraph_1] + # > context_binary_2: [x.subgraph_2, y.subgraph_2] + qnn_manager = PyQnnManager.QnnManager( + generate_qnn_executorch_option(compile_spec) + ) + qnn_manager.Init() + py_op_wrapper_list = [] + for j, programs in enumerate(edge_programs.values()): + logger.info(f"Processing Method({j}): ({i+1}/{num_sub_graphs})") + py_op_wrappers = QnnBackend._build_op_wrappers( + programs[i], qnn_manager.IsTensorDump() + ) + py_op_wrapper_list.append( + [py_op_wrapper.GetOpWrapper() for py_op_wrapper in py_op_wrappers] + ) + + qnn_context_binary = qnn_manager.Compile(graph_name, py_op_wrapper_list) + assert ( + len(qnn_context_binary) != 0 + ), "Failed to generate Qnn context binary." + qnn_manager.Destroy() + # methods should share the same context binary for current partition + for key in edge_programs.keys(): + all_processed_results[key].append( + PreprocessResult( + processed_bytes=bytes(qnn_context_binary), + debug_handle_map={}, + ) + ) + + return all_processed_results diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index 13718b0891a..6850a92fdc6 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -59,7 +59,9 @@ QnnManager::QnnManager( EnumNameQcomChipset(options_->soc_info()->soc_model())); QNN_EXECUTORCH_LOG_INFO( "backend_type: %s", EnumNameQnnExecuTorchBackendType(backend_type)); - QNN_EXECUTORCH_LOG_INFO("graph_name: %s", options_->graph_name()->c_str()); + for (auto name : *options_->graph_name()) { + QNN_EXECUTORCH_LOG_INFO("graph_name: %s", name->c_str()); + } QNN_EXECUTORCH_LOG_INFO("library_path: %s", library_path.c_str()); QNN_EXECUTORCH_LOG_INFO("dump intermediate outputs: %s", IsTensorDump()); QNN_EXECUTORCH_LOG_INFO( @@ -281,6 +283,10 @@ Error QnnManager::Init() { LoadQnnLibrary() == Error::Ok, Internal, "Fail to load Qnn library"); logger_ = std::make_unique( qnn_loaded_backend_, LoggingCallback, options_->log_level()); + std::vector graph_names; + for (auto name : *options_->graph_name()) { + graph_names.emplace_back(name->str()); + } if (backend_params_ptr_->backend_init_state_ == BackendInitializeState::UNINITIALIZED) { QNN_EXECUTORCH_LOG_INFO( @@ -298,7 +304,8 @@ Error QnnManager::Init() { Internal, "Failed to load Qnn backend."); ET_CHECK_OR_RETURN_ERROR( - backend_params_ptr_->qnn_backend_cache_ptr_->Configure() == Error::Ok, + backend_params_ptr_->qnn_backend_cache_ptr_->Configure(graph_names) == + Error::Ok, Internal, "Fail to configure Qnn backend cache"); ET_CHECK_OR_RETURN_ERROR( diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h index ee9c4337532..77412a184ff 100644 --- a/backends/qualcomm/runtime/QnnManager.h +++ b/backends/qualcomm/runtime/QnnManager.h @@ -57,10 +57,6 @@ class QnnManager { return options_->online_prepare(); } - bool IsMultipleGraphs() { - return options_->multiple_graphs(); - } - bool IsTensorDump() { return options_->dump_intermediate_outputs(); } diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp index 1e6b1262c3a..5cfe783c6f0 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp @@ -81,11 +81,10 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary( return Error::Ok; } -Error QnnBackendCache::Configure() { +Error QnnBackendCache::Configure(const std::vector& graph_names) { if (qnn_context_blob_.buffer == nullptr) { + graph_names_ = graph_names; state_ = SERIALIZE; - // use aot_graph_name if we're lowering graph on host side - graph_names_.push_back(aot_graph_name_); QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in SAVE MODE."); return Error::Ok; } diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.h b/backends/qualcomm/runtime/backends/QnnBackendCache.h index 9abec186c3a..f51fd5679a1 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCache.h +++ b/backends/qualcomm/runtime/backends/QnnBackendCache.h @@ -26,10 +26,8 @@ class QnnBackendCache { ONLINE_PREPARE = 3, MULTI_GRAPH = 4, }; - explicit QnnBackendCache( - const QnnExecuTorchContextBinary& qnn_context_blob, - const std::string& aot_graph_name) - : qnn_context_blob_(qnn_context_blob), aot_graph_name_(aot_graph_name) {} + explicit QnnBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob) + : qnn_context_blob_(qnn_context_blob) {} virtual ~QnnBackendCache(); QnnBackendCache(const QnnBackendCache&) = delete; QnnBackendCache(QnnBackendCache&&) = delete; @@ -60,7 +58,8 @@ class QnnBackendCache { graph_names_.emplace_back(graph_name); } - executorch::runtime::Error Configure(); + executorch::runtime::Error Configure( + const std::vector& graph_names); protected: virtual executorch::runtime::Error RetrieveBackendBinaryInfo( @@ -82,7 +81,6 @@ class QnnBackendCache { QnnSystemContext_Handle_t sys_context_handle_{nullptr}; QnnSystemImplementation qnn_sys_impl_{"libQnnSystem.so"}; std::vector graph_names_; - std::string aot_graph_name_; std::unordered_map> input_tensor_structs_; std::unordered_map> diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp index 1f251aeaffa..e646a3add5e 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp @@ -62,8 +62,7 @@ std::unique_ptr QnnBackendFactory::Create( implementation, logger, options->soc_info(), htp_options); backend_params->qnn_backend_cache_ptr_ = - std::make_unique( - qnn_context_blob, options->graph_name()->str()); + std::make_unique(qnn_context_blob); backend_params->qnn_context_ptr_ = std::make_unique( implementation, diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h index 4dd6897f74a..faad456aed4 100644 --- a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h +++ b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h @@ -13,10 +13,8 @@ namespace backends { namespace qnn { class HtpBackendCache : public QnnBackendCache { public: - explicit HtpBackendCache( - const QnnExecuTorchContextBinary& qnn_context_blob, - const std::string& aot_graph_name) - : QnnBackendCache(qnn_context_blob, aot_graph_name), spill_fill_buf_(0) {} + explicit HtpBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob) + : QnnBackendCache(qnn_context_blob), spill_fill_buf_(0) {} ~HtpBackendCache() override = default; uint64_t GetSpillFillBufferSize() { diff --git a/backends/qualcomm/serialization/qc_compiler_spec.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs index d8809231a9f..656bb5c76af 100644 --- a/backends/qualcomm/serialization/qc_compiler_spec.fbs +++ b/backends/qualcomm/serialization/qc_compiler_spec.fbs @@ -168,7 +168,8 @@ table QnnExecuTorchOptions { backend_options:QnnExecuTorchBackendOptions; /// Optional parameter to create qnn graph if QNN context blob is not given - graph_name:string; + /// It could be a list of names only when doing weight-sharing lowering + graph_name:[string]; /// Optional parameter to override the QNN backend library. library_path:string; @@ -192,9 +193,6 @@ table QnnExecuTorchOptions { /// Is model from qnn context binary is_from_context_binary:bool; - /// True if there exists multiple graphs in one .pte file. - multiple_graphs:bool; - // Enable this option to record all QNN API calls for debugging purpose saver:bool; diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py index 93305b1dbb5..84ce23701ef 100644 --- a/backends/qualcomm/serialization/qc_schema.py +++ b/backends/qualcomm/serialization/qc_schema.py @@ -10,6 +10,7 @@ from dataclasses import dataclass, field from enum import IntEnum, unique +from typing import List @dataclass @@ -148,7 +149,7 @@ class QnnExecuTorchBackendOptions: class QnnExecuTorchOptions: soc_info: SocInfo backend_options: QnnExecuTorchBackendOptions - graph_name: str = "" + graph_name: List[str] = field(default_factory=lambda: ["forward"]) library_path: str = "" log_level: QnnExecuTorchLogLevel = QnnExecuTorchLogLevel.kLogOff online_prepare: bool = False @@ -156,6 +157,5 @@ class QnnExecuTorchOptions: profile_level: QnnExecuTorchProfileLevel = QnnExecuTorchProfileLevel.kProfileOff shared_buffer: bool = False is_from_context_binary: bool = False - multiple_graphs: bool = False saver: bool = False saver_output_dir: str = "saver_output" diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 338209fcd4a..e1dfe3295dd 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -23,8 +23,10 @@ ) from executorch.backends.qualcomm.tests.utils import ( + convert_pt2e, generate_context_binary, ModuleQConfig, + prepare_pt2e, QnnTool, QuantDtype, TestQNN, @@ -44,9 +46,9 @@ dump_context_from_pte, from_context_binary, generate_htp_compiler_spec, - generate_multi_graph_program, generate_qnn_executorch_compiler_spec, PyQnnManagerAdaptor, + QnnPartitioner, skip_annotation, to_edge_transform_and_lower_to_qnn, update_spill_fill_size, @@ -87,8 +89,12 @@ from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel from executorch.examples.models.wav2letter import Wav2LetterModel -from executorch.exir import to_edge -from executorch.exir.backend.backend_api import disable_validation +from executorch.exir import EdgeProgramManager, to_edge +from executorch.exir.backend.backend_api import ( + disable_validation, + MethodProgramsPartitionerSpec, + to_backend, +) class TestQNNFloatingPointOperator(TestQNN): @@ -2459,35 +2465,37 @@ def test_qnn_backend_multi_graphs(self): graph_names = ["seq_conv", "single_conv"] backend_options = generate_htp_compiler_spec( use_fp16=True, + use_weight_sharing=True, ) compiler_specs = [ generate_qnn_executorch_compiler_spec( soc_model=self.chipset_table[TestQNN.model], backend_options=backend_options, - multiple_graphs=True, - weight_sharing=True, graph_name=graph_name, ) for graph_name in graph_names ] - edge_progs = [ - to_edge_transform_and_lower_to_qnn(module, sample_input, compiler_spec) - for module, sample_input, compiler_spec in zip( - modules, sample_inputs, compiler_specs + # TODO: retire capture_program once we figure out how to extract + # intermediate graph from official lowering API + edge_progs = { + graph_name: capture_program(module, sample_input).exported_program + for graph_name, module, sample_input in zip( + graph_names, modules, sample_inputs ) - ] - prog_mgr, _ = generate_multi_graph_program( - compiler_specs=compiler_specs[0], - processed_bytes=[ - edge_prog.exported_program().graph_module.lowered_module_0.processed_bytes - for edge_prog in edge_progs - ], + } + partitioners = { + graph_name: QnnPartitioner(compiler_spec) + for graph_name, compiler_spec in zip(graph_names, compiler_specs) + } + lowered_ep_dict = to_backend( + MethodProgramsPartitionerSpec(edge_progs, partitioners) ) + executorch_prog = EdgeProgramManager(lowered_ep_dict).to_executorch() for index, module in enumerate(modules): self.verify_output( module=module, sample_inputs=sample_inputs[index], - executorch_prog=prog_mgr, + executorch_prog=executorch_prog, method_index=index, ) @@ -3101,37 +3109,43 @@ def test_qnn_backend_multi_graphs(self): graph_names = ["seq_conv", "single_conv"] backend_options = generate_htp_compiler_spec( use_fp16=False, + use_weight_sharing=True, ) compiler_specs = [ generate_qnn_executorch_compiler_spec( soc_model=self.chipset_table[TestQNN.model], backend_options=backend_options, - multiple_graphs=True, - weight_sharing=True, graph_name=graph_name, ) for graph_name in graph_names ] - edge_progs = [ - to_edge_transform_and_lower_to_qnn( - self.get_qdq_module(module, sample_input), sample_input, compiler_spec - ) - for module, sample_input, compiler_spec in zip( - modules, sample_inputs, compiler_specs + # TODO: retire capture_program once we figure out how to extract + # intermediate graph from official lowering API + for i, module in enumerate(modules): + module_exported = torch.export.export(module, sample_inputs[i]).module() + module_prepared = prepare_pt2e(module_exported, make_quantizer()) + module_prepared(*sample_inputs[i]) + modules[i] = convert_pt2e(module_prepared) + + edge_progs = { + graph_name: capture_program(module, sample_input).exported_program + for graph_name, module, sample_input in zip( + graph_names, modules, sample_inputs ) - ] - prog_mgr, _ = generate_multi_graph_program( - compiler_specs=compiler_specs[0], - processed_bytes=[ - edge_prog.exported_program().graph_module.lowered_module_0.processed_bytes - for edge_prog in edge_progs - ], + } + partitioners = { + graph_name: QnnPartitioner(compiler_spec) + for graph_name, compiler_spec in zip(graph_names, compiler_specs) + } + lowered_ep_dict = to_backend( + MethodProgramsPartitionerSpec(edge_progs, partitioners) ) + executorch_prog = EdgeProgramManager(lowered_ep_dict).to_executorch() for index, module in enumerate(modules): self.verify_output( module=module, sample_inputs=sample_inputs[index], - executorch_prog=prog_mgr, + executorch_prog=executorch_prog, method_index=index, ) diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index 3653cd3176f..fec67833077 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -4,8 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import operator -import re -import time import warnings from collections import OrderedDict from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -51,14 +49,8 @@ QCOM_QUANTIZED_IO, ) -from executorch.exir import ( - EdgeCompileConfig, - ExecutorchProgramManager, - ExirExportedProgram, - to_edge, -) +from executorch.exir import EdgeCompileConfig, ExirExportedProgram, to_edge from executorch.exir.backend.compile_spec_schema import CompileSpec -from executorch.exir.capture import ExecutorchBackendConfig from executorch.exir.lowered_backend_module import LoweredBackendModule from executorch.exir.program._program import ( EdgeProgramManager, @@ -382,6 +374,7 @@ def to_edge_transform_and_lower_to_qnn( def capture_program( module: Union[torch.nn.Module, torch.fx.GraphModule], inputs: Tuple[torch.Tensor], + dep_table: Optional[Dict] = None, passes_job: OrderedDict = None, dynamic_shapes: Dict = None, ) -> exir.ExirExportedProgram: @@ -393,6 +386,7 @@ def capture_program( Args: module (Union[torch.nn.Module, torch.fx.GraphModule]): The PyTorch module or fx.GraphModule to be captured. inputs (Tuple[torch.Tensor]): The input tensors for the module. + dep_table (Optional[Dict]): Dependency table for the transformation passes. passes_job (OrderedDict, optional): Ordered dictionary of transformation passes. dynamic_shapes (Dict, optional): Information about dynamic shapes. @@ -413,7 +407,9 @@ def capture_program( core_ep = ExirExportedProgram(decomposed_ep, False) edge_ep = core_ep.to_edge(qnn_edge_config()) transform_passes = QnnPassManager().get_to_edge_transform_passes( - edge_ep.exported_program, passes_job=passes_job + edge_ep.exported_program, + passes_job=passes_job, + dep_table=dep_table, ) edge_ep.transform(*transform_passes) return edge_ep @@ -855,219 +851,11 @@ def draw_graph(title, path, graph_module: torch.fx.GraphModule): f.write(graph.get_dot_graph().create_svg()) -def generate_multi_graph_program( - compiler_specs: List[CompileSpec], - processed_bytes: List[bytes], - input_nodes_dict: List[torch.fx.Node] = None, - output_nodes_dict: List[torch.fx.Node] = None, - backend_config: ExecutorchBackendConfig = None, - constant_methods: Optional[Dict[str, Any]] = None, -) -> ExecutorchProgramManager: - # compile multiple graphs in qcir into single context binary - ( - graph_inputs, - graph_outputs, - qnn_in_order, - executorch_in_order, - executorch_out_order, - ) = ({}, {}, {}, {}, {}) - qnn_mgr = PyQnnManagerAdaptor.QnnManager( - generate_qnn_executorch_option(compiler_specs), processed_bytes - ) - assert qnn_mgr.Init().value == 0, "failed to load processed bytes" - binary_info = bytes(qnn_mgr.Compile()) - assert len(binary_info) != 0, "failed to generate QNN context binary" - graph_names = qnn_mgr.GetGraphNames() - for graph_name in graph_names: - graph_inputs[graph_name] = qnn_mgr.GetGraphInputs(graph_name) - graph_outputs[graph_name] = qnn_mgr.GetGraphOutputs(graph_name) - - # We need to obtain the order of the IOs to correctly map QNN with nn.module - for graph_name in graph_names: - if input_nodes_dict: - # input - input_names = [node.name for node in input_nodes_dict[graph_name]] - qnn_input_names = [ - wrapper.GetName() for wrapper in graph_inputs[graph_name] - ] - # The input of intermideate module including call_function node - # could not be reorder by node name - if len(input_names) == len(qnn_input_names): - input_order_list = [] - for input_name in input_names: - # e.g., input_0_tokens_0 - pattern = rf"^input_(\d+)_({input_name})_(\d+)$" - for j in range(len(qnn_input_names)): - if re.match(pattern, qnn_input_names[j]): - input_order_list.append(j) - break - assert len(input_order_list) == len( - input_names - ), "Order list length is different from names" - executorch_in_order[graph_name] = input_order_list - qnn_in_order[graph_name] = sorted( - range(len(input_order_list)), key=lambda k: input_order_list[k] - ) - if output_nodes_dict: - # output - get_item_list = output_nodes_dict[graph_name][0].args[0] - output_order_list = [item.args[1] for item in get_item_list] - executorch_out_order[graph_name] = output_order_list - - qnn_mgr.Destroy() - - # build custom ops with different graph signatures - compiler_options = flatbuffer_to_option(compiler_specs[0].value) - bundle_progs = [ - from_context_binary( - ctx_path=binary_info, - op_name=f"loader_{graph_name}_{int(time.time())}", - soc_model=compiler_options.soc_info.soc_model, - custom_info={ - "graph_inputs": graph_inputs[graph_name], - "graph_outputs": graph_outputs[graph_name], - "graph_name": graph_name, - "qnn_in_order": qnn_in_order.get(graph_name, None), - "executorch_in_order": executorch_in_order.get(graph_name, None), - "executorch_out_order": executorch_out_order.get(graph_name, None), - }, - ) - for graph_name in graph_names - ] - # leverage ExecutorchProgramManager for generating pte with multi-methods - edge_prog_mgr = to_edge( - { - graph_name: bundle_prog["exported_program"] - for graph_name, bundle_prog in zip(graph_names, bundle_progs) - }, - constant_methods=constant_methods, - # do not alter name for custom op - compile_config=EdgeCompileConfig(_use_edge_ops=False), - ) - # restore meta losed in generating EdgeProgramManager - for graph_name in graph_names: - for n in edge_prog_mgr._edge_programs[graph_name].graph.nodes: - if graph_name in n.name: - n.meta[OpContextLoader.meta_ctx_bin] = binary_info - break - - edge_prog_mgr = edge_prog_mgr.to_backend(QnnPartitioner(compiler_specs)) - exec_prog = edge_prog_mgr.to_executorch( - config=backend_config or ExecutorchBackendConfig() - ) - return exec_prog, bundle_progs - - -def generate_composite_llama_program( - llama_model: torch.nn.Module, - graph_names: List[str], - sample_inputs_list: List[Tuple[Any]], - lower_module_dict: Dict[str, List[LoweredBackendModule]], - call_delegate_node_name_dict: Dict[str, List[str]], - call_delegate_inputs_dict: Dict[str, List[Tuple[str, int | None]]], - outputs_dict: Dict[str, List[Tuple[str, int]]], - embedding_quantize: str, - backend_config: ExecutorchBackendConfig = None, - constant_methods: Optional[Dict[str, Any]] = None, -) -> ExecutorchProgramManager: - class CompositeLlamaModule(torch.nn.Module): - def __init__( - self, - llama_model, - lower_module_list, - call_delegate_node_name_list, - call_delegate_inputs_list, - outputs_list, - embedding_quantize, - ) -> None: - super().__init__() - self.llama_model = llama_model - self.lower_module_list = lower_module_list - self.call_delegate_node_name_list = call_delegate_node_name_list - self.call_delegate_inputs_list = call_delegate_inputs_list - self.outputs_list = outputs_list - self.embedding_quantize = embedding_quantize - - def reorder( - self, - call_delegate_inputs: List[Tuple[str, int | None]], - module_inputs: dict[str, torch.Tensor], - all_ret: dict[str, torch.Tensor], - ) -> Tuple[torch.Tensor]: - ret = [] - for name, index in call_delegate_inputs: - if index is not None: - # Get tensor from previous results - ret.append(all_ret[name][index]) - else: - # Get tensor from the inputs of module - ret.append(module_inputs[name]) - return tuple(ret) - - def forward( - self, - tokens: torch.Tensor, - atten_mask: torch.Tensor, - input_pos: Optional[torch.Tensor] = None, - *args, - ) -> Tuple[torch.Tensor]: - all_ret = {} - module_input_dict = { - "tokens": tokens, - "atten_mask": atten_mask, - "input_pos": input_pos, - } - for num, arg in enumerate(args): - module_input_dict[f"args_{num}"] = arg - - if self.embedding_quantize: - hidden_states = self.llama_model.tok_embeddings(tokens) - module_input_dict["quantized_decomposed_embedding_4bit_dtype"] = ( - hidden_states - ) - - for lower_module, call_delegate_node_name, call_delegate_inputs in zip( - self.lower_module_list, - self.call_delegate_node_name_list, - self.call_delegate_inputs_list, - ): - inp = self.reorder(call_delegate_inputs, module_input_dict, all_ret) - ret = lower_module(*inp) - all_ret[call_delegate_node_name] = ret - llama_outputs = [] - for output_src_name, index in self.outputs_list: - llama_outputs.append(all_ret[output_src_name][index]) - return tuple(llama_outputs) - - progs_dict = {} - for graph_name, sample_inputs in zip(graph_names, sample_inputs_list): - composite_llama_module = CompositeLlamaModule( - llama_model, - lower_module_dict[graph_name], - call_delegate_node_name_dict[graph_name], - call_delegate_inputs_dict[graph_name], - outputs_dict[graph_name], - embedding_quantize, - ) - prog = torch.export.export(composite_llama_module, sample_inputs, strict=True) - progs_dict[graph_name] = prog - # leverage ExecutorchProgramManager for generating pte with multi-methods - edge_prog_mgr = to_edge( - progs_dict, - constant_methods=constant_methods, - # do not alter name for custom op - compile_config=EdgeCompileConfig(_check_ir_validity=False, _use_edge_ops=False), - ) - exec_prog = edge_prog_mgr.to_executorch( - config=backend_config or ExecutorchBackendConfig() - ) - return exec_prog - - def generate_htp_compiler_spec( use_fp16: bool, use_dlbc: bool = False, use_multi_contexts: bool = False, + use_weight_sharing: bool = False, ) -> QnnExecuTorchBackendOptions: """ Helper function generating backend options for QNN HTP @@ -1081,6 +869,8 @@ def generate_htp_compiler_spec( use_multi_contexts: When multiple contexts are generated inside the same pte, it is possible to reserve a single spill-fill allocation that could be re-used across all the splits. + use_weight_sharing: Used with multiple_graphs, where model size will be + reduced when operations have the same weights across multiple graphs. Returns: QnnExecuTorchHtpBackendOptions: backend options for QNN HTP. @@ -1096,6 +886,7 @@ def generate_htp_compiler_spec( # TODO: enable voting mechanism in runtime and make this as an option htp_options.performance_mode = QnnExecuTorchHtpPerformanceMode.kHtpBurst htp_options.use_multi_contexts = use_multi_contexts + htp_options.use_weight_sharing = use_weight_sharing htp_options.use_dlbc = use_dlbc return QnnExecuTorchBackendOptions( backend_type=QnnExecuTorchBackendType.kHtpBackend, @@ -1114,8 +905,6 @@ def generate_qnn_executorch_compiler_spec( optrace: bool = False, shared_buffer: bool = False, is_from_context_binary: bool = False, - multiple_graphs: bool = False, - weight_sharing: bool = False, graph_name: str = "forward", ) -> List[CompileSpec]: """ @@ -1144,10 +933,7 @@ def generate_qnn_executorch_compiler_spec( shared_buffer: Enables usage of shared buffer between application and backend for graph I/O. is_from_context_binary: True if current graph comes from pre-built context binary. - multiple_graphs: True if multiple methods are expected to have in single .pte file. - Please see test cases for post-processing example. - weight_sharing: Used with multiple_graphs, where model size will be reduced when operations have the same weights across multiple graphs. - graph_name: Assign unique graph name if 'multiple_graphs' is used. + graph_name: Assign unique graph name if lowering multiple methods. Returns: List[CompileSpec]: Compiler specs for Qualcomm AI Engine Direct. @@ -1167,16 +953,10 @@ def generate_qnn_executorch_compiler_spec( stacklevel=1, ) - if weight_sharing and not multiple_graphs: - warnings.warn( - "Weight sharing is intended for multiple graphs scenario, please ensure if there are multiple graphs", - stacklevel=1, - ) - qnn_executorch_options = QnnExecuTorchOptions( _soc_info_table[soc_model], backend_options ) - qnn_executorch_options.graph_name = graph_name + qnn_executorch_options.graph_name = [graph_name] qnn_executorch_options.log_level = ( QnnExecuTorchLogLevel.kLogLevelDebug if debug @@ -1212,15 +992,6 @@ def generate_qnn_executorch_compiler_spec( qnn_executorch_options.shared_buffer = shared_buffer qnn_executorch_options.online_prepare = online_prepare qnn_executorch_options.is_from_context_binary = is_from_context_binary - qnn_executorch_options.multiple_graphs = multiple_graphs - - if multiple_graphs: - # enable weight sharing mechanism if multiple graphs appear - if ( - backend_options.backend_type == QnnExecuTorchBackendType.kHtpBackend - and weight_sharing - ): - backend_options.htp_options.use_weight_sharing = True return [ CompileSpec(QCOM_QNN_COMPILE_SPEC, option_to_flatbuffer(qnn_executorch_options)) diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp index 83478bd8e68..27ccd9fe3e9 100644 --- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp +++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp @@ -481,12 +481,6 @@ int main(int argc, char** argv) { ++inference_index; } - ET_LOG( - Info, - "%d inference took %f ms, avg %f ms", - inference_index, - elapsed_time, - elapsed_time / inference_index); } else { // if no input is provided, fill the inputs with default values auto inputs = prepare_input_tensors(*method); diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index 375edf9fb6c..47b2eebf518 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -29,7 +29,7 @@ ) from executorch.backends.qualcomm.builders.utils import is_graph_output - +from executorch.backends.qualcomm.partition.qnn_partitioner import QnnPartitioner from executorch.backends.qualcomm.quantizer.custom_annotation import ( annotate_linear_16a8w_in_affine_layer, annotate_matmul_16a8w, @@ -39,20 +39,15 @@ from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype from executorch.backends.qualcomm.serialization.qc_schema import QcomChipset -from executorch.backends.qualcomm.serialization.qc_schema_serialize import ( - flatbuffer_to_option, - option_to_flatbuffer, -) from executorch.backends.qualcomm.utils.constants import ( QCOM_PASS_ACTIVATE_KEY, QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY, QCOM_QUANT_ATTRS_MAP, ) from executorch.backends.qualcomm.utils.utils import ( + capture_program, convert_linear_to_conv2d, - generate_composite_llama_program, generate_htp_compiler_spec, - generate_multi_graph_program, generate_qnn_executorch_compiler_spec, get_soc_to_chipset_map, to_edge_transform_and_lower_to_qnn, @@ -73,6 +68,11 @@ setup_common_args_and_variables, SimpleADB, ) +from executorch.exir import EdgeProgramManager +from executorch.exir.backend.backend_api import ( + MethodProgramsPartitionerSpec, + to_backend, +) from executorch.exir.capture._config import ExecutorchBackendConfig from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass @@ -678,7 +678,9 @@ def permute(w, heads): llama_instace.inputs for llama_instace in llama_instance_list ] backend_options = generate_htp_compiler_spec( - use_fp16=use_fp16, use_multi_contexts=args.num_sharding > 1 + use_fp16=use_fp16, + use_multi_contexts=args.num_sharding > 1, + use_weight_sharing=not args.enable_x86_64, # x86 emulator does not support weight sharing ) graph_names = ["kv_forward", "prefill_forward"] compiler_specs = [ @@ -686,45 +688,49 @@ def permute(w, heads): soc_model=get_soc_to_chipset_map()[args.model], backend_options=backend_options, shared_buffer=args.shared_buffer, - multiple_graphs=True, - weight_sharing=not args.enable_x86_64, # x86 emulator does not support weight sharing graph_name=graph_name, ) for graph_name in graph_names ] - skip_node_op_set = {"llama.fallback.default"} - edge_prog_mgrs = [ - to_edge_transform_and_lower_to_qnn( - llama_instance.llama_graph_module, - sample_input, - compile_spec, + + # TODO: retire capture_program once we figure out how to extract + # intermediate graph from official lowering API + edge_progs = { + graph_name: capture_program( + module=llama_instance.llama_graph_module, + inputs=sample_input, dep_table=llama_instance.dep_table, passes_job=llama_instance.passes_job, - skip_node_op_set=skip_node_op_set, - ) - for llama_instance, sample_input, compile_spec in zip( - llama_instance_list, sample_inputs_list, compiler_specs + ).exported_program + for graph_name, llama_instance, sample_input in zip( + graph_names, llama_instance_list, sample_inputs_list ) - ] - for n in edge_prog_mgrs[0].exported_program().graph.nodes: + } + for n in edge_progs[graph_names[0]].graph.nodes: if n.op == "output": for node, output_encoding in n.meta[QCOM_QUANT_ATTRS_MAP].items(): if node.meta["val"].size() in llama_instance_list[0].io_shape: quant_attrs = output_encoding - if args.num_sharding > 1: - max_sf_size = update_spill_fill_size( - [edge_prog_mgr.exported_program() for edge_prog_mgr in edge_prog_mgrs] + partitioners = { + graph_name: QnnPartitioner( + compiler_spec, skip_node_op_set={"llama.fallback.default"} ) - qnn_executorch_options = flatbuffer_to_option(compiler_specs[0][0].value) - qnn_executorch_options.backend_options.htp_options.max_sf_buf_size = ( - max_sf_size - ) - compiler_specs[0][0].value = option_to_flatbuffer(qnn_executorch_options) + for graph_name, compiler_spec in zip(graph_names, compiler_specs) + } + + lowered_ep_dict = to_backend( + MethodProgramsPartitionerSpec(edge_progs, partitioners) + ) + + if args.num_sharding > 1: + # TODO: add arg parser of spill_fill_size since weight-sharing based + # context binaries cannot be opened in x86 host + pass if args.verbose: - for edge_prog_mgr in edge_prog_mgrs: - print_delegation_info(edge_prog_mgr.exported_program().graph_module) + for ep in lowered_ep_dict.values(): + print_delegation_info(ep.graph_module) executorch_config = ExecutorchBackendConfig( # For shared buffer, user must pass the memory address @@ -737,91 +743,13 @@ def permute(w, heads): ), extract_delegate_segments=True, ) + exec_prog_mgr = EdgeProgramManager( + edge_programs=lowered_ep_dict, + constant_methods=llama_instance_list[1].llama_meta, + ).to_executorch(executorch_config) - bundle_progs_list = [] - lower_module_dict = {name: [] for name in graph_names} - call_delegate_inputs_dict = {name: [] for name in graph_names} - call_delegate_node_name_dict = {name: [] for name in graph_names} - outputs_dict = {name: [] for name in graph_names} - input_nodes_dict = {name: [] for name in graph_names} - for prog, graph_name in zip(edge_prog_mgrs, graph_names): - for node in prog.exported_program().graph_module.graph.nodes: - if ( - node.op == "call_function" - and "executorch_call_delegate" in node.name - ): - call_delegate_node_name_dict[graph_name].append(node.name) - call_delegate_inputs_list = [] - for arg in node.args: - if arg.op == "call_function": - if ( - arg.target - == exir_ops.edge.quantized_decomposed.embedding_4bit.dtype - ): - call_delegate_inputs_list.append((arg.name, None)) - else: - while "getitem" not in arg.name: - arg = arg.args[0] - call_delegate_inputs_list.append( - (arg.args[0].name, arg.args[1]) - ) - elif arg.op == "placeholder": - call_delegate_inputs_list.append((arg.name, None)) - # No extra needs to do for get_attr node - call_delegate_inputs_dict[graph_name].append( - call_delegate_inputs_list - ) - elif node.op == "output": - for arg in node.args[0]: - outputs_dict[graph_name].append((arg.args[0].name, arg.args[1])) - for num in range(args.num_sharding - 1, -1, -1): - processed_bytes = [] - for prog, graph_name in zip(edge_prog_mgrs, graph_names): - processed_bytes.append( - getattr( - prog.exported_program().graph_module, f"lowered_module_{num}" - ).processed_bytes - ) - call_delegate_node = [ - list(node.users.keys())[0] - for node in prog.exported_program().graph_module.graph.nodes - if node.op == "get_attr" and node.name == f"lowered_module_{num}" - ] - input_nodes_dict[graph_name] = [ - node - for node in call_delegate_node[0].args - if node.op == "placeholder" - or node.target - == exir_ops.edge.quantized_decomposed.embedding_4bit.dtype - ] - prog_mgr, bundle_progs = generate_multi_graph_program( - compiler_specs=compiler_specs[0], - processed_bytes=processed_bytes, - input_nodes_dict=input_nodes_dict, - backend_config=executorch_config, - constant_methods=llama_instance_list[0].llama_meta, # kv method meta - ) - bundle_progs_list.append(bundle_progs) - for graph_name in graph_names: - lower_module_dict[graph_name].append( - prog_mgr.exported_program(graph_name).graph_module._modules.get( - "lowered_module_0" - ) - ) - exec_prog = generate_composite_llama_program( - llama_model=llama_instance_list[1].llama_model, - graph_names=graph_names, - sample_inputs_list=sample_inputs_list, - lower_module_dict=lower_module_dict, - call_delegate_node_name_dict=call_delegate_node_name_dict, - call_delegate_inputs_dict=call_delegate_inputs_dict, - outputs_dict=outputs_dict, - embedding_quantize=args.embedding_quantize, - backend_config=executorch_config, - constant_methods=llama_instance_list[1].llama_meta, # kv method meta - ) with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file: - exec_prog.write_to_file(file) + exec_prog_mgr.write_to_file(file) end_lowering_ts = time.time() logging.info(f"Time for compiling: {end_lowering_ts - start_lowering_ts}") diff --git a/examples/qualcomm/utils.py b/examples/qualcomm/utils.py index 542739a2898..670cdde12ad 100755 --- a/examples/qualcomm/utils.py +++ b/examples/qualcomm/utils.py @@ -58,8 +58,6 @@ class SimpleADB: runner (str): Runtime executor binary expected_input_shape (Tuple[torch.Size]): input shape of dynamic graph expected_output_shape (Tuple[torch.Size]): output shape of dynamic graph - expected_input_dtype (Tuple[torch.dtype]): input dtype - expected_output_sdtype (Tuple[torch.dtype]): output dtype """ def __init__( diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py index 310e5ea9379..0b8dac741a5 100644 --- a/exir/backend/backend_api.py +++ b/exir/backend/backend_api.py @@ -7,6 +7,7 @@ import copy import logging +import operator from contextlib import contextmanager, nullcontext from dataclasses import dataclass from functools import singledispatch @@ -204,11 +205,36 @@ def _insert_lowered_submodule( owning_graph_module = call_submodule_node.graph.owning_module # call delegate args should only use user_inputs call_delegate_args = [] + # handle getitem node in multi-method scenario + call_submodule_inputs = [] + for inp_node in call_submodule_node.all_input_nodes: + if inp_node.target == operator.getitem: + # it could be an executorch_call_delegate node or a submodule to be replaced + subgraph = ( + # get owning_module of lowered_module node + getattr( + inp_node.args[0].all_input_nodes[0].graph.owning_module, + inp_node.args[0].all_input_nodes[0].name, + ).original_module + if inp_node.args[0].target + == torch._higher_order_ops.executorch_call_delegate + # get owning_module of submodule node + else getattr( + inp_node.args[0].graph.owning_module, + inp_node.args[0].all_input_nodes[0].name, + ) + ) + output_node = [ + node for node in subgraph.graph.nodes if node.name == "output" + ][0] + call_submodule_inputs.append(output_node.all_input_nodes[inp_node.args[1]]) + else: + call_submodule_inputs.append(inp_node) # Preserve input order as user_inputs for inp_name in submodule_program.graph_signature.user_inputs: - for inp_node in call_submodule_node.all_input_nodes: + for i, inp_node in enumerate(call_submodule_inputs): if inp_node.name == inp_name: - call_delegate_args.append(inp_node) + call_delegate_args.append(call_submodule_node.all_input_nodes[i]) break def generate_debug_handle(ep: ExportedProgram) -> int: @@ -325,6 +351,9 @@ def _partition_and_lower_one_graph_module( toplevel_output_specs_to_delete, ) + # perform validation here to make sure all the delegated submodules are gone + # validate inside _insert_lowered_submodule will break multi-method scenario + owning_program._validate() return tagged_graph_module @@ -569,7 +598,11 @@ def lower_all_submodules_to_backend( # The created exported program for the submodules are in the call_module node's meta data # We just map the method_to_submodule_nodes directly to the method_to_partitioned_exported_programs method_to_partitioned_program = { - method_name: [node.meta["submodule_program"] for node in call_submodule_nodes] + method_name: [ + # perform deep copy here in case backends change graph inside preprocess method + copy.deepcopy(node.meta["submodule_program"]) + for node in call_submodule_nodes + ] for method_name, call_submodule_nodes in method_to_submodules_nodes.items() } method_to_compile_specs = { @@ -627,6 +660,10 @@ def lower_all_submodules_to_backend( toplevel_output_specs_to_delete, ) + # perform validation here to make sure all the delegated submodules are gone + # validate inside _insert_lowered_submodule will break multi-method scenario + owning_program._validate() + @dataclass class MethodProgramsPartitionerSpec: diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index 78b031a238e..8bbf5a7a960 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -958,5 +958,3 @@ def _unsafe_adjust_original_program( # noqa: C901 if user_idx > idx: user.args = (user.args[0], user_idx - (len(getitem_idxs) - i)) break - - original_program._validate() From 5f976652ba9e3430e3f283f2951495eb3f4868fd Mon Sep 17 00:00:00 2001 From: haowhsu-quic Date: Thu, 1 May 2025 00:08:05 +0800 Subject: [PATCH 2/5] add call_module op check --- exir/backend/backend_api.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py index 0b8dac741a5..3720087a113 100644 --- a/exir/backend/backend_api.py +++ b/exir/backend/backend_api.py @@ -208,7 +208,10 @@ def _insert_lowered_submodule( # handle getitem node in multi-method scenario call_submodule_inputs = [] for inp_node in call_submodule_node.all_input_nodes: - if inp_node.target == operator.getitem: + if inp_node.target == operator.getitem and ( + inp_node.args[0].target == torch._higher_order_ops.executorch_call_delegate + or inp_node.args[0].op == "call_module" + ): # it could be an executorch_call_delegate node or a submodule to be replaced subgraph = ( # get owning_module of lowered_module node From 3624778c8f0a67ed30746d0e67a46c31beba19cd Mon Sep 17 00:00:00 2001 From: haowhsu-quic Date: Thu, 1 May 2025 08:43:36 +0800 Subject: [PATCH 3/5] add is_submodule check --- exir/backend/backend_api.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py index 3720087a113..06d0c566e06 100644 --- a/exir/backend/backend_api.py +++ b/exir/backend/backend_api.py @@ -227,9 +227,8 @@ def _insert_lowered_submodule( inp_node.args[0].all_input_nodes[0].name, ) ) - output_node = [ - node for node in subgraph.graph.nodes if node.name == "output" - ][0] + output_node = list(subgraph.graph.nodes)[-1] + assert output_node.op == "output" call_submodule_inputs.append(output_node.all_input_nodes[inp_node.args[1]]) else: call_submodule_inputs.append(inp_node) @@ -356,7 +355,8 @@ def _partition_and_lower_one_graph_module( # perform validation here to make sure all the delegated submodules are gone # validate inside _insert_lowered_submodule will break multi-method scenario - owning_program._validate() + if not is_submodule: + owning_program._validate() return tagged_graph_module @@ -665,7 +665,8 @@ def lower_all_submodules_to_backend( # perform validation here to make sure all the delegated submodules are gone # validate inside _insert_lowered_submodule will break multi-method scenario - owning_program._validate() + if not is_submodule: + owning_program._validate() @dataclass From 8468fa73bda7b9f7c20c1fd3ddef80a1b3d0e82c Mon Sep 17 00:00:00 2001 From: haowhsu-quic Date: Thu, 1 May 2025 10:15:53 +0800 Subject: [PATCH 4/5] rebase QNN IR PR --- backends/qualcomm/CMakeLists.txt | 7 +- .../qualcomm/aot/python/PyQnnManagerAdaptor.h | 1 - backends/qualcomm/runtime/QnnManager.cpp | 121 ------------------ backends/qualcomm/runtime/QnnManager.h | 1 - .../runtime/backends/QnnBackendCache.cpp | 13 -- .../runtime/backends/QnnCustomProtocol.cpp | 2 +- .../irbackend/x86_64/QnnDlcManager.cpp | 10 +- backends/qualcomm/tests/test_qnn_delegate.py | 6 + 8 files changed, 17 insertions(+), 144 deletions(-) diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt index 1b7c8891a4e..37e814d0679 100644 --- a/backends/qualcomm/CMakeLists.txt +++ b/backends/qualcomm/CMakeLists.txt @@ -153,12 +153,12 @@ target_link_libraries(qnn_executorch_logging PRIVATE qnn_schema) target_link_libraries(qnn_profiler PRIVATE qnn_executorch_logging) target_link_libraries(qnn_logger PRIVATE qnn_implementation ${android_log}) target_link_libraries(qnn_backend PRIVATE qnn_implementation qnn_logger) -target_link_libraries(qnn_custom_protocol PRIVATE qcir_utils) +target_link_libraries(qnn_custom_protocol PRIVATE qnn_logger) target_link_libraries( qnn_device PRIVATE qnn_executorch_logging qnn_implementation qnn_logger ) target_link_libraries( - qnn_backend_cache PRIVATE qnn_sys_implementation qcir_utils + qnn_backend_cache PRIVATE qnn_sys_implementation ) target_link_libraries( qnn_context PRIVATE qnn_implementation qnn_logger qnn_backend qnn_device @@ -184,7 +184,7 @@ target_link_libraries( ) target_link_libraries( qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager - executorch_core qcir_utils extension_tensor + executorch_core extension_tensor ) set_target_properties( qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" @@ -243,7 +243,6 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") qnn_manager qnn_executorch_header executorch - qcir_utils extension_tensor ) target_link_libraries( diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h index 66fd41721c6..409ec1a4294 100644 --- a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h +++ b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h @@ -6,7 +6,6 @@ * LICENSE file in the root directory of this source tree. */ #pragma once -#include #include #include #include diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp index 6850a92fdc6..600bc072b06 100644 --- a/backends/qualcomm/runtime/QnnManager.cpp +++ b/backends/qualcomm/runtime/QnnManager.cpp @@ -6,7 +6,6 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include #include @@ -572,126 +571,6 @@ Error QnnManager::CompileDlc() { return Error::Ok; } -Error QnnManager::CompileQcir() { - QnnQcirCustomProtocol qnn_qcir_custom_protocol; - auto [status, qcir_fbs_size, tensor_size, qcir_fbs_ptr, tensor_ptr] = - qnn_qcir_custom_protocol.DeserializeQcirCustomBuffer( - qnn_context_blob_.buffer); - - if (status != Error::Ok) { - QNN_EXECUTORCH_LOG_ERROR("Failed to verify QnnQcirCustomProtocol"); - return Error::Internal; - } - - auto context = qcir::GetContext(qcir_fbs_ptr); - for (const auto& graph : *context->graphs()) { - // qcir tensors to TensorWrapper - std::vector> graph_inputs, graph_outputs, - tensors; - for (const auto& tensor : *graph->tensors()) { - tensors.emplace_back(CreateTensorWrapper(ToTensor( - tensor, static_cast(tensor_ptr) + tensor->offset()))); - if (tensor->type() == qcir::TensorType::WRITE) { - graph_inputs.push_back(tensors.back()); - } else if (tensor->type() == qcir::TensorType::READ) { - graph_outputs.push_back(tensors.back()); - } - } - std::vector> op_wrappers; - // qcir graph node to OpWrapper - for (const auto& node : *graph->nodes()) { - std::shared_ptr op = std::make_shared( - node->name()->str(), - node->package_name()->str(), - node->type_name()->str()); - - // qcir input tensors to OpWrapper input tensors - std::vector> inputs; - for (uint32_t index : *node->inputs()) { - inputs.push_back(tensors[index]); - } - op->AddInputTensors(inputs); - - // qcir output tensors to OpWrapper output tensors - std::vector> outputs; - for (uint32_t index : *node->outputs()) { - outputs.push_back(tensors[index]); - } - op->AddOutputTensors(outputs); - - // qcir operator param to OpWrapper param - for (uint32_t index : *node->params()) { - const auto& tensor = graph->tensors()->Get(index); - std::string name = tensor->name()->str(); - Qnn_DataType_t dtype = ToDataType(tensor->dtype()); - const uint8_t* data_ptr = - static_cast(tensor_ptr) + tensor->offset(); - if (tensor->shape()->size() != 0) { - // add tensor param - op->AddTensorParam( - name, - dtype, - tensor->shape()->size(), - tensor->shape()->data(), - data_ptr); - } else { - // add scalar param - switch (dtype) { - case Qnn_DataType_t::QNN_DATATYPE_INT_32: - op->AddScalarParam( - name, dtype, *reinterpret_cast(data_ptr)); - break; - case Qnn_DataType_t::QNN_DATATYPE_INT_16: - op->AddScalarParam( - name, dtype, *reinterpret_cast(data_ptr)); - break; - case Qnn_DataType_t::QNN_DATATYPE_INT_8: - op->AddScalarParam(name, dtype, static_cast(*data_ptr)); - break; - case Qnn_DataType_t::QNN_DATATYPE_UINT_32: - op->AddScalarParam( - name, dtype, *reinterpret_cast(data_ptr)); - break; - case Qnn_DataType_t::QNN_DATATYPE_UINT_16: - op->AddScalarParam( - name, dtype, *reinterpret_cast(data_ptr)); - break; - case Qnn_DataType_t::QNN_DATATYPE_UINT_8: - op->AddScalarParam(name, dtype, *data_ptr); - break; - case Qnn_DataType_t::QNN_DATATYPE_FLOAT_32: - case Qnn_DataType_t::QNN_DATATYPE_FLOAT_16: - op->AddScalarParam( - name, dtype, *reinterpret_cast(data_ptr)); - break; - case Qnn_DataType_t::QNN_DATATYPE_BOOL_8: - op->AddScalarParam(name, dtype, *data_ptr); - break; - default: - QNN_EXECUTORCH_LOG_ERROR( - "Invalid scalar type: %s", tensor->name()->c_str()); - break; - } - } - } - op_wrappers.emplace_back(std::move(op)); - } - ET_CHECK_OR_RETURN_ERROR( - Compile(graph->name()->str(), op_wrappers) == Error::Ok, - Internal, - "Fail to compile graph from qcir with graph_name: %s", - graph->name()->str().c_str()); - ET_CHECK_OR_RETURN_ERROR( - AllocateTensor(graph->name()->str(), graph_inputs, graph_outputs) == - Error::Ok, - Internal, - "Fail to allocate tensor for qcir with graph_name: %s", - graph->name()->str().c_str()); - } - - return Error::Ok; -} - Error QnnManager::Compile( const std::string& graph_name, std::vector>& op_wrappers) { diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h index 77412a184ff..c01a537f7bd 100644 --- a/backends/qualcomm/runtime/QnnManager.h +++ b/backends/qualcomm/runtime/QnnManager.h @@ -67,7 +67,6 @@ class QnnManager { executorch::runtime::Error GetContextBinary( QnnExecuTorchContextBinary& qnn_executorch_context_binary); - executorch::runtime::Error CompileQcir(); executorch::runtime::Error CompileDlc(); executorch::runtime::Error Compile( const std::string& graph_name, diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp index 5cfe783c6f0..4387d61ab7c 100644 --- a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp +++ b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp @@ -6,7 +6,6 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include namespace executorch { @@ -129,18 +128,6 @@ Error QnnBackendCache::Configure(const std::vector& graph_names) { qnn_context_blob_.nbytes); if (status == Error::Internal) { - auto [status, qcir_fbs_size, _, qcir_fbs_ptr, __] = - QnnQcirCustomProtocol().DeserializeQcirCustomBuffer( - qnn_context_blob_.buffer); - if (status == Error::Ok) { - // first stage of multi graph - state_ = MULTI_GRAPH; - auto context = qcir::GetContext(qcir_fbs_ptr); - for (const auto& graph : *context->graphs()) { - graph_names_.emplace_back(graph->name()->str()); - } - return Error::Ok; - } // online prepare state_ = ONLINE_PREPARE; } diff --git a/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp b/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp index 6bf65f59286..12de1b3e705 100644 --- a/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp +++ b/backends/qualcomm/runtime/backends/QnnCustomProtocol.cpp @@ -6,13 +6,13 @@ * LICENSE file in the root directory of this source tree. */ -#include #include namespace executorch { namespace backends { namespace qnn { +// we still need this for on-device op validation of other backends void QnnQcirCustomProtocol::BuildQcirCustomBuffer( const QnnExecuTorchContextBinary& qcir_binary, const std::vector& tensor_data) { diff --git a/backends/qualcomm/runtime/backends/irbackend/x86_64/QnnDlcManager.cpp b/backends/qualcomm/runtime/backends/irbackend/x86_64/QnnDlcManager.cpp index 14b9aeadf3a..bd54a078ef7 100644 --- a/backends/qualcomm/runtime/backends/irbackend/x86_64/QnnDlcManager.cpp +++ b/backends/qualcomm/runtime/backends/irbackend/x86_64/QnnDlcManager.cpp @@ -41,8 +41,7 @@ Error QnnDlcManager::Create() { std::make_unique(qnn_loaded_backend_, logger_.get()); backend_params_ptr_->qnn_backend_cache_ptr_ = - std::make_unique( - qnn_context_blob_, options_->graph_name()->str()); + std::make_unique(qnn_context_blob_); backend_params_ptr_->qnn_context_ptr_ = std::make_unique( qnn_loaded_backend_, @@ -64,8 +63,13 @@ Error QnnDlcManager::Create() { Error QnnDlcManager::Configure() { ET_CHECK_OR_RETURN_ERROR( backend_params_ptr_ != nullptr, Internal, "Failed to load Qnn backend."); + std::vector graph_names; + for (auto name : *options_->graph_name()) { + graph_names.emplace_back(name->str()); + } ET_CHECK_OR_RETURN_ERROR( - backend_params_ptr_->qnn_backend_cache_ptr_->Configure() == Error::Ok, + backend_params_ptr_->qnn_backend_cache_ptr_->Configure(graph_names) == + Error::Ok, Internal, "Fail to configure Qnn backend cache"); ET_CHECK_OR_RETURN_ERROR( diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index e1dfe3295dd..7d0a360d298 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -2535,6 +2535,9 @@ def test_qnn_backend_shared_buffer(self): ) def test_qnn_backend_online_prepare(self): + if self.enable_x86_64: + self.skipTest("TODO: add online_prepare support on host platform") + backend_options = generate_htp_compiler_spec(use_fp16=True) TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( soc_model=self.chipset_table[TestQNN.model], @@ -3187,6 +3190,9 @@ def test_qnn_backend_shared_buffer(self): ) def test_qnn_backend_online_prepare(self): + if self.enable_x86_64: + self.skipTest("TODO: add online_prepare support on host platform") + backend_options = generate_htp_compiler_spec(use_fp16=False) TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( soc_model=self.chipset_table[TestQNN.model], From 30883fde362e7951391f930b7eb181017e88dbab Mon Sep 17 00:00:00 2001 From: haowhsu-quic Date: Thu, 1 May 2025 18:17:03 +0800 Subject: [PATCH 5/5] chenge validation logic --- exir/backend/backend_api.py | 14 +++++--------- exir/lowered_backend_module.py | 4 ++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py index 06d0c566e06..54592478198 100644 --- a/exir/backend/backend_api.py +++ b/exir/backend/backend_api.py @@ -201,6 +201,7 @@ def _insert_lowered_submodule( is_submodule: bool, toplevel_input_specs_to_delete: Dict[str, InputSpec], toplevel_output_specs_to_delete: Dict[str, OutputSpec], + validate_program: bool = True, ): owning_graph_module = call_submodule_node.graph.owning_module # call delegate args should only use user_inputs @@ -275,6 +276,7 @@ def generate_debug_handle(ep: ExportedProgram) -> int: call_delegate_node, toplevel_input_specs_to_delete, toplevel_output_specs_to_delete, + validate_program, ) @@ -353,10 +355,6 @@ def _partition_and_lower_one_graph_module( toplevel_output_specs_to_delete, ) - # perform validation here to make sure all the delegated submodules are gone - # validate inside _insert_lowered_submodule will break multi-method scenario - if not is_submodule: - owning_program._validate() return tagged_graph_module @@ -661,13 +659,11 @@ def lower_all_submodules_to_backend( is_submodule, toplevel_input_specs_to_delete, toplevel_output_specs_to_delete, + # validate only when all submodules are processed + validate_program=call_submodule_node + == list_of_call_submodule_nodes[-1], ) - # perform validation here to make sure all the delegated submodules are gone - # validate inside _insert_lowered_submodule will break multi-method scenario - if not is_submodule: - owning_program._validate() - @dataclass class MethodProgramsPartitionerSpec: diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index 8bbf5a7a960..b1bd1b3164d 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -862,6 +862,7 @@ def _unsafe_adjust_original_program( # noqa: C901 call_delegate_node: torch.fx.Node, input_specs_to_delete: Dict[str, InputSpec], output_specs_to_delete: Dict[str, OutputSpec], + validate_program: bool, ) -> None: """ Directly modify the original exported program's signature and state dict @@ -958,3 +959,6 @@ def _unsafe_adjust_original_program( # noqa: C901 if user_idx > idx: user.args = (user.args[0], user_idx - (len(getitem_idxs) - i)) break + + if validate_program: + original_program._validate()