[XNNPACK][Weights Cache] Enable in XNNPACK (#9297)

pytorchbot · mcr229 · web-flow · commit 2a903f9609c2 · 2025-03-14T22:31:40.000-04:00
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: #9155 by @mcr229 ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/mcr229/11/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/mcr229/11/head Merge bot PR base: https://github.com/pytorch/executorch/tree/gh/mcr229/10/orig Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/mcr229/11/orig @diff-train-skip-merge --------- Co-authored-by: Max Ren <maxren@meta.com>
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
@@ -37,6 +37,19 @@ option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
 # Keeping this OFF by default due to regressions in decode and model load with
 # kleidi kernels
 option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF)
+
+# Turning this on cache weights between partitions and methods. If weights
+# are shared across methods/partitions then this can reduce load time and
+# memory usage
+
+# Keeping this off maintains existing behavior. Turning this on serializes
+# execution and initialization of delegates, to be revisited
+option(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE
+        "Enable weights cache to cache and manage all packed weights" OFF)
+
+if(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE)
+  add_definitions(-DENABLE_XNNPACK_WEIGHTS_CACHE)
+endif()
 if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
   add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
 endif()
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -11,7 +11,9 @@
 #include <executorch/backends/xnnpack/serialization/schema_generated.h>
 #include <executorch/extension/threadpool/threadpool.h>
 #include <executorch/runtime/executor/pte_data_map.h>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
 #pragma clang diagnostic ignored "-Wglobal-constructors"
@@ -167,7 +169,8 @@ const uint8_t* getConstantDataPtr(
     GraphPtr flatbuffer_graph,
     const uint8_t* constant_data_ptr,
     const NamedDataMap* named_data_map,
-    std::vector<FreeableBuffer>& loaded_buffers_from_map) {
+    std::vector<FreeableBuffer>& freeable_buffers,
+    XNNWeightsCache* weights_cache) {
   auto buffer_idx = tensor_value->constant_buffer_idx();
   if (buffer_idx) {
     if (!constant_data_ptr) {
@@ -187,6 +190,15 @@ const uint8_t* getConstantDataPtr(
         return constant_data_ptr + offset;
       } else {
         const std::string& data_name = constant_data_offset->named_key()->str();
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+        Result<const uint8_t*> data_ptr =
+            weights_cache->load_unpacked_data(data_name);
+        if (!data_ptr.ok()) {
+          ET_LOG(Error, "Failed to load weights from cache");
+          return nullptr;
+        }
+        return data_ptr.get();
+#else
         Result<FreeableBuffer> buffer =
             named_data_map->get_data(data_name.c_str());
         if (!buffer.ok()) {
@@ -198,8 +210,9 @@ const uint8_t* getConstantDataPtr(
         }
         const uint8_t* data_ptr =
             static_cast<const uint8_t*>(buffer.get().data());
-        loaded_buffers_from_map.push_back(std::move(buffer.get()));
+        freeable_buffers.push_back(std::move(buffer.get()));
         return data_ptr;
+#endif
       }
     }
   }
@@ -222,7 +235,8 @@ Error defineTensor(
     std::vector<uint32_t>& output_ids,
     CompileAllocator& allocator,
     const NamedDataMap* named_data_map,
-    std::vector<FreeableBuffer>& loaded_buffers_from_map) {
+    std::vector<FreeableBuffer>& freeable_buffers,
+    XNNWeightsCache* weights_cache) {
   const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
   const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;
 
@@ -264,7 +278,8 @@ Error defineTensor(
       flatbuffer_graph,
       constant_data_ptr,
       named_data_map,
-      loaded_buffers_from_map);
+      freeable_buffers,
+      weights_cache);
 
   xnn_status status;
   // The type we might have to convert to
@@ -1999,9 +2014,9 @@ ET_NODISCARD Error XNNCompiler::compileModel(
     const void* buffer_pointer,
     size_t num_bytes,
     XNNExecutor* executor,
-    MemoryAllocator* runtime_allocator,
-    const NamedDataMap* named_data_map,
-    xnn_workspace_t workspace) {
+    XNNWeightsCache* weights_cache,
+    xnn_workspace_t workspace,
+    const NamedDataMap* named_data_map) {
   Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
   const uint8_t* flatbuffer_data = nullptr;
   const uint8_t* constant_data = nullptr;
@@ -2065,11 +2080,14 @@ ET_NODISCARD Error XNNCompiler::compileModel(
   // Invalid ids do not need to be remapped
   remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID);
 
+  // If weight cache is not on we hold onto all the unpacked buffers
+  // and we free them at the end
+  std::vector<FreeableBuffer> unpacked_buffers;
+
   // External Ids for inputs and outputs
   std::vector<uint32_t> input_ids;
   std::vector<uint32_t> output_ids;
   Error err = Error::Ok;
-  std::vector<FreeableBuffer> loaded_buffers_from_map;
   for (auto value : *flatbuffer_graph->xvalues()) {
     err = defineTensor(
         subgraph.get(),
@@ -2081,7 +2099,8 @@ ET_NODISCARD Error XNNCompiler::compileModel(
         output_ids,
         compile_allocator,
         named_data_map,
-        loaded_buffers_from_map);
+        unpacked_buffers,
+        weights_cache);
 
     if (err != Error::Ok) {
       return err;
@@ -2103,20 +2122,34 @@ ET_NODISCARD Error XNNCompiler::compileModel(
 
   xnn_runtime_t runtime_ptr = nullptr;
 
+  // XNNWeightsCache if weights cache is not enabled, then XNNWeightsCache
+  // just manages the unpacked weights until the runtime is created.
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+  ET_CHECK_OR_RETURN_ERROR(
+      unpacked_buffers.size() == 0,
+      Internal,
+      "Weight Cache is enabled, which means unpacked buffers should be owned by the cache");
+  xnn_weights_cache_t weights_cache_ptr =
+      weights_cache->get_num_unpacked_data() > 0 ? weights_cache->get()
+                                                 : nullptr;
+#else
+  xnn_weights_cache_t weights_cache_ptr = nullptr;
+#endif
+
 #ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
   ET_CHECK_OR_RETURN_ERROR(
       workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace");
   status = xnn_create_runtime_v4(
       subgraph.get(),
-      /*weight_cache=*/nullptr, // TODO - support weight cache
+      weights_cache_ptr,
       workspace,
       ::executorch::extension::threadpool::get_pthreadpool(),
       runtime_flags,
       &runtime_ptr);
 #else
   status = xnn_create_runtime_v3(
       subgraph.get(),
-      /*weight_cache=*/nullptr, // TODO - support weight cache
+      weights_cache_ptr,
       ::executorch::extension::threadpool::get_pthreadpool(),
       runtime_flags,
       &runtime_ptr);
@@ -2128,10 +2161,25 @@ ET_NODISCARD Error XNNCompiler::compileModel(
       "XNN Runtime creation failed with code: %s",
       xnn_status_to_string(status));
 
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+  auto packed_weights_names = weights_cache->finalize_for_runtime();
+  ET_CHECK_OR_RETURN_ERROR(
+      packed_weights_names.ok(),
+      Internal,
+      "Failed to finalize weights cache after creating the xnn runtime")
+#else
+  for (auto& buffer : unpacked_buffers) {
+    buffer.Free();
+  }
+  Result<std::vector<std::string>> packed_weights_names =
+      std::vector<std::string>();
+#endif
+
   err = executor->initialize( // NOLINT: runtime_ptr is non-null
       runtime_ptr,
       std::move(input_ids),
-      std::move(output_ids));
+      std::move(output_ids),
+      std::move(packed_weights_names.get()));
 
   return err;
 };
diff --git a/backends/xnnpack/runtime/XNNCompiler.h b/backends/xnnpack/runtime/XNNCompiler.h
@@ -9,11 +9,9 @@
 #pragma once
 
 #include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
 #include <executorch/runtime/platform/compiler.h>
-
 #include <xnnpack.h>
-#include <memory>
-#include <vector>
 
 namespace executorch {
 namespace backends {
@@ -29,9 +27,9 @@ class XNNCompiler {
       const void* buffer_pointer,
       size_t num_bytes,
       XNNExecutor* executor,
-      executorch::runtime::MemoryAllocator* runtime_allocator,
-      const executorch::runtime::NamedDataMap* named_data_map,
-      xnn_workspace_t workspace);
+      XNNWeightsCache* weights_cache,
+      xnn_workspace_t workspace,
+      const NamedDataMap* named_data_map);
 };
 
 } // namespace delegate
diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp
@@ -30,7 +30,8 @@ using executorch::runtime::kTensorDimensionLimit;
 ET_NODISCARD Error XNNExecutor::initialize(
     xnn_runtime_t runtime,
     std::vector<uint32_t>&& input_ids,
-    std::vector<uint32_t>&& output_ids) {
+    std::vector<uint32_t>&& output_ids,
+    std::vector<std::string>&& packed_data_names) {
   runtime_ = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
       runtime, xnn_delete_runtime);
 
@@ -51,6 +52,7 @@ ET_NODISCARD Error XNNExecutor::initialize(
   std::sort(output_ids_.begin(), output_ids_.end());
 
   externals_.resize(input_ids_.size() + output_ids_.size());
+  packed_data_names_ = std::move(packed_data_names);
 
   return Error::Ok;
 }
diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h
@@ -34,6 +34,7 @@ class XNNExecutor {
   std::vector<uint32_t> input_ids_;
   std::vector<uint32_t> output_ids_;
   std::vector<xnn_external_value> externals_;
+  std::vector<std::string> packed_data_names_;
 
  public:
   XNNExecutor() = default;
@@ -46,6 +47,10 @@ class XNNExecutor {
     return output_ids_.size();
   }
 
+  inline std::vector<std::string> get_packed_data_names() {
+    return packed_data_names_;
+  }
+
   /**
    * Initialize the XNNExecutor with a given runtime and input/output ids.
    * The input/output ids are expected to be sorted in order of their
@@ -54,7 +59,8 @@ class XNNExecutor {
   ET_NODISCARD executorch::runtime::Error initialize(
       xnn_runtime_t runtime,
       std::vector<uint32_t>&& input_ids,
-      std::vector<uint32_t>&& output_ids);
+      std::vector<uint32_t>&& output_ids,
+      std::vector<std::string>&& packed_data_names);
 
   /**
    * Prepares the arguments for runtime graph execution.
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
@@ -20,6 +21,7 @@
 namespace executorch {
 namespace backends {
 
+using executorch::backends::xnnpack::delegate::XNNWeightsCache;
 using executorch::runtime::ArrayRef;
 using executorch::runtime::Backend;
 using executorch::runtime::BackendExecutionContext;
@@ -81,13 +83,18 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
     }
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
-
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
-    // This is needed to serialize access to xnn_create_runtime which is not
     // thread safe. This can heppen when multiple threads call init() on
     // the same backend instance.
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
     const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
+
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+    const std::lock_guard<std::mutex> lock_weight_cache(weights_cache_mutex_);
+    weights_cache_->initialize_for_runtime(
+        context.get_runtime_allocator(), named_data_map);
+#endif
+
     // Executor has been allocated but not constructed, ensure that runtime_ is
     // nullptr by constructing it in place here. NOTE: Since we use placement
     // new and since this type is not trivially destructible, we must call the
@@ -97,9 +104,9 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
         processed->data(),
         processed->size(),
         executor,
-        context.get_runtime_allocator(),
-        named_data_map,
-        workspace_.get());
+        weights_cache_.get(),
+        workspace_.get(),
+        named_data_map);
     // This backend does not need its processed data after compiling the model.
     processed->Free();
 
@@ -125,6 +132,10 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
     const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
 
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+    const std::lock_guard<std::mutex> lock_weights_cache(weights_cache_mutex_);
+#endif
+
     // Prepare Inputs/Outputs and Propagate Input Shapes
     Error err = executor->prepare_args(args);
     if (err != Error::Ok) {
@@ -145,16 +156,24 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
 
   void destroy(DelegateHandle* handle) const override {
     if (handle != nullptr) {
-#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
       // This is needed to serialize access to xnn_delete_runtime which is not
       // thread safe. This can heppen when multiple threads call destroy() on
       // the same backend instance.
+#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
       const std::lock_guard<std::mutex> lock(workspace_mutex_);
 #endif
+
       auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
+
 #ifdef ENABLE_XNNPACK_PROFILING
       executor->print_avg_op_timings();
 #endif
+
+#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
+      const std::lock_guard<std::mutex> lock_weights_cache(
+          weights_cache_mutex_);
+      weights_cache_->delete_packed_data(executor->get_packed_data_names());
+#endif
       // XNNExecutor is not trivially destructible. Since this was constructed
       // manually in init(), we must destroy it manually here.
       executor->~XNNExecutor();
@@ -167,6 +186,15 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
   std::unique_ptr<xnn_workspace, decltype(&xnn_release_workspace)> workspace_{
       nullptr,
       &xnn_release_workspace};
+
+  // Weights cache is global to all delegate instances.
+  mutable std::mutex weights_cache_mutex_;
+  std::unique_ptr<XNNWeightsCache> weights_cache_ =
+      std::make_unique<XNNWeightsCache>();
+
+  // Lock Hiearchy for Mutexes:
+  // workspace_mutex_
+  // weights_cache_mutex_
 };
 
 namespace {
diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl
@@ -6,11 +6,15 @@ def _get_preprocessor_flags():
     Disable if someone explictly specified a config option,
     else Enable otherwise
     """
-    if native.read_config("executorch", "xnnpack_workspace_sharing", "0") == "0":
-        return []
+    preprocessor_flags = []
+    if native.read_config("executorch", "xnnpack_workspace_sharing", "0") != "0":
+        preprocessor_flags.append("-DENABLE_XNNPACK_SHARED_WORKSPACE")
+
+    if native.read_config("executorch", "xnnpack_weights_cache", "0") != "0":
+        preprocessor_flags.append("-DENABLE_XNNPACK_WEIGHTS_CACHE")
 
     # Enable if not disabled through config
-    return ["-DENABLE_XNNPACK_SHARED_WORKSPACE"]
+    return preprocessor_flags
 
 def define_common_targets():
     runtime.cxx_library(
diff --git a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp
@@ -74,7 +74,8 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
           },
           {
               1,
-          }),
+          },
+          {}),
       Error::Ok);
   TensorFactory<executorch::aten::ScalarType::Int> tf;
   auto input_tensor = tf.make({1, 1, 1, 1, 1, 1, 1, 1, 1}, {42});