pytorch · cccclai · May 28, 2025 · Apr 22, 2025
@@ -22,7 +22,7 @@ std::size_t std::hash<CustomMemTensorInfo>::operator()(
   hash_val ^= std::hash<size_t>()(info.pos);
   hash_val ^= std::hash<size_t>()(info.tensor_bytes);
   for (int i = 0; i < info.rank; ++i) {
-    hash_val ^= info.shape[i];
+    hash_val ^= std::hash<uint32_t>()(info.shape[i]);
   }
   hash_val ^= std::hash<uint32_t>()(info.rank);
   hash_val ^= std::hash<executorch::aten::ScalarType>()(info.dtype);

@@ -78,7 +78,9 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
           options->soc_info(),
           htp_options);
       backend_params->qnn_mem_manager_ptr_ = std::make_unique<QnnMemManager>(
-          implementation, backend_params->qnn_context_ptr_.get());
+          implementation,
+          backend_params->qnn_context_ptr_.get(),
+          options->log_level());
       backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED;
     } break;
     case QnnExecuTorchBackendType::kGpuBackend:

@@ -47,9 +47,12 @@ Error QnnMemManager::RegisterIonMem(
   }
   tensor_wrapper->SetMemHandle(handle);
   registered_map_.insert({handle, mem_ptr});
-  QNN_EXECUTORCH_LOG_INFO(
-      "Tensor %s is successfully registered to ION shared memory.",
-      tensor_wrapper->GetName().c_str());
+  if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) {
+    QNN_EXECUTORCH_LOG_INFO(
+        "Tensor %s is successfully registered to ION shared memory.",
+        tensor_wrapper->GetName().c_str());
+  }
+
   return Error::Ok;
 }
 
@@ -92,9 +95,11 @@ Error QnnMemManager::RegisterCustomMem(
   }
   tensor_wrapper->SetMemHandle(handle);
   registered_map_.insert({handle, mem_ptr});
-  QNN_EXECUTORCH_LOG_INFO(
-      "Tensor %s is successfully registered to custom shared memory.",
-      tensor_wrapper->GetName().c_str());
+  if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) {
+    QNN_EXECUTORCH_LOG_INFO(
+        "Tensor %s is successfully registered to custom shared memory.",
+        tensor_wrapper->GetName().c_str());
+  }
   return Error::Ok;
 }
 

@@ -21,8 +21,11 @@ class QnnMemManager {
  public:
   explicit QnnMemManager(
       const QnnImplementation& implementation,
-      QnnContext* context)
-      : implementation_(implementation), context_(context) {}
+      QnnContext* context,
+      QnnExecuTorchLogLevel log_level)
+      : implementation_(implementation),
+        context_(context),
+        log_level_(log_level) {}
   ~QnnMemManager() {
     DeRegisterMem();
   }
@@ -63,6 +66,7 @@ class QnnMemManager {
 
   const QnnImplementation& implementation_;
   QnnContext* context_;
+  QnnExecuTorchLogLevel log_level_;
   std::unordered_map<Qnn_MemHandle_t, void*> registered_map_;
   std::unordered_map<CustomMemTensorInfo, void*> pre_registered_handles_;
   std::unordered_map<executorch::aten::ScalarType, Qnn_DataType_t>

@@ -3463,7 +3463,7 @@ def test_llama3_2_1b(self):
         if self.pre_gen_pte:
             cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
 
-        golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
+        golden_start_with = "<|start_header_id|>user<|end_header_id|>"
         p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
         with Listener((self.ip, self.port)) as listener:
             conn = listener.accept()

@@ -28,8 +28,18 @@ list(
   ${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
-  ${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.cpp
-  ${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/decoder_runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/decoder_runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/prompt_processor.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/prompt_processor.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/token_generator.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/token_generator.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/imem_alloc.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/client_mem.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/rpc_mem.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/rpc_mem.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.h
 )
 
 list(
@@ -42,7 +52,7 @@ list(
 # build qnn llama runner
 add_executable(qnn_llama_runner ${_llama_runner__srcs})
 target_include_directories(
-  qnn_llama_runner PUBLIC ${_common_include_directories}
+  qnn_llama_runner PUBLIC ${_common_include_directories} ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include
 )
 
 target_link_options_shared_lib(quantized_ops_lib)

@@ -403,7 +403,7 @@ def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()):
         logging.info("Quantizing the model...")
         calibrate(
             self.get_example_inputs(self.llama_meta["get_use_kv_cache"]),
-            args.prompt,
+            args.prompt[0],
             fx_graph_module,
             tokenizer=tokenizer,
             ar_len=self.llama_meta["get_ar_len"],
@@ -828,7 +828,7 @@ def permute(w, heads):
     return quant_attrs
 
 
-def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
+def inference(args, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
     workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama"
 
     if args.model_mode == "kv":
@@ -854,14 +854,13 @@ def post_process():
             outputs.append(f.read())
 
     seq_len = args.max_seq_len
+    multi_prompts = " ".join([f'--prompt "{prompt}"' for prompt in args.prompt])
     runner_args = " ".join(
         [
-            f'--prompt "{args.prompt}"',
+            multi_prompts,
             f"--eval_mode {eval_mode}",
             f"--temperature {args.temperature}",
             f"--system_prompt '{args.system_prompt}'",
-            f"--logits_scale {quant_attrs['scale']}",
-            f"--logits_offset {quant_attrs['zero_point']}",
         ]
     )
 
@@ -1004,9 +1003,10 @@ def _build_parser():
 
     parser.add_argument(
         "--prompt",
-        help="User prompts for llama.",
+        help="User prompts for Llama. When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.",
         required=True,
         type=str,
+        nargs="+",
     )
 
     parser.add_argument(
@@ -1090,7 +1090,7 @@ def _build_parser():
 
 def export_llama(args) -> None:
     if args.compile_only and args.pre_gen_pte:
-        exit("Cannot set both compile_only and pre_gen_pte as true")
+        raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true")
 
     if args.model_mode == "kv":
         pte_filename = "kv_llama_qnn"
@@ -1126,29 +1126,15 @@ def export_llama(args) -> None:
     elif args.kv_updater == "shift_pointer":
         args.kv_updater = shift_pointer_updater
     else:
-        exit(f"Using an unkown kv update {args.kv_updater}")
+        raise RuntimeError(f"Using an unknown kv update {args.kv_updater}")
 
     if args.pre_gen_pte:
-        quant_attrs = json.load(
-            open(f"{args.pre_gen_pte}/{pte_filename}_quant_attrs.txt")
-        )
-        inference(
-            args, quant_attrs, pte_filename, runtime_tokenizer_path, args.pre_gen_pte
-        )
-        exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
+        inference(args, pte_filename, runtime_tokenizer_path, args.pre_gen_pte)
+        print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
+        return
 
     if args.compile_only:
-        quant_attrs = compile(args, pte_filename, tokenizer)
-        if quant_attrs:
-            json.dump(
-                {
-                    "scale": quant_attrs["scale"],
-                    "zero_point": quant_attrs["zero_point"],
-                },
-                open(f"{args.artifact}/{pte_filename}_quant_attrs.txt", "w"),
-            )
-        else:
-            logging.warning("Quant attributes of the logit is None.")
+        compile(args, pte_filename, tokenizer)
 
         if args.ip and args.port != -1:
             pte_path = f"{args.artifact}/{pte_filename}.pte"
@@ -1161,24 +1147,18 @@ def export_llama(args) -> None:
                         }
                     )
                 )
-        exit(f"Finish compile_only and save to {args.artifact}")
+        print(f"Finish compile_only and save to {args.artifact}")
+        return
+
+    compile(args, pte_filename, tokenizer)
+    inference(args, pte_filename, runtime_tokenizer_path)
 
+
+def main():
+    parser = _build_parser()
+    args = parser.parse_args()
     try:
-        quant_attrs = compile(args, pte_filename, tokenizer)
-        if quant_attrs:
-            logging.info(
-                f"Logit scale: {quant_attrs['scale']}; Logit offset: {quant_attrs['zero_point']}"
-            )
-            json.dump(
-                {
-                    "scale": quant_attrs["scale"],
-                    "zero_point": quant_attrs["zero_point"],
-                },
-                open(f"{args.artifact}/{pte_filename}_quant_attrs.txt", "w"),
-            )
-        else:
-            logging.warning("Quant attributes of the logit is None.")
-        inference(args, quant_attrs, pte_filename, runtime_tokenizer_path)
+        export_llama(args)
     except Exception as e:
         if args.ip and args.port != -1:
             with Client((args.ip, args.port)) as conn:
@@ -1187,12 +1167,6 @@ def export_llama(args) -> None:
             raise Exception(e)
 
 
-def main():
-    parser = _build_parser()
-    args = parser.parse_args()
-    export_llama(args)
-
-
 # flake8: noqa: C901
 if __name__ == "__main__":
     main()
@@ -34,7 +34,10 @@ DEFINE_string(
     "inference_speed.txt",
     "Records inference speed. For CI purpose.");
 DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
-DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
+DEFINE_string(
+    prompt,
+    "The answer to the ultimate question is",
+    "User prompts for Llama. When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.");
 DEFINE_string(
     system_prompt,
     "",
@@ -49,10 +52,8 @@ DEFINE_int32(
     "Total number of tokens to generate (prompt + output).");
 DEFINE_int32(
     eval_mode,
-    1,
+    0,
     "0: TokenGenerator(kv) / 1: HybridMode (prefill+kv)");
-DEFINE_double(logits_scale, 0.0, "Logits scale");
-DEFINE_int32(logits_offset, 0, "Logits offset");
 DEFINE_string(
     kv_updater,
     "How to update kv cache. Choose between SmartMask and ShiftPointer",
@@ -72,20 +73,46 @@ std::vector<std::string> CollectPrompts(int argc, char** argv) {
   return prompts;
 }
 
+std::string get_formatted_prompt(
+    const std::string& prompt,
+    const std::string& system_prompt,
+    example::LlamaVersion llama_version) {
+  std::string formatted_prompt;
+  switch (llama_version) {
+    case example::LlamaVersion::kLlama2:
+      formatted_prompt.append(prompt);
+      break;
+    case example::LlamaVersion::kLlama3:
+      if (!system_prompt.empty()) {
+        formatted_prompt.append(
+            "<|start_header_id|>system<|end_header_id|>\n\n");
+        formatted_prompt.append(system_prompt);
+        formatted_prompt.append("<|eot_id|>");
+      }
+      formatted_prompt.append("<|start_header_id|>user<|end_header_id|>\n\n");
+      formatted_prompt.append(prompt);
+      formatted_prompt.append(
+          "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
+      break;
+    default:
+      ET_CHECK_MSG(false, "unsupported llama version");
+      break;
+  }
+  return formatted_prompt;
+}
+
 int main(int argc, char** argv) {
   std::vector<std::string> prompts = CollectPrompts(argc, argv);
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   // create llama runner
   example::Runner runner(
-      {FLAGS_model_path},
+      FLAGS_model_path.c_str(),
       FLAGS_tokenizer_path.c_str(),
       FLAGS_performance_output_path.c_str(),
-      FLAGS_logits_scale,
-      FLAGS_logits_offset,
       FLAGS_temperature,
       FLAGS_eval_mode,
-      FLAGS_kv_updater,
-      FLAGS_num_iters);
+      FLAGS_kv_updater);
+  auto llama_version = runner.get_llama_version();
   std::vector<char> buf;
   buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char
   std::ofstream fout(FLAGS_output_path.c_str());
@@ -97,8 +124,10 @@ int main(int argc, char** argv) {
   // generate tokens & store inference output
   for (int i = 0; i < FLAGS_num_iters; i++) {
     for (const auto& prompt : prompts) {
-      runner.generate(
-          FLAGS_seq_len, prompt.c_str(), FLAGS_system_prompt.c_str(), callback);
+      std::string formatted_prompt;
+      formatted_prompt = get_formatted_prompt(
+          prompt, FLAGS_system_prompt, llama_version.get());
+      runner.generate(formatted_prompt.c_str(), FLAGS_seq_len, callback);
     }
   }
   fout.write(buf.data(), buf.size());

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
+#include <vector>
+
+namespace example {
+/**
+ * @class ClientMem
+ * @brief Final class for client buffer allocation, implementing IBufferAlloc
+ * interface. Used for SHIFT_POINTER mode.
+ */
+class ClientMem final : public IMemAlloc {
+ public:
+  ClientMem(){};
+  // Disable copy constructors, r-value referencing, etc
+  ClientMem(const ClientMem&) = delete;
+  ClientMem& operator=(const ClientMem&) = delete;
+  ClientMem(ClientMem&&) = delete;
+  ClientMem& operator=(ClientMem&&) = delete;
+  virtual ~ClientMem(){};
+  /**
+   * @brief Allocate buffer of specified size with vector.
+   * @param data_size Size of the data to allocate.
+   * @return Pointer to the allocated buffer.
+   */
+  std::byte* allocate(size_t data_size) override {
+    allocated_buffers_.push_back(std::vector<std::byte>(data_size));
+    return allocated_buffers_.back().data();
+  };
+  // Only used for SMART_MASK mode
+  void add_memory_info(
+      void* data_ptr,
+      size_t data_size,
+      executorch::runtime::TensorInfo tensor_info) override {};
+
+ private:
+  std::vector<std::vector<std::byte>> allocated_buffers_;
+};
+
+} // namespace example