Skip to content

Qualcomm AI Engine Direct - Refactor llama runner #10578

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backends/qualcomm/runtime/SharedBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ std::size_t std::hash<CustomMemTensorInfo>::operator()(
hash_val ^= std::hash<size_t>()(info.pos);
hash_val ^= std::hash<size_t>()(info.tensor_bytes);
for (int i = 0; i < info.rank; ++i) {
hash_val ^= info.shape[i];
hash_val ^= std::hash<uint32_t>()(info.shape[i]);
}
hash_val ^= std::hash<uint32_t>()(info.rank);
hash_val ^= std::hash<executorch::aten::ScalarType>()(info.dtype);
Expand Down
4 changes: 3 additions & 1 deletion backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
options->soc_info(),
htp_options);
backend_params->qnn_mem_manager_ptr_ = std::make_unique<QnnMemManager>(
implementation, backend_params->qnn_context_ptr_.get());
implementation,
backend_params->qnn_context_ptr_.get(),
options->log_level());
backend_params->backend_init_state_ = BackendInitializeState::INITIALIZED;
} break;
case QnnExecuTorchBackendType::kGpuBackend:
Expand Down
17 changes: 11 additions & 6 deletions backends/qualcomm/runtime/backends/QnnMemManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,12 @@ Error QnnMemManager::RegisterIonMem(
}
tensor_wrapper->SetMemHandle(handle);
registered_map_.insert({handle, mem_ptr});
QNN_EXECUTORCH_LOG_INFO(
"Tensor %s is successfully registered to ION shared memory.",
tensor_wrapper->GetName().c_str());
if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) {
QNN_EXECUTORCH_LOG_INFO(
"Tensor %s is successfully registered to ION shared memory.",
tensor_wrapper->GetName().c_str());
}

return Error::Ok;
}

Expand Down Expand Up @@ -92,9 +95,11 @@ Error QnnMemManager::RegisterCustomMem(
}
tensor_wrapper->SetMemHandle(handle);
registered_map_.insert({handle, mem_ptr});
QNN_EXECUTORCH_LOG_INFO(
"Tensor %s is successfully registered to custom shared memory.",
tensor_wrapper->GetName().c_str());
if (log_level_ >= QnnExecuTorchLogLevel::kLogLevelInfo) {
QNN_EXECUTORCH_LOG_INFO(
"Tensor %s is successfully registered to custom shared memory.",
tensor_wrapper->GetName().c_str());
}
return Error::Ok;
}

Expand Down
8 changes: 6 additions & 2 deletions backends/qualcomm/runtime/backends/QnnMemManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ class QnnMemManager {
public:
explicit QnnMemManager(
const QnnImplementation& implementation,
QnnContext* context)
: implementation_(implementation), context_(context) {}
QnnContext* context,
QnnExecuTorchLogLevel log_level)
: implementation_(implementation),
context_(context),
log_level_(log_level) {}
~QnnMemManager() {
DeRegisterMem();
}
Expand Down Expand Up @@ -63,6 +66,7 @@ class QnnMemManager {

const QnnImplementation& implementation_;
QnnContext* context_;
QnnExecuTorchLogLevel log_level_;
std::unordered_map<Qnn_MemHandle_t, void*> registered_map_;
std::unordered_map<CustomMemTensorInfo, void*> pre_registered_handles_;
std::unordered_map<executorch::aten::ScalarType, Qnn_DataType_t>
Expand Down
2 changes: 1 addition & 1 deletion backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3463,7 +3463,7 @@ def test_llama3_2_1b(self):
if self.pre_gen_pte:
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])

golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
golden_start_with = "<|start_header_id|>user<|end_header_id|>"
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
with Listener((self.ip, self.port)) as listener:
conn = listener.accept()
Expand Down
16 changes: 13 additions & 3 deletions examples/qualcomm/oss_scripts/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,18 @@ list(
${CMAKE_CURRENT_LIST_DIR}/qnn_llama_runner.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/io_manager.h
${CMAKE_CURRENT_LIST_DIR}/runner/decoder_runner.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/decoder_runner.h
${CMAKE_CURRENT_LIST_DIR}/runner/prompt_processor.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/prompt_processor.h
${CMAKE_CURRENT_LIST_DIR}/runner/token_generator.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/token_generator.h
${CMAKE_CURRENT_LIST_DIR}/runner/imem_alloc.h
${CMAKE_CURRENT_LIST_DIR}/runner/client_mem.h
${CMAKE_CURRENT_LIST_DIR}/runner/rpc_mem.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/rpc_mem.h
${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.h
)

list(
Expand All @@ -42,7 +52,7 @@ list(
# build qnn llama runner
add_executable(qnn_llama_runner ${_llama_runner__srcs})
target_include_directories(
qnn_llama_runner PUBLIC ${_common_include_directories}
qnn_llama_runner PUBLIC ${_common_include_directories} ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include
)

target_link_options_shared_lib(quantized_ops_lib)
Expand Down
70 changes: 22 additions & 48 deletions examples/qualcomm/oss_scripts/llama/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()):
logging.info("Quantizing the model...")
calibrate(
self.get_example_inputs(self.llama_meta["get_use_kv_cache"]),
args.prompt,
args.prompt[0],
fx_graph_module,
tokenizer=tokenizer,
ar_len=self.llama_meta["get_ar_len"],
Expand Down Expand Up @@ -828,7 +828,7 @@ def permute(w, heads):
return quant_attrs


def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
def inference(args, pte_filename, runtime_tokenizer_path, pre_gen_pte=""):
workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama"

if args.model_mode == "kv":
Expand All @@ -854,14 +854,13 @@ def post_process():
outputs.append(f.read())

seq_len = args.max_seq_len
multi_prompts = " ".join([f'--prompt "{prompt}"' for prompt in args.prompt])
runner_args = " ".join(
[
f'--prompt "{args.prompt}"',
multi_prompts,
f"--eval_mode {eval_mode}",
f"--temperature {args.temperature}",
f"--system_prompt '{args.system_prompt}'",
f"--logits_scale {quant_attrs['scale']}",
f"--logits_offset {quant_attrs['zero_point']}",
]
)

Expand Down Expand Up @@ -1004,9 +1003,10 @@ def _build_parser():

parser.add_argument(
"--prompt",
help="User prompts for llama.",
help="User prompts for Llama. When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.",
required=True,
type=str,
nargs="+",
)

parser.add_argument(
Expand Down Expand Up @@ -1090,7 +1090,7 @@ def _build_parser():

def export_llama(args) -> None:
if args.compile_only and args.pre_gen_pte:
exit("Cannot set both compile_only and pre_gen_pte as true")
raise RuntimeError("Cannot set both compile_only and pre_gen_pte as true")

if args.model_mode == "kv":
pte_filename = "kv_llama_qnn"
Expand Down Expand Up @@ -1126,29 +1126,15 @@ def export_llama(args) -> None:
elif args.kv_updater == "shift_pointer":
args.kv_updater = shift_pointer_updater
else:
exit(f"Using an unkown kv update {args.kv_updater}")
raise RuntimeError(f"Using an unknown kv update {args.kv_updater}")

if args.pre_gen_pte:
quant_attrs = json.load(
open(f"{args.pre_gen_pte}/{pte_filename}_quant_attrs.txt")
)
inference(
args, quant_attrs, pte_filename, runtime_tokenizer_path, args.pre_gen_pte
)
exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
inference(args, pte_filename, runtime_tokenizer_path, args.pre_gen_pte)
print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
return

if args.compile_only:
quant_attrs = compile(args, pte_filename, tokenizer)
if quant_attrs:
json.dump(
{
"scale": quant_attrs["scale"],
"zero_point": quant_attrs["zero_point"],
},
open(f"{args.artifact}/{pte_filename}_quant_attrs.txt", "w"),
)
else:
logging.warning("Quant attributes of the logit is None.")
compile(args, pte_filename, tokenizer)

if args.ip and args.port != -1:
pte_path = f"{args.artifact}/{pte_filename}.pte"
Expand All @@ -1161,24 +1147,18 @@ def export_llama(args) -> None:
}
)
)
exit(f"Finish compile_only and save to {args.artifact}")
print(f"Finish compile_only and save to {args.artifact}")
return

compile(args, pte_filename, tokenizer)
inference(args, pte_filename, runtime_tokenizer_path)


def main():
parser = _build_parser()
args = parser.parse_args()
try:
quant_attrs = compile(args, pte_filename, tokenizer)
if quant_attrs:
logging.info(
f"Logit scale: {quant_attrs['scale']}; Logit offset: {quant_attrs['zero_point']}"
)
json.dump(
{
"scale": quant_attrs["scale"],
"zero_point": quant_attrs["zero_point"],
},
open(f"{args.artifact}/{pte_filename}_quant_attrs.txt", "w"),
)
else:
logging.warning("Quant attributes of the logit is None.")
inference(args, quant_attrs, pte_filename, runtime_tokenizer_path)
export_llama(args)
except Exception as e:
if args.ip and args.port != -1:
with Client((args.ip, args.port)) as conn:
Expand All @@ -1187,12 +1167,6 @@ def export_llama(args) -> None:
raise Exception(e)


def main():
parser = _build_parser()
args = parser.parse_args()
export_llama(args)


# flake8: noqa: C901
if __name__ == "__main__":
main()
51 changes: 40 additions & 11 deletions examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ DEFINE_string(
"inference_speed.txt",
"Records inference speed. For CI purpose.");
DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
DEFINE_string(
prompt,
"The answer to the ultimate question is",
"User prompts for Llama. When multiple prompts are entered, a multi-turn conversation will be initiated. Note that this feature is currently for testing purposes only.");
DEFINE_string(
system_prompt,
"",
Expand All @@ -49,10 +52,8 @@ DEFINE_int32(
"Total number of tokens to generate (prompt + output).");
DEFINE_int32(
eval_mode,
1,
0,
"0: TokenGenerator(kv) / 1: HybridMode (prefill+kv)");
DEFINE_double(logits_scale, 0.0, "Logits scale");
DEFINE_int32(logits_offset, 0, "Logits offset");
DEFINE_string(
kv_updater,
"How to update kv cache. Choose between SmartMask and ShiftPointer",
Expand All @@ -72,20 +73,46 @@ std::vector<std::string> CollectPrompts(int argc, char** argv) {
return prompts;
}

std::string get_formatted_prompt(
const std::string& prompt,
const std::string& system_prompt,
example::LlamaVersion llama_version) {
std::string formatted_prompt;
switch (llama_version) {
case example::LlamaVersion::kLlama2:
formatted_prompt.append(prompt);
break;
case example::LlamaVersion::kLlama3:
if (!system_prompt.empty()) {
formatted_prompt.append(
"<|start_header_id|>system<|end_header_id|>\n\n");
formatted_prompt.append(system_prompt);
formatted_prompt.append("<|eot_id|>");
}
formatted_prompt.append("<|start_header_id|>user<|end_header_id|>\n\n");
formatted_prompt.append(prompt);
formatted_prompt.append(
"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
break;
default:
ET_CHECK_MSG(false, "unsupported llama version");
break;
}
return formatted_prompt;
}

int main(int argc, char** argv) {
std::vector<std::string> prompts = CollectPrompts(argc, argv);
gflags::ParseCommandLineFlags(&argc, &argv, true);
// create llama runner
example::Runner runner(
{FLAGS_model_path},
FLAGS_model_path.c_str(),
FLAGS_tokenizer_path.c_str(),
FLAGS_performance_output_path.c_str(),
FLAGS_logits_scale,
FLAGS_logits_offset,
FLAGS_temperature,
FLAGS_eval_mode,
FLAGS_kv_updater,
FLAGS_num_iters);
FLAGS_kv_updater);
auto llama_version = runner.get_llama_version();
std::vector<char> buf;
buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char
std::ofstream fout(FLAGS_output_path.c_str());
Expand All @@ -97,8 +124,10 @@ int main(int argc, char** argv) {
// generate tokens & store inference output
for (int i = 0; i < FLAGS_num_iters; i++) {
for (const auto& prompt : prompts) {
runner.generate(
FLAGS_seq_len, prompt.c_str(), FLAGS_system_prompt.c_str(), callback);
std::string formatted_prompt;
formatted_prompt = get_formatted_prompt(
prompt, FLAGS_system_prompt, llama_version.get());
runner.generate(formatted_prompt.c_str(), FLAGS_seq_len, callback);
}
}
fout.write(buf.data(), buf.size());
Expand Down
48 changes: 48 additions & 0 deletions examples/qualcomm/oss_scripts/llama/runner/client_mem.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright (c) Qualcomm Innovation Center, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <executorch/examples/qualcomm/oss_scripts/llama/runner/imem_alloc.h>
#include <vector>

namespace example {
/**
* @class ClientMem
* @brief Final class for client buffer allocation, implementing IBufferAlloc
* interface. Used for SHIFT_POINTER mode.
*/
class ClientMem final : public IMemAlloc {
public:
ClientMem(){};
// Disable copy constructors, r-value referencing, etc
ClientMem(const ClientMem&) = delete;
ClientMem& operator=(const ClientMem&) = delete;
ClientMem(ClientMem&&) = delete;
ClientMem& operator=(ClientMem&&) = delete;
virtual ~ClientMem(){};
/**
* @brief Allocate buffer of specified size with vector.
* @param data_size Size of the data to allocate.
* @return Pointer to the allocated buffer.
*/
std::byte* allocate(size_t data_size) override {
allocated_buffers_.push_back(std::vector<std::byte>(data_size));
return allocated_buffers_.back().data();
};
// Only used for SMART_MASK mode
void add_memory_info(
void* data_ptr,
size_t data_size,
executorch::runtime::TensorInfo tensor_info) override {};

private:
std::vector<std::vector<std::byte>> allocated_buffers_;
};

} // namespace example
Loading
Loading