diff --git a/examples/Chat.py b/examples/Chat.py new file mode 100644 index 0000000000000..c7834716889b9 --- /dev/null +++ b/examples/Chat.py @@ -0,0 +1,71 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "ChatLLaMa") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "USER") +N_PREDICTS = int(env_or_def("N_PREDICTS", "2048")) +N_THREAD = int(env_or_def("N_THREAD", "8")) + +today = datetime.datetime.today() +DATE_YEAR=today.strftime("%Y") +DATE_TIME=today.strftime("%H:%M") + +prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. +{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision. +There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. +The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. +The transcript only includes text, it does not include markup like HTML and Markdown. + +{USER_NAME}: Hello, {AI_NAME}! +{AI_NAME}: Hello {USER_NAME}! How may I help you today? +{USER_NAME}: What year is it? +{AI_NAME}: We are in {DATE_YEAR}. +{USER_NAME}: Please tell me the largest city in Europe. +{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia. +{USER_NAME}: What can you tell me about Moscow? +{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center. +{USER_NAME}: What is a cat? +{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. +{USER_NAME}: How do I pass command line arguments to a Node.js program? +{AI_NAME}: The arguments are stored in process.argv. + + argv[0] is the path to the Node. js executable. + argv[1] is the path to the script file. + argv[2] is the first argument passed to the script. + argv[3] is the second argument passed to the script and so on. +{USER_NAME}: Name a color. +{AI_NAME}: Blue. +{USER_NAME}: What time is it? +{AI_NAME}: It is {DATE_TIME}. +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_ctx=2048, + temp=0.7, + top_k=40, + top_p=0.5, + repeat_last_n=256, + n_batch=1024, + repeat_penalty=1.17647, + model=MODEL, + n_threads=N_THREAD, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + input_prefix=" ", + input_suffix=f"{AI_NAME}:", + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/Miku.py b/examples/Miku.py new file mode 100644 index 0000000000000..86fd1d74ce78c --- /dev/null +++ b/examples/Miku.py @@ -0,0 +1,59 @@ +#!/bin/python +import sys, os +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "Miku") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "Anon") +N_PREDICTS = int(env_or_def("N_PREDICTS", "4096")) +N_THREAD = int(env_or_def("N_THREAD", "0")) + +prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer. +{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. +{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. +{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad. +{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her. +The conversation is only between {USER_NAME} and {AI_NAME} +The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice. +{AI_NAME} can only communicate through text, so she can't send images or videos. + + +{USER_NAME}: Hello! +{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression! +{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^ +{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :) +{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant! +{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off! +{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that! +{AI_NAME}: What do you like to do in your free time? ^_^ +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_batch=1024, + n_ctx=2048, + n_keep=-1, + repeat_last_n=256, + repeat_penalty=1.17647, + temp=0.7, + top_k=40, + top_p=0.5, + model=MODEL, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + prompt=prompt, +) + +if N_THREAD > 0: + params.n_threads = N_THREAD + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/ReasonAct.py b/examples/ReasonAct.py new file mode 100644 index 0000000000000..cf0a137477b70 --- /dev/null +++ b/examples/ReasonAct.py @@ -0,0 +1,49 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") + +prompt=f"""You run in a loop of Thought, Action, Observation. +At the end of the loop either Answer or restate your Thought and Action. +Use Thought to describe your thoughts about the question you have been asked. +Use Action to run one of these actions available to you: +- calculate[python math expression] +Observation will be the result of running those actions + + +Question: What is 4 * 7 / 3? +Thought: Do I need to use an action? Yes, I use calculate to do math +Action: calculate[4 * 7 / 3] +Observation: 9.3333333333 +Thought: Do I need to use an action? No, have the result +Answer: The calculate tool says it is 9.3333333333 +Question: What is capital of france? +Thought: Do I need to use an action? No, I know the answer +Answer: Paris is the capital of France +Question:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + interactive=True, + interactive_start=True, + top_k=10000, + temp=0.2, + repeat_penalty=1, + n_threads=7, + n_ctx=2048, + antiprompt=["Question:","Observation:"], + model=MODEL, + input_prefix=" ", + n_predict=-1, + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/common.py b/examples/common.py new file mode 100644 index 0000000000000..2a14917c51805 --- /dev/null +++ b/examples/common.py @@ -0,0 +1,202 @@ +import os +import argparse +import re + +from dataclasses import dataclass, field +from typing import List + +# Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp + + +@dataclass +class GptParams: + seed: int = -1 + n_threads: int = min(4, os.cpu_count() or 1) + n_predict: int = 128 + n_parts: int = -1 + n_ctx: int = 512 + n_batch: int = 8 + n_keep: int = 0 + + ignore_eos: bool = False + logit_bias: dict[int, float] = field(default_factory=dict) + top_k: int = 40 + top_p: float = 0.95 + tfs_z: float = 1.00 + typical_p: float = 1.00 + temp: float = 0.80 + repeat_penalty: float = 1.10 + repeat_last_n: int = 64 + frequency_penalty: float = 0.0 + presence_penalty: float = 0.0 + mirostat: int = 0 + mirostat_tau: float = 5.0 + mirostat_eta: float = 0.1 + + model: str = "./models/llama-7B/ggml-model.bin" + prompt: str = "" + path_session: str = "" + input_prefix: str = " " + input_suffix: str = "" + antiprompt: List[str] = field(default_factory=list) + + lora_adapter: str = "" + lora_base: str = "" + + memory_f16: bool = True + random_prompt: bool = False + use_color: bool = False + interactive: bool = False + + embedding: bool = False + interactive_start: bool = False + + instruct: bool = False + penalize_nl: bool = True + perplexity: bool = False + use_mmap: bool = True + use_mlock: bool = False + mem_test: bool = False + verbose_prompt: bool = False + + file: str = None + + # If chat ended prematurely, append this to the conversation to fix it. + # Set to "\nUser:" etc. + # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:"" + fix_prefix: str = "" + input_echo: bool = True, + + # Default instructions for Alpaca + # switch to "Human" and "Assistant" for Vicuna. + # TODO: TBD how they are gonna handle this upstream + instruct_inp_prefix: str="\n\n### Instruction:\n\n" + instruct_inp_suffix: str="\n\n### Response:\n\n" + + +def gpt_params_parse(argv = None): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed") + parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads") + parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") + parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") + parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx") + parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch") + parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep") + + parser.add_argument( + "-l", + "--logit-bias", + type=str, + action='append', + help="--logit-bias TOKEN_ID(+/-)BIAS", + dest="logit_bias_str" + ) + parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") + parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") + parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") + parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z") + parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") + parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") + parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") + parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z") + parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty") + parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat") + parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau") + parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta") + + parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model") + parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") + parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") + parser.add_argument("--session", type=str, default="", help="file to cache model state in (may be large!)",dest="path_session") + parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") + parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix") + parser.add_argument( + "-r", + "--reverse-prompt", + type=str, + action='append', + help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).", + dest="antiprompt" + ) + + parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter") + parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base") + + parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") + parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt") + parser.add_argument( + "--color", + action="store_true", + help="colorise output to distinguish prompt and user input from generations", + dest="use_color" + ) + parser.add_argument( + "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" + ) + + parser.add_argument("--embedding", action="store_true", help="", dest="embedding") + parser.add_argument( + "--interactive-first", + action="store_true", + help="run in interactive mode and wait for input right away", + dest="interactive_start" + ) + + parser.add_argument( + "-ins", + "--instruct", + action="store_true", + help="run in instruction mode (use with Alpaca or Vicuna models)", + dest="instruct" + ) + parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl") + parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") + parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap") + parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") + parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") + parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt") + + #Custom args + parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix") + parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo") + + parser.add_argument( + "--interactive-start", + action="store_true", + help="run in interactive mode", + dest="interactive" + ) + + args = parser.parse_args(argv) + + logit_bias_str = args.logit_bias_str + delattr(args, "logit_bias_str") + params = GptParams(**vars(args)) + + if (params.lora_adapter): + params.use_mmap = False + + if (logit_bias_str != None): + for i in logit_bias_str: + if (m := re.match(r"(\d+)([-+]\d+)", i)): + params.logit_bias[int(m.group(1))] = float(m.group(2)) + + return params + +def gpt_random_prompt(rng): + return [ + "So", + "Once upon a time", + "When", + "The", + "After", + "If", + "import", + "He", + "She", + "They", + ][rng % 10] + +if __name__ == "__main__": + print(gpt_params_parse()) diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py new file mode 100644 index 0000000000000..643c94bf55947 --- /dev/null +++ b/examples/llama_cpp.py @@ -0,0 +1,862 @@ +import sys +import os +import ctypes +from ctypes import ( + c_int, + c_float, + c_char_p, + c_void_p, + c_bool, + POINTER, + _Pointer, # type: ignore + Structure, + Array, + c_uint8, + c_size_t, +) +import pathlib + + +# Load the library +def _load_shared_library(lib_base_name: str): + # Determine the file extension based on the platform + if sys.platform.startswith("linux"): + lib_ext = ".so" + elif sys.platform == "darwin": + lib_ext = ".dylib" + elif sys.platform == "win32": + lib_ext = ".dll" + else: + raise RuntimeError("Unsupported platform") + + # Construct the paths to the possible shared library names + _base_path = pathlib.Path(__file__).parent.resolve() + _base_path_parent = pathlib.Path(__file__).parent.parent.resolve() + # Searching for the library in the current directory under the name "libllama" (default name + # for llamacpp) and "llama" (default name for this repo) + _lib_paths = [ + _base_path / f"lib{lib_base_name}{lib_ext}", + _base_path_parent / f"lib{lib_base_name}{lib_ext}", + _base_path / f"{lib_base_name}{lib_ext}", + ] + + if "LLAMA_CPP_LIB" in os.environ: + lib_base_name = os.environ["LLAMA_CPP_LIB"] + _lib = pathlib.Path(lib_base_name) + _base_path = _lib.parent.resolve() + _lib_paths = [_lib.resolve()] + + cdll_args = dict() # type: ignore + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(_base_path)) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) + cdll_args["winmode"] = 0 + + # Try to load the shared library, handling potential errors + for _lib_path in _lib_paths: + if _lib_path.exists(): + try: + return ctypes.CDLL(str(_lib_path), **cdll_args) + except Exception as e: + raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") + + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + + +# Specify the base name of the shared library to load +_lib_base_name = "llama" + +# Load the library +_lib = _load_shared_library(_lib_base_name) + +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + +# llama.h bindings + +# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) +# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) +# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) +# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) +# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) + +# #define LLAMA_FILE_VERSION 3 +LLAMA_FILE_VERSION = c_int(3) +LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT +LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML +LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN +LLAMA_SESSION_VERSION = c_int(1) + +# struct llama_context; +llama_context_p = c_void_p + + +# typedef int llama_token; +llama_token = c_int +llama_token_p = POINTER(llama_token) + + +# typedef struct llama_token_data { +# llama_token id; // token id +# float logit; // log-odds of the token +# float p; // probability of the token +# } llama_token_data; +class llama_token_data(Structure): + _fields_ = [ + ("id", llama_token), + ("logit", c_float), + ("p", c_float), + ] + + +llama_token_data_p = POINTER(llama_token_data) + + +# typedef struct llama_token_data_array { +# llama_token_data * data; +# size_t size; +# bool sorted; +# } llama_token_data_array; +class llama_token_data_array(Structure): + _fields_ = [ + ("data", llama_token_data_p), + ("size", c_size_t), + ("sorted", c_bool), + ] + + +llama_token_data_array_p = POINTER(llama_token_data_array) + +# typedef void (*llama_progress_callback)(float progress, void *ctx); +llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) + + +# struct llama_context_params { +# int n_ctx; // text context +# int n_gpu_layers; // number of layers to store in VRAM +# int seed; // RNG seed, -1 for random + +# bool f16_kv; // use fp16 for KV cache +# bool logits_all; // the llama_eval() call computes all logits, not just the last one +# bool vocab_only; // only load the vocabulary, no weights +# bool use_mmap; // use mmap if possible +# bool use_mlock; // force system to keep model in RAM +# bool embedding; // embedding mode only + + +# // called with a progress value between 0 and 1, pass NULL to disable +# llama_progress_callback progress_callback; +# // context pointer passed to the progress callback +# void * progress_callback_user_data; +# }; +class llama_context_params(Structure): + _fields_ = [ + ("n_ctx", c_int), + ("n_gpu_layers", c_int), + ("seed", c_int), + ("f16_kv", c_bool), + ( + "logits_all", + c_bool, + ), + ("vocab_only", c_bool), + ("use_mmap", c_bool), + ("use_mlock", c_bool), + ("embedding", c_bool), + ("progress_callback", llama_progress_callback), + ("progress_callback_user_data", c_void_p), + ] + + +llama_context_params_p = POINTER(llama_context_params) + +# enum llama_ftype { +# LLAMA_FTYPE_ALL_F32 = 0, +# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 +# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed +# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed +# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors +# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors +# }; +LLAMA_FTYPE_ALL_F32 = c_int(0) +LLAMA_FTYPE_MOSTLY_F16 = c_int(1) +LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) +LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) +LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) +LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) +LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) +LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) + + +# LLAMA_API struct llama_context_params llama_context_default_params(); +def llama_context_default_params() -> llama_context_params: + return _lib.llama_context_default_params() + + +_lib.llama_context_default_params.argtypes = [] +_lib.llama_context_default_params.restype = llama_context_params + + +# LLAMA_API bool llama_mmap_supported(); +def llama_mmap_supported() -> bool: + return _lib.llama_mmap_supported() + + +_lib.llama_mmap_supported.argtypes = [] +_lib.llama_mmap_supported.restype = c_bool + + +# LLAMA_API bool llama_mlock_supported(); +def llama_mlock_supported() -> bool: + return _lib.llama_mlock_supported() + + +_lib.llama_mlock_supported.argtypes = [] +_lib.llama_mlock_supported.restype = c_bool + + +# // TODO: not great API - very likely to change +# // Initialize the llama + ggml backend +# // Call once at the start of the program +# LLAMA_API void llama_init_backend(); +def llama_init_backend(): + return _lib.llama_init_backend() + + +_lib.llama_init_backend.argtypes = [] +_lib.llama_init_backend.restype = None + + +# LLAMA_API int64_t llama_time_us(); +def llama_time_us() -> int: + return _lib.llama_time_us() + + +_lib.llama_time_us.argtypes = [] +_lib.llama_time_us.restype = ctypes.c_int64 + + +# // Various functions for loading a ggml llama model. +# // Allocate (almost) all memory needed for the model. +# // Return NULL on failure +# LLAMA_API struct llama_context * llama_init_from_file( +# const char * path_model, +# struct llama_context_params params); +def llama_init_from_file( + path_model: bytes, params: llama_context_params +) -> llama_context_p: + return _lib.llama_init_from_file(path_model, params) + + +_lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params] +_lib.llama_init_from_file.restype = llama_context_p + + +# Frees all allocated memory +# LLAMA_API void llama_free(struct llama_context * ctx); +def llama_free(ctx: llama_context_p): + return _lib.llama_free(ctx) + + +_lib.llama_free.argtypes = [llama_context_p] +_lib.llama_free.restype = None + + +# TODO: not great API - very likely to change +# Returns 0 on success +# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given +# LLAMA_API int llama_model_quantize( +# const char * fname_inp, +# const char * fname_out, +# enum llama_ftype ftype, +# int nthread); +def llama_model_quantize( + fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int +) -> int: + return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread) + + +_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int] +_lib.llama_model_quantize.restype = c_int + + +# Apply a LoRA adapter to a loaded model +# path_base_model is the path to a higher quality model to use as a base for +# the layers modified by the adapter. Can be NULL to use the current loaded model. +# The model needs to be reloaded before applying a new adapter, otherwise the adapter +# will be applied on top of the previous one +# Returns 0 on success +# LLAMA_API int llama_apply_lora_from_file( +# struct llama_context * ctx, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); +def llama_apply_lora_from_file( + ctx: llama_context_p, + path_lora: c_char_p, + path_base_model: c_char_p, + n_threads: c_int, +) -> int: + return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) + + +_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int] +_lib.llama_apply_lora_from_file.restype = c_int + + +# Returns the number of tokens in the KV cache +# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); +def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: + return _lib.llama_get_kv_cache_token_count(ctx) + + +_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] +_lib.llama_get_kv_cache_token_count.restype = c_int + + +# Sets the current rng seed. +# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); +def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): + return _lib.llama_set_rng_seed(ctx, seed) + + +_lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] +_lib.llama_set_rng_seed.restype = None + + +# Returns the maximum size in bytes of the state (rng, logits, embedding +# and kv_cache) - will often be smaller after compacting tokens +# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); +def llama_get_state_size(ctx: llama_context_p) -> int: + return _lib.llama_get_state_size(ctx) + + +_lib.llama_get_state_size.argtypes = [llama_context_p] +_lib.llama_get_state_size.restype = c_size_t + + +# Copies the state to the specified destination address. +# Destination needs to have allocated enough memory. +# Returns the number of bytes copied +# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); +def llama_copy_state_data( + ctx: llama_context_p, dst # type: Array[c_uint8] +) -> int: + return _lib.llama_copy_state_data(ctx, dst) + + +_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] +_lib.llama_copy_state_data.restype = c_size_t + + +# Set the state reading from the specified address +# Returns the number of bytes read +# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); +def llama_set_state_data( + ctx: llama_context_p, src # type: Array[c_uint8] +) -> int: + return _lib.llama_set_state_data(ctx, src) + + +_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p] +_lib.llama_set_state_data.restype = c_size_t + + +# Save/load session file +# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); +def llama_load_session_file( + ctx: llama_context_p, + path_session: bytes, + tokens_out, # type: Array[llama_token] + n_token_capacity: c_size_t, + n_token_count_out, # type: _Pointer[c_size_t] +) -> int: + return _lib.llama_load_session_file( + ctx, path_session, tokens_out, n_token_capacity, n_token_count_out + ) + + +_lib.llama_load_session_file.argtypes = [ + llama_context_p, + c_char_p, + llama_token_p, + c_size_t, + c_size_t_p, +] +_lib.llama_load_session_file.restype = c_size_t + + +# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); +def llama_save_session_file( + ctx: llama_context_p, + path_session: bytes, + tokens, # type: Array[llama_token] + n_token_count: c_size_t, +) -> int: + return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) + + +_lib.llama_save_session_file.argtypes = [ + llama_context_p, + c_char_p, + llama_token_p, + c_size_t, +] +_lib.llama_save_session_file.restype = c_size_t + + +# Run the llama inference to obtain the logits and probabilities for the next token. +# tokens + n_tokens is the provided batch of new tokens to process +# n_past is the number of tokens to use from previous eval calls +# Returns 0 on success +# LLAMA_API int llama_eval( +# struct llama_context * ctx, +# const llama_token * tokens, +# int n_tokens, +# int n_past, +# int n_threads); +def llama_eval( + ctx: llama_context_p, + tokens, # type: Array[llama_token] + n_tokens: c_int, + n_past: c_int, + n_threads: c_int, +) -> int: + return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) + + +_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int] +_lib.llama_eval.restype = c_int + + +# Convert the provided text into tokens. +# The tokens pointer must be large enough to hold the resulting tokens. +# Returns the number of tokens on success, no more than n_max_tokens +# Returns a negative number on failure - the number of tokens that would have been returned +# TODO: not sure if correct +# LLAMA_API int llama_tokenize( +# struct llama_context * ctx, +# const char * text, +# llama_token * tokens, +# int n_max_tokens, +# bool add_bos); +def llama_tokenize( + ctx: llama_context_p, + text: bytes, + tokens, # type: Array[llama_token] + n_max_tokens: c_int, + add_bos: c_bool, +) -> int: + return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) + + +_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool] +_lib.llama_tokenize.restype = c_int + + +# LLAMA_API int llama_n_vocab(const struct llama_context * ctx); +def llama_n_vocab(ctx: llama_context_p) -> int: + return _lib.llama_n_vocab(ctx) + + +_lib.llama_n_vocab.argtypes = [llama_context_p] +_lib.llama_n_vocab.restype = c_int + + +# LLAMA_API int llama_n_ctx (const struct llama_context * ctx); +def llama_n_ctx(ctx: llama_context_p) -> int: + return _lib.llama_n_ctx(ctx) + + +_lib.llama_n_ctx.argtypes = [llama_context_p] +_lib.llama_n_ctx.restype = c_int + + +# LLAMA_API int llama_n_embd (const struct llama_context * ctx); +def llama_n_embd(ctx: llama_context_p) -> int: + return _lib.llama_n_embd(ctx) + + +_lib.llama_n_embd.argtypes = [llama_context_p] +_lib.llama_n_embd.restype = c_int + + +# Token logits obtained from the last call to llama_eval() +# The logits for the last token are stored in the last row +# Can be mutated in order to change the probabilities of the next token +# Rows: n_tokens +# Cols: n_vocab +# LLAMA_API float * llama_get_logits(struct llama_context * ctx); +def llama_get_logits( + ctx: llama_context_p, +): # type: (...) -> Array[float] # type: ignore + return _lib.llama_get_logits(ctx) + + +_lib.llama_get_logits.argtypes = [llama_context_p] +_lib.llama_get_logits.restype = c_float_p + + +# Get the embeddings for the input +# shape: [n_embd] (1-dimensional) +# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); +def llama_get_embeddings( + ctx: llama_context_p, +): # type: (...) -> Array[float] # type: ignore + return _lib.llama_get_embeddings(ctx) + + +_lib.llama_get_embeddings.argtypes = [llama_context_p] +_lib.llama_get_embeddings.restype = c_float_p + + +# Token Id -> String. Uses the vocabulary in the provided context +# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); +def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: + return _lib.llama_token_to_str(ctx, token) + + +_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] +_lib.llama_token_to_str.restype = c_char_p + +# Special tokens + + +# LLAMA_API llama_token llama_token_bos(); +def llama_token_bos() -> int: + return _lib.llama_token_bos() + + +_lib.llama_token_bos.argtypes = [] +_lib.llama_token_bos.restype = llama_token + + +# LLAMA_API llama_token llama_token_eos(); +def llama_token_eos() -> int: + return _lib.llama_token_eos() + + +_lib.llama_token_eos.argtypes = [] +_lib.llama_token_eos.restype = llama_token + + +# LLAMA_API llama_token llama_token_nl(); +def llama_token_nl() -> int: + return _lib.llama_token_nl() + + +_lib.llama_token_nl.argtypes = [] +_lib.llama_token_nl.restype = llama_token + + +# Sampling functions + + +# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. +# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); +def llama_sample_repetition_penalty( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + last_tokens_data, # type: Array[llama_token] + last_tokens_size: c_int, + penalty: c_float, +): + return _lib.llama_sample_repetition_penalty( + ctx, candidates, last_tokens_data, last_tokens_size, penalty + ) + + +_lib.llama_sample_repetition_penalty.argtypes = [ + llama_context_p, + llama_token_data_array_p, + llama_token_p, + c_int, + c_float, +] +_lib.llama_sample_repetition_penalty.restype = None + + +# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. +# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); +def llama_sample_frequency_and_presence_penalties( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + last_tokens_data, # type: Array[llama_token] + last_tokens_size: c_int, + alpha_frequency: c_float, + alpha_presence: c_float, +): + return _lib.llama_sample_frequency_and_presence_penalties( + ctx, + candidates, + last_tokens_data, + last_tokens_size, + alpha_frequency, + alpha_presence, + ) + + +_lib.llama_sample_frequency_and_presence_penalties.argtypes = [ + llama_context_p, + llama_token_data_array_p, + llama_token_p, + c_int, + c_float, + c_float, +] +_lib.llama_sample_frequency_and_presence_penalties.restype = None + + +# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_softmax( + ctx: llama_context_p, candidates # type: _Pointer[llama_token_data] +): + return _lib.llama_sample_softmax(ctx, candidates) + + +_lib.llama_sample_softmax.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_softmax.restype = None + + +# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); +def llama_sample_top_k( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + k: c_int, + min_keep: c_size_t, +): + return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) + + +_lib.llama_sample_top_k.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_int, + c_size_t, +] +_lib.llama_sample_top_k.restype = None + + +# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); +def llama_sample_top_p( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + p: c_float, + min_keep: c_size_t, +): + return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) + + +_lib.llama_sample_top_p.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_size_t, +] +_lib.llama_sample_top_p.restype = None + + +# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. +# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); +def llama_sample_tail_free( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + z: c_float, + min_keep: c_size_t, +): + return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) + + +_lib.llama_sample_tail_free.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_size_t, +] +_lib.llama_sample_tail_free.restype = None + + +# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. +# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); +def llama_sample_typical( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + p: c_float, + min_keep: c_size_t, +): + return _lib.llama_sample_typical(ctx, candidates, p, min_keep) + + +_lib.llama_sample_typical.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_size_t, +] +_lib.llama_sample_typical.restype = None + + +# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); +def llama_sample_temperature( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + temp: c_float, +): + return _lib.llama_sample_temperature(ctx, candidates, temp) + + +_lib.llama_sample_temperature.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, +] +_lib.llama_sample_temperature.restype = None + + +# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); +def llama_sample_token_mirostat( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + tau: c_float, + eta: c_float, + m: c_int, + mu, # type: _Pointer[c_float] +) -> int: + return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) + + +_lib.llama_sample_token_mirostat.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_float, + c_int, + c_float_p, +] +_lib.llama_sample_token_mirostat.restype = llama_token + + +# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); +def llama_sample_token_mirostat_v2( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + tau: c_float, + eta: c_float, + mu, # type: _Pointer[c_float] +) -> int: + return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) + + +_lib.llama_sample_token_mirostat_v2.argtypes = [ + llama_context_p, + llama_token_data_array_p, + c_float, + c_float, + c_float_p, +] +_lib.llama_sample_token_mirostat_v2.restype = llama_token + + +# @details Selects the token with the highest probability. +# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_token_greedy( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] +) -> int: + return _lib.llama_sample_token_greedy(ctx, candidates) + + +_lib.llama_sample_token_greedy.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_token_greedy.restype = llama_token + + +# @details Randomly selects a token from the candidates based on their probabilities. +# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); +def llama_sample_token( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] +) -> int: + return _lib.llama_sample_token(ctx, candidates) + + +_lib.llama_sample_token.argtypes = [ + llama_context_p, + llama_token_data_array_p, +] +_lib.llama_sample_token.restype = llama_token + + +# Performance information + + +# LLAMA_API void llama_print_timings(struct llama_context * ctx); +def llama_print_timings(ctx: llama_context_p): + _lib.llama_print_timings(ctx) + + +_lib.llama_print_timings.argtypes = [llama_context_p] +_lib.llama_print_timings.restype = None + + +# LLAMA_API void llama_reset_timings(struct llama_context * ctx); +def llama_reset_timings(ctx: llama_context_p): + _lib.llama_reset_timings(ctx) + + +_lib.llama_reset_timings.argtypes = [llama_context_p] +_lib.llama_reset_timings.restype = None + + +# Print system information +# LLAMA_API const char * llama_print_system_info(void); +def llama_print_system_info() -> bytes: + return _lib.llama_print_system_info() + + +_lib.llama_print_system_info.argtypes = [] +_lib.llama_print_system_info.restype = c_char_p + +################################################################################################### + + +_llama_initialized = False + +if not _llama_initialized: + llama_init_backend() + _llama_initialized = True diff --git a/examples/low_level_api_chat_cpp.py b/examples/low_level_api_chat_cpp.py new file mode 100644 index 0000000000000..5e817243471fb --- /dev/null +++ b/examples/low_level_api_chat_cpp.py @@ -0,0 +1,573 @@ +""" +This is an example implementation of main.cpp from llama.cpp +Quirks: + * Its not exactly alike since this port is designed around programmatic I/O + * Input is always echoed if on, so it should be turned off when using "input()" + * The first antiprompt should be the userprompt like "\nUser:", + because its added when n_predict is reached (aka generation ended prematurely) + * n_predict can be set to -1 for unlimited length responses (or just a really high value) + * Instruction mode adds its own antiprompt. + You should also still be feeding the model with a "primer" prompt that + shows it the expected format. +""" +import ctypes +import sys +from time import time +from os import cpu_count, path + +import llama_cpp +from common import GptParams, gpt_params_parse, gpt_random_prompt + +ANSI_COLOR_RESET = "\x1b[0m" +ANSI_COLOR_YELLOW = "\x1b[33m" +ANSI_BOLD = "\x1b[1m" +ANSI_COLOR_GREEN = "\x1b[32m" + +CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET +CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW +CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN + +# Iterative search +# Actively searches and prevents a pattern from being returned +class IterSearch: + def __init__(self, pattern): + self.pattern = list(pattern) + self.buffer = [] + + def __call__(self, char): + self.buffer += [char] + + if (self.pattern[:len(self.buffer)] == self.buffer): + if (len(self.buffer) >= len(self.pattern)): + self.buffer.clear() + return [] + + _tmp = self.buffer[:] + self.buffer.clear() + return _tmp + +# A LLaMA interactive session +class LLaMAInteract: + def __init__(self, params: GptParams) -> None: + # input args + self.params = params + + if (self.params.perplexity): + raise NotImplementedError("""************ +please use the 'perplexity' tool for perplexity calculations +************""") + + if (self.params.embedding): + raise NotImplementedError("""************ +please use the 'embedding' tool for embedding calculations +************""") + + if (self.params.n_ctx > 2048): + print(f"""warning: model does not support \ +context sizes greater than 2048 tokens ({self.params.n_ctx} \ +specified) expect poor results""", file=sys.stderr) + + if (self.params.seed <= 0): + self.params.seed = int(time()) + + print(f"seed = {self.params.seed}", file=sys.stderr) + + if (self.params.random_prompt): + self.params.prompt = gpt_random_prompt(self.params.seed) + + # runtime args + self.input_consumed = 0 + self.n_past = 0 + self.n_session_consumed = 0 + self.first_antiprompt = [] + self.remaining_tokens = self.params.n_predict + self.output_echo = self.params.input_echo + + # model load + self.lparams = llama_cpp.llama_context_default_params() + self.lparams.n_ctx = self.params.n_ctx + self.lparams.n_parts = self.params.n_parts + self.lparams.seed = self.params.seed + self.lparams.memory_f16 = self.params.memory_f16 + self.lparams.use_mlock = self.params.use_mlock + self.lparams.use_mmap = self.params.use_mmap + + self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams) + if (not self.ctx): + raise RuntimeError(f"error: failed to load model '{self.params.model}'") + + if (self.params.ignore_eos): + self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf") + + if (len(self.params.lora_adapter) > 0): + if (llama_cpp.llama_apply_lora_from_file( + self.ctx, + self.params.lora_adapter.encode("utf8"), + self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None, + self.params.n_threads + ) != 0): + print("error: failed to apply lora adapter") + return + + print(file=sys.stderr) + print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ +| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) + + # determine the required inference memory per token: + if (self.params.mem_test): + tmp = [0, 1, 2, 3] + llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads) + llama_cpp.llama_print_timings(self.ctx) + self.exit() + return + + # create internal context + self.n_ctx = llama_cpp.llama_n_ctx(self.ctx) + + # Add a space in front of the first character to match OG llama tokenizer behavior + self.params.prompt = " " + self.params.prompt + + # Load prompt file + if (self.params.file): + with open(self.params.file) as f: + self.params.prompt = f.read() + + self.session_tokens: list[llama_cpp.llama_token] = [] + if (len(self.params.path_session) > 0): + print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr) + + if (path.exists(self.params.path_session)): + _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() + _n_token_count_out = llama_cpp.c_size_t() + if (llama_cpp.llama_load_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + _session_tokens, + self.params.n_ctx, + ctypes.byref(_n_token_count_out) + ) != 1): + print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr) + return + _n_token_count_out = _n_token_count_out.value + self.session_tokens = _session_tokens[:_n_token_count_out] + print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr) + else: + print(f"session file does not exist, will create", file=sys.stderr) + + # tokenize the prompt + self.embd = [] + self.embd_inp = self._tokenize(self.params.prompt) + + if (len(self.embd_inp) > self.n_ctx - 4): + raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})") + + # debug message about similarity of saved session, if applicable + self.n_matching_session_tokens = 0 + if len(self.session_tokens) > 0: + for id in self.session_tokens: + if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]: + break + self.n_matching_session_tokens += 1 + + if self.n_matching_session_tokens >= len(self.embd_inp): + print(f"session file has exact match for prompt!") + elif self.n_matching_session_tokens < (len(self.embd_inp) / 2): + print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") + else: + print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + + self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) + + # number of tokens to keep when resetting context + if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct): + self.params.n_keep = len(self.embd_inp) + + self.inp_prefix = self._tokenize(self.params.instruct_inp_prefix) + self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False) + + # in instruct mode, we inject a prefix and a suffix to each input by the user + self.antiecho = None + if (self.params.instruct): + self.params.interactive_start = True + _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False) + self.first_antiprompt.append(_ptn) + self.antiecho = IterSearch(_ptn) + + # enable interactive mode if reverse prompt or interactive start is specified + if (len(self.params.antiprompt) != 0 or self.params.interactive_start): + self.params.interactive = True + + # determine newline token + self.llama_token_newline = self._tokenize("\n", False) + self.llama_token_eot = self._tokenize(" [end of text]\n", False) + + if (self.params.verbose_prompt): + print(f""" +prompt: '{self.params.prompt}' +number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr) + + for i in range(len(self.embd_inp)): + print(f"{self.embd_inp[i]} -> '{llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i])}'", file=sys.stderr) + + if (self.params.n_keep > 0): + print("static prompt based on n_keep: '") + for i in range(self.params.n_keep): + print(llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i]), file=sys.stderr) + print("'", file=sys.stderr) + print(file=sys.stderr) + + if (self.params.interactive): + print("interactive mode on.", file=sys.stderr) + + if (len(self.params.antiprompt) > 0): + for antiprompt in self.params.antiprompt: + print(f"Reverse prompt: '{antiprompt}'", file=sys.stderr) + + if len(self.params.input_prefix) > 0: + print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr) + + print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\ +repeat_penalty = {self.params.repeat_penalty},\ +presence_penalty = {self.params.presence_penalty},\ +frequency_penalty = {self.params.frequency_penalty},\ +top_k = {self.params.top_k},\ +tfs_z = {self.params.tfs_z},\ +top_p = {self.params.top_p},\ +typical_p = {self.params.typical_p},\ +temp = {self.params.temp},\ +mirostat = {self.params.mirostat},\ +mirostat_lr = {self.params.mirostat_eta},\ +mirostat_ent = {self.params.mirostat_tau},\ + +generate: n_ctx = {self.n_ctx},\ +n_batch = {self.params.n_batch},\ +n_predict = {self.params.n_predict},\ +n_keep = {self.params.n_keep} + +""", file=sys.stderr) + + # determine antiprompt tokens + for i in self.params.antiprompt: + self.first_antiprompt.append(self._tokenize(i, False)) + + self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices + + if (params.interactive): + print("""== Running in interactive mode. == + - Press Ctrl+C to interject at any time. + - Press Return to return control to LLaMa. + - If you want to submit another line, end your input in '\\'. + +""", file=sys.stderr) + self.set_color(CONSOLE_COLOR_PROMPT) + + # tokenize a prompt + def _tokenize(self, prompt, bos=True): + _arr = (llama_cpp.llama_token * (len(prompt) + 1))() + _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos) + return _arr[:_n] + + def set_color(self, c): + if (self.params.use_color): + print(c, end="") + + def use_antiprompt(self): + return len(self.first_antiprompt) > 0 + + # generate tokens + def generate(self): + while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1: + # predict + if len(self.embd) > 0: + # infinite text generation via context swapping + # if we run out of context: + # - take the n_keep first tokens from the original prompt (via n_past) + # - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch + if (self.n_past + len(self.embd) > self.n_ctx): + n_left = self.n_past - self.params.n_keep + self.n_past = self.params.n_keep + + # insert n_left/2 tokens at the start of embd from last_n_tokens + _insert = self.last_n_tokens[ + self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd) + ] + self.embd = _insert + self.embd + self.params.path_session = "" + + # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + # REVIEW + if self.n_session_consumed < len(self.session_tokens): + for i in range(len(self.embd)): + if self.embd[i] != self.session_tokens[self.n_session_consumed]: + self.session_tokens = self.session_tokens[:self.n_session_consumed] + break + + self.n_past += 1 + self.n_session_consumed += 1 + + if self.n_session_consumed >= len(self.session_tokens): + i += 1 + break + + if i > 0: + self.embd = self.embd[i:] + + # evaluate tokens in batches + # embd is typically prepared beforehand to fit within a batch, but not always + #TODO BUG: The batching code causes nonsensical generation + """for i in range(0, len(self.embd), self.params.n_batch): + n_eval = self.params.n_batch + _arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval]) + if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0: + print(f"failed to eval") + return + + self.n_past += n_eval""" + + if (llama_cpp.llama_eval( + self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads + ) != 0): + raise Exception("Failed to llama_eval!") + + if len(self.embd) > 0 and len(self.params.path_session) > 0: + self.session_tokens.extend(self.embd) + self.n_session_consumed = len(self.session_tokens) + + self.n_past += len(self.embd) + self.embd = [] + if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting + # out of user input, sample next token + top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k + repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n + + # optionally save the session on first sample (for faster prompt loading next time) + if len(self.params.path_session) > 0 and self.need_to_save_session: + self.need_to_save_session = False + llama_cpp.llama_save_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + (llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens), + len(self.session_tokens) + ) + + id = 0 + + logits = llama_cpp.llama_get_logits(self.ctx) + n_vocab = llama_cpp.llama_n_vocab(self.ctx) + + # Apply params.logit_bias map + for key, value in self.params.logit_bias.items(): + logits[key] += value + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + # Apply penalties + nl_logit = logits[llama_cpp.llama_token_nl()] + last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx) + + _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:]) + llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p, + _arr, + last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty)) + llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p, + _arr, + last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty)) + + if not self.params.penalize_nl: + logits[llama_cpp.llama_token_nl()] = nl_logit + + if self.params.temp <= 0: + # Greedy sampling + id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p) + else: + if self.params.mirostat == 1: + mirostat_mu = 2.0 * self.params.mirostat_tau + mirostat_m = 100 + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu)) + elif self.params.mirostat == 2: + mirostat_mu = 2.0 * self.params.mirostat_tau + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) + else: + # Temperature sampling + llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z),min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p),min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p),min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token(self.ctx, candidates_p) + # print("`{}`".format(candidates_p.size)) + + self.last_n_tokens.pop(0) + self.last_n_tokens.append(id) + + # replace end of text token with newline token when in interactive mode + if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct): + id = self.llama_token_newline[0] + self.embd.append(id) + if (self.use_antiprompt()): + # tokenize and inject first reverse prompt + self.embd_inp += self.first_antiprompt[0] + for id in self.first_antiprompt[0]: + self.embd.append(id) + else: + # add it to the context + self.embd.append(id) + + # echo this to console + self.output_echo = True + + # decrement remaining sampling budget + self.remaining_tokens -= 1 + else: + # output to console if input echo is on + self.output_echo = self.params.input_echo + + # some user input remains from prompt or interaction, forward it to processing + while len(self.embd_inp) > self.input_consumed: + self.embd.append(self.embd_inp[self.input_consumed]) + self.last_n_tokens.pop(0) + self.last_n_tokens.append(self.embd_inp[self.input_consumed]) + self.input_consumed += 1 + if len(self.embd) >= self.params.n_batch: + break + + # display tokens + if self.output_echo: + for id in self.embd: + if self.antiecho != None: + for r in self.antiecho(id): + yield r + else: + yield id + + # reset color to default if we there is no pending user input + if (self.params.input_echo and len(self.embd_inp) == self.input_consumed): + self.set_color(CONSOLE_COLOR_DEFAULT) + + if (self.params.interactive and len(self.embd_inp) <= self.input_consumed): + # if antiprompt is present, stop + if (self.use_antiprompt()): + if True in [ + i == self.last_n_tokens[-len(i):] + for i in self.first_antiprompt + ]: + break + + # if we are using instruction mode, and we have processed the initial prompt + if (self.params.interactive_start): + break + + # end of text token + if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(): + if (not self.params.instruct): + for i in self.llama_token_eot: + yield i + break + + # respect n_predict even if antiprompt is present + if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1): + # If we arent in instruction mode, fix the current generation by appending the antiprompt. + # Makes it so if chat ends prematurely you dont append the AI's text etc. + if not self.params.instruct: + self.embd_inp += self.first_antiprompt[0] + self.n_remain = self.params.n_predict + break + + self.params.interactive_start = False + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.exit() + + def exit(self): + llama_cpp.llama_free(self.ctx) + self.set_color(CONSOLE_COLOR_DEFAULT) + + # return past text + def past(self): + for id in self.last_n_tokens[-self.n_past:]: + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8", errors="ignore") + + # write input + def input(self, prompt: str): + if (self.params.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix): + self.embd_inp += self.inp_prefix + self.embd_inp += self._tokenize(prompt) + if (self.params.instruct): + self.embd_inp += self.inp_suffix + + # write output + def output(self): + self.remaining_tokens = self.params.n_predict + for id in self.generate(): + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8") + + # read user input + def read_input(self): + out = "" + while (t := input()).endswith("\\"): + out += t[:-1] + "\n" + return out + t + "\n" + + # interactive mode + def interact(self): + for i in self.output(): + print(i,end="",flush=True) + self.params.input_echo = False + + while self.params.interactive: + self.set_color(CONSOLE_COLOR_USER_INPUT) + if (self.params.instruct): + print('\n> ', end="") + self.input(self.read_input()) + else: + print(self.params.input_prefix, end="") + self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}") + print(self.params.input_suffix,end="") + self.set_color(CONSOLE_COLOR_DEFAULT) + + try: + for i in self.output(): + print(i,end="",flush=True) + except KeyboardInterrupt: + self.set_color(CONSOLE_COLOR_DEFAULT) + if not self.params.instruct: + print(self.params.fix_prefix,end="") + self.input(self.params.fix_prefix) + +if __name__ == "__main__": + from datetime import datetime + + USER_NAME="User" + AI_NAME="ChatLLaMa" + + time_now = datetime.now() + prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. +{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision. +There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. +The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. +The transcript only includes text, it does not include markup like HTML and Markdown. + +{USER_NAME}: Hello, {AI_NAME}! +{AI_NAME}: Hello {USER_NAME}! How may I help you today? +{USER_NAME}: What time is it? +{AI_NAME}: It is {time_now.strftime("%H:%M")}. +{USER_NAME}: What year is it? +{AI_NAME}: We are in {time_now.strftime("%Y")}. +{USER_NAME}: What is a cat? +{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. +{USER_NAME}: Name a color. +{AI_NAME}: Blue +{USER_NAME}:""" + params = gpt_params_parse() + + with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api_llama_cpp.py b/examples/low_level_api_llama_cpp.py new file mode 100644 index 0000000000000..2d1bab3f8f9c8 --- /dev/null +++ b/examples/low_level_api_llama_cpp.py @@ -0,0 +1,102 @@ +import llama_cpp + +import multiprocessing + +import llama_cpp + +N_THREADS = multiprocessing.cpu_count() + +prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n" + +lparams = llama_cpp.llama_context_default_params() +ctx = llama_cpp.llama_init_from_file(b"../models/7B/ggml-model.bin", lparams) + +# determine the required inference memory per token: +tmp = [0, 1, 2, 3] +llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS) + +n_past = 0 + +prompt = b" " + prompt + +embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))() +n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True) +embd_inp = embd_inp[:n_of_tok] + +n_ctx = llama_cpp.llama_n_ctx(ctx) + +n_predict = 20 +n_predict = min(n_predict, n_ctx - len(embd_inp)) + +input_consumed = 0 +input_noecho = False + +remaining_tokens = n_predict + +embd = [] +last_n_size = 64 +last_n_tokens_data = [0] * last_n_size +n_batch = 24 +last_n_repeat = 64 +repeat_penalty = 1 +frequency_penalty = 0.0 +presence_penalty = 0.0 + +while remaining_tokens > 0: + if len(embd) > 0: + llama_cpp.llama_eval( + ctx, (llama_cpp.c_int * len(embd))(*embd), len(embd), n_past, N_THREADS + ) + + n_past += len(embd) + embd = [] + if len(embd_inp) <= input_consumed: + logits = llama_cpp.llama_get_logits(ctx) + n_vocab = llama_cpp.llama_n_vocab(ctx) + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data) + llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p, + _arr, + last_n_repeat, repeat_penalty) + llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p, + _arr, + last_n_repeat, frequency_penalty, presence_penalty) + + llama_cpp.llama_sample_top_k(ctx, candidates_p, 40, min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8, min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2) + id = llama_cpp.llama_sample_token(ctx, candidates_p) + + last_n_tokens_data = last_n_tokens_data[1:] + [id] + embd.append(id) + input_noecho = False + remaining_tokens -= 1 + else: + while len(embd_inp) > input_consumed: + embd.append(embd_inp[input_consumed]) + last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]] + input_consumed += 1 + if len(embd) >= n_batch: + break + if not input_noecho: + for id in embd: + print( + llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"), + end="", + flush=True, + ) + + if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(): + break + +print() + +llama_cpp.llama_print_timings(ctx) + +llama_cpp.llama_free(ctx) diff --git a/examples/quantize.py b/examples/quantize.py new file mode 100644 index 0000000000000..8bd03f88a1895 --- /dev/null +++ b/examples/quantize.py @@ -0,0 +1,25 @@ +import os +import argparse +import llama_cpp + + +def main(args): + if not os.path.exists(fname_inp): + raise RuntimeError(f"Input file does not exist ({fname_inp})") + if os.path.exists(fname_out): + raise RuntimeError(f"Output file already exists ({fname_out})") + fname_inp = args.fname_inp.encode("utf-8") + fname_out = args.fname_out.encode("utf-8") + itype = args.itype + return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype) + if return_code != 0: + raise RuntimeError("Failed to quantize model") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("fname_inp", type=str, help="Path to input model") + parser.add_argument("fname_out", type=str, help="Path to output model") + parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)") + args = parser.parse_args() + main(args)