Skip to content

Commit ad2ea65

Browse files
committed
add mul_mat_q parameter
This also fixes a crash when loading the 70b llama2 model. This parameter was introduced in ggml-org/llama.cpp#2453 (`0728c5a8`)
1 parent 91bf8fa commit ad2ea65

File tree

1 file changed

+2
-0
lines changed

1 file changed

+2
-0
lines changed

llama_cpp/llama_cpp.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ class llama_token_data_array(Structure):
181181

182182
# // Keep the booleans together to avoid misalignment during copy-by-value.
183183
# bool low_vram; // if true, reduce VRAM usage at the cost of performance
184+
# bool mul_mat_q; // if true, use experimental mul_mat_q kernels
184185
# bool f16_kv; // use fp16 for KV cache
185186
# bool logits_all; // the llama_eval() call computes all logits, not just the last one
186187
# bool vocab_only; // only load the vocabulary, no weights
@@ -203,6 +204,7 @@ class llama_context_params(Structure):
203204
("progress_callback", llama_progress_callback),
204205
("progress_callback_user_data", c_void_p),
205206
("low_vram", c_bool),
207+
("mul_mat_q", c_bool),
206208
("f16_kv", c_bool),
207209
("logits_all", c_bool),
208210
("vocab_only", c_bool),

0 commit comments

Comments
 (0)