Skip to content

Commit ecda2ec

Browse files
authored
mtmd : Support Pixtral 12B (#13065)
* add pixtral text model (vision is wip) * cgraph ok, just missing 2D RoPE * fix bad rebase * first working version * fix problem with img_break token * support dynamic image size * update docs * update test script
1 parent eb1776b commit ecda2ec

14 files changed

+644
-32
lines changed

convert_hf_to_gguf.py

+64-2
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
776776
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
777777
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
778778
res = "glm4"
779+
if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
780+
# ref: https://huggingface.co/mistral-community/pixtral-12b
781+
res = "pixtral"
779782

780783
if res is None:
781784
logger.warning("\n")
@@ -1724,7 +1727,8 @@ def prepare_tensors(self):
17241727
"MistralForCausalLM",
17251728
"MixtralForCausalLM",
17261729
"Idefics3ForConditionalGeneration",
1727-
"SmolVLMForConditionalGeneration")
1730+
"SmolVLMForConditionalGeneration",
1731+
"LlavaForConditionalGeneration")
17281732
class LlamaModel(TextModel):
17291733
model_arch = gguf.MODEL_ARCH.LLAMA
17301734
undo_permute = True
@@ -1734,6 +1738,10 @@ def __init__(self, *args, **kwargs):
17341738
# fix for SmolVLM2, missing `num_attention_heads` in config.json
17351739
if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration":
17361740
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
1741+
# fix for Pixtral, missing `num_attention_heads` in config.json
1742+
if self.hparams["architectures"][0] == "LlavaForConditionalGeneration" \
1743+
and self.hparams.get("model_type") == "mistral":
1744+
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
17371745

17381746
def set_vocab(self):
17391747
try:
@@ -1797,12 +1805,17 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
17971805
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
17981806
n_head = self.hparams["num_attention_heads"]
17991807
n_kv_head = self.hparams.get("num_key_value_heads")
1800-
is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
1808+
is_vision_tensor = "vision_tower" in name \
1809+
or "vision_model" in name \
1810+
or "model.connector" in name \
1811+
or "multi_modal_projector" in name
18011812

18021813
if is_vision_tensor:
18031814
return [] # skip vision tensors
18041815
elif name.startswith("model.text_model"):
18051816
name = name.replace("text_model.", "") # for SmolVLM
1817+
elif name.startswith("language_model."):
1818+
name = name.replace("language_model.", "") # for the rest
18061819

18071820
if self.undo_permute:
18081821
if name.endswith(("q_proj.weight", "q_proj.bias")):
@@ -1885,6 +1898,55 @@ def prepare_tensors(self):
18851898
raise ValueError(f"Unprocessed experts: {experts}")
18861899

18871900

1901+
@ModelBase.register("LlavaForConditionalGeneration")
1902+
class LlavaVisionModel(VisionModel):
1903+
img_break_tok_id = -1
1904+
1905+
def __init__(self, *args, **kwargs):
1906+
super().__init__(*args, **kwargs)
1907+
if self.hparams["model_type"] == "pixtral":
1908+
# fix missing config.json values
1909+
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
1910+
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 24)
1911+
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 4096)
1912+
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1024)
1913+
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
1914+
self.img_break_tok_id = 12 # see tokenizer_config.json
1915+
else:
1916+
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
1917+
1918+
def set_gguf_parameters(self):
1919+
super().set_gguf_parameters()
1920+
hparams = self.hparams
1921+
if hparams["model_type"] == "pixtral":
1922+
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
1923+
# default values below are taken from HF tranformers code
1924+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
1925+
self.gguf_writer.add_vision_use_silu(True)
1926+
1927+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1928+
del bid # unused
1929+
n_head = self.hparams["num_attention_heads"]
1930+
n_kv_head = n_head
1931+
1932+
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."):
1933+
# process vision tensors
1934+
if name.endswith(("q_proj.weight", "q_proj.bias")):
1935+
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
1936+
if name.endswith(("k_proj.weight", "k_proj.bias")):
1937+
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
1938+
return [(self.map_tensor_name(name), data_torch)]
1939+
1940+
if self.img_break_tok_id > 0 and "embed_tokens.weight" in name:
1941+
logger.info(f"Extracting [IMG_BREAK] token embedding from {name}")
1942+
# for pixtral model, we need to extract the [IMG_BREAK] token embedding
1943+
img_break_embd = data_torch[self.img_break_tok_id]
1944+
name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK]
1945+
return [(self.map_tensor_name(name), img_break_embd)]
1946+
1947+
return [] # skip other tensors
1948+
1949+
18881950
@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
18891951
class SmolVLMModel(VisionModel):
18901952
def __init__(self, *args, **kwargs):

convert_hf_to_gguf_update.py

+1
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ class TOKENIZER_TYPE(IntEnum):
115115
{"name": "bailingmoe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
116116
{"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
117117
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
118+
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
118119
]
119120

120121

docs/multimodal/gemma3.md

+6-6
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@ You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)
1111
```bash
1212
# build
1313
cmake -B build
14-
cmake --build build --target llama-gemma3-cli
14+
cmake --build build --target llama-mtmd-cli
1515

1616
# alternatively, install from brew (MacOS)
1717
brew install llama.cpp
1818

1919
# run it
20-
llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
21-
llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
22-
llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
20+
llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
21+
llama-mtmd-cli -hf ggml-org/gemma-3-12b-it-GGUF
22+
llama-mtmd-cli -hf ggml-org/gemma-3-27b-it-GGUF
2323

2424
# note: 1B model does not support vision
2525
```
@@ -44,8 +44,8 @@ What you need:
4444
```bash
4545
# build
4646
cmake -B build
47-
cmake --build build --target llama-gemma3-cli
47+
cmake --build build --target llama-mtmd-cli
4848

4949
# run it
50-
./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
50+
./build/bin/llama-mtmd-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
5151
```

examples/llava/README.md

+28
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,28 @@ The naming and structure related to multimodal support have evolved, which might
1414
- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs.
1515
- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`.
1616

17+
## Pre-quantized models
18+
19+
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default:
20+
21+
```sh
22+
# Gemma 3
23+
llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
24+
llama-mtmd-cli -hf ggml-org/gemma-3-12b-it-GGUF
25+
llama-mtmd-cli -hf ggml-org/gemma-3-27b-it-GGUF
26+
27+
# SmolVLM
28+
llama-mtmd-cli -hf ggml-org/SmolVLM-Instruct-GGUF
29+
llama-mtmd-cli -hf ggml-org/SmolVLM-256M-Instruct-GGUF
30+
llama-mtmd-cli -hf ggml-org/SmolVLM-500M-Instruct-GGUF
31+
llama-mtmd-cli -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
32+
llama-mtmd-cli -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
33+
llama-mtmd-cli -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
34+
35+
# Pixtral 12B
36+
llama-mtmd-cli -hf ggml-org/pixtral-12b-GGUF
37+
```
38+
1739
## How it works and what is `mmproj`?
1840

1941
Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model.
@@ -45,3 +67,9 @@ Multimodal projector (`mmproj`) files are specific to each model architecture. P
4567
- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
4668
- [IBM Granite Vision](../../docs/multimodal/granitevision.md)
4769
- [Google Gemma 3](../../docs/multimodal/gemma3.md)
70+
71+
For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` flag to get the `mmproj` file:
72+
- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - Note: 1B variant does not have vision support
73+
- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
74+
- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
75+
- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint

examples/llava/clip-impl.h

+4
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
6161
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
6262
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
63+
#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s"
6364
#define TN_FFN_UP "%s.blk.%d.ffn_up.%s"
6465
#define TN_LN_1 "%s.blk.%d.ln1.%s"
6566
#define TN_LN_2 "%s.blk.%d.ln2.%s"
@@ -73,6 +74,7 @@
7374
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
7475
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
7576
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
77+
#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral
7678

7779
// mimicpmv
7880
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@@ -101,6 +103,7 @@ enum projector_type {
101103
PROJECTOR_TYPE_MERGER,
102104
PROJECTOR_TYPE_GEMMA3,
103105
PROJECTOR_TYPE_IDEFICS3,
106+
PROJECTOR_TYPE_PIXTRAL,
104107
PROJECTOR_TYPE_UNKNOWN,
105108
};
106109

@@ -113,6 +116,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
113116
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
114117
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
115118
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
119+
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
116120
};
117121

118122
static projector_type clip_projector_type_from_string(const std::string & str) {

0 commit comments

Comments
 (0)