Skip to content

llama 4 wip #24

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 32 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c912c67
wip llama 4 conversion
ngxson May 2, 2025
a67a1be
rm redundant __init__
ngxson May 2, 2025
10db58b
fix conversion
ngxson May 2, 2025
55ad3a6
Merge branch 'master' into xsn/mtmd_llama4_new
ngxson May 2, 2025
c50e627
fix conversion
ngxson May 2, 2025
8775bc4
test impl
ngxson May 2, 2025
7341e70
try this
ngxson May 3, 2025
893ad9c
reshape patch_embeddings_0
ngxson May 3, 2025
15605e4
fix view
ngxson May 3, 2025
32a62d1
rm ffn_post_norm
ngxson May 3, 2025
97a5cd1
cgraph ok
ngxson May 3, 2025
c6c2d66
f32 for pos embd
ngxson May 3, 2025
224cbba
add image marker tokens
ngxson May 3, 2025
9d1a4d6
Llama4UnfoldConvolution
ngxson May 3, 2025
532c332
correct pixel shuffle
ngxson May 3, 2025
844a344
Merge branch 'master' into xsn/mtmd_llama4_new
ngxson May 17, 2025
8caeed5
fix merge conflicts
ngxson May 17, 2025
2ffafd5
correct
ngxson May 18, 2025
7d9d4e3
add debug_graph
ngxson May 18, 2025
919318e
logits matched, but it still preceives the image incorrectly
ngxson May 18, 2025
5b81972
fix style
ngxson May 18, 2025
b74122f
add image_grid_pinpoints
ngxson May 18, 2025
4217d42
handle llama 4 preprocessing
ngxson May 18, 2025
3645fe0
rm load_image_size
ngxson May 18, 2025
791f106
rm unused line
ngxson May 18, 2025
8646240
fix
ngxson May 18, 2025
53fb622
small fix 2
ngxson May 18, 2025
f083c19
add test & docs
ngxson May 18, 2025
b199e70
fix llava-1.6 test
ngxson May 18, 2025
e52481b
test: add notion of huge models
ngxson May 18, 2025
186d7a8
add comment
ngxson May 19, 2025
d5e50aa
add warn about degraded quality
ngxson May 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ def prepare_tensors(self):
gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
gguf.MODEL_TENSOR.POSNET_NORM1,
gguf.MODEL_TENSOR.POSNET_NORM2,
gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
)
)
or not new_name.endswith(".weight")
Expand Down Expand Up @@ -2092,6 +2093,26 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
return super().modify_tensors(data_torch, name, bid)


@ModelBase.register("Llama4ForConditionalGeneration")
class Llama4VisionModel(VisionModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.LLAMA4)
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
assert self.hparams["hidden_act"] == "gelu"
self.gguf_writer.add_vision_use_gelu(True)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if "multi_modal_projector" in name or "vision_model" in name:
# process vision tensors
if "positional_embedding_vlm" in name and ".weight" not in name:
name += ".weight"
return [(self.map_tensor_name(name), data_torch)]
return []


@ModelBase.register("Mistral3ForConditionalGeneration")
class Mistral3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA
Expand Down
3 changes: 3 additions & 0 deletions docs/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,7 @@ NOTE: some models may require large context window, for example: `-c 8192`
(tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
(tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
(tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF

# Llama 4 Scout
(tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
```
20 changes: 12 additions & 8 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,14 +482,15 @@ class MODEL_TENSOR(IntEnum):
V_ENC_EMBD_CLS = auto()
V_ENC_EMBD_PATCH = auto()
V_ENC_EMBD_POS = auto()
V_ENC_INPUT_NORM = auto()
V_ENC_ATTN_Q = auto()
V_ENC_ATTN_Q_NORM = auto()
V_ENC_ATTN_K = auto()
V_ENC_ATTN_K_NORM = auto()
V_ENC_ATTN_V = auto()
V_ENC_INPUT_NORM = auto()
V_ENC_OUTPUT = auto()
V_ENC_OUTPUT_NORM = auto()
V_ENC_ATTN_O = auto()
V_ENC_ATTN_O_NORM = auto()
V_ENC_POST_ATTN_NORM = auto()
V_ENC_FFN_UP = auto()
V_ENC_FFN_GATE = auto()
V_ENC_FFN_DOWN = auto()
Expand Down Expand Up @@ -749,8 +750,9 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_ENC_ATTN_K_NORM: "v.blk.{bid}.attn_k_norm",
MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v",
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1",
MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out",
MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.blk.{bid}.ln2",
MODEL_TENSOR.V_ENC_ATTN_O: "v.blk.{bid}.attn_out",
MODEL_TENSOR.V_ENC_ATTN_O_NORM: "v.blk.{bid}.attn_out_norm",
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: "v.blk.{bid}.ln2",
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
Expand Down Expand Up @@ -785,14 +787,15 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_ENC_EMBD_CLS,
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_ATTN_Q,
MODEL_TENSOR.V_ENC_ATTN_Q_NORM,
MODEL_TENSOR.V_ENC_ATTN_K,
MODEL_TENSOR.V_ENC_ATTN_K_NORM,
MODEL_TENSOR.V_ENC_ATTN_V,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_OUTPUT,
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
MODEL_TENSOR.V_ENC_ATTN_O,
MODEL_TENSOR.V_ENC_ATTN_O_NORM,
MODEL_TENSOR.V_ENC_POST_ATTN_NORM,
MODEL_TENSOR.V_ENC_FFN_UP,
MODEL_TENSOR.V_ENC_FFN_GATE,
MODEL_TENSOR.V_ENC_FFN_DOWN,
Expand Down Expand Up @@ -2180,6 +2183,7 @@ class VisionProjectorType:
GEMMA3 = "gemma3"
IDEFICS3 = "idefics3"
PIXTRAL = "pixtral"
LLAMA4 = "llama4"
QWEN2VL = "qwen2vl_merger"
QWEN25VL = "qwen2.5vl_merger"
INTERNVL = "internvl"
Expand Down
19 changes: 17 additions & 2 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,10 +902,12 @@ class TensorNameMap:

MODEL_TENSOR.V_MMPROJ_FC: (
"model.connector.modality_projection.proj", # SmolVLM
"multi_modal_projector.linear_1", # llama 4
),

MODEL_TENSOR.V_MMPROJ_MLP: (
"model.mm_projector.mlp.mlp.{bid}",
"vision_model.vision_adapter.mlp.fc{bid}", # llama 4
"mlp1.{bid}", # InternVL
),

Expand All @@ -915,26 +917,30 @@ class TensorNameMap:

MODEL_TENSOR.V_ENC_EMBD_CLS: (
"vision_tower.vision_model.embeddings.class_embedding",
"vision_model.class_embedding", # llama 4
),

MODEL_TENSOR.V_ENC_EMBD_PATCH: (
"vision_tower.vision_model.embeddings.patch_embedding",
"vpm.embeddings.patch_embedding",
"model.vision_model.embeddings.patch_embedding", # SmolVLM
"vision_tower.patch_conv", # pixtral
"vision_model.patch_embedding.linear", # llama 4
"visual.patch_embed.proj", # qwen2vl
),

MODEL_TENSOR.V_ENC_EMBD_POS: (
"vision_tower.vision_model.embeddings.position_embedding",
"vpm.embeddings.position_embedding",
"model.vision_model.embeddings.position_embedding", # SmolVLM
"vision_model.positional_embedding_vlm", # llama 4
),

MODEL_TENSOR.V_ENC_ATTN_Q: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
"vpm.encoder.layers.{bid}.self_attn.q_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
),
Expand All @@ -947,6 +953,7 @@ class TensorNameMap:
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
"vpm.encoder.layers.{bid}.self_attn.k_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
),
Expand All @@ -959,6 +966,7 @@ class TensorNameMap:
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
"vpm.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
),
Expand All @@ -969,23 +977,26 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.layer_norm1",
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
"vision_model.model.layers.{bid}.input_layernorm", # llama4
"visual.blocks.{bid}.norm1", # qwen2vl
),

MODEL_TENSOR.V_ENC_OUTPUT: (
MODEL_TENSOR.V_ENC_ATTN_O: (
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
"vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
"vpm.encoder.layers.{bid}.self_attn.out_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
"visual.blocks.{bid}.attn.proj", # qwen2vl
),

MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
"vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
"vpm.encoder.layers.{bid}.layer_norm2",
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
"visual.blocks.{bid}.norm2", # qwen2vl
),
Expand All @@ -995,6 +1006,7 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.mlp.fc1",
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
),
Expand All @@ -1009,6 +1021,7 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.mlp.fc2",
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
),
Expand All @@ -1024,11 +1037,13 @@ class TensorNameMap:
MODEL_TENSOR.V_PRE_NORM: (
"vision_tower.vision_model.pre_layrnorm",
"vision_tower.ln_pre", # pixtral
"vision_model.layernorm_pre", # llama4
),

MODEL_TENSOR.V_POST_NORM: (
"vision_tower.vision_model.post_layernorm",
"model.vision_model.post_layernorm", # SmolVLM
"vision_model.layernorm_post", # llama4
"visual.merger.ln_q", # qwen2vl
),

Expand Down
74 changes: 73 additions & 1 deletion tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <climits>
#include <cstdarg>
#include <cinttypes>
#include <string>
#include <map>
#include <sstream>
Expand Down Expand Up @@ -44,7 +45,7 @@
// tensor name constants
//

#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_POS_EMBD "v.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
Expand Down Expand Up @@ -110,6 +111,7 @@ enum projector_type {
PROJECTOR_TYPE_PIXTRAL,
PROJECTOR_TYPE_QWEN25VL,
PROJECTOR_TYPE_INTERNVL,
PROJECTOR_TYPE_LLAMA4,
PROJECTOR_TYPE_UNKNOWN,
};

Expand All @@ -125,6 +127,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
};

static projector_type clip_projector_type_from_string(const std::string & str) {
Expand Down Expand Up @@ -240,6 +243,11 @@ struct clip_image_u8_batch {
struct clip_image_f32_batch {
std::vector<clip_image_f32_ptr> entries;

// for llava-uhd style models, we need to know the grid size
// note: entries.size() == grid_x * grid_y + 1 (one overview image)
int grid_x = 0;
int grid_y = 0;

clip_image_f32_batch clone() const {
clip_image_f32_batch new_batch;
new_batch.entries.reserve(entries.size());
Expand Down Expand Up @@ -358,6 +366,70 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
}
}

//
// debugging
//

static void print_tensor_shape(ggml_tensor * t) {
printf("%s.shape = [", t->name);
for (int i = 0; i < ggml_n_dims(t); ++i) {
printf("%" PRId64, t->ne[i]);
if (i < ggml_n_dims(t) - 1) {
printf(", ");
}
}
printf("]\n");
}

static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
ggml_type type = t->type;
int64_t * ne = t->ne;
size_t * nb = t->nb;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
printf("%s.data: [\n", t->name);
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2*n) {
printf(" ..., \n");
i2 = ne[2] - n;
}
printf(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2*n) {
printf(" ..., \n");
i1 = ne[1] - n;
}
printf(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2*n) {
printf("..., ");
i0 = ne[0] - n;
}
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(float *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i];
} else {
GGML_ABORT("fatal error");
}
printf("%8.4f", v);
if (i0 < ne[0] - 1) printf(", ");
}
printf("],\n");
}
printf(" ],\n");
}
printf(" ]\n");
}
}

//
// API used internally with mtmd
//
Expand Down
Loading
Loading