@@ -14,6 +14,7 @@ class TensorNameMap:
14
14
"transformer.word_embeddings" , # falcon
15
15
"word_embeddings" , # bloom
16
16
"model.embed_tokens" , # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 plamo2 granite-hybrid
17
+ "model.language_model.embed_tokens" , # glm-4-thinking
17
18
"tok_embeddings" , # llama-pth
18
19
"embeddings.word_embeddings" , # bert nomic-bert
19
20
"language_model.embedding.word_embeddings" , # persimmon
@@ -94,6 +95,7 @@ class TensorNameMap:
94
95
"model.ln_out" , # rwkv7
95
96
"backbone.final_layer_norm" , # wavtokenizer
96
97
"model.norm" , # llama4
98
+ "model.language_model.norm" # glm-4-thinking
97
99
),
98
100
99
101
# Rope frequencies
@@ -139,6 +141,7 @@ class TensorNameMap:
139
141
"model.layers.{bid}.input_layernorm" , # llama4
140
142
"transformer_encoder.{bid}.attention_norm" , # neobert
141
143
"model.layers.{bid}.operator_norm" , # lfm2
144
+ "model.language_model.layers.{bid}.input_layernorm" , # glm-4-thinking
142
145
),
143
146
144
147
# Attention norm 2
@@ -183,6 +186,7 @@ class TensorNameMap:
183
186
"transformer.decoder_layer.{bid}.multi_head_attention.query" ,# Grok
184
187
"transformer.h.{bid}.attn.attention.q_proj" , # exaone
185
188
"model.layers.{bid}.self_attn.q_proj" , # llama4
189
+ "model.language_model.layers.{bid}.self_attn.q_proj" , # glm-4-thinking
186
190
),
187
191
188
192
# Attention key
@@ -199,6 +203,7 @@ class TensorNameMap:
199
203
"transformer.decoder_layer.{bid}.multi_head_attention.key" ,# Grok
200
204
"transformer.h.{bid}.attn.attention.k_proj" , # exaone
201
205
"model.layers.{bid}.self_attn.k_proj" , # llama4
206
+ "model.language_model.layers.{bid}.self_attn.k_proj" , # glm-4-thinking
202
207
),
203
208
204
209
# Attention value
@@ -214,6 +219,7 @@ class TensorNameMap:
214
219
"transformer.decoder_layer.{bid}.multi_head_attention.value" ,# Grok
215
220
"transformer.h.{bid}.attn.attention.v_proj" , # exaone
216
221
"model.layers.{bid}.self_attn.v_proj" , # llama4
222
+ "model.language_model.layers.{bid}.self_attn.v_proj" , # glm-4-thinking
217
223
),
218
224
219
225
# Attention output
@@ -246,6 +252,7 @@ class TensorNameMap:
246
252
"transformer.h.{bid}.attn.attention.out_proj" , # exaone
247
253
"model.layers.{bid}.self_attn.o_proj" , # llama4
248
254
"transformer_encoder.{bid}.wo" , # neobert
255
+ "model.language_model.layers.{bid}.self_attn.o_proj" , # glm-4-thinking
249
256
),
250
257
251
258
# Attention output norm
@@ -258,9 +265,10 @@ class TensorNameMap:
258
265
),
259
266
260
267
MODEL_TENSOR .ATTN_POST_NORM : (
261
- "model.layers.{bid}.post_attention_layernorm" , # gemma2 olmo2 # ge
262
- "model.layers.{bid}.post_self_attn_layernorm" , # glm-4-0414
263
- "model.layers.layers.{bid}.post_mixer_norm.weight" , # plamo2
268
+ "model.layers.{bid}.post_attention_layernorm" , # gemma2 olmo2 # ge
269
+ "model.layers.{bid}.post_self_attn_layernorm" , # glm-4-0414
270
+ "model.layers.layers.{bid}.post_mixer_norm.weight" , # plamo2
271
+ "model.language_model.layers.{bid}.post_self_attn_layernorm" , # glm-4-thinking
264
272
),
265
273
266
274
# Rotary embeddings
@@ -291,6 +299,7 @@ class TensorNameMap:
291
299
"model.layers.{bid}.post_attention_layernorm" , # llama4
292
300
"transformer_encoder.{bid}.ffn_norm" , # neobert
293
301
"model.layers.layers.{bid}.pre_mlp_norm" , # plamo2
302
+ "model.language_model.layers.{bid}.post_attention_layernorm" # glm-4-thinking
294
303
),
295
304
296
305
# Post feed-forward norm
@@ -305,6 +314,7 @@ class TensorNameMap:
305
314
"model.layers.{bid}.post_mlp_layernorm" , # glm-4-0414
306
315
"model.layers.layers.{bid}.post_mlp_norm.weight" , # plamo2
307
316
"model.layers.{bid}.feed_forward.up_proj" ,
317
+ "model.language_model.layers.{bid}.post_mlp_layernorm" , # glm-4-thinking
308
318
),
309
319
310
320
MODEL_TENSOR .FFN_GATE_INP : (
@@ -362,6 +372,7 @@ class TensorNameMap:
362
372
"transformer.h.{bid}.mlp.c_fc_1" , # exaone
363
373
"model.layers.{bid}.feed_forward.up_proj" , # llama4 jamba granite-hybrid
364
374
"transformer_encoder.{bid}.ffn.w12" , # neobert
375
+ "model.language_model.layers.{bid}.mlp.gate_up_proj" , # glm-4-thinking
365
376
),
366
377
367
378
MODEL_TENSOR .FFN_UP_EXP : (
@@ -448,6 +459,7 @@ class TensorNameMap:
448
459
"model.layers.h.{bid}.mlp.c_proj" , # exaone
449
460
"model.layers.{bid}.feed_forward.down_proj" , # llama4 jamba granite-hybrid
450
461
"transformer_encoder.{bid}.ffn.w3" , # neobert
462
+ "model.language_model.layers.{bid}.mlp.down_proj" , # glm-4-thinking
451
463
),
452
464
453
465
MODEL_TENSOR .FFN_DOWN_EXP : (
0 commit comments