13
13
from transformers import AutoTokenizer
14
14
15
15
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
16
+
17
+
16
18
def bytes_to_unicode ():
17
19
"""
18
20
Returns list of utf-8 byte and a corresponding list of unicode strings.
@@ -34,6 +36,7 @@ def bytes_to_unicode():
34
36
cs = [chr (n ) for n in cs ]
35
37
return dict (zip (bs , cs ))
36
38
39
+
37
40
def count_model_parts (dir_model : str ) -> int :
38
41
num_parts = 0
39
42
for filename in os .listdir (dir_model ):
@@ -44,6 +47,7 @@ def count_model_parts(dir_model: str) -> int:
44
47
print ("gguf: found " + str (num_parts ) + " model parts" )
45
48
return num_parts
46
49
50
+
47
51
if len (sys .argv ) < 3 :
48
52
print ("Usage: convert-h5-to-ggml.py dir-model ftype\n " )
49
53
print (" ftype == 0 -> float32" )
@@ -58,7 +62,7 @@ def count_model_parts(dir_model: str) -> int:
58
62
# possible tensor data types
59
63
# ftype == 0 -> float32
60
64
# ftype == 1 -> float16
61
- #
65
+
62
66
# map from ftype to string
63
67
ftype_str = ["f32" , "f16" ]
64
68
@@ -67,6 +71,7 @@ def count_model_parts(dir_model: str) -> int:
67
71
ftype = int (sys .argv [2 ])
68
72
if ftype < 0 or ftype > 1 :
69
73
print ("Invalid ftype: " + str (ftype ))
74
+
70
75
sys .exit (1 )
71
76
72
77
fname_out = sys .argv [1 ] + "/ggml-model-" + ftype_str [ftype ] + ".gguf"
@@ -77,30 +82,30 @@ def count_model_parts(dir_model: str) -> int:
77
82
hparams = json .load (f )
78
83
79
84
if hparams ["architectures" ][0 ] != "GPTNeoXForCausalLM" :
80
- print ("Model architecture not supported: " + hparams ["architectures" ][0 ] )
85
+ print ("Model architecture not supported: " + hparams ["architectures" ][0 ])
86
+
81
87
sys .exit ()
82
88
83
89
# get number of model parts
84
90
num_parts = count_model_parts (dir_model )
85
91
86
- gguf_writer = gguf .GGUFWriter .open (fname_out )
92
+ llm_arch = "gptneox"
93
+ gguf_writer = gguf .GGUFWriter (fname_out , arch = llm_arch )
87
94
88
95
print ("gguf: get model metadata" )
89
96
90
- llm_arch = "gptneox"
91
97
block_count = hparams ["num_hidden_layers" ]
92
98
93
- gguf_writer .add_architecture (llm_arch )
99
+ gguf_writer .add_architecture ()
94
100
gguf_writer .add_name (last_dir )
95
- gguf_writer .add_file_type ( "All tensors F32" if ftype == 0 else "Most tensors F16, some F32" )
96
- gguf_writer .add_context_length (llm_arch , hparams ["max_position_embeddings" ])
97
- gguf_writer .add_embedding_length (llm_arch , hparams ["hidden_size" ])
98
- gguf_writer .add_block_count (llm_arch , block_count )
99
- gguf_writer .add_feed_forward_length (llm_arch , hparams ["intermediate_size" ])
100
- gguf_writer .add_rope_dimension_count (llm_arch , int ( hparams ["rotary_pct" ]* (hparams ["hidden_size" ]// hparams ["num_attention_heads" ])) )
101
- gguf_writer .add_head_count (llm_arch , hparams ["num_attention_heads" ])
102
- gguf_writer .add_parallel_residual (llm_arch , hparams ["use_parallel_residual" ] if "use_parallel_residual" in hparams else True )
103
- gguf_writer .add_layer_norm_eps (llm_arch , hparams ["layer_norm_eps" ])
101
+ gguf_writer .add_context_length (hparams ["max_position_embeddings" ])
102
+ gguf_writer .add_embedding_length (hparams ["hidden_size" ])
103
+ gguf_writer .add_block_count (block_count )
104
+ gguf_writer .add_feed_forward_length (hparams ["intermediate_size" ])
105
+ gguf_writer .add_rope_dimension_count (int (hparams ["rotary_pct" ]* (hparams ["hidden_size" ]// hparams ["num_attention_heads" ])))
106
+ gguf_writer .add_head_count (hparams ["num_attention_heads" ])
107
+ gguf_writer .add_parallel_residual (hparams ["use_parallel_residual" ] if "use_parallel_residual" in hparams else True )
108
+ gguf_writer .add_layer_norm_eps (hparams ["layer_norm_eps" ])
104
109
105
110
# TOKENIZATION
106
111
@@ -124,14 +129,14 @@ def count_model_parts(dir_model: str) -> int:
124
129
125
130
print ("gguf: get gpt2 tokenizer vocab" )
126
131
127
- vocab_size = len ( tokenizer_json ["model" ]["vocab" ] )
132
+ vocab_size = len (tokenizer_json ["model" ]["vocab" ])
128
133
129
134
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
130
135
tokenizer = AutoTokenizer .from_pretrained (dir_model )
131
136
132
137
reverse_vocab = {id : encoded_tok for encoded_tok , id in tokenizer .vocab .items ()}
133
138
byte_encoder = bytes_to_unicode ()
134
- byte_decoder = {v :k for k , v in byte_encoder .items ()}
139
+ byte_decoder = {v : k for k , v in byte_encoder .items ()}
135
140
136
141
for i in range (vocab_size ):
137
142
if i in reverse_vocab :
@@ -146,8 +151,9 @@ def count_model_parts(dir_model: str) -> int:
146
151
text .extend (c .encode ('utf-8' ))
147
152
else :
148
153
print (f"Key { i } not in tokenizer vocabulary. Padding with an arbitrary token." )
149
- padding_token = f"[PAD{ i } ]" .encode ("utf8" )
150
- text = bytearray (padding_token )
154
+ pad_token = f"[PAD{ i } ]" .encode ("utf8" )
155
+ text = bytearray (pad_token )
156
+
151
157
tokens .append (text )
152
158
153
159
gguf_writer .add_token_list (tokens )
@@ -201,7 +207,7 @@ def count_model_parts(dir_model: str) -> int:
201
207
)
202
208
203
209
for part_name in part_names :
204
- print ("gguf: loading model part '" + part_name + "'" )
210
+ print ("gguf: loading model part '" + part_name + "'" )
205
211
model_part = torch .load (f"{ dir_model } /{ part_name } " , map_location = "cpu" )
206
212
207
213
for name in model_part .keys ():
@@ -223,11 +229,12 @@ def count_model_parts(dir_model: str) -> int:
223
229
elif name .endswith (".bias" ) and name [:- 5 ] in tensor_map :
224
230
name = tensor_map [name [:- 5 ]] + ".bias"
225
231
else :
226
- print ( "Can not map tensor '" + name + "'" )
232
+ print ("Can not map tensor '" + name + "'" )
227
233
sys .exit ()
228
234
229
235
n_dims = len (data .shape )
230
236
data_dtype = data .dtype
237
+ old_dtype = data_dtype
231
238
232
239
# if f32 desired, convert any float16 to float32
233
240
if ftype == 0 and data .dtype == np .float16 :
@@ -241,77 +248,21 @@ def count_model_parts(dir_model: str) -> int:
241
248
if ftype == 1 and data .dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
242
249
data_dtype = np .float16
243
250
244
- data_nbytes = data .size * 2 if data_dtype == np .float16 else data .size * 4
251
+ print (name + ", n_dims = " + str (n_dims ) + ", " + str (old_dtype ) + " --> " + str (data_dtype ))
252
+
253
+ data = data .astype (data_dtype )
245
254
246
- gguf_writer .add_tensor_info (name , data . shape , data_dtype , data_nbytes )
255
+ gguf_writer .add_tensor (name , data )
247
256
248
257
249
258
print ("gguf: write header" )
250
259
gguf_writer .write_header_to_file ()
251
260
print ("gguf: write metadata" )
252
261
gguf_writer .write_kv_data_to_file ()
253
- print ("gguf: write tensor metadata" )
254
- gguf_writer .write_ti_data_to_file ()
255
-
256
- # tensor data
257
- print ("gguf: convert and write tensor data" )
258
-
259
- if num_parts == 0 :
260
- part_names = ("pytorch_model.bin" ,)
261
- else :
262
- part_names = (
263
- f"pytorch_model-{ n :05} -of-{ num_parts :05} .bin" for n in range (1 , num_parts + 1 )
264
- )
265
-
266
- for part_name in part_names :
267
- print ("gguf: loading model part '" + part_name + "'" )
268
- model_part = torch .load (f"{ dir_model } /{ part_name } " , map_location = "cpu" )
269
-
270
- for name in model_part .keys ():
271
- data = model_part [name ]
272
-
273
- old_dtype = data .dtype
274
-
275
- # we don't need these
276
- if name .endswith (".attention.masked_bias" ) or name .endswith (".attention.bias" ) or name .endswith (".attention.rotary_emb.inv_freq" ):
277
- continue
278
-
279
- # convert any unsupported data types to float32
280
- if data .dtype != torch .float16 and data .dtype != torch .float32 :
281
- data = data .to (torch .float32 )
282
-
283
- data = data .squeeze ().numpy ()
284
-
285
- # map tensor names
286
- if name .endswith (".weight" ) and name [:- 7 ] in tensor_map :
287
- name = tensor_map [name [:- 7 ]] + ".weight"
288
- elif name .endswith (".bias" ) and name [:- 5 ] in tensor_map :
289
- name = tensor_map [name [:- 5 ]] + ".bias"
290
- else :
291
- print ( "Can not map tensor '" + name + "'" )
292
- sys .exit ()
293
-
294
- n_dims = len (data .shape )
295
- data_dtype = data .dtype
296
-
297
- # if f32 desired, convert any float16 to float32
298
- if ftype == 0 and data .dtype == np .float16 :
299
- data = data .astype (np .float32 )
300
-
301
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
302
- if ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
303
- data = data .astype (np .float32 )
304
-
305
- # if f16 desired, convert any float32 2-dim weight tensors to float16
306
- if ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
307
- data = data .astype (np .float16 )
308
-
309
- print ( name + ", shape " + str (len (data .shape )) + ", " + str (old_dtype ) + " --> " + str (data .dtype ))
310
-
311
- gguf_writer .write_tensor_to_file (data )
262
+ print ("gguf: write tensors" )
263
+ gguf_writer .write_tensors_to_file ()
312
264
313
265
gguf_writer .close ()
314
266
315
-
316
- print ("gguf: model successfully exported to '" + fname_out + "'" )
267
+ print ("gguf: model successfully exported to '" + fname_out + "'" )
317
268
print ("" )
0 commit comments