@@ -2768,80 +2768,6 @@ def write_tensors(self):
2768
2768
if len (experts ) > 0 :
2769
2769
raise ValueError (f"Unprocessed experts: { experts } " )
2770
2770
2771
- @Model .register ("JAISLMHeadModel" )
2772
- class JaisModel (Model ):
2773
- model_arch = gguf .MODEL_ARCH .JAIS
2774
-
2775
- def __init__ (self , * args , ** kwargs ):
2776
- super ().__init__ (* args , ** kwargs )
2777
-
2778
- # SwigLU activation
2779
- assert self .hparams ["activation_function" ] == "swiglu"
2780
- # ALiBi position embedding
2781
- assert self .hparams ["position_embedding_type" ] == "alibi"
2782
-
2783
- # Embeddings scale
2784
- self .embeddings_scale = 1.0
2785
- # note: For some JAIS flavors, output is tied to (same as) wte in original model
2786
- self .output_is_wte = False
2787
- if 'mup_embeddings_scale' in self .hparams :
2788
- self .output_is_wte = True # Hack (?)
2789
- self .embeddings_scale = self .hparams ['mup_embeddings_scale' ]
2790
- elif 'embeddings_scale' in self .hparams :
2791
- self .embeddings_scale = self .hparams ['embeddings_scale' ]
2792
- else :
2793
- assert False
2794
-
2795
- self .width_scale = 1.0
2796
- if 'mup_output_alpha' in self .hparams :
2797
- assert 'mup_width_scale' in self .hparams
2798
- self .width_scale = self .hparams ['mup_output_alpha' ] * self .hparams ['mup_width_scale' ]
2799
- elif 'width_scale' in self .hparams :
2800
- self .width_scale = self .hparams ['width_scale' ]
2801
- else :
2802
- assert False
2803
-
2804
- def set_vocab (self ):
2805
- self ._set_vocab_gpt2 ()
2806
-
2807
- def set_gguf_parameters (self ):
2808
- self .gguf_writer .add_name (self .dir_model .name )
2809
- self .gguf_writer .add_block_count (self .hparams ["n_layer" ])
2810
- self .gguf_writer .add_context_length (self .hparams ["n_positions" ])
2811
- self .gguf_writer .add_embedding_length (self .hparams ["n_embd" ])
2812
- self .gguf_writer .add_feed_forward_length (self .hparams ["n_inner" ])
2813
- self .gguf_writer .add_head_count (self .hparams ["n_head" ])
2814
- self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
2815
- self .gguf_writer .add_file_type (self .ftype )
2816
-
2817
- def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2818
- del bid # unused
2819
-
2820
- tensors : list [tuple [str , Tensor ]] = []
2821
-
2822
- # we don't need these
2823
- if name .endswith ((".attn.bias" , "relative_pe.slopes" )):
2824
- return tensors
2825
-
2826
- if name .endswith ((".c_attn.weight" , ".c_proj.weight" , ".c_fc.weight" , ".c_fc2.weight" )):
2827
- data_torch = data_torch .transpose (1 , 0 )
2828
-
2829
- new_name = self .map_tensor_name (name )
2830
-
2831
- if new_name == self .format_tensor_name (gguf .MODEL_TENSOR .TOKEN_EMBD ):
2832
- tensors .append ((new_name , data_torch * self .embeddings_scale ))
2833
- if self .output_is_wte :
2834
- tensors .append ((self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ), data_torch * self .width_scale ))
2835
- elif new_name == self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ):
2836
- assert not self .output_is_wte
2837
- tensors .append ((new_name , data_torch * self .width_scale ))
2838
- else :
2839
- tensors .append ((new_name , data_torch ))
2840
-
2841
- return tensors
2842
-
2843
-
2844
-
2845
2771
@Model .register ("T5ForConditionalGeneration" )
2846
2772
@Model .register ("T5WithLMHeadModel" )
2847
2773
class T5Model (Model ):
@@ -2959,6 +2885,78 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2959
2885
2960
2886
return [(self .map_tensor_name (name ), data_torch )]
2961
2887
2888
+ @Model .register ("JAISLMHeadModel" )
2889
+ class JaisModel (Model ):
2890
+ model_arch = gguf .MODEL_ARCH .JAIS
2891
+
2892
+ def __init__ (self , * args , ** kwargs ):
2893
+ super ().__init__ (* args , ** kwargs )
2894
+
2895
+ # SwigLU activation
2896
+ assert self .hparams ["activation_function" ] == "swiglu"
2897
+ # ALiBi position embedding
2898
+ assert self .hparams ["position_embedding_type" ] == "alibi"
2899
+
2900
+ # Embeddings scale
2901
+ self .embeddings_scale = 1.0
2902
+ # note: For some JAIS flavors, output is tied to (same as) wte in original model
2903
+ self .output_is_wte = False
2904
+ if 'mup_embeddings_scale' in self .hparams :
2905
+ self .output_is_wte = True # Hack (?)
2906
+ self .embeddings_scale = self .hparams ['mup_embeddings_scale' ]
2907
+ elif 'embeddings_scale' in self .hparams :
2908
+ self .embeddings_scale = self .hparams ['embeddings_scale' ]
2909
+ else :
2910
+ assert False
2911
+
2912
+ self .width_scale = 1.0
2913
+ if 'mup_output_alpha' in self .hparams :
2914
+ assert 'mup_width_scale' in self .hparams
2915
+ self .width_scale = self .hparams ['mup_output_alpha' ] * self .hparams ['mup_width_scale' ]
2916
+ elif 'width_scale' in self .hparams :
2917
+ self .width_scale = self .hparams ['width_scale' ]
2918
+ else :
2919
+ assert False
2920
+
2921
+ def set_vocab (self ):
2922
+ self ._set_vocab_gpt2 ()
2923
+
2924
+ def set_gguf_parameters (self ):
2925
+ self .gguf_writer .add_name (self .dir_model .name )
2926
+ self .gguf_writer .add_block_count (self .hparams ["n_layer" ])
2927
+ self .gguf_writer .add_context_length (self .hparams ["n_positions" ])
2928
+ self .gguf_writer .add_embedding_length (self .hparams ["n_embd" ])
2929
+ self .gguf_writer .add_feed_forward_length (self .hparams ["n_inner" ])
2930
+ self .gguf_writer .add_head_count (self .hparams ["n_head" ])
2931
+ self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
2932
+ self .gguf_writer .add_file_type (self .ftype )
2933
+
2934
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2935
+ del bid # unused
2936
+
2937
+ tensors : list [tuple [str , Tensor ]] = []
2938
+
2939
+ # we don't need these
2940
+ if name .endswith ((".attn.bias" , "relative_pe.slopes" )):
2941
+ return tensors
2942
+
2943
+ if name .endswith ((".c_attn.weight" , ".c_proj.weight" , ".c_fc.weight" , ".c_fc2.weight" )):
2944
+ data_torch = data_torch .transpose (1 , 0 )
2945
+
2946
+ new_name = self .map_tensor_name (name )
2947
+
2948
+ if new_name == self .format_tensor_name (gguf .MODEL_TENSOR .TOKEN_EMBD ):
2949
+ tensors .append ((new_name , data_torch * self .embeddings_scale ))
2950
+ if self .output_is_wte :
2951
+ tensors .append ((self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ), data_torch * self .width_scale ))
2952
+ elif new_name == self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT ):
2953
+ assert not self .output_is_wte
2954
+ tensors .append ((new_name , data_torch * self .width_scale ))
2955
+ else :
2956
+ tensors .append ((new_name , data_torch ))
2957
+
2958
+ return tensors
2959
+
2962
2960
2963
2961
###### CONVERSION LOGIC ######
2964
2962
0 commit comments