@@ -184,6 +184,8 @@ def from_model_architecture(model_architecture):
184184            return  MixtralModel 
185185        if  model_architecture  ==  "PhiForCausalLM" :
186186            return  Phi2Model 
187+         if  model_architecture  ==  "PlamoForCausalLM" :
188+             return  PlamoModel 
187189        return  Model 
188190
189191    def  _is_model_safetensors (self ) ->  bool :
@@ -225,6 +227,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
225227            return  gguf .MODEL_ARCH .LLAMA 
226228        if  arch  ==  "PhiForCausalLM" :
227229            return  gguf .MODEL_ARCH .PHI2 
230+         if  arch  ==  "PlamoForCausalLM" :
231+             return  gguf .MODEL_ARCH .PLAMO 
228232
229233        raise  NotImplementedError (f'Architecture "{ arch }  " not supported!' )
230234
@@ -1002,11 +1006,91 @@ def set_gguf_parameters(self):
10021006        self .gguf_writer .add_add_bos_token (False )
10031007
10041008
1009+ class  PlamoModel (Model ):
1010+     def  set_vocab (self ):
1011+         self ._set_vocab_sentencepiece ()
1012+ 
1013+     def  set_gguf_parameters (self ):
1014+         hparams  =  self .hparams 
1015+         block_count  =  hparams ["num_hidden_layers" ]
1016+ 
1017+         self .gguf_writer .add_name ("PLaMo" )
1018+         self .gguf_writer .add_context_length (4096 )  # not in config.json 
1019+         self .gguf_writer .add_embedding_length (hparams ["hidden_size" ])
1020+         self .gguf_writer .add_feed_forward_length (hparams ["intermediate_size" ])
1021+         self .gguf_writer .add_block_count (block_count )
1022+         self .gguf_writer .add_head_count (hparams ["num_attention_heads" ])
1023+         self .gguf_writer .add_head_count_kv (5 )  # hparams["num_key_value_heads"]) is wrong 
1024+         self .gguf_writer .add_layer_norm_rms_eps (hparams ["rms_norm_eps" ])
1025+ 
1026+     def  shuffle_attn_q_weight (self , data_torch ):
1027+         assert  data_torch .size () ==  (5120 , 5120 )
1028+         data_torch  =  data_torch .reshape (8 , 5 , 128 , 5120 )
1029+         data_torch  =  torch .permute (data_torch , (1 , 0 , 2 , 3 ))
1030+         data_torch  =  torch .reshape (data_torch , (5120 , 5120 ))
1031+         return  data_torch 
1032+ 
1033+     def  shuffle_attn_output_weight (self , data_torch ):
1034+         assert  data_torch .size () ==  (5120 , 5120 )
1035+         data_torch  =  data_torch .reshape (5120 , 8 , 5 , 128 )
1036+         data_torch  =  torch .permute (data_torch , (0 , 2 , 1 , 3 ))
1037+         data_torch  =  torch .reshape (data_torch , (5120 , 5120 ))
1038+         return  data_torch 
1039+ 
1040+     def  write_tensors (self ):
1041+         block_count  =  self .hparams .get ("num_layers" , self .hparams .get ("num_hidden_layers" ))
1042+         tensor_map  =  gguf .get_tensor_name_map (self .model_arch , block_count )
1043+ 
1044+         for  name , data_torch  in  self .get_tensors ():
1045+             if  "self_attn.rotary_emb.inv_freq"  in  name :
1046+                 continue 
1047+ 
1048+             # map tensor names 
1049+             new_name  =  tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
1050+             if  new_name  is  None :
1051+                 print (f"Can not map tensor { name !r}  " )
1052+                 sys .exit ()
1053+ 
1054+             # shuffle for broadcasting of gqa in ggml_mul_mat 
1055+             if  new_name .endswith ("attn_q.weight" ):
1056+                 data_torch  =  self .shuffle_attn_q_weight (data_torch )
1057+             elif  new_name .endswith ("attn_output.weight" ):
1058+                 data_torch  =  self .shuffle_attn_output_weight (data_torch )
1059+ 
1060+             old_dtype  =  data_torch .dtype 
1061+ 
1062+             # convert any unsupported data types to float32 
1063+             if  data_torch .dtype  not  in   (torch .float16 , torch .float32 ):
1064+                 data_torch  =  data_torch .to (torch .float32 )
1065+ 
1066+             data  =  data_torch .squeeze ().numpy ()
1067+ 
1068+             n_dims  =  len (data .shape )
1069+             data_dtype  =  data .dtype 
1070+ 
1071+             # if f32 desired, convert any float16 to float32 
1072+             if  self .ftype  ==  0  and  data_dtype  ==  np .float16 :
1073+                 data  =  data .astype (np .float32 )
1074+ 
1075+             # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 
1076+             if  self .ftype  ==  1  and  data_dtype  ==  np .float16  and  n_dims  ==  1 :
1077+                 data  =  data .astype (np .float32 )
1078+ 
1079+             # if f16 desired, convert any float32 2-dim weight tensors to float16 
1080+             if  self .ftype  ==  1  and  data_dtype  ==  np .float32  and  name .endswith (".weight" ) and  n_dims  ==  2 :
1081+                 data  =  data .astype (np .float16 )
1082+ 
1083+             print (f"{ new_name }  , n_dims = { n_dims }  , { old_dtype }   --> { data .dtype }  " )
1084+ 
1085+             self .gguf_writer .add_tensor (new_name , data )
1086+ 
1087+ 
10051088###### CONVERSION LOGIC ###### 
10061089
10071090
10081091def  parse_args () ->  argparse .Namespace :
1009-     parser  =  argparse .ArgumentParser (description = "Convert a huggingface model to a GGML compatible file" )
1092+     parser  =  argparse .ArgumentParser (
1093+         description = "Convert a huggingface model to a GGML compatible file" )
10101094    parser .add_argument (
10111095        "--vocab-only" , action = "store_true" ,
10121096        help = "extract only the vocab" ,
0 commit comments