2323 sys .path .insert (1 , str (Path (__file__ ).parent / 'gguf-py' ))
2424import gguf
2525
26- from convert import HfVocab
26+ from convert import LlamaHfVocab
2727
2828
2929###### MODEL DEFINITIONS ######
@@ -230,7 +230,7 @@ def _get_part_names(self):
230230 def _set_vocab_gpt2 (self ):
231231 dir_model = self .dir_model
232232 hparams = self .hparams
233- tokens : list [bytearray ] = []
233+ tokens : list [str ] = []
234234 toktypes : list [int ] = []
235235
236236 from transformers import AutoTokenizer
@@ -243,8 +243,7 @@ def _set_vocab_gpt2(self):
243243
244244 for i in range (vocab_size ):
245245 if i not in reverse_vocab :
246- pad_token = f"[PAD{ i } ]" .encode ('utf-8' )
247- tokens .append (bytearray (pad_token ))
246+ tokens .append (f"[PAD{ i } ]" )
248247 toktypes .append (gguf .TokenType .USER_DEFINED )
249248 elif reverse_vocab [i ] in added_vocab :
250249 tokens .append (reverse_vocab [i ])
@@ -266,7 +265,7 @@ def _set_vocab_gpt2(self):
266265 def _set_vocab_qwen (self ):
267266 dir_model = self .dir_model
268267 hparams = self .hparams
269- tokens : list [bytearray ] = []
268+ tokens : list [str ] = []
270269 toktypes : list [int ] = []
271270
272271 from transformers import AutoTokenizer
@@ -291,8 +290,7 @@ def _set_vocab_qwen(self):
291290
292291 for i in range (vocab_size ):
293292 if i not in reverse_vocab :
294- pad_token = f"[PAD{ i } ]" .encode ("utf-8" )
295- tokens .append (bytearray (pad_token ))
293+ tokens .append (f"[PAD{ i } ]" )
296294 toktypes .append (gguf .TokenType .USER_DEFINED )
297295 elif reverse_vocab [i ] in added_vocab :
298296 tokens .append (reverse_vocab [i ])
@@ -372,12 +370,8 @@ def _set_vocab_sentencepiece(self):
372370 special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
373371 special_vocab .add_to_gguf (self .gguf_writer )
374372
375- def _set_vocab_hf (self ):
376- path = self .dir_model
377- added_tokens_path = self .dir_model
378- vocab = HfVocab (
379- path , added_tokens_path if added_tokens_path .exists () else None
380- )
373+ def _set_vocab_llama_hf (self ):
374+ vocab = LlamaHfVocab (self .dir_model )
381375 tokens = []
382376 scores = []
383377 toktypes = []
@@ -1099,7 +1093,7 @@ def set_gguf_parameters(self):
10991093 self .gguf_writer .add_file_type (self .ftype )
11001094
11011095 def set_vocab (self ):
1102- self ._set_vocab_hf ()
1096+ self ._set_vocab_llama_hf ()
11031097
11041098 def _reverse_hf_permute (self , weights : Tensor , n_head : int , n_kv_head : int | None = None ) -> Tensor :
11051099 if n_kv_head is not None and n_head != n_kv_head :
@@ -1700,11 +1694,8 @@ def set_gguf_parameters(self):
17001694 self .gguf_writer .add_pooling_type (pooling_type )
17011695
17021696 def set_vocab (self ):
1703- path = self .dir_model
1704- added_tokens_path = self .dir_model if self .dir_model .exists () else None
1705-
17061697 # use huggingface vocab to get all tokens
1707- vocab = HfVocab ( path , added_tokens_path )
1698+ vocab = LlamaHfVocab ( self . dir_model , ignore_nonllama = True )
17081699 tokens , scores , toktypes = zip (* vocab .all_tokens ())
17091700 assert len (tokens ) == vocab .vocab_size
17101701 self .vocab_size = vocab .vocab_size
0 commit comments