@@ -366,16 +366,19 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No
366366 added_tokens = {}
367367
368368 vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
369- expected_ids = list (range (vocab_size , vocab_size + len (added_tokens )))
370- actual_ids = sorted (added_tokens .values ())
371- if expected_ids != actual_ids :
372- raise Exception (f"Expected added token IDs to be sequential and start at { vocab_size } ; got { actual_ids } " )
373369
374- items = sorted (added_tokens .items (), key = lambda text_idx : text_idx [1 ])
375- self .added_tokens_list = [text for (text , idx ) in items ]
376- self .vocab_size_base : int = vocab_size
377- self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_list )
378- self .fname_tokenizer = fname_tokenizer
370+ new_tokens = {id : piece for piece , id in added_tokens .items () if id >= vocab_size }
371+ expected_new_ids = list (range (vocab_size , vocab_size + len (new_tokens )))
372+ actual_new_ids = sorted (new_tokens .keys ())
373+
374+ if expected_new_ids != actual_new_ids :
375+ raise ValueError (f"Expected new token IDs { expected_new_ids } to be sequential; got { actual_new_ids } " )
376+
377+ # Token pieces that were added to the base vocabulary.
378+ self .added_tokens_list = [new_tokens [id ] for id in actual_new_ids ]
379+ self .vocab_size_base = vocab_size
380+ self .vocab_size = self .vocab_size_base + len (self .added_tokens_list )
381+ self .fname_tokenizer = fname_tokenizer
379382 self .fname_added_tokens = fname_added_tokens
380383
381384 def sentencepiece_tokens (self ) -> Iterable [tuple [bytes , float , gguf .TokenType ]]:
0 commit comments