File tree Expand file tree Collapse file tree 1 file changed +9
-0
lines changed Expand file tree Collapse file tree 1 file changed +9
-0
lines changed Original file line number Diff line number Diff line change 1+ import contextlib
12import os
23import warnings
34from pathlib import Path
@@ -67,7 +68,15 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
6768 tokenizer .all_special_tokens_extended )
6869 tokenizer_all_special_tokens = set (tokenizer .all_special_tokens )
6970 tokenizer_len = len (tokenizer )
71+
7072 max_token_id = max (tokenizer .get_vocab ().values ())
73+ # Some tokenizers (e.g., QwenTokenizer) have special tokens that
74+ # are added and included in the implementation of the vocab_size
75+ # property, but not in get_vocab(); if there is an implementation
76+ # of vocab size, we should take the greater value.
77+ if hasattr (tokenizer , "vocab_size" ):
78+ with contextlib .suppress (NotImplementedError ):
79+ max_token_id = max (max_token_id , tokenizer .vocab_size )
7180
7281 class CachedTokenizer (tokenizer .__class__ ): # type: ignore
7382
You can’t perform that action at this time.
0 commit comments