@@ -555,16 +555,16 @@ def gen_inner_sequence(input_len, idx_offset, random_offset, vocab_size):
555555 # [43577] --decode-> 'swagen' --tokenize-> [43577]
556556 # [43576, 43577] --decode-> 'Ġaquiswagen'
557557 # --tokenize-> [264, 9202, 86, 8535] # seqlen changes
558- prompt = tokenizer .decode (token_ids )
559- re_encoded_token_ids = tokenizer .encode (prompt )
558+ prompt = tokenizer .decode (token_ids , skip_special_tokens = True )
559+ re_encoded_token_ids = tokenizer .encode (prompt , add_special_tokens = False )
560560 while len (re_encoded_token_ids ) < total_input_len_expected :
561561 # Append a new random sequence to the existing sequence
562562 new_random_offset = np .random .randint (0 , vocab_size )
563563 new_inner_seq = gen_inner_sequence (input_lens [i ], i , new_random_offset , vocab_size )
564564 re_encoded_token_ids += new_inner_seq
565565 # Re-encode the prompt
566- new_prompt = tokenizer .decode (re_encoded_token_ids )
567- re_encoded_token_ids = tokenizer .encode (new_prompt )
566+ new_prompt = tokenizer .decode (re_encoded_token_ids , skip_special_tokens = True )
567+ re_encoded_token_ids = tokenizer .encode (new_prompt , add_special_tokens = False )
568568
569569 # Cut if the sequence is longer than the expected length
570570 re_encoded_token_ids = re_encoded_token_ids [:total_input_len_expected ]
0 commit comments