@@ -538,36 +538,45 @@ def sample(
538538 f"Only { len (requests )} requests sampled from sharegpt dataset, { num_requests } requests are needed"
539539 )
540540 else :
541- def gen_inner_sequence (input_len , idx_offset , random_offset , vocab_size ):
541+
542+ def gen_inner_sequence (input_len , idx_offset , random_offset ,
543+ vocab_size ):
542544 return ((random_offset + idx_offset + np .arange (input_len )) %
543- vocab_size ).tolist ()
545+ vocab_size ).tolist ()
544546
545547 for i in range (num_requests ):
546- inner_seq = gen_inner_sequence (input_lens [i ], i , offsets [i ], vocab_size )
548+ inner_seq = gen_inner_sequence (input_lens [i ], i , offsets [i ],
549+ vocab_size )
547550 token_ids = prefix_token_ids + inner_seq
548551 total_input_len_expected = prefix_len + int (input_lens [i ])
549-
552+
550553 # Here we have to re-tokenize and decode the prompt. Because the token_ids
551- # generated randomly can not guarantee a same token_id sequence after
552- # decode and re-tokenize, and it will get a longer sequence length in most cases.
553- # Take Qwen2TokenizerFast as an example:
554+ # generated randomly can not guarantee a same token_id sequence after
555+ # decode and re-tokenize, and it will get a longer sequence length in most cases.
556+ # Take Qwen2TokenizerFast as an example:
554557 # [43576] --decode-> 'Ġaqui' --tokenize-> [43576]
555558 # [43577] --decode-> 'swagen' --tokenize-> [43577]
556- # [43576, 43577] --decode-> 'Ġaquiswagen'
559+ # [43576, 43577] --decode-> 'Ġaquiswagen'
557560 # --tokenize-> [264, 9202, 86, 8535] # seqlen changes
558561 prompt = tokenizer .decode (token_ids , skip_special_tokens = True )
559- re_encoded_token_ids = tokenizer .encode (prompt , add_special_tokens = False )
562+ re_encoded_token_ids = tokenizer .encode (
563+ prompt , add_special_tokens = False )
560564 while len (re_encoded_token_ids ) < total_input_len_expected :
561565 # Append a new random sequence to the existing sequence
562566 new_random_offset = np .random .randint (0 , vocab_size )
563- new_inner_seq = gen_inner_sequence (input_lens [i ], i , new_random_offset , vocab_size )
567+ new_inner_seq = gen_inner_sequence (input_lens [i ], i ,
568+ new_random_offset ,
569+ vocab_size )
564570 re_encoded_token_ids += new_inner_seq
565571 # Re-encode the prompt
566- new_prompt = tokenizer .decode (re_encoded_token_ids , skip_special_tokens = True )
567- re_encoded_token_ids = tokenizer .encode (new_prompt , add_special_tokens = False )
572+ new_prompt = tokenizer .decode (re_encoded_token_ids ,
573+ skip_special_tokens = True )
574+ re_encoded_token_ids = tokenizer .encode (
575+ new_prompt , add_special_tokens = False )
568576
569577 # Cut if the sequence is longer than the expected length
570- re_encoded_token_ids = re_encoded_token_ids [:total_input_len_expected ]
578+ re_encoded_token_ids = re_encoded_token_ids [:
579+ total_input_len_expected ]
571580
572581 result_prompt = re_encoded_token_ids
573582 if self .return_text :
0 commit comments