diff --git a/tensorrt_llm/serve/scripts/benchmark_dataset.py b/tensorrt_llm/serve/scripts/benchmark_dataset.py index 02000cddbaa..0293aa2e105 100644 --- a/tensorrt_llm/serve/scripts/benchmark_dataset.py +++ b/tensorrt_llm/serve/scripts/benchmark_dataset.py @@ -538,17 +538,56 @@ def sample( f"Only {len(requests)} requests sampled from sharegpt dataset, {num_requests} requests are needed" ) else: + + def gen_inner_sequence(input_len, idx_offset, random_offset, + vocab_size): + return ((random_offset + idx_offset + np.arange(input_len)) % + vocab_size).tolist() + for i in range(num_requests): - inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % - vocab_size).tolist() - prompt = prefix_token_ids + inner_seq + inner_seq = gen_inner_sequence(input_lens[i], i, offsets[i], + vocab_size) + token_ids = prefix_token_ids + inner_seq + total_input_len_expected = prefix_len + int(input_lens[i]) + + # Here we have to re-tokenize and decode the prompt. Because the token_ids + # generated randomly can not guarantee a same token_id sequence after + # decode and re-tokenize, and it will get a longer sequence length in most cases. + # Take Qwen2TokenizerFast as an example: + # [43576] --decode-> 'Ġaqui' --tokenize-> [43576] + # [43577] --decode-> 'swagen' --tokenize-> [43577] + # [43576, 43577] --decode-> 'Ġaquiswagen' + # --tokenize-> [264, 9202, 86, 8535] # seqlen changes + prompt = tokenizer.decode(token_ids, skip_special_tokens=True) + re_encoded_token_ids = tokenizer.encode( + prompt, add_special_tokens=False) + while len(re_encoded_token_ids) < total_input_len_expected: + # Append a new random sequence to the existing sequence + new_random_offset = int( + torch.randint(0, vocab_size, (1, ), + generator=self.rng).item()) + new_inner_seq = gen_inner_sequence(input_lens[i], i, + new_random_offset, + vocab_size) + re_encoded_token_ids += new_inner_seq + # Re-encode the prompt + new_prompt = tokenizer.decode(re_encoded_token_ids, + skip_special_tokens=True) + re_encoded_token_ids = tokenizer.encode( + new_prompt, add_special_tokens=False) + + # Cut if the sequence is longer than the expected length + re_encoded_token_ids = re_encoded_token_ids[: + total_input_len_expected] + + result_prompt = re_encoded_token_ids if self.return_text: - prompt = tokenizer.decode(prompt) - total_input_len = prefix_len + int(input_lens[i]) + result_prompt = tokenizer.decode(result_prompt) + requests.append( SampleRequest( - prompt=prompt, - prompt_len=total_input_len, + prompt=result_prompt, + prompt_len=total_input_len_expected, expected_output_len=int(output_lens[i]), )) return requests