From 3d71b942c648c80b22f5326d38f81c09258cd6af Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Wed, 25 Jun 2025 16:24:58 +0000 Subject: [PATCH 1/7] add seed --- vllm/benchmarks/datasets.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 3efbe5695711..9ce159a50c41 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -320,6 +320,8 @@ def __init__( **kwargs, ) -> None: super().__init__(**kwargs) + random.seed(self.random_seed) + np.random.seed(self.random_seed) def sample( self, @@ -692,7 +694,8 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: dataset_path=args.dataset_path). sample(tokenizer=tokenizer, num_requests=args.num_prompts), "random": - lambda: RandomDataset(dataset_path=args.dataset_path).sample( + lambda: RandomDataset(random_seed=args.seed, + dataset_path=args.dataset_path).sample( tokenizer=tokenizer, num_requests=args.num_prompts, prefix_len=args.random_prefix_len, From e1c96846b3d8bd0d272a660498c103305b917e70 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Wed, 25 Jun 2025 16:25:09 +0000 Subject: [PATCH 2/7] add backend --- vllm/benchmarks/serve.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 302f655f424a..419284cca042 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -631,6 +631,12 @@ def add_cli_args(parser: argparse.ArgumentParser): help="The label (prefix) of the benchmark results. If not specified, " "the endpoint type will be used as the label.", ) + parser.add_argument( + "--backend", + type=str, + default="vllm", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + ) parser.add_argument( "--base-url", type=str, From c56c2afe4c50bd6b958c3b65357d5bc8dcc0a74a Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Wed, 25 Jun 2025 16:34:41 +0000 Subject: [PATCH 3/7] fix prefix and input len len --- vllm/benchmarks/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 9ce159a50c41..7be3fe63b556 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -378,10 +378,10 @@ def sample( # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] # To avoid uncontrolled change of the prompt length, # the encoded sequence is truncated before being decode again. + total_input_len = prefix_len + int(input_lens[i]) re_encoded_sequence = tokenizer.encode( - prompt, add_special_tokens=False)[:input_lens[i]] + prompt, add_special_tokens=False)[:total_input_len] prompt = tokenizer.decode(re_encoded_sequence) - total_input_len = prefix_len + int(input_lens[i]) requests.append( SampleRequest( prompt=prompt, From 864aeebcf048fa0d956ff4d78e14046e6752b365 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Wed, 25 Jun 2025 16:36:10 +0000 Subject: [PATCH 4/7] fix prefix and input len len --- benchmarks/benchmark_dataset.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 8671719bce72..39d580ee7a9e 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -349,11 +349,10 @@ def sample( # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] # To avoid uncontrolled change of the prompt length, # the encoded sequence is truncated before being decode again. - re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ - : input_lens[i] - ] + total_input_len = prefix_len + int(input_lens[i]) + re_encoded_sequence = tokenizer.encode( + prompt, add_special_tokens=False)[:total_input_len] prompt = tokenizer.decode(re_encoded_sequence) - total_input_len = len(re_encoded_sequence) requests.append( SampleRequest( prompt=prompt, From 3a61da474e4e921586a4e6b68818d9e22f8ef642 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Wed, 25 Jun 2025 16:37:32 +0000 Subject: [PATCH 5/7] qol spec decode offline --- examples/offline_inference/spec_decode.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index eece8beced51..800e9a1f6f83 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -39,6 +39,9 @@ def parse_args(): parser.add_argument("--top-k", type=int, default=-1) parser.add_argument("--print-output", action="store_true") parser.add_argument("--output-len", type=int, default=256) + parser.add_argument("--model-dir", type=str, default=None) + parser.add_argument("--eagle-dir", type=str, default=None) + parser.add_argument("--max-model-len", type=int, default=2048) return parser.parse_args() @@ -46,9 +49,10 @@ def main(): args = parse_args() args.endpoint_type = "openai-chat" - model_dir = "meta-llama/Llama-3.1-8B-Instruct" + model_dir = args.model_dir + if args.model_dir is None: + model_dir = "meta-llama/Llama-3.1-8B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_dir) - max_model_len = 2048 prompts = get_samples(args, tokenizer) # add_special_tokens is False to avoid adding bos twice when using chat templates @@ -57,16 +61,20 @@ def main(): ] if args.method == "eagle" or args.method == "eagle3": + eagle_dir = args.eagle_dir if args.method == "eagle": - eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" + if eagle_dir is None: + eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" + elif args.method == "eagle3": - eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" + if eagle_dir is None: + eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" speculative_config = { "method": args.method, "model": eagle_dir, "num_speculative_tokens": args.num_spec_tokens, "draft_tensor_parallel_size": args.draft_tp, - "max_model_len": max_model_len, + "max_model_len": args.max_model_len, } elif args.method == "ngram": speculative_config = { @@ -74,7 +82,7 @@ def main(): "num_speculative_tokens": args.num_spec_tokens, "prompt_lookup_max": args.prompt_lookup_max, "prompt_lookup_min": args.prompt_lookup_min, - "max_model_len": max_model_len, + "max_model_len": args.max_model_len, } else: raise ValueError(f"unknown method: {args.method}") @@ -86,7 +94,7 @@ def main(): enable_chunked_prefill=args.enable_chunked_prefill, max_num_batched_tokens=args.max_num_batched_tokens, enforce_eager=args.enforce_eager, - max_model_len=max_model_len, + max_model_len=args.max_model_len, max_num_seqs=args.max_num_seqs, gpu_memory_utilization=0.8, speculative_config=speculative_config, From 92ead112ac5a01e1971d66ff14534ab438cc014e Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Wed, 25 Jun 2025 17:00:40 +0000 Subject: [PATCH 6/7] lint --- benchmarks/benchmark_dataset.py | 5 +++-- examples/offline_inference/spec_decode.py | 12 +++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 39d580ee7a9e..8774a50f6479 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -350,8 +350,9 @@ def sample( # To avoid uncontrolled change of the prompt length, # the encoded sequence is truncated before being decode again. total_input_len = prefix_len + int(input_lens[i]) - re_encoded_sequence = tokenizer.encode( - prompt, add_special_tokens=False)[:total_input_len] + re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ + :total_input_len + ] prompt = tokenizer.decode(re_encoded_sequence) requests.append( SampleRequest( diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 800e9a1f6f83..6fa68d2ecee1 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -49,7 +49,7 @@ def main(): args = parse_args() args.endpoint_type = "openai-chat" - model_dir = args.model_dir + model_dir = args.model_dir if args.model_dir is None: model_dir = "meta-llama/Llama-3.1-8B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_dir) @@ -62,13 +62,11 @@ def main(): if args.method == "eagle" or args.method == "eagle3": eagle_dir = args.eagle_dir - if args.method == "eagle": - if eagle_dir is None: - eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" + if args.method == "eagle" and eagle_dir is None: + eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" - elif args.method == "eagle3": - if eagle_dir is None: - eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" + elif args.method == "eagle3" and eagle_dir is None: + eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B" speculative_config = { "method": args.method, "model": eagle_dir, From 227e90aa53062af7d35f341ecdcf96fc6f54bbc8 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Wed, 25 Jun 2025 17:04:22 +0000 Subject: [PATCH 7/7] fix gemini suggestion --- benchmarks/benchmark_dataset.py | 1 + vllm/benchmarks/datasets.py | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 8774a50f6479..55c0cf851264 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -354,6 +354,7 @@ def sample( :total_input_len ] prompt = tokenizer.decode(re_encoded_sequence) + total_input_len = len(re_encoded_sequence) requests.append( SampleRequest( prompt=prompt, diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 7be3fe63b556..b3688d2340e4 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -382,6 +382,7 @@ def sample( re_encoded_sequence = tokenizer.encode( prompt, add_special_tokens=False)[:total_input_len] prompt = tokenizer.decode(re_encoded_sequence) + total_input_len = len(re_encoded_sequence) requests.append( SampleRequest( prompt=prompt,