Skip to content

Commit e7c4f9e

Browse files
authored
[CI/Build][Doc] Move existing benchmark scripts in CI/document/example to vllm bench CLI (#21355)
Signed-off-by: Ye (Charlotte) Qi <[email protected]>
1 parent 9094d11 commit e7c4f9e

File tree

14 files changed

+102
-87
lines changed

14 files changed

+102
-87
lines changed

.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ get_current_llm_serving_engine() {
7373
echo "Container: vllm"
7474
# move to a completely irrelevant directory, to avoid import vllm from current folder
7575
export CURRENT_LLM_SERVING_ENGINE=vllm
76-
76+
7777
return
7878
fi
7979
}
@@ -227,7 +227,7 @@ run_serving_tests() {
227227

228228
if [[ "$dataset_name" = "sharegpt" ]]; then
229229

230-
client_command="python3 benchmark_serving.py \
230+
client_command="vllm bench serve \
231231
--backend $backend \
232232
--tokenizer /tokenizer_cache \
233233
--model $model \
@@ -248,7 +248,7 @@ run_serving_tests() {
248248
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
249249
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
250250

251-
client_command="python3 benchmark_serving.py \
251+
client_command="vllm bench serve \
252252
--backend $backend \
253253
--tokenizer /tokenizer_cache \
254254
--model $model \
@@ -267,13 +267,13 @@ run_serving_tests() {
267267
$client_args"
268268

269269
else
270-
270+
271271
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
272272
exit 1
273273

274274
fi
275275

276-
276+
277277

278278
echo "Running test case $test_name with qps $qps"
279279
echo "Client command: $client_command"
@@ -304,7 +304,7 @@ run_serving_tests() {
304304
}
305305

306306
run_genai_perf_tests() {
307-
# run genai-perf tests
307+
# run genai-perf tests
308308

309309
# $1: a json file specifying genai-perf test cases
310310
local genai_perf_test_file
@@ -313,14 +313,14 @@ run_genai_perf_tests() {
313313
# Iterate over genai-perf tests
314314
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
315315
# get the test name, and append the GPU type back to it.
316-
test_name=$(echo "$params" | jq -r '.test_name')
317-
316+
test_name=$(echo "$params" | jq -r '.test_name')
317+
318318
# if TEST_SELECTOR is set, only run the test cases that match the selector
319319
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
320320
echo "Skip test case $test_name."
321321
continue
322322
fi
323-
323+
324324
# prepend the current serving engine to the test name
325325
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
326326

@@ -371,10 +371,10 @@ run_genai_perf_tests() {
371371
qps=$num_prompts
372372
echo "now qps is $qps"
373373
fi
374-
374+
375375
new_test_name=$test_name"_qps_"$qps
376376
backend=$CURRENT_LLM_SERVING_ENGINE
377-
377+
378378
if [[ "$backend" == *"vllm"* ]]; then
379379
backend="vllm"
380380
fi
@@ -415,7 +415,7 @@ prepare_dataset() {
415415
do
416416
cat sonnet.txt >> sonnet_4x.txt
417417
done
418-
418+
419419
}
420420

421421
main() {

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ run_latency_tests() {
206206
fi
207207
fi
208208

209-
latency_command=" $latency_envs python3 benchmark_latency.py \
209+
latency_command=" $latency_envs vllm bench latency \
210210
--output-json $RESULTS_FOLDER/${test_name}.json \
211211
$latency_args"
212212

@@ -273,7 +273,7 @@ run_throughput_tests() {
273273
fi
274274
fi
275275

276-
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
276+
throughput_command=" $throughput_envs vllm bench throughput \
277277
--output-json $RESULTS_FOLDER/${test_name}.json \
278278
$throughput_args"
279279

@@ -394,7 +394,7 @@ run_serving_tests() {
394394

395395
# pass the tensor parallel size to the client so that it can be displayed
396396
# on the benchmark dashboard
397-
client_command="python3 benchmark_serving.py \
397+
client_command="vllm bench serve \
398398
--save-result \
399399
--result-dir $RESULTS_FOLDER \
400400
--result-filename ${new_test_name}.json \

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ NUMA_NODE=${NUMA_NODE:-1}
1313
export CMAKE_BUILD_PARALLEL_LEVEL=32
1414

1515
# Setup cleanup
16-
remove_docker_container() {
17-
set -e;
18-
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
16+
remove_docker_container() {
17+
set -e;
18+
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
1919
}
2020
trap remove_docker_container EXIT
2121
remove_docker_container
@@ -69,7 +69,7 @@ function cpu_tests() {
6969
docker exec cpu-test-"$NUMA_NODE" bash -c "
7070
set -e
7171
pytest -s -v \
72-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
72+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
7373

7474
# Note: disable it until supports V1
7575
# Run AWQ test
@@ -83,7 +83,7 @@ function cpu_tests() {
8383
set -e
8484
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
8585
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
86-
python3 benchmarks/benchmark_serving.py \
86+
vllm bench serve \
8787
--backend vllm \
8888
--dataset-name random \
8989
--model meta-llama/Llama-3.2-3B-Instruct \

.buildkite/scripts/run-benchmarks.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
1111
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
1212

1313
# run python-based benchmarks and upload the result to buildkite
14-
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
14+
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
1515
bench_latency_exit_code=$?
1616

17-
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
17+
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
1818
bench_throughput_exit_code=$?
1919

2020
# run server-based benchmarks and upload the result to buildkite
@@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
2424

2525
# wait for server to start, timeout after 600 seconds
2626
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
27-
python3 benchmarks/benchmark_serving.py \
27+
vllm bench serve \
2828
--backend vllm \
2929
--dataset-name sharegpt \
3030
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \

.buildkite/scripts/tpu/run_bm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ done
7777
echo "run benchmark test..."
7878
echo "logging to $BM_LOG"
7979
echo
80-
python benchmarks/benchmark_serving.py \
80+
vllm bench serve \
8181
--backend vllm \
8282
--model $MODEL \
8383
--dataset-name sonnet \

benchmarks/README.md

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ Then run the benchmarking script
9898
```bash
9999
# download dataset
100100
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
101-
python3 vllm/benchmarks/benchmark_serving.py \
101+
vllm bench serve \
102102
--backend vllm \
103103
--model NousResearch/Hermes-3-Llama-3.1-8B \
104104
--endpoint /v1/completions \
@@ -111,25 +111,25 @@ If successful, you will see the following output
111111

112112
```
113113
============ Serving Benchmark Result ============
114-
Successful requests: 10
115-
Benchmark duration (s): 5.78
116-
Total input tokens: 1369
117-
Total generated tokens: 2212
118-
Request throughput (req/s): 1.73
119-
Output token throughput (tok/s): 382.89
120-
Total Token throughput (tok/s): 619.85
114+
Successful requests: 10
115+
Benchmark duration (s): 5.78
116+
Total input tokens: 1369
117+
Total generated tokens: 2212
118+
Request throughput (req/s): 1.73
119+
Output token throughput (tok/s): 382.89
120+
Total Token throughput (tok/s): 619.85
121121
---------------Time to First Token----------------
122-
Mean TTFT (ms): 71.54
123-
Median TTFT (ms): 73.88
124-
P99 TTFT (ms): 79.49
122+
Mean TTFT (ms): 71.54
123+
Median TTFT (ms): 73.88
124+
P99 TTFT (ms): 79.49
125125
-----Time per Output Token (excl. 1st token)------
126-
Mean TPOT (ms): 7.91
127-
Median TPOT (ms): 7.96
128-
P99 TPOT (ms): 8.03
126+
Mean TPOT (ms): 7.91
127+
Median TPOT (ms): 7.96
128+
P99 TPOT (ms): 8.03
129129
---------------Inter-token Latency----------------
130-
Mean ITL (ms): 7.74
131-
Median ITL (ms): 7.70
132-
P99 ITL (ms): 8.39
130+
Mean ITL (ms): 7.74
131+
Median ITL (ms): 7.70
132+
P99 ITL (ms): 8.39
133133
==================================================
134134
```
135135

@@ -141,7 +141,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
141141
{"prompt": "What is the capital of India?"}
142142
{"prompt": "What is the capital of Iran?"}
143143
{"prompt": "What is the capital of China?"}
144-
```
144+
```
145145

146146
```bash
147147
# start server
@@ -150,7 +150,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
150150

151151
```bash
152152
# run benchmarking script
153-
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
153+
vllm bench serve --port 9001 --save-result --save-detailed \
154154
--backend vllm \
155155
--model meta-llama/Llama-3.1-8B-Instruct \
156156
--endpoint /v1/completions \
@@ -174,7 +174,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
174174
```
175175

176176
```bash
177-
python3 vllm/benchmarks/benchmark_serving.py \
177+
vllm bench serve \
178178
--backend openai-chat \
179179
--model Qwen/Qwen2-VL-7B-Instruct \
180180
--endpoint /v1/chat/completions \
@@ -194,7 +194,7 @@ VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
194194
```
195195

196196
``` bash
197-
python3 benchmarks/benchmark_serving.py \
197+
vllm bench serve \
198198
--model meta-llama/Meta-Llama-3-8B-Instruct \
199199
--dataset-name hf \
200200
--dataset-path likaixin/InstructCoder \
@@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
210210
**`lmms-lab/LLaVA-OneVision-Data`**
211211

212212
```bash
213-
python3 vllm/benchmarks/benchmark_serving.py \
213+
vllm bench serve \
214214
--backend openai-chat \
215215
--model Qwen/Qwen2-VL-7B-Instruct \
216216
--endpoint /v1/chat/completions \
@@ -224,7 +224,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
224224
**`Aeala/ShareGPT_Vicuna_unfiltered`**
225225

226226
```bash
227-
python3 vllm/benchmarks/benchmark_serving.py \
227+
vllm bench serve \
228228
--backend openai-chat \
229229
--model Qwen/Qwen2-VL-7B-Instruct \
230230
--endpoint /v1/chat/completions \
@@ -237,7 +237,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
237237
**`AI-MO/aimo-validation-aime`**
238238

239239
``` bash
240-
python3 vllm/benchmarks/benchmark_serving.py \
240+
vllm bench serve \
241241
--model Qwen/QwQ-32B \
242242
--dataset-name hf \
243243
--dataset-path AI-MO/aimo-validation-aime \
@@ -248,7 +248,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
248248
**`philschmid/mt-bench`**
249249

250250
``` bash
251-
python3 vllm/benchmarks/benchmark_serving.py \
251+
vllm bench serve \
252252
--model Qwen/QwQ-32B \
253253
--dataset-name hf \
254254
--dataset-path philschmid/mt-bench \
@@ -261,7 +261,7 @@ When using OpenAI-compatible backends such as `vllm`, optional sampling
261261
parameters can be specified. Example client command:
262262

263263
```bash
264-
python3 vllm/benchmarks/benchmark_serving.py \
264+
vllm bench serve \
265265
--backend vllm \
266266
--model NousResearch/Hermes-3-Llama-3.1-8B \
267267
--endpoint /v1/completions \
@@ -296,7 +296,7 @@ The following arguments can be used to control the ramp-up:
296296
<br/>
297297

298298
```bash
299-
python3 vllm/benchmarks/benchmark_throughput.py \
299+
vllm bench throughput \
300300
--model NousResearch/Hermes-3-Llama-3.1-8B \
301301
--dataset-name sonnet \
302302
--dataset-path vllm/benchmarks/sonnet.txt \
@@ -314,7 +314,7 @@ Total num output tokens: 1500
314314
**VisionArena Benchmark for Vision Language Models**
315315

316316
``` bash
317-
python3 vllm/benchmarks/benchmark_throughput.py \
317+
vllm bench throughput \
318318
--model Qwen/Qwen2-VL-7B-Instruct \
319319
--backend vllm-chat \
320320
--dataset-name hf \
@@ -336,7 +336,7 @@ Total num output tokens: 1280
336336
``` bash
337337
VLLM_WORKER_MULTIPROC_METHOD=spawn \
338338
VLLM_USE_V1=1 \
339-
python3 vllm/benchmarks/benchmark_throughput.py \
339+
vllm bench throughput \
340340
--dataset-name=hf \
341341
--dataset-path=likaixin/InstructCoder \
342342
--model=meta-llama/Meta-Llama-3-8B-Instruct \
@@ -360,7 +360,7 @@ Total num output tokens: 204800
360360
**`lmms-lab/LLaVA-OneVision-Data`**
361361

362362
```bash
363-
python3 vllm/benchmarks/benchmark_throughput.py \
363+
vllm bench throughput \
364364
--model Qwen/Qwen2-VL-7B-Instruct \
365365
--backend vllm-chat \
366366
--dataset-name hf \
@@ -373,7 +373,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
373373
**`Aeala/ShareGPT_Vicuna_unfiltered`**
374374

375375
```bash
376-
python3 vllm/benchmarks/benchmark_throughput.py \
376+
vllm bench throughput \
377377
--model Qwen/Qwen2-VL-7B-Instruct \
378378
--backend vllm-chat \
379379
--dataset-name hf \
@@ -385,7 +385,7 @@ python3 vllm/benchmarks/benchmark_throughput.py \
385385
**`AI-MO/aimo-validation-aime`**
386386

387387
```bash
388-
python3 benchmarks/benchmark_throughput.py \
388+
vllm bench throughput \
389389
--model Qwen/QwQ-32B \
390390
--backend vllm \
391391
--dataset-name hf \
@@ -399,7 +399,7 @@ python3 benchmarks/benchmark_throughput.py \
399399
``` bash
400400
# download dataset
401401
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
402-
python3 vllm/benchmarks/benchmark_throughput.py \
402+
vllm bench throughput \
403403
--model meta-llama/Llama-2-7b-hf \
404404
--backend vllm \
405405
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \

0 commit comments

Comments
 (0)