Skip to content

Commit 8f4648c

Browse files
committed
Merge branch 'main' into marlin-moe-integration
2 parents b0c4671 + 1a36287 commit 8f4648c

File tree

487 files changed

+31334
-6885
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

487 files changed

+31334
-6885
lines changed

.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ tasks:
99
value: 0.664
1010
limit: 1000
1111
num_fewshot: 5
12+
trust_remote_code: True
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
2-
model_name: "nvidia/Minitron-4B-Base"
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
2+
model_name: "mgoin/Minitron-4B-Base-FP8"
33
tasks:
44
- name: "gsm8k"
55
metrics:
66
- name: "exact_match,strict-match"
7-
value: 0.252
7+
value: 0.233
88
- name: "exact_match,flexible-extract"
9-
value: 0.252
9+
value: 0.236
1010
limit: 1000
1111
num_fewshot: 5

.buildkite/lm-eval-harness/configs/models-small.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
44
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
55
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
66
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
7-
Minitron-4B-Base.yaml
7+
Minitron-4B-Base-FP8.yaml
88
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
99
Qwen2-1.5B-Instruct-FP8W8.yaml
1010
Meta-Llama-3-8B-QQQ.yaml

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import numpy
1515
import yaml
1616

17-
RTOL = 0.02
17+
RTOL = 0.05
1818
TEST_DATA_FILE = os.environ.get(
1919
"LM_EVAL_TEST_DATA_FILE",
2020
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
@@ -23,9 +23,12 @@
2323

2424

2525
def launch_lm_eval(eval_config):
26+
trust_remote_code = eval_config.get('trust_remote_code', False)
27+
2628
model_args = f"pretrained={eval_config['model_name']}," \
2729
f"tensor_parallel_size={TP_SIZE}," \
28-
f"add_bos_token=true"
30+
f"add_bos_token=true," \
31+
f"trust_remote_code={trust_remote_code}"
2932

3033
results = lm_eval.simple_evaluate(
3134
model="vllm",

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,20 +42,20 @@ steps:
4242
- name: devshm
4343
emptyDir:
4444
medium: Memory
45-
- label: "H100"
46-
agents:
47-
queue: H100
48-
plugins:
49-
- docker#v5.11.0:
50-
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
51-
command:
52-
- bash
53-
- .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
54-
mount-buildkite-agent: true
55-
propagate-environment: true
56-
ipc: host
57-
gpus: all
58-
environment:
59-
- VLLM_USAGE_SOURCE
60-
- HF_TOKEN
45+
# - label: "H100"
46+
# agents:
47+
# queue: H100
48+
# plugins:
49+
# - docker#v5.11.0:
50+
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
51+
# command:
52+
# - bash
53+
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
54+
# mount-buildkite-agent: true
55+
# propagate-environment: true
56+
# ipc: host
57+
# gpus: all
58+
# environment:
59+
# - VLLM_USAGE_SOURCE
60+
# - HF_TOKEN
6161

.buildkite/nightly-benchmarks/run-benchmarks-suite.sh

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -70,23 +70,13 @@ wait_for_server() {
7070

7171
kill_gpu_processes() {
7272
# kill all processes on GPU.
73-
pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
74-
if [ -z "$pids" ]; then
75-
echo "No GPU processes found."
76-
else
77-
for pid in $pids; do
78-
kill -9 "$pid"
79-
echo "Killed process with PID: $pid"
80-
done
8173

82-
echo "All GPU processes have been killed."
83-
fi
74+
ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
75+
ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
8476

85-
# waiting for GPU processes to be fully killed
86-
# loop while nvidia-smi returns any processes
87-
while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
77+
# wait until GPU memory usage smaller than 1GB
78+
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
8879
sleep 1
89-
echo "Waiting for GPU processes to be killed"
9080
done
9181

9282
# remove vllm config file

.buildkite/nightly-benchmarks/tests/descriptions.md

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,42 @@
11

22
## Latency tests
33

4-
This test suite aims to test vllm's end-to-end latency under a controlled setup.
5-
64
- Input length: 32 tokens.
75
- Output length: 128 tokens.
86
- Batch size: fixed (8).
9-
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
7+
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
108
- Evaluation metrics: end-to-end latency (mean, median, p99).
119

12-
### Latency benchmarking results
1310

1411
{latency_tests_markdown_table}
1512

16-
## Throughput tests
1713

18-
This test suite aims to test vllm's throughput.
14+
## Throughput tests
1915

2016
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
2117
- Output length: the corresponding output length of these 200 prompts.
2218
- Batch size: dynamically determined by vllm to achieve maximum throughput.
23-
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
19+
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
2420
- Evaluation metrics: throughput.
2521

26-
### Throughput benchmarking results
2722

2823
{throughput_tests_markdown_table}
2924

30-
## Serving tests
3125

32-
This test suite aims to test vllm's real serving metrics.
26+
## Serving tests
3327

3428
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
3529
- Output length: the corresponding output length of these 200 prompts.
3630
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
3731
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
38-
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
32+
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
33+
- We also added a speculative decoding test for llama-3 70B, under QPS 2
3934
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
4035

41-
### Serving benchmarking results
4236

4337
{serving_tests_markdown_table}
4438

39+
4540
## json version of the benchmarking tables
4641

4742
This section contains the data of the markdown tables above in JSON format.

.buildkite/nightly-benchmarks/tests/latency-tests.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
{
33
"test_name": "latency_llama8B_tp1",
44
"parameters": {
5-
"model": "meta-llama/Meta-Llama-3-8B",
5+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
66
"tensor_parallel_size": 1,
77
"load_format": "dummy",
88
"num_iters_warmup": 5,
@@ -12,7 +12,7 @@
1212
{
1313
"test_name": "latency_llama70B_tp4",
1414
"parameters": {
15-
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
15+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
1616
"tensor_parallel_size": 4,
1717
"load_format": "dummy",
1818
"num-iters-warmup": 5,

.buildkite/nightly-benchmarks/tests/serving-tests.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
"test_name": "serving_llama8B_tp1_sharegpt",
44
"qps_list": [1, 4, 16, "inf"],
55
"server_parameters": {
6-
"model": "meta-llama/Meta-Llama-3-8B",
6+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
77
"tensor_parallel_size": 1,
88
"swap_space": 16,
99
"disable_log_stats": "",
1010
"disable_log_requests": "",
1111
"load_format": "dummy"
1212
},
1313
"client_parameters": {
14-
"model": "meta-llama/Meta-Llama-3-8B",
14+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
1515
"backend": "vllm",
1616
"dataset_name": "sharegpt",
1717
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -22,15 +22,15 @@
2222
"test_name": "serving_llama70B_tp4_sharegpt",
2323
"qps_list": [1, 4, 16, "inf"],
2424
"server_parameters": {
25-
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
25+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
2626
"tensor_parallel_size": 4,
2727
"swap_space": 16,
2828
"disable_log_stats": "",
2929
"disable_log_requests": "",
3030
"load_format": "dummy"
3131
},
3232
"client_parameters": {
33-
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
33+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
3434
"backend": "vllm",
3535
"dataset_name": "sharegpt",
3636
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -60,7 +60,7 @@
6060
"test_name": "serving_llama70B_tp4_sharegpt_specdecode",
6161
"qps_list": [2],
6262
"server_parameters": {
63-
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
63+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
6464
"disable_log_requests": "",
6565
"tensor_parallel_size": 4,
6666
"swap_space": 16,
@@ -70,7 +70,7 @@
7070
"use_v2_block_manager": ""
7171
},
7272
"client_parameters": {
73-
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
73+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
7474
"backend": "vllm",
7575
"dataset_name": "sharegpt",
7676
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

.buildkite/nightly-benchmarks/tests/throughput-tests.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
{
33
"test_name": "throughput_llama8B_tp1",
44
"parameters": {
5-
"model": "meta-llama/Meta-Llama-3-8B",
5+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
66
"tensor_parallel_size": 1,
77
"load_format": "dummy",
88
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -13,7 +13,7 @@
1313
{
1414
"test_name": "throughput_llama70B_tp4",
1515
"parameters": {
16-
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
16+
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
1717
"tensor_parallel_size": 4,
1818
"load_format": "dummy",
1919
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",

0 commit comments

Comments
 (0)