neuralmagic
diff --git a/‎.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml‎ renamed to ‎.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml‎
Lines changed: 4 additions & 4 deletions b/‎.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml‎ renamed to ‎.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.buildkite/lm-eval-harness/configs/models-small.txt‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/lm-eval-harness/configs/models-small.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/lm-eval-harness/test_lm_eval_correctness.py‎
Lines changed: 5 additions & 2 deletions b/‎.buildkite/lm-eval-harness/test_lm_eval_correctness.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎.buildkite/nightly-benchmarks/benchmark-pipeline.yaml‎
Lines changed: 16 additions & 16 deletions b/‎.buildkite/nightly-benchmarks/benchmark-pipeline.yaml‎
Lines changed: 16 additions & 16 deletions
diff --git a/‎.buildkite/nightly-benchmarks/run-benchmarks-suite.sh‎
Lines changed: 4 additions & 14 deletions b/‎.buildkite/nightly-benchmarks/run-benchmarks-suite.sh‎
Lines changed: 4 additions & 14 deletions
diff --git a/‎.buildkite/nightly-benchmarks/tests/descriptions.md‎
Lines changed: 7 additions & 12 deletions b/‎.buildkite/nightly-benchmarks/tests/descriptions.md‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎.buildkite/nightly-benchmarks/tests/latency-tests.json‎
Lines changed: 2 additions & 2 deletions b/‎.buildkite/nightly-benchmarks/tests/latency-tests.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.buildkite/nightly-benchmarks/tests/serving-tests.json‎
Lines changed: 6 additions & 6 deletions b/‎.buildkite/nightly-benchmarks/tests/serving-tests.json‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎.buildkite/nightly-benchmarks/tests/throughput-tests.json‎
Lines changed: 2 additions & 2 deletions b/‎.buildkite/nightly-benchmarks/tests/throughput-tests.json‎
Lines changed: 2 additions & 2 deletions
@@ -9,3 +9,4 @@ tasks:
     value: 0.664
 limit: 1000
 num_fewshot: 5
+trust_remote_code: True
@@ -1,11 +1,11 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1
-model_name: "nvidia/Minitron-4B-Base"
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
+model_name: "mgoin/Minitron-4B-Base-FP8"
 tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.252
+    value: 0.233
   - name: "exact_match,flexible-extract"
-    value: 0.252
+    value: 0.236
 limit: 1000
 num_fewshot: 5
@@ -4,7 +4,7 @@ Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base.yaml
+Minitron-4B-Base-FP8.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
@@ -14,7 +14,7 @@
 import numpy
 import yaml
 
-RTOL = 0.02
+RTOL = 0.05
 TEST_DATA_FILE = os.environ.get(
     "LM_EVAL_TEST_DATA_FILE",
     ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
@@ -23,9 +23,12 @@
 
 
 def launch_lm_eval(eval_config):
+    trust_remote_code = eval_config.get('trust_remote_code', False)
+
     model_args = f"pretrained={eval_config['model_name']}," \
                  f"tensor_parallel_size={TP_SIZE}," \
-                 f"add_bos_token=true"
+                 f"add_bos_token=true," \
+                 f"trust_remote_code={trust_remote_code}"
 
     results = lm_eval.simple_evaluate(
         model="vllm",
 
@@ -42,20 +42,20 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - label: "H100"
-    agents:
-      queue: H100
-    plugins:
-    - docker#v5.11.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-        command:
-        - bash
-        - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-        mount-buildkite-agent: true
-        propagate-environment: true
-        ipc: host
-        gpus: all
-        environment:
-        - VLLM_USAGE_SOURCE
-        - HF_TOKEN
+  # - label: "H100"
+  #   agents:
+  #     queue: H100
+  #   plugins:
+  #   - docker#v5.11.0:
+  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+  #       command:
+  #       - bash
+  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+  #       mount-buildkite-agent: true
+  #       propagate-environment: true
+  #       ipc: host
+  #       gpus: all
+  #       environment:
+  #       - VLLM_USAGE_SOURCE
+  #       - HF_TOKEN
 
@@ -70,23 +70,13 @@ wait_for_server() {
 
 kill_gpu_processes() {
   # kill all processes on GPU.
-  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
-  if [ -z "$pids" ]; then
-      echo "No GPU processes found."
-  else
-      for pid in $pids; do
-          kill -9 "$pid"
-          echo "Killed process with PID: $pid"
-      done
 
-      echo "All GPU processes have been killed."
-  fi
+  ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
+  ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
 
-  # waiting for GPU processes to be fully killed
-  # loop while nvidia-smi returns any processes
-  while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
+  # wait until GPU memory usage smaller than 1GB
+  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
     sleep 1
-    echo "Waiting for GPU processes to be killed"
   done
 
   # remove vllm config file
 
@@ -1,47 +1,42 @@
 
 ## Latency tests
 
-This test suite aims to test vllm's end-to-end latency under a controlled setup.
-
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 
-### Latency benchmarking results
 
 {latency_tests_markdown_table}
 
-## Throughput tests
 
-This test suite aims to test vllm's throughput.
+## Throughput tests
 
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.
 
-### Throughput benchmarking results
 
 {throughput_tests_markdown_table}
 
-## Serving tests
 
-This test suite aims to test vllm's real serving metrics.
+## Serving tests
 
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
+- We also added a speculative decoding test for llama-3 70B, under QPS 2
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 
-### Serving benchmarking results
 
 {serving_tests_markdown_table}
 
+
 ## json version of the benchmarking tables
 
 This section contains the data of the markdown tables above in JSON format. 
 
@@ -2,7 +2,7 @@
     {
         "test_name": "latency_llama8B_tp1",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "load_format": "dummy",
             "num_iters_warmup": 5,
@@ -12,7 +12,7 @@
     {
         "test_name": "latency_llama70B_tp4",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
             "num-iters-warmup": 5,
 
@@ -3,15 +3,15 @@
         "test_name": "serving_llama8B_tp1_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "swap_space": 16,
             "disable_log_stats": "",
             "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -22,15 +22,15 @@
         "test_name": "serving_llama70B_tp4_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
             "swap_space": 16,
             "disable_log_stats": "",
             "disable_log_requests": "",
             "load_format": "dummy"
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -60,7 +60,7 @@
         "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
         "qps_list": [2],
         "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "disable_log_requests": "", 
             "tensor_parallel_size": 4,
             "swap_space": 16, 
@@ -70,7 +70,7 @@
             "use_v2_block_manager": ""
         },
         "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 
@@ -2,7 +2,7 @@
     {
         "test_name": "throughput_llama8B_tp1",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
             "tensor_parallel_size": 1,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -13,7 +13,7 @@
     {
         "test_name": "throughput_llama70B_tp4",
         "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "tensor_parallel_size": 4,
             "load_format": "dummy",
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",