From 0057b2b04202bf39a14ea6b2da4865ced77c62dc Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 29 Apr 2025 16:24:47 -0700
Subject: [PATCH 01/10] add more tests

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 .buildkite/test-pipeline.yaml       | 4 ++++
 requirements/nightly_torch_test.txt | 8 +++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d3c07cdda454..84ee991f5659 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -293,6 +293,7 @@ steps:
   parallelism: 4
 
 - label: PyTorch Compilation Unit Tests
+  torch_nightly: true
   source_file_dependencies:
     - vllm/
     - tests/compile
@@ -302,6 +303,7 @@ steps:
     - pytest -v -s compile/test_sequence_parallelism.py
 
 - label: PyTorch Fullgraph Smoke Test # 9min
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/compile
@@ -312,6 +314,7 @@ steps:
   - pytest -v -s compile/piecewise/test_toy_llama.py
 
 - label: PyTorch Fullgraph Test # 18min
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/compile
@@ -436,6 +439,7 @@ steps:
 #####  models test  #####
 
 - label: Basic Models Test # 24min
+  torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/models
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index 199bcafe0bdd..e2711354ac10 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -23,5 +23,11 @@ runai-model-streamer-s3==0.11.0
 tensorizer>=2.9.0
 lm-eval==0.4.8
 buildkite-test-collector==0.1.9
-
 lm-eval[api]==0.4.8 # required for model evaluation test
+
+# required for quantization test
+bitsandbytes>=0.45.3
+
+# required for minicpmo_26 test
+vector_quantize_pytorch
+vocos

From 2ec84224245b8581b5db81edd4a08130b11bde14 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 29 Apr 2025 16:37:18 -0700
Subject: [PATCH 02/10] add more tests

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 vllm/sampling_params.py | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 3ac5c5c3daab..8d1a52c329c6 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -14,7 +14,7 @@
 from vllm.logits_process import LogitsProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-
+# test
 logger = init_logger(__name__)
 
 _SAMPLING_EPS = 1e-5
@@ -73,6 +73,36 @@ def from_optional(
             structural_tag=structural_tag,
         )
 
+    @property
+    def backend_name(self) -> str:
+        """Return the backend name without any options.
+
+        For example if the backend is "xgrammar:no-fallback", returns "xgrammar"
+        """
+        return (self.backend or "").split(":")[0]
+
+    def backend_options(self) -> list[str]:
+        """Return the backend options as a list of strings."""
+        if not self.backend or ":" not in self.backend:
+            return []
+        return self.backend.split(":")[1].split(",")
+
+    def add_option(self, opt_name: str) -> None:
+        """Adds an option to the backend options."""
+        if not self.backend:
+            self.backend = f":{opt_name}"
+        elif ":" not in self.backend:
+            self.backend += f":{opt_name}"
+        else:
+            options = set(self.backend_options())
+            options.add(opt_name)
+            self.backend = f"{self.backend_name}:{','.join(sorted(options))}"
+
+    def no_fallback(self) -> bool:
+        """Returns True if the "no-fallback" option is supplied for the guided
+        decoding backend"""
+        return "no-fallback" in self.backend_options()
+
     def __post_init__(self):
         """Validate that some fields are mutually exclusive."""
         guide_count = sum([

From 0ac27fbc51125a519384f2af55ab84ec7d073601 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 29 Apr 2025 16:37:34 -0700
Subject: [PATCH 03/10] add more tests

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 vllm/sampling_params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 8d1a52c329c6..33738f0e5bc8 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -14,7 +14,7 @@
 from vllm.logits_process import LogitsProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-# test
+
 logger = init_logger(__name__)
 
 _SAMPLING_EPS = 1e-5

From d5f5684e227a37018aa5558ac7b5abc4d9e79299 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 29 Apr 2025 16:41:01 -0700
Subject: [PATCH 04/10] add more tests

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 vllm/sampling_params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 33738f0e5bc8..8d1a52c329c6 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -14,7 +14,7 @@
 from vllm.logits_process import LogitsProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-
+# test
 logger = init_logger(__name__)
 
 _SAMPLING_EPS = 1e-5

From 5ef55ff4f0771ae032ebe239bf69775989b73105 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 29 Apr 2025 16:41:41 -0700
Subject: [PATCH 05/10] add more tests

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 vllm/sampling_params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 8d1a52c329c6..33738f0e5bc8 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -14,7 +14,7 @@
 from vllm.logits_process import LogitsProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-# test
+
 logger = init_logger(__name__)
 
 _SAMPLING_EPS = 1e-5

From 92a7f3c756e48750ea05573c0dd40b1dc65e4899 Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 29 Apr 2025 16:43:24 -0700
Subject: [PATCH 06/10] add more tests

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 vllm/sampling_params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 33738f0e5bc8..332f48708562 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -14,7 +14,7 @@
 from vllm.logits_process import LogitsProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-
+# comment
 logger = init_logger(__name__)
 
 _SAMPLING_EPS = 1e-5

From 525254bb0e5838dd79e2a70471b9372b3bdc4e4f Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Tue, 29 Apr 2025 16:43:37 -0700
Subject: [PATCH 07/10] add more tests

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 vllm/sampling_params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 332f48708562..33738f0e5bc8 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -14,7 +14,7 @@
 from vllm.logits_process import LogitsProcessor
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-# comment
+
 logger = init_logger(__name__)
 
 _SAMPLING_EPS = 1e-5

From 237c8befcfa6297443ee1c8c3dfadc1829cf4fdf Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Thu, 1 May 2025 10:02:17 -0700
Subject: [PATCH 08/10] add more tests

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 vllm/sampling_params.py | 36 +++---------------------------------
 1 file changed, 3 insertions(+), 33 deletions(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 33738f0e5bc8..66a77681be9a 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -73,36 +73,6 @@ def from_optional(
             structural_tag=structural_tag,
         )
 
-    @property
-    def backend_name(self) -> str:
-        """Return the backend name without any options.
-
-        For example if the backend is "xgrammar:no-fallback", returns "xgrammar"
-        """
-        return (self.backend or "").split(":")[0]
-
-    def backend_options(self) -> list[str]:
-        """Return the backend options as a list of strings."""
-        if not self.backend or ":" not in self.backend:
-            return []
-        return self.backend.split(":")[1].split(",")
-
-    def add_option(self, opt_name: str) -> None:
-        """Adds an option to the backend options."""
-        if not self.backend:
-            self.backend = f":{opt_name}"
-        elif ":" not in self.backend:
-            self.backend += f":{opt_name}"
-        else:
-            options = set(self.backend_options())
-            options.add(opt_name)
-            self.backend = f"{self.backend_name}:{','.join(sorted(options))}"
-
-    def no_fallback(self) -> bool:
-        """Returns True if the "no-fallback" option is supplied for the guided
-        decoding backend"""
-        return "no-fallback" in self.backend_options()
-
     def __post_init__(self):
         """Validate that some fields are mutually exclusive."""
         guide_count = sum([
@@ -216,9 +186,9 @@ class SamplingParams(
         logits_processors: list of functions that modify logits based on
             previously generated tokens, and optionally prompt tokens as
             a first argument.
-        truncate_prompt_tokens: If set to -1, will use the truncation size 
-            supported by the model. If set to an integer k, will use only 
-            the last k tokens from the prompt (i.e., left truncation). 
+        truncate_prompt_tokens: If set to -1, will use the truncation size
+            supported by the model. If set to an integer k, will use only
+            the last k tokens from the prompt (i.e., left truncation).
             Defaults to None (i.e., no truncation).
         guided_decoding: If provided, the engine will construct a guided
             decoding logits processor from these parameters. Defaults to None.

From 4e4cd0933b2a962e1986e5b5f532e87701498ced Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Thu, 1 May 2025 10:07:22 -0700
Subject: [PATCH 09/10] add more tests

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 vllm/sampling_params.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 66a77681be9a..6748c31dff89 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -188,8 +188,7 @@ class SamplingParams(
             a first argument.
         truncate_prompt_tokens: If set to -1, will use the truncation size
             supported by the model. If set to an integer k, will use only
-            the last k tokens from the prompt (i.e., left truncation).
-            Defaults to None (i.e., no truncation).
+            the last k tokens from the prompt (i.e., left truncation). 
         guided_decoding: If provided, the engine will construct a guided
             decoding logits processor from these parameters. Defaults to None.
         logit_bias: If provided, the engine will construct a logits processor

From fcd71d03145255302933bc6dd0ec804a3170289e Mon Sep 17 00:00:00 2001
From: Yang Wang <elainewy@meta.com>
Date: Thu, 1 May 2025 10:07:54 -0700
Subject: [PATCH 10/10] add more tests

Signed-off-by: Yang Wang <elainewy@meta.com>
---
 vllm/sampling_params.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 6748c31dff89..66a77681be9a 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -188,7 +188,8 @@ class SamplingParams(
             a first argument.
         truncate_prompt_tokens: If set to -1, will use the truncation size
             supported by the model. If set to an integer k, will use only
-            the last k tokens from the prompt (i.e., left truncation). 
+            the last k tokens from the prompt (i.e., left truncation).
+            Defaults to None (i.e., no truncation).
         guided_decoding: If provided, the engine will construct a guided
             decoding logits processor from these parameters. Defaults to None.
         logit_bias: If provided, the engine will construct a logits processor