diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d3c07cdda454..84ee991f5659 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -293,6 +293,7 @@ steps: parallelism: 4 - label: PyTorch Compilation Unit Tests + torch_nightly: true source_file_dependencies: - vllm/ - tests/compile @@ -302,6 +303,7 @@ steps: - pytest -v -s compile/test_sequence_parallelism.py - label: PyTorch Fullgraph Smoke Test # 9min + torch_nightly: true source_file_dependencies: - vllm/ - tests/compile @@ -312,6 +314,7 @@ steps: - pytest -v -s compile/piecewise/test_toy_llama.py - label: PyTorch Fullgraph Test # 18min + torch_nightly: true source_file_dependencies: - vllm/ - tests/compile @@ -436,6 +439,7 @@ steps: ##### models test ##### - label: Basic Models Test # 24min + torch_nightly: true source_file_dependencies: - vllm/ - tests/models diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 199bcafe0bdd..e2711354ac10 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -23,5 +23,11 @@ runai-model-streamer-s3==0.11.0 tensorizer>=2.9.0 lm-eval==0.4.8 buildkite-test-collector==0.1.9 - lm-eval[api]==0.4.8 # required for model evaluation test + +# required for quantization test +bitsandbytes>=0.45.3 + +# required for minicpmo_26 test +vector_quantize_pytorch +vocos diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 3ac5c5c3daab..66a77681be9a 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -186,9 +186,9 @@ class SamplingParams( logits_processors: list of functions that modify logits based on previously generated tokens, and optionally prompt tokens as a first argument. - truncate_prompt_tokens: If set to -1, will use the truncation size - supported by the model. If set to an integer k, will use only - the last k tokens from the prompt (i.e., left truncation). + truncate_prompt_tokens: If set to -1, will use the truncation size + supported by the model. If set to an integer k, will use only + the last k tokens from the prompt (i.e., left truncation). Defaults to None (i.e., no truncation). guided_decoding: If provided, the engine will construct a guided decoding logits processor from these parameters. Defaults to None.