Skip to content

Commit 90ac00c

Browse files
committed
Merge remote-tracking branch 'upstream/main' into min-new-tokens
* upstream/main: [Misc] Bump up transformers to v4.39.0 & Remove StarCoder2Config (vllm-project#3551) [Misc][Log] Add log for tokenizer length not equal to vocabulary size (vllm-project#3500) [🚀 Ready to be merged] Added support for Jais models (vllm-project#3183) Fix 1D query issue from `_prune_hidden_states` (vllm-project#3539) [PREFIX CACHING FOLLOW UP] OrderedDict-based evictor (vllm-project#3431) [BugFix] Hot fix in setup.py for neuron build (vllm-project#3537) Migrate `logits` computation and gather to `model_runner` (vllm-project#3233) [1/n][Chunked Prefill] Refactor input query shapes (vllm-project#3236) [1/n] Triton sampling kernel (vllm-project#3186) [Bugfix] Fix ROCm support in CMakeLists.txt (vllm-project#3534)
2 parents b93e18f + c188ecb commit 90ac00c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+2856
-687
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,10 @@ steps:
4747
- pytest -v -s prefix_caching
4848

4949
- label: Samplers Test
50-
command: pytest -v -s samplers --forked
50+
command: pytest -v -s samplers
51+
52+
- label: LogitsProcessor Test
53+
command: pytest -v -s test_logits_processor.py
5154

5255
- label: Worker Test
5356
command: pytest -v -s worker
@@ -56,7 +59,7 @@ steps:
5659
command: pytest -v -s spec_decode
5760

5861
- label: LoRA Test %N
59-
command: pytest -v -s lora --forked --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
62+
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
6063
parallelism: 4
6164

6265
- label: Metrics Test

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
5151

5252
# Ensure the 'nvcc' command is in the PATH
5353
find_program(NVCC_EXECUTABLE nvcc)
54-
if (NOT NVCC_EXECUTABLE)
54+
if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
5555
message(FATAL_ERROR "nvcc not found")
5656
endif()
5757

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
7676
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
7777
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
7878
- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
79+
- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
7980
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
8081
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
8182
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)

docs/source/models/supported_models.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,11 @@ Alongside each architecture, we include some popular models that use it.
6666
* - :code:`InternLM2ForCausalLM`
6767
- InternLM2
6868
- :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
69-
-
69+
-
70+
* - :code:`JAISLMHeadModel`
71+
- Jais
72+
- :code:`core42/jais-13b`, :code:`core42/jais-13b-chat`, :code:`core42/jais-30b-v3`, :code:`core42/jais-30b-chat-v3`, etc.
73+
-
7074
* - :code:`LlamaForCausalLM`
7175
- LLaMA, LLaMA-2, Vicuna, Alpaca, Yi
7276
- :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.

requirements-rocm.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ ray >= 2.9
77
sentencepiece # Required for LLaMA tokenizer.
88
numpy
99
tokenizers>=0.15.0
10-
transformers >= 4.38.0 # Required for Gemma.
10+
transformers >= 4.39.0 # Required for StarCoder2.
1111
fastapi
1212
uvicorn[standard]
1313
pydantic >= 2.0 # Required for OpenAI server.

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ ray >= 2.9
55
sentencepiece # Required for LLaMA tokenizer.
66
numpy
77
torch == 2.1.2
8-
transformers >= 4.38.0 # Required for Gemma.
8+
transformers >= 4.39.0 # Required for StarCoder2.
99
xformers == 0.0.23.post1 # Required for CUDA 12.1.
1010
fastapi
1111
uvicorn[standard]

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def build_extensions(self) -> None:
168168

169169

170170
def _is_cuda() -> bool:
171-
return torch.version.cuda is not None
171+
return torch.version.cuda is not None and not _is_neuron()
172172

173173

174174
def _is_hip() -> bool:

tests/basic_correctness/test_basic_correctness.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,21 @@
1313
@pytest.mark.parametrize("model", MODELS)
1414
@pytest.mark.parametrize("dtype", ["half"])
1515
@pytest.mark.parametrize("max_tokens", [5])
16+
@pytest.mark.parametrize("enforce_eager", [False, True])
1617
def test_models(
1718
hf_runner,
1819
vllm_runner,
1920
example_prompts,
2021
model: str,
2122
dtype: str,
2223
max_tokens: int,
24+
enforce_eager: bool,
2325
) -> None:
2426
hf_model = hf_runner(model, dtype=dtype)
2527
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
2628
del hf_model
2729

28-
vllm_model = vllm_runner(model, dtype=dtype)
30+
vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=enforce_eager)
2931
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
3032
del vllm_model
3133

tests/core/test_scheduler.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
def test_scheduler_add_seq_group():
1212
block_size = 4
13-
scheduler_config = SchedulerConfig(100, 64, 1, 256)
13+
scheduler_config = SchedulerConfig(100, 64, 1)
1414
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
1515
cache_config.num_cpu_blocks = 4
1616
cache_config.num_gpu_blocks = 4
@@ -26,7 +26,7 @@ def test_scheduler_add_seq_group():
2626

2727
def test_scheduler_abort_seq_group():
2828
block_size = 4
29-
scheduler_config = SchedulerConfig(100, 64, 1, 256)
29+
scheduler_config = SchedulerConfig(100, 64, 1)
3030
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
3131
cache_config.num_cpu_blocks = 4
3232
cache_config.num_gpu_blocks = 4
@@ -50,7 +50,7 @@ def test_scheduler_schedule_simple():
5050
block_size = 4
5151
num_seq_group = 4
5252
max_model_len = 16
53-
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len, 256)
53+
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
5454
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
5555
cache_config.num_cpu_blocks = 8
5656
cache_config.num_gpu_blocks = 8
@@ -64,10 +64,10 @@ def test_scheduler_schedule_simple():
6464
running.append(seq_group)
6565

6666
# Schedule seq groups prompts.
67+
num_tokens = block_size * num_seq_group
6768
seq_group_meta, out = scheduler.schedule()
6869
assert set(out.scheduled_seq_groups) == set(running)
69-
assert out.num_batched_tokens == num_seq_group * seq_group.get_seqs(
70-
)[0].get_len()
70+
assert out.num_batched_tokens == num_tokens
7171
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
7272
and not out.blocks_to_swap_out)
7373
assert len(seq_group_meta) == num_seq_group
@@ -84,7 +84,7 @@ def test_scheduler_schedule_simple():
8484
def test_scheduler_schedule_preempt_abort():
8585
block_size = 4
8686
max_model_len = 16
87-
scheduler_config = SchedulerConfig(64, 2, max_model_len, 256)
87+
scheduler_config = SchedulerConfig(64, 2, max_model_len)
8888
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
8989
cache_config.num_cpu_blocks = 2
9090
cache_config.num_gpu_blocks = 2
@@ -99,7 +99,7 @@ def test_scheduler_schedule_preempt_abort():
9999
# Schedule seq groups prompts.
100100
seq_group_meta, out = scheduler.schedule()
101101
assert out.scheduled_seq_groups == [seq_group_a, seq_group_b]
102-
assert out.num_batched_tokens == seq_group_a.get_seqs()[0].get_len() * 2
102+
assert out.num_batched_tokens == block_size * 2 # seq_a and seq_b
103103
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
104104
and not out.blocks_to_swap_out)
105105
assert len(seq_group_meta) == 2
@@ -124,7 +124,7 @@ def test_scheduler_schedule_preempt_abort():
124124
scheduler.abort_seq_group("1")
125125
seq_group_meta, out = scheduler.schedule()
126126
assert out.scheduled_seq_groups == [seq_group_b]
127-
assert out.num_batched_tokens == seq_group_b.get_seqs()[0].get_len()
127+
assert out.num_batched_tokens == 5 # 4 prompt + 1 generation.
128128
assert (not out.blocks_to_copy and not out.blocks_to_swap_in
129129
and not out.blocks_to_swap_out)
130130
assert len(seq_group_meta) == 1
@@ -136,7 +136,7 @@ def test_scheduler_max_seqs():
136136
num_seq_group = 4
137137
max_seq_group = 2
138138
max_model_len = 16
139-
scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len, 256)
139+
scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len)
140140
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
141141
cache_config.num_cpu_blocks = 8
142142
cache_config.num_gpu_blocks = 8

tests/kernels/test_rand.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import torch
2+
import pytest
3+
import random
4+
5+
from vllm.model_executor.layers.ops.rand import seeded_uniform
6+
from vllm.model_executor.utils import set_random_seed
7+
8+
9+
@pytest.mark.parametrize("dtype",
10+
[torch.float32, torch.float16, torch.bfloat16])
11+
@pytest.mark.parametrize("use_3d", [True, False])
12+
def test_seeded_uniform(dtype: torch.dtype, use_3d: bool):
13+
device = "cuda"
14+
for seed in range(512):
15+
set_random_seed(seed)
16+
rows = random.randint(1, 512)
17+
cols = random.randint(1, 64000)
18+
if use_3d:
19+
third_dim = random.randint(2, 10)
20+
dims = [rows, third_dim, cols]
21+
else:
22+
dims = [rows, cols]
23+
seeds = torch.randint(torch.iinfo(torch.long).min,
24+
torch.iinfo(torch.long).max, (rows, ),
25+
device=device)
26+
27+
# Test that the same seed produces the same output
28+
out = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
29+
out2 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
30+
torch.testing.assert_close(out, out2)
31+
# del to save memory
32+
del out2
33+
34+
out3 = seeded_uniform(*dims, seeds=seeds, dtype=dtype, device=device)
35+
torch.testing.assert_close(out, out3)
36+
# del to save memory
37+
del out3
38+
39+
# Initialize out tensor with garbage to ensure that it is overwritten
40+
out_with_tensor = seeded_uniform(
41+
*dims,
42+
out=torch.full(
43+
(*dims, ),
44+
-1,
45+
dtype=dtype,
46+
device=device,
47+
),
48+
seeds=seeds,
49+
dtype=dtype,
50+
)
51+
torch.testing.assert_close(out, out_with_tensor)

0 commit comments

Comments
 (0)