Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
da3dc61
llama4 type eagle support in v1
RonaldBXu May 19, 2025
b61e6be
Merge branch 'vllm-project:main' into llama4_v1_support
RonaldBXu May 21, 2025
924be7b
Merge branch 'main' into llama4_v1_support
RonaldBXu Jun 5, 2025
f40d973
Merge branch 'vllm-project:main' into llama4_v1_support
RonaldBXu Jun 11, 2025
a4dd030
Merge branch 'vllm-project:main' into llama4_v1_support
RonaldBXu Jun 14, 2025
06bfb26
updating code to match current standards. removed redundant lm_head
RonaldBXu Jun 15, 2025
40df89d
add spdx filecopyright text
RonaldBXu Jun 15, 2025
e9d9241
fix linter
RonaldBXu Jun 15, 2025
88ecec6
Merge branch 'vllm-project:main' into llama4_v1_support
RonaldBXu Jun 21, 2025
25bf276
tests
RonaldBXu Jun 21, 2025
1868c12
fix linter
RonaldBXu Jun 21, 2025
23136c8
fix linter
RonaldBXu Jun 21, 2025
5c65200
remove whitespace
RonaldBXu Jun 21, 2025
1de5b84
split tests
RonaldBXu Jun 24, 2025
89fdd43
fix linter
RonaldBXu Jun 24, 2025
74ae303
address comments 1
RonaldBXu Jun 25, 2025
ea6cca9
address comments 2
RonaldBXu Jun 25, 2025
e667918
fix registry test
RonaldBXu Jun 26, 2025
d342950
fix linter
RonaldBXu Jun 26, 2025
2eec3e7
Merge branch 'vllm-project:main' into llama4_v1_support
RonaldBXu Jun 29, 2025
a850d1e
skip initialization test
RonaldBXu Jun 29, 2025
32edc20
ignore llama4 test
RonaldBXu Jun 30, 2025
07c5c8c
update initialization test
RonaldBXu Jun 30, 2025
815a8a2
fix linter
RonaldBXu Jun 30, 2025
96f22bd
change to scout
RonaldBXu Jun 30, 2025
963f57c
change max model len
RonaldBXu Jun 30, 2025
66d79a3
Merge branch 'vllm-project:main' into llama4_v1_support
RonaldBXu Jul 8, 2025
7ef96be
skip test
RonaldBXu Jul 8, 2025
7c85ebb
Merge branch 'vllm-project:main' into llama4_v1_support
RonaldBXu Jul 9, 2025
c07e825
Merge branch 'main' into llama4_v1_support
RonaldBXu Jul 12, 2025
929e620
Merge branch 'main' into llama4_v1_support
RonaldBXu Jul 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ steps:
- pytest -v -s v1/test_metrics_reader.py
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
- pytest -v -s v1/e2e --ignore=v1/e2e/test_llama4_eagle.py
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
Expand Down
16 changes: 16 additions & 0 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ class _HfExamplesInfo:
for speculative decoding.
"""

speculative_method: Optional[str] = None
"""
The default speculative method to use for testing this architecture, which
is only used for speculative decoding.
"""

min_transformers_version: Optional[str] = None
"""
The minimum version of HF Transformers that is required to run this model.
Expand Down Expand Up @@ -61,6 +67,9 @@ class _HfExamplesInfo:
v0_only: bool = False
"""The model is only available with the vLLM V0 engine."""

v1_only: bool = False
"""The model is only available with the vLLM V1 engine."""

hf_overrides: dict[str, Any] = field(default_factory=dict)
"""The ``hf_overrides`` required to load the model."""

Expand Down Expand Up @@ -457,6 +466,13 @@ def check_available_online(
trust_remote_code=True,
speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"), # noqa: E501
"EagleLlama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
trust_remote_code=True,
speculative_model="ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct", # noqa: E501
tokenizer="meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
speculative_method="eagle",
max_model_len=256,
v1_only=True),
"Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", # noqa: E501
trust_remote_code=True,
speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
Expand Down
14 changes: 11 additions & 3 deletions tests/models/test_initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
# FIXME: Possible memory leak in the previous tests?
if model_arch in ("Glm4vForConditionalGeneration",
"GraniteSpeechForConditionalGeneration",
"KimiVLForConditionalGeneration"):
"KimiVLForConditionalGeneration",
"EagleLlama4ForCausalLM"):
pytest.skip("Avoid OOM")

# Avoid OOM and reduce initialization time by only using 1 layer
Expand Down Expand Up @@ -103,6 +104,8 @@ def _initialize_kv_caches_v1(self, vllm_config):
_initialize_kv_caches_v1), monkeypatch.context() as m):
if model_info.v0_only:
m.setenv("VLLM_USE_V1", "0")
if model_info.v1_only:
m.setenv("VLLM_USE_V1", "1")
if model_arch == "Phi4FlashForCausalLM":
# Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
Expand All @@ -112,8 +115,13 @@ def _initialize_kv_caches_v1(self, vllm_config):
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
speculative_config={
"model": model_info.speculative_model,
"num_speculative_tokens": 1,
"method":
model_info.speculative_method
if model_info.speculative_method else None,
"model":
model_info.speculative_model,
"num_speculative_tokens":
1,
} if model_info.speculative_model else None,
trust_remote_code=model_info.trust_remote_code,
max_model_len=model_info.max_model_len,
Expand Down
111 changes: 111 additions & 0 deletions tests/v1/e2e/test_llama4_eagle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

# To run this file, run
# pytest -vx /tests/v1/e2e/test_llama4_eagle.py

from __future__ import annotations

import random
from typing import Any

import pytest

from vllm import LLM, SamplingParams


@pytest.fixture
def test_prompts():
prompt_types = ["repeat", "sentence"]
num_prompts = 100
prompts = []

random.seed(0)
random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)

# Generate a mixed batch of prompts, some of which can be easily
# predicted by n-gram matching and some which likely cannot.
for kind in random_prompt_type_choices:
word_choices = ["test", "temp", "hello", "where"]
word = random.choice(word_choices)
if kind == "repeat":
prompt = f"""
please repeat the word '{word}' 10 times.
give no other output than the word at least ten times in a row,
in lowercase with spaces between each word and without quotes.
"""
elif kind == "sentence":
prompt = f"""
please give a ten-word sentence that
uses the word {word} at least once.
give no other output than that simple sentence without quotes.
"""
else:
raise ValueError(f"Unknown prompt type: {kind}")
prompts.append([{"role": "user", "content": prompt}])

return prompts


@pytest.fixture
def sampling_config():
return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)


@pytest.mark.parametrize(
"method_model_and_draft_model",
[("eagle", "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
"ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct")],
ids=[
"llama4_eagle",
])
def test_eagle_correctness(
monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
sampling_config: SamplingParams,
method_model_and_draft_model: tuple[str, str, str],
):
'''
Compare the outputs of a original LLM and a speculative LLM
should be the same when using eagle speculative decoding.
'''
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")

method, model_name, spec_model_name = method_model_and_draft_model

tp = 8

ref_llm = LLM(model=model_name,
tensor_parallel_size=tp,
max_model_len=2048)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm

spec_llm = LLM(
model=model_name,
trust_remote_code=True,
tensor_parallel_size=tp,
speculative_config={
"method": method,
"model": spec_model_name,
"num_speculative_tokens": 3,
"max_model_len": 2048,
},
max_model_len=2048,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
misses = 0
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
if ref_output.outputs[0].text == spec_output.outputs[0].text:
matches += 1
else:
misses += 1
print(f"ref_output: {ref_output.outputs[0].text}")
print(f"spec_output: {spec_output.outputs[0].text}")

# Heuristic: expect at least 66% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.66 * len(ref_outputs))
del spec_llm
24 changes: 10 additions & 14 deletions tests/v1/e2e/test_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,6 @@ def model_name():
return "meta-llama/Llama-3.1-8B-Instruct"


def eagle_model_name():
return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"


def eagle3_model_name():
return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"


def test_ngram_correctness(
monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
Expand Down Expand Up @@ -105,13 +97,17 @@ def test_ngram_correctness(
del spec_llm


@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
@pytest.mark.parametrize("method_model_and_draft_model",
[("eagle", "meta-llama/Llama-3.1-8B-Instruct",
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"),
("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")],
ids=["llama3_eagle", "llama3_eagle3"])
def test_eagle_correctness(
monkeypatch: pytest.MonkeyPatch,
test_prompts: list[list[dict[str, Any]]],
sampling_config: SamplingParams,
model_name: str,
use_eagle3: bool,
method_model_and_draft_model: tuple[str, str, str],
):
'''
Compare the outputs of a original LLM and a speculative LLM
Expand All @@ -120,17 +116,17 @@ def test_eagle_correctness(
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")

method, model_name, spec_model_name = method_model_and_draft_model

ref_llm = LLM(model=model_name, max_model_len=2048)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm

spec_model_name = eagle3_model_name(
) if use_eagle3 else eagle_model_name()
spec_llm = LLM(
model=model_name,
trust_remote_code=True,
speculative_config={
"method": "eagle3" if use_eagle3 else "eagle",
"method": method,
"model": spec_model_name,
"num_speculative_tokens": 3,
"max_model_len": 2048,
Expand Down
35 changes: 24 additions & 11 deletions tests/v1/spec_decode/test_eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,16 @@
from vllm.platforms import current_platform
from vllm.v1.spec_decode.eagle import EagleProposer

model_dir = "meta-llama/Llama-3.1-8B-Instruct"
eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
llama3_model_dir = "meta-llama/Llama-3.1-8B-Instruct"
llama3_eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
llama3_eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"

llama4_model_dir = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
llama4_eagle_dir = "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct"

def _create_proposer(method: str, k: int) -> EagleProposer:

def _create_proposer(method: str, model_dir: str, draft_model_dir: str,
k: int) -> EagleProposer:
model_config = ModelConfig(model=model_dir,
task="generate",
max_model_len=100,
Expand All @@ -28,9 +32,6 @@ def _create_proposer(method: str, k: int) -> EagleProposer:
seed=None,
trust_remote_code=False)

# Choose model directory based on method
draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir

speculative_config = SpeculativeConfig(
target_model_config=model_config,
target_parallel_config=ParallelConfig(),
Expand Down Expand Up @@ -118,8 +119,14 @@ def test_prepare_inputs():


@pytest.mark.parametrize("method,proposer_helper", [
("eagle", lambda k: _create_proposer("eagle", k)),
("eagle3", lambda k: _create_proposer("eagle3", k)),
("eagle",
lambda k: _create_proposer("eagle", llama3_model_dir, llama3_eagle_dir, k)
),
("eagle",
lambda k: _create_proposer("eagle", llama4_model_dir, llama4_eagle_dir, k)
),
("eagle3", lambda k: _create_proposer("eagle3", llama3_model_dir,
llama3_eagle3_dir, k)),
])
@pytest.mark.parametrize("pp_size", [1, 2])
@pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
Expand Down Expand Up @@ -199,7 +206,12 @@ class _TargetModelStub(LlamaForCausalLM):


@pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
def test_propose(num_speculative_tokens):
@pytest.mark.parametrize("model_and_draft_model",
[(llama3_model_dir, llama3_eagle_dir),
(llama4_model_dir, llama4_eagle_dir)])
def test_propose(num_speculative_tokens, model_and_draft_model):
model_dir = model_and_draft_model[0]
draft_model_dir = model_and_draft_model[1]
# Use GPU device
device = torch.device(current_platform.device_type)

Expand All @@ -211,7 +223,8 @@ def test_propose(num_speculative_tokens):
vocab_size = 100

# Create proposer first so we can use its actual hidden_size
proposer = _create_proposer("eagle", num_speculative_tokens)
proposer = _create_proposer("eagle", model_dir, draft_model_dir,
num_speculative_tokens)
# Get the hidden_size from the proposer to ensure consistency
hidden_size = proposer.hidden_size

Expand Down
Loading