vllm-project · RonaldBXu · May 19, 2025 · May 21, 2025 · Jun 5, 2025 · Jun 11, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -262,7 +262,7 @@ steps:
     - pytest -v -s v1/test_metrics_reader.py
     # TODO: accuracy does not match, whether setting
     # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
+    - pytest -v -s v1/e2e --ignore=v1/e2e/test_llama4_eagle.py
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

@@ -32,6 +32,12 @@ class _HfExamplesInfo:
     for speculative decoding.
     """
 
+    speculative_method: Optional[str] = None
+    """
+    The default speculative method to use for testing this architecture, which 
+    is only used for speculative decoding.
+    """
+
     min_transformers_version: Optional[str] = None
     """
     The minimum version of HF Transformers that is required to run this model.
@@ -61,6 +67,9 @@ class _HfExamplesInfo:
     v0_only: bool = False
     """The model is only available with the vLLM V0 engine."""
 
+    v1_only: bool = False
+    """The model is only available with the vLLM V1 engine."""
+
     hf_overrides: dict[str, Any] = field(default_factory=dict)
     """The ``hf_overrides`` required to load the model."""
 
@@ -457,6 +466,13 @@ def check_available_online(
                                              trust_remote_code=True,
                                              speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
                                              tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"),  # noqa: E501
+    "EagleLlama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
+                                             trust_remote_code=True,
+                                             speculative_model="ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct", # noqa: E501
+                                             tokenizer="meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
+                                             speculative_method="eagle",
+                                             max_model_len=256,
+                                             v1_only=True),
     "Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",  # noqa: E501
                                             trust_remote_code=True,
                                             speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",

@@ -33,7 +33,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     # FIXME: Possible memory leak in the previous tests?
     if model_arch in ("Glm4vForConditionalGeneration",
                       "GraniteSpeechForConditionalGeneration",
-                      "KimiVLForConditionalGeneration"):
+                      "KimiVLForConditionalGeneration",
+                      "EagleLlama4ForCausalLM"):
         pytest.skip("Avoid OOM")
 
     # Avoid OOM and reduce initialization time by only using 1 layer
@@ -103,6 +104,8 @@ def _initialize_kv_caches_v1(self, vllm_config):
                        _initialize_kv_caches_v1), monkeypatch.context() as m):
         if model_info.v0_only:
             m.setenv("VLLM_USE_V1", "0")
+        if model_info.v1_only:
+            m.setenv("VLLM_USE_V1", "1")
         if model_arch == "Phi4FlashForCausalLM":
             # Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
             m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
@@ -112,8 +115,13 @@ def _initialize_kv_caches_v1(self, vllm_config):
             tokenizer_mode=model_info.tokenizer_mode,
             revision=model_info.revision,
             speculative_config={
-                "model": model_info.speculative_model,
-                "num_speculative_tokens": 1,
+                "method":
+                model_info.speculative_method
+                if model_info.speculative_method else None,
+                "model":
+                model_info.speculative_model,
+                "num_speculative_tokens":
+                1,
             } if model_info.speculative_model else None,
             trust_remote_code=model_info.trust_remote_code,
             max_model_len=model_info.max_model_len,

diff --git a/tests/v1/e2e/test_llama4_eagle.py b/tests/v1/e2e/test_llama4_eagle.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# To run this file, run
+# pytest -vx /tests/v1/e2e/test_llama4_eagle.py
+
+from __future__ import annotations
+
+import random
+from typing import Any
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+
+@pytest.fixture
+def test_prompts():
+    prompt_types = ["repeat", "sentence"]
+    num_prompts = 100
+    prompts = []
+
+    random.seed(0)
+    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+
+    # Generate a mixed batch of prompts, some of which can be easily
+    # predicted by n-gram matching and some which likely cannot.
+    for kind in random_prompt_type_choices:
+        word_choices = ["test", "temp", "hello", "where"]
+        word = random.choice(word_choices)
+        if kind == "repeat":
+            prompt = f"""
+            please repeat the word '{word}' 10 times.
+            give no other output than the word at least ten times in a row,
+            in lowercase with spaces between each word and without quotes.
+            """
+        elif kind == "sentence":
+            prompt = f"""
+            please give a ten-word sentence that
+            uses the word {word} at least once.
+            give no other output than that simple sentence without quotes.
+            """
+        else:
+            raise ValueError(f"Unknown prompt type: {kind}")
+        prompts.append([{"role": "user", "content": prompt}])
+
+    return prompts
+
+
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
+
+
+@pytest.mark.parametrize(
+    "method_model_and_draft_model",
+    [("eagle", "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+      "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct")],
+    ids=[
+        "llama4_eagle",
+    ])
+def test_eagle_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    method_model_and_draft_model: tuple[str, str, str],
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using eagle speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        method, model_name, spec_model_name = method_model_and_draft_model
+
+        tp = 8
+
+        ref_llm = LLM(model=model_name,
+                      tensor_parallel_size=tp,
+                      max_model_len=2048)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+
+        spec_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            tensor_parallel_size=tp,
+            speculative_config={
+                "method": method,
+                "model": spec_model_name,
+                "num_speculative_tokens": 3,
+                "max_model_len": 2048,
+            },
+            max_model_len=2048,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 66% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.66 * len(ref_outputs))
+        del spec_llm
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
@@ -53,14 +53,6 @@ def model_name():
     return "meta-llama/Llama-3.1-8B-Instruct"
 
 
-def eagle_model_name():
-    return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
-
-
-def eagle3_model_name():
-    return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
-
-
 def test_ngram_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
@@ -105,13 +97,17 @@ def test_ngram_correctness(
         del spec_llm
 
 
-@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
+@pytest.mark.parametrize("method_model_and_draft_model",
+                         [("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+                           "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"),
+                          ("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+                           "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")],
+                         ids=["llama3_eagle", "llama3_eagle3"])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
-    model_name: str,
-    use_eagle3: bool,
+    method_model_and_draft_model: tuple[str, str, str],
 ):
     '''
     Compare the outputs of a original LLM and a speculative LLM
@@ -120,17 +116,17 @@ def test_eagle_correctness(
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
+        method, model_name, spec_model_name = method_model_and_draft_model
+
         ref_llm = LLM(model=model_name, max_model_len=2048)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 
-        spec_model_name = eagle3_model_name(
-        ) if use_eagle3 else eagle_model_name()
         spec_llm = LLM(
             model=model_name,
             trust_remote_code=True,
             speculative_config={
-                "method": "eagle3" if use_eagle3 else "eagle",
+                "method": method,
                 "model": spec_model_name,
                 "num_speculative_tokens": 3,
                 "max_model_len": 2048,

diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
@@ -13,12 +13,16 @@
 from vllm.platforms import current_platform
 from vllm.v1.spec_decode.eagle import EagleProposer
 
-model_dir = "meta-llama/Llama-3.1-8B-Instruct"
-eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
-eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+llama3_model_dir = "meta-llama/Llama-3.1-8B-Instruct"
+llama3_eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+llama3_eagle3_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
 
+llama4_model_dir = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+llama4_eagle_dir = "ronaldbxu/EAGLE-Llama-4-Maverick-17B-128E-Instruct"
 
-def _create_proposer(method: str, k: int) -> EagleProposer:
+
+def _create_proposer(method: str, model_dir: str, draft_model_dir: str,
+                     k: int) -> EagleProposer:
     model_config = ModelConfig(model=model_dir,
                                task="generate",
                                max_model_len=100,
@@ -28,9 +32,6 @@ def _create_proposer(method: str, k: int) -> EagleProposer:
                                seed=None,
                                trust_remote_code=False)
 
-    # Choose model directory based on method
-    draft_model_dir = eagle_dir if method == "eagle" else eagle3_dir
-
     speculative_config = SpeculativeConfig(
         target_model_config=model_config,
         target_parallel_config=ParallelConfig(),
@@ -118,8 +119,14 @@ def test_prepare_inputs():
 
 
 @pytest.mark.parametrize("method,proposer_helper", [
-    ("eagle", lambda k: _create_proposer("eagle", k)),
-    ("eagle3", lambda k: _create_proposer("eagle3", k)),
+    ("eagle",
+     lambda k: _create_proposer("eagle", llama3_model_dir, llama3_eagle_dir, k)
+     ),
+    ("eagle",
+     lambda k: _create_proposer("eagle", llama4_model_dir, llama4_eagle_dir, k)
+     ),
+    ("eagle3", lambda k: _create_proposer("eagle3", llama3_model_dir,
+                                          llama3_eagle3_dir, k)),
 ])
 @pytest.mark.parametrize("pp_size", [1, 2])
 @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
@@ -199,7 +206,12 @@ class _TargetModelStub(LlamaForCausalLM):
 
 
 @pytest.mark.parametrize("num_speculative_tokens", [1, 3, 8])
-def test_propose(num_speculative_tokens):
+@pytest.mark.parametrize("model_and_draft_model",
+                         [(llama3_model_dir, llama3_eagle_dir),
+                          (llama4_model_dir, llama4_eagle_dir)])
+def test_propose(num_speculative_tokens, model_and_draft_model):
+    model_dir = model_and_draft_model[0]
+    draft_model_dir = model_and_draft_model[1]
     # Use GPU device
     device = torch.device(current_platform.device_type)
 
@@ -211,7 +223,8 @@ def test_propose(num_speculative_tokens):
     vocab_size = 100
 
     # Create proposer first so we can use its actual hidden_size
-    proposer = _create_proposer("eagle", num_speculative_tokens)
+    proposer = _create_proposer("eagle", model_dir, draft_model_dir,
+                                num_speculative_tokens)
     # Get the hidden_size from the proposer to ensure consistency
     hidden_size = proposer.hidden_size