[fix] Eagle-2 LLMAPI pybind argument fix. (#3967)

jhaotingc · symphonylyh · web-flow · commit fcadce9f8d4a · 2025-05-29T12:23:25.000-07:00
Signed-off-by: Jhao-Ting Chen &lt;jhaotingc@nvidia.com&gt;
Co-authored-by: Haohang Huang &lt;31998628+symphonylyh@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/pybind/executor/request.cpp b/cpp/tensorrt_llm/pybind/executor/request.cpp
@@ -449,7 +449,7 @@ void initRequestBindings(pybind11::module_& m)
         {
             throw std::runtime_error("Invalid EagleConfig state!");
         }
-        return tle::EagleConfig(state[0].cast<tle::EagleChoices>(), state[1].cast<bool>(),
+        return tle::EagleConfig(state[0].cast<std::optional<tle::EagleChoices>>(), state[1].cast<bool>(),
             state[2].cast<std::optional<float>>(), state[3].cast<bool>(), state[4].cast<std::optional<SizeType32>>());
     };
     py::class_<tle::EagleConfig>(m, "EagleConfig")
diff --git a/examples/llm-api/llm_eagle2_decoding.py b/examples/llm-api/llm_eagle2_decoding.py
@@ -0,0 +1,56 @@
+### Generate Text Using Eagle2 Decoding
+
+from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
+                                 SamplingParams)
+
+
+def main():
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # The end user can customize the sampling configuration with the SamplingParams class
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # The end user can customize the kv cache configuration with the KVCache class
+    kv_cache_config = KvCacheConfig(enable_block_reuse=True)
+
+    llm_kwargs = {}
+
+    model = "lmsys/vicuna-7b-v1.3"
+
+    # The end user can customize the eagle decoding configuration by specifying the
+    # speculative_model, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices
+    # greedy_sampling,posterior_threshold, use_dynamic_tree and dynamic_tree_max_topK
+    # with the EagleDecodingConfig class
+
+    speculative_config = EagleDecodingConfig(
+        speculative_model="yuhuili/EAGLE-Vicuna-7B-v1.3",
+        max_draft_len=63,
+        num_eagle_layers=4,
+        max_non_leaves_per_layer=10,
+        use_dynamic_tree=True,
+        dynamic_tree_max_topK=10)
+
+    llm = LLM(model=model,
+              kv_cache_config=kv_cache_config,
+              speculative_config=speculative_config,
+              max_batch_size=1,
+              max_seq_len=1024,
+              **llm_kwargs)
+
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/llm-api/llm_eagle_decoding.py b/examples/llm-api/llm_eagle_decoding.py
@@ -1,8 +1,8 @@
 ### Generate Text Using Eagle Decoding
 
 from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import (LLM, BuildConfig, EagleDecodingConfig,
-                                 KvCacheConfig, SamplingParams)
+from tensorrt_llm.llmapi import (LLM, EagleDecodingConfig, KvCacheConfig,
+                                 SamplingParams)
 
 
 def main():
@@ -16,9 +16,6 @@ def main():
     # The end user can customize the sampling configuration with the SamplingParams class
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-    # The end user can customize the build configuration with the BuildConfig class
-    build_config = BuildConfig(max_batch_size=1, max_seq_len=1024)
-
     # The end user can customize the kv cache configuration with the KVCache class
     kv_cache_config = KvCacheConfig(enable_block_reuse=True)
 
@@ -45,9 +42,10 @@ def main():
     )
 
     llm = LLM(model=model,
-              build_config=build_config,
               kv_cache_config=kv_cache_config,
               speculative_config=speculative_config,
+              max_batch_size=1,
+              max_seq_len=1024,
               **llm_kwargs)
 
     outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/integration/defs/accuracy/test_cli_flow.py b/tests/integration/defs/accuracy/test_cli_flow.py
@@ -466,6 +466,33 @@ def test_eagle(self, cuda_graph, chunked_context, typical_acceptance,
                  ],
                  extra_summarize_args=extra_summarize_args)
 
+    @skip_post_blackwell
+    @parametrize_with_ids("cuda_graph,chunked_context", [(False, False),
+                                                         (True, True),
+                                                         (True, False)])
+    def test_eagle_2(self, cuda_graph, chunked_context, mocker):
+        mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "eagle")
+        mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 8)
+
+        extra_summarize_args = [
+            "--eagle_use_dynamic_tree", "--eagle_dynamic_tree_max_top_k=10"
+        ]
+        if cuda_graph:
+            extra_summarize_args.append("--cuda_graph_mode")
+        if chunked_context:
+            extra_summarize_args.append("--enable_chunked_context")
+
+        self.run(spec_dec_algo=EagleDecodingConfig.decoding_type,
+                 extra_convert_args=[
+                     f"--eagle_model_dir={self.EAGLE_MODEL_PATH}",
+                     "--max_draft_len=63", "--num_eagle_layers=4",
+                     "--max_non_leaves_per_layer=10"
+                 ],
+                 extra_build_args=[
+                     "--speculative_decoding_mode=eagle", "--max_draft_len=63"
+                 ],
+                 extra_summarize_args=extra_summarize_args)
+
 
 class TestLlama7B(CliFlowAccuracyTestHarness):
     MODEL_NAME = "llama-7b-hf"
diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import pytest
 
-from tensorrt_llm.llmapi import LLM
+from tensorrt_llm.llmapi import LLM, EagleDecodingConfig
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo
 
@@ -290,3 +290,50 @@ def test_fp8_kvcache(self):
                           extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
+
+
+class TestEagleVicuna_7B_v1_3(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "lmsys/vicuna-7b-v1.3"
+    MODEL_PATH = f"{llm_models_root()}/vicuna-7b-v1.3"
+
+    speculative_config = EagleDecodingConfig(
+        max_draft_len=63,
+        speculative_model=f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3",
+        num_eagle_layers=4,
+        max_non_leaves_per_layer=10,
+                            eagle_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \
+                                            [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], [7], [0, 8], [0, 0, 2], [3, 0], \
+                                            [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], \
+                                            [0, 0, 0, 0], [0, 1, 1], [0, 0, 6], [0, 3, 0], [5, 0], [1, 3], [0, 0, 7], [0, 0, 8], [0, 0, 9], \
+                                            [6, 0], [0, 4, 0], [1, 4], [7, 0], [0, 1, 2], [2, 0, 0], [3, 1], [2, 2], [8, 0], \
+                                            [0, 5, 0], [1, 5], [1, 0, 1], [0, 2, 1], [9, 0], [0, 6, 0], [0, 0, 0, 1], [1, 6], [0, 7, 0]]
+    )
+
+    def test_auto_dtype(self):
+        with LLM(
+                self.MODEL_PATH,
+                max_batch_size=8,  # Spec-dec use case less than bs=8
+                speculative_config=self.speculative_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
+class TestEagle2Vicuna_7B_v1_3(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "lmsys/vicuna-7b-v1.3"
+    MODEL_PATH = f"{llm_models_root()}/vicuna-7b-v1.3"
+
+    speculative_config = EagleDecodingConfig(
+        max_draft_len=63,
+        speculative_model=f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3",
+        num_eagle_layers=4,
+        max_non_leaves_per_layer=10,
+        use_dynamic_tree=True,
+        dynamic_tree_max_topK=10)
+
+    def test_auto_dtype(self):
+        with LLM(
+                self.MODEL_PATH,
+                max_batch_size=8,  # Spec-dec use case less than bs=8
+                speculative_config=self.speculative_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
diff --git a/tests/integration/defs/llmapi/test_llm_examples.py b/tests/integration/defs/llmapi/test_llm_examples.py
@@ -141,6 +141,11 @@ def test_llmapi_example_eagle_decoding(llm_root, engine_dir, llm_venv):
     _run_llmapi_example(llm_root, engine_dir, llm_venv, "llm_eagle_decoding.py")
 
 
+def test_llmapi_example_eagle2_decoding(llm_root, engine_dir, llm_venv):
+    _run_llmapi_example(llm_root, engine_dir, llm_venv,
+                        "llm_eagle2_decoding.py")
+
+
 @pytest.mark.skip_less_device(2)
 def test_llmapi_example_distributed_tp2(llm_root, engine_dir, llm_venv):
     _run_llmapi_example(llm_root, engine_dir, llm_venv,
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1442,9 +1442,6 @@ def test_build_time_benchmark_sanity(llm_root, llm_venv):
     ])
 
 
-# End of HLAPI examples
-
-
 ### Pivot-To-Python examples
 def test_ptp_quickstart(llm_root, llm_venv):
     example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -107,6 +107,9 @@ l0_a10:
   - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=False] # 5 mins
   - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=True-typical_acceptance=False] # 5 mins
   - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=True] # 5 mins
+  - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=False-chunked_context=False] # 5 mins
+  - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=False] # 5 mins
+  - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=True] # 5 mins
   - accuracy/test_cli_flow.py::TestLlama2_7B::test_auto_dtype
   - examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-disable_weight_only]
   - unittest/trt/attention/test_gpt_attention_IFB.py
@@ -165,3 +168,5 @@ l0_a10:
   - test_e2e.py::test_build_time_benchmark_sanity
   - examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime]
   - examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_python_runtime] # 4 mins
+  - accuracy/test_llm_api.py::TestEagleVicuna_7B_v1_3::test_auto_dtype
+  - accuracy/test_llm_api.py::TestEagle2Vicuna_7B_v1_3::test_auto_dtype
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -268,6 +268,9 @@ l0_h100:
   - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=False] # 5 mins
   - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=True-typical_acceptance=False] # 5 mins
   - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=True] # 5 mins
+  - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=False-chunked_context=False] # 5 mins
+  - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=False] # 5 mins
+  - accuracy/test_cli_flow.py::TestVicuna7B::test_eagle_2[cuda_graph=True-chunked_context=True] # 5 mins
   - accuracy/test_cli_flow.py::TestPhi2::test_auto_dtype # 2 mins
   - accuracy/test_cli_flow.py::TestGpt2Medium::test_fp8
   - accuracy/test_cli_flow.py::TestGpt2Medium::test_fp8_lm_head
@@ -289,3 +292,5 @@ l0_h100:
   - unittest/trt/model_api/test_model_quantization.py # 20 mins on H100
   - unittest/bindings # 8 mins on H100
   - test_e2e.py::test_build_time_benchmark_sanity
+  - accuracy/test_llm_api.py::TestEagleVicuna_7B_v1_3::test_auto_dtype
+  - accuracy/test_llm_api.py::TestEagle2Vicuna_7B_v1_3::test_auto_dtype
diff --git a/tests/unittest/bindings/test_executor_bindings.py b/tests/unittest/bindings/test_executor_bindings.py
@@ -1444,6 +1444,14 @@ def test_eagle_config_pickle():
     assert config.use_dynamic_tree == config_copy.use_dynamic_tree
     assert config.greedy_sampling == config_copy.greedy_sampling
 
+    config = trtllm.EagleConfig(None, False, 0.5, True, 3)
+    config_copy = pickle.loads(pickle.dumps(config))
+    assert config.eagle_choices == config_copy.eagle_choices
+    assert config.greedy_sampling == config_copy.greedy_sampling
+    assert config.posterior_threshold == config_copy.posterior_threshold
+    assert config.use_dynamic_tree == config_copy.use_dynamic_tree
+    assert config.dynamic_tree_max_topK == config_copy.dynamic_tree_max_topK
+
 
 def test_decoding_mode():
     mode = trtllm.DecodingMode.Auto()
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
@@ -1194,15 +1194,14 @@ def test_llm_api_medusa_tp2():
 
 
 @pytest.mark.part0
-def test_llm_api_eagle():
+def test_llm_api_eagle(**llm_kwargs):
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
         "The capital of France is",
         "The future of AI is",
     ]
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    build_config = BuildConfig(max_batch_size=1, max_seq_len=1024)
 
     kv_cache_config = KvCacheConfig(enable_block_reuse=True)
 
@@ -1218,11 +1217,19 @@ def test_llm_api_eagle():
                                             [6, 0], [0, 4, 0], [1, 4], [7, 0], [0, 1, 2], [2, 0, 0], [3, 1], [2, 2], [8, 0], \
                                             [0, 5, 0], [1, 5], [1, 0, 1], [0, 2, 1], [9, 0], [0, 6, 0], [0, 0, 0, 1], [1, 6], [0, 7, 0]]
     )
+
+    # in test_llm_multi_gpu, kv_cache_config is passed as a kwarg
+    if "kv_cache_config" in llm_kwargs:
+        kv_cache_config = llm_kwargs["kv_cache_config"]
+        del llm_kwargs["kv_cache_config"]
+
     llm = LLM(model=get_model_path("vicuna-7b-v1.3"),
-              build_config=build_config,
               kv_cache_config=kv_cache_config,
               speculative_config=speculative_config,
-              fast_build=True)
+              max_batch_size=1,
+              max_seq_len=1024,
+              fast_build=True,
+              **llm_kwargs)
 
     outputs = llm.generate(prompts, sampling_params)
 
@@ -1233,39 +1240,38 @@ def test_llm_api_eagle():
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
 
-@skip_single_gpu
 @pytest.mark.part0
-def test_llm_api_eagle_tp2():
+def test_llm_api_eagle2(**llm_kwargs):
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
         "The capital of France is",
         "The future of AI is",
     ]
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    build_config = BuildConfig(max_batch_size=1, max_seq_len=1024)
 
     kv_cache_config = KvCacheConfig(enable_block_reuse=True)
 
     speculative_config = EagleDecodingConfig(
         max_draft_len=63,
-                      speculative_model=get_model_path("EAGLE-Vicuna-7B-v1.3"),
-
+        speculative_model=get_model_path("EAGLE-Vicuna-7B-v1.3"),
         num_eagle_layers=4,
         max_non_leaves_per_layer=10,
-                            eagle_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], \
-                                            [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], [7], [0, 8], [0, 0, 2], [3, 0], \
-                                            [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], \
-                                            [0, 0, 0, 0], [0, 1, 1], [0, 0, 6], [0, 3, 0], [5, 0], [1, 3], [0, 0, 7], [0, 0, 8], [0, 0, 9], \
-                                            [6, 0], [0, 4, 0], [1, 4], [7, 0], [0, 1, 2], [2, 0, 0], [3, 1], [2, 2], [8, 0], \
-                                            [0, 5, 0], [1, 5], [1, 0, 1], [0, 2, 1], [9, 0], [0, 6, 0], [0, 0, 0, 1], [1, 6], [0, 7, 0]]
-    )
+        use_dynamic_tree=True,
+        dynamic_tree_max_topK=10)
+
+    # in test_llm_multi_gpu, kv_cache_config is passed as a kwarg
+    if "kv_cache_config" in llm_kwargs:
+        kv_cache_config = llm_kwargs["kv_cache_config"]
+        del llm_kwargs["kv_cache_config"]
+
     llm = LLM(model=get_model_path("vicuna-7b-v1.3"),
-              build_config=build_config,
               kv_cache_config=kv_cache_config,
               speculative_config=speculative_config,
-              tensor_parallel_size=2,
-              fast_build=True)
+              max_batch_size=1,
+              max_seq_len=1024,
+              fast_build=True,
+              **llm_kwargs)
 
     outputs = llm.generate(prompts, sampling_params)
 
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py
@@ -24,7 +24,7 @@
     llama_model_path, llama_v2_7b_prompt_adapter_test_harness,
     llama_v2_13b_lora_test_harness, llm_check_output,
     llm_get_stats_async_test_harness, llm_get_stats_test_harness,
-    llm_test_harness, mixtral_model_name, prompts,
+    llm_test_harness, mixtral_model_name, prompts, test_llm_api_eagle,
     tinyllama_guided_decoding_test_harness,
     tinyllama_logits_processor_test_harness, run_llm_with_postprocess_parallel,
     run_llm_with_postprocess_parallel_and_result_handler, run_llm_abort_request,
@@ -283,6 +283,13 @@ def test_llama_v2_7b_prompt_adapter_tp2():
         tensor_parallel_size=2, kv_cache_config=global_kv_cache_config_no_reuse)
 
 
+@pytest.mark.gpu2
+@pytest.mark.part0
+def test_llm_api_eagle_tp2():
+    test_llm_api_eagle(tensor_parallel_size=2,
+                       kv_cache_config=global_kv_cache_config)
+
+
 def run_command(command: str):
     try:
         result = subprocess.run(command,

Original file line number	Diff line number	Diff line change
`@@ -449,7 +449,7 @@ void initRequestBindings(pybind11::module_& m)`
`449`	`449`	`{`
`450`	`450`	`throw std::runtime_error("Invalid EagleConfig state!");`
`451`	`451`	`}`
`452`		`- return tle::EagleConfig(state[0].cast<tle::EagleChoices>(), state[1].cast<bool>(),`
	`452`	`+ return tle::EagleConfig(state[0].cast<std::optional<tle::EagleChoices>>(), state[1].cast<bool>(),`
`453`	`453`	`state[2].cast<std::optional<float>>(), state[3].cast<bool>(), state[4].cast<std::optional<SizeType32>>());`
`454`	`454`	`};`
`455`	`455`	`py::class_<tle::EagleConfig>(m, "EagleConfig")`