Clean.

zheyuf · zheyuf · commit 3f4781cf612b · 2025-09-26T15:11:51.000-07:00
Signed-off-by: Zheyu Fu &lt;zheyuf@NVIDIA.com&gt;
diff --git a/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py b/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py
@@ -30,12 +30,12 @@ def test_dynamic_spec_decode(enforce_single_worker,
     total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
     if total_mem_gb < 35:
         pytest.skip("Not enough memory to load target + draft model")
+
     models_path = llm_models_root()
     eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
     target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"
 
-    # Allow with 3 concurrent requests
-    max_batch_size = 3
+    max_batch_size = 1
     max_draft_len = 4
     kv_cache_config = KvCacheConfig(enable_block_reuse=True, max_tokens=8192)
     cuda_graph_config = CudaGraphConfig(batch_sizes=[1])
@@ -47,16 +47,18 @@ def test_dynamic_spec_decode(enforce_single_worker,
         cuda_graph_config=cuda_graph_config,
         max_batch_size=max_batch_size,
         kv_cache_config=kv_cache_config,
-        max_seq_len=4096,
+        # This max_seq_len is larger than the one specified
+        # in the llama 3 8B eagle's config. We want to make sure
+        # that the draft model won't go above its max in warmup
+        # in this test.
+        max_seq_len=8192,
     )
 
     spec_config = EagleDecodingConfig(
         max_draft_len=max_draft_len,
         speculative_model_dir=eagle_model_dir,
         # Llama 3 does not support one model eagle.
         eagle3_one_model=False,
-        # allow speculation only when <= 2 effective request
-        max_concurrency=2,
     )
 
     llm_spec = LLM(**llm_common_config, speculative_config=spec_config)