Skip to content

Commit 3f4781c

Browse files
committed
Clean.
Signed-off-by: Zheyu Fu <[email protected]>
1 parent 403b461 commit 3f4781c

File tree

1 file changed

+7
-5
lines changed

1 file changed

+7
-5
lines changed

tests/unittest/_torch/speculative/test_dynamic_spec_decode.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ def test_dynamic_spec_decode(enforce_single_worker,
3030
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
3131
if total_mem_gb < 35:
3232
pytest.skip("Not enough memory to load target + draft model")
33+
3334
models_path = llm_models_root()
3435
eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
3536
target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"
3637

37-
# Allow with 3 concurrent requests
38-
max_batch_size = 3
38+
max_batch_size = 1
3939
max_draft_len = 4
4040
kv_cache_config = KvCacheConfig(enable_block_reuse=True, max_tokens=8192)
4141
cuda_graph_config = CudaGraphConfig(batch_sizes=[1])
@@ -47,16 +47,18 @@ def test_dynamic_spec_decode(enforce_single_worker,
4747
cuda_graph_config=cuda_graph_config,
4848
max_batch_size=max_batch_size,
4949
kv_cache_config=kv_cache_config,
50-
max_seq_len=4096,
50+
# This max_seq_len is larger than the one specified
51+
# in the llama 3 8B eagle's config. We want to make sure
52+
# that the draft model won't go above its max in warmup
53+
# in this test.
54+
max_seq_len=8192,
5155
)
5256

5357
spec_config = EagleDecodingConfig(
5458
max_draft_len=max_draft_len,
5559
speculative_model_dir=eagle_model_dir,
5660
# Llama 3 does not support one model eagle.
5761
eagle3_one_model=False,
58-
# allow speculation only when <= 2 effective request
59-
max_concurrency=2,
6062
)
6163

6264
llm_spec = LLM(**llm_common_config, speculative_config=spec_config)

0 commit comments

Comments
 (0)