@@ -30,12 +30,12 @@ def test_dynamic_spec_decode(enforce_single_worker,
30
30
total_mem_gb = torch .cuda .get_device_properties (0 ).total_memory / 1e9
31
31
if total_mem_gb < 35 :
32
32
pytest .skip ("Not enough memory to load target + draft model" )
33
+
33
34
models_path = llm_models_root ()
34
35
eagle_model_dir = f"{ models_path } /EAGLE3-LLaMA3.1-Instruct-8B"
35
36
target_model_dir = f"{ models_path } /llama-3.1-model/Llama-3.1-8B-Instruct"
36
37
37
- # Allow with 3 concurrent requests
38
- max_batch_size = 3
38
+ max_batch_size = 1
39
39
max_draft_len = 4
40
40
kv_cache_config = KvCacheConfig (enable_block_reuse = True , max_tokens = 8192 )
41
41
cuda_graph_config = CudaGraphConfig (batch_sizes = [1 ])
@@ -47,16 +47,18 @@ def test_dynamic_spec_decode(enforce_single_worker,
47
47
cuda_graph_config = cuda_graph_config ,
48
48
max_batch_size = max_batch_size ,
49
49
kv_cache_config = kv_cache_config ,
50
- max_seq_len = 4096 ,
50
+ # This max_seq_len is larger than the one specified
51
+ # in the llama 3 8B eagle's config. We want to make sure
52
+ # that the draft model won't go above its max in warmup
53
+ # in this test.
54
+ max_seq_len = 8192 ,
51
55
)
52
56
53
57
spec_config = EagleDecodingConfig (
54
58
max_draft_len = max_draft_len ,
55
59
speculative_model_dir = eagle_model_dir ,
56
60
# Llama 3 does not support one model eagle.
57
61
eagle3_one_model = False ,
58
- # allow speculation only when <= 2 effective request
59
- max_concurrency = 2 ,
60
62
)
61
63
62
64
llm_spec = LLM (** llm_common_config , speculative_config = spec_config )
0 commit comments