1414from  vllm .v1 .core .sched .scheduler  import  Scheduler 
1515from  vllm .v1 .kv_cache_interface  import  (FullAttentionSpec , KVCacheConfig ,
1616                                        KVCacheGroupSpec )
17- from  vllm .v1 .outputs  import  ModelRunnerOutput 
17+ from  vllm .v1 .outputs  import  DraftTokenIds ,  ModelRunnerOutput 
1818from  vllm .v1 .request  import  Request , RequestStatus 
1919from  vllm .v1 .structured_output  import  StructuredOutputManager 
2020from  vllm .v1 .structured_output .request  import  StructuredOutputRequest 
@@ -158,7 +158,6 @@ def test_schedule_partial_requests():
158158        # Only the first request has a sampled token id because 
159159        # the rest requests are still being prefilled. 
160160        sampled_token_ids = [[0 ], [], []],
161-         spec_token_ids = None ,
162161        logprobs = None ,
163162        prompt_logprobs_dict = {},
164163        pooler_output = [],
@@ -209,7 +208,6 @@ def test_no_mm_input_chunking():
209208        req_ids = [request .request_id  for  request  in  requests ],
210209        req_id_to_index = req_to_index ,
211210        sampled_token_ids = [[] for  _  in  range (len (requests ))],
212-         spec_token_ids = None ,
213211        logprobs = None ,
214212        prompt_logprobs_dict = {},
215213        pooler_output = [],
@@ -273,7 +271,6 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
273271        req_ids = [request .request_id  for  request  in  requests ],
274272        req_id_to_index = req_to_index ,
275273        sampled_token_ids = [[] for  _  in  range (len (requests ))],
276-         spec_token_ids = None ,
277274        logprobs = None ,
278275        prompt_logprobs_dict = {},
279276        pooler_output = [],
@@ -298,7 +295,6 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
298295        req_ids = [request .request_id  for  request  in  requests ],
299296        req_id_to_index = req_to_index ,
300297        sampled_token_ids = [[0 ], [0 ]] +  [[] for  _  in  range (len (requests ) -  2 )],
301-         spec_token_ids = None ,
302298        logprobs = None ,
303299        prompt_logprobs_dict = {},
304300        pooler_output = [],
@@ -355,7 +351,6 @@ def test_stop_via_update_from_output():
355351        sampled_token_ids = [[EOS_TOKEN_ID ],
356352                           [10 ,
357353                            11 ]],  # First request hits EOS, second continues 
358-         spec_token_ids = None ,
359354        logprobs = None ,
360355        prompt_logprobs_dict = {},
361356        pooler_output = [])
@@ -409,7 +404,6 @@ def test_stop_via_update_from_output():
409404        },
410405        sampled_token_ids = [[10 , 42 , 12 ],
411406                           [13 , 14 ]],  # First request hits stop token 
412-         spec_token_ids = None ,
413407        logprobs = None ,
414408        prompt_logprobs_dict = {},
415409        pooler_output = [])
@@ -462,7 +456,6 @@ def test_stop_via_update_from_output():
462456        },
463457        sampled_token_ids = [[10 , 11 , 12 ],
464458                           [13 ]],  # First request exceeds max_tokens 
465-         spec_token_ids = None ,
466459        logprobs = None ,
467460        prompt_logprobs_dict = {},
468461        pooler_output = [])
@@ -505,7 +498,6 @@ def test_stop_via_update_from_output():
505498        req_ids = [requests [0 ].request_id ],
506499        req_id_to_index = {requests [0 ].request_id : 0 },
507500        sampled_token_ids = [[EOS_TOKEN_ID , 10 , 11 ]],
508-         spec_token_ids = None ,
509501        logprobs = None ,
510502        prompt_logprobs_dict = {},
511503        pooler_output = [])
@@ -554,7 +546,6 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
554546        req_ids = [requests [0 ].request_id ],
555547        req_id_to_index = {requests [0 ].request_id : 0 },
556548        sampled_token_ids = [[0 ]],
557-         spec_token_ids = None ,
558549        logprobs = None ,
559550        prompt_logprobs_dict = {},
560551        pooler_output = [],
@@ -572,7 +563,6 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
572563        req_ids = [requests [1 ].request_id ],
573564        req_id_to_index = {requests [1 ].request_id : 0 },
574565        sampled_token_ids = [[0 ]],
575-         spec_token_ids = None ,
576566        logprobs = None ,
577567        prompt_logprobs_dict = {},
578568        pooler_output = [],
@@ -608,7 +598,6 @@ def test_preempt_during_execution():
608598        req_ids = [requests [0 ].request_id ],
609599        req_id_to_index = {requests [0 ].request_id : 0 },
610600        sampled_token_ids = [[0 ]],
611-         spec_token_ids = None ,
612601        logprobs = None ,
613602        prompt_logprobs_dict = {},
614603        pooler_output = [],
@@ -626,7 +615,6 @@ def test_preempt_during_execution():
626615        req_ids = [requests [1 ].request_id ],
627616        req_id_to_index = {requests [1 ].request_id : 0 },
628617        sampled_token_ids = [[42 ]],
629-         spec_token_ids = None ,
630618        logprobs = None ,
631619        prompt_logprobs_dict = {},
632620        pooler_output = [],
@@ -682,13 +670,14 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
682670        req_ids = req_ids ,
683671        req_id_to_index = req_to_index ,
684672        sampled_token_ids = [[0 ] for  _  in  range (len (requests ))],
685-         spec_token_ids = spec_tokens ,
686673        logprobs = None ,
687674        prompt_logprobs_dict = {},
688675        pooler_output = [],
689676    )
690677    engine_core_outputs  =  scheduler .update_from_output (output ,
691678                                                       model_runner_output )
679+     draft_token_ids  =  DraftTokenIds (req_ids , spec_tokens )
680+     scheduler .update_draft_token_ids (draft_token_ids )
692681
693682    for  i  in  range (len (requests )):
694683        running_req  =  scheduler .running [i ]
@@ -722,7 +711,6 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
722711        req_ids = req_ids ,
723712        req_id_to_index = req_to_index ,
724713        sampled_token_ids = output_tokens ,
725-         spec_token_ids = None ,
726714        logprobs = None ,
727715        prompt_logprobs_dict = {},
728716        pooler_output = [],
@@ -851,7 +839,6 @@ def test_kv_connector_basic():
851839        req_ids = req_ids ,
852840        req_id_to_index = req_to_index ,
853841        sampled_token_ids = [[1000 ]] *  len (req_ids ),
854-         spec_token_ids = None ,
855842        logprobs = None ,
856843        prompt_logprobs_dict = {},
857844        pooler_output = [],
@@ -898,7 +885,6 @@ def test_kv_connector_basic():
898885        req_ids = req_ids ,
899886        req_id_to_index = req_to_index ,
900887        sampled_token_ids = [[1000 ]] *  len (req_ids ),
901-         spec_token_ids = None ,
902888        logprobs = None ,
903889        prompt_logprobs_dict = {},
904890        pooler_output = [],
@@ -966,7 +952,6 @@ def test_kv_connector_unable_to_allocate():
966952        req_ids = req_ids ,
967953        req_id_to_index = req_to_index ,
968954        sampled_token_ids = [[1000 ]] *  len (req_ids ),
969-         spec_token_ids = None ,
970955        logprobs = None ,
971956        prompt_logprobs_dict = {},
972957        pooler_output = [],
@@ -1048,7 +1033,6 @@ def test_kv_connector_handles_preemption():
10481033        req_ids = req_ids ,
10491034        req_id_to_index = req_to_index ,
10501035        sampled_token_ids = [[1000 ]] *  len (req_ids ),
1051-         spec_token_ids = None ,
10521036        logprobs = None ,
10531037        prompt_logprobs_dict = {},
10541038        pooler_output = [],
@@ -1142,7 +1126,6 @@ def make_output(scheduler: Scheduler):
11421126            for  i , req  in  enumerate (scheduler .running )
11431127        },
11441128        sampled_token_ids = [[1000 ]] *  len (scheduler .running ),
1145-         spec_token_ids = None ,
11461129        logprobs = None ,
11471130        prompt_logprobs_dict = {},
11481131        pooler_output = [],
@@ -1468,7 +1451,6 @@ def test_priority_scheduling_preemption():
14681451            for  i , req  in  enumerate (low_priority_requests )
14691452        },
14701453        sampled_token_ids = [[100 ] for  _  in  low_priority_requests ],
1471-         spec_token_ids = None ,
14721454        logprobs = None ,
14731455        prompt_logprobs_dict = {},
14741456        pooler_output = [],
@@ -1541,7 +1523,6 @@ def test_priority_scheduling_no_preemption_when_space_available():
15411523            for  i , req  in  enumerate (low_priority_requests )
15421524        },
15431525        sampled_token_ids = [[100 ] for  _  in  low_priority_requests ],
1544-         spec_token_ids = None ,
15451526        logprobs = None ,
15461527        prompt_logprobs_dict = {},
15471528        pooler_output = [],
@@ -1783,7 +1764,6 @@ def test_priority_scheduling_heap_property():
17831764                req_ids = [req .req_id ],
17841765                req_id_to_index = {req .req_id : 0 },
17851766                sampled_token_ids = [[100 ]],
1786-                 spec_token_ids = None ,
17871767                logprobs = None ,
17881768                prompt_logprobs_dict = {},
17891769                pooler_output = [],
0 commit comments