@@ -896,6 +896,10 @@ def _executor_loop_pp(self):
896896
897897 def _executor_loop (self ):
898898 torch .cuda .set_device (self .device_id )
899+ is_ngram = hasattr (
900+ self .model_engine , "spec_config"
901+ ) and self .model_engine .spec_config is not None and self .model_engine .spec_config .spec_dec_mode .is_ngram (
902+ )
899903 with self ._profiler () as profile_step :
900904 sample_state = None
901905 iter_start_time = time .time ()
@@ -918,8 +922,7 @@ def _executor_loop(self):
918922
919923 self ._pad_attention_dp_dummy_request ()
920924
921- if self .draft_model_engine is not None or hasattr (
922- self , 'drafter' ) and self .drafter is not None :
925+ if self .draft_model_engine is not None or is_ngram or self .drafter is not None :
923926 self ._prepare_draft_requests (self .active_requests )
924927
925928 scheduled_batch , fitting_disagg_gen_init_requests , num_fitting_reqs = self ._schedule (
@@ -1652,8 +1655,13 @@ def _send_disagg_ctx_cache(self, scheduled_ctx_requests):
16521655 if req .is_context_only_request and (req .is_context_finished or
16531656 req .is_finished_due_to_length ):
16541657 self .kv_cache_transceiver .respond_and_send_async (req )
1655- self .resource_manager .resource_managers [
1656- ResourceManagerType .SEQ_SLOT_MANAGER ].free_resources (req )
1658+ for resource_mgr_type in (
1659+ ResourceManagerType .SEQ_SLOT_MANAGER ,
1660+ ResourceManagerType .SPEC_RESOURCE_MANAGER ):
1661+ if resource_mgr_type in self .resource_manager .resource_managers and self .resource_manager .resource_managers [
1662+ resource_mgr_type ] is not None :
1663+ self .resource_manager .resource_managers [
1664+ resource_mgr_type ].free_resources (req )
16571665
16581666 self .kv_cache_transceiver .check_context_transfer_status (0 )
16591667
0 commit comments