@@ -1957,31 +1957,20 @@ def _handle_cancelled_requests(self):
19571957 if len (self .canceled_req_ids ) == 0 :
19581958 return
19591959
1960- cancelled_responses = {}
1961- left_requests = []
1962- # Tracks canceled requests for proper handling in overlap mode during `sampler.update_requests`.
1963- self .canceled_requests = []
19641960 for request in self .active_requests :
19651961 req_id = request .py_request_id
19661962 if req_id in self .canceled_req_ids :
1967- self ._terminate_request (request )
1963+ # Mark requests as finished, then, we reuse all existing code
1964+ # to clean up the KV cache resources.
19681965 request .finish_by_reason (FinishReason .CANCELLED )
19691966 request .decoding_iter = request .py_decoding_iter
1970- cancelled_responses [req_id ] = request .create_response (
1971- False , self .dist .rank )
1972- self .canceled_requests .append (request )
19731967 self .canceled_req_ids .erase (req_id )
1974- else :
1975- left_requests .append (request )
1976- self .active_requests = left_requests
19771968
1978- # When enable attention dp, each rank does not have full copy of requests
1979- # so we need to remove the cancel requests not in the local rank
1980- self .canceled_req_ids .clear ()
1981-
1982- # enqueue the cancelled requests' responses as they are not
1983- # active_requests and be discarded in the sampler loop.
1984- self ._enqueue_responses (cancelled_responses )
1969+ if self .enable_attention_dp :
1970+ # TODO: revisit the cancel logic of attention dp
1971+ # When enable attention dp, each rank does not have full copy of requests
1972+ # so we need to remove the cancel requests not in the local rank
1973+ self .canceled_req_ids .clear ()
19851974
19861975 @nvtx_range ("_enqueue_responses" )
19871976 def _enqueue_responses (self , responses : Dict [int , LlmResponse ]):
0 commit comments