@@ -147,6 +147,7 @@ def make_request_output(
147147 finish_reason : Optional [FinishReason ],
148148 stop_reason : Union [int , str , None ],
149149 kv_transfer_params : Optional [dict [str , Any ]] = None ,
150+ num_cached_tokens : int = 0 ,
150151 ) -> Optional [RequestOutput ]:
151152
152153 finished = finish_reason is not None
@@ -169,14 +170,15 @@ def make_request_output(
169170 return None
170171
171172 return self ._new_request_output (request_id , outputs , finished ,
172- kv_transfer_params )
173+ kv_transfer_params , num_cached_tokens )
173174
174175 def _new_request_output (
175176 self ,
176177 request_id : str ,
177178 outputs : list [CompletionOutput ],
178179 finished : bool ,
179180 kv_transfer_params : Optional [dict [str , Any ]] = None ,
181+ num_cached_tokens : int = 0 ,
180182 ) -> RequestOutput :
181183
182184 if self .output_kind == RequestOutputKind .DELTA :
@@ -193,6 +195,7 @@ def _new_request_output(
193195 outputs = outputs ,
194196 finished = finished ,
195197 kv_transfer_params = kv_transfer_params ,
198+ num_cached_tokens = num_cached_tokens ,
196199 )
197200
198201 def _new_completion_output (
@@ -340,7 +343,7 @@ def process_outputs(
340343 finish_reason = engine_core_output .finish_reason
341344 stop_reason = engine_core_output .stop_reason
342345 kv_transfer_params = engine_core_output .kv_transfer_params
343-
346+ num_cached_tokens = engine_core_output . num_cached_tokens
344347 req_state .is_prefilling = False
345348
346349 # 2) Detokenize the token ids into text and perform stop checks.
@@ -356,7 +359,7 @@ def process_outputs(
356359 # 4) Create and handle RequestOutput objects.
357360 if request_output := req_state .make_request_output (
358361 new_token_ids , finish_reason , stop_reason ,
359- kv_transfer_params ):
362+ kv_transfer_params , num_cached_tokens ):
360363 if req_state .queue is not None :
361364 # AsyncLLM: put into queue for handling by generate().
362365 req_state .queue .put (request_output )
0 commit comments