@@ -316,7 +316,7 @@ def _handle_response(self,
316316 else :
317317 self ._outputs [0 ]._postprocess_result = response .res
318318 if response .metrics :
319- self .metrics_dict = response .metrics
319+ self .metrics_dict . update ( response .metrics )
320320
321321 if response .error :
322322 if self ._background_error_handler is not None and (
@@ -391,7 +391,7 @@ def record_stats(self,
391391 stats , len (output .token_ids ), self .sampling_params .n > 1 )
392392 if processed_metrics_stat :
393393 metrics_stats .update (processed_metrics_stat )
394- self .metrics_dict = metrics_stats
394+ self .metrics_dict . update ( metrics_stats )
395395
396396 def do_tracing (
397397 self ,
@@ -410,20 +410,29 @@ def do_tracing(
410410 trace_context = tracing .extract_trace_context (self .trace_headers )
411411 sampling_params = self .sampling_params
412412
413- # TODO: Add request arrival time
414- arrival_time = time .time () - metrics_dict .get (MetricNames .E2E , - 1 )
413+ # Since arrival_time and other timing metrics are based on different time origins,
414+ # we need to apply corrections to align them with absolute timestamps
415+ time_correction = 0
416+ arrival_timestamp = metrics_dict .get (MetricNames .ARRIVAL_TIMESTAMP , 0 )
417+ arrival_time = req_perf_metrics_dict .get (
418+ RequestEventTiming .ARRIVAL_TIME , 0 )
419+ if arrival_timestamp > 0 :
420+ time_correction = arrival_timestamp - arrival_time
421+ else :
422+ time_correction = time .time () - metrics_dict .get (
423+ MetricNames .E2E , - 1 ) - arrival_time
424+
415425 with tracing .global_otlp_tracer ().start_as_current_span (
416426 "llm_request" ,
417427 kind = tracing .SpanKind .SERVER ,
418428 context = trace_context ,
419- start_time = int (arrival_time * 1e9 ),
429+ start_time = int (( arrival_time + time_correction ) * 1e9 ),
420430 ) as span :
421431
422432 def safe_set_attr (span , attr , value ):
423433 if value is not None :
424434 span .set_attribute (attr , value )
425435
426- e2e_time = metrics_dict .get (MetricNames .E2E , - 1 )
427436 safe_set_attr (span ,
428437 tracing .SpanAttributes .GEN_AI_REQUEST_TEMPERATURE ,
429438 sampling_params .temperature )
@@ -451,14 +460,36 @@ def safe_set_attr(span, attr, value):
451460 span , tracing .SpanAttributes .GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN ,
452461 metrics_dict .get (MetricNames .TTFT , - 1 ))
453462 safe_set_attr (span , tracing .SpanAttributes .GEN_AI_LATENCY_E2E ,
454- e2e_time )
463+ metrics_dict . get ( MetricNames . E2E , - 1 ) )
455464 safe_set_attr (span ,
456465 tracing .SpanAttributes .GEN_AI_LATENCY_TIME_IN_QUEUE ,
457466 metrics_dict .get (MetricNames .REQUEST_QUEUE_TIME , - 1 ))
458467 safe_set_attr (
459468 span , tracing .SpanAttributes .GEN_AI_RESPONSE_FINISH_REASONS ,
460469 json .dumps ([output .finish_reason ])
461470 if output .finish_reason else None )
471+ safe_set_attr (
472+ span ,
473+ tracing .SpanAttributes .GEN_AI_LATENCY_KV_CACHE_TRANSFER_TIME ,
474+ req_perf_metrics_dict .get (
475+ RequestEventTiming .KV_CACHE_TRANSFER_END , 0.0 ) -
476+ req_perf_metrics_dict .get (
477+ RequestEventTiming .KV_CACHE_TRANSFER_START , 0.0 ))
478+
479+ if req_perf_metrics_dict .get (
480+ RequestEventTiming .KV_CACHE_TRANSFER_START ,
481+ 0 ) and req_perf_metrics_dict .get (
482+ RequestEventTiming .KV_CACHE_TRANSFER_END , 0 ):
483+ tracing .add_event (
484+ tracing .SpanEvents .KV_CACHE_TRANSFER_START ,
485+ timestamp = int ((req_perf_metrics_dict .get (
486+ RequestEventTiming .KV_CACHE_TRANSFER_START , 0.0 ) +
487+ time_correction ) * 1e9 ))
488+ tracing .add_event (
489+ tracing .SpanEvents .KV_CACHE_TRANSFER_END ,
490+ timestamp = int ((req_perf_metrics_dict .get (
491+ RequestEventTiming .KV_CACHE_TRANSFER_END , 0.0 ) +
492+ time_correction ) * 1e9 ))
462493
463494
464495class DetokenizedGenerationResultBase (GenerationResultBase ):
0 commit comments