Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
]


Expand Down
4 changes: 3 additions & 1 deletion vllm/v1/engine/output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def __init__(
prompt: Optional[str],
prompt_token_ids: List[int],
detokenizer: IncrementalDetokenizer,
arrival_time: float,
queue: Optional[asyncio.Queue[RequestOutput]],
):
self.request_id = request_id
Expand All @@ -37,7 +38,7 @@ def __init__(
self.is_prefilling = True
self.queue = queue

self.stats = RequestStateStats()
self.stats = RequestStateStats(last_token_time=arrival_time)

@classmethod
def from_new_request(
Expand All @@ -54,6 +55,7 @@ def from_new_request(
tokenizer=tokenizer,
request=request,
),
arrival_time=request.arrival_time,
queue=queue,
)

Expand Down
25 changes: 25 additions & 0 deletions vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,26 @@ def __init__(self, model_config: ModelConfig):
buckets=build_1_2_5_buckets(max_model_len),
labelnames=labelnames).labels(*labelvalues)

self.histogram_time_to_first_token = \
prometheus_client.Histogram(
name="vllm:time_to_first_token_seconds",
documentation="Histogram of time to first token in seconds.",
buckets=[
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
0.75, 1.0, 2.5, 5.0, 7.5, 10.0
],
labelnames=labelnames).labels(*labelvalues)

self.histogram_time_per_output_token = \
prometheus_client.Histogram(
name="vllm:time_per_output_token_seconds",
documentation="Histogram of time per output token in seconds.",
buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
0.75, 1.0, 2.5
],
labelnames=labelnames).labels(*labelvalues)

def log(self, scheduler_stats: SchedulerStats,
iteration_stats: IterationStats):
"""Log to prometheus."""
Expand All @@ -137,6 +157,11 @@ def log(self, scheduler_stats: SchedulerStats,
self.histogram_num_generation_tokens_request.observe(
finished_request.num_generation_tokens)

for ttft in iteration_stats.time_to_first_tokens_iter:
self.histogram_time_to_first_token.observe(ttft)
for tpot in iteration_stats.time_per_output_tokens_iter:
self.histogram_time_per_output_token.observe(tpot)

@staticmethod
def _unregister_vllm_metrics():
# Unregister any existing vLLM collectors (for CI/CD
Expand Down
11 changes: 11 additions & 0 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
from dataclasses import dataclass
from typing import TYPE_CHECKING, List

Expand All @@ -22,6 +23,7 @@ class RequestStateStats:
"""Stats that need to be tracked across delta updates."""

num_generation_tokens: int = 0
last_token_time: float = 0.0


@dataclass
Expand All @@ -40,6 +42,8 @@ def __init__(self, log_stats: bool):
self.num_generation_tokens = 0
self.num_prompt_tokens = 0
self.finished_requests: List[FinishedRequestStats] = []
self.time_to_first_tokens_iter: List[float] = []
self.time_per_output_tokens_iter: List[float] = []

def update_from_output(self, output: "EngineCoreOutput",
is_prefilling: bool, prompt_len: int,
Expand All @@ -48,6 +52,8 @@ def update_from_output(self, output: "EngineCoreOutput",
return

num_new_generation_tokens = len(output.new_token_ids)
now = time.time()
Copy link
Collaborator

@robertgshaw2-redhat robertgshaw2-redhat Jan 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have seen mixed use of time.time() and time.monotonic() - I think we should standardize on one or the other so we don't have a footgun down the road

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes!

I had initially changed to time.monotonic() but I backed it out because it looks like arrival_time being wall-clock time is part of the public API, so not something I wanted to rush into.

I'll make sure to do it or capture it as a TODO/FIXME before I wrap this up 👍

last_token_latency = now - request_state_stats.last_token_time

self.num_generation_tokens += num_new_generation_tokens
if is_prefilling:
Expand All @@ -58,7 +64,12 @@ def update_from_output(self, output: "EngineCoreOutput",
assert (num_new_generation_tokens > 0)
self.num_prompt_tokens += prompt_len

self.time_to_first_tokens_iter.append(last_token_latency)
else:
self.time_per_output_tokens_iter.append(last_token_latency)

request_state_stats.num_generation_tokens += num_new_generation_tokens
request_state_stats.last_token_time = now

def update_from_finished_request(self, request_output: "RequestOutput",
request_state_stats: RequestStateStats):
Expand Down