-
-
Notifications
You must be signed in to change notification settings - Fork 10.7k
[ Bugfix ] Fix Prometheus Metrics With zeromq
Frontend
#7279
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
robertgshaw2-redhat
merged 42 commits into
vllm-project:main
from
neuralmagic:fix-prom-metrics
Aug 18, 2024
Merged
Changes from all commits
Commits
Show all changes
42 commits
Select commit
Hold shift + click to select a range
0ee81de
fix
2de4dc4
stash
ebd062e
remove __init__
c79d165
scripts fix
6da5189
cleanup
346e5fc
more cleanup
b1d945d
clean
460b621
clean
66fa98b
match nick
db86714
match nick exactly
4029167
grabbed nicks changes
njhill c2b304a
switch to tempfile
dea6896
add comment
1082e63
format
b26cb53
deprecate Info metrics
64ba139
fixt
2263569
format
ba5c741
add multiprocess mode to gauges
694fc12
fix typo
4032b4d
test that metrics are exported
d1fe504
run both in the ci
c65f8ea
format
e3025f7
fix test
350c66d
adding tests
2da7d13
comments in test
3d6aade
format
a76f38a
fix example
6eea97c
remove unregistering
bccc2d2
Merge branch 'main' into fix-prom-metrics
0745f7d
cleanup for prom multiprocessing
5c253d9
format
af3474a
stash
13c0444
updated
c4477c4
updated
281a26a
fix
e793498
fix naming
53a56d5
comment
59479a6
format
f74d426
fix cache_config_info
03b8895
Merge branch 'main' into fix-prom-metrics
224c987
properly pass multiprocess_mode to RayGaugeCLS
ad26ad7
./format
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
from http import HTTPStatus | ||
|
||
import openai | ||
import pytest | ||
import requests | ||
from prometheus_client.parser import text_string_to_metric_families | ||
from transformers import AutoTokenizer | ||
|
||
from ...utils import RemoteOpenAIServer | ||
|
||
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def default_server_args(): | ||
return [ | ||
# use half precision for speed and memory savings in CI environment | ||
"--dtype", | ||
"bfloat16", | ||
"--max-model-len", | ||
"1024", | ||
"--enforce-eager", | ||
"--max-num-seqs", | ||
"128", | ||
] | ||
|
||
|
||
@pytest.fixture(scope="module", | ||
params=[ | ||
"", | ||
"--enable-chunked-prefill", | ||
"--disable-frontend-multiprocessing", | ||
]) | ||
def client(default_server_args, request): | ||
if request.param: | ||
default_server_args.append(request.param) | ||
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: | ||
yield remote_server.get_async_client() | ||
|
||
|
||
_PROMPT = "Hello my name is Robert and I love magic" | ||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | ||
_TOKENIZED_PROMPT = tokenizer(_PROMPT)["input_ids"] | ||
|
||
_NUM_REQUESTS = 10 | ||
_NUM_PROMPT_TOKENS_PER_REQUEST = len(_TOKENIZED_PROMPT) | ||
_NUM_GENERATION_TOKENS_PER_REQUEST = 10 | ||
|
||
# {metric_family: [(suffix, expected_value)]} | ||
EXPECTED_VALUES = { | ||
"vllm:time_to_first_token_seconds": [("_count", _NUM_REQUESTS)], | ||
"vllm:time_per_output_token_seconds": | ||
[("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))], | ||
"vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)], | ||
"vllm:request_prompt_tokens": | ||
[("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST), | ||
("_count", _NUM_REQUESTS)], | ||
"vllm:request_generation_tokens": | ||
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), | ||
("_count", _NUM_REQUESTS)], | ||
"vllm:request_params_n": [("_count", _NUM_REQUESTS)], | ||
"vllm:request_params_best_of": [("_count", _NUM_REQUESTS)], | ||
"vllm:prompt_tokens": [("_total", | ||
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], | ||
"vllm:generation_tokens": | ||
[("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], | ||
"vllm:request_success": [("_total", _NUM_REQUESTS)], | ||
} | ||
|
||
|
||
@pytest.mark.asyncio | ||
async def test_metrics_counts(client: openai.AsyncOpenAI): | ||
base_url = str(client.base_url)[:-3].strip("/") | ||
|
||
for _ in range(_NUM_REQUESTS): | ||
# sending a request triggers the metrics to be logged. | ||
await client.completions.create( | ||
model=MODEL_NAME, | ||
prompt=_TOKENIZED_PROMPT, | ||
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST) | ||
|
||
response = requests.get(base_url + "/metrics") | ||
print(response.text) | ||
assert response.status_code == HTTPStatus.OK | ||
|
||
# Loop over all expected metric_families | ||
for metric_family, suffix_values_list in EXPECTED_VALUES.items(): | ||
found_metric = False | ||
|
||
# Check to see if the metric_family is found in the prom endpoint. | ||
for family in text_string_to_metric_families(response.text): | ||
if family.name == metric_family: | ||
found_metric = True | ||
|
||
# Check that each suffix is found in the prom endpoint. | ||
for suffix, expected_value in suffix_values_list: | ||
metric_name_w_suffix = f"{metric_family}{suffix}" | ||
found_suffix = False | ||
|
||
for sample in family.samples: | ||
if sample.name == metric_name_w_suffix: | ||
found_suffix = True | ||
|
||
# For each suffix, value sure the value matches | ||
# what we expect. | ||
assert sample.value == expected_value, ( | ||
f"{metric_name_w_suffix} expected value of " | ||
f"{expected_value} did not match found value " | ||
f"{sample.value}") | ||
break | ||
assert found_suffix, ( | ||
f"Did not find {metric_name_w_suffix} in prom endpoint" | ||
) | ||
break | ||
|
||
assert found_metric, (f"Did not find {metric_family} in prom endpoint") | ||
|
||
|
||
EXPECTED_METRICS = [ | ||
"vllm:num_requests_running", | ||
"vllm:num_requests_swapped", | ||
"vllm:num_requests_waiting", | ||
"vllm:gpu_cache_usage_perc", | ||
"vllm:cpu_cache_usage_perc", | ||
"vllm:time_to_first_token_seconds_sum", | ||
"vllm:time_to_first_token_seconds_bucket", | ||
"vllm:time_to_first_token_seconds_count", | ||
"vllm:time_per_output_token_seconds_sum", | ||
"vllm:time_per_output_token_seconds_bucket", | ||
"vllm:time_per_output_token_seconds_count", | ||
"vllm:e2e_request_latency_seconds_sum", | ||
"vllm:e2e_request_latency_seconds_bucket", | ||
"vllm:e2e_request_latency_seconds_count", | ||
"vllm:request_prompt_tokens_sum", | ||
"vllm:request_prompt_tokens_bucket", | ||
"vllm:request_prompt_tokens_count", | ||
"vllm:request_generation_tokens_sum", | ||
"vllm:request_generation_tokens_bucket", | ||
"vllm:request_generation_tokens_count", | ||
"vllm:request_params_n_sum", | ||
"vllm:request_params_n_bucket", | ||
"vllm:request_params_n_count", | ||
"vllm:request_params_best_of_sum", | ||
"vllm:request_params_best_of_bucket", | ||
"vllm:request_params_best_of_count", | ||
"vllm:num_preemptions_total", | ||
"vllm:prompt_tokens_total", | ||
"vllm:generation_tokens_total", | ||
"vllm:request_success_total", | ||
"vllm:cache_config_info", | ||
# labels in cache_config_info | ||
"block_size", | ||
"cache_dtype", | ||
"cpu_offload_gb", | ||
"enable_prefix_caching", | ||
"gpu_memory_utilization", | ||
"num_cpu_blocks", | ||
"num_gpu_blocks", | ||
"num_gpu_blocks_override", | ||
"sliding_window", | ||
"swap_space_bytes", | ||
] | ||
|
||
|
||
@pytest.mark.asyncio | ||
async def test_metrics_exist(client: openai.AsyncOpenAI): | ||
base_url = str(client.base_url)[:-3].strip("/") | ||
|
||
# sending a request triggers the metrics to be logged. | ||
await client.completions.create(model=MODEL_NAME, | ||
prompt="Hello, my name is", | ||
max_tokens=5, | ||
temperature=0.0) | ||
|
||
response = requests.get(base_url + "/metrics") | ||
assert response.status_code == HTTPStatus.OK | ||
|
||
for metric in EXPECTED_METRICS: | ||
assert metric in response.text |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.