Skip to content

Commit ddd704f

Browse files
authored
fix: Fix queued req stats for release/0.20 (#4806)
Signed-off-by: Patrice Castonguay <[email protected]>
1 parent 7a2cd25 commit ddd704f

File tree

2 files changed

+64
-5
lines changed

2 files changed

+64
-5
lines changed

tensorrt_llm/_torch/pyexecutor/py_executor.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -538,9 +538,9 @@ def get_req_stats(req: LlmRequest) -> RequestStats:
538538
req_stat.dis_serving_stats.kv_cache_size = req.kv_cache_size
539539
return req_stat
540540

541-
def get_queued_req_stats(req: LlmRequest) -> RequestStats:
541+
def get_queued_req_stats(request_id: int) -> RequestStats:
542542
req_stat = RequestStats()
543-
req_stat.id = req.request_id
543+
req_stat.id = request_id
544544
req_stat.context_prefill_position = 0
545545
req_stat.num_generated_tokens = 0
546546
req_stat.avg_num_decoded_tokens_per_iter = 0
@@ -558,9 +558,10 @@ def get_queued_req_stats(req: LlmRequest) -> RequestStats:
558558
req_stats.append(req_stat)
559559

560560
for req in list(self.request_queue.queue):
561-
req_stat = get_queued_req_stats(req)
562-
req.stage = RequestStage.QUEUED
563-
req_stats.append(req_stat)
561+
if isinstance(req, Tuple):
562+
req_stat = get_queued_req_stats(req[0])
563+
req_stat.stage = RequestStage.QUEUED
564+
req_stats.append(req_stat)
564565

565566
for req in finished_requests:
566567
req_stat = get_req_stats(req)

tests/unittest/llmapi/test_llm.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1912,6 +1912,64 @@ def test_llm_get_stats(return_context_logits, enable_iter_req_stats):
19121912
enable_iter_req_stats=enable_iter_req_stats)
19131913

19141914

1915+
def test_llm_get_queued_stats():
1916+
1917+
enable_iter_req_stats = True
1918+
use_overlap = False
1919+
tp_size = 1
1920+
1921+
num_requests = 3000
1922+
repeated_prompts = ["A B C D E F G H I J K L M"] * num_requests
1923+
1924+
llm_args_extra = {}
1925+
sampling_args_extra = {}
1926+
1927+
from tensorrt_llm._torch import LLM as LLM_torch
1928+
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
1929+
1930+
llm_args_extra["pytorch_backend_config"] = PyTorchConfig(
1931+
enable_iter_perf_stats=True,
1932+
enable_iter_req_stats=enable_iter_req_stats,
1933+
disable_overlap_scheduler=not use_overlap,
1934+
)
1935+
1936+
llm = LLM_torch(model=llama_model_path,
1937+
kv_cache_config=global_kvcache_config,
1938+
tensor_parallel_size=tp_size,
1939+
fast_build=True,
1940+
**llm_args_extra)
1941+
1942+
max_tokens = 10
1943+
sampling_params = SamplingParams(max_tokens=max_tokens,
1944+
**sampling_args_extra)
1945+
1946+
max_tries = 10
1947+
has_queue_requests = False
1948+
1949+
while not has_queue_requests and max_tries > 0:
1950+
max_tries -= 1
1951+
# Generate outputs, which will queue requests
1952+
outputs = llm.generate(repeated_prompts,
1953+
sampling_params=sampling_params)
1954+
1955+
results = llm.get_stats(2)
1956+
1957+
for index, result in enumerate(results):
1958+
if "requestStats" in result:
1959+
for requestStat in result["requestStats"]:
1960+
if requestStat["stage"] == "QUEUED":
1961+
has_queue_requests = True
1962+
assert requestStat["numGeneratedTokens"] == 0
1963+
1964+
if not has_queue_requests:
1965+
print("No queued requests found, retrying...")
1966+
asyncio.sleep(1)
1967+
else:
1968+
print("Found queued requests, breaking out of the loop.")
1969+
1970+
assert has_queue_requests
1971+
1972+
19151973
def llm_get_stats_async_test_harness(tp_size: int = 1,
19161974
return_context_logits: bool = False,
19171975
pytorch_backend: bool = False,

0 commit comments

Comments
 (0)