[https://nvbugs/5369366] [fix] Report failing requests (#7060)

arekay · web-flow · commit 89889fb52609 · 2025-09-04T12:56:23.000-07:00
Signed-off-by: Rashid Kaleem &lt;4079439+arekay@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/serve/scripts/backend_request_func.py b/tensorrt_llm/serve/scripts/backend_request_func.py
@@ -46,6 +46,7 @@ class RequestFuncOutput:
     prompt_len: int = 0
     error: str = ""
     avg_decoded_tokens_per_iter: float = 0.0  # Average tokens decoded per iteration
+    exception_type: str = None  # unset
 
 
 async def async_request_trt_llm(
@@ -132,10 +133,11 @@ async def async_request_trt_llm(
             else:
                 output.error = response.reason or ""
                 output.success = False
-    except Exception:
+    except Exception as e:
         output.success = False
         exc_info = sys.exc_info()
         output.error = "".join(traceback.format_exception(*exc_info))
+        output.exception_type = e.__class__.__name__
     finally:
         if session is None:
             await request_session.close()
@@ -259,12 +261,14 @@ async def async_request_openai_completions(
                         output.avg_decoded_tokens_per_iter = choice[
                             "avg_decoded_tokens_per_iter"]
             else:
+                print(f"HTTP Error {response.status}: {response}")
                 output.error = response.reason or ""
                 output.success = False
-    except Exception:
+    except Exception as e:
         output.success = False
         exc_info = sys.exc_info()
         output.error = "".join(traceback.format_exception(*exc_info))
+        output.exception_type = e.__class__.__name__
     finally:
         if session is None:
             await request_session.close()
@@ -392,12 +396,14 @@ async def async_request_openai_chat_completions(
                             "avg_decoded_tokens_per_iter"]
 
             else:
+                # TODO: Need to store the status code to debug and report
                 output.error = response.reason or ""
                 output.success = False
-    except Exception:
+    except Exception as e:
         output.success = False
         exc_info = sys.exc_info()
         output.error = "".join(traceback.format_exception(*exc_info))
+        output.exception_type = e.__class__.__name__
     finally:
         if session is None:
             await request_session.close()
diff --git a/tensorrt_llm/serve/scripts/benchmark_serving.py b/tensorrt_llm/serve/scripts/benchmark_serving.py
@@ -144,7 +144,12 @@ def calculate_metrics(
     e2els: list[float] = []
     tput_user: list[float] = []
     latest_avg_decoded_tokens_per_iter: float = 0.0
+    error_counts: dict[str, int] = {}
     for i in range(len(outputs)):
+        if outputs[i].exception_type:
+            exception_type = outputs[i].exception_type
+            error_counts[exception_type] = error_counts.get(exception_type,
+                                                            0) + 1
         if outputs[i].success:
             output_len = outputs[i].output_tokens
             if not output_len:
@@ -179,6 +184,11 @@ def calculate_metrics(
         else:
             actual_output_lens.append(0)
 
+    total_error_count = sum(error_counts.values())
+    for exception_type, count in error_counts.items():
+        print(f"Error type: {exception_type}, Count: {count} requests")
+    print(f"Total failed requests: {total_error_count}")
+
     if goodput_config_dict:
         valid_metrics = []
         slo_values = []
@@ -336,7 +346,8 @@ async def benchmark(
     print(f"Burstiness factor: {burstiness} ({distribution})")
     print(f"Maximum request concurrency: {max_concurrency}")
 
-    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests),
+                                          desc="Benchmarking")
 
     # This can be used once the minimum Python version is 3.10 or higher,
     # and it will simplify the code in limited_request_func.
@@ -433,7 +444,10 @@ async def limited_request_func(request_func_input, streaming, pbar,
     )
 
     print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{:<40} {:<10}".format("Total requests:", len(outputs)))
     print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10}".format("Failed requests:",
+                                 len(outputs) - metrics.completed))
     print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
                                     benchmark_duration))
     print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
@@ -455,6 +469,12 @@ async def limited_request_func(request_func_input, streaming, pbar,
     if metrics.avg_decoded_tokens_per_iter > 0.0:
         print("{:<40} {:<10.2f}".format("Avg Decoded Tokens per Iter:",
                                         metrics.avg_decoded_tokens_per_iter))
+    if len(outputs) - metrics.completed > 0:
+        print(
+            f"=======================!FAILED REQUESTS!=======================")
+        print(f"Total failed requests: {len(outputs) - metrics.completed}")
+        print(
+            f"=====================!CHECK LOG FOR ERRORS!====================")
 
     result = {
         "duration": benchmark_duration,