diff --git a/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py b/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py index db3093a5b47..3523dff6819 100644 --- a/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py +++ b/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py @@ -82,7 +82,7 @@ def _parse_log_file(self, filename): return json.loads(json_string) - def _parse_triton_metrics(self, filename, is_v1): + def _parse_triton_metrics(self, filename): curl_counts = {} with open(filename) as metrics_file: for line in metrics_file: @@ -91,12 +91,11 @@ def _parse_triton_metrics(self, filename, is_v1): metric_output = re.sub(r"^.*?{", "{", line).split() metric_key = metric_output[0] metric_value = metric_output[1] - key = self._convert_metric_key_to_stats_key( - metric_key, is_v1) + key = self._convert_metric_key_to_stats_key(metric_key) curl_counts[key] = metric_value return curl_counts - def _convert_metric_key_to_stats_key(self, metric_output, is_v1): + def _convert_metric_key_to_stats_key(self, metric_output): # Converts: # '{model="tensorrt_llm",request_type="context",version="1"}' # to: @@ -107,15 +106,12 @@ def _convert_metric_key_to_stats_key(self, metric_output, is_v1): if not i.startswith('model') and not i.startswith('version') ][0] self.assertIn(key, metric_to_stat_dict) - if (is_v1): - self.assertNotIn("inflight_batcher_specific_metric", key) - else: - self.assertNotIn("v1_specific_metric", key) + self.assertNotIn("v1_specific_metric", key) return metric_to_stat_dict[key] - def _base_test(self, stats_file, metrics_file, is_v1): + def _base_test(self, stats_file, metrics_file): stats = self._parse_log_file(stats_file) - metrics = self._parse_triton_metrics(metrics_file, is_v1) + metrics = self._parse_triton_metrics(metrics_file) self.assertEqual(len(stats.keys()), len(metrics.keys())) self.assertEqual(list(stats.keys()).sort(), list(metrics.keys()).sort()) for metric_key in stats.keys(): @@ -140,45 +136,33 @@ def _base_test(self, stats_file, metrics_file, is_v1): timedelta(seconds=-1) <= difference, difference <= timedelta(seconds=1)) - def test_1_gpu_v1(self): - self._base_test("1gpu_v1_no_streaming_server.log", - "1gpu_v1_no_stream_metrics.out", True) - def test_1_gpu_IFB_no_stream(self): self._base_test("1gpu_IFB_no_streaming_server.log", - "1gpu_IFB_no_stream_metrics.out", False) + "1gpu_IFB_no_stream_metrics.out") def test_1_gpu_IFB_stream(self): self._base_test("1gpu_IFB_streaming_server.log", - "1gpu_IFB_stream_metrics.out", False) + "1gpu_IFB_stream_metrics.out") if AVAILABLE_GPUS >= 2: - def test_2_gpu_v1(self): - self._base_test("2gpu_v1_no_streaming_server.log", - "2gpu_v1_no_stream_metrics.out", True) - def test_2_gpu_IFB_no_stream(self): self._base_test("2gpu_IFB_no_streaming_server.log", - "2gpu_IFB_no_stream_metrics.out", False) + "2gpu_IFB_no_stream_metrics.out") def test_2_gpu_IFB_stream(self): self._base_test("2gpu_IFB_streaming_server.log", - "2gpu_IFB_stream_metrics.out", False) + "2gpu_IFB_stream_metrics.out") if AVAILABLE_GPUS >= 4: - def test_4_gpu_v1(self): - self._base_test("4gpu_v1_no_streaming_server.log", - "4gpu_v1_no_stream_metrics.out", True) - def test_4_gpu_IFB_no_stream(self): self._base_test("4gpu_IFB_no_streaming_server.log", - "4gpu_IFB_no_stream_metrics.out", False) + "4gpu_IFB_no_stream_metrics.out") def test_4_gpu_IFB_stream(self): self._base_test("4gpu_IFB_streaming_server.log", - "4gpu_IFB_stream_metrics.out", False) + "4gpu_IFB_stream_metrics.out") if __name__ == "__main__": diff --git a/triton_backend/ci/L0_backend_trtllm/test.sh b/triton_backend/ci/L0_backend_trtllm/test.sh index c09e985a266..83967d1c58c 100644 --- a/triton_backend/ci/L0_backend_trtllm/test.sh +++ b/triton_backend/ci/L0_backend_trtllm/test.sh @@ -228,49 +228,13 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do run_server "${SERVER_ARGS}" wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]} - if [ "$WAIT_RET" != "0" ]; then - # Cleanup - kill $SERVER_PID > /dev/null 2>&1 || true - echo -e "\n***\n*** Failed to start $SERVER\n***" - cat $SERVER_LOG - exit 1 - fi - - set -e - python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \ - --max-input-len=500 \ - dataset --dataset=${DATASET} \ - --tokenizer-dir=${TOKENIZER_DIR} - - if [ $? -ne 0 ]; then - cat $SERVER_LOG - echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]} - RET=1 - fi - set +e - - set -e - python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \ - --max-input-len=500 \ - --dataset=${DATASET} - if [ $? -ne 0 ]; then + # Expect invalid GPT model type error to be gracefully handled + if [ `grep -c "Static batching type is deprecated" $SERVER_LOG` == "0" ]; then + echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***" cat $SERVER_LOG - echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***" - kill_server - wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]} - RET=1 + exit 1 fi - set +e - - # Make sure the metrics is retrieved after the server has updated the metrics internally - sleep ${SLEEP_DURATION} - curl localhost:8002/metrics -o ${NUM_GPU}gpu_v1_no_stream_metrics.out - - kill_server - wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]} # inflight batching ON # streaming OFF