Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def _parse_log_file(self, filename):

return json.loads(json_string)

def _parse_triton_metrics(self, filename, is_v1):
def _parse_triton_metrics(self, filename):
curl_counts = {}
with open(filename) as metrics_file:
for line in metrics_file:
Expand All @@ -91,12 +91,11 @@ def _parse_triton_metrics(self, filename, is_v1):
metric_output = re.sub(r"^.*?{", "{", line).split()
metric_key = metric_output[0]
metric_value = metric_output[1]
key = self._convert_metric_key_to_stats_key(
metric_key, is_v1)
key = self._convert_metric_key_to_stats_key(metric_key)
curl_counts[key] = metric_value
return curl_counts

def _convert_metric_key_to_stats_key(self, metric_output, is_v1):
def _convert_metric_key_to_stats_key(self, metric_output):
# Converts:
# '{model="tensorrt_llm",request_type="context",version="1"}'
# to:
Expand All @@ -107,15 +106,12 @@ def _convert_metric_key_to_stats_key(self, metric_output, is_v1):
if not i.startswith('model') and not i.startswith('version')
][0]
self.assertIn(key, metric_to_stat_dict)
if (is_v1):
self.assertNotIn("inflight_batcher_specific_metric", key)
else:
self.assertNotIn("v1_specific_metric", key)
self.assertNotIn("v1_specific_metric", key)
return metric_to_stat_dict[key]

def _base_test(self, stats_file, metrics_file, is_v1):
def _base_test(self, stats_file, metrics_file):
stats = self._parse_log_file(stats_file)
metrics = self._parse_triton_metrics(metrics_file, is_v1)
metrics = self._parse_triton_metrics(metrics_file)
self.assertEqual(len(stats.keys()), len(metrics.keys()))
self.assertEqual(list(stats.keys()).sort(), list(metrics.keys()).sort())
for metric_key in stats.keys():
Expand All @@ -140,45 +136,33 @@ def _base_test(self, stats_file, metrics_file, is_v1):
timedelta(seconds=-1) <= difference, difference
<= timedelta(seconds=1))

def test_1_gpu_v1(self):
self._base_test("1gpu_v1_no_streaming_server.log",
"1gpu_v1_no_stream_metrics.out", True)

def test_1_gpu_IFB_no_stream(self):
self._base_test("1gpu_IFB_no_streaming_server.log",
"1gpu_IFB_no_stream_metrics.out", False)
"1gpu_IFB_no_stream_metrics.out")

def test_1_gpu_IFB_stream(self):
self._base_test("1gpu_IFB_streaming_server.log",
"1gpu_IFB_stream_metrics.out", False)
"1gpu_IFB_stream_metrics.out")

if AVAILABLE_GPUS >= 2:

def test_2_gpu_v1(self):
self._base_test("2gpu_v1_no_streaming_server.log",
"2gpu_v1_no_stream_metrics.out", True)

def test_2_gpu_IFB_no_stream(self):
self._base_test("2gpu_IFB_no_streaming_server.log",
"2gpu_IFB_no_stream_metrics.out", False)
"2gpu_IFB_no_stream_metrics.out")

def test_2_gpu_IFB_stream(self):
self._base_test("2gpu_IFB_streaming_server.log",
"2gpu_IFB_stream_metrics.out", False)
"2gpu_IFB_stream_metrics.out")

if AVAILABLE_GPUS >= 4:

def test_4_gpu_v1(self):
self._base_test("4gpu_v1_no_streaming_server.log",
"4gpu_v1_no_stream_metrics.out", True)

def test_4_gpu_IFB_no_stream(self):
self._base_test("4gpu_IFB_no_streaming_server.log",
"4gpu_IFB_no_stream_metrics.out", False)
"4gpu_IFB_no_stream_metrics.out")

def test_4_gpu_IFB_stream(self):
self._base_test("4gpu_IFB_streaming_server.log",
"4gpu_IFB_stream_metrics.out", False)
"4gpu_IFB_stream_metrics.out")


if __name__ == "__main__":
Expand Down
44 changes: 4 additions & 40 deletions triton_backend/ci/L0_backend_trtllm/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -228,49 +228,13 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do

run_server "${SERVER_ARGS}"
wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
if [ "$WAIT_RET" != "0" ]; then
# Cleanup
kill $SERVER_PID > /dev/null 2>&1 || true
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
exit 1
fi

set -e
python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
--max-input-len=500 \
dataset --dataset=${DATASET} \
--tokenizer-dir=${TOKENIZER_DIR}

if [ $? -ne 0 ]; then
cat $SERVER_LOG
echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
kill_server
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
RET=1
fi
set +e

set -e
python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
--max-input-len=500 \
--dataset=${DATASET}

if [ $? -ne 0 ]; then
# Expect invalid GPT model type error to be gracefully handled
if [ `grep -c "Static batching type is deprecated" $SERVER_LOG` == "0" ]; then
echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***"
cat $SERVER_LOG
echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
kill_server
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
RET=1
exit 1
fi
set +e

# Make sure the metrics is retrieved after the server has updated the metrics internally
sleep ${SLEEP_DURATION}
curl localhost:8002/metrics -o ${NUM_GPU}gpu_v1_no_stream_metrics.out

kill_server
wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}

# inflight batching ON
# streaming OFF
Expand Down