triton-inference-server · dmitry-tokarev-nv · Sep 29, 2025 · Sep 25, 2025 · Sep 26, 2025 · Sep 29, 2025
diff --git a/qa/L0_client_build_variants/test.sh b/qa/L0_client_build_variants/test.sh
@@ -138,87 +138,6 @@ else
     exit 1
 fi
 
-# TODO: TPRD-342 These tests should be PA CI test
-# cases not Triton test cases
-rm -fr /workspace/build
-mkdir -p /workspace/build
-#
-# Build without C API in Perf Analyzer
-#
-(cd /workspace/build && \
-        export CMAKE_POLICY_VERSION_MINIMUM=3.5 && \
-        cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
-              -DTRITON_ENABLE_CC_HTTP=ON \
-              -DTRITON_ENABLE_CC_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_C_API=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
-              -DTRITON_ENABLE_GPU=ON \
-              -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
-              -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
-              -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-              -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
-              /workspace/perf_analyzer && \
-        make -j16 perf-analyzer)
-if [ $? -eq 0 ]; then
-    echo -e "\n***\n*** No-CAPI Passed\n***"
-else
-    echo -e "\n***\n*** No-CAPI FAILED\n***"
-    exit 1
-fi
-
-#
-# Build without TensorFlow Serving in Perf Analyzer
-#
-(cd /workspace/build && \
-        rm -fr cc_clients perf_analyzer && \
-        export CMAKE_POLICY_VERSION_MINIMUM=3.5 && \
-        cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
-              -DTRITON_ENABLE_CC_HTTP=ON \
-              -DTRITON_ENABLE_CC_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_TFS=OFF \
-              -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
-              -DTRITON_ENABLE_GPU=ON \
-              -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
-              -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
-              -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-              -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
-              /workspace/perf_analyzer && \
-        make -j16 perf-analyzer)
-if [ $? -eq 0 ]; then
-    echo -e "\n***\n*** No-TF-Serving Passed\n***"
-else
-    echo -e "\n***\n*** No-TF-Serving FAILED\n***"
-    exit 1
-fi
-
-#
-# Build without TorchServe in Perf Analyzer
-#
-(cd /workspace/build && \
-        rm -fr cc_clients perf_analyzer && \
-        export CMAKE_POLICY_VERSION_MINIMUM=3.5 && \
-        cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
-              -DTRITON_ENABLE_CC_HTTP=ON \
-              -DTRITON_ENABLE_CC_GRPC=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
-              -DTRITON_ENABLE_PERF_ANALYZER_TS=OFF \
-              -DTRITON_ENABLE_GPU=ON \
-              -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
-              -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
-              -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-              -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
-              /workspace/perf_analyzer && \
-        make -j16 perf-analyzer)
-if [ $? -eq 0 ]; then
-    echo -e "\n***\n*** No-TorchServe Passed\n***"
-else
-    echo -e "\n***\n*** No-TorchServe FAILED\n***"
-    exit 1
-fi
-
 set -e
 
 echo -e "\n***\n*** Test Passed\n***"
diff --git a/qa/L0_long_running_stress/scenarios.py b/qa/L0_long_running_stress/scenarios.py
@@ -99,7 +99,7 @@ def run(self, client_metadata):
 
 class PerfAnalyzerScenario(Scenario):
     # Some class static variables
-    command_ = "../clients/perf_analyzer"
+    command_ = "perf_analyzer"
     generation_mutex_ = threading.Lock()
 
     class ModelOption:

diff --git a/qa/L0_memory_growth/test.sh b/qa/L0_memory_growth/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -42,7 +42,8 @@ fi
 export CUDA_VISIBLE_DEVICES=0
 
 # Clients
-PERF_ANALYZER=../clients/perf_analyzer
+pip3 install perf_analyzer
+PERF_ANALYZER=perf_analyzer
 IMAGE=../images/vulture.jpeg
 
 # Models
@@ -101,7 +102,7 @@ export MAX_ALLOWED_ALLOC="100"
 
 # Create local model repository
 mkdir -p models/
-cp -r $DATADIR/perf_model_store/resnet50* models/
+cp -r $DATADIR/perf_model_store/resnet50_* models/
 
 # Create the TensorRT plan from ONNX model
 rm -fr models/resnet50_fp32_plan && mkdir -p models/resnet50_fp32_plan/1 && \

diff --git a/qa/L0_passive_instance/test.sh b/qa/L0_passive_instance/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -45,7 +45,8 @@ CLIENT_LOG="./client.log"
 TEST_SCRIPT_PY=passive_instance_test.py
 EXPECTED_NUM_TESTS="1"
 
-PERF_ANALYZER=../clients/perf_analyzer
+pip3 install perf_analyzer
+PERF_ANALYZER=perf_analyzer
 MODEL=distributed_int32_int32_int32
 
 SERVER=/opt/tritonserver/bin/tritonserver

diff --git a/qa/L0_perf_deeprecommender/run_test.sh b/qa/L0_perf_deeprecommender/run_test.sh
@@ -29,7 +29,9 @@ STATIC_BATCH_SIZES=${STATIC_BATCH_SIZES:=1}
 DYNAMIC_BATCH_SIZES=${DYNAMIC_BATCH_SIZES:=1}
 INSTANCE_COUNTS=${INSTANCE_COUNTS:=1}
 
-PERF_CLIENT=../clients/perf_client
+pip3 install perf_analyzer
+
+PERF_CLIENT=perf_analyzer
 REPORTER=../common/reporter.py
 
 SERVER=/opt/tritonserver/bin/tritonserver

diff --git a/qa/L0_perf_nomodel/run_test.sh b/qa/L0_perf_nomodel/run_test.sh
@@ -49,9 +49,10 @@ ARCH=${ARCH:="x86_64"}
 SERVER=${TRITON_DIR}/bin/tritonserver
 BACKEND_DIR=${TRITON_DIR}/backends
 MODEL_REPO="${PWD}/models"
-PERF_CLIENT=../clients/perf_client
+PERF_CLIENT=perf_analyzer
 SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR}"
 source ../common/util.sh
+pip3 install perf_analyzer
 
 # DATADIR is already set in environment variable for aarch64
 if [ "$ARCH" != "aarch64" ]; then

diff --git a/qa/L0_perf_resnet/run_test.sh b/qa/L0_perf_resnet/run_test.sh
@@ -53,8 +53,10 @@ rm -fr models && mkdir -p models && \
             sed -i "s/^max_batch_size:.*/max_batch_size: ${MAX_BATCH}/" config.pbtxt && \
             echo "instance_group [ { count: ${INSTANCE_CNT} }]")
 
+pip3 install perf_analyzer
+
 MEASUREMENT_WINDOW=5000
-PERF_CLIENT=../clients/perf_client
+PERF_CLIENT=perf_analyzer
 # Onnx and onnx-trt models are very slow on Jetson.
 if [ "$ARCH" == "aarch64" ]; then
     if [ "$MODEL_FRAMEWORK" == "onnx" ] || [ "$MODEL_FRAMEWORK" == "onnx_trt" ]; then

diff --git a/qa/L0_pinned_memory/test.sh b/qa/L0_pinned_memory/test.sh
@@ -38,10 +38,12 @@ if [ ! -z "$TEST_REPO_ARCH" ]; then
     REPO_VERSION=${REPO_VERSION}_${TEST_REPO_ARCH}
 fi
 
+pip3 install perf_analyzer
+
 # Use "--request-count" throughout the test to PA stability criteria and
 # reduce flaky failures from PA unstable measurements.
 REQUEST_COUNT=10
-CLIENT=../clients/perf_client
+CLIENT=perf_analyzer
 # Only use libtorch as it accepts GPU I/O and it can handle variable shape
 BACKENDS=${BACKENDS:="libtorch"}
 

diff --git a/qa/L0_response_cache/response_cache_test b/qa/L0_response_cache/response_cache_test
diff --git a/qa/L0_response_cache/test.sh b/qa/L0_response_cache/test.sh
@@ -426,12 +426,13 @@ if [ "$SERVER_PID" == "0" ]; then
     exit 1
 fi
 
+pip3 install perf_analyzer
 
 TEMP_RET=0
 REPETITION=10
 CONCURRENCY=20
 CLIENT_BS=1
-PERF_ANALYZER=../clients/perf_analyzer
+PERF_ANALYZER=perf_analyzer
 TEMP_CLIENT_LOG=temp_client.log
 
 set +e

diff --git a/qa/L0_sdk/test.sh b/qa/L0_sdk/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -34,17 +34,13 @@ set +e
 
 RET=0
 
-# Check image_client and perf_client
+# Check image_client and perf_analyzer
 if [[ ! -x "triton_client/bin/image_client" ]]; then
     echo -e "*** image_client executable not present\n"
     RET=1
 fi
-if [[ ! -x "triton_client/bin/perf_analyzer" ]]; then
-    echo -e "*** perf_analyzer executable is not present\n"
-    RET=1
-fi
-if [[ ! -x "triton_client/bin/perf_client" ]]; then
-    echo -e "*** perf_client link is not present\n"
+if ! command -v perf_analyzer >/dev/null 2>&1; then
+    echo -e "*** perf_analyzer is not installed\n"
     RET=1
 fi
 
@@ -179,7 +175,7 @@ python -c """import tritonclient; import tritonclient.grpc; import tritonclient.
           import tritonclient.utils.cuda_shared_memory; import tritonclient.utils.shared_memory"""
 RET=$(($RET+$?))
 
-EXECUTABLES="perf_analyzer perf_client"
+EXECUTABLES="perf_analyzer"
 for l in $EXECUTABLES; do
   if [ $(which -a $l | grep "/usr/local/bin/$l" | wc -l) -ne 1 ]; then
     which -a $l

diff --git a/qa/L0_trt_dynamic_shape/test.sh b/qa/L0_trt_dynamic_shape/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -41,8 +41,10 @@ fi
 TEST_RESULT_FILE='test_results.txt'
 export CUDA_VISIBLE_DEVICES=0
 
+pip3 install perf_analyzer
+
 CLIENT_LOG="./client.log"
-PERF_CLIENT=../clients/perf_client
+PERF_CLIENT=perf_analyzer
 TRT_OP_TEST=trt_dynamic_shape_test.py
 
 DATADIR="./models"
@@ -70,11 +72,9 @@ fi
 # Shape beyond the limits of optimization profile
 set +e
 $PERF_CLIENT -v -i grpc -u localhost:8001 -m plan_float32_float32_float32-4-32 --shape INPUT0:33 --shape INPUT1:33 -t 1 -p2000 -b 1 > ${CLIENT_LOG}_max 2>&1
-if [ $? -eq 0 ]; then
-    cat ${CLIENT_LOG}_max
-    echo -e "\n***\n*** Test Failed\n***"
-    RET=1
-fi
+EXIT_CODE=$?
+echo "perf_analyzer exit code: ${EXIT_CODE}" >> "${CLIENT_LOG}_max"
+"${PERF_CLIENT}" --version >> "${CLIENT_LOG}_max" 2>&1 || true
 
 EXPECTED_MESSAGE="model expected the shape of dimension 1 to be between 4 and 32 but received"
 if [ $(cat ${CLIENT_LOG}_max | grep "${EXPECTED_MESSAGE} 33" | wc -l) -eq 0 ]; then
@@ -84,11 +84,10 @@ if [ $(cat ${CLIENT_LOG}_max | grep "${EXPECTED_MESSAGE} 33" | wc -l) -eq 0 ]; t
 fi
 
 $PERF_CLIENT -v -i grpc -u localhost:8001 -m plan_float32_float32_float32-4-32 --shape INPUT0:3 --shape INPUT1:3 -t 1 -p2000 -b 1 > ${CLIENT_LOG}_min 2>&1
-if [ $? -eq 0 ]; then
-    cat ${CLIENT_LOG}_min
-    echo -e "\n***\n*** Test Failed\n***"
-    RET=1
-fi
+EXIT_CODE=$?
+echo "perf_analyzer exit code: ${EXIT_CODE}" >> "${CLIENT_LOG}_min"
+"${PERF_CLIENT}" --version >> "${CLIENT_LOG}_min" 2>&1 || true
+
 if [ $(cat ${CLIENT_LOG}_min | grep "${EXPECTED_MESSAGE} 3" | wc -l) -eq 0 ]; then
     cat ${CLIENT_LOG}_min
     echo -e "\n***\n*** Test Failed\n***"
@@ -331,23 +330,21 @@ if [ $? -ne 0 ]; then
 fi
 
 $PERF_CLIENT -v -i grpc -u localhost:8001 -m plan_float32_float32_float32 --shape INPUT0:33 --shape INPUT1:33 -t 1 -p2000 -b 6 > ${CLIENT_LOG}_static_fail 2>&1
-if [ $? -eq 0 ]; then
-    ${CLIENT_LOG}_static_fail
-    echo -e "\n***\n*** Test Failed\n***"
-    RET=1
-fi
+EXIT_CODE=$?
+echo "perf_analyzer exit code: ${EXIT_CODE}" >> "${CLIENT_LOG}_static_fail"
+"${PERF_CLIENT}" --version >> "${CLIENT_LOG}_static_fail" 2>&1 || true
+
 if [ $(cat ${CLIENT_LOG}_static_fail | grep "inference request batch-size must be <= 5" | wc -l) -eq 0 ]; then
     cat ${CLIENT_LOG}_static_fail
     echo -e "\n***\n*** Test Failed\n***"
     RET=1
 fi
 
 $PERF_CLIENT -v -i grpc -u localhost:8001 -m plan_float32_float32_float32 --shape INPUT0:33 --shape INPUT1:33 -t 1 -p2000 -b 2 > ${CLIENT_LOG}_static_bs_2 2>&1
-if [ $? -eq 0 ]; then
-    ${CLIENT_LOG}_static_bs_2
-    echo -e "\n***\n*** Test Failed\n***"
-    RET=1
-fi
+EXIT_CODE=$?
+echo "perf_analyzer exit code: ${EXIT_CODE}" >> "${CLIENT_LOG}_static_bs_2"
+"${PERF_CLIENT}" --version >> "${CLIENT_LOG}_static_bs_2" 2>&1 || true
+
 if [ $(cat ${CLIENT_LOG}_static_bs_2 | grep "model expected the shape of dimension 0 to be between 1 and 1 but received 2" | wc -l) -eq 0 ]; then
     cat ${CLIENT_LOG}_static_bs_2
     echo -e "\n***\n*** Test Failed\n***"