refacotr pytest command generation

yuanjingx87 · yuanjingx87 · commit c19f3c79ce27 · 2025-08-22T10:40:34.000-07:00
With the refactor, now slurm job and blossom job will both generate
pytest command from same method

Signed-off-by: Yuanjing Xue &lt;197832395+yuanjingx87@users.noreply.github.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -170,6 +170,66 @@ def getNodeArgs(int nodeCount, int gpuCount) {
     ]
 }
 
+def getPytestBaseCommand(
+    String llmSrc,
+    String stageName,
+    String testDBList,
+    int splits,
+    int split_id,
+    Boolean perfMode,
+    String outputPath,
+    String trtllmWheelPath,
+    String coverageConfigFile,
+    String pytestUtil = ""
+) {
+    def extraInternalEnv = ""
+    def pytestTestTimeout = "3600"
+
+    // TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
+    extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
+    // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
+    extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
+
+    def testCmdLine = [
+        "LLM_ROOT=${llmSrc}",
+        "LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
+        "LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
+        "MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
+        extraInternalEnv,
+        pytestUtil,
+        "pytest",
+        "-v",
+        testFilter[(DETAILED_LOG)] ? "-s" : "",
+        "--timeout-method=thread",
+        "--apply-test-list-correction",
+        "--splitting-algorithm least_duration",
+        "--timeout=${pytestTestTimeout}",
+        "--rootdir ${llmSrc}/tests/integration/defs",
+        "--test-prefix=${stageName}",
+        "--splits ${splits}",
+        "--group ${split_id}",
+        "--waives-file=${llmSrc}/tests/integration/test_lists/waives.txt",
+        "--output-dir=${outputPath}/",
+        "--csv=${outputPath}/report.csv",
+        "--junit-xml ${outputPath}/results.xml",
+        "-o junit_logging=out-err",
+        "--cov=${llmSrc}/examples/",
+        "--cov=${llmSrc}/tensorrt_llm/",
+        "--cov=${trtllmWheelPath}/tensorrt_llm/",
+        "--cov-report=",
+        "--cov-config=${coverageConfigFile}",
+        "--test-list=${testDBList}",
+    ]
+    if (perfMode) {
+        testCmdLine += [
+            "--perf",
+            "--perf-log-formats csv",
+            "--perf-log-formats yaml"
+        ]
+    }
+    return testCmdLine.join(" ")
+}
+
 def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, skipInstallWheel=false, cpver="cp312")
 {
     SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
@@ -207,9 +267,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
             def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
             def scriptLaunchPathNode = "${jobWorkspace}/slurm_launch.sh"
             def isAarch64 = config.contains("aarch64")
-            def pytestTestTimeout = "7200"
 
-            stage('Prepare Testing') {
+            stage("[${stageName}] Initializing Test") {
                 // Create Job Workspace folder in Frontend Node
                 Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host} 'mkdir -p ${jobWorkspace}'",)
 
@@ -228,15 +287,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                     true
                 )
 
-                // Upload waives.txt to Frontend node
-                def waivesListPathLocal = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
-                Utils.copyScriptToRemoteHost(
-                    pipeline,
-                    remote,
-                    waivesListPathLocal,
-                    waivesListPathNode
-                )
-
                 // Generate Test List and Upload to Frontend Node
                 def makoArgs = getMakoArgsFromStageName(stageName, true)
                 // TODO: currently the options will only be processed if the first
@@ -261,37 +311,42 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                 }
                 taskArgs = [
                     *taskArgs,
-                    "--container-image=${container}",
+                    "--container-image=$container",
                     "--container-workdir=/home/svc_tensorrt/bloom/scripts",
-                    "--container-mounts=${mounts}",
+                    "--container-mounts=$mounts",
                     "--container-env=NVIDIA_IMEX_CHANNELS"
                 ]
 
+                String pytestUtil = ""
+                if (nodeCount > 1) {
+                    pytestUtil = "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
+                }
+
+                def pytestCommand = getPytestBaseCommand(
+                    llmSrcNode,
+                    stageName,
+                    testListPathNode,
+                    splits,
+                    splitId,
+                    perfMode,
+                    jobWorkspace,
+                    "__PLACEHOLDER_TRTLLM_WHL_PATH__",
+                    "__PLACEHOLDER_coverageConfigFile__",
+                    pytestUtil
+                )
+
                 def scriptContent = """#!/bin/bash
                     export jobWorkspace=$jobWorkspace
                     export tarName=$tarName
                     export llmTarfile=$llmTarfile
                     export llmSrcNode=$llmSrcNode
                     export stageName=$stageName
-                    export testList=$testList
-                    export testListPathNode=$testListPathNode
-                    export waivesListPathNode=$waivesListPathNode
-                    export pytestTestTimeout=$pytestTestTimeout
-                    export splits=$splits
-                    export splitId=$splitId
                     export perfMode=$perfMode
                     export resourcePathNode=$resourcePathNode
                     export nodeCount=$nodeCount
-                    export MODEL_CACHE_DIR=$MODEL_CACHE_DIR
+                    export pytestCommand="${pytestCommand}"
                     export NVIDIA_IMEX_CHANNELS=0
-                    chmod +x ${scriptRunNode}
-<<<<<<< HEAD
-                    ${srunCmd}
-                """.stripIndent()
-                pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
-                Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
-                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
-=======
+                    chmod +x $scriptRunNode
                 """
                 if (nodeCount > 1) {
                     taskArgs = [
@@ -341,9 +396,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                     scriptLaunchPathLocal,
                     scriptLaunchPathNode
                 )
->>>>>>> e74931618 (Refactor L0 Test code)
             }
-            stage('Run Test') {
+            stage("[${stageName}] Run Pytest") {
                 Utils.exec(
                     pipeline,
                     timeout: false,
@@ -1344,50 +1398,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
 
     stage ("[${stageName}] Run Pytest")
     {
-        echoNodeAndGpuInfo(pipeline, stageName)
-        sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi'
-
-        def extraInternalEnv = ""
-        def pytestTestTimeout = "3600"
-
-        // TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
-        extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
-        // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
-        extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
-
+        // Test List
         def testDBList = renderTestDB(testList, llmSrc, stageName)
-        testList = "${testList}_${splitId}"
-        def testCmdLine = [
-            "LLM_ROOT=${llmSrc}",
-            "LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
-            "LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
-            "MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
-            extraInternalEnv,
-            "pytest",
-            "-v",
-            testFilter[(DETAILED_LOG)] ? "-s" : "",
-            "--timeout-method=thread",
-            "--apply-test-list-correction",
-            "--splitting-algorithm least_duration",
-            "--timeout=${pytestTestTimeout}",
-            "--rootdir ${llmSrc}/tests/integration/defs",
-            "--test-prefix=${stageName}",
-            "--splits ${splits}",
-            "--group ${splitId}",
-            "--waives-file=${llmSrc}/tests/integration/test_lists/waives.txt",
-            "--test-list=${testDBList}",
-            "--output-dir=${WORKSPACE}/${stageName}/",
-            "--csv=${WORKSPACE}/${stageName}/report.csv",
-            "--junit-xml ${WORKSPACE}/${stageName}/results.xml",
-            "-o junit_logging=out-err"
-        ]
-        if (perfMode) {
-            testCmdLine += [
-                "--perf",
-                "--perf-log-formats csv",
-                "--perf-log-formats yaml"
-            ]
-        }
+
         // Test Coverage
         def TRTLLM_WHL_PATH = sh(returnStdout: true, script: "pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2").replaceAll("\\s","")
         sh "echo ${TRTLLM_WHL_PATH}"
@@ -1401,13 +1414,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
             echo 'source =\n    ${llmSrc}/tensorrt_llm/\n    ${TRTLLM_WHL_PATH}/tensorrt_llm/' >> ${coverageConfigFile}
             cat ${coverageConfigFile}
         """
-        testCmdLine += [
-            "--cov=${llmSrc}/examples/",
-            "--cov=${llmSrc}/tensorrt_llm/",
-            "--cov=${TRTLLM_WHL_PATH}/tensorrt_llm/",
-            "--cov-report=",
-            "--cov-config=${coverageConfigFile}"
-        ]
+        echoNodeAndGpuInfo(pipeline, stageName)
+        sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi'
+        def pytestCommand = getPytestBaseCommand(
+            llmSrc,
+            stageName,
+            testDBList,
+            splits,
+            split_id,
+            perfMode,
+            ${WORKSPACE}/${stageName},
+            TRTLLM_WHL_PATH,
+            coverageConfigFile
+        )
 
         def containerPIP_LLM_LIB_PATH = sh(script: "pip3 show tensorrt_llm | grep \"Location\" | awk -F\":\" '{ gsub(/ /, \"\", \$2); print \$2\"/tensorrt_llm/libs\"}'", returnStdout: true).replaceAll("\\s","")
         def containerLD_LIBRARY_PATH = sh(script: "echo \${LD_LIBRARY_PATH}", returnStdout: true).replaceAll("\\s","")
@@ -1428,9 +1447,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
                 sh "env | sort"
                 try {
                     sh """
-                        rm -rf ${stageName}/ && \
-                        cd ${llmSrc}/tests/integration/defs && \
-                        ${testCmdLine.join(" ")}
+                        rm -rf $stageName/ && \
+                        cd $llmSrc/tests/integration/defs && \
+                        $pytestCommand
                     """
                 } catch (InterruptedException e) {
                     throw e
diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh
@@ -2,6 +2,28 @@
 cd $resourcePathNode
 llmSrcNode=$resourcePathNode/TensorRT-LLM/src
 
+set_value_in_command() {
+    # Parameters
+    local key="$1"
+    local value="$2"
+    local command="$3"
+
+    # Transform the key
+    local placeholder="__PLACEHOLDER_${key}__"
+
+    # Check if placeholder exists
+    if [[ "$command" != *"$placeholder"* ]]; then
+        echo "Error: placeholder '$placeholder' not found in the command" >&2
+        return 1
+    fi
+
+    # Replace all occurrences
+    local result="${command//${placeholder}/${value}}"
+
+    # Return the result
+    echo "$result"
+}
+
 resultsPath=$jobWorkspace/results
 mkdir -p $resultsPath
 if [ $SLURM_LOCALID -eq 0 ]; then
@@ -15,51 +37,30 @@ if [ $SLURM_LOCALID -eq 0 ]; then
     cd $resourcePathNode &&  pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
     git config --global --add safe.directory "*"
     gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
-    echo "HOST_NODE_NAME = $HOST_NODE_NAME ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName"
+    echo "HOST_NODE_NAME = $hostname ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName"
     touch install_lock.lock
 else
     while [ ! -f install_lock.lock ]; do
         sleep 5
     done
 fi
-testList="$testList_$splitId"
-export CPP_TEST_TIMEOUT_OVERRIDDEN=7200
-export LLM_ROOT=$llmSrcNode
-export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
+
 export UCX_TLS=^gdr_copy
 cd $llmSrcNode/tests/integration/defs
 testCmdLines=()
 if [ $nodeCount -gt 1 ]; then
     testCmdLines+=(
-        "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
-    )
-fi
-testCmdLines+=(
-    "pytest"
-    "-v"
-    "--timeout=$pytestTestTimeout"
-    "--test-list=$testListPathNode"
-    "--waives-file=$waivesListPathNode"
-    "--rootdir $llmSrcNode/tests/integration/defs"
-    "--test-prefix=$stageName"
-    "--splits $splits"
-    "--group $splitId"
-    "--output-dir=$jobWorkspace/"
-    "--csv=$resultsPath/report.csv"
-    "--junit-xml $resultsPath/results.xml"
-    "-o junit_logging=out-err"
-)
-if [ "$perfMode" = "true" ]; then
-    testCmdLines+=(
-        "--perf"
-        "--perf-log-formats csv"
-        "--perf-log-formats yaml"
     )
+    pytestCommand=$(set_value_in_command "pytest" "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch" "$pytestCommand")
 fi
+
+# get trtllm wheel path and add to pytest command
 trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
 trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
 echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
-# generate .coveragerc in workspace
+pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
+
+# generate .coveragerc in workspace and add file path to pytest command
 cat << EOF > $jobWorkspace/.coveragerc
 [run]
 branch = True
@@ -69,14 +70,8 @@ source =
     $llmSrcNode/tensorrt_llm/
     $trtllmWhlPath/tensorrt_llm/
 EOF
+pytestCommand=$(set_value_in_command "coverageConfigFile" "$jobWorkspace/.coveragerc" "$pytestCommand")
 
-testCmdLines+=(
-    "--cov=$llmSrcNode/examples/"
-    "--cov=$llmSrcNode/tensorrt_llm/"
-    "--cov=$trtllmWhlPath/tensorrt_llm/"
-    "--cov-report="
-    "--cov-config=$jobWorkspace/.coveragerc"
-)
 containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
 containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')
 containerLDLibPath=$LD_LIBRARY_PATH
@@ -89,7 +84,24 @@ export LD_LIBRARY_PATH=$containerLDLibPath
 echo "Library Path:"
 echo "$LD_LIBRARY_PATH"
 env | sort
-fullCmd="${testCmdLines[*]}"
 echo "Running: $testCase"
-echo "Full Command: $fullCmd"
-eval $fullCmd
+echo "Full Command: $pytestCommand"
+eval $pytestCommand
+
+if [ "$perfMode" = "true" ]; then
+    if [[ "$" == *PyTorch* ]]; then
+        basePerfFilename="base_perf_pytorch.csv"
+    else
+        basePerfFilename="base_perf.csv"
+    fi
+    basePerfPath="$llmSrcNode/tests/integration/defs/perf/$basePerfFilename"
+    echo "Check perf result"
+    python3 $llmSrcNode/tests/integration/defs/perf/sanity_perf_check.py \
+        $stageName/perf_script_test_results.csv \
+        $basePerfPath
+    echo "Check perf report"
+    python3 $llmSrcNode/tests/integration/defs/perf/create_perf_comparison_report.py \
+        --output_path $stageName/report.pdf \
+        --files $stageName/perf_script_test_results.csv \
+        $basePerfPath
+fi