Refactor L0 Test code

yuanjingx87 · yuanjingx87 · commit dd681a768499 · 2025-08-22T10:37:23.000-07:00
consolidate test code for slurm jobs and regular jobs

Signed-off-by: Yuanjing Xue &lt;197832395+yuanjingx87@users.noreply.github.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -1,4 +1,4 @@
-@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
+@Library(['bloom-jenkins-shared-lib@dev-yuanjingx-slurm_refactor', 'trtllm-jenkins-shared-lib@main']) _
 
 import java.lang.InterruptedException
 import groovy.transform.Field
@@ -133,8 +133,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
     }
 }
 
-//TODO: consolidate slurm related code for both multi nodes and single nodes
-def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID){
+def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String jobUID){
     withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
         def remote = [
             ip           : cluster.ip,
@@ -158,139 +157,20 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
     }
 }
 
-def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
-    withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
-        def remote = [
-            ip           : cluster.ip,
-            host         : cluster.host,
-            user         : "${pipeline.USERNAME}",
-            passwd       : "${pipeline.PASSWORD}",
-            allowAnyHosts: true,
-        ]
-
-        Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
-        pipeline.stage('Clean up SLURM Agent Resources') {
-            Utils.exec(
-                pipeline,
-                timeout: false,
-                script: Utils.sshUserCmd(
-                    remote,
-                    "rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
-                )
-            )
-            Utils.exec(pipeline, script: "echo done")
-        }
-    }
-}
-
-def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312", runner)
-{
-    runner {
-        // TODO: refactor the finallyRunner to reuse within slurm or nonslurm job.
-        cacheErrorAndUploadResult(stageName, {
-            runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver)
-        }, {
-            // If the execution test list is null, remove the test result xml
-            sh """
-                ls -all ${stageName}/
-                if ! grep -q '<testcase' ${stageName}/results.xml; then
-                    rm ${stageName}/results.xml || true
-                fi
-            """
-            def llmPath = sh (script: "realpath .", returnStdout: true).trim()
-            def llmSrc = "${llmPath}/${LLM_ROOT}${config}/TensorRT-LLM/src"
-            // CPP tests will generate test result in ${llmSrc}/cpp/build_backup/, move these files to job result folder
-            sh "ls -all ${llmSrc}/cpp/build_backup/ || true"
-            sh "ls -all ${llmSrc}/cpp/build/ || true"
-            // Sed for CPP test result
-            sh "cd ${llmSrc}/cpp/build_backup/ && sed -i 's/\" classname=\"/\" classname=\"${stageName}./g' *.xml || true"
-            sh "cd ${llmSrc}/cpp/build_backup/ && sed -i 's/testsuite name=\"[^\"]*\"/testsuite name=\"${stageName}\"/g' *.xml || true"
-            // Sed for Pytest result
-            sh "cd ${stageName} && sed -i 's/testsuite name=\"pytest\"/testsuite name=\"${stageName}\"/g' *.xml || true"
-            // Copy CPP test result
-            sh "cp ${llmSrc}/cpp/build_backup/*.xml ${stageName} || true"
-            sh "ls ${stageName}/ -all"
-        })
-    }
-}
-
-def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, skipInstallWheel=false, cpver="cp312")
-{
-    SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
-    SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
-
-    def nodeName = "${cluster.host}-test-${UUID.randomUUID().toString()}"
-    def nodeSecret = CloudManager.createNode(nodeName)
-
-    try {
-        // Run ssh command to start node in desired cluster via SLURM
-        withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
-            def remote = [
-                    ip           : cluster.ip,
-                    host         : cluster.host,
-                    user         : "${pipeline.USERNAME}",
-                    passwd       : "${pipeline.PASSWORD}",
-                    allowAnyHosts: true,
-            ]
-
-            Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
-            stage('Request Node via SLURM') {
-                println("Selected Cluster: ${cluster.name}")
-
-                def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, "slurm_jenkins_agent_setup.sh")
-
-                Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)
-
-                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
-
-                Utils.exec(
-                    pipeline,
-                    timeout: false,
-                    script: Utils.sshUserCmd(
-                            remote,
-                            """${SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)}"""
-                    )
-                )
-                Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
-            }
-        }
-
-        stage('Checking if the Node is Online') {
-            def counter = 0
-            while (!CloudManager.isNodeOnline(nodeName) && counter < 12) {
-                sleep(time: 10, unit: 'MINUTES')  // Wait 10 minutes to check status of the node again
-                counter++
-            }
-
-            if (CloudManager.isNodeOnline(nodeName)) {
-                def dockerArgs = "--gpus ${gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined  -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
-
-                if (partition.clusterName == "dlcluster") {
-                    dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
-                }
-                slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, false)
-                executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
-            } else {
-                echo "The node does not come online in 2 hours, terminating the job"
-            }
-        }
-    } finally {
-        cleanUpNodeResources(pipeline, cluster, nodeName)
-        CloudManager.destroyNode(nodeName)
-    }
-}
-
 def getNodeArgs(int nodeCount, int gpuCount) {
     int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
-    return [
+    return nodeCount == 1 ? [
+        "--nodes=${nodeCount}",
+        "--gpus=${gpuCount}"
+    ] : [
         "--nodes=${nodeCount}",
         "--ntasks=${gpuCount}",
         "--ntasks-per-node=${gpusPerNode}",
         "--gpus-per-node=${gpusPerNode}",
-    ].join(" ")
+    ]
 }
 
-def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=2, skipInstallWheel=false, cpver="cp312")
+def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, skipInstallWheel=false, cpver="cp312")
 {
     SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
     SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
@@ -324,6 +204,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
             def scriptRunNode = "${jobWorkspace}/slurm_run.sh"
             def testListPathNode = "${jobWorkspace}/${testList}.txt"
             def waivesListPathNode = "${jobWorkspace}/waives.txt"
+            def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
+            def scriptLaunchPathNode = "${jobWorkspace}/slurm_launch.sh"
             def isAarch64 = config.contains("aarch64")
             def pytestTestTimeout = "7200"
 
@@ -337,43 +219,54 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
 
                 // Upload slurm_run_sh to Frontend node
                 def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
-                Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
-                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
+
+                Utils.copyScriptToRemoteHost(
+                    pipeline,
+                    remote,
+                    scriptRunLocalPath,
+                    scriptRunNode,
+                    true
+                )
 
                 // Upload waives.txt to Frontend node
-                def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
-                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)
+                def waivesListPathLocal = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
+                Utils.copyScriptToRemoteHost(
+                    pipeline,
+                    remote,
+                    waivesListPathLocal,
+                    waivesListPathNode
+                )
 
                 // Generate Test List and Upload to Frontend Node
                 def makoArgs = getMakoArgsFromStageName(stageName, true)
                 // TODO: currently the options will only be processed if the first
                 // line is "Mako options:", maybe we can make it more generic, which
                 // if the line cannot be split by "=", just ignore that line.
                 def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs)
-                def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
-                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
+                def testListPathLocal = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
+                Utils.copyScriptToRemoteHost(
+                    pipeline,
+                    remote,
+                    testListPathLocal,
+                    testListPathNode
+                )
 
                 // Generate Multi Node Job Launch Script
                 def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
                 def mounts = "/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro,/home/svc_tensorrt/bloom/scripts:/home/svc_tensorrt/bloom/scripts"
-                String taskArgs = getNodeArgs(nodeCount, gpuCount)
+                String[] taskArgs = getNodeArgs(nodeCount, gpuCount)
 
                 if (taskArgs == null) {
                     error "Invalid multinode task stage name is set"
                 }
-
-                taskArgs =  [
-                    taskArgs,
-                    "--exclusive",
+                taskArgs = [
+                    *taskArgs,
                     "--container-image=${container}",
                     "--container-workdir=/home/svc_tensorrt/bloom/scripts",
                     "--container-mounts=${mounts}",
                     "--container-env=NVIDIA_IMEX_CHANNELS"
-                ].join(" ")
+                ]
 
-                def scriptLaunch = "/home/svc_tensorrt/bloom/scripts/${jobUID}/slurm_launch.sh"
-                def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
-                scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
                 def scriptContent = """#!/bin/bash
                     export jobWorkspace=$jobWorkspace
                     export tarName=$tarName
@@ -388,30 +281,82 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                     export splitId=$splitId
                     export perfMode=$perfMode
                     export resourcePathNode=$resourcePathNode
+                    export nodeCount=$nodeCount
                     export MODEL_CACHE_DIR=$MODEL_CACHE_DIR
                     export NVIDIA_IMEX_CHANNELS=0
                     chmod +x ${scriptRunNode}
+<<<<<<< HEAD
                     ${srunCmd}
                 """.stripIndent()
                 pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
                 Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
                 Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
+=======
+                """
+                if (nodeCount > 1) {
+                    taskArgs = [
+                        *taskArgs,
+                        "--mpi=pmi2",
+                    ]
+                    def runTestCmd = SlurmConfig.generateTrtllmCommand("srun", partition, taskArgs.join(" "), scriptRunNode)
+                    scriptContent += """
+                    ${runTestCmd}
+                    """
+                } else {
+                    String outputPath = "${jobWorkspace}/job-output.log"
+                    taskArgs = [
+                        *taskArgs,
+                        "--output=${outputPath}",
+                    ]
+                    def runTestCmd = SlurmConfig.generateTrtllmCommand("sbatch", partition, taskArgs.join(" "), scriptRunNode)
+                    scriptContent += """
+                    touch ${outputPath}
+                    jobId=\$(${runTestCmd} | awk '{print \$4}')
+                    if [ -z "\$jobId" ]; then
+                        echo "Error: Job submission failed, no job ID returned."
+                        exit 1
+                    fi
+                    echo "Submitted job \$jobId"
+                    tail -f ${outputPath} &
+                    tailPid=\$!
+                    # Wait until sbatch job is done..
+                    while squeue -j \$jobId -o %T >/dev/null 2>&1; do
+                        sleep 300
+                    done
+                    # Kill tail -f process
+                    kill \$tailPid
+                    # Check if the job failed or not
+                    EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
+                    if [ "\$EXIT_CODE" -ne 0 ]; then
+                        echo "Pytest failed in Slurm job \$jobId with exit code \$EXIT_CODE"
+                        exit \$EXIT_CODE
+                    fi
+                    """
+                }
+                scriptContent = scriptContent.replaceAll('\t','').stripIndent()
+                pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
+                Utils.copyScriptToRemoteHost(
+                    pipeline,
+                    remote,
+                    scriptLaunchPathLocal,
+                    scriptLaunchPathNode
+                )
+>>>>>>> e74931618 (Refactor L0 Test code)
             }
             stage('Run Test') {
-                def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
                 Utils.exec(
                     pipeline,
                     timeout: false,
                     script: Utils.sshUserCmd(
                         remote,
-                        """bash ${scriptLaunch}"""
+                        """bash ${scriptLaunchPathNode}"""
                     )
                 )
             }
         }
     } finally {
         uploadResults(pipeline, cluster, jobUID, stageName)
-        cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID)
+        // cleanUpNodeResources(pipeline, cluster, jobUID)
     }
 }
 
@@ -1934,7 +1879,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
             if (key.contains("llvm")) {
                 config = LLVM_CONFIG
             }
-            runLLMTestlistOnSlurm_MultiNodes(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2)
+            runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2)
         }]]}
 
         parallelJobs += parallelMultiNodesSBSAJobs
diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh
@@ -2,17 +2,6 @@
 cd $resourcePathNode
 llmSrcNode=$resourcePathNode/TensorRT-LLM/src
 
-# generate .coveragerc in workspace
-cat << EOF > $jobWorkspace/.coveragerc
-[run]
-branch = True
-data_file = $jobWorkspace/.coverage.$stageName
-[paths]
-source =
-    $llmSrcNode/tensorrt_llm/
-    ---wheel_path---/tensorrt_llm/
-EOF
-
 resultsPath=$jobWorkspace/results
 mkdir -p $resultsPath
 if [ $SLURM_LOCALID -eq 0 ]; then
@@ -39,8 +28,13 @@ export LLM_ROOT=$llmSrcNode
 export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
 export UCX_TLS=^gdr_copy
 cd $llmSrcNode/tests/integration/defs
-testCmdLines=(
-    "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
+testCmdLines=()
+if [ $nodeCount -gt 1 ]; then
+    testCmdLines+=(
+        "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
+    )
+fi
+testCmdLines+=(
     "pytest"
     "-v"
     "--timeout=$pytestTestTimeout"
@@ -65,13 +59,23 @@ fi
 trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
 trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
 echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
-sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
+# generate .coveragerc in workspace
+cat << EOF > $jobWorkspace/.coveragerc
+[run]
+branch = True
+data_file = $jobWorkspace/.coverage.$stageName
+[paths]
+source =
+    $llmSrcNode/tensorrt_llm/
+    $trtllmWhlPath/tensorrt_llm/
+EOF
+
 testCmdLines+=(
     "--cov=$llmSrcNode/examples/"
     "--cov=$llmSrcNode/tensorrt_llm/"
     "--cov=$trtllmWhlPath/tensorrt_llm/"
     "--cov-report="
-    "--cov-config=$coverageConfigFile"
+    "--cov-config=$jobWorkspace/.coveragerc"
 )
 containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
 containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')