diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 363dcf322fa..df87ee70e41 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -154,16 +154,13 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo "-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " + "-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " + "-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " + - "${slurmOutputFile} | tail -n1\"" + "-e 's/.*SLURM_JOB_ID=\\([0-9]\\+\\).*/\\1/p' " + + "-e 's/.*SLURM_JOBID=\\([0-9]\\+\\).*/\\1/p' " + + "${slurmOutputFile} | tail -n1 || true\"" ), returnStdout: true ).trim() - if (!slurmJobID || !slurmJobID.isNumber()) { - Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile}\"")) - error("Slurm job did not submit successfully. No job ID found.") - } - Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}") Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30") @@ -180,10 +177,19 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo pipeline, script: Utils.sshUserCmd( remote, - "rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}" + "\"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID} || true\"" ) ) + if (!slurmJobID || !slurmJobID.isNumber()) { + Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\"")) + echo "Slurm job did not submit successfully. No job ID found." + } else { + // The original Slurm output file name is like "slurm-%j-*.out", we need to replace the %j with the real job ID. + def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID) + Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\"")) + } + Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up") } } @@ -198,6 +204,12 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St allowAnyHosts: true, ] + Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30") + + CloudManager.destroyNode(nodeName) + + Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30") + Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client") Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}") @@ -214,7 +226,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St pipeline, script: Utils.sshUserCmd( remote, - "rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh" + "\"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh || true\"" ) ) @@ -308,13 +320,17 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p if (m1) ids << m1[0][1] // Extract the first captured group def m2 = (line =~ /srun: job (\d+) (queued|has been allocated)/) if (m2) ids << m2[0][1] // Extract the first captured group + def m3 = (line =~ /SLURM_JOB_ID=(\d+)/) + if (m3) ids << m3[0][1] // Extract the first captured group + def m4 = (line =~ /SLURM_JOBID=(\d+)/) + if (m4) ids << m4[0][1] // Extract the first captured group return ids } slurmJobID = jobIDs ? jobIDs[-1] : null if (!slurmJobID || !slurmJobID.isNumber()) { - error("Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}") + echo "Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}" } Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}") Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30") @@ -332,6 +348,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p } if (CloudManager.isNodeOnline(nodeName)) { + def dockerGpuOption = "" + node(nodeName) { sh """ env | sort @@ -339,9 +357,28 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p ls -alh ${env.WORKSPACE} ls -alh ${env.WORKSPACE_TMP} """ + + sh "nproc && free -g && hostname" + echoNodeAndGpuInfo(pipeline, stageName) + sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m" + // Use single quotes to avoid Jenkins variable expansion + sh 'echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"' + sh 'echo "NV_GPU: $NV_GPU"' + + // Dynamically set GPU arguments based on environment variables + // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html + dockerGPUOption = sh(script: """ + if [ -n "\$NV_GPU" ]; then + echo "--gpus '\\"device=\$NV_GPU\\"'" + elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then + echo "--gpus '\\"device=\$CUDA_VISIBLE_DEVICES\\"'" + else + echo "--gpus ${gpuCount}" + fi + """, returnStdout: true).trim() } - def dockerArgs = "--gpus ${gpuCount} " + + def dockerArgs = "${dockerGPUOption} " + "--cap-add=SYS_ADMIN " + "--ipc=host " + "--security-opt seccomp=unconfined " + @@ -351,6 +388,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " + "--cap-add syslog" + echo "Final dockerArgs: ${dockerArgs}" + if (partition.clusterName == "dlcluster") { dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0" } @@ -362,11 +401,15 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p } } } finally { - stage('Clean up SLURM Resources') { - Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30") - CloudManager.destroyNode(nodeName) - Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30") - cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID) + stage("Clean up SLURM Resources") { + // Workaround to handle the interruption during clean up SLURM resources + retry(3) { + try { + cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID) + } catch (Exception e) { + error "Error during clean up SLURM resources: ${e.getMessage()} and retrying." + } + } } } } @@ -420,7 +463,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL def llmSrcLocal = "${llmPath}/TensorRT-LLM/src" def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh" def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh" - slurmOutputFile = "${jobWorkspace}/${jobUID}-slurm_output.log" + slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID) def testListPathNode = "${jobWorkspace}/${testList}.txt" def waivesListPathNode = "${jobWorkspace}/waives.txt" def isAarch64 = config.contains("aarch64") @@ -474,6 +517,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode) scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh") + // TODO: check if the tee always returns 0 def scriptContent = """#!/bin/bash export jobWorkspace=$jobWorkspace export tarName=$tarName @@ -515,8 +559,15 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL } finally { uploadResults(pipeline, cluster, jobUID, stageName) - stage('Clean up SLURM Resources') { - cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile) + stage("Clean up SLURM Resources") { + // Workaround to handle the interruption during clean up SLURM resources + retry(3) { + try { + cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile) + } catch (Exception e) { + error "Error during clean up SLURM resources: ${e.getMessage()} and retrying." + } + } } } } @@ -644,7 +695,7 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu if (stageIsInterrupted) { echo "Stage is interrupted, skip to upload test result." } else { - sh 'if [ "$(id -u)" -eq 0 ]; then dmesg; fi' + sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi' if (noResultIfSuccess && !stageIsFailed) { // Clean up the workspace sh """ @@ -912,7 +963,14 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod def echoNodeAndGpuInfo(pipeline, stageName) { - String hostNodeName = sh(script: 'echo $HOST_NODE_NAME', returnStdout: true) + String hostNodeName = sh(script: ''' + if [ -n "$HOST_NODE_NAME" ]; then + echo "$HOST_NODE_NAME" + else + hostname -f || hostname + fi + ''', returnStdout: true).trim() + String gpuUuids = pipeline.sh(script: "nvidia-smi -q | grep \"GPU UUID\" | awk '{print \$4}' | tr '\n' ',' || true", returnStdout: true) pipeline.echo "HOST_NODE_NAME = ${hostNodeName} ; GPU_UUIDS = ${gpuUuids} ; STAGE_NAME = ${stageName}" } @@ -986,7 +1044,7 @@ def launchTestListCheck(pipeline) trtllm_utils.llmExecStepWithRetry(pipeline, script: """apt-get update && apt-get install \ libffi-dev \ -y""") - sh "nvidia-smi -q" + sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m" // download TRT-LLM tarfile def tarName = BUILD_CONFIGS[VANILLA_CONFIG][TARNAME] def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}" @@ -1394,8 +1452,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO sh "nproc && free -g && hostname" echoNodeAndGpuInfo(pipeline, stageName) sh "cat ${MODEL_CACHE_DIR}/README" - sh "nvidia-smi -q" - sh "nvidia-smi topo -m" + sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m" sh "df -h" // setup HF_HOME to cache model and datasets @@ -1526,7 +1583,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO stage ("[${stageName}] Run Pytest") { echoNodeAndGpuInfo(pipeline, stageName) - sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi' + sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi' def extraInternalEnv = "" def pytestTestTimeout = "3600" @@ -1771,7 +1828,7 @@ def runPackageSanityCheck(pipeline, wheel_path, reinstall_dependencies=false, cp sh "nproc && free -g && hostname" sh "bash -c 'pip3 show tensorrt || true'" sh "cat ${MODEL_CACHE_DIR}/README" - sh "nvidia-smi -q" + sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m" sh "pwd && ls -alh" trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget -nv ${whlUrl}") @@ -1822,33 +1879,26 @@ def checkStageName(stageNames) { } } -// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true) { return { runner -> node(label) { - if (needToDeleteDir) { - deleteDir() - } - stage('Pull Docker Image') { - docker.image(image).pull() - } - docker.image(image).inside(dockerArgs) { - runner() - } - } - } -} - -def runInDockerOnNode(image, label, dockerArgs) -{ - return { - stageName, runner -> stage(stageName) { - node(label) { - deleteDir() + try { + if (needToDeleteDir) { + deleteDir() + } + stage('Pull Docker Image') { + docker.image(image).pull() + } docker.image(image).inside(dockerArgs) { runner() } + } catch (Exception e) { + if (e.getMessage()?.contains("Failed to kill container")) { + echo "Known benign error ignored: ${e.getMessage()}" + } else { + throw e // Re-throw if it's a different IOException + } } } } @@ -1866,10 +1916,8 @@ def runInKubernetes(pipeline, podSpec, containerName) } } -def launchTestJobs(pipeline, testFilter, dockerNode=null) +def launchTestJobs(pipeline, testFilter) { - def dockerArgs = "-v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog" - // IMPORTANT: Stage Configuration Syntax Requirement // // The test_to_stage_mapping.py script expects stage definitions in the following format: @@ -2171,12 +2219,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) def buildRunner = runInKubernetes(pipeline, buildSpec, "trt-llm") def sanityRunner = null - if (dockerNode) { - sanityRunner = runInDockerOnNode(values[0], dockerNode, dockerArgs) - } else { - def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch) - sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm") - } + + def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch) + sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm") def wheelPath = "${values[4]}" def wheelName = "" @@ -2420,17 +2465,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) stage("Skip - reused") { echo "Skip - Passed in the last pipeline." } - } else if (values instanceof List && dockerNode == null) { + } else if (values instanceof List) { trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", { values[1]() }) - } else if (values instanceof List && dockerNode != null) { - node(dockerNode) { - deleteDir() - docker.image(LLM_DOCKER_IMAGE).inside(dockerArgs) { - values[1]() - } - } } else { values() } diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index 7ae3de4f961..af171ba8776 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -22,7 +22,7 @@ if [ $SLURM_LOCALID -eq 0 ]; then which python3 python3 --version apt-get install -y libffi-dev - nvidia-smi + nvidia-smi && nvidia-smi -q && nvidia-smi topo -m cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl git config --global --add safe.directory "*"