diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 57a39141690..df87ee70e41 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -154,6 +154,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo "-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " + "-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " + "-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " + + "-e 's/.*SLURM_JOB_ID=\\([0-9]\\+\\).*/\\1/p' " + + "-e 's/.*SLURM_JOBID=\\([0-9]\\+\\).*/\\1/p' " + "${slurmOutputFile} | tail -n1 || true\"" ), returnStdout: true @@ -183,6 +185,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\"")) echo "Slurm job did not submit successfully. No job ID found." } else { + // The original Slurm output file name is like "slurm-%j-*.out", we need to replace the %j with the real job ID. def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID) Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\"")) } @@ -317,6 +320,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p if (m1) ids << m1[0][1] // Extract the first captured group def m2 = (line =~ /srun: job (\d+) (queued|has been allocated)/) if (m2) ids << m2[0][1] // Extract the first captured group + def m3 = (line =~ /SLURM_JOB_ID=(\d+)/) + if (m3) ids << m3[0][1] // Extract the first captured group + def m4 = (line =~ /SLURM_JOBID=(\d+)/) + if (m4) ids << m4[0][1] // Extract the first captured group return ids } @@ -341,6 +348,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p } if (CloudManager.isNodeOnline(nodeName)) { + def dockerGpuOption = "" + node(nodeName) { sh """ env | sort @@ -348,9 +357,28 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p ls -alh ${env.WORKSPACE} ls -alh ${env.WORKSPACE_TMP} """ + + sh "nproc && free -g && hostname" + echoNodeAndGpuInfo(pipeline, stageName) + sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m" + // Use single quotes to avoid Jenkins variable expansion + sh 'echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"' + sh 'echo "NV_GPU: $NV_GPU"' + + // Dynamically set GPU arguments based on environment variables + // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html + dockerGPUOption = sh(script: """ + if [ -n "\$NV_GPU" ]; then + echo "--gpus '\\"device=\$NV_GPU\\"'" + elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then + echo "--gpus '\\"device=\$CUDA_VISIBLE_DEVICES\\"'" + else + echo "--gpus ${gpuCount}" + fi + """, returnStdout: true).trim() } - def dockerArgs = "--gpus ${gpuCount} " + + def dockerArgs = "${dockerGPUOption} " + "--cap-add=SYS_ADMIN " + "--ipc=host " + "--security-opt seccomp=unconfined " + @@ -360,6 +388,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " + "--cap-add syslog" + echo "Final dockerArgs: ${dockerArgs}" + if (partition.clusterName == "dlcluster") { dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0" } @@ -370,12 +400,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p error "The Slurm node does not come online in the waiting period. Terminating the job." } } - } catch (Exception e) { - if (e.getMessage()?.contains("Failed to kill container")) { - echo "Known benign error ignored: ${e.getMessage()}" - } else { - throw e // Re-throw if it's a different IOException - } } finally { stage("Clean up SLURM Resources") { // Workaround to handle the interruption during clean up SLURM resources @@ -939,7 +963,14 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod def echoNodeAndGpuInfo(pipeline, stageName) { - String hostNodeName = sh(script: 'echo $HOST_NODE_NAME', returnStdout: true) + String hostNodeName = sh(script: ''' + if [ -n "$HOST_NODE_NAME" ]; then + echo "$HOST_NODE_NAME" + else + hostname -f || hostname + fi + ''', returnStdout: true).trim() + String gpuUuids = pipeline.sh(script: "nvidia-smi -q | grep \"GPU UUID\" | awk '{print \$4}' | tr '\n' ',' || true", returnStdout: true) pipeline.echo "HOST_NODE_NAME = ${hostNodeName} ; GPU_UUIDS = ${gpuUuids} ; STAGE_NAME = ${stageName}" } @@ -1013,7 +1044,7 @@ def launchTestListCheck(pipeline) trtllm_utils.llmExecStepWithRetry(pipeline, script: """apt-get update && apt-get install \ libffi-dev \ -y""") - sh "nvidia-smi -q" + sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m" // download TRT-LLM tarfile def tarName = BUILD_CONFIGS[VANILLA_CONFIG][TARNAME] def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}" @@ -1421,8 +1452,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO sh "nproc && free -g && hostname" echoNodeAndGpuInfo(pipeline, stageName) sh "cat ${MODEL_CACHE_DIR}/README" - sh "nvidia-smi -q" - sh "nvidia-smi topo -m" + sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m" sh "df -h" // setup HF_HOME to cache model and datasets @@ -1798,7 +1828,7 @@ def runPackageSanityCheck(pipeline, wheel_path, reinstall_dependencies=false, cp sh "nproc && free -g && hostname" sh "bash -c 'pip3 show tensorrt || true'" sh "cat ${MODEL_CACHE_DIR}/README" - sh "nvidia-smi -q" + sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m" sh "pwd && ls -alh" trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget -nv ${whlUrl}") @@ -1849,33 +1879,26 @@ def checkStageName(stageNames) { } } -// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true) { return { runner -> node(label) { - if (needToDeleteDir) { - deleteDir() - } - stage('Pull Docker Image') { - docker.image(image).pull() - } - docker.image(image).inside(dockerArgs) { - runner() - } - } - } -} - -def runInDockerOnNode(image, label, dockerArgs) -{ - return { - stageName, runner -> stage(stageName) { - node(label) { - deleteDir() + try { + if (needToDeleteDir) { + deleteDir() + } + stage('Pull Docker Image') { + docker.image(image).pull() + } docker.image(image).inside(dockerArgs) { runner() } + } catch (Exception e) { + if (e.getMessage()?.contains("Failed to kill container")) { + echo "Known benign error ignored: ${e.getMessage()}" + } else { + throw e // Re-throw if it's a different IOException + } } } } @@ -1893,10 +1916,8 @@ def runInKubernetes(pipeline, podSpec, containerName) } } -def launchTestJobs(pipeline, testFilter, dockerNode=null) +def launchTestJobs(pipeline, testFilter) { - def dockerArgs = "-v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog" - // IMPORTANT: Stage Configuration Syntax Requirement // // The test_to_stage_mapping.py script expects stage definitions in the following format: @@ -2044,8 +2065,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) fullSet += SBSATestConfigs.keySet() SBSASlurmTestConfigs = [ - // Disable GB200-PyTorch-1 due to OOM (https://nvbugspro.nvidia.com/bug/5490507) - //"GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 1], + "GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 1], "GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4], "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4], ] @@ -2199,12 +2219,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) def buildRunner = runInKubernetes(pipeline, buildSpec, "trt-llm") def sanityRunner = null - if (dockerNode) { - sanityRunner = runInDockerOnNode(values[0], dockerNode, dockerArgs) - } else { - def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch) - sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm") - } + + def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch) + sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm") def wheelPath = "${values[4]}" def wheelName = "" @@ -2448,17 +2465,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) stage("Skip - reused") { echo "Skip - Passed in the last pipeline." } - } else if (values instanceof List && dockerNode == null) { + } else if (values instanceof List) { trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", { values[1]() }) - } else if (values instanceof List && dockerNode != null) { - node(dockerNode) { - deleteDir() - docker.image(LLM_DOCKER_IMAGE).inside(dockerArgs) { - values[1]() - } - } } else { values() } diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index 7ae3de4f961..af171ba8776 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -22,7 +22,7 @@ if [ $SLURM_LOCALID -eq 0 ]; then which python3 python3 --version apt-get install -y libffi-dev - nvidia-smi + nvidia-smi && nvidia-smi -q && nvidia-smi topo -m cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl git config --global --add safe.directory "*"