Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 100 additions & 62 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -154,16 +154,13 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
"-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
"-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
"-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
"${slurmOutputFile} | tail -n1\""
"-e 's/.*SLURM_JOB_ID=\\([0-9]\\+\\).*/\\1/p' " +
"-e 's/.*SLURM_JOBID=\\([0-9]\\+\\).*/\\1/p' " +
"${slurmOutputFile} | tail -n1 || true\""
),
returnStdout: true
).trim()

if (!slurmJobID || !slurmJobID.isNumber()) {
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile}\""))
error("Slurm job did not submit successfully. No job ID found.")
}

Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")

Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
Expand All @@ -180,10 +177,19 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
pipeline,
script: Utils.sshUserCmd(
remote,
"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
"\"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID} || true\""
)
)

if (!slurmJobID || !slurmJobID.isNumber()) {
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\""))
echo "Slurm job did not submit successfully. No job ID found."
} else {
// The original Slurm output file name is like "slurm-%j-*.out", we need to replace the %j with the real job ID.
def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID)
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\""))
}

Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
}
}
Expand All @@ -198,6 +204,12 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
allowAnyHosts: true,
]

Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")

CloudManager.destroyNode(nodeName)

Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")

Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")

Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
Expand All @@ -214,7 +226,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
pipeline,
script: Utils.sshUserCmd(
remote,
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
"\"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh || true\""
)
)

Expand Down Expand Up @@ -308,13 +320,17 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
if (m1) ids << m1[0][1] // Extract the first captured group
def m2 = (line =~ /srun: job (\d+) (queued|has been allocated)/)
if (m2) ids << m2[0][1] // Extract the first captured group
def m3 = (line =~ /SLURM_JOB_ID=(\d+)/)
if (m3) ids << m3[0][1] // Extract the first captured group
def m4 = (line =~ /SLURM_JOBID=(\d+)/)
if (m4) ids << m4[0][1] // Extract the first captured group
return ids
}

slurmJobID = jobIDs ? jobIDs[-1] : null

if (!slurmJobID || !slurmJobID.isNumber()) {
error("Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}")
echo "Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}"
}
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
Expand All @@ -332,16 +348,37 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
}

if (CloudManager.isNodeOnline(nodeName)) {
def dockerGpuOption = ""

node(nodeName) {
sh """
env | sort
pwd && ls -alh
ls -alh ${env.WORKSPACE}
ls -alh ${env.WORKSPACE_TMP}
"""

sh "nproc && free -g && hostname"
echoNodeAndGpuInfo(pipeline, stageName)
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
// Use single quotes to avoid Jenkins variable expansion
sh 'echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"'
sh 'echo "NV_GPU: $NV_GPU"'

// Dynamically set GPU arguments based on environment variables
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
dockerGPUOption = sh(script: """
if [ -n "\$NV_GPU" ]; then
echo "--gpus '\\"device=\$NV_GPU\\"'"
elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then
echo "--gpus '\\"device=\$CUDA_VISIBLE_DEVICES\\"'"
else
echo "--gpus ${gpuCount}"
fi
""", returnStdout: true).trim()
}

def dockerArgs = "--gpus ${gpuCount} " +
def dockerArgs = "${dockerGPUOption} " +
"--cap-add=SYS_ADMIN " +
"--ipc=host " +
"--security-opt seccomp=unconfined " +
Expand All @@ -351,6 +388,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
"--cap-add syslog"

echo "Final dockerArgs: ${dockerArgs}"

if (partition.clusterName == "dlcluster") {
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
}
Expand All @@ -362,11 +401,15 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
}
}
} finally {
stage('Clean up SLURM Resources') {
Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
CloudManager.destroyNode(nodeName)
Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
stage("Clean up SLURM Resources") {
// Workaround to handle the interruption during clean up SLURM resources
retry(3) {
try {
cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
} catch (Exception e) {
error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
}
}
}
}
}
Expand Down Expand Up @@ -420,7 +463,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
slurmOutputFile = "${jobWorkspace}/${jobUID}-slurm_output.log"
slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID)
def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt"
def isAarch64 = config.contains("aarch64")
Expand Down Expand Up @@ -474,6 +517,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL

def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
// TODO: check if the tee always returns 0
def scriptContent = """#!/bin/bash
export jobWorkspace=$jobWorkspace
export tarName=$tarName
Expand Down Expand Up @@ -515,8 +559,15 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
} finally {
uploadResults(pipeline, cluster, jobUID, stageName)

stage('Clean up SLURM Resources') {
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
stage("Clean up SLURM Resources") {
// Workaround to handle the interruption during clean up SLURM resources
retry(3) {
try {
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
} catch (Exception e) {
error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
}
}
}
}
}
Expand Down Expand Up @@ -644,7 +695,7 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
if (stageIsInterrupted) {
echo "Stage is interrupted, skip to upload test result."
} else {
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg; fi'
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
if (noResultIfSuccess && !stageIsFailed) {
// Clean up the workspace
sh """
Expand Down Expand Up @@ -912,7 +963,14 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod

def echoNodeAndGpuInfo(pipeline, stageName)
{
String hostNodeName = sh(script: 'echo $HOST_NODE_NAME', returnStdout: true)
String hostNodeName = sh(script: '''
if [ -n "$HOST_NODE_NAME" ]; then
echo "$HOST_NODE_NAME"
else
hostname -f || hostname
fi
''', returnStdout: true).trim()

String gpuUuids = pipeline.sh(script: "nvidia-smi -q | grep \"GPU UUID\" | awk '{print \$4}' | tr '\n' ',' || true", returnStdout: true)
pipeline.echo "HOST_NODE_NAME = ${hostNodeName} ; GPU_UUIDS = ${gpuUuids} ; STAGE_NAME = ${stageName}"
}
Expand Down Expand Up @@ -986,7 +1044,7 @@ def launchTestListCheck(pipeline)
trtllm_utils.llmExecStepWithRetry(pipeline, script: """apt-get update && apt-get install \
libffi-dev \
-y""")
sh "nvidia-smi -q"
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
// download TRT-LLM tarfile
def tarName = BUILD_CONFIGS[VANILLA_CONFIG][TARNAME]
def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}"
Expand Down Expand Up @@ -1394,8 +1452,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
sh "nproc && free -g && hostname"
echoNodeAndGpuInfo(pipeline, stageName)
sh "cat ${MODEL_CACHE_DIR}/README"
sh "nvidia-smi -q"
sh "nvidia-smi topo -m"
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
sh "df -h"

// setup HF_HOME to cache model and datasets
Expand Down Expand Up @@ -1526,7 +1583,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
stage ("[${stageName}] Run Pytest")
{
echoNodeAndGpuInfo(pipeline, stageName)
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi'
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'

def extraInternalEnv = ""
def pytestTestTimeout = "3600"
Expand Down Expand Up @@ -1771,7 +1828,7 @@ def runPackageSanityCheck(pipeline, wheel_path, reinstall_dependencies=false, cp
sh "nproc && free -g && hostname"
sh "bash -c 'pip3 show tensorrt || true'"
sh "cat ${MODEL_CACHE_DIR}/README"
sh "nvidia-smi -q"
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"

sh "pwd && ls -alh"
trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget -nv ${whlUrl}")
Expand Down Expand Up @@ -1822,33 +1879,26 @@ def checkStageName(stageNames) {
}
}

// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
{
return {
runner -> node(label) {
if (needToDeleteDir) {
deleteDir()
}
stage('Pull Docker Image') {
docker.image(image).pull()
}
docker.image(image).inside(dockerArgs) {
runner()
}
}
}
}

def runInDockerOnNode(image, label, dockerArgs)
{
return {
stageName, runner -> stage(stageName) {
node(label) {
deleteDir()
try {
if (needToDeleteDir) {
deleteDir()
}
stage('Pull Docker Image') {
docker.image(image).pull()
}
docker.image(image).inside(dockerArgs) {
runner()
}
} catch (Exception e) {
if (e.getMessage()?.contains("Failed to kill container")) {
echo "Known benign error ignored: ${e.getMessage()}"
} else {
throw e // Re-throw if it's a different IOException
}
}
}
}
Expand All @@ -1866,10 +1916,8 @@ def runInKubernetes(pipeline, podSpec, containerName)
}
}

def launchTestJobs(pipeline, testFilter, dockerNode=null)
def launchTestJobs(pipeline, testFilter)
{
def dockerArgs = "-v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"

// IMPORTANT: Stage Configuration Syntax Requirement
//
// The test_to_stage_mapping.py script expects stage definitions in the following format:
Expand Down Expand Up @@ -2171,12 +2219,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
def buildRunner = runInKubernetes(pipeline, buildSpec, "trt-llm")
def sanityRunner = null

if (dockerNode) {
sanityRunner = runInDockerOnNode(values[0], dockerNode, dockerArgs)
} else {
def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch)
sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm")
}

def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch)
sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm")

def wheelPath = "${values[4]}"
def wheelName = ""
Expand Down Expand Up @@ -2420,17 +2465,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
stage("Skip - reused") {
echo "Skip - Passed in the last pipeline."
}
} else if (values instanceof List && dockerNode == null) {
} else if (values instanceof List) {
trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", {
values[1]()
})
} else if (values instanceof List && dockerNode != null) {
node(dockerNode) {
deleteDir()
docker.image(LLM_DOCKER_IMAGE).inside(dockerArgs) {
values[1]()
}
}
} else {
values()
}
Expand Down
2 changes: 1 addition & 1 deletion jenkins/scripts/slurm_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ if [ $SLURM_LOCALID -eq 0 ]; then
which python3
python3 --version
apt-get install -y libffi-dev
nvidia-smi
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
git config --global --add safe.directory "*"
Expand Down
Loading