Skip to content

Commit 0c43f83

Browse files
committed
[None][ci] Remove some stale CI codes
Signed-off-by: Yanchao Lu <[email protected]>
1 parent 16e9d11 commit 0c43f83

File tree

3 files changed

+56
-52
lines changed

3 files changed

+56
-52
lines changed

jenkins/L0_Test.groovy

Lines changed: 54 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
154154
"-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
155155
"-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
156156
"-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
157+
"-e 's/.*SLURM_JOB_ID=\\([0-9]\\+\\).*/\\1/p' " +
158+
"-e 's/.*SLURM_JOBID=\\([0-9]\\+\\).*/\\1/p' " +
157159
"${slurmOutputFile} | tail -n1 || true\""
158160
),
159161
returnStdout: true
@@ -183,6 +185,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
183185
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\""))
184186
echo "Slurm job did not submit successfully. No job ID found."
185187
} else {
188+
// The original Slurm output file name is like "slurm-%j-*.out", we need to replace the %j with the real job ID.
186189
def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID)
187190
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\""))
188191
}
@@ -317,6 +320,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
317320
if (m1) ids << m1[0][1] // Extract the first captured group
318321
def m2 = (line =~ /srun: job (\d+) (queued|has been allocated)/)
319322
if (m2) ids << m2[0][1] // Extract the first captured group
323+
def m3 = (line =~ /SLURM_JOB_ID=(\d+)/)
324+
if (m3) ids << m3[0][1] // Extract the first captured group
325+
def m4 = (line =~ /SLURM_JOBID=(\d+)/)
326+
if (m4) ids << m4[0][1] // Extract the first captured group
320327
return ids
321328
}
322329

@@ -341,16 +348,36 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
341348
}
342349

343350
if (CloudManager.isNodeOnline(nodeName)) {
351+
def dockerGpuOption = ""
352+
344353
node(nodeName) {
345354
sh """
346355
env | sort
347356
pwd && ls -alh
348357
ls -alh ${env.WORKSPACE}
349358
ls -alh ${env.WORKSPACE_TMP}
350359
"""
360+
361+
sh "nproc && free -g && hostname"
362+
echoNodeAndGpuInfo(pipeline, stageName)
363+
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
364+
// Use single quotes to avoid Jenkins variable expansion
365+
sh 'echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"'
366+
sh 'echo "NV_GPU: $NV_GPU"'
367+
368+
// Dynamically set GPU arguments based on environment variables
369+
dockerGPUOption = sh(script: """
370+
if [ -n "\$CUDA_VISIBLE_DEVICES" ]; then
371+
echo "--gpus \\"device=\$CUDA_VISIBLE_DEVICES\\""
372+
elif [ -n "\$NV_GPU" ]; then
373+
echo "--gpus \\"device=\$NV_GPU\\""
374+
else
375+
echo "--gpus ${gpuCount}"
376+
fi
377+
""", returnStdout: true).trim()
351378
}
352379

353-
def dockerArgs = "--gpus ${gpuCount} " +
380+
def dockerArgs = "${dockerGPUOption} " +
354381
"--cap-add=SYS_ADMIN " +
355382
"--ipc=host " +
356383
"--security-opt seccomp=unconfined " +
@@ -360,6 +387,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
360387
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
361388
"--cap-add syslog"
362389

390+
echo "Final dockerArgs: ${dockerArgs}"
391+
363392
if (partition.clusterName == "dlcluster") {
364393
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
365394
}
@@ -370,12 +399,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
370399
error "The Slurm node does not come online in the waiting period. Terminating the job."
371400
}
372401
}
373-
} catch (Exception e) {
374-
if (e.getMessage()?.contains("Failed to kill container")) {
375-
echo "Known benign error ignored: ${e.getMessage()}"
376-
} else {
377-
throw e // Re-throw if it's a different IOException
378-
}
379402
} finally {
380403
stage("Clean up SLURM Resources") {
381404
// Workaround to handle the interruption during clean up SLURM resources
@@ -1013,7 +1036,7 @@ def launchTestListCheck(pipeline)
10131036
trtllm_utils.llmExecStepWithRetry(pipeline, script: """apt-get update && apt-get install \
10141037
libffi-dev \
10151038
-y""")
1016-
sh "nvidia-smi -q"
1039+
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
10171040
// download TRT-LLM tarfile
10181041
def tarName = BUILD_CONFIGS[VANILLA_CONFIG][TARNAME]
10191042
def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}"
@@ -1421,8 +1444,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
14211444
sh "nproc && free -g && hostname"
14221445
echoNodeAndGpuInfo(pipeline, stageName)
14231446
sh "cat ${MODEL_CACHE_DIR}/README"
1424-
sh "nvidia-smi -q"
1425-
sh "nvidia-smi topo -m"
1447+
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
14261448
sh "df -h"
14271449

14281450
// setup HF_HOME to cache model and datasets
@@ -1798,7 +1820,7 @@ def runPackageSanityCheck(pipeline, wheel_path, reinstall_dependencies=false, cp
17981820
sh "nproc && free -g && hostname"
17991821
sh "bash -c 'pip3 show tensorrt || true'"
18001822
sh "cat ${MODEL_CACHE_DIR}/README"
1801-
sh "nvidia-smi -q"
1823+
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
18021824

18031825
sh "pwd && ls -alh"
18041826
trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget -nv ${whlUrl}")
@@ -1849,33 +1871,26 @@ def checkStageName(stageNames) {
18491871
}
18501872
}
18511873

1852-
// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
18531874
def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
18541875
{
18551876
return {
18561877
runner -> node(label) {
1857-
if (needToDeleteDir) {
1858-
deleteDir()
1859-
}
1860-
stage('Pull Docker Image') {
1861-
docker.image(image).pull()
1862-
}
1863-
docker.image(image).inside(dockerArgs) {
1864-
runner()
1865-
}
1866-
}
1867-
}
1868-
}
1869-
1870-
def runInDockerOnNode(image, label, dockerArgs)
1871-
{
1872-
return {
1873-
stageName, runner -> stage(stageName) {
1874-
node(label) {
1875-
deleteDir()
1878+
try {
1879+
if (needToDeleteDir) {
1880+
deleteDir()
1881+
}
1882+
stage('Pull Docker Image') {
1883+
docker.image(image).pull()
1884+
}
18761885
docker.image(image).inside(dockerArgs) {
18771886
runner()
18781887
}
1888+
} catch (Exception e) {
1889+
if (e.getMessage()?.contains("Failed to kill container")) {
1890+
echo "Known benign error ignored: ${e.getMessage()}"
1891+
} else {
1892+
throw e // Re-throw if it's a different IOException
1893+
}
18791894
}
18801895
}
18811896
}
@@ -1893,10 +1908,8 @@ def runInKubernetes(pipeline, podSpec, containerName)
18931908
}
18941909
}
18951910

1896-
def launchTestJobs(pipeline, testFilter, dockerNode=null)
1911+
def launchTestJobs(pipeline, testFilter)
18971912
{
1898-
def dockerArgs = "-v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
1899-
19001913
// IMPORTANT: Stage Configuration Syntax Requirement
19011914
//
19021915
// The test_to_stage_mapping.py script expects stage definitions in the following format:
@@ -2044,8 +2057,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
20442057
fullSet += SBSATestConfigs.keySet()
20452058

20462059
SBSASlurmTestConfigs = [
2047-
// Disable GB200-PyTorch-1 due to OOM (https://nvbugspro.nvidia.com/bug/5490507)
2048-
//"GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 1],
2060+
// Should use "gb200-single" instead of "gb200-x4" for single GPU testing
2061+
"GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 2],
2062+
"GB200-PyTorch-2": ["gb200-x4", "l0_gb200", 2, 2],
20492063
"GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
20502064
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
20512065
]
@@ -2199,12 +2213,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21992213
def buildRunner = runInKubernetes(pipeline, buildSpec, "trt-llm")
22002214
def sanityRunner = null
22012215

2202-
if (dockerNode) {
2203-
sanityRunner = runInDockerOnNode(values[0], dockerNode, dockerArgs)
2204-
} else {
2205-
def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch)
2206-
sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm")
2207-
}
2216+
2217+
def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch)
2218+
sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm")
22082219

22092220
def wheelPath = "${values[4]}"
22102221
def wheelName = ""
@@ -2448,17 +2459,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
24482459
stage("Skip - reused") {
24492460
echo "Skip - Passed in the last pipeline."
24502461
}
2451-
} else if (values instanceof List && dockerNode == null) {
2462+
} else if (values instanceof List) {
24522463
trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", {
24532464
values[1]()
24542465
})
2455-
} else if (values instanceof List && dockerNode != null) {
2456-
node(dockerNode) {
2457-
deleteDir()
2458-
docker.image(LLM_DOCKER_IMAGE).inside(dockerArgs) {
2459-
values[1]()
2460-
}
2461-
}
24622466
} else {
24632467
values()
24642468
}

jenkins/scripts/slurm_run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ if [ $SLURM_LOCALID -eq 0 ]; then
2222
which python3
2323
python3 --version
2424
apt-get install -y libffi-dev
25-
nvidia-smi
25+
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
2626
cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt
2727
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
2828
git config --global --add safe.directory "*"

tests/integration/test_lists/test-db/l0_gb200.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ l0_gb200:
44
ranges:
55
system_gpu_count:
66
gte: 1
7-
lte: 1
7+
lte: 4
88
wildcards:
99
gpu:
1010
- '*gb200*'

0 commit comments

Comments
 (0)