Skip to content

Commit dd681a7

Browse files
committed
Refactor L0 Test code
consolidate test code for slurm jobs and regular jobs Signed-off-by: Yuanjing Xue <[email protected]>
1 parent 81fd468 commit dd681a7

File tree

2 files changed

+111
-162
lines changed

2 files changed

+111
-162
lines changed

jenkins/L0_Test.groovy

Lines changed: 92 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
1+
@Library(['bloom-jenkins-shared-lib@dev-yuanjingx-slurm_refactor', 'trtllm-jenkins-shared-lib@main']) _
22

33
import java.lang.InterruptedException
44
import groovy.transform.Field
@@ -133,8 +133,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
133133
}
134134
}
135135

136-
//TODO: consolidate slurm related code for both multi nodes and single nodes
137-
def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID){
136+
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String jobUID){
138137
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
139138
def remote = [
140139
ip : cluster.ip,
@@ -158,139 +157,20 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
158157
}
159158
}
160159

161-
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
162-
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
163-
def remote = [
164-
ip : cluster.ip,
165-
host : cluster.host,
166-
user : "${pipeline.USERNAME}",
167-
passwd : "${pipeline.PASSWORD}",
168-
allowAnyHosts: true,
169-
]
170-
171-
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
172-
pipeline.stage('Clean up SLURM Agent Resources') {
173-
Utils.exec(
174-
pipeline,
175-
timeout: false,
176-
script: Utils.sshUserCmd(
177-
remote,
178-
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
179-
)
180-
)
181-
Utils.exec(pipeline, script: "echo done")
182-
}
183-
}
184-
}
185-
186-
def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312", runner)
187-
{
188-
runner {
189-
// TODO: refactor the finallyRunner to reuse within slurm or nonslurm job.
190-
cacheErrorAndUploadResult(stageName, {
191-
runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver)
192-
}, {
193-
// If the execution test list is null, remove the test result xml
194-
sh """
195-
ls -all ${stageName}/
196-
if ! grep -q '<testcase' ${stageName}/results.xml; then
197-
rm ${stageName}/results.xml || true
198-
fi
199-
"""
200-
def llmPath = sh (script: "realpath .", returnStdout: true).trim()
201-
def llmSrc = "${llmPath}/${LLM_ROOT}${config}/TensorRT-LLM/src"
202-
// CPP tests will generate test result in ${llmSrc}/cpp/build_backup/, move these files to job result folder
203-
sh "ls -all ${llmSrc}/cpp/build_backup/ || true"
204-
sh "ls -all ${llmSrc}/cpp/build/ || true"
205-
// Sed for CPP test result
206-
sh "cd ${llmSrc}/cpp/build_backup/ && sed -i 's/\" classname=\"/\" classname=\"${stageName}./g' *.xml || true"
207-
sh "cd ${llmSrc}/cpp/build_backup/ && sed -i 's/testsuite name=\"[^\"]*\"/testsuite name=\"${stageName}\"/g' *.xml || true"
208-
// Sed for Pytest result
209-
sh "cd ${stageName} && sed -i 's/testsuite name=\"pytest\"/testsuite name=\"${stageName}\"/g' *.xml || true"
210-
// Copy CPP test result
211-
sh "cp ${llmSrc}/cpp/build_backup/*.xml ${stageName} || true"
212-
sh "ls ${stageName}/ -all"
213-
})
214-
}
215-
}
216-
217-
def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, skipInstallWheel=false, cpver="cp312")
218-
{
219-
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
220-
SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
221-
222-
def nodeName = "${cluster.host}-test-${UUID.randomUUID().toString()}"
223-
def nodeSecret = CloudManager.createNode(nodeName)
224-
225-
try {
226-
// Run ssh command to start node in desired cluster via SLURM
227-
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
228-
def remote = [
229-
ip : cluster.ip,
230-
host : cluster.host,
231-
user : "${pipeline.USERNAME}",
232-
passwd : "${pipeline.PASSWORD}",
233-
allowAnyHosts: true,
234-
]
235-
236-
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
237-
stage('Request Node via SLURM') {
238-
println("Selected Cluster: ${cluster.name}")
239-
240-
def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, "slurm_jenkins_agent_setup.sh")
241-
242-
Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)
243-
244-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
245-
246-
Utils.exec(
247-
pipeline,
248-
timeout: false,
249-
script: Utils.sshUserCmd(
250-
remote,
251-
"""${SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)}"""
252-
)
253-
)
254-
Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
255-
}
256-
}
257-
258-
stage('Checking if the Node is Online') {
259-
def counter = 0
260-
while (!CloudManager.isNodeOnline(nodeName) && counter < 12) {
261-
sleep(time: 10, unit: 'MINUTES') // Wait 10 minutes to check status of the node again
262-
counter++
263-
}
264-
265-
if (CloudManager.isNodeOnline(nodeName)) {
266-
def dockerArgs = "--gpus ${gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
267-
268-
if (partition.clusterName == "dlcluster") {
269-
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
270-
}
271-
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, false)
272-
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
273-
} else {
274-
echo "The node does not come online in 2 hours, terminating the job"
275-
}
276-
}
277-
} finally {
278-
cleanUpNodeResources(pipeline, cluster, nodeName)
279-
CloudManager.destroyNode(nodeName)
280-
}
281-
}
282-
283160
def getNodeArgs(int nodeCount, int gpuCount) {
284161
int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
285-
return [
162+
return nodeCount == 1 ? [
163+
"--nodes=${nodeCount}",
164+
"--gpus=${gpuCount}"
165+
] : [
286166
"--nodes=${nodeCount}",
287167
"--ntasks=${gpuCount}",
288168
"--ntasks-per-node=${gpusPerNode}",
289169
"--gpus-per-node=${gpusPerNode}",
290-
].join(" ")
170+
]
291171
}
292172

293-
def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=2, skipInstallWheel=false, cpver="cp312")
173+
def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, skipInstallWheel=false, cpver="cp312")
294174
{
295175
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
296176
SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
@@ -324,6 +204,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
324204
def scriptRunNode = "${jobWorkspace}/slurm_run.sh"
325205
def testListPathNode = "${jobWorkspace}/${testList}.txt"
326206
def waivesListPathNode = "${jobWorkspace}/waives.txt"
207+
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
208+
def scriptLaunchPathNode = "${jobWorkspace}/slurm_launch.sh"
327209
def isAarch64 = config.contains("aarch64")
328210
def pytestTestTimeout = "7200"
329211

@@ -337,43 +219,54 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
337219

338220
// Upload slurm_run_sh to Frontend node
339221
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
340-
Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
341-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
222+
223+
Utils.copyScriptToRemoteHost(
224+
pipeline,
225+
remote,
226+
scriptRunLocalPath,
227+
scriptRunNode,
228+
true
229+
)
342230

343231
// Upload waives.txt to Frontend node
344-
def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
345-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)
232+
def waivesListPathLocal = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
233+
Utils.copyScriptToRemoteHost(
234+
pipeline,
235+
remote,
236+
waivesListPathLocal,
237+
waivesListPathNode
238+
)
346239

347240
// Generate Test List and Upload to Frontend Node
348241
def makoArgs = getMakoArgsFromStageName(stageName, true)
349242
// TODO: currently the options will only be processed if the first
350243
// line is "Mako options:", maybe we can make it more generic, which
351244
// if the line cannot be split by "=", just ignore that line.
352245
def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs)
353-
def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
354-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
246+
def testListPathLocal = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
247+
Utils.copyScriptToRemoteHost(
248+
pipeline,
249+
remote,
250+
testListPathLocal,
251+
testListPathNode
252+
)
355253

356254
// Generate Multi Node Job Launch Script
357255
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
358256
def mounts = "/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro,/home/svc_tensorrt/bloom/scripts:/home/svc_tensorrt/bloom/scripts"
359-
String taskArgs = getNodeArgs(nodeCount, gpuCount)
257+
String[] taskArgs = getNodeArgs(nodeCount, gpuCount)
360258

361259
if (taskArgs == null) {
362260
error "Invalid multinode task stage name is set"
363261
}
364-
365-
taskArgs = [
366-
taskArgs,
367-
"--exclusive",
262+
taskArgs = [
263+
*taskArgs,
368264
"--container-image=${container}",
369265
"--container-workdir=/home/svc_tensorrt/bloom/scripts",
370266
"--container-mounts=${mounts}",
371267
"--container-env=NVIDIA_IMEX_CHANNELS"
372-
].join(" ")
268+
]
373269

374-
def scriptLaunch = "/home/svc_tensorrt/bloom/scripts/${jobUID}/slurm_launch.sh"
375-
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
376-
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
377270
def scriptContent = """#!/bin/bash
378271
export jobWorkspace=$jobWorkspace
379272
export tarName=$tarName
@@ -388,30 +281,82 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
388281
export splitId=$splitId
389282
export perfMode=$perfMode
390283
export resourcePathNode=$resourcePathNode
284+
export nodeCount=$nodeCount
391285
export MODEL_CACHE_DIR=$MODEL_CACHE_DIR
392286
export NVIDIA_IMEX_CHANNELS=0
393287
chmod +x ${scriptRunNode}
288+
<<<<<<< HEAD
394289
${srunCmd}
395290
""".stripIndent()
396291
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
397292
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
398293
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
294+
=======
295+
"""
296+
if (nodeCount > 1) {
297+
taskArgs = [
298+
*taskArgs,
299+
"--mpi=pmi2",
300+
]
301+
def runTestCmd = SlurmConfig.generateTrtllmCommand("srun", partition, taskArgs.join(" "), scriptRunNode)
302+
scriptContent += """
303+
${runTestCmd}
304+
"""
305+
} else {
306+
String outputPath = "${jobWorkspace}/job-output.log"
307+
taskArgs = [
308+
*taskArgs,
309+
"--output=${outputPath}",
310+
]
311+
def runTestCmd = SlurmConfig.generateTrtllmCommand("sbatch", partition, taskArgs.join(" "), scriptRunNode)
312+
scriptContent += """
313+
touch ${outputPath}
314+
jobId=\$(${runTestCmd} | awk '{print \$4}')
315+
if [ -z "\$jobId" ]; then
316+
echo "Error: Job submission failed, no job ID returned."
317+
exit 1
318+
fi
319+
echo "Submitted job \$jobId"
320+
tail -f ${outputPath} &
321+
tailPid=\$!
322+
# Wait until sbatch job is done..
323+
while squeue -j \$jobId -o %T >/dev/null 2>&1; do
324+
sleep 300
325+
done
326+
# Kill tail -f process
327+
kill \$tailPid
328+
# Check if the job failed or not
329+
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
330+
if [ "\$EXIT_CODE" -ne 0 ]; then
331+
echo "Pytest failed in Slurm job \$jobId with exit code \$EXIT_CODE"
332+
exit \$EXIT_CODE
333+
fi
334+
"""
335+
}
336+
scriptContent = scriptContent.replaceAll('\t','').stripIndent()
337+
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
338+
Utils.copyScriptToRemoteHost(
339+
pipeline,
340+
remote,
341+
scriptLaunchPathLocal,
342+
scriptLaunchPathNode
343+
)
344+
>>>>>>> e74931618 (Refactor L0 Test code)
399345
}
400346
stage('Run Test') {
401-
def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
402347
Utils.exec(
403348
pipeline,
404349
timeout: false,
405350
script: Utils.sshUserCmd(
406351
remote,
407-
"""bash ${scriptLaunch}"""
352+
"""bash ${scriptLaunchPathNode}"""
408353
)
409354
)
410355
}
411356
}
412357
} finally {
413358
uploadResults(pipeline, cluster, jobUID, stageName)
414-
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID)
359+
// cleanUpNodeResources(pipeline, cluster, jobUID)
415360
}
416361
}
417362
@@ -1934,7 +1879,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
19341879
if (key.contains("llvm")) {
19351880
config = LLVM_CONFIG
19361881
}
1937-
runLLMTestlistOnSlurm_MultiNodes(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2)
1882+
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2)
19381883
}]]}
19391884
19401885
parallelJobs += parallelMultiNodesSBSAJobs

jenkins/scripts/slurm_run.sh

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,6 @@
22
cd $resourcePathNode
33
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
44

5-
# generate .coveragerc in workspace
6-
cat << EOF > $jobWorkspace/.coveragerc
7-
[run]
8-
branch = True
9-
data_file = $jobWorkspace/.coverage.$stageName
10-
[paths]
11-
source =
12-
$llmSrcNode/tensorrt_llm/
13-
---wheel_path---/tensorrt_llm/
14-
EOF
15-
165
resultsPath=$jobWorkspace/results
176
mkdir -p $resultsPath
187
if [ $SLURM_LOCALID -eq 0 ]; then
@@ -39,8 +28,13 @@ export LLM_ROOT=$llmSrcNode
3928
export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
4029
export UCX_TLS=^gdr_copy
4130
cd $llmSrcNode/tests/integration/defs
42-
testCmdLines=(
43-
"$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
31+
testCmdLines=()
32+
if [ $nodeCount -gt 1 ]; then
33+
testCmdLines+=(
34+
"$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
35+
)
36+
fi
37+
testCmdLines+=(
4438
"pytest"
4539
"-v"
4640
"--timeout=$pytestTestTimeout"
@@ -65,13 +59,23 @@ fi
6559
trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
6660
trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
6761
echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
68-
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
62+
# generate .coveragerc in workspace
63+
cat << EOF > $jobWorkspace/.coveragerc
64+
[run]
65+
branch = True
66+
data_file = $jobWorkspace/.coverage.$stageName
67+
[paths]
68+
source =
69+
$llmSrcNode/tensorrt_llm/
70+
$trtllmWhlPath/tensorrt_llm/
71+
EOF
72+
6973
testCmdLines+=(
7074
"--cov=$llmSrcNode/examples/"
7175
"--cov=$llmSrcNode/tensorrt_llm/"
7276
"--cov=$trtllmWhlPath/tensorrt_llm/"
7377
"--cov-report="
74-
"--cov-config=$coverageConfigFile"
78+
"--cov-config=$jobWorkspace/.coveragerc"
7579
)
7680
containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
7781
containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')

0 commit comments

Comments
 (0)