1- @Library ([' bloom-jenkins-shared-lib@main ' , ' trtllm-jenkins-shared-lib@main' ]) _
1+ @Library ([' bloom-jenkins-shared-lib@dev-yuanjingx-slurm_refactor ' , ' trtllm-jenkins-shared-lib@main' ]) _
22
33import java.lang.InterruptedException
44import groovy.transform.Field
@@ -133,8 +133,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
133133 }
134134}
135135
136- // TODO: consolidate slurm related code for both multi nodes and single nodes
137- def cleanUpNodeResourcesMultiNodes (def pipeline , SlurmCluster cluster , String jobUID ){
136+ def cleanUpNodeResources (def pipeline , SlurmCluster cluster , String jobUID ){
138137 withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' , usernameVariable : ' USERNAME' , passwordVariable : ' PASSWORD' )]) {
139138 def remote = [
140139 ip : cluster. ip,
@@ -158,139 +157,20 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
158157 }
159158}
160159
161- def cleanUpNodeResources (def pipeline , SlurmCluster cluster , String nodeName ){
162- withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' , usernameVariable : ' USERNAME' , passwordVariable : ' PASSWORD' )]) {
163- def remote = [
164- ip : cluster. ip,
165- host : cluster. host,
166- user : " ${ pipeline.USERNAME} " ,
167- passwd : " ${ pipeline.PASSWORD} " ,
168- allowAnyHosts : true ,
169- ]
170-
171- Utils . exec(pipeline, script : " apt-get update && apt-get install -y sshpass openssh-client" )
172- pipeline. stage(' Clean up SLURM Agent Resources' ) {
173- Utils . exec(
174- pipeline,
175- timeout : false ,
176- script : Utils . sshUserCmd(
177- remote,
178- " rm -rf /home/svc_tensorrt/bloom/scripts/agent-${ nodeName} .jar /home/svc_tensorrt/bloom/scripts/${ nodeName} -slurm_jenkins_agent_setup.sh"
179- )
180- )
181- Utils . exec(pipeline, script : " echo done" )
182- }
183- }
184- }
185-
186- def executeLLMTestOnSlurm (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" , splitId = 1 , splits = 1 , skipInstallWheel = false , cpver = " cp312" , runner )
187- {
188- runner {
189- // TODO: refactor the finallyRunner to reuse within slurm or nonslurm job.
190- cacheErrorAndUploadResult(stageName, {
191- runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver)
192- }, {
193- // If the execution test list is null, remove the test result xml
194- sh """
195- ls -all ${ stageName} /
196- if ! grep -q '<testcase' ${ stageName} /results.xml; then
197- rm ${ stageName} /results.xml || true
198- fi
199- """
200- def llmPath = sh (script : " realpath ." , returnStdout : true ). trim()
201- def llmSrc = " ${ llmPath} /${ LLM_ROOT}${ config} /TensorRT-LLM/src"
202- // CPP tests will generate test result in ${llmSrc}/cpp/build_backup/, move these files to job result folder
203- sh " ls -all ${ llmSrc} /cpp/build_backup/ || true"
204- sh " ls -all ${ llmSrc} /cpp/build/ || true"
205- // Sed for CPP test result
206- sh " cd ${ llmSrc} /cpp/build_backup/ && sed -i 's/\" classname=\" /\" classname=\" ${ stageName} ./g' *.xml || true"
207- sh " cd ${ llmSrc} /cpp/build_backup/ && sed -i 's/testsuite name=\" [^\" ]*\" /testsuite name=\" ${ stageName} \" /g' *.xml || true"
208- // Sed for Pytest result
209- sh " cd ${ stageName} && sed -i 's/testsuite name=\" pytest\" /testsuite name=\" ${ stageName} \" /g' *.xml || true"
210- // Copy CPP test result
211- sh " cp ${ llmSrc} /cpp/build_backup/*.xml ${ stageName} || true"
212- sh " ls ${ stageName} / -all"
213- })
214- }
215- }
216-
217- def runLLMTestlistOnSlurm (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" , splitId = 1 , splits = 1 , gpuCount = 1 , skipInstallWheel = false , cpver = " cp312" )
218- {
219- SlurmPartition partition = SlurmConfig . partitionConfig[platform] as SlurmPartition
220- SlurmCluster cluster = SlurmConfig . clusterConfig[partition. clusterName]
221-
222- def nodeName = " ${ cluster.host} -test-${ UUID.randomUUID().toString()} "
223- def nodeSecret = CloudManager . createNode(nodeName)
224-
225- try {
226- // Run ssh command to start node in desired cluster via SLURM
227- withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' , usernameVariable : ' USERNAME' , passwordVariable : ' PASSWORD' )]) {
228- def remote = [
229- ip : cluster. ip,
230- host : cluster. host,
231- user : " ${ pipeline.USERNAME} " ,
232- passwd : " ${ pipeline.PASSWORD} " ,
233- allowAnyHosts : true ,
234- ]
235-
236- Utils . exec(pipeline, script : " apt-get update && apt-get install -y sshpass openssh-client" )
237- stage(' Request Node via SLURM' ) {
238- println (" Selected Cluster: ${ cluster.name} " )
239-
240- def jenkinsSetupPath = Utils . copyLibraryResource(pipeline, " slurm_jenkins_agent_setup.sh" )
241-
242- Utils . exec(pipeline, script : " chmod +x ${ jenkinsSetupPath} " , returnStdout : true )
243-
244- Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ jenkinsSetupPath} ${ remote.user} @${ remote.host} :~/bloom/scripts/${ nodeName} -slurm_jenkins_agent_setup.sh" ,)
245-
246- Utils . exec(
247- pipeline,
248- timeout : false ,
249- script : Utils . sshUserCmd(
250- remote,
251- """ ${ SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)} """
252- )
253- )
254- Utils . exec(pipeline, script : " echo Sleeping to allow agent initialization; sleep 30" )
255- }
256- }
257-
258- stage(' Checking if the Node is Online' ) {
259- def counter = 0
260- while (! CloudManager . isNodeOnline(nodeName) && counter < 12 ) {
261- sleep(time : 10 , unit : ' MINUTES' ) // Wait 10 minutes to check status of the node again
262- counter++
263- }
264-
265- if (CloudManager . isNodeOnline(nodeName)) {
266- def dockerArgs = " --gpus ${ gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${ CCACHE_DIR} :rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
267-
268- if (partition. clusterName == " dlcluster" ) {
269- dockerArgs + = " -e NVIDIA_IMEX_CHANNELS=0"
270- }
271- slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, false )
272- executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
273- } else {
274- echo " The node does not come online in 2 hours, terminating the job"
275- }
276- }
277- } finally {
278- cleanUpNodeResources(pipeline, cluster, nodeName)
279- CloudManager . destroyNode(nodeName)
280- }
281- }
282-
283160def getNodeArgs (int nodeCount , int gpuCount ) {
284161 int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal ). setScale(0 , BigDecimal . ROUND_CEILING ). intValue()
285- return [
162+ return nodeCount == 1 ? [
163+ " --nodes=${ nodeCount} " ,
164+ " --gpus=${ gpuCount} "
165+ ] : [
286166 " --nodes=${ nodeCount} " ,
287167 " --ntasks=${ gpuCount} " ,
288168 " --ntasks-per-node=${ gpusPerNode} " ,
289169 " --gpus-per-node=${ gpusPerNode} " ,
290- ]. join( " " )
170+ ]
291171}
292172
293- def runLLMTestlistOnSlurm_MultiNodes (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" , splitId = 1 , splits = 1 , gpuCount = 1 , nodeCount = 2 , skipInstallWheel = false , cpver = " cp312" )
173+ def runLLMTestlistOnSlurm (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" , splitId = 1 , splits = 1 , gpuCount = 1 , nodeCount = 1 , skipInstallWheel = false , cpver = " cp312" )
294174{
295175 SlurmPartition partition = SlurmConfig . partitionConfig[platform] as SlurmPartition
296176 SlurmCluster cluster = SlurmConfig . clusterConfig[partition. clusterName]
@@ -324,6 +204,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
324204 def scriptRunNode = " ${ jobWorkspace} /slurm_run.sh"
325205 def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
326206 def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
207+ def scriptLaunchPathLocal = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
208+ def scriptLaunchPathNode = " ${ jobWorkspace} /slurm_launch.sh"
327209 def isAarch64 = config. contains(" aarch64" )
328210 def pytestTestTimeout = " 7200"
329211
@@ -337,43 +219,54 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
337219
338220 // Upload slurm_run_sh to Frontend node
339221 def scriptRunLocalPath = " ${ llmSrcLocal} /jenkins/scripts/slurm_run.sh"
340- Utils . exec(pipeline, script : " chmod +x ${ scriptRunLocalPath} " , returnStdout : true )
341- Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ scriptRunLocalPath} ${ remote.user} @${ remote.host} :${ scriptRunNode} " ,)
222+
223+ Utils . copyScriptToRemoteHost(
224+ pipeline,
225+ remote,
226+ scriptRunLocalPath,
227+ scriptRunNode,
228+ true
229+ )
342230
343231 // Upload waives.txt to Frontend node
344- def waivesListLocalPath = " ${ llmSrcLocal} /tests/integration/test_lists/waives.txt"
345- Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ waivesListLocalPath} ${ remote.user} @${ remote.host} :${ waivesListPathNode} " ,)
232+ def waivesListPathLocal = " ${ llmSrcLocal} /tests/integration/test_lists/waives.txt"
233+ Utils . copyScriptToRemoteHost(
234+ pipeline,
235+ remote,
236+ waivesListPathLocal,
237+ waivesListPathNode
238+ )
346239
347240 // Generate Test List and Upload to Frontend Node
348241 def makoArgs = getMakoArgsFromStageName(stageName, true )
349242 // TODO: currently the options will only be processed if the first
350243 // line is "Mako options:", maybe we can make it more generic, which
351244 // if the line cannot be split by "=", just ignore that line.
352245 def makoOptsJson = transformMakoArgsToJson([" Mako options:" ] + makoArgs)
353- def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
354- Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ testListPath} ${ remote.user} @${ remote.host} :${ testListPathNode} " ,)
246+ def testListPathLocal = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
247+ Utils . copyScriptToRemoteHost(
248+ pipeline,
249+ remote,
250+ testListPathLocal,
251+ testListPathNode
252+ )
355253
356254 // Generate Multi Node Job Launch Script
357255 def container = LLM_DOCKER_IMAGE . replace(" urm.nvidia.com/" , " urm.nvidia.com#" )
358256 def mounts = " /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro,/home/svc_tensorrt/bloom/scripts:/home/svc_tensorrt/bloom/scripts"
359- String taskArgs = getNodeArgs(nodeCount, gpuCount)
257+ String [] taskArgs = getNodeArgs(nodeCount, gpuCount)
360258
361259 if (taskArgs == null ) {
362260 error " Invalid multinode task stage name is set"
363261 }
364-
365- taskArgs = [
366- taskArgs,
367- " --exclusive" ,
262+ taskArgs = [
263+ * taskArgs,
368264 " --container-image=${ container} " ,
369265 " --container-workdir=/home/svc_tensorrt/bloom/scripts" ,
370266 " --container-mounts=${ mounts} " ,
371267 " --container-env=NVIDIA_IMEX_CHANNELS"
372- ]. join( " " )
268+ ]
373269
374- def scriptLaunch = " /home/svc_tensorrt/bloom/scripts/${ jobUID} /slurm_launch.sh"
375- def srunCmd = SlurmConfig . generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
376- scriptLaunchDestPath = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
377270 def scriptContent = """ #!/bin/bash
378271 export jobWorkspace=$jobWorkspace
379272 export tarName=$tarName
@@ -388,30 +281,82 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
388281 export splitId=$splitId
389282 export perfMode=$perfMode
390283 export resourcePathNode=$resourcePathNode
284+ export nodeCount=$nodeCount
391285 export MODEL_CACHE_DIR=$MODEL_CACHE_DIR
392286 export NVIDIA_IMEX_CHANNELS=0
393287 chmod +x ${ scriptRunNode}
288+ <<<<<<< HEAD
394289 ${ srunCmd}
395290 """ . stripIndent()
396291 pipeline. writeFile(file : scriptLaunchDestPath, text : scriptContent)
397292 Utils . exec(pipeline, script : " chmod +x ${ scriptLaunchDestPath} " , returnStdout : true )
398293 Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ scriptLaunchDestPath} ${ remote.user} @${ remote.host} :${ scriptLaunch} " ,)
294+ ====== =
295+ """
296+ if (nodeCount > 1) {
297+ taskArgs = [
298+ *taskArgs,
299+ "--mpi=pmi2",
300+ ]
301+ def runTestCmd = SlurmConfig.generateTrtllmCommand("srun", partition, taskArgs.join(" "), scriptRunNode)
302+ scriptContent += """
303+ ${runTestCmd}
304+ """
305+ } else {
306+ String outputPath = "${ jobWorkspace} /job-output.log"
307+ taskArgs = [
308+ *taskArgs,
309+ "--output=${ outputPath} ",
310+ ]
311+ def runTestCmd = SlurmConfig.generateTrtllmCommand("sbatch", partition, taskArgs.join(" "), scriptRunNode)
312+ scriptContent += """
313+ touch ${outputPath}
314+ jobId= \$(${runTestCmd} | awk ' {print \$ 4}' )
315+ if [ - z " \$ jobId" ]; then
316+ echo " Error: Job submission failed, no job ID returned."
317+ exit 1
318+ fi
319+ echo " Submitted job \$ jobId"
320+ tail - f ${outputPath} &
321+ tailPid= \$!
322+ # Wait until sbatch job is done..
323+ while squeue - j \$jobId - o % T > / dev/ null 2 > &1 ; do
324+ sleep 300
325+ done
326+ # Kill tail - f process
327+ kill \$tailPid
328+ # Check if the job failed or not
329+ EXIT_CODE = \$(sacct - j \$jobId -- format= ExitCode - Pn -- allocations | awk - F : ' {print \$ 1}' )
330+ if [ " \$ EXIT_CODE" - ne 0 ]; then
331+ echo " Pytest failed in Slurm job \$ jobId with exit code \$ EXIT_CODE"
332+ exit \$EXIT_CODE
333+ fi
334+ """
335+ }
336+ scriptContent = scriptContent.replaceAll('\t ','').stripIndent()
337+ pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
338+ Utils.copyScriptToRemoteHost(
339+ pipeline,
340+ remote,
341+ scriptLaunchPathLocal,
342+ scriptLaunchPathNode
343+ )
344+ >>>>>>> e74931618 (Refactor L0 Test code)
399345 }
400346 stage('Run Test') {
401- def scriptLaunch = " ${ jobWorkspace} /slurm_launch.sh"
402347 Utils.exec(
403348 pipeline,
404349 timeout: false,
405350 script: Utils.sshUserCmd(
406351 remote,
407- """ bash ${ scriptLaunch } """
352+ """ bash ${scriptLaunchPathNode }"""
408353 )
409354 )
410355 }
411356 }
412357 } finally {
413358 uploadResults(pipeline, cluster, jobUID, stageName)
414- cleanUpNodeResourcesMultiNodes (pipeline, cluster, jobUID)
359+ // cleanUpNodeResources (pipeline, cluster, jobUID)
415360 }
416361}
417362
@@ -1934,7 +1879,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
19341879 if (key.contains("llvm")) {
19351880 config = LLVM_CONFIG
19361881 }
1937- runLLMTestlistOnSlurm_MultiNodes (pipeline, values[0 ], values[1 ], config, key. contains(" Perf" ), key, values[2 ], values[3 ], values[4 ] ?: 1 , values[5 ] ?: 2 )
1882+ runLLMTestlistOnSlurm (pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2)
19381883 }]]}
19391884
19401885 parallelJobs += parallelMultiNodesSBSAJobs
0 commit comments