@@ -44,8 +44,9 @@ DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.06-py3"
4444UBUNTU_22_04_IMAGE  =  " urm.nvidia.com/docker/ubuntu:22.04" 
4545UBUNTU_24_04_IMAGE  =  " urm.nvidia.com/docker/ubuntu:24.04" 
4646
47- POD_TIMEOUT_SECONDS  =  env. podTimeoutSeconds ?  env. podTimeoutSeconds :  " 21600" 
48- POD_TIMEOUT_SECONDS_TMP  =  env. podTimeoutSeconds ?  env. podTimeoutSeconds :  " 43200" 
47+ POD_TIMEOUT_SECONDS_TEST  =  env. podTimeoutSeconds ?  env. podTimeoutSeconds :  " 21600" 
48+ POD_TIMEOUT_SECONDS_BUILD  =  env. podTimeoutSeconds ?  env. podTimeoutSeconds :  " 43200" 
49+ POD_TIMEOUT_SECONDS_SLURM  =  env. podTimeoutSeconds ?  env. podTimeoutSeconds :  " 79200" //  Use 22 hours to allow for 2 hour of buffer.
4950
5051//  Literals for easier access.
5152@Field 
@@ -133,7 +134,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
133134}
134135
135136// TODO: consolidate slurm related code for both multi nodes and single nodes
136- def  cleanUpNodeResourcesMultiNodes (def  pipeline , SlurmCluster  cluster , String  jobUID ) {
137+ def  cleanUpNodeResourcesMultiNodes (def  pipeline , SlurmCluster  cluster , String  jobUID ,  String   slurmOutputFile )  {
137138    withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' usernameVariable : ' USERNAME' passwordVariable : ' PASSWORD' 
138139        def  remote =  [
139140            ip            : cluster. ip,
@@ -144,20 +145,50 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
144145        ]
145146
146147        Utils . exec(pipeline, script : " apt-get update && apt-get install -y sshpass openssh-client" 
147-         pipeline. stage(' Clean up SLURM Agent Resources' 
148-             Utils . exec(
149-                 pipeline,
150-                 timeout : false ,
151-                 script : Utils . sshUserCmd(
152-                     remote,
153-                     " rm -rf /home/svc_tensorrt/bloom/scripts/${ jobUID} " 
154-                 )
155-             )
148+ 
149+         def  slurmJobID =  Utils . exec(
150+             pipeline,
151+             script : Utils . sshUserCmd(
152+                 remote,
153+                 " \" sed -n " + 
154+                 " -e 's/.*Submitted batch job \\ ([0-9]\\ +\\ ).*/\\ 1/p' " + 
155+                 " -e 's/.*srun: job \\ ([0-9]\\ +\\ ) queued.*/\\ 1/p' " + 
156+                 " -e 's/.*srun: job \\ ([0-9]\\ +\\ ) has been allocated.*/\\ 1/p' " + 
157+                 " ${ slurmOutputFile} \" " 
158+             ),
159+             returnStdout : true 
160+         ). trim()
161+ 
162+         if  (! slurmJobID ||  ! slurmJobID. isNumber()) {
163+             Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" cat ${ slurmOutputFile} \" " 
164+             error(" Slurm job did not submit successfully. No job ID found." 
156165        }
166+ 
167+         Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " 
168+ 
169+         Utils . exec(pipeline, script : " echo Sleeping to allow slurm job termination; sleep 30" 
170+ 
171+         Utils . exec(
172+             pipeline,
173+             script : Utils . sshUserCmd(
174+                 remote,
175+                 " \" scancel ${ slurmJobID} ${ slurmJobID} ${ slurmJobID} \" " 
176+             )
177+         )
178+ 
179+         Utils . exec(
180+             pipeline,
181+             script : Utils . sshUserCmd(
182+                 remote,
183+                 " rm -rf /home/svc_tensorrt/bloom/scripts/${ jobUID} " 
184+             )
185+         )
186+ 
187+         Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " 
157188    }
158189}
159190
160- def  cleanUpNodeResources (def  pipeline , SlurmCluster  cluster , String  nodeName ) {
191+ def  cleanUpNodeResources (def  pipeline , SlurmCluster  cluster , String  nodeName ,  String   slurmJobID )  {
161192    withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' usernameVariable : ' USERNAME' passwordVariable : ' PASSWORD' 
162193        def  remote =  [
163194            ip            : cluster. ip,
@@ -168,17 +199,26 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
168199        ]
169200
170201        Utils . exec(pipeline, script : " apt-get update && apt-get install -y sshpass openssh-client" 
171-         pipeline . stage( ' Clean up SLURM Agent Resources ' ) { 
172-              Utils . exec(
173-                 pipeline, 
174-                  timeout :  false , 
175-                  script :  Utils . sshUserCmd( 
176-                     remote, 
177-                      " rm -rf /home/svc_tensorrt/bloom/scripts/agent- ${ nodeName } .jar /home/svc_tensorrt/bloom/scripts/ ${ nodeName } -slurm_jenkins_agent_setup.sh " 
178-                 ) 
202+ 
203+         Utils . exec(pipeline,  script :  " echo Slurm job ID:  ${ slurmJobID } " ) 
204+ 
205+         Utils . exec( 
206+             pipeline, 
207+             script :  Utils . sshUserCmd( 
208+                 remote, 
209+                 " \" scancel  ${ slurmJobID }  || true; sacct -j  ${ slurmJobID }  --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job  ${ slurmJobID }  || true \" " 
179210            )
180-             Utils . exec(pipeline, script : " echo done" 
181-         }
211+         )
212+ 
213+         Utils . exec(
214+             pipeline,
215+             script : Utils . sshUserCmd(
216+                 remote,
217+                 " rm -rf /home/svc_tensorrt/bloom/scripts/agent-${ nodeName} ${ nodeName} " 
218+             )
219+         )
220+ 
221+         Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " 
182222    }
183223}
184224
@@ -224,6 +264,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
224264    def  customWorkspace =  " /tmp/${ nodeName} " 
225265    def  nodeSecret =  CloudManager . createNode(nodeName, customWorkspace)
226266
267+     def  slurmJobID =  null 
268+ 
227269    try  {
228270        //  Run ssh command to start node in desired cluster via SLURM
229271        withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' usernameVariable : ' USERNAME' passwordVariable : ' PASSWORD' 
@@ -245,22 +287,47 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
245287
246288                Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ${ COMMON_SSH_OPTIONS} ${ jenkinsSetupPath} ${ remote.user} ${ remote.host} ${ nodeName} " numRetries : 3 ,)
247289
248-                 Utils . exec(
290+                 Utils . exec(pipeline, script : " cat ${ jenkinsSetupPath} " 
291+ 
292+                 def  slurmSubmitOutput =  Utils . exec(
249293                    pipeline,
250294                    timeout : false ,
251295                    script : Utils . sshUserCmd(
252-                             remote,
253-                             """ ${ SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)} """ 
254-                     )
296+                         remote,
297+                         " \" ${ SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)} \" " 
298+                     ),
299+                     returnStdout : true 
255300                )
301+ 
302+                 def  jobIDs =  slurmSubmitOutput
303+                     .readLines()
304+                     .collect { it. trim() }
305+                     .collectMany { line  -> 
306+                         def  ids =  []
307+                         def  m1 =  (line =~  / Submitted batch job (\d +)/ 
308+                         if  (m1) ids <<  m1[0 ][1 ]  //  Extract the first captured group
309+                         def  m2 =  (line =~  / srun: job (\d +) (queued|has been allocated)/ 
310+                         if  (m2) ids <<  m2[0 ][1 ]  //  Extract the first captured group
311+                         return  ids
312+                     }
313+ 
314+                 slurmJobID =  jobIDs ?  jobIDs[-1 ] :  null 
315+ 
316+                 if  (! slurmJobID ||  ! slurmJobID. isNumber()) {
317+                     error(" Slurm job did not submit successfully. No job ID found.\n Submission output:\n ${ slurmSubmitOutput} " 
318+                 }
319+                 Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " 
256320                Utils . exec(pipeline, script : " echo Sleeping to allow agent initialization; sleep 30" 
257321            }
258322        }
259323
260324        stage(' Checking if the Node is Online' 
261325            def  counter =  0 
262-             while  (! CloudManager . isNodeOnline(nodeName) &&  counter <  12 ) {
263-                 sleep(time : 10 , unit : ' MINUTES' //  Wait 10 minutes to check status of the node again
326+             //  We submit the Slurm job with 5 hours timeout, and the K8S pod will be evicted after 22 hours.
327+             //  Let's use 15 hours to check if the node is online, and with 2 hours buffer.
328+             while  (! CloudManager . isNodeOnline(nodeName) &&  counter <  90 ) {
329+                 //  Wait 10 minutes to check status of the node again
330+                 sleep(time : 10 , unit : ' MINUTES' 
264331                counter++ 
265332            }
266333
@@ -291,12 +358,16 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
291358                slurmRunner =  runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, true )
292359                executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
293360            } else  {
294-                 echo  " The node does not come online in 2 hours, terminating  the job" 
361+                 error  " The Slurm  node does not come online in the waiting period. Terminating  the job. " 
295362            }
296363        }
297364    } finally  {
298-         cleanUpNodeResources(pipeline, cluster, nodeName)
299-         CloudManager . destroyNode(nodeName)
365+         stage(' Clean up SLURM Resources' 
366+             Utils . exec(pipeline, script : " echo Sleeping to allow docker stop; sleep 30" 
367+             CloudManager . destroyNode(nodeName)
368+             Utils . exec(pipeline, script : " echo Sleeping to allow node destruction; sleep 30" 
369+             cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
370+         }
300371    }
301372}
302373
@@ -315,7 +386,13 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
315386    SlurmPartition  partition =  SlurmConfig . partitionConfig[platform] as  SlurmPartition 
316387    SlurmCluster  cluster =  SlurmConfig . clusterConfig[partition. clusterName]
317388
318-     def  jobUID =  " ${ cluster.host} ${ UUID.randomUUID().toString()} " 
389+     //  Create a unique suffix for the job name
390+     String  customSuffix =  " ${ env.BUILD_TAG} ${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
391+     def  jobUID =  " ${ cluster.host} ${ customSuffix} " 
392+ 
393+     Utils . exec(pipeline, script : " env | sort && pwd && ls -alh" 
394+ 
395+     def  slurmOutputFile =  null 
319396
320397    try  {
321398        //  Run ssh command to start node in desired cluster via SLURM
@@ -341,7 +418,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
341418            def  resourcePathNode =  " /tmp" 
342419            def  llmSrcNode =  " ${ resourcePathNode} " 
343420            def  llmSrcLocal =  " ${ llmPath} " 
344-             def  scriptRunNode =  " ${ jobWorkspace} " 
421+             def  scriptRunNode =  " ${ jobWorkspace} ${ jobUID} " 
422+             def  scriptLaunch =  " ${ jobWorkspace} ${ jobUID} " 
423+             slurmOutputFile =  " ${ jobWorkspace} ${ jobUID} " 
345424            def  testListPathNode =  " ${ jobWorkspace} ${ testList} " 
346425            def  waivesListPathNode =  " ${ jobWorkspace} " 
347426            def  isAarch64 =  config. contains(" aarch64" 
@@ -358,7 +437,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
358437                //  Upload slurm_run_sh to Frontend node
359438                def  scriptRunLocalPath =  " ${ llmSrcLocal} " 
360439                Utils . exec(pipeline, script : " chmod +x ${ scriptRunLocalPath} " returnStdout : true )
440+ 
361441                Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ${ COMMON_SSH_OPTIONS} ${ scriptRunLocalPath} ${ remote.user} ${ remote.host} ${ scriptRunNode} " numRetries : 3 ,)
442+                 Utils . exec(pipeline, script : " cat ${ scriptRunLocalPath} " 
443+ 
362444                //  Upload waives.txt to Frontend node
363445                def  waivesListLocalPath =  " ${ llmSrcLocal} " 
364446                Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ${ COMMON_SSH_OPTIONS} ${ waivesListLocalPath} ${ remote.user} ${ remote.host} ${ waivesListPathNode} " numRetries : 3 ,)
@@ -390,7 +472,6 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
390472                    " --container-env=NVIDIA_IMEX_CHANNELS" 
391473                ]. join("  " 
392474
393-                 def  scriptLaunch =  " /home/svc_tensorrt/bloom/scripts/${ jobUID} " 
394475                def  srunCmd =  SlurmConfig . generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
395476                scriptLaunchDestPath =  Utils . createTempLocation(pipeline, " ./slurm_launch.sh" 
396477                def  scriptContent =  """ #!/bin/bash
@@ -410,27 +491,33 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
410491                    export MODEL_CACHE_DIR=$MODEL_CACHE_DIR   
411492                    export NVIDIA_IMEX_CHANNELS=0 
412493                    chmod +x ${ scriptRunNode}  
413-                     ${ srunCmd}  
494+                     ${ srunCmd}  2>&1 | tee  ${ slurmOutputFile }  
414495                """  . stripIndent()
415496                pipeline. writeFile(file : scriptLaunchDestPath, text : scriptContent)
416497                Utils . exec(pipeline, script : " chmod +x ${ scriptLaunchDestPath} " returnStdout : true )
417498                Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ${ COMMON_SSH_OPTIONS} ${ scriptLaunchDestPath} ${ remote.user} ${ remote.host} ${ scriptLaunch} " numRetries : 3 ,)
499+                 Utils . exec(pipeline, script : " cat ${ scriptLaunchDestPath} " 
418500            }
501+ 
419502            stage(' Run Test' 
420-                 def  scriptLaunch =  " ${ jobWorkspace} " 
421503                Utils . exec(
422504                    pipeline,
423505                    timeout : false ,
424506                    script : Utils . sshUserCmd(
425507                        remote,
426-                         """  bash ${ scriptLaunch} " ""
508+                         " \" bash ${ scriptLaunch} \ "" 
427509                    )
428510                )
429511            }
512+ 
513+             echo " Finished test stage execution." 
430514        }
431515    } finally  {
432516        uploadResults(pipeline, cluster, jobUID, stageName)
433-         cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID)
517+ 
518+         stage(' Clean up SLURM Resources' 
519+             cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
520+         }
434521    }
435522}
436523
@@ -559,6 +646,14 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
559646        } else  {
560647            sh ' if [ "$(id -u)" -eq 0 ]; then dmesg; fi' 
561648            if  (noResultIfSuccess &&  ! stageIsFailed) {
649+                 //  Clean up the workspace
650+                 sh """ 
651+                     env | sort 
652+                     pwd && ls -alh 
653+                     rm -rf ./* 
654+                 """  
655+ 
656+                 echo " Finished test stage execution." 
562657                return 
563658            }
564659            echo " noResultIfSuccess: ${ noResultIfSuccess} ${ stageIsFailed} " 
@@ -579,14 +674,16 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
579674                " ${ UPLOAD_PATH} " 
580675            )
581676            junit(testResults : " ${ stageName} " 
582- 
583-             //  Clean up the workspace
584-             sh """ 
585-                 env | sort 
586-                 pwd && ls -alh 
587-                 rm -rf ./* 
588-             """  
589677        }
678+ 
679+         //  Clean up the workspace
680+         sh """ 
681+             env | sort 
682+             pwd && ls -alh 
683+             rm -rf ./* 
684+         """  
685+ 
686+         echo " Finished test stage execution." 
590687    }
591688}
592689
@@ -629,7 +726,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
629726        containerConfig =  """ 
630727                  - name: trt-llm 
631728                    image: ${ image}  
632-                     command: ['sleep', ${ POD_TIMEOUT_SECONDS }  
729+                     command: ['sleep', ${ POD_TIMEOUT_SECONDS_SLURM }  
633730                    tty: true 
634731                    resources: 
635732                      requests: 
@@ -647,7 +744,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
647744        containerConfig =  """ 
648745                  - name: trt-llm 
649746                    image: ${ image}  
650-                     command: ['sleep', ${ POD_TIMEOUT_SECONDS_TMP }  
747+                     command: ['sleep', ${ POD_TIMEOUT_SECONDS_BUILD }  
651748                    volumeMounts: 
652749                    - name: sw-tensorrt-pvc 
653750                      mountPath: "/mnt/sw-tensorrt-pvc" 
@@ -713,7 +810,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
713810        containerConfig =  """ 
714811                  - name: trt-llm 
715812                    image: ${ image}  
716-                     command: ['sleep', ${ POD_TIMEOUT_SECONDS }  
813+                     command: ['sleep', ${ POD_TIMEOUT_SECONDS_TEST }  
717814                    tty: true 
718815                    resources: 
719816                      requests: 
@@ -2153,10 +2250,13 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21532250                        }
21542251                        echo " ###### Check pip install Start ######" 
21552252                        withEnv(libEnv) {
2253+                             //  Retry 2 times if timeout occurs.
21562254                            sh " env | sort" 
2157-                             timeout(time : 30 , unit : ' MINUTES' 
2158-                                 checkPipInstall(pipeline, " ${ cpu_arch} ${ wheelPath} " 
2159-                             }
2255+                             trtllm_utils. llmRetry(1 , " checkPipInstall" 
2256+                                 timeout(time : 30 , unit : ' MINUTES' 
2257+                                     checkPipInstall(pipeline, " ${ cpu_arch} ${ wheelPath} " 
2258+                                 }
2259+                             })
21602260                        }
21612261                        echo " ###### Run LLMAPI tests Start ######" 
21622262                        def  config =  VANILLA_CONFIG 
0 commit comments