@@ -170,6 +170,66 @@ def getNodeArgs(int nodeCount, int gpuCount) {
170170    ]
171171}
172172
173+ def  getPytestBaseCommand (
174+     String  llmSrc ,
175+     String  stageName ,
176+     String  testDBList ,
177+     int  splits ,
178+     int  split_id ,
179+     Boolean  perfMode ,
180+     String  outputPath ,
181+     String  trtllmWheelPath ,
182+     String  coverageConfigFile ,
183+     String  pytestUtil  =  " " 
184+ ) {
185+     def  extraInternalEnv =  " " 
186+     def  pytestTestTimeout =  " 3600" 
187+ 
188+     //  TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
189+     extraInternalEnv =  " __LUNOWUD=\" -thread_pool_size=${ TESTER_CORES} \" " 
190+     //  CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
191+     extraInternalEnv + =  "  CPP_TEST_TIMEOUT_OVERRIDDEN=${ pytestTestTimeout} " 
192+ 
193+     def  testCmdLine =  [
194+         " LLM_ROOT=${ llmSrc} "  ,
195+         " LLM_BACKEND_ROOT=${ llmSrc}  /triton_backend"  ,
196+         " LLM_MODELS_ROOT=${ MODEL_CACHE_DIR} "  ,
197+         " MODEL_CACHE_DIR=${ MODEL_CACHE_DIR} "  ,
198+         extraInternalEnv,
199+         pytestUtil,
200+         " pytest"  ,
201+         " -v"  ,
202+         testFilter[(DETAILED_LOG )] ?  " -s"   :  " "  ,
203+         " --timeout-method=thread"  ,
204+         " --apply-test-list-correction"  ,
205+         " --splitting-algorithm least_duration"  ,
206+         " --timeout=${ pytestTestTimeout} "  ,
207+         " --rootdir ${ llmSrc}  /tests/integration/defs"  ,
208+         " --test-prefix=${ stageName} "  ,
209+         " --splits ${ splits} "  ,
210+         " --group ${ split_id} "  ,
211+         " --waives-file=${ llmSrc}  /tests/integration/test_lists/waives.txt"  ,
212+         " --output-dir=${ outputPath}  /"  ,
213+         " --csv=${ outputPath}  /report.csv"  ,
214+         " --junit-xml ${ outputPath}  /results.xml"  ,
215+         " -o junit_logging=out-err"  ,
216+         " --cov=${ llmSrc}  /examples/"  ,
217+         " --cov=${ llmSrc}  /tensorrt_llm/"  ,
218+         " --cov=${ trtllmWheelPath}  /tensorrt_llm/"  ,
219+         " --cov-report="  ,
220+         " --cov-config=${ coverageConfigFile} "  ,
221+         " --test-list=${ testDBList} "  ,
222+     ]
223+     if  (perfMode) {
224+         testCmdLine + =  [
225+             " --perf"  ,
226+             " --perf-log-formats csv"  ,
227+             " --perf-log-formats yaml" 
228+         ]
229+     }
230+     return  testCmdLine. join("  "  )
231+ }
232+ 
173233def  runLLMTestlistOnSlurm (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined"  , splitId = 1 , splits = 1 , gpuCount = 1 , nodeCount = 1 , skipInstallWheel = false , cpver = " cp312"  )
174234{
175235    SlurmPartition  partition =  SlurmConfig . partitionConfig[platform] as  SlurmPartition 
@@ -207,9 +267,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
207267            def  scriptLaunchPathLocal =  Utils . createTempLocation(pipeline, " ./slurm_launch.sh"  )
208268            def  scriptLaunchPathNode =  " ${ jobWorkspace}  /slurm_launch.sh" 
209269            def  isAarch64 =  config. contains(" aarch64"  )
210-             def  pytestTestTimeout =  " 7200" 
211270
212-             stage(' Prepare Testing '  ) {
271+             stage(" [ ${ stageName } ] Initializing Test "  ) {
213272                //  Create Job Workspace folder in Frontend Node
214273                Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd}  ' ssh ${ COMMON_SSH_OPTIONS}   ${ remote.user}  @${ remote.host}   'mkdir -p ${ jobWorkspace}  '"  ,)
215274
@@ -228,15 +287,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
228287                    true 
229288                )
230289
231-                 //  Upload waives.txt to Frontend node
232-                 def  waivesListPathLocal =  " ${ llmSrcLocal}  /tests/integration/test_lists/waives.txt" 
233-                 Utils . copyScriptToRemoteHost(
234-                     pipeline,
235-                     remote,
236-                     waivesListPathLocal,
237-                     waivesListPathNode
238-                 )
239- 
240290                //  Generate Test List and Upload to Frontend Node
241291                def  makoArgs =  getMakoArgsFromStageName(stageName, true )
242292                //  TODO: currently the options will only be processed if the first
@@ -261,37 +311,42 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
261311                }
262312                taskArgs =  [
263313                    * taskArgs,
264-                     " --container-image=${  container} "  ,
314+                     " --container-image=$container  "  ,
265315                    " --container-workdir=/home/svc_tensorrt/bloom/scripts"  ,
266-                     " --container-mounts=${  mounts} "  ,
316+                     " --container-mounts=$mounts  "  ,
267317                    " --container-env=NVIDIA_IMEX_CHANNELS" 
268318                ]
269319
320+                 String  pytestUtil =  " " 
321+                 if  (nodeCount >  1 ) {
322+                     pytestUtil =  " $llmSrcNode  /tensorrt_llm/llmapi/trtllm-llmapi-launch" 
323+                 }
324+ 
325+                 def  pytestCommand =  getPytestBaseCommand(
326+                     llmSrcNode,
327+                     stageName,
328+                     testListPathNode,
329+                     splits,
330+                     splitId,
331+                     perfMode,
332+                     jobWorkspace,
333+                     " __PLACEHOLDER_TRTLLM_WHL_PATH__"  ,
334+                     " __PLACEHOLDER_coverageConfigFile__"  ,
335+                     pytestUtil
336+                 )
337+ 
270338                def  scriptContent =  """ #!/bin/bash
271339                    export jobWorkspace=$jobWorkspace   
272340                    export tarName=$tarName   
273341                    export llmTarfile=$llmTarfile   
274342                    export llmSrcNode=$llmSrcNode   
275343                    export stageName=$stageName   
276-                     export testList=$testList   
277-                     export testListPathNode=$testListPathNode   
278-                     export waivesListPathNode=$waivesListPathNode   
279-                     export pytestTestTimeout=$pytestTestTimeout   
280-                     export splits=$splits   
281-                     export splitId=$splitId   
282344                    export perfMode=$perfMode   
283345                    export resourcePathNode=$resourcePathNode   
284346                    export nodeCount=$nodeCount   
285-                     export MODEL_CACHE_DIR= $M ODEL_CACHE_DIR  
347+                     export pytestCommand=" ${ pytestCommand } "  
286348                    export NVIDIA_IMEX_CHANNELS=0 
287-                     chmod +x ${ scriptRunNode}  
288- <<<<<<< HEAD 
289-                     ${ srunCmd}  
290-                 """  . stripIndent()
291-                 pipeline. writeFile(file : scriptLaunchDestPath, text : scriptContent)
292-                 Utils . exec(pipeline, script : " chmod +x ${ scriptLaunchDestPath} "  , returnStdout : true )
293-                 Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd}  ' scp -r -p ${ COMMON_SSH_OPTIONS}   ${ scriptLaunchDestPath}   ${ remote.user}  @${ remote.host}  :${ scriptLaunch} "  ,)
294- ====== = 
349+                     chmod +x $scriptRunNode   
295350                """  
296351                if  (nodeCount >  1 ) {
297352                    taskArgs =  [
@@ -341,9 +396,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
341396                    scriptLaunchPathLocal,
342397                    scriptLaunchPathNode
343398                )
344- >>>>>>> e74931618 (Refactor L0 Test code) 
345399            }
346-             stage(' Run Test' ) { 
400+             stage(" [ ${ stageName } ]  Run Pytest "  ) {
347401                Utils . exec(
348402                    pipeline,
349403                    timeout : false ,
@@ -1344,50 +1398,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
13441398
13451399    stage (" [${ stageName}  ] Run Pytest"  )
13461400    {
1347-         echoNodeAndGpuInfo(pipeline, stageName) 
1348-         sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi' 
1349- 
1350-         def extraInternalEnv = "" 
1351-         def pytestTestTimeout = "3600" 
1352- 
1353-         // TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines. 
1354-         extraInternalEnv = "__LUNOWUD=\" -thread_pool_size=${ TESTER_CORES} \" " 
1355-         // CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest 
1356-         extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${ pytestTestTimeout}  " 
1357- 
1401+         //  Test List
13581402        def  testDBList =  renderTestDB(testList, llmSrc, stageName)
1359-         testList = "${ testList}  _${ splitId}  " 
1360-         def testCmdLine = [ 
1361-             "LLM_ROOT=${ llmSrc}  ", 
1362-             "LLM_BACKEND_ROOT=${ llmSrc}  /triton_backend", 
1363-             "LLM_MODELS_ROOT=${ MODEL_CACHE_DIR}  ", 
1364-             "MODEL_CACHE_DIR=${ MODEL_CACHE_DIR}  ", 
1365-             extraInternalEnv, 
1366-             "pytest", 
1367-             "-v", 
1368-             testFilter[(DETAILED_LOG)] ? "-s" : "", 
1369-             "--timeout-method=thread", 
1370-             "--apply-test-list-correction", 
1371-             "--splitting-algorithm least_duration", 
1372-             "--timeout=${ pytestTestTimeout}  ", 
1373-             "--rootdir ${ llmSrc}  /tests/integration/defs", 
1374-             "--test-prefix=${ stageName}  ", 
1375-             "--splits ${ splits}  ", 
1376-             "--group ${ splitId}  ", 
1377-             "--waives-file=${ llmSrc}  /tests/integration/test_lists/waives.txt", 
1378-             "--test-list=${ testDBList}  ", 
1379-             "--output-dir=${ WORKSPACE}  /${ stageName}  /", 
1380-             "--csv=${ WORKSPACE}  /${ stageName}  /report.csv", 
1381-             "--junit-xml ${ WORKSPACE}  /${ stageName}  /results.xml", 
1382-             "-o junit_logging=out-err" 
1383-         ] 
1384-         if (perfMode) { 
1385-             testCmdLine += [ 
1386-                 "--perf", 
1387-                 "--perf-log-formats csv", 
1388-                 "--perf-log-formats yaml" 
1389-             ] 
1390-         } 
1403+ 
13911404        //  Test Coverage
13921405        def  TRTLLM_WHL_PATH  =  sh(returnStdout : true , script : " pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2"  ). replaceAll(" \\ s"  ," "  )
13931406        sh " echo ${ TRTLLM_WHL_PATH} " 
@@ -1401,13 +1414,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
14011414            echo 'source =\n     ${ llmSrc}  /tensorrt_llm/\n     ${ TRTLLM_WHL_PATH}  /tensorrt_llm/' >> ${ coverageConfigFile}  
14021415            cat ${ coverageConfigFile}  
14031416        """  
1404-         testCmdLine += [ 
1405-             "--cov=${ llmSrc}  /examples/", 
1406-             "--cov=${ llmSrc}  /tensorrt_llm/", 
1407-             "--cov=${ TRTLLM_WHL_PATH}  /tensorrt_llm/", 
1408-             "--cov-report=", 
1409-             "--cov-config=${ coverageConfigFile}  " 
1410-         ] 
1417+         echoNodeAndGpuInfo(pipeline, stageName)
1418+         sh ' if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi' 
1419+         def  pytestCommand =  getPytestBaseCommand(
1420+             llmSrc,
1421+             stageName,
1422+             testDBList,
1423+             splits,
1424+             split_id,
1425+             perfMode,
1426+             ${WORKSPACE }/ ${stageName},
1427+             TRTLLM_WHL_PATH ,
1428+             coverageConfigFile
1429+         )
14111430
14121431        def  containerPIP_LLM_LIB_PATH =  sh(script : " pip3 show tensorrt_llm | grep \" Location\"  | awk -F\" :\"  '{ gsub(/ /, \"\" , \$ 2); print \$ 2\" /tensorrt_llm/libs\" }'"  , returnStdout : true ). replaceAll(" \\ s"  ," "  )
14131432        def  containerLD_LIBRARY_PATH =  sh(script : " echo \$ {LD_LIBRARY_PATH}"  , returnStdout : true ). replaceAll(" \\ s"  ," "  )
@@ -1428,9 +1447,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
14281447                sh " env | sort" 
14291448                try  {
14301449                    sh """ 
1431-                         rm - rf ${ stageName} /  &&  \
1432-                         cd ${ llmSrc} / tests/  integration/ defs &&  \
1433-                         ${testCmdLine . join( "   " )} 
1450+                         rm -rf $stageName  / && \ 
1451+                         cd $llmSrc  /tests/integration/defs && \ 
1452+                         $p ytestCommand   
14341453                    """  
14351454                } catch  (InterruptedException  e) {
14361455                    throw  e
0 commit comments