Skip to content

Commit c19f3c7

Browse files
committed
refacotr pytest command generation
With the refactor, now slurm job and blossom job will both generate pytest command from same method Signed-off-by: Yuanjing Xue <[email protected]>
1 parent dd681a7 commit c19f3c7

File tree

2 files changed

+153
-122
lines changed

2 files changed

+153
-122
lines changed

jenkins/L0_Test.groovy

Lines changed: 102 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,66 @@ def getNodeArgs(int nodeCount, int gpuCount) {
170170
]
171171
}
172172

173+
def getPytestBaseCommand(
174+
String llmSrc,
175+
String stageName,
176+
String testDBList,
177+
int splits,
178+
int split_id,
179+
Boolean perfMode,
180+
String outputPath,
181+
String trtllmWheelPath,
182+
String coverageConfigFile,
183+
String pytestUtil = ""
184+
) {
185+
def extraInternalEnv = ""
186+
def pytestTestTimeout = "3600"
187+
188+
// TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
189+
extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
190+
// CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
191+
extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
192+
193+
def testCmdLine = [
194+
"LLM_ROOT=${llmSrc}",
195+
"LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
196+
"LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
197+
"MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
198+
extraInternalEnv,
199+
pytestUtil,
200+
"pytest",
201+
"-v",
202+
testFilter[(DETAILED_LOG)] ? "-s" : "",
203+
"--timeout-method=thread",
204+
"--apply-test-list-correction",
205+
"--splitting-algorithm least_duration",
206+
"--timeout=${pytestTestTimeout}",
207+
"--rootdir ${llmSrc}/tests/integration/defs",
208+
"--test-prefix=${stageName}",
209+
"--splits ${splits}",
210+
"--group ${split_id}",
211+
"--waives-file=${llmSrc}/tests/integration/test_lists/waives.txt",
212+
"--output-dir=${outputPath}/",
213+
"--csv=${outputPath}/report.csv",
214+
"--junit-xml ${outputPath}/results.xml",
215+
"-o junit_logging=out-err",
216+
"--cov=${llmSrc}/examples/",
217+
"--cov=${llmSrc}/tensorrt_llm/",
218+
"--cov=${trtllmWheelPath}/tensorrt_llm/",
219+
"--cov-report=",
220+
"--cov-config=${coverageConfigFile}",
221+
"--test-list=${testDBList}",
222+
]
223+
if (perfMode) {
224+
testCmdLine += [
225+
"--perf",
226+
"--perf-log-formats csv",
227+
"--perf-log-formats yaml"
228+
]
229+
}
230+
return testCmdLine.join(" ")
231+
}
232+
173233
def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=1, skipInstallWheel=false, cpver="cp312")
174234
{
175235
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
@@ -207,9 +267,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
207267
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
208268
def scriptLaunchPathNode = "${jobWorkspace}/slurm_launch.sh"
209269
def isAarch64 = config.contains("aarch64")
210-
def pytestTestTimeout = "7200"
211270

212-
stage('Prepare Testing') {
271+
stage("[${stageName}] Initializing Test") {
213272
// Create Job Workspace folder in Frontend Node
214273
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host} 'mkdir -p ${jobWorkspace}'",)
215274

@@ -228,15 +287,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
228287
true
229288
)
230289

231-
// Upload waives.txt to Frontend node
232-
def waivesListPathLocal = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
233-
Utils.copyScriptToRemoteHost(
234-
pipeline,
235-
remote,
236-
waivesListPathLocal,
237-
waivesListPathNode
238-
)
239-
240290
// Generate Test List and Upload to Frontend Node
241291
def makoArgs = getMakoArgsFromStageName(stageName, true)
242292
// TODO: currently the options will only be processed if the first
@@ -261,37 +311,42 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
261311
}
262312
taskArgs = [
263313
*taskArgs,
264-
"--container-image=${container}",
314+
"--container-image=$container",
265315
"--container-workdir=/home/svc_tensorrt/bloom/scripts",
266-
"--container-mounts=${mounts}",
316+
"--container-mounts=$mounts",
267317
"--container-env=NVIDIA_IMEX_CHANNELS"
268318
]
269319

320+
String pytestUtil = ""
321+
if (nodeCount > 1) {
322+
pytestUtil = "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
323+
}
324+
325+
def pytestCommand = getPytestBaseCommand(
326+
llmSrcNode,
327+
stageName,
328+
testListPathNode,
329+
splits,
330+
splitId,
331+
perfMode,
332+
jobWorkspace,
333+
"__PLACEHOLDER_TRTLLM_WHL_PATH__",
334+
"__PLACEHOLDER_coverageConfigFile__",
335+
pytestUtil
336+
)
337+
270338
def scriptContent = """#!/bin/bash
271339
export jobWorkspace=$jobWorkspace
272340
export tarName=$tarName
273341
export llmTarfile=$llmTarfile
274342
export llmSrcNode=$llmSrcNode
275343
export stageName=$stageName
276-
export testList=$testList
277-
export testListPathNode=$testListPathNode
278-
export waivesListPathNode=$waivesListPathNode
279-
export pytestTestTimeout=$pytestTestTimeout
280-
export splits=$splits
281-
export splitId=$splitId
282344
export perfMode=$perfMode
283345
export resourcePathNode=$resourcePathNode
284346
export nodeCount=$nodeCount
285-
export MODEL_CACHE_DIR=$MODEL_CACHE_DIR
347+
export pytestCommand="${pytestCommand}"
286348
export NVIDIA_IMEX_CHANNELS=0
287-
chmod +x ${scriptRunNode}
288-
<<<<<<< HEAD
289-
${srunCmd}
290-
""".stripIndent()
291-
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
292-
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
293-
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
294-
=======
349+
chmod +x $scriptRunNode
295350
"""
296351
if (nodeCount > 1) {
297352
taskArgs = [
@@ -341,9 +396,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
341396
scriptLaunchPathLocal,
342397
scriptLaunchPathNode
343398
)
344-
>>>>>>> e74931618 (Refactor L0 Test code)
345399
}
346-
stage('Run Test') {
400+
stage("[${stageName}] Run Pytest") {
347401
Utils.exec(
348402
pipeline,
349403
timeout: false,
@@ -1344,50 +1398,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
13441398

13451399
stage ("[${stageName}] Run Pytest")
13461400
{
1347-
echoNodeAndGpuInfo(pipeline, stageName)
1348-
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi'
1349-
1350-
def extraInternalEnv = ""
1351-
def pytestTestTimeout = "3600"
1352-
1353-
// TRT uses half of the host logic cores for engine building which is bad for multi-GPU machines.
1354-
extraInternalEnv = "__LUNOWUD=\"-thread_pool_size=${TESTER_CORES}\""
1355-
// CPP test execution is timing out easily, so we always override its internal timeout to the same value as pytest
1356-
extraInternalEnv += " CPP_TEST_TIMEOUT_OVERRIDDEN=${pytestTestTimeout}"
1357-
1401+
// Test List
13581402
def testDBList = renderTestDB(testList, llmSrc, stageName)
1359-
testList = "${testList}_${splitId}"
1360-
def testCmdLine = [
1361-
"LLM_ROOT=${llmSrc}",
1362-
"LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
1363-
"LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
1364-
"MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
1365-
extraInternalEnv,
1366-
"pytest",
1367-
"-v",
1368-
testFilter[(DETAILED_LOG)] ? "-s" : "",
1369-
"--timeout-method=thread",
1370-
"--apply-test-list-correction",
1371-
"--splitting-algorithm least_duration",
1372-
"--timeout=${pytestTestTimeout}",
1373-
"--rootdir ${llmSrc}/tests/integration/defs",
1374-
"--test-prefix=${stageName}",
1375-
"--splits ${splits}",
1376-
"--group ${splitId}",
1377-
"--waives-file=${llmSrc}/tests/integration/test_lists/waives.txt",
1378-
"--test-list=${testDBList}",
1379-
"--output-dir=${WORKSPACE}/${stageName}/",
1380-
"--csv=${WORKSPACE}/${stageName}/report.csv",
1381-
"--junit-xml ${WORKSPACE}/${stageName}/results.xml",
1382-
"-o junit_logging=out-err"
1383-
]
1384-
if (perfMode) {
1385-
testCmdLine += [
1386-
"--perf",
1387-
"--perf-log-formats csv",
1388-
"--perf-log-formats yaml"
1389-
]
1390-
}
1403+
13911404
// Test Coverage
13921405
def TRTLLM_WHL_PATH = sh(returnStdout: true, script: "pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2").replaceAll("\\s","")
13931406
sh "echo ${TRTLLM_WHL_PATH}"
@@ -1401,13 +1414,19 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
14011414
echo 'source =\n ${llmSrc}/tensorrt_llm/\n ${TRTLLM_WHL_PATH}/tensorrt_llm/' >> ${coverageConfigFile}
14021415
cat ${coverageConfigFile}
14031416
"""
1404-
testCmdLine += [
1405-
"--cov=${llmSrc}/examples/",
1406-
"--cov=${llmSrc}/tensorrt_llm/",
1407-
"--cov=${TRTLLM_WHL_PATH}/tensorrt_llm/",
1408-
"--cov-report=",
1409-
"--cov-config=${coverageConfigFile}"
1410-
]
1417+
echoNodeAndGpuInfo(pipeline, stageName)
1418+
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi'
1419+
def pytestCommand = getPytestBaseCommand(
1420+
llmSrc,
1421+
stageName,
1422+
testDBList,
1423+
splits,
1424+
split_id,
1425+
perfMode,
1426+
${WORKSPACE}/${stageName},
1427+
TRTLLM_WHL_PATH,
1428+
coverageConfigFile
1429+
)
14111430

14121431
def containerPIP_LLM_LIB_PATH = sh(script: "pip3 show tensorrt_llm | grep \"Location\" | awk -F\":\" '{ gsub(/ /, \"\", \$2); print \$2\"/tensorrt_llm/libs\"}'", returnStdout: true).replaceAll("\\s","")
14131432
def containerLD_LIBRARY_PATH = sh(script: "echo \${LD_LIBRARY_PATH}", returnStdout: true).replaceAll("\\s","")
@@ -1428,9 +1447,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
14281447
sh "env | sort"
14291448
try {
14301449
sh """
1431-
rm -rf ${stageName}/ && \
1432-
cd ${llmSrc}/tests/integration/defs && \
1433-
${testCmdLine.join(" ")}
1450+
rm -rf $stageName/ && \
1451+
cd $llmSrc/tests/integration/defs && \
1452+
$pytestCommand
14341453
"""
14351454
} catch (InterruptedException e) {
14361455
throw e

jenkins/scripts/slurm_run.sh

Lines changed: 51 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,28 @@
22
cd $resourcePathNode
33
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
44

5+
set_value_in_command() {
6+
# Parameters
7+
local key="$1"
8+
local value="$2"
9+
local command="$3"
10+
11+
# Transform the key
12+
local placeholder="__PLACEHOLDER_${key}__"
13+
14+
# Check if placeholder exists
15+
if [[ "$command" != *"$placeholder"* ]]; then
16+
echo "Error: placeholder '$placeholder' not found in the command" >&2
17+
return 1
18+
fi
19+
20+
# Replace all occurrences
21+
local result="${command//${placeholder}/${value}}"
22+
23+
# Return the result
24+
echo "$result"
25+
}
26+
527
resultsPath=$jobWorkspace/results
628
mkdir -p $resultsPath
729
if [ $SLURM_LOCALID -eq 0 ]; then
@@ -15,51 +37,30 @@ if [ $SLURM_LOCALID -eq 0 ]; then
1537
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
1638
git config --global --add safe.directory "*"
1739
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
18-
echo "HOST_NODE_NAME = $HOST_NODE_NAME ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName"
40+
echo "HOST_NODE_NAME = $hostname ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName"
1941
touch install_lock.lock
2042
else
2143
while [ ! -f install_lock.lock ]; do
2244
sleep 5
2345
done
2446
fi
25-
testList="$testList_$splitId"
26-
export CPP_TEST_TIMEOUT_OVERRIDDEN=7200
27-
export LLM_ROOT=$llmSrcNode
28-
export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
47+
2948
export UCX_TLS=^gdr_copy
3049
cd $llmSrcNode/tests/integration/defs
3150
testCmdLines=()
3251
if [ $nodeCount -gt 1 ]; then
3352
testCmdLines+=(
34-
"$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
35-
)
36-
fi
37-
testCmdLines+=(
38-
"pytest"
39-
"-v"
40-
"--timeout=$pytestTestTimeout"
41-
"--test-list=$testListPathNode"
42-
"--waives-file=$waivesListPathNode"
43-
"--rootdir $llmSrcNode/tests/integration/defs"
44-
"--test-prefix=$stageName"
45-
"--splits $splits"
46-
"--group $splitId"
47-
"--output-dir=$jobWorkspace/"
48-
"--csv=$resultsPath/report.csv"
49-
"--junit-xml $resultsPath/results.xml"
50-
"-o junit_logging=out-err"
51-
)
52-
if [ "$perfMode" = "true" ]; then
53-
testCmdLines+=(
54-
"--perf"
55-
"--perf-log-formats csv"
56-
"--perf-log-formats yaml"
5753
)
54+
pytestCommand=$(set_value_in_command "pytest" "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch" "$pytestCommand")
5855
fi
56+
57+
# get trtllm wheel path and add to pytest command
5958
trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
6059
trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
6160
echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
62-
# generate .coveragerc in workspace
61+
pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
62+
63+
# generate .coveragerc in workspace and add file path to pytest command
6364
cat << EOF > $jobWorkspace/.coveragerc
6465
[run]
6566
branch = True
@@ -69,14 +70,8 @@ source =
6970
$llmSrcNode/tensorrt_llm/
7071
$trtllmWhlPath/tensorrt_llm/
7172
EOF
73+
pytestCommand=$(set_value_in_command "coverageConfigFile" "$jobWorkspace/.coveragerc" "$pytestCommand")
7274

73-
testCmdLines+=(
74-
"--cov=$llmSrcNode/examples/"
75-
"--cov=$llmSrcNode/tensorrt_llm/"
76-
"--cov=$trtllmWhlPath/tensorrt_llm/"
77-
"--cov-report="
78-
"--cov-config=$jobWorkspace/.coveragerc"
79-
)
8075
containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
8176
containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')
8277
containerLDLibPath=$LD_LIBRARY_PATH
@@ -89,7 +84,24 @@ export LD_LIBRARY_PATH=$containerLDLibPath
8984
echo "Library Path:"
9085
echo "$LD_LIBRARY_PATH"
9186
env | sort
92-
fullCmd="${testCmdLines[*]}"
9387
echo "Running: $testCase"
94-
echo "Full Command: $fullCmd"
95-
eval $fullCmd
88+
echo "Full Command: $pytestCommand"
89+
eval $pytestCommand
90+
91+
if [ "$perfMode" = "true" ]; then
92+
if [[ "$" == *PyTorch* ]]; then
93+
basePerfFilename="base_perf_pytorch.csv"
94+
else
95+
basePerfFilename="base_perf.csv"
96+
fi
97+
basePerfPath="$llmSrcNode/tests/integration/defs/perf/$basePerfFilename"
98+
echo "Check perf result"
99+
python3 $llmSrcNode/tests/integration/defs/perf/sanity_perf_check.py \
100+
$stageName/perf_script_test_results.csv \
101+
$basePerfPath
102+
echo "Check perf report"
103+
python3 $llmSrcNode/tests/integration/defs/perf/create_perf_comparison_report.py \
104+
--output_path $stageName/report.pdf \
105+
--files $stageName/perf_script_test_results.csv \
106+
$basePerfPath
107+
fi

0 commit comments

Comments
 (0)