Skip to content

Commit ec35481

Browse files
authored
[None][infra] Prepare for single GPU GB200 test pipeline (#7073)
Signed-off-by: Yanchao Lu <[email protected]>
1 parent 48155f5 commit ec35481

File tree

6 files changed

+254
-77
lines changed

6 files changed

+254
-77
lines changed

jenkins/L0_Test.groovy

Lines changed: 51 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import groovy.json.JsonOutput
77
import com.nvidia.bloom.KubernetesManager
88
import com.nvidia.bloom.Constants
99
import com.nvidia.bloom.CloudManager
10-
import com.nvidia.bloom.KubernetesManager
1110
import com.nvidia.bloom.SlurmConfig
1211
import com.nvidia.bloom.SlurmCluster
1312
import com.nvidia.bloom.SlurmPartition
@@ -219,8 +218,11 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
219218
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
220219
SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
221220

222-
def nodeName = "${cluster.host}-test-${UUID.randomUUID().toString()}"
223-
def nodeSecret = CloudManager.createNode(nodeName)
221+
// Create a unique suffix for the node name and workspace
222+
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
223+
def nodeName = "${cluster.host}-test-${customSuffix}"
224+
def customWorkspace = "/tmp/${nodeName}"
225+
def nodeSecret = CloudManager.createNode(nodeName, customWorkspace)
224226

225227
try {
226228
// Run ssh command to start node in desired cluster via SLURM
@@ -263,12 +265,30 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
263265
}
264266

265267
if (CloudManager.isNodeOnline(nodeName)) {
266-
def dockerArgs = "--gpus ${gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
268+
node(nodeName) {
269+
sh """
270+
env | sort
271+
pwd && ls -alh
272+
ls -alh ${env.WORKSPACE}
273+
ls -alh ${env.WORKSPACE_TMP}
274+
"""
275+
}
276+
277+
def dockerArgs = "--gpus ${gpuCount} " +
278+
"--cap-add=SYS_ADMIN " +
279+
"--ipc=host " +
280+
"--security-opt seccomp=unconfined " +
281+
"-u root:root " +
282+
"-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
283+
"-v /tmp/ccache:${CCACHE_DIR}:rw " +
284+
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
285+
"--cap-add syslog"
267286

268287
if (partition.clusterName == "dlcluster") {
269288
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
270289
}
271-
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, false)
290+
291+
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
272292
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
273293
} else {
274294
echo "The node does not come online in 2 hours, terminating the job"
@@ -560,6 +580,13 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
560580
"${UPLOAD_PATH}/test-results/"
561581
)
562582
junit(testResults: "${stageName}/results*.xml")
583+
584+
// Clean up the workspace
585+
sh """
586+
env | sort
587+
pwd && ls -alh
588+
rm -rf ./*
589+
"""
563590
}
564591
}
565592
}
@@ -796,7 +823,7 @@ def echoNodeAndGpuInfo(pipeline, stageName)
796823

797824
def runLLMDocBuild(pipeline, config)
798825
{
799-
// Step 1: cloning tekit source code
826+
// Step 1: cloning source code
800827
sh "pwd && ls -alh"
801828
sh "env | sort"
802829
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
@@ -1241,13 +1268,16 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
12411268

12421269
def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312")
12431270
{
1244-
// Step 1: create LLM_ROOT dir
1245-
sh "pwd && ls -alh"
1246-
// TODO: proper way to clean workspace, maybe save in a folder named with BUILD_ID.
1247-
// So that it can work with multiple job running in same node
1248-
sh "rm -rf ./*"
1271+
// Step 1: create LLM_ROOT dir and clean up the workspace
12491272
def llmRootConfig = "${LLM_ROOT}${config}"
1250-
sh "mkdir ${llmRootConfig}"
1273+
sh """
1274+
env | sort
1275+
pwd && ls -alh
1276+
rm -rf ./*
1277+
mkdir ${llmRootConfig}
1278+
ls -alh ${env.WORKSPACE}
1279+
ls -alh ${env.WORKSPACE_TMP}
1280+
"""
12511281

12521282
def llmPath = sh (script: "realpath ${llmRootConfig}", returnStdout: true).trim()
12531283
def llmSrc = "${llmPath}/TensorRT-LLM/src"
@@ -1765,7 +1795,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
17651795
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 2, 4],
17661796
"DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["dgx-h100-x4", "l0_dgx_h100", 2, 2, 4],
17671797
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
1768-
"DGX_H100-4_GPUs-Triton-Post-Merge-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
17691798
"DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
17701799
"A10-PyTorch-1": ["a10", "l0_a10", 1, 1],
17711800
"A10-CPP-1": ["a10", "l0_a10", 1, 1],
@@ -1838,6 +1867,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
18381867
"B200_PCIe-TensorRT-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
18391868
"H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],
18401869
"H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1],
1870+
"DGX_H200-4_GPUs-Triton-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4],
18411871
"DGX_H200-8_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8],
18421872
"DGX_H200-4_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4],
18431873
"DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4],
@@ -1890,8 +1920,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
18901920
fullSet += SBSATestConfigs.keySet()
18911921

18921922
SBSASlurmTestConfigs = [
1893-
"GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200", 1, 1, 4],
1894-
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200", 1, 1, 4],
1923+
// Not used in the pipeline now
1924+
// "GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 3],
1925+
"GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
1926+
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
18951927
]
18961928
fullSet += SBSASlurmTestConfigs.keySet()
18971929

@@ -1903,7 +1935,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
19031935
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 7, 8, 2],
19041936
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-5": ["gb200-multi-node", "l0_gb200_multi_nodes", 5, 7, 8, 2],
19051937
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-6": ["gb200-multi-node", "l0_gb200_multi_nodes", 6, 7, 8, 2],
1906-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-7": ["gb200-multi-node", "l0_gb200_multi_nodes", 7, 7, 8, 2],
19071938
]
19081939
fullSet += multiNodesSBSAConfigs.keySet()
19091940

@@ -2123,7 +2154,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21232154
echo "###### Check pip install Start ######"
21242155
withEnv(libEnv) {
21252156
sh "env | sort"
2126-
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
2157+
timeout(time: 1, unit: 'HOURS') {
2158+
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
2159+
}
21272160
}
21282161
echo "###### Run LLMAPI tests Start ######"
21292162
def config = VANILLA_CONFIG
@@ -2458,7 +2491,7 @@ pipeline {
24582491

24592492
def testPhase2StageName = env.testPhase2StageName
24602493
if (testPhase2StageName) {
2461-
def dgxSigns = ["DGX_H100", "DGX_H200", "GB200", "DGX_B200", "RTXPro6000-4_GPUs"]
2494+
def dgxSigns = ["2_GPUs", "4_GPUs", "8_GPUs"]
24622495
singleGpuJobs = parallelJobs.findAll{!dgxSigns.any{sign -> it.key.contains(sign)}}
24632496
dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
24642497
}

jenkins/scripts/slurm_run.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ else
3434
done
3535
fi
3636
testList="$testList_$splitId"
37-
export CPP_TEST_TIMEOUT_OVERRIDDEN=7200
37+
export CPP_TEST_TIMEOUT_OVERRIDDEN=$pytestTestTimeout
3838
export LLM_ROOT=$llmSrcNode
3939
export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
4040
export UCX_TLS=^gdr_copy
@@ -43,6 +43,7 @@ testCmdLines=(
4343
"$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
4444
"pytest"
4545
"-v"
46+
"--timeout-method=thread"
4647
"--timeout=$pytestTestTimeout"
4748
"--test-list=$testListPathNode"
4849
"--waives-file=$waivesListPathNode"

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -209,18 +209,3 @@ l0_dgx_h100:
209209
- cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-ucx_kvcache-90]
210210
- cpp/test_multi_gpu.py::TestDisagg::test_orchestrator_params[llama-nixl_kvcache-90] TIMEOUT (90)
211211
- cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-nixl_kvcache-90]
212-
- condition:
213-
ranges:
214-
system_gpu_count:
215-
gte: 4
216-
lte: 4
217-
wildcards:
218-
gpu:
219-
- '*h100*'
220-
linux_distribution_name: ubuntu*
221-
terms:
222-
stage: post_merge
223-
backend: triton
224-
auto_trigger: others
225-
tests:
226-
- triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm]

tests/integration/test_lists/test-db/l0_dgx_h200.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,19 @@ l0_dgx_h200:
166166
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float16-enable_gemm_plugin-enable_attention_plugin-disable_paged_kv_cache-tp:2-pp:2-nb:1-disable_fp8]
167167
- examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_py_session-tp2]
168168
- unittest/llmapi/apps/_test_openai_multi_gpu.py -m "part0"
169+
- condition:
170+
ranges:
171+
system_gpu_count:
172+
gte: 4
173+
lte: 4
174+
wildcards:
175+
gpu:
176+
- '*h200*'
177+
linux_distribution_name: ubuntu*
178+
cpu: x86_64
179+
terms:
180+
stage: post_merge
181+
backend: triton
182+
tests:
183+
# ------------- Triton tests ---------------
184+
- triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm]

0 commit comments

Comments
 (0)