Skip to content

Commit 3cbc23f

Browse files
infra: [TRTLLM-5250] Add sanity check stage for ngc-release images (Build wheels for devel image) (#4656)
Signed-off-by: ZhanruiSunCh <[email protected]> Signed-off-by: Zhanrui Sun <[email protected]> Co-authored-by: Yanchao Lu <[email protected]>
1 parent 3efad2e commit 3cbc23f

File tree

3 files changed

+227
-12
lines changed

3 files changed

+227
-12
lines changed

jenkins/BuildDockerImage.groovy

Lines changed: 122 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LL
1212
LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
1313
}
1414

15+
ARTIFACT_PATH = env.artifactPath ? env.artifactPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
1516
UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
1617

1718
LLM_ROOT = "llm"
@@ -25,6 +26,8 @@ LLM_SHORT_COMMIT = env.gitlabCommit ? env.gitlabCommit.substring(0, 7) : "undefi
2526

2627
LLM_DEFAULT_TAG = env.defaultTag ?: "${LLM_SHORT_COMMIT}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"
2728

29+
RUN_SANITY_CHECK = params.runSanityCheck ?: false
30+
2831
BUILD_JOBS = "32"
2932
BUILD_JOBS_RELEASE_X86_64 = "32"
3033
BUILD_JOBS_RELEASE_SBSA = "32"
@@ -37,10 +40,13 @@ def GITHUB_PR_API_URL = "github_pr_api_url"
3740
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
3841
@Field
3942
def ACTION_INFO = "action_info"
43+
@Field
44+
def IMAGE_KEY_TO_TAG = "image_key_to_tag"
4045
def globalVars = [
4146
(GITHUB_PR_API_URL): null,
4247
(CACHED_CHANGED_FILE_LIST): null,
4348
(ACTION_INFO): null,
49+
(IMAGE_KEY_TO_TAG): [:],
4450
]
4551

4652
@Field
@@ -203,15 +209,11 @@ def buildImage(config, imageKeyToTag)
203209
def dependentImageWithTag = "${IMAGE_NAME}/${dependent.dockerfileStage}:${dependentTag}"
204210
def customImageWithTag = "${IMAGE_NAME}/${dockerfileStage}:${customTag}"
205211

206-
if (target == "ngc-release") {
207-
if (params.triggerType == "post-merge") {
208-
echo "Use NGC artifacts for post merge build"
209-
dependentImageWithTag = "${NGC_IMAGE_NAME}:${dependentTag}"
210-
imageWithTag = "${NGC_IMAGE_NAME}:${tag}"
211-
customImageWithTag = "${NGC_IMAGE_NAME}:${customTag}"
212-
}
213-
imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag
214-
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
212+
if (target == "ngc-release" && params.triggerType == "post-merge") {
213+
echo "Use NGC artifacts for post merge build"
214+
dependentImageWithTag = "${NGC_IMAGE_NAME}:${dependentTag}"
215+
imageWithTag = "${NGC_IMAGE_NAME}:${tag}"
216+
customImageWithTag = "${NGC_IMAGE_NAME}:${customTag}"
215217
}
216218

217219
args += " GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote"
@@ -266,6 +268,9 @@ def buildImage(config, imageKeyToTag)
266268
"""
267269
}
268270
args += " DEVEL_IMAGE=${dependentImageWithTag}"
271+
if (target == "ngc-release") {
272+
imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag
273+
}
269274
}
270275
}
271276

@@ -290,6 +295,9 @@ def buildImage(config, imageKeyToTag)
290295
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
291296
"""
292297
}
298+
if (target == "ngc-release") {
299+
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
300+
}
293301
}
294302

295303
if (customTag) {
@@ -429,6 +437,17 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
429437
}
430438

431439

440+
def getCommonParameters()
441+
{
442+
return [
443+
'gitlabSourceRepoHttpUrl': LLM_REPO,
444+
'gitlabCommit': env.gitlabCommit,
445+
'artifactPath': ARTIFACT_PATH,
446+
'uploadPath': UPLOAD_PATH,
447+
]
448+
}
449+
450+
432451
pipeline {
433452
agent {
434453
kubernetes createKubernetesPodConfig("agent")
@@ -494,7 +513,100 @@ pipeline {
494513
}
495514
}
496515
}
497-
stage("Register Images for Security Checks") {
516+
stage("Wait for Build Jobs Complete") {
517+
when {
518+
expression {
519+
RUN_SANITY_CHECK
520+
}
521+
}
522+
steps {
523+
script {
524+
container("python3") {
525+
// Install wget
526+
trtllm_utils.llmExecStepWithRetry(this, script: "apt-get update && apt-get -y install wget")
527+
528+
// Poll for build artifacts
529+
def artifactBaseUrl = "https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/"
530+
def requiredFiles = [
531+
"TensorRT-LLM-GH200.tar.gz",
532+
"TensorRT-LLM.tar.gz"
533+
]
534+
def maxWaitMinutes = 60
535+
def pollIntervalSeconds = 60
536+
537+
echo "Waiting for build artifacts..."
538+
echo "Required files: ${requiredFiles}"
539+
540+
def startTime = System.currentTimeMillis()
541+
def maxWaitMs = maxWaitMinutes * 60 * 1000
542+
543+
while ((System.currentTimeMillis() - startTime) < maxWaitMs) {
544+
def missingFiles = []
545+
546+
for (file in requiredFiles) {
547+
def fileUrl = "${artifactBaseUrl}${file}"
548+
def exitCode = sh(
549+
script: "wget --spider --quiet --timeout=30 --tries=1 '${fileUrl}'",
550+
returnStatus: true
551+
)
552+
553+
if (exitCode != 0) {
554+
missingFiles.add(file)
555+
}
556+
}
557+
558+
if (missingFiles.isEmpty()) {
559+
echo "All build artifacts are ready!"
560+
return
561+
}
562+
563+
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
564+
echo "Waiting... (${elapsedMinutes.intValue()} minutes elapsed)"
565+
echo "Missing files: ${missingFiles}"
566+
sleep(pollIntervalSeconds)
567+
}
568+
569+
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
570+
error "Timeout waiting for build artifacts (${elapsedMinutes.intValue()} minutes)"
571+
}
572+
}
573+
}
574+
}
575+
stage("Sanity Check for NGC Images") {
576+
when {
577+
expression {
578+
RUN_SANITY_CHECK
579+
}
580+
}
581+
steps {
582+
script {
583+
globalVars[IMAGE_KEY_TO_TAG] = imageKeyToTag
584+
String globalVarsJson = writeJSON returnText: true, json: globalVars
585+
def parameters = getCommonParameters()
586+
parameters += [
587+
'enableFailFast': false,
588+
'globalVars': globalVarsJson,
589+
]
590+
591+
echo "Trigger BuildDockerImageSanityTest job, params: ${parameters}"
592+
593+
def status = ""
594+
def jobName = "/LLM/helpers/BuildDockerImageSanityTest"
595+
def handle = build(
596+
job: jobName,
597+
parameters: trtllm_utils.toBuildParameters(parameters),
598+
propagate: false,
599+
)
600+
echo "Triggered job: ${handle.absoluteUrl}"
601+
status = handle.result
602+
603+
if (status != "SUCCESS") {
604+
error "Downstream job did not succeed"
605+
}
606+
}
607+
}
608+
}
609+
stage("Register NGC Images for Security Checks") {
498610
when {
499611
expression {
500612
return params.nspect_id && params.action == "push"

jenkins/L0_MergeRequest.groovy

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,10 +142,13 @@ def GITHUB_PR_API_URL = "github_pr_api_url"
142142
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
143143
@Field
144144
def ACTION_INFO = "action_info"
145+
@Field
146+
def IMAGE_KEY_TO_TAG = "image_key_to_tag"
145147
def globalVars = [
146148
(GITHUB_PR_API_URL): gitlabParamsFromBot.get('github_pr_api_url', null),
147149
(CACHED_CHANGED_FILE_LIST): null,
148150
(ACTION_INFO): gitlabParamsFromBot.get('action_info', null),
151+
(IMAGE_KEY_TO_TAG): [:],
149152
]
150153

151154
// If not running all test stages in the L0 pre-merge, we will not update the GitLab status at the end.
@@ -1091,6 +1094,7 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
10911094
'branch': branch,
10921095
'action': "push",
10931096
'triggerType': env.JOB_NAME ==~ /.*PostMerge.*/ ? "post-merge" : "pre-merge",
1097+
'runSanityCheck': true,
10941098
]
10951099

10961100
launchJob("/LLM/helpers/BuildDockerImages", false, enableFailFast, globalVars, "x86_64", additionalParameters)

jenkins/L0_Test.groovy

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ TESTER_MEMORY = "96Gi"
9595
CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
9696
MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
9797

98+
// ENABLE_NGC_DEVEL_IMAGE_TEST is currently disabled in the Jenkins BuildDockerImageSanityTest job config
99+
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
100+
ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false
101+
98102
def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
99103
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
100104
def remote = [
@@ -474,10 +478,13 @@ def GITHUB_PR_API_URL = "github_pr_api_url"
474478
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
475479
@Field
476480
def ACTION_INFO = "action_info"
481+
@Field
482+
def IMAGE_KEY_TO_TAG = "image_key_to_tag"
477483
def globalVars = [
478484
(GITHUB_PR_API_URL): null,
479485
(CACHED_CHANGED_FILE_LIST): null,
480486
(ACTION_INFO): null,
487+
(IMAGE_KEY_TO_TAG): [:],
481488
]
482489

483490
String getShortenedJobName(String path)
@@ -490,6 +497,7 @@ String getShortenedJobName(String path)
490497
"L1_Custom": "l1-cus",
491498
"L1_Nightly": "l1-nt",
492499
"L1_Stable": "l1-stb",
500+
"BuildDockerImageSanityTest": "img-check",
493501
]
494502
def parts = path.split('/')
495503
// Apply nameMapping to the last part (jobName)
@@ -2264,6 +2272,90 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
22642272
return parallelJobsFiltered
22652273
}
22662274

2275+
2276+
2277+
def launchTestJobsForImagesSanityCheck(pipeline, globalVars) {
2278+
def testConfigs = [
2279+
"NGC Devel Image amd64": [
2280+
name: "NGC-Devel-Image-amd64-Sanity-Test",
2281+
k8sArch: "amd64",
2282+
wheelInstalled: false,
2283+
config: VANILLA_CONFIG,
2284+
],
2285+
"NGC Devel Image arm64": [
2286+
name: "NGC-Devel-Image-arm64-Sanity-Test",
2287+
k8sArch: "arm64",
2288+
wheelInstalled: false,
2289+
config: LINUX_AARCH64_CONFIG,
2290+
],
2291+
"NGC Release Image amd64": [
2292+
name: "NGC-Release-Image-amd64-Sanity-Test-A10",
2293+
gpuType: "a10",
2294+
k8sArch: "amd64",
2295+
wheelInstalled: true,
2296+
config: VANILLA_CONFIG,
2297+
],
2298+
"NGC Release Image arm64": [
2299+
name: "NGC-Release-Image-arm64-Sanity-Test-GH200",
2300+
gpuType: "gh200",
2301+
k8sArch: "arm64",
2302+
wheelInstalled: true,
2303+
config: LINUX_AARCH64_CONFIG,
2304+
],
2305+
]
2306+
if (!ENABLE_NGC_DEVEL_IMAGE_TEST) {
2307+
["NGC Devel Image amd64", "NGC Devel Image arm64"].each { key ->
2308+
testConfigs.remove(key)
2309+
}
2310+
echo "NGC Devel Image test is disabled."
2311+
}
2312+
if (!ENABLE_NGC_RELEASE_IMAGE_TEST) {
2313+
["NGC Release Image amd64", "NGC Release Image arm64"].each { key ->
2314+
testConfigs.remove(key)
2315+
}
2316+
echo "NGC Release Image test is disabled."
2317+
}
2318+
// Update testConfigs image field using the map from globalVars
2319+
testConfigs.each { key, config ->
2320+
if (globalVars[IMAGE_KEY_TO_TAG] && globalVars[IMAGE_KEY_TO_TAG][key]) {
2321+
config.image = globalVars[IMAGE_KEY_TO_TAG][key]
2322+
}
2323+
}
2324+
// Filter out all configs that don't have image set
2325+
testConfigs = testConfigs.findAll { key, config ->
2326+
return config.image != null
2327+
}
2328+
2329+
echo "Filtered test configs with images:"
2330+
println testConfigs
2331+
2332+
def testJobs = testConfigs.collectEntries { key, values -> [values.name, {
2333+
if (values.wheelInstalled) {
2334+
stage(values.name) {
2335+
echo "Run ${values.name} sanity test."
2336+
imageSanitySpec = createKubernetesPodConfig(values.image, values.gpuType, values.k8sArch)
2337+
trtllm_utils.launchKubernetesPod(pipeline, imageSanitySpec, "trt-llm", {
2338+
sh "env | sort"
2339+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y git rsync curl")
2340+
runLLMTestlistOnPlatform(pipeline, values.gpuType, "l0_sanity_check", values.config, false, values.name , 1, 1, true, null)
2341+
})
2342+
}
2343+
} else {
2344+
stage(values.name) {
2345+
imageSanitySpec = createKubernetesPodConfig(values.image, "build", values.k8sArch)
2346+
trtllm_utils.launchKubernetesPod(pipeline, imageSanitySpec, "trt-llm", {
2347+
sh "env | sort"
2348+
def cpuArch = values.k8sArch == "amd64" ? X86_64_TRIPLE : AARCH64_TRIPLE
2349+
runLLMBuild(pipeline, cpuArch, false, "imageTest/")
2350+
})
2351+
}
2352+
}
2353+
}]}
2354+
2355+
return testJobs
2356+
}
2357+
2358+
22672359
pipeline {
22682360
agent {
22692361
kubernetes createKubernetesPodConfig("", "agent")
@@ -2306,7 +2398,10 @@ pipeline {
23062398
when {
23072399
expression {
23082400
// Only run the test list validation when necessary
2309-
env.targetArch == X86_64_TRIPLE && testFilter[ONLY_DOCS_FILE_CHANGED] == false && !(env.JOB_NAME ==~ /.*Multi-GPU.*/)
2401+
env.targetArch == X86_64_TRIPLE &&
2402+
testFilter[ONLY_DOCS_FILE_CHANGED] == false &&
2403+
!(env.JOB_NAME ==~ /.*Multi-GPU.*/) &&
2404+
!(env.JOB_NAME ==~ /.*BuildDockerImageSanityTest.*/)
23102405
}
23112406
}
23122407
steps
@@ -2319,7 +2414,11 @@ pipeline {
23192414
stage("Test") {
23202415
steps {
23212416
script {
2322-
parallelJobs = launchTestJobs(this, testFilter)
2417+
if (env.JOB_NAME ==~ /.*BuildDockerImageSanityTest.*/) {
2418+
parallelJobs = launchTestJobsForImagesSanityCheck(this, globalVars)
2419+
} else {
2420+
parallelJobs = launchTestJobs(this, testFilter)
2421+
}
23232422

23242423
singleGpuJobs = parallelJobs
23252424
dgxJobs = [:]

0 commit comments

Comments
 (0)