Skip to content

Commit 64ba483

Browse files
authored
infra: [TRTLLM-6499] Split L0_Test into two pipeline by single GPU and multi GPU(For SBSA) (#6132)
Signed-off-by: ZhanruiSunCh <[email protected]>
1 parent ee3cbb0 commit 64ba483

File tree

2 files changed

+63
-10
lines changed

2 files changed

+63
-10
lines changed

jenkins/L0_MergeRequest.groovy

Lines changed: 61 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -950,21 +950,21 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
950950
}
951951
}
952952

953-
def requireMultiGpuTesting = currentBuild.description?.contains("Require Multi-GPU Testing") ?: false
953+
def requireMultiGpuTesting = currentBuild.description?.contains("Require x86_64 Multi-GPU Testing") ?: false
954954
echo "requireMultiGpuTesting: ${requireMultiGpuTesting}"
955955
if (!requireMultiGpuTesting) {
956956
if (singleGpuTestFailed) {
957-
error "Single-GPU test failed"
957+
error "x86_64 single-GPU test failed"
958958
}
959959
return
960960
}
961961

962962
if (singleGpuTestFailed) {
963963
if (env.JOB_NAME ==~ /.*PostMerge.*/) {
964-
echo "In the official post-merge pipeline, single-GPU test failed, whereas multi-GPU test is still kept running."
964+
echo "In the official post-merge pipeline, x86_64 single-GPU test failed, whereas multi-GPU test is still kept running."
965965
} else {
966966
stage("[Test-x86_64-Multi-GPU] Blocked") {
967-
error "This pipeline requires running multi-GPU test, but single-GPU test has failed."
967+
error "This pipeline requires running multi-GPU test, but x86_64 single-GPU test has failed."
968968
}
969969
return
970970
}
@@ -1007,10 +1007,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
10071007
script {
10081008
def jenkinsUrl = ""
10091009
def credentials = ""
1010-
def testStageName = "[Test-SBSA] Run"
1011-
if (env.localJobCredentials) {
1012-
testStageName = "[Test-SBSA] Remote Run"
1013-
}
1010+
def testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
1011+
def singleGpuTestFailed = false
10141012

10151013
if (testFilter[(ONLY_ONE_GROUP_CHANGED)] == "Docs") {
10161014
echo "SBSA build job is skipped due to Jenkins configuration or conditional pipeline run"
@@ -1023,6 +1021,60 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
10231021
]
10241022
launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters)
10251023
}
1024+
stage(testStageName) {
1025+
if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
1026+
echo "SBSA test job is skipped due to Jenkins configuration"
1027+
return
1028+
}
1029+
try {
1030+
String testFilterJson = writeJSON returnText: true, json: testFilter
1031+
def additionalParameters = [
1032+
'testFilter': testFilterJson,
1033+
"dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
1034+
]
1035+
1036+
launchJob("L0_Test-SBSA-Single-GPU", false, enableFailFast, globalVars, "SBSA", additionalParameters)
1037+
} catch (InterruptedException e) {
1038+
throw e
1039+
} catch (Exception e) {
1040+
if (SBSA_TEST_CHOICE == STAGE_CHOICE_IGNORE) {
1041+
catchError(
1042+
buildResult: 'SUCCESS',
1043+
stageResult: 'FAILURE') {
1044+
error "SBSA test failed but ignored due to Jenkins configuration"
1045+
}
1046+
} else {
1047+
catchError(
1048+
buildResult: 'FAILURE',
1049+
stageResult: 'FAILURE') {
1050+
error "SBSA single-GPU test failed"
1051+
}
1052+
singleGpuTestFailed = true
1053+
}
1054+
}
1055+
}
1056+
1057+
def requireMultiGpuTesting = currentBuild.description?.contains("Require SBSA Multi-GPU Testing") ?: false
1058+
echo "requireMultiGpuTesting: ${requireMultiGpuTesting}"
1059+
if (!requireMultiGpuTesting) {
1060+
if (singleGpuTestFailed) {
1061+
error "SBSA single-GPU test failed"
1062+
}
1063+
return
1064+
}
1065+
1066+
if (singleGpuTestFailed) {
1067+
if (env.JOB_NAME ==~ /.*PostMerge.*/) {
1068+
echo "In the official post-merge pipeline, SBSA single-GPU test failed, whereas multi-GPU test is still kept running."
1069+
} else {
1070+
stage("[Test-SBSA-Multi-GPU] Blocked") {
1071+
error "This pipeline requires running SBSA multi-GPU test, but SBSA single-GPU test has failed."
1072+
}
1073+
return
1074+
}
1075+
}
1076+
1077+
testStageName = "[Test-SBSA-Multi-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
10261078
stage(testStageName) {
10271079
if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
10281080
echo "SBSA test job is skipped due to Jenkins configuration"
@@ -1035,7 +1087,7 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
10351087
"dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
10361088
]
10371089

1038-
launchJob("L0_Test-SBSA", false, enableFailFast, globalVars, "SBSA", additionalParameters)
1090+
launchJob("L0_Test-SBSA-Multi-GPU", false, enableFailFast, globalVars, "SBSA", additionalParameters)
10391091

10401092
} catch (InterruptedException e) {
10411093
throw e

jenkins/L0_Test.groovy

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2446,7 +2446,8 @@ pipeline {
24462446
// We add a special marker to the parent job's description.
24472447
// This will be used to decide whether to run multi-GPU test stage.
24482448
def parentJob = globalVars[ACTION_INFO]['parents'][-2]
2449-
trtllm_utils.appendBuildDescription(this, parentJob['name'], parentJob['build_number'], "====Require Multi-GPU Testing====<br/>")
2449+
def archStr = (env.targetArch == X86_64_TRIPLE) ? "x86_64" : (env.targetArch == AARCH64_TRIPLE ? "SBSA" : "Unknown")
2450+
trtllm_utils.appendBuildDescription(this, parentJob['name'], parentJob['build_number'], "====Require ${archStr} Multi-GPU Testing====<br/>")
24502451
} else {
24512452
echo "No parent job found to add the special marker for executing multi-GPU test stage."
24522453
}

0 commit comments

Comments
 (0)