Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 46 additions & 19 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -154,16 +154,11 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
"-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
"-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
"-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
"${slurmOutputFile} | tail -n1\""
"${slurmOutputFile} | tail -n1 || true\""
),
returnStdout: true
).trim()

if (!slurmJobID || !slurmJobID.isNumber()) {
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile}\""))
error("Slurm job did not submit successfully. No job ID found.")
}

Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")

Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
Expand All @@ -180,10 +175,18 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
pipeline,
script: Utils.sshUserCmd(
remote,
"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
"\"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID} || true\""
)
)

if (!slurmJobID || !slurmJobID.isNumber()) {
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\""))
echo "Slurm job did not submit successfully. No job ID found."
} else {
def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID)
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\""))
}

Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
}
}
Expand All @@ -198,6 +201,12 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
allowAnyHosts: true,
]

Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")

CloudManager.destroyNode(nodeName)

Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")

Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")

Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
Expand All @@ -214,7 +223,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
pipeline,
script: Utils.sshUserCmd(
remote,
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
"\"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh || true\""
)
)

Expand Down Expand Up @@ -314,7 +323,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
slurmJobID = jobIDs ? jobIDs[-1] : null

if (!slurmJobID || !slurmJobID.isNumber()) {
error("Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}")
echo "Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}"
}
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
Expand Down Expand Up @@ -361,12 +370,22 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
error "The Slurm node does not come online in the waiting period. Terminating the job."
}
}
} catch (Exception e) {
if (e.getMessage()?.contains("Failed to kill container")) {
echo "Known benign error ignored: ${e.getMessage()}"
} else {
throw e // Re-throw if it's a different IOException
}
} finally {
stage('Clean up SLURM Resources') {
Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
CloudManager.destroyNode(nodeName)
Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
stage("Clean up SLURM Resources") {
// Workaround to handle the interruption during clean up SLURM resources
retry(3) {
try {
cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
} catch (Exception e) {
error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
}
}
}
}
}
Expand Down Expand Up @@ -420,7 +439,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
slurmOutputFile = "${jobWorkspace}/${jobUID}-slurm_output.log"
slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID)
def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt"
def isAarch64 = config.contains("aarch64")
Expand Down Expand Up @@ -474,6 +493,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL

def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
// TODO: check if the tee always returns 0
def scriptContent = """#!/bin/bash
export jobWorkspace=$jobWorkspace
export tarName=$tarName
Expand Down Expand Up @@ -515,8 +535,15 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
} finally {
uploadResults(pipeline, cluster, jobUID, stageName)

stage('Clean up SLURM Resources') {
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
stage("Clean up SLURM Resources") {
// Workaround to handle the interruption during clean up SLURM resources
retry(3) {
try {
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
} catch (Exception e) {
error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
}
}
}
}
}
Expand Down Expand Up @@ -644,7 +671,7 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
if (stageIsInterrupted) {
echo "Stage is interrupted, skip to upload test result."
} else {
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg; fi'
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
if (noResultIfSuccess && !stageIsFailed) {
// Clean up the workspace
sh """
Expand Down Expand Up @@ -1526,7 +1553,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
stage ("[${stageName}] Run Pytest")
{
echoNodeAndGpuInfo(pipeline, stageName)
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi'
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'

def extraInternalEnv = ""
def pytestTestTimeout = "3600"
Expand Down