NVIDIA · chzblych · Sep 1, 2025 · Aug 31, 2025
@@ -154,16 +154,11 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
                 "-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
                 "-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
                 "-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
-                "${slurmOutputFile} | tail -n1\""
+                "${slurmOutputFile} | tail -n1 || true\""
             ),
             returnStdout: true
         ).trim()
 
-        if (!slurmJobID || !slurmJobID.isNumber()) {
-            Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile}\""))
-            error("Slurm job did not submit successfully. No job ID found.")
-        }
-
         Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
 
         Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
@@ -180,10 +175,18 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
             pipeline,
             script: Utils.sshUserCmd(
                 remote,
-                "rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
+                "\"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID} || true\""
             )
         )
 
+        if (!slurmJobID || !slurmJobID.isNumber()) {
+            Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\""))
+            echo "Slurm job did not submit successfully. No job ID found."
+        } else {
+            def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID)
+            Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\""))
+        }
+
         Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
     }
 }
@@ -198,6 +201,12 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
             allowAnyHosts: true,
         ]
 
+        Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
+
+        CloudManager.destroyNode(nodeName)
+
+        Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
+
         Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
 
         Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
@@ -214,7 +223,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
             pipeline,
             script: Utils.sshUserCmd(
                 remote,
-                "rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
+                "\"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh || true\""
             )
         )
 
@@ -314,7 +323,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                 slurmJobID = jobIDs ? jobIDs[-1] : null
 
                 if (!slurmJobID || !slurmJobID.isNumber()) {
-                    error("Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}")
+                    echo "Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}"
                 }
                 Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
                 Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
@@ -361,12 +370,22 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                 error "The Slurm node does not come online in the waiting period. Terminating the job."
             }
         }
+    } catch (Exception e) {
+        if (e.getMessage()?.contains("Failed to kill container")) {
+            echo "Known benign error ignored: ${e.getMessage()}"
+        } else {
+            throw e // Re-throw if it's a different IOException
+        }
     } finally {
-        stage('Clean up SLURM Resources') {
-            Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
-            CloudManager.destroyNode(nodeName)
-            Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
-            cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
+        stage("Clean up SLURM Resources") {
+            // Workaround to handle the interruption during clean up SLURM resources
+            retry(3) {
+                try {
+                    cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
+                } catch (Exception e) {
+                    error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
+                }
+            }
         }
     }
 }
@@ -420,7 +439,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
             def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
             def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
             def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
-            slurmOutputFile = "${jobWorkspace}/${jobUID}-slurm_output.log"
+            slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID)
             def testListPathNode = "${jobWorkspace}/${testList}.txt"
             def waivesListPathNode = "${jobWorkspace}/waives.txt"
             def isAarch64 = config.contains("aarch64")
@@ -474,6 +493,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
 
                 def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
                 scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
+                // TODO: check if the tee always returns 0
                 def scriptContent = """#!/bin/bash
                     export jobWorkspace=$jobWorkspace
                     export tarName=$tarName
@@ -515,8 +535,15 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
     } finally {
         uploadResults(pipeline, cluster, jobUID, stageName)
 
-        stage('Clean up SLURM Resources') {
-            cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
+        stage("Clean up SLURM Resources") {
+            // Workaround to handle the interruption during clean up SLURM resources
+            retry(3) {
+                try {
+                    cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
+                } catch (Exception e) {
+                    error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
+                }
+            }
         }
     }
 }
@@ -644,7 +671,7 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
         if (stageIsInterrupted) {
             echo "Stage is interrupted, skip to upload test result."
         } else {
-            sh 'if [ "$(id -u)" -eq 0 ]; then dmesg; fi'
+            sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
             if (noResultIfSuccess && !stageIsFailed) {
                 // Clean up the workspace
                 sh """
@@ -1526,7 +1553,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
     stage ("[${stageName}] Run Pytest")
     {
         echoNodeAndGpuInfo(pipeline, stageName)
-        sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi'
+        sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
 
         def extraInternalEnv = ""
         def pytestTestTimeout = "3600"