From 5982bba7e37e4167e74649cd3237756993355f7e Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Sun, 7 Sep 2025 22:26:52 +0800 Subject: [PATCH] [None][ci] Block some nodes to avoid unstable network access Signed-off-by: Yanchao Lu --- jenkins/BuildDockerImage.groovy | 20 ++++++++++++++++++++ jenkins/L0_Test.groovy | 25 ++++++++++++++++++------- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy index c0e718882bd..180fcfb5f5a 100644 --- a/jenkins/BuildDockerImage.groovy +++ b/jenkins/BuildDockerImage.groovy @@ -94,6 +94,26 @@ def createKubernetesPodConfig(type, arch = "amd64", build_wheel = false) """ } + if (arch == "amd64") { + // For x86_64, we block some nodes to avoid unstable network access. + selectors += """ + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "kubernetes.io/hostname" + operator: NotIn + values: + - "sc-ipp-blossom-prod-k8w-105" + - "sc-ipp-blossom-prod-k8w-114" + - "sc-ipp-blossom-prod-k8w-115" + - "sc-ipp-blossom-prod-k8w-121" + - "sc-ipp-blossom-prod-k8w-123" + - "sc-ipp-blossom-prod-k8w-124" + """ + } + def archSuffix = arch == "arm64" ? "arm" : "amd" def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17" diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 65c06a888d9..572bd670ae3 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -349,13 +349,24 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p } stage('Checking if the Node is Online') { - def counter = 0 - // We submit the Slurm job with 5 hours timeout, and the K8S pod will be evicted after 22 hours. - // Let's use 15 hours to check if the node is online, and with 2 hours buffer. - while (!CloudManager.isNodeOnline(nodeName) && counter < 90) { - // Wait 10 minutes to check status of the node again - sleep(time: 10, unit: 'MINUTES') - counter++ + withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { + def remote = [ + ip : cluster.ip, + host : cluster.host, + user : "${pipeline.USERNAME}", + passwd : "${pipeline.PASSWORD}", + allowAnyHosts: true, + ] + def counter = 0 + // We submit the Slurm job with 5 hours timeout, and the K8S pod will be evicted after 22 hours. + // Let's use 15 hours to check if the node is online, and with 2 hours buffer. + while (!CloudManager.isNodeOnline(nodeName) && counter < 90) { + // Wait 10 minutes to check status of the node again + sleep(time: 10, unit: 'MINUTES') + // Avoid the node being stuck in the held state. + Utils.exec(pipeline, Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\"")) + counter++ + } } if (CloudManager.isNodeOnline(nodeName)) {