Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 61 additions & 51 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
"-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
"-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
"-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
"-e 's/.*SLURM_JOB_ID=\\([0-9]\\+\\).*/\\1/p' " +
"-e 's/.*SLURM_JOBID=\\([0-9]\\+\\).*/\\1/p' " +
"${slurmOutputFile} | tail -n1 || true\""
),
returnStdout: true
Expand Down Expand Up @@ -183,6 +185,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\""))
echo "Slurm job did not submit successfully. No job ID found."
} else {
// The original Slurm output file name is like "slurm-%j-*.out", we need to replace the %j with the real job ID.
def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID)
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\""))
}
Expand Down Expand Up @@ -317,6 +320,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
if (m1) ids << m1[0][1] // Extract the first captured group
def m2 = (line =~ /srun: job (\d+) (queued|has been allocated)/)
if (m2) ids << m2[0][1] // Extract the first captured group
def m3 = (line =~ /SLURM_JOB_ID=(\d+)/)
if (m3) ids << m3[0][1] // Extract the first captured group
def m4 = (line =~ /SLURM_JOBID=(\d+)/)
if (m4) ids << m4[0][1] // Extract the first captured group
return ids
}

Expand All @@ -341,16 +348,37 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
}

if (CloudManager.isNodeOnline(nodeName)) {
def dockerGpuOption = ""

node(nodeName) {
sh """
env | sort
pwd && ls -alh
ls -alh ${env.WORKSPACE}
ls -alh ${env.WORKSPACE_TMP}
"""

sh "nproc && free -g && hostname"
echoNodeAndGpuInfo(pipeline, stageName)
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
// Use single quotes to avoid Jenkins variable expansion
sh 'echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"'
sh 'echo "NV_GPU: $NV_GPU"'

// Dynamically set GPU arguments based on environment variables
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
dockerGPUOption = sh(script: """
if [ -n "\$NV_GPU" ]; then
echo "--gpus '\\"device=\$NV_GPU\\"'"
elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then
echo "--gpus '\\"device=\$CUDA_VISIBLE_DEVICES\\"'"
else
echo "--gpus ${gpuCount}"
fi
""", returnStdout: true).trim()
}

def dockerArgs = "--gpus ${gpuCount} " +
def dockerArgs = "${dockerGPUOption} " +
"--cap-add=SYS_ADMIN " +
"--ipc=host " +
"--security-opt seccomp=unconfined " +
Expand All @@ -360,6 +388,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
"--cap-add syslog"

echo "Final dockerArgs: ${dockerArgs}"

if (partition.clusterName == "dlcluster") {
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
}
Expand All @@ -370,12 +400,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
error "The Slurm node does not come online in the waiting period. Terminating the job."
}
}
} catch (Exception e) {
if (e.getMessage()?.contains("Failed to kill container")) {
echo "Known benign error ignored: ${e.getMessage()}"
} else {
throw e // Re-throw if it's a different IOException
}
} finally {
stage("Clean up SLURM Resources") {
// Workaround to handle the interruption during clean up SLURM resources
Expand Down Expand Up @@ -939,7 +963,14 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod

def echoNodeAndGpuInfo(pipeline, stageName)
{
String hostNodeName = sh(script: 'echo $HOST_NODE_NAME', returnStdout: true)
String hostNodeName = sh(script: '''
if [ -n "$HOST_NODE_NAME" ]; then
echo "$HOST_NODE_NAME"
else
hostname -f || hostname
fi
''', returnStdout: true).trim()

String gpuUuids = pipeline.sh(script: "nvidia-smi -q | grep \"GPU UUID\" | awk '{print \$4}' | tr '\n' ',' || true", returnStdout: true)
pipeline.echo "HOST_NODE_NAME = ${hostNodeName} ; GPU_UUIDS = ${gpuUuids} ; STAGE_NAME = ${stageName}"
}
Expand Down Expand Up @@ -1013,7 +1044,7 @@ def launchTestListCheck(pipeline)
trtllm_utils.llmExecStepWithRetry(pipeline, script: """apt-get update && apt-get install \
libffi-dev \
-y""")
sh "nvidia-smi -q"
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
// download TRT-LLM tarfile
def tarName = BUILD_CONFIGS[VANILLA_CONFIG][TARNAME]
def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}"
Expand Down Expand Up @@ -1421,8 +1452,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
sh "nproc && free -g && hostname"
echoNodeAndGpuInfo(pipeline, stageName)
sh "cat ${MODEL_CACHE_DIR}/README"
sh "nvidia-smi -q"
sh "nvidia-smi topo -m"
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"
sh "df -h"

// setup HF_HOME to cache model and datasets
Expand Down Expand Up @@ -1798,7 +1828,7 @@ def runPackageSanityCheck(pipeline, wheel_path, reinstall_dependencies=false, cp
sh "nproc && free -g && hostname"
sh "bash -c 'pip3 show tensorrt || true'"
sh "cat ${MODEL_CACHE_DIR}/README"
sh "nvidia-smi -q"
sh "nvidia-smi && nvidia-smi -q && nvidia-smi topo -m"

sh "pwd && ls -alh"
trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget -nv ${whlUrl}")
Expand Down Expand Up @@ -1849,33 +1879,26 @@ def checkStageName(stageNames) {
}
}

// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
{
return {
runner -> node(label) {
if (needToDeleteDir) {
deleteDir()
}
stage('Pull Docker Image') {
docker.image(image).pull()
}
docker.image(image).inside(dockerArgs) {
runner()
}
}
}
}

def runInDockerOnNode(image, label, dockerArgs)
{
return {
stageName, runner -> stage(stageName) {
node(label) {
deleteDir()
try {
if (needToDeleteDir) {
deleteDir()
}
stage('Pull Docker Image') {
docker.image(image).pull()
}
docker.image(image).inside(dockerArgs) {
runner()
}
} catch (Exception e) {
if (e.getMessage()?.contains("Failed to kill container")) {
echo "Known benign error ignored: ${e.getMessage()}"
} else {
throw e // Re-throw if it's a different IOException
}
}
}
}
Expand All @@ -1893,10 +1916,8 @@ def runInKubernetes(pipeline, podSpec, containerName)
}
}

def launchTestJobs(pipeline, testFilter, dockerNode=null)
def launchTestJobs(pipeline, testFilter)
{
def dockerArgs = "-v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"

// IMPORTANT: Stage Configuration Syntax Requirement
//
// The test_to_stage_mapping.py script expects stage definitions in the following format:
Expand Down Expand Up @@ -2044,8 +2065,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
fullSet += SBSATestConfigs.keySet()

SBSASlurmTestConfigs = [
// Disable GB200-PyTorch-1 due to OOM (https://nvbugspro.nvidia.com/bug/5490507)
//"GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 1],
"GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 1],
"GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
]
Expand Down Expand Up @@ -2199,12 +2219,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
def buildRunner = runInKubernetes(pipeline, buildSpec, "trt-llm")
def sanityRunner = null

if (dockerNode) {
sanityRunner = runInDockerOnNode(values[0], dockerNode, dockerArgs)
} else {
def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch)
sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm")
}

def sanitySpec = createKubernetesPodConfig(values[0], gpu_type, k8s_arch)
sanityRunner = runInKubernetes(pipeline, sanitySpec, "trt-llm")

def wheelPath = "${values[4]}"
def wheelName = ""
Expand Down Expand Up @@ -2448,17 +2465,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
stage("Skip - reused") {
echo "Skip - Passed in the last pipeline."
}
} else if (values instanceof List && dockerNode == null) {
} else if (values instanceof List) {
trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", {
values[1]()
})
} else if (values instanceof List && dockerNode != null) {
node(dockerNode) {
deleteDir()
docker.image(LLM_DOCKER_IMAGE).inside(dockerArgs) {
values[1]()
}
}
} else {
values()
}
Expand Down
2 changes: 1 addition & 1 deletion jenkins/scripts/slurm_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ if [ $SLURM_LOCALID -eq 0 ]; then
which python3
python3 --version
apt-get install -y libffi-dev
nvidia-smi
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
git config --global --add safe.directory "*"
Expand Down