Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 51 additions & 18 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import groovy.json.JsonOutput
import com.nvidia.bloom.KubernetesManager
import com.nvidia.bloom.Constants
import com.nvidia.bloom.CloudManager
import com.nvidia.bloom.KubernetesManager
import com.nvidia.bloom.SlurmConfig
import com.nvidia.bloom.SlurmCluster
import com.nvidia.bloom.SlurmPartition
Expand Down Expand Up @@ -219,8 +218,11 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]

def nodeName = "${cluster.host}-test-${UUID.randomUUID().toString()}"
def nodeSecret = CloudManager.createNode(nodeName)
// Create a unique suffix for the node name and workspace
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
def nodeName = "${cluster.host}-test-${customSuffix}"
def customWorkspace = "/tmp/${nodeName}"
def nodeSecret = CloudManager.createNode(nodeName, customWorkspace)

try {
// Run ssh command to start node in desired cluster via SLURM
Expand Down Expand Up @@ -263,12 +265,30 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
}

if (CloudManager.isNodeOnline(nodeName)) {
def dockerArgs = "--gpus ${gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
node(nodeName) {
sh """
env | sort
pwd && ls -alh
ls -alh ${env.WORKSPACE}
ls -alh ${env.WORKSPACE_TMP}
"""
}

def dockerArgs = "--gpus ${gpuCount} " +
"--cap-add=SYS_ADMIN " +
"--ipc=host " +
"--security-opt seccomp=unconfined " +
"-u root:root " +
"-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
"-v /tmp/ccache:${CCACHE_DIR}:rw " +
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
"--cap-add syslog"

if (partition.clusterName == "dlcluster") {
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
}
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, false)

slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
} else {
echo "The node does not come online in 2 hours, terminating the job"
Expand Down Expand Up @@ -560,6 +580,13 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
"${UPLOAD_PATH}/test-results/"
)
junit(testResults: "${stageName}/results*.xml")

// Clean up the workspace
sh """
env | sort
pwd && ls -alh
rm -rf ./*
"""
}
}
}
Expand Down Expand Up @@ -796,7 +823,7 @@ def echoNodeAndGpuInfo(pipeline, stageName)

def runLLMDocBuild(pipeline, config)
{
// Step 1: cloning tekit source code
// Step 1: cloning source code
sh "pwd && ls -alh"
sh "env | sort"
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
Expand Down Expand Up @@ -1241,13 +1268,16 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {

def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312")
{
// Step 1: create LLM_ROOT dir
sh "pwd && ls -alh"
// TODO: proper way to clean workspace, maybe save in a folder named with BUILD_ID.
// So that it can work with multiple job running in same node
sh "rm -rf ./*"
// Step 1: create LLM_ROOT dir and clean up the workspace
def llmRootConfig = "${LLM_ROOT}${config}"
sh "mkdir ${llmRootConfig}"
sh """
env | sort
pwd && ls -alh
rm -rf ./*
mkdir ${llmRootConfig}
ls -alh ${env.WORKSPACE}
ls -alh ${env.WORKSPACE_TMP}
"""

def llmPath = sh (script: "realpath ${llmRootConfig}", returnStdout: true).trim()
def llmSrc = "${llmPath}/TensorRT-LLM/src"
Expand Down Expand Up @@ -1765,7 +1795,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 2, 4],
"DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["dgx-h100-x4", "l0_dgx_h100", 2, 2, 4],
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-Triton-Post-Merge-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"A10-PyTorch-1": ["a10", "l0_a10", 1, 1],
"A10-CPP-1": ["a10", "l0_a10", 1, 1],
Expand Down Expand Up @@ -1838,6 +1867,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
"B200_PCIe-TensorRT-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
"H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],
"H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1],
"DGX_H200-4_GPUs-Triton-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4],
"DGX_H200-8_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8],
"DGX_H200-4_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4],
"DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4],
Expand Down Expand Up @@ -1890,8 +1920,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
fullSet += SBSATestConfigs.keySet()

SBSASlurmTestConfigs = [
"GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200", 1, 1, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200", 1, 1, 4],
// Not used in the pipeline now
// "GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 3],
"GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
]
fullSet += SBSASlurmTestConfigs.keySet()

Expand All @@ -1903,7 +1935,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 7, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-5": ["gb200-multi-node", "l0_gb200_multi_nodes", 5, 7, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-6": ["gb200-multi-node", "l0_gb200_multi_nodes", 6, 7, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-7": ["gb200-multi-node", "l0_gb200_multi_nodes", 7, 7, 8, 2],
]
fullSet += multiNodesSBSAConfigs.keySet()

Expand Down Expand Up @@ -2123,7 +2154,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
echo "###### Check pip install Start ######"
withEnv(libEnv) {
sh "env | sort"
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
timeout(time: 1, unit: 'HOURS') {
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
}
}
echo "###### Run LLMAPI tests Start ######"
def config = VANILLA_CONFIG
Expand Down Expand Up @@ -2458,7 +2491,7 @@ pipeline {

def testPhase2StageName = env.testPhase2StageName
if (testPhase2StageName) {
def dgxSigns = ["DGX_H100", "DGX_H200", "GB200", "DGX_B200", "RTXPro6000-4_GPUs"]
def dgxSigns = ["2_GPUs", "4_GPUs", "8_GPUs"]
singleGpuJobs = parallelJobs.findAll{!dgxSigns.any{sign -> it.key.contains(sign)}}
dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
}
Expand Down
3 changes: 2 additions & 1 deletion jenkins/scripts/slurm_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ else
done
fi
testList="$testList_$splitId"
export CPP_TEST_TIMEOUT_OVERRIDDEN=7200
export CPP_TEST_TIMEOUT_OVERRIDDEN=$pytestTestTimeout
export LLM_ROOT=$llmSrcNode
export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
export UCX_TLS=^gdr_copy
Expand All @@ -43,6 +43,7 @@ testCmdLines=(
"$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
"pytest"
"-v"
"--timeout-method=thread"
"--timeout=$pytestTestTimeout"
"--test-list=$testListPathNode"
"--waives-file=$waivesListPathNode"
Expand Down
15 changes: 0 additions & 15 deletions tests/integration/test_lists/test-db/l0_dgx_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -209,18 +209,3 @@ l0_dgx_h100:
- cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-ucx_kvcache-90]
- cpp/test_multi_gpu.py::TestDisagg::test_orchestrator_params[llama-nixl_kvcache-90] TIMEOUT (90)
- cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-nixl_kvcache-90]
- condition:
ranges:
system_gpu_count:
gte: 4
lte: 4
wildcards:
gpu:
- '*h100*'
linux_distribution_name: ubuntu*
terms:
stage: post_merge
backend: triton
auto_trigger: others
tests:
- triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm]
16 changes: 16 additions & 0 deletions tests/integration/test_lists/test-db/l0_dgx_h200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,19 @@ l0_dgx_h200:
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float16-enable_gemm_plugin-enable_attention_plugin-disable_paged_kv_cache-tp:2-pp:2-nb:1-disable_fp8]
- examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_py_session-tp2]
- unittest/llmapi/apps/_test_openai_multi_gpu.py -m "part0"
- condition:
ranges:
system_gpu_count:
gte: 4
lte: 4
wildcards:
gpu:
- '*h200*'
linux_distribution_name: ubuntu*
cpu: x86_64
terms:
stage: post_merge
backend: triton
tests:
# ------------- Triton tests ---------------
- triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm]
Loading