diff --git a/cpp/tests/unit_tests/executor/transferAgentTest.cpp b/cpp/tests/unit_tests/executor/transferAgentTest.cpp index c73d9a2140b..4745e8e40b1 100644 --- a/cpp/tests/unit_tests/executor/transferAgentTest.cpp +++ b/cpp/tests/unit_tests/executor/transferAgentTest.cpp @@ -255,7 +255,8 @@ TEST_F(TransferAgentTest, SyncMessage) checked = nixlAgent0->checkRemoteDescs(agent1, regMem3.getDescs()); } while (!checked); auto syncMessage = std::string("agent_sync_message"); - TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1, syncMessage}; + nixlAgent0->notifySyncMessage(agent1, syncMessage); + TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1}; auto status = nixlAgent0->submitTransferRequests(writeReq); auto notif = nixlAgent1->getNotifiedSyncMessages(); @@ -302,7 +303,8 @@ TEST_F(TransferAgentTest, SyncMessage) } while (!checked2); std::string syncMessage4 = "four_agent_sync_message"; - TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0, syncMessage4}; + nixlAgent1->notifySyncMessage(agent0, syncMessage4); + TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0}; auto status1 = nixlAgent1->submitTransferRequests(writeReq1); auto notif4 = nixlAgent0->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++) diff --git a/docker/common/install_nixl.sh b/docker/common/install_nixl.sh index 18ee554f693..cecd61a7af4 100644 --- a/docker/common/install_nixl.sh +++ b/docker/common/install_nixl.sh @@ -4,8 +4,9 @@ set -ex GITHUB_URL="https://github.com" UCX_INSTALL_PATH="/usr/local/ucx/" CUDA_PATH="/usr/local/cuda" -NIXL_VERSION="0.3.1" +NIXL_VERSION="0.5.0" NIXL_REPO="https://github.com/ai-dynamo/nixl.git" +OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH ARCH_NAME="x86_64-linux-gnu" GDS_PATH="$CUDA_PATH/targets/x86_64-linux" @@ -18,25 +19,26 @@ pip3 install --no-cache-dir meson ninja pybind11 git clone --depth 1 -b ${NIXL_VERSION} ${NIXL_REPO} cd nixl -cuda_path=$(find / -name "libcuda.so.1" 2>/dev/null | head -n1) -if [[ -z "$cuda_path" ]]; then - echo "libcuda.so.1 not found " +CUDA_SO_PATH=$(find "/usr/local" -name "libcuda.so.1" 2>/dev/null | head -n1) + +if [[ -z "$CUDA_SO_PATH" ]]; then + echo "libcuda.so.1 not found" exit 1 fi -ln -sf $cuda_path $CUDA_PATH/lib64/libcuda.so.1 +CUDA_SO_PATH=$(dirname $CUDA_SO_PATH) +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_SO_PATH meson setup builddir \ -Ducx_path=$UCX_INSTALL_PATH \ -Dcudapath_lib="$CUDA_PATH/lib64" \ -Dcudapath_inc="$CUDA_PATH/include" \ -Dgds_path="$GDS_PATH" \ - -Dinstall_headers=true \ - -Dstatic_plugins=UCX + -Dinstall_headers=true cd builddir && ninja install cd ../.. rm -rf nixl* # Remove NIXL source tree to save space -rm $CUDA_PATH/lib64/libcuda.so.1 +export LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}" diff --git a/docker/common/install_ucx.sh b/docker/common/install_ucx.sh index 22f444d9746..ba35e82ce63 100644 --- a/docker/common/install_ucx.sh +++ b/docker/common/install_ucx.sh @@ -2,29 +2,28 @@ set -ex GITHUB_URL="https://github.com" -UCX_VERSION="v1.18.1" +UCX_VERSION="v1.19.x" UCX_INSTALL_PATH="/usr/local/ucx/" CUDA_PATH="/usr/local/cuda" UCX_REPO="https://github.com/openucx/ucx.git" -if [ ! -d ${UCX_INSTALL_PATH} ]; then - git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO} - cd ucx - ./autogen.sh - ./contrib/configure-release \ - --prefix=${UCX_INSTALL_PATH} \ - --enable-shared \ - --disable-static \ - --disable-doxygen-doc \ - --enable-optimizations \ - --enable-cma \ - --enable-devel-headers \ - --with-cuda=${CUDA_PATH} \ - --with-verbs \ - --with-dm \ - --enable-mt - make install -j$(nproc) - cd .. - rm -rf ucx # Remove UCX source to save space - echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}" -fi +rm -rf ${UCX_INSTALL_PATH} +git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO} +cd ucx +./autogen.sh +./contrib/configure-release \ + --prefix=${UCX_INSTALL_PATH} \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-cma \ + --enable-devel-headers \ + --with-cuda=${CUDA_PATH} \ + --with-verbs \ + --with-dm \ + --enable-mt +make install -j$(nproc) +cd .. +rm -rf ucx # Remove UCX source to save space +echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}" diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index 751f2516358..bd46241e51d 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -11,7 +11,7 @@ # # NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508130930-6501 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508130930-6501 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508201630-pre-test +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508201630-pre-test