Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions cpp/tests/unit_tests/executor/transferAgentTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,8 @@ TEST_F(TransferAgentTest, SyncMessage)
checked = nixlAgent0->checkRemoteDescs(agent1, regMem3.getDescs());
} while (!checked);
auto syncMessage = std::string("agent_sync_message");
TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1, syncMessage};
nixlAgent0->notifySyncMessage(agent1, syncMessage);
TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1};
auto status = nixlAgent0->submitTransferRequests(writeReq);

auto notif = nixlAgent1->getNotifiedSyncMessages();
Expand Down Expand Up @@ -302,7 +303,8 @@ TEST_F(TransferAgentTest, SyncMessage)
} while (!checked2);

std::string syncMessage4 = "four_agent_sync_message";
TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0, syncMessage4};
nixlAgent1->notifySyncMessage(agent0, syncMessage4);
TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0};
auto status1 = nixlAgent1->submitTransferRequests(writeReq1);
auto notif4 = nixlAgent0->getNotifiedSyncMessages();
for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++)
Expand Down
18 changes: 10 additions & 8 deletions docker/common/install_nixl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ set -ex
GITHUB_URL="https://github.com"
UCX_INSTALL_PATH="/usr/local/ucx/"
CUDA_PATH="/usr/local/cuda"
NIXL_VERSION="0.3.1"
NIXL_VERSION="0.5.0"
NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH

ARCH_NAME="x86_64-linux-gnu"
GDS_PATH="$CUDA_PATH/targets/x86_64-linux"
Expand All @@ -18,25 +19,26 @@ pip3 install --no-cache-dir meson ninja pybind11
git clone --depth 1 -b ${NIXL_VERSION} ${NIXL_REPO}
cd nixl

cuda_path=$(find / -name "libcuda.so.1" 2>/dev/null | head -n1)
if [[ -z "$cuda_path" ]]; then
echo "libcuda.so.1 not found "
CUDA_SO_PATH=$(find "/usr/local" -name "libcuda.so.1" 2>/dev/null | head -n1)

if [[ -z "$CUDA_SO_PATH" ]]; then
echo "libcuda.so.1 not found"
exit 1
fi

ln -sf $cuda_path $CUDA_PATH/lib64/libcuda.so.1
CUDA_SO_PATH=$(dirname $CUDA_SO_PATH)

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_SO_PATH
meson setup builddir \
-Ducx_path=$UCX_INSTALL_PATH \
-Dcudapath_lib="$CUDA_PATH/lib64" \
-Dcudapath_inc="$CUDA_PATH/include" \
-Dgds_path="$GDS_PATH" \
-Dinstall_headers=true \
-Dstatic_plugins=UCX
-Dinstall_headers=true

cd builddir && ninja install
cd ../..
rm -rf nixl* # Remove NIXL source tree to save space
rm $CUDA_PATH/lib64/libcuda.so.1
export LD_LIBRARY_PATH=$OLD_LD_LIBRARY_PATH

echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}"
43 changes: 21 additions & 22 deletions docker/common/install_ucx.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,28 @@
set -ex

GITHUB_URL="https://github.com"
UCX_VERSION="v1.18.1"
UCX_VERSION="v1.19.x"
UCX_INSTALL_PATH="/usr/local/ucx/"
CUDA_PATH="/usr/local/cuda"
UCX_REPO="https://github.com/openucx/ucx.git"

if [ ! -d ${UCX_INSTALL_PATH} ]; then
git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
cd ucx
./autogen.sh
./contrib/configure-release \
--prefix=${UCX_INSTALL_PATH} \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-cuda=${CUDA_PATH} \
--with-verbs \
--with-dm \
--enable-mt
make install -j$(nproc)
cd ..
rm -rf ucx # Remove UCX source to save space
echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
fi
rm -rf ${UCX_INSTALL_PATH}
git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
cd ucx
./autogen.sh
./contrib/configure-release \
--prefix=${UCX_INSTALL_PATH} \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-cuda=${CUDA_PATH} \
--with-verbs \
--with-dm \
--enable-mt
make install -j$(nproc)
cd ..
rm -rf ucx # Remove UCX source to save space
echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
8 changes: 4 additions & 4 deletions jenkins/current_image_tags.properties
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#
# NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508130930-6501
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508130930-6501
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508201630-pre-test