NVIDIA · DomBrown · Jul 17, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 11, 2025
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -198,7 +198,7 @@ set(TRT_LIB TensorRT::NvInfer)
 get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
-if(BINDING_TYPE STREQUAL "pybind")
+if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
   add_subdirectory(${3RDPARTY_DIR}/pybind11
                    ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
 endif()
@@ -217,7 +217,7 @@ include_directories(
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
   ${3RDPARTY_DIR}/json/include)
-if(BINDING_TYPE STREQUAL "pybind")
+if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
   include_directories(${3RDPARTY_DIR}/pybind11/include)
 endif()
 if(BINDING_TYPE STREQUAL "nanobind")

diff --git a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
@@ -168,7 +168,7 @@ class RuntimeBuffers
 
 public:
     //! Additional buffers depending on model type
-    std::unique_ptr<TransformerBuffers> transformerBuffers;
+    std::shared_ptr<TransformerBuffers> transformerBuffers;
     std::unique_ptr<RnnStateBuffers> rnnStateBuffers;
 
     //! Encoder-Decoder

diff --git a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
@@ -84,7 +84,7 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
 
     if (modelConfig.isTransformerBased())
     {
-        transformerBuffers = std::make_unique<TransformerBuffers>(maxBatchSize, maxBeamWidth, maxAttentionWindowVec,
+        transformerBuffers = std::make_shared<TransformerBuffers>(maxBatchSize, maxBeamWidth, maxAttentionWindowVec,
             maxAttentionWindow, sinkTokenLen, runtime, modelConfig, worldConfig);
     }
     if (modelConfig.isRnnBased())

diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@@ -3,7 +3,23 @@ set(TRTLLM_NB_MODULE
     ${TRTLLM_NB_MODULE}
     PARENT_SCOPE)
 
-set(SRCS ../runtime/ipcNvlsMemory.cu bindings.cpp)
+set(SRCS
+    batch_manager/algorithms.cpp
+    batch_manager/bindings.cpp
+    batch_manager/buffers.cpp
+    batch_manager/cacheTransceiver.cpp
+    batch_manager/kvCacheManager.cpp
+    batch_manager/llmRequest.cpp
+    executor/bindings.cpp
+    executor/executor.cpp
+    executor/executorConfig.cpp
+    executor/request.cpp
+    runtime/bindings.cpp
+    testing/modelSpecBinding.cpp
+    runtime/moeBindings.cpp
+    userbuffers/bindings.cpp
+    ../runtime/ipcNvlsMemory.cu
+    bindings.cpp)
 
 include_directories(${PROJECT_SOURCE_DIR}/include)
 
@@ -14,20 +30,29 @@ set_property(TARGET ${TRTLLM_NB_MODULE} PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_link_directories(${TRTLLM_NB_MODULE} PUBLIC
                         "${TORCH_INSTALL_PREFIX}/lib")
 
+if(ENABLE_NVSHMEM)
+  target_link_libraries(${TRTLLM_NB_MODULE} PUBLIC nvshmem::nvshmem_host
+                                                   nvshmem::nvshmem_device)
+endif()
+
 target_link_libraries(
   ${TRTLLM_NB_MODULE}
-  PUBLIC ${SHARED_TARGET} ${UNDEFINED_FLAG} ${NO_AS_NEEDED_FLAG}
-         ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python)
-
+  PUBLIC ${SHARED_TARGET}
+         ${UNDEFINED_FLAG}
+         ${NO_AS_NEEDED_FLAG}
+         ${Python3_LIBRARIES}
+         ${TORCH_LIBRARIES}
+         torch_python
+         ${CUDA_NVML_LIB})
 target_compile_definitions(
   ${TRTLLM_NB_MODULE} PUBLIC TRTLLM_NB_MODULE=${TRTLLM_NB_MODULE}
-                             NB_DETAILED_ERROR_MESSAGES=1)
+                             PYBIND11_DETAILED_ERROR_MESSAGES=1)
 
 if(NOT WIN32)
   set_target_properties(
     ${TRTLLM_NB_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
@@ -0,0 +1,178 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "algorithms.h"
+#include "tensorrt_llm/batch_manager/allocateKvCache.h"
+#include "tensorrt_llm/batch_manager/assignReqSeqSlots.h"
+#include "tensorrt_llm/batch_manager/capacityScheduler.h"
+#include "tensorrt_llm/batch_manager/createNewDecoderRequests.h"
+#include "tensorrt_llm/batch_manager/handleContextLogits.h"
+#include "tensorrt_llm/batch_manager/handleGenerationLogits.h"
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/batch_manager/logitsPostProcessor.h"
+#include "tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h"
+#include "tensorrt_llm/batch_manager/medusaBuffers.h"
+#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
+#include "tensorrt_llm/batch_manager/pauseRequests.h"
+#include "tensorrt_llm/batch_manager/peftCacheManager.h"
+#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
+#include "tensorrt_llm/batch_manager/updateDecoderBuffers.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/decoderState.h"
+#include "tensorrt_llm/runtime/torch.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/core/TensorBody.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/list.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
+#include <torch/extension.h>
+
+#include <optional>
+
+namespace nb = nanobind;
+
+namespace tr = tensorrt_llm::runtime;
+using namespace tensorrt_llm::batch_manager;
+
+void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_& m)
+{
+    nb::class_<CapacityScheduler>(m, CapacityScheduler::name)
+        .def(nb::init<SizeType32, executor::CapacitySchedulerPolicy, bool, bool, LlmRequestState, LlmRequestState>(),
+            nb::arg("max_num_requests"), nb::arg("capacity_scheduler_policy"), nb::arg("has_kv_cache_manager"),
+            nb::arg("two_step_lookahead") = false, nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
+            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
+        .def("__call__", &CapacityScheduler::operator(), nb::arg("active_requests"),
+            nb::arg("kv_cache_manager") = nullptr, nb::arg("peft_cache_manager") = nullptr,
+            nb::arg("cross_kv_cache_manager") = nullptr)
+        .def("name", [](CapacityScheduler const&) { return CapacityScheduler::name; });
+
+    nb::class_<MicroBatchScheduler>(m, MicroBatchScheduler::name)
+        .def(nb::init<std::optional<batch_scheduler::ContextChunkingConfig>, std::optional<SizeType32>, LlmRequestState,
+                 LlmRequestState>(),
+            nb::arg("ctx_chunk_config") = std::nullopt, nb::arg("max_context_length") = std::nullopt,
+            nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
+            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
+        .def("__call__", &MicroBatchScheduler::operator(), nb::arg("active_requests"), nb::arg("inflight_req_ids"),
+            nb::arg("max_batch_size_runtime"), nb::arg("max_num_tokens_runtime"))
+        .def("name", [](MicroBatchScheduler const&) { return MicroBatchScheduler::name; });
+
+    nb::class_<PauseRequests>(m, PauseRequests::name)
+        .def(nb::init<SizeType32>(), nb::arg("max_input_len"))
+        .def("__call__", &PauseRequests::operator(), nb::arg("requests_to_pause"), nb::arg("inflight_req_ids"),
+            nb::arg("req_ids_to_pause"), nb::arg("pause_flagged"), nb::arg("seq_slot_manager"),
+            nb::arg("kv_cache_manager") = std::nullopt, nb::arg("cross_kv_cache_manager") = std::nullopt,
+            nb::arg("peft_cache_manager") = std::nullopt)
+        .def("name", [](PauseRequests const&) { return PauseRequests::name; });
+
+    nb::class_<AssignReqSeqSlots>(m, AssignReqSeqSlots::name)
+        .def(nb::init<>())
+        .def("__call__", &AssignReqSeqSlots::operator(), nb::arg("seq_slot_manager"), nb::arg("context_requests"),
+            nb::arg("generation_requests"))
+        .def("name", [](AssignReqSeqSlots const&) { return AssignReqSeqSlots::name; });
+
+    nb::class_<AllocateKvCache>(m, AllocateKvCache::name)
+        .def(nb::init<>())
+        .def("__call__", &AllocateKvCache::operator(), nb::arg("kv_cache_manager"), nb::arg("context_requests"),
+            nb::arg("generation_requests"), nb::arg("model_config"), nb::arg("cross_kv_cache_manager") = std::nullopt)
+        .def("name", [](AllocateKvCache const&) { return AllocateKvCache::name; });
+
+    nb::class_<HandleContextLogits>(m, HandleContextLogits::name)
+        .def(nb::init<>())
+        .def(
+            "__call__",
+            [](HandleContextLogits const& self, DecoderInputBuffers& inputBuffers, RequestVector const& contextRequests,
+                at::Tensor const& logits, std::vector<tr::SizeType32> const& numContextLogitsVec,
+                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
+                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
+            {
+                return self(inputBuffers, contextRequests, tr::TorchView::of(logits), numContextLogitsVec, modelConfig,
+                    manager, medusaBuffers);
+            },
+            nb::arg("decoder_input_buffers"), nb::arg("context_requests"), nb::arg("logits"),
+            nb::arg("num_context_logits"), nb::arg("model_config"), nb::arg("buffer_manager"),
+            nb::arg("medusa_buffers") = std::nullopt)
+        .def("name", [](HandleContextLogits const&) { return HandleContextLogits::name; });
+
+    nb::class_<HandleGenerationLogits>(m, HandleGenerationLogits::name)
+        .def(nb::init<>())
+        .def(
+            "__call__",
+            [](HandleGenerationLogits const& self, DecoderInputBuffers& inputBuffers,
+                RequestVector const& generationRequests, at::Tensor const& logits, tr::SizeType32 logitsIndex,
+                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
+                OptionalRef<RuntimeBuffers> genRuntimeBuffers = std::nullopt,
+                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
+            {
+                self(inputBuffers, generationRequests, tr::TorchView::of(logits), logitsIndex, modelConfig, manager,
+                    genRuntimeBuffers, medusaBuffers);
+            },
+            nb::arg("decoder_input_buffers"), nb::arg("generation_requests"), nb::arg("logits"),
+            nb::arg("logits_index"), nb::arg("model_config"), nb::arg("buffer_manager"),
+            nb::arg("gen_runtime_buffers") = std::nullopt, nb::arg("medusa_buffers") = std::nullopt)
+        .def("name", [](HandleGenerationLogits const&) { return HandleGenerationLogits::name; });
+
+    nb::class_<MakeDecodingBatchInputOutput>(m, MakeDecodingBatchInputOutput::name)
+        .def(nb::init<>())
+        .def("__call__", &MakeDecodingBatchInputOutput::operator(), nb::arg("context_requests"),
+            nb::arg("generation_requests"), nb::arg("decoder_input_buffers"), nb::arg("decoder_state"),
+            nb::arg("model_config"), nb::arg("max_num_sequences"), nb::arg("fused_runtime_buffers") = std::nullopt)
+        .def("name", [](MakeDecodingBatchInputOutput const&) { return MakeDecodingBatchInputOutput::name; });
+
+    nb::class_<LogitsPostProcessor>(m, LogitsPostProcessor::name)
+        .def(nb::init<>())
+        .def("__call__", &LogitsPostProcessor::operator(), nb::arg("context_requests"), nb::arg("generation_requests"),
+            nb::arg("replicate_logits_post_processor"), nb::arg("decoder_buffers"), nb::arg("world_config"),
+            nb::arg("runtime"), nb::arg("logits_post_processor_batched") = std::nullopt)
+        .def("name", [](LogitsPostProcessor const&) { return LogitsPostProcessor::name; });
+
+    nb::class_<CreateNewDecoderRequests>(m, CreateNewDecoderRequests::name)
+        .def(nb::init<bool, bool, bool>(), nb::arg("speculative_decoding_fast_logits"),
+            nb::arg("is_leader_in_orch_mode"), nb::arg("is_normalize_log_probs"))
+        .def(
+            "__call__",
+            [](CreateNewDecoderRequests& self, tr::ModelConfig const& modelConfig, tr::WorldConfig const& worldConfig,
+                executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
+                tr::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
+                DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
+                tensorrt_llm::runtime::CudaStream const& runtimeStream,
+                tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
+                SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt)
+            {
+                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
+                    worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
+                    runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
+
+                return std::tuple{runtime::Torch::tensor(batchSlots), std::move(samplingConfigs),
+                    std::move(lookaheadPrompt), std::move(lookaheadAlgoConfigs)};
+            },
+            nb::arg("model_config"), nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("context_requests"),
+            nb::arg("buffer_manager"), nb::arg("logits_type"), nb::arg("decoder_input_buffers"),
+            nb::arg("decoder_state"), nb::arg("runtime_stream"), nb::arg("decoder_stream"),
+            nb::arg("max_sequence_length"), nb::arg("beam_width"), nb::arg("medusa_buffers") = std::nullopt)
+        .def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });
+
+    nb::class_<UpdateDecoderBuffers>(m, UpdateDecoderBuffers::name)
+        .def(nb::init<>())
+        .def("__call__", &UpdateDecoderBuffers::operator(), nb::arg("model_config"), nb::arg("decoder_output_buffers"),
+            nb::arg("copy_buffer_manager"), nb::arg("decoder_state"), nb::arg("return_log_probs"),
+            nb::arg("decoder_finish_event"))
+        .def("name", [](UpdateDecoderBuffers const&) { return UpdateDecoderBuffers::name; });
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::batch_manager::algorithms
+{
+
+void initBindings(nb::module_& m);
+
+}