diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a76b3e21558..d9e8c206f46 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -198,7 +198,7 @@ set(TRT_LIB TensorRT::NvInfer)
 get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
-if(BINDING_TYPE STREQUAL "pybind")
+if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
   add_subdirectory(${3RDPARTY_DIR}/pybind11
                    ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
 endif()
@@ -217,7 +217,7 @@ include_directories(
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
   ${3RDPARTY_DIR}/json/include)
-if(BINDING_TYPE STREQUAL "pybind")
+if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
   include_directories(${3RDPARTY_DIR}/pybind11/include)
 endif()
 if(BINDING_TYPE STREQUAL "nanobind")
diff --git a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
index 13bde6d07a5..fa43d084b27 100644
--- a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
+++ b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
@@ -168,7 +168,7 @@ class RuntimeBuffers
 
 public:
     //! Additional buffers depending on model type
-    std::unique_ptr<TransformerBuffers> transformerBuffers;
+    std::shared_ptr<TransformerBuffers> transformerBuffers;
     std::unique_ptr<RnnStateBuffers> rnnStateBuffers;
 
     //! Encoder-Decoder
diff --git a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
index 691fb9c7efd..e8b71d065f3 100644
--- a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
+++ b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
@@ -84,7 +84,7 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
 
     if (modelConfig.isTransformerBased())
     {
-        transformerBuffers = std::make_unique<TransformerBuffers>(maxBatchSize, maxBeamWidth, maxAttentionWindowVec,
+        transformerBuffers = std::make_shared<TransformerBuffers>(maxBatchSize, maxBeamWidth, maxAttentionWindowVec,
             maxAttentionWindow, sinkTokenLen, runtime, modelConfig, worldConfig);
     }
     if (modelConfig.isRnnBased())
diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
index d2e7eac20c2..3d570f024d7 100755
--- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@@ -3,7 +3,23 @@ set(TRTLLM_NB_MODULE
     ${TRTLLM_NB_MODULE}
     PARENT_SCOPE)
 
-set(SRCS ../runtime/ipcNvlsMemory.cu bindings.cpp)
+set(SRCS
+    batch_manager/algorithms.cpp
+    batch_manager/bindings.cpp
+    batch_manager/buffers.cpp
+    batch_manager/cacheTransceiver.cpp
+    batch_manager/kvCacheManager.cpp
+    batch_manager/llmRequest.cpp
+    executor/bindings.cpp
+    executor/executor.cpp
+    executor/executorConfig.cpp
+    executor/request.cpp
+    runtime/bindings.cpp
+    testing/modelSpecBinding.cpp
+    runtime/moeBindings.cpp
+    userbuffers/bindings.cpp
+    ../runtime/ipcNvlsMemory.cu
+    bindings.cpp)
 
 include_directories(${PROJECT_SOURCE_DIR}/include)
 
@@ -14,20 +30,29 @@ set_property(TARGET ${TRTLLM_NB_MODULE} PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_link_directories(${TRTLLM_NB_MODULE} PUBLIC
                         "${TORCH_INSTALL_PREFIX}/lib")
 
+if(ENABLE_NVSHMEM)
+  target_link_libraries(${TRTLLM_NB_MODULE} PUBLIC nvshmem::nvshmem_host
+                                                   nvshmem::nvshmem_device)
+endif()
+
 target_link_libraries(
   ${TRTLLM_NB_MODULE}
-  PUBLIC ${SHARED_TARGET} ${UNDEFINED_FLAG} ${NO_AS_NEEDED_FLAG}
-         ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python)
-
+  PUBLIC ${SHARED_TARGET}
+         ${UNDEFINED_FLAG}
+         ${NO_AS_NEEDED_FLAG}
+         ${Python3_LIBRARIES}
+         ${TORCH_LIBRARIES}
+         torch_python
+         ${CUDA_NVML_LIB})
 target_compile_definitions(
   ${TRTLLM_NB_MODULE} PUBLIC TRTLLM_NB_MODULE=${TRTLLM_NB_MODULE}
-                             NB_DETAILED_ERROR_MESSAGES=1)
+                             PYBIND11_DETAILED_ERROR_MESSAGES=1)
 
 if(NOT WIN32)
   set_target_properties(
     ${TRTLLM_NB_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
new file mode 100644
index 00000000000..637401555e8
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
@@ -0,0 +1,178 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "algorithms.h"
+#include "tensorrt_llm/batch_manager/allocateKvCache.h"
+#include "tensorrt_llm/batch_manager/assignReqSeqSlots.h"
+#include "tensorrt_llm/batch_manager/capacityScheduler.h"
+#include "tensorrt_llm/batch_manager/createNewDecoderRequests.h"
+#include "tensorrt_llm/batch_manager/handleContextLogits.h"
+#include "tensorrt_llm/batch_manager/handleGenerationLogits.h"
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/batch_manager/logitsPostProcessor.h"
+#include "tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h"
+#include "tensorrt_llm/batch_manager/medusaBuffers.h"
+#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
+#include "tensorrt_llm/batch_manager/pauseRequests.h"
+#include "tensorrt_llm/batch_manager/peftCacheManager.h"
+#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
+#include "tensorrt_llm/batch_manager/updateDecoderBuffers.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/decoderState.h"
+#include "tensorrt_llm/runtime/torch.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/core/TensorBody.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/list.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
+#include <torch/extension.h>
+
+#include <optional>
+
+namespace nb = nanobind;
+
+namespace tr = tensorrt_llm::runtime;
+using namespace tensorrt_llm::batch_manager;
+
+void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_& m)
+{
+    nb::class_<CapacityScheduler>(m, CapacityScheduler::name)
+        .def(nb::init<SizeType32, executor::CapacitySchedulerPolicy, bool, bool, LlmRequestState, LlmRequestState>(),
+            nb::arg("max_num_requests"), nb::arg("capacity_scheduler_policy"), nb::arg("has_kv_cache_manager"),
+            nb::arg("two_step_lookahead") = false, nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
+            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
+        .def("__call__", &CapacityScheduler::operator(), nb::arg("active_requests"),
+            nb::arg("kv_cache_manager") = nullptr, nb::arg("peft_cache_manager") = nullptr,
+            nb::arg("cross_kv_cache_manager") = nullptr)
+        .def("name", [](CapacityScheduler const&) { return CapacityScheduler::name; });
+
+    nb::class_<MicroBatchScheduler>(m, MicroBatchScheduler::name)
+        .def(nb::init<std::optional<batch_scheduler::ContextChunkingConfig>, std::optional<SizeType32>, LlmRequestState,
+                 LlmRequestState>(),
+            nb::arg("ctx_chunk_config") = std::nullopt, nb::arg("max_context_length") = std::nullopt,
+            nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
+            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
+        .def("__call__", &MicroBatchScheduler::operator(), nb::arg("active_requests"), nb::arg("inflight_req_ids"),
+            nb::arg("max_batch_size_runtime"), nb::arg("max_num_tokens_runtime"))
+        .def("name", [](MicroBatchScheduler const&) { return MicroBatchScheduler::name; });
+
+    nb::class_<PauseRequests>(m, PauseRequests::name)
+        .def(nb::init<SizeType32>(), nb::arg("max_input_len"))
+        .def("__call__", &PauseRequests::operator(), nb::arg("requests_to_pause"), nb::arg("inflight_req_ids"),
+            nb::arg("req_ids_to_pause"), nb::arg("pause_flagged"), nb::arg("seq_slot_manager"),
+            nb::arg("kv_cache_manager") = std::nullopt, nb::arg("cross_kv_cache_manager") = std::nullopt,
+            nb::arg("peft_cache_manager") = std::nullopt)
+        .def("name", [](PauseRequests const&) { return PauseRequests::name; });
+
+    nb::class_<AssignReqSeqSlots>(m, AssignReqSeqSlots::name)
+        .def(nb::init<>())
+        .def("__call__", &AssignReqSeqSlots::operator(), nb::arg("seq_slot_manager"), nb::arg("context_requests"),
+            nb::arg("generation_requests"))
+        .def("name", [](AssignReqSeqSlots const&) { return AssignReqSeqSlots::name; });
+
+    nb::class_<AllocateKvCache>(m, AllocateKvCache::name)
+        .def(nb::init<>())
+        .def("__call__", &AllocateKvCache::operator(), nb::arg("kv_cache_manager"), nb::arg("context_requests"),
+            nb::arg("generation_requests"), nb::arg("model_config"), nb::arg("cross_kv_cache_manager") = std::nullopt)
+        .def("name", [](AllocateKvCache const&) { return AllocateKvCache::name; });
+
+    nb::class_<HandleContextLogits>(m, HandleContextLogits::name)
+        .def(nb::init<>())
+        .def(
+            "__call__",
+            [](HandleContextLogits const& self, DecoderInputBuffers& inputBuffers, RequestVector const& contextRequests,
+                at::Tensor const& logits, std::vector<tr::SizeType32> const& numContextLogitsVec,
+                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
+                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
+            {
+                return self(inputBuffers, contextRequests, tr::TorchView::of(logits), numContextLogitsVec, modelConfig,
+                    manager, medusaBuffers);
+            },
+            nb::arg("decoder_input_buffers"), nb::arg("context_requests"), nb::arg("logits"),
+            nb::arg("num_context_logits"), nb::arg("model_config"), nb::arg("buffer_manager"),
+            nb::arg("medusa_buffers") = std::nullopt)
+        .def("name", [](HandleContextLogits const&) { return HandleContextLogits::name; });
+
+    nb::class_<HandleGenerationLogits>(m, HandleGenerationLogits::name)
+        .def(nb::init<>())
+        .def(
+            "__call__",
+            [](HandleGenerationLogits const& self, DecoderInputBuffers& inputBuffers,
+                RequestVector const& generationRequests, at::Tensor const& logits, tr::SizeType32 logitsIndex,
+                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
+                OptionalRef<RuntimeBuffers> genRuntimeBuffers = std::nullopt,
+                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
+            {
+                self(inputBuffers, generationRequests, tr::TorchView::of(logits), logitsIndex, modelConfig, manager,
+                    genRuntimeBuffers, medusaBuffers);
+            },
+            nb::arg("decoder_input_buffers"), nb::arg("generation_requests"), nb::arg("logits"),
+            nb::arg("logits_index"), nb::arg("model_config"), nb::arg("buffer_manager"),
+            nb::arg("gen_runtime_buffers") = std::nullopt, nb::arg("medusa_buffers") = std::nullopt)
+        .def("name", [](HandleGenerationLogits const&) { return HandleGenerationLogits::name; });
+
+    nb::class_<MakeDecodingBatchInputOutput>(m, MakeDecodingBatchInputOutput::name)
+        .def(nb::init<>())
+        .def("__call__", &MakeDecodingBatchInputOutput::operator(), nb::arg("context_requests"),
+            nb::arg("generation_requests"), nb::arg("decoder_input_buffers"), nb::arg("decoder_state"),
+            nb::arg("model_config"), nb::arg("max_num_sequences"), nb::arg("fused_runtime_buffers") = std::nullopt)
+        .def("name", [](MakeDecodingBatchInputOutput const&) { return MakeDecodingBatchInputOutput::name; });
+
+    nb::class_<LogitsPostProcessor>(m, LogitsPostProcessor::name)
+        .def(nb::init<>())
+        .def("__call__", &LogitsPostProcessor::operator(), nb::arg("context_requests"), nb::arg("generation_requests"),
+            nb::arg("replicate_logits_post_processor"), nb::arg("decoder_buffers"), nb::arg("world_config"),
+            nb::arg("runtime"), nb::arg("logits_post_processor_batched") = std::nullopt)
+        .def("name", [](LogitsPostProcessor const&) { return LogitsPostProcessor::name; });
+
+    nb::class_<CreateNewDecoderRequests>(m, CreateNewDecoderRequests::name)
+        .def(nb::init<bool, bool, bool>(), nb::arg("speculative_decoding_fast_logits"),
+            nb::arg("is_leader_in_orch_mode"), nb::arg("is_normalize_log_probs"))
+        .def(
+            "__call__",
+            [](CreateNewDecoderRequests& self, tr::ModelConfig const& modelConfig, tr::WorldConfig const& worldConfig,
+                executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
+                tr::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
+                DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
+                tensorrt_llm::runtime::CudaStream const& runtimeStream,
+                tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
+                SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt)
+            {
+                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
+                    worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
+                    runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
+
+                return std::tuple{runtime::Torch::tensor(batchSlots), std::move(samplingConfigs),
+                    std::move(lookaheadPrompt), std::move(lookaheadAlgoConfigs)};
+            },
+            nb::arg("model_config"), nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("context_requests"),
+            nb::arg("buffer_manager"), nb::arg("logits_type"), nb::arg("decoder_input_buffers"),
+            nb::arg("decoder_state"), nb::arg("runtime_stream"), nb::arg("decoder_stream"),
+            nb::arg("max_sequence_length"), nb::arg("beam_width"), nb::arg("medusa_buffers") = std::nullopt)
+        .def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });
+
+    nb::class_<UpdateDecoderBuffers>(m, UpdateDecoderBuffers::name)
+        .def(nb::init<>())
+        .def("__call__", &UpdateDecoderBuffers::operator(), nb::arg("model_config"), nb::arg("decoder_output_buffers"),
+            nb::arg("copy_buffer_manager"), nb::arg("decoder_state"), nb::arg("return_log_probs"),
+            nb::arg("decoder_finish_event"))
+        .def("name", [](UpdateDecoderBuffers const&) { return UpdateDecoderBuffers::name; });
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h
new file mode 100644
index 00000000000..cac81d73f27
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::batch_manager::algorithms
+{
+
+void initBindings(nb::module_& m);
+
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
new file mode 100644
index 00000000000..d44a957aad9
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -0,0 +1,525 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+
+#include "tensorrt_llm/batch_manager/common.h"
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
+#include "tensorrt_llm/batch_manager/medusaBuffers.h"
+#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
+#include "tensorrt_llm/batch_manager/peftCacheManager.h"
+#include "tensorrt_llm/batch_manager/rnnStateManager.h"
+#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
+#include "tensorrt_llm/batch_manager/sequenceSlotManager.h"
+#include "tensorrt_llm/nanobind/common/bindTypes.h"
+#include "tensorrt_llm/runtime/gptDecoderBatched.h"
+#include "tensorrt_llm/runtime/runtimeKernels.h"
+#include "tensorrt_llm/runtime/torch.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/ATen.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/stl/vector.h>
+#include <torch/extension.h>
+#include <tuple>
+
+namespace nb = nanobind;
+namespace tb = tensorrt_llm::batch_manager;
+namespace tle = tensorrt_llm::executor;
+namespace tr = tensorrt_llm::runtime;
+
+using namespace tensorrt_llm::runtime;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+
+void initBindings(nb::module_& m)
+{
+    using GenLlmReq = tb::GenericLlmRequest<runtime::ITensor::SharedPtr>;
+
+    // Create and register exceptions in module scope
+    nb::exception<tb::PeftTaskNotCachedException>(m, "PeftTaskNotCachedException");
+    nb::exception<tr::LoraCacheFullException>(m, "LoraCacheFullException");
+
+    // Register with no captures
+    nb::register_exception_translator(
+        [](std::exception_ptr const& p, void*)
+        {
+            try
+            {
+                if (p)
+                    std::rethrow_exception(p);
+            }
+            catch (const tb::PeftTaskNotCachedException& e)
+            {
+                PyErr_SetString(nb::type<tb::PeftTaskNotCachedException>().ptr(), e.what());
+            }
+            catch (const tr::LoraCacheFullException& e)
+            {
+                PyErr_SetString(nb::type<tr::LoraCacheFullException>().ptr(), e.what());
+            }
+        });
+
+    PybindUtils::bindSet<tb::ReqIdsSet>(m, "ReqIdsSet");
+
+    nb::enum_<tb::LlmRequestType>(m, "LlmRequestType")
+        .value("LLMREQUEST_TYPE_CONTEXT_AND_GENERATION", tb::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION)
+        .value("LLMREQUEST_TYPE_CONTEXT_ONLY", tb::LLMREQUEST_TYPE_CONTEXT_ONLY)
+        .value("LLMREQUEST_TYPE_GENERATION_ONLY", tb::LLMREQUEST_TYPE_GENERATION_ONLY)
+        .export_values();
+
+    nb::class_<tb::batch_scheduler::ContextChunkingConfig>(m, "ContextChunkingConfig")
+        .def(nb::init<tle::ContextChunkingPolicy, tensorrt_llm::runtime::SizeType32>(), nb::arg("chunking_policy"),
+            nb::arg("chunk_unit_size"))
+        .def_rw("chunking_policy", &tb::batch_scheduler::ContextChunkingConfig::chunkingPolicy)
+        .def_rw("chunk_unit_size", &tb::batch_scheduler::ContextChunkingConfig::chunkUnitSize);
+
+    nb::class_<GenLlmReq>(m, "GenericLlmRequest")
+        .def("set_exclude_input_from_output", &GenLlmReq::setExcludeInputFromOutput, nb::arg("exclude"))
+        .def("get_num_tokens", &GenLlmReq::getNumTokens, nb::arg("beam"))
+        .def_prop_ro("max_beam_num_tokens", &GenLlmReq::getMaxBeamNumTokens)
+        .def("get_token", &GenLlmReq::getToken, nb::arg("beam"), nb::arg("pos"))
+        .def("get_tokens", nb::overload_cast<GenLlmReq::SizeType32>(&GenLlmReq::getTokens, nb::const_), nb::arg("beam"))
+        .def("get_tokens", nb::overload_cast<>(&GenLlmReq::getTokens, nb::const_))
+        .def("get_last_tokens", nb::overload_cast<GenLlmReq::SizeType32>(&GenLlmReq::getLastTokens), nb::arg("beam"))
+        .def("get_last_tokens", nb::overload_cast<>(&GenLlmReq::getLastTokens))
+        .def("get_beam_width_by_iter", &GenLlmReq::getBeamWidthByIter, nb::arg("for_next_iteration") = false)
+        .def_prop_ro("max_num_generated_tokens", &GenLlmReq::getMaxNumGeneratedTokens)
+        .def("add_new_token", &GenLlmReq::addNewToken, nb::arg("token"), nb::arg("beam"))
+        .def("add_new_tokens", &GenLlmReq::addNewTokens, nb::arg("beam_tokens"))
+        .def_prop_ro("num_draft_tokens", &GenLlmReq::getNumDraftTokens)
+        .def("set_generated_tokens", &GenLlmReq::setGeneratedTokens, nb::arg("generated_beam_tokens"))
+        .def("pause", &GenLlmReq::pause, nb::arg("max_input_len"))
+        .def_prop_rw("max_sent_token_len", &GenLlmReq::getMaxSentTokenLen, &GenLlmReq::setMaxSentTokenLen)
+        .def_prop_ro("prompt_embedding_table", &GenLlmReq::getPromptEmbeddingTable)
+        .def_prop_ro("multimodal_embedding", &GenLlmReq::getMultimodalEmbedding)
+        .def_prop_ro("mrope_rotary_cos_sin", &GenLlmReq::getMropeRotaryCosSin)
+        .def_prop_ro("bad_words_list", &GenLlmReq::getBadWordsList)
+        .def_prop_rw("draft_logits", &GenLlmReq::getDraftLogits, &GenLlmReq::setDraftLogits)
+        .def_prop_ro("embedding_bias", &GenLlmReq::getEmbeddingBias)
+        .def_prop_rw("lora_config", &GenLlmReq::getLoraConfig, &GenLlmReq::setLoraConfig)
+        .def_prop_rw("lora_weights", &GenLlmReq::getLoraWeights, &GenLlmReq::setLoraWeights)
+        .def_prop_ro("stop_words_list", &GenLlmReq::getStopWordsList)
+        .def_prop_ro("context_logits", &GenLlmReq::getContextLogitsHost)
+        .def_prop_ro("generation_logits", &GenLlmReq::getGenerationLogitsHost)
+        .def_prop_ro("prompt_vocab_size", &GenLlmReq::getPromptVocabSize)
+        .def_prop_ro("mrope_position_deltas", &GenLlmReq::getMropePositionDeltas)
+        .def_prop_ro("lora_task_id", &GenLlmReq::getLoraTaskId)
+        .def_prop_ro("lookahead_config", &GenLlmReq::getLookaheadConfig)
+        .def_prop_rw("context_chunk_size", &GenLlmReq::getContextChunkSize, &GenLlmReq::setContextChunkSize)
+        .def_prop_rw("decoding_iter", &GenLlmReq::getDecodingIter, &GenLlmReq::setDecodingIter)
+        .def_rw("request_id", &GenLlmReq::mRequestId)
+        .def_rw("prompt_len", &GenLlmReq::mPromptLen)
+        .def_rw("max_new_tokens", &GenLlmReq::mMaxNewTokens)
+        .def_rw("sampling_config", &GenLlmReq::mSamplingConfig)
+        .def_prop_rw("state", &GenLlmReq::getState, &GenLlmReq::setState)
+        .def_prop_rw("streaming", &GenLlmReq::isStreaming, &GenLlmReq::setStreaming)
+        .def_rw("end_id", &GenLlmReq::mEndId)
+        .def_rw("pad_id", &GenLlmReq::mPadId)
+        .def_rw("seq_slot", &GenLlmReq::mSeqSlot)
+        .def_prop_ro("return_log_probs", &GenLlmReq::returnLogProbs)
+        .def_prop_ro("return_context_logits", &GenLlmReq::getReturnContextLogits)
+        .def_prop_ro("return_generation_logits", &GenLlmReq::getReturnGenerationLogits)
+        .def_prop_ro("log_probs", nb::overload_cast<>(&GenLlmReq::getLogProbs, nb::const_))
+        .def("get_log_probs", nb::overload_cast<GenLlmReq::SizeType32>(&GenLlmReq::getLogProbs, nb::const_))
+        .def("set_log_probs", &GenLlmReq::setLogProbs, nb::arg("log_probs"), nb::arg("beam"))
+        .def("set_return_encoder_output", &GenLlmReq::setReturnEncoderOutput, nb::arg("return_encoder_output"))
+        .def("get_return_encoder_output", &GenLlmReq::getReturnEncoderOutput)
+        .def("priority", nb::overload_cast<>(&GenLlmReq::priority, nb::const_))
+        .def("set_priority", nb::overload_cast<tle::PriorityType>(&GenLlmReq::setPriority))
+        .def_prop_ro("cum_log_probs", &GenLlmReq::getCumLogProbs)
+        .def("set_cum_log_prob", &GenLlmReq::setCumLogProb, nb::arg("cum_log_prob"), nb::arg("beam"))
+        .def("update_num_tokens_per_iteration", &GenLlmReq::updateNumTokensPerIteration,
+            nb::arg("num_tokens_per_iteration"), nb::arg("model_config"))
+        .def_prop_ro("orig_prompt_len", &GenLlmReq::getOrigPromptLen)
+        .def("has_draft_tokens", &GenLlmReq::hasDraftTokens)
+        .def("move_to_next_context_chunk", &GenLlmReq::moveToNextContextChunk)
+        .def_prop_ro("is_last_context_chunk", &GenLlmReq::isLastContextChunk)
+        .def_prop_ro("is_first_context_chunk", &GenLlmReq::isFirstContextChunk)
+        .def_prop_ro("context_remaining_length", &GenLlmReq::getContextRemainingLength)
+        .def_prop_ro("context_logits", &GenLlmReq::getContextLogitsHost)
+        .def_prop_ro("num_draft_tokens", &GenLlmReq::getNumDraftTokens)
+        .def("set_finished_reason", &GenLlmReq::setFinishedReason, nb::arg("finish_reason"), nb::arg("beam"))
+        .def_prop_ro("is_finished", &GenLlmReq::isFinished)
+        .def_prop_ro("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
+        .def_prop_rw(
+            "context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
+        .def_prop_ro("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)
+        .def_prop_rw("guided_decoding_params", &GenLlmReq::getGuidedDecodingParams, &GenLlmReq::setGuidedDecodingParams)
+        .def_prop_ro("context_phase_params", &GenLlmReq::getContextPhaseParams)
+        .def_prop_ro("is_context_only_request", &GenLlmReq::isContextOnlyRequest)
+        .def_prop_ro("is_generation_only_request", &GenLlmReq::isGenerationOnlyRequest)
+        .def_prop_ro("is_generation_complete_state", &GenLlmReq::isGenerationCompleteState)
+        .def_prop_ro("is_context_finished", &GenLlmReq::isContextFinished)
+        .def_prop_ro("is_disagg_generation_init_state", &GenLlmReq::isDisaggGenerationInitState)
+        .def_prop_ro("is_disagg_generation_transmission_complete", &GenLlmReq::isDisaggGenerationTransmissionComplete)
+        .def_prop_ro(
+            "is_disagg_generation_transmission_in_progress", &GenLlmReq::isDisaggGenerationTransmissionInProgress)
+        .def_prop_ro("is_context_init_state", &GenLlmReq::isContextInitState)
+        .def_prop_ro("is_generation_in_progress_state", &GenLlmReq::isGenerationInProgressState)
+        .def_prop_ro("is_disagg_context_transmission_state", &GenLlmReq::isDisaggContextTransmissionState)
+        .def_prop_ro("is_disagg_context_complete_state", &GenLlmReq::isDisaggContextCompleteState)
+        .def_prop_ro("stage", &GenLlmReq::getRequestStage)
+        .def_prop_ro("kv_cache_transfer_time_ms", &GenLlmReq::getKvCacheTransferTimeMS)
+        .def_prop_ro("kv_cache_size", &GenLlmReq::getKvCacheSize)
+        .def_prop_ro("avg_decoded_tokens_per_iter", &GenLlmReq::getAvgDecodedTokensPerIter)
+        .def_prop_ro("alloc_total_blocks", &GenLlmReq::getAllocTotalBlocksPerRequest)
+        .def_prop_ro("alloc_new_blocks", &GenLlmReq::getAllocNewBlocksPerRequest)
+        .def("alloc_context_logits", &GenLlmReq::allocContextLogitsHost, nb::arg("vocab_size"), nb::arg("logit_dtype"))
+        .def_prop_ro("reused_blocks", &GenLlmReq::getReusedBlocksPerRequest)
+        .def_prop_ro("missed_blocks", &GenLlmReq::getMissedBlocksPerRequest)
+        .def_prop_ro("kv_cache_hit_rate", &GenLlmReq::getKVCacheHitRatePerRequest)
+        .def_prop_ro("llm_request_type", &GenLlmReq::getLlmRequestType)
+        .def_prop_ro("multimodal_hashes",
+            [](GenLlmReq& self)
+            {
+                std::optional<std::vector<std::vector<GenLlmReq::SizeType32>>> hashes = std::nullopt;
+                if (self.getMultimodalHashes())
+                {
+                    hashes = *self.getMultimodalHashes().value();
+                }
+                return hashes;
+            })
+        .def_prop_ro("multimodal_positions",
+            [](GenLlmReq& self)
+            {
+                std::optional<std::vector<GenLlmReq::SizeType32>> positions = std::nullopt;
+                if (self.getMultimodalPositions())
+                {
+                    positions = *self.getMultimodalPositions().value();
+                }
+                return positions;
+            })
+        .def_prop_ro("multimodal_lengths",
+            [](GenLlmReq& self)
+            {
+                std::optional<std::vector<GenLlmReq::SizeType32>> lengths = std::nullopt;
+                if (self.getMultimodalLengths())
+                {
+                    lengths = *self.getMultimodalLengths().value();
+                }
+                return lengths;
+            })
+        .def_prop_ro("position_ids",
+            [](GenLlmReq& self)
+            {
+                std::optional<std::vector<GenLlmReq::SizeType32>> positionIds = std::nullopt;
+                if (self.getPositionIds())
+                {
+                    positionIds = *self.getPositionIds().value();
+                }
+                return positionIds;
+            })
+        .def_prop_rw(
+            "draft_tokens",
+            [](GenLlmReq& self)
+            {
+                std::optional<GenLlmReq::VecTokens> draftTokens = std::nullopt;
+                if (self.hasDraftTokens())
+                {
+                    draftTokens = *self.getDraftTokens();
+                }
+                return draftTokens;
+            },
+            [](GenLlmReq& self, std::optional<GenLlmReq::VecTokens> const& draftTokens)
+            {
+                if (draftTokens)
+                {
+                    self.setDraftTokens(std::make_shared<GenLlmReq::VecTokens>(draftTokens.value()));
+                }
+            })
+        .def_prop_rw("is_dummy_request", &GenLlmReq::isDummyRequest, &GenLlmReq::setIsDummyRequest)
+        .def_prop_ro("return_perf_metrics", &GenLlmReq::getReturnPerfMetrics);
+
+    nb::class_<tb::LlmRequest, GenLlmReq>(m, "LlmRequest", nb::dynamic_attr())
+        .def(
+            "__init__",
+            [](tb::LlmRequest* self, tb::LlmRequest::RequestIdType request_id,
+                tb::LlmRequest::SizeType32 max_new_tokens, std::vector<tb::LlmRequest::TokenIdType> input_tokens,
+                runtime::SamplingConfig sampling_config, bool is_streaming,
+                std::optional<tb::LlmRequest::SizeType32> end_id, std::optional<tb::LlmRequest::SizeType32> pad_id,
+                std::optional<at::Tensor> embedding_bias, std::optional<at::Tensor> bad_words_list,
+                std::optional<at::Tensor> stop_words_list,
+                std::optional<std::vector<tb::LlmRequest::SizeType32>> position_ids,
+                std::optional<at::Tensor> prompt_embedding_table,
+                std::optional<tb::LlmRequest::SizeType32> prompt_vocab_size,
+                std::optional<std::vector<std::vector<tb::LlmRequest::SizeType32>>> multimodal_hashes,
+                std::optional<std::vector<tb::LlmRequest::SizeType32>> multimodal_positions,
+                std::optional<std::vector<tb::LlmRequest::SizeType32>> multimodal_lengths,
+                std::optional<at::Tensor> multimodal_embedding, std::optional<at::Tensor> mrope_rotary_cos_sin,
+                std::optional<tb::LlmRequest::SizeType32> mrope_position_deltas,
+                std::optional<LoraTaskIdType> lora_task_id, std::optional<at::Tensor> lora_weights,
+                std::optional<at::Tensor> lora_config,
+                std::optional<executor::LookaheadDecodingConfig> lookahead_config,
+                std::optional<executor::KvCacheRetentionConfig> kv_cache_retention_config, bool return_log_probs,
+                bool return_context_logits, bool return_generation_logits,
+                std::optional<tb::LlmRequest::VecTokens> draft_tokens, std::optional<at::Tensor> draft_logits,
+                bool exclude_input_from_output,
+                std::optional<tb::LlmRequest::LogitsPostProcessor> logits_post_processor,
+                bool apply_logits_post_processor_batched, std::optional<tb::LlmRequest::VecTokens> encoder_input_tokens,
+                bool return_encoder_output, std::optional<tb::LlmRequest::RequestIdType> client_id,
+                executor::PriorityType priority, std::optional<at::Tensor> encoder_input_features,
+                std::optional<tb::LlmRequest::SizeType32> encoder_output_length,
+                std::optional<at::Tensor> cross_attention_mask, tb::LlmRequestType llm_request_type,
+                std::optional<tb::LlmRequest::VecTokenExtraIds> input_token_extra_ids,
+                tb::LlmRequest::SizeType32 num_return_sequences, std::optional<executor::EagleConfig> eagle_config,
+                std::optional<at::Tensor> skip_cross_attn_blocks, bool return_perf_metrics,
+                std::optional<executor::GuidedDecodingParams> guided_decoding_params,
+                std::optional<tb::LlmRequest::SizeType32> language_adapter_uid,
+                std::optional<tb::LlmRequest::MillisecondsType> allotted_time_ms,
+                std::optional<executor::ContextPhaseParams> context_phase_params)
+            {
+                auto makeOptionalTensor = [](std::optional<at::Tensor> const& atTensor, bool unsqueeze = false)
+                {
+                    std::optional<tb::LlmRequest::TensorPtr> tensorPtr = std::nullopt;
+                    if (atTensor)
+                    {
+                        tensorPtr = tr::TorchView::of(atTensor.value());
+                        if (unsqueeze)
+                        {
+                            (*tensorPtr)->unsqueeze(0);
+                        }
+                    }
+                    return tensorPtr;
+                };
+
+                auto embedding_bias_tensor_ptr = makeOptionalTensor(embedding_bias, true);
+                auto bad_words_list_tensor_ptr = makeOptionalTensor(bad_words_list, true);
+                auto stop_words_list_tensor_ptr = makeOptionalTensor(stop_words_list, true);
+                auto prompt_embedding_table_tensor_ptr = makeOptionalTensor(prompt_embedding_table);
+                auto multimodal_embedding_tensor_ptr = makeOptionalTensor(multimodal_embedding);
+                auto lora_weights_tensor_ptr = makeOptionalTensor(lora_weights);
+                auto mrope_rotary_cos_sin_tensor_ptr = makeOptionalTensor(mrope_rotary_cos_sin);
+                auto lora_config_tensor_ptr = makeOptionalTensor(lora_config);
+                auto draft_logits_tensor_ptr = makeOptionalTensor(draft_logits);
+                auto encoder_input_features_tensor_ptr = makeOptionalTensor(encoder_input_features);
+                auto cross_attention_mask_tensor_ptr = makeOptionalTensor(cross_attention_mask);
+                auto skip_cross_attn_blocks_tensor_ptr = makeOptionalTensor(skip_cross_attn_blocks);
+
+                // 49 parameters
+                new (self) tb::LlmRequest{request_id, max_new_tokens, input_tokens, sampling_config, is_streaming,
+                    end_id, pad_id, embedding_bias_tensor_ptr, bad_words_list_tensor_ptr, stop_words_list_tensor_ptr,
+                    position_ids, prompt_embedding_table_tensor_ptr, prompt_vocab_size, multimodal_hashes,
+                    multimodal_positions, multimodal_lengths, multimodal_embedding_tensor_ptr,
+                    mrope_rotary_cos_sin_tensor_ptr, mrope_position_deltas, lora_task_id, lora_weights_tensor_ptr,
+                    lora_config_tensor_ptr, lookahead_config, kv_cache_retention_config, return_log_probs,
+                    return_context_logits, return_generation_logits, draft_tokens, draft_logits_tensor_ptr,
+                    exclude_input_from_output, logits_post_processor, apply_logits_post_processor_batched,
+                    encoder_input_tokens, return_encoder_output, client_id, priority, encoder_input_features_tensor_ptr,
+                    encoder_output_length, cross_attention_mask_tensor_ptr, llm_request_type, input_token_extra_ids,
+                    num_return_sequences, eagle_config, skip_cross_attn_blocks_tensor_ptr, return_perf_metrics,
+                    guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params};
+            },
+            nb::arg("request_id"), nb::arg("max_new_tokens"), nb::arg("input_tokens"), nb::arg("sampling_config"),
+            nb::arg("is_streaming"), nb::arg("end_id") = std::nullopt, nb::arg("pad_id") = std::nullopt,
+            nb::arg("embedding_bias") = std::nullopt, nb::arg("bad_words_list") = std::nullopt,
+            nb::arg("stop_words_list") = std::nullopt, nb::arg("position_ids") = std::nullopt,
+            nb::arg("prompt_embedding_table") = std::nullopt, nb::arg("prompt_vocab_size") = std::nullopt,
+            nb::arg("multimodal_hashes") = std::nullopt, nb::arg("multimodal_positions") = std::nullopt,
+            nb::arg("multimodal_lengths") = std::nullopt, nb::arg("multimodal_embedding") = std::nullopt,
+            nb::arg("mrope_rotary_cos_sin") = std::nullopt, nb::arg("mrope_position_deltas") = std::nullopt,
+            nb::arg("lora_task_id") = std::nullopt, nb::arg("lora_weights") = std::nullopt,
+            nb::arg("lora_config") = std::nullopt, nb::arg("lookahead_config") = std::nullopt,
+            nb::arg("kv_cache_retention_config") = std::nullopt, nb::arg("return_log_probs") = false,
+            nb::arg("return_context_logits") = false, nb::arg("return_generation_logits") = false,
+            nb::arg("draft_tokens") = std::nullopt, nb::arg("draft_logits") = std::nullopt,
+            nb::arg("exclude_input_from_output") = false, nb::arg("logits_post_processor") = std::nullopt,
+            nb::arg("apply_logits_post_processor_batched") = false, nb::arg("encoder_input_tokens") = std::nullopt,
+            nb::arg("return_encoder_output") = false, nb::arg("client_id") = std::nullopt,
+            nb::arg("priority") = executor::Request::kDefaultPriority, nb::arg("encoder_input_features") = std::nullopt,
+            nb::arg("encoder_output_len") = std::nullopt, nb::arg("cross_attention_mask") = std::nullopt,
+            nb::arg("llm_request_type") = tb::LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
+            nb::arg("input_token_extra_ids") = std::nullopt, nb::arg("num_return_sequences") = 1,
+            nb::arg("eagle_config") = std::nullopt, nb::arg("skip_cross_attn_blocks") = std::nullopt,
+            nb::arg("return_perf_metrics") = false, nb::arg("guided_decoding_params") = std::nullopt,
+            nb::arg("language_adapter_uid") = std::nullopt, nb::arg("allotted_time_ms") = std::nullopt,
+            nb::arg("context_phase_params") = std::nullopt)
+        .def("validate", &tb::LlmRequest::validate, nb::arg("max_input_len"), nb::arg("max_seq_len"),
+            nb::arg("max_draft_len"), nb::arg("vocab_size_padded"), nb::arg("max_endocer_input_len") = std::nullopt,
+            nb::arg("enable_kv_cache_reuse") = false)
+        .def("create_response", &tb::LlmRequest::createResponse, nb::arg("use_fast_logits") = false,
+            nb::arg("mpi_world_rank") = 0)
+        .def("create_result", &tb::LlmRequest::createResult, nb::arg("use_fast_logits") = false,
+            nb::arg("mpi_world_rank") = 0)
+        .def("create_serialized_result",
+            [](tb::LlmRequest& self, bool use_fast_logits = false, int mpi_world_rank = 0)
+            {
+                std::vector<char> serialized_result;
+                bool is_final = false;
+                self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
+                return std::make_tuple(nb::bytes(serialized_result.data(), serialized_result.size()), is_final);
+            })
+        .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, nb::arg("manager"))
+        .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, nb::arg("manager"))
+        .def("finish_by_reason", &tb::LlmRequest::finishByReason, nb::arg("finish_reason"))
+        .def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime)
+        .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, nb::arg("iter_counter"));
+
+    nb::class_<tb::SequenceSlotManager>(m, "SequenceSlotManager")
+        .def(nb::init<tb::SequenceSlotManager::SlotIdType, uint64_t>(), nb::arg("max_num_slots"),
+            nb::arg("max_sequence_idle_microseconds"))
+        .def("get_sequence_slot", &tb::SequenceSlotManager::getSequenceSlot, nb::arg("start_flag"),
+            nb::arg("sequence_id"))
+        .def("free_sequence_slot", &tb::SequenceSlotManager::freeSequenceSlot, nb::arg("sequence_id"))
+        .def("free_idle_sequence_slots", &tb::SequenceSlotManager::freeIdleSequenceSlots);
+
+    nb::class_<tb::rnn_state_manager::RnnStateManager>(m, "RnnStateManager")
+        .def(nb::init<tr::SizeType32, tr::ModelConfig, tr::WorldConfig, tr::BufferManager>(),
+            nb::arg("max_num_sequences"), nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"));
+
+    nb::class_<tb::DecoderInputBuffers>(m, "DecoderInputBuffers")
+        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::SizeType32, tr::BufferManager>(),
+            nb::arg("max_num_sequences"), nb::arg("max_batch_size"), nb::arg("max_tokens_per_engine_step"),
+            nb::arg("manager"))
+        .def_rw("setup_batch_slots", &tb::DecoderInputBuffers::setupBatchSlots)
+        .def_rw("setup_batch_slots_device", &tb::DecoderInputBuffers::setupBatchSlotsDevice)
+        .def_rw("fill_values", &tb::DecoderInputBuffers::fillValues)
+        .def_rw("fill_values_device", &tb::DecoderInputBuffers::fillValuesDevice)
+        .def_rw("inputs_ids", &tb::DecoderInputBuffers::inputsIds)
+        .def_rw("forward_batch_slots", &tb::DecoderInputBuffers::forwardBatchSlots)
+        .def_rw("logits", &tb::DecoderInputBuffers::logits);
+
+    nb::class_<tb::DecoderOutputBuffers>(m, "DecoderOutputBuffers")
+        .def_rw("sequence_lengths_host", &tb::DecoderOutputBuffers::sequenceLengthsHost)
+        .def_rw("finished_sum_host", &tb::DecoderOutputBuffers::finishedSumHost)
+        .def_prop_ro("new_output_tokens_host",
+            [](tb::DecoderOutputBuffers& self) { return tr::Torch::tensor(self.newOutputTokensHost); })
+        .def_rw("cum_log_probs_host", &tb::DecoderOutputBuffers::cumLogProbsHost)
+        .def_rw("log_probs_host", &tb::DecoderOutputBuffers::logProbsHost)
+        .def_rw("finish_reasons_host", &tb::DecoderOutputBuffers::finishReasonsHost);
+
+    nb::class_<tb::SlotDecoderBuffers>(m, "SlotDecoderBuffers")
+        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::BufferManager const&>(),
+            nb::arg("max_beam_width"), nb::arg("max_seq_len"), nb::arg("buffer_manager"))
+        .def_rw("output_ids", &tb::SlotDecoderBuffers::outputIds)
+        .def_rw("output_ids_host", &tb::SlotDecoderBuffers::outputIdsHost)
+        .def_rw("sequence_lengths_host", &tb::SlotDecoderBuffers::sequenceLengthsHost)
+        .def_rw("cum_log_probs", &tb::SlotDecoderBuffers::cumLogProbs)
+        .def_rw("cum_log_probs_host", &tb::SlotDecoderBuffers::cumLogProbsHost)
+        .def_rw("log_probs", &tb::SlotDecoderBuffers::logProbs)
+        .def_rw("log_probs_host", &tb::SlotDecoderBuffers::logProbsHost)
+        .def_rw("finish_reasons_host", &tb::SlotDecoderBuffers::finishReasonsHost);
+
+    nb::class_<tb::MedusaBuffers>(m, "MedusaBuffers")
+        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::BufferManager const&,
+                 runtime::ModelConfig const&, runtime::WorldConfig const&, executor::DecodingConfig const&,
+                 runtime::TllmRuntime const&>(),
+            nb::arg("max_beam_width"), nb::arg("max_seq_len"), nb::arg("buffer_manager"), nb::arg("model_config"),
+            nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("runtime"));
+
+    m.def(
+        "add_new_tokens_to_requests",
+        [](std::vector<std::shared_ptr<tb::LlmRequest>>& requests,
+            std::vector<tb::LlmRequest::TokenIdType> const& tokens, int beam_idx)
+        {
+            TLLM_CHECK_WITH_INFO(requests.size() == tokens.size(), "Expected the same number of requests and tokens.");
+
+            for (int i = 0; i < requests.size(); ++i)
+            {
+                requests[i]->addNewToken(tokens[i], beam_idx);
+            }
+        },
+        nb::arg("requests"), nb::arg("tokens"), nb::arg("beam_idx"),
+        "Add new tokens to multiple LLM requests. The tokens vector should contain tokens for beam beam_idx of all "
+        "requests in order.");
+
+    m.def(
+        "make_decoding_batch_input",
+        [](std::vector<std::shared_ptr<tb::LlmRequest>>& contextRequests,
+            std::vector<std::shared_ptr<tb::LlmRequest>>& genRequests, tr::ITensor::SharedPtr logits, int beamWidth,
+            std::vector<int> const& numContextLogitsPrefixSum, tb::DecoderInputBuffers const& decoderInputBuffers,
+            runtime::decoder::DecoderState& decoderState, tr::BufferManager const& manager)
+        {
+            std::vector<int> activeSlots;
+            std::vector<int> generationSteps;
+            std::vector<std::vector<tr::ITensor::SharedConstPtr>> logitsVec = {{}};
+
+            for (int i = 0; i < contextRequests.size(); ++i)
+            {
+                if (contextRequests[i]->isLastContextChunk())
+                {
+                    activeSlots.push_back(*contextRequests[i]->mSeqSlot);
+                    generationSteps.push_back(contextRequests[i]->getDecodingIter());
+                    auto contextLogitsOffset = numContextLogitsPrefixSum[i + 1] - 1;
+                    tr::ITensor::SharedPtr logitsView = ITensor::slice(logits, contextLogitsOffset, 1);
+
+                    if (beamWidth > 1)
+                    {
+                        // Tile logits of context requests
+                        auto const logitsShape = logitsView->getShape();
+                        auto const logitsType = logitsView->getDataType();
+                        auto decoderLogits = manager.gpu(ITensor::makeShape({beamWidth, logitsShape.d[1]}), logitsType);
+                        tensorrt_llm::runtime::kernels::tileTensor(
+                            *decoderLogits, *logitsView, beamWidth, manager.getStream());
+                        decoderLogits->unsqueeze(0);
+                        logitsVec[0].push_back(std::move(decoderLogits));
+                    }
+                    else
+                    {
+                        logitsView->unsqueeze(1);
+                        logitsVec[0].push_back(std::move(logitsView));
+                    }
+                }
+            }
+
+            auto genLogitsOffset = numContextLogitsPrefixSum.back();
+            for (int i = 0; i < genRequests.size(); ++i)
+            {
+                if (genRequests[i]->isGenerationInProgressState())
+                {
+                    activeSlots.push_back(*genRequests[i]->mSeqSlot);
+                    generationSteps.push_back(genRequests[i]->getDecodingIter());
+
+                    auto logitsOffset = genLogitsOffset + i * beamWidth;
+                    auto numberOfLogits = beamWidth;
+                    tr::ITensor::SharedPtr logitsView = ITensor::slice(logits, logitsOffset, numberOfLogits);
+                    logitsView->unsqueeze(0);
+                    logitsVec[0].push_back(std::move(logitsView));
+                }
+            }
+
+            auto& batchSlots = decoderInputBuffers.forwardBatchSlots;
+            batchSlots[0]->resize(activeSlots.size());
+            auto batchSlotsRange = tr::BufferRange<SizeType32>(*batchSlots[0]);
+            for (int i = 0; i < activeSlots.size(); ++i)
+            {
+                batchSlotsRange[i] = activeSlots[i];
+            }
+
+            auto decodingInput = std::make_unique<tr::decoder_batch::Input>(logitsVec, 1);
+            decodingInput->batchSlots = batchSlots;
+
+            auto const maxBeamWidth = decoderState.getMaxBeamWidth();
+            if (maxBeamWidth > 1)
+            {
+                // For Variable-Beam-Width-Search
+                decoderState.getJointDecodingInput().generationSteps = generationSteps;
+            }
+
+            return decodingInput;
+        },
+        nb::arg("context_requests"), nb::arg("generation_requests"), nb::arg("logits"), nb::arg("beam_width"),
+        nb::arg("num_context_logits_prefix_sum"), nb::arg("decoder_input_buffers"), nb::arg("decoder_state"),
+        nb::arg("buffer_manager"), "Make decoding batch input.");
+}
+
+} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.h b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.h
new file mode 100644
index 00000000000..3d5a0f5d5b2
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.h
@@ -0,0 +1,28 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+
+void initBindings(nb::module_& m);
+
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp
new file mode 100644
index 00000000000..b6edcca1c24
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp
@@ -0,0 +1,108 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "buffers.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
+#include "tensorrt_llm/batch_manager/transformerBuffers.h"
+
+#include <ATen/ATen.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/vector.h>
+#include <torch/extension.h>
+
+namespace nb = nanobind;
+namespace tb = tensorrt_llm::batch_manager;
+namespace tr = tensorrt_llm::runtime;
+
+using tr::SizeType32;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+
+void Buffers::initBindings(nb::module_& m)
+{
+    nb::class_<tb::TransformerBuffers>(m, "TransformerBuffers")
+        .def(nb::init<SizeType32, SizeType32, std::vector<SizeType32> const&, SizeType32, SizeType32,
+                 runtime::TllmRuntime const&, runtime::ModelConfig const&, runtime::WorldConfig const&>(),
+            nb::arg("max_batch_size"), nb::arg("max_beam_width"), nb::arg("max_attention_window_vec"),
+            nb::arg("max_attention_window"), nb::arg("sink_token_len"), nb::arg("runtime"), nb::arg("model_config"),
+            nb::arg("world_config"))
+        .def("reshape", &tb::TransformerBuffers::reshape, nb::arg("num_sequences"), nb::arg("num_input_tokens"))
+        .def("reshape_kv_tensors", &tb::TransformerBuffers::reshapeKvTensors, nb::arg("max_batch_size"),
+            nb::arg("max_beam_width"), nb::arg("max_blocks_per_seq"), nb::arg("kv_cache_type"), nb::arg("num_pools"),
+            nb::arg("buffer_manager"))
+        .def("get_buffers", &tb::TransformerBuffers::getBuffers, nb::arg("input_buffers"), nb::arg("output_buffers"),
+            nb::arg("model_config"))
+        .def("copy_position_ids", &tb::TransformerBuffers::copyPositionIds, nb::arg("runtime"),
+            nb::arg("position_ids_host"), nb::arg("is_chat_glm"), nb::arg("decoder_position_ids"))
+        .def("copy_kv_block_offsets", &tb::TransformerBuffers::copyKvBlockOffsets, nb::arg("context_requests"),
+            nb::arg("gen_requests"), nb::arg("kv_cache_manager"), nb::arg("cross_kv_cache_manager"),
+            nb::arg("buffer_manager"))
+        .def("copy_cache_indirection", &tb::TransformerBuffers::copyCacheIndirection, nb::arg("gen_requests"),
+            nb::arg("decoder_cache_indirection_output"), nb::arg("runtime"))
+        .def_rw("past_key_value_lengths", &tb::TransformerBuffers::pastKeyValueLengths)
+        .def_rw("position_ids", &tb::TransformerBuffers::positionIds)
+        .def_rw("max_attention_windows", &tb::TransformerBuffers::maxAttentionWindows)
+        .def_rw("sink_token_lengths", &tb::TransformerBuffers::sinkTokenLengths)
+        .def_rw("cache_indirection", &tb::TransformerBuffers::cacheIndirection)
+        .def_rw("kv_cache_block_offsets_host", &tb::TransformerBuffers::kvCacheBlockOffsetsHost)
+        .def_rw("kv_cache_block_offsets_device", &tb::TransformerBuffers::kvCacheBlockOffsetsDevice)
+        .def_rw("cross_kv_cache_block_pool_pointers", &tb::TransformerBuffers::crossKvCacheBlockPoolPointers)
+        .def_rw("cross_kv_cache_block_offsets_host", &tb::TransformerBuffers::crossKvCacheBlockOffsetsHost)
+        .def_rw("cross_kv_cache_block_offsets_device", &tb::TransformerBuffers::crossKvCacheBlockOffsetsDevice)
+        .def_rw("cache_indir_batched_copy_src_offsets", &tb::TransformerBuffers::cacheIndirBatchedCopySrcOffsets)
+        .def_rw("cache_indir_batched_copy_dst_offsets", &tb::TransformerBuffers::cacheIndirBatchedCopyDstOffsets)
+        .def_rw("cache_indir_batched_copy_sizes", &tb::TransformerBuffers::cacheIndirBatchedCopySizes)
+        .def_rw("fill_values_alt", &tb::TransformerBuffers::fillValuesAlt)
+        .def_rw("fill_values_alt_device", &tb::TransformerBuffers::fillValuesAltDevice)
+        .def_rw("seq_slots_alt", &tb::TransformerBuffers::seqSlotsAlt)
+        .def_rw("seq_slots_alt_device", &tb::TransformerBuffers::seqSlotsAltDevice);
+
+    nb::class_<tb::RuntimeBuffers>(m, "RuntimeBuffers")
+        .def(nb::init<SizeType32, SizeType32, std::vector<SizeType32> const&, SizeType32, SizeType32,
+                 runtime::TllmRuntime const&, runtime::ModelConfig const&, runtime::WorldConfig const&,
+                 executor::DecodingConfig const&, bool, std::optional<SizeType32>>(),
+            nb::arg("max_batch_size"), nb::arg("max_beam_width"), nb::arg("max_attention_window_vec"),
+            nb::arg("max_attention_window"), nb::arg("sink_token_len"), nb::arg("runtime"), nb::arg("model_config"),
+            nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("gather_generation_logits"),
+            nb::arg("max_num_tokens") = std::nullopt)
+        .def_prop_rw(
+            "transformer_buffers", [](tb::RuntimeBuffers& self) { return self.transformerBuffers; },
+            [](tb::RuntimeBuffers& self, std::shared_ptr<tb::TransformerBuffers> val)
+            { self.transformerBuffers = val; })
+        .def_rw("num_context_logits", &tb::RuntimeBuffers::numContextLogits)
+        .def_rw("cache_indir_decoder_io_batched_copy_src_offsets",
+            &tb::RuntimeBuffers::cacheIndirDecoderIOBatchedCopySrcOffsets)
+        .def_rw("cache_indir_decoder_io_batched_copy_dst_offsets",
+            &tb::RuntimeBuffers::cacheIndirDecoderIOBatchedCopyDstOffsets)
+        .def_rw("cache_indir_decoder_io_batched_copy_sizes", &tb::RuntimeBuffers::cacheIndirDecoderIOBatchedCopySizes)
+        .def_rw("logits", &tb::RuntimeBuffers::logits)
+        .def_rw("seq_slots", &tb::RuntimeBuffers::seqSlots)
+        .def_rw("seq_slots_device", &tb::RuntimeBuffers::seqSlotsDevice)
+        .def_rw("cache_indir_decoder_io_batched_copy_src_offsets_slice_device",
+            &tb::RuntimeBuffers::mCacheIndirDecoderIOBatchedCopySrcOffsetsSliceDevice)
+        .def_rw("cache_indir_decoder_io_batched_copy_dst_offsets_slice_device",
+            &tb::RuntimeBuffers::mCacheIndirDecoderIOBatchedCopyDstOffsetsSliceDevice)
+        .def_rw("cache_indir_decoder_io_batched_copy_copy_sizes_device",
+            &tb::RuntimeBuffers::mCacheIndirDecoderIOBatchedCopyCopySizesDevice);
+}
+} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/buffers.h b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.h
new file mode 100644
index 00000000000..34df07e4073
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+class Buffers
+{
+public:
+    static void initBindings(nb::module_& m);
+};
+} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp
new file mode 100644
index 00000000000..abac6d17ed8
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp
@@ -0,0 +1,110 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cacheTransceiver.h"
+#include "tensorrt_llm/batch_manager/cacheTransceiver.h"
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include <ATen/ATen.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/trampoline.h>
+#include <torch/extension.h>
+
+using SizeType32 = tensorrt_llm::runtime::SizeType32;
+
+namespace tb = tensorrt_llm::batch_manager;
+namespace nb = nanobind;
+
+namespace
+{
+
+class PyCacheTransceiver : public tb::BaseCacheTransceiver
+{
+public:
+    // using BaseCacheTransceiver::BaseCacheTransceiver; // Inherit constructors
+    NB_TRAMPOLINE(tb::BaseCacheTransceiver, 6);
+
+    void respondAndSendAsync(tb::LlmRequest* llmRequest) override
+    {
+        NB_OVERRIDE_PURE(respondAndSendAsync, llmRequest);
+    }
+
+    void requestAndReceiveSync(tb::LlmRequest* llmRequest) override
+    {
+        NB_OVERRIDE_PURE(requestAndReceiveSync, llmRequest);
+    }
+
+    void requestAndReceiveAsync(tb::LlmRequest* llmRequest) override
+    {
+        NB_OVERRIDE_PURE(requestAndReceiveAsync, llmRequest);
+    }
+
+    void checkContextTransferStatus(std::optional<int> const& atLeastRequestNum = std::nullopt) override
+    {
+        NB_OVERRIDE_PURE(checkContextTransferStatus, atLeastRequestNum);
+    }
+
+    void checkGenTransferStatus(std::optional<int> const& atLeastRequestNum = std::nullopt) override
+    {
+        NB_OVERRIDE_PURE(checkGenTransferStatus, atLeastRequestNum);
+    }
+
+    bool checkGenTransferComplete() const override
+    {
+        NB_OVERRIDE_PURE(checkGenTransferComplete);
+    }
+};
+} // namespace
+
+void tb::CacheTransceiverBindings::initBindings(nb::module_& m)
+{
+    nb::class_<tb::BaseCacheTransceiver, PyCacheTransceiver>(m, "BaseCacheTransceiver")
+        .def("respond_and_send_async", &BaseCacheTransceiver::respondAndSendAsync)
+        .def("request_and_receive_sync", &BaseCacheTransceiver::requestAndReceiveSync)
+        .def("request_and_receive_async", &BaseCacheTransceiver::requestAndReceiveAsync)
+        .def("check_context_transfer_status", &BaseCacheTransceiver::checkContextTransferStatus)
+        .def("check_gen_transfer_status", &BaseCacheTransceiver::checkGenTransferStatus)
+        .def("check_gen_transfer_complete", &BaseCacheTransceiver::checkGenTransferComplete);
+
+    nb::enum_<tb::CacheTransceiver::CommType>(m, "CommType")
+        .value("UNKNOWN", tb::CacheTransceiver::CommType::UNKNOWN)
+        .value("MPI", tb::CacheTransceiver::CommType::MPI)
+        .value("UCX", tb::CacheTransceiver::CommType::UCX)
+        .value("NIXL", tb::CacheTransceiver::CommType::NIXL);
+
+    nb::enum_<executor::kv_cache::CacheState::AttentionType>(m, "AttentionType")
+        .value("DEFAULT", executor::kv_cache::CacheState::AttentionType::kDEFAULT)
+        .value("MLA", executor::kv_cache::CacheState::AttentionType::kMLA);
+
+    nb::class_<tb::CacheTransceiver, tb::BaseCacheTransceiver>(m, "CacheTransceiver")
+        .def(nb::init<tb::kv_cache_manager::BaseKVCacheManager*, tb::CacheTransceiver::CommType,
+                 std::vector<SizeType32>, SizeType32, SizeType32, runtime::WorldConfig, nvinfer1::DataType,
+                 executor::kv_cache::CacheState::AttentionType, std::optional<executor::CacheTransceiverConfig>>(),
+            nb::arg("cache_manager"), nb::arg("comm_type"), nb::arg("num_kv_heads_per_layer"), nb::arg("size_per_head"),
+            nb::arg("tokens_per_block"), nb::arg("world_config"), nb::arg("dtype"), nb::arg("attention_type"),
+            nb::arg("cache_transceiver_config") = std::nullopt);
+
+    nb::class_<tb::kv_cache_manager::CacheTransBufferManager>(m, "CacheTransBufferManager")
+        .def(nb::init<tb::kv_cache_manager::BaseKVCacheManager*, std::optional<size_t>>(), nb::arg("cache_manager"),
+            nb::arg("max_num_tokens") = std::nullopt)
+        .def_static("pre_alloc_buffer_size", &tb::kv_cache_manager::CacheTransBufferManager::preAllocBufferSize,
+            nb::arg("max_num_tokens") = std::nullopt);
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h b/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h
new file mode 100644
index 00000000000..90fc63d4fde
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::batch_manager
+{
+class CacheTransceiverBindings
+{
+public:
+    static void initBindings(nb::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
new file mode 100644
index 00000000000..f1c398d31f0
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
@@ -0,0 +1,478 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/peftCacheManager.h"
+#include "tensorrt_llm/nanobind/common/bindTypes.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/torch.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/ATen.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/bind_vector.h>
+#include <nanobind/stl/chrono.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/stl/vector.h>
+#include <nanobind/trampoline.h>
+#include <torch/extension.h>
+
+namespace tb = tensorrt_llm::batch_manager;
+namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
+namespace tr = tensorrt_llm::runtime;
+namespace nb = nanobind;
+using BlockKey = tbk::BlockKey;
+using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens;
+using SizeType32 = tensorrt_llm::runtime::SizeType32;
+using TokenIdType = tensorrt_llm::runtime::TokenIdType;
+using VecTokens = std::vector<TokenIdType>;
+using CudaStreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>;
+
+namespace
+{
+std::optional<tensorrt_llm::runtime::ITensor::UniquePtr> from_torch(std::optional<at::Tensor> torchPtr)
+{
+    if (torchPtr)
+    {
+        return tr::TorchView::of(torchPtr.value());
+    }
+    return std::nullopt;
+}
+
+class PyKvCacheManager : public tbk::BaseKVCacheManager
+{
+public:
+    NB_TRAMPOLINE(tbk::BaseKVCacheManager, 28);
+
+    // using BaseKVCacheManager::BaseKVCacheManager; // Inherit constructors
+    void allocatePools(bool useUvm = false) override
+    {
+        NB_OVERRIDE_PURE(allocatePools, useUvm);
+    }
+
+    void releasePools() override
+    {
+        NB_OVERRIDE_PURE(releasePools);
+    }
+
+    void startScheduling() override
+    {
+        NB_OVERRIDE_PURE(startScheduling);
+    }
+
+    SizeType32 getTokensPerBlock() const override
+    {
+        NB_OVERRIDE_PURE(getTokensPerBlock);
+    }
+
+    SizeType32 getMaxNumBlocks() const override
+    {
+        NB_OVERRIDE_PURE(getMaxNumBlocks);
+    }
+
+    SizeType32 getNumPools() const override
+    {
+        NB_OVERRIDE_PURE(getNumPools);
+    }
+
+    tbk::KvCacheStats getKvCacheStats() const override
+    {
+        NB_OVERRIDE_PURE(getKvCacheStats);
+    }
+
+    void addToken(tb::LlmRequest::RequestIdType requestId) override
+    {
+        NB_OVERRIDE_PURE(addToken, requestId);
+    }
+
+    void addSequence(tb::LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth,
+        tensorrt_llm::common::OptionalRef<tb::LlmRequest> llmRequest = std::nullopt) override
+    {
+        NB_OVERRIDE_PURE(addSequence, requestId, inputLength, beamWidth, llmRequest);
+    }
+
+    void removeSequence(tb::LlmRequest::RequestIdType requestId,
+        tensorrt_llm::common::OptionalRef<tb::LlmRequest const> llmRequest = std::nullopt) override
+    {
+        NB_OVERRIDE_PURE(removeSequence, requestId, llmRequest);
+    }
+
+    tbk::GenerationRequest const& getSequence(tb::LlmRequest::RequestIdType requestId) const override
+    {
+        NB_OVERRIDE_PURE(getSequence, requestId);
+    }
+
+    void schedulingRemoveSequence(tb::LlmRequest::RequestIdType requestId) override
+    {
+        NB_OVERRIDE_PURE(schedulingRemoveSequence, requestId);
+    }
+
+    tensorrt_llm::runtime::ITensor::SharedPtr getBlockPoolPointers() const override
+    {
+        NB_OVERRIDE_PURE(getBlockPoolPointers);
+    }
+
+    tensorrt_llm::runtime::ITensor::SharedPtr getLayerToPoolMapping() const override
+    {
+        NB_OVERRIDE_PURE(getLayerToPoolMapping);
+    }
+
+    void getBlockOffsetsOfBatch(tensorrt_llm::runtime::ITensor& output, SizeType32 firstBatchSlotIdx,
+        SizeType32 batchSize, SizeType32 beamWidth) const override
+    {
+        NB_OVERRIDE_PURE(getBlockOffsetsOfBatch, output, firstBatchSlotIdx, batchSize, beamWidth);
+    }
+
+    SizeType32 copyBlockOffsets(tensorrt_llm::runtime::ITensor& output, SizeType32 outputSlotOffset,
+        tb::LlmRequest::RequestIdType requestId) const override
+    {
+        NB_OVERRIDE_PURE(copyBlockOffsets, output, outputSlotOffset, requestId);
+    }
+
+    bool isEnableBlockReuse() const override
+    {
+        NB_OVERRIDE_PURE(isEnableBlockReuse);
+    }
+
+    void rewindKVCache(tb::LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override
+    {
+        NB_OVERRIDE_PURE(rewindKVCache, requestId, rewindLengths);
+    }
+
+    bool isCrossKv() const override
+    {
+        NB_OVERRIDE_PURE(isCrossKv);
+    }
+
+    std::optional<BlockKey> findNewContextBlock(
+        VecUniqueTokens const& uniqueTokens, tb::LlmRequest const& llmRequest) const override
+    {
+        NB_OVERRIDE_PURE(findNewContextBlock, uniqueTokens, llmRequest);
+    }
+
+    void storeContextBlocks(tb::LlmRequest const& llmRequest) override
+    {
+        NB_OVERRIDE_PURE(storeContextBlocks, llmRequest);
+    }
+
+    std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
+        tb::LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override
+    {
+        NB_OVERRIDE_PURE(getCacheBlockIds, requestId, windowSize);
+    }
+
+    std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
+        std::vector<tb::LlmRequest::RequestIdType> const& requestIds, SizeType32 windowSize) const override
+    {
+        NB_OVERRIDE_PURE(getBatchCacheBlockIds, requestIds, windowSize);
+    }
+
+    std::vector<SizeType32> getNewlyAllocatedBlockIds(
+        tb::LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override
+    {
+        NB_OVERRIDE_PURE(getNewlyAllocatedBlockIds, requestId, windowSize);
+    }
+
+    SizeType32 getUsedNumBlocks() const override
+    {
+        NB_OVERRIDE_PURE(getUsedNumBlocks);
+    }
+
+    SizeType32 getNumFreeBlocks() const override
+    {
+        NB_OVERRIDE_PURE(getNumFreeBlocks);
+    }
+
+    tbk::BlockManager const& getBlockManager() const override
+    {
+        NB_OVERRIDE_PURE(getBlockManager);
+    }
+
+    std::deque<tensorrt_llm::executor::KVCacheEvent> getLatestEvents(
+        std::optional<std::chrono::milliseconds> timeout = std::nullopt) const override
+    {
+        NB_OVERRIDE_PURE(getLatestEvents, timeout);
+    }
+
+    tensorrt_llm::runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const override
+    {
+        NB_OVERRIDE_PURE(getPrimaryPool, layer_idx);
+    }
+
+    SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const override
+    {
+        NB_OVERRIDE_PURE(getPoolLayerIdx, layer_idx);
+    }
+
+    void refreshBlocks() override
+    {
+        NB_OVERRIDE_PURE(refreshBlocks);
+    }
+
+    void flushIterationEvents() override
+    {
+        NB_OVERRIDE_PURE(flushIterationEvents);
+    }
+};
+
+// TODO: Deduplicate executor bindings KvCacheStats
+class PyBasePeftCacheManager : public tb::BasePeftCacheManager
+{
+public:
+    ~PyBasePeftCacheManager() override = default;
+
+    NB_TRAMPOLINE(tb::BasePeftCacheManager, 8);
+
+    void addRequestPeft(tb::BasePeftCacheManager::LlmRequestPtr llmRequest, bool tryGpuCache = true) override
+    {
+        NB_OVERRIDE_PURE(addRequestPeft, llmRequest, tryGpuCache);
+    }
+
+    tb::BasePeftCacheManager::PeftTable ensureBatch(tb::RequestVector const& contextRequests,
+        tb::RequestVector const& generationRequests, bool resetGpuCache = false) override
+    {
+        NB_OVERRIDE_PURE(ensureBatch, contextRequests, generationRequests, resetGpuCache);
+    }
+
+    void resetDeviceCache() override
+    {
+        NB_OVERRIDE_PURE(resetDeviceCache);
+    }
+
+    void markRequestDone(tb::LlmRequest const& llmReq, bool pause = false) override
+    {
+        NB_OVERRIDE_PURE(markRequestDone, llmReq, pause);
+    }
+
+    tr::SizeType32 getMaxDevicePages() const override
+    {
+        NB_OVERRIDE_PURE(getMaxDevicePages);
+    }
+
+    tr::SizeType32 getMaxHostPages() const override
+    {
+        NB_OVERRIDE_PURE(getMaxHostPages);
+    }
+
+    tr::SizeType32 determineNumPages(std::shared_ptr<tb::LlmRequest> llmRequest) const override
+    {
+        NB_OVERRIDE_PURE(determineNumPages, llmRequest);
+    }
+
+    bool enabled() const override
+    {
+        NB_OVERRIDE_PURE(enabled);
+    }
+};
+} // namespace
+
+void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
+{
+    nb::class_<tbk::KvCacheStats>(m, "KvCacheStats")
+        .def(nb::init<>())
+        .def_rw("max_num_blocks", &tbk::KvCacheStats::maxNumBlocks)
+        .def_rw("free_num_blocks", &tbk::KvCacheStats::freeNumBlocks)
+        .def_rw("used_num_blocks", &tbk::KvCacheStats::usedNumBlocks)
+        .def_rw("tokens_per_block", &tbk::KvCacheStats::toksPerBlock)
+        .def_rw("alloc_total_blocks", &tbk::KvCacheStats::allocTotalBlocks)
+        .def_rw("alloc_new_blocks", &tbk::KvCacheStats::allocNewBlocks)
+        .def_rw("reused_blocks", &tbk::KvCacheStats::reusedBlocks)
+        .def_rw("missed_blocks", &tbk::KvCacheStats::missedBlocks)
+        .def_rw("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate)
+        .def_rw("num_free_blocks_per_window_size", &tbk::KvCacheStats::numFreeBlocksPerWindowSize);
+
+    nb::class_<tbk::TempAttentionWindowInputs>(m, "TempAttentionWindowInputs")
+        .def(nb::init<>())
+        .def_rw("paged_context_fmha", &tbk::TempAttentionWindowInputs::pagedContextFMHA)
+        .def_rw("max_input_len", &tbk::TempAttentionWindowInputs::maxInputLen)
+        .def_rw("max_num_tokens", &tbk::TempAttentionWindowInputs::maxNumTokens);
+
+    nb::class_<tbk::BlockKey>(m, "BlockKey")
+        .def(nb::init<>())
+        .def(nb::init<VecTokens const&, std::optional<tr::LoraTaskIdType>>(), nb::arg("tokens"),
+            nb::arg("lora_task_id") = std::nullopt)
+        .def(nb::init<bool, std::optional<tr::LoraTaskIdType>, VecUniqueTokens const&>(), nb::arg("uses_extra_ids"),
+            nb::arg("lora_task_id"), nb::arg("unique_tokens"))
+        .def_ro("uses_extra_ids", &tbk::BlockKey::usesExtraIds)
+        .def_ro("lora_task_id", &tbk::BlockKey::loraTaskId)
+        .def_ro("unique_tokens", &tbk::BlockKey::uniqueTokens);
+
+    nb::class_<tbk::BlockKeyHasher>(m, "BlockKeyHasher")
+        .def_static("hash", &tbk::BlockKeyHasher::hash, nb::arg("block_key"), nb::arg("parent_hash") = 0);
+
+    nb::class_<tbk::KVCacheEventManager>(m, "KVCacheEventManager")
+        .def(nb::init<size_t>(), nb::arg("max_kv_event_entries"));
+
+    nb::class_<tbk::BaseKVCacheManager, PyKvCacheManager>(m, "BaseKVCacheManager")
+        .def_static("calculate_max_num_blocks", &tbk::BaseKVCacheManager::calculateMaxNumBlocks, nb::arg("config"),
+            nb::arg("is_cross_attention"), nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config"),
+            nb::arg("window_size_to_layers"), nb::arg("allotted_primary_mem_bytes"),
+            nb::arg("allotted_secondary_mem_bytes"), nb::arg("extra_cost_memory"), nb::arg("kv_factor"))
+        .def("allocate_pools", &BaseKVCacheManager::allocatePools)
+        .def("release_pools", &BaseKVCacheManager::releasePools)
+        .def("start_scheduling", &BaseKVCacheManager::startScheduling)
+        .def_prop_ro("tokens_per_block", &BaseKVCacheManager::getTokensPerBlock)
+        .def_prop_ro("max_num_blocks", &BaseKVCacheManager::getMaxNumBlocks)
+        .def_prop_ro("num_pools", &BaseKVCacheManager::getNumPools)
+        .def("get_kv_cache_stats", &BaseKVCacheManager::getKvCacheStats)
+        .def_prop_ro("max_blocks_per_seq",
+            [](tbk::BaseKVCacheManager& self) { return self.getOffsetTableDimensions().maxBlocksPerSeq; })
+        .def("get_needed_blocks_one_step", &BaseKVCacheManager::getNeededBlocksOneStep)
+        .def("get_remaining_blocks_to_completion", &BaseKVCacheManager::getRemainingBlocksToCompletion)
+        .def("add_token", &BaseKVCacheManager::addToken)
+        .def("add_sequence", &BaseKVCacheManager::addSequence)
+        .def("remove_sequence", &BaseKVCacheManager::removeSequence)
+        .def("scheduling_remove_sequence", &BaseKVCacheManager::schedulingRemoveSequence)
+        .def("get_block_pool_pointers",
+            [](tbk::BaseKVCacheManager& self)
+            {
+                std::optional<at::Tensor> block_pool_pointers{std::nullopt};
+                auto tensor = self.getBlockPoolPointers();
+                if (tensor)
+                {
+                    std::shared_ptr<tensorrt_llm::runtime::ITensor> _tensor = std::move(tensor);
+                    block_pool_pointers = tr::Torch::tensor(_tensor);
+                }
+                return block_pool_pointers;
+            })
+        .def("get_layer_to_pool_mapping",
+            [](tbk::BaseKVCacheManager& self)
+            {
+                std::optional<at::Tensor> layer_to_pool_mapping{std::nullopt};
+                auto tensor = self.getLayerToPoolMapping();
+                if (tensor)
+                {
+                    std::shared_ptr<tensorrt_llm::runtime::ITensor> _tensor = std::move(tensor);
+                    layer_to_pool_mapping = tr::Torch::tensor(_tensor);
+                }
+                return layer_to_pool_mapping;
+            })
+        .def("get_primary_pool_data",
+            [](tbk::BaseKVCacheManager& self, SizeType32 layer_idx) -> at::Tensor
+            {
+                auto pool = tr::Torch::tensor(self.getPrimaryPool(layer_idx));
+                auto pool_layer_idx = self.getPoolLayerIdx(layer_idx);
+                return pool.index({torch::indexing::Slice(), pool_layer_idx});
+            })
+        .def("get_block_offsets_of_batch",
+            [](tbk::BaseKVCacheManager& self, at::Tensor output, SizeType32 firstBatchSlotIdx, SizeType32 batchSize,
+                SizeType32 beamWidth)
+            {
+                auto _output = from_torch(output);
+                TLLM_CHECK_WITH_INFO(_output.has_value(), "Invalid output tensor.");
+                self.getBlockOffsetsOfBatch(*(_output.value()), firstBatchSlotIdx, batchSize, beamWidth);
+            })
+        .def("copy_block_offsets",
+            [](tbk::BaseKVCacheManager& self, at::Tensor output, SizeType32 outputSlotOffset,
+                tb::LlmRequest::RequestIdType requestId)
+            {
+                auto _output = from_torch(output);
+                TLLM_CHECK_WITH_INFO(_output.has_value(), "Invalid output tensor.");
+                auto maxBlockCount = self.copyBlockOffsets(*(_output.value()), outputSlotOffset, requestId);
+                return maxBlockCount;
+            })
+        .def("copy_batch_block_offsets",
+            [](tbk::BaseKVCacheManager& self, at::Tensor output,
+                std::vector<tb::LlmRequest::RequestIdType> const& requestIds, SizeType32 const beamWidth,
+                SizeType32 const offset)
+            {
+                auto _output = from_torch(output);
+                TLLM_CHECK_WITH_INFO(_output.has_value(), "Invalid output tensor.");
+                for (size_t i = 0; i < requestIds.size(); ++i)
+                {
+                    self.copyBlockOffsets(*(_output.value()), i * beamWidth + offset, requestIds[i]);
+                }
+            })
+        .def(
+            "get_latest_events",
+            [](tbk::BaseKVCacheManager& self, std::optional<double> timeout_ms = std::nullopt)
+            {
+                if (timeout_ms)
+                {
+                    return self.getLatestEvents(std::chrono::milliseconds(static_cast<int64_t>(*timeout_ms)));
+                }
+                return self.getLatestEvents(std::nullopt);
+            },
+            nb::arg("timeout_ms") = std::nullopt)
+        .def_prop_ro("enable_block_reuse", &BaseKVCacheManager::isEnableBlockReuse)
+        .def("rewind_kv_cache", &BaseKVCacheManager::rewindKVCache)
+        .def_prop_ro("cross_kv", &BaseKVCacheManager::isCrossKv)
+        .def("store_context_blocks", &BaseKVCacheManager::storeContextBlocks)
+        .def("get_cache_block_ids", &BaseKVCacheManager::getCacheBlockIds)
+        .def("get_batch_cache_block_ids", &BaseKVCacheManager::getBatchCacheBlockIds)
+        .def("get_newly_allocated_block_ids", &BaseKVCacheManager::getNewlyAllocatedBlockIds)
+        .def("flush_iteration_events", &BaseKVCacheManager::flushIterationEvents);
+
+    nb::bind_vector<std::vector<std::vector<SizeType32>>>(m, "CacheBlockIds");
+
+    nb::enum_<tbk::CacheType>(m, "CacheType")
+        .value("SELF", tbk::CacheType::kSELF)
+        .value("CROSS", tbk::CacheType::kCROSS)
+        .value("SELFKONLY", tbk::CacheType::kSELFKONLY);
+
+    nb::class_<tbk::KVCacheManager, tbk::BaseKVCacheManager>(m, "KVCacheManager")
+        .def(nb::init<std::vector<SizeType32> const&, SizeType32, SizeType32,
+                 std::map<SizeType32, std::tuple<SizeType32, SizeType32>> const&, SizeType32, SizeType32,
+                 std::vector<SizeType32> const&, std::optional<tbk::TempAttentionWindowInputs> const&,
+                 nvinfer1::DataType, SizeType32, int64_t, std::optional<runtime::SizeType32>, bool, bool,
+                 tbk::CacheType, std::optional<tensorrt_llm::executor::RetentionPriority>,
+                 std::shared_ptr<tbk::KVCacheEventManager>, bool, bool>(),
+            nb::arg("num_kv_heads_per_layer"), nb::arg("size_per_head"), nb::arg("tokens_per_block"),
+            nb::arg("blocks_per_window"), nb::arg("max_num_sequences"), nb::arg("max_beam_width"),
+            nb::arg("max_attention_window_vec"), nb::arg("temp_attention_window_inputs").none(), nb::arg("dtype"),
+            nb::arg("sink_token_length"), nb::arg("stream"), nb::arg("max_sequence_length").none(),
+            nb::arg("enable_block_reuse") = false, nb::arg("onboard_blocks") = true,
+            nb::arg("cache_type") = tbk::CacheType::kSELF, nb::arg("secondary_offload_min_priority") = std::nullopt,
+            nb::arg("event_manager") = nullptr, nb::arg("enable_partial_reuse") = true,
+            nb::arg("copy_on_partial_reuse") = true);
+}
+
+void tb::BasePeftCacheManagerBindings::initBindings(nb::module_& m)
+{
+    nb::class_<tb::BasePeftCacheManager, PyBasePeftCacheManager>(m, "BasePeftCacheManager")
+        .def("add_request_peft", &tb::BasePeftCacheManager::addRequestPeft, nb::arg("request"),
+            nb::arg("try_gpu_cache") = true)
+        .def(
+            "ensure_batch",
+            [](tb::BasePeftCacheManager& self, tb::RequestVector const& contextRequests,
+                tb::RequestVector const& generationRequests, bool resetGpuCache)
+            {
+                nb::gil_scoped_release release;
+                return self.ensureBatch(contextRequests, generationRequests, resetGpuCache);
+            },
+            nb::arg("context_requests"), nb::arg("generation_requests"), nb::arg("reset_gpu_cache") = false)
+        .def("reset_device_cache", &tb::BasePeftCacheManager::resetDeviceCache)
+        .def("mark_request_done", &tb::BasePeftCacheManager::markRequestDone, nb::arg("request"),
+            nb::arg("pause") = false)
+        .def_prop_ro("max_device_pages", &tb::BasePeftCacheManager::getMaxDevicePages)
+        .def_prop_ro("max_host_pages", &tb::BasePeftCacheManager::getMaxHostPages)
+        .def("determine_num_pages", &tb::BasePeftCacheManager::determineNumPages, nb::arg("request"))
+        .def_prop_ro("enabled", &tb::BasePeftCacheManager::enabled);
+
+    nb::class_<tb::PeftCacheManager, tb::BasePeftCacheManager>(m, "PeftCacheManager")
+        .def(nb::init<tb::PeftCacheManagerConfig, tr::ModelConfig, tr::WorldConfig, tr::BufferManager>(),
+            nb::arg("config"), nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"));
+
+    nb::class_<tb::NoOpPeftCacheManager, tb::BasePeftCacheManager>(m, "NoOpPeftCacheManager").def(nb::init<>());
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h
new file mode 100644
index 00000000000..786c0d391df
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h
@@ -0,0 +1,39 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::batch_manager::kv_cache_manager
+{
+class KVCacheManagerBindings
+{
+public:
+    static void initBindings(nb::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager::kv_cache_manager
+
+namespace tensorrt_llm::batch_manager
+{
+class BasePeftCacheManagerBindings
+{
+public:
+    static void initBindings(nb::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
new file mode 100644
index 00000000000..d8f45cb865f
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
@@ -0,0 +1,131 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "llmRequest.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/nanobind/common/bindTypes.h"
+#include "tensorrt_llm/runtime/torch.h"
+#include "tensorrt_llm/runtime/torchUtils.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/ATen.h>
+#include <torch/extension.h>
+
+#include <memory>
+
+namespace tb = tensorrt_llm::batch_manager;
+namespace tr = tensorrt_llm::runtime;
+namespace tle = tensorrt_llm::executor;
+
+using namespace tensorrt_llm::nanobind::batch_manager;
+
+using LlmRequestPtr = std::shared_ptr<tb::LlmRequest>;
+using RequestList = std::list<LlmRequestPtr>;
+
+namespace
+{
+
+std::optional<tb::LlmRequest::TensorPtr> from_torch(std::optional<LlmRequest::TensorPtr> torchPtr)
+{
+    if (torchPtr)
+    {
+        return tr::TorchView::of(torchPtr.value());
+    }
+    return std::nullopt;
+}
+
+} // namespace
+
+std::optional<tb::LlmRequest::LogitsPostProcessor> LlmRequest::callbackAdapter(
+    std::optional<LlmRequest::LogitsPostProcessor> callback)
+{
+    if (!callback)
+    {
+        return std::nullopt;
+    }
+
+    return [callback](RequestIdType reqId, tr::ITensor::SharedPtr& tensor, tb::LlmRequest::BeamTokens const& tokens,
+               tr::BufferManager::CudaStreamPtr stream, std::optional<RequestIdType> clientId)
+    {
+        at::Tensor atTensor = tr::Torch::tensor(tensor);
+        callback.value()(reqId, atTensor, tokens, runtime::TorchUtils::stream(*stream).unwrap(), clientId);
+    };
+}
+
+std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
+{
+
+    auto const draftTokens = std::make_shared<std::vector<TokenIdType>>(*mDraftTokens.get());
+    auto const optDraftTokens = std::optional<std::shared_ptr<std::vector<TokenIdType>>>(draftTokens);
+    auto const encoderInputTokens = mEncoderTokens.has_value()
+        ? std::make_shared<std::vector<TokenIdType>>(*mEncoderTokens.value().get())
+        : nullptr;
+    auto const optEncoderInputTokens = std::optional<std::shared_ptr<std::vector<TokenIdType>>>(encoderInputTokens);
+    // 49 parameters
+    return std::make_shared<tb::LlmRequest>(                       //
+        mRequestId,                                                //
+        mMaxNewTokens,                                             //
+        std::make_shared<std::vector<TokenIdType>>(mTokens.at(0)), //
+        mSamplingConfig,                                           //
+        mIsStreaming,                                              //
+        mEndId,                                                    //
+        mPadId,                                                    //
+        from_torch(mEmbeddingBias),                                //
+        from_torch(mBadWordsList),                                 //
+        from_torch(mStopWordsList),                                //
+        mPositionIds,                                              //
+        from_torch(mPromptEmbeddingTable),                         //
+        mPromptVocabSize,                                          //
+        mMultimodalHashes,                                         //
+        mMultimodalPositions,                                      //
+        mMultimodalLengths,                                        //
+        from_torch(mMultimodalEmbedding),                          //
+        from_torch(mMropeRotaryCosSin),                            //
+        mMropePositionDeltas,                                      //
+        mLoraTaskId,                                               //
+        from_torch(mLoraWeights),                                  //
+        from_torch(mLoraConfig),                                   //
+        mLookaheadConfig,                                          //
+        mKvCacheRetentionConfig,                                   //
+        mReturnLogProbs,                                           //
+        mReturnContextLogits,                                      //
+        mReturnGenerationLogits,                                   //
+        optDraftTokens,                                            //
+        from_torch(mDraftLogits),                                  //
+        mExcludeInputFromOutput,                                   //
+        callbackAdapter(mLogitsPostProcessor),                     //
+        mApplyLogitsPostProcessorBatched,                          //
+        optEncoderInputTokens,                                     //
+        mReturnEncoderOutput,                                      //
+        mClientId,                                                 //
+        mPriority,                                                 //
+        from_torch(mEncoderInputFeatures),                         //
+        mEncoderOutputLength,                                      //
+        from_torch(mCrossAttentionMask),                           //
+        getLlmRequestType(),                                       //
+        std::nullopt,                                              // inputTokenExtraIds
+        mNumReturnSequences,                                       //
+        mEagleConfig,                                              //
+        from_torch(mSkipCrossAttnBlocks),                          //
+        false,                                                     // returnPerfMetrics
+        mGuidedDecodingParams,                                     //
+        mLanguageAdapterUid,                                       //
+        mAllottedTimeMs,                                           //
+        mContextPhaseParams                                        //
+    );
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
new file mode 100644
index 00000000000..624dc55112d
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
@@ -0,0 +1,160 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+
+#include <ATen/ATen.h>
+#include <ATen/ops/tensor.h>
+#include <memory>
+#include <nanobind/nanobind.h>
+#include <optional>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+
+namespace tb = tensorrt_llm::batch_manager;
+
+/* Unfortunately, torch's default nanobind bindings don't know about c10::cuda::CUDAStream,
+ * so we have to pass the more generic c10::Stream, and convert it back to a full-fledged
+ * torch.cuda.Stream in python. See example in test/bindings/test_gpt_manager.py
+ */
+class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
+{
+public:
+    using Base = GenericLlmRequest<at::Tensor, c10::Stream>;
+    using TensorPtr = Base::TensorPtr;
+    using SizeType32 = Base::SizeType32;
+    using TokenIdType = Base::TokenIdType;
+    using RequestIdType = Base::RequestIdType;
+    using LoraTaskIdType = Base::LoraTaskIdType;
+    using VecLogProbs = Base::VecLogProbs;
+    using BeamTokens = Base::BeamTokens;
+    using VecTokens = Base::VecTokens;
+    using VecTokenExtraIds = Base::VecTokenExtraIds;
+    using LogitsPostProcessor = Base::LogitsPostProcessor;
+
+    // 49 parameters
+    LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::vector<TokenIdType> inputTokens,
+        runtime::SamplingConfig samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
+        std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
+        std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
+        std::optional<std::vector<SizeType32>> positionIds = std::nullopt,
+        std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
+        std::optional<SizeType32> promptVocabSize = std::nullopt,
+        std::optional<std::vector<std::vector<SizeType32>>> multimodalHashes = std::nullopt,
+        std::optional<std::vector<SizeType32>> multimodalPositions = std::nullopt,
+        std::optional<std::vector<SizeType32>> multimodalLengths = std::nullopt,
+        std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
+        std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
+        std::optional<SizeType32> mropePositionDeltas = std::nullopt,
+        std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
+        std::optional<TensorPtr> loraConfig = std::nullopt,
+        std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
+        std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
+        bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
+        std::optional<VecTokens> draftTokens = std::nullopt, std::optional<TensorPtr> draftLogits = std::nullopt,
+        bool excludeInputFromOutput = false, std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
+        bool applyLogitsPostProcessorBatched = false, std::optional<VecTokens> encoderInputTokens = std::nullopt,
+        bool returnEncoderOutput = false, std::optional<RequestIdType> clientId = std::nullopt,
+        executor::PriorityType priority = executor::Request::kDefaultPriority,
+        std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
+        std::optional<SizeType32> encoderOutputLength = std::nullopt,
+        std::optional<TensorPtr> crossAttentionMask = std::nullopt,
+        tb::LlmRequestType llmRequestType = tb::LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
+        std::optional<VecTokenExtraIds> inputTokenExtraIds = std::nullopt, SizeType32 numReturnSequences = 1,
+        std::optional<executor::EagleConfig> eagleConfig = std::nullopt,
+        std::optional<TensorPtr> skipCrossAttnBlocks = std::nullopt, bool returnPerfMetrics = false,
+        std::optional<executor::GuidedDecodingParams> guidedDecodingParams = std::nullopt,
+        std::optional<SizeType32> languageAdapterUid = std::nullopt,
+        std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
+        std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt)
+        : Base(requestId,                                                                                       //
+            maxNewTokens,                                                                                       //
+            std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),                                 //
+            samplingConfig,                                                                                     //
+            isStreaming,                                                                                        //
+            endId,                                                                                              //
+            padId,                                                                                              //
+            embeddingBias,                                                                                      //
+            badWordsList,                                                                                       //
+            stopWordsList,                                                                                      //
+            positionIds.has_value() ? std::make_shared<std::vector<SizeType32>>(std::move(positionIds.value())) //
+                                    : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),    //
+            promptEmbeddingTable,                                                                               //
+            promptVocabSize,                                                                                    //
+            multimodalHashes.has_value()
+                ? std::make_optional(
+                    std::make_shared<std::vector<std::vector<SizeType32>>>(std::move(multimodalHashes.value()))) //
+                : std::optional<std::shared_ptr<std::vector<std::vector<SizeType32>>>>(std::nullopt),            //
+            multimodalPositions.has_value()
+                ? std::make_shared<std::vector<SizeType32>>(std::move(multimodalPositions.value()))              //
+                : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),                         //
+            multimodalLengths.has_value()
+                ? std::make_shared<std::vector<SizeType32>>(std::move(multimodalLengths.value()))                //
+                : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),                         //
+            multimodalEmbedding,                                                                                 //
+            mropeRotaryCosSin,                                                                                   //
+            mropePositionDeltas,                                                                                 //
+            loraTaskId,                                                                                          //
+            loraWeights,                                                                                         //
+            loraConfig,                                                                                          //
+            lookaheadConfig,                                                                                     //
+            kvCacheRetentionConfig,                                                                              //
+            returnLogProbs,                                                                                      //
+            returnContextLogits,                                                                                 //
+            returnGenerationLogits,                                                                              //
+            draftTokens.has_value() ? std::make_shared<VecTokens>(std::move(draftTokens.value()))                //
+                                    : std::make_shared<VecTokens>(),                                             //
+            draftLogits,                                                                                         //
+            excludeInputFromOutput,                                                                              //
+            logitsPostProcessor,                                                                                 //
+            applyLogitsPostProcessorBatched,                                                                     //
+            encoderInputTokens ? std::make_optional(std::make_shared<VecTokens>(std::move(*encoderInputTokens))) //
+                               : std::optional<std::shared_ptr<VecTokens>>(std::nullopt),                        //
+            returnEncoderOutput,                                                                                 //
+            clientId,                                                                                            //
+            priority,                                                                                            //
+            encoderInputFeatures,                                                                                //
+            encoderOutputLength,                                                                                 //
+            crossAttentionMask,                                                                                  //
+            llmRequestType,                                                                                      //
+            inputTokenExtraIds                                                                                   //
+                ? std::make_optional(std::make_shared<VecTokenExtraIds>(std::move(*inputTokenExtraIds)))         //
+                : std::optional<std::shared_ptr<VecTokenExtraIds>>(std::nullopt),                                //
+            numReturnSequences,                                                                                  //
+            eagleConfig,                                                                                         //
+            skipCrossAttnBlocks,                                                                                 //
+            returnPerfMetrics,                                                                                   //
+            guidedDecodingParams,                                                                                //
+            languageAdapterUid,                                                                                  //
+            allottedTimeMs,                                                                                      //
+            contextPhaseParams                                                                                   //
+        )
+    {
+    }
+
+    static std::optional<tb::LlmRequest::LogitsPostProcessor> callbackAdapter(
+        std::optional<LlmRequest::LogitsPostProcessor> callback);
+
+    [[nodiscard]] std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> toTrtLlm() const;
+};
+
+} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/bindings.cpp b/cpp/tensorrt_llm/nanobind/bindings.cpp
index adc82587433..dd01d21cced 100644
--- a/cpp/tensorrt_llm/nanobind/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/bindings.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,14 +15,483 @@
  * limitations under the License.
  */
 
+#include "tensorrt_llm/nanobind/common/customCasters.h"
 #include <nanobind/nanobind.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/bind_vector.h>
+#include <nanobind/stl/filesystem.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unique_ptr.h>
+
+#include <torch/extension.h>
+#include <vector>
+
+#include "tensorrt_llm/batch_manager/peftCacheManagerConfig.h"
+#include "tensorrt_llm/common/quantization.h"
+#include "tensorrt_llm/nanobind/batch_manager/algorithms.h"
+#include "tensorrt_llm/nanobind/batch_manager/bindings.h"
+#include "tensorrt_llm/nanobind/batch_manager/buffers.h"
+#include "tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h"
+#include "tensorrt_llm/nanobind/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/nanobind/batch_manager/llmRequest.h"
+#include "tensorrt_llm/nanobind/executor/bindings.h"
+#include "tensorrt_llm/nanobind/runtime/bindings.h"
+#include "tensorrt_llm/nanobind/testing/modelSpecBinding.h"
+#include "tensorrt_llm/nanobind/userbuffers/bindings.h"
+#include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/gptJsonConfig.h"
+#include "tensorrt_llm/runtime/ipcNvlsMemory.h"
+#include "tensorrt_llm/runtime/memoryCounters.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+#include "tensorrt_llm/runtime/utils/mpiUtils.h"
+
+namespace nb = nanobind;
+namespace tb = tensorrt_llm::batch_manager;
+namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
+namespace tpb = tensorrt_llm::nanobind::batch_manager;
+namespace tc = tensorrt_llm::common;
+namespace tr = tensorrt_llm::runtime;
+namespace tle = tensorrt_llm::executor;
+using SizeType32 = tr::SizeType32;
+using TokenIdType = tr::TokenIdType;
+template <typename T>
+using OptVec = std::optional<std::vector<T>>;
 
 #if not defined(TRTLLM_NB_MODULE)
 #error "TRTLLM_NB_MODULE must be defined"
 #endif
 
+namespace
+{
+tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& configs)
+{
+    return tr::SamplingConfig(configs);
+}
+} // namespace
+
 NB_MODULE(TRTLLM_NB_MODULE, m)
 {
     m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
     m.attr("binding_type") = "nanobind";
+    nb::set_leak_warnings(false);
+
+    // Create MpiComm binding first since it's used in the executor bindings
+    nb::class_<tensorrt_llm::mpi::MpiComm>(m, "MpiComm")
+        .def_static("rank",
+            []()
+            {
+                auto& session = tensorrt_llm::mpi::MpiComm::session();
+                return session.tensorrt_llm::mpi::MpiComm::getRank();
+            })
+        .def_static("size",
+            []()
+            {
+                auto& session = tensorrt_llm::mpi::MpiComm::session();
+                return session.tensorrt_llm::mpi::MpiComm::getSize();
+            })
+        .def_static("local_size",
+            []()
+            {
+                auto& session = tensorrt_llm::mpi::MpiComm::localSession();
+                return session.tensorrt_llm::mpi::MpiComm::getSize();
+            })
+        .def_static("local_init", []() { tensorrt_llm::mpi::MpiComm::localSession(); })
+        .def_static("set_raw_mpi_session_by_fortran_handle",
+            [](int64_t fortran_handle) { tensorrt_llm::mpi::MpiComm::setRawSessionByFortran(fortran_handle); })
+        .def_static("split",
+            [](size_t color, size_t rank)
+            {
+                auto& world = tensorrt_llm::mpi::MpiComm::world();
+                tensorrt_llm::mpi::MpiComm::setSession(world.split(color, rank));
+            });
+
+    nb::class_<tr::CudaStream>(m, "CudaStream")
+        .def(
+            "__init__",
+            [](tr::CudaStream* self, nb::object py_stream)
+            {
+                cudaStream_t stream = reinterpret_cast<cudaStream_t>(nb::cast<uintptr_t>(py_stream));
+                new (self) tr::CudaStream{stream};
+            },
+            nb::arg("stream_ptr"))
+        .def("get_device", &tr::CudaStream::getDevice);
+
+    // Create submodule for executor bindings.
+    auto mExecutor = m.def_submodule("executor", "Executor bindings");
+    auto mInternal = m.def_submodule("internal", "Internal submodule of TRTLLM runtime");
+    auto mInternalRuntime = mInternal.def_submodule("runtime", "Runtime internal bindings");
+    auto mInternalTesting = mInternal.def_submodule("testing", "Testing internal bindings");
+    auto mInternalBatchManager = mInternal.def_submodule("batch_manager", "Batch manager internal bindings");
+
+    tensorrt_llm::nanobind::executor::initBindings(mExecutor);
+    tensorrt_llm::nanobind::runtime::initBindingsEarly(mInternalRuntime);
+
+    auto buildInfo = m.def_submodule("BuildInfo");
+    buildInfo.attr("ENABLE_MULTI_DEVICE") = nb::int_(ENABLE_MULTI_DEVICE);
+
+    nb::class_<tb::PeftCacheManagerConfig>(m, "PeftCacheManagerConfig")
+        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32,
+                 SizeType32, std::optional<float>, std::optional<size_t>, std::optional<std::string>>(),
+            nb::arg("num_host_module_layer") = 0, nb::arg("num_device_module_layer") = 0,
+            nb::arg("optimal_adapter_size") = 8, nb::arg("max_adapter_size") = 64, nb::arg("num_put_workers") = 1,
+            nb::arg("num_ensure_workers") = 1, nb::arg("num_copy_streams") = 1,
+            nb::arg("max_pages_per_block_host") = 24, nb::arg("max_pages_per_block_device") = 8,
+            nb::arg("device_cache_percent") = std::nullopt, nb::arg("host_cache_size") = std::nullopt,
+            nb::arg("lora_prefetch_dir") = std::nullopt)
+        .def_rw("num_host_module_layer", &tb::PeftCacheManagerConfig::numHostModuleLayer)
+        .def_rw("num_device_module_layer", &tb::PeftCacheManagerConfig::numDeviceModuleLayer)
+        .def_rw("optimal_adapter_size", &tb::PeftCacheManagerConfig::optimalAdapterSize)
+        .def_rw("max_adapter_size", &tb::PeftCacheManagerConfig::maxAdapterSize)
+        .def_rw("num_put_workers", &tb::PeftCacheManagerConfig::numPutWorkers)
+        .def_rw("num_ensure_workers", &tb::PeftCacheManagerConfig::numEnsureWorkers)
+        .def_rw("num_copy_streams", &tb::PeftCacheManagerConfig::numCopyStreams)
+        .def_rw("max_pages_per_block_host", &tb::PeftCacheManagerConfig::maxPagesPerBlockHost)
+        .def_rw("max_pages_per_block_device", &tb::PeftCacheManagerConfig::maxPagesPerBlockDevice)
+        .def_rw("device_cache_percent", &tb::PeftCacheManagerConfig::deviceCachePercent)
+        .def_rw("host_cache_size", &tb::PeftCacheManagerConfig::hostCacheSize)
+        .def_rw("lora_prefetch_dir", &tb::PeftCacheManagerConfig::loraPrefetchDir);
+
+    nb::enum_<nvinfer1::DataType>(m, "DataType")
+        .value("FLOAT", nvinfer1::DataType::kFLOAT)
+        .value("HALF", nvinfer1::DataType::kHALF)
+        .value("INT8", nvinfer1::DataType::kINT8)
+        .value("INT32", nvinfer1::DataType::kINT32)
+        .value("BOOL", nvinfer1::DataType::kBOOL)
+        .value("UINT8", nvinfer1::DataType::kUINT8)
+        .value("FP8", nvinfer1::DataType::kFP8)
+        .value("BF16", nvinfer1::DataType::kBF16)
+        .value("INT64", nvinfer1::DataType::kINT64)
+        .export_values();
+
+    nb::enum_<tr::ModelConfig::ModelVariant>(m, "GptModelVariant")
+        .value("GPT", tr::ModelConfig::ModelVariant::kGpt)
+        .value("GLM", tr::ModelConfig::ModelVariant::kGlm)
+        .value("CHATGLM", tr::ModelConfig::ModelVariant::kChatGlm)
+        .value("MAMBA", tr::ModelConfig::ModelVariant::kMamba)
+        .value("RECURRENTGEMMA", tr::ModelConfig::ModelVariant::kRecurrentGemma);
+
+    nb::enum_<tr::ModelConfig::KVCacheType>(m, "KVCacheType")
+        .value("CONTINUOUS", tr::ModelConfig::KVCacheType::kCONTINUOUS)
+        .value("PAGED", tr::ModelConfig::KVCacheType::kPAGED)
+        .value("DISABLED", tr::ModelConfig::KVCacheType::kDISABLED)
+        .def("from_string", tr::ModelConfig::KVCacheTypeFromString);
+
+    nb::enum_<tr::ModelConfig::LayerType>(m, "LayerType")
+        .value("ATTENTION", tr::ModelConfig::LayerType::kATTENTION)
+        .value("RECURRENT", tr::ModelConfig::LayerType::kRECURRENT);
+
+    nb::enum_<tr::LoraModule::ModuleType>(m, "LoraModuleType")
+        .value("INVALID", tr::LoraModule::ModuleType::kINVALID)
+        .value("ATTN_QKV", tr::LoraModule::ModuleType::kATTN_QKV)
+        .value("ATTN_Q", tr::LoraModule::ModuleType::kATTN_Q)
+        .value("ATTN_K", tr::LoraModule::ModuleType::kATTN_K)
+        .value("ATTN_V", tr::LoraModule::ModuleType::kATTN_V)
+        .value("ATTN_DENSE", tr::LoraModule::ModuleType::kATTN_DENSE)
+        .value("MLP_H_TO_4H", tr::LoraModule::ModuleType::kMLP_H_TO_4H)
+        .value("MLP_4H_TO_H", tr::LoraModule::ModuleType::kMLP_4H_TO_H)
+        .value("MLP_GATE", tr::LoraModule::ModuleType::kMLP_GATE)
+        .value("CROSS_ATTN_QKV", tr::LoraModule::ModuleType::kCROSS_ATTN_QKV)
+        .value("CROSS_ATTN_Q", tr::LoraModule::ModuleType::kCROSS_ATTN_Q)
+        .value("CROSS_ATTN_K", tr::LoraModule::ModuleType::kCROSS_ATTN_K)
+        .value("CROSS_ATTN_V", tr::LoraModule::ModuleType::kCROSS_ATTN_V)
+        .value("CROSS_ATTN_DENSE", tr::LoraModule::ModuleType::kCROSS_ATTN_DENSE)
+        .value("MOE_H_TO_4H", tr::LoraModule::ModuleType::kMOE_H_TO_4H)
+        .value("MOE_4H_TO_H", tr::LoraModule::ModuleType::kMOE_4H_TO_H)
+        .value("MOE_GATE", tr::LoraModule::ModuleType::kMOE_GATE)
+        .value("MOE_ROUTER", tr::LoraModule::ModuleType::kMOE_ROUTER)
+        .value("MLP_ROUTER", tr::LoraModule::ModuleType::kMLP_ROUTER)
+        .value("MLP_GATE_UP", tr::LoraModule::ModuleType::kMLP_GATE_UP);
+
+    nb::class_<tr::LoraModule>(m, "LoraModule")
+        .def(nb::init<tr::LoraModule::ModuleType, SizeType32, SizeType32, bool, bool, SizeType32, SizeType32>(),
+            nb::arg("module_type"), nb::arg("in_dim"), nb::arg("out_dim"), nb::arg("in_dim_first"),
+            nb::arg("out_dim_first"), nb::arg("in_tp_split_dim"), nb::arg("out_tp_split_dim"))
+        .def_prop_ro("module_type", &tr::LoraModule::name)
+        .def_prop_ro("in_dim", &tr::LoraModule::inDim)
+        .def_prop_ro("out_dim", &tr::LoraModule::outDim)
+        .def_prop_ro("in_dim_first", &tr::LoraModule::inDimFirst)
+        .def_prop_ro("out_dim_first", &tr::LoraModule::outDimFirst)
+        .def_prop_ro("in_tp_split_dim", &tr::LoraModule::inTpSplitDim)
+        .def_prop_ro("out_tp_split_dim", &tr::LoraModule::outTpSplitDim)
+        .def_static("create_lora_modules", &tr::LoraModule::createLoraModules, nb::arg("lora_module_names"),
+            nb::arg("hidden_size"), nb::arg("mlp_hidden_size"), nb::arg("num_attention_heads"),
+            nb::arg("num_kv_attention_heads"), nb::arg("attention_head_size"), nb::arg("tp_size") = 1,
+            nb::arg("num_experts") = 0);
+
+    nb::class_<tc::QuantMode>(m, "QuantMode")
+        .def_static("none", &tc::QuantMode::none)
+        .def_static("int4_weights", &tc::QuantMode::int4Weights)
+        .def_static("int8_weights", &tc::QuantMode::int8Weights)
+        .def_static("activations", &tc::QuantMode::activations)
+        .def_static("per_channel_scaling", &tc::QuantMode::perChannelScaling)
+        .def_static("per_token_scaling", &tc::QuantMode::perTokenScaling)
+        .def_static("per_group_scaling", &tc::QuantMode::perGroupScaling)
+        .def_static("int8_kv_cache", &tc::QuantMode::int8KvCache)
+        .def_static("fp8_kv_cache", &tc::QuantMode::fp8KvCache)
+        .def_static("fp8_qdq", &tc::QuantMode::fp8Qdq)
+        .def_prop_ro("value", &tc::QuantMode::value)
+        .def("is_set", &tc::QuantMode::isSet, nb::arg("mode"))
+        .def_prop_ro("has_int4_weights", &tc::QuantMode::hasInt4Weights)
+        .def_prop_ro("has_int8_weights", &tc::QuantMode::hasInt8Weights)
+        .def_prop_ro("has_activations", &tc::QuantMode::hasActivations)
+        .def_prop_ro("has_per_channel_scaling", &tc::QuantMode::hasPerChannelScaling)
+        .def_prop_ro("has_per_token_scaling", &tc::QuantMode::hasPerTokenScaling)
+        .def_prop_ro("has_per_group_scaling", &tc::QuantMode::hasPerGroupScaling)
+        .def_prop_ro("has_static_activation_scaling", &tc::QuantMode::hasStaticActivationScaling)
+        .def_prop_ro("has_int8_kv_cache", &tc::QuantMode::hasInt8KvCache)
+        .def_prop_ro("has_fp8_kv_cache", &tc::QuantMode::hasFp8KvCache)
+        .def_prop_ro("has_fp8_qdq", &tc::QuantMode::hasFp8Qdq)
+        .def_prop_ro("has_nvfp4", &tc::QuantMode::hasNvfp4)
+        .def_prop_ro("has_w4a8_mxfp4_fp8", &tc::QuantMode::hasW4a8Mxfp4Fp8)
+        .def_prop_ro("has_kv_cache_quant", &tc::QuantMode::hasKvCacheQuant)
+        .def_static("from_description", &tc::QuantMode::fromDescription, nb::arg("quantize_weights"),
+            nb::arg("quantize_activations"), nb::arg("per_token"), nb::arg("per_channel"), nb::arg("per_group"),
+            nb::arg("use_int4_weights"), nb::arg("use_int8_kv_cache"), nb::arg("use_fp8_kv_kache"),
+            nb::arg("use_fp8_qdq"), nb::arg("use_fp8_rowwise"), nb::arg("use_w4a8_qserve"), nb::arg("use_nvfp4"),
+            nb::arg("use_fp8_block_scales"), nb::arg("use_w4a8_mxfp4_fp8"))
+        .def_static("use_smooth_quant", &tc::QuantMode::useSmoothQuant, nb::arg("per_token") = false,
+            nb::arg("per_channel") = false)
+        .def_static("use_weight_only", &tc::QuantMode::useWeightOnly, nb::arg("use_int4_weights") = false,
+            nb::arg("per_group") = false)
+        .def_static("from_quant_algo", &tc::QuantMode::fromQuantAlgo, nb::arg("quant_algo") = nb::none(),
+            nb::arg("kv_cache_quant_algo") = nb::none())
+        .def(nb::self + nb::self)
+        .def(nb::self += nb::self)
+        .def(nb::self - nb::self)
+        .def(nb::self -= nb::self)
+        .def(nb::self == nb::self)
+        .def(nb::self != nb::self);
+
+    nb::class_<tr::ModelConfig>(m, "ModelConfig")
+        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, nvinfer1::DataType>(),
+            nb::arg("vocab_size"), nb::arg("num_layers"), nb::arg("num_attention_layers"), nb::arg("num_rnn_layers"),
+            nb::arg("num_heads"), nb::arg("hidden_size"), nb::arg("data_type"))
+        .def_prop_ro("vocab_size", &tr::ModelConfig::getVocabSize)
+        .def("vocab_size_padded", &tr::ModelConfig::getVocabSizePadded, nb::arg("world_size"))
+        .def("num_layers", &tr::ModelConfig::getNbLayers, nb::arg("pipeline_parallelism") = 1,
+            nb::arg("pipeline_parallelism_rank") = 0)
+        .def("num_attention_layers", &tr::ModelConfig::getNbAttentionLayers, nb::arg("pipeline_parallelism") = 1,
+            nb::arg("pipeline_parallelism_rank") = 0)
+        .def("num_rnn_layers", &tr::ModelConfig::getNbRnnLayers, nb::arg("pipeline_parallelism") = 1,
+            nb::arg("pipeline_parallelism_rank") = 0)
+        .def("num_kv_heads", &tr::ModelConfig::getNbKvHeads, nb::arg("layer_idx"))
+        .def("set_num_kv_heads", &tr::ModelConfig::setNbKvHeads, nb::arg("num_kv_heads"))
+        .def_prop_ro("num_heads", &tr::ModelConfig::getNbHeads)
+        .def_prop_ro("hidden_size", &tr::ModelConfig::getHiddenSize)
+        .def_prop_ro("size_per_head", &tr::ModelConfig::getSizePerHead)
+        .def_prop_ro("data_type", &tr::ModelConfig::getDataType)
+        .def_prop_ro("speculative_decoding_mode", &tr::ModelConfig::getSpeculativeDecodingMode)
+        .def_prop_rw("head_size", &tr::ModelConfig::getSizePerHead, &tr::ModelConfig::setSizePerHead)
+        .def_prop_rw(
+            "num_kv_heads_per_layer", &tr::ModelConfig::getNumKvHeadsPerLayer, &tr::ModelConfig::setNumKvHeadsPerLayer)
+        .def_prop_rw("use_gpt_attention_plugin",
+            nb::overload_cast<>(&tr::ModelConfig::useGptAttentionPlugin, nb::const_),
+            nb::overload_cast<bool>(&tr::ModelConfig::useGptAttentionPlugin))
+        .def_prop_rw("use_packed_input", nb::overload_cast<>(&tr::ModelConfig::usePackedInput, nb::const_),
+            nb::overload_cast<bool>(&tr::ModelConfig::usePackedInput))
+        .def_prop_rw("kv_cache_type", nb::overload_cast<>(&tr::ModelConfig::getKVCacheType, nb::const_),
+            nb::overload_cast<tr::ModelConfig::KVCacheType>(&tr::ModelConfig::setKVCacheType))
+        .def_prop_rw("tokens_per_block", &tr::ModelConfig::getTokensPerBlock, &tr::ModelConfig::setTokensPerBlock)
+        .def_prop_rw("quant_mode", &tr::ModelConfig::getQuantMode, &tr::ModelConfig::setQuantMode)
+        .def_prop_ro("supports_inflight_batching", &tr::ModelConfig::supportsInflightBatching)
+        .def_prop_rw("max_batch_size", &tr::ModelConfig::getMaxBatchSize, &tr::ModelConfig::setMaxBatchSize)
+        .def_prop_rw("max_beam_width", &tr::ModelConfig::getMaxBeamWidth, &tr::ModelConfig::setMaxBeamWidth)
+        .def_prop_rw("max_input_len", &tr::ModelConfig::getMaxInputLen, &tr::ModelConfig::setMaxInputLen)
+        .def_prop_rw("max_seq_len", &tr::ModelConfig::getMaxSequenceLen, &tr::ModelConfig::setMaxSequenceLen)
+        .def_prop_rw("max_num_tokens", &tr::ModelConfig::getMaxNumTokens, &tr::ModelConfig::setMaxNumTokens)
+        .def_prop_rw("max_prompt_embedding_table_size", &tr::ModelConfig::getMaxPromptEmbeddingTableSize,
+            &tr::ModelConfig::setMaxPromptEmbeddingTableSize)
+        .def_prop_ro("use_prompt_tuning", &tr::ModelConfig::usePromptTuning)
+        .def_prop_ro("use_mrope", &tr::ModelConfig::useMrope)
+        .def_prop_rw("use_lora_plugin", nb::overload_cast<>(&tr::ModelConfig::useLoraPlugin, nb::const_),
+            nb::overload_cast<bool>(&tr::ModelConfig::useLoraPlugin))
+        .def_prop_rw("layer_types", &tr::ModelConfig::getLayerTypes, &tr::ModelConfig::setLayerTypes)
+        .def_prop_rw("compute_context_logits", nb::overload_cast<>(&tr::ModelConfig::computeContextLogits, nb::const_),
+            nb::overload_cast<bool>(&tr::ModelConfig::computeContextLogits))
+        .def_prop_rw("compute_generation_logits",
+            nb::overload_cast<>(&tr::ModelConfig::computeGenerationLogits, nb::const_),
+            nb::overload_cast<bool>(&tr::ModelConfig::computeGenerationLogits))
+        .def_prop_rw("model_variant", &tr::ModelConfig::getModelVariant, &tr::ModelConfig::setModelVariant)
+        .def_prop_rw("use_cross_attention", &tr::ModelConfig::useCrossAttention, &tr::ModelConfig::setUseCrossAttention)
+        .def_prop_rw("lora_modules", &tr::ModelConfig::getLoraModules, &tr::ModelConfig::setLoraModules)
+        .def_prop_rw("max_lora_rank", &tr::ModelConfig::getMaxLoraRank, &tr::ModelConfig::setMaxLoraRank)
+        .def_prop_rw("mlp_hidden_size", &tr::ModelConfig::getMlpHiddenSize, &tr::ModelConfig::setMlpHiddenSize)
+        .def_prop_rw("size_per_head", &tr::ModelConfig::getSizePerHead, &tr::ModelConfig::setSizePerHead);
+
+    nb::class_<tr::WorldConfig>(m, "WorldConfig")
+        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32,
+                 std::optional<std::vector<SizeType32>> const&, bool>(),
+            nb::arg("tensor_parallelism") = 1, nb::arg("pipeline_parallelism") = 1, nb::arg("context_parallelism") = 1,
+            nb::arg("rank") = 0, nb::arg("gpus_per_node") = tr::WorldConfig::kDefaultGpusPerNode,
+            nb::arg("device_ids") = nb::none(), nb::arg("enable_attention_dp") = false)
+        .def_prop_ro("size", &tr::WorldConfig::getSize)
+        .def_prop_ro("tensor_parallelism", &tr::WorldConfig::getTensorParallelism)
+        .def_prop_ro("pipeline_parallelism", &tr::WorldConfig::getPipelineParallelism)
+        .def_prop_ro("context_parallelism", &tr::WorldConfig::getContextParallelism)
+        .def_prop_ro("is_tensor_parallel", &tr::WorldConfig::isTensorParallel)
+        .def_prop_ro("is_pipeline_parallel", &tr::WorldConfig::isPipelineParallel)
+        .def_prop_ro("is_context_parallel", &tr::WorldConfig::isContextParallel)
+        .def_prop_ro("rank", &tr::WorldConfig::getRank)
+        .def_prop_ro("local_rank", &tr::WorldConfig::getLocalRank)
+        .def_prop_ro("node_rank", &tr::WorldConfig::getNodeRank)
+        .def_prop_ro("gpus_per_node", &tr::WorldConfig::getGpusPerNode)
+        .def_prop_ro("gpus_per_group", &tr::WorldConfig::getGpusPerGroup)
+        .def_prop_ro("device", &tr::WorldConfig::getDevice)
+        .def_prop_ro("pipeline_parallel_rank", &tr::WorldConfig::getPipelineParallelRank)
+        .def_prop_ro("tensor_parallel_rank", &tr::WorldConfig::getTensorParallelRank)
+        .def_prop_ro("context_parallel_rank", &tr::WorldConfig::getContextParallelRank)
+        .def_prop_ro("enable_attention_dp", &tr::WorldConfig::enableAttentionDP)
+        .def_static("mpi",
+            nb::overload_cast<SizeType32, std::optional<SizeType32>, std::optional<SizeType32>,
+                std::optional<SizeType32>, std::optional<std::vector<SizeType32>> const&, bool>(&tr::WorldConfig::mpi),
+            nb::arg("gpus_per_node") = tr::WorldConfig::kDefaultGpusPerNode, nb::arg("tensor_parallelism") = nb::none(),
+            nb::arg("pipeline_parallelism") = nb::none(), nb::arg("context_parallelism") = nb::none(),
+            nb::arg("device_ids") = nb::none(), nb::arg("enable_attention_dp") = false);
+
+    auto SamplingConfigGetState = [](tr::SamplingConfig const& config) -> nb::tuple
+    {
+        return nb::make_tuple(config.beamWidth, config.temperature, config.minLength, config.repetitionPenalty,
+            config.presencePenalty, config.frequencyPenalty, config.topK, config.topP, config.randomSeed,
+            config.topPDecay, config.topPMin, config.topPResetIds, config.beamSearchDiversityRate, config.lengthPenalty,
+            config.earlyStopping, config.noRepeatNgramSize, config.numReturnSequences, config.minP,
+            config.beamWidthArray);
+    };
+    auto SamplingConfigSetState = [](tr::SamplingConfig& self, nb::tuple t) -> tr::SamplingConfig
+    {
+        assert(t.size() == 19);
+
+        tr::SamplingConfig config;
+        config.beamWidth = nb::cast<SizeType32>(t[0]);
+        config.temperature = nb::cast<OptVec<float>>(t[1]);
+        config.minLength = nb::cast<OptVec<SizeType32>>(t[2]);
+        config.repetitionPenalty = nb::cast<OptVec<float>>(t[3]);
+        config.presencePenalty = nb::cast<OptVec<float>>(t[4]);
+        config.frequencyPenalty = nb::cast<OptVec<float>>(t[5]);
+        config.topK = nb::cast<OptVec<SizeType32>>(t[6]);
+        config.topP = nb::cast<OptVec<float>>(t[7]);
+        config.randomSeed = nb::cast<OptVec<uint64_t>>(t[8]);
+        config.topPDecay = nb::cast<OptVec<float>>(t[9]);
+        config.topPMin = nb::cast<OptVec<float>>(t[10]);
+        config.topPResetIds = nb::cast<OptVec<TokenIdType>>(t[11]);
+        config.beamSearchDiversityRate = nb::cast<OptVec<float>>(t[12]);
+        config.lengthPenalty = nb::cast<OptVec<float>>(t[13]);
+        config.earlyStopping = nb::cast<OptVec<SizeType32>>(t[14]);
+        config.noRepeatNgramSize = nb::cast<OptVec<SizeType32>>(t[15]);
+        config.numReturnSequences = nb::cast<SizeType32>(t[16]);
+        config.minP = nb::cast<OptVec<float>>(t[17]);
+        config.beamWidthArray = nb::cast<OptVec<std::vector<SizeType32>>>(t[18]);
+
+        return config;
+    };
+
+    nb::class_<tr::SamplingConfig>(m, "SamplingConfig")
+        .def(nb::init<SizeType32>(), nb::arg("beam_width") = 1)
+        .def(nb::init<tle::SamplingConfig, std::optional<tle::ExternalDraftTokensConfig>>(),
+            nb::arg("executor_sample_config"), nb::arg("external_draft_tokens_config") = std::nullopt)
+        .def_rw("beam_width", &tr::SamplingConfig::beamWidth)
+        .def_rw("temperature", &tr::SamplingConfig::temperature)
+        .def_rw("min_length", &tr::SamplingConfig::minLength)
+        .def_rw("repetition_penalty", &tr::SamplingConfig::repetitionPenalty)
+        .def_rw("presence_penalty", &tr::SamplingConfig::presencePenalty)
+        .def_rw("frequency_penalty", &tr::SamplingConfig::frequencyPenalty)
+        .def_rw("top_k", &tr::SamplingConfig::topK)
+        .def_rw("top_p", &tr::SamplingConfig::topP)
+        .def_rw("random_seed", &tr::SamplingConfig::randomSeed)
+        .def_rw("top_p_decay", &tr::SamplingConfig::topPDecay)
+        .def_rw("top_p_min", &tr::SamplingConfig::topPMin)
+        .def_rw("top_p_reset_ids", &tr::SamplingConfig::topPResetIds)
+        .def_rw("beam_search_diversity_rate", &tr::SamplingConfig::beamSearchDiversityRate)
+        .def_rw("length_penalty", &tr::SamplingConfig::lengthPenalty)
+        .def_rw("early_stopping", &tr::SamplingConfig::earlyStopping)
+        .def_rw("no_repeat_ngram_size", &tr::SamplingConfig::noRepeatNgramSize)
+        .def_rw("num_return_sequences", &tr::SamplingConfig::numReturnSequences)
+        .def_rw("min_p", &tr::SamplingConfig::minP)
+        .def_rw("beam_width_array", &tr::SamplingConfig::beamWidthArray)
+        .def_rw("normalize_log_probs", &tr::SamplingConfig::normalizeLogProbs)
+        .def("__getstate__", SamplingConfigGetState)
+        .def("__setstate__", SamplingConfigSetState)
+        .def("__eq__", &tr::SamplingConfig::operator==);
+
+    nb::bind_vector<std::vector<tr::SamplingConfig>>(m, "SamplingConfigVector");
+
+    m.def("make_sampling_config", &makeSamplingConfig, nb::arg("configs"));
+
+    nb::class_<tr::GptJsonConfig>(m, "GptJsonConfig")
+        .def(nb::init<std::string, std::string, std::string, SizeType32, SizeType32, SizeType32, SizeType32,
+                 tr::ModelConfig, std::optional<tr::RuntimeDefaults>>(),
+            nb::arg("name"), nb::arg("version"), nb::arg("precision"), nb::arg("tensor_parallelism"),
+            nb::arg("pipeline_parallelism"), nb::arg("context_parallelism"), nb::arg("gpus_per_node"),
+            nb::arg("model_config"), nb::arg("runtime_defaults") = nb::none())
+        .def_static("parse", nb::overload_cast<std::string const&>(&tr::GptJsonConfig::parse), nb::arg("json"))
+        .def_static(
+            "parse_file", nb::overload_cast<std::filesystem::path const&>(&tr::GptJsonConfig::parse), nb::arg("path"))
+        .def_prop_ro("model_config", &tr::GptJsonConfig::getModelConfig)
+        .def_prop_ro("name", &tr::GptJsonConfig::getName)
+        .def_prop_ro("version", &tr::GptJsonConfig::getVersion)
+        .def_prop_ro("precision", &tr::GptJsonConfig::getPrecision)
+        .def_prop_ro("tensor_parallelism", &tr::GptJsonConfig::getTensorParallelism)
+        .def_prop_ro("pipeline_parallelism", &tr::GptJsonConfig::getPipelineParallelism)
+        .def_prop_ro("context_parallelism", &tr::GptJsonConfig::getContextParallelism)
+        .def_prop_ro("gpus_per_node", &tr::GptJsonConfig::getGpusPerNode)
+        .def_prop_ro("world_size", &tr::GptJsonConfig::getWorldSize)
+        .def_prop_ro("runtime_defaults", &tr::GptJsonConfig::getRuntimeDefaults)
+        .def("engine_filename",
+            nb::overload_cast<tr::WorldConfig const&, std::string const&>(
+                &tr::GptJsonConfig::engineFilename, nb::const_),
+            nb::arg("world_config"), nb::arg("model"))
+        .def("engine_filename",
+            nb::overload_cast<tr::WorldConfig const&>(&tr::GptJsonConfig::engineFilename, nb::const_),
+            nb::arg("world_config"));
+
+    nb::enum_<tb::LlmRequestState>(m, "LlmRequestState")
+        .value("UNKNOWN", tb::LlmRequestState::kUNKNOWN)
+        .value("ENCODER_INIT", tb::LlmRequestState::kENCODER_INIT)
+        .value("CONTEXT_INIT", tb::LlmRequestState::kCONTEXT_INIT)
+        .value("GENERATION_IN_PROGRESS", tb::LlmRequestState::kGENERATION_IN_PROGRESS)
+        .value("GENERATION_TO_COMPLETE", tb::LlmRequestState::kGENERATION_TO_COMPLETE)
+        .value("GENERATION_COMPLETE", tb::LlmRequestState::kGENERATION_COMPLETE)
+        .value("DISAGG_GENERATION_INIT", tb::LlmRequestState::kDISAGG_GENERATION_INIT)
+        .value("DISAGG_CONTEXT_TRANS_IN_PROGRESS", tb::LlmRequestState::kDISAGG_CONTEXT_TRANS_IN_PROGRESS)
+        .value("DISAGG_CONTEXT_COMPLETE", tb::LlmRequestState::kDISAGG_CONTEXT_COMPLETE)
+        .value("DISAGG_GENERATION_TRANS_IN_PROGRESS", tb::LlmRequestState::kDISAGG_GENERATION_TRANS_IN_PROGRESS)
+        .value("DISAGG_GENERATION_TRANS_COMPLETE", tb::LlmRequestState::kDISAGG_GENERATION_TRANS_COMPLETE)
+        .value("DISAGG_CONTEXT_INIT_AND_TRANS", tb::LlmRequestState::kDISAGG_CONTEXT_INIT_AND_TRANS);
+
+    nb::class_<tr::MemoryCounters>(m, "MemoryCounters")
+        .def_static("instance", &tr::MemoryCounters::getInstance, nb::rv_policy::reference)
+        .def_prop_ro("gpu", &tr::MemoryCounters::getGpu)
+        .def_prop_ro("cpu", &tr::MemoryCounters::getCpu)
+        .def_prop_ro("pinned", &tr::MemoryCounters::getPinned)
+        .def_prop_ro("uvm", &tr::MemoryCounters::getUVM);
+
+    tensorrt_llm::nanobind::runtime::initBindings(mInternalRuntime);
+    tensorrt_llm::nanobind::testing::initBindings(mInternalTesting);
+    tpb::initBindings(mInternalBatchManager);
+    tb::kv_cache_manager::KVCacheManagerBindings::initBindings(mInternalBatchManager);
+    tb::BasePeftCacheManagerBindings::initBindings(mInternalBatchManager);
+    tb::CacheTransceiverBindings::initBindings(mInternalBatchManager);
+    tpb::Buffers::initBindings(mInternalBatchManager);
+
+    auto mInternalAlgorithms = mInternal.def_submodule("algorithms", "Algorithms internal bindings");
+    tpb::algorithms::initBindings(mInternalAlgorithms);
+
+    auto mUserbuffers = mInternal.def_submodule("userbuffers", "User buffers internal bindings");
+    tensorrt_llm::kernels::userbuffers::UserBufferBindings::initBindings(mUserbuffers);
+
+    // NVLS allocators
+    nb::class_<tr::IpcNvlsHandle>(m, "IpcNvlsHandle")
+        .def(nb::init<>())
+        .def_rw("uc_ptr", &tr::IpcNvlsHandle::uc_ptr)
+        .def_rw("mc_ptr", &tr::IpcNvlsHandle::mc_ptr)
+        .def_rw("size", &tr::IpcNvlsHandle::size)
+        .def("get_ipc_ptrs",
+            [](tr::IpcNvlsHandle& self) { return reinterpret_cast<uintptr_t>(self.ipc_uc_ptrs.data()); });
+
+    m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate, nb::rv_policy::reference);
+    m.def("ipc_nvls_free", &tr::ipcNvlsFree);
+    m.def("ipc_nvls_supported", &tr::ipcNvlsSupported);
 }
diff --git a/cpp/tensorrt_llm/nanobind/common/bindTypes.h b/cpp/tensorrt_llm/nanobind/common/bindTypes.h
new file mode 100644
index 00000000000..5cd714e458a
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/common/bindTypes.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/make_iterator.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
+
+namespace PybindUtils
+{
+
+namespace nb = nanobind;
+
+template <typename T>
+void bindList(nb::module_& m, std::string const& name)
+{
+    nb::class_<T>(m, name.c_str())
+        .def(nb::init<>())
+        .def("push_back", [](T& lst, const typename T::value_type& value) { lst.push_back(value); })
+        .def("pop_back", [](T& lst) { lst.pop_back(); })
+        .def("push_front", [](T& lst, const typename T::value_type& value) { lst.push_front(value); })
+        .def("pop_front", [](T& lst) { lst.pop_front(); })
+        .def("__len__", [](T const& lst) { return lst.size(); })
+        .def(
+            "__iter__", [](T& lst) { return nb::make_iterator(nb::type<T>(), "iterator", lst.begin(), lst.end()); },
+            nb::keep_alive<0, 1>())
+        .def("__getitem__",
+            [](T const& lst, size_t index)
+            {
+                if (index >= lst.size())
+                    throw nb::index_error();
+                auto it = lst.begin();
+                std::advance(it, index);
+                return *it;
+            })
+        .def("__setitem__",
+            [](T& lst, size_t index, const typename T::value_type& value)
+            {
+                if (index >= lst.size())
+                    throw nb::index_error();
+                auto it = lst.begin();
+                std::advance(it, index);
+                *it = value;
+            });
+}
+
+template <typename T>
+void bindSet(nb::module_& m, std::string const& name)
+{
+    nb::class_<T>(m, name.c_str())
+        .def(nb::init<>())
+        .def("clear", &T::clear)
+        .def("size", &T::size)
+        .def("insert", [](T& s, typename T::value_type const& value) { s.insert(value); })
+        .def("erase", nb::overload_cast<typename T::value_type const&>(&T::erase))
+        .def("__len__", [](T const& lst) { return lst.size(); })
+        .def("__contains__", [](T const& s, typename T::value_type x) { return s.find(x) != s.end(); })
+        .def(
+            "__iter__", [](T& s) { return nb::make_iterator(nb::type<T>(), "iterator", s.begin(), s.end()); },
+            nb::keep_alive<0, 1>())
+        .def("__eq__", [](T const& s, T const& other) { return s == other; })
+        .def("__getstate__",
+            [](T const& v)
+            {
+                /* Return a tuple that fully encodes the state of the object */
+                return nb::make_tuple(std::vector<typename T::value_type>(v.begin(), v.end()));
+            })
+        .def("__setstate__",
+            [](T& v, nb::tuple const& t)
+            {
+                if (t.size() != 1)
+                    throw std::runtime_error("Invalid state!");
+                /* Create a new C++ instance */
+                T s;
+                /* Assign any additional state */
+                auto state_list = nb::cast<std::vector<typename T::value_type>>(t[0]);
+                for (auto& item : state_list)
+                {
+                    s.insert(item);
+                }
+                return s;
+            });
+}
+
+} // namespace PybindUtils
diff --git a/cpp/tensorrt_llm/nanobind/common/customCasters.h b/cpp/tensorrt_llm/nanobind/common/customCasters.h
new file mode 100644
index 00000000000..7cfa07d249a
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/common/customCasters.h
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/common.h"
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
+#include "tensorrt_llm/common/optionalRef.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/request.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+#include "tensorrt_llm/runtime/torch.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/DLConvertor.h>
+#include <deque>
+#include <filesystem>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/filesystem.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+
+// Pybind requires to have a central include in order for type casters to work.
+// Opaque bindings add a type caster, so they have the same requirement.
+// See the warning in https://pybind11.readthedocs.io/en/stable/advanced/cast/custom.html
+
+// Opaque bindings
+NB_MAKE_OPAQUE(tensorrt_llm::batch_manager::ReqIdsSet)
+NB_MAKE_OPAQUE(std::vector<tensorrt_llm::batch_manager::SlotDecoderBuffers>)
+NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::decoder_batch::Request>)
+NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::SamplingConfig>)
+NB_MAKE_OPAQUE(std::vector<std::vector<tensorrt_llm::runtime::SizeType32>>)
+
+namespace nb = nanobind;
+
+// Custom casters
+namespace NB_NAMESPACE
+{
+
+namespace detail
+{
+
+template <typename T, typename Alloc>
+struct type_caster<std::deque<T, Alloc>>
+{
+    using Type = std::deque<T, Alloc>;
+    NB_TYPE_CASTER(Type, const_name("List"));
+
+    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup) noexcept
+    {
+        sequence seq(src, nanobind::detail::borrow_t{});
+        value.clear();
+        make_caster<T> caster;
+        for (auto const& item : seq)
+        {
+            if (!caster.from_python(item, flags, cleanup))
+                return false;
+            value.push_back(caster.operator T&());
+        }
+        return true;
+    }
+
+    static handle from_cpp(Type const& deque, rv_policy policy, cleanup_list* cleanup) noexcept
+    {
+        nb::list list;
+
+        for (auto const& item : deque)
+        {
+            nb::object py_item = steal(make_caster<T>::from_cpp(item, policy, cleanup));
+            if (!py_item)
+                return {};
+            list.append(py_item);
+        }
+        return list.release();
+    }
+};
+
+template <typename T>
+struct type_caster<tensorrt_llm::common::OptionalRef<T>>
+{
+    using value_conv = make_caster<T>;
+
+    NB_TYPE_CASTER(tensorrt_llm::common::OptionalRef<T>, value_conv::Name);
+
+    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup)
+    {
+        if (src.is_none())
+        {
+            // If the Python object is None, create an empty OptionalRef
+            value = tensorrt_llm::common::OptionalRef<T>();
+            return true;
+        }
+
+        value_conv conv;
+        if (!conv.from_python(src, flags, cleanup))
+            return false;
+
+        // Create an OptionalRef with a reference to the converted value
+        value = tensorrt_llm::common::OptionalRef<T>(conv);
+        return true;
+    }
+
+    static handle from_cpp(tensorrt_llm::common::OptionalRef<T> const& src, rv_policy policy, cleanup_list* cleanup)
+    {
+        if (!src.has_value())
+            return none().release();
+
+        return value_conv::from_cpp(*src, policy, cleanup);
+    }
+};
+
+template <typename T>
+struct PathCaster
+{
+
+private:
+    static PyObject* unicode_from_fs_native(std::string const& w)
+    {
+        return PyUnicode_DecodeFSDefaultAndSize(w.c_str(), ssize_t(w.size()));
+    }
+
+    static PyObject* unicode_from_fs_native(std::wstring const& w)
+    {
+        return PyUnicode_FromWideChar(w.c_str(), ssize_t(w.size()));
+    }
+
+public:
+    static handle from_cpp(T const& path, rv_policy, cleanup_list* cleanup)
+    {
+        if (auto py_str = unicode_from_fs_native(path.native()))
+        {
+            return module_::import_("pathlib").attr("Path")(steal<object>(py_str), cleanup).release();
+        }
+        return nullptr;
+    }
+
+    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup)
+    {
+        PyObject* native = nullptr;
+        if constexpr (std::is_same_v<typename T::value_type, char>)
+        {
+            if (PyUnicode_FSConverter(src.ptr(), &native) != 0)
+            {
+                if (auto* c_str = PyBytes_AsString(native))
+                {
+                    // AsString returns a pointer to the internal buffer, which
+                    // must not be free'd.
+                    value = c_str;
+                }
+            }
+        }
+        else if constexpr (std::is_same_v<typename T::value_type, wchar_t>)
+        {
+            if (PyUnicode_FSDecoder(src.ptr(), &native) != 0)
+            {
+                if (auto* c_str = PyUnicode_AsWideCharString(native, nullptr))
+                {
+                    // AsWideCharString returns a new string that must be free'd.
+                    value = c_str; // Copies the string.
+                    PyMem_Free(c_str);
+                }
+            }
+        }
+        Py_XDECREF(native);
+        if (PyErr_Occurred())
+        {
+            PyErr_Clear();
+            return false;
+        }
+        return true;
+    }
+
+    NB_TYPE_CASTER(T, const_name("os.PathLike"));
+};
+
+template <>
+class type_caster<tensorrt_llm::executor::StreamPtr>
+{
+public:
+    NB_TYPE_CASTER(tensorrt_llm::executor::StreamPtr, const_name("int"));
+
+    bool from_python([[maybe_unused]] handle src, uint8_t flags, cleanup_list* cleanup)
+    {
+        auto stream_ptr = nanobind::cast<uintptr_t>(src);
+        value = std::make_shared<tensorrt_llm::runtime::CudaStream>(reinterpret_cast<cudaStream_t>(stream_ptr));
+
+        return true;
+    }
+
+    static handle from_cpp(
+        tensorrt_llm::executor::StreamPtr const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
+    {
+        // Return cudaStream_t as integer.
+        return PyLong_FromVoidPtr(src->get());
+    }
+};
+
+template <>
+struct type_caster<tensorrt_llm::executor::Tensor>
+{
+public:
+    NB_TYPE_CASTER(tensorrt_llm::executor::Tensor, const_name("torch.Tensor"));
+
+    // Convert PyObject(torch.Tensor) -> tensorrt_llm::executor::Tensor
+    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup)
+    {
+        PyObject* obj = src.ptr();
+        if (THPVariable_Check(obj))
+        {
+            at::Tensor const& t = THPVariable_Unpack(obj);
+            value = tensorrt_llm::executor::detail::ofITensor(tensorrt_llm::runtime::TorchView::of(t));
+            return true;
+        }
+        return false;
+    }
+
+    // Convert tensorrt_llm::executor::Tensor -> PyObject(torch.Tensor)
+    static handle from_cpp(
+        tensorrt_llm::executor::Tensor const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
+    {
+        return THPVariable_Wrap(tensorrt_llm::runtime::Torch::tensor(tensorrt_llm::executor::detail::toITensor(src)));
+    }
+};
+
+template <>
+struct type_caster<tensorrt_llm::runtime::ITensor::SharedPtr>
+{
+public:
+    NB_TYPE_CASTER(tensorrt_llm::runtime::ITensor::SharedPtr, const_name("torch.Tensor"));
+
+    // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedPtr
+    bool from_python(handle src, uint8_t, cleanup_list*)
+    {
+        PyObject* obj = src.ptr();
+        if (THPVariable_Check(obj))
+        {
+            at::Tensor const& t = THPVariable_Unpack(obj);
+            value = std::move(tensorrt_llm::runtime::TorchView::of(t));
+            return true;
+        }
+        return false;
+    }
+
+    // Convert tensorrt_llm::runtime::ITensor::SharedPtr -> PyObject(torch.Tensor)
+    static handle from_cpp(
+        tensorrt_llm::runtime::ITensor::SharedPtr const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
+    {
+        if (src == nullptr)
+        {
+            return none().release();
+        }
+        return THPVariable_Wrap(tensorrt_llm::runtime::Torch::tensor(src));
+    }
+};
+
+template <>
+struct type_caster<tensorrt_llm::runtime::ITensor::SharedConstPtr>
+{
+public:
+    NB_TYPE_CASTER(tensorrt_llm::runtime::ITensor::SharedConstPtr, const_name("torch.Tensor"));
+
+    // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedConstPtr
+    bool from_python(handle src, uint8_t, cleanup_list*)
+    {
+        PyObject* obj = src.ptr();
+        if (THPVariable_Check(obj))
+        {
+            at::Tensor const& t = THPVariable_Unpack(obj);
+            value = std::move(tensorrt_llm::runtime::TorchView::of(t));
+            return true;
+        }
+        return false;
+    }
+
+    // Convert tensorrt_llm::runtime::ITensor::SharedConstPtr -> PyObject(torch.Tensor)
+    static handle from_cpp(
+        tensorrt_llm::runtime::ITensor::SharedConstPtr const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
+    {
+        if (src == nullptr)
+        {
+            return none().release();
+        }
+        return THPVariable_Wrap(tensorrt_llm::runtime::Torch::tensor(
+            reinterpret_cast<tensorrt_llm::runtime::ITensor::SharedPtr const&>(src)));
+    }
+};
+
+template <>
+struct type_caster<at::Tensor>
+{
+    NB_TYPE_CASTER(at::Tensor, const_name("torch.Tensor"));
+
+    bool from_python(nb::handle src, uint8_t, cleanup_list*) noexcept
+    {
+        nb::object capsule = nb::getattr(src, "__dlpack__")();
+        DLManagedTensor* dl_managed = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule.ptr(), "dltensor"));
+        PyCapsule_SetDestructor(capsule.ptr(), nullptr);
+        value = at::fromDLPack(dl_managed).alias();
+        return true;
+    }
+
+    static handle from_cpp(at::Tensor tensor, rv_policy, cleanup_list*) noexcept
+    {
+        DLManagedTensor* dl_managed = at::toDLPack(tensor);
+        if (!dl_managed)
+            return nullptr;
+
+        nanobind::object capsule = nb::steal(PyCapsule_New(dl_managed, "dltensor",
+            [](PyObject* obj)
+            {
+                DLManagedTensor* dl = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(obj, "dltensor"));
+                dl->deleter(dl);
+            }));
+        if (!capsule.is_valid())
+        {
+            dl_managed->deleter(dl_managed);
+            return nullptr;
+        }
+        nanobind::module_ torch = nanobind::module_::import_("torch");
+        nanobind::object result = torch.attr("from_dlpack")(capsule);
+        capsule.release();
+        return result.release();
+    }
+};
+} // namespace detail
+} // namespace NB_NAMESPACE
diff --git a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp
new file mode 100644
index 00000000000..d3f482df899
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp
@@ -0,0 +1,263 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include "executor.h"
+#include "executorConfig.h"
+#include "request.h"
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/variant.h>
+#include <optional>
+
+namespace nb = nanobind;
+namespace tle = tensorrt_llm::executor;
+using SizeType32 = tle::SizeType32;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+template <typename T>
+void instantiateEventDiff(nb::module_& m, std::string const& name)
+{
+    nb::class_<tle::KVCacheEventDiff<T>>(m, ("KVCacheEventDiff" + name).c_str())
+        .def_ro("old_value", &tle::KVCacheEventDiff<T>::oldValue)
+        .def_ro("new_value", &tle::KVCacheEventDiff<T>::newValue);
+}
+
+void initBindings(nb::module_& m)
+{
+    m.attr("__version__") = tle::version();
+    nb::enum_<tle::ModelType>(m, "ModelType")
+        .value("DECODER_ONLY", tle::ModelType::kDECODER_ONLY)
+        .value("ENCODER_ONLY", tle::ModelType::kENCODER_ONLY)
+        .value("ENCODER_DECODER", tle::ModelType::kENCODER_DECODER);
+
+    auto decodingModeGetstate = [](tle::DecodingMode const& self) { return nb::make_tuple(self.getState()); };
+    auto decodingModeSetstate = [](tle::DecodingMode& self, nb::tuple const& state)
+    {
+        if (state.size() != 1)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::DecodingMode(nb::cast<tle::DecodingMode::UnderlyingType>(state[0]));
+    };
+    nb::class_<tle::DecodingMode>(m, "DecodingMode")
+        .def("Auto", &tle::DecodingMode::Auto)
+        .def("TopK", &tle::DecodingMode::TopK)
+        .def("TopP", &tle::DecodingMode::TopP)
+        .def("TopKTopP", &tle::DecodingMode::TopKTopP)
+        .def("BeamSearch", &tle::DecodingMode::BeamSearch)
+        .def("Medusa", &tle::DecodingMode::Medusa)
+        .def("Lookahead", &tle::DecodingMode::Lookahead)
+        .def("ExplicitDraftTokens", &tle::DecodingMode::ExplicitDraftTokens)
+        .def("Eagle", &tle::DecodingMode::Eagle)
+        .def("isAuto", &tle::DecodingMode::isAuto)
+        .def("isTopK", &tle::DecodingMode::isTopK)
+        .def("isTopP", &tle::DecodingMode::isTopP)
+        .def("isTopKorTopP", &tle::DecodingMode::isTopKorTopP)
+        .def("isTopKandTopP", &tle::DecodingMode::isTopKandTopP)
+        .def("isBeamSearch", &tle::DecodingMode::isBeamSearch)
+        .def("isMedusa", &tle::DecodingMode::isMedusa)
+        .def("isLookahead", &tle::DecodingMode::isLookahead)
+        .def("isExplicitDraftTokens", &tle::DecodingMode::isExplicitDraftTokens)
+        .def("isEagle", &tle::DecodingMode::isEagle)
+        .def("useVariableBeamWidthSearch", &tle::DecodingMode::useVariableBeamWidthSearch)
+        .def_prop_ro("name", &tle::DecodingMode::getName)
+        .def("__getstate__", decodingModeGetstate)
+        .def("__setstate__", decodingModeSetstate);
+
+    nb::enum_<tle::CapacitySchedulerPolicy>(m, "CapacitySchedulerPolicy")
+        .value("MAX_UTILIZATION", tle::CapacitySchedulerPolicy::kMAX_UTILIZATION)
+        .value("GUARANTEED_NO_EVICT", tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT)
+        .value("STATIC_BATCH", tle::CapacitySchedulerPolicy::kSTATIC_BATCH);
+
+    nb::enum_<tle::ContextChunkingPolicy>(m, "ContextChunkingPolicy")
+        .value("EQUAL_PROGRESS", tle::ContextChunkingPolicy::kEQUAL_PROGRESS)
+        .value("FIRST_COME_FIRST_SERVED", tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED);
+
+    nb::enum_<tle::CommunicationType>(m, "CommunicationType").value("MPI", tle::CommunicationType::kMPI);
+
+    nb::enum_<tle::CommunicationMode>(m, "CommunicationMode")
+        .value("LEADER", tle::CommunicationMode::kLEADER)
+        .value("ORCHESTRATOR", tle::CommunicationMode::kORCHESTRATOR);
+
+    nb::class_<tle::KvCacheStats>(m, "KvCacheStats")
+        .def(nb::init<>())
+        .def_rw("max_num_blocks", &tle::KvCacheStats::maxNumBlocks)
+        .def_rw("free_num_blocks", &tle::KvCacheStats::freeNumBlocks)
+        .def_rw("used_num_blocks", &tle::KvCacheStats::usedNumBlocks)
+        .def_rw("tokens_per_block", &tle::KvCacheStats::tokensPerBlock)
+        .def_rw("alloc_total_blocks", &tle::KvCacheStats::allocTotalBlocks)
+        .def_rw("alloc_new_blocks", &tle::KvCacheStats::allocNewBlocks)
+        .def_rw("reused_blocks", &tle::KvCacheStats::reusedBlocks)
+        .def_rw("missed_blocks", &tle::KvCacheStats::missedBlocks)
+        .def_rw("cache_hit_rate", &tle::KvCacheStats::cacheHitRate);
+
+    nb::class_<tle::StaticBatchingStats>(m, "StaticBatchingStats")
+        .def(nb::init<>())
+        .def_rw("num_scheduled_requests", &tle::StaticBatchingStats::numScheduledRequests)
+        .def_rw("num_context_requests", &tle::StaticBatchingStats::numContextRequests)
+        .def_rw("num_ctx_tokens", &tle::StaticBatchingStats::numCtxTokens)
+        .def_rw("num_gen_tokens", &tle::StaticBatchingStats::numGenTokens)
+        .def_rw("empty_gen_slots", &tle::StaticBatchingStats::emptyGenSlots);
+
+    nb::class_<tle::InflightBatchingStats>(m, "InflightBatchingStats")
+        .def(nb::init<>())
+        .def_rw("num_scheduled_requests", &tle::InflightBatchingStats::numScheduledRequests)
+        .def_rw("num_context_requests", &tle::InflightBatchingStats::numContextRequests)
+        .def_rw("num_gen_requests", &tle::InflightBatchingStats::numGenRequests)
+        .def_rw("num_paused_requests", &tle::InflightBatchingStats::numPausedRequests)
+        .def_rw("num_ctx_tokens", &tle::InflightBatchingStats::numCtxTokens)
+        .def_rw("micro_batch_id", &tle::InflightBatchingStats::microBatchId)
+        .def_rw("avg_num_decoded_tokens_per_iter", &tle::InflightBatchingStats::avgNumDecodedTokensPerIter);
+
+    nb::class_<tle::SpecDecodingStats>(m, "SpecDecodingStats")
+        .def(nb::init<>())
+        .def_rw("num_draft_tokens", &tle::SpecDecodingStats::numDraftTokens)
+        .def_rw("num_accepted_tokens", &tle::SpecDecodingStats::numAcceptedTokens)
+        .def_rw("num_requests_with_draft_tokens", &tle::SpecDecodingStats::numRequestsWithDraftTokens)
+        .def_rw("acceptance_length", &tle::SpecDecodingStats::acceptanceLength)
+        .def_rw("iter_latency_ms", &tle::SpecDecodingStats::iterLatencyMS)
+        .def_rw("draft_overhead", &tle::SpecDecodingStats::draftOverhead);
+
+    nb::class_<tle::IterationStats>(m, "IterationStats")
+        .def(nb::init<>())
+        .def_rw("timestamp", &tle::IterationStats::timestamp)
+        .def_rw("iter", &tle::IterationStats::iter)
+        .def_rw("iter_latency_ms", &tle::IterationStats::iterLatencyMS)
+        .def_rw("new_active_requests_queue_latency_ms", &tle::IterationStats::newActiveRequestsQueueLatencyMS)
+        .def_rw("num_new_active_requests", &tle::IterationStats::numNewActiveRequests)
+        .def_rw("num_active_requests", &tle::IterationStats::numActiveRequests)
+        .def_rw("num_queued_requests", &tle::IterationStats::numQueuedRequests)
+        .def_rw("num_completed_requests", &tle::IterationStats::numCompletedRequests)
+        .def_rw("max_num_active_requests", &tle::IterationStats::maxNumActiveRequests)
+        .def_rw("gpu_mem_usage", &tle::IterationStats::gpuMemUsage)
+        .def_rw("cpu_mem_usage", &tle::IterationStats::cpuMemUsage)
+        .def_rw("pinned_mem_usage", &tle::IterationStats::pinnedMemUsage)
+        .def_rw("kv_cache_stats", &tle::IterationStats::kvCacheStats)
+        .def_rw("cross_kv_cache_stats", &tle::IterationStats::crossKvCacheStats)
+        .def_rw("static_batching_stats", &tle::IterationStats::staticBatchingStats)
+        .def_rw("inflight_batching_stats", &tle::IterationStats::inflightBatchingStats)
+        .def_rw("specdec_stats", &tle::IterationStats::specDecodingStats)
+        .def("to_json_str",
+            [](tle::IterationStats const& iterationStats)
+            { return tle::JsonSerialization::toJsonStr(iterationStats); });
+
+    nb::class_<tle::DebugTensorsPerIteration>(m, "DebugTensorsPerIteration")
+        .def(nb::init<>())
+        .def_rw("iter", &tle::DebugTensorsPerIteration::iter)
+        .def_rw("debug_tensors", &tle::DebugTensorsPerIteration::debugTensors);
+
+    nb::enum_<tle::RequestStage>(m, "RequestStage")
+        .value("QUEUED", tle::RequestStage::kQUEUED)
+        .value("ENCODER_IN_PROGRESS", tle::RequestStage::kENCODER_IN_PROGRESS)
+        .value("CONTEXT_IN_PROGRESS", tle::RequestStage::kCONTEXT_IN_PROGRESS)
+        .value("GENERATION_IN_PROGRESS", tle::RequestStage::kGENERATION_IN_PROGRESS)
+        .value("GENERATION_COMPLETE", tle::RequestStage::kGENERATION_COMPLETE);
+
+    nb::class_<tle::DisServingRequestStats>(m, "DisServingRequestStats")
+        .def(nb::init<>())
+        .def_rw("kv_cache_transfer_ms", &tle::DisServingRequestStats::kvCacheTransferMS)
+        .def_rw("kv_cache_size", &tle::DisServingRequestStats::kvCacheSize);
+
+    nb::class_<tle::RequestStats>(m, "RequestStats")
+        .def(nb::init<>())
+        .def_rw("id", &tle::RequestStats::id)
+        .def_rw("stage", &tle::RequestStats::stage)
+        .def_rw("context_prefill_position", &tle::RequestStats::contextPrefillPosition)
+        .def_rw("num_generated_tokens", &tle::RequestStats::numGeneratedTokens)
+        .def_rw("avg_num_decoded_tokens_per_iter", &tle::RequestStats::avgNumDecodedTokensPerIter)
+        .def_rw("scheduled", &tle::RequestStats::scheduled)
+        .def_rw("paused", &tle::RequestStats::paused)
+        .def_rw("dis_serving_stats", &tle::RequestStats::disServingStats)
+        .def_rw("alloc_total_blocks_per_request", &tle::RequestStats::allocTotalBlocksPerRequest)
+        .def_rw("alloc_new_blocks_per_request", &tle::RequestStats::allocNewBlocksPerRequest)
+        .def_rw("reused_blocks_per_request", &tle::RequestStats::reusedBlocksPerRequest)
+        .def_rw("missed_blocks_per_request", &tle::RequestStats::missedBlocksPerRequest)
+        .def_rw("kv_cache_hit_rate_per_request", &tle::RequestStats::kvCacheHitRatePerRequest)
+        .def("to_json_str",
+            [](tle::RequestStats const& iterationStats) { return tle::JsonSerialization::toJsonStr(iterationStats); });
+
+    nb::class_<tle::RequestStatsPerIteration>(m, "RequestStatsPerIteration")
+        .def(nb::init<>())
+        .def_rw("iter", &tle::RequestStatsPerIteration::iter)
+        .def_rw("request_stats", &tle::RequestStatsPerIteration::requestStats)
+        .def("to_json_str",
+            [](tle::RequestStatsPerIteration const& iterationStats)
+            { return tle::JsonSerialization::toJsonStr(iterationStats); });
+
+    nb::module_ executor_kv_cache = m.def_submodule("kv_cache", "Executor KV Cache Manager");
+
+    nb::class_<tle::KVCacheCreatedData>(executor_kv_cache, "KVCacheCreatedData")
+        .def_ro("num_blocks_per_cache_level", &tle::KVCacheCreatedData::numBlocksPerCacheLevel);
+
+    nb::class_<tensorrt_llm::runtime::UniqueToken>(executor_kv_cache, "UniqueToken")
+        .def_ro("token_id", &tensorrt_llm::runtime::UniqueToken::tokenId)
+        .def_ro("token_extra_id", &tensorrt_llm::runtime::UniqueToken::tokenExtraId);
+
+    nb::class_<tle::KVCacheStoredBlockData>(executor_kv_cache, "KVCacheStoredBlockData")
+        .def_ro("block_hash", &tle::KVCacheStoredBlockData::blockHash)
+        .def_ro("tokens", &tle::KVCacheStoredBlockData::tokens)
+        .def_ro("lora_id", &tle::KVCacheStoredBlockData::loraId)
+        .def_ro("cache_level", &tle::KVCacheStoredBlockData::cacheLevel)
+        .def_ro("priority", &tle::KVCacheStoredBlockData::priority);
+
+    nb::class_<tle::KVCacheStoredData>(executor_kv_cache, "KVCacheStoredData")
+        .def_ro("parent_hash", &tle::KVCacheStoredData::parentHash)
+        .def_ro("blocks", &tle::KVCacheStoredData::blocks);
+
+    nb::class_<tle::KVCacheRemovedData>(executor_kv_cache, "KVCacheRemovedData")
+        .def_ro("block_hashes", &tle::KVCacheRemovedData::blockHashes);
+
+    instantiateEventDiff<SizeType32>(executor_kv_cache, "Int");
+
+    nb::class_<tle::KVCacheUpdatedData>(executor_kv_cache, "KVCacheUpdatedData")
+        .def_ro("block_hash", &tle::KVCacheUpdatedData::blockHash)
+        .def_ro("cache_level", &tle::KVCacheUpdatedData::cacheLevel)
+        .def_ro("priority", &tle::KVCacheUpdatedData::priority);
+
+    nb::class_<tle::KVCacheEvent>(executor_kv_cache, "KVCacheEvent")
+        .def_ro("event_id", &tle::KVCacheEvent::eventId)
+        .def_ro("data", &tle::KVCacheEvent::data)
+        .def_ro("window_size", &tle::KVCacheEvent::windowSize);
+
+    nb::class_<tle::KVCacheEventManager>(executor_kv_cache, "KVCacheEventManager")
+        .def(
+            "get_latest_events",
+            [](tle::KVCacheEventManager& self, std::optional<double> timeout_ms = std::nullopt)
+            {
+                if (timeout_ms)
+                {
+                    return self.getLatestEvents(std::chrono::milliseconds(static_cast<int64_t>(*timeout_ms)));
+                }
+                return self.getLatestEvents(std::nullopt);
+            },
+            nb::arg("timeout_ms") = std::nullopt);
+
+    tensorrt_llm::nanobind::executor::initRequestBindings(m);
+    tensorrt_llm::nanobind::executor::initConfigBindings(m);
+    tensorrt_llm::nanobind::executor::Executor::initBindings(m);
+}
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/bindings.h b/cpp/tensorrt_llm/nanobind/executor/bindings.h
new file mode 100644
index 00000000000..4df52c2d34e
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/bindings.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+// Register bindings for executor API.
+void initBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executor.cpp b/cpp/tensorrt_llm/nanobind/executor/executor.cpp
new file mode 100644
index 00000000000..59c7d2a3dc1
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/executor.cpp
@@ -0,0 +1,241 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "executor.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/executor/tensor.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/chrono.h>
+#include <nanobind/stl/filesystem.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
+#include <torch/extension.h>
+
+namespace nb = nanobind;
+namespace tle = tensorrt_llm::executor;
+
+namespace nanobind::detail
+{
+
+template <>
+struct dtype_traits<half>
+{
+    static constexpr dlpack::dtype value{
+        (uint8_t) dlpack::dtype_code::Float, // type code
+        16,                                  // size in bits
+        1                                    // lanes (simd), usually set to 1
+    };
+    static constexpr auto name = const_name("float16");
+};
+} // namespace nanobind::detail
+
+namespace
+{
+// todo: Properly support FP8 and BF16 and verify functionality
+tle::Tensor numpyToTensor(nb::ndarray<nb::numpy> const& array)
+{
+    auto npDtype = array.dtype();
+    char kind = '\0';
+    switch (npDtype.code)
+    {
+    case static_cast<uint8_t>(nb::dlpack::dtype_code::Int):
+        kind = 'i'; // signed integer
+        break;
+    case static_cast<uint8_t>(nb::dlpack::dtype_code::UInt):
+        kind = 'u'; // unsigned integer
+        break;
+    case static_cast<uint8_t>(nb::dlpack::dtype_code::Float):
+        kind = 'f'; // floating point
+        break;
+    case static_cast<uint8_t>(nb::dlpack::dtype_code::Bfloat):
+        kind = 'f'; // brain floating point (treat as float kind)
+        break;
+    case static_cast<uint8_t>(nb::dlpack::dtype_code::Complex):
+        kind = 'c'; // complex
+        break;
+    default:
+        kind = 'V'; // void/other
+        break;
+    }
+    tle::DataType dtype;
+    if (npDtype == nb::dtype<half>())
+    {
+        dtype = tle::DataType::kFP16;
+    }
+    else if (npDtype == nb::dtype<float>())
+    {
+        dtype = tle::DataType::kFP32;
+    }
+    else if (npDtype == nb::dtype<int8_t>())
+    {
+        dtype = tle::DataType::kINT8;
+    }
+    else if (npDtype == nb::dtype<int32_t>())
+    {
+        dtype = tle::DataType::kINT32;
+    }
+    else if (npDtype == nb::dtype<int64_t>())
+    {
+        dtype = tle::DataType::kINT64;
+    }
+    else if (kind == 'V' && array.itemsize() == 1)
+    {
+        dtype = tle::DataType::kFP8;
+    }
+    else if (kind == 'V' && array.itemsize() == 2)
+    {
+        dtype = tle::DataType::kBF16;
+    }
+    else
+    {
+        TLLM_THROW("Unsupported numpy dtype.");
+    }
+
+    // todo: improve the following code
+    std::vector<int64_t> dims;
+    dims.reserve(array.ndim());
+    for (size_t i = 0; i < array.ndim(); ++i)
+    {
+        dims.push_back(static_cast<int64_t>(array.shape(i)));
+    }
+    tle::Shape shape(dims.data(), dims.size());
+
+    return tle::Tensor::of(dtype, const_cast<void*>(array.data()), shape);
+}
+
+} // namespace
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+Executor::Executor(
+    std::filesystem::path const& modelPath, tle::ModelType modelType, tle::ExecutorConfig const& executorConfig)
+{
+    mExecutor = std::make_unique<tle::Executor>(modelPath, modelType, executorConfig);
+}
+
+Executor::Executor(std::filesystem::path const& encoderModelPath, std::filesystem::path const& decoderModelPath,
+    tle::ModelType modelType, tle::ExecutorConfig const& executorConfig)
+{
+    mExecutor = std::make_unique<tle::Executor>(encoderModelPath, decoderModelPath, modelType, executorConfig);
+}
+
+Executor::Executor(nb::bytes const& engineBuffer, std::string const& jsonConfigStr, tle::ModelType modelType,
+    tle::ExecutorConfig const& executorConfig, std::optional<nb::dict> managedWeights)
+{
+    uint8_t const* data = static_cast<uint8_t const*>(engineBuffer.data());
+    size_t size = engineBuffer.size();
+    std::optional<std::map<std::string, tle::Tensor>> managedWeightsMap = std::nullopt;
+    if (managedWeights.has_value() && !managedWeights.value().empty())
+    {
+        managedWeightsMap = std::map<std::string, tle::Tensor>();
+        for (auto const& [rawName, rawArray] : managedWeights.value())
+        {
+            std::string name = nb::cast<std::string>(rawName);
+            nb::ndarray<nb::numpy> array = nb::cast<nb::ndarray<nb::numpy>>(rawArray);
+            managedWeightsMap->emplace(name, numpyToTensor(array));
+        }
+    }
+    mExecutor = std::make_unique<tle::Executor>(
+        tle::BufferView(data, size), jsonConfigStr, modelType, executorConfig, managedWeightsMap);
+}
+
+Executor::Executor(std::string const& encoderEngineBuffer, std::string const& encoderJsonConfigStr,
+    std::string const& decoderEngineBuffer, std::string const& decoderJsonConfigStr, tle::ModelType modelType,
+    tle::ExecutorConfig const& executorConfig)
+{
+    uint8_t const* encoderData = reinterpret_cast<uint8_t const*>(encoderEngineBuffer.data());
+    size_t encoderSize = encoderEngineBuffer.size();
+    uint8_t const* decoderData = reinterpret_cast<uint8_t const*>(decoderEngineBuffer.data());
+    size_t decoderSize = decoderEngineBuffer.size();
+    mExecutor = std::make_unique<tle::Executor>(tle::BufferView(encoderData, encoderSize), encoderJsonConfigStr,
+        tle::BufferView(decoderData, decoderSize), decoderJsonConfigStr, modelType, executorConfig);
+}
+
+nb::object Executor::enter()
+{
+    TLLM_CHECK(static_cast<bool>(mExecutor));
+    return nb::cast(this);
+}
+
+void Executor::exit(
+    [[maybe_unused]] nb::handle type, [[maybe_unused]] nb::handle value, [[maybe_unused]] nb::handle traceback)
+{
+    shutdown();
+    mExecutor = nullptr;
+}
+
+void Executor::shutdown()
+{
+    // NOTE: we must release the GIL here. Executor has spawned a thread for the execution loop. That thread must be
+    // able to do forward progress for the shutdown process to succeed. It takes the GIL during its callbacks, so
+    // we release it now. Note that we shouldn't do anything related to python objects after that.
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    nb::gil_scoped_release release;
+    mExecutor->shutdown();
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+void Executor::initBindings(nb::module_& m)
+{
+    nb::class_<Executor>(m, "Executor")
+        .def(nb::init<std::filesystem::path const&, tle::ModelType, tle::ExecutorConfig const&>(),
+            nb::arg("model_path"), nb::arg("model_type"), nb::arg("executor_config"))
+        .def(nb::init<std::filesystem::path const&, std::filesystem::path const&, tle::ModelType,
+                 tle::ExecutorConfig const&>(),
+            nb::arg("encoder_model_path"), nb::arg("decoder_model_path"), nb::arg("model_type"),
+            nb::arg("executor_config"))
+        .def(nb::init<nb::bytes, std::string const&, tle::ModelType, tle::ExecutorConfig const&, nb::dict>(),
+            nb::arg("engine_buffer"), nb::arg("json_config_str"), nb::arg("model_type"), nb::arg("executor_config"),
+            nb::arg("managed_weights") = nb::dict())
+        .def(nb::init<std::string const&, std::string const&, std::string const&, std::string const&, tle::ModelType,
+                 tle::ExecutorConfig const&>(),
+            nb::arg("encoder_engine_buffer"), nb::arg("encoder_json_config_str"), nb::arg("decoder_engine_buffer"),
+            nb::arg("decoder_json_config_str"), nb::arg("model_type"), nb::arg("executor_config"))
+        .def("shutdown", &Executor::shutdown)
+        .def("__enter__", &Executor::enter)
+        .def("__exit__", &Executor::exit)
+        .def("enqueue_request", &Executor::enqueueRequest, nb::arg("request"))
+        .def("enqueue_requests", &Executor::enqueueRequests, nb::arg("requests"))
+        .def("await_responses",
+            nb::overload_cast<std::optional<std::chrono::milliseconds> const&>(&Executor::awaitResponses),
+            nb::arg("timeout") = nb::none())
+        .def("await_responses",
+            nb::overload_cast<tle::IdType const&, std::optional<std::chrono::milliseconds> const&>(
+                &Executor::awaitResponses),
+            nb::arg("id"), nb::arg("timeout") = nb::none())
+        .def("await_responses",
+            nb::overload_cast<std::vector<tle::IdType> const&, std::optional<std::chrono::milliseconds> const&>(
+                &Executor::awaitResponses),
+            nb::arg("ids"), nb::arg("timeout") = nb::none())
+        .def("get_num_responses_ready", &Executor::getNumResponsesReady, nb::arg("id") = nb::none())
+        .def("cancel_request", &Executor::cancelRequest, nb::arg("id") = nb::none())
+        .def("get_latest_iteration_stats", &Executor::getLatestIterationStats)
+        .def("get_latest_request_stats", &Executor::getLatestRequestStats)
+        .def("get_latest_debug_tensors", &Executor::getLatestDebugTensors)
+        .def("can_enqueue_requests", &Executor::canEnqueueRequests)
+        .def("get_kv_cache_event_manager", &Executor::getKVCacheEventManager);
+}
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executor.h b/cpp/tensorrt_llm/nanobind/executor/executor.h
new file mode 100644
index 00000000000..22c24abb4bf
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/executor.h
@@ -0,0 +1,129 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/types.h"
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+namespace tle = tensorrt_llm::executor;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+class Executor
+{
+public:
+    Executor(
+        std::filesystem::path const& modelPath, tle::ModelType modelType, tle::ExecutorConfig const& executorConfig);
+
+    Executor(std::filesystem::path const& encoderModelPath, std::filesystem::path const& decoderModelPath,
+        tle::ModelType modelType, tle::ExecutorConfig const& executorConfig);
+
+    Executor(nb::bytes const& engineBuffer, std::string const& jsonConfigStr, tle::ModelType modelType,
+        tle::ExecutorConfig const& executorConfig, std::optional<nb::dict> managedWeights);
+
+    Executor(std::string const& encoderEngineBuffer, std::string const& encoderJsonConfigStr,
+        std::string const& decoderEngineBuffer, std::string const& decoderJsonConfigStr, tle::ModelType modelType,
+        tle::ExecutorConfig const& executorConfig);
+
+    nb::object enter();
+    void exit(
+        [[maybe_unused]] nb::handle type, [[maybe_unused]] nb::handle value, [[maybe_unused]] nb::handle traceback);
+    void shutdown();
+
+    [[nodiscard]] tle::IdType enqueueRequest(tle::Request const& request)
+    {
+        return mExecutor->enqueueRequest(request);
+    }
+
+    [[nodiscard]] std::vector<tle::IdType> enqueueRequests(std::vector<tle::Request> const& requests)
+    {
+        return mExecutor->enqueueRequests(requests);
+    }
+
+    [[nodiscard]] std::vector<tle::Response> awaitResponses(
+        std::optional<std::chrono::milliseconds> const& timeout = std::nullopt)
+    {
+        // Await responses blocks until a response is received. Release GIL so that it can be ran in a background
+        // thread.
+        nb::gil_scoped_release release;
+        return mExecutor->awaitResponses(timeout);
+    }
+
+    [[nodiscard]] std::vector<tle::Response> awaitResponses(
+        tle::IdType const& requestId, std::optional<std::chrono::milliseconds> const& timeout = std::nullopt)
+    {
+        // Await responses blocks until a response is received. Release GIL so that it can be ran in a background
+        // thread.
+        nb::gil_scoped_release release;
+        return mExecutor->awaitResponses(requestId, timeout);
+    }
+
+    [[nodiscard]] std::vector<std::vector<tle::Response>> awaitResponses(std::vector<tle::IdType> const& requestIds,
+        std::optional<std::chrono::milliseconds> const& timeout = std::nullopt)
+    {
+        // Await responses blocks until a response is received. Release GIL so that it can be ran in a background
+        // thread.
+        nb::gil_scoped_release release;
+        return mExecutor->awaitResponses(requestIds, timeout);
+    }
+
+    [[nodiscard]] tle::SizeType32 getNumResponsesReady(std::optional<tle::IdType> const& requestId = std::nullopt) const
+    {
+        return mExecutor->getNumResponsesReady(requestId);
+    }
+
+    void cancelRequest(tle::IdType requestId)
+    {
+        mExecutor->cancelRequest(requestId);
+    }
+
+    std::deque<tle::IterationStats> getLatestIterationStats()
+    {
+        return mExecutor->getLatestIterationStats();
+    }
+
+    std::deque<tle::RequestStatsPerIteration> getLatestRequestStats()
+    {
+        return mExecutor->getLatestRequestStats();
+    }
+
+    std::deque<tle::DebugTensorsPerIteration> getLatestDebugTensors()
+    {
+        return mExecutor->getLatestDebugTensors();
+    }
+
+    [[nodiscard]] bool canEnqueueRequests() const
+    {
+        return mExecutor->canEnqueueRequests();
+    }
+
+    [[nodiscard]] std::optional<std::shared_ptr<tle::KVCacheEventManager>> getKVCacheEventManager() const
+    {
+        return mExecutor->getKVCacheEventManager();
+    }
+
+    static void initBindings(nb::module_& m);
+
+private:
+    std::unique_ptr<tle::Executor> mExecutor;
+};
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
new file mode 100644
index 00000000000..c2d9fe25dff
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
@@ -0,0 +1,616 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "executorConfig.h"
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/utils/mpiUtils.h"
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/set.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/unordered_set.h>
+#include <nanobind/stl/vector.h>
+#include <torch/torch.h>
+#include <vector>
+
+namespace nb = nanobind;
+namespace tle = tensorrt_llm::executor;
+using SizeType32 = tle::SizeType32;
+using RuntimeDefaults = tensorrt_llm::runtime::RuntimeDefaults;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+void initConfigBindings(nb::module_& m)
+{
+    nb::enum_<tle::BatchingType>(m, "BatchingType")
+        .value("STATIC", tle::BatchingType::kSTATIC)
+        .value("INFLIGHT", tle::BatchingType::kINFLIGHT);
+
+    auto dynamicBatchConfigGetstate = [](tle::DynamicBatchConfig const& self)
+    {
+        return nb::make_tuple(self.getEnableBatchSizeTuning(), self.getEnableMaxNumTokensTuning(),
+            self.getDynamicBatchMovingAverageWindow(), self.getBatchSizeTable());
+    };
+    auto dynamicBatchConfigSetstate = [](tle::DynamicBatchConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::DynamicBatchConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
+            nb::cast<SizeType32>(state[2]), nb::cast<std::vector<std::pair<SizeType32, SizeType32>>>(state[3]));
+    };
+    nb::class_<tle::DynamicBatchConfig>(m, "DynamicBatchConfig")
+        .def(nb::init<bool, bool, SizeType32>(), nb::arg("enable_batch_size_tuning"),
+            nb::arg("enable_max_num_tokens_tuning"), nb::arg("dynamic_batch_moving_average_window"))
+        .def_prop_ro("enable_batch_size_tuning", &tle::DynamicBatchConfig::getEnableBatchSizeTuning)
+        .def_prop_ro("enable_max_num_tokens_tuning", &tle::DynamicBatchConfig::getEnableMaxNumTokensTuning)
+        .def_prop_ro(
+            "dynamic_batch_moving_average_window", &tle::DynamicBatchConfig::getDynamicBatchMovingAverageWindow)
+        .def("__getstate__", dynamicBatchConfigGetstate)
+        .def("__setstate__", dynamicBatchConfigSetstate);
+
+    auto schedulerConfigSetstate = [](tle::SchedulerConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::SchedulerConfig(nb::cast<tle::CapacitySchedulerPolicy>(state[0]),
+            nb::cast<std::optional<tle::ContextChunkingPolicy>>(state[1]),
+            nb::cast<std::optional<tle::DynamicBatchConfig>>(state[2]));
+    };
+    auto schedulerConfigGetstate = [](tle::SchedulerConfig const& self)
+    {
+        return nb::make_tuple(
+            self.getCapacitySchedulerPolicy(), self.getContextChunkingPolicy(), self.getDynamicBatchConfig());
+    };
+    nb::class_<tle::SchedulerConfig>(m, "SchedulerConfig")
+        .def(nb::init<tle::CapacitySchedulerPolicy, std::optional<tle::ContextChunkingPolicy>,
+                 std::optional<tle::DynamicBatchConfig>>(),
+            nb::arg("capacity_scheduler_policy") = tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT,
+            nb::arg("context_chunking_policy") = nb::none(), nb::arg("dynamic_batch_config") = nb::none())
+        .def_prop_ro("capacity_scheduler_policy", &tle::SchedulerConfig::getCapacitySchedulerPolicy)
+        .def_prop_ro("context_chunking_policy", &tle::SchedulerConfig::getContextChunkingPolicy)
+        .def_prop_ro("dynamic_batch_config", &tle::SchedulerConfig::getDynamicBatchConfig)
+        .def("__getstate__", schedulerConfigGetstate)
+        .def("__setstate__", schedulerConfigSetstate);
+
+    nb::class_<RuntimeDefaults>(m, "RuntimeDefaults")
+        .def(nb::init<std::optional<std::vector<SizeType32>>, std::optional<SizeType32>>(),
+            nb::arg("max_attention_window") = nb::none(), nb::arg("sink_token_length") = nb::none())
+        .def_ro("max_attention_window", &RuntimeDefaults::maxAttentionWindowVec)
+        .def_ro("sink_token_length", &RuntimeDefaults::sinkTokenLength);
+
+    auto kvCacheConfigGetstate = [](tle::KvCacheConfig const& self)
+    {
+        return nb::make_tuple(self.getEnableBlockReuse(), self.getMaxTokens(), self.getMaxAttentionWindowVec(),
+            self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(),
+            self.getOnboardBlocks(), self.getCrossKvCacheFraction(), self.getSecondaryOffloadMinPriority(),
+            self.getEventBufferMaxSize(), self.getEnablePartialReuse(), self.getCopyOnPartialReuse(), self.getUseUvm());
+    };
+    auto kvCacheConfigSetstate = [](tle::KvCacheConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 13)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::KvCacheConfig(nb::cast<bool>(state[0]), nb::cast<std::optional<SizeType32>>(state[1]),
+            nb::cast<std::optional<std::vector<SizeType32>>>(state[2]), nb::cast<std::optional<SizeType32>>(state[3]),
+            nb::cast<std::optional<float>>(state[4]), nb::cast<std::optional<size_t>>(state[5]),
+            nb::cast<bool>(state[6]), nb::cast<std::optional<float>>(state[7]),
+            nb::cast<std::optional<tle::RetentionPriority>>(state[8]), nb::cast<size_t>(state[9]),
+            nb::cast<bool>(state[10]), nb::cast<bool>(state[11]), nb::cast<bool>(state[12]));
+    };
+    nb::class_<tle::KvCacheConfig>(m, "KvCacheConfig")
+        .def(nb::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
+                 std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
+                 std::optional<float> const&, std::optional<tle::RetentionPriority>, size_t const&, bool, bool, bool,
+                 std::optional<RuntimeDefaults> const&>(),
+            nb::arg("enable_block_reuse") = true, nb::arg("max_tokens") = nb::none(),
+            nb::arg("max_attention_window") = nb::none(), nb::arg("sink_token_length") = nb::none(),
+            nb::arg("free_gpu_memory_fraction") = nb::none(), nb::arg("host_cache_size") = nb::none(),
+            nb::arg("onboard_blocks") = true, nb::arg("cross_kv_cache_fraction") = nb::none(),
+            nb::arg("secondary_offload_min_priority") = nb::none(), nb::arg("event_buffer_max_size") = 0, nb::kw_only(),
+            nb::arg("enable_partial_reuse") = true, nb::arg("copy_on_partial_reuse") = true, nb::arg("use_uvm") = false,
+            nb::arg("runtime_defaults") = nb::none())
+        .def_prop_rw(
+            "enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
+        .def_prop_rw("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
+        .def_prop_rw("max_attention_window", &tle::KvCacheConfig::getMaxAttentionWindowVec,
+            &tle::KvCacheConfig::setMaxAttentionWindowVec)
+        .def_prop_rw(
+            "sink_token_length", &tle::KvCacheConfig::getSinkTokenLength, &tle::KvCacheConfig::setSinkTokenLength)
+        .def_prop_rw("free_gpu_memory_fraction", &tle::KvCacheConfig::getFreeGpuMemoryFraction,
+            &tle::KvCacheConfig::setFreeGpuMemoryFraction)
+        .def_prop_rw("host_cache_size", &tle::KvCacheConfig::getHostCacheSize, &tle::KvCacheConfig::setHostCacheSize)
+        .def_prop_rw("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks, &tle::KvCacheConfig::setOnboardBlocks)
+        .def_prop_rw("cross_kv_cache_fraction", &tle::KvCacheConfig::getCrossKvCacheFraction,
+            &tle::KvCacheConfig::setCrossKvCacheFraction)
+        .def_prop_rw("secondary_offload_min_priority", &tle::KvCacheConfig::getSecondaryOffloadMinPriority,
+            &tle::KvCacheConfig::setSecondaryOffloadMinPriority)
+        .def_prop_rw("event_buffer_max_size", &tle::KvCacheConfig::getEventBufferMaxSize,
+            &tle::KvCacheConfig::setEventBufferMaxSize)
+        .def_prop_rw("enable_partial_reuse", &tle::KvCacheConfig::getEnablePartialReuse,
+            &tle::KvCacheConfig::setEnablePartialReuse)
+        .def_prop_rw("copy_on_partial_reuse", &tle::KvCacheConfig::getCopyOnPartialReuse,
+            &tle::KvCacheConfig::setCopyOnPartialReuse)
+        .def_prop_rw("use_uvm", &tle::KvCacheConfig::getUseUvm, &tle::KvCacheConfig::setUseUvm)
+        .def("fill_empty_fields_from_runtime_defaults", &tle::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults)
+        .def("__getstate__", kvCacheConfigGetstate)
+        .def("__setstate__", kvCacheConfigSetstate);
+
+    nb::class_<tle::OrchestratorConfig>(m, "OrchestratorConfig")
+        .def(nb::init<bool, std::string, std::shared_ptr<mpi::MpiComm>, bool>(), nb::arg("is_orchestrator") = true,
+            nb::arg("worker_executable_path") = "", nb::arg("orch_leader_comm").none() = nullptr,
+            nb::arg("spawn_processes") = true)
+        .def_prop_rw(
+            "is_orchestrator", &tle::OrchestratorConfig::getIsOrchestrator, &tle::OrchestratorConfig::setIsOrchestrator)
+        .def_prop_rw("worker_executable_path", &tle::OrchestratorConfig::getWorkerExecutablePath,
+            &tle::OrchestratorConfig::setWorkerExecutablePath)
+        .def_prop_rw("orch_leader_comm", &tle::OrchestratorConfig::getOrchLeaderComm,
+            &tle::OrchestratorConfig::setOrchLeaderComm)
+        .def_prop_rw("spawn_processes", &tle::OrchestratorConfig::getSpawnProcesses,
+            &tle::OrchestratorConfig::setSpawnProcesses);
+
+    auto parallelConfigGetstate = [](tle::ParallelConfig const& self)
+    {
+        return nb::make_tuple(self.getCommunicationType(), self.getCommunicationMode(), self.getDeviceIds(),
+            self.getParticipantIds(), self.getOrchestratorConfig(), self.getNumNodes());
+    };
+    auto parallelConfigSetstate = [](tle::ParallelConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 6)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::ParallelConfig(nb::cast<tle::CommunicationType>(state[0]),
+            nb::cast<tle::CommunicationMode>(state[1]), nb::cast<std::optional<std::vector<SizeType32>>>(state[2]),
+            nb::cast<std::optional<std::vector<SizeType32>>>(state[3]),
+            nb::cast<std::optional<tle::OrchestratorConfig>>(state[4]), nb::cast<std::optional<SizeType32>>(state[5]));
+    };
+    nb::class_<tle::ParallelConfig>(m, "ParallelConfig")
+        .def(nb::init<tle::CommunicationType, tle::CommunicationMode, std::optional<std::vector<SizeType32>> const&,
+                 std::optional<std::vector<SizeType32>> const&, std::optional<tle::OrchestratorConfig> const&,
+                 std::optional<SizeType32> const&>(),
+            nb::arg("communication_type") = tle::CommunicationType::kMPI,
+            nb::arg("communication_mode") = tle::CommunicationMode::kLEADER, nb::arg("device_ids") = nb::none(),
+            nb::arg("participant_ids") = nb::none(), nb::arg("orchestrator_config") = nb::none(),
+            nb::arg("num_nodes") = nb::none())
+        .def_prop_rw("communication_type", &tle::ParallelConfig::getCommunicationType,
+            &tle::ParallelConfig::setCommunicationType)
+        .def_prop_rw("communication_mode", &tle::ParallelConfig::getCommunicationMode,
+            &tle::ParallelConfig::setCommunicationMode)
+        .def_prop_rw("device_ids", &tle::ParallelConfig::getDeviceIds, &tle::ParallelConfig::setDeviceIds)
+        .def_prop_rw(
+            "participant_ids", &tle::ParallelConfig::getParticipantIds, &tle::ParallelConfig::setParticipantIds)
+        .def_prop_rw("orchestrator_config", &tle::ParallelConfig::getOrchestratorConfig,
+            &tle::ParallelConfig::setOrchestratorConfig)
+        .def_prop_rw("num_nodes", &tle::ParallelConfig::getNumNodes, &tle::ParallelConfig::setNumNodes)
+        .def("__getstate__", parallelConfigGetstate)
+        .def("__setstate__", parallelConfigSetstate);
+
+    auto peftCacheConfigSetstate = [](tle::PeftCacheConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 11)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::PeftCacheConfig(nb::cast<SizeType32>(state[0]), nb::cast<SizeType32>(state[1]),
+            nb::cast<SizeType32>(state[2]), nb::cast<SizeType32>(state[3]), nb::cast<SizeType32>(state[4]),
+            nb::cast<SizeType32>(state[5]), nb::cast<SizeType32>(state[6]), nb::cast<SizeType32>(state[7]),
+            nb::cast<SizeType32>(state[8]), nb::cast<std::optional<float>>(state[9]),
+            nb::cast<std::optional<size_t>>(state[10]));
+    };
+    auto peftCacheConfigGetstate = [](tle::PeftCacheConfig const& self)
+    {
+        return nb::make_tuple(self.getNumHostModuleLayer(), self.getNumDeviceModuleLayer(),
+            self.getOptimalAdapterSize(), self.getMaxAdapterSize(), self.getNumPutWorkers(), self.getNumEnsureWorkers(),
+            self.getNumCopyStreams(), self.getMaxPagesPerBlockHost(), self.getMaxPagesPerBlockDevice(),
+            self.getDeviceCachePercent(), self.getHostCacheSize());
+    };
+    nb::class_<tle::PeftCacheConfig>(m, "PeftCacheConfig")
+        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32,
+                 SizeType32, std::optional<float> const&, std::optional<size_t> const&,
+                 std::optional<std::string> const&>(),
+            nb::arg("num_host_module_layer") = 0, nb::arg("num_device_module_layer") = 0,
+            nb::arg("optimal_adapter_size") = 8, nb::arg("max_adapter_size") = 64, nb::arg("num_put_workers") = 1,
+            nb::arg("num_ensure_workers") = 1, nb::arg("num_copy_streams") = 1,
+            nb::arg("max_pages_per_block_host") = 24, nb::arg("max_pages_per_block_device") = 8,
+            nb::arg("device_cache_percent") = nb::none(), nb::arg("host_cache_size") = nb::none(),
+            nb::arg("lora_prefetch_dir") = nb::none())
+        .def_prop_ro("num_host_module_layer", &tle::PeftCacheConfig::getNumHostModuleLayer)
+        .def_prop_ro("num_device_module_layer", &tle::PeftCacheConfig::getNumDeviceModuleLayer)
+        .def_prop_ro("optimal_adapter_size", &tle::PeftCacheConfig::getOptimalAdapterSize)
+        .def_prop_ro("max_adapter_size", &tle::PeftCacheConfig::getMaxAdapterSize)
+        .def_prop_ro("num_put_workers", &tle::PeftCacheConfig::getNumPutWorkers)
+        .def_prop_ro("num_ensure_workers", &tle::PeftCacheConfig::getNumEnsureWorkers)
+        .def_prop_ro("num_copy_streams", &tle::PeftCacheConfig::getNumCopyStreams)
+        .def_prop_ro("max_pages_per_block_host", &tle::PeftCacheConfig::getMaxPagesPerBlockHost)
+        .def_prop_ro("max_pages_per_block_device", &tle::PeftCacheConfig::getMaxPagesPerBlockDevice)
+        .def_prop_ro("device_cache_percent", &tle::PeftCacheConfig::getDeviceCachePercent)
+        .def_prop_ro("host_cache_size", &tle::PeftCacheConfig::getHostCacheSize)
+        .def_prop_ro("lora_prefetch_dir", &tle::PeftCacheConfig::getLoraPrefetchDir)
+        .def("__getstate__", peftCacheConfigGetstate)
+        .def("__setstate__", peftCacheConfigSetstate);
+
+    auto decodingConfigGetstate = [](tle::DecodingConfig const& self)
+    {
+        return nb::make_tuple(
+            self.getDecodingMode(), self.getLookaheadDecodingConfig(), self.getMedusaChoices(), self.getEagleConfig());
+    };
+    auto decodingConfigSetstate = [](tle::DecodingConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::DecodingConfig(nb::cast<std::optional<tle::DecodingMode>>(state[0]), // DecodingMode
+            nb::cast<std::optional<tle::LookaheadDecodingConfig>>(state[1]),                  // LookaheadDecodingConfig
+            nb::cast<std::optional<tle::MedusaChoices>>(state[2]),                            // MedusaChoices
+            nb::cast<std::optional<tle::EagleConfig>>(state[3])                               // EagleConfig
+        );
+    };
+    nb::class_<tle::DecodingConfig>(m, "DecodingConfig")
+        .def(nb::init<std::optional<tle::DecodingMode>, std::optional<tle::LookaheadDecodingConfig>,
+                 std::optional<tle::MedusaChoices>, std::optional<tle::EagleConfig>>(),
+            nb::arg("decoding_mode") = nb::none(), nb::arg("lookahead_decoding_config") = nb::none(),
+            nb::arg("medusa_choices") = nb::none(), nb::arg("eagle_config") = nb::none())
+        .def_prop_rw("decoding_mode", &tle::DecodingConfig::getDecodingMode, &tle::DecodingConfig::setDecodingMode)
+        .def_prop_rw("lookahead_decoding_config", &tle::DecodingConfig::getLookaheadDecodingConfig,
+            &tle::DecodingConfig::setLookaheadDecodingConfig)
+        .def_prop_rw("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices)
+        .def_prop_rw("eagle_config", &tle::DecodingConfig::getEagleConfig, &tle::DecodingConfig::setEagleConfig)
+        .def("__getstate__", decodingConfigGetstate)
+        .def("__setstate__", decodingConfigSetstate);
+
+    auto debugConfigGetstate = [](tle::DebugConfig const& self)
+    {
+        return nb::make_tuple(self.getDebugInputTensors(), self.getDebugOutputTensors(), self.getDebugTensorNames(),
+            self.getDebugTensorsMaxIterations());
+    };
+    auto debugConfigSetstate = [](tle::DebugConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::DebugConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
+            nb::cast<std::vector<std::string>>(state[2]), nb::cast<SizeType32>(state[3]));
+    };
+    nb::class_<tle::DebugConfig>(m, "DebugConfig")
+        .def(nb::init<bool, bool, std::vector<std::string>, SizeType32>(), nb::arg("debug_input_tensors") = false,
+            nb::arg("debug_output_tensors") = false, nb::arg("debug_tensor_names") = nb::none(),
+            nb::arg("debug_tensors_max_iterations") = false)
+        .def_prop_rw(
+            "debug_input_tensors", &tle::DebugConfig::getDebugInputTensors, &tle::DebugConfig::setDebugInputTensors)
+        .def_prop_rw(
+            "debug_output_tensors", &tle::DebugConfig::getDebugOutputTensors, &tle::DebugConfig::setDebugOutputTensors)
+        .def_prop_rw(
+            "debug_tensor_names", &tle::DebugConfig::getDebugTensorNames, &tle::DebugConfig::setDebugTensorNames)
+        .def_prop_rw("debug_tensors_max_iterations", &tle::DebugConfig::getDebugTensorsMaxIterations,
+            &tle::DebugConfig::setDebugTensorsMaxIterations)
+        .def("__getstate__", debugConfigGetstate)
+        .def("__setstate__", debugConfigSetstate);
+
+    auto logitsPostProcessorConfigGetstate = [](tle::LogitsPostProcessorConfig const& self)
+    { return nb::make_tuple(self.getProcessorMap(), self.getProcessorBatched(), self.getReplicate()); };
+
+    auto logitsPostProcessorConfigSetstate = [](tle::LogitsPostProcessorConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid LogitsPostProcessorConfig state!");
+        }
+        new (&self) tle::LogitsPostProcessorConfig(nb::cast<std::optional<tle::LogitsPostProcessorMap>>(state[0]),
+            nb::cast<std::optional<tle::LogitsPostProcessorBatched>>(state[1]), nb::cast<bool>(state[2]));
+    };
+
+    nb::class_<tle::LogitsPostProcessorConfig>(m, "LogitsPostProcessorConfig")
+        .def(nb::init<std::optional<tle::LogitsPostProcessorMap>, std::optional<tle::LogitsPostProcessorBatched>,
+                 bool>(),
+            nb::arg("processor_map") = nb::none(), nb::arg("processor_batched") = nb::none(),
+            nb::arg("replicate") = true)
+        .def_prop_rw("processor_map", &tle::LogitsPostProcessorConfig::getProcessorMap,
+            &tle::LogitsPostProcessorConfig::setProcessorMap)
+        .def_prop_rw("processor_batched", &tle::LogitsPostProcessorConfig::getProcessorBatched,
+            &tle::LogitsPostProcessorConfig::setProcessorBatched)
+        .def_prop_rw(
+            "replicate", &tle::LogitsPostProcessorConfig::getReplicate, &tle::LogitsPostProcessorConfig::setReplicate)
+        .def("__getstate__", logitsPostProcessorConfigGetstate)
+        .def("__setstate__", logitsPostProcessorConfigSetstate);
+
+    auto extendedRuntimePerfKnobConfigSetstate = [](tle::ExtendedRuntimePerfKnobConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid extendedRuntimePerfKnobConfig state!");
+        }
+        new (&self) tle::ExtendedRuntimePerfKnobConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
+            nb::cast<bool>(state[2]), nb::cast<SizeType32>(state[2]));
+    };
+    auto extendedRuntimePerfKnobConfigGetstate = [](tle::ExtendedRuntimePerfKnobConfig const& self)
+    {
+        return nb::make_tuple(self.getMultiBlockMode(), self.getEnableContextFMHAFP32Acc(), self.getCudaGraphMode(),
+            self.getCudaGraphCacheSize());
+    };
+    nb::class_<tle::ExtendedRuntimePerfKnobConfig>(m, "ExtendedRuntimePerfKnobConfig")
+        .def(
+            nb::init<bool, bool>(), nb::arg("multi_block_mode") = true, nb::arg("enable_context_fmha_fp32_acc") = false)
+        .def_prop_rw("multi_block_mode", &tle::ExtendedRuntimePerfKnobConfig::getMultiBlockMode,
+            &tle::ExtendedRuntimePerfKnobConfig::setMultiBlockMode)
+        .def_prop_rw("enable_context_fmha_fp32_acc", &tle::ExtendedRuntimePerfKnobConfig::getEnableContextFMHAFP32Acc,
+            &tle::ExtendedRuntimePerfKnobConfig::setEnableContextFMHAFP32Acc)
+        .def_prop_rw("cuda_graph_mode", &tle::ExtendedRuntimePerfKnobConfig::getCudaGraphMode,
+            &tle::ExtendedRuntimePerfKnobConfig::setCudaGraphMode)
+        .def_prop_rw("cuda_graph_cache_size", &tle::ExtendedRuntimePerfKnobConfig::getCudaGraphCacheSize,
+            &tle::ExtendedRuntimePerfKnobConfig::setCudaGraphCacheSize)
+        .def("__getstate__", extendedRuntimePerfKnobConfigGetstate)
+        .def("__setstate__", extendedRuntimePerfKnobConfigSetstate);
+
+    auto SpeculativeDecodingConfigGetState
+        = [](tle::SpeculativeDecodingConfig const& self) { return nb::make_tuple(self.fastLogits); };
+    auto SpeculativeDecodingConfigSetState = [](tle::SpeculativeDecodingConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 1)
+        {
+            throw std::runtime_error("Invalid SpeculativeDecodingConfig state!");
+        }
+        new (&self) tle::SpeculativeDecodingConfig(nb::cast<bool>(state[0]));
+    };
+    nb::class_<tle::SpeculativeDecodingConfig>(m, "SpeculativeDecodingConfig")
+        .def(nb::init<bool>(), nb::arg("fast_logits") = false)
+        .def_rw("fast_logits", &tle::SpeculativeDecodingConfig::fastLogits)
+        .def("__getstate__", SpeculativeDecodingConfigGetState)
+        .def("__setstate__", SpeculativeDecodingConfigSetState);
+
+    // Guided decoding config
+    auto pyGuidedDecodingConfig = nb::class_<tle::GuidedDecodingConfig>(m, "GuidedDecodingConfig");
+
+    nb::enum_<tle::GuidedDecodingConfig::GuidedDecodingBackend>(pyGuidedDecodingConfig, "GuidedDecodingBackend")
+        .value("XGRAMMAR", tle::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR)
+        .value("LLGUIDANCE", tle::GuidedDecodingConfig::GuidedDecodingBackend::kLLGUIDANCE);
+
+    auto guidedDecodingConfigGetstate = [](tle::GuidedDecodingConfig const& self) {
+        return nb::make_tuple(
+            self.getBackend(), self.getEncodedVocab(), self.getTokenizerStr(), self.getStopTokenIds());
+    };
+    auto guidedDecodingConfigSetstate = [](tle::GuidedDecodingConfig& self, nb::tuple state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid GuidedDecodingConfig state!");
+        }
+        new (&self) tle::GuidedDecodingConfig(nb::cast<tle::GuidedDecodingConfig::GuidedDecodingBackend>(state[0]),
+            nb::cast<std::optional<std::vector<std::string>>>(state[1]), nb::cast<std::optional<std::string>>(state[2]),
+            nb::cast<std::optional<std::vector<tle::TokenIdType>>>(state[3]));
+    };
+
+    pyGuidedDecodingConfig
+        .def(nb::init<tle::GuidedDecodingConfig::GuidedDecodingBackend, std::optional<std::vector<std::string>>,
+                 std::optional<std::string>, std::optional<std::vector<tle::TokenIdType>>>(),
+            nb::arg("backend"), nb::arg("encoded_vocab") = nb::none(), nb::arg("tokenizer_str") = nb::none(),
+            nb::arg("stop_token_ids") = nb::none())
+        .def_prop_rw("backend", &tle::GuidedDecodingConfig::getBackend, &tle::GuidedDecodingConfig::setBackend)
+        .def_prop_rw(
+            "encoded_vocab", &tle::GuidedDecodingConfig::getEncodedVocab, &tle::GuidedDecodingConfig::setEncodedVocab)
+        .def_prop_rw(
+            "tokenizer_str", &tle::GuidedDecodingConfig::getTokenizerStr, &tle::GuidedDecodingConfig::setTokenizerStr)
+        .def_prop_rw(
+            "stop_token_ids", &tle::GuidedDecodingConfig::getStopTokenIds, &tle::GuidedDecodingConfig::setStopTokenIds)
+        .def("__getstate__", guidedDecodingConfigGetstate)
+        .def("__setstate__", guidedDecodingConfigSetstate);
+
+    auto cacheTransceiverConfigGetstate
+        = [](tle::CacheTransceiverConfig const& self) { return nb::make_tuple(self.getMaxNumTokens()); };
+    auto cacheTransceiverConfigSetstate = [](tle::CacheTransceiverConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 1)
+        {
+            throw std::runtime_error("Invalid CacheTransceiverConfig state!");
+        }
+        new (&self) tle::CacheTransceiverConfig(nb::cast<std::optional<size_t>>(state[0]));
+    };
+
+    nb::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
+        .def(nb::init<std::optional<size_t>>(), nb::arg("max_num_tokens") = nb::none())
+        .def_prop_rw("max_num_tokens", &tle::CacheTransceiverConfig::getMaxNumTokens,
+            &tle::CacheTransceiverConfig::setMaxNumTokens)
+        .def("__getstate__", cacheTransceiverConfigGetstate)
+        .def("__setstate__", cacheTransceiverConfigSetstate);
+
+    auto executorConfigGetState = [](nb::object const& self)
+    {
+        auto& c = nb::cast<tle::ExecutorConfig&>(self);
+        // Return a tuple containing C++ data and the Python __dict__
+        auto cpp_states = nb::make_tuple(c.getMaxBeamWidth(), c.getSchedulerConfig(), c.getKvCacheConfig(),
+            c.getEnableChunkedContext(), c.getNormalizeLogProbs(), c.getIterStatsMaxIterations(),
+            c.getRequestStatsMaxIterations(), c.getBatchingType(), c.getMaxBatchSize(), c.getMaxNumTokens(),
+            c.getParallelConfig(), c.getPeftCacheConfig(), c.getLogitsPostProcessorConfig(), c.getDecodingConfig(),
+            c.getUseGpuDirectStorage(), c.getGpuWeightsPercent(), c.getMaxQueueSize(),
+            c.getExtendedRuntimePerfKnobConfig(), c.getDebugConfig(), c.getRecvPollPeriodMs(),
+            c.getMaxSeqIdleMicroseconds(), c.getSpecDecConfig(), c.getGuidedDecodingConfig(),
+            c.getAdditionalModelOutputs(), c.getCacheTransceiverConfig(), c.getGatherGenerationLogits(),
+            c.getPromptTableOffloading(), c.getEnableTrtOverlap());
+        auto pickle_tuple = nb::make_tuple(cpp_states, nb::getattr(self, "__dict__"));
+        return pickle_tuple;
+    };
+
+    auto executorConfigSetState = [](nb::object self, nb::tuple const& state)
+    {
+        if (state.size() != 2)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+
+        auto cpp_states = nb::cast<nb::tuple>(state[0]);
+        if (cpp_states.size() != 28)
+        {
+            throw std::runtime_error("Invalid cpp_states!");
+        }
+
+        // Restore C++ data
+        tle::ExecutorConfig* cpp_self = nb::inst_ptr<tle::ExecutorConfig>(self);
+        new (cpp_self) tle::ExecutorConfig(                                          //
+            nb::cast<SizeType32>(cpp_states[0]),                                     // MaxBeamWidth
+            nb::cast<tle::SchedulerConfig>(cpp_states[1]),                           // SchedulerConfig
+            nb::cast<tle::KvCacheConfig>(cpp_states[2]),                             // KvCacheConfig
+            nb::cast<bool>(cpp_states[3]),                                           // EnableChunkedContext
+            nb::cast<bool>(cpp_states[4]),                                           // NormalizeLogProbs
+            nb::cast<SizeType32>(cpp_states[5]),                                     // IterStatsMaxIterations
+            nb::cast<SizeType32>(cpp_states[6]),                                     // RequestStatsMaxIterations
+            nb::cast<tle::BatchingType>(cpp_states[7]),                              // BatchingType
+            nb::cast<std::optional<SizeType32>>(cpp_states[8]),                      // MaxBatchSize
+            nb::cast<std::optional<SizeType32>>(cpp_states[9]),                      // MaxNumTokens
+            nb::cast<std::optional<tle::ParallelConfig>>(cpp_states[10]),            // ParallelConfig
+            nb::cast<std::optional<tle::PeftCacheConfig>>(cpp_states[11]),           // PeftCacheConfig
+            nb::cast<std::optional<tle::LogitsPostProcessorConfig>>(cpp_states[12]), // LogitsPostProcessorConfig
+            nb::cast<std::optional<tle::DecodingConfig>>(cpp_states[13]),            // DecodingConfig
+            nb::cast<bool>(cpp_states[14]),                                          // UseGpuDirectStorage
+            nb::cast<float>(cpp_states[15]),                                         // GpuWeightsPercent
+            nb::cast<std::optional<SizeType32>>(cpp_states[16]),                     // MaxQueueSize
+            nb::cast<tle::ExtendedRuntimePerfKnobConfig>(cpp_states[17]),            // ExtendedRuntimePerfKnobConfig
+            nb::cast<std::optional<tle::DebugConfig>>(cpp_states[18]),               // DebugConfig
+            nb::cast<SizeType32>(cpp_states[19]),                                    // RecvPollPeriodMs
+            nb::cast<uint64_t>(cpp_states[20]),                                      // MaxSeqIdleMicroseconds
+            nb::cast<std::optional<tle::SpeculativeDecodingConfig>>(cpp_states[21]), // SpecDecConfig
+            nb::cast<std::optional<tle::GuidedDecodingConfig>>(cpp_states[22]),      // GuidedDecodingConfig
+            nb::cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(cpp_states[23]), // AdditionalModelOutputs
+            nb::cast<std::optional<tle::CacheTransceiverConfig>>(cpp_states[24]),             // CacheTransceiverConfig
+            nb::cast<bool>(cpp_states[25]),                                                   // GatherGenerationLogits
+            nb::cast<bool>(cpp_states[26]),                                                   // PromptTableOffloading
+            nb::cast<bool>(cpp_states[27])                                                    // EnableTrtOverlap
+        );
+
+        // Restore Python data
+        auto py_state = nb::cast<nb::dict>(state[1]);
+        self.attr("__dict__").attr("update")(py_state);
+
+        nb::inst_mark_ready(self);
+    };
+
+    nb::class_<tle::ExecutorConfig>(m, "ExecutorConfig", nb::dynamic_attr())
+        .def(nb::init<                                                   //
+                 SizeType32,                                             // MaxBeamWidth
+                 tle::SchedulerConfig const&,                            // SchedulerConfig
+                 tle::KvCacheConfig const&,                              // KvCacheConfig
+                 bool,                                                   // EnableChunkedContext
+                 bool,                                                   // NormalizeLogProbs
+                 SizeType32,                                             // IterStatsMaxIterations
+                 SizeType32,                                             // RequestStatsMaxIterations
+                 tle::BatchingType,                                      // BatchingType
+                 std::optional<SizeType32>,                              // MaxBatchSize
+                 std::optional<SizeType32>,                              // MaxNumTokens
+                 std::optional<tle::ParallelConfig>,                     // ParallelConfig
+                 tle::PeftCacheConfig const&,                            // PeftCacheConfig
+                 std::optional<tle::LogitsPostProcessorConfig>,          // LogitsPostProcessorConfig
+                 std::optional<tle::DecodingConfig>,                     // DecodingConfig
+                 bool,                                                   // UseGpuDirectStorage
+                 float,                                                  // GpuWeightsPercent
+                 std::optional<SizeType32>,                              // MaxQueueSize
+                 tle::ExtendedRuntimePerfKnobConfig const&,              // ExtendedRuntimePerfKnobConfig
+                 std::optional<tle::DebugConfig>,                        // DebugConfig
+                 SizeType32,                                             // RecvPollPeriodMs
+                 uint64_t,                                               // MaxSeqIdleMicroseconds
+                 std::optional<tle::SpeculativeDecodingConfig>,          // SpecDecConfig
+                 std::optional<tle::GuidedDecodingConfig>,               // GuidedDecodingConfig
+                 std::optional<std::vector<tle::AdditionalModelOutput>>, // AdditionalModelOutputs
+                 std::optional<tle::CacheTransceiverConfig>,             // CacheTransceiverConfig
+                 bool,                                                   // GatherGenerationLogits
+                 bool,                                                   // PromptTableOffloading
+                 bool                                                    // EnableTrtOverlap
+                 >(),
+            nb::arg("max_beam_width") = 1, nb::arg("scheduler_config") = tle::SchedulerConfig(),
+            nb::arg("kv_cache_config") = tle::KvCacheConfig(), nb::arg("enable_chunked_context") = false,
+            nb::arg("normalize_log_probs") = true,
+            nb::arg("iter_stats_max_iterations") = tle::ExecutorConfig::kDefaultIterStatsMaxIterations,
+            nb::arg("request_stats_max_iterations") = tle::ExecutorConfig::kDefaultRequestStatsMaxIterations,
+            nb::arg("batching_type") = tle::BatchingType::kINFLIGHT, nb::arg("max_batch_size") = nb::none(),
+            nb::arg("max_num_tokens") = nb::none(), nb::arg("parallel_config") = nb::none(),
+            nb::arg("peft_cache_config") = tle::PeftCacheConfig(), nb::arg("logits_post_processor_config") = nb::none(),
+            nb::arg("decoding_config") = nb::none(), nb::arg("use_gpu_direct_storage") = false,
+            nb::arg("gpu_weights_percent") = 1.0, nb::arg("max_queue_size") = nb::none(),
+            nb::arg("extended_runtime_perf_knob_config") = tle::ExtendedRuntimePerfKnobConfig(),
+            nb::arg("debug_config") = nb::none(), nb::arg("recv_poll_period_ms") = 0,
+            nb::arg("max_seq_idle_microseconds") = tle::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds,
+            nb::arg("spec_dec_config") = nb::none(), nb::arg("guided_decoding_config") = nb::none(),
+            nb::arg("additional_model_outputs") = nb::none(), nb::arg("cache_transceiver_config") = nb::none(),
+            nb::arg("gather_generation_logits") = false, nb::arg("mm_embedding_offloading") = false,
+            nb::arg("enable_trt_overlap") = false)
+        .def_prop_rw("max_beam_width", &tle::ExecutorConfig::getMaxBeamWidth, &tle::ExecutorConfig::setMaxBeamWidth)
+        .def_prop_rw("max_batch_size", &tle::ExecutorConfig::getMaxBatchSize, &tle::ExecutorConfig::setMaxBatchSize)
+        .def_prop_rw("max_num_tokens", &tle::ExecutorConfig::getMaxNumTokens, &tle::ExecutorConfig::setMaxNumTokens)
+        .def_prop_rw(
+            "scheduler_config", &tle::ExecutorConfig::getSchedulerConfigRef, &tle::ExecutorConfig::setSchedulerConfig)
+        .def_prop_rw(
+            "kv_cache_config", &tle::ExecutorConfig::getKvCacheConfigRef, &tle::ExecutorConfig::setKvCacheConfig)
+        .def_prop_rw("enable_chunked_context", &tle::ExecutorConfig::getEnableChunkedContext,
+            &tle::ExecutorConfig::setEnableChunkedContext)
+        .def_prop_rw("normalize_log_probs", &tle::ExecutorConfig::getNormalizeLogProbs,
+            &tle::ExecutorConfig::setNormalizeLogProbs)
+        .def_prop_rw("iter_stats_max_iterations", &tle::ExecutorConfig::getIterStatsMaxIterations,
+            &tle::ExecutorConfig::setIterStatsMaxIterations)
+        .def_prop_rw("request_stats_max_iterations", &tle::ExecutorConfig::getRequestStatsMaxIterations,
+            &tle::ExecutorConfig::setRequestStatsMaxIterations)
+        .def_prop_rw("batching_type", &tle::ExecutorConfig::getBatchingType, &tle::ExecutorConfig::setBatchingType)
+        .def_prop_rw(
+            "parallel_config", &tle::ExecutorConfig::getParallelConfig, &tle::ExecutorConfig::setParallelConfig)
+        .def_prop_rw(
+            "peft_cache_config", &tle::ExecutorConfig::getPeftCacheConfig, &tle::ExecutorConfig::setPeftCacheConfig)
+        .def_prop_rw("logits_post_processor_config", &tle::ExecutorConfig::getLogitsPostProcessorConfig,
+            &tle::ExecutorConfig::setLogitsPostProcessorConfig)
+        .def_prop_rw(
+            "decoding_config", &tle::ExecutorConfig::getDecodingConfig, &tle::ExecutorConfig::setDecodingConfig)
+        .def_prop_rw("use_gpu_direct_storage", &tle::ExecutorConfig::getUseGpuDirectStorage,
+            &tle::ExecutorConfig::setUseGpuDirectStorage)
+        .def_prop_rw("gpu_weights_percent", &tle::ExecutorConfig::getGpuWeightsPercent,
+            &tle::ExecutorConfig::setGpuWeightsPercent)
+        .def_prop_rw("max_queue_size", &tle::ExecutorConfig::getMaxQueueSize, &tle::ExecutorConfig::setMaxQueueSize)
+        .def_prop_rw("extended_runtime_perf_knob_config", &tle::ExecutorConfig::getExtendedRuntimePerfKnobConfig,
+            &tle::ExecutorConfig::setExtendedRuntimePerfKnobConfig)
+        .def_prop_rw("debug_config", &tle::ExecutorConfig::getDebugConfig, &tle::ExecutorConfig::setDebugConfig)
+        .def_prop_rw(
+            "recv_poll_period_ms", &tle::ExecutorConfig::getRecvPollPeriodMs, &tle::ExecutorConfig::setRecvPollPeriodMs)
+        .def_prop_rw("max_seq_idle_microseconds", &tle::ExecutorConfig::getMaxSeqIdleMicroseconds,
+            &tle::ExecutorConfig::setMaxSeqIdleMicroseconds)
+        .def_prop_rw("spec_dec_config", &tle::ExecutorConfig::getSpecDecConfig, &tle::ExecutorConfig::setSpecDecConfig)
+        .def_prop_rw("guided_decoding_config", &tle::ExecutorConfig::getGuidedDecodingConfig,
+            &tle::ExecutorConfig::setGuidedDecodingConfig)
+        .def_prop_rw("additional_model_outputs", &tle::ExecutorConfig::getAdditionalModelOutputs,
+            &tle::ExecutorConfig::setAdditionalModelOutputs)
+        .def_prop_rw("cache_transceiver_config", &tle::ExecutorConfig::getCacheTransceiverConfig,
+            &tle::ExecutorConfig::setCacheTransceiverConfig)
+        .def_prop_rw("gather_generation_logits", &tle::ExecutorConfig::getGatherGenerationLogits,
+            &tle::ExecutorConfig::setGatherGenerationLogits)
+        .def_prop_rw("mm_embedding_offloading", &tle::ExecutorConfig::getPromptTableOffloading,
+            &tle::ExecutorConfig::setPromptTableOffloading)
+        .def_prop_rw(
+            "enable_trt_overlap", &tle::ExecutorConfig::getEnableTrtOverlap, &tle::ExecutorConfig::setEnableTrtOverlap)
+        .def("__getstate__", executorConfigGetState)
+        .def("__setstate__", executorConfigSetState);
+}
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.h b/cpp/tensorrt_llm/nanobind/executor/executorConfig.h
new file mode 100644
index 00000000000..5b63e7c5a3e
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/executorConfig.h
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+// Register bindings for executor API.
+void initConfigBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/request.cpp b/cpp/tensorrt_llm/nanobind/executor/request.cpp
new file mode 100644
index 00000000000..9c3d34aa8fd
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/request.cpp
@@ -0,0 +1,935 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "request.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/serializeUtils.h"
+#include "tensorrt_llm/executor/tensor.h"
+#include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/chrono.h>
+#include <nanobind/stl/list.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
+#include <sstream>
+
+#include <optional>
+#include <vector>
+
+namespace nb = nanobind;
+namespace tle = tensorrt_llm::executor;
+using Tensor = tle::Tensor;
+using SizeType32 = tle::SizeType32;
+using FloatType = tle::FloatType;
+using VecTokens = tle::VecTokens;
+using IdType = tle::IdType;
+using VecTokenExtraIds = tle::VecTokenExtraIds;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+void initRequestBindings(nb::module_& m)
+{
+    nb::enum_<tle::RequestType>(m, "RequestType")
+        .value("REQUEST_TYPE_CONTEXT_AND_GENERATION", tle::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION)
+        .value("REQUEST_TYPE_CONTEXT_ONLY", tle::RequestType::REQUEST_TYPE_CONTEXT_ONLY)
+        .value("REQUEST_TYPE_GENERATION_ONLY", tle::RequestType::REQUEST_TYPE_GENERATION_ONLY);
+
+    nb::enum_<tle::FinishReason>(m, "FinishReason")
+        .value("NOT_FINISHED", tle::FinishReason::kNOT_FINISHED)
+        .value("END_ID", tle::FinishReason::kEND_ID)
+        .value("STOP_WORDS", tle::FinishReason::kSTOP_WORDS)
+        .value("LENGTH", tle::FinishReason::kLENGTH)
+        .value("TIMED_OUT", tle::FinishReason::kTIMED_OUT)
+        .value("CANCELLED", tle::FinishReason::kCANCELLED);
+
+    nb::enum_<tle::KvCacheTransferMode>(m, "KvCacheTransferMode")
+        .value("DRAM", tle::KvCacheTransferMode::DRAM)
+        .value("GDS", tle::KvCacheTransferMode::GDS)
+        .value("POSIX_DEBUG_FALLBACK", tle::KvCacheTransferMode::POSIX_DEBUG_FALLBACK);
+
+    auto samplingConfigGetstate = [](tle::SamplingConfig const& self)
+    {
+        return nb::make_tuple(self.getBeamWidth(), self.getTopK(), self.getTopP(), self.getTopPMin(),
+            self.getTopPResetIds(), self.getTopPDecay(), self.getSeed(), self.getTemperature(), self.getMinTokens(),
+            self.getBeamSearchDiversityRate(), self.getRepetitionPenalty(), self.getPresencePenalty(),
+            self.getFrequencyPenalty(), self.getLengthPenalty(), self.getEarlyStopping(), self.getNoRepeatNgramSize(),
+            self.getNumReturnSequences(), self.getMinP(), self.getBeamWidthArray());
+    };
+    auto samplingConfigSetstate = [](tle::SamplingConfig& samplingConfig, nb::tuple const& state)
+    {
+        if (state.size() != 19)
+        {
+            throw std::runtime_error("Invalid SamplingConfig state!");
+        }
+        new (&samplingConfig) tle::SamplingConfig(nb::cast<SizeType32>(state[0]), // BeamWidth
+            nb::cast<std::optional<SizeType32>>(state[1]),                        // TopK
+            nb::cast<std::optional<FloatType>>(state[2]),                         // TopP
+            nb::cast<std::optional<FloatType>>(state[3]),                         // TopPMin
+            nb::cast<std::optional<tle::TokenIdType>>(state[4]),                  // TopPResetIds
+            nb::cast<std::optional<FloatType>>(state[5]),                         // TopPDecay
+            nb::cast<std::optional<tle::RandomSeedType>>(state[6]),               // Seed
+            nb::cast<std::optional<FloatType>>(state[7]),                         // Temperature
+            nb::cast<std::optional<SizeType32>>(state[8]),                        // MinTokens
+            nb::cast<std::optional<FloatType>>(state[9]),                         // BeamSearchDiversityRate
+            nb::cast<std::optional<FloatType>>(state[10]),                        // RepetitionPenalty
+            nb::cast<std::optional<FloatType>>(state[11]),                        // PresencePenalty
+            nb::cast<std::optional<FloatType>>(state[12]),                        // FrequencyPenalty
+            nb::cast<std::optional<FloatType>>(state[13]),                        // LengthPenalty
+            nb::cast<std::optional<SizeType32>>(state[14]),                       // EarlyStopping
+            nb::cast<std::optional<SizeType32>>(state[15]),                       // NoRepeatNgramSize
+            nb::cast<std::optional<SizeType32>>(state[16]),                       // NumReturnSequences
+            nb::cast<std::optional<FloatType>>(state[17]),                        // MinP
+            nb::cast<std::optional<std::vector<SizeType32>>>(state[18])           // BeamWidthArray
+        );
+    };
+    nb::class_<tle::SamplingConfig>(m, "SamplingConfig")
+        .def(nb::init<tle::SizeType32,
+                 std::optional<tle::SizeType32> const&,             // beamWidth
+                 std::optional<tle::FloatType> const&,              // topP
+                 std::optional<tle::FloatType> const&,              // topPMin
+                 std::optional<tle::TokenIdType> const&,            // topPResetIds
+                 std::optional<tle::FloatType> const&,              // topPDecay
+                 std::optional<tle::RandomSeedType> const&,         // seed
+                 std::optional<tle::FloatType> const&,              // temperature
+                 std::optional<tle::SizeType32> const&,             // minTokens
+                 std::optional<tle::FloatType> const&,              // beamSearchDiversityRate
+                 std::optional<tle::FloatType> const&,              // repetitionPenalty
+                 std::optional<tle::FloatType> const&,              // presencePenalty
+                 std::optional<tle::FloatType> const&,              // frequencyPenalty
+                 std::optional<tle::FloatType> const&,              // lengthPenalty
+                 std::optional<tle::SizeType32> const&,             // earlyStopping
+                 std::optional<tle::SizeType32> const&,             // noRepeatNgramSize
+                 std::optional<tle::SizeType32> const&,             // numReturnSequences
+                 std::optional<tle::FloatType> const&,              // minP
+                 std::optional<std::vector<tle::SizeType32>> const& // beamWidthArray
+                 >(),
+            // clang-format off
+            nb::arg("beam_width") = 1,
+            nb::kw_only(),
+            nb::arg("top_k") = nb::none(),
+            nb::arg("top_p") = nb::none(),
+            nb::arg("top_p_min") = nb::none(),
+            nb::arg("top_p_reset_ids") = nb::none(),
+            nb::arg("top_p_decay") = nb::none(),
+            nb::arg("seed") = nb::none(),
+            nb::arg("temperature") = nb::none(),
+            nb::arg("min_tokens") = nb::none(),
+            nb::arg("beam_search_diversity_rate") = nb::none(),
+            nb::arg("repetition_penalty") = nb::none(),
+            nb::arg("presence_penalty") = nb::none(),
+            nb::arg("frequency_penalty") = nb::none(),
+            nb::arg("length_penalty") = nb::none(),
+            nb::arg("early_stopping") = nb::none(),
+            nb::arg("no_repeat_ngram_size") = nb::none(),
+            nb::arg("num_return_sequences") = nb::none(),
+            nb::arg("min_p") = nb::none(),
+            nb::arg("beam_width_array") = nb::none())               // clang-format on
+        .def_prop_rw("beam_width", &tle::SamplingConfig::getBeamWidth, &tle::SamplingConfig::setBeamWidth)
+        .def_prop_rw("top_k", &tle::SamplingConfig::getTopK, &tle::SamplingConfig::setTopK)
+        .def_prop_rw("top_p", &tle::SamplingConfig::getTopP, &tle::SamplingConfig::setTopP)
+        .def_prop_rw("top_p_min", &tle::SamplingConfig::getTopPMin, &tle::SamplingConfig::setTopPMin)
+        .def_prop_rw("top_p_reset_ids", &tle::SamplingConfig::getTopPResetIds, &tle::SamplingConfig::setTopPResetIds)
+        .def_prop_rw("top_p_decay", &tle::SamplingConfig::getTopPDecay, &tle::SamplingConfig::setTopPDecay)
+        .def_prop_rw("seed", &tle::SamplingConfig::getSeed, &tle::SamplingConfig::setSeed)
+        .def_prop_rw("temperature", &tle::SamplingConfig::getTemperature, &tle::SamplingConfig::setTemperature)
+        .def_prop_rw("min_tokens", &tle::SamplingConfig::getMinTokens, &tle::SamplingConfig::setMinTokens)
+        .def_prop_rw("beam_search_diversity_rate", &tle::SamplingConfig::getBeamSearchDiversityRate,
+            &tle::SamplingConfig::setBeamSearchDiversityRate)
+        .def_prop_rw("repetition_penalty", &tle::SamplingConfig::getRepetitionPenalty,
+            &tle::SamplingConfig::setRepetitionPenalty)
+        .def_prop_rw("presence_penalty", &tle::SamplingConfig::getPresencePenalty,
+            [](tle::SamplingConfig& self, std::optional<FloatType> v) { self.setPresencePenalty(v); })
+        .def_prop_rw(
+            "frequency_penalty", &tle::SamplingConfig::getFrequencyPenalty, &tle::SamplingConfig::setFrequencyPenalty)
+        .def_prop_rw("length_penalty", &tle::SamplingConfig::getLengthPenalty, &tle::SamplingConfig::setLengthPenalty)
+        .def_prop_rw("early_stopping", &tle::SamplingConfig::getEarlyStopping, &tle::SamplingConfig::setEarlyStopping)
+        .def_prop_rw("no_repeat_ngram_size", &tle::SamplingConfig::getNoRepeatNgramSize,
+            &tle::SamplingConfig::setNoRepeatNgramSize)
+        .def_prop_rw("num_return_sequences", &tle::SamplingConfig::getNumReturnSequences,
+            &tle::SamplingConfig::setNumReturnSequences)
+        .def_prop_rw("min_p", &tle::SamplingConfig::getMinP, &tle::SamplingConfig::setMinP)
+        .def_prop_rw(
+            "beam_width_array", &tle::SamplingConfig::getBeamWidthArray, &tle::SamplingConfig::setBeamWidthArray)
+        .def("__getstate__", samplingConfigGetstate)
+        .def("__setstate__", samplingConfigSetstate);
+
+    auto additionalModelOutputGetstate
+        = [](tle::AdditionalModelOutput const& self) { return nb::make_tuple(self.name, self.gatherContext); };
+    auto additionalModelOutputSetstate = [](tle::AdditionalModelOutput& additionalModelOutput, nb::tuple const& state)
+    {
+        if (state.size() != 2)
+        {
+            throw std::runtime_error("Invalid AdditionalModelOutput state!");
+        }
+        new (&additionalModelOutput)
+            tle::AdditionalModelOutput(nb::cast<std::string>(state[0]), nb::cast<bool>(state[1]));
+    };
+    nb::class_<tle::AdditionalModelOutput>(m, "AdditionalModelOutput")
+        .def(nb::init<std::string, bool>(), nb::arg("name"), nb::arg("gather_context") = false)
+        .def_rw("name", &tle::AdditionalModelOutput::name)
+        .def_rw("gather_context", &tle::AdditionalModelOutput::gatherContext)
+        .def("__getstate__", additionalModelOutputGetstate)
+        .def("__setstate__", additionalModelOutputSetstate);
+
+    auto outputConfigGetstate = [](tle::OutputConfig const& self)
+    {
+        return nb::make_tuple(self.returnLogProbs, self.returnContextLogits, self.returnGenerationLogits,
+            self.excludeInputFromOutput, self.returnEncoderOutput, self.returnPerfMetrics, self.additionalModelOutputs);
+    };
+    auto outputConfigSetstate = [](tle::OutputConfig& outputConfig, nb::tuple const& state)
+    {
+        if (state.size() != 7)
+        {
+            throw std::runtime_error("Invalid OutputConfig state!");
+        }
+        new (&outputConfig) tle::OutputConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
+            nb::cast<bool>(state[2]), nb::cast<bool>(state[3]), nb::cast<bool>(state[4]), nb::cast<bool>(state[5]),
+            nb::cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(state[6]));
+    };
+    nb::class_<tle::OutputConfig>(m, "OutputConfig")
+        .def(nb::init<bool, bool, bool, bool, bool, bool, std::optional<std::vector<tle::AdditionalModelOutput>>>(),
+            nb::arg("return_log_probs").none() = false, nb::arg("return_context_logits") = false,
+            nb::arg("return_generation_logits") = false, nb::arg("exclude_input_from_output") = false,
+            nb::arg("return_encoder_output") = false, nb::arg("return_perf_metrics") = false,
+            nb::arg("additional_model_outputs") = nb::none())
+        .def_rw("return_log_probs", &tle::OutputConfig::returnLogProbs)
+        .def_rw("return_context_logits", &tle::OutputConfig::returnContextLogits)
+        .def_rw("return_generation_logits", &tle::OutputConfig::returnGenerationLogits)
+        .def_rw("exclude_input_from_output", &tle::OutputConfig::excludeInputFromOutput)
+        .def_rw("return_encoder_output", &tle::OutputConfig::returnEncoderOutput)
+        .def_rw("return_perf_metrics", &tle::OutputConfig::returnPerfMetrics)
+        .def_rw("additional_model_outputs", &tle::OutputConfig::additionalModelOutputs)
+        .def("__getstate__", outputConfigGetstate)
+        .def("__setstate__", outputConfigSetstate);
+
+    auto externalDraftTokensConfigGetstate = [](tle::ExternalDraftTokensConfig const& self)
+    { return nb::make_tuple(self.getTokens(), self.getLogits(), self.getAcceptanceThreshold()); };
+    auto externalDraftTokensConfigSetstate
+        = [](tle::ExternalDraftTokensConfig& externalDraftTokensConfig, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid ExternalDraftTokensConfig state!");
+        }
+        new (&externalDraftTokensConfig) tle::ExternalDraftTokensConfig(nb::cast<VecTokens>(state[0]),
+            nb::cast<std::optional<Tensor>>(state[1]), nb::cast<std::optional<FloatType>>(state[2]));
+    };
+    nb::class_<tle::ExternalDraftTokensConfig>(m, "ExternalDraftTokensConfig")
+        .def(nb::init<VecTokens, std::optional<Tensor>, std::optional<FloatType> const&, std::optional<bool>>(),
+            nb::arg("tokens"), nb::arg("logits") = nb::none(), nb::arg("acceptance_threshold") = nb::none(),
+            nb::arg("fast_logits") = nb::none())
+        .def_prop_ro("tokens", &tle::ExternalDraftTokensConfig::getTokens)
+        .def_prop_ro("logits", &tle::ExternalDraftTokensConfig::getLogits)
+        .def_prop_ro("acceptance_threshold", &tle::ExternalDraftTokensConfig::getAcceptanceThreshold)
+        .def("__getstate__", externalDraftTokensConfigGetstate)
+        .def("__setstate__", externalDraftTokensConfigSetstate)
+        .def_prop_ro("fast_logits", &tle::ExternalDraftTokensConfig::getFastLogits);
+
+    auto promptTuningConfigGetstate = [](tle::PromptTuningConfig const& self)
+    { return nb::make_tuple(self.getEmbeddingTable(), self.getInputTokenExtraIds()); };
+    auto promptTuningConfigSetstate = [](tle::PromptTuningConfig& promptTuningConfig, nb::tuple const& state)
+    {
+        if (state.size() != 2)
+        {
+            throw std::runtime_error("Invalid PromptTuningConfig state!");
+        }
+        new (&promptTuningConfig)
+            tle::PromptTuningConfig(nb::cast<Tensor>(state[0]), nb::cast<std::optional<VecTokenExtraIds>>(state[1]));
+    };
+    nb::class_<tle::PromptTuningConfig>(m, "PromptTuningConfig")
+        .def(nb::init<Tensor, std::optional<VecTokenExtraIds>>(), nb::arg("embedding_table"),
+            nb::arg("input_token_extra_ids") = nb::none())
+        .def_prop_ro("embedding_table", &tle::PromptTuningConfig::getEmbeddingTable)
+        .def_prop_ro("input_token_extra_ids", &tle::PromptTuningConfig::getInputTokenExtraIds)
+        .def("__getstate__", promptTuningConfigGetstate)
+        .def("__setstate__", promptTuningConfigSetstate);
+
+    auto loraConfigGetstate = [](tle::LoraConfig const& self)
+    { return nb::make_tuple(self.getTaskId(), self.getWeights(), self.getConfig()); };
+    auto loraConfigSetstate = [](tle::LoraConfig& loraConfig, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid LoraConfig state!");
+        }
+        new (&loraConfig) tle::LoraConfig(nb::cast<IdType>(state[0]), nb::cast<std::optional<Tensor>>(state[1]),
+            nb::cast<std::optional<Tensor>>(state[2]));
+    };
+    nb::class_<tle::LoraConfig>(m, "LoraConfig")
+        .def(nb::init<uint64_t, std::optional<Tensor>, std::optional<Tensor>>(), nb::arg("task_id"),
+            nb::arg("weights") = nb::none(), nb::arg("config") = nb::none())
+        .def_prop_ro("task_id", &tle::LoraConfig::getTaskId)
+        .def_prop_ro("weights", &tle::LoraConfig::getWeights)
+        .def_prop_ro("config", &tle::LoraConfig::getConfig)
+        .def("__getstate__", loraConfigGetstate)
+        .def("__setstate__", loraConfigSetstate);
+
+    auto multimodalInputGetstate = [](tle::MultimodalInput const& self)
+    { return nb::make_tuple(self.getMultimodalHashes(), self.getMultimodalPositions(), self.getMultimodalLengths()); };
+    auto multimodalInputSetstate = [](tle::MultimodalInput& multimodalInput, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid MultimodalInput state!");
+        }
+        new (&multimodalInput) tle::MultimodalInput(nb::cast<std::vector<std::vector<SizeType32>>>(state[0]),
+            nb::cast<std::vector<SizeType32>>(state[1]), nb::cast<std::vector<SizeType32>>(state[2]));
+    };
+    nb::class_<tle::MultimodalInput>(m, "MultimodalInput")
+        .def(nb::init<std::vector<std::vector<SizeType32>>, std::vector<SizeType32>, std::vector<SizeType32>>(),
+            nb::arg("multimodal_hashes"), nb::arg("multimodal_positions"), nb::arg("multimodal_lengths"))
+        .def_prop_ro("multimodal_hashes", &tle::MultimodalInput::getMultimodalHashes)
+        .def_prop_ro("multimodal_positions", &tle::MultimodalInput::getMultimodalPositions)
+        .def_prop_ro("multimodal_lengths", &tle::MultimodalInput::getMultimodalLengths)
+        .def("__getstate__", multimodalInputGetstate)
+        .def("__setstate__", multimodalInputSetstate);
+
+    auto MropeConfigGetstate = [](tle::MropeConfig const& self)
+    { return nb::make_tuple(self.getMRopeRotaryCosSin(), self.getMRopePositionDeltas()); };
+    auto MropeConfigSetstate = [](tle::MropeConfig& mropeConfig, nb::tuple const& state)
+    {
+        if (state.size() != 2)
+        {
+            throw std::runtime_error("Invalid MropeConfig state!");
+        }
+        new (&mropeConfig) tle::MropeConfig(nb::cast<tle::Tensor>(state[0]), nb::cast<SizeType32>(state[1]));
+    };
+    nb::class_<tle::MropeConfig>(m, "MropeConfig")
+        .def(nb::init<Tensor, SizeType32>(), nb::arg("mrope_rotary_cos_sin"), nb::arg("mrope_position_deltas"))
+        .def_prop_ro("mrope_rotary_cos_sin", &tle::MropeConfig::getMRopeRotaryCosSin)
+        .def_prop_ro("mrope_position_deltas", &tle::MropeConfig::getMRopePositionDeltas)
+        .def("__getstate__", MropeConfigGetstate)
+        .def("__setstate__", MropeConfigSetstate);
+
+    auto lookaheadDecodingConfigGetstate = [](tle::LookaheadDecodingConfig const& self)
+    { return nb::make_tuple(self.getWindowSize(), self.getNgramSize(), self.getVerificationSetSize()); };
+    auto lookaheadDecodingConfigSetstate
+        = [](tle::LookaheadDecodingConfig& lookaheadDecodingConfig, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid LookaheadDecodingConfig state!");
+        }
+        new (&lookaheadDecodingConfig) tle::LookaheadDecodingConfig(
+            nb::cast<SizeType32>(state[0]), nb::cast<SizeType32>(state[1]), nb::cast<SizeType32>(state[2]));
+    };
+    nb::class_<tle::LookaheadDecodingConfig>(m, "LookaheadDecodingConfig")
+        .def(nb::init<SizeType32, SizeType32, SizeType32>(), nb::arg("max_window_size"), nb::arg("max_ngram_size"),
+            nb::arg("max_verification_set_size"))
+        .def_prop_ro("max_window_size", &tle::LookaheadDecodingConfig::getWindowSize)
+        .def_prop_ro("max_ngram_size", &tle::LookaheadDecodingConfig::getNgramSize)
+        .def_prop_ro("max_verification_set_size", &tle::LookaheadDecodingConfig::getVerificationSetSize)
+        .def("calculate_speculative_resource", &tle::LookaheadDecodingConfig::calculateSpeculativeResource)
+        .def_static(
+            "calculate_speculative_resource_tuple", &tle::LookaheadDecodingConfig::calculateSpeculativeResourceTuple)
+        .def("__getstate__", lookaheadDecodingConfigGetstate)
+        .def("__setstate__", lookaheadDecodingConfigSetstate)
+        .def_static("get_default_lookahead_decoding_window",
+            []() { return tle::LookaheadDecodingConfig::kDefaultLookaheadDecodingWindow; })
+        .def_static("get_default_lookahead_decoding_ngram",
+            []() { return tle::LookaheadDecodingConfig::kDefaultLookaheadDecodingNgram; })
+        .def_static("get_default_lookahead_decoding_verification_set",
+            []() { return tle::LookaheadDecodingConfig::kDefaultLookaheadDecodingVerificationSet; });
+
+    auto TokenRangeRetentionConfigGetstate = [](tle::KvCacheRetentionConfig::TokenRangeRetentionConfig const& self)
+    { return nb::make_tuple(self.tokenStart, self.tokenEnd, self.priority, self.durationMs); };
+    auto TokenRangeRetentionConfigSetstate
+        = [](tle::KvCacheRetentionConfig::TokenRangeRetentionConfig& tokenRangeRetentionConfig, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&tokenRangeRetentionConfig) tle::KvCacheRetentionConfig::TokenRangeRetentionConfig(
+            nb::cast<SizeType32>(state[0]), nb::cast<std::optional<SizeType32>>(state[1]),
+            nb::cast<tle::RetentionPriority>(state[2]), nb::cast<std::optional<std::chrono::milliseconds>>(state[3]));
+    };
+    auto kvCacheRetentionConfigGetstate = [](tle::KvCacheRetentionConfig const& self)
+    {
+        return nb::make_tuple(self.getTokenRangeRetentionConfigs(), self.getDecodeRetentionPriority(),
+            self.getDecodeDurationMs(), self.getTransferMode(), self.getDirectory());
+    };
+    auto kvCacheRetentionConfigSetstate
+        = [](tle::KvCacheRetentionConfig& kvCacheRetentionConfig, nb::tuple const& state)
+    {
+        if (state.size() != 5)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&kvCacheRetentionConfig) tle::KvCacheRetentionConfig(
+            nb::cast<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>>(state[0]),
+            nb::cast<tle::RetentionPriority>(state[1]), nb::cast<std::optional<std::chrono::milliseconds>>(state[2]),
+            nb::cast<tle::KvCacheTransferMode>(state[3]), nb::cast<std::optional<std::string>>(state[4]));
+    };
+
+    auto kvCacheRetentionConfig = nb::class_<tle::KvCacheRetentionConfig>(m, "KvCacheRetentionConfig");
+
+    nb::class_<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>(
+        kvCacheRetentionConfig, "TokenRangeRetentionConfig")
+        .def(nb::init<SizeType32, std::optional<SizeType32>, tle::RetentionPriority,
+                 std::optional<std::chrono::milliseconds>>(),
+            nb::arg("token_start"), nb::arg("token_end"), nb::arg("priority"), nb::arg("duration_ms") = nb::none())
+        .def_rw("token_start", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::tokenStart)
+        .def_rw("token_end", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::tokenEnd)
+        .def_rw("priority", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::priority)
+        .def_rw("duration_ms", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::durationMs)
+        .def("__getstate__", TokenRangeRetentionConfigGetstate)
+        .def("__setstate__", TokenRangeRetentionConfigSetstate)
+        .def("__eq__", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::operator==);
+
+    // There's a circular dependency between the declaration of the TokenRangeRetentionPriority and
+    // KvCacheRetentionConfig bindings. Defer definition of the KvCacheRetentionConfig bindings until the
+    // TokenRangeRetentionPriority bindings have been defined.
+    kvCacheRetentionConfig
+        .def(nb::init<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>, tle::RetentionPriority,
+                 std::optional<std::chrono::milliseconds>, tle::KvCacheTransferMode, std::optional<std::string>>(),
+            nb::arg("token_range_retention_configs"),
+            nb::arg("decode_retention_priority") = tle::KvCacheRetentionConfig::kDefaultRetentionPriority,
+            nb::arg("decode_duration_ms") = nb::none(), nb::arg("transfer_mode") = tle::KvCacheTransferMode::DRAM,
+            nb::arg("directory") = nb::none())
+        .def_prop_ro("token_range_retention_configs", &tle::KvCacheRetentionConfig::getTokenRangeRetentionConfigs)
+        .def_prop_ro("decode_retention_priority", &tle::KvCacheRetentionConfig::getDecodeRetentionPriority)
+        .def_prop_ro("decode_duration_ms", &tle::KvCacheRetentionConfig::getDecodeDurationMs)
+        .def_prop_ro("transfer_mode", &tle::KvCacheRetentionConfig::getTransferMode)
+        .def_prop_ro("directory", &tle::KvCacheRetentionConfig::getDirectory)
+        .def("__getstate__", kvCacheRetentionConfigGetstate)
+        .def("__setstate__", kvCacheRetentionConfigSetstate)
+        .def("__eq__", &tle::KvCacheRetentionConfig::operator==);
+
+    auto ContextPhaseParamsGetState = [](tle::ContextPhaseParams const& self)
+    {
+        if (self.getState() != nullptr)
+        {
+            auto serializedState = self.getSerializedState();
+            return nb::make_tuple(self.getFirstGenTokens(), self.getReqId(),
+                nb::bytes(serializedState.data(), serializedState.size()), self.getDraftTokens());
+        }
+        return nb::make_tuple(self.getFirstGenTokens(), self.getReqId(), nb::none(), self.getDraftTokens());
+    };
+
+    auto ContextPhaseParamsSetState = [](tle::ContextPhaseParams& contextPhaseParams, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid ContextPhaseParams state!");
+        }
+        if (!state[2].is_none())
+        {
+            auto opaque_state = nb::cast<nb::bytes>(state[2]);
+            auto opaque_state_str_view = std::string_view(opaque_state.c_str(), opaque_state.size());
+            new (&contextPhaseParams) tle::ContextPhaseParams(nb::cast<VecTokens>(state[0]),
+                nb::cast<tle::ContextPhaseParams::RequestIdType>(state[1]),
+                std::vector<char>(opaque_state_str_view.begin(), opaque_state_str_view.end()),
+                nb::cast<std::optional<VecTokens>>(state[3]));
+        }
+        new (&contextPhaseParams) tle::ContextPhaseParams(nb::cast<VecTokens>(state[0]),
+            nb::cast<tle::ContextPhaseParams::RequestIdType>(state[1]), nb::cast<std::optional<VecTokens>>(state[3]));
+    };
+
+    nb::class_<tle::ContextPhaseParams>(m, "ContextPhaseParams")
+        .def("__init__",
+            [](tle::ContextPhaseParams const& self, VecTokens const& first_gen_tokens,
+                tle::ContextPhaseParams::RequestIdType req_id, std::optional<nb::bytes> const& opaque_state,
+                std::optional<VecTokens> const& draft_tokens)
+            {
+                if (opaque_state)
+                {
+                    auto opaque_state_str_view
+                        = std::string_view(opaque_state.value().c_str(), opaque_state.value().size());
+                    return std::make_unique<tle::ContextPhaseParams>(first_gen_tokens, req_id,
+                        std::vector<char>(opaque_state_str_view.begin(), opaque_state_str_view.end()), draft_tokens);
+                }
+                return std::make_unique<tle::ContextPhaseParams>(first_gen_tokens, req_id, draft_tokens);
+            })
+        .def_prop_ro("first_gen_tokens", [](tle::ContextPhaseParams const& self) { return self.getFirstGenTokens(); })
+        .def_prop_ro("draft_tokens", [](tle::ContextPhaseParams const& self) { return self.getDraftTokens(); })
+        .def_prop_ro("req_id", &tle::ContextPhaseParams::getReqId)
+        .def_prop_ro("opaque_state",
+            [](tle::ContextPhaseParams const& self)
+            {
+                std::optional<nb::bytes> opaque_state{std::nullopt};
+                if (self.getState() != nullptr)
+                {
+                    auto serializedState = self.getSerializedState();
+                    opaque_state = nb::bytes(serializedState.data(), serializedState.size());
+                }
+                return opaque_state;
+            })
+        .def("__getstate__", ContextPhaseParamsGetState)
+        .def("__setstate__", ContextPhaseParamsSetState);
+
+    auto EagleDecodingConfigGetstate = [](tle::EagleConfig const& self)
+    {
+        return nb::make_tuple(self.getEagleChoices(), self.isGreedySampling(), self.getPosteriorThreshold(),
+            self.useDynamicTree(), self.getDynamicTreeMaxTopK());
+    };
+    auto EagleDecodingConfigSetstate = [](tle::EagleConfig& eagleConfig, nb::tuple const& state)
+    {
+        if (state.size() != 5)
+        {
+            throw std::runtime_error("Invalid EagleConfig state!");
+        }
+        new (&eagleConfig) tle::EagleConfig(nb::cast<std::optional<tle::EagleChoices>>(state[0]),
+            nb::cast<bool>(state[1]), nb::cast<std::optional<float>>(state[2]), nb::cast<bool>(state[3]),
+            nb::cast<std::optional<SizeType32>>(state[4]));
+    };
+    nb::class_<tle::EagleConfig>(m, "EagleConfig")
+        .def(nb::init<std::optional<tle::EagleChoices>, bool, std::optional<float>, bool, std::optional<SizeType32>>(),
+            nb::arg("eagle_choices") = nb::none(), nb::arg("greedy_sampling") = true,
+            nb::arg("posterior_threshold") = nb::none(), nb::arg("use_dynamic_tree") = false,
+            nb::arg("dynamic_tree_max_topK") = nb::none())
+        .def_prop_ro("eagle_choices", &tle::EagleConfig::getEagleChoices)
+        .def_prop_ro("greedy_sampling", &tle::EagleConfig::isGreedySampling)
+        .def_prop_ro("posterior_threshold", &tle::EagleConfig::getPosteriorThreshold)
+        .def_prop_ro("use_dynamic_tree", &tle::EagleConfig::useDynamicTree)
+        .def_prop_ro("dynamic_tree_max_topK", &tle::EagleConfig::getDynamicTreeMaxTopK)
+        .def("__getstate__", EagleDecodingConfigGetstate)
+        .def("__setstate__", EagleDecodingConfigSetstate);
+
+    // Guided decoding params
+    auto pyGuidedDecodingParams = nb::class_<tle::GuidedDecodingParams>(m, "GuidedDecodingParams");
+
+    nb::enum_<tle::GuidedDecodingParams::GuideType>(pyGuidedDecodingParams, "GuideType")
+        .value("JSON", tle::GuidedDecodingParams::GuideType::kJSON)
+        .value("JSON_SCHEMA", tle::GuidedDecodingParams::GuideType::kJSON_SCHEMA)
+        .value("REGEX", tle::GuidedDecodingParams::GuideType::kREGEX)
+        .value("EBNF_GRAMMAR", tle::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR)
+        .value("STRUCTURAL_TAG", tle::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG);
+
+    auto guidedDecodingParamsGetstate
+        = [](tle::GuidedDecodingParams const& self) { return nb::make_tuple(self.getGuideType(), self.getGuide()); };
+
+    auto guidedDecodingParamsSetstate = [](tle::GuidedDecodingParams& guidedDecodingParams, nb::tuple const& state)
+    {
+        if (state.size() != 2)
+        {
+            throw std::runtime_error("Invalid GuidedDecodingParams state!");
+        }
+        new (&guidedDecodingParams) tle::GuidedDecodingParams(
+            nb::cast<tle::GuidedDecodingParams::GuideType>(state[0]), nb::cast<std::optional<std::string>>(state[1]));
+    };
+
+    pyGuidedDecodingParams
+        .def(nb::init<tle::GuidedDecodingParams::GuideType, std::optional<std::string>>(), nb::arg("guide_type"),
+            nb::arg("guide") = nb::none())
+        .def_prop_ro("guide_type", &tle::GuidedDecodingParams::getGuideType)
+        .def_prop_ro("guide", &tle::GuidedDecodingParams::getGuide)
+        .def("__getstate__", guidedDecodingParamsGetstate)
+        .def("__setstate__", guidedDecodingParamsSetstate);
+
+    auto requestGetstate = [](tle::Request const& self)
+    {
+        return nb::make_tuple(self.getInputTokenIds(), self.getMaxTokens(), self.getStreaming(),
+            self.getSamplingConfig(), self.getOutputConfig(), self.getEndId(), self.getPadId(), self.getPositionIds(),
+            self.getBadWords(), self.getStopWords(), self.getEmbeddingBias(), self.getExternalDraftTokensConfig(),
+            self.getPromptTuningConfig(), self.getMultimodalInput(), self.getMultimodalEmbedding(),
+            self.getMropeConfig(), self.getLoraConfig(), self.getLookaheadConfig(), self.getKvCacheRetentionConfig(),
+            self.getLogitsPostProcessorName(), self.getLogitsPostProcessor(), self.getEncoderInputTokenIds(),
+            self.getClientId(), self.getReturnAllGeneratedTokens(), self.getPriority(), self.getRequestType(),
+            self.getContextPhaseParams(), self.getEncoderInputFeatures(), self.getEncoderOutputLength(),
+            self.getCrossAttentionMask(), self.getEagleConfig(), self.getSkipCrossAttnBlocks(),
+            self.getGuidedDecodingParams());
+    };
+    auto requestSetstate = [](tle::Request& request, nb::tuple const& state)
+    {
+        if (state.size() != 33)
+        {
+            throw std::runtime_error("Invalid Request state!");
+        }
+        new (&request) tle::Request(nb::cast<VecTokens>(state[0]), nb::cast<SizeType32>(state[1]),
+            nb::cast<bool>(state[2]), nb::cast<tle::SamplingConfig>(state[3]), nb::cast<tle::OutputConfig>(state[4]),
+            nb::cast<std::optional<SizeType32>>(state[5]), nb::cast<std::optional<SizeType32>>(state[6]),
+            nb::cast<std::optional<std::vector<SizeType32>>>(state[7]),
+            nb::cast<std::optional<std::list<VecTokens>>>(state[8]),
+            nb::cast<std::optional<std::list<VecTokens>>>(state[9]), nb::cast<std::optional<Tensor>>(state[10]),
+            nb::cast<std::optional<tle::ExternalDraftTokensConfig>>(state[11]),
+            nb::cast<std::optional<tle::PromptTuningConfig>>(state[12]),
+            nb::cast<std::optional<tle::MultimodalInput>>(state[13]), nb::cast<std::optional<Tensor>>(state[14]),
+            nb::cast<std::optional<tle::MropeConfig>>(state[15]), nb::cast<std::optional<tle::LoraConfig>>(state[16]),
+            nb::cast<std::optional<tle::LookaheadDecodingConfig>>(state[17]),
+            nb::cast<std::optional<tle::KvCacheRetentionConfig>>(state[18]),
+            nb::cast<std::optional<std::string>>(state[19]),
+            nb::cast<std::optional<tle::LogitsPostProcessor>>(state[20]), nb::cast<std::optional<VecTokens>>(state[21]),
+            nb::cast<std::optional<IdType>>(state[22]), nb::cast<bool>(state[23]),
+            nb::cast<tle::PriorityType>(state[24]), nb::cast<tle::RequestType>(state[25]),
+            nb::cast<std::optional<tle::ContextPhaseParams>>(state[26]),
+            nb::cast<std::optional<tle::Tensor>>(state[27]), nb::cast<std::optional<SizeType32>>(state[28]),
+            nb::cast<std::optional<tle::Tensor>>(state[29]), 1, nb::cast<std::optional<tle::EagleConfig>>(state[30]),
+            nb::cast<std::optional<tle::Tensor>>(state[31]),
+            nb::cast<std::optional<tle::GuidedDecodingParams>>(state[32]));
+    };
+
+    nb::class_<tle::Request> request(m, "Request", nb::dynamic_attr());
+    request
+        .def(nb::init<tle::VecTokens,                           // inputTokenIds
+                 tle::SizeType32,                               // maxTokens
+                 bool,                                          // streaming
+                 tle::SamplingConfig const&,                    // samplingConfig
+                 tle::OutputConfig const&,                      // outputConfig
+                 std::optional<tle::SizeType32> const&,         // endId
+                 std::optional<tle::SizeType32> const&,         // padId
+                 std::optional<std::vector<SizeType32>>,        // positionIds
+                 std::optional<std::list<tle::VecTokens>>,      // badWords
+                 std::optional<std::list<tle::VecTokens>>,      // stopWords
+                 std::optional<tle::Tensor>,                    // embeddingBias
+                 std::optional<tle::ExternalDraftTokensConfig>, // externalDraftTokensConfig
+                 std::optional<tle::PromptTuningConfig>,        // pTuningConfig
+                 std::optional<tle::MultimodalInput>,           // multimodalInput
+                 std::optional<tle::Tensor>,                    // multimodalEmbedding
+                 std::optional<tle::MropeConfig>,               // mRopeConfig
+                 std::optional<tle::LoraConfig>,                // loraConfig
+                 std::optional<tle::LookaheadDecodingConfig>,   // lookaheadConfig
+                 std::optional<tle::KvCacheRetentionConfig>,    // kvCacheRetentionConfig
+                 std::optional<std::string>,                    // logitsPostProcessorName
+                 std::optional<tle::LogitsPostProcessor>,       // logitsPostProcessor
+                 std::optional<tle::VecTokens>,                 // encoderInputTokenIds
+                 std::optional<tle::IdType>,                    // clientId
+                 bool,                                          // returnAllGeneratedTokens
+                 tle::PriorityType,                             // priority
+                 tle::RequestType,                              // type
+                 std::optional<tle::ContextPhaseParams>,        // contextPhaseParams
+                 std::optional<tle::Tensor>,                    // encoderInputFeatures
+                 std::optional<tle::SizeType32>,                // encoderOutputLength
+                 std::optional<tle::Tensor>,                    // crossAttentionMask
+                 SizeType32,                                    // numReturnSequences
+                 std::optional<tle::EagleConfig>,               // eagleConfig
+                 std::optional<tle::Tensor>,                    // skipCrossAttnBlocks
+                 std::optional<tle::GuidedDecodingParams>,      // guidedDecodingParams
+                 std::optional<tle::SizeType32>,                // languageAdapterUid
+                 std::optional<tle::MillisecondsType>           // allottedTimeMs
+                 >(),
+            // clang-format off
+        nb::arg("input_token_ids"),
+        nb::arg("max_tokens"),
+        nb::kw_only(),
+        nb::arg("streaming") = false,
+        nb::arg("sampling_config") = tle::SamplingConfig(),
+        nb::arg("output_config") = tle::OutputConfig(),
+        nb::arg("end_id") = nb::none(),
+        nb::arg("pad_id") = nb::none(),
+        nb::arg("position_ids") = nb::none(),
+        nb::arg("bad_words") = nb::none(),
+        nb::arg("stop_words") = nb::none(),
+        nb::arg("embedding_bias") = nb::none(),
+        nb::arg("external_draft_tokens_config") = nb::none(),
+        nb::arg("prompt_tuning_config") = nb::none(),
+        nb::arg("multimodal_input") = nb::none(),
+        nb::arg("multimodal_embedding") = nb::none(),
+        nb::arg("mrope_config") = nb::none(),
+        nb::arg("lora_config") = nb::none(),
+        nb::arg("lookahead_config") = nb::none(),
+        nb::arg("kv_cache_retention_config") = nb::none(),
+        nb::arg("logits_post_processor_name") = nb::none(),
+        nb::arg("logits_post_processor") = nb::none(),
+        nb::arg("encoder_input_token_ids") = nb::none(),
+        nb::arg("client_id") = nb::none(),
+        nb::arg("return_all_generated_tokens") = false,
+        nb::arg("priority") = tle::Request::kDefaultPriority,
+        nb::arg("type") = tle::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION,
+        nb::arg("context_phase_params") = nb::none(),
+        nb::arg("encoder_input_features") = nb::none(),
+        nb::arg("encoder_output_length") = nb::none(),
+        nb::arg("cross_attention_mask") = nb::none(),
+        nb::arg("num_return_sequences") = 1,
+        nb::arg("eagle_config") = nb::none(),
+        nb::arg("skip_cross_attn_blocks") = nb::none(),
+        nb::arg("guided_decoding_params") = nb::none(),
+        nb::arg("language_adapter_uid") = nb::none(),
+        nb::arg("allotted_time_ms") = nb::none()
+    )          // clang-format on
+        .def_prop_ro("input_token_ids", &tle::Request::getInputTokenIds)
+        .def_prop_ro("max_tokens", &tle::Request::getMaxTokens)
+        .def_prop_rw("streaming", &tle::Request::getStreaming, &tle::Request::setStreaming)
+        .def_prop_rw("sampling_config", &tle::Request::getSamplingConfig, &tle::Request::setSamplingConfig)
+        .def_prop_rw("output_config", &tle::Request::getOutputConfig, &tle::Request::setOutputConfig)
+        .def_prop_rw("end_id", &tle::Request::getEndId, &tle::Request::setEndId)
+        .def_prop_rw("pad_id", &tle::Request::getPadId, &tle::Request::setPadId)
+        .def_prop_rw("position_ids", &tle::Request::getPositionIds, &tle::Request::setPositionIds)
+        .def_prop_rw("bad_words", &tle::Request::getBadWords, &tle::Request::setBadWords)
+        .def_prop_rw("stop_words", &tle::Request::getStopWords, &tle::Request::setStopWords)
+        .def_prop_rw("embedding_bias", &tle::Request::getEmbeddingBias, &tle::Request::setEmbeddingBias)
+        .def_prop_rw("external_draft_tokens_config", &tle::Request::getExternalDraftTokensConfig,
+            &tle::Request::setExternalDraftTokensConfig)
+        .def_prop_rw("prompt_tuning_config", &tle::Request::getPromptTuningConfig, &tle::Request::setPromptTuningConfig)
+        .def_prop_rw("multimodal_input", &tle::Request::getMultimodalInput, &tle::Request::setMultimodalInput)
+        .def_prop_rw(
+            "multimodal_embedding", &tle::Request::getMultimodalEmbedding, &tle::Request::setMultimodalEmbedding)
+        .def_prop_rw("mrope_config", &tle::Request::getMropeConfig, &tle::Request::setMropeConfig)
+        .def_prop_rw("lora_config", &tle::Request::getLoraConfig, &tle::Request::setLoraConfig)
+        .def_prop_rw("lookahead_config", &tle::Request::getLookaheadConfig, &tle::Request::setLookaheadConfig)
+        .def_prop_rw("kv_cache_retention_config", &tle::Request::getKvCacheRetentionConfig,
+            &tle::Request::setKvCacheRetentionConfig)
+        .def_prop_rw("logits_post_processor_name", &tle::Request::getLogitsPostProcessorName,
+            &tle::Request::setLogitsPostProcessorName)
+        .def_prop_rw(
+            "logits_post_processor", &tle::Request::getLogitsPostProcessor, &tle::Request::setLogitsPostProcessor)
+        .def_prop_rw(
+            "encoder_input_token_ids", &tle::Request::getEncoderInputTokenIds, &tle::Request::setEncoderInputTokenIds)
+        .def_prop_rw("client_id", &tle::Request::getClientId, &tle::Request::setClientId)
+        .def_prop_rw("return_all_generated_tokens", &tle::Request::getReturnAllGeneratedTokens,
+            &tle::Request::setReturnAllGeneratedTokens)
+        .def_prop_rw("request_type", &tle::Request::getRequestType, &tle::Request::setRequestType)
+        .def_prop_rw(
+            "encoder_input_features", &tle::Request::getEncoderInputFeatures, &tle::Request::setEncoderInputFeatures)
+        .def_prop_rw("cross_attention_mask", &tle::Request::getCrossAttentionMask, &tle::Request::setCrossAttentionMask)
+        .def_prop_rw("eagle_config", &tle::Request::getEagleConfig, &tle::Request::setEagleConfig)
+        .def_prop_rw(
+            "skip_cross_attn_blocks", &tle::Request::getSkipCrossAttnBlocks, &tle::Request::setSkipCrossAttnBlocks)
+        .def_prop_rw(
+            "guided_decoding_params", &tle::Request::getGuidedDecodingParams, &tle::Request::setGuidedDecodingParams)
+        .def_prop_rw("allotted_time_ms", &tle::Request::getAllottedTimeMs, &tle::Request::setAllottedTimeMs)
+        .def_prop_rw("context_phase_params", &tle::Request::getContextPhaseParams, &tle::Request::setContextPhaseParams)
+        .def("__getstate__", requestGetstate)
+        .def("__setstate__", requestSetstate);
+    request.attr("BATCHED_POST_PROCESSOR_NAME") = tle::Request::kBatchedPostProcessorName;
+
+    nb::class_<tle::SpeculativeDecodingFastLogitsInfo>(m, "SpeculativeDecodingFastLogitsInfo")
+        .def(nb::init<>())
+        .def_rw("draft_request_id", &tle::SpeculativeDecodingFastLogitsInfo::draftRequestId)
+        .def_rw("draft_participant_id", &tle::SpeculativeDecodingFastLogitsInfo::draftParticipantId)
+        .def("to_tensor", &tle::SpeculativeDecodingFastLogitsInfo::toTensor);
+
+    auto requestPerfMetrics = nb::class_<tle::RequestPerfMetrics>(m, "RequestPerfMetrics");
+
+    auto timingMetricsGetstate = [](tle::RequestPerfMetrics::TimingMetrics const& self)
+    {
+        return nb::make_tuple(self.arrivalTime, self.firstScheduledTime, self.firstTokenTime, self.lastTokenTime,
+            self.kvCacheTransferStart, self.kvCacheTransferEnd, self.kvCacheSize);
+    };
+    auto timingMetricsSetstate = [](tle::RequestPerfMetrics::TimingMetrics& timingMetrics, nb::tuple const& state)
+    {
+        if (state.size() != 7)
+        {
+            throw std::runtime_error("Invalid TimingMetrics state!");
+        }
+        new (&timingMetrics)
+            tle::RequestPerfMetrics::TimingMetrics{nb::cast<tle::RequestPerfMetrics::TimePoint>(state[0]),
+                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[1]),
+                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[2]),
+                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[3]),
+                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[4]),
+                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[5]), nb::cast<size_t>(state[6])};
+    };
+    nb::class_<tle::RequestPerfMetrics::TimingMetrics>(m, "TimingMetrics")
+        .def(nb::init<>())
+        .def_rw("arrival_time", &tle::RequestPerfMetrics::TimingMetrics::arrivalTime)
+        .def_rw("first_scheduled_time", &tle::RequestPerfMetrics::TimingMetrics::firstScheduledTime)
+        .def_rw("first_token_time", &tle::RequestPerfMetrics::TimingMetrics::firstTokenTime)
+        .def_rw("last_token_time", &tle::RequestPerfMetrics::TimingMetrics::lastTokenTime)
+        .def_rw("kv_cache_transfer_start", &tle::RequestPerfMetrics::TimingMetrics::kvCacheTransferStart)
+        .def_rw("kv_cache_transfer_end", &tle::RequestPerfMetrics::TimingMetrics::kvCacheTransferEnd)
+        .def_rw("kv_cache_size", &tle::RequestPerfMetrics::TimingMetrics::kvCacheSize)
+        .def("__getstate__", timingMetricsGetstate)
+        .def("__setstate__", timingMetricsSetstate);
+
+    auto kvCacheMetricsGetstate = [](tle::RequestPerfMetrics::KvCacheMetrics const& self)
+    {
+        return nb::make_tuple(self.numTotalAllocatedBlocks, self.numNewAllocatedBlocks, self.numReusedBlocks,
+            self.numMissedBlocks, self.kvCacheHitRate);
+    };
+    auto kvCacheMetricsSetstate = [](tle::RequestPerfMetrics::KvCacheMetrics& kvCacheMetrics, nb::tuple const& state)
+    {
+        if (state.size() != 5)
+        {
+            throw std::runtime_error("Invalid KvCacheMetrics state!");
+        }
+        new (&kvCacheMetrics)
+            tle::RequestPerfMetrics::KvCacheMetrics{nb::cast<SizeType32>(state[0]), nb::cast<SizeType32>(state[1]),
+                nb::cast<SizeType32>(state[2]), nb::cast<SizeType32>(state[3]), nb::cast<float>(state[4])};
+    };
+    nb::class_<tle::RequestPerfMetrics::KvCacheMetrics>(m, "KvCacheMetrics")
+        .def(nb::init<>())
+        .def_rw("num_total_allocated_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numTotalAllocatedBlocks)
+        .def_rw("num_new_allocated_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numNewAllocatedBlocks)
+        .def_rw("num_reused_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numReusedBlocks)
+        .def_rw("num_missed_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numMissedBlocks)
+        .def_rw("kv_cache_hit_rate", &tle::RequestPerfMetrics::KvCacheMetrics::kvCacheHitRate)
+        .def("__getstate__", kvCacheMetricsGetstate)
+        .def("__setstate__", kvCacheMetricsSetstate);
+
+    auto speculativeDecodingMetricsGetstate = [](tle::RequestPerfMetrics::SpeculativeDecodingMetrics const& self)
+    { return nb::make_tuple(self.acceptanceRate, self.totalAcceptedDraftTokens, self.totalDraftTokens); };
+    auto speculativeDecodingMetricsSetstate
+        = [](tle::RequestPerfMetrics::SpeculativeDecodingMetrics& speculativeDecodingMetrics, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid SpeculativeDecodingMetrics state!");
+        }
+        new (&speculativeDecodingMetrics) tle::RequestPerfMetrics::SpeculativeDecodingMetrics{
+            nb::cast<float>(state[0]), nb::cast<SizeType32>(state[1]), nb::cast<SizeType32>(state[2])};
+    };
+
+    nb::class_<tle::RequestPerfMetrics::SpeculativeDecodingMetrics>(m, "SpeculativeDecodingMetrics")
+        .def(nb::init<>())
+        .def_rw("acceptance_rate", &tle::RequestPerfMetrics::SpeculativeDecodingMetrics::acceptanceRate)
+        .def_rw("total_accepted_draft_tokens",
+            &tle::RequestPerfMetrics::SpeculativeDecodingMetrics::totalAcceptedDraftTokens)
+        .def_rw("total_draft_tokens", &tle::RequestPerfMetrics::SpeculativeDecodingMetrics::totalDraftTokens)
+        .def("__getstate__", speculativeDecodingMetricsGetstate)
+        .def("__setstate__", speculativeDecodingMetricsSetstate);
+
+    auto requestPerfMetricsGetstate = [](tle::RequestPerfMetrics const& self)
+    {
+        return nb::make_tuple(self.timingMetrics, self.kvCacheMetrics, self.speculativeDecoding, self.firstIter,
+            self.lastIter, self.iter);
+    };
+    auto requestPerfMetricsSetstate = [](tle::RequestPerfMetrics& requestPerfMetrics, nb::tuple const& state)
+    {
+        if (state.size() != 6)
+        {
+            throw std::runtime_error("Invalid RequestPerfMetrics state!");
+        }
+        new (&requestPerfMetrics) tle::RequestPerfMetrics{nb::cast<tle::RequestPerfMetrics::TimingMetrics>(state[0]),
+            nb::cast<tle::RequestPerfMetrics::KvCacheMetrics>(state[1]),
+            nb::cast<tle::RequestPerfMetrics::SpeculativeDecodingMetrics>(state[2]),
+            nb::cast<std::optional<tle::IterationType>>(state[3]),
+            nb::cast<std::optional<tle::IterationType>>(state[4]),
+            nb::cast<std::optional<tle::IterationType>>(state[5])};
+    };
+
+    // There's a circular dependency between the declaration of the TimingMetrics and RequestPerfMetrics bindings.
+    // Defer definition of the RequestPerfMetrics bindings until the TimingMetrics have been defined.
+    requestPerfMetrics.def(nb::init<>())
+        .def_rw("timing_metrics", &tle::RequestPerfMetrics::timingMetrics)
+        .def_rw("kv_cache_metrics", &tle::RequestPerfMetrics::kvCacheMetrics)
+        .def_rw("speculative_decoding", &tle::RequestPerfMetrics::speculativeDecoding)
+        .def_rw("first_iter", &tle::RequestPerfMetrics::firstIter)
+        .def_rw("last_iter", &tle::RequestPerfMetrics::lastIter)
+        .def_rw("iter", &tle::RequestPerfMetrics::iter)
+        .def("__getstate__", requestPerfMetricsGetstate)
+        .def("__setstate__", requestPerfMetricsSetstate);
+
+    nb::class_<tle::AdditionalOutput>(m, "AdditionalOutput")
+        .def("__init__ ",
+            [](tle::AdditionalOutput const& self, std::string const& name, tle::Tensor const& output)
+            { return std::make_unique<tle::AdditionalOutput>(name, output); })
+        .def_rw("name", &tle::AdditionalOutput::name)
+        .def_rw("output", &tle::AdditionalOutput::output);
+
+    auto resultSetstate = [](tle::Result& result, nb::tuple const& state)
+    {
+        if (state.size() != 13)
+        {
+            throw std::runtime_error("Invalid Request state!");
+        }
+        new (&result) tle::Result();
+        result.isFinal = nb::cast<bool>(state[0]);
+        result.outputTokenIds = nb::cast<std::vector<VecTokens>>(state[1]);
+        result.cumLogProbs = nb::cast<std::optional<std::vector<float>>>(state[2]);
+        result.logProbs = nb::cast<std::optional<std::vector<std::vector<float>>>>(state[3]);
+        result.contextLogits = nb::cast<std::optional<Tensor>>(state[4]);
+        result.generationLogits = nb::cast<std::optional<Tensor>>(state[5]);
+        result.encoderOutput = nb::cast<std::optional<Tensor>>(state[6]);
+        result.finishReasons = nb::cast<std::vector<tle::FinishReason>>(state[7]);
+        result.sequenceIndex = nb::cast<SizeType32>(state[8]);
+        result.isSequenceFinal = nb::cast<bool>(state[9]);
+        result.decodingIter = nb::cast<SizeType32>(state[10]);
+        result.contextPhaseParams = nb::cast<std::optional<tle::ContextPhaseParams>>(state[11]);
+        result.requestPerfMetrics = nb::cast<std::optional<tle::RequestPerfMetrics>>(state[12]);
+    };
+
+    auto resultGetstate = [](tle::Result const& self)
+    {
+        return nb::make_tuple(self.isFinal, self.outputTokenIds, self.cumLogProbs, self.logProbs, self.contextLogits,
+            self.generationLogits, self.encoderOutput, self.finishReasons, self.sequenceIndex, self.isSequenceFinal,
+            self.decodingIter, self.contextPhaseParams, self.requestPerfMetrics);
+    };
+
+    nb::class_<tle::Result>(m, "Result")
+        .def(nb::init<>())
+        .def_rw("is_final", &tle::Result::isFinal)
+        .def_rw("output_token_ids", &tle::Result::outputTokenIds)
+        .def_rw("cum_log_probs", &tle::Result::cumLogProbs)
+        .def_rw("log_probs", &tle::Result::logProbs)
+        .def_rw("context_logits", &tle::Result::contextLogits)
+        .def_rw("generation_logits", &tle::Result::generationLogits)
+        .def_rw("spec_dec_fast_logits_info", &tle::Result::specDecFastLogitsInfo)
+        .def_rw("encoder_output", &tle::Result::encoderOutput)
+        .def_rw("finish_reasons", &tle::Result::finishReasons)
+        .def_rw("sequence_index", &tle::Result::sequenceIndex)
+        .def_rw("is_sequence_final", &tle::Result::isSequenceFinal)
+        .def_rw("decoding_iter", &tle::Result::decodingIter)
+        .def_rw("context_phase_params", &tle::Result::contextPhaseParams)
+        .def_rw("request_perf_metrics", &tle::Result::requestPerfMetrics)
+        .def_rw("additional_outputs", &tle::Result::additionalOutputs)
+        .def("__getstate__", resultGetstate)
+        .def("__setstate__", resultSetstate);
+
+    m.def("deserialize_result",
+        [](nb::bytes& x)
+        {
+            std::string str(x.c_str(), x.size());
+            std::istringstream is(str);
+            return tle::serialize_utils::deserialize<tle::Result>(is);
+        });
+
+    auto responseGetstate = [](tle::Response const& self)
+    { return nb::make_tuple(self.getRequestId(), self.getResult(), self.getClientId()); };
+
+    auto responseSetstate = [](tle::Response& response, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid Request state!");
+        }
+        new (&response) tle::Response(
+            nb::cast<SizeType32>(state[0]), nb::cast<tle::Result>(state[1]), nb::cast<SizeType32>(state[2]));
+    };
+
+    nb::class_<tle::Response>(m, "Response")
+        .def(nb::init<IdType, std::string, std::optional<IdType>>(), nb::arg("request_id"), nb::arg("error_msg"),
+            nb::arg("client_id") = std::nullopt)
+        .def(nb::init<IdType, tle::Result, std::optional<IdType>>(), nb::arg("request_id"), nb::arg("result"),
+            nb::arg("client_id") = std::nullopt)
+        .def_prop_ro("request_id", &tle::Response::getRequestId)
+        .def_prop_ro("client_id", &tle::Response::getClientId)
+        .def("has_error", &tle::Response::hasError)
+        .def_prop_ro("error_msg", &tle::Response::getErrorMsg)
+        .def_prop_ro("result", &tle::Response::getResult)
+        .def("clear_context_logits",
+            [](tle::Response& self)
+            {
+                if (!self.hasError())
+                {
+                    auto& result = const_cast<tle::Result&>(self.getResult());
+                    result.contextLogits.reset();
+                }
+            })
+        .def("clear_generation_logits",
+            [](tle::Response& self)
+            {
+                if (!self.hasError())
+                {
+                    auto& result = const_cast<tle::Result&>(self.getResult());
+                    result.generationLogits.reset();
+                }
+            })
+        .def("__getstate__", responseGetstate)
+        .def("__setstate__", responseSetstate);
+}
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/request.h b/cpp/tensorrt_llm/nanobind/executor/request.h
new file mode 100644
index 00000000000..5a5cf9acbee
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/request.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+// Register bindings for executor API.
+void initRequestBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
new file mode 100644
index 00000000000..f3be85bbbf2
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -0,0 +1,388 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include "moeBindings.h"
+#include "tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h"
+#include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h"
+#include "tensorrt_llm/kernels/customAllReduceKernels.h"
+#include "tensorrt_llm/kernels/delayStream.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/cudaEvent.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/decoderState.h"
+#include "tensorrt_llm/runtime/decodingInput.h"
+#include "tensorrt_llm/runtime/decodingOutput.h"
+#include "tensorrt_llm/runtime/gptDecoder.h"
+#include "tensorrt_llm/runtime/gptDecoderBatched.h"
+#include "tensorrt_llm/runtime/iBuffer.h"
+#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/ipcUtils.h"
+#include "tensorrt_llm/runtime/lookaheadBuffers.h"
+#include "tensorrt_llm/runtime/loraCache.h"
+#include "tensorrt_llm/runtime/mcastGPUBuffer.h"
+#include "tensorrt_llm/runtime/request.h"
+#include "tensorrt_llm/runtime/speculativeDecodingMode.h"
+#include "tensorrt_llm/runtime/tllmRuntime.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+#include <nanobind/stl/vector.h>
+
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/bind_vector.h>
+#include <nanobind/stl/filesystem.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/trampoline.h>
+#include <torch/extension.h>
+namespace tr = tensorrt_llm::runtime;
+namespace te = tensorrt_llm::executor;
+
+class PyIGptDecoder : public tr::IGptDecoder
+{
+public:
+    NB_TRAMPOLINE(tr::IGptDecoder, 5);
+
+    void setup(tr::SamplingConfig const& samplingConfig, size_t batchSize,
+        tr::DecodingInput::TensorConstPtr const& batchSlots,
+        std::optional<tr::DecodingOutput> const& output = std::nullopt,
+        std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
+        std::optional<std::vector<tr::ITensor::SharedConstPtr>> const& lookaheadPrompt = std::nullopt,
+        std::optional<std::vector<te::LookaheadDecodingConfig>> const& lookaheadAlgoConfigs = std::nullopt) override
+    {
+        NB_OVERRIDE_PURE(setup, samplingConfig, batchSize, batchSlots, output, explicitDraftTokensDType,
+            lookaheadPrompt, lookaheadAlgoConfigs);
+    }
+
+    void forwardAsync(tr::DecodingOutput& output, tr::DecodingInput const& input) override
+    {
+        NB_OVERRIDE_PURE(forwardAsync, output, input);
+    }
+
+    void forwardSync(tr::DecodingOutput& output, tr::DecodingInput const& input) override
+    {
+        NB_OVERRIDE_PURE(forwardSync, output, input);
+    }
+
+    tr::SamplingConfig const& getSamplingConfig() override
+    {
+        NB_OVERRIDE_PURE(getSamplingConfig);
+    }
+
+    void disableLookahead(std::optional<tr::SamplingConfig> const& samplingConfig, tr::SizeType32 batchSize,
+        tr::DecodingInput::TensorConstPtr batchSlots) override
+    {
+        NB_OVERRIDE_PURE(disableLookahead, samplingConfig, batchSize, batchSlots);
+    }
+};
+
+namespace tensorrt_llm::nanobind::runtime
+{
+
+void initBindings(nb::module_& m)
+{
+
+    nb::class_<tr::LoraCache::TaskLayerModuleConfig>(m, "TaskLayerModuleConfig")
+        .def(nb::init<>())
+        .def_rw("page_id", &tr::LoraCache::TaskLayerModuleConfig::pageId)
+        .def_rw("slot_idx", &tr::LoraCache::TaskLayerModuleConfig::slotIdx)
+        .def_rw("in_size", &tr::LoraCache::TaskLayerModuleConfig::inSize)
+        .def_rw("out_size", &tr::LoraCache::TaskLayerModuleConfig::outSize)
+        .def_rw("module_id", &tr::LoraCache::TaskLayerModuleConfig::moduleId)
+        .def_rw("layer_id", &tr::LoraCache::TaskLayerModuleConfig::layerId)
+        .def_rw("adapter_size", &tr::LoraCache::TaskLayerModuleConfig::adapterSize)
+        .def_rw("num_slots", &tr::LoraCache::TaskLayerModuleConfig::numSlots)
+        .def_rw("weights_in_pointer", &tr::LoraCache::TaskLayerModuleConfig::weightsInPointer)
+        .def_rw("weights_out_pointer", &tr::LoraCache::TaskLayerModuleConfig::weightsOutPointer)
+        .def_rw("scaling_vec_pointer", &tr::LoraCache::TaskLayerModuleConfig::scalingVecPointer)
+        .def(nb::self == nb::self);
+
+    nb::class_<tr::BufferManager>(m, "BufferManager")
+        .def(nb::init<tr::BufferManager::CudaStreamPtr, bool>(), nb::arg("stream"), nb::arg("trim_pool") = false)
+        .def_prop_ro("stream", &tr::BufferManager::getStream);
+
+    nb::class_<tr::TllmRuntime>(m, "TllmRuntime")
+        .def(
+            "__init__",
+            [](tr::TllmRuntime* self, std::filesystem::path engine_path, float gpu_weights_percent = 1.0f,
+                bool use_shape_inference = true)
+            {
+                // Using default logger by passing nullptr
+                new (self)
+                    tr::TllmRuntime(tr::RawEngine(engine_path), nullptr, gpu_weights_percent, use_shape_inference);
+            },
+            nb::arg("engine_path"), nb::arg("gpu_weights_percent") = 1.0f, nb::arg("use_shape_inference") = true)
+        .def(
+            "__init__",
+            [](tr::TllmRuntime* self, nb::ndarray<nb::numpy, uint8_t> engine_buffer, float gpu_weights_percent = 1.0f,
+                bool use_shape_inference = true)
+            {
+                if (engine_buffer.ndim() != 1)
+                    throw std::runtime_error("Expected 1-D array for engine buffer");
+                new (self) tr::TllmRuntime(tr::RawEngine(engine_buffer.data(), engine_buffer.size()), nullptr,
+                    gpu_weights_percent, use_shape_inference);
+            },
+            nb::arg("engine_buffer"), nb::arg("gpu_weights_percent") = 1.0f, nb::arg("use_shape_inference") = true)
+        .def_prop_ro("num_contexts", &tr::TllmRuntime::getNbContexts)
+        .def_prop_ro("num_profiles", &tr::TllmRuntime::getNbProfiles)
+        .def("get_opt_profile_id", &tr::TllmRuntime::getOptProfileId, nb::arg("num_tokens"), nb::arg("split_points"))
+        .def("clear_contexts", &tr::TllmRuntime::clearContexts)
+        .def("execute_context", &tr::TllmRuntime::executeContext, nb::arg("context_id"))
+        .def_prop_ro("stream_ptr", &tr::TllmRuntime::getStreamPtr)
+        .def_prop_ro("buffer_manager",
+            static_cast<tr::BufferManager& (tr::TllmRuntime::*) ()>(&tr::TllmRuntime::getBufferManager))
+        .def("set_layer_profiler", &tr::TllmRuntime::setLayerProfiler)
+        .def("has_layer_profiler", &tr::TllmRuntime::hasLayerProfiler, nb::arg("context_id"))
+        .def_prop_ro("layer_profiler_info", &tr::TllmRuntime::getLayerProfileInfo)
+        .def("report_to_profiler", &tr::TllmRuntime::reportToProfiler, nb::arg("context_id"))
+        .def_prop_ro("logits_dtype_from_engine",
+            [](tr::TllmRuntime& self) { return self.getEngine().getTensorDataType("logits"); });
+
+    nb::class_<tr::decoder_batch::Request>(m, "Request")
+        .def(nb::init<tr::decoder_batch::Request::TensorConstPtr, tr::SizeType32, std::optional<tr::SizeType32>,
+                 std::optional<tr::SizeType32>>(),
+            nb::arg("ids"), nb::arg("input_len"), nb::arg("max_new_tokens") = std::nullopt,
+            nb::arg("end_id") = std::nullopt)
+        .def_rw("ids", &tr::decoder_batch::Request::ids)
+        .def_rw("input_len", &tr::decoder_batch::Request::inputLen)
+        .def_rw("max_new_tokens", &tr::decoder_batch::Request::maxNewTokens)
+        .def_rw("end_id", &tr::decoder_batch::Request::endId)
+        .def_rw("draft_logits", &tr::decoder_batch::Request::draftLogits)
+        .def_rw("embedding_bias", &tr::decoder_batch::Request::embeddingBias)
+        .def_rw("bad_words_list", &tr::decoder_batch::Request::badWordsList)
+        .def_rw("stop_words_list", &tr::decoder_batch::Request::stopWordsList)
+        .def_rw("generated_tokens_per_engine_step", &tr::decoder_batch::Request::generatedTokensPerEngineStep)
+        .def_rw("medusa_paths", &tr::decoder_batch::Request::medusaPaths)
+        .def_rw("medusa_tree_ids", &tr::decoder_batch::Request::medusaTreeIds)
+        .def_rw("lookahead_runtime_config", &tr::decoder_batch::Request::lookaheadRuntimeConfig);
+    nb::bind_vector<std::vector<tr::decoder_batch::Request>>(m, "RequestVector");
+
+    nb::class_<tr::decoder_batch::Input>(m, "DecoderBatchInput")
+        .def(nb::init<std::vector<std::vector<tr::ITensor::SharedConstPtr>>, tr::SizeType32>(), nb::arg("logits"),
+            nb::arg("max_decoding_engine_tokens"))
+        .def(nb::init<std::vector<tr::ITensor::SharedConstPtr>>(), nb::arg("logits"))
+        .def_rw("logits", &tr::decoder_batch::Input::logits)
+        .def_rw("max_decoder_steps", &tr::decoder_batch::Input::maxDecoderSteps)
+        .def_rw("batch_slots", &tr::decoder_batch::Input::batchSlots);
+
+    nb::class_<tr::LookaheadDecodingBuffers>(m, "LookaheadDecodingBuffers")
+        .def(nb::init<tr::SizeType32, tr::SizeType32, tr::BufferManager const&>(), nb::arg("max_num_sequences"),
+            nb::arg("max_tokens_per_step"), nb::arg("buffer_manager"))
+        .def_rw("generation_lengths", &tr::LookaheadDecodingBuffers::generationLengths)
+        .def_rw("position_offsets", &tr::LookaheadDecodingBuffers::positionOffsets)
+        .def_rw("packed_masks", &tr::LookaheadDecodingBuffers::packedMasks)
+        .def_rw("position_ids", &tr::LookaheadDecodingBuffers::positionIds);
+
+    nb::class_<tr::ExplicitDraftTokensBuffers::Inputs>(m, "ExplicitDraftTokensBuffersInputs")
+        .def("create", &tr::ExplicitDraftTokensBuffers::Inputs::create, nb::arg("max_num_sequences"),
+            nb::arg("runtime"), nb::arg("model_config"), nb::arg("world_config"))
+        .def_rw("temperatures", &tr::ExplicitDraftTokensBuffers::Inputs::temperatures)
+        .def_rw("position_ids_base", &tr::ExplicitDraftTokensBuffers::Inputs::positionIdsBase)
+        .def_rw("generation_lengths", &tr::ExplicitDraftTokensBuffers::Inputs::generationLengths)
+        .def_rw("random_data_sample", &tr::ExplicitDraftTokensBuffers::Inputs::randomDataSample)
+        .def_rw("random_data_validation", &tr::ExplicitDraftTokensBuffers::Inputs::randomDataValidation)
+        .def_rw("draft_tokens", &tr::ExplicitDraftTokensBuffers::Inputs::draftTokens)
+        .def_rw("draft_indices", &tr::ExplicitDraftTokensBuffers::Inputs::draftIndices)
+        .def_rw("draft_probs", &tr::ExplicitDraftTokensBuffers::Inputs::draftProbs)
+        .def_rw("packed_masks", &tr::ExplicitDraftTokensBuffers::Inputs::packedMasks)
+        .def_rw("position_ids", &tr::ExplicitDraftTokensBuffers::Inputs::positionIds)
+        .def_rw("max_gen_length_host", &tr::ExplicitDraftTokensBuffers::Inputs::maxGenLengthHost)
+        .def_rw("generation_lengths_host", &tr::ExplicitDraftTokensBuffers::Inputs::generationLengthsHost);
+
+    nb::class_<tr::DecodingInput>(m, "DecodingInput");
+    nb::class_<tr::DecodingOutput>(m, "DecodingOutput");
+
+    nb::class_<tr::CudaEvent>(m, "CudaEvent")
+        .def(nb::init<unsigned int>(), nb::arg("flags") = cudaEventDisableTiming)
+        .def("synchronize", &tr::CudaEvent::synchronize);
+
+    nb::class_<tr::IGptDecoder, PyIGptDecoder>(m, "IGptDecoder")
+        .def(
+            "setup",
+            [](tr::IGptDecoder& self, tr::SamplingConfig const& samplingConfig, size_t batchSize,
+                at::Tensor const& batchSlots, std::optional<tr::DecodingOutput> const& output = std::nullopt,
+                std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
+                std::optional<std::vector<tr::ITensor::SharedConstPtr>> const& lookaheadPrompt = std::nullopt,
+                std::optional<std::vector<te::LookaheadDecodingConfig>> const& lookaheadAlgoConfigs = std::nullopt)
+            {
+                auto tensorPtrBatchSlots = tr::TorchView::of(batchSlots);
+                self.setup(samplingConfig, batchSize, std::move(tensorPtrBatchSlots), output, explicitDraftTokensDType,
+                    lookaheadPrompt, lookaheadAlgoConfigs);
+            },
+            nb::arg("sampling_config"), nb::arg("batch_size"), nb::arg("batch_slots"), nb::arg("output") = std::nullopt,
+            nb::arg("explicit_draft_tokens_d_type") = std::nullopt, nb::arg("lookahead_prompt") = std::nullopt,
+            nb::arg("lookahead_algo_configs") = std::nullopt);
+
+    nb::class_<tr::decoder::DecoderState>(m, "DecoderState")
+        .def(nb::init<>())
+        .def("setup", &tr::decoder::DecoderState::setup, nb::arg("max_batch_size"), nb::arg("max_beam_width"),
+            nb::arg("max_attention_window"), nb::arg("sink_token_length"), nb::arg("max_sequence_length"),
+            nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"))
+        .def("setup_cache_indirection", &tr::decoder::DecoderState::setupCacheIndirection, nb::arg("max_batch_size"),
+            nb::arg("max_beam_width"), nb::arg("max_attention_window"), nb::arg("buffer_manager"))
+        .def("setup_speculative_decoding", &tr::decoder::DecoderState::setupSpeculativeDecoding,
+            nb::arg("speculative_decoding_mode"), nb::arg("max_tokens_per_engine_step"), nb::arg("dtype"),
+            nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"))
+        .def_prop_ro("joint_decoding_input", &tr::decoder::DecoderState::getJointDecodingInput)
+        .def_prop_ro("joint_decoding_output", &tr::decoder::DecoderState::getJointDecodingOutput)
+        .def_prop_ro("cache_indirection_input", &tr::decoder::DecoderState::getCacheIndirectionInput)
+        .def_prop_ro("cache_indirection_output", &tr::decoder::DecoderState::getCacheIndirectionOutput)
+        .def_prop_ro(
+            "sequence_lengths", nb::overload_cast<>(&tr::decoder::DecoderState::getSequenceLengths, nb::const_))
+        .def("get_sequence_lengths",
+            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getSequenceLengths, nb::const_),
+            nb::arg("batch_idx"))
+        .def_prop_ro("all_new_tokens", &tr::decoder::DecoderState::getAllNewTokens)
+        .def_prop_ro("finished_sum", &tr::decoder::DecoderState::getFinishedSum)
+        .def_prop_ro("finish_reasons", &tr::decoder::DecoderState::getFinishReasons)
+        .def_prop_ro("ids", nb::overload_cast<>(&tr::decoder::DecoderState::getIds, nb::const_))
+        .def("get_ids", nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getIds, nb::const_),
+            nb::arg("batch_idx"))
+        .def_prop_ro("gathered_ids", nb::overload_cast<>(&tr::decoder::DecoderState::getGatheredIds, nb::const_))
+        .def("get_gathered_ids",
+            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getGatheredIds, nb::const_),
+            nb::arg("batch_idx"))
+        .def_prop_ro("parent_ids", &tr::decoder::DecoderState::getParentIds)
+        .def_prop_ro("cum_log_probs", nb::overload_cast<>(&tr::decoder::DecoderState::getCumLogProbs, nb::const_))
+        .def("get_cum_log_probs",
+            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getCumLogProbs, nb::const_),
+            nb::arg("batch_idx"))
+        .def_prop_ro("log_probs", nb::overload_cast<>(&tr::decoder::DecoderState::getLogProbs, nb::const_))
+        .def("get_log_probs", nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getLogProbs, nb::const_),
+            nb::arg("batch_idx"))
+        .def_prop_ro("next_draft_tokens", &tr::decoder::DecoderState::getNextDraftTokens)
+        .def_prop_ro("prev_draft_tokens_lengths", &tr::decoder::DecoderState::getPrevDraftTokensLengths)
+        .def_prop_ro("next_draft_tokens_lengths", &tr::decoder::DecoderState::getNextDraftTokensLengths)
+        .def_prop_ro("accepted_lengths_cum_sum", &tr::decoder::DecoderState::getAcceptedLengthsCumSum)
+        .def_prop_ro("accepted_packed_paths", &tr::decoder::DecoderState::getAcceptedPackedPaths)
+        .def_prop_ro("finished_steps", &tr::decoder::DecoderState::getFinishedSteps)
+        .def_prop_ro("max_beam_width", &tr::decoder::DecoderState::getMaxBeamWidth)
+        .def_prop_ro("max_sequence_length", &tr::decoder::DecoderState::getMaxSequenceLength)
+        .def_prop_ro("max_decoding_decoder_tokens", &tr::decoder::DecoderState::getMaxDecodingDecoderTokens)
+        .def_prop_ro("max_decoding_engine_tokens", &tr::decoder::DecoderState::getMaxDecodingEngineTokens)
+        .def_prop_ro("num_decoding_engine_tokens",
+            nb::overload_cast<>(&tr::decoder::DecoderState::getNumDecodingEngineTokens, nb::const_))
+        .def("get_num_decoding_engine_tokens",
+            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getNumDecodingEngineTokens, nb::const_),
+            nb::arg("batch_idx"))
+        .def("set_num_decoding_engine_tokens", &tr::decoder::DecoderState::setNumDecodingEngineTokens,
+            nb::arg("batch_idx"), nb::arg("num_tokens"))
+        .def_prop_ro("speculative_decoding_mode", &tr::decoder::DecoderState::getSpeculativeDecodingMode)
+        .def_prop_rw("generation_steps", &tr::decoder::DecoderState::getGenerationSteps,
+            &tr::decoder::DecoderState::setGenerationSteps);
+
+    nb::class_<tr::GptDecoderBatched>(m, "GptDecoderBatched")
+        .def(nb::init<tr::GptDecoderBatched::CudaStreamPtr>(), nb::arg("stream"))
+        .def("setup", &tr::GptDecoderBatched::setup, nb::arg("mode"), nb::arg("max_batch_size"),
+            nb::arg("max_beam_width"), nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config"))
+        .def("forward_async", &tr::GptDecoderBatched::forwardAsync, nb::arg("output"), nb::arg("input"))
+        .def("underlying_decoder", &tr::GptDecoderBatched::getUnderlyingDecoder, nb::rv_policy::reference)
+        .def("finalize", &tr::GptDecoderBatched::finalize, nb::arg("decoder_state"), nb::arg("batch_idx"),
+            nb::arg("sampling_config"), nb::arg("streaming"))
+        .def_prop_ro(
+            "decoder_stream",
+            [](tr::GptDecoderBatched& self) -> tr::CudaStream const& { return *self.getDecoderStream(); },
+            nb::rv_policy::reference);
+
+    m.def(
+        "lamport_initialize_all",
+        [](intptr_t buffer_0, intptr_t buffer_1, intptr_t buffer_2, size_t size)
+        {
+            tr::lamportInitializeAll(reinterpret_cast<void*>(buffer_0), reinterpret_cast<void*>(buffer_1),
+                reinterpret_cast<void*>(buffer_2), size);
+        },
+        "Lamport initialize all buffers");
+    m.def(
+        "lamport_initialize",
+        [](intptr_t buffer, size_t size)
+        { tensorrt_llm::kernels::ar_fusion::lamport_initialize(reinterpret_cast<void*>(buffer), size, 0); },
+        "Lmaport initialize buffer");
+    m.def(
+        "delay_kernel",
+        [](int64_t delay_micro_secs, nb::object py_stream)
+        {
+            // Get the raw stream handle from PyTorch stream object
+            auto stream_ptr = nb::cast<int64_t>(py_stream.attr("cuda_stream"));
+            cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_ptr);
+            tensorrt_llm::kernels::invokeDelayStreamKernel(delay_micro_secs, stream);
+        },
+        "Delay kernel launch on the default stream");
+    m.def(
+        "max_workspace_size_lowprecision",
+        [](int32_t tp_size) { return tensorrt_llm::kernels::max_workspace_size_lowprecision(tp_size); },
+        "Calculate the maximum workspace size needed for low precision all-reduce operations");
+
+    nb::class_<tensorrt_llm::runtime::McastGPUBuffer>(m, "McastGPUBuffer")
+        .def(nb::init<size_t, uint32_t, uint32_t, at::Device, bool>())
+        .def("get_uc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getUCBuffer)
+        .def("get_mc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getMCBuffer);
+
+    nb::enum_<tensorrt_llm::kernels::AllReduceFusionOp>(m, "AllReduceFusionOp")
+        .value("NONE", tensorrt_llm::kernels::AllReduceFusionOp::NONE)
+        .value("RESIDUAL_RMS_NORM", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM)
+        .value("LAST_PROCESS_FOR_UB", tensorrt_llm::kernels::AllReduceFusionOp::LAST_PROCESS_FOR_UB)
+        .value("RESIDUAL_RMS_PREPOST_NORM", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_PREPOST_NORM)
+        .value("RESIDUAL_RMS_NORM_QUANT_FP8", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_FP8)
+        .value("RESIDUAL_RMS_NORM_QUANT_NVFP4", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_NVFP4)
+        .value("RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4",
+            tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4)
+        .value("RESIDUAL_RMS_NORM_OUT_QUANT_FP8",
+            tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_FP8);
+
+    nb::enum_<tensorrt_llm::kernels::AllReduceStrategyType>(m, "AllReduceStrategy")
+        .value("NCCL", tensorrt_llm::kernels::AllReduceStrategyType::NCCL)
+        .value("MIN_LATENCY", tensorrt_llm::kernels::AllReduceStrategyType::MIN_LATENCY)
+        .value("AUTO", tensorrt_llm::kernels::AllReduceStrategyType::AUTO)
+        .value("UB", tensorrt_llm::kernels::AllReduceStrategyType::UB)
+        .value("ONESHOT", tensorrt_llm::kernels::AllReduceStrategyType::ONESHOT)
+        .value("TWOSHOT", tensorrt_llm::kernels::AllReduceStrategyType::TWOSHOT);
+
+    // Initialize MoeLoadBalancer bindings
+    initMoeBindings(m);
+}
+
+void initBindingsEarly(nb::module_& m)
+{
+    nb::class_<tr::SpeculativeDecodingMode>(m, "SpeculativeDecodingMode")
+        .def(nb::init<tr::SpeculativeDecodingMode::UnderlyingType>(), nb::arg("state"))
+        .def_static("NoneType", &tr::SpeculativeDecodingMode::None)
+        .def_static("DraftTokensExternal", &tr::SpeculativeDecodingMode::DraftTokensExternal)
+        .def_static("Medusa", &tr::SpeculativeDecodingMode::Medusa)
+        .def_static("Eagle", &tr::SpeculativeDecodingMode::Eagle)
+        .def_static("LookaheadDecoding", &tr::SpeculativeDecodingMode::LookaheadDecoding)
+        .def_static("ExplicitDraftTokens", &tr::SpeculativeDecodingMode::ExplicitDraftTokens)
+        .def_prop_ro("is_none", &tr::SpeculativeDecodingMode::isNone)
+        .def_prop_ro("is_draft_tokens_external", &tr::SpeculativeDecodingMode::isDraftTokensExternal)
+        .def_prop_ro("is_medusa", &tr::SpeculativeDecodingMode::isMedusa)
+        .def_prop_ro("is_eagle", &tr::SpeculativeDecodingMode::isEagle)
+        .def_prop_ro("is_lookahead_decoding", &tr::SpeculativeDecodingMode::isLookaheadDecoding)
+        .def_prop_ro("is_explicit_draft_tokens", &tr::SpeculativeDecodingMode::isExplicitDraftTokens)
+        .def_prop_ro("updates_position_ids", &tr::SpeculativeDecodingMode::updatesPositionIds)
+        .def_prop_ro("requires_attention_mask", &tr::SpeculativeDecodingMode::requiresAttentionMask)
+        .def_prop_ro("predicts_draft_tokens", &tr::SpeculativeDecodingMode::predictsDraftTokens)
+        .def_prop_ro("needs_kv_cache_rewind", &tr::SpeculativeDecodingMode::needsKVCacheRewind)
+        .def_prop_ro("variable_draft_length", &tr::SpeculativeDecodingMode::variableDraftLength)
+        .def_prop_ro("has_draft_logits", &tr::SpeculativeDecodingMode::hasDraftLogits)
+        .def_prop_ro("needs_decoder_prologue", &tr::SpeculativeDecodingMode::needsDecoderPrologue);
+}
+} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.h b/cpp/tensorrt_llm/nanobind/runtime/bindings.h
new file mode 100644
index 00000000000..410dac80b05
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.h
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::runtime
+{
+
+void initBindings(nb::module_& m);
+void initBindingsEarly(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp
new file mode 100644
index 00000000000..c26fa84b661
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp
@@ -0,0 +1,124 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "moeBindings.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.h"
+#include "tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h"
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <vector>
+
+namespace nb = nanobind;
+namespace tr = tensorrt_llm::runtime;
+namespace tk = tensorrt_llm::kernels;
+
+namespace tensorrt_llm::nanobind::runtime
+{
+
+void pyDoReplication(tk::MoeLoadBalanceMetaInfo const& metaInfo, std::vector<float>& expertLoadFactor,
+    tr::MoePlacementCpuInfo* cpuPlacement)
+{
+    TLLM_CHECK_WITH_INFO(
+        metaInfo.expertCount == expertLoadFactor.size(), "expert_count and expert_load_factor size mismatch");
+    tr::doReplication(metaInfo, expertLoadFactor.data(), cpuPlacement);
+};
+
+void pyDoPlacement(tk::MoeLoadBalanceMetaInfo const& metaInfo, std::vector<float>& expertLoadFactor,
+    tr::MoePlacementCpuInfo* cpuPlacement)
+{
+    TLLM_CHECK_WITH_INFO(
+        metaInfo.expertCount == expertLoadFactor.size(), "expert_count and expert_load_factor size mismatch");
+    tr::doPlacement(metaInfo, expertLoadFactor.data(), cpuPlacement);
+};
+
+void initMoeBindings(nb::module_& m)
+{
+    // Bind MoeWeight struct
+    nb::class_<tr::MoeWeight>(m, "MoeWeight")
+        .def(nb::init<>())
+        .def_prop_rw("weight_ptr", &tr::MoeWeight::getWeightPtr, &tr::MoeWeight::setWeightPtr)
+        .def_rw("height", &tr::MoeWeight::mHeight)
+        .def_rw("width", &tr::MoeWeight::mWidth)
+        .def_rw("pitch", &tr::MoeWeight::mPitch)
+        .def("__repr__",
+            [](tr::MoeWeight const& self)
+            {
+                return "<MoeWeight ptr=" + std::to_string(self.getWeightPtr())
+                    + " height=" + std::to_string(self.mHeight) + " width=" + std::to_string(self.mWidth)
+                    + " pitch=" + std::to_string(self.mPitch) + ">";
+            });
+
+    // Bind MoeLoadBalanceMetaInfo struct
+    nb::class_<tk::MoeLoadBalanceMetaInfo>(m, "MoeLoadBalanceMetaInfo")
+        .def(nb::init<int, int, int, int, int>(), nb::arg("expert_count"), nb::arg("top_k"), nb::arg("ep_rank"),
+            nb::arg("ep_size"), nb::arg("slot_count_per_rank"))
+        .def_rw("expert_count", &tk::MoeLoadBalanceMetaInfo::expertCount)
+        .def_rw("top_k", &tk::MoeLoadBalanceMetaInfo::topK)
+        .def_rw("ep_rank", &tk::MoeLoadBalanceMetaInfo::epRank)
+        .def_rw("ep_size", &tk::MoeLoadBalanceMetaInfo::epSize)
+        .def_rw("slot_count_per_rank", &tk::MoeLoadBalanceMetaInfo::slotCountPerRank);
+
+    // Bind MoePlacementCpuInfo struct
+    nb::class_<tr::MoePlacementCpuInfo>(m, "MoePlacementCpuInfo")
+        .def(nb::init<>())
+        .def_rw("expert_replica_count", &tr::MoePlacementCpuInfo::expertReplicaCount)
+        .def_rw("rank_expert_ids", &tr::MoePlacementCpuInfo::rankExpertIds);
+
+    // Bind SingleLayerMoeLoadBalancer class
+    nb::class_<tr::SingleLayerMoeLoadBalancer>(m, "SingleLayerMoeLoadBalancer")
+        .def("add_single_weight_slot", &tr::SingleLayerMoeLoadBalancer::addSingleWeightSlot, nb::arg("slot_id"),
+            nb::arg("name"), nb::arg("weight_slot"), "Add a single weight slot for a specific slot ID")
+        .def("add_single_host_weight", &tr::SingleLayerMoeLoadBalancer::addSingleHostWeight, nb::arg("expert_id"),
+            nb::arg("name"), nb::arg("host_weight"), "Add a single host weight for a specific expert ID")
+        .def("set_initial_weight_assignments", &tr::SingleLayerMoeLoadBalancer::setInitialWeightAssignments,
+            nb::arg("initial_weight_assignments"), "Set initial weight assignments for each slot")
+        .def("get_pointer", &tr::SingleLayerMoeLoadBalancer::getSelfPtr,
+            "Get the pointer of the SingleLayerMoeLoadBalancer")
+        .def("get_layer_id", &tr::SingleLayerMoeLoadBalancer::getLayerId,
+            "Get the layer id of the SingleLayerMoeLoadBalancer");
+
+    // Bind MoeLoadBalancer class
+    nb::class_<tr::MoeLoadBalancer>(m, "MoeLoadBalancer")
+        .def(nb::init<int, int, int>(), nb::arg("ep_rank"), nb::arg("ep_size"), nb::arg("layer_updates_per_iter"),
+            "Initialize the MoeLoadBalancer with the specified expert parallel rank, size, and update frequency")
+        .def("set_use_gpu_memcpy", &tr::MoeLoadBalancer::setUseGpuMemcpy, nb::arg("use_gpu_memcpy"),
+            "Set whether to use GPU memcpy for weight updates")
+        .def("add_layer", &tr::MoeLoadBalancer::AddLayer, nb::arg("expert_count"), nb::arg("top_k"),
+            nb::arg("slot_count_per_rank"), "Add a new MOE layer to the load balancer")
+        .def("finalize_model", &tr::MoeLoadBalancer::finalizeModel,
+            "Finalize the model structure, must be called after all layers are added")
+        .def("set_warm_up_iter_count", &tr::MoeLoadBalancer::setWarmUpIterCount, nb::arg("iter_count"),
+            "Set the number of warm-up iterations")
+        .def("start_iter", &tr::MoeLoadBalancer::startIter, nb::arg("iter_id"), nb::arg("enable_statistic"),
+            nb::arg("enable_update_weights"), "Start a new iteration with the given ID and settings")
+        .def("end_iter", &tr::MoeLoadBalancer::endIter, nb::arg("iter_id"), "End the iteration with the given ID")
+        .def("shutdown", &tr::MoeLoadBalancer::shutdown, "Shutdown the load balancer and clean up resources");
+
+    m.def("is_host_accessible_device_memory_supported", &tr::HostAccessibleDeviceAllocator::isSupported,
+        "If current system support host accessible device memory");
+
+    // Bind do_replication function for testing
+    m.def("do_replication", &pyDoReplication, nb::arg("meta_info"), nb::arg("expert_load_factor"),
+        nb::arg("cpu_placement"), "Do replication");
+
+    // Bind do_placement function for testing
+    m.def("do_placement", &pyDoPlacement, nb::arg("meta_info"), nb::arg("expert_load_factor"), nb::arg("cpu_placement"),
+        "Do placement");
+}
+
+} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/runtime/moeBindings.h b/cpp/tensorrt_llm/nanobind/runtime/moeBindings.h
new file mode 100644
index 00000000000..73b9a3ceec8
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/runtime/moeBindings.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::runtime
+{
+
+void initMoeBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.cpp b/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.cpp
new file mode 100644
index 00000000000..caef94c5def
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.cpp
@@ -0,0 +1,87 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "modelSpecBinding.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/testing/modelSpec.h"
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+using tensorrt_llm::testing::ModelSpec;
+using tensorrt_llm::testing::KVCacheType;
+using tensorrt_llm::testing::QuantMethod;
+using tensorrt_llm::testing::OutputContentType;
+
+namespace tensorrt_llm::nanobind::testing
+{
+
+void initBindings(nb::module_& m)
+{
+    nb::enum_<QuantMethod>(m, "QuantMethod", nb::is_arithmetic(), "Quantization Method")
+        .value("NONE", QuantMethod::kNONE, "No Quantization")
+        .value("SMOOTH_QUANT", QuantMethod::kSMOOTH_QUANT, "Smooth Quantization");
+
+    nb::enum_<OutputContentType>(m, "OutputContentType", nb::is_arithmetic(), "Output Content Type")
+        .value("NONE", OutputContentType::kNONE, "No Output Content")
+        .value("CONTEXT_LOGITS", OutputContentType::kCONTEXT_LOGITS, "Context Logits")
+        .value("GENERATION_LOGITS", OutputContentType::kGENERATION_LOGITS, "Generation Logits")
+        .value("LOG_PROBS", OutputContentType::kLOG_PROBS, "Log Probs")
+        .value("CUM_LOG_PROBS", OutputContentType::kCUM_LOG_PROBS, "Cumulative Log");
+
+    nb::class_<ModelSpec>(m, "ModelSpec")
+        .def(nb::init<std::string const&, nvinfer1::DataType>())
+        .def("use_gpt_plugin", &ModelSpec::useGptAttentionPlugin, nb::rv_policy::reference_internal)
+        .def("use_packed_input", &ModelSpec::usePackedInput, nb::rv_policy::reference_internal)
+        .def("set_kv_cache_type", &ModelSpec::setKVCacheType, nb::rv_policy::reference_internal)
+        .def("use_decoder_per_request", &ModelSpec::useDecoderPerRequest, nb::rv_policy::reference_internal)
+        .def("use_tensor_parallelism", &ModelSpec::useTensorParallelism, nb::rv_policy::reference_internal)
+        .def("use_pipeline_parallelism", &ModelSpec::usePipelineParallelism, nb::rv_policy::reference_internal)
+        .def("use_context_parallelism", &ModelSpec::useContextParallelism, nb::rv_policy::reference_internal)
+        .def("set_draft_tokens", &ModelSpec::setDraftTokens, nb::rv_policy::reference_internal)
+        .def("use_accept_by_logits", &ModelSpec::useAcceptByLogits, nb::rv_policy::reference_internal)
+        .def("use_mamba_plugin", &ModelSpec::useMambaPlugin, nb::rv_policy::reference_internal)
+        .def("gather_logits", &ModelSpec::gatherLogits, nb::rv_policy::reference_internal)
+        .def("replace_logits", &ModelSpec::replaceLogits, nb::rv_policy::reference_internal)
+        .def("return_log_probs", &ModelSpec::returnLogProbs, nb::rv_policy::reference_internal)
+        .def("smoke_test", &ModelSpec::smokeTest, nb::rv_policy::reference_internal)
+        .def("use_medusa", &ModelSpec::useMedusa, nb::rv_policy::reference_internal)
+        .def("use_eagle", &ModelSpec::useEagle, nb::rv_policy::reference_internal)
+        .def("use_lookahead_decoding", &ModelSpec::useLookaheadDecoding, nb::rv_policy::reference_internal)
+        .def("use_explicit_draft_tokens_decoding", &ModelSpec::useExplicitDraftTokensDecoding,
+            nb::rv_policy::reference_internal)
+        .def("use_draft_tokens_external_decoding", &ModelSpec::useDraftTokensExternalDecoding,
+            nb::rv_policy::reference_internal)
+        .def("use_logits", &ModelSpec::useLogits)
+        .def("use_multiple_profiles", &ModelSpec::useMultipleProfiles, nb::rv_policy::reference_internal)
+        .def("set_max_input_length", &ModelSpec::setMaxInputLength, nb::rv_policy::reference_internal)
+        .def("set_max_output_length", &ModelSpec::setMaxOutputLength, nb::rv_policy::reference_internal)
+        .def("set_quant_method", &ModelSpec::setQuantMethod, nb::rv_policy::reference_internal)
+        .def("use_lora_plugin", &ModelSpec::useLoraPlugin, nb::rv_policy::reference_internal)
+        .def("get_input_file", &ModelSpec::getInputFile)
+        .def("get_model_path", &ModelSpec::getModelPath)
+        .def("get_results_file", &ModelSpec::getResultsFile)
+        .def("get_generation_logits_file", &ModelSpec::getGenerationLogitsFile)
+        .def("get_context_logits_file", &ModelSpec::getContextLogitsFile)
+        .def("get_cum_log_probs_file", &ModelSpec::getCumLogProbsFile)
+        .def("get_log_probs_file", &ModelSpec::getLogProbsFile)
+        .def("enable_context_fmha_fp32_acc", &ModelSpec::enableContextFMHAFp32Acc, nb::rv_policy::reference_internal)
+        .def("get_enable_context_fmha_fp32_acc", &ModelSpec::getEnableContextFMHAFp32Acc)
+        .def("__copy__", [](ModelSpec const& self) { return ModelSpec(self); });
+}
+
+} // namespace tensorrt_llm::nanobind::testing
diff --git a/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.h b/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.h
new file mode 100644
index 00000000000..1aababc6ff8
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::testing
+{
+
+void initBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::testing
diff --git a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp
new file mode 100644
index 00000000000..82e0d0a1f0c
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp
@@ -0,0 +1,47 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
+#include "tensorrt_llm/kernels/userbuffers/userbuffersManager.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+namespace tub = tensorrt_llm::runtime::ub;
+
+namespace tensorrt_llm::kernels::userbuffers
+{
+
+void UserBufferBindings::initBindings(nb::module_& m)
+{
+    nb::class_<tub::UBBuffer>(m, "UBBuffer")
+        .def_ro("size", &tub::UBBuffer::size)
+        .def_prop_ro("addr", [](tub::UBBuffer& self) { return reinterpret_cast<intptr_t>(self.addr); })
+        .def_ro("handle", &tub::UBBuffer::handle)
+        .def("invalid", &tub::UBBuffer::invalid);
+
+    m.def("ub_initialize", [](int tp_size) { tub::ub_initialize(tp_size); });
+    m.def("ub_is_initialized", &tub::ub_is_initialized);
+    m.def("ub_allocate", [](size_t bytes) { return tub::ub_allocate(bytes); });
+    m.def("ub_deallocate", [](intptr_t addr) { return tub::ub_deallocate(reinterpret_cast<void*>(addr)); });
+    m.def("ub_get", &tub::ub_get);
+    m.def("ub_supported", &tub::ub_supported);
+
+    m.def("initialize_userbuffers_manager", &tub::initialize_userbuffers_manager);
+}
+} // namespace tensorrt_llm::kernels::userbuffers
diff --git a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h
new file mode 100644
index 00000000000..15728bf6c1d
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::kernels::userbuffers
+{
+class UserBufferBindings
+{
+public:
+    static void initBindings(nb::module_& m);
+};
+} // namespace tensorrt_llm::kernels::userbuffers
diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
index 1a5841d4b7a..962071c4857 100644
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -170,7 +170,7 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
         .value("CONTINUOUS", tr::ModelConfig::KVCacheType::kCONTINUOUS)
         .value("PAGED", tr::ModelConfig::KVCacheType::kPAGED)
         .value("DISABLED", tr::ModelConfig::KVCacheType::kDISABLED)
-        .def(py::init(&tr::ModelConfig::KVCacheTypeFromString));
+        .def("from_string", &tr::ModelConfig::KVCacheTypeFromString);
 
     py::enum_<tr::ModelConfig::LayerType>(m, "LayerType")
         .value("ATTENTION", tr::ModelConfig::LayerType::kATTENTION)
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
index d09157e1a8b..a8f6aaef73d 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@@ -244,7 +244,17 @@ void initBindings(pybind11::module_& m)
 
     py::class_<tle::KVCacheEventManager, std::shared_ptr<tle::KVCacheEventManager>>(
         executor_kv_cache, "KVCacheEventManager")
-        .def("get_latest_events", &tle::KVCacheEventManager::getLatestEvents, py::arg("timeout") = std::nullopt);
+        .def(
+            "get_latest_events",
+            [](tle::KVCacheEventManager& self, std::optional<double> timeout_ms = std::nullopt)
+            {
+                if (timeout_ms)
+                {
+                    return self.getLatestEvents(std::chrono::milliseconds(static_cast<int64_t>(*timeout_ms)));
+                }
+                return self.getLatestEvents(std::nullopt);
+            },
+            py::arg("timeout_ms") = std::nullopt);
 
     tensorrt_llm::pybind::executor::initRequestBindings(m);
     tensorrt_llm::pybind::executor::initConfigBindings(m);
diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
index 71a0b4af724..a413b8c9e67 100644
--- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
@@ -336,7 +336,7 @@ void initConfigBindings(pybind11::module_& m)
             throw std::runtime_error("Invalid extendedRuntimePerfKnobConfig state!");
         }
         return tle::ExtendedRuntimePerfKnobConfig(
-            state[0].cast<bool>(), state[1].cast<bool>(), state[2].cast<bool>(), state[2].cast<SizeType32>());
+            state[0].cast<bool>(), state[1].cast<bool>(), state[2].cast<bool>(), state[3].cast<SizeType32>());
     };
     auto extendedRuntimePerfKnobConfigGetstate = [](tle::ExtendedRuntimePerfKnobConfig const& self)
     {
diff --git a/examples/models/core/llama/summarize_long.py b/examples/models/core/llama/summarize_long.py
index 9f127bc32a6..cee2e07fdd5 100644
--- a/examples/models/core/llama/summarize_long.py
+++ b/examples/models/core/llama/summarize_long.py
@@ -97,7 +97,7 @@ def TRTLLaMA(args, config):
     quantization_config = pretrained_config['quantization']
 
     build_config = config['build_config']
-    kv_cache_type = KVCacheType(build_config['kv_cache_type'])
+    kv_cache_type = KVCacheType.from_string(build_config['kv_cache_type'])
     plugin_config = build_config['plugin_config']
 
     dtype = pretrained_config['dtype']
diff --git a/examples/models/core/qwen2audio/run.py b/examples/models/core/qwen2audio/run.py
index e0d495a67f8..93e161c7e08 100644
--- a/examples/models/core/qwen2audio/run.py
+++ b/examples/models/core/qwen2audio/run.py
@@ -122,7 +122,8 @@ def get_model(self):
         num_kv_heads = config["pretrained_config"].get("num_key_value_heads",
                                                        num_heads)
         if "kv_cache_type" in config["build_config"]:
-            kv_cache_type = KVCacheType(config["build_config"]["kv_cache_type"])
+            kv_cache_type = KVCacheType.from_string(
+                config["build_config"]["kv_cache_type"])
         else:
             kv_cache_type = KVCacheType.CONTINUOUS
 
diff --git a/examples/models/core/qwenvl/run.py b/examples/models/core/qwenvl/run.py
index a04c2b142e3..06ce341a9a0 100644
--- a/examples/models/core/qwenvl/run.py
+++ b/examples/models/core/qwenvl/run.py
@@ -118,7 +118,8 @@ def get_model(self):
         num_kv_heads = config["pretrained_config"].get("num_key_value_heads",
                                                        num_heads)
         if "kv_cache_type" in config["build_config"]:
-            kv_cache_type = KVCacheType(config["build_config"]["kv_cache_type"])
+            kv_cache_type = KVCacheType.from_string(
+                config["build_config"]["kv_cache_type"])
         else:
             kv_cache_type = KVCacheType.CONTINUOUS
 
diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy
index bb8fd7816ce..77e12ee5100 100644
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@@ -47,6 +47,12 @@ CONFIG_LINUX_AARCH64 = "linux_aarch64"
 @Field
 def CONFIG_LINUX_AARCH64_LLVM = "linux_aarch64_LLVM"
 
+@Field
+def CONFIG_LINUX_X86_64_NANOBIND = "linux_x86_64_Nanobind"
+
+@Field
+def CONFIG_LINUX_AARCH64_NANOBIND = "linux_aarch64_Nanobind"
+
 @Field
 def BUILD_CONFIGS = [
   // Vanilla TARNAME is used for packaging in runLLMPackage
@@ -56,6 +62,11 @@ def BUILD_CONFIGS = [
     (TARNAME) : "TensorRT-LLM.tar.gz",
     (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;120-real",
   ],
+  (CONFIG_LINUX_X86_64_NANOBIND) : [
+    (WHEEL_EXTRA_ARGS) : "--binding_type nanobind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks",
+    (TARNAME) : "nanobind-TensorRT-LLM.tar.gz",
+    (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;120-real",
+  ],
   (CONFIG_LINUX_X86_64_SINGLE_DEVICE) : [
     (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=0 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars ENABLE_UCX=0 --micro_benchmarks",
     (TARNAME) : "single-device-TensorRT-LLM.tar.gz",
@@ -71,6 +82,11 @@ def BUILD_CONFIGS = [
     (TARNAME) : "TensorRT-LLM-GH200.tar.gz",
     (WHEEL_ARCHS): "90-real;100-real;120-real",
   ],
+  (CONFIG_LINUX_AARCH64_NANOBIND): [
+    (WHEEL_EXTRA_ARGS) : "--binding_type nanobind --extra-cmake-vars WARNING_IS_ERROR=ON",
+    (TARNAME) : "nanobind-TensorRT-LLM-GH200.tar.gz",
+    (WHEEL_ARCHS): "90-real;100-real;120-real",
+  ],
   (CONFIG_LINUX_AARCH64_LLVM) : [
     (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
     (TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz",
@@ -523,6 +539,8 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
             pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA),
         "Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
             pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM),
+        "Build TRT-LLM Nanobind": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
+            pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_NANOBIND : CONFIG_LINUX_X86_64_NANOBIND),
     ]
 
     if (cpu_arch == X86_64_TRIPLE) {
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 6f6ae7c1186..35e7140ebda 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -64,6 +64,9 @@ def LLVM_CONFIG = "LLVM"
 @Field
 LINUX_AARCH64_CONFIG = "linux_aarch64"
 
+@Field
+def NANOBIND_CONFIG = "Nanobind"
+
 @Field
 def BUILD_CONFIGS = [
   // Vanilla TARNAME is used for packaging in runLLMPackage
@@ -71,6 +74,7 @@ def BUILD_CONFIGS = [
   (SINGLE_DEVICE_CONFIG) : [(TARNAME) : "single-device-TensorRT-LLM.tar.gz"],
   (LLVM_CONFIG) : [(TARNAME) : "llvm-TensorRT-LLM.tar.gz"],
   (LINUX_AARCH64_CONFIG) : [(TARNAME) : "TensorRT-LLM-GH200.tar.gz"],
+  (NANOBIND_CONFIG) : [(TARNAME) : "nanobind-TensorRT-LLM.tar.gz"],
 ]
 
 // TODO: Move common variables to an unified location
@@ -1724,6 +1728,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
         "A10-TensorRT-4": ["a10", "l0_a10", 4, 6],
         "A10-TensorRT-5": ["a10", "l0_a10", 5, 6],
         "A10-TensorRT-6": ["a10", "l0_a10", 6, 6],
+        "A10-Nanobind": ["a10", "l0_a10_nanobind", 1, 1],
         "A30-Triton-1": ["a30", "l0_a30", 1, 1],
         "A30-PyTorch-1": ["a30", "l0_a30", 1, 2],
         "A30-PyTorch-2": ["a30", "l0_a30", 2, 2],
@@ -1800,6 +1805,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
         if (key.contains("llvm")) {
             config = LLVM_CONFIG
         }
+        if (key.contains("Nanobind")) {
+            config = NANOBIND_CONFIG
+        }
         runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
     }]]}
     fullSet = parallelJobs.keySet()
diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
index e2dc543ac42..11d528a853d 100644
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@@ -593,7 +593,7 @@ def from_dict(cls, config, plugin_config=None):
             defaults.get('max_prompt_embedding_table_size'))
 
         if "kv_cache_type" in config and config["kv_cache_type"] is not None:
-            kv_cache_type = KVCacheType(config.pop('kv_cache_type'))
+            kv_cache_type = KVCacheType.from_string(config.pop('kv_cache_type'))
         else:
             kv_cache_type = None
         gather_context_logits = config.pop(
diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py
index a47e1485b71..e6b55f6e040 100644
--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@@ -38,6 +38,23 @@
 from tensorrt_llm.quantization.mode import QuantAlgo
 
 
+def enum_type(enum_class):
+
+    def parse_enum(value):
+        if isinstance(value, enum_class):
+            return value
+
+        if isinstance(value, str):
+            return enum_class.from_string(value)
+
+        valid_values = [e.name for e in enum_class]
+        raise argparse.ArgumentTypeError(
+            f"Invalid value '{value}' of type {type(value).__name__}. Expected one of {valid_values}"
+        )
+
+    return parse_enum
+
+
 def parse_arguments():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -131,7 +148,7 @@ def parse_arguments():
     parser.add_argument(
         '--kv_cache_type',
         default=argparse.SUPPRESS,
-        type=KVCacheType,
+        type=enum_type(KVCacheType),
         help=
         "Set KV cache type (continuous, paged, or disabled). For disabled case, KV cache is disabled and only context phase is allowed."
     )
diff --git a/tensorrt_llm/runtime/model_runner.py b/tensorrt_llm/runtime/model_runner.py
index 486c58f6d15..a9f0fe8de40 100644
--- a/tensorrt_llm/runtime/model_runner.py
+++ b/tensorrt_llm/runtime/model_runner.py
@@ -86,7 +86,7 @@ def _builder_to_model_config(config: dict) -> Tuple[ModelConfig, dict]:
     dtype = builder_config['precision']
     tp_size = builder_config['tensor_parallel']
     pp_size = builder_config.get('pipeline_parallel', 1)
-    kv_cache_type = KVCacheType(builder_config.get('kv_cache_type'))
+    kv_cache_type = KVCacheType.from_string(builder_config.get('kv_cache_type'))
     world_size = tp_size * pp_size
     assert world_size == mpi_world_size(), \
         f'Engine world size ({tp_size} * {pp_size}) != Runtime world size ({mpi_world_size()})'
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
index 2f63ab45f3a..5799ea27945 100644
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -190,3 +190,18 @@ l0_a10:
   tests:
   - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test]
   - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test]
+l0_a10_nanobind:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*a10*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: pre_merge
+      backend: tensorrt
+  tests:
+  - unittest/bindings
diff --git a/tests/unittest/bindings/test_bindings_ut.py b/tests/unittest/bindings/test_bindings_ut.py
index 774accb080f..6fd46040b66 100644
--- a/tests/unittest/bindings/test_bindings_ut.py
+++ b/tests/unittest/bindings/test_bindings_ut.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 
 import numpy as np
+import pytest
 import torch
 from utils.runtime_defaults import assert_runtime_defaults_are_parsed_correctly
 
@@ -309,6 +310,8 @@ def parse_runtime_defaults(defaults_dict: dict | None = None):
                                                  strict_keys=strict_keys)
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_llm_request():
     beam_width = 2
     sampling_config = _tb.SamplingConfig(beam_width)
@@ -418,6 +421,8 @@ def test_Mpicomm():
     assert size2 == session_size
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_SamplingConfig_pickle():
     config = _tb.SamplingConfig()
     config.beam_width = 5
@@ -497,6 +502,8 @@ def test_KvCache_events_binding():
     torch.cuda.empty_cache()
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_ReqIdsSet_pickle():
     ids = _tb.internal.batch_manager.ReqIdsSet()
     ids1 = pickle.loads(pickle.dumps(ids))
diff --git a/tests/unittest/bindings/test_executor_bindings.py b/tests/unittest/bindings/test_executor_bindings.py
index 5d9460ffef0..56f71df079d 100644
--- a/tests/unittest/bindings/test_executor_bindings.py
+++ b/tests/unittest/bindings/test_executor_bindings.py
@@ -14,6 +14,7 @@
 from binding_test_utils import *
 from pydantic import BaseModel
 
+import tensorrt_llm.bindings as _tb
 import tensorrt_llm.bindings.executor as trtllm
 import tensorrt_llm.version as trtllm_version
 from tensorrt_llm.models.modeling_utils import PretrainedConfig
@@ -484,6 +485,8 @@ def test_get_num_responses_ready(streaming: bool,
     assert executor.get_num_responses_ready() == num_expected_responses
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 @pytest.mark.parametrize("batching_type", [trtllm.BatchingType.INFLIGHT])
 @pytest.mark.parametrize("streaming", [False, True])
 @pytest.mark.parametrize("beam_width", [1])
@@ -688,6 +691,8 @@ def verify_output(beam_tokens, test_data, given_input_lengths):
     verify_output(tokens, test_data, given_input_lengths)
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 @pytest.mark.parametrize("streaming", [False, True])
 @pytest.mark.parametrize("beam_width", [1])
 def test_finish_reason(streaming: bool, beam_width: int, model_files,
@@ -1112,6 +1117,8 @@ def test_spec_dec_fast_logits_info():
     assert fast_logits_info.draft_participant_id == 5
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_result():
     result = trtllm.Result()
     result.is_final = True
@@ -1149,6 +1156,8 @@ def test_result():
     assert (additional_output.output == torch.ones(1, 4, 100)).all()
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_result_pickle():
     result = trtllm.Result()
     result.is_final = True
@@ -1495,6 +1504,8 @@ def test_eagle_config():
         assert getattr(config, k) == v
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_eagle_config_pickle():
     config = trtllm.EagleConfig([[0, 0], [0, 1]], False, 0.5)
     config_copy = pickle.loads(pickle.dumps(config))
@@ -1867,6 +1878,8 @@ def logits_post_processor(req_id: int, logits: torch.Tensor,
     assert tokens[-max_tokens:] == [42] * max_tokens
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_logits_post_processor_batched(model_files, model_path):
 
     # Define the logits post-processor callback
@@ -2141,6 +2154,8 @@ def test_request_perf_metrics_kv_cache(model_path):
     assert kv_cache_metrics.kv_cache_hit_rate == 1.0
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 @pytest.mark.parametrize("exclude_input_from_output", [False, True])
 def test_request_perf_metrics_draft(model_path_draft_tokens_external,
                                     exclude_input_from_output: bool):
@@ -2221,7 +2236,7 @@ def test_kv_event_stream_timeout(model_path):
     assert len(events) == 1
 
     start = datetime.datetime.now()
-    events = cache_manager.get_latest_events(datetime.timedelta(seconds=1))
+    events = cache_manager.get_latest_events(1000)
     end = datetime.datetime.now()
     # Make sure that it actually waited
     assert abs(end - start) > datetime.timedelta(milliseconds=900)