NVIDIA · Funatiq · Jun 29, 2025 · Jun 29, 2025 · Jun 29, 2025 · Aug 1, 2025
diff --git a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
@@ -70,7 +70,7 @@ class CreateNewDecoderRequests : Algorithm
     {
     }
 
-    std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
+    std::tuple<TensorPtr, runtime::SamplingConfig, std::vector<runtime::ITensor::SharedConstPtr>,
         std::vector<executor::LookaheadDecodingConfig>>
     operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
         executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
@@ -86,6 +86,10 @@ class CreateNewDecoderRequests : Algorithm
         runtime::CudaStream const& runtimeStream, runtime::CudaStream const& decoderStream,
         SizeType32 maxSequenceLength, OptionalRef<MedusaBuffers const> medusaBuffers) const;
 
+    static TensorPtr fillBatchSlots(RequestVector const& requests, DecoderInputBuffers& inputBuffers);
+
+    static std::optional<SamplingConfig> fuseSamplingConfigs(RequestVector const& requests);
+
 private:
     bool mSpeculativeDecodingFastLogits;
     bool mIsLeaderInOrchMode;

diff --git a/cpp/include/tensorrt_llm/runtime/decoderState.h b/cpp/include/tensorrt_llm/runtime/decoderState.h
@@ -66,7 +66,7 @@ class DecoderState
         WorldConfig const& worldConfig, BufferManager const& bufferManager);
 
     //! @brief Disable lookahead decoding.
-    void disableLookahead(RequestVector const& genRequests);
+    void disableLookahead();
 
     //! @returns [batchSize], number of finished sequences per request, on gpu
     [[nodiscard]] TensorPtr getFinishedSum() const;

diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
@@ -26,12 +26,6 @@
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <memory>
-#include <vector>
-
-namespace tensorrt_llm::batch_manager
-{
-class LlmRequest;
-} // namespace tensorrt_llm::batch_manager
 
 namespace tensorrt_llm::runtime
 {
@@ -41,17 +35,13 @@ class GptDecoderBatched : public IGptDecoderBatched
 {
 public:
     using CudaStreamPtr = std::shared_ptr<CudaStream>;
-    using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>;
-    using RequestVector = std::vector<LlmRequestPtr>;
     using TensorPtr = ITensor::SharedPtr;
 
     explicit GptDecoderBatched(CudaStreamPtr stream);
 
     void setup(executor::DecodingMode const& mode, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
         nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig) override;
 
-    void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) override;
-
     CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override;
     void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override;
 
@@ -60,12 +50,12 @@ class GptDecoderBatched : public IGptDecoderBatched
     [[nodiscard]] CudaEvent finalize(decoder::DecoderState const& decoderState, SizeType32 batchSlot,
         SamplingConfig const& samplingConfig, bool streaming) const override;
 
-    CudaStreamPtr getDecoderStream() const
+    [[nodiscard]] CudaStreamPtr getDecoderStream() const
     {
         return mDecoderStream;
     }
 
-    IGptDecoder& getUnderlyingDecoder() const
+    [[nodiscard]] IGptDecoder& getUnderlyingDecoder() const
     {
         return *mDecoder.get();
     }

diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
@@ -25,11 +25,6 @@
 #include <memory>
 #include <vector>
 
-namespace tensorrt_llm::batch_manager
-{
-class LlmRequest;
-}
-
 namespace tensorrt_llm::runtime
 {
 class SamplingConfig;
@@ -81,18 +76,13 @@ class IGptDecoderBatched
 {
 public:
     using CudaStreamPtr = std::shared_ptr<CudaStream>;
-    using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>;
-    using RequestVector = std::vector<LlmRequestPtr>;
     using TensorPtr = std::shared_ptr<ITensor>;
 
     //! @brief Setup the decoder before calling `forward()`
     virtual void setup(executor::DecodingMode const& mode, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
         nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig)
         = 0;
 
-    //! @brief Disable Lookahead decoding.
-    virtual void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) = 0;
-
     //! @brief Run one step for all requests without blocking the host process and return the token for synchronization.
     virtual CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) = 0;
 

diff --git a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
@@ -54,7 +54,8 @@ using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
 namespace
 {
 
-void copySequenceLengths(RequestVector const& contextRequests, DecoderInputBuffers& inputBuffers,
+//! @brief Fills the seqSlots and sequence lengths in the inputBuffers.
+TensorPtr copySequenceLengths(RequestVector const& contextRequests, DecoderInputBuffers& inputBuffers,
     ITensor& sequenceLengths, SizeType32 beamWidth, runtime::CudaStream const& stream)
 {
     auto const bufferManager = BufferManager{std::make_shared<CudaStream>(stream.get())};
@@ -82,6 +83,7 @@ void copySequenceLengths(RequestVector const& contextRequests, DecoderInputBuffe
     }
 
     // copy sequence lengths
+    if (!contextRequests.empty())
     {
         auto batchSlotsDeviceView = tr::ITensor::slice(inputBuffers.setupBatchSlotsDevice, 0, batchSize);
         auto fillValuesViewDevice = tr::ITensor::slice(inputBuffers.fillValuesDevice, 0, batchSize);
@@ -90,6 +92,8 @@ void copySequenceLengths(RequestVector const& contextRequests, DecoderInputBuffe
         bufferManager.copy(*fillValuesView, *fillValuesViewDevice);
         tr::kernels::invokeFillBatch(sequenceLengths, *batchSlotsDeviceView, beamWidth, *fillValuesViewDevice, stream);
     }
+
+    return batchSlotsView;
 }
 
 /// @brief Retrieve the embedding bias from the request. This potentially makes a copy of the tensor
@@ -131,7 +135,46 @@ void copySequenceLengths(RequestVector const& contextRequests, DecoderInputBuffe
 
 } // namespace
 
-std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
+// Similar to copySequenceLengths, but only fills the seqSlots.
+TensorPtr CreateNewDecoderRequests::fillBatchSlots(RequestVector const& requests, DecoderInputBuffers& inputBuffers)
+{
+    auto const batchSize = requests.size();
+    auto batchSlotsView = tr::ITensor::slice(inputBuffers.setupBatchSlots, 0, batchSize);
+
+    auto batchSlotsRange = tr::BufferRange<SizeType32>(*batchSlotsView);
+
+    // fill buffers on host
+    SizeType32 batchIdx{0};
+    for (auto const& llmReq : requests)
+    {
+        auto const seqSlot = llmReq->mSeqSlot.value();
+        batchSlotsRange[batchIdx] = seqSlot;
+        ++batchIdx;
+    }
+
+    // TODO: copy to device and use in GptDecoder
+    // manager.copy(*batchSlotsView, *batchSlotsDeviceView);
+
+    return batchSlotsView;
+}
+
+std::optional<SamplingConfig> CreateNewDecoderRequests::fuseSamplingConfigs(RequestVector const& requests)
+{
+    if (requests.empty())
+    {
+        return std::nullopt;
+    }
+
+    std::vector<SamplingConfig> samplingConfigs;
+    samplingConfigs.reserve(requests.size());
+    for (auto const& llmReq : requests)
+    {
+        samplingConfigs.push_back(llmReq->mSamplingConfig);
+    }
+    return SamplingConfig(samplingConfigs);
+}
+
+std::tuple<TensorPtr, runtime::SamplingConfig, std::vector<runtime::ITensor::SharedConstPtr>,
     std::vector<executor::LookaheadDecodingConfig>>
 CreateNewDecoderRequests::operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
     executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests, nvinfer1::DataType logitsType,
@@ -142,33 +185,21 @@ CreateNewDecoderRequests::operator()(runtime::ModelConfig const& modelConfig, ru
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_SCOPED_RANGE(CreateNewDecoderRequests);
 
-    RequestVector finishedContextRequests;
-    std::copy_if(contextRequests.begin(), contextRequests.end(), std::back_inserter(finishedContextRequests),
-        [](auto const& llmReq) { return llmReq->isLastContextChunk(); });
+    TLLM_CHECK_WITH_INFO(
+        !contextRequests.empty(), "CreateNewDecoderRequests should be called with at least one request");
 
-    if (!finishedContextRequests.empty())
-    {
-        copySequenceLengths(
-            finishedContextRequests, inputBuffers, *decoderState.getSequenceLengths(), beamWidth, runtimeStream);
-    }
+    auto batchSlotsView = copySequenceLengths(
+        contextRequests, inputBuffers, *decoderState.getSequenceLengths(), beamWidth, runtimeStream);
 
     auto [lookaheadPrompt, lookaheadAlgoConfigs]
-        = createDecoderRequests(finishedContextRequests, inputBuffers.inputsIds, decodingConfig, decoderState,
-            logitsType, modelConfig, worldConfig, runtimeStream, decoderStream, maxSequenceLength, medusaBuffers);
-
-    auto const batchSize = finishedContextRequests.size();
-
-    std::vector<SamplingConfig> samplingConfigs;
-    samplingConfigs.reserve(batchSize);
-    for (auto const& llmReq : finishedContextRequests)
-    {
-        samplingConfigs.push_back(llmReq->mSamplingConfig);
-    }
+        = createDecoderRequests(contextRequests, inputBuffers.inputsIds, decodingConfig, decoderState, logitsType,
+            modelConfig, worldConfig, runtimeStream, decoderStream, maxSequenceLength, medusaBuffers);
 
-    TensorPtr batchSlotsView = runtime::ITensor::slice(inputBuffers.setupBatchSlots, 0, batchSize);
+    auto samplingConfig = fuseSamplingConfigs(contextRequests);
+    TLLM_CHECK(samplingConfig.has_value());
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return {std::move(batchSlotsView), std::move(samplingConfigs), std::move(lookaheadPrompt),
+    return {std::move(batchSlotsView), std::move(samplingConfig.value()), std::move(lookaheadPrompt),
         std::move(lookaheadAlgoConfigs)};
 }
 

diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -1863,24 +1863,29 @@ void TrtGptModelInflightBatching::setupDecoderStep(
 
     if (mWorldConfig.isLastPipelineParallelRank() && !contextRequests.empty())
     {
-        auto const logitsType = mRuntime->getEngine().getTensorDataType("logits");
+        RequestVector finishedContextRequests;
+        std::copy_if(contextRequests.begin(), contextRequests.end(), std::back_inserter(finishedContextRequests),
+            [](auto const& llmReq) { return llmReq->isLastContextChunk(); });
 
-        auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs]
-            = (*mCreateNewDecoderRequests)(mModelConfig, mWorldConfig, mDecodingConfig, contextRequests, logitsType,
-                inputBuffers, *mDecoderState, mRuntime->getStream(), *mDecoder->getDecoderStream(), getMaxSequenceLen(),
-                mOperatingBeamWidth, buffers.mMedusaBuffers);
-
-        auto const localBatchSize = batchSlots->getSize();
-        if (localBatchSize > 0)
+        if (!finishedContextRequests.empty())
         {
-        RequestVector finishedContextRequests;
-        std::copy_if(contextRequests.begin(), contextRequests.end(), std::back_inserter(finishedContextRequests),
-            [](auto const& llmReq) { return llmReq->isLastContextChunk(); });
-
-        auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs]
-            = (*mCreateNewDecoderRequests)(mModelConfig, mWorldConfig, mDecodingConfig, contextRequests, logitsType,
-                inputBuffers, *mDecoderState, mRuntime->getStream(), *mDecoder->getDecoderStream(), getMaxSequenceLen(),
-                mOperatingBeamWidth, buffers.mMedusaBuffers);
-
-        auto const localBatchSize = batchSlots->getSize();
-        if (localBatchSize > 0)
-        if (!finishedContextRequests.empty())
-        {
+        RequestVector finishedContextRequests;
+        std::copy_if(contextRequests.begin(), contextRequests.end(), std::back_inserter(finishedContextRequests),
+            [](auto const& llmReq) { return llmReq->isLastContextChunk(); });
+        // Disagg generation init path calls this with generation-ready requests (no context chunk in-flight).
+        if (finishedContextRequests.empty())
+        {
+            finishedContextRequests = contextRequests;
+        }
+
+        if (!finishedContextRequests.empty())
+        {
+            // … existing decoder-setup logic …
+        }
-        RequestVector finishedContextRequests;
-        std::copy_if(contextRequests.begin(), contextRequests.end(), std::back_inserter(finishedContextRequests),
-            [](auto const& llmReq) { return llmReq->isLastContextChunk(); });
-
-        auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs]
-            = (*mCreateNewDecoderRequests)(mModelConfig, mWorldConfig, mDecodingConfig, contextRequests, logitsType,
-                inputBuffers, *mDecoderState, mRuntime->getStream(), *mDecoder->getDecoderStream(), getMaxSequenceLen(),
-                mOperatingBeamWidth, buffers.mMedusaBuffers);
-
-        auto const localBatchSize = batchSlots->getSize();
-        if (localBatchSize > 0)
-        if (!finishedContextRequests.empty())
-        {
+        RequestVector finishedContextRequests;
+        std::copy_if(contextRequests.begin(), contextRequests.end(), std::back_inserter(finishedContextRequests),
+            [](auto const& llmReq) { return llmReq->isLastContextChunk(); });
+        // Disagg generation init path calls this with generation-ready requests (no context chunk in-flight).
+        if (finishedContextRequests.empty())
+        {
+            finishedContextRequests = contextRequests;
+        }
+
+        if (!finishedContextRequests.empty())
+        {
+            // … existing decoder-setup logic …
+        }
-            auto samplingConfig = SamplingConfig(samplingConfigs);
+            auto const logitsType = mRuntime->getEngine().getTensorDataType("logits");
+
+            auto [batchSlots, samplingConfig, lookaheadPrompt, lookaheadAlgoConfigs]
+                = (*mCreateNewDecoderRequests)(mModelConfig, mWorldConfig, mDecodingConfig, finishedContextRequests,
+                    logitsType, inputBuffers, *mDecoderState, mRuntime->getStream(), *mDecoder->getDecoderStream(),
+                    getMaxSequenceLen(), mOperatingBeamWidth, buffers.mMedusaBuffers);
+
+            auto const localBatchSize = batchSlots->getSize();
+            TLLM_CHECK_WITH_INFO(localBatchSize > 0, "Decoder setup should be called with at least one request");
+
             mDecoder->getUnderlyingDecoder().setup(samplingConfig, localBatchSize, batchSlots,
                 {mDecoderState->getJointDecodingOutput()}, mModelConfig.getDataType(), lookaheadPrompt,
                 lookaheadAlgoConfigs);
 
-            auto const& stream = mDecoder->getDecoderStream();
+            auto const& decoderStream = mDecoder->getDecoderStream();
             CudaEvent event{};
-            stream->record(event);
+            decoderStream->record(event);
             mRuntime->getStreamPtr()->wait(event);
         }
     }
@@ -2515,6 +2520,24 @@ void TrtGptModelInflightBatching::changeBeamWidth(SizeType32 beamWidth)
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
+void TrtGptModelInflightBatching::disableLookaheadDecoder(
+    RequestVector const& genRequests, DecoderInputBuffers& inputBuffers)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    auto batchSlots = CreateNewDecoderRequests::fillBatchSlots(genRequests, inputBuffers);
+    auto samplingConfig = CreateNewDecoderRequests::fuseSamplingConfigs(genRequests);
+
+    mDecoder->getUnderlyingDecoder().disableLookahead(samplingConfig, batchSlots->getSize(), batchSlots);
+
+    auto const& decoderStream = mDecoder->getDecoderStream();
+    CudaEvent event{};
+    decoderStream->record(event);
+    mRuntime->getStreamPtr()->wait(event);
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
 void TrtGptModelInflightBatching::changeSpecDecMode(ScheduledRequests const& scheduledRequests)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@@ -2602,11 +2625,16 @@ void TrtGptModelInflightBatching::changeSpecDecMode(ScheduledRequests const& sch
         mDecodingConfig.setDecodingMode(executor::DecodingMode::Auto());
         mBuffers.at(bufferId)->mLookaheadBuffers->disableLookaheadDecoding();
         mDecoderOutputBuffers.at(getFusedBufferId()).disableLookaheadDecoding(getMaxNumSequences());
-        mDecoder->disableLookahead(
-            scheduledRequests.generationRequests, mDecoderInputBuffers.at(getFusedBufferId()).setupBatchSlots);
-        mDecoderState->disableLookahead(scheduledRequests.generationRequests);
+        disableLookaheadDecoder(scheduledRequests.generationRequests, mDecoderInputBuffers.at(getFusedBufferId()));
+        mDecoderState->disableLookahead();
+
         for (auto const& llmReq : scheduledRequests.generationRequests)
         {
+            if (llmReq->mSeqSlot)
+            {
+                mDecoderState->setNumDecodingEngineTokens(llmReq->mSeqSlot.value(), 1);
+            }
+
-        disableLookaheadDecoder(scheduledRequests.generationRequests, mDecoderInputBuffers.at(getFusedBufferId()));
-        mDecoderState->disableLookahead();
-
-        for (auto const& llmReq : scheduledRequests.generationRequests)
-        {
-            if (llmReq->mSeqSlot)
-            {
-                mDecoderState->setNumDecodingEngineTokens(llmReq->mSeqSlot.value(), 1);
-            }
+        // Apply to all scheduled slots (both ctx and gen have seqSlot at this point)
+        RequestVector requestsForDisable;
+        requestsForDisable.reserve(
+            scheduledRequests.contextRequests.size() + scheduledRequests.generationRequests.size());
+        requestsForDisable.insert(requestsForDisable.end(),
+            scheduledRequests.contextRequests.begin(), scheduledRequests.contextRequests.end());
+        requestsForDisable.insert(requestsForDisable.end(),
+            scheduledRequests.generationRequests.begin(), scheduledRequests.generationRequests.end());
+
+        disableLookaheadDecoder(requestsForDisable, mDecoderInputBuffers.at(getFusedBufferId()));
+        mDecoderState->disableLookahead();
+
+        for (auto const& llmReq : requestsForDisable)
+        {
+            if (llmReq->mSeqSlot)
+            {
+                mDecoderState->setNumDecodingEngineTokens(llmReq->mSeqSlot.value(), 1);
+            }
+        }
-        disableLookaheadDecoder(scheduledRequests.generationRequests, mDecoderInputBuffers.at(getFusedBufferId()));
-        mDecoderState->disableLookahead();
-
-        for (auto const& llmReq : scheduledRequests.generationRequests)
-        {
-            if (llmReq->mSeqSlot)
-            {
-                mDecoderState->setNumDecodingEngineTokens(llmReq->mSeqSlot.value(), 1);
-            }
+        // Apply to all scheduled slots (both ctx and gen have seqSlot at this point)
+        RequestVector requestsForDisable;
+        requestsForDisable.reserve(
+            scheduledRequests.contextRequests.size() + scheduledRequests.generationRequests.size());
+        requestsForDisable.insert(requestsForDisable.end(),
+            scheduledRequests.contextRequests.begin(), scheduledRequests.contextRequests.end());
+        requestsForDisable.insert(requestsForDisable.end(),
+            scheduledRequests.generationRequests.begin(), scheduledRequests.generationRequests.end());
+
+        disableLookaheadDecoder(requestsForDisable, mDecoderInputBuffers.at(getFusedBufferId()));
+        mDecoderState->disableLookahead();
+
+        for (auto const& llmReq : requestsForDisable)
+        {
+            if (llmReq->mSeqSlot)
+            {
+                mDecoderState->setNumDecodingEngineTokens(llmReq->mSeqSlot.value(), 1);
+            }
+        }
             if (llmReq->getNumDraftTokens() > 0)
             {
                 llmReq->discardDraftTokens(llmReq->getNumDraftTokens());

diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h
@@ -388,6 +388,8 @@ class TrtGptModelInflightBatching : public TrtGptModel
     /// @brief Change the speculative decoding mode.
     void changeSpecDecMode(ScheduledRequests const& scheduledRequests);
 
+    void disableLookaheadDecoder(RequestVector const& genRequests, DecoderInputBuffers& inputBuffers);
+
     void prefetchNextPromptTableChunk(RequestVector const& contextRequests, bool isFirstChunk, SizeType32 bufferId);
 
     void remapInputTokensForPromptTable(

diff --git a/cpp/tensorrt_llm/nanobind/bindings.cpp b/cpp/tensorrt_llm/nanobind/bindings.cpp
@@ -64,14 +64,6 @@ using OptVec = std::optional<std::vector<T>>;
 #error "TRTLLM_NB_MODULE must be defined"
 #endif
 
-namespace
-{
-tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& configs)
-{
-    return tr::SamplingConfig(configs);
-}
-} // namespace
-
 NB_MODULE(TRTLLM_NB_MODULE, m)
 {
     m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
@@ -425,10 +417,6 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
         .def("__setstate__", SamplingConfigSetState)
         .def("__eq__", &tr::SamplingConfig::operator==);
 
-    nb::bind_vector<std::vector<tr::SamplingConfig>>(m, "SamplingConfigVector");
-
-    m.def("make_sampling_config", &makeSamplingConfig, nb::arg("configs"));
-
     nb::class_<tr::GptJsonConfig>(m, "GptJsonConfig")
         .def(nb::init<std::string, std::string, std::string, SizeType32, SizeType32, SizeType32, SizeType32,
                  tr::ModelConfig, std::optional<tr::RuntimeDefaults>>(),

diff --git a/cpp/tensorrt_llm/nanobind/common/customCasters.h b/cpp/tensorrt_llm/nanobind/common/customCasters.h
@@ -21,7 +21,6 @@
 #include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/common/optionalRef.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
-#include "tensorrt_llm/runtime/samplingConfig.h"
 #include "tensorrt_llm/runtime/torch.h"
 #include "tensorrt_llm/runtime/torchView.h"
 
@@ -45,7 +44,6 @@
 // Opaque bindings
 NB_MAKE_OPAQUE(tensorrt_llm::batch_manager::ReqIdsSet)
 NB_MAKE_OPAQUE(std::vector<tensorrt_llm::batch_manager::SlotDecoderBuffers>)
-NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::SamplingConfig>)
 
 namespace nb = nanobind;
 

diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -58,14 +58,6 @@ using OptVec = std::optional<std::vector<T>>;
 #error "TRTLLM_PYBIND_MODULE must be defined"
 #endif
 
-namespace
-{
-tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& configs)
-{
-    return tr::SamplingConfig(configs);
-}
-} // namespace
-
 PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
 {
     m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
@@ -415,8 +407,6 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
         .def(py::pickle(SamplingConfigGetState, SamplingConfigSetState))
         .def("__eq__", &tr::SamplingConfig::operator==);
 
-    m.def("make_sampling_config", &makeSamplingConfig, py::arg("configs"));
-
     py::class_<tr::GptJsonConfig>(m, "GptJsonConfig")
         .def(py::init<std::string, std::string, std::string, SizeType32, SizeType32, SizeType32, SizeType32,
                  tr::ModelConfig, std::optional<tr::RuntimeDefaults>>(),