NVIDIA · Funatiq · Jun 29, 2025 · Jun 30, 2025 · Jun 30, 2025 · Copilot
diff --git a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
@@ -21,13 +21,20 @@
 #include "tensorrt_llm/common/algorithm.h"
 #include "tensorrt_llm/common/optionalRef.h"
 #include "tensorrt_llm/runtime/common.h"
-#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
 
-namespace tensorrt_llm::runtime::decoder
+namespace tensorrt_llm::runtime
+{
+namespace decoder
 {
 class DecoderState;
-} // namespace tensorrt_llm::runtime::decoder
+} // namespace decoder
+
+namespace decoder_batch
+{
+class Input;
+} // namespace decoder_batch
+} // namespace tensorrt_llm::runtime
 
 namespace tensorrt_llm::batch_manager
 {
@@ -40,7 +47,7 @@ class MakeDecodingBatchInputOutput : Algorithm
     constexpr static auto name{"MakeDecodingBatchInputOutput"};
 
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
-    using TensorPtr = runtime::decoder_batch::Input::TensorPtr;
+    using TensorPtr = runtime::ITensor::SharedPtr;
     template <typename T>
     using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
 

diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
@@ -19,13 +19,13 @@
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/cudaEvent.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
-#include "tensorrt_llm/runtime/decoderState.h"
-#include "tensorrt_llm/runtime/gptDecoder.h"
-#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
+#include "tensorrt_llm/runtime/eagleBuffers.h"
+#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <memory>
+#include <optional>
 #include <vector>
 
 namespace tensorrt_llm::batch_manager
@@ -35,9 +35,72 @@ class LlmRequest;
 
 namespace tensorrt_llm::runtime
 {
+class SamplingConfig;
+class IGptDecoder;
+
+namespace decoder
+{
+class DecoderState;
+}
+
+namespace decoder_batch
+{
+
+class Input
+{
+public:
+    using TensorConstPtr = ITensor::SharedConstPtr;
+    using TensorPtr = ITensor::SharedPtr;
+
+    explicit Input(std::vector<std::vector<TensorConstPtr>> const& logits, SizeType32 maxDecoderSteps)
+        : logits{logits}
+        , maxDecoderSteps{maxDecoderSteps}
+    {
+        TLLM_CHECK_WITH_INFO(
+            logits.size() == static_cast<size_t>(maxDecoderSteps), "logits vector size does not match maxDecoderSteps");
+    }
+
+    explicit Input(std::vector<TensorConstPtr> const& logits)
+        : Input{{logits}, 1}
+    {
+    }
+
+    //! Mandatory parameters
+    //! Logits
+    // FIXME: remove first dimension of tensors
+    //! [maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
+    std::vector<std::vector<TensorConstPtr>> logits;
+
+    //! Maximum number of decoding tokens of active slots
+    SizeType32 maxDecoderSteps;
+
+    //! Batch of active decoder slots, sorted by slots, [maxDecoderSteps][batchSize]
+    std::vector<TensorPtr> batchSlots;
+    //! Filled with slots in request order, [batchSize]
+    TensorPtr batchSlotsRequestOrder;
+
+    //! For Beam Search
+    //! The generation step of each request (for Variable-Beam-Width-Search), [batchSize]
+    std::vector<SizeType32> generationSteps;
+
+    //! For speculative decoding
+    //! Logits of draft
+    //! [maxBatchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded]
+    std::vector<std::vector<TensorPtr>> predictedDraftLogits;
+
+    //! Explicit draft tokens data
+    std::optional<ExplicitDraftTokensBuffers::EngineOutputs> explicitDraftTokensInputs;
+    std::optional<ExplicitDraftTokensBuffers::EngineInputs> explicitDraftTokensLastInputs;
+
+    //! Eagle data
+    std::optional<EagleBuffers::EngineOutputs> eagleInputs;
+    std::optional<EagleBuffers::Inputs> eagleLastInputs;
+};
+
+} // namespace decoder_batch
 
 //! GPT decoder class with support for in-flight batching
-class GptDecoderBatched : public IGptDecoderBatched
+class GptDecoderBatched
 {
 public:
     using CudaStreamPtr = std::shared_ptr<CudaStream>;
@@ -47,25 +110,29 @@ class GptDecoderBatched : public IGptDecoderBatched
 
     explicit GptDecoderBatched(CudaStreamPtr stream);
 
+    //! @brief Setup the decoder before calling `forward()`
     void setup(executor::DecodingMode const& mode, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
-        nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig) override;
+        nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig);
 
-    void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) override;
+    //! @brief Disable Lookahead decoding.
+    void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots);
 
-    CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override;
-    void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override;
+    //! @brief Run one step for all requests without blocking the host process and return the token for synchronization.
+    CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input);
+    //! @brief Run one step for all requests and wait for completion on the host.
+    void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input);
 
     //! @brief Gather final beam search results for request `batchSlot`.
     //! Result will only be available after event returned.
     [[nodiscard]] CudaEvent finalize(decoder::DecoderState const& decoderState, SizeType32 batchSlot,
-        SamplingConfig const& samplingConfig, bool streaming) const override;
+        SamplingConfig const& samplingConfig, bool streaming) const;
 
-    CudaStreamPtr getDecoderStream() const
+    [[nodiscard]] CudaStreamPtr getDecoderStream() const
     {
         return mDecoderStream;
     }
 
-    IGptDecoder& getUnderlyingDecoder() const
+    [[nodiscard]] IGptDecoder& getUnderlyingDecoder() const
     {
         return *mDecoder.get();
     }
@@ -87,4 +154,5 @@ class GptDecoderBatched : public IGptDecoderBatched
     using GptDecoderPtr = std::unique_ptr<IGptDecoder>;
     GptDecoderPtr mDecoder;
 };
+
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
diff --git a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
@@ -22,7 +22,7 @@
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/runtime/decoderState.h"
-#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
+#include "tensorrt_llm/runtime/gptDecoderBatched.h"
 
 namespace tr = tensorrt_llm::runtime;
 

diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -55,6 +55,8 @@
 #include "tensorrt_llm/kernels/decodingCommon.h"
 #include "tensorrt_llm/layers/defaultDecodingParams.h"
 #include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/decoderState.h"
+#include "tensorrt_llm/runtime/gptDecoder.h"
 #include "tensorrt_llm/runtime/gptDecoderBatched.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"

diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -25,6 +25,7 @@
 #include "tensorrt_llm/batch_manager/rnnStateManager.h"
 #include "tensorrt_llm/batch_manager/sequenceSlotManager.h"
 #include "tensorrt_llm/nanobind/common/bindTypes.h"
+#include "tensorrt_llm/runtime/decoderState.h"
 #include "tensorrt_llm/runtime/gptDecoderBatched.h"
 #include "tensorrt_llm/runtime/runtimeKernels.h"
 #include "tensorrt_llm/runtime/torch.h"

diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -30,7 +30,6 @@
 #include "tensorrt_llm/runtime/gptDecoder.h"
 #include "tensorrt_llm/runtime/gptDecoderBatched.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
-#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/ipcUtils.h"
 #include "tensorrt_llm/runtime/lookaheadBuffers.h"

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
@@ -28,6 +28,7 @@
 #include "tensorrt_llm/batch_manager/pauseRequests.h"
 #include "tensorrt_llm/batch_manager/peftCacheManager.h"
 #include "tensorrt_llm/runtime/decoderState.h"
+#include "tensorrt_llm/runtime/gptDecoderBatched.h"
 #include "tensorrt_llm/runtime/torch.h"
 #include "tensorrt_llm/runtime/torchView.h"
 

diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -29,7 +29,6 @@
 #include "tensorrt_llm/runtime/gptDecoder.h"
 #include "tensorrt_llm/runtime/gptDecoderBatched.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
-#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/ipcUtils.h"
 #include "tensorrt_llm/runtime/lookaheadBuffers.h"

diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
@@ -26,6 +26,7 @@
 #include "tensorrt_llm/kernels/decodingKernels.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/cudaEvent.h"
+#include "tensorrt_llm/runtime/gptDecoder.h"
 
 #include <algorithm>
 #include <cassert>

diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
@@ -24,10 +24,11 @@
 #include "tensorrt_llm/executor/types.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/decoderState.h"
+#include "tensorrt_llm/runtime/gptDecoder.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
-#include "tensorrt_llm/runtime/runtimeKernels.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <gmock/gmock-matchers.h>