diff --git a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
index cea23a4e7ec..ab6674a505f 100644
--- a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
+++ b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
@@ -21,13 +21,20 @@
 #include "tensorrt_llm/common/algorithm.h"
 #include "tensorrt_llm/common/optionalRef.h"
 #include "tensorrt_llm/runtime/common.h"
-#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
 
-namespace tensorrt_llm::runtime::decoder
+namespace tensorrt_llm::runtime
+{
+namespace decoder
 {
 class DecoderState;
-} // namespace tensorrt_llm::runtime::decoder
+} // namespace decoder
+
+namespace decoder_batch
+{
+class Input;
+} // namespace decoder_batch
+} // namespace tensorrt_llm::runtime
 
 namespace tensorrt_llm::batch_manager
 {
@@ -40,7 +47,7 @@ class MakeDecodingBatchInputOutput : Algorithm
     constexpr static auto name{"MakeDecodingBatchInputOutput"};
 
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
-    using TensorPtr = runtime::decoder_batch::Input::TensorPtr;
+    using TensorPtr = runtime::ITensor::SharedPtr;
     template <typename T>
     using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
 
diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
index d0a9e726d13..1c546fc5c4f 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
@@ -19,13 +19,13 @@
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/cudaEvent.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
-#include "tensorrt_llm/runtime/decoderState.h"
-#include "tensorrt_llm/runtime/gptDecoder.h"
-#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
+#include "tensorrt_llm/runtime/eagleBuffers.h"
+#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <memory>
+#include <optional>
 #include <vector>
 
 namespace tensorrt_llm::batch_manager
@@ -35,9 +35,72 @@ class LlmRequest;
 
 namespace tensorrt_llm::runtime
 {
+class SamplingConfig;
+class IGptDecoder;
+
+namespace decoder
+{
+class DecoderState;
+}
+
+namespace decoder_batch
+{
+
+class Input
+{
+public:
+    using TensorConstPtr = ITensor::SharedConstPtr;
+    using TensorPtr = ITensor::SharedPtr;
+
+    explicit Input(std::vector<std::vector<TensorConstPtr>> const& logits, SizeType32 maxDecoderSteps)
+        : logits{logits}
+        , maxDecoderSteps{maxDecoderSteps}
+    {
+        TLLM_CHECK_WITH_INFO(
+            logits.size() == static_cast<size_t>(maxDecoderSteps), "logits vector size does not match maxDecoderSteps");
+    }
+
+    explicit Input(std::vector<TensorConstPtr> const& logits)
+        : Input{{logits}, 1}
+    {
+    }
+
+    //! Mandatory parameters
+    //! Logits
+    // FIXME: remove first dimension of tensors
+    //! [maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
+    std::vector<std::vector<TensorConstPtr>> logits;
+
+    //! Maximum number of decoding tokens of active slots
+    SizeType32 maxDecoderSteps;
+
+    //! Batch of active decoder slots, sorted by slots, [maxDecoderSteps][batchSize]
+    std::vector<TensorPtr> batchSlots;
+    //! Filled with slots in request order, [batchSize]
+    TensorPtr batchSlotsRequestOrder;
+
+    //! For Beam Search
+    //! The generation step of each request (for Variable-Beam-Width-Search), [batchSize]
+    std::vector<SizeType32> generationSteps;
+
+    //! For speculative decoding
+    //! Logits of draft
+    //! [maxBatchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded]
+    std::vector<std::vector<TensorPtr>> predictedDraftLogits;
+
+    //! Explicit draft tokens data
+    std::optional<ExplicitDraftTokensBuffers::EngineOutputs> explicitDraftTokensInputs;
+    std::optional<ExplicitDraftTokensBuffers::EngineInputs> explicitDraftTokensLastInputs;
+
+    //! Eagle data
+    std::optional<EagleBuffers::EngineOutputs> eagleInputs;
+    std::optional<EagleBuffers::Inputs> eagleLastInputs;
+};
+
+} // namespace decoder_batch
 
 //! GPT decoder class with support for in-flight batching
-class GptDecoderBatched : public IGptDecoderBatched
+class GptDecoderBatched
 {
 public:
     using CudaStreamPtr = std::shared_ptr<CudaStream>;
@@ -47,25 +110,29 @@ class GptDecoderBatched : public IGptDecoderBatched
 
     explicit GptDecoderBatched(CudaStreamPtr stream);
 
+    //! @brief Setup the decoder before calling `forward()`
     void setup(executor::DecodingMode const& mode, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
-        nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig) override;
+        nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig);
 
-    void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) override;
+    //! @brief Disable Lookahead decoding.
+    void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots);
 
-    CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override;
-    void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override;
+    //! @brief Run one step for all requests without blocking the host process and return the token for synchronization.
+    CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input);
+    //! @brief Run one step for all requests and wait for completion on the host.
+    void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input);
 
     //! @brief Gather final beam search results for request `batchSlot`.
     //! Result will only be available after event returned.
     [[nodiscard]] CudaEvent finalize(decoder::DecoderState const& decoderState, SizeType32 batchSlot,
-        SamplingConfig const& samplingConfig, bool streaming) const override;
+        SamplingConfig const& samplingConfig, bool streaming) const;
 
-    CudaStreamPtr getDecoderStream() const
+    [[nodiscard]] CudaStreamPtr getDecoderStream() const
     {
         return mDecoderStream;
     }
 
-    IGptDecoder& getUnderlyingDecoder() const
+    [[nodiscard]] IGptDecoder& getUnderlyingDecoder() const
     {
         return *mDecoder.get();
     }
@@ -87,4 +154,5 @@ class GptDecoderBatched : public IGptDecoderBatched
     using GptDecoderPtr = std::unique_ptr<IGptDecoder>;
     GptDecoderPtr mDecoder;
 };
+
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
deleted file mode 100644
index 606ba3c98a4..00000000000
--- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "tensorrt_llm/runtime/cudaEvent.h"
-#include "tensorrt_llm/runtime/cudaStream.h"
-#include "tensorrt_llm/runtime/iTensor.h"
-#include "tensorrt_llm/runtime/modelConfig.h"
-#include "tensorrt_llm/runtime/worldConfig.h"
-
-#include <memory>
-#include <vector>
-
-namespace tensorrt_llm::batch_manager
-{
-class LlmRequest;
-}
-
-namespace tensorrt_llm::runtime
-{
-class SamplingConfig;
-
-namespace decoder
-{
-class DecoderState;
-}
-
-namespace decoder_batch
-{
-
-class Input
-{
-public:
-    using TensorConstPtr = ITensor::SharedConstPtr;
-    using TensorPtr = ITensor::SharedPtr;
-
-    explicit Input(std::vector<std::vector<TensorConstPtr>> const& logits, SizeType32 maxDecoderSteps)
-        : logits{logits}
-        , maxDecoderSteps{maxDecoderSteps}
-    {
-        TLLM_CHECK_WITH_INFO(
-            logits.size() == static_cast<size_t>(maxDecoderSteps), "logits vector size does not match maxDecoderSteps");
-    }
-
-    explicit Input(std::vector<TensorConstPtr> const& logits)
-        : Input{{logits}, 1}
-    {
-    }
-
-    //! Mandatory parameters
-    //! Logits
-    // FIXME: remove first dimension of tensors
-    //! [maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
-    std::vector<std::vector<TensorConstPtr>> logits;
-
-    //! Maximum number of decoding tokens of active slots
-    SizeType32 maxDecoderSteps;
-
-    //! Batch of active decoder slots, sorted by slots, [maxDecoderSteps][batchSize]
-    std::vector<TensorPtr> batchSlots;
-};
-
-} // namespace decoder_batch
-
-//! GPT decoder class with support for in-flight batching
-class IGptDecoderBatched
-{
-public:
-    using CudaStreamPtr = std::shared_ptr<CudaStream>;
-    using LlmRequestPtr = std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>;
-    using RequestVector = std::vector<LlmRequestPtr>;
-    using TensorPtr = std::shared_ptr<ITensor>;
-
-    //! @brief Setup the decoder before calling `forward()`
-    virtual void setup(executor::DecodingMode const& mode, SizeType32 maxNumSequences, SizeType32 maxBeamWidth,
-        nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig)
-        = 0;
-
-    //! @brief Disable Lookahead decoding.
-    virtual void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) = 0;
-
-    //! @brief Run one step for all requests without blocking the host process and return the token for synchronization.
-    virtual CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) = 0;
-
-    //! @brief Run one step for all requests and wait for completion on the host.
-    virtual void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) = 0;
-
-    //! @brief Gather final beam search results for request `batchIdx`.
-    //! Result will only be available after event returned
-    [[nodiscard]] virtual CudaEvent finalize(decoder::DecoderState const& decoderState, SizeType32 batchSlot,
-        SamplingConfig const& samplingConfig, bool streaming) const
-        = 0;
-
-protected:
-    IGptDecoderBatched() = default;
-    virtual ~IGptDecoderBatched() = default;
-};
-
-} // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
index c9b2bb0b937..4ee0c0b7c00 100644
--- a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
+++ b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
@@ -22,7 +22,7 @@
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/runtime/decoderState.h"
-#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
+#include "tensorrt_llm/runtime/gptDecoderBatched.h"
 
 namespace tr = tensorrt_llm::runtime;
 
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
index 08cb4d407c1..78ecc0a3cf6 100644
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -55,6 +55,8 @@
 #include "tensorrt_llm/kernels/decodingCommon.h"
 #include "tensorrt_llm/layers/defaultDecodingParams.h"
 #include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/decoderState.h"
+#include "tensorrt_llm/runtime/gptDecoder.h"
 #include "tensorrt_llm/runtime/gptDecoderBatched.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
index c170ca81015..959412ef3d4 100644
--- a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -25,6 +25,7 @@
 #include "tensorrt_llm/batch_manager/rnnStateManager.h"
 #include "tensorrt_llm/batch_manager/sequenceSlotManager.h"
 #include "tensorrt_llm/nanobind/common/bindTypes.h"
+#include "tensorrt_llm/runtime/decoderState.h"
 #include "tensorrt_llm/runtime/gptDecoderBatched.h"
 #include "tensorrt_llm/runtime/runtimeKernels.h"
 #include "tensorrt_llm/runtime/torch.h"
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
index 47be92e13f5..2484db632e8 100644
--- a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -30,7 +30,6 @@
 #include "tensorrt_llm/runtime/gptDecoder.h"
 #include "tensorrt_llm/runtime/gptDecoderBatched.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
-#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/ipcUtils.h"
 #include "tensorrt_llm/runtime/lookaheadBuffers.h"
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
index 8f0cc3315cd..199c7299a0b 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
@@ -28,6 +28,7 @@
 #include "tensorrt_llm/batch_manager/pauseRequests.h"
 #include "tensorrt_llm/batch_manager/peftCacheManager.h"
 #include "tensorrt_llm/runtime/decoderState.h"
+#include "tensorrt_llm/runtime/gptDecoderBatched.h"
 #include "tensorrt_llm/runtime/torch.h"
 #include "tensorrt_llm/runtime/torchView.h"
 
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
index 17aa48ef308..3781b559c46 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -29,7 +29,6 @@
 #include "tensorrt_llm/runtime/gptDecoder.h"
 #include "tensorrt_llm/runtime/gptDecoderBatched.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
-#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/ipcUtils.h"
 #include "tensorrt_llm/runtime/lookaheadBuffers.h"
diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
index 6df7b1634b8..2c310c3a599 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
@@ -26,6 +26,7 @@
 #include "tensorrt_llm/kernels/decodingKernels.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/cudaEvent.h"
+#include "tensorrt_llm/runtime/gptDecoder.h"
 
 #include <algorithm>
 #include <cassert>
diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
index 338b974aa0d..8b8ae44b61e 100644
--- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp
+++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
@@ -24,10 +24,11 @@
 #include "tensorrt_llm/executor/types.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/decoderState.h"
+#include "tensorrt_llm/runtime/gptDecoder.h"
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
-#include "tensorrt_llm/runtime/runtimeKernels.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <gmock/gmock-matchers.h>