diff --git a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h index cea23a4e7ec..ab6674a505f 100644 --- a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h +++ b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h @@ -21,13 +21,20 @@ #include "tensorrt_llm/common/algorithm.h" #include "tensorrt_llm/common/optionalRef.h" #include "tensorrt_llm/runtime/common.h" -#include "tensorrt_llm/runtime/iGptDecoderBatched.h" #include "tensorrt_llm/runtime/modelConfig.h" -namespace tensorrt_llm::runtime::decoder +namespace tensorrt_llm::runtime +{ +namespace decoder { class DecoderState; -} // namespace tensorrt_llm::runtime::decoder +} // namespace decoder + +namespace decoder_batch +{ +class Input; +} // namespace decoder_batch +} // namespace tensorrt_llm::runtime namespace tensorrt_llm::batch_manager { @@ -40,7 +47,7 @@ class MakeDecodingBatchInputOutput : Algorithm constexpr static auto name{"MakeDecodingBatchInputOutput"}; using SizeType32 = tensorrt_llm::runtime::SizeType32; - using TensorPtr = runtime::decoder_batch::Input::TensorPtr; + using TensorPtr = runtime::ITensor::SharedPtr; template using OptionalRef = tensorrt_llm::common::OptionalRef; diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h index d0a9e726d13..1c546fc5c4f 100644 --- a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h +++ b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h @@ -19,13 +19,13 @@ #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/cudaEvent.h" #include "tensorrt_llm/runtime/cudaStream.h" -#include "tensorrt_llm/runtime/decoderState.h" -#include "tensorrt_llm/runtime/gptDecoder.h" -#include "tensorrt_llm/runtime/iGptDecoderBatched.h" +#include "tensorrt_llm/runtime/eagleBuffers.h" +#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/worldConfig.h" #include +#include #include namespace tensorrt_llm::batch_manager @@ -35,9 +35,72 @@ class LlmRequest; namespace tensorrt_llm::runtime { +class SamplingConfig; +class IGptDecoder; + +namespace decoder +{ +class DecoderState; +} + +namespace decoder_batch +{ + +class Input +{ +public: + using TensorConstPtr = ITensor::SharedConstPtr; + using TensorPtr = ITensor::SharedPtr; + + explicit Input(std::vector> const& logits, SizeType32 maxDecoderSteps) + : logits{logits} + , maxDecoderSteps{maxDecoderSteps} + { + TLLM_CHECK_WITH_INFO( + logits.size() == static_cast(maxDecoderSteps), "logits vector size does not match maxDecoderSteps"); + } + + explicit Input(std::vector const& logits) + : Input{{logits}, 1} + { + } + + //! Mandatory parameters + //! Logits + // FIXME: remove first dimension of tensors + //! [maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu + std::vector> logits; + + //! Maximum number of decoding tokens of active slots + SizeType32 maxDecoderSteps; + + //! Batch of active decoder slots, sorted by slots, [maxDecoderSteps][batchSize] + std::vector batchSlots; + //! Filled with slots in request order, [batchSize] + TensorPtr batchSlotsRequestOrder; + + //! For Beam Search + //! The generation step of each request (for Variable-Beam-Width-Search), [batchSize] + std::vector generationSteps; + + //! For speculative decoding + //! Logits of draft + //! [maxBatchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded] + std::vector> predictedDraftLogits; + + //! Explicit draft tokens data + std::optional explicitDraftTokensInputs; + std::optional explicitDraftTokensLastInputs; + + //! Eagle data + std::optional eagleInputs; + std::optional eagleLastInputs; +}; + +} // namespace decoder_batch //! GPT decoder class with support for in-flight batching -class GptDecoderBatched : public IGptDecoderBatched +class GptDecoderBatched { public: using CudaStreamPtr = std::shared_ptr; @@ -47,25 +110,29 @@ class GptDecoderBatched : public IGptDecoderBatched explicit GptDecoderBatched(CudaStreamPtr stream); + //! @brief Setup the decoder before calling `forward()` void setup(executor::DecodingMode const& mode, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, - nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig) override; + nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig); - void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) override; + //! @brief Disable Lookahead decoding. + void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots); - CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override; - void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override; + //! @brief Run one step for all requests without blocking the host process and return the token for synchronization. + CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input); + //! @brief Run one step for all requests and wait for completion on the host. + void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input); //! @brief Gather final beam search results for request `batchSlot`. //! Result will only be available after event returned. [[nodiscard]] CudaEvent finalize(decoder::DecoderState const& decoderState, SizeType32 batchSlot, - SamplingConfig const& samplingConfig, bool streaming) const override; + SamplingConfig const& samplingConfig, bool streaming) const; - CudaStreamPtr getDecoderStream() const + [[nodiscard]] CudaStreamPtr getDecoderStream() const { return mDecoderStream; } - IGptDecoder& getUnderlyingDecoder() const + [[nodiscard]] IGptDecoder& getUnderlyingDecoder() const { return *mDecoder.get(); } @@ -87,4 +154,5 @@ class GptDecoderBatched : public IGptDecoderBatched using GptDecoderPtr = std::unique_ptr; GptDecoderPtr mDecoder; }; + } // namespace tensorrt_llm::runtime diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h deleted file mode 100644 index 606ba3c98a4..00000000000 --- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "tensorrt_llm/runtime/cudaEvent.h" -#include "tensorrt_llm/runtime/cudaStream.h" -#include "tensorrt_llm/runtime/iTensor.h" -#include "tensorrt_llm/runtime/modelConfig.h" -#include "tensorrt_llm/runtime/worldConfig.h" - -#include -#include - -namespace tensorrt_llm::batch_manager -{ -class LlmRequest; -} - -namespace tensorrt_llm::runtime -{ -class SamplingConfig; - -namespace decoder -{ -class DecoderState; -} - -namespace decoder_batch -{ - -class Input -{ -public: - using TensorConstPtr = ITensor::SharedConstPtr; - using TensorPtr = ITensor::SharedPtr; - - explicit Input(std::vector> const& logits, SizeType32 maxDecoderSteps) - : logits{logits} - , maxDecoderSteps{maxDecoderSteps} - { - TLLM_CHECK_WITH_INFO( - logits.size() == static_cast(maxDecoderSteps), "logits vector size does not match maxDecoderSteps"); - } - - explicit Input(std::vector const& logits) - : Input{{logits}, 1} - { - } - - //! Mandatory parameters - //! Logits - // FIXME: remove first dimension of tensors - //! [maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu - std::vector> logits; - - //! Maximum number of decoding tokens of active slots - SizeType32 maxDecoderSteps; - - //! Batch of active decoder slots, sorted by slots, [maxDecoderSteps][batchSize] - std::vector batchSlots; -}; - -} // namespace decoder_batch - -//! GPT decoder class with support for in-flight batching -class IGptDecoderBatched -{ -public: - using CudaStreamPtr = std::shared_ptr; - using LlmRequestPtr = std::shared_ptr; - using RequestVector = std::vector; - using TensorPtr = std::shared_ptr; - - //! @brief Setup the decoder before calling `forward()` - virtual void setup(executor::DecodingMode const& mode, SizeType32 maxNumSequences, SizeType32 maxBeamWidth, - nvinfer1::DataType dtype, ModelConfig const& modelConfig, WorldConfig const& worldConfig) - = 0; - - //! @brief Disable Lookahead decoding. - virtual void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) = 0; - - //! @brief Run one step for all requests without blocking the host process and return the token for synchronization. - virtual CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) = 0; - - //! @brief Run one step for all requests and wait for completion on the host. - virtual void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) = 0; - - //! @brief Gather final beam search results for request `batchIdx`. - //! Result will only be available after event returned - [[nodiscard]] virtual CudaEvent finalize(decoder::DecoderState const& decoderState, SizeType32 batchSlot, - SamplingConfig const& samplingConfig, bool streaming) const - = 0; - -protected: - IGptDecoderBatched() = default; - virtual ~IGptDecoderBatched() = default; -}; - -} // namespace tensorrt_llm::runtime diff --git a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp index c9b2bb0b937..4ee0c0b7c00 100644 --- a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp +++ b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp @@ -22,7 +22,7 @@ #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/runtime/decoderState.h" -#include "tensorrt_llm/runtime/iGptDecoderBatched.h" +#include "tensorrt_llm/runtime/gptDecoderBatched.h" namespace tr = tensorrt_llm::runtime; diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp index 08cb4d407c1..78ecc0a3cf6 100644 --- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp +++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp @@ -55,6 +55,8 @@ #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/layers/defaultDecodingParams.h" #include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/decoderState.h" +#include "tensorrt_llm/runtime/gptDecoder.h" #include "tensorrt_llm/runtime/gptDecoderBatched.h" #include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/iTensor.h" diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp index c170ca81015..959412ef3d4 100644 --- a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp @@ -25,6 +25,7 @@ #include "tensorrt_llm/batch_manager/rnnStateManager.h" #include "tensorrt_llm/batch_manager/sequenceSlotManager.h" #include "tensorrt_llm/nanobind/common/bindTypes.h" +#include "tensorrt_llm/runtime/decoderState.h" #include "tensorrt_llm/runtime/gptDecoderBatched.h" #include "tensorrt_llm/runtime/runtimeKernels.h" #include "tensorrt_llm/runtime/torch.h" diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp index 47be92e13f5..2484db632e8 100644 --- a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp @@ -30,7 +30,6 @@ #include "tensorrt_llm/runtime/gptDecoder.h" #include "tensorrt_llm/runtime/gptDecoderBatched.h" #include "tensorrt_llm/runtime/iBuffer.h" -#include "tensorrt_llm/runtime/iGptDecoderBatched.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/ipcUtils.h" #include "tensorrt_llm/runtime/lookaheadBuffers.h" diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp index 8f0cc3315cd..199c7299a0b 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp @@ -28,6 +28,7 @@ #include "tensorrt_llm/batch_manager/pauseRequests.h" #include "tensorrt_llm/batch_manager/peftCacheManager.h" #include "tensorrt_llm/runtime/decoderState.h" +#include "tensorrt_llm/runtime/gptDecoderBatched.h" #include "tensorrt_llm/runtime/torch.h" #include "tensorrt_llm/runtime/torchView.h" diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp index 17aa48ef308..3781b559c46 100644 --- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp @@ -29,7 +29,6 @@ #include "tensorrt_llm/runtime/gptDecoder.h" #include "tensorrt_llm/runtime/gptDecoderBatched.h" #include "tensorrt_llm/runtime/iBuffer.h" -#include "tensorrt_llm/runtime/iGptDecoderBatched.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/ipcUtils.h" #include "tensorrt_llm/runtime/lookaheadBuffers.h" diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp index 6df7b1634b8..2c310c3a599 100644 --- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp +++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp @@ -26,6 +26,7 @@ #include "tensorrt_llm/kernels/decodingKernels.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/cudaEvent.h" +#include "tensorrt_llm/runtime/gptDecoder.h" #include #include diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp index 338b974aa0d..8b8ae44b61e 100644 --- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp +++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp @@ -24,10 +24,11 @@ #include "tensorrt_llm/executor/types.h" #include "tensorrt_llm/runtime/bufferManager.h" #include "tensorrt_llm/runtime/common.h" +#include "tensorrt_llm/runtime/decoderState.h" +#include "tensorrt_llm/runtime/gptDecoder.h" #include "tensorrt_llm/runtime/iBuffer.h" #include "tensorrt_llm/runtime/iTensor.h" #include "tensorrt_llm/runtime/modelConfig.h" -#include "tensorrt_llm/runtime/runtimeKernels.h" #include "tensorrt_llm/runtime/worldConfig.h" #include