NVIDIA
diff --git a/‎README.md
Lines changed: 2 additions & 2 deletions b/‎README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheEventManager.h
Lines changed: 20 additions & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheEventManager.h
Lines changed: 20 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 4 additions & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h
Lines changed: 4 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/common/quantization.h
Lines changed: 57 additions & 16 deletions b/‎cpp/include/tensorrt_llm/common/quantization.h
Lines changed: 57 additions & 16 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h
Lines changed: 16 additions & 2 deletions b/‎cpp/include/tensorrt_llm/executor/executor.h
Lines changed: 16 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/serialization.h
Lines changed: 47 additions & 0 deletions b/‎cpp/include/tensorrt_llm/executor/serialization.h
Lines changed: 47 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/utils/mpiTags.h
Lines changed: 4 additions & 0 deletions b/‎cpp/include/tensorrt_llm/runtime/utils/mpiTags.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/kernels/fmha_v2/fmha_test.py
Lines changed: 3 additions & 3 deletions b/‎cpp/kernels/fmha_v2/fmha_test.py
Lines changed: 3 additions & 3 deletions
@@ -9,7 +9,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.9.1-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-1.0.0rc6-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.1.0rc0-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/torch/arch_overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
@@ -253,5 +253,5 @@ Deprecation is used to inform developers that some APIs and tools are no longer
 ## Useful Links
 - [Quantized models on Hugging Face](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4): A growing collection of quantized (e.g., FP8, FP4) and optimized LLMs, including [DeepSeek FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), ready for fast inference with TensorRT-LLM.
 - [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo): A datacenter scale distributed inference serving framework that works seamlessly with TensorRT-LLM.
-- [AutoDeploy](./examples/auto_deploy/README.md): An experimental backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models.
+- [AutoDeploy](./examples/auto_deploy/README.md): A prototype backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models.
 - [WeChat Discussion Group](https://github.com/NVIDIA/TensorRT-LLM/issues/5359): A real-time channel for TensorRT-LLM Q&A and news.
@@ -18,6 +18,7 @@
 
 #include "tensorrt_llm/executor/executor.h"
 
+#include <atomic>
 #include <chrono>
 #include <condition_variable>
 #include <deque>
@@ -36,7 +37,8 @@ using BlockPtr = std::shared_ptr<KVCacheBlock>;
 class KVCacheEventManager
 {
 public:
-    explicit KVCacheEventManager(size_t maxKVEventEntries);
+    explicit KVCacheEventManager(size_t maxKVEventEntries, std::optional<SizeType32> attentionDpRank = std::nullopt,
+        std::optional<SizeType32> attentionDpSize = std::nullopt, SizeType32 attentionDpEventsGatherPeriodMs = 5);
 
     ~KVCacheEventManager();
     KVCacheEventManager(KVCacheEventManager& other) = delete;
@@ -61,14 +63,19 @@ class KVCacheEventManager
     // Worker thread which adds events to mEvents.
     void worker();
 
+    // Thread which exchanges events if attentionDP is enabled
+    void exchangeAttentionDpThread();
+
 private:
     // Add an event to mEventQueue
     void enqueueEvent(executor::KVCacheEvent&& event);
 
     /// @brief Flag to terminate the worker
-    bool mRun;
+    std::atomic<bool> mRun;
     /// @brief Worker thread
     std::thread mWorkerThread;
+    /// @brief Exchange thread for attention DP events
+    std::thread mExchangeAttentionDpThread;
 
     /// @brief The deque of events
     std::deque<executor::KVCacheEvent> mEvents;
@@ -91,6 +98,17 @@ class KVCacheEventManager
     size_t mMaxSize;
     /// @brief An auto-incrementing event id counter
     size_t mEventId;
+
+    /// @brief Attention DP ranks and size
+    /// If set, we will exchange KV cache events and accumulate on rank 0
+    std::optional<SizeType32> mAttentionDpRank;
+    std::optional<SizeType32> mAttentionDpSize;
+
+    /// @brief The period in milliseconds to gather attention DP events across rank
+    SizeType32 mAttentionDpEventsGatherPeriodMs;
+
+    /// @brief MPI communicator for attention DP
+    std::unique_ptr<tensorrt_llm::mpi::MpiComm> mMpiComm;
 };
 
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
@@ -2027,7 +2027,7 @@ class GenericLlmRequest
 
         // Scatter the input tokens to other beam
         mTokens = BeamTokens(mSamplingConfig.beamWidth, inputTokens);
-        mLastTokens = VecTokens(mSamplingConfig.beamWidth);
+        mLastTokens = VecTokens(mSamplingConfig.beamWidth, inputTokens.back());
 
         // Init mUniqueTokens
         VecUniqueTokens uniqueTokens{inputTokens.size()};
@@ -2347,6 +2347,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
     void movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager);
 
     void moveLoraWeightsToGpu(runtime::BufferManager const& manager);
+
+    // Remove LoRA weights and LoRA config tensors
+    void removeLoraTensors();
 };
 
 } // namespace tensorrt_llm::batch_manager
@@ -122,6 +122,16 @@ class QuantMode
         return QuantMode(BaseType(1u) << 14);
     }
 
+    static constexpr QuantMode w4a8Mxfp4Mxfp8() noexcept
+    {
+        return QuantMode(BaseType(1u) << 15);
+    }
+
+    static constexpr QuantMode w4a16Mxfp4() noexcept
+    {
+        return QuantMode(BaseType(1u) << 16);
+    }
+
     constexpr BaseType value() const noexcept
     {
         return mValue;
@@ -202,14 +212,25 @@ class QuantMode
         return isSet(w4a8Mxfp4Fp8());
     }
 
+    constexpr bool hasW4a8Mxfp4Mxfp8() const noexcept
+    {
+        return isSet(w4a8Mxfp4Mxfp8());
+    }
+
+    constexpr bool hasW4a16Mxfp4() const noexcept
+    {
+        return isSet(w4a16Mxfp4());
+    }
+
     constexpr bool hasKvCacheQuant() const noexcept
     {
         return hasInt8KvCache() || hasFp8KvCache() || hasFp4KvCache();
     }
 
     static constexpr QuantMode fromDescription(bool quantizeWeights, bool quantizeActivations, bool perToken,
         bool perChannel, bool perGroup, bool useInt4Weights, bool useInt8KvCache, bool useFp8KvCache, bool useFp8Qdq,
-        bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8)
+        bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8,
+        bool useW4a8Mxfp4Mxfp8, bool useW4a16Mxfp4)
     {
         QuantMode quantMode{};
         if (quantizeWeights)
@@ -278,25 +299,35 @@ class QuantMode
             quantMode += w4a8Mxfp4Fp8();
         }
 
+        if (useW4a8Mxfp4Mxfp8)
+        {
+            quantMode += w4a8Mxfp4Mxfp8();
+        }
+
+        if (useW4a16Mxfp4)
+        {
+            quantMode += w4a16Mxfp4();
+        }
+
         return quantMode;
     }
 
     static constexpr QuantMode useSmoothQuant(bool perToken = false, bool perChannel = false)
     {
-        return fromDescription(
-            true, true, perToken, perChannel, false, false, false, false, false, false, false, false, false, false);
+        return fromDescription(true, true, perToken, perChannel, false, false, false, false, false, false, false, false,
+            false, false, false, false);
     }
 
     static constexpr QuantMode useQServe(bool perGroup)
     {
-        return fromDescription(
-            true, true, false, false, perGroup, true, false, false, false, false, true, false, false, false);
+        return fromDescription(true, true, false, false, perGroup, true, false, false, false, false, true, false, false,
+            false, false, false);
     }
 
     static constexpr QuantMode useWeightOnly(bool useInt4Weights = false, bool perGroup = false)
     {
         return fromDescription(true, false, false, false, perGroup, useInt4Weights, false, false, false, false, false,
-            false, false, false);
+            false, false, false, false, false);
     }
 
     static QuantMode const fromQuantAlgo(
@@ -353,28 +384,38 @@ class QuantMode
         }
         else if (quantAlgo == "FP8")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, true, false, false, false, false, false);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, true, false, false,
+                false, false, false, false, false);
         }
         else if (quantAlgo == "FP8_ROWWISE")
         {
-            quantMode = fromDescription(
-                false, false, true, true, false, false, false, false, false, true, false, false, false, false);
+            quantMode = fromDescription(false, false, true, true, false, false, false, false, false, true, false, false,
+                false, false, false, false);
         }
         else if (quantAlgo == "FP4")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, false, false, false, true, false, false);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                true, false, false, false, false);
         }
         else if (quantAlgo == "FP8_BLOCK_SCALES")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, false, false, false, false, true, false);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, true, false, false, false);
         }
         else if (quantAlgo == "W4A8_MXFP4_FP8")
         {
-            quantMode = fromDescription(
-                false, false, false, false, false, false, false, false, false, false, false, false, false, true);
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, false, true, false, false);
+        }
+        else if (quantAlgo == "W4A8_MXFP4_MXFP8")
+        {
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, false, false, true, false);
+        }
+        else if (quantAlgo == "W4A16_MXFP4")
+        {
+            quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,
+                false, false, false, false, true);
         }
 
         if (kvCacheQuantAlgo == "INT8")
 
@@ -1001,6 +1001,7 @@ class KvCacheConfig
         std::optional<FloatType> const& crossKvCacheFraction = std::nullopt,
         std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt, size_t eventBufferMaxSize = 0,
         bool enablePartialReuse = true, bool copyOnPartialReuse = true, bool useUvm = false,
+        SizeType32 attentionDpEventsGatherPeriodMs = 5,
         std::optional<tensorrt_llm::runtime::RuntimeDefaults> const& runtimeDefaults = std::nullopt);
 
     [[nodiscard]] bool getEnableBlockReuse() const;
@@ -1016,6 +1017,7 @@ class KvCacheConfig
     [[nodiscard]] std::optional<RetentionPriority> getSecondaryOffloadMinPriority() const;
     [[nodiscard]] size_t getEventBufferMaxSize() const;
     [[nodiscard]] bool getUseUvm() const;
+    [[nodiscard]] SizeType32 getAttentionDpEventsGatherPeriodMs() const;
 
     void setEnableBlockReuse(bool enableBlockReuse);
     void setEnablePartialReuse(bool enablePartialReuse);
@@ -1030,6 +1032,7 @@ class KvCacheConfig
     void setSecondaryOffloadMinPriority(std::optional<RetentionPriority> secondaryOffloadMinPriority);
     void setEventBufferMaxSize(size_t eventBufferMaxSize);
     void setUseUvm(bool useUvm);
+    void setAttentionDpEventsGatherPeriodMs(SizeType32 attentionDpEventsGatherPeriodMs);
 
     void fillEmptyFieldsFromRuntimeDefaults(tensorrt_llm::runtime::RuntimeDefaults const& runtimeDefaults);
 
@@ -1085,6 +1088,9 @@ class KvCacheConfig
 
     /// @brief Whether to use UVM for the KV cache.
     bool mUseUvm;
+
+    /// @brief The period in milliseconds to gather attention DP events across ranks
+    SizeType32 mAttentionDpEventsGatherPeriodMs;
 };
 
 /// @brief Configuration class for the runtime perf knobs
@@ -1702,6 +1708,12 @@ struct KVCacheUpdatedData
     explicit KVCacheUpdatedData(IdType blockHash)
         : blockHash{blockHash} {};
 
+    explicit KVCacheUpdatedData(IdType blockHash, std::optional<KVCacheEventDiff<SizeType32>> cacheLevel,
+        std::optional<KVCacheEventDiff<SizeType32>> priority)
+        : blockHash{blockHash}
+        , cacheLevel{cacheLevel}
+        , priority{priority} {};
+
     KVCacheUpdatedData& cacheLevelUpdated(SizeType32 oldValue, SizeType32 newValue)
     {
         cacheLevel = KVCacheEventDiff<SizeType32>{oldValue, newValue};
@@ -1726,15 +1738,17 @@ using KVCacheEventData = std::variant<KVCacheCreatedData, KVCacheStoredData, KVC
 
 struct KVCacheEvent
 {
-
-    KVCacheEvent(IdType eventId, KVCacheEventData data, SizeType32 windowSize);
+    KVCacheEvent(IdType eventId, KVCacheEventData data, SizeType32 windowSize,
+        std::optional<SizeType32> attentionDpRank = std::nullopt);
 
     /// @brief The unique id of this event
     IdType eventId;
     /// @brief The data corresponding to this event
     KVCacheEventData data;
     /// @brief The sliding window size
     SizeType32 windowSize;
+    /// @brief The attention DP rank of the event, if applicable
+    std::optional<SizeType32> attentionDpRank;
 };
 
 /// @brief Exposes a limited set of KV cache manager functionalities
 
@@ -302,6 +302,53 @@ class Serialization
     [[nodiscard]] static std::vector<RequestStatsPerIteration> deserializeRequestStatsPerIterationVec(
         std::vector<char>& buffer);
 
+    // KVCacheEvent deque
+    [[nodiscard]] static std::vector<char> serialize(std::deque<KVCacheEvent> const& kvCacheEvents);
+    [[nodiscard]] static std::deque<KVCacheEvent> deserializeKVCacheEvents(std::vector<char>& buffer);
+
+    // KVCacheEvent
+    [[nodiscard]] static size_t serializedSize(KVCacheEvent const& event);
+    static void serialize(KVCacheEvent const& event, std::ostream& os);
+    [[nodiscard]] static KVCacheEvent deserializeKVCacheEvent(std::istream& is);
+
+    // KVCacheCreatedData
+    [[nodiscard]] static size_t serializedSize(KVCacheCreatedData const& data);
+    static void serialize(KVCacheCreatedData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheCreatedData deserializeKVCacheCreatedData(std::istream& is);
+
+    // KVCacheStoredData
+    [[nodiscard]] static size_t serializedSize(KVCacheStoredData const& data);
+    static void serialize(KVCacheStoredData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheStoredData deserializeKVCacheStoredData(std::istream& is);
+
+    // KVCacheStoredBlockData
+    [[nodiscard]] static size_t serializedSize(KVCacheStoredBlockData const& data);
+    static void serialize(KVCacheStoredBlockData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheStoredBlockData deserializeKVCacheStoredBlockData(std::istream& is);
+
+    // KVCacheRemovedData
+    [[nodiscard]] static size_t serializedSize(KVCacheRemovedData const& data);
+    static void serialize(KVCacheRemovedData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheRemovedData deserializeKVCacheRemovedData(std::istream& is);
+
+    // KVCacheEventDiff
+    template <typename T>
+    [[nodiscard]] static size_t serializedSize(KVCacheEventDiff<T> const& data);
+    template <typename T>
+    static void serialize(KVCacheEventDiff<T> const& data, std::ostream& os);
+    template <typename T>
+    [[nodiscard]] static KVCacheEventDiff<T> deserializeKVCacheEventDiff(std::istream& is);
+
+    // KVCacheUpdateData
+    [[nodiscard]] static size_t serializedSize(KVCacheUpdatedData const& data);
+    static void serialize(KVCacheUpdatedData const& data, std::ostream& os);
+    [[nodiscard]] static KVCacheUpdatedData deserializeKVCacheUpdatedData(std::istream& is);
+
+    // UniqueToken
+    [[nodiscard]] static size_t serializedSize(tensorrt_llm::runtime::UniqueToken const& token);
+    static void serialize(tensorrt_llm::runtime::UniqueToken const& token, std::ostream& os);
+    [[nodiscard]] static tensorrt_llm::runtime::UniqueToken deserializeUniqueToken(std::istream& is);
+
     // String
     static std::string deserializeString(std::istream& is);
 
 
@@ -68,6 +68,10 @@ enum class MpiTag : int
     // LogitsThread
     kSpecDecLogitsId = 129,
     kSpecDecLogitsData = 1025,
+
+    // KvCacheEventManager
+    kKvCacheEventSize = 1026,
+    kKvCacheEvent = 1027
 };
 
 } // namespace tensorrt_llm::mpi
@@ -50,7 +50,7 @@ def getSMVersion():
                          ids=["fp16", "bf16", "fp16-fp32", "e4m3"])
 @pytest.mark.parametrize('flag', [
     "-s-q 128 -paged-kv", "-s-q 63 -paged-kv", "-paged-kv",
-    "-softcapping-scale-bmm1 30", "-contiguous-q-kv"
+    "-softcapping-scale-bmm1 30", "-contiguous-q-kv", "-use-attention-sinks"
 ])
 @pytest.mark.parametrize('tiled_kernel', ["", "-force-non-tiled"])
 def test_trtllm_flash_attention_fmha(d, s, dtype, flag, tiled_kernel):
@@ -117,8 +117,8 @@ def test_trtllm_flash_attention_fmha(d, s, dtype, flag, tiled_kernel):
             f"bin/fmha.exe -d {d} -h 16 -b 8 -s {s} -min-s 128 -custom-mask -gqa 2 -v {verbose} {dtype} {epsilon} {flag} {tiled_kernel}",
             shell=True,
             check=True)
-    # alibi and softcapping-scale-bmm1 are mutually exclusive.
-    if '-softcapping-scale-bmm1' not in flag:
+    # alibi doesn't work with softcapping-scale-bmm1/use-attention-sinks.
+    if '-softcapping-scale-bmm1' not in flag and '-use-attention-sinks' not in flag:
         subprocess.run(
             f"bin/fmha.exe -d {d} -h 16 -b 8 -s {s} -min-s 128 -causal-mask -alibi -v {verbose} {dtype} {epsilon} {flag} {tiled_kernel}",
             shell=True,
Original file line number	Diff line number	Diff line change
`@@ -122,6 +122,16 @@ class QuantMode`
`122`	`122`	`return QuantMode(BaseType(1u) << 14);`
`123`	`123`	`}`
`124`	`124`
	`125`	`+ static constexpr QuantMode w4a8Mxfp4Mxfp8() noexcept`
	`126`	`+ {`
	`127`	`+ return QuantMode(BaseType(1u) << 15);`
	`128`	`+ }`
	`129`	`+`
	`130`	`+ static constexpr QuantMode w4a16Mxfp4() noexcept`
	`131`	`+ {`
	`132`	`+ return QuantMode(BaseType(1u) << 16);`
	`133`	`+ }`
	`134`	`+`
`125`	`135`	`constexpr BaseType value() const noexcept`
`126`	`136`	`{`
`127`	`137`	`return mValue;`
`@@ -202,14 +212,25 @@ class QuantMode`
`202`	`212`	`return isSet(w4a8Mxfp4Fp8());`
`203`	`213`	`}`
`204`	`214`
	`215`	`+ constexpr bool hasW4a8Mxfp4Mxfp8() const noexcept`
	`216`	`+ {`
	`217`	`+ return isSet(w4a8Mxfp4Mxfp8());`
	`218`	`+ }`
	`219`	`+`
	`220`	`+ constexpr bool hasW4a16Mxfp4() const noexcept`
	`221`	`+ {`
	`222`	`+ return isSet(w4a16Mxfp4());`
	`223`	`+ }`
	`224`	`+`
`205`	`225`	`constexpr bool hasKvCacheQuant() const noexcept`
`206`	`226`	`{`
`207`	`227`	`return hasInt8KvCache() \|\| hasFp8KvCache() \|\| hasFp4KvCache();`
`208`	`228`	`}`
`209`	`229`
`210`	`230`	`static constexpr QuantMode fromDescription(bool quantizeWeights, bool quantizeActivations, bool perToken,`
`211`	`231`	`bool perChannel, bool perGroup, bool useInt4Weights, bool useInt8KvCache, bool useFp8KvCache, bool useFp8Qdq,`
`212`		`- bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8)`
	`232`	`+ bool useFp8RowWise, bool useW4a8QServe, bool useFp4Quant, bool useFp8BlockScales, bool useW4a8Mxfp4Fp8,`
	`233`	`+ bool useW4a8Mxfp4Mxfp8, bool useW4a16Mxfp4)`
`213`	`234`	`{`
`214`	`235`	`QuantMode quantMode{};`
`215`	`236`	`if (quantizeWeights)`
`@@ -278,25 +299,35 @@ class QuantMode`
`278`	`299`	`quantMode += w4a8Mxfp4Fp8();`
`279`	`300`	`}`
`280`	`301`
	`302`	`+ if (useW4a8Mxfp4Mxfp8)`
	`303`	`+ {`
	`304`	`+ quantMode += w4a8Mxfp4Mxfp8();`
	`305`	`+ }`
	`306`	`+`
	`307`	`+ if (useW4a16Mxfp4)`
	`308`	`+ {`
	`309`	`+ quantMode += w4a16Mxfp4();`
	`310`	`+ }`
	`311`	`+`
`281`	`312`	`return quantMode;`
`282`	`313`	`}`
`283`	`314`
`284`	`315`	`static constexpr QuantMode useSmoothQuant(bool perToken = false, bool perChannel = false)`
`285`	`316`	`{`
`286`		`- return fromDescription(`
`287`		`- true, true, perToken, perChannel, false, false, false, false, false, false, false, false, false, false);`
	`317`	`+ return fromDescription(true, true, perToken, perChannel, false, false, false, false, false, false, false, false,`
	`318`	`+ false, false, false, false);`
`288`	`319`	`}`
`289`	`320`
`290`	`321`	`static constexpr QuantMode useQServe(bool perGroup)`
`291`	`322`	`{`
`292`		`- return fromDescription(`
`293`		`- true, true, false, false, perGroup, true, false, false, false, false, true, false, false, false);`
	`323`	`+ return fromDescription(true, true, false, false, perGroup, true, false, false, false, false, true, false, false,`
	`324`	`+ false, false, false);`
`294`	`325`	`}`
`295`	`326`
`296`	`327`	`static constexpr QuantMode useWeightOnly(bool useInt4Weights = false, bool perGroup = false)`
`297`	`328`	`{`
`298`	`329`	`return fromDescription(true, false, false, false, perGroup, useInt4Weights, false, false, false, false, false,`
`299`		`- false, false, false);`
	`330`	`+ false, false, false, false, false);`
`300`	`331`	`}`
`301`	`332`
`302`	`333`	`static QuantMode const fromQuantAlgo(`
`@@ -353,28 +384,38 @@ class QuantMode`
`353`	`384`	`}`
`354`	`385`	`else if (quantAlgo == "FP8")`
`355`	`386`	`{`
`356`		`- quantMode = fromDescription(`
`357`		`- false, false, false, false, false, false, false, false, true, false, false, false, false, false);`
	`387`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, true, false, false,`
	`388`	`+ false, false, false, false, false);`
`358`	`389`	`}`
`359`	`390`	`else if (quantAlgo == "FP8_ROWWISE")`
`360`	`391`	`{`
`361`		`- quantMode = fromDescription(`
`362`		`- false, false, true, true, false, false, false, false, false, true, false, false, false, false);`
	`392`	`+ quantMode = fromDescription(false, false, true, true, false, false, false, false, false, true, false, false,`
	`393`	`+ false, false, false, false);`
`363`	`394`	`}`
`364`	`395`	`else if (quantAlgo == "FP4")`
`365`	`396`	`{`
`366`		`- quantMode = fromDescription(`
`367`		`- false, false, false, false, false, false, false, false, false, false, false, true, false, false);`
	`397`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,`
	`398`	`+ true, false, false, false, false);`
`368`	`399`	`}`
`369`	`400`	`else if (quantAlgo == "FP8_BLOCK_SCALES")`
`370`	`401`	`{`
`371`		`- quantMode = fromDescription(`
`372`		`- false, false, false, false, false, false, false, false, false, false, false, false, true, false);`
	`402`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,`
	`403`	`+ false, true, false, false, false);`
`373`	`404`	`}`
`374`	`405`	`else if (quantAlgo == "W4A8_MXFP4_FP8")`
`375`	`406`	`{`
`376`		`- quantMode = fromDescription(`
`377`		`- false, false, false, false, false, false, false, false, false, false, false, false, false, true);`
	`407`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,`
	`408`	`+ false, false, true, false, false);`
	`409`	`+ }`
	`410`	`+ else if (quantAlgo == "W4A8_MXFP4_MXFP8")`
	`411`	`+ {`
	`412`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,`
	`413`	`+ false, false, false, true, false);`
	`414`	`+ }`
	`415`	`+ else if (quantAlgo == "W4A16_MXFP4")`
	`416`	`+ {`
	`417`	`+ quantMode = fromDescription(false, false, false, false, false, false, false, false, false, false, false,`
	`418`	`+ false, false, false, false, true);`
`378`	`419`	`}`
`379`	`420`
`380`	`421`	`if (kvCacheQuantAlgo == "INT8")`