From b85caa72f5aaf5e9594a8ddb168209b1f7a479ed Mon Sep 17 00:00:00 2001
From: nachiketb <nachiketb@nvidia.com>
Date: Thu, 21 Aug 2025 17:08:28 -0700
Subject: [PATCH 1/3] chore: itnermediate commit

---
 lib/llm/src/engines.rs                        |   6 +-
 lib/llm/src/preprocessor.rs                   |   5 +-
 .../src/protocols/openai/chat_completions.rs  |   2 +-
 .../openai/chat_completions/delta.rs          |  69 +-
 lib/llm/src/tokenizers/hf.rs                  |   6 +
 lib/llm/tests/http-service.rs                 |   4 +-
 lib/parsers/src/reasoning/base_parser.rs      | 667 +++++++++---------
 .../src/reasoning/deepseek_r1_parser.rs       |  13 +-
 lib/parsers/src/reasoning/mod.rs              |  56 +-
 9 files changed, 442 insertions(+), 386 deletions(-)
diff --git a/lib/llm/src/engines.rs b/lib/llm/src/engines.rs
index bd7e0e6e3c..89a8111a49 100644
--- a/lib/llm/src/engines.rs
+++ b/lib/llm/src/engines.rs
@@ -183,7 +183,7 @@ impl
         incoming_request: SingleIn<NvCreateChatCompletionRequest>,
     ) -> Result<ManyOut<Annotated<NvCreateChatCompletionStreamResponse>>, Error> {
         let (request, context) = incoming_request.transfer(());
-        let mut deltas = request.response_generator();
+        let mut deltas = request.response_generator(None, None);
         let ctx = context.context();
         let req = request.inner.messages.into_iter().next_back().unwrap();
 
@@ -204,12 +204,12 @@ impl
             for c in prompt.chars() {
                 // we are returning characters not tokens, so there will be some postprocessing overhead
                 tokio::time::sleep(*TOKEN_ECHO_DELAY).await;
-                let response = deltas.create_choice(0, Some(c.to_string()), None, None);
+                let response = deltas.create_choice(0, Some(c.to_string()), None, None, None);
                 yield Annotated{ id: Some(id.to_string()), data: Some(response), event: None, comment: None };
                 id += 1;
             }
 
-            let response = deltas.create_choice(0, None, Some(dynamo_async_openai::types::FinishReason::Stop), None);
+            let response = deltas.create_choice(0, None, None, Some(dynamo_async_openai::types::FinishReason::Stop), None);
             yield Annotated { id: Some(id.to_string()), data: Some(response), event: None, comment: None };
         };
 
diff --git a/lib/llm/src/preprocessor.rs b/lib/llm/src/preprocessor.rs
index 917fcf0c50..f5c3513957 100644
--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -94,6 +94,7 @@ pub struct OpenAIPreprocessor {
     formatter: Arc<dyn OAIPromptFormatter>,
     tokenizer: Arc<dyn Tokenizer>,
     model_info: Arc<dyn ModelInfo>,
+    vocab: HashMap<String, u32>,
 }
 
 impl OpenAIPreprocessor {
@@ -113,6 +114,7 @@ impl OpenAIPreprocessor {
                 );
             }
         };
+        let vocab = tokenizer.get_vocab(true);
         let tokenizer = Arc::new(tokenizer);
 
         let Some(model_info) = mdc.model_info else {
@@ -127,6 +129,7 @@ impl OpenAIPreprocessor {
             tokenizer,
             model_info,
             mdcsum,
+            vocab
         }))
     }
 
@@ -499,7 +502,7 @@ impl
         let (request, context) = request.into_parts();
 
         // create a response generator
-        let response_generator = request.response_generator();
+        let response_generator = request.response_generator(None, None);
         let mut response_generator = Box::new(response_generator);
 
         // convert the chat completion request to a common completion request
diff --git a/lib/llm/src/protocols/openai/chat_completions.rs b/lib/llm/src/protocols/openai/chat_completions.rs
index b97e8d7f5a..adc59bde3f 100644
--- a/lib/llm/src/protocols/openai/chat_completions.rs
+++ b/lib/llm/src/protocols/openai/chat_completions.rs
@@ -28,7 +28,7 @@ use super::{
 };
 
 pub mod aggregator;
-mod delta;
+pub mod delta;
 
 pub use aggregator::DeltaAggregator;
 pub use delta::DeltaGenerator;
diff --git a/lib/llm/src/protocols/openai/chat_completions/delta.rs b/lib/llm/src/protocols/openai/chat_completions/delta.rs
index dad744959e..17e2c9f9e4 100644
--- a/lib/llm/src/protocols/openai/chat_completions/delta.rs
+++ b/lib/llm/src/protocols/openai/chat_completions/delta.rs
@@ -1,6 +1,9 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+use std::{collections::HashMap, sync::Arc};
+use crate::tokenizers::{traits::Tokenizer};
+
 use dynamo_parsers::{ParserResult, ReasoningParser, ReasoningParserType, ReasoningParserWrapper};
 
 use super::{NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse};
@@ -15,14 +18,14 @@ impl NvCreateChatCompletionRequest {
     ///
     /// # Returns
     /// * [`DeltaGenerator`] configured with model name and response options.
-    pub fn response_generator(&self) -> DeltaGenerator {
+    pub fn response_generator(&self, tokenizer: Option<&Arc<(dyn Tokenizer + 'static)>>, vocab: Option<&HashMap<String, u32>>) -> DeltaGenerator {
         let options = DeltaGeneratorOptions {
             enable_usage: true,
             enable_logprobs: self.inner.logprobs.unwrap_or(false)
                 || self.inner.top_logprobs.unwrap_or(0) > 0,
         };
 
-        DeltaGenerator::new(self.inner.model.clone(), options)
+        DeltaGenerator::new(self.inner.model.clone(), options, tokenizer, vocab)
     }
 }
 
@@ -36,7 +39,6 @@ pub struct DeltaGeneratorOptions {
 }
 
 /// Generates incremental chat completion responses in a streaming fashion.
-#[derive(Debug)]
 pub struct DeltaGenerator {
     /// Unique identifier for the chat completion session.
     id: String,
@@ -58,7 +60,9 @@ pub struct DeltaGenerator {
 
     /// Reasoning Parser object
     /// This is used to parse reasoning content in the response.
-    reasoning_parser: ReasoningParserWrapper,
+    reasoning_parser: Option<ReasoningParserWrapper>,
+
+    tokenizer: Option<Arc<(dyn Tokenizer + 'static)>>,
 }
 
 impl DeltaGenerator {
@@ -70,7 +74,7 @@ impl DeltaGenerator {
     ///
     /// # Returns
     /// * A new instance of [`DeltaGenerator`].
-    pub fn new(model: String, options: DeltaGeneratorOptions) -> Self {
+    pub fn new(model: String, options: DeltaGeneratorOptions, tokenizer: Option<&Arc<(dyn Tokenizer + 'static)>>, vocab: Option<&HashMap<String, u32>>) -> Self {
         let now = std::time::SystemTime::now()
             .duration_since(std::time::UNIX_EPOCH)
             .unwrap()
@@ -94,7 +98,17 @@ impl DeltaGenerator {
         let reasoning_parser_type = ReasoningParserType::Basic;
 
         // Reasoning parser wrapper
-        let reasoning_parser = reasoning_parser_type.get_reasoning_parser();
+        let reasoning_parser = if let Some(v) = vocab {
+            Some(reasoning_parser_type.get_reasoning_parser(v))
+        } else {
+            None
+        };
+
+        let tokenizer = if let Some(tokenizer) = tokenizer {
+            Some(tokenizer.clone())
+        } else {
+            None
+        };
 
         Self {
             id: format!("chatcmpl-{}", uuid::Uuid::new_v4()),
@@ -107,6 +121,7 @@ impl DeltaGenerator {
             msg_counter: 0,
             options,
             reasoning_parser,
+            tokenizer,
         }
     }
 
@@ -183,13 +198,11 @@ impl DeltaGenerator {
         })
     }
 
-    fn create_reasoning_content(&mut self, text: Option<String>) -> Option<ParserResult> {
-        let text = text?;
-        let parser_result = self
-            .reasoning_parser
-            .parse_reasoning_streaming_incremental(&text);
-
-        Some(parser_result)
+    fn create_reasoning_content(&mut self, token_ids: Vec<u32>) -> Option<ParserResult> {
+        if self.tokenizer.is_none() || self.reasoning_parser.is_none() {
+            return None;
+        }
+        Some(self.reasoning_parser.as_mut().unwrap().parse_reasoning_streaming_incremental(&token_ids))
     }
 
     /// Creates a choice within a chat completion response.
@@ -207,17 +220,12 @@ impl DeltaGenerator {
         &mut self,
         index: u32,
         text: Option<String>,
+        reasoning_content: Option<String>,
         finish_reason: Option<dynamo_async_openai::types::FinishReason>,
         logprobs: Option<dynamo_async_openai::types::ChatChoiceLogprobs>,
     ) -> NvCreateChatCompletionStreamResponse {
-        let reasoning_parser_result = self.create_reasoning_content(text).unwrap_or_default();
-
-        let (normal_text, reasoning_content) = (
-            reasoning_parser_result.get_some_normal_text(),
-            reasoning_parser_result.get_some_reasoning(),
-        );
         let delta = dynamo_async_openai::types::ChatCompletionStreamResponseDelta {
-            content: normal_text,
+            content: text,
             function_call: None,
             tool_calls: None,
             role: if self.msg_counter == 0 {
@@ -226,7 +234,7 @@ impl DeltaGenerator {
                 None
             },
             refusal: None,
-            reasoning_content,
+            reasoning_content: reasoning_content
         };
 
         let choice = dynamo_async_openai::types::ChatChoiceStream {
@@ -292,7 +300,7 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
 
         let logprobs = self.create_logprobs(
             delta.tokens,
-            delta.token_ids,
+            delta.token_ids.clone(),
             delta.log_probs,
             delta.top_logprobs,
         );
@@ -319,8 +327,23 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
         };
 
         // Create the streaming response.
+        let reasoning_parser_result = self.create_reasoning_content(delta.token_ids);
+        
+        let (normal_text, reasoning_content) = if let Some(parser_result) = reasoning_parser_result {
+            let none_if_empty = |vec: String| {
+                if vec.is_empty() {
+                    None
+                } else {
+                    Some(vec)
+                }
+            };
+            (none_if_empty(self.tokenizer.as_ref().unwrap().decode(&parser_result.normal_token_ids, false).unwrap()),
+             none_if_empty(self.tokenizer.as_ref().unwrap().decode(&parser_result.reasoning_token_ids, false).unwrap()))
+        } else {
+            (delta.text, None)
+        };
         let index = 0;
-        let stream_response = self.create_choice(index, delta.text, finish_reason, logprobs);
+        let stream_response = self.create_choice(index, normal_text, reasoning_content, finish_reason, logprobs);
 
         Ok(stream_response)
     }
diff --git a/lib/llm/src/tokenizers/hf.rs b/lib/llm/src/tokenizers/hf.rs
index 38b8825458..c2a22bb09b 100644
--- a/lib/llm/src/tokenizers/hf.rs
+++ b/lib/llm/src/tokenizers/hf.rs
@@ -13,6 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::collections::HashMap;
+
 use tokenizers::tokenizer::Tokenizer as HfTokenizer;
 
 use super::{
@@ -35,6 +37,10 @@ impl HuggingFaceTokenizer {
     pub fn from_tokenizer(tokenizer: HfTokenizer) -> Self {
         HuggingFaceTokenizer { tokenizer }
     }
+
+    pub fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32> {
+        self.tokenizer.get_vocab(with_added_tokens)
+    }
 }
 
 impl Encoder for HuggingFaceTokenizer {
diff --git a/lib/llm/tests/http-service.rs b/lib/llm/tests/http-service.rs
index 0e122313f0..65140dfddf 100644
--- a/lib/llm/tests/http-service.rs
+++ b/lib/llm/tests/http-service.rs
@@ -95,12 +95,12 @@ impl
         let max_tokens = request.inner.max_tokens.unwrap_or(0) as u64;
 
         // let generator = NvCreateChatCompletionStreamResponse::generator(request.model.clone());
-        let mut generator = request.response_generator();
+        let mut generator = request.response_generator(None, None);
 
         let stream = stream! {
             tokio::time::sleep(std::time::Duration::from_millis(max_tokens)).await;
             for i in 0..10 {
-                let output = generator.create_choice(i,Some(format!("choice {i}")), None, None);
+                let output = generator.create_choice(i,Some(format!("choice {i}")), None, None, None);
 
                 yield Annotated::from_data(output);
             }
diff --git a/lib/parsers/src/reasoning/base_parser.rs b/lib/parsers/src/reasoning/base_parser.rs
index 96046b3962..d4f88813bd 100644
--- a/lib/parsers/src/reasoning/base_parser.rs
+++ b/lib/parsers/src/reasoning/base_parser.rs
@@ -1,16 +1,18 @@
+use std::collections::HashMap;
+
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 use tracing as log;
 
 use crate::{ParserResult, ReasoningParser};
 
-#[derive(Default, Debug, Clone)]
+#[derive(Debug, Default, Clone)]
 pub struct BasicReasoningParser {
-    think_start_token: String,
-    think_end_token: String,
+    think_start_token: u32,
+    think_end_token: u32,
     _in_reasoning: bool,
     stream_reasoning: bool,
-    _buffer: String,
+    _buffer: Vec<u32>,
     stripped_think_start: bool,
 }
 
@@ -20,35 +22,48 @@ impl BasicReasoningParser {
         think_end_token: String,
         force_reasoning: bool,
         stream_reasoning: bool,
+        vocab: &HashMap<String, u32>,
     ) -> Self {
+        let think_start_token_id = vocab
+            .get(&think_start_token)
+            .cloned()
+            .unwrap();
+        let think_end_token_id = vocab
+            .get(&think_end_token)
+            .cloned()
+            .unwrap();
+
         Self {
-            think_start_token,
-            think_end_token,
+            think_start_token: think_start_token_id,
+            think_end_token: think_end_token_id,
             _in_reasoning: force_reasoning,
             stream_reasoning,
-            _buffer: String::new(),
+            _buffer: Vec::new(),
             stripped_think_start: false,
         }
     }
 }
 
 impl ReasoningParser for BasicReasoningParser {
-    fn detect_and_parse_reasoning(&self, text: &str) -> ParserResult {
-        log::debug!("detect_and_parse_reasoning called with text: {:?}", text);
+    fn detect_and_parse_reasoning(&self, token_ids: &Vec<u32>) -> ParserResult {
+        log::debug!("detect_and_parse_reasoning called with token_ids: {:?}", token_ids);
 
-        let in_reasoning = self._in_reasoning || text.contains(&self.think_start_token);
+        let in_reasoning = self._in_reasoning || token_ids.contains(&self.think_start_token);
         log::debug!("in_reasoning: {}", in_reasoning);
 
         if !in_reasoning {
             log::debug!("No reasoning detected, returning normal text.");
             return ParserResult {
-                normal_text: text.to_string(),
-                reasoning_text: String::new(),
+                normal_token_ids: token_ids.clone(),
+                reasoning_token_ids: Vec::new(),
             };
         }
 
         // The text is considered to be in a reasoning block.
-        let processed_text = text.replace(&self.think_start_token, "").trim().to_string();
+        let mut processed_text = token_ids.clone();
+        if let Some(pos) = processed_text.iter().position(|&x| x == self.think_start_token) {
+            processed_text.remove(pos);
+        }
         log::debug!(
             "Processed text after removing think_start_token: {:?}",
             processed_text
@@ -60,37 +75,38 @@ impl ReasoningParser for BasicReasoningParser {
             );
             // Assume reasoning was truncated before `think_end_token`
             return ParserResult {
-                normal_text: String::new(),
-                reasoning_text: processed_text,
+                normal_token_ids: Vec::new(),
+                reasoning_token_ids: processed_text,
             };
         }
 
         // Extract reasoning content
-        let splits: Vec<&str> = processed_text.splitn(2, &self.think_end_token).collect();
-        let reasoning_text = splits.first().unwrap_or(&"").to_string();
-        let normal_text = splits
-            .get(1)
-            .map(|s| s.trim().to_string())
-            .unwrap_or_default();
+        let (reasoning_token_ids, normal_token_ids) = if let Some(pos) = processed_text.iter().position(|&x| x == self.think_end_token) {
+            let reasoning_token_ids = processed_text[..pos].to_vec();
+            let normal_token_ids = processed_text[pos + 1..].to_vec();
+            (reasoning_token_ids, normal_token_ids)
+        } else {
+            (processed_text, Vec::new())
+        };
 
-        log::debug!("Extracted reasoning_text: {:?}", reasoning_text);
-        log::debug!("Extracted normal_text: {:?}", normal_text);
+        log::debug!("Extracted reasoning_token_ids: {:?}", reasoning_token_ids);
+        log::debug!("Extracted normal_token_ids: {:?}", normal_token_ids);
 
         ParserResult {
-            normal_text,
-            reasoning_text,
+            normal_token_ids,
+            reasoning_token_ids,
         }
     }
 
-    fn parse_reasoning_streaming_incremental(&mut self, text: &str) -> ParserResult {
+    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &Vec<u32>) -> ParserResult {
         // Incrementally parse the streaming text
-        self._buffer.push_str(text);
-        let mut current_text = self._buffer.to_string();
+        self._buffer.extend(token_ids);
+        let mut current_text = self._buffer.clone();
         // If the current text is a prefix of the think token, keep buffering
 
         log::debug!(
             "parse_reasoning_streaming_incremental called with text: {:?}",
-            text
+            token_ids
         );
         log::debug!("current buffer: {:?}", self._buffer);
         log::debug!("current_text: {:?}", current_text);
@@ -101,335 +117,338 @@ impl ReasoningParser for BasicReasoningParser {
             self.stream_reasoning
         );
 
-        if self.think_start_token.starts_with(&current_text)
-            && self.think_start_token.as_str() != current_text.as_str()
-        {
-            return ParserResult {
-                normal_text: String::new(),
-                reasoning_text: String::new(),
-            };
-        }
-        if self.think_end_token.starts_with(&current_text)
-            && self.think_end_token.as_str() != current_text.as_str()
-        {
-            return ParserResult {
-                normal_text: String::new(),
-                reasoning_text: String::new(),
-            };
-        }
+        // if self.think_start_token.(&current_text)
+        //     && self.think_start_token.as_str() != current_text.as_str()
+        // {
+        //     return ParserResult {
+        //         normal_token_ids: String::new(),
+        //         reasoning_token_ids: String::new(),
+        //     };
+        // }
+        // if self.think_end_token.starts_with(&current_text)
+        //     && self.think_end_token.as_str() != current_text.as_str()
+        // {
+        //     return ParserResult {
+        //         normal_token_ids: String::new(),
+        //         reasoning_token_ids: String::new(),
+        //     };
+        // }
 
         // Strip `<think>` token if present
         if !self.stripped_think_start && current_text.contains(&self.think_start_token) {
-            current_text = current_text.replace(&self.think_start_token, "");
-            self._buffer = current_text.to_string();
+            if let Some(pos) = current_text.iter().position(|&x| x == self.think_start_token) {
+                current_text.remove(pos);
+            }
+            // current_text = current_text.replace(&self.think_start_token, "");
+            self._buffer = current_text.clone();
             self.stripped_think_start = true;
             self._in_reasoning = true;
         }
         // Handle end of reasoning block
         let mut think_end_idx = current_text.len();
         if self._in_reasoning {
-            think_end_idx = current_text
-                .find(&self.think_end_token)
-                .unwrap_or(current_text.len());
+            if let Some(pos) = current_text.iter().position(|&x| x == self.think_end_token) {
+                think_end_idx = pos;
+            }
         }
         if self._in_reasoning && think_end_idx < current_text.len() {
-            let reasoning_text = &current_text[..think_end_idx];
+            let reasoning_token_ids = &current_text[..think_end_idx];
             self._buffer.clear();
             self._in_reasoning = false;
-            let start_idx = think_end_idx + self.think_end_token.len();
-            let normal_text = if start_idx < current_text.len() {
-                &current_text[start_idx..]
+            let start_idx = think_end_idx + 1;
+            let normal_token_ids: &[u32] = if start_idx < current_text.len() {
+                &current_text[start_idx..].to_vec()
             } else {
-                ""
+                &[].to_vec()
             };
             return ParserResult {
-                normal_text: normal_text.to_string(),
-                reasoning_text: reasoning_text.trim().to_string(),
+                normal_token_ids: normal_token_ids.to_vec(),
+                reasoning_token_ids: reasoning_token_ids.to_vec(),
             };
         }
         // Continue with reasoning content
         if self._in_reasoning && self.stream_reasoning {
             // Stream the content immediately
-            let reasoning_text = current_text;
+            let reasoning_token_ids = current_text;
             self._buffer.clear();
             ParserResult {
-                normal_text: String::new(),
-                reasoning_text,
+                normal_token_ids: Vec::new(),
+                reasoning_token_ids,
             }
         } else if !self._in_reasoning {
             // If we're not in a reasoning block return as normal text
-            let normal_text = current_text;
+            let normal_token_ids = current_text;
             self._buffer.clear();
             ParserResult {
-                normal_text,
-                reasoning_text: String::new(),
+                normal_token_ids,
+                reasoning_token_ids: Vec::new(),
             }
         } else {
             // If we are in a reasoning block but no end token is found, return the current buffer
             ParserResult {
-                normal_text: String::new(),
-                reasoning_text: String::new(),
+                normal_token_ids: Vec::new(),
+                reasoning_token_ids: Vec::new(),
             }
         }
     }
 }
 
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_detect_and_parse_reasoning_reasoning() {
-        let parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result =
-            parser.detect_and_parse_reasoning("<think>with reasoning</think> and more text.");
-        assert_eq!(result.normal_text, "and more text.");
-        assert_eq!(result.reasoning_text, "with reasoning");
-    }
-    #[test]
-    fn test_detect_and_parse_reasoning_reasoning_no_reasoning() {
-        let parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result = parser.detect_and_parse_reasoning("This is a test without reasoning.");
-        assert_eq!(result.normal_text, "This is a test without reasoning.");
-        assert_eq!(result.reasoning_text, "");
-    }
-    #[test]
-    fn test_detect_and_parse_reasoning_reasoning_truncated_reasoning() {
-        let parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result = parser.detect_and_parse_reasoning("<think>with truncated reasoning");
-        assert_eq!(result.normal_text, "");
-        assert_eq!(result.reasoning_text, "with truncated reasoning");
-    }
-
-    #[test]
-    fn test_parse_reasoning_streaming_incremental() {
-        let mut parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result = parser.parse_reasoning_streaming_incremental("<thi");
-        assert_eq!(result.normal_text, "");
-        assert_eq!(result.reasoning_text, "");
-    }
-
-    #[test]
-    fn test_parse_reasoning_streaming_incremental_complete() {
-        let mut parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result = parser
-            .parse_reasoning_streaming_incremental("<think>with reasoning</think> and more text.");
-        assert_eq!(result.normal_text, " and more text.");
-        assert_eq!(result.reasoning_text, "with reasoning");
-    }
-
-    #[test]
-    fn test_parse_reasoning_streaming_incremental_no_end_token() {
-        let mut parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), true, true);
-        let result = parser.parse_reasoning_streaming_incremental("<think>with reasoning");
-        assert_eq!(result.normal_text, "");
-        assert_eq!(result.reasoning_text, "with reasoning");
-    }
-
-    #[test]
-    fn test_detect_and_parse_reasoning_multiple_reasoning_blocks() {
-        let parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result = parser.detect_and_parse_reasoning(
-            "<think>first reasoning</think> middle <think>second reasoning</think> end",
-        );
-        // The current implementation only handles the first occurrence properly
-        assert_eq!(result.normal_text, "middle second reasoning</think> end");
-        assert_eq!(result.reasoning_text, "first reasoning");
-    }
-
-    #[test]
-    fn test_streaming_multiple_reasoning_blocks() {
-        let mut parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, false);
-        let result1 =
-            parser.parse_reasoning_streaming_incremental("<think>first reasoning</think> middle");
-        assert_eq!(result1.normal_text, " middle");
-        assert_eq!(result1.reasoning_text, "first reasoning");
-
-        // Basic parser assumes only one reasoning block at a time
-        let result2 =
-            parser.parse_reasoning_streaming_incremental(" <think>second reasoning</think> end");
-        assert_eq!(result2.normal_text, " <think>second reasoning</think> end");
-        assert_eq!(result2.reasoning_text, "");
-    }
-
-    #[test]
-    fn test_partial_token_matching_opening_tag() {
-        let mut parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-
-        // Feed partial opening tag
-        let result1 = parser.parse_reasoning_streaming_incremental("<th");
-        assert_eq!(result1.normal_text, "");
-        assert_eq!(result1.reasoning_text, "");
-
-        // Complete the opening tag and add content
-        let result2 = parser
-            .parse_reasoning_streaming_incremental("ink>reasoning content</think> normal text");
-        assert_eq!(result2.normal_text, " normal text");
-        assert_eq!(result2.reasoning_text, "reasoning content");
-    }
-
-    #[test]
-    fn test_partial_token_matching_closing_tag() {
-        let mut parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, false);
-
-        // Start with complete opening and partial content
-        let result1 = parser.parse_reasoning_streaming_incremental("<think>reasoning content</th");
-        assert_eq!(result1.normal_text, "");
-        assert_eq!(result1.reasoning_text, "");
-
-        // Complete the closing tag
-        let result2 = parser.parse_reasoning_streaming_incremental("ink> normal text");
-        assert_eq!(result2.normal_text, " normal text");
-        assert_eq!(result2.reasoning_text, "reasoning content");
-    }
-
-    #[test]
-    fn test_buffer_state_persistence_across_calls() {
-        let mut parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, false);
-
-        // First call - partial opening tag
-        let result1 = parser.parse_reasoning_streaming_incremental("<th");
-        assert_eq!(result1.normal_text, "");
-        assert_eq!(result1.reasoning_text, "");
-
-        // Second call - complete opening tag, start reasoning
-        let result2 = parser.parse_reasoning_streaming_incremental("ink>part1 ");
-        assert_eq!(result2.normal_text, "");
-        assert_eq!(result2.reasoning_text, "");
-
-        // Third call - more reasoning content
-        let result3 = parser.parse_reasoning_streaming_incremental("part2 ");
-        assert_eq!(result3.normal_text, "");
-        assert_eq!(result3.reasoning_text, "");
-
-        // Fourth call - end reasoning and normal text
-        let result4 = parser.parse_reasoning_streaming_incremental("part3</think> normal");
-        assert_eq!(result4.normal_text, " normal");
-        assert_eq!(result4.reasoning_text, "part1 part2 part3");
-    }
-
-    #[test]
-    fn test_streaming_with_stream_reasoning_enabled() {
-        let mut parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-
-        // Start reasoning block
-        let result1 = parser.parse_reasoning_streaming_incremental("<think>reasoning ");
-        assert_eq!(result1.normal_text, "");
-        assert_eq!(result1.reasoning_text, "reasoning ");
-
-        // Continue streaming reasoning
-        let result2 = parser.parse_reasoning_streaming_incremental("content ");
-        assert_eq!(result2.normal_text, "");
-        assert_eq!(result2.reasoning_text, "content ");
-
-        // End reasoning block
-        let result3 = parser.parse_reasoning_streaming_incremental("more</think> normal");
-        assert_eq!(result3.normal_text, " normal");
-        assert_eq!(result3.reasoning_text, "more");
-    }
-
-    #[test]
-    fn test_nested_reasoning_blocks() {
-        let parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result = parser.detect_and_parse_reasoning(
-            "<think>outer <think>inner</think> reasoning</think> normal",
-        );
-        // Current implementation should handle this by finding the first closing tag
-        assert_eq!(result.normal_text, "reasoning</think> normal");
-        // All <think> tags are stripped, so <think>inner is not included
-        assert_eq!(result.reasoning_text, "outer inner");
-    }
-
-    #[test]
-    fn test_malformed_missing_closing_tag() {
-        let parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result = parser.detect_and_parse_reasoning("<think>reasoning without closing tag");
-        assert_eq!(result.normal_text, "");
-        assert_eq!(result.reasoning_text, "reasoning without closing tag");
-    }
-
-    #[test]
-    fn test_malformed_stray_closing_tag() {
-        let parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result = parser.detect_and_parse_reasoning("normal text</think> more normal");
-        assert_eq!(result.normal_text, "normal text</think> more normal");
-        assert_eq!(result.reasoning_text, "");
-    }
-
-    #[test]
-    fn test_malformed_multiple_opening_tags() {
-        let parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result = parser
-            .detect_and_parse_reasoning("<think>first <think>second reasoning</think> normal");
-        // Should handle by replacing all opening tags and using first closing tag
-        assert_eq!(result.normal_text, "normal");
-        assert_eq!(result.reasoning_text, "first second reasoning");
-    }
-
-    #[test]
-    fn test_empty_reasoning_block() {
-        let parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result = parser.detect_and_parse_reasoning("<think></think> normal text");
-        assert_eq!(result.normal_text, "normal text");
-        assert_eq!(result.reasoning_text, "");
-    }
-
-    #[test]
-    fn test_whitespace_only_reasoning_block() {
-        let parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
-        let result = parser.detect_and_parse_reasoning("<think>   \n\t  </think> normal text");
-        assert_eq!(result.normal_text, "normal text");
-        assert_eq!(result.reasoning_text, ""); // Should be empty after trim
-    }
-
-    #[test]
-    fn test_force_reasoning_mode() {
-        let parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), true, true);
-        let result = parser.detect_and_parse_reasoning("no think tags here");
-        assert_eq!(result.normal_text, "");
-        assert_eq!(result.reasoning_text, "no think tags here");
-    }
-
-    #[test]
-    fn test_streaming_reset_state_after_complete_block() {
-        let mut parser =
-            BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, false);
-
-        // Process complete reasoning block
-        let result1 =
-            parser.parse_reasoning_streaming_incremental("<think>reasoning</think> normal");
-        assert_eq!(result1.normal_text, " normal");
-        assert_eq!(result1.reasoning_text, "reasoning");
-
-        // Process normal text - should not be affected by previous state
-        let result2 = parser.parse_reasoning_streaming_incremental(" more normal text");
-        assert_eq!(result2.normal_text, " more normal text");
-        assert_eq!(result2.reasoning_text, "");
-
-        // Basic parser does not expect more than one reasoning block at a time
-        // So this should not affect the state
-        let result3 =
-            parser.parse_reasoning_streaming_incremental(" <think>new reasoning</think> final");
-        assert_eq!(result3.normal_text, " <think>new reasoning</think> final");
-        assert_eq!(result3.reasoning_text, "");
-    }
-}
+// #[cfg(test)]
+// mod tests {
+//     use super::*;
+
+//     #[test]
+//     fn test_detect_and_parse_reasoning_reasoning() {
+//         let parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result =
+//             parser.detect_and_parse_reasoning("<think>with reasoning</think> and more text.");
+//         assert_eq!(result.normal_token_ids, "and more text.");
+//         assert_eq!(result.reasoning_token_ids, "with reasoning");
+//     }
+//     #[test]
+//     fn test_detect_and_parse_reasoning_reasoning_no_reasoning() {
+//         let parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result = parser.detect_and_parse_reasoning("This is a test without reasoning.");
+//         assert_eq!(result.normal_token_ids, "This is a test without reasoning.");
+//         assert_eq!(result.reasoning_token_ids, "");
+//     }
+//     #[test]
+//     fn test_detect_and_parse_reasoning_reasoning_truncated_reasoning() {
+//         let parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result = parser.detect_and_parse_reasoning("<think>with truncated reasoning");
+//         assert_eq!(result.normal_token_ids, "");
+//         assert_eq!(result.reasoning_token_ids, "with truncated reasoning");
+//     }
+
+//     #[test]
+//     fn test_parse_reasoning_streaming_incremental() {
+//         let mut parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result = parser.parse_reasoning_streaming_incremental("<thi");
+//         assert_eq!(result.normal_token_ids, "");
+//         assert_eq!(result.reasoning_token_ids, "");
+//     }
+
+//     #[test]
+//     fn test_parse_reasoning_streaming_incremental_complete() {
+//         let mut parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result = parser
+//             .parse_reasoning_streaming_incremental("<think>with reasoning</think> and more text.");
+//         assert_eq!(result.normal_token_ids, " and more text.");
+//         assert_eq!(result.reasoning_token_ids, "with reasoning");
+//     }
+
+//     #[test]
+//     fn test_parse_reasoning_streaming_incremental_no_end_token() {
+//         let mut parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), true, true);
+//         let result = parser.parse_reasoning_streaming_incremental("<think>with reasoning");
+//         assert_eq!(result.normal_token_ids, "");
+//         assert_eq!(result.reasoning_token_ids, "with reasoning");
+//     }
+
+//     #[test]
+//     fn test_detect_and_parse_reasoning_multiple_reasoning_blocks() {
+//         let parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result = parser.detect_and_parse_reasoning(
+//             "<think>first reasoning</think> middle <think>second reasoning</think> end",
+//         );
+//         // The current implementation only handles the first occurrence properly
+//         assert_eq!(result.normal_token_ids, "middle second reasoning</think> end");
+//         assert_eq!(result.reasoning_token_ids, "first reasoning");
+//     }
+
+//     #[test]
+//     fn test_streaming_multiple_reasoning_blocks() {
+//         let mut parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, false);
+//         let result1 =
+//             parser.parse_reasoning_streaming_incremental("<think>first reasoning</think> middle");
+//         assert_eq!(result1.normal_token_ids, " middle");
+//         assert_eq!(result1.reasoning_token_ids, "first reasoning");
+
+//         // Basic parser assumes only one reasoning block at a time
+//         let result2 =
+//             parser.parse_reasoning_streaming_incremental(" <think>second reasoning</think> end");
+//         assert_eq!(result2.normal_token_ids, " <think>second reasoning</think> end");
+//         assert_eq!(result2.reasoning_token_ids, "");
+//     }
+
+//     #[test]
+//     fn test_partial_token_matching_opening_tag() {
+//         let mut parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+//         // Feed partial opening tag
+//         let result1 = parser.parse_reasoning_streaming_incremental("<th");
+//         assert_eq!(result1.normal_token_ids, "");
+//         assert_eq!(result1.reasoning_token_ids, "");
+
+//         // Complete the opening tag and add content
+//         let result2 = parser
+//             .parse_reasoning_streaming_incremental("ink>reasoning content</think> normal text");
+//         assert_eq!(result2.normal_token_ids, " normal text");
+//         assert_eq!(result2.reasoning_token_ids, "reasoning content");
+//     }
+
+//     #[test]
+//     fn test_partial_token_matching_closing_tag() {
+//         let mut parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, false);
+
+//         // Start with complete opening and partial content
+//         let result1 = parser.parse_reasoning_streaming_incremental("<think>reasoning content</th");
+//         assert_eq!(result1.normal_token_ids, "");
+//         assert_eq!(result1.reasoning_token_ids, "");
+
+//         // Complete the closing tag
+//         let result2 = parser.parse_reasoning_streaming_incremental("ink> normal text");
+//         assert_eq!(result2.normal_token_ids, " normal text");
+//         assert_eq!(result2.reasoning_token_ids, "reasoning content");
+//     }
+
+//     #[test]
+//     fn test_buffer_state_persistence_across_calls() {
+//         let mut parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, false);
+
+//         // First call - partial opening tag
+//         let result1 = parser.parse_reasoning_streaming_incremental("<th");
+//         assert_eq!(result1.normal_token_ids, "");
+//         assert_eq!(result1.reasoning_token_ids, "");
+
+//         // Second call - complete opening tag, start reasoning
+//         let result2 = parser.parse_reasoning_streaming_incremental("ink>part1 ");
+//         assert_eq!(result2.normal_token_ids, "");
+//         assert_eq!(result2.reasoning_token_ids, "");
+
+//         // Third call - more reasoning content
+//         let result3 = parser.parse_reasoning_streaming_incremental("part2 ");
+//         assert_eq!(result3.normal_token_ids, "");
+//         assert_eq!(result3.reasoning_token_ids, "");
+
+//         // Fourth call - end reasoning and normal text
+//         let result4 = parser.parse_reasoning_streaming_incremental("part3</think> normal");
+//         assert_eq!(result4.normal_token_ids, " normal");
+//         assert_eq!(result4.reasoning_token_ids, "part1 part2 part3");
+//     }
+
+//     #[test]
+//     fn test_streaming_with_stream_reasoning_enabled() {
+//         let mut parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+
+//         // Start reasoning block
+//         let result1 = parser.parse_reasoning_streaming_incremental("<think>reasoning ");
+//         assert_eq!(result1.normal_token_ids, "");
+//         assert_eq!(result1.reasoning_token_ids, "reasoning ");
+
+//         // Continue streaming reasoning
+//         let result2 = parser.parse_reasoning_streaming_incremental("content ");
+//         assert_eq!(result2.normal_token_ids, "");
+//         assert_eq!(result2.reasoning_token_ids, "content ");
+
+//         // End reasoning block
+//         let result3 = parser.parse_reasoning_streaming_incremental("more</think> normal");
+//         assert_eq!(result3.normal_token_ids, " normal");
+//         assert_eq!(result3.reasoning_token_ids, "more");
+//     }
+
+//     #[test]
+//     fn test_nested_reasoning_blocks() {
+//         let parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result = parser.detect_and_parse_reasoning(
+//             "<think>outer <think>inner</think> reasoning</think> normal",
+//         );
+//         // Current implementation should handle this by finding the first closing tag
+//         assert_eq!(result.normal_token_ids, "reasoning</think> normal");
+//         // All <think> tags are stripped, so <think>inner is not included
+//         assert_eq!(result.reasoning_token_ids, "outer inner");
+//     }
+
+//     #[test]
+//     fn test_malformed_missing_closing_tag() {
+//         let parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result = parser.detect_and_parse_reasoning("<think>reasoning without closing tag");
+//         assert_eq!(result.normal_token_ids, "");
+//         assert_eq!(result.reasoning_token_ids, "reasoning without closing tag");
+//     }
+
+//     #[test]
+//     fn test_malformed_stray_closing_tag() {
+//         let parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result = parser.detect_and_parse_reasoning("normal text</think> more normal");
+//         assert_eq!(result.normal_token_ids, "normal text</think> more normal");
+//         assert_eq!(result.reasoning_token_ids, "");
+//     }
+
+//     #[test]
+//     fn test_malformed_multiple_opening_tags() {
+//         let parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result = parser
+//             .detect_and_parse_reasoning("<think>first <think>second reasoning</think> normal");
+//         // Should handle by replacing all opening tags and using first closing tag
+//         assert_eq!(result.normal_token_ids, "normal");
+//         assert_eq!(result.reasoning_token_ids, "first second reasoning");
+//     }
+
+//     #[test]
+//     fn test_empty_reasoning_block() {
+//         let parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result = parser.detect_and_parse_reasoning("<think></think> normal text");
+//         assert_eq!(result.normal_token_ids, "normal text");
+//         assert_eq!(result.reasoning_token_ids, "");
+//     }
+
+//     #[test]
+//     fn test_whitespace_only_reasoning_block() {
+//         let parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, true);
+//         let result = parser.detect_and_parse_reasoning("<think>   \n\t  </think> normal text");
+//         assert_eq!(result.normal_token_ids, "normal text");
+//         assert_eq!(result.reasoning_token_ids, ""); // Should be empty after trim
+//     }
+
+//     #[test]
+//     fn test_force_reasoning_mode() {
+//         let parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), true, true);
+//         let result = parser.detect_and_parse_reasoning("no think tags here");
+//         assert_eq!(result.normal_token_ids, "");
+//         assert_eq!(result.reasoning_token_ids, "no think tags here");
+//     }
+
+//     #[test]
+//     fn test_streaming_reset_state_after_complete_block() {
+//         let mut parser =
+//             BasicReasoningParser::new("<think>".to_string(), "</think>".to_string(), false, false);
+
+//         // Process complete reasoning block
+//         let result1 =
+//             parser.parse_reasoning_streaming_incremental("<think>reasoning</think> normal");
+//         assert_eq!(result1.normal_token_ids, " normal");
+//         assert_eq!(result1.reasoning_token_ids, "reasoning");
+
+//         // Process normal text - should not be affected by previous state
+//         let result2 = parser.parse_reasoning_streaming_incremental(" more normal text");
+//         assert_eq!(result2.normal_token_ids, " more normal text");
+//         assert_eq!(result2.reasoning_token_ids, "");
+
+//         // Basic parser does not expect more than one reasoning block at a time
+//         // So this should not affect the state
+//         let result3 =
+//             parser.parse_reasoning_streaming_incremental(" <think>new reasoning</think> final");
+//         assert_eq!(result3.normal_token_ids, " <think>new reasoning</think> final");
+//         assert_eq!(result3.reasoning_token_ids, "");
+//     }
+// }
diff --git a/lib/parsers/src/reasoning/deepseek_r1_parser.rs b/lib/parsers/src/reasoning/deepseek_r1_parser.rs
index 22d9a33a93..e51aa57a53 100644
--- a/lib/parsers/src/reasoning/deepseek_r1_parser.rs
+++ b/lib/parsers/src/reasoning/deepseek_r1_parser.rs
@@ -1,6 +1,8 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+use std::collections::HashMap;
+
 use super::base_parser::BasicReasoningParser;
 use crate::ParserResult;
 use crate::ReasoningParser;
@@ -11,24 +13,25 @@ pub struct DeepseekR1ReasoningParser {
 }
 
 impl DeepseekR1ReasoningParser {
-    pub fn new() -> Self {
+    pub fn new(vocab: &HashMap<String, u32>) -> Self {
         Self {
             base: BasicReasoningParser::new(
                 "<think>".to_string(),
                 "</think>".to_string(),
                 true,
                 true,
+                vocab
             ),
         }
     }
 }
 
 impl ReasoningParser for DeepseekR1ReasoningParser {
-    fn parse_reasoning_streaming_incremental(&mut self, text: &str) -> ParserResult {
-        self.base.parse_reasoning_streaming_incremental(text)
+    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &Vec<u32>) -> ParserResult {
+        self.base.parse_reasoning_streaming_incremental(token_ids)
     }
 
-    fn detect_and_parse_reasoning(&self, text: &str) -> ParserResult {
-        self.base.detect_and_parse_reasoning(text)
+    fn detect_and_parse_reasoning(&self, token_ids: &Vec<u32>) -> ParserResult {
+        self.base.detect_and_parse_reasoning(token_ids)
     }
 }
diff --git a/lib/parsers/src/reasoning/mod.rs b/lib/parsers/src/reasoning/mod.rs
index 10975c0919..1b86773c7e 100644
--- a/lib/parsers/src/reasoning/mod.rs
+++ b/lib/parsers/src/reasoning/mod.rs
@@ -4,6 +4,8 @@
 mod base_parser;
 mod deepseek_r1_parser;
 
+use std::collections::HashMap;
+
 // Re-export main types and functions for convenience
 pub use base_parser::BasicReasoningParser;
 pub use deepseek_r1_parser::DeepseekR1ReasoningParser;
@@ -11,40 +13,39 @@ pub use deepseek_r1_parser::DeepseekR1ReasoningParser;
 #[derive(Debug, Clone, Default)]
 pub struct ParserResult {
     /// The normal text outside of reasoning blocks.
-    pub normal_text: String,
-
+    pub normal_token_ids: Vec<u32>,
     /// The extracted reasoning text from within reasoning blocks.
-    pub reasoning_text: String,
+    pub reasoning_token_ids: Vec<u32>,
 }
 
-impl ParserResult {
-    pub fn get_some_reasoning(&self) -> Option<String> {
-        if self.reasoning_text.is_empty() {
-            None
-        } else {
-            Some(self.reasoning_text.clone())
-        }
-    }
+// impl ParserResult {
+//     pub fn get_some_reasoning(&self) -> Option<Vec<u32>> {
+//         if self.reasoning_token_ids.is_empty() {
+//             None
+//         } else {
+//             Some(self.reasoning_token_ids.clone())
+//         }
+//     }
 
-    pub fn get_some_normal_text(&self) -> Option<String> {
-        if self.normal_text.is_empty() {
-            None
-        } else {
-            Some(self.normal_text.clone())
-        }
-    }
-}
+//     pub fn get_some_normal_text(&self) -> Option<Vec<u32>> {
+//         if self.normal_token_ids.is_empty() {
+//             None
+//         } else {
+//             Some(self.normal_token_ids.clone())
+//         }
+//     }
+// }
 
 pub trait ReasoningParser: Send + std::fmt::Debug {
     /// Parses a standalone, non-streaming input chunk. Implementations may reset or ignore
     /// internal streaming state and should return the split of normal vs reasoning text for
     /// this complete input. Marker tokens must not be included in either output.
-    fn detect_and_parse_reasoning(&self, text: &str) -> ParserResult;
+    fn detect_and_parse_reasoning(&self, token_ids: &Vec<u32>) -> ParserResult;
 
     /// Parses a streaming chunk and updates internal state. The return value should be the
     /// delta: only the newly discovered normal and reasoning text attributable to this chunk
     /// (not the cumulative totals). Marker tokens must not be included in either output.
-    fn parse_reasoning_streaming_incremental(&mut self, text: &str) -> ParserResult;
+    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &Vec<u32>) -> ParserResult;
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -60,20 +61,20 @@ pub struct ReasoningParserWrapper {
 }
 
 impl ReasoningParser for ReasoningParserWrapper {
-    fn detect_and_parse_reasoning(&self, text: &str) -> ParserResult {
-        self.parser.detect_and_parse_reasoning(text)
+    fn detect_and_parse_reasoning(&self, token_ids: &Vec<u32>) -> ParserResult {
+        self.parser.detect_and_parse_reasoning(token_ids)
     }
 
-    fn parse_reasoning_streaming_incremental(&mut self, text: &str) -> ParserResult {
-        self.parser.parse_reasoning_streaming_incremental(text)
+    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &Vec<u32>) -> ParserResult {
+        self.parser.parse_reasoning_streaming_incremental(token_ids)
     }
 }
 
 impl ReasoningParserType {
-    pub fn get_reasoning_parser(self) -> ReasoningParserWrapper {
+    pub fn get_reasoning_parser(self, vocab: &HashMap<String, u32>) -> ReasoningParserWrapper {
         match self {
             ReasoningParserType::DeepseekR1 => ReasoningParserWrapper {
-                parser: Box::new(DeepseekR1ReasoningParser::new()),
+                parser: Box::new(DeepseekR1ReasoningParser::new(vocab)),
             },
             ReasoningParserType::Basic => ReasoningParserWrapper {
                 parser: Box::new(BasicReasoningParser::new(
@@ -81,6 +82,7 @@ impl ReasoningParserType {
                     "</think>".into(),
                     false,
                     true,
+                    vocab
                 )),
             },
         }

From f6c5ed46b020081c56227b75733d7ebc740ab499 Mon Sep 17 00:00:00 2001
From: nachiketb <nachiketb@nvidia.com>
Date: Thu, 21 Aug 2025 17:48:34 -0700
Subject: [PATCH 2/3] fix: working example

---
 lib/llm/src/preprocessor.rs                   |  5 +-
 .../openai/chat_completions/delta.rs          | 69 +++++++++++++------
 lib/parsers/src/reasoning/base_parser.rs      | 42 ++++++-----
 .../src/reasoning/deepseek_r1_parser.rs       |  6 +-
 lib/parsers/src/reasoning/mod.rs              | 10 +--
 5 files changed, 82 insertions(+), 50 deletions(-)

diff --git a/lib/llm/src/preprocessor.rs b/lib/llm/src/preprocessor.rs
index f5c3513957..5f81035226 100644
--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -129,7 +129,7 @@ impl OpenAIPreprocessor {
             tokenizer,
             model_info,
             mdcsum,
-            vocab
+            vocab,
         }))
     }
 
@@ -502,7 +502,8 @@ impl
         let (request, context) = request.into_parts();
 
         // create a response generator
-        let response_generator = request.response_generator(None, None);
+        let response_generator =
+            request.response_generator(Some(&self.tokenizer), Some(&self.vocab));
         let mut response_generator = Box::new(response_generator);
 
         // convert the chat completion request to a common completion request
diff --git a/lib/llm/src/protocols/openai/chat_completions/delta.rs b/lib/llm/src/protocols/openai/chat_completions/delta.rs
index 17e2c9f9e4..43d4bdc7b4 100644
--- a/lib/llm/src/protocols/openai/chat_completions/delta.rs
+++ b/lib/llm/src/protocols/openai/chat_completions/delta.rs
@@ -1,8 +1,8 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
+use crate::tokenizers::traits::Tokenizer;
 use std::{collections::HashMap, sync::Arc};
-use crate::tokenizers::{traits::Tokenizer};
 
 use dynamo_parsers::{ParserResult, ReasoningParser, ReasoningParserType, ReasoningParserWrapper};
 
@@ -18,7 +18,11 @@ impl NvCreateChatCompletionRequest {
     ///
     /// # Returns
     /// * [`DeltaGenerator`] configured with model name and response options.
-    pub fn response_generator(&self, tokenizer: Option<&Arc<(dyn Tokenizer + 'static)>>, vocab: Option<&HashMap<String, u32>>) -> DeltaGenerator {
+    pub fn response_generator(
+        &self,
+        tokenizer: Option<&Arc<(dyn Tokenizer + 'static)>>,
+        vocab: Option<&HashMap<String, u32>>,
+    ) -> DeltaGenerator {
         let options = DeltaGeneratorOptions {
             enable_usage: true,
             enable_logprobs: self.inner.logprobs.unwrap_or(false)
@@ -74,7 +78,12 @@ impl DeltaGenerator {
     ///
     /// # Returns
     /// * A new instance of [`DeltaGenerator`].
-    pub fn new(model: String, options: DeltaGeneratorOptions, tokenizer: Option<&Arc<(dyn Tokenizer + 'static)>>, vocab: Option<&HashMap<String, u32>>) -> Self {
+    pub fn new(
+        model: String,
+        options: DeltaGeneratorOptions,
+        tokenizer: Option<&Arc<(dyn Tokenizer + 'static)>>,
+        vocab: Option<&HashMap<String, u32>>,
+    ) -> Self {
         let now = std::time::SystemTime::now()
             .duration_since(std::time::UNIX_EPOCH)
             .unwrap()
@@ -98,17 +107,7 @@ impl DeltaGenerator {
         let reasoning_parser_type = ReasoningParserType::Basic;
 
         // Reasoning parser wrapper
-        let reasoning_parser = if let Some(v) = vocab {
-            Some(reasoning_parser_type.get_reasoning_parser(v))
-        } else {
-            None
-        };
-
-        let tokenizer = if let Some(tokenizer) = tokenizer {
-            Some(tokenizer.clone())
-        } else {
-            None
-        };
+        let reasoning_parser = vocab.map(|v| reasoning_parser_type.get_reasoning_parser(v));
 
         Self {
             id: format!("chatcmpl-{}", uuid::Uuid::new_v4()),
@@ -121,7 +120,7 @@ impl DeltaGenerator {
             msg_counter: 0,
             options,
             reasoning_parser,
-            tokenizer,
+            tokenizer: tokenizer.cloned(),
         }
     }
 
@@ -202,7 +201,12 @@ impl DeltaGenerator {
         if self.tokenizer.is_none() || self.reasoning_parser.is_none() {
             return None;
         }
-        Some(self.reasoning_parser.as_mut().unwrap().parse_reasoning_streaming_incremental(&token_ids))
+        Some(
+            self.reasoning_parser
+                .as_mut()
+                .unwrap()
+                .parse_reasoning_streaming_incremental(&token_ids),
+        )
     }
 
     /// Creates a choice within a chat completion response.
@@ -234,7 +238,7 @@ impl DeltaGenerator {
                 None
             },
             refusal: None,
-            reasoning_content: reasoning_content
+            reasoning_content,
         };
 
         let choice = dynamo_async_openai::types::ChatChoiceStream {
@@ -328,8 +332,9 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
 
         // Create the streaming response.
         let reasoning_parser_result = self.create_reasoning_content(delta.token_ids);
-        
-        let (normal_text, reasoning_content) = if let Some(parser_result) = reasoning_parser_result {
+
+        let (normal_text, reasoning_content) = if let Some(parser_result) = reasoning_parser_result
+        {
             let none_if_empty = |vec: String| {
                 if vec.is_empty() {
                     None
@@ -337,13 +342,33 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
                     Some(vec)
                 }
             };
-            (none_if_empty(self.tokenizer.as_ref().unwrap().decode(&parser_result.normal_token_ids, false).unwrap()),
-             none_if_empty(self.tokenizer.as_ref().unwrap().decode(&parser_result.reasoning_token_ids, false).unwrap()))
+            (
+                none_if_empty(
+                    self.tokenizer
+                        .as_ref()
+                        .unwrap()
+                        .decode(&parser_result.normal_token_ids, false)
+                        .unwrap(),
+                ),
+                none_if_empty(
+                    self.tokenizer
+                        .as_ref()
+                        .unwrap()
+                        .decode(&parser_result.reasoning_token_ids, false)
+                        .unwrap(),
+                ),
+            )
         } else {
             (delta.text, None)
         };
         let index = 0;
-        let stream_response = self.create_choice(index, normal_text, reasoning_content, finish_reason, logprobs);
+        let stream_response = self.create_choice(
+            index,
+            normal_text,
+            reasoning_content,
+            finish_reason,
+            logprobs,
+        );
 
         Ok(stream_response)
     }
diff --git a/lib/parsers/src/reasoning/base_parser.rs b/lib/parsers/src/reasoning/base_parser.rs
index d4f88813bd..e7447f3225 100644
--- a/lib/parsers/src/reasoning/base_parser.rs
+++ b/lib/parsers/src/reasoning/base_parser.rs
@@ -24,14 +24,8 @@ impl BasicReasoningParser {
         stream_reasoning: bool,
         vocab: &HashMap<String, u32>,
     ) -> Self {
-        let think_start_token_id = vocab
-            .get(&think_start_token)
-            .cloned()
-            .unwrap();
-        let think_end_token_id = vocab
-            .get(&think_end_token)
-            .cloned()
-            .unwrap();
+        let think_start_token_id = vocab.get(&think_start_token).cloned().unwrap();
+        let think_end_token_id = vocab.get(&think_end_token).cloned().unwrap();
 
         Self {
             think_start_token: think_start_token_id,
@@ -45,23 +39,29 @@ impl BasicReasoningParser {
 }
 
 impl ReasoningParser for BasicReasoningParser {
-    fn detect_and_parse_reasoning(&self, token_ids: &Vec<u32>) -> ParserResult {
-        log::debug!("detect_and_parse_reasoning called with token_ids: {:?}", token_ids);
+    fn detect_and_parse_reasoning(&self, token_ids: &[u32]) -> ParserResult {
+        log::debug!(
+            "detect_and_parse_reasoning called with token_ids: {:?}",
+            token_ids
+        );
 
         let in_reasoning = self._in_reasoning || token_ids.contains(&self.think_start_token);
         log::debug!("in_reasoning: {}", in_reasoning);
+        let mut processed_text = token_ids.to_vec();
 
         if !in_reasoning {
             log::debug!("No reasoning detected, returning normal text.");
             return ParserResult {
-                normal_token_ids: token_ids.clone(),
+                normal_token_ids: processed_text,
                 reasoning_token_ids: Vec::new(),
             };
         }
 
         // The text is considered to be in a reasoning block.
-        let mut processed_text = token_ids.clone();
-        if let Some(pos) = processed_text.iter().position(|&x| x == self.think_start_token) {
+        if let Some(pos) = processed_text
+            .iter()
+            .position(|&x| x == self.think_start_token)
+        {
             processed_text.remove(pos);
         }
         log::debug!(
@@ -81,7 +81,10 @@ impl ReasoningParser for BasicReasoningParser {
         }
 
         // Extract reasoning content
-        let (reasoning_token_ids, normal_token_ids) = if let Some(pos) = processed_text.iter().position(|&x| x == self.think_end_token) {
+        let (reasoning_token_ids, normal_token_ids) = if let Some(pos) = processed_text
+            .iter()
+            .position(|&x| x == self.think_end_token)
+        {
             let reasoning_token_ids = processed_text[..pos].to_vec();
             let normal_token_ids = processed_text[pos + 1..].to_vec();
             (reasoning_token_ids, normal_token_ids)
@@ -98,7 +101,7 @@ impl ReasoningParser for BasicReasoningParser {
         }
     }
 
-    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &Vec<u32>) -> ParserResult {
+    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &[u32]) -> ParserResult {
         // Incrementally parse the streaming text
         self._buffer.extend(token_ids);
         let mut current_text = self._buffer.clone();
@@ -136,7 +139,10 @@ impl ReasoningParser for BasicReasoningParser {
 
         // Strip `<think>` token if present
         if !self.stripped_think_start && current_text.contains(&self.think_start_token) {
-            if let Some(pos) = current_text.iter().position(|&x| x == self.think_start_token) {
+            if let Some(pos) = current_text
+                .iter()
+                .position(|&x| x == self.think_start_token)
+            {
                 current_text.remove(pos);
             }
             // current_text = current_text.replace(&self.think_start_token, "");
@@ -157,9 +163,9 @@ impl ReasoningParser for BasicReasoningParser {
             self._in_reasoning = false;
             let start_idx = think_end_idx + 1;
             let normal_token_ids: &[u32] = if start_idx < current_text.len() {
-                &current_text[start_idx..].to_vec()
+                &current_text[start_idx..]
             } else {
-                &[].to_vec()
+                [].as_ref()
             };
             return ParserResult {
                 normal_token_ids: normal_token_ids.to_vec(),
diff --git a/lib/parsers/src/reasoning/deepseek_r1_parser.rs b/lib/parsers/src/reasoning/deepseek_r1_parser.rs
index e51aa57a53..28d5c6080d 100644
--- a/lib/parsers/src/reasoning/deepseek_r1_parser.rs
+++ b/lib/parsers/src/reasoning/deepseek_r1_parser.rs
@@ -20,18 +20,18 @@ impl DeepseekR1ReasoningParser {
                 "</think>".to_string(),
                 true,
                 true,
-                vocab
+                vocab,
             ),
         }
     }
 }
 
 impl ReasoningParser for DeepseekR1ReasoningParser {
-    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &Vec<u32>) -> ParserResult {
+    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &[u32]) -> ParserResult {
         self.base.parse_reasoning_streaming_incremental(token_ids)
     }
 
-    fn detect_and_parse_reasoning(&self, token_ids: &Vec<u32>) -> ParserResult {
+    fn detect_and_parse_reasoning(&self, token_ids: &[u32]) -> ParserResult {
         self.base.detect_and_parse_reasoning(token_ids)
     }
 }
diff --git a/lib/parsers/src/reasoning/mod.rs b/lib/parsers/src/reasoning/mod.rs
index 1b86773c7e..c2ece2bfc1 100644
--- a/lib/parsers/src/reasoning/mod.rs
+++ b/lib/parsers/src/reasoning/mod.rs
@@ -40,12 +40,12 @@ pub trait ReasoningParser: Send + std::fmt::Debug {
     /// Parses a standalone, non-streaming input chunk. Implementations may reset or ignore
     /// internal streaming state and should return the split of normal vs reasoning text for
     /// this complete input. Marker tokens must not be included in either output.
-    fn detect_and_parse_reasoning(&self, token_ids: &Vec<u32>) -> ParserResult;
+    fn detect_and_parse_reasoning(&self, token_ids: &[u32]) -> ParserResult;
 
     /// Parses a streaming chunk and updates internal state. The return value should be the
     /// delta: only the newly discovered normal and reasoning text attributable to this chunk
     /// (not the cumulative totals). Marker tokens must not be included in either output.
-    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &Vec<u32>) -> ParserResult;
+    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &[u32]) -> ParserResult;
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -61,11 +61,11 @@ pub struct ReasoningParserWrapper {
 }
 
 impl ReasoningParser for ReasoningParserWrapper {
-    fn detect_and_parse_reasoning(&self, token_ids: &Vec<u32>) -> ParserResult {
+    fn detect_and_parse_reasoning(&self, token_ids: &[u32]) -> ParserResult {
         self.parser.detect_and_parse_reasoning(token_ids)
     }
 
-    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &Vec<u32>) -> ParserResult {
+    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &[u32]) -> ParserResult {
         self.parser.parse_reasoning_streaming_incremental(token_ids)
     }
 }
@@ -82,7 +82,7 @@ impl ReasoningParserType {
                     "</think>".into(),
                     false,
                     true,
-                    vocab
+                    vocab,
                 )),
             },
         }

From 26ff25ee45a8614493a3e638097609f95086bfba Mon Sep 17 00:00:00 2001
From: nachiketb <nachiketb@nvidia.com>
Date: Fri, 22 Aug 2025 00:38:39 -0700
Subject: [PATCH 3/3] feat: add gpt oss reasoning parser

---
 Cargo.lock                                  | 337 +++++++++++++++++++-
 lib/parsers/Cargo.toml                      |   3 +-
 lib/parsers/src/reasoning/gpt_oss_parser.rs |  89 ++++++
 lib/parsers/src/reasoning/mod.rs            |  23 +-
 4 files changed, 432 insertions(+), 20 deletions(-)
 create mode 100644 lib/parsers/src/reasoning/gpt_oss_parser.rs

diff --git a/Cargo.lock b/Cargo.lock
index 5d0423bbb9..8e1275ac2f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -172,6 +172,17 @@ version = "1.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457"
 
+[[package]]
+name = "arg_enum_proc_macro"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.104",
+]
+
 [[package]]
 name = "arrayref"
 version = "0.3.9"
@@ -341,6 +352,29 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 
+[[package]]
+name = "av1-grain"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f3efb2ca85bc610acfa917b5aaa36f3fcbebed5b3182d7f877b02531c4b80c8"
+dependencies = [
+ "anyhow",
+ "arrayvec",
+ "log",
+ "nom 7.1.3",
+ "num-rational",
+ "v_frame",
+]
+
+[[package]]
+name = "avif-serialize"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47c8fbc0f831f4519fe8b810b6a7a91410ec83031b8233f730a0480029f6a23f"
+dependencies = [
+ "arrayvec",
+]
+
 [[package]]
 name = "aws-lc-rs"
 version = "1.13.3"
@@ -693,6 +727,12 @@ version = "2.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
 
+[[package]]
+name = "bitstream-io"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6099cdc01846bc367c4e7dd630dc5966dccf36b652fae7a74e17b640411a91b2"
+
 [[package]]
 name = "blake3"
 version = "1.8.2"
@@ -753,9 +793,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4"
 dependencies = [
  "memchr",
+ "regex-automata 0.4.9",
  "serde",
 ]
 
+[[package]]
+name = "built"
+version = "0.7.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "56ed6191a7e78c36abdb16ab65341eefd73d64d303fffccdbb00d51e4205967b"
+
 [[package]]
 name = "bumpalo"
 version = "3.19.0"
@@ -1991,6 +2038,7 @@ version = "0.4.1"
 dependencies = [
  "anyhow",
  "dynamo-async-openai",
+ "openai-harmony",
  "regex",
  "serde",
  "serde_json",
@@ -2353,6 +2401,17 @@ dependencies = [
  "regex",
 ]
 
+[[package]]
+name = "fancy-regex"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "531e46835a22af56d1e3b66f04844bed63158bc094a628bec1d321d9b4c44bf2"
+dependencies = [
+ "bit-set 0.5.3",
+ "regex-automata 0.4.9",
+ "regex-syntax 0.8.5",
+]
+
 [[package]]
 name = "fancy-regex"
 version = "0.14.0"
@@ -3130,6 +3189,12 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
 
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
 [[package]]
 name = "hf-hub"
 version = "0.4.3"
@@ -3548,6 +3613,9 @@ dependencies = [
  "num-traits",
  "png",
  "qoi",
+ "ravif",
+ "rayon",
+ "rgb",
  "tiff",
  "zune-core",
  "zune-jpeg",
@@ -3563,6 +3631,12 @@ dependencies = [
  "quick-error 2.0.1",
 ]
 
+[[package]]
+name = "imgref"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0263a3d970d5c054ed9312c0057b4f3bde9c0b33836d3637361d4a9e6e7a408"
+
 [[package]]
 name = "indexmap"
 version = "1.9.3"
@@ -3571,6 +3645,7 @@ checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
 dependencies = [
  "autocfg",
  "hashbrown 0.12.3",
+ "serde",
 ]
 
 [[package]]
@@ -3630,6 +3705,17 @@ dependencies = [
  "cfg-if 1.0.1",
 ]
 
+[[package]]
+name = "interpolate_name"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.104",
+]
+
 [[package]]
 name = "interprocess"
 version = "2.2.3"
@@ -3703,6 +3789,15 @@ dependencies = [
  "either",
 ]
 
+[[package]]
+name = "itertools"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itertools"
 version = "0.14.0"
@@ -3863,6 +3958,16 @@ dependencies = [
  "uuid 1.17.0",
 ]
 
+[[package]]
+name = "libfuzzer-sys"
+version = "0.4.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5037190e1f70cbeef565bd267599242926f724d3b8a9f510fd7e0b540cfa4404"
+dependencies = [
+ "arbitrary",
+ "cc",
+]
+
 [[package]]
 name = "libloading"
 version = "0.8.8"
@@ -3976,6 +4081,15 @@ version = "0.4.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
 
+[[package]]
+name = "loop9"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062"
+dependencies = [
+ "imgref",
+]
+
 [[package]]
 name = "lrtable"
 version = "0.13.10"
@@ -4093,6 +4207,16 @@ dependencies = [
  "rawpointer",
 ]
 
+[[package]]
+name = "maybe-rayon"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519"
+dependencies = [
+ "cfg-if 1.0.1",
+ "rayon",
+]
+
 [[package]]
 name = "memchr"
 version = "2.7.5"
@@ -4383,7 +4507,7 @@ dependencies = [
  "rustc-hash 2.1.1",
  "rustfft",
  "safetensors 0.6.1",
- "schemars",
+ "schemars 0.8.22",
  "scraper",
  "serde",
  "serde-big-array",
@@ -4702,6 +4826,12 @@ version = "0.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "610a5acd306ec67f907abe5567859a3c693fb9886eb1f012ab8f2a47bef3db51"
 
+[[package]]
+name = "noop_proc_macro"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8"
+
 [[package]]
 name = "ntapi"
 version = "0.4.1"
@@ -4776,6 +4906,17 @@ version = "0.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
 
+[[package]]
+name = "num-derive"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.104",
+]
+
 [[package]]
 name = "num-integer"
 version = "0.1.46"
@@ -4948,6 +5089,29 @@ version = "11.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
 
+[[package]]
+name = "openai-harmony"
+version = "0.0.4"
+source = "git+https://github.com/openai/harmony#508cbaa7f6b0277bd37c9bdf6d4dc8a4d51aada5"
+dependencies = [
+ "anyhow",
+ "base64 0.22.1",
+ "bstr",
+ "clap 4.5.42",
+ "fancy-regex 0.13.0",
+ "futures",
+ "image",
+ "regex",
+ "reqwest 0.12.22",
+ "rustc-hash 1.1.0",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "sha1",
+ "sha2",
+ "thiserror 2.0.12",
+]
+
 [[package]]
 name = "openssl"
 version = "0.10.73"
@@ -5437,6 +5601,25 @@ dependencies = [
  "yansi",
 ]
 
+[[package]]
+name = "profiling"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3eb8486b569e12e2c32ad3e204dbaba5e4b5b216e9367044f25f1dba42341773"
+dependencies = [
+ "profiling-procmacros",
+]
+
+[[package]]
+name = "profiling-procmacros"
+version = "1.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52717f9a02b6965224f95ca2a81e2e0c5c43baacd28ca057577988930b6c3d5b"
+dependencies = [
+ "quote",
+ "syn 2.0.104",
+]
+
 [[package]]
 name = "prometheus"
 version = "0.14.0"
@@ -5777,6 +5960,56 @@ dependencies = [
  "rand_core 0.9.3",
 ]
 
+[[package]]
+name = "rav1e"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd87ce80a7665b1cce111f8a16c1f3929f6547ce91ade6addf4ec86a8dda5ce9"
+dependencies = [
+ "arbitrary",
+ "arg_enum_proc_macro",
+ "arrayvec",
+ "av1-grain",
+ "bitstream-io",
+ "built",
+ "cfg-if 1.0.1",
+ "interpolate_name",
+ "itertools 0.12.1",
+ "libc",
+ "libfuzzer-sys",
+ "log",
+ "maybe-rayon",
+ "new_debug_unreachable",
+ "noop_proc_macro",
+ "num-derive",
+ "num-traits",
+ "once_cell",
+ "paste",
+ "profiling",
+ "rand 0.8.5",
+ "rand_chacha 0.3.1",
+ "simd_helpers",
+ "system-deps",
+ "thiserror 1.0.69",
+ "v_frame",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "ravif"
+version = "0.11.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5825c26fddd16ab9f515930d49028a630efec172e903483c94796cfe31893e6b"
+dependencies = [
+ "avif-serialize",
+ "imgref",
+ "loop9",
+ "quick-error 2.0.1",
+ "rav1e",
+ "rayon",
+ "rgb",
+]
+
 [[package]]
 name = "raw-cpuid"
 version = "10.7.0"
@@ -5873,6 +6106,26 @@ dependencies = [
  "thiserror 2.0.12",
 ]
 
+[[package]]
+name = "ref-cast"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a0ae411dbe946a674d89546582cea4ba2bb8defac896622d6496f14c23ba5cf"
+dependencies = [
+ "ref-cast-impl",
+]
+
+[[package]]
+name = "ref-cast-impl"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.104",
+]
+
 [[package]]
 name = "regex"
 version = "1.11.1"
@@ -6025,6 +6278,12 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
+[[package]]
+name = "rgb"
+version = "0.8.52"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce"
+
 [[package]]
 name = "ring"
 version = "0.17.14"
@@ -6418,6 +6677,30 @@ dependencies = [
  "serde_json",
 ]
 
+[[package]]
+name = "schemars"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f"
+dependencies = [
+ "dyn-clone",
+ "ref-cast",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "82d20c4491bc164fa2f6c5d44565947a52ad80b9505d8e36f8d54c27c739fcd0"
+dependencies = [
+ "dyn-clone",
+ "ref-cast",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "schemars_derive"
 version = "0.8.22"
@@ -6663,6 +6946,38 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "serde_with"
+version = "3.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2c45cd61fefa9db6f254525d46e392b852e0e61d9a1fd36e5bd183450a556d5"
+dependencies = [
+ "base64 0.22.1",
+ "chrono",
+ "hex",
+ "indexmap 1.9.3",
+ "indexmap 2.10.0",
+ "schemars 0.9.0",
+ "schemars 1.0.4",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "serde_with_macros",
+ "time",
+]
+
+[[package]]
+name = "serde_with_macros"
+version = "3.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de90945e6565ce0d9a25098082ed4ee4002e047cb59892c318d66821e14bb30f"
+dependencies = [
+ "darling 0.20.11",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.104",
+]
+
 [[package]]
 name = "serde_yaml"
 version = "0.9.34+deprecated"
@@ -6824,6 +7139,15 @@ version = "0.3.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe"
 
+[[package]]
+name = "simd_helpers"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6"
+dependencies = [
+ "quote",
+]
+
 [[package]]
 name = "similar"
 version = "2.7.0"
@@ -8313,6 +8637,17 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "v_frame"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "666b7727c8875d6ab5db9533418d7c764233ac9c0cff1d469aec8fa127597be2"
+dependencies = [
+ "aligned-vec",
+ "num-traits",
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "validator"
 version = "0.20.0"
diff --git a/lib/parsers/Cargo.toml b/lib/parsers/Cargo.toml
index d9898c5fe8..9958c2102a 100644
--- a/lib/parsers/Cargo.toml
+++ b/lib/parsers/Cargo.toml
@@ -32,4 +32,5 @@ serde_json = { workspace = true }
 tracing = { workspace = true }
 uuid = { workspace = true }
 
-regex = "1"
\ No newline at end of file
+regex = "1"
+openai-harmony = { git = "https://github.com/openai/harmony", version = "0.0.4" }
diff --git a/lib/parsers/src/reasoning/gpt_oss_parser.rs b/lib/parsers/src/reasoning/gpt_oss_parser.rs
new file mode 100644
index 0000000000..ade2ed5493
--- /dev/null
+++ b/lib/parsers/src/reasoning/gpt_oss_parser.rs
@@ -0,0 +1,89 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use std::ops::Deref;
+
+use crate::ParserResult;
+use crate::ReasoningParser;
+
+use openai_harmony::chat::TextContent;
+use openai_harmony::StreamableParser;
+use openai_harmony::{load_harmony_encoding, HarmonyEncodingName, HarmonyEncoding, chat::Role};
+
+#[derive(Debug)]
+pub struct GptOssReasoningParser {
+    enc: HarmonyEncoding,
+}
+
+impl GptOssReasoningParser {
+    pub fn new() -> Self {
+        let enc = load_harmony_encoding(HarmonyEncodingName::HarmonyGptOss).unwrap();
+        Self { enc }
+    }
+}
+
+impl GptOssReasoningParser {
+    fn reason_parsing_wrapper(&self, token_ids: &[u32]) -> ParserResult {
+        let mut parser = StreamableParser::new(self.enc.clone(), Some(Role::Assistant)).unwrap();
+        for token_id in token_ids {
+            parser.process(*token_id).unwrap();
+        }
+        let output_msgs = parser.messages();
+        // let mut reasoning_token_ids = vec![];
+        // let mut normal_token_ids = vec![];
+        match output_msgs.len() {
+            0 => {
+                return ParserResult {
+                    normal_token_ids: vec![], // No normal text in this example
+                    reasoning_token_ids: self.enc.tokenizer().encode_with_special_tokens(parser.current_content().unwrap().deref()),
+                }
+            },
+            1 => {
+                let mut reasoning_token_ids = vec![];
+                if let Some(openai_harmony::chat::Content::Text(TextContent { text })) = output_msgs[0].content.first() {
+                    reasoning_token_ids.extend(self.enc.tokenizer().encode_with_special_tokens(text));
+                }
+                return ParserResult {
+                    normal_token_ids: self.enc.tokenizer().encode_with_special_tokens(parser.current_content().unwrap().deref()),
+                    reasoning_token_ids: reasoning_token_ids,
+                };
+            },
+            _ => {
+                let mut reasoning_token_ids = vec![];
+                let mut normal_token_ids = vec![];
+
+                // Loop until second last message
+                for i in 0..(output_msgs.len() - 1) {
+                    let parse_msg = &output_msgs[i]; 
+                    if let Some(openai_harmony::chat::Content::Text(TextContent { text })) = parse_msg.content.first() {
+                        reasoning_token_ids.extend(self.enc.tokenizer().encode_with_special_tokens(text));
+                    }
+                }
+
+                let last_msg = &output_msgs[output_msgs.len() - 1];
+
+                // Handle the last message
+                if let Some(openai_harmony::chat::Content::Text(TextContent { text })) = last_msg.content.first() {
+                    normal_token_ids.extend(self.enc.tokenizer().encode_with_special_tokens(text));
+                }
+
+                return ParserResult {
+                    normal_token_ids,
+                    reasoning_token_ids,
+                };
+            }
+        }
+    }
+}
+
+impl ReasoningParser for  GptOssReasoningParser {
+
+    fn detect_and_parse_reasoning(&self, token_ids: &[u32]) -> ParserResult {
+        self.reason_parsing_wrapper(token_ids)
+    }
+
+    fn parse_reasoning_streaming_incremental(&mut self, token_ids: &[u32]) -> ParserResult {
+        self.reason_parsing_wrapper(token_ids)
+    }
+    
+}
\ No newline at end of file
diff --git a/lib/parsers/src/reasoning/mod.rs b/lib/parsers/src/reasoning/mod.rs
index c2ece2bfc1..42d635e4a9 100644
--- a/lib/parsers/src/reasoning/mod.rs
+++ b/lib/parsers/src/reasoning/mod.rs
@@ -3,6 +3,7 @@
 
 mod base_parser;
 mod deepseek_r1_parser;
+mod gpt_oss_parser;
 
 use std::collections::HashMap;
 
@@ -18,24 +19,6 @@ pub struct ParserResult {
     pub reasoning_token_ids: Vec<u32>,
 }
 
-// impl ParserResult {
-//     pub fn get_some_reasoning(&self) -> Option<Vec<u32>> {
-//         if self.reasoning_token_ids.is_empty() {
-//             None
-//         } else {
-//             Some(self.reasoning_token_ids.clone())
-//         }
-//     }
-
-//     pub fn get_some_normal_text(&self) -> Option<Vec<u32>> {
-//         if self.normal_token_ids.is_empty() {
-//             None
-//         } else {
-//             Some(self.normal_token_ids.clone())
-//         }
-//     }
-// }
-
 pub trait ReasoningParser: Send + std::fmt::Debug {
     /// Parses a standalone, non-streaming input chunk. Implementations may reset or ignore
     /// internal streaming state and should return the split of normal vs reasoning text for
@@ -53,6 +36,7 @@ pub trait ReasoningParser: Send + std::fmt::Debug {
 pub enum ReasoningParserType {
     DeepseekR1,
     Basic,
+    GptOss,
 }
 
 #[derive(std::fmt::Debug)]
@@ -85,6 +69,9 @@ impl ReasoningParserType {
                     vocab,
                 )),
             },
+            ReasoningParserType::GptOss => ReasoningParserWrapper {
+                parser: Box::new(gpt_oss_parser::GptOssReasoningParser::new()),
+            },
         }
     }
 }