remove unnecessary api expose

jaedeok-nvidia · jaedeok-nvidia · commit a90b238f835a · 2025-07-11T22:07:47.000+09:00
Signed-off-by: Jaedeok Kim &lt;jaedeokk@nvidia.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -472,11 +472,6 @@ class GenericLlmRequest
         mExcludeInputFromOutput = exclude;
     }
 
-    bool getExcludeInputFromOutput()
-    {
-        return mExcludeInputFromOutput;
-    }
-
     /// @brief Get the params of the context
     /// @return The params of the context
     [[nodiscard]] std::optional<executor::ContextPhaseParams> const& getContextPhaseParams() const noexcept
@@ -774,11 +769,6 @@ class GenericLlmRequest
         return mParentRequestId;
     }
 
-    [[nodiscard]] SizeType32 getSequenceIndex() const
-    {
-        return mSequenceIndex;
-    }
-
     /// @brief Return a vector of the last-generated tokens of shape [num_beams]
     [[nodiscard]] VecTokens const& getLastTokens()
     {
@@ -1866,11 +1856,6 @@ class GenericLlmRequest
     // current position of the prompt tuning table (only used in chunked prefill mode)
     SizeType32 mPtableCurrentPosition{0};
 
-    [[nodiscard]] std::shared_ptr<std::vector<bool>> getSequenceFinalVec() const
-    {
-        return mSequenceFinalVec;
-    }
-
 protected:
     bool mIsStreaming;
 
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -113,8 +113,6 @@ void initBindings(pybind11::module_& m)
         .def("set_generated_tokens", &GenLlmReq::setGeneratedTokens, py::arg("generated_beam_tokens"))
         .def("pause", &GenLlmReq::pause, py::arg("max_input_len"))
         .def_property("max_sent_token_len", &GenLlmReq::getMaxSentTokenLen, &GenLlmReq::setMaxSentTokenLen)
-        .def_property(
-            "exclude_input_from_output", &GenLlmReq::getExcludeInputFromOutput, &GenLlmReq::setExcludeInputFromOutput)
         .def_property_readonly("prompt_embedding_table", &GenLlmReq::getPromptEmbeddingTable)
         .def_property_readonly("multimodal_embedding", &GenLlmReq::getMultimodalEmbedding)
         .def_property_readonly("mrope_rotary_cos_sin", &GenLlmReq::getMropeRotaryCosSin)
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -386,11 +386,11 @@ def create_child_request(self, request_id: int):
         child_request.py_request_id = child_request.request_id
         child_request.py_llm_request_type = child_request.llm_request_type
         child_request.py_batch_idx = None
+
+        # Mimic the behavior of the original LlmRequest.
         child_request.is_attention_dp_dummy = self.is_attention_dp_dummy
         child_request.is_cuda_graph_dummy = self.is_cuda_graph_dummy
         child_request.is_dummy = self.is_dummy
-
-        # Mimic the behavior of the original LlmRequest.
         child_request.create_response = partial(create_response, child_request)
         child_request.finish_by = partial(finish_by, child_request)