OpenDCAI · PKU-DataLab · Sep 26, 2025 · Sep 15, 2025
diff --git a/docs/.vuepress/navbars/en/index.ts b/docs/.vuepress/navbars/en/index.ts
@@ -84,14 +84,14 @@ export const enNavbar = defineNavbarConfig([
                         activeMatch: '^/guide/'
                     },
                     {
-                        text: "Agentic RAG Pipeline-Alpha",
-                        link: "/en/notes/guide/pipelines/AgenticRAGPipeline.md",
+                        text: "Doc-to-QA Pipeline",
+                        link: "/en/notes/guide/pipelines/Doc2QAPipeline.md",
                         icon: "solar:palette-round-linear",
                         activeMatch: '^/guide/'
                     },
                     {
-                        text: "Agentic RAG Pipeline-Beta",
-                        link: "/en/notes/guide/pipelines/AgenticRAGPipeline2.md",
+                        text: "Agentic RAG Pipeline",
+                        link: "/en/notes/guide/pipelines/AgenticRAGPipeline.md",
                         icon: "solar:palette-round-linear",
                         activeMatch: '^/guide/'
                     },

diff --git a/docs/.vuepress/navbars/zh/index.ts b/docs/.vuepress/navbars/zh/index.ts
@@ -86,14 +86,14 @@ export const zhNavbar = defineNavbarConfig([
                         activeMatch: '^/guide/'
                     },
                     {
-                        text: "Agentic RAG数据合成流水线-Alpha",
-                        link: "/zh/notes/guide/pipelines/AgenticRAGPipeline.md",
+                        text: "Doc-to-QA数据合成流水线",
+                        link: "/zh/notes/guide/pipelines/Doc2QAPipeline.md",
                         icon: "solar:palette-round-linear",
                         activeMatch: '^/guide/'
                     },
                     {
-                        text: "Agentic RAG数据合成流水线-Beta",
-                        link: "/zh/notes/guide/pipelines/AgenticRAGPipeline2.md",
+                        text: "Agentic RAG数据合成流水线",
+                        link: "/zh/notes/guide/pipelines/AgenticRAGPipeline.md",
                         icon: "solar:palette-round-linear",
                         activeMatch: '^/guide/'
                     },

diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts
@@ -52,8 +52,8 @@ export const Guide: ThemeNote = defineNoteConfig({
                 "TextPipeline",
                 "ReasoningPipeline",
                 "Text2SqlPipeline",
+                "Doc2QAPipeline",
                 "AgenticRAGPipeline",
-                "AgenticRAGPipeline2",
                 "RAREPipeline",
                 "KnowledgeBaseCleaningPipeline",
                 "FuncCallPipeline",

diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts
@@ -52,8 +52,8 @@ export const Guide: ThemeNote = defineNoteConfig({
                 "TextPipeline",
                 "ReasoningPipeline",
                 "Text2SqlPipeline",
+                "Doc2QAPipeline",
                 "AgenticRAGPipeline",
-                "AgenticRAGPipeline2",
                 "RAREPipeline",
                 "KnowledgeBaseCleaningPipeline",
                 "FuncCallPipeline",

diff --git a/docs/en/notes/guide/domain_specific_operators/agenticrag_operators.md b/docs/en/notes/guide/domain_specific_operators/agenticrag_operators.md
@@ -8,7 +8,7 @@ permalink: /en/guide/agenticrag_operators/
 
 ## Overview
 
-AgenticRAG Operators are a specialized suite of tools designed for agentic RAG (Retrieval-Augmented Generation) tasks, with a particular focus on generating question-and-answer (QA) samples from provided text to support RL-based agentic RAG training. These operators are primarily categorized into two groups: **Data Generation Operators (Generators)** and **Processing Operators (Processors)**.
+AgenticRAG Operators are a specialized suite of tools designed for agentic RAG (Retrieval-Augmented Generation) tasks, with a particular focus on generating question-and-answer (QA) samples from provided text to support RL-based agentic RAG training. These operators are primarily categorized into two groups: **Data Generation Operators (Generators)** and **Evaluating Operators (Evaluators)**.
 
 - 🚀 **Independent Innovation**: Core algorithms developed from scratch, filling existing algorithmic gaps or further improving performance, breaking through current performance bottlenecks.
 - ✨ **Open Source First**: First integration of this operator into mainstream community frameworks, facilitating use by more developers and achieving open-source sharing.
@@ -28,31 +28,19 @@ Data Generation Operators are responsible for producing RAG-related RL training
   </thead>
   <tbody>
     <tr>
-      <td class="tg-0pky">AutoPromptGenerator🚀</td>
-      <td class="tg-0pky">Prompt Synthesis</td>
-      <td class="tg-0pky">Generates prompts for question and answer creation tailored to specific content by leveraging large language models.</td>
-      <td class="tg-0pky">-</td>
-    </tr>
-    <tr>
-      <td class="tg-0pky">AtomicTaskGenerator✨</td>
+      <td class="tg-0pky">AgenticRAGAtomicTaskGenerator✨</td>
       <td class="tg-0pky">Atomic Task Generation</td>
       <td class="tg-0pky">Generates high-quality questions and verifiable answers based on the given text content.</td>
       <td class="tg-0pky">Refined and improved from <a href="https://github.com/OPPO-PersonalAI/TaskCraft" target="_blank">https://github.com/OPPO-PersonalAI/TaskCraft</a></td>
     </tr>
     <tr>
-      <td class="tg-0pky">QAGenerator✨</td>
-      <td class="tg-0pky">Question and Answer Generation</td>
-      <td class="tg-0pky">Produces questions and answers for given text content using large language models and generated prompts.</td>
-      <td class="tg-0pky">-</td>
-    </tr>
-    <tr>
-      <td class="tg-0pky">WidthQAGenerator✨</td>
+      <td class="tg-0pky">AgenticRAGWidthQAGenerator✨</td>
       <td class="tg-0pky">QA Breadth Expansion</td>
       <td class="tg-0pky">Combines multiple QA pairs to generate new, more difficult QA pairs.</td>
       <td class="tg-0pky">Refined and improved from <a href="https://github.com/OPPO-PersonalAI/TaskCraft" target="_blank">https://github.com/OPPO-PersonalAI/TaskCraft</a></td>
     </tr>
     <tr>
-      <td class="tg-0pky">DepthQAGenerator✨</td>
+      <td class="tg-0pky">AgenticRAGDepthQAGenerator✨</td>
       <td class="tg-0pky">QA Depth Expansion</td>
       <td class="tg-0pky">Expands individual QA pairs into new, more challenging QA pairs.</td>
       <td class="tg-0pky">Refined and improved from <a href="https://github.com/OPPO-PersonalAI/TaskCraft" target="_blank">https://github.com/OPPO-PersonalAI/TaskCraft</a></td>
@@ -75,52 +63,22 @@ Data evaluation operators are responsible for assessing reinforcement learning t
   </thead>
   <tbody>
     <tr>
-      <td class="tg-0pky">QAScorer✨</td>
-      <td class="tg-0pky">QA Scoring</td>
-      <td class="tg-0pky">Evaluates the quality of questions, answer consistency, answer verifiability, and downstream utility for QA pairs and their related content.</td>
-      <td class="tg-0pky">-</td>
-    </tr>
-    <tr>
-      <td class="tg-0pky">F1Scorer🚀</td>
+      <td class="tg-0pky">AgenticRAGQAF1SampleEvaluator🚀</td>
       <td class="tg-0pky">QA Scoring</td>
       <td class="tg-0pky">Assesses the verifiability of answers with and without the presence of gold documents in QA tasks.</td>
       <td class="tg-0pky">-</td>
     </tr>
   </tbody>
 </table>
 
-
-## Processing Operators
-
-Processing Operators are mainly tasked with choosing suitable data.
-
-<table class="tg">
-  <thead>
-    <tr>
-      <th class="tg-0pky">Name</th>
-      <th class="tg-0pky">Application Type</th>
-      <th class="tg-0pky">Description</th>
-      <th class="tg-0pky">Official Repository or Paper</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td class="tg-0pky">ContentChooser🚀</td>
-      <td class="tg-0pky">Content chooser</td>
-      <td class="tg-0pky">Selects a subset of content from a larger collection for further processing within the pipeline.</td>
-      <td class="tg-0pky">-</td>
-    </tr>
-  </tbody>
-</table>
-
 ## Operator Interface Usage Instructions
 
 Specifically, for operators that specify storage paths or call models, we provide encapsulated **model interfaces** and **storage object interfaces**. You can predefine model API parameters for operators in the following way:
 
 ```python
 from dataflow.llmserving import APILLMServing_request
 
-api_llm_serving = APILLMServing_request(
+llm_serving = APILLMServing_request(
                 api_url="your_api_url",
                 model_name="model_name",
                 max_workers=5
@@ -140,44 +98,15 @@ from dataflow.utils.storage import FileStorage
         )
 ```
 
-The `api_llm_serving` and `self.storage` used in the following text are the interface objects defined here. Complete usage examples can be found in `test/test_agentic_rag.py`.
+The `llm_serving` and `self.storage` used in the following text are the interface objects defined here. Complete usage examples can be found in `DataFlow/dataflow/statics/pipelines/api_pipelines/agentic_rag_pipeline.py`.
 
 For parameter passing, the constructor of operator objects mainly passes information related to operator configuration, which can be configured once and called multiple times; while the `X.run()` function passes `key` information related to IO. Details can be seen in the operator description examples below.
 
 ## Detailed Operator Descriptions
 
 ### Data Generation Operators
 
-#### 1. AutoPromptGenerator
-
-**Function Description:**  This operator is specifically designed to generate specialized prompts for creating question-and-answer pairs based on given text content.
-
-**Input Parameters:**
-
-- `__init__()`
-  - `llm_serving`: Large language model interface object to use (default: predefined value above)
-- `run()`
-  - `storage`: Storage interface object (default: predefined value above)
-  - `input_key`: Input text content field name (default: "text")
-  - `output_key`: Output generated prompt field name (default: "generated_prompt")
-
-**Key Features:**
-
-- Supports multiple types of text contents
-- Automatically generates suitable prompts
-
-**Usage Example:**
-
-```python
-prompt_generator = AutoPromptGenerator(api_llm_serving)
-result = prompt_generator.run(
-            storage = self.storage.step(),
-            input_key = "text",
-            output_key = "generated_prompt"
-        )
-```
-
-#### 2. AtomicTaskGenerator
+#### 1. AgenticRAGAtomicTaskGenerator
 
 **Function Description:**  This operator is used to generate appropriate high-quality questions and verifiable answers for the provided text content.
 
@@ -203,47 +132,17 @@ result = prompt_generator.run(
 **Usage Example:**
 
 ```python
-atomic_task_gen = AtomicTaskGenerator(llm_serving=api_llm_serving)
-result = atomic_task_gen.run(
-    storage = self.storage.step(),
-    input_key = "text",
-)
-```
-
-#### 3. QAGenerator
-
-**Function Description:** This operator generates a pair of question and answer for a special content.
-
-**Input Parameters:**
-
-- `__init__()`
-  - `llm_serving`: Large language model interface object to use (default: predefined value above)
-- `run()`
-  - `storage`: Storage interface object (default: predefined value above)
-  - `input_key`: Input text content field name (default: "text")
-  - `prompt_key`: Output answer field name (default: "generated_prompt")
-  - `output_quesion_key`: Output answer field name (default: "generated_question")
-  - `output_answer_key`: Output answer field name (default: "generated_answer")
-
-**Key Features:**
-
-- Supports multiple types of text contents
-- Generates suitable pairs of questions and answers
-
-**Usage Example:**
+atomic_task_generator = AgenticRAGAtomicTaskGenerator(
+            llm_serving=self.llm_serving
+        )
 
-```python
-qa_gen = QAGenerator(llm_serving=api_llm_serving)
-result = qa_gen.run(
+result = atomic_task_generator.run(
             storage = self.storage.step(),
-            input_key="text",
-            prompt_key="generated_prompt",
-            output_quesion_key="generated_question",
-            output_answer_key="generated_answer"
-          )
+            input_key = "contents",
+        )
 ```
 
-#### 4. WidthQAGenerator
+#### 2. AgenticRAGWidthQAGenerator
 
 **Function Description:** This operator is used to combine two QA pairs and generate a new question.
 
@@ -265,16 +164,19 @@ result = qa_gen.run(
 **Usage Example:**
 
 ```python
-width_qa_gen = WidthQAGenerator(llm_serving=api_llm_serving)
-result = width_qa_gen.run(
+width_qa_generator = AgenticRAGWidthQAGenerator(
+            llm_serving=self.llm_serving
+        )
+
+result = width_qa_generator.run(
     storage = self.storage.step(),
     input_question_key = "question",
     input_identifier_key = "identifier",
     input_answer_key = "refined_answer"
 )
 ```
 
-#### 5. DepthQAGenerator
+#### 3. AgenticRAGDepthQAGenerator
 
 **Function Description:** This operator is used to generate deeper questions based on existing QA pairs.
 
@@ -294,8 +196,11 @@ result = width_qa_gen.run(
 **Usage Example:**
 
 ```python
-depth_qa_gen = DepthQAGenerator(llm_serving=api_llm_serving)
-result = depth_qa_gen.run(
+depth_qa_generator = AgenticRAGDepthQAGenerator(
+            llm_serving=self.llm_serving
+        )
+
+result = depth_qa_generator.run(
     storage = self.storage.step(),
     input_key = "question",
     output_key = "depth_question"
@@ -304,48 +209,7 @@ result = depth_qa_gen.run(
 
 ### Data Evaluation Operators
 
-#### 1. QAScorer
-
-**Function Description:** This operator generates multiple evaluation scores for the produced question-and-answer pairs.
-
-**Input Parameters:**
-
-- `__init__()`
-  - `llm_serving`: Large language model interface object to use (default: predefined value above)
-- `run()`
-  - `storage`: Storage interface object (default: predefined value above)
-  - `input_question_key`: Input text content field name containing the generated questions (default: "generated_question")
-  - `input_answer_key`: Input text content field name containing the generated answers (default: "generated_answer")
-  - `output_question_quality_key`: Output field name for question quality grades (default: "question_quality_grades")
-  - `output_question_quality_feedback_key`: Output field name for detailed feedback on question quality (default: "question_quality_feedbacks")
-  - `output_answer_alignment_key`: Output field name for answer alignment grades (default: "answer_alignment_grades")
-  - `output_answer_alignment_feedback_key`: Output field name for detailed feedback on answer alignment (default: "answer_alignment_feedbacks")
-  - `output_answer_verifiability_key`: Output field name for answer verifiability grades (default: "answer_verifiability_grades")
-  - `output_answer_verifiability_feedback_key`: Output field name for detailed feedback on answer verifiability (default: "answer_verifiability_feedbacks")
-  - `output_downstream_value_key`: Output field name for downstream value grades (default: "downstream_value_grades")
-  - `output_downstream_value_feedback_key`: Output field name for detailed feedback on downstream value (default: "downstream_value_feedbacks")
-
-**Key Features:**
-
-- Generates multiple useful scores for further filtering
-
-**Usage Example:**
-
-```python
-qa_scorer = QAScorer(llm_serving=api_llm_serving)
-result = qa_scorer.run(
-            storage = self.storage.step(),
-            input_question_key="generated_question",
-            input_answer_key="generated_answer",
-            output_question_quality_key="question_quality_grades",
-            output_question_quality_feedback_key="question_quality_feedbacks",
-            output_answer_alignment_key="answer_alignment_grades",
-            output_answer_alignment_feedback_key="answer_alignment_feedbacks",
-            output_answer_verifiability_key="answer_verifiability_grades",
-          )
-```
-
-#### 2. F1Scorer
+#### 1. AgenticRAGQAF1SampleEvaluator
 
 **Function Description:** This operator is used to evaluate the verifiability of QA tasks with and without support from gold documents.
 
@@ -366,44 +230,11 @@ result = qa_scorer.run(
 **Usage Example:**
 
 ```python
-f1_scorer = F1Scorer(llm_serving=api_llm_serving)
+f1_scorer = AgenticRAGQAF1SampleEvaluator()
 result = f1_scorer.run(
-    storage = self.storage.step(),
-    prediction_key = "refined_answer",
-    ground_truth_key = "golden_doc_answer",
-    output_key = "F1Score",
-)
-```
-
-### Processing Operators
-
-#### 1. ContentChooser
-
-**Function Description:** This operator identifies and selects representative text content from a set of text contexts.
-
-**Input Parameters:**
-
-- `init()` 
-  - `num_samples`: Numuber of choosen samples 
-  - `method`: The method used to select from the original text contents (default: 'random')
-  - `embedding_server`: The server to generate embeddings for text contents
-- `run()` 
-  - `storage`: Storage interface object (default: predefined value above)
-  - `input_key`:  Input text content field name (default: "text")
-
-**Key Features:**
-
-- Supports random choose and kmean choose
-- Supports a lot of embedding models
-
-**Usage Example:**
-
-```python
-embedding_serving = LocalModelLLMServing_vllm(hf_model_name_or_path="your_embedding_model_path", vllm_max_tokens=8192)
-
-content_chooser = ContentChooser(num_samples = 5, method = "kcenter", embedding_serving=embedding_serving)
-result = content_chooser.run(
-            storage = self.storage.step(),
-            input_key = "text",
-          ) 
+            storage=self.storage.step(),
+            output_key="F1Score",
+            input_prediction_key="refined_answer",
+            input_ground_truth_key="golden_doc_answer"
+        )
 ```