From 7ec24d5e9f677fb775e7910fcdca873ed96ae3f3 Mon Sep 17 00:00:00 2001 From: wongzhenhao Date: Mon, 15 Sep 2025 15:38:28 +0800 Subject: [PATCH] [Update] Doc update for Sep 25 refactor of DataFlow --- docs/.vuepress/navbars/en/index.ts | 8 +- docs/.vuepress/navbars/zh/index.ts | 8 +- docs/.vuepress/notes/en/guide.ts | 2 +- docs/.vuepress/notes/zh/guide.ts | 2 +- .../agenticrag_operators.md | 235 ++-------- .../core_operators.md | 409 +++++++++++++++++ .../guide/pipelines/AgenticRAGPipeline.md | 191 +++----- .../guide/pipelines/AgenticRAGPipeline2.md | 155 ------- .../notes/guide/pipelines/Doc2QAPipeline.md | 228 ++++++++++ .../agenticrag_operators.md | 234 ++-------- .../core_operators.md | 415 ++++++++++++++++++ .../guide/pipelines/AgenticRAGPipeline.md | 192 +++----- .../guide/pipelines/AgenticRAGPipeline2.md | 159 ------- .../notes/guide/pipelines/Doc2QAPipeline.md | 234 ++++++++++ 14 files changed, 1479 insertions(+), 993 deletions(-) create mode 100644 docs/en/notes/guide/domain_specific_operators/core_operators.md delete mode 100644 docs/en/notes/guide/pipelines/AgenticRAGPipeline2.md create mode 100644 docs/en/notes/guide/pipelines/Doc2QAPipeline.md create mode 100644 docs/zh/notes/guide/domain_specific_operators/core_operators.md delete mode 100644 docs/zh/notes/guide/pipelines/AgenticRAGPipeline2.md create mode 100644 docs/zh/notes/guide/pipelines/Doc2QAPipeline.md diff --git a/docs/.vuepress/navbars/en/index.ts b/docs/.vuepress/navbars/en/index.ts index 1b3cdc644..fb621115e 100644 --- a/docs/.vuepress/navbars/en/index.ts +++ b/docs/.vuepress/navbars/en/index.ts @@ -84,14 +84,14 @@ export const enNavbar = defineNavbarConfig([ activeMatch: '^/guide/' }, { - text: "Agentic RAG Pipeline-Alpha", - link: "/en/notes/guide/pipelines/AgenticRAGPipeline.md", + text: "Doc-to-QA Pipeline", + link: "/en/notes/guide/pipelines/Doc2QAPipeline.md", icon: "solar:palette-round-linear", activeMatch: '^/guide/' }, { - text: "Agentic RAG Pipeline-Beta", - link: "/en/notes/guide/pipelines/AgenticRAGPipeline2.md", + text: "Agentic RAG Pipeline", + link: "/en/notes/guide/pipelines/AgenticRAGPipeline.md", icon: "solar:palette-round-linear", activeMatch: '^/guide/' }, diff --git a/docs/.vuepress/navbars/zh/index.ts b/docs/.vuepress/navbars/zh/index.ts index a44e19059..1ba33eed0 100644 --- a/docs/.vuepress/navbars/zh/index.ts +++ b/docs/.vuepress/navbars/zh/index.ts @@ -86,14 +86,14 @@ export const zhNavbar = defineNavbarConfig([ activeMatch: '^/guide/' }, { - text: "Agentic RAG数据合成流水线-Alpha", - link: "/zh/notes/guide/pipelines/AgenticRAGPipeline.md", + text: "Doc-to-QA数据合成流水线", + link: "/zh/notes/guide/pipelines/Doc2QAPipeline.md", icon: "solar:palette-round-linear", activeMatch: '^/guide/' }, { - text: "Agentic RAG数据合成流水线-Beta", - link: "/zh/notes/guide/pipelines/AgenticRAGPipeline2.md", + text: "Agentic RAG数据合成流水线", + link: "/zh/notes/guide/pipelines/AgenticRAGPipeline.md", icon: "solar:palette-round-linear", activeMatch: '^/guide/' }, diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts index a2888ff10..c730f6368 100644 --- a/docs/.vuepress/notes/en/guide.ts +++ b/docs/.vuepress/notes/en/guide.ts @@ -52,8 +52,8 @@ export const Guide: ThemeNote = defineNoteConfig({ "TextPipeline", "ReasoningPipeline", "Text2SqlPipeline", + "Doc2QAPipeline", "AgenticRAGPipeline", - "AgenticRAGPipeline2", "RAREPipeline", "KnowledgeBaseCleaningPipeline", "FuncCallPipeline", diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts index 6e54a8a96..b9ba27245 100644 --- a/docs/.vuepress/notes/zh/guide.ts +++ b/docs/.vuepress/notes/zh/guide.ts @@ -52,8 +52,8 @@ export const Guide: ThemeNote = defineNoteConfig({ "TextPipeline", "ReasoningPipeline", "Text2SqlPipeline", + "Doc2QAPipeline", "AgenticRAGPipeline", - "AgenticRAGPipeline2", "RAREPipeline", "KnowledgeBaseCleaningPipeline", "FuncCallPipeline", diff --git a/docs/en/notes/guide/domain_specific_operators/agenticrag_operators.md b/docs/en/notes/guide/domain_specific_operators/agenticrag_operators.md index 3c3cd51a2..7a82ff143 100644 --- a/docs/en/notes/guide/domain_specific_operators/agenticrag_operators.md +++ b/docs/en/notes/guide/domain_specific_operators/agenticrag_operators.md @@ -8,7 +8,7 @@ permalink: /en/guide/agenticrag_operators/ ## Overview -AgenticRAG Operators are a specialized suite of tools designed for agentic RAG (Retrieval-Augmented Generation) tasks, with a particular focus on generating question-and-answer (QA) samples from provided text to support RL-based agentic RAG training. These operators are primarily categorized into two groups: **Data Generation Operators (Generators)** and **Processing Operators (Processors)**. +AgenticRAG Operators are a specialized suite of tools designed for agentic RAG (Retrieval-Augmented Generation) tasks, with a particular focus on generating question-and-answer (QA) samples from provided text to support RL-based agentic RAG training. These operators are primarily categorized into two groups: **Data Generation Operators (Generators)** and **Evaluating Operators (Evaluators)**. - 🚀 **Independent Innovation**: Core algorithms developed from scratch, filling existing algorithmic gaps or further improving performance, breaking through current performance bottlenecks. - ✨ **Open Source First**: First integration of this operator into mainstream community frameworks, facilitating use by more developers and achieving open-source sharing. @@ -28,31 +28,19 @@ Data Generation Operators are responsible for producing RAG-related RL training - AutoPromptGenerator🚀 - Prompt Synthesis - Generates prompts for question and answer creation tailored to specific content by leveraging large language models. - - - - - AtomicTaskGenerator✨ + AgenticRAGAtomicTaskGenerator✨ Atomic Task Generation Generates high-quality questions and verifiable answers based on the given text content. Refined and improved from https://github.com/OPPO-PersonalAI/TaskCraft - QAGenerator✨ - Question and Answer Generation - Produces questions and answers for given text content using large language models and generated prompts. - - - - - WidthQAGenerator✨ + AgenticRAGWidthQAGenerator✨ QA Breadth Expansion Combines multiple QA pairs to generate new, more difficult QA pairs. Refined and improved from https://github.com/OPPO-PersonalAI/TaskCraft - DepthQAGenerator✨ + AgenticRAGDepthQAGenerator✨ QA Depth Expansion Expands individual QA pairs into new, more challenging QA pairs. Refined and improved from https://github.com/OPPO-PersonalAI/TaskCraft @@ -75,13 +63,7 @@ Data evaluation operators are responsible for assessing reinforcement learning t - QAScorer✨ - QA Scoring - Evaluates the quality of questions, answer consistency, answer verifiability, and downstream utility for QA pairs and their related content. - - - - - F1Scorer🚀 + AgenticRAGQAF1SampleEvaluator🚀 QA Scoring Assesses the verifiability of answers with and without the presence of gold documents in QA tasks. - @@ -89,30 +71,6 @@ Data evaluation operators are responsible for assessing reinforcement learning t - -## Processing Operators - -Processing Operators are mainly tasked with choosing suitable data. - - - - - - - - - - - - - - - - - - -
NameApplication TypeDescriptionOfficial Repository or Paper
ContentChooser🚀Content chooserSelects a subset of content from a larger collection for further processing within the pipeline.-
- ## Operator Interface Usage Instructions Specifically, for operators that specify storage paths or call models, we provide encapsulated **model interfaces** and **storage object interfaces**. You can predefine model API parameters for operators in the following way: @@ -120,7 +78,7 @@ Specifically, for operators that specify storage paths or call models, we provid ```python from dataflow.llmserving import APILLMServing_request -api_llm_serving = APILLMServing_request( +llm_serving = APILLMServing_request( api_url="your_api_url", model_name="model_name", max_workers=5 @@ -140,7 +98,7 @@ from dataflow.utils.storage import FileStorage ) ``` -The `api_llm_serving` and `self.storage` used in the following text are the interface objects defined here. Complete usage examples can be found in `test/test_agentic_rag.py`. +The `llm_serving` and `self.storage` used in the following text are the interface objects defined here. Complete usage examples can be found in `DataFlow/dataflow/statics/pipelines/api_pipelines/agentic_rag_pipeline.py`. For parameter passing, the constructor of operator objects mainly passes information related to operator configuration, which can be configured once and called multiple times; while the `X.run()` function passes `key` information related to IO. Details can be seen in the operator description examples below. @@ -148,36 +106,7 @@ For parameter passing, the constructor of operator objects mainly passes informa ### Data Generation Operators -#### 1. AutoPromptGenerator - -**Function Description:** This operator is specifically designed to generate specialized prompts for creating question-and-answer pairs based on given text content. - -**Input Parameters:** - -- `__init__()` - - `llm_serving`: Large language model interface object to use (default: predefined value above) -- `run()` - - `storage`: Storage interface object (default: predefined value above) - - `input_key`: Input text content field name (default: "text") - - `output_key`: Output generated prompt field name (default: "generated_prompt") - -**Key Features:** - -- Supports multiple types of text contents -- Automatically generates suitable prompts - -**Usage Example:** - -```python -prompt_generator = AutoPromptGenerator(api_llm_serving) -result = prompt_generator.run( - storage = self.storage.step(), - input_key = "text", - output_key = "generated_prompt" - ) -``` - -#### 2. AtomicTaskGenerator +#### 1. AgenticRAGAtomicTaskGenerator **Function Description:** This operator is used to generate appropriate high-quality questions and verifiable answers for the provided text content. @@ -203,47 +132,17 @@ result = prompt_generator.run( **Usage Example:** ```python -atomic_task_gen = AtomicTaskGenerator(llm_serving=api_llm_serving) -result = atomic_task_gen.run( - storage = self.storage.step(), - input_key = "text", -) -``` - -#### 3. QAGenerator - -**Function Description:** This operator generates a pair of question and answer for a special content. - -**Input Parameters:** - -- `__init__()` - - `llm_serving`: Large language model interface object to use (default: predefined value above) -- `run()` - - `storage`: Storage interface object (default: predefined value above) - - `input_key`: Input text content field name (default: "text") - - `prompt_key`: Output answer field name (default: "generated_prompt") - - `output_quesion_key`: Output answer field name (default: "generated_question") - - `output_answer_key`: Output answer field name (default: "generated_answer") - -**Key Features:** - -- Supports multiple types of text contents -- Generates suitable pairs of questions and answers - -**Usage Example:** +atomic_task_generator = AgenticRAGAtomicTaskGenerator( + llm_serving=self.llm_serving + ) -```python -qa_gen = QAGenerator(llm_serving=api_llm_serving) -result = qa_gen.run( +result = atomic_task_generator.run( storage = self.storage.step(), - input_key="text", - prompt_key="generated_prompt", - output_quesion_key="generated_question", - output_answer_key="generated_answer" - ) + input_key = "contents", + ) ``` -#### 4. WidthQAGenerator +#### 2. AgenticRAGWidthQAGenerator **Function Description:** This operator is used to combine two QA pairs and generate a new question. @@ -265,8 +164,11 @@ result = qa_gen.run( **Usage Example:** ```python -width_qa_gen = WidthQAGenerator(llm_serving=api_llm_serving) -result = width_qa_gen.run( +width_qa_generator = AgenticRAGWidthQAGenerator( + llm_serving=self.llm_serving + ) + +result = width_qa_generator.run( storage = self.storage.step(), input_question_key = "question", input_identifier_key = "identifier", @@ -274,7 +176,7 @@ result = width_qa_gen.run( ) ``` -#### 5. DepthQAGenerator +#### 3. AgenticRAGDepthQAGenerator **Function Description:** This operator is used to generate deeper questions based on existing QA pairs. @@ -294,8 +196,11 @@ result = width_qa_gen.run( **Usage Example:** ```python -depth_qa_gen = DepthQAGenerator(llm_serving=api_llm_serving) -result = depth_qa_gen.run( +depth_qa_generator = AgenticRAGDepthQAGenerator( + llm_serving=self.llm_serving + ) + +result = depth_qa_generator.run( storage = self.storage.step(), input_key = "question", output_key = "depth_question" @@ -304,48 +209,7 @@ result = depth_qa_gen.run( ### Data Evaluation Operators -#### 1. QAScorer - -**Function Description:** This operator generates multiple evaluation scores for the produced question-and-answer pairs. - -**Input Parameters:** - -- `__init__()` - - `llm_serving`: Large language model interface object to use (default: predefined value above) -- `run()` - - `storage`: Storage interface object (default: predefined value above) - - `input_question_key`: Input text content field name containing the generated questions (default: "generated_question") - - `input_answer_key`: Input text content field name containing the generated answers (default: "generated_answer") - - `output_question_quality_key`: Output field name for question quality grades (default: "question_quality_grades") - - `output_question_quality_feedback_key`: Output field name for detailed feedback on question quality (default: "question_quality_feedbacks") - - `output_answer_alignment_key`: Output field name for answer alignment grades (default: "answer_alignment_grades") - - `output_answer_alignment_feedback_key`: Output field name for detailed feedback on answer alignment (default: "answer_alignment_feedbacks") - - `output_answer_verifiability_key`: Output field name for answer verifiability grades (default: "answer_verifiability_grades") - - `output_answer_verifiability_feedback_key`: Output field name for detailed feedback on answer verifiability (default: "answer_verifiability_feedbacks") - - `output_downstream_value_key`: Output field name for downstream value grades (default: "downstream_value_grades") - - `output_downstream_value_feedback_key`: Output field name for detailed feedback on downstream value (default: "downstream_value_feedbacks") - -**Key Features:** - -- Generates multiple useful scores for further filtering - -**Usage Example:** - -```python -qa_scorer = QAScorer(llm_serving=api_llm_serving) -result = qa_scorer.run( - storage = self.storage.step(), - input_question_key="generated_question", - input_answer_key="generated_answer", - output_question_quality_key="question_quality_grades", - output_question_quality_feedback_key="question_quality_feedbacks", - output_answer_alignment_key="answer_alignment_grades", - output_answer_alignment_feedback_key="answer_alignment_feedbacks", - output_answer_verifiability_key="answer_verifiability_grades", - ) -``` - -#### 2. F1Scorer +#### 1. AgenticRAGQAF1SampleEvaluator **Function Description:** This operator is used to evaluate the verifiability of QA tasks with and without support from gold documents. @@ -366,44 +230,11 @@ result = qa_scorer.run( **Usage Example:** ```python -f1_scorer = F1Scorer(llm_serving=api_llm_serving) +f1_scorer = AgenticRAGQAF1SampleEvaluator() result = f1_scorer.run( - storage = self.storage.step(), - prediction_key = "refined_answer", - ground_truth_key = "golden_doc_answer", - output_key = "F1Score", -) -``` - -### Processing Operators - -#### 1. ContentChooser - -**Function Description:** This operator identifies and selects representative text content from a set of text contexts. - -**Input Parameters:** - -- `init()` - - `num_samples`: Numuber of choosen samples - - `method`: The method used to select from the original text contents (default: 'random') - - `embedding_server`: The server to generate embeddings for text contents -- `run()` - - `storage`: Storage interface object (default: predefined value above) - - `input_key`: Input text content field name (default: "text") - -**Key Features:** - -- Supports random choose and kmean choose -- Supports a lot of embedding models - -**Usage Example:** - -```python -embedding_serving = LocalModelLLMServing_vllm(hf_model_name_or_path="your_embedding_model_path", vllm_max_tokens=8192) - -content_chooser = ContentChooser(num_samples = 5, method = "kcenter", embedding_serving=embedding_serving) -result = content_chooser.run( - storage = self.storage.step(), - input_key = "text", - ) + storage=self.storage.step(), + output_key="F1Score", + input_prediction_key="refined_answer", + input_ground_truth_key="golden_doc_answer" + ) ``` \ No newline at end of file diff --git a/docs/en/notes/guide/domain_specific_operators/core_operators.md b/docs/en/notes/guide/domain_specific_operators/core_operators.md new file mode 100644 index 000000000..ad51ec978 --- /dev/null +++ b/docs/en/notes/guide/domain_specific_operators/core_operators.md @@ -0,0 +1,409 @@ +--- +title: AgenticRAG Operators +createTime: 2025/06/24 11:43:42 +permalink: /en/guide/core_operators/ +--- + +# AgenticRAG Operators + +## Overview + +AgenticRAG Operators are a specialized suite of tools designed for agentic RAG (Retrieval-Augmented Generation) tasks, with a particular focus on generating question-and-answer (QA) samples from provided text to support RL-based agentic RAG training. These operators are primarily categorized into two groups: **Data Generation Operators (Generators)** and **Evaluating Operators (Evaluators)**. + +- 🚀 **Independent Innovation**: Core algorithms developed from scratch, filling existing algorithmic gaps or further improving performance, breaking through current performance bottlenecks. +- ✨ **Open Source First**: First integration of this operator into mainstream community frameworks, facilitating use by more developers and achieving open-source sharing. + +## Data Generation Operators + +Data Generation Operators are responsible for producing RAG-related RL training data, including automated prompt creation, question and answer generation, and QA scoring. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameApplication TypeDescriptionOfficial Repository or Paper
AutoPromptGenerator🚀Prompt SynthesisGenerates prompts for question and answer creation tailored to specific content by leveraging large language models.-
AtomicTaskGenerator✨Atomic Task GenerationGenerates high-quality questions and verifiable answers based on the given text content.Refined and improved from https://github.com/OPPO-PersonalAI/TaskCraft
QAGenerator✨Question and Answer GenerationProduces questions and answers for given text content using large language models and generated prompts.-
WidthQAGenerator✨QA Breadth ExpansionCombines multiple QA pairs to generate new, more difficult QA pairs.Refined and improved from https://github.com/OPPO-PersonalAI/TaskCraft
DepthQAGenerator✨QA Depth ExpansionExpands individual QA pairs into new, more challenging QA pairs.Refined and improved from https://github.com/OPPO-PersonalAI/TaskCraft
+ +## Data Evaluation Operators + +Data evaluation operators are responsible for assessing reinforcement learning training data related to RAG, including quality scoring for questions and answers. + + + + + + + + + + + + + + + + + + + + + + + + +
NameApplication TypeDescriptionOfficial Repository or Paper
QAScorer✨QA ScoringEvaluates the quality of questions, answer consistency, answer verifiability, and downstream utility for QA pairs and their related content.-
F1Scorer🚀QA ScoringAssesses the verifiability of answers with and without the presence of gold documents in QA tasks.-
+ + +## Processing Operators + +Processing Operators are mainly tasked with choosing suitable data. + + + + + + + + + + + + + + + + + + +
NameApplication TypeDescriptionOfficial Repository or Paper
ContentChooser🚀Content chooserSelects a subset of content from a larger collection for further processing within the pipeline.-
+ +## Operator Interface Usage Instructions + +Specifically, for operators that specify storage paths or call models, we provide encapsulated **model interfaces** and **storage object interfaces**. You can predefine model API parameters for operators in the following way: + +```python +from dataflow.llmserving import APILLMServing_request + +api_llm_serving = APILLMServing_request( + api_url="your_api_url", + model_name="model_name", + max_workers=5 + ) +``` + +You can predefine storage parameters for operators in the following way: + +```python +from dataflow.utils.storage import FileStorage + + self.storage = FileStorage( + first_entry_file_name="your_file_path", + cache_path="./cache", + file_name_prefix="dataflow_cache_step", + cache_type="json", # jsonl, json, ... + ) +``` + +The `api_llm_serving` and `self.storage` used in the following text are the interface objects defined here. Complete usage examples can be found in `test/test_agentic_rag.py`. + +For parameter passing, the constructor of operator objects mainly passes information related to operator configuration, which can be configured once and called multiple times; while the `X.run()` function passes `key` information related to IO. Details can be seen in the operator description examples below. + +## Detailed Operator Descriptions + +### Data Generation Operators + +#### 1. AutoPromptGenerator + +**Function Description:** This operator is specifically designed to generate specialized prompts for creating question-and-answer pairs based on given text content. + +**Input Parameters:** + +- `__init__()` + - `llm_serving`: Large language model interface object to use (default: predefined value above) +- `run()` + - `storage`: Storage interface object (default: predefined value above) + - `input_key`: Input text content field name (default: "text") + - `output_key`: Output generated prompt field name (default: "generated_prompt") + +**Key Features:** + +- Supports multiple types of text contents +- Automatically generates suitable prompts + +**Usage Example:** + +```python +prompt_generator = AutoPromptGenerator(api_llm_serving) +result = prompt_generator.run( + storage = self.storage.step(), + input_key = "text", + output_key = "generated_prompt" + ) +``` + +#### 2. AtomicTaskGenerator + +**Function Description:** This operator is used to generate appropriate high-quality questions and verifiable answers for the provided text content. + +**Input Parameters:** + +- `__init__()` + - `llm_serving`: Large language model interface object to use (default: predefined value above) +- `run()` + - `storage`: Storage interface object (default: predefined value above) + - `input_key`: Field name of the input text content (default: `"prompts"`) + - `output_question_key`: Field name for the output question (default: `"question"`) + - `output_answer_key`: Field name for the output answer (default: `"answer"`) + - `output_refined_answer_key`: Field name for the output refined answer (default: `"refined_answer"`) + - `output_optional_answer_key`: Field name for the output optional refined answer (default: `"optional_answer"`) + - `output_golden_doc_answer_key`: Field name for the output answer based on gold documents (default: `"golden_doc_answer"`) + +**Key Features:** + +- Supports various types of text content +- Capable of generating appropriate question-answer pairs +- Generates verifiable answers and alternative answers + +**Usage Example:** + +```python +atomic_task_gen = AtomicTaskGenerator(llm_serving=api_llm_serving) +result = atomic_task_gen.run( + storage = self.storage.step(), + input_key = "text", +) +``` + +#### 3. QAGenerator + +**Function Description:** This operator generates a pair of question and answer for a special content. + +**Input Parameters:** + +- `__init__()` + - `llm_serving`: Large language model interface object to use (default: predefined value above) +- `run()` + - `storage`: Storage interface object (default: predefined value above) + - `input_key`: Input text content field name (default: "text") + - `prompt_key`: Output answer field name (default: "generated_prompt") + - `output_quesion_key`: Output answer field name (default: "generated_question") + - `output_answer_key`: Output answer field name (default: "generated_answer") + +**Key Features:** + +- Supports multiple types of text contents +- Generates suitable pairs of questions and answers + +**Usage Example:** + +```python +qa_gen = QAGenerator(llm_serving=api_llm_serving) +result = qa_gen.run( + storage = self.storage.step(), + input_key="text", + prompt_key="generated_prompt", + output_quesion_key="generated_question", + output_answer_key="generated_answer" + ) +``` + +#### 4. WidthQAGenerator + +**Function Description:** This operator is used to combine two QA pairs and generate a new question. + +**Input Parameters:** + +- `__init__()` + - `llm_serving`: Large language model interface object to use (default: predefined value above) +- `run()` + - `storage`: Storage interface object (default: predefined value above) + - `input_question_key`: Field name for the input question (default: `"question"`) + - `input_identifier_key`: Field name for the input identifier (default: `"identifier"`) + - `input_answer_key`: Field name for the input answer (default: `"answer"`) + - `output_question_key`: Field name for the output question (default: `"generated_width_task"`) + +**Key Features:** + +- Combines two QA pairs to generate a more complex new question. + +**Usage Example:** + +```python +width_qa_gen = WidthQAGenerator(llm_serving=api_llm_serving) +result = width_qa_gen.run( + storage = self.storage.step(), + input_question_key = "question", + input_identifier_key = "identifier", + input_answer_key = "refined_answer" +) +``` + +#### 5. DepthQAGenerator + +**Function Description:** This operator is used to generate deeper questions based on existing QA pairs. + +**Input Parameters:** + +- `__init__()` + - `llm_serving`: Large language model interface object to use (default: predefined value above) +- `run()` + - `storage`: Storage interface object (default: predefined value above) + - `input_key`: Field name for the input (default: `"question"`) + - `output_key`: Field name for the output (default: `"depth_question"`) + +**Key Features:** + +- Generates deeper questions based on existing QA pairs + +**Usage Example:** + +```python +depth_qa_gen = DepthQAGenerator(llm_serving=api_llm_serving) +result = depth_qa_gen.run( + storage = self.storage.step(), + input_key = "question", + output_key = "depth_question" +) +``` + +### Data Evaluation Operators + +#### 1. QAScorer + +**Function Description:** This operator generates multiple evaluation scores for the produced question-and-answer pairs. + +**Input Parameters:** + +- `__init__()` + - `llm_serving`: Large language model interface object to use (default: predefined value above) +- `run()` + - `storage`: Storage interface object (default: predefined value above) + - `input_question_key`: Input text content field name containing the generated questions (default: "generated_question") + - `input_answer_key`: Input text content field name containing the generated answers (default: "generated_answer") + - `output_question_quality_key`: Output field name for question quality grades (default: "question_quality_grades") + - `output_question_quality_feedback_key`: Output field name for detailed feedback on question quality (default: "question_quality_feedbacks") + - `output_answer_alignment_key`: Output field name for answer alignment grades (default: "answer_alignment_grades") + - `output_answer_alignment_feedback_key`: Output field name for detailed feedback on answer alignment (default: "answer_alignment_feedbacks") + - `output_answer_verifiability_key`: Output field name for answer verifiability grades (default: "answer_verifiability_grades") + - `output_answer_verifiability_feedback_key`: Output field name for detailed feedback on answer verifiability (default: "answer_verifiability_feedbacks") + - `output_downstream_value_key`: Output field name for downstream value grades (default: "downstream_value_grades") + - `output_downstream_value_feedback_key`: Output field name for detailed feedback on downstream value (default: "downstream_value_feedbacks") + +**Key Features:** + +- Generates multiple useful scores for further filtering + +**Usage Example:** + +```python +qa_scorer = QAScorer(llm_serving=api_llm_serving) +result = qa_scorer.run( + storage = self.storage.step(), + input_question_key="generated_question", + input_answer_key="generated_answer", + output_question_quality_key="question_quality_grades", + output_question_quality_feedback_key="question_quality_feedbacks", + output_answer_alignment_key="answer_alignment_grades", + output_answer_alignment_feedback_key="answer_alignment_feedbacks", + output_answer_verifiability_key="answer_verifiability_grades", + ) +``` + +#### 2. F1Scorer + +**Function Description:** This operator is used to evaluate the verifiability of QA tasks with and without support from gold documents. + +**Input Parameters:** + +- `__init__()` + - `llm_serving`: Large language model interface object to use (default: predefined value above) +- `run()` + - `storage`: Storage interface object (default: predefined value above) + - `prediction_key`: Field name for the input prediction (default: `"refined_answer"`) + - `ground_truth_key`: Field name for the input ground truth answer (default: `"golden_doc_answer"`) + - `output_key`: Field name for the output QA quality score (default: `"F1Score"`) + +**Key Features:** + +- Generates verifiability evaluation results with and without gold document support, facilitating subsequent filtering. + +**Usage Example:** + +```python +f1_scorer = F1Scorer(llm_serving=api_llm_serving) +result = f1_scorer.run( + storage = self.storage.step(), + prediction_key = "refined_answer", + ground_truth_key = "golden_doc_answer", + output_key = "F1Score", +) +``` + +### Processing Operators + +#### 1. ContentChooser + +**Function Description:** This operator identifies and selects representative text content from a set of text contexts. + +**Input Parameters:** + +- `init()` + - `num_samples`: Numuber of choosen samples + - `method`: The method used to select from the original text contents (default: 'random') + - `embedding_server`: The server to generate embeddings for text contents +- `run()` + - `storage`: Storage interface object (default: predefined value above) + - `input_key`: Input text content field name (default: "text") + +**Key Features:** + +- Supports random choose and kmean choose +- Supports a lot of embedding models + +**Usage Example:** + +```python +embedding_serving = LocalModelLLMServing_vllm(hf_model_name_or_path="your_embedding_model_path", vllm_max_tokens=8192) + +content_chooser = ContentChooser(num_samples = 5, method = "kcenter", embedding_serving=embedding_serving) +result = content_chooser.run( + storage = self.storage.step(), + input_key = "text", + ) +``` \ No newline at end of file diff --git a/docs/en/notes/guide/pipelines/AgenticRAGPipeline.md b/docs/en/notes/guide/pipelines/AgenticRAGPipeline.md index 2bf67ee09..1c53c4f68 100644 --- a/docs/en/notes/guide/pipelines/AgenticRAGPipeline.md +++ b/docs/en/notes/guide/pipelines/AgenticRAGPipeline.md @@ -1,7 +1,7 @@ --- -title: Agentic RAG Data Synthesis Pipeline-Alpha +title: Agentic RAG Data Synthesis Pipeline icon: solar:palette-round-linear -createTime: 2025/06/16 13:08:42 +createTime: 2025/07/14 16:37:14 permalink: /en/guide/agenticrag_pipeline/ --- @@ -36,188 +36,119 @@ self.storage = FileStorage( ) ``` -### 2. **Content Selection** +### 2. **Atomic Task Generation** -#### 2.1 **Selecting Content** +#### 2.1 **Atomic Task Generator** -The first step of the process is to use the **Content Chooser** operator (`ContentChooser`) to select a portion of text content from a large dataset. This step is crucial because it determines which text content will be used in the subsequent generation process. +The first step of the process is to use the **Atomic Task Generator** operator (`AgenticRAGAtomicTaskGenerator`) to generate the question, reference answer, refined reference answer, optional verifiable answers, and the LLM’s answer to the question when provided with the original document—all from a large dataset. **Functionality:** -* Identifies and selects representative text content from a set of textual contexts. +* Generate the question, reference answer, refined reference answer, optional verifiable answers, and the LLM’s answer to the question when provided with the original document—all from a large dataset. **Input:** Original text content -**Output:** Selected text content +**Output:** Question, reference answer, refined reference answer, optional verifiable answers, and the LLM’s answer to the question when provided with the original document—all from a large dataset. ```python -embedding_serving = APILLMServing_request( - api_url="https://api.openai.com/v1/embeddings", - model_name="text-embedding-ada-002", - max_workers=100 +self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o-mini", + max_workers=500 ) -content_chooser = ContentChooser(num_samples = 5, method = "random", embedding_serving=embedding_serving) -result = content_chooser.run( - storage = self.storage.step(), - input_key = "text", - ) -``` - -### 3. **Question and Answer Generation** - -#### 3.1 **Automatic Prompt Generation** - -The second step of the process is to use the **Automatic Prompt Generator** operator (`AutoPromptGenerator`) to automatically generate dedicated prompts for question and answer generation. This step ensures that each selected text content is paired with an appropriate prompt for the subsequent Q&A generation. - -**Functionality:** - -* Automatically generates suitable prompts for each selected text content to guide the Q&A generation process. - -**Input:** Selected text content -**Output:** Generated prompts for each text content +atomic_task_generator = AgenticRAGAtomicTaskGenerator( + llm_serving=self.llm_serving + ) -```python -prompt_generator = AutoPromptGenerator(api_llm_serving) -result = prompt_generator.run( +result = atomic_task_generator.run( storage = self.storage.step(), - input_key = "text", - output_key = "generated_prompt" + input_key = "contents", ) ``` ---- +### 3. **Task Quality Evaluation** -#### 3.2 **Q&A Pair Generation** +#### 3.1 **F1 Scorer** -The third step of the process is to use the **Q&A Generator** operator (`QAGenerator`) to generate Q&A pairs for each text content and its corresponding prompt. This step produces the core data for subsequent evaluation and use. +The second step of the process is to use the **F1 Scorer** operator (`AgenticRAGQAF1SampleEvaluator`) evaluate the F1 score between the refined reference answer and the LLM’s answer to the question when provided with the original document. This step ensures that each constructed question, when paired with correct document retrieval, receives an appropriate reward, thereby maintaining the training quality of reinforcement learning. **Functionality:** -* Generates questions and their corresponding answers based on the text content and the generated prompts. +* Evaluate the F1 score between the refined reference answer and the LLM’s answer to the question given the original document. -**Input:** Selected text content and its generated prompts -**Output:** Generated Q&A pairs +**Input:** refined reference answer, the LLM’s answer to the question given the original document. +**Output:** F1 scores ```python -qa_gen = QAGenerator(llm_serving=api_llm_serving) -result = qa_gen.run( - storage = self.storage.step(), - input_key="text", - prompt_key="generated_prompt", - output_quesion_key="generated_question", - output_answer_key="generated_answer" - ) +f1_scorer = AgenticRAGQAF1SampleEvaluator() + +result = f1_scorer.run( + storage=self.storage.step(), + output_key="F1Score", + input_prediction_key="refined_answer", + input_ground_truth_key="golden_doc_answer" + ) ``` --- -#### 3.3 **Q&A Pair Scoring** - -The fourth step of the process is to use the **Q&A Scorer** operator (`QAScorer`) to evaluate the quality of the generated Q&A pairs. This step provides multi-dimensional scores and feedback for each Q&A pair, supporting further filtering and improvement. - -**Functionality:** - -* Evaluates the generated Q&A pairs from multiple dimensions (such as question quality, answer consistency, answer verifiability, and downstream value), and provides scores and detailed feedback. - -**Input:** Generated Q&A pairs -**Output:** Evaluation scores and feedback for each Q&A pair - -```python -qa_scorer = QAScorer(llm_serving=api_llm_serving) -result = qa_scorer.run( - storage = self.storage.step(), - input_question_key="generated_question", - input_answer_key="generated_answer", - output_question_quality_key="question_quality_grades", - output_question_quality_feedback_key="question_quality_feedbacks", - output_answer_alignment_key="answer_alignment_grades", - output_answer_alignment_feedback_key="answer_alignment_feedbacks", - output_answer_verifiability_key="answer_verifiability_grades", - ) -``` - ## 3. Running the Process Run the complete process: ```python -from dataflow.operators.generate.AgenticRAG import ( - AutoPromptGenerator, - QAGenerator, - QAScorer -) +import pandas as pd +from dataflow.operators.agentic_rag import AgenticRAGQAF1SampleEvaluator -from dataflow.operators.process.AgenticRAG import ( - ContentChooser, +from dataflow.operators.agentic_rag import ( + AgenticRAGAtomicTaskGenerator, + AgenticRAGDepthQAGenerator, + AgenticRAGWidthQAGenerator ) + from dataflow.utils.storage import FileStorage -from dataflow.llmserving import APILLMServing_request +from dataflow.serving import APILLMServing_request +from dataflow.core import LLMServingABC + +class AgenticRAGEval_APIPipeline(): -class AgenticRAGPipeline(): def __init__(self, llm_serving=None): self.storage = FileStorage( - first_entry_file_name="../dataflow/example/AgenticRAGPipeline/pipeline_small_chunk.json", - cache_path="./cache", - file_name_prefix="dataflow_cache_step", + first_entry_file_name="../example_data/AgenticRAGPipeline/eval_test_data.jsonl", + cache_path="./agenticRAG_eval_cache", + file_name_prefix="agentic_rag_eval", cache_type="jsonl", ) - if llm_serving is None: - api_llm_serving = APILLMServing_request( - api_url="your_api_url", - model_name="gpt-4o", - max_workers=100 - ) - else: - api_llm_serving = llm_serving - - embedding_serving = APILLMServing_request( - api_url="https://api.openai.com/v1/embeddings", - model_name="text-embedding-ada-002", - max_workers=100 - ) - - self.content_chooser_step1 = ContentChooser(num_samples=5, method="kcenter", embedding_serving=embedding_serving) - self.prompt_generator_step2 = AutoPromptGenerator(api_llm_serving) + self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o-mini", + max_workers=500 + ) - self.qa_generator_step3 = QAGenerator(api_llm_serving) + self.task_step1 = AgenticRAGAtomicTaskGenerator( + llm_serving=self.llm_serving + ) - self.qa_scorer_step4 = QAScorer(api_llm_serving) + self.task_step2 = AgenticRAGQAF1SampleEvaluator() def forward(self): - self.content_chooser_step1.run( + self.task_step1.run( storage = self.storage.step(), - input_key= "text" + input_key = "contents", ) - self.prompt_generator_step2.run( - storage = self.storage.step(), - input_key = "text" + self.task_step2.run( + storage=self.storage.step(), + output_key="F1Score", + input_prediction_key="refined_answer", + input_ground_truth_key="golden_doc_answer" ) - self.qa_generator_step3.run( - storage = self.storage.step(), - input_key="text", - prompt_key="generated_prompt", - output_quesion_key="generated_question", - output_answer_key="generated_answer" - ) - - self.qa_scorer_step4.run( - storage = self.storage.step(), - input_question_key="generated_question", - input_answer_key="generated_answer", - output_question_quality_key="question_quality_grades", - output_question_quality_feedback_key="question_quality_feedbacks", - output_answer_alignment_key="answer_alignment_grades", - output_answer_alignment_feedback_key="answer_alignment_feedbacks", - output_answer_verifiability_key="answer_verifiability_grades", - ) - if __name__ == "__main__": - model = AgenticRAGPipeline() + model = AgenticRAGEval_APIPipeline() model.forward() ``` \ No newline at end of file diff --git a/docs/en/notes/guide/pipelines/AgenticRAGPipeline2.md b/docs/en/notes/guide/pipelines/AgenticRAGPipeline2.md deleted file mode 100644 index 1f9ce0cec..000000000 --- a/docs/en/notes/guide/pipelines/AgenticRAGPipeline2.md +++ /dev/null @@ -1,155 +0,0 @@ ---- -title: Agentic RAG Data Synthesis Pipeline-Beta -icon: solar:palette-round-linear -createTime: 2025/07/14 16:37:14 -permalink: /en/guide/agenticrag_pipeline2/ ---- - -# Agentic RAG Data Synthesis Pipeline - -## 1. Overview - -The **Agentic RAG Data Synthesis Pipeline** is an end-to-end framework to: -- Support RL-based agentic RAG training. -- Generate high-quality pairs of questions and answers from provided text contents. - -This pipeline only need text contexts for generating high-quality questions and answers for further training - ---- - -## 2. Data Flow and Pipeline Logic - -### 1. **Input Data** - -The input data for the pipeline includes the following fields: - -* **text**: various text contents - -These input data can be stored in designated files (such as `json` or `jsonl`) and managed and read via the `FileStorage` object. In the provided example, the default data path is loaded. In practical use, you can modify the path to load custom data and cache paths: - -```python -self.storage = FileStorage( - first_entry_file_name="../dataflow/example/AgenticRAGPipeline/pipeline_small_chunk.json", - cache_path="./cache_local", - file_name_prefix="dataflow_cache_step", - cache_type="jsonl", -) -``` - -### 2. **Atomic Task Generation** - -#### 2.1 **Generating Task** - -The first step of the process is to use the **Atomic Task Generator** operator (`AtomicTaskGenerator`) to generate the question, reference answer, refined reference answer, optional verifiable answers, and the LLM’s answer to the question when provided with the original document—all from a large dataset. - -**Functionality:** - -* Generate the question, reference answer, refined reference answer, optional verifiable answers, and the LLM’s answer to the question when provided with the original document—all from a large dataset. - -**Input:** Original text content - -**Output:** Question, reference answer, refined reference answer, optional verifiable answers, and the LLM’s answer to the question when provided with the original document—all from a large dataset. - -```python -llm_serving = APILLMServing_request( - api_url="https://api.openai.com/v1/chat/completions", - model_name="gpt-4o-mini", - max_workers=500 - ) - -atomic_task_generator = AtomicTaskGenerator( - llm_serving=llm_serving - ) -result = atomic_task_generator.run( - storage = self.storage.step(), - input_key = "text", - ) -``` - -### 3. **Task Quality Evaluation** - -#### 3.1 **F1 Scorer** - -The second step of the process is to use the **F1 Scorer** operator (`F1Scorer`) evaluate the F1 score between the refined reference answer and the LLM’s answer to the question when provided with the original document. This step ensures that each constructed question, when paired with correct document retrieval, receives an appropriate reward, thereby maintaining the training quality of reinforcement learning. - -**Functionality:** - -* Evaluate the F1 score between the refined reference answer and the LLM’s answer to the question given the original document. - -**Input:** refined reference answer, the LLM’s answer to the question given the original document. -**Output:** F1 scores - -```python -f1_scorer = F1Scorer( - prediction_key="refined_answer", - ground_truth_key="golden_doc_answer" - ) -result = f1_scorer.run( - storage=self.storage.step(), - output_key="F1Score" - ) -``` - ---- - -## 3. Running the Process - -Run the complete process: - -```python -import pandas as pd -from dataflow.operators.eval import * - -from dataflow.operators.generate import ( - AtomicTaskGenerator, - DepthQAGenerator, - WidthQAGenerator -) - -from dataflow.operators.filter import * -from dataflow.utils.storage import FileStorage -from dataflow.serving import APILLMServing_request, LocalModelLLMServing -from dataflow.core import LLMServingABC - -class AgenticRAGEvalPipeline(): - - def __init__(self, llm_serving=None): - - self.storage = FileStorage( - first_entry_file_name="../dataflow/example/AgenticRAGPipeline/pipeline_small_chunk.json", - cache_path="./agenticRAG_eval_cache", - file_name_prefix="agentic_rag_eval", - cache_type="jsonl", - ) - - llm_serving = APILLMServing_request( - api_url="https://api.openai.com/v1/chat/completions", - model_name="gpt-4o-mini", - max_workers=500 - ) - - self.task_step1 = AtomicTaskGenerator( - llm_serving=llm_serving - ) - - self.task_step2 = F1Scorer( - prediction_key="refined_answer", - ground_truth_key="golden_doc_answer" - ) - - def forward(self): - - self.task_step1.run( - storage = self.storage.step(), - input_key = "contents", - ) - - self.task_step2.run( - storage=self.storage.step(), - output_key="F1Score" - ) - -if __name__ == "__main__": - model = AgenticRAGEvalPipeline() - model.forward() -``` \ No newline at end of file diff --git a/docs/en/notes/guide/pipelines/Doc2QAPipeline.md b/docs/en/notes/guide/pipelines/Doc2QAPipeline.md new file mode 100644 index 000000000..ca24af6b2 --- /dev/null +++ b/docs/en/notes/guide/pipelines/Doc2QAPipeline.md @@ -0,0 +1,228 @@ +--- +title: Doc-to-QA Data Synthesis Pipeline +icon: solar:palette-round-linear +createTime: 2025/06/16 13:08:42 +permalink: /en/guide/doc2qa_pipeline/ +--- + +# Doc-to-QA Data Synthesis Pipeline + +## 1. Overview + +The **Doc-to-QA Data Synthesis Pipeline** is an end-to-end framework to: +- Support RL-based agentic RAG training. +- Generate high-quality pairs of questions and answers from provided text contents. + +This pipeline only need text contexts for generating high-quality questions and answers for further training + +--- + +## 2. Data Flow and Pipeline Logic + +### 1. **Input Data** + +The input data for the pipeline includes the following fields: + +* **text**: various text contents + +These input data can be stored in designated files (such as `json` or `jsonl`) and managed and read via the `FileStorage` object. In the provided example, the default data path is loaded. In practical use, you can modify the path to load custom data and cache paths: + +```python +self.storage = FileStorage( + first_entry_file_name="../example_data/core_text_data/pipeline_small_chunk.json", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="json", + ) +``` + +### 2. **Content Selection** + +#### 2.1 **Sample Sampling** + +The first step of the process is to use the **Sampling** operator (`KCenterGreedyFilter`) to select a portion of text content from a large dataset. This step is crucial because it determines which text content will be used in the subsequent generation process. + +**Functionality:** + +* Identifies and selects representative text content from a set of textual contexts. + +**Input:** Original text content + +**Output:** Selected text content + +```python +self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o", + max_workers=1 +) + +embedding_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/embeddings", + model_name="text-embedding-ada-002", + max_workers=100 +) + +self.content_chooser_step1 = KCenterGreedyFilter(embedding_serving=embedding_serving, num_samples=5) + +result = self.content_chooser_step1.run( + storage = self.storage.step(), + input_key = "text" + ) +``` + +### 3. **Question and Answer Generation** + +#### 3.1 **Automatic Prompt Generation** + +The second step of the process is to use the **Automatic Prompt Generator** operator (`Doc2PromptGenerator`) to automatically generate dedicated prompts for question and answer generation. This step ensures that each selected text content is paired with an appropriate prompt for the subsequent Q&A generation. + +**Functionality:** + +* Automatically generates suitable prompts for each selected text content to guide the Q&A generation process. + +**Input:** Selected text content +**Output:** Generated prompts for each text content + +```python +self.doc2prompt_generator_step2 = Doc2PromptGenerator(self.llm_serving) +result = doc2prompt_generator_step2.run( + storage = self.storage.step(), + input_key = "text", + output_key = "generated_prompt" + ) +``` + +--- + +#### 3.2 **Q&A Pair Generation** + +The third step of the process is to use the **Q&A Generator** operator (`Doc2QAGenerator`) to generate Q&A pairs for each text content and its corresponding prompt. This step produces the core data for subsequent evaluation and use. + +**Functionality:** + +* Generates questions and their corresponding answers based on the text content and the generated prompts. + +**Input:** Selected text content and its generated prompts +**Output:** Generated Q&A pairs + +```python +self.doc2qa_generator_step3 = Doc2QAGenerator(self.llm_serving) +result = self.doc2qa_generator_step3.run( + storage = self.storage.step(), + input_key="text", + output_prompt_key="generated_prompt", + output_quesion_key="generated_question", + output_answer_key="generated_answer" + ) +``` + +--- + +#### 3.3 **Q&A Pair Scoring** + +The fourth step of the process is to use the **Q&A Scorer** operator (`Doc2QASampleEvaluator`) to evaluate the quality of the generated Q&A pairs. This step provides multi-dimensional scores and feedback for each Q&A pair, supporting further filtering and improvement. + +**Functionality:** + +* Evaluates the generated Q&A pairs from multiple dimensions (such as question quality, answer consistency, answer verifiability, and downstream value), and provides scores and detailed feedback. + +**Input:** Generated Q&A pairs +**Output:** Evaluation scores and feedback for each Q&A pair + +```python +self.doc2qa_scorer_step4 = Doc2QASampleEvaluator(self.llm_serving) +result = self.doc2qa_scorer_step4.run( + storage = self.storage.step(), + input_question_key="generated_question", + input_answer_key="generated_answer", + output_question_quality_key="question_quality_grades", + output_question_quality_feedback_key="question_quality_feedbacks", + output_answer_alignment_key="answer_alignment_grades", + output_answer_alignment_feedback_key="answer_alignment_feedbacks", + output_answer_verifiability_key="answer_verifiability_grades", + ) +``` + +## 3. Running the Process + +Run the complete process: + +```python +from dataflow.operators.core_text import ( + Doc2PromptGenerator, + Doc2QASampleEvaluator, + Doc2QAGenerator, + KCenterGreedyFilter +) + +from dataflow.utils.storage import FileStorage +from dataflow.serving import APILLMServing_request +from dataflow.serving import LocalModelLLMServing_vllm + +class AgenticRAG_APIPipeline(): + def __init__(self): + + self.storage = FileStorage( + first_entry_file_name="../example_data/core_text_data/pipeline_small_chunk.json", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="json", + ) + + # use API server as LLM serving + self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o", + max_workers=1 + ) + + embedding_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/embeddings", + model_name="text-embedding-ada-002", + max_workers=100 + ) + + self.content_chooser_step1 = KCenterGreedyFilter(embedding_serving=embedding_serving, num_samples=5) + + self.doc2prompt_generator_step2 = Doc2PromptGenerator(self.llm_serving) + + self.doc2qa_generator_step3 = Doc2QAGenerator(self.llm_serving) + + self.doc2qa_scorer_step4 = Doc2QASampleEvaluator(self.llm_serving) + + def forward(self): + + self.content_chooser_step1.run( + storage = self.storage.step(), + input_key = "text" + ) + + self.doc2prompt_generator_step2.run( + storage = self.storage.step(), + input_key = "text" + ) + + self.doc2qa_generator_step3.run( + storage = self.storage.step(), + input_key="text", + output_prompt_key="generated_prompt", + output_quesion_key="generated_question", + output_answer_key="generated_answer" + ) + + self.doc2qa_scorer_step4.run( + storage = self.storage.step(), + input_question_key="generated_question", + input_answer_key="generated_answer", + output_question_quality_key="question_quality_grades", + output_question_quality_feedback_key="question_quality_feedbacks", + output_answer_alignment_key="answer_alignment_grades", + output_answer_alignment_feedback_key="answer_alignment_feedbacks", + output_answer_verifiability_key="answer_verifiability_grades", + ) + +if __name__ == "__main__": + model = AgenticRAG_APIPipeline() + model.forward() +``` \ No newline at end of file diff --git a/docs/zh/notes/guide/domain_specific_operators/agenticrag_operators.md b/docs/zh/notes/guide/domain_specific_operators/agenticrag_operators.md index 8e83ac878..a82f85a35 100644 --- a/docs/zh/notes/guide/domain_specific_operators/agenticrag_operators.md +++ b/docs/zh/notes/guide/domain_specific_operators/agenticrag_operators.md @@ -8,7 +8,7 @@ permalink: /zh/guide/agenticrag_operators/ ## 概述 -AgenticRAG 算子是一套专为 agentic RAG(检索增强生成)任务设计的工具,特别聚焦于从给定文本中生成问答(QA)样本,以支持基于强化学习的 agentic RAG 训练。这些算子主要分为两类:**数据生成算子(Generators)** 和 **处理算子(Processors)** +AgenticRAG 算子是一套专为 agentic RAG(检索增强生成)任务设计的工具,特别聚焦于从给定文本中生成问答(QA)样本,以支持基于强化学习的 agentic RAG 训练。这些算子主要分为两类:**数据生成算子(Generators)** 和 **评估算子(Evaluators)** - 🚀 **自主创新**:核心算法原创研发,填补现有算法空白或是进一步提升性能,突破当下性能瓶颈。 - ✨ **开源首发**:首次将该算子集成到社区主流框架中,方便更多开发者使用,实现开源共享。 @@ -28,31 +28,19 @@ AgenticRAG 算子是一套专为 agentic RAG(检索增强生成)任务设计 - AutoPromptGenerator🚀 - 提示词合成 - 利用大语言模型为特定内容生成用于问答创建的提示词。 - - - - - AtomicTaskGenerator✨ + AgenticRAGAtomicTaskGenerator✨ 原子任务生成 为提供的文本内容生成合适的高质量问题与可验证答案 从https://github.com/OPPO-PersonalAI/TaskCraft提炼并改进 - QAGenerator✨ - 问答生成 - 使用大语言模型和生成的提示词,为给定文本内容生成问题和答案。 - - - - - WidthQAGenerator✨ + AgenticRAGWidthQAGenerator✨ 问答广度扩展 结合多个问答对,扩展成新的高难度问答对。 从https://github.com/OPPO-PersonalAI/TaskCraft提炼并改进 - DepthQAGenerator✨ + AgenticRAGDepthQAGenerator✨ 问答深度扩展 将问答对扩展成新的高难度问答对。 从https://github.com/OPPO-PersonalAI/TaskCraft提炼并改进 @@ -75,13 +63,7 @@ AgenticRAG 算子是一套专为 agentic RAG(检索增强生成)任务设计 - QAScorer✨ - 问答评分 - 对问答对及其相关内容进行问题质量、答案一致性、答案可验证性和下游价值的评估。 - - - - - F1Scorer🚀 + AgenticRAGQAF1SampleEvaluator🚀 问答评分 对问答任务在有无黄金文档支持下的可验证性评估。 - @@ -89,28 +71,6 @@ AgenticRAG 算子是一套专为 agentic RAG(检索增强生成)任务设计 -## 处理算子 - -处理算子主要负责选择合适的数据 - - - - - - - - - - - - - - - - - - -
名称应用类型描述官方仓库或论文
ContentChooser🚀内容选择器从更大的内容集合中选择一部分内容以供后续流程处理。-
## 算子接口使用说明 @@ -119,7 +79,7 @@ AgenticRAG 算子是一套专为 agentic RAG(检索增强生成)任务设计 ```python from dataflow.llmserving import APILLMServing_request -api_llm_serving = APILLMServing_request( +llm_serving = APILLMServing_request( api_url="your_api_url", model_name="model_name", max_workers=5 @@ -139,7 +99,7 @@ from dataflow.utils.storage import FileStorage ) ``` -下文中的 `api_llm_serving` 和 `self.storage` 即为此处定义的接口对象。完整的使用示例可见 `test/test_agentic_rag.py` +下文中的 `llm_serving` 和 `self.storage` 即为此处定义的接口对象。完整的使用示例可见 `DataFlow/dataflow/statics/pipelines/api_pipelines/agentic_rag_pipeline.py` 参数传递方面,算子对象的构造函数主要传递与算子配置相关的信息,可一次配置多次调用;而 `X.run()` 函数则传递与 IO 相关的 `key` 信息。具体细节可见下方算子描述示例。 @@ -147,36 +107,7 @@ from dataflow.utils.storage import FileStorage ### 数据生成算子 -#### 1. AutoPromptGenerator - -**功能描述** 该算子专为根据给定文本内容生成用于创建问答对的专用提示词而设计。 - -**输入参数** - -- `__init__()` - - `llm_serving`: 使用的大语言模型接口对象(默认:上文预定义值) -- `run()` - - `storage`: 存储接口对象(默认:上文预定义值) - - `input_key`: 输入文本内容字段名(默认:"text") - - `output_key`: 输出生成提示词字段名(默认:"generated_prompt") - -**主要特性** - -- 支持多种类型的文本内容 -- 自动生成合适的提示词 - -**使用示例** - -```python -prompt_generator = AutoPromptGenerator(api_llm_serving) -result = prompt_generator.run( - storage = self.storage.step(), - input_key = "text", - output_key = "generated_prompt" - ) -``` - -#### 2. AtomicTaskGenerator +#### 1. AgenticRAGAtomicTaskGenerator **函数描述:** 该算子用于为提供的文本内容生成合适的高质量问题与可验证答案。 @@ -203,48 +134,17 @@ result = prompt_generator.run( **使用示例:** ```python -atomic_task_gen = AtomicTaskGenerator(llm_serving=api_llm_serving) -result = atomic_task_gen.run( - storage = self.storage.step(), - input_key = "text", +atomic_task_generator = AgenticRAGAtomicTaskGenerator( + llm_serving=self.llm_serving ) -``` - -#### 3. QAGenerator - -**函数描述:** -该算子用于为特定内容生成一对问题和答案。 - -**输入参数:** - -- `__init__()` - - `llm_serving`:要使用的大语言模型接口对象(默认值:如上所述的预定义值) -- `run()` - - `storage`:存储接口对象(默认值:如上所述的预定义值) - - `input_key`:输入文本内容字段名(默认值:"text") - - `prompt_key`:输出提示字段名(默认值:"generated_prompt") - - `output_quesion_key`:输出问题字段名(默认值:"generated_question") - - `output_answer_key`:输出答案字段名(默认值:"generated_answer") - -**主要特性:** - -- 支持多种类型的文本内容 -- 能够生成合适的问题和答案对 -**使用示例:** - -```python -qa_gen = QAGenerator(llm_serving=api_llm_serving) -result = qa_gen.run( +result = atomic_task_generator.run( storage = self.storage.step(), - input_key="text", - prompt_key="generated_prompt", - output_quesion_key="generated_question", - output_answer_key="generated_answer" - ) + input_key = "contents", + ) ``` -#### 4. WidthQAGenerator +#### 2. AgenticRAGWidthQAGenerator **函数描述:** 该算子用于结合两个问答,生成新的问题。 @@ -267,8 +167,11 @@ result = qa_gen.run( **使用示例:** ```python -width_qa_gen = WidthQAGenerator(llm_serving=api_llm_serving) -result = width_qa_gen.run( +width_qa_generator = AgenticRAGWidthQAGenerator( + llm_serving=self.llm_serving + ) + +result = width_qa_generator.run( storage = self.storage.step(), input_question_key = "question", input_identifier_key= "identifier", @@ -276,7 +179,7 @@ result = width_qa_gen.run( ) ``` -#### 5. DepthQAGenerator +#### 3. AgenticRAGDepthQAGenerator **函数描述:** 该算子以已有问答生成更深度的问题。 @@ -297,8 +200,11 @@ result = width_qa_gen.run( **使用示例:** ```python -depth_qa_gen = DepthjQAGenerator(llm_serving=api_llm_serving) -result = depth_qa_gen.run( +depth_qa_generator = AgenticRAGDepthQAGenerator( + llm_serving=self.llm_serving + ) + +result = depth_qa_generator.run( storage = self.storage.step(), input_key= "question", output_key="depth_question" @@ -307,49 +213,7 @@ result = depth_qa_gen.run( ### 数据评估算子 -#### 1. QAScorer - -**函数描述:** -该算子用于为问题和答案对打出多项评估分数。 - -**输入参数:** - -- `__init__()` - - `llm_serving`:要使用的大语言模型接口对象(默认值:如上所述的预定义值) -- `run()` - - `storage`:存储接口对象(默认值:如上所述的预定义值) - - `input_question_key`:包含生成问题的输入文本内容字段名(默认值:"generated_question") - - `input_answer_key`:包含生成答案的输入文本内容字段名(默认值:"generated_answer") - - `output_question_quality_key`:问题质量评分的输出字段名(默认值:"question_quality_grades") - - `output_question_quality_feedback_key`:问题质量详细反馈的输出字段名(默认值:"question_quality_feedbacks") - - `output_answer_alignment_key`:答案契合度评分的输出字段名(默认值:"answer_alignment_grades") - - `output_answer_alignment_feedback_key`:答案契合度详细反馈的输出字段名(默认值:"answer_alignment_feedbacks") - - `output_answer_verifiability_key`:答案可验证性评分的输出字段名(默认值:"answer_verifiability_grades") - - `output_answer_verifiability_feedback_key`:答案可验证性详细反馈的输出字段名(默认值:"answer_verifiability_feedbacks") - - `output_downstream_value_key`:下游价值评分的输出字段名(默认值:"downstream_value_grades") - - `output_downstream_value_feedback_key`:下游价值详细反馈的输出字段名(默认值:"downstream_value_feedbacks") - -**主要特性:** - -- 生成多项有用的评分,便于后续筛选 - -**使用示例:** - -```python -qa_scorer = QAScorer(llm_serving=api_llm_serving) -result = qa_scorer.run( - storage = self.storage.step(), - input_question_key="generated_question", - input_answer_key="generated_answer", - output_question_quality_key="question_quality_grades", - output_question_quality_feedback_key="question_quality_feedbacks", - output_answer_alignment_key="answer_alignment_grades", - output_answer_alignment_feedback_key="answer_alignment_feedbacks", - output_answer_verifiability_key="answer_verifiability_grades", - ) -``` - -#### 2. F1Scorer +#### 1. AgenticRAGQAF1SampleEvaluator **函数描述:** 该算子用于对问答任务在有无黄金文档支持下的可验证性评估。 @@ -371,45 +235,11 @@ result = qa_scorer.run( **使用示例:** ```python -f1_scorer = F1Scorer(llm_serving=api_llm_serving) -result = qa_scorer.run( - storage = self.storage.step(), - prediction_key="refined_answer", - ground_truth_key="golden_doc_answer", +f1_scorer = AgenticRAGQAF1SampleEvaluator() +result = f1_scorer.run( + storage=self.storage.step(), output_key="F1Score", - ) -``` - -### 处理算子 - -#### 1. ContentChooser - -**函数描述:** -该算子用于从一组文本内容中识别并选择具有代表性的文本内容。 - -**输入参数:** - -- `init()` - - `num_samples`:选择的样本数量 - - `method`:用于从原始文本内容中选择的方法(默认值:'random') - - `embedding_serving`:用于提取文本特征向量 -- `run()` - - `storage`:存储接口对象(默认值:如上所述的预定义值) - - `input_key`:输入文本内容字段名(默认值:"text") - -**主要特性:** - -- 支持随机选择和kmean聚类选择 -- 支持多种嵌入模型 - -**使用示例:** - -```python -embedding_serving = LocalModelLLMServing_vllm(hf_model_name_or_path="your_embedding_model_path", vllm_max_tokens=8192) - -content_chooser = ContentChooser(num_samples = 5, method = "kcenter", embedding_serving=embedding_serving) -result = content_chooser.run( - storage = self.storage.step(), - input_key = "text", - ) + input_prediction_key="refined_answer", + input_ground_truth_key="golden_doc_answer" + ) ``` \ No newline at end of file diff --git a/docs/zh/notes/guide/domain_specific_operators/core_operators.md b/docs/zh/notes/guide/domain_specific_operators/core_operators.md new file mode 100644 index 000000000..ee5288d52 --- /dev/null +++ b/docs/zh/notes/guide/domain_specific_operators/core_operators.md @@ -0,0 +1,415 @@ +--- +title: AgenticRAG 算子 +createTime: 2025/06/24 11:43:42 +permalink: /zh/guide/core_operators/ +--- + +# AgenticRAG 算子 + +## 概述 + +AgenticRAG 算子是一套专为 agentic RAG(检索增强生成)任务设计的工具,特别聚焦于从给定文本中生成问答(QA)样本,以支持基于强化学习的 agentic RAG 训练。这些算子主要分为两类:**数据生成算子(Generators)** 和 **处理算子(Processors)** + +- 🚀 **自主创新**:核心算法原创研发,填补现有算法空白或是进一步提升性能,突破当下性能瓶颈。 +- ✨ **开源首发**:首次将该算子集成到社区主流框架中,方便更多开发者使用,实现开源共享。 + +## 数据生成算子 + +数据生成算子负责生成与 RAG 相关的强化学习训练数据,包括自动提示词生成、问答生成和问答评分 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
名称应用类型描述官方仓库或论文
AutoPromptGenerator🚀提示词合成利用大语言模型为特定内容生成用于问答创建的提示词。-
AtomicTaskGenerator✨原子任务生成为提供的文本内容生成合适的高质量问题与可验证答案从https://github.com/OPPO-PersonalAI/TaskCraft提炼并改进
QAGenerator✨问答生成使用大语言模型和生成的提示词,为给定文本内容生成问题和答案。-
WidthQAGenerator✨问答广度扩展结合多个问答对,扩展成新的高难度问答对。从https://github.com/OPPO-PersonalAI/TaskCraft提炼并改进
DepthQAGenerator✨问答深度扩展将问答对扩展成新的高难度问答对。从https://github.com/OPPO-PersonalAI/TaskCraft提炼并改进
+ +## 数据评估算子 + +数据评估算子负责评估与 RAG 相关的强化学习训练数据,包括问题、答案的质量评分。 + + + + + + + + + + + + + + + + + + + + + + + + +
名称应用类型描述官方仓库或论文
QAScorer✨问答评分对问答对及其相关内容进行问题质量、答案一致性、答案可验证性和下游价值的评估。-
F1Scorer🚀问答评分对问答任务在有无黄金文档支持下的可验证性评估。-
+ +## 处理算子 + +处理算子主要负责选择合适的数据 + + + + + + + + + + + + + + + + + + +
名称应用类型描述官方仓库或论文
ContentChooser🚀内容选择器从更大的内容集合中选择一部分内容以供后续流程处理。-
+ +## 算子接口使用说明 + +对于指定存储路径或调用模型的算子,我们提供了封装好的**模型接口**和**存储对象接口**。你可以通过如下方式为算子预定义模型 API 参数: + +```python +from dataflow.llmserving import APILLMServing_request + +api_llm_serving = APILLMServing_request( + api_url="your_api_url", + model_name="model_name", + max_workers=5 + ) +``` + +你可以通过如下方式为算子预定义存储参数: + +```python +from dataflow.utils.storage import FileStorage + + self.storage = FileStorage( + first_entry_file_name="your_file_path", + cache_path="./cache", + file_name_prefix="dataflow_cache_step", + cache_type="json", # jsonl, json, ... + ) +``` + +下文中的 `api_llm_serving` 和 `self.storage` 即为此处定义的接口对象。完整的使用示例可见 `test/test_agentic_rag.py` + +参数传递方面,算子对象的构造函数主要传递与算子配置相关的信息,可一次配置多次调用;而 `X.run()` 函数则传递与 IO 相关的 `key` 信息。具体细节可见下方算子描述示例。 + +## 算子详细说明 + +### 数据生成算子 + +#### 1. AutoPromptGenerator + +**功能描述** 该算子专为根据给定文本内容生成用于创建问答对的专用提示词而设计。 + +**输入参数** + +- `__init__()` + - `llm_serving`: 使用的大语言模型接口对象(默认:上文预定义值) +- `run()` + - `storage`: 存储接口对象(默认:上文预定义值) + - `input_key`: 输入文本内容字段名(默认:"text") + - `output_key`: 输出生成提示词字段名(默认:"generated_prompt") + +**主要特性** + +- 支持多种类型的文本内容 +- 自动生成合适的提示词 + +**使用示例** + +```python +prompt_generator = AutoPromptGenerator(api_llm_serving) +result = prompt_generator.run( + storage = self.storage.step(), + input_key = "text", + output_key = "generated_prompt" + ) +``` + +#### 2. AtomicTaskGenerator + +**函数描述:** +该算子用于为提供的文本内容生成合适的高质量问题与可验证答案。 + +**输入参数:** + +- `__init__()` + - `llm_serving`:要使用的大语言模型接口对象(默认值:如上所述的预定义值) +- `run()` + - `storage`:存储接口对象(默认值:如上所述的预定义值) + - `input_key`:输入文本内容字段名(默认值:"prompts") + - `output_question_key`:输出问题字段名(默认值:"question") + - `output_answer_key`:输出答案字段名(默认值:"answer") + - `output_refined_answer_key`:输出精炼答案字段名(默认值:"refined_answer") + - `output_optional_answer_key`:输出可替代精炼答案字段名(默认值:"optional_answer") + - `output_golden_doc_answer_key`:输出黄金文档回答字段名(默认值:"golden_doc_answer") + +**主要特性:** + +- 支持多种类型的文本内容 +- 能够生成合适的问题和答案对 +- 生成可验证答案和可替代答案 + +**使用示例:** + +```python +atomic_task_gen = AtomicTaskGenerator(llm_serving=api_llm_serving) +result = atomic_task_gen.run( + storage = self.storage.step(), + input_key = "text", + ) +``` + +#### 3. QAGenerator + +**函数描述:** +该算子用于为特定内容生成一对问题和答案。 + +**输入参数:** + +- `__init__()` + - `llm_serving`:要使用的大语言模型接口对象(默认值:如上所述的预定义值) +- `run()` + - `storage`:存储接口对象(默认值:如上所述的预定义值) + - `input_key`:输入文本内容字段名(默认值:"text") + - `output_prompt_key`:输出提示字段名(默认值:"generated_prompt") + - `output_quesion_key`:输出问题字段名(默认值:"generated_question") + - `output_answer_key`:输出答案字段名(默认值:"generated_answer") + +**主要特性:** + +- 支持多种类型的文本内容 +- 能够生成合适的问题和答案对 + +**使用示例:** + +```python +qa_gen = QAGenerator(llm_serving=api_llm_serving) +result = qa_gen.run( + storage = self.storage.step(), + input_key="text", + output_prompt_key="generated_prompt", + output_quesion_key="generated_question", + output_answer_key="generated_answer" + ) +``` + +#### 4. WidthQAGenerator + +**函数描述:** +该算子用于结合两个问答,生成新的问题。 + +**输入参数:** + +- `__init__()` + - `llm_serving`:要使用的大语言模型接口对象(默认值:如上所述的预定义值) +- `run()` + - `storage`:存储接口对象(默认值:如上所述的预定义值) + - `input_question_key`:输入问题字段名(默认值:"question") + - `input_identifier_key`:输入标识符字段名(默认值:"identifier") + - `input_answer_key`:输入答案字段名(默认值:"answer") + - `output_question_key`:输出问题字段名(默认值:"generated_width_task") + +**主要特性:** + +- 结合两个问答生成更复杂的新问题。 + +**使用示例:** + +```python +width_qa_gen = WidthQAGenerator(llm_serving=api_llm_serving) +result = width_qa_gen.run( + storage = self.storage.step(), + input_question_key = "question", + input_identifier_key= "identifier", + input_answer_key = "refined_answer" + ) +``` + +#### 5. DepthQAGenerator + +**函数描述:** +该算子以已有问答生成更深度的问题。 + +**输入参数:** + +- `__init__()` + - `llm_serving`:要使用的大语言模型接口对象(默认值:如上所述的预定义值) +- `run()` + - `storage`:存储接口对象(默认值:如上所述的预定义值) + - `input_key`:输入字段名(默认值:"question") + - `output_key`:输出字段名(默认值:"depth_question") + +**主要特性:** + +- 以已有问答生成更深度的问题 + +**使用示例:** + +```python +depth_qa_gen = DepthjQAGenerator(llm_serving=api_llm_serving) +result = depth_qa_gen.run( + storage = self.storage.step(), + input_key= "question", + output_key="depth_question" + ) +``` + +### 数据评估算子 + +#### 1. QAScorer + +**函数描述:** +该算子用于为问题和答案对打出多项评估分数。 + +**输入参数:** + +- `__init__()` + - `llm_serving`:要使用的大语言模型接口对象(默认值:如上所述的预定义值) +- `run()` + - `storage`:存储接口对象(默认值:如上所述的预定义值) + - `input_question_key`:包含生成问题的输入文本内容字段名(默认值:"generated_question") + - `input_answer_key`:包含生成答案的输入文本内容字段名(默认值:"generated_answer") + - `output_question_quality_key`:问题质量评分的输出字段名(默认值:"question_quality_grades") + - `output_question_quality_feedback_key`:问题质量详细反馈的输出字段名(默认值:"question_quality_feedbacks") + - `output_answer_alignment_key`:答案契合度评分的输出字段名(默认值:"answer_alignment_grades") + - `output_answer_alignment_feedback_key`:答案契合度详细反馈的输出字段名(默认值:"answer_alignment_feedbacks") + - `output_answer_verifiability_key`:答案可验证性评分的输出字段名(默认值:"answer_verifiability_grades") + - `output_answer_verifiability_feedback_key`:答案可验证性详细反馈的输出字段名(默认值:"answer_verifiability_feedbacks") + - `output_downstream_value_key`:下游价值评分的输出字段名(默认值:"downstream_value_grades") + - `output_downstream_value_feedback_key`:下游价值详细反馈的输出字段名(默认值:"downstream_value_feedbacks") + +**主要特性:** + +- 生成多项有用的评分,便于后续筛选 + +**使用示例:** + +```python +qa_scorer = QAScorer(llm_serving=api_llm_serving) +result = qa_scorer.run( + storage = self.storage.step(), + input_question_key="generated_question", + input_answer_key="generated_answer", + output_question_quality_key="question_quality_grades", + output_question_quality_feedback_key="question_quality_feedbacks", + output_answer_alignment_key="answer_alignment_grades", + output_answer_alignment_feedback_key="answer_alignment_feedbacks", + output_answer_verifiability_key="answer_verifiability_grades", + ) +``` + +#### 2. F1Scorer + +**函数描述:** +该算子用于对问答任务在有无黄金文档支持下的可验证性评估。 + +**输入参数:** + +- `__init__()` + - `llm_serving`:要使用的大语言模型接口对象(默认值:如上所述的预定义值) +- `run()` + - `storage`:存储接口对象(默认值:如上所述的预定义值) + - `prediction_key`:输入预测字段名(默认值:"refined_answer") + - `ground_truth_key`:输入基准答案字段名(默认值:"golden_doc_answer") + - `output_key`:输出问答质量字段名(默认值:"F1Score") + +**主要特性:** + +- 生成在有无黄金文档支持下的可验证性评估,便于后续筛选 + +**使用示例:** + +```python +f1_scorer = F1Scorer(llm_serving=api_llm_serving) +result = qa_scorer.run( + storage = self.storage.step(), + prediction_key="refined_answer", + ground_truth_key="golden_doc_answer", + output_key="F1Score", + ) +``` + +### 处理算子 + +#### 1. ContentChooser + +**函数描述:** +该算子用于从一组文本内容中识别并选择具有代表性的文本内容。 + +**输入参数:** + +- `init()` + - `num_samples`:选择的样本数量 + - `method`:用于从原始文本内容中选择的方法(默认值:'random') + - `embedding_serving`:用于提取文本特征向量 +- `run()` + - `storage`:存储接口对象(默认值:如上所述的预定义值) + - `input_key`:输入文本内容字段名(默认值:"text") + +**主要特性:** + +- 支持随机选择和kmean聚类选择 +- 支持多种嵌入模型 + +**使用示例:** + +```python +embedding_serving = LocalModelLLMServing_vllm(hf_model_name_or_path="your_embedding_model_path", vllm_max_tokens=8192) + +content_chooser = ContentChooser(num_samples = 5, method = "kcenter", embedding_serving=embedding_serving) +result = content_chooser.run( + storage = self.storage.step(), + input_key = "text", + ) +``` \ No newline at end of file diff --git a/docs/zh/notes/guide/pipelines/AgenticRAGPipeline.md b/docs/zh/notes/guide/pipelines/AgenticRAGPipeline.md index c59f1ca83..c2d78b973 100644 --- a/docs/zh/notes/guide/pipelines/AgenticRAGPipeline.md +++ b/docs/zh/notes/guide/pipelines/AgenticRAGPipeline.md @@ -1,7 +1,7 @@ --- -title: AgenticRAG数据合成流水线-Alpha +title: AgenticRAG数据合成流水线 icon: solar:palette-round-linear -createTime: 2025/06/16 13:08:42 +createTime: 2025/07/14 16:37:14 permalink: /zh/guide/agenticrag_pipeline/ --- @@ -29,114 +29,61 @@ permalink: /zh/guide/agenticrag_pipeline/ ```python self.storage = FileStorage( - first_entry_file_name="../dataflow/example/AgenticRAGPipeline/pipeline_small_chunk.json", - cache_path="./cache_local", - file_name_prefix="dataflow_cache_step", + first_entry_file_name="../example_data/AgenticRAGPipeline/eval_test_data.jsonl", + cache_path="./agenticRAG_eval_cache", + file_name_prefix="agentic_rag_eval", cache_type="jsonl", ) ``` -### 2. **内容选择** +### 2. **原子问答生成** -#### 2.1 **选择内容** +#### 2.1 **问答生成** -流程的第一步是使用 **内容选择器** 算子(`ContentChooser`)从大型数据集中选择一部分文本内容。这一步至关重要,因为它决定了哪些文本内容将用于后续的生成流程。 +流程的第一步是使用 **原子任务生成器** 算子(`AgenticRAGAtomicTaskGenerator`)从大型数据集中分别生成问题、参考答案、精简的参考答案、可替代(验证)以及在提供原始文档下LLM对问题的回答。 **功能:** -* 从一组文本上下文中识别并选择具有代表性的文本内容。 +* 从一组文本上下文中生成问题、参考答案、精简的参考答案、可替代(验证)以及在提供原始文档下LLM对问题的回答。 **输入**:原始文本内容 -**输出**:已选择的文本内容 +**输出**:问题、参考答案、精简的参考答案、可替代(验证)以及在提供原始文档下LLM对问题的回答 ```python -embedding_serving = APILLMServing_request( - api_url="https://api.openai.com/v1/embeddings", - model_name="text-embedding-ada-002", - max_workers=100 +self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o-mini", + max_workers=500 ) -content_chooser = ContentChooser(num_samples = 5, method = "random", embedding_serving=embedding_serving) -result = content_chooser.run( - storage = self.storage.step(), - input_key = "text", - ) -``` - ---- - -### 3. **问答生成** - -#### 3.1 **自动提示生成** - -流程的第二步是使用 **自动提示生成器** 算子(`AutoPromptGenerator`)为问答生成自动生成专用提示语。这一步确保每个被选中的文本内容都配有合适的提示语,以便后续的问答生成。 - -**功能:** - -* 为每个被选中的文本内容自动生成合适的提示语,引导问答生成过程。 - -**输入**:已选择的文本内容 -**输出**:为每个文本内容生成的提示语 +atomic_task_generator = AgenticRAGAtomicTaskGenerator( + llm_serving=self.llm_serving + ) -```python -prompt_generator = AutoPromptGenerator(api_llm_serving) -result = prompt_generator.run( +result = atomic_task_generator.run( storage = self.storage.step(), - input_key = "text", - output_key = "generated_prompt" + input_key = "contents", ) ``` --- -#### 3.2 **问答对生成** - -流程的第三步是使用 **问答生成器** 算子(`QAGenerator`)为每个文本内容及其对应的提示语生成问答对。这一步生成了后续评估和使用的核心数据。 - -**功能:** - -* 根据文本内容和生成的提示语,生成问题及其对应的答案。 +### 3. **问答生成质量评估** -**输入**:已选择的文本内容及其生成的提示语 -**输出**:生成的问答对 +#### 3.1 **F1打分器** -```python -qa_gen = QAGenerator(llm_serving=api_llm_serving) -result = qa_gen.run( - storage = self.storage.step(), - input_key="text", - prompt_key="generated_prompt", - output_quesion_key="generated_question", - output_answer_key="generated_answer" - ) -``` - ---- - -#### 3.3 **问答对评分** - -流程的第四步是使用 **问答评分器** 算子(`QAScorer`)对生成的问答对进行质量评估。这一步为每个问答对提供多维度的评分和反馈,支持进一步筛选和改进。 +流程的第二步是使用 **F1打分器** 算子(`AgenticRAGQAF1SampleEvaluator`)为精简的参考答案与提供原始文档下LLM对问题的回答之间的 F1 分数进行评估。这一步确保每个已构造的问题,在正确地文档检索下,所回答的答案能被合适地给予reward,保障强化学习的训练质量。 **功能:** -* 从多个维度(如问题质量、答案契合度、答案可验证性和下游价值)对生成的问答对进行评估,并提供评分和详细反馈。 +* 为精简的参考答案与提供原始文档下LLM对问题的回答之间的 F1 分数进行评估。 -**输入**:生成的问答对 -**输出**:每个问答对的评估分数和反馈 +**输入**:参考答案、 提供原始文档下LLM对问题的回答 +**输出**:F1 分数 ```python -qa_scorer = QAScorer(llm_serving=api_llm_serving) -result = qa_scorer.run( - storage = self.storage.step(), - input_question_key="generated_question", - input_answer_key="generated_answer", - output_question_quality_key="question_quality_grades", - output_question_quality_feedback_key="question_quality_feedbacks", - output_answer_alignment_key="answer_alignment_grades", - output_answer_alignment_feedback_key="answer_alignment_feedbacks", - output_answer_verifiability_key="answer_verifiability_grades", - ) +AgenticRAGQAF1SampleEvaluator ``` --- @@ -146,83 +93,58 @@ result = qa_scorer.run( 运行完整流程: ```python -from dataflow.operators.generate.AgenticRAG import ( - AutoPromptGenerator, - QAGenerator, - QAScorer -) +import pandas as pd +from dataflow.operators.agentic_rag import AgenticRAGQAF1SampleEvaluator -from dataflow.operators.process.AgenticRAG import ( - ContentChooser, +from dataflow.operators.agentic_rag import ( + AgenticRAGAtomicTaskGenerator, + AgenticRAGDepthQAGenerator, + AgenticRAGWidthQAGenerator ) + from dataflow.utils.storage import FileStorage -from dataflow.llmserving import APILLMServing_request +from dataflow.serving import APILLMServing_request +from dataflow.core import LLMServingABC + +class AgenticRAGEval_APIPipeline(): -class AgenticRAGPipeline(): def __init__(self, llm_serving=None): self.storage = FileStorage( - first_entry_file_name="../dataflow/example/AgenticRAGPipeline/pipeline_small_chunk.json", - cache_path="./cache", - file_name_prefix="dataflow_cache_step", + first_entry_file_name="../example_data/AgenticRAGPipeline/eval_test_data.jsonl", + cache_path="./agenticRAG_eval_cache", + file_name_prefix="agentic_rag_eval", cache_type="jsonl", ) - if llm_serving is None: - api_llm_serving = APILLMServing_request( - api_url="your_api_url", - model_name="gpt-4o", - max_workers=100 - ) - else: - api_llm_serving = llm_serving - - embedding_serving = APILLMServing_request( - api_url="https://api.openai.com/v1/embeddings", - model_name="text-embedding-ada-002", - max_workers=100 - ) - - self.content_chooser_step1 = ContentChooser(num_samples=5, method="kcenter", embedding_serving=embedding_serving) - self.prompt_generator_step2 = AutoPromptGenerator(api_llm_serving) + self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o-mini", + max_workers=500 + ) - self.qa_generator_step3 = QAGenerator(api_llm_serving) + self.task_step1 = AgenticRAGAtomicTaskGenerator( + llm_serving=self.llm_serving + ) - self.qa_scorer_step4 = QAScorer(api_llm_serving) + self.task_step2 = AgenticRAGQAF1SampleEvaluator() def forward(self): - self.content_chooser_step1.run( + self.task_step1.run( storage = self.storage.step(), - input_key= "text" + input_key = "contents", ) - self.prompt_generator_step2.run( - storage = self.storage.step(), - input_key = "text" + self.task_step2.run( + storage=self.storage.step(), + output_key="F1Score", + input_prediction_key="refined_answer", + input_ground_truth_key="golden_doc_answer" ) - self.qa_generator_step3.run( - storage = self.storage.step(), - input_key="text", - prompt_key="generated_prompt", - output_quesion_key="generated_question", - output_answer_key="generated_answer" - ) - - self.qa_scorer_step4.run( - storage = self.storage.step(), - input_question_key="generated_question", - input_answer_key="generated_answer", - output_question_quality_key="question_quality_grades", - output_question_quality_feedback_key="question_quality_feedbacks", - output_answer_alignment_key="answer_alignment_grades", - output_answer_alignment_feedback_key="answer_alignment_feedbacks", - output_answer_verifiability_key="answer_verifiability_grades", - ) - if __name__ == "__main__": - model = AgenticRAGPipeline() + model = AgenticRAGEval_APIPipeline() model.forward() ``` diff --git a/docs/zh/notes/guide/pipelines/AgenticRAGPipeline2.md b/docs/zh/notes/guide/pipelines/AgenticRAGPipeline2.md deleted file mode 100644 index 07b5c82fb..000000000 --- a/docs/zh/notes/guide/pipelines/AgenticRAGPipeline2.md +++ /dev/null @@ -1,159 +0,0 @@ ---- -title: AgenticRAG数据合成流水线-Beta -icon: solar:palette-round-linear -createTime: 2025/07/14 16:37:14 -permalink: /zh/guide/agenticrag_pipeline2/ ---- - -# Agentic RAG 数据合成流程 - -## 1. 概述 - -**Agentic RAG 数据合成流程**是一个端到端的框架,用于: -- 支持基于强化学习的 Agentic RAG 训练。 -- 从提供的文本内容中生成高质量的问题和答案对。 - -该流程只需要文本上下文即可生成高质量的问题和答案,用于后续训练。 - ---- - -## 2. 数据流与流程逻辑 - -### 1. **输入数据** - -该流程的输入数据包括以下字段: - -* **text**:各种文本内容 - -这些输入数据可以存储在指定的文件中(如 `json` 或 `jsonl`),并通过 `FileStorage` 对象进行管理和读取。在提供的示例中,默认数据路径被加载。在实际使用中,你可以修改路径以加载自定义数据和缓存路径: - -```python -self.storage = FileStorage( - first_entry_file_name="../dataflow/example/AgenticRAGPipeline/pipeline_small_chunk.json", - cache_path="./cache_local", - file_name_prefix="dataflow_cache_step", - cache_type="jsonl", -) -``` - -### 2. **原子问答生成** - -#### 2.1 **问答生成** - -流程的第一步是使用 **原子任务生成器** 算子(`AtomicTaskGenerator`)从大型数据集中分别生成问题、参考答案、精简的参考答案、可替代(验证)以及在提供原始文档下LLM对问题的回答。 - -**功能:** - -* 从一组文本上下文中生成问题、参考答案、精简的参考答案、可替代(验证)以及在提供原始文档下LLM对问题的回答。 - -**输入**:原始文本内容 - -**输出**:问题、参考答案、精简的参考答案、可替代(验证)以及在提供原始文档下LLM对问题的回答 - -```python -llm_serving = APILLMServing_request( - api_url="https://api.openai.com/v1/chat/completions", - model_name="gpt-4o-mini", - max_workers=500 - ) - -atomic_task_generator = AtomicTaskGenerator( - llm_serving=llm_serving - ) -result = atomic_task_generator.run( - storage = self.storage.step(), - input_key = "text", - ) -``` - ---- - -### 3. **问答生成质量评估** - -#### 3.1 **F1打分器** - -流程的第二步是使用 **F1打分器** 算子(`F1Scorer`)为精简的参考答案与提供原始文档下LLM对问题的回答之间的 F1 分数进行评估。这一步确保每个已构造的问题,在正确地文档检索下,所回答的答案能被合适地给予reward,保障强化学习的训练质量。 - -**功能:** - -* 为精简的参考答案与提供原始文档下LLM对问题的回答之间的 F1 分数进行评估。 - -**输入**:参考答案、 提供原始文档下LLM对问题的回答 -**输出**:F1 分数 - -```python -f1_scorer = F1Scorer( - prediction_key="refined_answer", - ground_truth_key="golden_doc_answer" - ) -result = f1_scorer.run( - storage=self.storage.step(), - output_key="F1Score" - ) -``` - ---- - -## 3. 运行流程 - -运行完整流程: - -```python -import pandas as pd -from dataflow.operators.eval import * - -from dataflow.operators.generate import ( - AtomicTaskGenerator, - DepthQAGenerator, - WidthQAGenerator -) - -from dataflow.operators.filter import * -from dataflow.utils.storage import FileStorage -from dataflow.serving import APILLMServing_request, LocalModelLLMServing -from dataflow.core import LLMServingABC - -class AgenticRAGEvalPipeline(): - - def __init__(self, llm_serving=None): - - self.storage = FileStorage( - first_entry_file_name="../dataflow/example/AgenticRAGPipeline/pipeline_small_chunk.json", - cache_path="./agenticRAG_eval_cache", - file_name_prefix="agentic_rag_eval", - cache_type="jsonl", - ) - - llm_serving = APILLMServing_request( - api_url="https://api.openai.com/v1/chat/completions", - model_name="gpt-4o-mini", - max_workers=500 - ) - - self.task_step1 = AtomicTaskGenerator( - llm_serving=llm_serving - ) - - self.task_step2 = F1Scorer( - prediction_key="refined_answer", - ground_truth_key="golden_doc_answer" - ) - - def forward(self): - - self.task_step1.run( - storage = self.storage.step(), - input_key = "contents", - ) - - self.task_step2.run( - storage=self.storage.step(), - output_key="F1Score" - ) - -if __name__ == "__main__": - model = AgenticRAGEvalPipeline() - model.forward() -``` - ---- \ No newline at end of file diff --git a/docs/zh/notes/guide/pipelines/Doc2QAPipeline.md b/docs/zh/notes/guide/pipelines/Doc2QAPipeline.md new file mode 100644 index 000000000..6c06ded2b --- /dev/null +++ b/docs/zh/notes/guide/pipelines/Doc2QAPipeline.md @@ -0,0 +1,234 @@ +--- +title: Doc-to-QA数据合成流水线 +icon: solar:palette-round-linear +createTime: 2025/06/16 13:08:42 +permalink: /zh/guide/doc2qa_pipeline/ +--- + +# Doc-to-QA 数据合成流程 + +## 1. 概述 + +**Doc2QA 数据合成流程**是一个端到端的框架,用于: +- 支持基于基于文档进行问答的训练语料。 +- 从提供的文本内容中生成高质量的问题和答案对。 + +该流程只需要文本上下文即可生成高质量的问题和答案,用于后续训练。 + +--- + +## 2. 数据流与流程逻辑 + +### 1. **输入数据** + +该流程的输入数据包括以下字段: + +* **text**:各种文本内容 + +这些输入数据可以存储在指定的文件中(如 `json` 或 `jsonl`),并通过 `FileStorage` 对象进行管理和读取。在提供的示例中,默认数据路径被加载。在实际使用中,你可以修改路径以加载自定义数据和缓存路径: + +```python +self.storage = FileStorage( + first_entry_file_name="../example_data/core_text_data/pipeline_small_chunk.json", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="json", + ) +``` + +### 2. **内容选择** + +#### 2.1 **样本采样** + +流程的第一步是使用 **采样器** 算子(`KCenterGreedyFilter`)从大型数据集中采样一部分文本内容。这一步至关重要,因为它决定了哪些文本内容将用于后续的生成流程。 + +**功能:** + +* 从一组文本上下文中识别并选择具有代表性的文本内容。 + +**输入**:原始文本内容 + +**输出**:已选择的文本内容 + +```python +self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o", + max_workers=1 +) + +embedding_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/embeddings", + model_name="text-embedding-ada-002", + max_workers=100 +) + +self.content_chooser_step1 = KCenterGreedyFilter(embedding_serving=embedding_serving, num_samples=5) + +result = self.content_chooser_step1.run( + storage = self.storage.step(), + input_key = "text" + ) +``` + +--- + +### 3. **问答生成** + +#### 3.1 **自动提示生成** + +流程的第二步是使用 **自动提示生成器** 算子(`Doc2PromptGenerator`)为问答生成自动生成专用提示语。这一步确保每个被选中的文本内容都配有合适的提示语,以便后续的问答生成。 + +**功能:** + +* 为每个被选中的文本内容自动生成合适的提示语,引导问答生成过程。 + +**输入**:已选择的文本内容 +**输出**:为每个文本内容生成的提示语 + +```python +self.doc2prompt_generator_step2 = Doc2PromptGenerator(self.llm_serving) +result = doc2prompt_generator_step2.run( + storage = self.storage.step(), + input_key = "text", + output_key = "generated_prompt" + ) +``` + +--- + +#### 3.2 **问答对生成** + +流程的第三步是使用 **问答生成器** 算子(`Doc2QAGenerator`)为每个文本内容及其对应的提示语生成问答对。这一步生成了后续评估和使用的核心数据。 + +**功能:** + +* 根据文本内容和生成的提示语,生成问题及其对应的答案。 + +**输入**:已选择的文本内容及其生成的提示语 +**输出**:生成的问答对 + +```python +self.doc2qa_generator_step3 = Doc2QAGenerator(self.llm_serving) +result = self.doc2qa_generator_step3.run( + storage = self.storage.step(), + input_key="text", + output_prompt_key="generated_prompt", + output_quesion_key="generated_question", + output_answer_key="generated_answer" + ) +``` + +--- + +#### 3.3 **问答对评分** + +流程的第四步是使用 **问答评分器** 算子(`Doc2QASampleEvaluator`)对生成的问答对进行质量评估。这一步为每个问答对提供多维度的评分和反馈,支持进一步筛选和改进。 + +**功能:** + +* 从多个维度(如问题质量、答案契合度、答案可验证性和下游价值)对生成的问答对进行评估,并提供评分和详细反馈。 + +**输入**:生成的问答对 +**输出**:每个问答对的评估分数和反馈 + +```python +self.doc2qa_scorer_step4 = Doc2QASampleEvaluator(self.llm_serving) +result = self.doc2qa_scorer_step4.run( + storage = self.storage.step(), + input_question_key="generated_question", + input_answer_key="generated_answer", + output_question_quality_key="question_quality_grades", + output_question_quality_feedback_key="question_quality_feedbacks", + output_answer_alignment_key="answer_alignment_grades", + output_answer_alignment_feedback_key="answer_alignment_feedbacks", + output_answer_verifiability_key="answer_verifiability_grades", + ) +``` + +--- + +## 3. 运行流程 + +运行完整流程: + +```python +from dataflow.operators.core_text import ( + Doc2PromptGenerator, + Doc2QASampleEvaluator, + Doc2QAGenerator, + KCenterGreedyFilter +) + +from dataflow.utils.storage import FileStorage +from dataflow.serving import APILLMServing_request +from dataflow.serving import LocalModelLLMServing_vllm + +class AgenticRAG_APIPipeline(): + def __init__(self): + + self.storage = FileStorage( + first_entry_file_name="../example_data/core_text_data/pipeline_small_chunk.json", + cache_path="./cache_local", + file_name_prefix="dataflow_cache_step", + cache_type="json", + ) + + # use API server as LLM serving + self.llm_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", + model_name="gpt-4o", + max_workers=1 + ) + + embedding_serving = APILLMServing_request( + api_url="https://api.openai.com/v1/embeddings", + model_name="text-embedding-ada-002", + max_workers=100 + ) + + self.content_chooser_step1 = KCenterGreedyFilter(embedding_serving=embedding_serving, num_samples=5) + + self.doc2prompt_generator_step2 = Doc2PromptGenerator(self.llm_serving) + + self.doc2qa_generator_step3 = Doc2QAGenerator(self.llm_serving) + + self.doc2qa_scorer_step4 = Doc2QASampleEvaluator(self.llm_serving) + + def forward(self): + + self.content_chooser_step1.run( + storage = self.storage.step(), + input_key = "text" + ) + + self.doc2prompt_generator_step2.run( + storage = self.storage.step(), + input_key = "text" + ) + + self.doc2qa_generator_step3.run( + storage = self.storage.step(), + input_key="text", + output_prompt_key="generated_prompt", + output_quesion_key="generated_question", + output_answer_key="generated_answer" + ) + + self.doc2qa_scorer_step4.run( + storage = self.storage.step(), + input_question_key="generated_question", + input_answer_key="generated_answer", + output_question_quality_key="question_quality_grades", + output_question_quality_feedback_key="question_quality_feedbacks", + output_answer_alignment_key="answer_alignment_grades", + output_answer_alignment_feedback_key="answer_alignment_feedbacks", + output_answer_verifiability_key="answer_verifiability_grades", + ) + +if __name__ == "__main__": + model = AgenticRAG_APIPipeline() + model.forward() +``` + +--- \ No newline at end of file