diff --git a/docs/en/notes/guide/pipelines/Pdf2ModelPipeline.md b/docs/en/notes/guide/pipelines/Pdf2ModelPipeline.md index cf2565d51..6bc073350 100644 --- a/docs/en/notes/guide/pipelines/Pdf2ModelPipeline.md +++ b/docs/en/notes/guide/pipelines/Pdf2ModelPipeline.md @@ -12,15 +12,14 @@ conda create -n dataflow python=3.10 conda activate dataflow git clone https://github.com/OpenDCAI/DataFlow.git cd DataFlow -pip install -e .[mineru] -pip install llamafactory[torch,metrics] -pip install open-dataflow[vllm] - +#prepare environment +pip install -e .[llamafactory] +#prepare models mineru-models-download cd .. -mkdir test -cd test +mkdir run_dataflow +cd run_dataflow # Initialize dataflow pdf2model init @@ -28,7 +27,7 @@ dataflow pdf2model init # Train dataflow pdf2model train -# Chat with the trained model, or chat with locally trained models +# Chat with the trained model, or chat with locally trained models in workspace directory dataflow chat ``` @@ -42,10 +41,7 @@ conda activate dataflow cd DataFlow -pip install -e .[mineru] - -pip install llamafactory[torch,metrics] -pip install open-dataflow[vllm] +pip install -e .[llamafactory] ``` @@ -82,7 +78,7 @@ After initialization is complete, the project directory becomes: ```shell Project Root/ -├── Pdf2QAPipeline.py # pipeline execution file +├── pdf_to_qa_pipeline.py # pipeline execution file └── .cache/ # cache directory └── train_config.yaml # default config file for llamafactory training ``` @@ -101,7 +97,7 @@ After fine-tuning is complete, the project directory becomes: ``` Project Root/ -├── Pdf2QAPipeline.py # pipeline execution file +├── pdf_to_qa_pipeline.py # pipeline execution file └── .cache/ # cache directory ├── train_config.yaml # default config file for llamafactory training ├── data/ @@ -114,9 +110,9 @@ Project Root/ │ ├── batch_cleaning_step_step4.json │ └── pdf_list.jsonl ├── mineru/ - │ └── sample-1-7/auto/ + │ └── sample/auto/ └── saves/ - └── qwen2.5_7b_sft_model/ + └── pdf2model_cache_{timestamp}/ ``` @@ -128,7 +124,6 @@ Project Root/ # Default path: .cache/saves/pdf2model_cache_{timestamp} dataflow chat --model ./custom_model_path -# Method 2: Navigate to model directory and run dataflow chat -cd .cache/saves/pdf2model_cache_20250901_143022 +# Method 2: Navigate to workspace directory and run dataflow chat dataflow chat ``` \ No newline at end of file diff --git a/docs/zh/notes/guide/pipelines/Pdf2ModelPipeline.md b/docs/zh/notes/guide/pipelines/Pdf2ModelPipeline.md index 77d5a4bd9..6da7fe952 100644 --- a/docs/zh/notes/guide/pipelines/Pdf2ModelPipeline.md +++ b/docs/zh/notes/guide/pipelines/Pdf2ModelPipeline.md @@ -12,15 +12,14 @@ conda create -n dataflow python=3.10 conda activate dataflow git clone https://github.com/OpenDCAI/DataFlow.git cd DataFlow -pip install -e .[mineru] -pip install llamafactory[torch,metrics] -pip install open-dataflow[vllm] - +#环境准备 +pip install -e .[llamafactory] +#模型准备 mineru-models-download cd .. -mkdir test -cd test +mkdir run_dataflow +cd run_dataflow #初始化 dataflow pdf2model init @@ -41,11 +40,7 @@ conda create -n dataflow python=3.10 conda activate dataflow cd DataFlow - -pip install -e .[mineru] - -pip install llamafactory[torch,metrics] -pip install open-dataflow[vllm] +pip install -e .[llamafactory] ``` @@ -80,7 +75,7 @@ dataflow pdf2model init ```shell 项目根目录/ -├── Pdf2QAPipeline.py # pipeline执行文件 +├── pdf_to_qa_pipeline.py # pipeline执行文件 └── .cache/ # 缓存目录 └── train_config.yaml # llamafactory训练的默认配置文件 ``` @@ -99,7 +94,7 @@ dataflow pdf2model train ``` 项目根目录/ -├── Pdf2QAPipeline.py # pipeline执行文件 +├── pdf_to_qa_pipeline.py # pipeline执行文件 └── .cache/ # 缓存目录 ├── train_config.yaml # llamafactory训练的默认配置文件 ├── data/ @@ -112,7 +107,7 @@ dataflow pdf2model train │ ├── batch_cleaning_step_step4.json │ └── pdf_list.jsonl ├── mineru/ - │ └── sample-1-7/auto/ + │ └── sample/auto/ └── saves/ └── pdf2model_cache_{timestamp}/ ``` @@ -124,5 +119,8 @@ dataflow pdf2model train ``` #用法一:--model 可以指定 对话模型的路径位置(可选) #默认值为.cache/saves/pdf2model_cache_{timestamp} -#用法二:到模型文件夹下 运行dataflow chat +dataflow chat --model ./custom_model_path + +#用法二:在工作文件夹下 运行dataflow chat +dataflow chat ```