From 192f9caab432e74c8e117b149b14fb873045f51f Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Wed, 13 Aug 2025 18:50:52 +0800 Subject: [PATCH 01/13] Pre ce modified (#3335) (#3360) * Pre ce modified (#3335) * update * update * fix * fix * update * update * update * fix * update * update * update * add ut fix pr(3367) --- .github/workflows/_pre_ce_test.yml | 55 +++++++++++++------ .github/workflows/_unit_test_coverage.yml | 53 ++++++++++++++---- .github/workflows/pr_build_and_test.yml | 10 ++-- .../openai/test_serving_completion.py | 1 + 4 files changed, 87 insertions(+), 32 deletions(-) diff --git a/.github/workflows/_pre_ce_test.yml b/.github/workflows/_pre_ce_test.yml index 91368dfda1..1f61ef7869 100644 --- a/.github/workflows/_pre_ce_test.yml +++ b/.github/workflows/_pre_ce_test.yml @@ -21,6 +21,11 @@ on: required: false type: string default: "" + MODEL_CACHE_DIR: + description: "Cache Dir Use" + required: false + type: string + default: "" concurrency: group: ${{ github.event.pull_request.number }} @@ -28,7 +33,7 @@ concurrency: jobs: run_ce_cases: - runs-on: [self-hosted, GPU-L20-4Card] + runs-on: [self-hosted, PRE_CE_RUN_2Card] steps: - name: Print current runner name run: | @@ -67,37 +72,51 @@ jobs: env: docker_image: ${{ inputs.DOCKER_IMAGE }} fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }} + CACHE_DIR: ${{ inputs.CACHE_DIR }} + MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }} run: | runner_name="${{ runner.name }}" - last_char="${runner_name: -1}" + CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}') + DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,) + DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1) + + FLASK_PORT=$((42068 + DEVICE_PORT * 100)) + FD_API_PORT=$((42088 + DEVICE_PORT * 100)) + FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100)) + FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100)) + echo "Test ENV Parameter:" + echo "=========================================================" + echo "FLASK_PORT=${FLASK_PORT}" + echo "FD_API_PORT=${FD_API_PORT}" + echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" + echo "FD_METRICS_PORT=${FD_METRICS_PORT}" + echo "DEVICES=${DEVICES}" + echo "=========================================================" - if [ "${last_char}" = "1" ]; then - gpu_id=2 - DEVICES="2,3" - else - gpu_id=0 - DEVICES="0,1" + CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}" + echo "CACHE_DIR is set to ${CACHE_DIR}" + if [ ! -f "${CACHE_DIR}/gitconfig" ]; then + touch "${CACHE_DIR}/gitconfig" fi - FD_API_PORT=$((9180 + gpu_id * 100)) - FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100)) - FD_METRICS_PORT=$((9170 + gpu_id * 100)) - PARENT_DIR=$(dirname "$WORKSPACE") - echo "PARENT_DIR:$PARENT_DIR" docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ - -v "/ssd4/GithubActions/gitconfig:/etc/gitconfig:ro" \ - -v "/ssd4/GithubActions/ModelData:/ModelData:ro" \ - -v "/ssd4/GithubActions/CacheDir:/root/.cache" \ - -v "/ssd4/GithubActions/ConfigDir:/root/.config" \ + -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ + -v "${CACHE_DIR}/.cache:/root/.cache" \ + -v "${CACHE_DIR}/ConfigDir:/root/.config" \ + -v "${MODEL_CACHE_DIR}:/ModelData:ro" \ -e "MODEL_PATH=/ModelData" \ -e "FD_API_PORT=${FD_API_PORT}" \ -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ + -e "FLASK_PORT=${FLASK_PORT}" \ -e "fd_wheel_url=${fd_wheel_url}" \ - --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c ' + --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c ' git config --global --add safe.directory /workspace/FastDeploy cd FastDeploy python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ python -m pip install ${fd_wheel_url} + for port in $FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT; do + lsof -t -i :$port | xargs -r kill -9 || true + done bash scripts/run_pre_ce.sh ' diff --git a/.github/workflows/_unit_test_coverage.yml b/.github/workflows/_unit_test_coverage.yml index b959fff17f..130dbf74b4 100644 --- a/.github/workflows/_unit_test_coverage.yml +++ b/.github/workflows/_unit_test_coverage.yml @@ -22,6 +22,11 @@ on: required: false type: string default: "" + MODEL_CACHE_DIR: + description: "Cache Dir Use" + required: false + type: string + default: "" jobs: run_tests_with_coverage: @@ -67,11 +72,26 @@ jobs: fd_wheel_url: ${{ inputs.FASTDEPLOY_WHEEL_URL }} CACHE_DIR: ${{ inputs.CACHE_DIR }} BASE_REF: ${{ github.event.pull_request.base.ref }} + MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }} run: | set -x runner_name="${{ runner.name }}" CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}') - gpu_id=$(echo "$CARD_ID" | fold -w1 | paste -sd,) + DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,) + DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1) + + FLASK_PORT=$((42068 + DEVICE_PORT * 100)) + FD_API_PORT=$((42088 + DEVICE_PORT * 100)) + FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100)) + FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100)) + echo "Test ENV Parameter:" + echo "=========================================================" + echo "FLASK_PORT=${FLASK_PORT}" + echo "FD_API_PORT=${FD_API_PORT}" + echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" + echo "FD_METRICS_PORT=${FD_METRICS_PORT}" + echo "DEVICES=${DEVICES}" + echo "=========================================================" CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}" echo "CACHE_DIR is set to ${CACHE_DIR}" @@ -86,17 +106,22 @@ jobs: -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ -v "${CACHE_DIR}/.cache:/root/.cache" \ -v "${CACHE_DIR}/ConfigDir:/root/.config" \ + -v "${MODEL_CACHE_DIR}:/ModelData:ro" \ + -e "MODEL_PATH=/ModelData" \ + -e "FD_API_PORT=${FD_API_PORT}" \ + -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ + -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ + -e "FLASK_PORT=${FLASK_PORT}" \ -e TZ="Asia/Shanghai" \ -e "fd_wheel_url=${fd_wheel_url}" \ -e "BASE_REF=${BASE_REF}" \ - --gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c ' + --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c ' git config --global --add safe.directory /workspace/FastDeploy cd FastDeploy python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ - pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - + pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple python -m pip install coverage python -m pip install diff-cover @@ -152,32 +177,40 @@ jobs: echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_OUTPUT echo "unittest_failed_url=${UNIT_TEST_RESULT_URL}" >> $GITHUB_ENV fi - - name: Determine Unit Succ and whether the coverage rate reaches 80% + - name: Check Unit Test Success shell: bash run: | + cd FastDeploy if [ "$TEST_EXIT_CODE" -eq 8 ]; then + filename=$(basename "$unittest_failed_url") if [ -z "${unittest_failed_url}" ]; then echo "No diff unit failed file URL provided." else - wget ${unittest_failed_url} || echo "Download unittest file failed, but continuing..." + rm -rf "${filename}" + wget -O ${filename} ${unittest_failed_url} || echo "Download unittest file failed, but continuing..." fi echo "Unit tests failed (exit code 8)" - filename=$(basename "$unittest_failed_url") if [ -f "${filename}" ];then echo "Failed test cases:" cat "${filename}" fi exit "$TEST_EXIT_CODE" fi + echo "All tests passed" + - name: Verify Code Coverage Threshold (80%) + shell: bash + run: | + cd FastDeploy if [ "$COVERAGE_EXIT_CODE" -eq 9 ]; then echo "Coverage generation failed (exit code 9)" + filename=$(basename "$diff_cov_result_json_url") if [ -z "${diff_cov_result_json_url}" ]; then echo "No diff cov result file URL provided." else - wget ${diff_cov_result_json_url} || echo "Download cov json file failed, but continuing..." + rm -rf "${filename}" + wget -O ${filename} ${diff_cov_result_json_url} || echo "Download cov json file failed, but continuing..." fi - filename=$(basename "$diff_cov_result_json_url") if [ -f "${filename}" ];then echo "Failed test cases:" if command -v jq >/dev/null 2>&1; then @@ -188,7 +221,7 @@ jobs: fi exit "$COVERAGE_EXIT_CODE" fi - echo "All tests and coverage passed" + echo "coverage passed" exit 0 diff_coverage_report: diff --git a/.github/workflows/pr_build_and_test.yml b/.github/workflows/pr_build_and_test.yml index 0123e5a554..73abc2440d 100644 --- a/.github/workflows/pr_build_and_test.yml +++ b/.github/workflows/pr_build_and_test.yml @@ -39,25 +39,27 @@ jobs: needs: [clone,build] uses: ./.github/workflows/_unit_test_coverage.yml with: - DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310 + DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} + MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" logprob_test: name: Run FastDeploy LogProb Tests needs: [build] uses: ./.github/workflows/_logprob_test_linux.yml with: - DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:cuda126-py310 + DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate PADDLETEST_ARCHIVE_URL: "https://xly-devops.bj.bcebos.com/PaddleTest/PaddleTest.tar.gz" FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} - MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelCache" + MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" pre_ce_test: name: Extracted partial CE model tasks to run in CI. needs: [clone,build] uses: ./.github/workflows/_pre_ce_test.yml with: - DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeploy-ciuse-cuda126 + DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} + MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" diff --git a/test/entrypoints/openai/test_serving_completion.py b/test/entrypoints/openai/test_serving_completion.py index 8d1a4eb663..4c7404a790 100644 --- a/test/entrypoints/openai/test_serving_completion.py +++ b/test/entrypoints/openai/test_serving_completion.py @@ -95,6 +95,7 @@ def test_request_output_to_completion_response(self): model_name=model_name, prompt_batched_token_ids=prompt_batched_token_ids, completion_batched_token_ids=completion_batched_token_ids, + text_after_process_list=["1", "1"], ) assert completion_response.id == request_id From a375378cc1edf452dcf73c6e72dc8fb834e7dc16 Mon Sep 17 00:00:00 2001 From: ming1753 <61511741+ming1753@users.noreply.github.com> Date: Thu, 14 Aug 2025 09:49:22 +0800 Subject: [PATCH 02/13] [Bug Fix] Fix V1 video bug (#3387) --- .../engine/sched/resource_manager_v1.py | 39 ++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index d1116980c5..4aecabcd5d 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -97,13 +97,13 @@ def _prepare_decode_task(self, request): def _prepare_preempt_task(self, request): return ScheduledPreemptTask(idx=request.idx, request_id=request.request_id) - + def reschedule_preempt_task(self, request_id): with self.lock: if request_id in self.to_be_rescheduled_request_id_set and request_id in self.requests: request = self.requests[request_id] self.waiting.appendleft(request) - self.to_be_rescheduled_request_id_set.remove(request_id) + self.to_be_rescheduled_request_id_set.remove(request_id) def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_reqs): can_schedule = True @@ -142,26 +142,31 @@ def _get_num_new_tokens(self, request, token_budget): input_ids_lst = request.prompt_token_ids + request.output_token_ids input_ids = paddle.to_tensor(input_ids_lst, dtype="int64") - grid_thw = [] - for one in inputs["grid_thw"]: - if one[0] == 1: - grid_thw.append(one) - else: - grid_thw.extend([[2, one[1], one[2]]] * (one[0] // 2)) - + input_ids = paddle.to_tensor(input_ids_lst, dtype="int64") image_patch_id = inputs["image_patch_id"] - grid_thw = paddle.to_tensor(grid_thw, dtype="int64") + if request.multimodal_img_boundaries is None: + grid_thw = [] + for one in inputs["grid_thw"]: + if one[0] == 1: + grid_thw.append(one) + else: + grid_thw.extend([[2, one[1], one[2]]] * (one[0] // 2)) + + grid_thw = paddle.to_tensor(grid_thw, dtype="int64") from fastdeploy.model_executor.ops.gpu import get_img_boundaries request.multimodal_img_boundaries = get_img_boundaries( task_input_ids=input_ids, grid_thw=grid_thw, image_patch_id=image_patch_id ).numpy() + grid_thw = grid_thw.numpy().reshape([-1, 3]) + inputs["grid_thw"] = grid_thw + + grid_thw = inputs["grid_thw"] img_boundaries_idx = request.multimodal_img_boundaries[0] img_num_per_boundary = request.multimodal_img_boundaries[1] ori_prompt_len = img_boundaries_idx[-1].item() - grid_thw = grid_thw.numpy().reshape([-1, 3]) pre_end_idx = request.num_computed_tokens new_end_idx = pre_end_idx + num_new_tokens if new_end_idx < ori_prompt_len and input_ids[new_end_idx - 1] == image_patch_id: @@ -421,9 +426,15 @@ def finish_requests(self, request_ids: Union[str, Iterable[str]]): self.running.remove(request) request.status = RequestStatus.FINISHED self._free_blocks(request) - if request.request_id in self.to_be_rescheduled_request_id_set: # finished after preempted, blocks have been recycled. - self.to_be_rescheduled_request_id_set.remove(request.request_id) # just remove from to_be_rescheduled_request_id_set - if request in self.waiting: # after finished, this request still scheduled from preempted to waiting, unexpected error, should not be here + if ( + request.request_id in self.to_be_rescheduled_request_id_set + ): # finished after preempted, blocks have been recycled. + self.to_be_rescheduled_request_id_set.remove( + request.request_id + ) # just remove from to_be_rescheduled_request_id_set + if ( + request in self.waiting + ): # after finished, this request still scheduled from preempted to waiting, unexpected error, should not be here raise RuntimeError(f"request {request.request_id} scheduled into waiting list, after finished") self.tasks_list[request.idx] = None self.stop_flags[request.idx] = True From 4870919682086866cf29018d95bcf5180cb135eb Mon Sep 17 00:00:00 2001 From: JYChen Date: Thu, 14 Aug 2025 10:45:05 +0800 Subject: [PATCH 03/13] fix stopseq error info (#3342) Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> --- fastdeploy/engine/engine.py | 4 ++-- fastdeploy/entrypoints/engine_client.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 88067ed068..fa9fa61750 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -535,7 +535,7 @@ def add_requests(self, task, sampling_params=None, **kwargs): max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM) if len(stop_seqs_len) > max_stop_seqs_num: error_msg = ( - f"Length of stop ({stop_seqs_len}) exceeds the limit max_model_len({max_stop_seqs_num})." + f"Length of stop ({stop_seqs_len}) exceeds the limit max_stop_seqs_num({max_stop_seqs_num})." "Please reduce the number of stop or set a lager max_stop_seqs_num by `FD_MAX_STOP_SEQS_NUM`" ) llm_logger.error(error_msg) @@ -544,7 +544,7 @@ def add_requests(self, task, sampling_params=None, **kwargs): for single_stop_seq_len in stop_seqs_len: if single_stop_seq_len > stop_seqs_max_len: error_msg = ( - f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit max_model_len({stop_seqs_max_len})." + f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit stop_seqs_max_len({stop_seqs_max_len})." "Please reduce the length of stop sequences or set a larger stop_seqs_max_len by `FD_STOP_SEQS_MAX_LEN`" ) llm_logger.error(error_msg) diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index 7c92390bc3..12d14f7e1c 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -152,7 +152,7 @@ def add_requests(self, task): max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM) if len(stop_seqs_len) > max_stop_seqs_num: error_msg = ( - f"Length of stop ({stop_seqs_len}) exceeds the limit max_model_len({max_stop_seqs_num})." + f"Length of stop ({stop_seqs_len}) exceeds the limit max_stop_seqs_num({max_stop_seqs_num})." "Please reduce the number of stop or set a lager max_stop_seqs_num by `FD_MAX_STOP_SEQS_NUM`" ) api_server_logger.error(error_msg) @@ -161,7 +161,7 @@ def add_requests(self, task): for single_stop_seq_len in stop_seqs_len: if single_stop_seq_len > stop_seqs_max_len: error_msg = ( - f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit max_model_len({stop_seqs_max_len})." + f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit stop_seqs_max_len({stop_seqs_max_len})." "Please reduce the length of stop sequences or set a larger stop_seqs_max_len by `FD_STOP_SEQS_MAX_LEN`" ) api_server_logger.error(error_msg) From f0a707e06f947176a34d09ade97e4387788c13ba Mon Sep 17 00:00:00 2001 From: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Date: Thu, 14 Aug 2025 11:36:13 +0800 Subject: [PATCH 04/13] [BugFix] Fix default log level of paddleformers (#3377) Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> --- fastdeploy/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fastdeploy/__init__.py b/fastdeploy/__init__.py index cc26ff07a0..836780ea42 100644 --- a/fastdeploy/__init__.py +++ b/fastdeploy/__init__.py @@ -24,7 +24,11 @@ os.environ["AISTUDIO_LOG"] = "critical" from fastdeploy.engine.sampling_params import SamplingParams from fastdeploy.entrypoints.llm import LLM -from fastdeploy.utils import version +from fastdeploy.utils import version, envs +from paddleformers.utils.log import logger as pf_logger +if envs.FD_DEBUG != "1": + import logging + pf_logger.logger.setLevel(logging.INFO) __all__ = ["LLM", "SamplingParams", "version"] From dc5d3ff5a00b676dc9821e89d5f8a30449726ab2 Mon Sep 17 00:00:00 2001 From: Jiang-Jia-Jun Date: Thu, 14 Aug 2025 14:05:29 +0800 Subject: [PATCH 05/13] [Polish Code] Remove useless notes --- fastdeploy/input/ernie_tokenizer.py | 2 -- fastdeploy/model_executor/layers/activation.py | 1 - 2 files changed, 3 deletions(-) diff --git a/fastdeploy/input/ernie_tokenizer.py b/fastdeploy/input/ernie_tokenizer.py index 2bbc798c5c..0575590151 100644 --- a/fastdeploy/input/ernie_tokenizer.py +++ b/fastdeploy/input/ernie_tokenizer.py @@ -14,8 +14,6 @@ # limitations under the License. """ -# cipher_token=WjI1fQOvhN # do not edit this line - import os import re from shutil import copyfile diff --git a/fastdeploy/model_executor/layers/activation.py b/fastdeploy/model_executor/layers/activation.py index 977a4f2f45..04476a5902 100644 --- a/fastdeploy/model_executor/layers/activation.py +++ b/fastdeploy/model_executor/layers/activation.py @@ -14,7 +14,6 @@ # limitations under the License. """ -# cipher_token=WjI1fQOvhN # do not edit this line from typing import Optional import paddle From d1d321bafdc11b8732a5803e757a2f0e0e66488f Mon Sep 17 00:00:00 2001 From: xiaolei373 Date: Thu, 14 Aug 2025 14:50:48 +0800 Subject: [PATCH 06/13] feat(log):add_request_and_response_log (#3392) --- fastdeploy/entrypoints/openai/api_server.py | 2 ++ fastdeploy/entrypoints/openai/serving_chat.py | 9 ++++++++- .../entrypoints/openai/serving_completion.py | 18 ++++++++++++++++-- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 6562bfac3c..e7064a38c6 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -244,6 +244,7 @@ async def create_chat_completion(request: ChatCompletionRequest): """ Create a chat completion for the provided prompt and parameters. """ + api_server_logger.info(f"Chat Received request: {request.model_dump_json()}") if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: @@ -272,6 +273,7 @@ async def create_completion(request: CompletionRequest): """ Create a completion for the provided prompt and parameters. """ + api_server_logger.info(f"Completion Received request: {request.model_dump_json()}") if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 829f39f3dc..0510a07845 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -239,6 +239,7 @@ async def chat_completion_stream_generator( prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens), ) yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n" + api_server_logger.info(f"Chat Streaming response send_idx 0: {chunk.model_dump_json()}") first_iteration = False output = res["outputs"] @@ -265,6 +266,7 @@ async def chat_completion_stream_generator( logprobs=logprobs_res, arrival_time=arrival_time, ) + if res["finished"]: num_choices -= 1 work_process_metrics.e2e_request_latency.observe( @@ -299,6 +301,9 @@ async def chat_completion_stream_generator( if len(choices) == max_streaming_response_tokens or res["finished"]: chunk.choices = choices yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" + # 打印尾包 + if res["finished"]: + api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}") choices = [] if choices: @@ -452,13 +457,15 @@ async def chat_completion_full_generator( prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=final_res.get("num_cached_tokens", 0)), ) work_process_metrics.e2e_request_latency.observe(time.time() - final_res["metrics"]["request_start_time"]) - return ChatCompletionResponse( + res = ChatCompletionResponse( id=request_id, created=created_time, model=model_name, choices=choices, usage=usage, ) + api_server_logger.info(f"Chat response: {res.model_dump_json()}") + return res def _create_chat_logprobs( self, diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index b684079420..a6a799ac7f 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -221,8 +221,7 @@ async def completion_full_generator( valid_results[rid] = data num_choices -= 1 break - - return self.request_output_to_completion_response( + res = self.request_output_to_completion_response( final_res_batch=valid_results, request=request, request_id=request_id, @@ -232,6 +231,8 @@ async def completion_full_generator( completion_batched_token_ids=completion_batched_token_ids, text_after_process_list=text_after_process_list, ) + api_server_logger.info(f"Completion response: {res.model_dump_json()}") + return res except Exception as e: api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True) raise @@ -322,6 +323,9 @@ async def completion_stream_generator( ], ) yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" + api_server_logger.info( + f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}" + ) first_iteration[idx] = False self.engine_client.data_processor.process_response_dict( @@ -358,6 +362,15 @@ async def completion_stream_generator( choices[-1].finish_reason = self.calc_finish_reason( request.max_tokens, output_tokens[idx], output ) + send_idx = output.get("send_idx") + # 只有当 send_idx 明确为 0 时才记录日志 + if send_idx == 0 and not request.return_token_ids: + chunk_temp = chunk + chunk_temp.choices = choices + api_server_logger.info( + f"Completion Streaming response send_idx 0: {chunk_temp.model_dump_json()}" + ) + del chunk_temp if len(choices) == max_streaming_response_tokens or res["finished"]: chunk = CompletionStreamResponse( @@ -383,6 +396,7 @@ async def completion_stream_generator( ), ) yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n" + api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}") if choices: chunk.choices = choices yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" From b2df0311b8e8292c2b4407e09cf3ecf32ac5468b Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Thu, 14 Aug 2025 14:51:15 +0800 Subject: [PATCH 07/13] Optimize CI execution workflow. (#3371) (#3384) * fix --- .github/workflows/_logprob_test_linux.yml | 45 ++++-- .github/workflows/_pre_ce_test.yml | 22 ++- .github/workflows/_unit_test_coverage.yml | 162 +++++++++++++--------- scripts/coverage_run.sh | 18 ++- 4 files changed, 162 insertions(+), 85 deletions(-) diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml index e55d937df2..79f6d47e2c 100644 --- a/.github/workflows/_logprob_test_linux.yml +++ b/.github/workflows/_logprob_test_linux.yml @@ -62,18 +62,22 @@ jobs: MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }} run: | runner_name="${{ runner.name }}" - last_char="${runner_name: -1}" + CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}') + DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,) + DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1) - if [[ "$last_char" =~ [0-7] ]]; then - DEVICES="$last_char" - else - DEVICES="0" - fi - - FLASK_PORT=$((9160 + DEVICES * 100)) - FD_API_PORT=$((9180 + DEVICES * 100)) - FD_ENGINE_QUEUE_PORT=$((9150 + DEVICES * 100)) - FD_METRICS_PORT=$((9170 + DEVICES * 100)) + FLASK_PORT=$((42068 + DEVICE_PORT * 100)) + FD_API_PORT=$((42088 + DEVICE_PORT * 100)) + FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100)) + FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100)) + echo "Test ENV Parameter:" + echo "=========================================================" + echo "FLASK_PORT=${FLASK_PORT}" + echo "FD_API_PORT=${FD_API_PORT}" + echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" + echo "FD_METRICS_PORT=${FD_METRICS_PORT}" + echo "DEVICES=${DEVICES}" + echo "=========================================================" CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}" echo "CACHE_DIR is set to ${CACHE_DIR}" @@ -85,7 +89,24 @@ jobs: exit 1 fi - PARENT_DIR=$(dirname "$WORKSPACE") + PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT) + LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log" + echo "==== LOG_FILE is ${LOG_FILE} ====" + + echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE + + for port in "${PORTS[@]}"; do + PIDS=$(lsof -t -i :$port || true) + if [ -n "$PIDS" ]; then + echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE + echo "$PIDS" | xargs -r kill -9 + echo "Port $port cleared" | tee -a $LOG_FILE + else + echo "Port $port is free" | tee -a $LOG_FILE + fi + done + + echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE docker run --ipc=host --pid=host --net=host \ -v $(pwd):/workspace \ diff --git a/.github/workflows/_pre_ce_test.yml b/.github/workflows/_pre_ce_test.yml index 1f61ef7869..637eeb249f 100644 --- a/.github/workflows/_pre_ce_test.yml +++ b/.github/workflows/_pre_ce_test.yml @@ -99,6 +99,25 @@ jobs: touch "${CACHE_DIR}/gitconfig" fi + PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT) + LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log" + echo "==== LOG_FILE is ${LOG_FILE} ====" + + echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE + + for port in "${PORTS[@]}"; do + PIDS=$(lsof -t -i :$port || true) + if [ -n "$PIDS" ]; then + echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE + echo "$PIDS" | xargs -r kill -9 + echo "Port $port cleared" | tee -a $LOG_FILE + else + echo "Port $port is free" | tee -a $LOG_FILE + fi + done + + echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE + docker run --rm --net=host -v $(pwd):/workspace -w /workspace \ -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ -v "${CACHE_DIR}/.cache:/root/.cache" \ @@ -115,8 +134,5 @@ jobs: cd FastDeploy python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ python -m pip install ${fd_wheel_url} - for port in $FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT; do - lsof -t -i :$port | xargs -r kill -9 || true - done bash scripts/run_pre_ce.sh ' diff --git a/.github/workflows/_unit_test_coverage.yml b/.github/workflows/_unit_test_coverage.yml index 130dbf74b4..a29edb0aac 100644 --- a/.github/workflows/_unit_test_coverage.yml +++ b/.github/workflows/_unit_test_coverage.yml @@ -74,74 +74,100 @@ jobs: BASE_REF: ${{ github.event.pull_request.base.ref }} MODEL_CACHE_DIR: ${{ inputs.MODEL_CACHE_DIR }} run: | - set -x - runner_name="${{ runner.name }}" - CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}') - DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,) - DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1) - - FLASK_PORT=$((42068 + DEVICE_PORT * 100)) - FD_API_PORT=$((42088 + DEVICE_PORT * 100)) - FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100)) - FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100)) - echo "Test ENV Parameter:" - echo "=========================================================" - echo "FLASK_PORT=${FLASK_PORT}" - echo "FD_API_PORT=${FD_API_PORT}" - echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" - echo "FD_METRICS_PORT=${FD_METRICS_PORT}" - echo "DEVICES=${DEVICES}" - echo "=========================================================" - - CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}" - echo "CACHE_DIR is set to ${CACHE_DIR}" - if [ ! -f "${CACHE_DIR}/gitconfig" ]; then - touch "${CACHE_DIR}/gitconfig" - fi - PARENT_DIR=$(dirname "$WORKSPACE") - echo "PARENT_DIR:$PARENT_DIR" - docker run --rm --net=host \ - --cap-add=SYS_PTRACE --privileged --shm-size=64G \ - -v $(pwd):/workspace -w /workspace \ - -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ - -v "${CACHE_DIR}/.cache:/root/.cache" \ - -v "${CACHE_DIR}/ConfigDir:/root/.config" \ - -v "${MODEL_CACHE_DIR}:/ModelData:ro" \ - -e "MODEL_PATH=/ModelData" \ - -e "FD_API_PORT=${FD_API_PORT}" \ - -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ - -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ - -e "FLASK_PORT=${FLASK_PORT}" \ - -e TZ="Asia/Shanghai" \ - -e "fd_wheel_url=${fd_wheel_url}" \ - -e "BASE_REF=${BASE_REF}" \ - --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c ' - - git config --global --add safe.directory /workspace/FastDeploy - cd FastDeploy - python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ - - pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - - python -m pip install coverage - python -m pip install diff-cover - python -m pip install ${fd_wheel_url} - export COVERAGE_FILE=/workspace/FastDeploy/coveragedata/.coverage - export COVERAGE_RCFILE=/workspace/FastDeploy/scripts/.coveragerc - TEST_EXIT_CODE=0 - bash scripts/coverage_run.sh || TEST_EXIT_CODE=8 - git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt - echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> exit_code.env - coverage combine coveragedata/ - coverage xml -o python_coverage_all.xml - COVERAGE_EXIT_CODE=0 - diff-cover python_coverage_all.xml --diff-file=diff.txt --fail-under=80 --json-report diff_coverage.json || COVERAGE_EXIT_CODE=9 - echo "COVERAGE_EXIT_CODE=${COVERAGE_EXIT_CODE}" >> exit_code.env - python scripts/generate_diff_coverage_xml.py diff.txt python_coverage_all.xml - ' - if [ -f FastDeploy/exit_code.env ]; then - cat FastDeploy/exit_code.env >> $GITHUB_ENV - fi + set -x + runner_name="${{ runner.name }}" + CARD_ID=$(echo "${runner_name}" | awk -F'-' '{print $NF}') + DEVICES=$(echo "$CARD_ID" | fold -w1 | paste -sd,) + DEVICE_PORT=$(echo "$DEVICES" | cut -d',' -f1) + + FLASK_PORT=$((42068 + DEVICE_PORT * 100)) + FD_API_PORT=$((42088 + DEVICE_PORT * 100)) + FD_ENGINE_QUEUE_PORT=$((42058 + DEVICE_PORT * 100)) + FD_METRICS_PORT=$((42078 + DEVICE_PORT * 100)) + echo "Test ENV Parameter:" + echo "=========================================================" + echo "FLASK_PORT=${FLASK_PORT}" + echo "FD_API_PORT=${FD_API_PORT}" + echo "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" + echo "FD_METRICS_PORT=${FD_METRICS_PORT}" + echo "DEVICES=${DEVICES}" + echo "=========================================================" + + CACHE_DIR="${CACHE_DIR:-$(dirname "$(dirname "${{ github.workspace }}")")}" + echo "CACHE_DIR is set to ${CACHE_DIR}" + if [ ! -f "${CACHE_DIR}/gitconfig" ]; then + touch "${CACHE_DIR}/gitconfig" + fi + + PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT) + LOG_FILE="./port_cleanup_$(date +%Y%m%d_%H%M%S).log" + echo "==== LOG_FILE is ${LOG_FILE} ====" + + echo "==== PORT CLEAN BEFORE TASK RUN ====" | tee -a $LOG_FILE + + for port in "${PORTS[@]}"; do + PIDS=$(lsof -t -i :$port || true) + if [ -n "$PIDS" ]; then + echo "Port $port is occupied by PID(s): $PIDS" | tee -a $LOG_FILE + echo "$PIDS" | xargs -r kill -9 + echo "Port $port cleared" | tee -a $LOG_FILE + else + echo "Port $port is free" | tee -a $LOG_FILE + fi + done + + echo "==== PORT CLEAN COMPLETE ====" | tee -a $LOG_FILE + + docker run --rm --net=host \ + --cap-add=SYS_PTRACE --shm-size=64G \ + -v $(pwd):/workspace -w /workspace \ + -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ + -v "${CACHE_DIR}/.cache:/root/.cache" \ + -v "${CACHE_DIR}/ConfigDir:/root/.config" \ + -v "${MODEL_CACHE_DIR}:/ModelData:ro" \ + -e "MODEL_PATH=/ModelData" \ + -e "FD_API_PORT=${FD_API_PORT}" \ + -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ + -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ + -e "FLASK_PORT=${FLASK_PORT}" \ + -e TZ="Asia/Shanghai" \ + -e "fd_wheel_url=${fd_wheel_url}" \ + -e "BASE_REF=${BASE_REF}" \ + --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c ' + + git config --global --add safe.directory /workspace/FastDeploy + cd FastDeploy + python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/ + + pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + python -m pip install coverage + python -m pip install diff-cover + python -m pip install ${fd_wheel_url} + if [ -d "test/plugins" ]; then + cd test/plugins + python setup.py install + cd ../.. + else + echo "Warning: test/plugins directory not found, skipping setup.py install" + fi + export COVERAGE_FILE=/workspace/FastDeploy/coveragedata/.coverage + export COVERAGE_RCFILE=/workspace/FastDeploy/scripts/.coveragerc + TEST_EXIT_CODE=0 + bash scripts/coverage_run.sh || TEST_EXIT_CODE=8 + git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt + echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> exit_code.env + coverage combine coveragedata/ + coverage xml -o python_coverage_all.xml + COVERAGE_EXIT_CODE=0 + diff-cover python_coverage_all.xml --diff-file=diff.txt --fail-under=80 --json-report diff_coverage.json || COVERAGE_EXIT_CODE=9 + echo "COVERAGE_EXIT_CODE=${COVERAGE_EXIT_CODE}" >> exit_code.env + python scripts/generate_diff_coverage_xml.py diff.txt python_coverage_all.xml + ' + if [ -f FastDeploy/exit_code.env ]; then + cat FastDeploy/exit_code.env >> $GITHUB_ENV + fi + - name: Upload unit resule and diff coverage to bos id: cov_upload shell: bash diff --git a/scripts/coverage_run.sh b/scripts/coverage_run.sh index 6b6cbbf850..443f2e1c37 100644 --- a/scripts/coverage_run.sh +++ b/scripts/coverage_run.sh @@ -66,11 +66,25 @@ for dir in "${dirs[@]}"; do echo "Skipping disabled test: $test_file" continue fi - - python -m coverage run "$test_file" + # TODO: Add a framework to manage unit test execution time + timeout 600 python -m coverage run "$test_file" if [ $? -ne 0 ]; then echo "$test_file" >> "$failed_tests_file" fail=$((fail + 1)) + + PORTS=($FLASK_PORT $FD_API_PORT $FD_ENGINE_QUEUE_PORT $FD_METRICS_PORT) + echo "==== PORT CLEAN AFTER UT FAILED ====" + + for port in "${PORTS[@]}"; do + PIDS=$(lsof -t -i :$port) + if [ -n "$PIDS" ]; then + echo "Port $port is occupied by PID(s): $PIDS" + echo "$PIDS" | xargs -r kill -9 + echo "Port $port cleared" + else + echo "Port $port is free" + fi + done else success=$((success + 1)) fi From 03347626a6a82d1b1fb8271488e0a39180409f7c Mon Sep 17 00:00:00 2001 From: ltd0924 <32387785+ltd0924@users.noreply.github.com> Date: Thu, 14 Aug 2025 17:01:25 +0800 Subject: [PATCH 08/13] [BugFix] fix control signal release failed (#3374) * [BugFix] * [BugFix] * [BugFix] * [BugFix] * fix * fix --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> --- fastdeploy/entrypoints/openai/api_server.py | 10 ++-- fastdeploy/entrypoints/openai/serving_chat.py | 58 +++++++++---------- .../entrypoints/openai/serving_completion.py | 15 +++-- fastdeploy/inter_communicator/zmq_client.py | 12 +++- 4 files changed, 51 insertions(+), 44 deletions(-) diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index e7064a38c6..2f501b2ef8 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -165,9 +165,9 @@ async def connection_manager(): yield except asyncio.TimeoutError: api_server_logger.info(f"Reach max request release: {connection_semaphore.status()}") - if connection_semaphore.locked(): - connection_semaphore.release() - raise HTTPException(status_code=429, detail="Too many requests") + raise HTTPException( + status_code=429, detail=f"Too many requests, current max concurrency is {args.max_concurrency}" + ) def wrap_streaming_generator(original_generator: AsyncGenerator): @@ -180,7 +180,7 @@ async def wrapped_generator(): async for chunk in original_generator: yield chunk finally: - api_server_logger.debug(f"release: {connection_semaphore.status()}") + api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}") connection_semaphore.release() return wrapped_generator @@ -255,9 +255,11 @@ async def create_chat_completion(request: ChatCompletionRequest): generator = await app.state.chat_handler.create_chat_completion(request) if isinstance(generator, ErrorResponse): connection_semaphore.release() + api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}") return JSONResponse(content={"detail": generator.model_dump()}, status_code=generator.code) elif isinstance(generator, ChatCompletionResponse): connection_semaphore.release() + api_server_logger.debug(f"current concurrency status: {connection_semaphore.status()}") return JSONResponse(content=generator.model_dump()) else: wrapped_generator = wrap_streaming_generator(generator) diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 0510a07845..536cd7d807 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -78,45 +78,45 @@ async def create_chat_completion(self, request: ChatCompletionRequest): api_server_logger.error(err_msg) return ErrorResponse(message=err_msg, code=400) - if request.user is not None: - request_id = f"chatcmpl-{request.user}-{uuid.uuid4()}" - else: - request_id = f"chatcmpl-{uuid.uuid4()}" - api_server_logger.info(f"create chat completion request: {request_id}") - text_after_process = None - try: - current_req_dict = request.to_dict_for_infer(request_id) - current_req_dict["arrival_time"] = time.time() - prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict) - text_after_process = current_req_dict.get("text_after_process") - if isinstance(prompt_token_ids, np.ndarray): - prompt_token_ids = prompt_token_ids.tolist() - except Exception as e: - return ErrorResponse(code=400, message=str(e)) - - del current_req_dict - try: - api_server_logger.debug(f"{self.engine_client.semaphore.status()}") if self.max_waiting_time < 0: await self.engine_client.semaphore.acquire() else: await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time) - except Exception: - return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}") + api_server_logger.debug(f"current waiting request {self.engine_client.semaphore.status()}") - if request.stream: - return self.chat_completion_stream_generator( - request, request_id, request.model, prompt_token_ids, text_after_process - ) - else: + if request.user is not None: + request_id = f"chatcmpl-{request.user}-{uuid.uuid4()}" + else: + request_id = f"chatcmpl-{uuid.uuid4()}" + api_server_logger.info(f"create chat completion request: {request_id}") + text_after_process = None try: - return await self.chat_completion_full_generator( - request, request_id, request.model, prompt_token_ids, text_after_process - ) + current_req_dict = request.to_dict_for_infer(request_id) + current_req_dict["arrival_time"] = time.time() + prompt_token_ids = self.engine_client.format_and_add_data(current_req_dict) + text_after_process = current_req_dict.get("text_after_process") + if isinstance(prompt_token_ids, np.ndarray): + prompt_token_ids = prompt_token_ids.tolist() except Exception as e: return ErrorResponse(code=400, message=str(e)) + del current_req_dict + + if request.stream: + return self.chat_completion_stream_generator( + request, request_id, request.model, prompt_token_ids, text_after_process + ) + else: + try: + return await self.chat_completion_full_generator( + request, request_id, request.model, prompt_token_ids, text_after_process + ) + except Exception as e: + return ErrorResponse(code=400, message=str(e)) + except Exception: + return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}") + def _create_streaming_error_response(self, message: str) -> str: error_response = ErrorResponse( code=400, diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index a6a799ac7f..cec597f78a 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -101,6 +101,13 @@ async def create_completion(self, request: CompletionRequest): api_server_logger.info(f"start inference for request {num_choices}") prompt_batched_token_ids = [] text_after_process_list = [] + try: + if self.max_waiting_time < 0: + await self.engine_client.semaphore.acquire() + else: + await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time) + except Exception: + return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}") try: for idx, prompt in enumerate(request_prompts): request_id_idx = f"{request_id}-{idx}" @@ -117,14 +124,6 @@ async def create_completion(self, request: CompletionRequest): del current_req_dict - try: - if self.max_waiting_time < 0: - await self.engine_client.semaphore.acquire() - else: - await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time) - except Exception: - return ErrorResponse(code=408, message=f"Request queued time exceed {self.max_waiting_time}") - if request.stream: return self.completion_stream_generator( request=request, diff --git a/fastdeploy/inter_communicator/zmq_client.py b/fastdeploy/inter_communicator/zmq_client.py index 05e55929dd..5a9b6418db 100644 --- a/fastdeploy/inter_communicator/zmq_client.py +++ b/fastdeploy/inter_communicator/zmq_client.py @@ -67,6 +67,7 @@ def create_router(self): """ self.router = self.context.socket(zmq.ROUTER) self.router.setsockopt(zmq.SNDHWM, self.ZMQ_SNDHWM) + self.router.setsockopt(zmq.ROUTER_MANDATORY, 1) self.router.setsockopt(zmq.SNDTIMEO, -1) self.router.bind(f"ipc://{self.router_path}") @@ -111,7 +112,6 @@ def send_multipart(self, req_id, data): """ if self.router is None: raise RuntimeError("Router socket not created. Call create_router() first.") - while self.running: with self.mutex: if req_id not in self.req_dict: @@ -124,7 +124,11 @@ def send_multipart(self, req_id, data): continue else: break - + if self.req_dict[req_id] == -1: + if data[-1].finished: + with self.mutex: + self.req_dict.pop(req_id, None) + return try: start_send = time.time() if self.aggregate_send: @@ -133,7 +137,9 @@ def send_multipart(self, req_id, data): result = msgpack.packb([response.to_dict() for response in data]) self.router.send_multipart([self.req_dict[req_id], b"", result]) llm_logger.debug(f"send_multipart result: {req_id} len {len(data)} elapse: {time.time()-start_send}") - + except zmq.ZMQError as e: + llm_logger.error(f"[{req_id}] zmq error: {e}") + self.req_dict[req_id] = -1 except Exception as e: llm_logger.error(f"Send result to zmq client failed: {e}") From 28918702c24549abaa2e0671766314228ec709ab Mon Sep 17 00:00:00 2001 From: Jiang-Jia-Jun Date: Thu, 14 Aug 2025 17:20:29 +0800 Subject: [PATCH 09/13] Revert "Merge branch 'feature/online/vs_think_20250813' into release/2.1" This reverts commit 02596fc53755c8325e63b630439e5558b457e667, reversing changes made to 03347626a6a82d1b1fb8271488e0a39180409f7c. --- fastdeploy/engine/args_utils.py | 27 +- fastdeploy/engine/config.py | 8 +- fastdeploy/engine/engine.py | 1 - fastdeploy/engine/request.py | 2 - fastdeploy/entrypoints/chat_utils.py | 5 - fastdeploy/entrypoints/engine_client.py | 2 - fastdeploy/entrypoints/llm.py | 6 +- fastdeploy/entrypoints/openai/api_server.py | 5 +- fastdeploy/entrypoints/openai/protocol.py | 13 +- fastdeploy/entrypoints/openai/serving_chat.py | 36 +- .../entrypoints/openai/serving_completion.py | 36 +- .../openai/tool_parsers/__init__.py | 24 -- .../tool_parsers/abstract_tool_parser.py | 159 --------- .../tool_parsers/ernie_x1_tool_parser.py | 320 ------------------ .../entrypoints/openai/tool_parsers/utils.py | 137 -------- fastdeploy/input/ernie_processor.py | 40 +-- fastdeploy/input/ernie_vl_processor.py | 3 - fastdeploy/input/preprocess.py | 9 - fastdeploy/input/text_processor.py | 32 +- fastdeploy/reasoning/__init__.py | 2 - .../reasoning/ernie_x1_reasoning_parsers.py | 208 ------------ fastdeploy/utils.py | 17 - requirements.txt | 1 - 23 files changed, 39 insertions(+), 1054 deletions(-) delete mode 100644 fastdeploy/entrypoints/openai/tool_parsers/__init__.py delete mode 100644 fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py delete mode 100644 fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py delete mode 100644 fastdeploy/entrypoints/openai/tool_parsers/utils.py delete mode 100644 fastdeploy/reasoning/ernie_x1_reasoning_parsers.py diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 834c8096b9..c254aaa1a2 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -15,10 +15,10 @@ """ import json -import os from dataclasses import asdict, dataclass from dataclasses import fields as dataclass_fields from typing import Any, Dict, List, Optional +import os from fastdeploy.config import ( CacheConfig, @@ -93,14 +93,6 @@ class EngineArgs: """ specifies the reasoning parser to use for extracting reasoning content from the model output """ - tool_call_parser: str = None - """ - specifies the tool call parser to use for extracting tool call from the model output - """ - tool_parser_plugin: str = None - """ - tool parser plugin used to register user defined tool parsers - """ enable_mm: bool = False """ Flags to enable multi-modal model @@ -429,18 +421,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help="Flag specifies the reasoning parser to use for extracting " "reasoning content from the model output", ) - model_group.add_argument( - "--tool-call-parser", - type=str, - default=EngineArgs.tool_call_parser, - help="Flag specifies the tool call parser to use for extracting" "tool call from the model output", - ) - model_group.add_argument( - "--tool-parser-plugin", - type=str, - default=EngineArgs.tool_parser_plugin, - help="tool parser plugin used to register user defined tool parsers", - ) model_group.add_argument( "--speculative-config", type=json.loads, @@ -886,10 +866,10 @@ def create_engine_config(self) -> Config: if self.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): + if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): self.max_num_batched_tokens = self.max_model_len else: - self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM + self.max_num_batched_tokens = 8192 all_dict = asdict(self) all_dict["model_cfg"] = model_cfg @@ -928,7 +908,6 @@ def create_engine_config(self) -> Config: mm_processor_kwargs=self.mm_processor_kwargs, enable_mm=self.enable_mm, reasoning_parser=self.reasoning_parser, - tool_parser=self.tool_call_parser, splitwise_role=self.splitwise_role, innode_prefill_ports=self.innode_prefill_ports, max_num_partial_prefills=self.max_num_partial_prefills, diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py index 31f7c5c708..fb57884bf3 100644 --- a/fastdeploy/engine/config.py +++ b/fastdeploy/engine/config.py @@ -85,7 +85,6 @@ def __init__( max_long_partial_prefills: int = 1, long_prefill_token_threshold: int = 0, reasoning_parser: str = None, - tool_parser: str = None, guided_decoding_backend: Optional[str] = None, disable_any_whitespace: bool = False, enable_logprob: bool = False, @@ -166,7 +165,6 @@ def __init__( self.max_long_partial_prefills = max_long_partial_prefills self.long_prefill_token_threshold = long_prefill_token_threshold self.reasoning_parser = reasoning_parser - self.tool_parser = tool_parser self.graph_optimization_config = graph_optimization_config self.early_stop_config = early_stop_config self.guided_decoding_backend = guided_decoding_backend @@ -238,10 +236,10 @@ def postprocess(self): if self.cache_config.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): + if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): self.max_num_batched_tokens = self.max_model_len else: - self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM + self.max_num_batched_tokens = 8192 if self.long_prefill_token_threshold == 0: self.long_prefill_token_threshold = int(self.max_model_len * 0.04) @@ -289,7 +287,7 @@ def check(self): ) if not self.cache_config.enable_chunked_prefill: - if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): + if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): assert self.max_num_batched_tokens >= self.max_model_len, ( f"max_num_batched_tokens: {self.max_num_batched_tokens} " f"should be larger than or equal to max_model_len: {self.max_model_len}" diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 16a89932d3..fa9fa61750 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -106,7 +106,6 @@ def __init__(self, cfg): cfg.limit_mm_per_prompt, cfg.mm_processor_kwargs, cfg.enable_mm, - cfg.tool_parser, ) self.start_queue_service() diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index b9fa895e6a..acf717547a 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -24,7 +24,6 @@ import numpy as np from fastdeploy.engine.sampling_params import SamplingParams -from fastdeploy.entrypoints.openai.protocol import ToolCall from fastdeploy.utils import data_processor_logger from fastdeploy.worker.output import LogprobsLists, SampleLogprobs @@ -250,7 +249,6 @@ class CompletionOutput: draft_token_ids: list[int] = None text: Optional[str] = None reasoning_content: Optional[str] = None - tool_calls: Optional[ToolCall] = None def to_dict(self): """ diff --git a/fastdeploy/entrypoints/chat_utils.py b/fastdeploy/entrypoints/chat_utils.py index 059ecee01e..4f7357e11f 100644 --- a/fastdeploy/entrypoints/chat_utils.py +++ b/fastdeploy/entrypoints/chat_utils.py @@ -14,7 +14,6 @@ # limitations under the License. """ -import uuid from copy import deepcopy from typing import List, Literal, Union from urllib.parse import urlparse @@ -157,7 +156,3 @@ def parse_chat_messages(messages): conversation.append({"role": role, "content": parsed_content}) return conversation - - -def random_tool_call_id() -> str: - return f"chatcmpl-tool-{str(uuid.uuid4().hex)}" diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index e7edacb26e..12d14f7e1c 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -45,7 +45,6 @@ def __init__( data_parallel_size=1, enable_logprob=False, workers=1, - tool_parser=None, ): input_processor = InputPreprocessor( tokenizer, @@ -53,7 +52,6 @@ def __init__( limit_mm_per_prompt, mm_processor_kwargs, enable_mm, - tool_parser, ) self.enable_logprob = enable_logprob self.enable_mm = enable_mm diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py index 8291c974a1..3e150abf2d 100644 --- a/fastdeploy/entrypoints/llm.py +++ b/fastdeploy/entrypoints/llm.py @@ -28,7 +28,8 @@ from fastdeploy.engine.args_utils import EngineArgs from fastdeploy.engine.engine import LLMEngine from fastdeploy.engine.sampling_params import SamplingParams -from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager + +# from fastdeploy.entrypoints.chat_utils import ChatCompletionMessageParam from fastdeploy.utils import llm_logger, retrive_model_from_server from fastdeploy.worker.output import Logprob, LogprobsLists @@ -72,9 +73,6 @@ def __init__( **kwargs, ): model = retrive_model_from_server(model, revision) - tool_parser_plugin = kwargs.get("tool_parser_plugin") - if tool_parser_plugin: - ToolParserManager.import_tool_parser(tool_parser_plugin) engine_args = EngineArgs( model=model, tokenizer=tokenizer, diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 53168abc03..2f501b2ef8 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -41,7 +41,6 @@ ) from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion -from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager from fastdeploy.metrics.metrics import ( EXCLUDE_LABELS, cleanup_prometheus_files, @@ -74,8 +73,7 @@ parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() args.model = retrive_model_from_server(args.model, args.revision) -if args.tool_parser_plugin: - ToolParserManager.import_tool_parser(args.tool_parser_plugin) + llm_engine = None @@ -128,7 +126,6 @@ async def lifespan(app: FastAPI): args.data_parallel_size, args.enable_logprob, args.workers, - args.tool_call_parser, ) app.state.dynamic_load_weight = args.dynamic_load_weight chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time) diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 2049fb971f..678ae8dd06 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -72,6 +72,7 @@ class ToolCall(BaseModel): id: str = None type: Literal["function"] = "function" function: FunctionCall + index: int class DeltaFunctionCall(BaseModel): @@ -95,18 +96,6 @@ class DeltaToolCall(BaseModel): function: Optional[DeltaFunctionCall] = None -class ExtractedToolCallInformation(BaseModel): - # indicate if tools were called - tools_called: bool - - # extracted tool calls - tool_calls: Optional[list[ToolCall]] = None - - # content - per OpenAI spec, content AND tool calls can be returned rarely - # But some models will do this intentionally - content: Optional[str] = None - - class FunctionDefinition(BaseModel): """ Function definition. diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 57b5945e3a..536cd7d807 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -141,7 +141,6 @@ async def chat_completion_stream_generator( previous_num_tokens = 0 num_prompt_tokens = 0 num_choices = 1 - tool_called = False max_streaming_response_tokens = ( request.max_streaming_response_tokens if request.max_streaming_response_tokens is not None @@ -246,28 +245,20 @@ async def chat_completion_stream_generator( output = res["outputs"] delta_text = output["text"] output_top_logprobs = output["top_logprobs"] - previous_num_tokens += len(output["token_ids"]) logprobs_res: Optional[LogProbs] = None if request.logprobs and output_top_logprobs is not None: logprobs_res = self._create_chat_logprobs( output_top_logprobs, request.logprobs, request.top_logprobs ) - if self.engine_client.data_processor.tool_parser_obj and not res["finished"]: - tool_delta_message = output["tool_delta_message"] - if tool_delta_message is None: - continue - delta_message = tool_delta_message - delta_message.reasoning_content = output.get("reasoning_content") - if delta_message.tool_calls: - tool_called = True - else: - delta_message = DeltaMessage( - content=delta_text, - reasoning_content=output.get("reasoning_content"), - prompt_token_ids=None, - completion_token_ids=None, - tool_calls=None, - ) + + previous_num_tokens += len(output["token_ids"]) + delta_message = DeltaMessage( + content=delta_text, + reasoning_content=output.get("reasoning_content"), + prompt_token_ids=None, + completion_token_ids=None, + tool_calls=output.get("tool_call_content", []), + ) choice = ChatCompletionResponseStreamChoice( index=0, @@ -285,7 +276,10 @@ async def chat_completion_stream_generator( max_tokens = request.max_completion_tokens or request.max_tokens if has_no_token_limit or previous_num_tokens != max_tokens: choice.finish_reason = "stop" - if tool_called: + if ( + self.engine_client.reasoning_parser == "ernie_x1" + and output.get("finish_reason", "") == "tool_calls" + ): choice.finish_reason = "tool_calls" else: choice.finish_reason = "length" @@ -425,7 +419,7 @@ async def chat_completion_full_generator( role="assistant", content=output["text"], reasoning_content=output.get("reasoning_content"), - tool_calls=output.get("tool_call"), + tool_calls=output.get("tool_call_content"), prompt_token_ids=prompt_token_ids if request.return_token_ids else None, completion_token_ids=completion_token_ids if request.return_token_ids else None, text_after_process=text_after_process if request.return_token_ids else None, @@ -445,7 +439,7 @@ async def chat_completion_full_generator( max_tokens = request.max_completion_tokens or request.max_tokens if has_no_token_limit or previous_num_tokens != max_tokens: choice.finish_reason = "stop" - if output.get("tool_call"): + if self.engine_client.reasoning_parser == "ernie_x1" and output.get("finish_reason", "") == "tool_calls": choice.finish_reason = "tool_calls" else: choice.finish_reason = "length" diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 1e8ad0f86e..cec597f78a 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -240,9 +240,9 @@ async def completion_full_generator( dealer.close() self.engine_client.semaphore.release() - def calc_finish_reason(self, max_tokens, token_num, output, tool_called): + def calc_finish_reason(self, max_tokens, token_num, output): if max_tokens is None or token_num != max_tokens: - if tool_called or output.get("tool_call"): + if self.engine_client.reasoning_parser == "ernie_x1" and output.get("finish_reason", "") == "tool_calls": return "tool_calls" else: return "stop" @@ -271,7 +271,6 @@ async def completion_stream_generator( output_tokens = [0] * num_choices inference_start_time = [0] * num_choices first_iteration = [True] * num_choices - tool_called = False max_streaming_response_tokens = ( request.max_streaming_response_tokens if request.max_streaming_response_tokens is not None @@ -343,41 +342,24 @@ async def completion_stream_generator( if request.logprobs and output_top_logprobs is not None: logprobs_res = self._create_completion_logprobs(output_top_logprobs, request.logprobs, 0) - output_tokens[idx] += 1 - if self.engine_client.data_processor.tool_parser_obj and not res["finished"]: - tool_delta_message = output["tool_delta_message"] - if tool_delta_message is None: - continue - delta_message = CompletionResponseStreamChoice( - index=idx, - text=output["text"], - completion_token_ids=output.get("token_ids") if request.return_token_ids else None, - tool_calls=tool_delta_message.tool_calls, - reasoning_content=output.get("reasoning_content"), - arrival_time=arrival_time, - logprobs=logprobs_res, - ) - if tool_delta_message.tool_calls: - tool_called = True - else: - delta_message = CompletionResponseStreamChoice( + choices.append( + CompletionResponseStreamChoice( index=idx, text=output["text"], prompt_token_ids=None, completion_token_ids=output.get("token_ids") if request.return_token_ids else None, - tool_calls=None, raw_prediction=output.get("raw_prediction") if request.return_token_ids else None, + tool_calls=output.get("tool_call_content"), reasoning_content=output.get("reasoning_content"), arrival_time=arrival_time, logprobs=logprobs_res, ) - - choices.append(delta_message) + ) output_tokens[idx] += 1 if res["finished"]: choices[-1].finish_reason = self.calc_finish_reason( - request.max_tokens, output_tokens[idx], output, tool_called + request.max_tokens, output_tokens[idx], output ) send_idx = output.get("send_idx") # 只有当 send_idx 明确为 0 时才记录日志 @@ -476,7 +458,7 @@ def request_output_to_completion_response( token_ids = output["token_ids"] output_text = output["text"] - finish_reason = self.calc_finish_reason(request.max_tokens, final_res["output_token_ids"], output, False) + finish_reason = self.calc_finish_reason(request.max_tokens, final_res["output_token_ids"], output) choice_data = CompletionResponseChoice( token_ids=token_ids, @@ -487,7 +469,7 @@ def request_output_to_completion_response( raw_prediction=output.get("raw_prediction") if request.return_token_ids else None, text_after_process=text_after_process_list[idx] if request.return_token_ids else None, reasoning_content=output.get("reasoning_content"), - tool_calls=output.get("tool_call"), + tool_calls=output.get("tool_call_content"), logprobs=aggregated_logprobs, finish_reason=finish_reason, ) diff --git a/fastdeploy/entrypoints/openai/tool_parsers/__init__.py b/fastdeploy/entrypoints/openai/tool_parsers/__init__.py deleted file mode 100644 index 2078a8c9fe..0000000000 --- a/fastdeploy/entrypoints/openai/tool_parsers/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -from .abstract_tool_parser import ToolParser, ToolParserManager -from .ernie_x1_tool_parser import ErnieX1ToolParser - -__all__ = [ - "ToolParser", - "ToolParserManager", - "ErnieX1ToolParser", -] diff --git a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py deleted file mode 100644 index d6ac8f81aa..0000000000 --- a/fastdeploy/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ /dev/null @@ -1,159 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import os -from collections.abc import Sequence -from functools import cached_property -from typing import Callable, Optional, Union - -from fastdeploy.entrypoints.openai.protocol import ( - ChatCompletionRequest, - DeltaMessage, - ExtractedToolCallInformation, -) -from fastdeploy.utils import data_processor_logger, import_from_path, is_list_of - - -class ToolParser: - """ - Abstract ToolParser class that should not be used directly. Provided - properties and methods should be used in - derived classes. - """ - - def __init__(self, tokenizer): - self.prev_tool_call_arr: list[dict] = [] - # the index of the tool call that is currently being parsed - self.current_tool_id: int = -1 - self.current_tool_name_sent: bool = False - self.streamed_args_for_tool: list[str] = [] - - self.model_tokenizer = tokenizer - - @cached_property - def vocab(self) -> dict[str, int]: - # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab - # whereas all tokenizers have .get_vocab() - return self.model_tokenizer.get_vocab() - - def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: - """ - Static method that used to adjust the request parameters. - """ - return request - - def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation: - """ - Static method that should be implemented for extracting tool calls from - a complete model-generated string. - Used for non-streaming responses where we have the entire model response - available before sending to the client. - Static because it's stateless. - """ - raise NotImplementedError("AbstractToolParser.extract_tool_calls has not been implemented!") - - def extract_tool_calls_streaming( - self, - previous_text: str, - current_text: str, - delta_text: str, - previous_token_ids: Sequence[int], - current_token_ids: Sequence[int], - delta_token_ids: Sequence[int], - request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: - """ - Instance method that should be implemented for extracting tool calls - from an incomplete response; for use when handling tool calls and - streaming. Has to be an instance method because it requires state - - the current tokens/diffs, but also the information about what has - previously been parsed and extracted (see constructor) - """ - raise NotImplementedError("AbstractToolParser.extract_tool_calls_streaming has not been " "implemented!") - - -class ToolParserManager: - tool_parsers: dict[str, type] = {} - - @classmethod - def get_tool_parser(cls, name) -> type: - """ - Get tool parser by name which is registered by `register_module`. - - Raise a KeyError exception if the name is not registered. - """ - if name in cls.tool_parsers: - return cls.tool_parsers[name] - - raise KeyError(f"tool helper: '{name}' not found in tool_parsers") - - @classmethod - def _register_module( - cls, module: type, module_name: Optional[Union[str, list[str]]] = None, force: bool = True - ) -> None: - if not issubclass(module, ToolParser): - raise TypeError(f"module must be subclass of ToolParser, but got {type(module)}") - if module_name is None: - module_name = module.__name__ - if isinstance(module_name, str): - module_name = [module_name] - for name in module_name: - if not force and name in cls.tool_parsers: - existed_module = cls.tool_parsers[name] - raise KeyError(f"{name} is already registered " f"at {existed_module.__module__}") - cls.tool_parsers[name] = module - - @classmethod - def register_module( - cls, name: Optional[Union[str, list[str]]] = None, force: bool = True, module: Union[type, None] = None - ) -> Union[type, Callable]: - """ - Register module with the given name or name list. it can be used as a - decoder(with module as None) or normal function(with module as not - None). - """ - if not isinstance(force, bool): - raise TypeError(f"force must be a boolean, but got {type(force)}") - - # raise the error ahead of time - if not (name is None or isinstance(name, str) or is_list_of(name, str)): - raise TypeError("name must be None, an instance of str, or a sequence of str, " f"but got {type(name)}") - - # use it as a normal method: x.register_module(module=SomeClass) - if module is not None: - cls._register_module(module=module, module_name=name, force=force) - return module - - # use it as a decorator: @x.register_module() - def _register(module): - cls._register_module(module=module, module_name=name, force=force) - return module - - return _register - - @classmethod - def import_tool_parser(cls, plugin_path: str) -> None: - """ - Import a user-defined tool parser by the path of the tool parser define - file. - """ - module_name = os.path.splitext(os.path.basename(plugin_path))[0] - - try: - import_from_path(module_name, plugin_path) - except Exception: - data_processor_logger.exception("Failed to load module '%s' from %s.", module_name, plugin_path) - return diff --git a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py b/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py deleted file mode 100644 index cec1f68401..0000000000 --- a/fastdeploy/entrypoints/openai/tool_parsers/ernie_x1_tool_parser.py +++ /dev/null @@ -1,320 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import re -import uuid -from collections.abc import Sequence -from typing import Union - -import partial_json_parser - - -def random_tool_call_id() -> str: - """Generate a random tool call ID""" - return f"chatcmpl-tool-{str(uuid.uuid4().hex)}" - - -from fastdeploy.entrypoints.openai.protocol import ( - ChatCompletionRequest, - DeltaFunctionCall, - DeltaMessage, - DeltaToolCall, - ExtractedToolCallInformation, - FunctionCall, - ToolCall, -) -from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import ( - ToolParser, - ToolParserManager, -) -from fastdeploy.utils import data_processor_logger - - -@ToolParserManager.register_module("ernie_x1") -class ErnieX1ToolParser(ToolParser): - """ - Tool parser for Ernie model version 4.5.1. - This parser handles tool calls with newline formats. - """ - - def __init__(self, tokenizer): - super().__init__(tokenizer) - - self.prev_tool_call_arr: list[dict] = [] - self.current_tool_id: int = -1 - self.current_tool_name_sent: bool = False - self.streamed_args_for_tool: list[str] = [] # map what has been streamed for each tool so far to a list - self.buffer: str = "" # buffer for accumulating unprocessed streaming content - - if not self.model_tokenizer: - raise ValueError( - "The model tokenizer must be passed to the ToolCallParser constructor during construction." - ) - - def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) -> ExtractedToolCallInformation: - """ - Extract the tool calls from a complete model response. - Supports XML-style formats with newlines: - - XML format: \n...\n\n\n\n\n{...}\n\n... - - Handles boundary cases: - 1. Only name and partial arguments: {"name": "get_weather", "arguments": {"location": "北京" - 2. Only partial name: {"name": "get_we - 3. Only name and arguments field without content: {"name": "get_weather", "argume - """ - - try: - tool_calls = [] - - # Check for invalid tags before tool calls - if re.search(r"[\s\S]*?\s*(?=)", model_output): - data_processor_logger.error("Invalid format: tags found before ") - return ExtractedToolCallInformation(tools_called=False, content=model_output) - - function_call_arr = [] - remaining_text = model_output - - while True: - # 查找下一个tool_call块 - tool_call_pos = remaining_text.find("") - if tool_call_pos == -1: - break - - # 提取tool_call开始位置后的内容 - tool_content_start = tool_call_pos + len("") - tool_content_end = remaining_text.find("", tool_content_start) - - tool_json = "" - if tool_content_end == -1: - # 处理未闭合的tool_call块(截断情况) - tool_json = remaining_text[tool_content_start:].strip() - remaining_text = "" # 没有更多内容需要处理 - else: - # 处理完整的tool_call块 - tool_json = remaining_text[tool_content_start:tool_content_end].strip() - remaining_text = remaining_text[tool_content_end + len("") :] - - if not tool_json: - continue - - # 处理JSON内容 - tool_json = tool_json.strip() - if not tool_json.startswith("{"): - tool_json = "{" + tool_json - if not tool_json.endswith("}"): - tool_json = tool_json + "}" - - try: - # 首先尝试标准JSON解析 - try: - tool_data = json.loads(tool_json) - - if isinstance(tool_data, dict) and "name" in tool_data and "arguments" in tool_data: - function_call_arr.append( - { - "name": tool_data["name"], - "arguments": tool_data["arguments"], - "_is_complete": True, # 明确标记为完整解析 - } - ) - continue - except json.JSONDecodeError: - pass - - # 标准解析失败时尝试partial_json_parser - from partial_json_parser.core.options import Allow - - try: - tool_data = {} - flags = Allow.ALL & ~Allow.STR - - # 解析name字段 - name_match = re.search(r'"name"\s*:\s*"([^"]*)"', tool_json) - if name_match: - tool_data["name"] = name_match.group(1) - - # 解析arguments字段 - args_match = re.search(r'"arguments"\s*:\s*(\{.*)', tool_json) - if args_match: - try: - tool_data["arguments"] = partial_json_parser.loads(args_match.group(1), flags=flags) - except: - tool_data["arguments"] = None - - if isinstance(tool_data, dict): - function_call_arr.append( - { - "name": tool_data.get("name", ""), - "arguments": tool_data.get("arguments", {}), - "_is_partial": True, # 标记为部分解析 - } - ) - except Exception as e: - data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") - continue - except Exception as e: - data_processor_logger.debug(f"Failed to parse tool call: {str(e)}") - continue - - if not function_call_arr: - data_processor_logger.error("No valid tool calls found") - return ExtractedToolCallInformation(tools_called=False, content=model_output) - - tool_calls = [] - all_complete = True # 初始设为True,只要有一个不完整就变为False - - for tool_call in function_call_arr: - # 记录工具调用解析状态 - is_complete = tool_call.get("_is_complete", False) - is_partial = tool_call.get("_is_partial", False) - - # 只要有一个不完整就认为整体不完整 - if not is_complete or is_partial: - all_complete = False - - # 处理参数序列化 - tool_args = tool_call.get("arguments", {}) - if not isinstance(tool_args, dict): - tool_args = {} - - try: - args_str = json.dumps(tool_args, ensure_ascii=False) if tool_args else "{}" - except: - args_str = "{}" - - tool_calls.append( - ToolCall( - type="function", - id=random_tool_call_id(), - function=FunctionCall( - name=tool_call.get("name", ""), - arguments=args_str, - ), - ) - ) - - # 只有当所有工具调用都明确标记为complete时才返回tools_called=True - return ExtractedToolCallInformation( - tools_called=all_complete, tool_calls=tool_calls if tool_calls else None, content="" - ) - - except Exception as e: - data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}") - return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output) - - def extract_tool_calls_streaming( - self, - previous_text: str, - current_text: str, - delta_text: str, - previous_token_ids: Sequence[int], - current_token_ids: Sequence[int], - delta_token_ids: Sequence[int], - request: dict, - ) -> Union[DeltaMessage, None]: - # 忽略空chunk - if len(delta_text.strip()) == 0: - return None - - try: - delta = None - # 使用buffer累积delta_text内容 - self.buffer += delta_text - - # 处理增量中的新tool_call开始 - if "" in delta_text and "" not in previous_text: - self.current_tool_id = ( - max(self.current_tool_id, 0) if self.current_tool_id == -1 else self.current_tool_id + 1 - ) - self.current_tool_name_sent = False - if len(self.streamed_args_for_tool) <= self.current_tool_id: - self.streamed_args_for_tool.append("") - data_processor_logger.debug(f"New tool call started with ID: {self.current_tool_id}") - - # 增量解析逻辑 - - # 1. 尝试解析name字段 - if not self.current_tool_name_sent and '"name"' in self.buffer: - name_match = re.search(r'"name"\s*:\s*"([^"]*)"', self.buffer) - if name_match: - name = name_match.group(1) - if name: - delta = DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=self.current_tool_id, - type="function", - id=random_tool_call_id(), - function=DeltaFunctionCall(name=name).model_dump(exclude_none=True), - ) - ] - ) - print("delta name:", delta) - # 删除已处理的name部分 - self.buffer = self.buffer[name_match.end() :] - self.current_tool_name_sent = True - return delta - # 2. 尝试解析arguments字段 - if '"arguments"' in self.buffer: - args_match = re.search(r'"arguments"\s*:\s*(\{.*)', self.buffer) - if args_match: - args_content = args_match.group(1) - # 处理多余的大括号 - open_braces = args_content.count("{") - close_braces = args_content.count("}") - if close_braces > open_braces: - args_content = args_content[: args_content.rfind("}")] - try: - # 增量解析arguments - parsed_args = json.loads(args_content) - if isinstance(parsed_args, dict): - args_json = json.dumps(parsed_args, ensure_ascii=False) - if len(args_json) > len(self.streamed_args_for_tool[self.current_tool_id]): - argument_diff = args_json[len(self.streamed_args_for_tool[self.current_tool_id]) :] - delta = DeltaMessage( - tool_calls=[ - DeltaToolCall( - index=self.current_tool_id, - function=DeltaFunctionCall(arguments=argument_diff).model_dump( - exclude_none=True - ), - ) - ] - ) - print("delta argument:", delta) - # 删除已处理部分 - processed_pos = args_match.start() + len('"arguments":') - self.buffer = ( - self.buffer[:processed_pos] + self.buffer[processed_pos + len(args_json) :] - ) - self.streamed_args_for_tool[self.current_tool_id] = args_json - return delta - except Exception as e: - data_processor_logger.debug(f"Partial arguments parsing: {str(e)}") - - if "" in self.buffer: - end_pos = self.buffer.find("") - self.buffer = self.buffer[end_pos + len("") :] - - # 完成当前工具调用处理 - self.current_tool_id += 1 - self.current_tool_name_sent = False - self.streamed_args_for_tool.append("") - - return delta - - except Exception as e: - data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}") - return None diff --git a/fastdeploy/entrypoints/openai/tool_parsers/utils.py b/fastdeploy/entrypoints/openai/tool_parsers/utils.py deleted file mode 100644 index b7dff3c588..0000000000 --- a/fastdeploy/entrypoints/openai/tool_parsers/utils.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import json -from json import JSONDecodeError, JSONDecoder -from typing import Any - -import partial_json_parser -from partial_json_parser.core.options import Allow - - -def find_common_prefix(s1: str, s2: str) -> str: - """ - Finds a common prefix that is shared between two strings, if there is one. - Order of arguments is NOT important. - - This function is provided as a UTILITY for extracting information from JSON - generated by partial_json_parser, to help in ensuring that the right tokens - are returned in streaming, so that close-quotes, close-brackets and - close-braces are not returned prematurely. - - e.g. find_common_prefix('{"fruit": "ap"}', '{"fruit": "apple"}') -> - '{"fruit": "ap' - """ - prefix = "" - min_length = min(len(s1), len(s2)) - for i in range(0, min_length): - if s1[i] == s2[i]: - prefix += s1[i] - else: - break - return prefix - - -def find_common_suffix(s1: str, s2: str) -> str: - """ - Finds a common suffix shared between two strings, if there is one. Order of - arguments is NOT important. - Stops when the suffix ends OR it hits an alphanumeric character - - e.g. find_common_suffix('{"fruit": "ap"}', '{"fruit": "apple"}') -> '"}' - """ - suffix = "" - min_length = min(len(s1), len(s2)) - for i in range(1, min_length + 1): - if s1[-i] == s2[-i] and not s1[-i].isalnum(): - suffix = s1[-i] + suffix - else: - break - return suffix - - -def extract_intermediate_diff(curr: str, old: str) -> str: - """ - Given two strings, extract the difference in the middle between two strings - that are known to have a common prefix and/or suffix. - - This function is provided as a UTILITY for extracting information from JSON - generated by partial_json_parser, to help in ensuring that the right tokens - are returned in streaming, so that close-quotes, close-brackets and - close-braces are not returned prematurely. The order of arguments IS - important - the new version of the partially-parsed JSON must be the first - argument, and the secnod argument must be from the previous generation. - - What it returns, is tokens that should be streamed to the client. - - e.g. extract_intermediate_diff('{"fruit": "apple"}', '{"fruit": "ap"}') - -> 'ple' - - """ - suffix = find_common_suffix(curr, old) - - old = old[::-1].replace(suffix[::-1], "", 1)[::-1] - prefix = find_common_prefix(curr, old) - diff = curr - if len(suffix): - diff = diff[::-1].replace(suffix[::-1], "", 1)[::-1] - - if len(prefix): - # replace the prefix only once in case it's mirrored - diff = diff.replace(prefix, "", 1) - - return diff - - -def find_all_indices(string: str, substring: str) -> list[int]: - """ - Find all (starting) indices of a substring in a given string. Useful for - tool call extraction - """ - indices = [] - index = -1 - while True: - index = string.find(substring, index + 1) - if index == -1: - break - indices.append(index) - return indices - - -# partial_json_parser doesn't support extra data and -# JSONDecoder.raw_decode doesn't support partial JSON -def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]: - try: - return (partial_json_parser.loads(input_str, flags), len(input_str)) - except JSONDecodeError as e: - if "Extra data" in e.msg: - dec = JSONDecoder() - return dec.raw_decode(input_str) - raise - - -def is_complete_json(input_str: str) -> bool: - try: - json.loads(input_str) - return True - except JSONDecodeError: - return False - - -def consume_space(i: int, s: str) -> int: - while i < len(s) and s[i].isspace(): - i += 1 - return i diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py index 7f55c26ce7..a268ad562e 100644 --- a/fastdeploy/input/ernie_processor.py +++ b/fastdeploy/input/ernie_processor.py @@ -43,7 +43,7 @@ class ErnieProcessor(BaseDataProcessor): pad_token_id (int): 存储填充符号的token ID。 """ - def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None): + def __init__(self, model_name_or_path, reasoning_parser_obj=None): self.model_name_or_path = model_name_or_path data_processor_logger.info(f"model_name_or_path: {model_name_or_path}") @@ -63,7 +63,6 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob self.reasoning_parser = None if reasoning_parser_obj: self.reasoning_parser = reasoning_parser_obj(self.tokenizer) - self.tool_parser_obj = tool_parser_obj def _init_config(self): self.use_hf_tokenizer = int(envs.FD_USE_HF_TOKENIZER) == 1 @@ -205,12 +204,6 @@ def process_response(self, response_dict, **kwargs): response_dict.outputs.reasoning_content = reasoning_content else: response_dict.outputs.text = full_text - if self.tool_parser_obj: - tool_parser = self.tool_parser_obj(self.tokenizer) - tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) - if tool_call_info.tools_called: - response_dict.outputs.tool_calls = tool_call_info.tool_calls - response_dict.outputs.text = tool_call_info.content data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}") if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "": return None @@ -251,20 +244,12 @@ def process_response_dict_normal(self, response_dict, **kwargs): delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id) if is_end: full_text = previous_texts + delta_text - if self.reasoning_parser and ( - enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" - ): + if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content else: response_dict["outputs"]["text"] = full_text - if self.tool_parser_obj: - tool_parser = self.tool_parser_obj(self.tokenizer) - tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) - if tool_call_info.tools_called: - response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls - response_dict["outputs"]["text"] = tool_call_info.content response_dict["outputs"]["raw_prediction"] = full_text data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] @@ -289,9 +274,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) - if self.reasoning_parser and ( - enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser" - ): + if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming( previous_texts, previous_texts + delta_text, @@ -304,25 +287,10 @@ def process_response_dict_streaming(self, response_dict, **kwargs): response_dict["outputs"]["reasoning_content"] = reasoning_content else: response_dict["outputs"]["text"] = delta_text - if self.tool_parser_obj: - if req_id not in self.tool_parsers: - self.tool_parsers[req_id] = self.tool_parser_obj(self.tokenizer) - tool_parser = self.tool_parsers[req_id] - tool_call = tool_parser.extract_tool_calls_streaming( - previous_texts, - previous_texts + delta_text, - delta_text, - previous_token_ids, - previous_token_ids + token_ids, - token_ids, - response_dict, - ) - response_dict["outputs"]["tool_delta_message"] = tool_call + response_dict["outputs"]["raw_prediction"] = delta_text if is_end: data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] - if req_id in self.tool_parsers: - del self.tool_parsers[req_id] return response_dict def messages2ids(self, request_or_messages): diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py index 21a96e92b8..d2975c6971 100644 --- a/fastdeploy/input/ernie_vl_processor.py +++ b/fastdeploy/input/ernie_vl_processor.py @@ -34,7 +34,6 @@ def __init__( limit_mm_per_prompt=None, mm_processor_kwargs=None, reasoning_parser_obj=None, - tool_parser_obj=None, ): self.use_hf_tokenizer = False @@ -54,7 +53,6 @@ def __init__( self.image_patch_id = self.ernie_processor.image_patch_id self.spatial_conv_size = self.ernie_processor.spatial_conv_size - self.tool_parsers = dict() self.decode_status = dict() self._load_tokenizer() self.eos_token_ids = [self.tokenizer.eos_token_id] @@ -64,7 +62,6 @@ def __init__( self.reasoning_parser = None if reasoning_parser_obj: self.reasoning_parser = reasoning_parser_obj(self.tokenizer) - self.tool_parser_obj = tool_parser_obj # Generation config try: diff --git a/fastdeploy/input/preprocess.py b/fastdeploy/input/preprocess.py index 5c1e2e8027..8edd4eb4b7 100644 --- a/fastdeploy/input/preprocess.py +++ b/fastdeploy/input/preprocess.py @@ -18,7 +18,6 @@ from fastdeploy.config import ErnieArchitectures from fastdeploy.engine.config import ModelConfig -from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager from fastdeploy.reasoning import ReasoningParserManager @@ -49,7 +48,6 @@ def __init__( limit_mm_per_prompt: Optional[Dict[str, Any]] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, enable_mm: bool = False, - tool_parser: str = None, ) -> None: self.model_name_or_path = model_name_or_path @@ -57,7 +55,6 @@ def __init__( self.enable_mm = enable_mm self.limit_mm_per_prompt = limit_mm_per_prompt self.mm_processor_kwargs = mm_processor_kwargs - self.tool_parser = tool_parser def create_processor(self): """ @@ -71,11 +68,8 @@ def create_processor(self): DataProcessor or MultiModalRegistry.Processor (Union[DataProcessor, MultiModalRegistry.Processor]): 数据处理器。 """ reasoning_parser_obj = None - tool_parser_obj = None if self.reasoning_parser: reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(self.reasoning_parser) - if self.tool_parser: - tool_parser_obj = ToolParserManager.get_tool_parser(self.tool_parser) architectures = ModelConfig({"model": self.model_name_or_path}).architectures[0] if not self.enable_mm: if not ErnieArchitectures.contains_ernie_arch(architectures): @@ -84,7 +78,6 @@ def create_processor(self): self.processor = DataProcessor( model_name_or_path=self.model_name_or_path, reasoning_parser_obj=reasoning_parser_obj, - tool_parser_obj=tool_parser_obj, ) else: from fastdeploy.input.ernie_processor import ErnieProcessor @@ -92,7 +85,6 @@ def create_processor(self): self.processor = ErnieProcessor( model_name_or_path=self.model_name_or_path, reasoning_parser_obj=reasoning_parser_obj, - tool_parser_obj=tool_parser_obj, ) else: if not architectures.startswith("Ernie4_5_VLMoeForConditionalGeneration"): @@ -105,6 +97,5 @@ def create_processor(self): limit_mm_per_prompt=self.limit_mm_per_prompt, mm_processor_kwargs=self.mm_processor_kwargs, reasoning_parser_obj=reasoning_parser_obj, - tool_parser_obj=tool_parser_obj, ) return self.processor diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index 4bffee280d..eec346341a 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -148,7 +148,7 @@ def _load_tokenizer(self): class DataProcessor(BaseDataProcessor): - def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None): + def __init__(self, model_name_or_path, reasoning_parser_obj=None): """ Initializes the DecodeStatus object. @@ -168,7 +168,6 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob self._init_config() self.decode_status = dict() - self.tool_parsers = dict() self.tokenizer = self._load_tokenizer() data_processor_logger.info( f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, \ @@ -181,7 +180,6 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob self.eos_token_id_len = len(self.eos_token_ids) self.pad_token_id = self.get_pad_id() self.reasoning_parser = None - self.tool_parser_obj = tool_parser_obj if reasoning_parser_obj: self.reasoning_parser = reasoning_parser_obj(self.tokenizer) self.tokenizer.pad_token_id = self.pad_token_id @@ -331,12 +329,6 @@ def process_response(self, response_dict, **kwargs): else: # 模型不支持思考,并且没单独设置enable_thinking为false response_dict.outputs.text = full_text - if self.tool_parser_obj: - tool_parser = self.tool_parser_obj(self.tokenizer) - tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) - if tool_call_info.tools_called: - response_dict.outputs.tool_calls = tool_call_info.tool_calls - response_dict.outputs.text = tool_call_info.content data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}") return response_dict @@ -368,12 +360,6 @@ def process_response_dict_normal(self, response_dict, **kwargs): response_dict["outputs"]["reasoning_content"] = reasoning_content else: response_dict["outputs"]["text"] = full_text - if self.tool_parser_obj: - tool_parser = self.tool_parser_obj(self.tokenizer) - tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) - if tool_call_info.tools_called: - response_dict["outputs"]["tool_call"] = tool_call_info.tool_calls - response_dict["outputs"]["text"] = tool_call_info.content data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] return response_dict @@ -411,25 +397,9 @@ def process_response_dict_streaming(self, response_dict, **kwargs): response_dict["outputs"]["reasoning_content"] = reasoning_content else: response_dict["outputs"]["text"] = delta_text - if self.tool_parser_obj and not is_end: - if req_id not in self.tool_parsers: - self.tool_parsers[req_id] = self.tool_parser_obj(self.tokenizer) - tool_parser = self.tool_parsers[req_id] - tool_call = tool_parser.extract_tool_calls_streaming( - previous_texts, - previous_texts + delta_text, - delta_text, - previous_token_ids, - previous_token_ids + token_ids, - token_ids, - response_dict, - ) - response_dict["outputs"]["tool_delta_message"] = tool_call if is_end: data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] - if req_id in self.tool_parsers: - del self.tool_parsers[req_id] return response_dict def process_response_dict(self, response_dict, **kwargs): diff --git a/fastdeploy/reasoning/__init__.py b/fastdeploy/reasoning/__init__.py index 51f59776e0..aa7d65e50b 100644 --- a/fastdeploy/reasoning/__init__.py +++ b/fastdeploy/reasoning/__init__.py @@ -16,7 +16,6 @@ from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .ernie_vl_reasoning_parsers import ErnieVLReasoningParser -from .ernie_x1_reasoning_parsers import ErnieX1ReasoningParser from .qwen3_reasoning_parsers import Qwen3ReasoningParser __all__ = [ @@ -24,5 +23,4 @@ "ReasoningParserManager", "ErnieVLReasoningParser", "Qwen3ReasoningParser", - "ErnieX1ReasoningParser", ] diff --git a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py b/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py deleted file mode 100644 index 4585052527..0000000000 --- a/fastdeploy/reasoning/ernie_x1_reasoning_parsers.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# -from collections.abc import Sequence -from typing import Tuple - -from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest -from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager - -# -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -@ReasoningParserManager.register_module("ernie_x1") -class ErnieX1ReasoningParser(ReasoningParser): - """ - Reasoning parser for ernie_x1 model with stricter boundary checking. - - This implementation follows the user's proposed approach: - 1. For thinking content: waits for \n then checks for tag - 2. For response content: checks for tag first, then waits for \n - 3. Handles newlines in content more precisely - """ - - def __init__(self, tokenizer): - super().__init__(tokenizer) - self.think_end_token = "" - self.response_start_token = "" - self.response_end_token = "" - self.tool_call_start_token = "" - self.tool_call_end_token = "" - - if not self.model_tokenizer: - raise ValueError("The model tokenizer must be passed to the ReasoningParser constructor.") - - self.think_end_token_id = self.vocab.get("") - if self.think_end_token_id is None: - raise RuntimeError("Could not find think end token id in tokenizer vocabulary") - - def extract_reasoning_content_streaming( - self, - previous_text: str, - current_text: str, - delta_text: str, - previous_token_ids: Sequence[int], - current_token_ids: Sequence[int], - delta_token_ids: Sequence[int], - ) -> tuple[str, str]: - """ - 根据用户需求实现的流式解析方法: - 1. 初始内容都视为思考内容 - 2. 当遇到\n时检查后续是否是 - 3. 思考结束后检查是还是 - 4. 对于内容,处理换行和结束标记 - """ - # 如果还在思考阶段 - if not previous_text.endswith(self.think_end_token): - # 如果遇到\n后接或直接遇到,思考结束 - if (previous_text.endswith("\n") and delta_text == self.think_end_token) or ( - not previous_text.endswith("\n") and delta_text == self.think_end_token - ): - return "", "" - # 否则继续返回思考内容 - return delta_text, "" - - # 思考结束后检查是tool_call还是response - remaining_text = previous_text + delta_text - after_think = remaining_text[remaining_text.find(self.think_end_token) + len(self.think_end_token) :] - - # 跳过think后的换行 - after_think = after_think.lstrip("\n") - - # 处理tool_call情况 - if after_think.startswith(self.tool_call_start_token): - return "", "" - - # 处理response情况 - if after_think.startswith(self.response_start_token): - response_content = after_think[len(self.response_start_token) :] - # 跳过response后的换行 - response_content = response_content.lstrip("\n") - - # 检查response是否结束 - if response_content.endswith(self.response_end_token): - return "", "" - - # 返回response内容(使用delta_text确保流式输出) - return "", delta_text - - # 默认情况不返回内容 - return "", "" - - def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest) -> Tuple[str, str]: - """ - Batch version of the enhanced parser. - Modified to preserve newlines in both reasoning and response content, - only removing the single newline before closing tags. - """ - reasoning_content = "" - response_content = "" - - think_end_pos = model_output.find(self.think_end_token) - if think_end_pos != -1: - # Extract thinking content - only remove the last newline before - reasoning_content = model_output[:think_end_pos] - if think_end_pos > 0 and reasoning_content[-1] == "\n": - reasoning_content = reasoning_content[:-1] - - remaining = model_output[think_end_pos + len(self.think_end_token) :] - - # Skip newlines after - remaining = remaining.lstrip("\n") - - # Check for response or tool_call - if remaining.startswith(self.response_start_token): - response_pos = len(self.response_start_token) - remaining = remaining[response_pos:].lstrip("\n") - response_end_pos = remaining.find(self.response_end_token) - if response_end_pos != -1: - # Only strip the last newline before , not all - if response_end_pos > 0 and remaining[response_end_pos - 1] == "\n": - response_content = remaining[: response_end_pos - 1] - else: - response_content = remaining[:response_end_pos] - else: - # If no found, return the rest as response content - response_content = remaining - elif remaining.startswith(self.tool_call_start_token): - pass # No response content - else: - # No thinking content found, return the whole input as reasoning - reasoning_content = model_output - response_content = "" - return reasoning_content, response_content - - -import unittest -from unittest.mock import MagicMock - - -class TestErnieX1ReasoningParser(unittest.TestCase): - def setUp(self): - self.tokenizer = MagicMock() - self.tokenizer.vocab = { - "\n\n\n": 1001, - "\n": 1002, - "\n\n": 1003, - "\n": 1004, - "\n\n": 1005, - } - self.parser = ErnieX1ReasoningParser(self.tokenizer) - - def test_streaming_with_think_and_response(self): - # 测试标准情况:\n\n\n\ncontent\n\n - prev_text = "thinking" - delta_text = "\n\n\n\nanswer\n\n" - result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [], [], []) - self.assertEqual(result, ("thinking", "answer")) - - def test_streaming_with_think_and_tool_call(self): - # 测试tool_call情况 - prev_text = "thinking" - delta_text = "\n\n\n\ndetails\n\n" - result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [], [], []) - self.assertEqual(result, ("thinking", "")) - - def test_streaming_with_think_no_newline(self): - # 测试没有前置换行的情况 - prev_text = "thinking" - delta_text = "\n\nanswer\n" - result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [], [], []) - self.assertEqual(result, ("thinking", "answer")) - - def test_streaming_response_without_leading_newline(self): - # 测试response内容没有前置换行 - prev_text = "thinking\n\n\n" - delta_text = "answer\n\n" - result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [1001], [], []) - self.assertEqual(result, ("thinking", "answer")) - - def test_streaming_response_with_middle_newline(self): - # 测试response内容中间的换行符 - prev_text = "thinking\n\n\n\n" - delta_text = "line1\nline2\n\n" - result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [1001], [], []) - self.assertEqual(result, ("thinking", "line1\nline2")) - - def test_streaming_partial_response(self): - # 测试不完整的response流式输出 - prev_text = "thinking\n\n\n\n" - delta_text = "partial answer" - result = self.parser.extract_reasoning_content_streaming(prev_text, "", delta_text, [1001], [], []) - self.assertEqual(result, ("thinking", "partial answer")) - - -if __name__ == "__main__": - unittest.main() diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index 70e5df1299..5d68c7681e 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -23,7 +23,6 @@ import random import re import socket -import sys import tarfile import time from datetime import datetime @@ -592,22 +591,6 @@ def is_list_of( assert_never(check) -def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): - """ - Import a Python file according to its file path. - """ - spec = importlib.util.spec_from_file_location(module_name, file_path) - if spec is None: - raise ModuleNotFoundError(f"No module named '{module_name}'") - - assert spec.loader is not None - - module = importlib.util.module_from_spec(spec) - sys.modules[module_name] = module - spec.loader.exec_module(module) - return module - - def version(): """ Prints the contents of the version.txt file located in the parent directory of this script. diff --git a/requirements.txt b/requirements.txt index 0e0d5ca6f7..55489db3a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,4 +37,3 @@ opentelemetry-instrumentation-mysql opentelemetry-distro  opentelemetry-exporter-otlp opentelemetry-instrumentation-fastapi -partial_json_parser From 101605869c87e5cb7f1a2c702fa592b1b5236c93 Mon Sep 17 00:00:00 2001 From: yinwei Date: Thu, 14 Aug 2025 17:41:40 +0800 Subject: [PATCH 10/13] [XPU] Fixed the issue of performance degradation caused by enabling ENABLE_V1_KVCACHE_SCHEDULER (#3393) * fix v1 schedule oom bug * fix v1 schedule oom bug --- fastdeploy/engine/args_utils.py | 11 ++++++++--- fastdeploy/engine/config.py | 9 ++++++--- fastdeploy/engine/sched/resource_manager_v1.py | 2 +- fastdeploy/worker/xpu_model_runner.py | 9 +++++++-- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index c254aaa1a2..835d3eb4dc 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -15,10 +15,12 @@ """ import json +import os from dataclasses import asdict, dataclass from dataclasses import fields as dataclass_fields from typing import Any, Dict, List, Optional -import os + +import paddle from fastdeploy.config import ( CacheConfig, @@ -866,10 +868,13 @@ def create_engine_config(self) -> Config: if self.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): self.max_num_batched_tokens = self.max_model_len else: - self.max_num_batched_tokens = 8192 + if paddle.is_compiled_with_xpu(): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 all_dict = asdict(self) all_dict["model_cfg"] = model_cfg diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py index fb57884bf3..f6303d7b3a 100644 --- a/fastdeploy/engine/config.py +++ b/fastdeploy/engine/config.py @@ -236,10 +236,13 @@ def postprocess(self): if self.cache_config.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): self.max_num_batched_tokens = self.max_model_len else: - self.max_num_batched_tokens = 8192 + if paddle.is_compiled_with_xpu(): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 if self.long_prefill_token_threshold == 0: self.long_prefill_token_threshold = int(self.max_model_len * 0.04) @@ -287,7 +290,7 @@ def check(self): ) if not self.cache_config.enable_chunked_prefill: - if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): assert self.max_num_batched_tokens >= self.max_model_len, ( f"max_num_batched_tokens: {self.max_num_batched_tokens} " f"should be larger than or equal to max_model_len: {self.max_model_len}" diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 4aecabcd5d..ba0197a90b 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -289,7 +289,7 @@ def schedule(self): while self.waiting and token_budget > 0: if len(self.running) == self.max_num_seqs: break - if self.config.enable_mm and self.exist_prefill(scheduled_reqs): + if (self.config.enable_mm or paddle.is_compiled_with_xpu()) and self.exist_prefill(scheduled_reqs): break request = self.waiting[0] if request.status == RequestStatus.WAITING: diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index a5558ac470..3c76b9a2c8 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -383,15 +383,18 @@ def insert_tasks_v1(self, req_dicts: List[Request]): req_len = len(req_dicts) has_prefill_task = False + has_decode_task = False for i in range(req_len): request = req_dicts[i] idx = request.idx if request.task_type.value == RequestType.PREFILL.value: # prefill task - logger.debug(f"Handle prefill request {request} at idx {idx}") prefill_start_index = request.prefill_start_index prefill_end_index = request.prefill_end_index length = prefill_end_index - prefill_start_index input_ids = request.prompt_token_ids + request.output_token_ids + logger.debug( + f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}" + ) self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array( input_ids[prefill_start_index:prefill_end_index] ) @@ -420,6 +423,8 @@ def insert_tasks_v1(self, req_dicts: List[Request]): self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array( request.block_tables, dtype="int32" ) + if self.share_inputs["is_block_step"][idx]: # has tasks to continue to decode + has_decode_task = True continue else: # preempted task logger.debug(f"Handle preempted request {request} at idx {idx}") @@ -460,7 +465,7 @@ def insert_tasks_v1(self, req_dicts: List[Request]): self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array( request.get("stop_token_ids"), dtype="int64" ) - if has_prefill_task: + if has_prefill_task or has_decode_task: self.share_inputs["not_need_stop"][0] = True def process_prefill_inputs(self, req_dicts: List[Request]): From ad8ea689067f71c0d4f1be2b8d84ef2170aec63f Mon Sep 17 00:00:00 2001 From: memoryCoderC <1137889088@qq.com> Date: Thu, 14 Aug 2025 19:10:07 +0800 Subject: [PATCH 11/13] [BugFix] fix ErnieProcessor not set raw_prediction (#3401) --- fastdeploy/input/ernie_processor.py | 1 + test/input/test_ernie_processor.py | 53 +++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 test/input/test_ernie_processor.py diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py index a268ad562e..7cbb847f79 100644 --- a/fastdeploy/input/ernie_processor.py +++ b/fastdeploy/input/ernie_processor.py @@ -274,6 +274,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs): if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) + response_dict["outputs"]["raw_prediction"] = delta_text if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming( previous_texts, diff --git a/test/input/test_ernie_processor.py b/test/input/test_ernie_processor.py new file mode 100644 index 0000000000..19226b1622 --- /dev/null +++ b/test/input/test_ernie_processor.py @@ -0,0 +1,53 @@ +import unittest +from unittest.mock import MagicMock, patch + +from fastdeploy.input.ernie_processor import ErnieProcessor + + +class TestErnieProcessorProcessResponseDictStreaming(unittest.TestCase): + def setUp(self): + # 创建 ErnieProcessor 实例的模拟对象 + with patch.object(ErnieProcessor, "__init__", return_value=None) as mock_init: + self.processor = ErnieProcessor("model_path") + mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}") + + # 设置必要的属性 + self.processor.tokenizer = MagicMock() + self.processor.tokenizer.eos_token_id = 1 + self.processor.decode_status = {} + self.processor.tool_parsers = {} + + # 模拟 ids2tokens 方法 + def mock_ids2tokens(token_ids, task_id): + return "delta_text", [2, 3], "previous_texts" + + self.processor.ids2tokens = mock_ids2tokens + + # 模拟推理解析器 + self.mock_reasoning_parser = MagicMock() + self.mock_reasoning_parser.__class__.__name__ = "ErnieX1ReasoningParser" + self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = ("reasoning", "text") + self.processor.reasoning_parser = self.mock_reasoning_parser + + # 模拟工具解析器 + self.mock_tool_parser = MagicMock() + self.mock_tool_parser.extract_tool_calls_streaming.return_value = "tool_call" + self.mock_tool_parser_obj = MagicMock() + self.mock_tool_parser_obj.return_value = self.mock_tool_parser + self.processor.tool_parser_obj = self.mock_tool_parser_obj + + def test_process_response_dict_streaming_normal_case(self): + """测试正常情况下的流式响应处理""" + # 准备输入 + response_dict = {"finished": False, "request_id": "req1", "outputs": {"token_ids": [4, 5]}} + kwargs = {"enable_thinking": True} + + # 调用方法 + result = self.processor.process_response_dict_streaming(response_dict, **kwargs) + + # 验证结果 + self.assertEqual(result["outputs"]["raw_prediction"], "delta_text") + + +if __name__ == "__main__": + unittest.main() From 8a15bdc0c8a4e2e378790e9be460f9058e38a1a5 Mon Sep 17 00:00:00 2001 From: yinwei Date: Thu, 14 Aug 2025 19:11:16 +0800 Subject: [PATCH 12/13] [Doc]Release fastdeploy-xpu 2.1.0 (#3407) * fix v1 schedule oom bug * fix v1 schedule oom bug * update release note --- docs/get_started/installation/kunlunxin_xpu.md | 14 +++++++------- docs/usage/kunlunxin_xpu_deployment.md | 8 +++++++- docs/zh/get_started/installation/kunlunxin_xpu.md | 14 +++++++------- docs/zh/usage/kunlunxin_xpu_deployment.md | 6 ++++++ 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/docs/get_started/installation/kunlunxin_xpu.md b/docs/get_started/installation/kunlunxin_xpu.md index 39c1832ca3..4950347ce1 100644 --- a/docs/get_started/installation/kunlunxin_xpu.md +++ b/docs/get_started/installation/kunlunxin_xpu.md @@ -5,7 +5,7 @@ - OS: Linux - Python: 3.10 - XPU Model: P800 -- XPU Driver Version: ≥ 5.0.21.10 +- XPU Driver Version: ≥ 5.0.21.26 - XPU Firmware Version: ≥ 1.31 Verified platform: @@ -15,7 +15,7 @@ Verified platform: - OS: CentOS release 7.6 (Final) - Python: 3.10 - XPU Model: P800 (OAM Edition) -- XPU Driver Version: 5.0.21.10 +- XPU Driver Version: 5.0.21.26 - XPU Firmware Version: 1.31 **Note:** Currently, only INTEL or Hygon CPU-based P800 (OAM Edition) servers have been verified. Other CPU types and P800 (PCIe Edition) servers have not been tested yet. @@ -25,9 +25,9 @@ Verified platform: ```bash mkdir Work cd Work -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \ - ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 \ + ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 \ /bin/bash docker exec -it fastdeploy-xpu /bin/bash ``` @@ -37,7 +37,7 @@ docker exec -it fastdeploy-xpu /bin/bash ### Install PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` Alternatively, you can install the latest version of PaddlePaddle (Not recommended) @@ -49,7 +49,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/ ### Install FastDeploy (**Do NOT install via PyPI source**) ```bash -python -m pip install fastdeploy-xpu==2.0.3 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +python -m pip install fastdeploy-xpu==2.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple ``` Alternatively, you can install the latest version of FastDeploy (Not recommended) @@ -63,7 +63,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa ### Install PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` Alternatively, you can install the latest version of PaddlePaddle (Not recommended) diff --git a/docs/usage/kunlunxin_xpu_deployment.md b/docs/usage/kunlunxin_xpu_deployment.md index 4eb7c70f87..455152d59c 100644 --- a/docs/usage/kunlunxin_xpu_deployment.md +++ b/docs/usage/kunlunxin_xpu_deployment.md @@ -5,8 +5,14 @@ |ERNIE-4.5-300B-A47B|32K|WINT4|4 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 4 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| |ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| |ERNIE-4.5-300B-A47B|128K|WINT4|8 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 131072 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| +|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.1.0| |ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.0.3| -|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.0.3| +|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.0.3| |ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.0.3| |ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.0.3| diff --git a/docs/zh/get_started/installation/kunlunxin_xpu.md b/docs/zh/get_started/installation/kunlunxin_xpu.md index 2e77dac4c5..e01d7c0f72 100644 --- a/docs/zh/get_started/installation/kunlunxin_xpu.md +++ b/docs/zh/get_started/installation/kunlunxin_xpu.md @@ -5,7 +5,7 @@ - OS:Linux - Python:3.10 - XPU 型号:P800 -- XPU 驱动版本:≥ 5.0.21.10 +- XPU 驱动版本:≥ 5.0.21.26 - XPU 固件版本:≥ 1.31 已验证的平台: @@ -15,7 +15,7 @@ - OS:CentOS release 7.6 (Final) - Python:3.10 - XPU 型号:P800(OAM 版) -- XPU 驱动版本:5.0.21.10 +- XPU 驱动版本:5.0.21.26 - XPU 固件版本:1.31 **注:** 目前只验证过 INTEL 或海光 CPU OAM 版 P800 服务器,暂未验证其它 CPU 和 PCIe 版 P800 服务器。 @@ -25,9 +25,9 @@ ```bash mkdir Work cd Work -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \ - ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 \ + ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 \ /bin/bash docker exec -it fastdeploy-xpu /bin/bash ``` @@ -37,7 +37,7 @@ docker exec -it fastdeploy-xpu /bin/bash ### 安装 PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.1.1-i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` 或者您也可以安装最新版 PaddlePaddle(不推荐) @@ -49,7 +49,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/ ### 安装 FastDeploy(**注意不要通过 pypi 源安装**) ```bash -python -m pip install fastdeploy-xpu==2.0.3 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +python -m pip install fastdeploy-xpu==2.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple ``` 或者你也可以安装最新版 FastDeploy(不推荐) @@ -63,7 +63,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa ### 安装 PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.1.1-i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` 或者您也可以安装最新版 PaddlePaddle(不推荐) diff --git a/docs/zh/usage/kunlunxin_xpu_deployment.md b/docs/zh/usage/kunlunxin_xpu_deployment.md index fa4501f5c8..aabfd14925 100644 --- a/docs/zh/usage/kunlunxin_xpu_deployment.md +++ b/docs/zh/usage/kunlunxin_xpu_deployment.md @@ -5,6 +5,12 @@ |ERNIE-4.5-300B-A47B|32K|WINT4|4 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 4 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| |ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| |ERNIE-4.5-300B-A47B|128K|WINT4|8 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 131072 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| +|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.1.0| |ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.0.3| |ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.0.3| |ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.0.3| From d998efbc17ece64178ae4a5214e89f44153f265e Mon Sep 17 00:00:00 2001 From: yinwei Date: Thu, 14 Aug 2025 19:19:54 +0800 Subject: [PATCH 13/13] [Doc]Release fastdeploy-xpu 2.0.3 (#3408) * fix v1 schedule oom bug * fix v1 schedule oom bug * update release note * update info --- docs/zh/get_started/installation/kunlunxin_xpu.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/zh/get_started/installation/kunlunxin_xpu.md b/docs/zh/get_started/installation/kunlunxin_xpu.md index e01d7c0f72..c14f49f5f6 100644 --- a/docs/zh/get_started/installation/kunlunxin_xpu.md +++ b/docs/zh/get_started/installation/kunlunxin_xpu.md @@ -37,7 +37,7 @@ docker exec -it fastdeploy-xpu /bin/bash ### 安装 PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.1.1-i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` 或者您也可以安装最新版 PaddlePaddle(不推荐) @@ -63,7 +63,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa ### 安装 PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.1.1-i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` 或者您也可以安装最新版 PaddlePaddle(不推荐)