From fcc494cd88c593f3daa364f3c0671c3f23eebbe4 Mon Sep 17 00:00:00 2001 From: iosmers Date: Thu, 14 Aug 2025 04:56:00 +0000 Subject: [PATCH 1/3] fix v1 schedule oom bug --- fastdeploy/engine/args_utils.py | 6 +++++- fastdeploy/engine/config.py | 5 ++++- fastdeploy/engine/sched/resource_manager_v1.py | 2 +- fastdeploy/worker/xpu_model_runner.py | 7 +++++-- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index c254aaa1a2..018edd9ab3 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -19,6 +19,7 @@ from dataclasses import fields as dataclass_fields from typing import Any, Dict, List, Optional import os +import paddle from fastdeploy.config import ( CacheConfig, @@ -869,7 +870,10 @@ def create_engine_config(self) -> Config: if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): self.max_num_batched_tokens = self.max_model_len else: - self.max_num_batched_tokens = 8192 + if paddle.is_compiled_with_xpu(): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 all_dict = asdict(self) all_dict["model_cfg"] = model_cfg diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py index fb57884bf3..36c5686e4f 100644 --- a/fastdeploy/engine/config.py +++ b/fastdeploy/engine/config.py @@ -239,7 +239,10 @@ def postprocess(self): if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): self.max_num_batched_tokens = self.max_model_len else: - self.max_num_batched_tokens = 8192 + if paddle.is_compiled_with_xpu(): + self.max_num_batched_tokens = self.max_model_len + else: + self.max_num_batched_tokens = 8192 if self.long_prefill_token_threshold == 0: self.long_prefill_token_threshold = int(self.max_model_len * 0.04) diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index d1116980c5..aeb5cfcc1e 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -284,7 +284,7 @@ def schedule(self): while self.waiting and token_budget > 0: if len(self.running) == self.max_num_seqs: break - if self.config.enable_mm and self.exist_prefill(scheduled_reqs): + if (self.config.enable_mm or paddle.is_compiled_with_xpu())and self.exist_prefill(scheduled_reqs): break request = self.waiting[0] if request.status == RequestStatus.WAITING: diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index a5558ac470..d9596b5f92 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -383,15 +383,16 @@ def insert_tasks_v1(self, req_dicts: List[Request]): req_len = len(req_dicts) has_prefill_task = False + has_decode_task = False for i in range(req_len): request = req_dicts[i] idx = request.idx if request.task_type.value == RequestType.PREFILL.value: # prefill task - logger.debug(f"Handle prefill request {request} at idx {idx}") prefill_start_index = request.prefill_start_index prefill_end_index = request.prefill_end_index length = prefill_end_index - prefill_start_index input_ids = request.prompt_token_ids + request.output_token_ids + logger.debug(f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}") self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array( input_ids[prefill_start_index:prefill_end_index] ) @@ -420,6 +421,8 @@ def insert_tasks_v1(self, req_dicts: List[Request]): self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array( request.block_tables, dtype="int32" ) + if self.share_inputs["is_block_step"][idx]: # has tasks to continue to decode + has_decode_task = True continue else: # preempted task logger.debug(f"Handle preempted request {request} at idx {idx}") @@ -460,7 +463,7 @@ def insert_tasks_v1(self, req_dicts: List[Request]): self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array( request.get("stop_token_ids"), dtype="int64" ) - if has_prefill_task: + if has_prefill_task or has_decode_task: self.share_inputs["not_need_stop"][0] = True def process_prefill_inputs(self, req_dicts: List[Request]): From dbeb7de00a23a19237759d2a86de3171a4ac01e7 Mon Sep 17 00:00:00 2001 From: iosmers Date: Thu, 14 Aug 2025 04:58:04 +0000 Subject: [PATCH 2/3] fix v1 schedule oom bug --- fastdeploy/engine/args_utils.py | 5 +++-- fastdeploy/engine/config.py | 4 ++-- fastdeploy/engine/sched/resource_manager_v1.py | 18 ++++++++++++------ fastdeploy/worker/xpu_model_runner.py | 4 +++- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 018edd9ab3..835d3eb4dc 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -15,10 +15,11 @@ """ import json +import os from dataclasses import asdict, dataclass from dataclasses import fields as dataclass_fields from typing import Any, Dict, List, Optional -import os + import paddle from fastdeploy.config import ( @@ -867,7 +868,7 @@ def create_engine_config(self) -> Config: if self.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): self.max_num_batched_tokens = self.max_model_len else: if paddle.is_compiled_with_xpu(): diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py index 36c5686e4f..f6303d7b3a 100644 --- a/fastdeploy/engine/config.py +++ b/fastdeploy/engine/config.py @@ -236,7 +236,7 @@ def postprocess(self): if self.cache_config.enable_chunked_prefill: self.max_num_batched_tokens = 2048 else: - if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): self.max_num_batched_tokens = self.max_model_len else: if paddle.is_compiled_with_xpu(): @@ -290,7 +290,7 @@ def check(self): ) if not self.cache_config.enable_chunked_prefill: - if not int(os.getenv('ENABLE_V1_KVCACHE_SCHEDULER', '0')): + if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): assert self.max_num_batched_tokens >= self.max_model_len, ( f"max_num_batched_tokens: {self.max_num_batched_tokens} " f"should be larger than or equal to max_model_len: {self.max_model_len}" diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index aeb5cfcc1e..d4d6c6143e 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -97,13 +97,13 @@ def _prepare_decode_task(self, request): def _prepare_preempt_task(self, request): return ScheduledPreemptTask(idx=request.idx, request_id=request.request_id) - + def reschedule_preempt_task(self, request_id): with self.lock: if request_id in self.to_be_rescheduled_request_id_set and request_id in self.requests: request = self.requests[request_id] self.waiting.appendleft(request) - self.to_be_rescheduled_request_id_set.remove(request_id) + self.to_be_rescheduled_request_id_set.remove(request_id) def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_reqs): can_schedule = True @@ -284,7 +284,7 @@ def schedule(self): while self.waiting and token_budget > 0: if len(self.running) == self.max_num_seqs: break - if (self.config.enable_mm or paddle.is_compiled_with_xpu())and self.exist_prefill(scheduled_reqs): + if (self.config.enable_mm or paddle.is_compiled_with_xpu()) and self.exist_prefill(scheduled_reqs): break request = self.waiting[0] if request.status == RequestStatus.WAITING: @@ -421,9 +421,15 @@ def finish_requests(self, request_ids: Union[str, Iterable[str]]): self.running.remove(request) request.status = RequestStatus.FINISHED self._free_blocks(request) - if request.request_id in self.to_be_rescheduled_request_id_set: # finished after preempted, blocks have been recycled. - self.to_be_rescheduled_request_id_set.remove(request.request_id) # just remove from to_be_rescheduled_request_id_set - if request in self.waiting: # after finished, this request still scheduled from preempted to waiting, unexpected error, should not be here + if ( + request.request_id in self.to_be_rescheduled_request_id_set + ): # finished after preempted, blocks have been recycled. + self.to_be_rescheduled_request_id_set.remove( + request.request_id + ) # just remove from to_be_rescheduled_request_id_set + if ( + request in self.waiting + ): # after finished, this request still scheduled from preempted to waiting, unexpected error, should not be here raise RuntimeError(f"request {request.request_id} scheduled into waiting list, after finished") self.tasks_list[request.idx] = None self.stop_flags[request.idx] = True diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index d9596b5f92..3c76b9a2c8 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -392,7 +392,9 @@ def insert_tasks_v1(self, req_dicts: List[Request]): prefill_end_index = request.prefill_end_index length = prefill_end_index - prefill_start_index input_ids = request.prompt_token_ids + request.output_token_ids - logger.debug(f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}") + logger.debug( + f"Handle prefill request {request} at idx {idx} prefill_start_index {prefill_start_index} prefill_end_index {prefill_end_index} need_prefilled_token_num {len(input_ids)}" + ) self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array( input_ids[prefill_start_index:prefill_end_index] ) From d627670b85b5ccc1e579c37ddae7a9ecd4296c88 Mon Sep 17 00:00:00 2001 From: iosmers Date: Thu, 14 Aug 2025 10:54:44 +0000 Subject: [PATCH 3/3] update release note --- docs/get_started/installation/kunlunxin_xpu.md | 14 +++++++------- docs/usage/kunlunxin_xpu_deployment.md | 8 +++++++- docs/zh/get_started/installation/kunlunxin_xpu.md | 14 +++++++------- docs/zh/usage/kunlunxin_xpu_deployment.md | 6 ++++++ 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/docs/get_started/installation/kunlunxin_xpu.md b/docs/get_started/installation/kunlunxin_xpu.md index 39c1832ca3..4950347ce1 100644 --- a/docs/get_started/installation/kunlunxin_xpu.md +++ b/docs/get_started/installation/kunlunxin_xpu.md @@ -5,7 +5,7 @@ - OS: Linux - Python: 3.10 - XPU Model: P800 -- XPU Driver Version: ≥ 5.0.21.10 +- XPU Driver Version: ≥ 5.0.21.26 - XPU Firmware Version: ≥ 1.31 Verified platform: @@ -15,7 +15,7 @@ Verified platform: - OS: CentOS release 7.6 (Final) - Python: 3.10 - XPU Model: P800 (OAM Edition) -- XPU Driver Version: 5.0.21.10 +- XPU Driver Version: 5.0.21.26 - XPU Firmware Version: 1.31 **Note:** Currently, only INTEL or Hygon CPU-based P800 (OAM Edition) servers have been verified. Other CPU types and P800 (PCIe Edition) servers have not been tested yet. @@ -25,9 +25,9 @@ Verified platform: ```bash mkdir Work cd Work -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \ - ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 \ + ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 \ /bin/bash docker exec -it fastdeploy-xpu /bin/bash ``` @@ -37,7 +37,7 @@ docker exec -it fastdeploy-xpu /bin/bash ### Install PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` Alternatively, you can install the latest version of PaddlePaddle (Not recommended) @@ -49,7 +49,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/ ### Install FastDeploy (**Do NOT install via PyPI source**) ```bash -python -m pip install fastdeploy-xpu==2.0.3 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +python -m pip install fastdeploy-xpu==2.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple ``` Alternatively, you can install the latest version of FastDeploy (Not recommended) @@ -63,7 +63,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa ### Install PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` Alternatively, you can install the latest version of PaddlePaddle (Not recommended) diff --git a/docs/usage/kunlunxin_xpu_deployment.md b/docs/usage/kunlunxin_xpu_deployment.md index 4eb7c70f87..455152d59c 100644 --- a/docs/usage/kunlunxin_xpu_deployment.md +++ b/docs/usage/kunlunxin_xpu_deployment.md @@ -5,8 +5,14 @@ |ERNIE-4.5-300B-A47B|32K|WINT4|4 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 4 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| |ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| |ERNIE-4.5-300B-A47B|128K|WINT4|8 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 131072 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| +|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.1.0| |ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.0.3| -|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.0.3| +|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.0.3| |ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.0.3| |ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.0.3| diff --git a/docs/zh/get_started/installation/kunlunxin_xpu.md b/docs/zh/get_started/installation/kunlunxin_xpu.md index 2e77dac4c5..e01d7c0f72 100644 --- a/docs/zh/get_started/installation/kunlunxin_xpu.md +++ b/docs/zh/get_started/installation/kunlunxin_xpu.md @@ -5,7 +5,7 @@ - OS:Linux - Python:3.10 - XPU 型号:P800 -- XPU 驱动版本:≥ 5.0.21.10 +- XPU 驱动版本:≥ 5.0.21.26 - XPU 固件版本:≥ 1.31 已验证的平台: @@ -15,7 +15,7 @@ - OS:CentOS release 7.6 (Final) - Python:3.10 - XPU 型号:P800(OAM 版) -- XPU 驱动版本:5.0.21.10 +- XPU 驱动版本:5.0.21.26 - XPU 固件版本:1.31 **注:** 目前只验证过 INTEL 或海光 CPU OAM 版 P800 服务器,暂未验证其它 CPU 和 PCIe 版 P800 服务器。 @@ -25,9 +25,9 @@ ```bash mkdir Work cd Work -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \ - ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 \ + ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.1.0 \ /bin/bash docker exec -it fastdeploy-xpu /bin/bash ``` @@ -37,7 +37,7 @@ docker exec -it fastdeploy-xpu /bin/bash ### 安装 PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.1.1-i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` 或者您也可以安装最新版 PaddlePaddle(不推荐) @@ -49,7 +49,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/ ### 安装 FastDeploy(**注意不要通过 pypi 源安装**) ```bash -python -m pip install fastdeploy-xpu==2.0.3 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +python -m pip install fastdeploy-xpu==2.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple ``` 或者你也可以安装最新版 FastDeploy(不推荐) @@ -63,7 +63,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa ### 安装 PaddlePaddle ```bash -python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +python -m pip install paddlepaddle-xpu==3.1.1-i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ ``` 或者您也可以安装最新版 PaddlePaddle(不推荐) diff --git a/docs/zh/usage/kunlunxin_xpu_deployment.md b/docs/zh/usage/kunlunxin_xpu_deployment.md index fa4501f5c8..aabfd14925 100644 --- a/docs/zh/usage/kunlunxin_xpu_deployment.md +++ b/docs/zh/usage/kunlunxin_xpu_deployment.md @@ -5,6 +5,12 @@ |ERNIE-4.5-300B-A47B|32K|WINT4|4 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 4 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| |ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| |ERNIE-4.5-300B-A47B|128K|WINT4|8 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 131072 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.0.0| +|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.1.0| +|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9|>=2.1.0| |ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.0.3| |ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9|>=2.0.3| |ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9|>=2.0.3|