ModelTC · shihaobai · Mar 1, 2025 · Feb 16, 2025 · Feb 16, 2025 · Feb 16, 2025
diff --git a/docs/CN/source/getting_started/quickstart.rst b/docs/CN/source/getting_started/quickstart.rst
@@ -56,6 +56,22 @@
 .. note::
     上面代码中的 ``--model_dir`` 参数需要修改为你本机实际的模型路径。
 
+单机H200部署 DeepSeek-R1 模型, 启动命令如下:
+
+.. code-block:: console
+
+    $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 8 --graph_max_batch_size 100
+
+.. note::
+    LOADWORKER 指定了模型加载的线程，可以提高模型加载的速度。--graph_max_batch_size 指定了要捕获的cudagraph的数量，将捕获从1到100的batch size的图。
+
+双机H100部署 DeepSeek-R1 模型，启动命令如下：
+
+.. code-block:: console
+    $ # Node 0
+    $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 16 --graph_max_batch_size 100 --nccl_host master_addr --nnodes 2 --node_rank 0
+    $ # Node 1
+    $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 16 --graph_max_batch_size 100 --nccl_host master_addr --nnodes 2 --node_rank 1
 
 3. （可选）测试模型服务
 -------------------------
@@ -75,3 +91,10 @@
     $           }'
 
 
+对于DeepSeek-R1模型，可以用如下脚本进行测试：
+
+.. code-block:: console
+
+    $ cd test
+    $ python benchmark_client.py --num_clients 100 --input_num 2000 --tokenizer_path /nvme/DeepSeek-R1/ --url http://127.0.01:8000/generate_stream
+
diff --git a/docs/EN/source/getting_started/quickstart.rst b/docs/EN/source/getting_started/quickstart.rst
@@ -53,7 +53,7 @@ After downloading the Llama-2-7b-chat model, use the following command in the te
 .. note::
     The ``--model_dir`` parameter in the above command should be changed to the actual path of your model on your machine. 
 
-For the DeepSeek-R1 model on H200, it can be launched with the following command:
+For the DeepSeek-R1 model on single H200, it can be launched with the following command:
 
 .. code-block:: console
 
@@ -62,6 +62,14 @@ For the DeepSeek-R1 model on H200, it can be launched with the following command
 .. note::
     LOADWORKER specifies the thread for model loading, which can enhance the speed of model loading. The --graph_max_batch_size parameter specifies the number of cudagraphs to be captured, which will capture graphs for batch sizes ranging from 1 to 100.
 
+For the DeepSeek-R1 model on two H100, it can be launched with the following command:
+
+.. code-block:: console
+    $ # Node 0
+    $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 16 --graph_max_batch_size 100 --nccl_host master_addr --nnodes 2 --node_rank 0
+    $ # Node 1
+    $ LOADWORKER=8 python -m lightllm.server.api_server --model_dir ~/models/DeepSeek-R1 --tp 16 --graph_max_batch_size 100 --nccl_host master_addr --nnodes 2 --node_rank 1
+
 
 3. (Optional) Test the Model Service
 --------------------------------------

diff --git a/lightllm/common/basemodel/layer_weights/base_layer_weight.py b/lightllm/common/basemodel/layer_weights/base_layer_weight.py
@@ -2,6 +2,7 @@
 import numpy as np
 import threading
 from lightllm.common.basemodel.layer_weights.meta_weights import BaseWeight
+from lightllm.utils.dist_utils import get_current_device_id
 
 
 class BaseLayerWeight:
@@ -37,4 +38,4 @@ def _cuda(self, cpu_tensor):
         if self.tp_rank_ is None:
             return cpu_tensor.contiguous().to(self.data_type_).cuda()
         else:
-            return cpu_tensor.contiguous().to(self.data_type_).cuda(self.tp_rank_)
+            return cpu_tensor.contiguous().to(self.data_type_).cuda(get_current_device_id())
diff --git a/lightllm/common/basemodel/layer_weights/hf_load_utils.py b/lightllm/common/basemodel/layer_weights/hf_load_utils.py
@@ -3,14 +3,14 @@
 import gc
 from safetensors import safe_open
 import lightllm.utils.petrel_helper as utils
+from lightllm.utils.dist_utils import get_current_device_id
 
 
 def load_func(file_, use_safetensors=False, pre_post_layer=None, transformer_layer_list=None, weight_dir=None):
     # fix bug for 多线程加载的时候，每个线程内部的cuda device 会切回 0， 修改后来保证不会出现bug
     import torch.distributed as dist
 
-    tp_rank = dist.get_rank()
-    torch.cuda.set_device(tp_rank)
+    torch.cuda.set_device(get_current_device_id())
 
     if use_safetensors:
         weights = safe_open(os.path.join(weight_dir, file_), "pt", "cpu")

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/base_weight.py
@@ -1,7 +1,6 @@
 import torch
 from abc import ABC, abstractmethod
-from lightllm.utils.dist_utils import get_world_size, get_rank
-from lightllm.utils.device_utils import get_current_device_id
+from lightllm.utils.dist_utils import get_global_world_size, get_global_rank, get_current_device_id
 
 
 class BaseWeight(ABC):
@@ -19,8 +18,8 @@ def verify_load(self):
 
 class BaseWeightTpl(BaseWeight):
     def __init__(self):
-        self.world_size_ = get_world_size()
-        self.tp_rank_ = get_rank()
+        self.world_size_ = get_global_world_size()
+        self.tp_rank_ = get_global_rank()
         self.device_id_ = get_current_device_id()
 
     def load_hf_weights(self, weights):

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight.py
@@ -5,9 +5,8 @@
 from .base_weight import BaseWeight
 from lightllm.common.quantization import vLLMFP8w8a8QuantizationMethod
 from lightllm.common.quantization.quantize_method import QuantizationMethod
-from lightllm.utils.dist_utils import get_world_size, get_rank
+from lightllm.utils.dist_utils import get_global_world_size, get_global_rank, get_current_device_id
 from lightllm.common.vllm_kernel import _custom_ops as ops
-from lightllm.utils.device_utils import get_current_device_id
 
 
 class FusedMoeWeight(BaseWeight):
@@ -39,7 +38,7 @@ def __init__(
         self.n_routed_experts = n_routed_experts
         self.split_inter_size = split_inter_size
         self.data_type_ = data_type
-        self.tp_rank_ = get_rank()
+        self.tp_rank_ = get_global_rank()
         self.experts_up_projs = [None] * self.n_routed_experts
         self.experts_gate_projs = [None] * self.n_routed_experts
         self.experts_up_proj_scales = [None] * self.n_routed_experts
@@ -159,7 +158,7 @@ def _fuse_weight_scale(self):
                 delattr(self, "experts_gate_proj_scales")
 
     def _load_hf_weights_etp(self, weights):
-        world_size_ = get_world_size()
+        world_size_ = get_global_world_size()
         assert self.n_routed_experts % world_size_ == 0
         n_expert_ep = self.n_routed_experts // world_size_
 

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight.py
@@ -4,6 +4,7 @@
 from typing import Optional, Tuple, List, Dict, Any
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.common.quantization.quantize_method import QuantizationMethod
+from lightllm.utils.dist_utils import get_current_device_id
 
 
 def generate_scale_name(name, weight_scale_suffix, act_scale_suffix):
@@ -73,20 +74,17 @@ def _post_load_weights(self) -> None:
                     and (not self.static_activation or self.input_scale is not None)
                 ):
                     if self.weight_scale.ndim > 1:
-                        # 让 k dim 更连续，大多数split k 算法的算子可能能更快
-                        self.weight_scale = self.weight_scale.cuda(self.device_id_).transpose(0, 1)
+                        self.weight_scale = self.weight_scale.transpose(0, 1).cuda(get_current_device_id())
                     self.weight = [
-                        # 让 k dim 更连续，大多数split k 算法的算子可能能更快
-                        self.weight.cuda(self.device_id_).transpose(0, 1),
+                        self.weight.cuda(get_current_device_id()).transpose(0, 1),
                         self.weight_scale,
                         self.input_scale,
                     ]
             else:
-                self.weight = self.quant_method.quantize(self.weight.to(self.data_type_).cuda(self.device_id_))
+                self.weight = self.quant_method.quantize(self.weight.to(self.data_type_).cuda(get_current_device_id()))
             return
-
         # 让 k dim 更连续，大多数split k 算法的算子可能能更快
-        self.weight = self.weight.to(self.data_type_).cuda(self.device_id_).transpose(0, 1)
+        self.weight = self.weight.to(self.data_type_).cuda(get_current_device_id()).transpose(0, 1)
 
 
 class MMWeight(MMWeightTpl):
@@ -133,7 +131,7 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]) -> None:
             self.weight = weight[self.start : self.end]
         if self.bias_name in weights:
             bias = weights[self.bias_name].to(self.data_type_)[self.start : self.end]
-            self.bias = bias.cuda(self.device_id_)
+            self.bias = bias.cuda(get_current_device_id())
 
         if self.weight_scale_name is not None and self.weight_scale_name in weights:
             block_size = 1
@@ -154,7 +152,7 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]) -> None:
 
         if self.act_scale_name is not None and self.act_scale_name in weights:
             input_scale = weights[self.act_scale_name].to(torch.float)
-            self.input_scale = input_scale.cuda()
+            self.input_scale = input_scale.cuda(get_current_device_id())
 
         if weight is None and weight_scale is None and input_scale is None:
             return
@@ -198,7 +196,7 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]) -> None:
             self.weight = weight[:, self.start : self.end]
         if self.bias_name in weights:
             bias = weights[self.bias_name]
-            self.bias = (bias / self.world_size_).to(self.data_type_).cuda(self.device_id_)
+            self.bias = (bias / self.world_size_).to(self.data_type_).cuda(get_current_device_id())
 
         if self.quantized_weight and self.weight_scale_name in weights:
             block_size = 1
@@ -216,7 +214,7 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]) -> None:
 
         if self.static_activation and self.act_scale_name in weights:
             input_scale = weights[self.act_scale_name].to(torch.float)
-            self.input_scale = input_scale.cuda()
+            self.input_scale = input_scale.cuda(get_current_device_id())
 
         if weight is None and weight_scale is None and input_scale is None:
             return
@@ -294,19 +292,19 @@ def _fuse(self) -> None:
             delattr(self, "weights")
 
         if self.weight_scale is None and (None not in self.weight_scales):
-            self.weight_scale = torch.cat(self.weight_scales, dim=0).cuda()
+            self.weight_scale = torch.cat(self.weight_scales, dim=0).cuda(get_current_device_id())
             self._post_load_weights()
             delattr(self, "weight_scales")
 
         if self.static_activation and self.input_scale is None and (None not in self.input_scales):
             input_scales = torch.stack(self.input_scales, dim=0)
-            self.input_scale = torch.max(input_scales).cuda()
+            self.input_scale = torch.max(input_scales).cuda(get_current_device_id())
             self._post_load_weights()
             delattr(self, "input_scales")
 
         if self.has_bias:
             if self.bias is None and (None not in self.biases):
-                self.bias = torch.cat(self.biases, dim=0).cuda(self.device_id_)
+                self.bias = torch.cat(self.biases, dim=0).cuda(get_current_device_id())
                 delattr(self, "biases")
         return self
 
@@ -449,10 +447,10 @@ def _post_load_weights(self) -> None:
                     and (not self.static_activation or self.input_scale is not None)
                 ):
                     if self.weight_scale.ndim > 1:
-                        self.weight_scale = self.weight_scale.cuda(self.device_id_)
-                    self.weight = [self.weight.cuda(self.device_id_), self.weight_scale, self.input_scale]
+                        self.weight_scale = self.weight_scale.cuda(get_current_device_id())
+                    self.weight = [self.weight.cuda(get_current_device_id()), self.weight_scale, self.input_scale]
             return
-        self.weight = self.weight.cuda(self.device_id_)
+        self.weight = self.weight.cuda(get_current_device_id())
 
 
 class BMMWeight(BMMWeightTpl):
@@ -518,7 +516,7 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]) -> None:
             self.weight = weight[self.start : self.end]
         if self.bias_name in weights:
             bias = weights[self.bias_name].to(self.data_type_)[self.start : self.end]
-            self.bias = bias.cuda(self.device_id_)
+            self.bias = bias.cuda(get_current_device_id())
 
         if self.weight_scale_name is not None and self.weight_scale_name in weights:
             weight_scale = weights[self.weight_scale_name]
@@ -532,7 +530,7 @@ def load_hf_weights(self, weights: Dict[str, torch.Tensor]) -> None:
 
         if self.act_scale_name is not None and self.act_scale_name in weights:
             input_scale = weights[self.act_scale_name].to(torch.float)
-            self.input_scale = input_scale.cuda()
+            self.input_scale = input_scale.cuda(get_current_device_id())
 
         if weight is None and weight_scale is None and input_scale is None:
             return

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/norm_weight.py
@@ -1,5 +1,6 @@
 import torch
 from .base_weight import BaseWeightTpl
+from lightllm.utils.dist_utils import get_current_device_id
 
 
 class NormWeight(BaseWeightTpl):
@@ -13,9 +14,9 @@ def __init__(self, weight_name, data_type, bias_name=None):
 
     def load_hf_weights(self, weights):
         if self.weight_name in weights:
-            self.weight = weights[self.weight_name].to(self.data_type_).cuda(self.device_id_)
+            self.weight = weights[self.weight_name].to(self.data_type_).cuda(get_current_device_id())
         if self.bias_name in weights:
-            self.bias = weights[self.bias_name].to(self.data_type_).cuda(self.device_id_)
+            self.bias = weights[self.bias_name].to(self.data_type_).cuda(get_current_device_id())
 
     def verify_load(self):
         load_ok = True
@@ -33,7 +34,7 @@ def __init__(self, weight_name, data_type, bias_name=None):
 
     def load_hf_weights(self, weights):
         if self.weight_name in weights:
-            self.weight = (weights[self.weight_name] + 1).to(self.data_type_).cuda(self.device_id_)
+            self.weight = (weights[self.weight_name] + 1).to(self.data_type_).cuda(get_current_device_id())
 
 
 class TpNormWeight(NormWeight):
@@ -46,6 +47,6 @@ def load_hf_weights(self, weights):
         end = self.split_n_embed * (self.tp_rank_ + 1)
 
         if self.weight_name in weights:
-            self.weight = weights[self.weight_name][start:end].to(self.data_type_).cuda(self.device_id_)
+            self.weight = weights[self.weight_name][start:end].to(self.data_type_).cuda(get_current_device_id())
         if self.bias_name in weights:
-            self.bias = weights[self.bias_name][start:end].to(self.data_type_).cuda(self.device_id_)
+            self.bias = weights[self.bias_name][start:end].to(self.data_type_).cuda(get_current_device_id())