|
14 | 14 | from vllm.config import CacheConfig, LoRAConfig
|
15 | 15 | from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
16 | 16 | RowParallelLinear)
|
| 17 | +from vllm.model_executor.layers.pooler import Pooler, PoolingType |
17 | 18 | from vllm.model_executor.layers.quantization.base_config import (
|
18 | 19 | QuantizationConfig)
|
19 | 20 | from vllm.model_executor.model_loader.weight_utils import (
|
20 | 21 | default_weight_loader, maybe_remap_kv_scale_name)
|
21 |
| -from vllm.model_executor.layers.pooler import Pooler, PoolingType |
| 22 | +from vllm.model_executor.models.qwen2 import Qwen2Model |
22 | 23 | from vllm.model_executor.pooling_metadata import PoolingMetadata
|
23 |
| -from vllm.sequence import IntermediateTensors |
24 |
| -from vllm.sequence import PoolerOutput |
25 |
| - |
| 24 | +from vllm.sequence import IntermediateTensors, PoolerOutput |
26 | 25 |
|
27 | 26 | from .utils import is_pp_missing_parameter
|
28 |
| -from vllm.model_executor.models.qwen2 import Qwen2Model |
29 | 27 |
|
30 | 28 |
|
31 | 29 | class ReLU(nn.Module):
|
| 30 | + |
32 | 31 | def __init__(self):
|
33 | 32 | super().__init__()
|
34 | 33 | self.activation = nn.ReLU()
|
@@ -89,9 +88,12 @@ def __init__(
|
89 | 88 | self.model = Qwen2Model(config, cache_config, quant_config)
|
90 | 89 |
|
91 | 90 | self.score = nn.Sequential(
|
92 |
| - ColumnParallelLinear(config.hidden_size, config.hidden_size, quant_config=quant_config), |
| 91 | + ColumnParallelLinear(config.hidden_size, |
| 92 | + config.hidden_size, |
| 93 | + quant_config=quant_config), |
93 | 94 | ReLU(),
|
94 |
| - RowParallelLinear(config.hidden_size, 1, quant_config=quant_config), |
| 95 | + RowParallelLinear(config.hidden_size, 1, |
| 96 | + quant_config=quant_config), |
95 | 97 | )
|
96 | 98 | self._pooler = Pooler(pooling_type=PoolingType.ALL, normalize=False)
|
97 | 99 |
|
@@ -126,6 +128,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
126 | 128 | ]
|
127 | 129 | params_dict = dict(self.named_parameters(remove_duplicate=False))
|
128 | 130 | for name, loaded_weight in weights:
|
| 131 | + # Skip loading lm_head for embedding model |
129 | 132 | if name == "lm_head.weight":
|
130 | 133 | continue
|
131 | 134 | if "rotary_emb.inv_freq" in name:
|
|
0 commit comments