|
14 | 14 | from vllm.config import CacheConfig, LoRAConfig |
15 | 15 | from vllm.model_executor.layers.linear import (ColumnParallelLinear, |
16 | 16 | RowParallelLinear) |
| 17 | +from vllm.model_executor.layers.pooler import Pooler, PoolingType |
17 | 18 | from vllm.model_executor.layers.quantization.base_config import ( |
18 | 19 | QuantizationConfig) |
19 | 20 | from vllm.model_executor.model_loader.weight_utils import ( |
20 | 21 | default_weight_loader, maybe_remap_kv_scale_name) |
21 | | -from vllm.model_executor.layers.pooler import Pooler, PoolingType |
| 22 | +from vllm.model_executor.models.qwen2 import Qwen2Model |
22 | 23 | from vllm.model_executor.pooling_metadata import PoolingMetadata |
23 | | -from vllm.sequence import IntermediateTensors |
24 | | -from vllm.sequence import PoolerOutput |
25 | | - |
| 24 | +from vllm.sequence import IntermediateTensors, PoolerOutput |
26 | 25 |
|
27 | 26 | from .utils import is_pp_missing_parameter |
28 | | -from vllm.model_executor.models.qwen2 import Qwen2Model |
29 | 27 |
|
30 | 28 |
|
31 | 29 | class ReLU(nn.Module): |
| 30 | + |
32 | 31 | def __init__(self): |
33 | 32 | super().__init__() |
34 | 33 | self.activation = nn.ReLU() |
@@ -89,9 +88,12 @@ def __init__( |
89 | 88 | self.model = Qwen2Model(config, cache_config, quant_config) |
90 | 89 |
|
91 | 90 | self.score = nn.Sequential( |
92 | | - ColumnParallelLinear(config.hidden_size, config.hidden_size, quant_config=quant_config), |
| 91 | + ColumnParallelLinear(config.hidden_size, |
| 92 | + config.hidden_size, |
| 93 | + quant_config=quant_config), |
93 | 94 | ReLU(), |
94 | | - RowParallelLinear(config.hidden_size, 1, quant_config=quant_config), |
| 95 | + RowParallelLinear(config.hidden_size, 1, |
| 96 | + quant_config=quant_config), |
95 | 97 | ) |
96 | 98 | self._pooler = Pooler(pooling_type=PoolingType.ALL, normalize=False) |
97 | 99 |
|
@@ -126,6 +128,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): |
126 | 128 | ] |
127 | 129 | params_dict = dict(self.named_parameters(remove_duplicate=False)) |
128 | 130 | for name, loaded_weight in weights: |
| 131 | + # Skip loading lm_head for embedding model |
129 | 132 | if name == "lm_head.weight": |
130 | 133 | continue |
131 | 134 | if "rotary_emb.inv_freq" in name: |
|
0 commit comments