From 2980d3cfe41d532fee0353f062b2bbc1394faa1c Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 23 Jun 2024 01:05:03 -0700 Subject: [PATCH 01/15] add chameleon --- vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/chameleon.py | 410 +++++++++++++++++++ vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/chameleon.py | 98 +++++ 4 files changed, 511 insertions(+) create mode 100644 vllm/model_executor/models/chameleon.py create mode 100644 vllm/transformers_utils/configs/chameleon.py diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 5afb2e1d44d3..41d71fbf5823 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -15,6 +15,7 @@ "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b "BloomForCausalLM": ("bloom", "BloomForCausalLM"), + "ChameleonForCausalLM": ("chameleon", "ChameleonForCausalLM"), "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), "CohereForCausalLM": ("commandr", "CohereForCausalLM"), diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py new file mode 100644 index 000000000000..c23bfadb27bd --- /dev/null +++ b/vllm/model_executor/models/chameleon.py @@ -0,0 +1,410 @@ +from typing import Any, Dict, Iterable, List, Optional, Tuple + +import torch +from torch import nn + +from vllm.attention import Attention, AttentionMetadata +from vllm.config import CacheConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import SamplerOutput +from vllm.transformers_utils.configs import ChameleonConfig +from vllm.utils import print_warning_once + + +# Copied from vllm.model_executor.models.llama.LlamaMLP -> ChameleonMLP +class ChameleonMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + input_size=hidden_size, + output_sizes=[intermediate_size] * 2, + bias=bias, + quant_config=quant_config) + self.down_proj = RowParallelLinear(input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +# Modified from vllm.model_executor.models.llama.LlamaAttention -> ChameleonAttention #noqa +class ChameleonAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 4096, + qk_layernorm: bool = False, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + cache_config: Optional[CacheConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + self.qk_layernorm = qk_layernorm + + self.qkv_proj = QKVParallelLinear( + hidden_size=hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, + bias=bias, + quant_config=quant_config, + ) + self.o_proj = RowParallelLinear( + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + + if self.qk_layernorm: + self.q_norm = nn.LayerNorm(self.head_dim) + self.k_norm = nn.LayerNorm(self.head_dim) + + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + if self.qk_layernorm: + q = self.q_norm(q) + k = self.k_norm(k) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class ChameleonDecoderLayer(nn.Module): + + def __init__( + self, + config: ChameleonConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is not None and getattr( + config, "original_max_position_embeddings", None): + rope_scaling["original_max_position_embeddings"] = ( + config.original_max_position_embeddings) + max_position_embeddings = getattr(config, "max_position_embeddings", + 4096) + + self.self_attn = ChameleonAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr(config, "num_key_value_heads", + config.num_attention_heads), + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + bias=False, + cache_config=cache_config, + ) + self.mlp = ChameleonMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=getattr(config, "mlp_bias", False), + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.swin_norm = config.swin_norm + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + + if self.swin_norm: + if residual is None: + residual = hidden_states + + # Self Attention + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + + # Fully Connected + hidden_states = self.mlp(hidden_states) + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + + else: # No swin norm + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +class ChameleonModel(nn.Module): + + def __init__( + self, + config: ChameleonConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + ChameleonDecoderLayer(config=config, + cache_config=cache_config, + quant_config=quant_config) + for _ in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + # TODO: Support image input + # self.vqmodel = ChameleonVQModel(config.vq_config) + # self.vocabulary_mapping = ChameleonImageVocabularyMapping(config.vocabulary_map) #noqa + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + attn_metadata, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class ChameleonForCausalLM(nn.Module): + + def __init__( + self, + config: ChameleonConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.config = config + self.model = ChameleonModel(config, cache_config, quant_config) + self.unpadded_vocab_size = config.vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + ) + if config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, logit_scale) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head.weight, hidden_states, + sampling_metadata) + + # TODO: update logits for image tokens + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + # Skip loading vqgan + # TODO: add support for the vision model + if "vqmodel" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + if name.endswith("kv_scale"): + remapped_kv_scale_name = name.replace( + ".kv_scale", ".attn.kv_scale") + if remapped_kv_scale_name not in params_dict: + print_warning_once( + f"Found kv scale in the checkpoint (e.g. {name}), " + "but not found the expected name in the model " + f"(e.g. {remapped_kv_scale_name}). kv-scale is " + "not loaded.") + continue + else: + name = remapped_kv_scale_name + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index d8170858c2a9..63f7510ec66f 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,3 +1,4 @@ +from vllm.transformers_utils.configs.chameleon import ChameleonConfig from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.dbrx import DbrxConfig # RWConfig is for the original tiiuae/falcon-40b(-instruct) and @@ -9,6 +10,7 @@ from vllm.transformers_utils.configs.mpt import MPTConfig __all__ = [ + "ChameleonConfig", "ChatGLMConfig", "DbrxConfig", "MPTConfig", diff --git a/vllm/transformers_utils/configs/chameleon.py b/vllm/transformers_utils/configs/chameleon.py new file mode 100644 index 000000000000..417d9c4fcc73 --- /dev/null +++ b/vllm/transformers_utils/configs/chameleon.py @@ -0,0 +1,98 @@ +from transformers import PretrainedConfig + + +class ChameleonConfig(PretrainedConfig): + + model_type = "chameleon" + is_composition = True + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=65536, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-05, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + qk_layernorm=False, + swin_norm=False, + vq_config=None, + vocabulary_map=None, + mlp_bias=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_bias = mlp_bias + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.qk_layernorm = qk_layernorm + self.swin_norm = swin_norm + # vq config is currently ignored + # self.vq_config = ChameleonVQConfig(**vq_config) + self.vocabulary_map = vocabulary_map + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, + dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, " + f"`type` and `factor`, got {self.rope_scaling}") + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in [ + "linear", "dynamic" + ]: + raise ValueError( + "`rope_scaling`'s type field must be one of ['linear', " + f"'dynamic'], got {rope_scaling_type}") + if rope_scaling_factor is None or not isinstance( + rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError( + "`rope_scaling`'s factor field must be a float > 1, " + f"got {rope_scaling_factor}") From 53eec672d77bba8b12de40ac51e988c3161c923d Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 23 Jun 2024 01:21:10 -0700 Subject: [PATCH 02/15] fix attn input --- vllm/model_executor/models/chameleon.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index c23bfadb27bd..a5093d5ab9ff 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -178,6 +178,7 @@ def __init__( rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, + qk_layernorm=config.qk_layernorm, quant_config=quant_config, bias=False, cache_config=cache_config, From e2d255563a52b4499fe827f76c81becf46a08296 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 8 Jul 2024 00:30:44 -0700 Subject: [PATCH 03/15] update chameleon --- vllm/model_executor/models/chameleon.py | 82 ++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a5093d5ab9ff..e41a40f8fe24 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1,3 +1,4 @@ +from functools import cached_property from typing import Any, Dict, Iterable, List, Optional, Tuple import torch @@ -132,6 +133,17 @@ def __init__( cache_config=cache_config, quant_config=quant_config) + def _apply_qk_norm(self, q: torch.Tensor, + k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + # reshape for layernorm + q = q.view(*q.shape[:-1], -1, self.head_dim) + k = k.view(*k.shape[:-1], -1, self.head_dim) + q, _ = self.q_norm(q) + k, _ = self.k_norm(k) + q = q.view(*q.shape[:-2], -1) + k = k.view(*k.shape[:-2], -1) + return q, k + def forward( self, positions: torch.Tensor, @@ -143,8 +155,9 @@ def forward( q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) if self.qk_layernorm: - q = self.q_norm(q) - k = self.k_norm(k) + # reshape for layernorm + q, k = self._apply_qk_norm(q, k) + q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.o_proj(attn_output) @@ -246,6 +259,62 @@ def forward( return hidden_states, residual +class ChameleonImageVocabularyMapping: + """ + A class for mapping discrete image tokens from VQGAN to BPE tokens. + """ + + def __init__(self, vocab_map): + self.vocab_map = vocab_map + self.image_token_id = vocab_map.get("") + + @cached_property + def val2name(self): + return {v: k for k, v in self.vocab_map.items()} + + @cached_property + def image_tokens(self): + return sorted([ + val for name, val in self.vocab_map.items() + if name.startswith("IMGIMG") + ]) + + @cached_property + def bpe2img(self): + img_tkn_chr_mapping = {chr(ord("A") + i): str(i) for i in range(10)} + + def remap(old_name: str) -> str: + return "".join( + img_tkn_chr_mapping.get(c, c) + for c in old_name[len("IMGIMG"):-1]) + + return { + tok: int(remap(self.val2name[tok])) + for tok in self.image_tokens + } + + @cached_property + def img2bpe(self): + return {v: k for k, v in self.bpe2img.items()} + + @cached_property + def bpe2img_search_tensors(self): + return torch.tensor(sorted(self.bpe2img.keys())), torch.tensor( + sorted(self.bpe2img.values())) + + @cached_property + def img2bpe_mapping_tensor(self): + mapping = torch.zeros(max(self.img2bpe.keys()) + 1, dtype=torch.int) + for k, v in self.img2bpe.items(): + mapping[k] = v + return mapping + + def convert_img2bpe(self, img_batch: torch.Tensor) -> torch.Tensor: + device = img_batch.device + img_tokens = self.img2bpe_mapping_tensor[img_batch.to("cpu")] + return img_tokens.to(device) + + class ChameleonModel(nn.Module): def __init__( @@ -262,6 +331,8 @@ def __init__( self.vocab_size, config.hidden_size, ) + self.vocabulary_mapping = ChameleonImageVocabularyMapping( + config.vocabulary_map) self.layers = nn.ModuleList([ ChameleonDecoderLayer(config=config, cache_config=cache_config, @@ -334,6 +405,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata) return hidden_states @@ -343,7 +415,11 @@ def compute_logits(self, hidden_states: torch.Tensor, logits = self.logits_processor(self.lm_head.weight, hidden_states, sampling_metadata) - # TODO: update logits for image tokens + # Disallow image tokens which does not include special + # begin-image and end-image tokens + image_tokens = self.model.vocabulary_mapping.image_tokens + logits[:, image_tokens] = torch.finfo(logits.dtype).min + return logits def sample( From ecce4458f744309334b5fde57f7df77084e42702 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 8 Jul 2024 00:59:08 -0700 Subject: [PATCH 04/15] add config to registry --- vllm/transformers_utils/config.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 5e2fe116db9c..8b3981f07ca3 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -5,9 +5,10 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger -from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, - JAISConfig, MLPSpeculatorConfig, - MPTConfig, RWConfig) +from vllm.transformers_utils.configs import (ChameleonConfig, ChatGLMConfig, + DbrxConfig, JAISConfig, + MLPSpeculatorConfig, MPTConfig, + RWConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig @@ -17,6 +18,7 @@ logger = init_logger(__name__) _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { + "chameleon": ChameleonConfig, "chatglm": ChatGLMConfig, "dbrx": DbrxConfig, "mpt": MPTConfig, From 6c2368662dc5318fb90550e67b2ac8c3830bf470 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 8 Jul 2024 01:02:33 -0700 Subject: [PATCH 05/15] add param --- vllm/model_executor/models/chameleon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index e41a40f8fe24..2849140a37bd 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -21,7 +21,7 @@ ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import SamplerOutput +from vllm.sequence import IntermediateTensors, SamplerOutput from vllm.transformers_utils.configs import ChameleonConfig from vllm.utils import print_warning_once @@ -404,6 +404,7 @@ def forward( positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, kv_caches, From 7c82bea1d8108af3725d41c0b9b37a925e4caaea Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 8 Jul 2024 01:04:10 -0700 Subject: [PATCH 06/15] fix layernorm output --- vllm/model_executor/models/chameleon.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 2849140a37bd..1802a7df3b35 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -138,8 +138,8 @@ def _apply_qk_norm(self, q: torch.Tensor, # reshape for layernorm q = q.view(*q.shape[:-1], -1, self.head_dim) k = k.view(*k.shape[:-1], -1, self.head_dim) - q, _ = self.q_norm(q) - k, _ = self.k_norm(k) + q = self.q_norm(q) + k = self.k_norm(k) q = q.view(*q.shape[:-2], -1) k = k.view(*k.shape[:-2], -1) return q, k From 80d9586960f0555b49ad97341c421448b7f302b0 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 8 Jul 2024 01:05:30 -0700 Subject: [PATCH 07/15] fix logits processor --- vllm/model_executor/models/chameleon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 1802a7df3b35..1228ef3ed04c 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -413,7 +413,7 @@ def forward( def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head.weight, hidden_states, + logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) # Disallow image tokens which does not include special From 06a4c3b1ffe317a45de43167549382b644134bd3 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 8 Jul 2024 09:40:18 -0700 Subject: [PATCH 08/15] update TODOs --- vllm/model_executor/models/chameleon.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 1228ef3ed04c..3e62f7ea639e 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -343,7 +343,6 @@ def __init__( # TODO: Support image input # self.vqmodel = ChameleonVQModel(config.vq_config) - # self.vocabulary_mapping = ChameleonImageVocabularyMapping(config.vocabulary_map) #noqa def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) @@ -405,8 +404,14 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs, ) -> torch.Tensor: + # TODO (ywang96): Support image input + # image_tokens = self.process_image_input(**kwargs) + # image_mask = input_ids == self.vocabulary_mapping.image_token_id + # input_ids[special_image_mask] = image_tokens.flatten().to(input_ids.dtype) #noqa + hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata) return hidden_states From a3b62851ca4a5cd305bff7e4c0f1b2a87551db90 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 8 Jul 2024 09:53:48 -0700 Subject: [PATCH 09/15] update comment --- vllm/model_executor/models/chameleon.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 3e62f7ea639e..b34ace739554 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -155,7 +155,6 @@ def forward( q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) if self.qk_layernorm: - # reshape for layernorm q, k = self._apply_qk_norm(q, k) q, k = self.rotary_emb(positions, q, k) From 3ed6475dc308b35e3565971440a4817c3b4b0bb9 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 9 Jul 2024 10:21:05 -0700 Subject: [PATCH 10/15] add TODO --- vllm/transformers_utils/configs/chameleon.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/transformers_utils/configs/chameleon.py b/vllm/transformers_utils/configs/chameleon.py index 417d9c4fcc73..73f0e0c33989 100644 --- a/vllm/transformers_utils/configs/chameleon.py +++ b/vllm/transformers_utils/configs/chameleon.py @@ -1,6 +1,9 @@ from transformers import PretrainedConfig +#TODO (ywang96): Remove this file and import it from +# transformers once the new release with Chameleon support +# is available. class ChameleonConfig(PretrainedConfig): model_type = "chameleon" From cc5c2f0e6836a83575740082d6ea759f539e9f00 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 10 Jul 2024 08:52:00 -0700 Subject: [PATCH 11/15] update qk norm --- vllm/model_executor/models/chameleon.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index b34ace739554..69b7a978649d 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -70,7 +70,6 @@ def __init__( rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 4096, - qk_layernorm: bool = False, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, cache_config: Optional[CacheConfig] = None, @@ -97,7 +96,6 @@ def __init__( self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings - self.qk_layernorm = qk_layernorm self.qkv_proj = QKVParallelLinear( hidden_size=hidden_size, @@ -113,7 +111,9 @@ def __init__( bias=bias, quant_config=quant_config, ) - + self.q_norm = nn.LayerNorm(param_shape=(self.num_heads, self.head_dim)) + self.k_norm = nn.LayerNorm(param_shape=(self.num_kv_heads, + self.head_dim)) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, @@ -122,10 +122,6 @@ def __init__( rope_scaling=rope_scaling, ) - if self.qk_layernorm: - self.q_norm = nn.LayerNorm(self.head_dim) - self.k_norm = nn.LayerNorm(self.head_dim) - self.attn = Attention(self.num_heads, self.head_dim, self.scaling, @@ -153,9 +149,7 @@ def forward( ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - - if self.qk_layernorm: - q, k = self._apply_qk_norm(q, k) + q, k = self._apply_qk_norm(q, k) q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) @@ -190,7 +184,6 @@ def __init__( rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, - qk_layernorm=config.qk_layernorm, quant_config=quant_config, bias=False, cache_config=cache_config, From 1f613635b500e51ec03e1323cbb4805770d39d3c Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 19 Jul 2024 15:29:17 -0700 Subject: [PATCH 12/15] update chameleon --- vllm/model_executor/models/__init__.py | 2 +- vllm/model_executor/models/chameleon.py | 28 +++++++++++++++++++------ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 40451e068944..88c178972b2b 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -15,7 +15,7 @@ "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b "BloomForCausalLM": ("bloom", "BloomForCausalLM"), - "ChameleonForCausalLM": ("chameleon", "ChameleonForCausalLM"), + "ChameleonForCausalLM": ("chameleon", "ChameleonForConditionalGeneration"), "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), "CohereForCausalLM": ("commandr", "CohereForCausalLM"), diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 69b7a978649d..acd7449b3762 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -2,6 +2,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple import torch +import torch.nn.functional as F from torch import nn from vllm.attention import Attention, AttentionMetadata @@ -26,6 +27,22 @@ from vllm.utils import print_warning_once +class ChameleonLayerNorm(nn.LayerNorm): + + def __init__(self, hidden_size, *args, **kwargs): + super().__init__(hidden_size, *args, **kwargs) + self.normalized_shape = (hidden_size[-1], ) + + def forward(self, hidden_states): + hidden_states = F.layer_norm(hidden_states, + self.normalized_shape, + None, + None, + eps=1e-5) + hidden_states = hidden_states * self.weight + self.bias + return hidden_states + + # Copied from vllm.model_executor.models.llama.LlamaMLP -> ChameleonMLP class ChameleonMLP(nn.Module): @@ -111,9 +128,8 @@ def __init__( bias=bias, quant_config=quant_config, ) - self.q_norm = nn.LayerNorm(param_shape=(self.num_heads, self.head_dim)) - self.k_norm = nn.LayerNorm(param_shape=(self.num_kv_heads, - self.head_dim)) + self.q_norm = ChameleonLayerNorm((self.num_heads, self.head_dim)) + self.k_norm = ChameleonLayerNorm((self.num_kv_heads, self.head_dim)) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, @@ -132,8 +148,8 @@ def __init__( def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: # reshape for layernorm - q = q.view(*q.shape[:-1], -1, self.head_dim) - k = k.view(*k.shape[:-1], -1, self.head_dim) + q = q.reshape(-1, self.num_heads, self.head_dim) + k = k.reshape(-1, self.num_kv_heads, self.head_dim) q = self.q_norm(q) k = self.k_norm(k) q = q.view(*q.shape[:-2], -1) @@ -365,7 +381,7 @@ def forward( return hidden_states -class ChameleonForCausalLM(nn.Module): +class ChameleonForConditionalGeneration(nn.Module): def __init__( self, From f1eb706e7c300eda81a18a42e9a55352b684c811 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 19 Jul 2024 15:30:32 -0700 Subject: [PATCH 13/15] format --- vllm/transformers_utils/config.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 827989411c46..f99bea356da8 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -7,9 +7,8 @@ from vllm.logger import init_logger from vllm.transformers_utils.configs import (ChameleonConfig, ChatGLMConfig, DbrxConfig, JAISConfig, - MedusaConfig, MPTConfig, - MLPSpeculatorConfig, - RWConfig) + MedusaConfig, MLPSpeculatorConfig, + MPTConfig, RWConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig From 7c474b630535a25e0f4332299a8993f9b2475010 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 20 Jul 2024 23:58:16 -0700 Subject: [PATCH 14/15] fix swin norm --- vllm/model_executor/models/chameleon.py | 125 +++++++++++++++++------- 1 file changed, 89 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index acd7449b3762..02a3cb02769f 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -215,7 +215,6 @@ def __init__( eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.swin_norm = config.swin_norm def forward( self, @@ -226,43 +225,95 @@ def forward( residual: Optional[torch.Tensor], ) -> Tuple[torch.Tensor, torch.Tensor]: - if self.swin_norm: - if residual is None: - residual = hidden_states - - # Self Attention - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - attn_metadata=attn_metadata, - ) + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: hidden_states, residual = self.input_layernorm( hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) - # Fully Connected - hidden_states = self.mlp(hidden_states) - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) - else: # No swin norm - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - attn_metadata=attn_metadata, - ) + return hidden_states, residual - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) + +class ChameleonSwinDecoderLayer(nn.Module): + + def __init__( + self, + config: ChameleonConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is not None and getattr( + config, "original_max_position_embeddings", None): + rope_scaling["original_max_position_embeddings"] = ( + config.original_max_position_embeddings) + max_position_embeddings = getattr(config, "max_position_embeddings", + 4096) + + self.self_attn = ChameleonAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr(config, "num_key_value_heads", + config.num_attention_heads), + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + bias=False, + cache_config=cache_config, + ) + self.mlp = ChameleonMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=getattr(config, "mlp_bias", False), + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + + residual = hidden_states + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + hidden_states = self.input_layernorm(hidden_states) + hidden_states = hidden_states + residual + + # Fully Connected + residual = hidden_states + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states return hidden_states, residual @@ -341,10 +392,12 @@ def __init__( ) self.vocabulary_mapping = ChameleonImageVocabularyMapping( config.vocabulary_map) + decoder_layer = ChameleonDecoderLayer if not self.config.swin_norm \ + else ChameleonSwinDecoderLayer self.layers = nn.ModuleList([ - ChameleonDecoderLayer(config=config, - cache_config=cache_config, - quant_config=quant_config) + decoder_layer(config=config, + cache_config=cache_config, + quant_config=quant_config) for _ in range(config.num_hidden_layers) ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) From e6372c072e2802fbf63ffd666046ea68880f322f Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 21 Jul 2024 00:12:04 -0700 Subject: [PATCH 15/15] add TODO --- vllm/model_executor/models/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 02f6b943166d..b73b4c3dfe9c 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -15,7 +15,9 @@ "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b "BloomForCausalLM": ("bloom", "BloomForCausalLM"), - "ChameleonForCausalLM": ("chameleon", "ChameleonForConditionalGeneration"), + "ChameleonForCausalLM": + ("chameleon", "ChameleonForConditionalGeneration" + ), #TODO(ywang96): fix model name when huggingface fixes it "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), "CohereForCausalLM": ("commandr", "CohereForCausalLM"),