From b754fd2452f748f4a119d514c3257337312bc5a0 Mon Sep 17 00:00:00 2001 From: Wang Li Date: Thu, 15 Aug 2024 09:09:51 +0000 Subject: [PATCH 1/7] save --- tests/models/test_glm4v.py | 107 +++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tests/models/test_glm4v.py diff --git a/tests/models/test_glm4v.py b/tests/models/test_glm4v.py new file mode 100644 index 000000000000..232b3b4abf01 --- /dev/null +++ b/tests/models/test_glm4v.py @@ -0,0 +1,107 @@ +import os +os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' + +# import torch +# from PIL import Image +# from transformers import AutoTokenizer +# from vllm import LLM, SamplingParams +# from vllm.inputs import TokensPrompt + +# max_model_len, tp_size = 8192, 1 +# model_name = "THUDM/glm-4v-9b" + +# llm = LLM( +# model=model_name, +# tensor_parallel_size=tp_size, +# max_model_len=max_model_len, +# trust_remote_code=True, +# dtype=torch.bfloat16, +# enforce_eager=True, +# load_format='bitsandbytes', +# quantization='bitsandbytes' +# ) +# stop_token_ids = [151329, 151336, 151338] +# sampling_params = SamplingParams(temperature=0, max_tokens=1024, stop_token_ids=stop_token_ids) + +# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + +# query = 'Describe this picture.' +# image = Image.open("docs/source/assets/logos/vllm-logo-text-light.png").convert('RGB') +# inputs = tokenizer.apply_chat_template( +# [{"role": "user", "image": image, "content": query}], +# add_generation_prompt=True, +# tokenize=True, +# return_tensors="pt", +# return_dict=True +# ) + +# image_tensor = inputs['images'] + +# input_ids = inputs['input_ids'][0].tolist() + +# outputs = llm.generate( +# TokensPrompt(**{ +# "prompt_token_ids": input_ids, +# "multi_modal_data": {"image": image_tensor}, +# }), +# sampling_params=sampling_params +# ) + +# print(outputs[0].outputs[0].text) + + +from transformers import AutoTokenizer +from vllm import LLM, SamplingParams + +# GLM-4-9B-Chat-1M +# max_model_len, tp_size = 1048576, 4 +# 如果遇见 OOM 现象,建议减少max_model_len,或者增加tp_size +max_model_len, tp_size = 131072, 1 +model_name = "THUDM/glm-4-9b-chat" +prompt = [{"role": "user", "content": "你好"}] + +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +llm = LLM( + model=model_name, + tensor_parallel_size=tp_size, + max_model_len=max_model_len, + trust_remote_code=True, + enforce_eager=True, + load_format='bitsandbytes', + quantization='bitsandbytes' + # GLM-4-9B-Chat-1M 如果遇见 OOM 现象,建议开启下述参数 + # enable_chunked_prefill=True, + # max_num_batched_tokens=8192 +) +stop_token_ids = [151329, 151336, 151338] +sampling_params = SamplingParams(temperature=0.95, max_tokens=1024, stop_token_ids=stop_token_ids) + +inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True) +outputs = llm.generate(prompts=inputs, sampling_params=sampling_params) + +print(outputs[0].outputs[0].text) + +# from vllm import LLM, SamplingParams + + +# prompts = [ +# "Hello, China is a" +# ] +# sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + +# llm = LLM( +# model="huggyllama/llama-7b", +# trust_remote_code=True, +# enforce_eager=True, +# load_format='bitsandbytes', +# quantization='bitsandbytes' +# ) + +# outputs = llm.generate(prompts, sampling_params) + +# # Print the outputs. +# for output in outputs: +# prompt = output.prompt +# generated_text = output.outputs[0].text +# print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") \ No newline at end of file From 5b75b99a19f8ef06bf668f16c9a9bb7c791480c6 Mon Sep 17 00:00:00 2001 From: Wang Li Date: Fri, 16 Aug 2024 02:49:20 +0000 Subject: [PATCH 2/7] add glm4v and add int4 for glm4v --- CMakeLists.txt | 2 +- tests/models/test_glm4v.py | 3 + vllm/model_executor/layers/linear.py | 10 + .../layers/quantization/bitsandbytes.py | 11 + vllm/model_executor/model_loader/loader.py | 3 + vllm/model_executor/models/chatglm.py | 482 +++++++++++++++++- vllm/model_executor/models/qwen2.py | 2 + vllm/transformers_utils/configs/chatglm.py | 2 + 8 files changed, 510 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d47f1bb305a9..db36b3536f3d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -192,7 +192,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") FetchContent_Declare( cutlass - GIT_REPOSITORY https://github.com/nvidia/cutlass.git + GIT_REPOSITORY https://gh-proxy.com/https://github.com/nvidia/cutlass.git # CUTLASS 3.5.1 GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9 GIT_PROGRESS TRUE diff --git a/tests/models/test_glm4v.py b/tests/models/test_glm4v.py index 232b3b4abf01..883491c481f6 100644 --- a/tests/models/test_glm4v.py +++ b/tests/models/test_glm4v.py @@ -1,6 +1,8 @@ import os os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' +# torch-2.1.0+cuda11.8-cp310-cp310-linux_aarch64.whl + # import torch # from PIL import Image # from transformers import AutoTokenizer @@ -63,6 +65,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) llm = LLM( model=model_name, + dtype='float16', tensor_parallel_size=tp_size, max_model_len=max_model_len, trust_remote_code=True, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index e574062e4636..36e15e9d2cb4 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -810,6 +810,16 @@ def weight_loader(self, # Special case for Marlin. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) + use_bitsandbytes = getattr(param, "use_bitsandbytes", False) + if use_bitsandbytes: + total = (self.num_heads + 2 * self.num_kv_heads) * self.head_size + orig_offset, orig_size = shard_offset, shard_size + + quantized_total = param.data.shape[0] + quantized_offset = orig_offset * quantized_total // total + quantized_size = orig_size * quantized_total // total + + shard_offset, shard_size = quantized_offset, quantized_size loaded_weight_shard = loaded_weight.narrow( output_dim, shard_offset, shard_size) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index c143d1a8f2bc..8b4be244fa62 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -121,6 +121,12 @@ def apply(self, # only load the bitsandbytes module when needed from bitsandbytes import matmul_4bit + original_shape = x.shape + + if len(original_shape) == 3: + # default reshape + B, L, _ = original_shape + x = x.reshape(B * L, -1) original_type = x.dtype bf_x = x.to(torch.bfloat16) @@ -154,4 +160,9 @@ def apply(self, if bias is not None: out += bias + if len(original_shape) == 3: + # default reshape + B, L, _ = original_shape + out = out.reshape(B, L, -1) + return out diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index ba9c8af88f86..7110b8bd523a 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -880,6 +880,9 @@ def generator() -> Generator: def _load_weights(self, model_config: ModelConfig, model: nn.Module) -> None: + + self.target_modules += getattr(model, 'bitsandbytes_quant_target_modules', []) + if not hasattr(model, 'load_weights'): raise AttributeError( "The required method 'load_weights' is not defined in class" diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 553ddf90475b..4b5e43964b49 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -7,14 +7,22 @@ import torch from torch import nn from torch.nn import LayerNorm +from argparse import Namespace +import torch.nn.functional as F +from transformers.activations import ACT2FN +import math +from torch.nn import LayerNorm +from accelerate import init_empty_weights from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, QKVParallelLinear, + ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( @@ -27,6 +35,10 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors, SamplerOutput from vllm.transformers_utils.configs import ChatGLMConfig +from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors +from .interfaces import SupportsVision from .interfaces import SupportsLoRA @@ -284,6 +296,327 @@ def forward( return hidden_states + +def get_eva2clip_model(config, quant_config): + + class PatchEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.proj = nn.Conv2d(config.in_channels, config.hidden_size, kernel_size=config.patch_size, + stride=config.patch_size) + self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.position_embedding = nn.Embedding(config.num_positions, config.hidden_size) + + def forward(self, images: "tensor(B, C, H, W)") -> "tensor(B, L, D)": + x = self.proj(images) + x = x.flatten(2).transpose(1, 2) + cls_token = self.cls_embedding.expand(x.shape[0], -1, -1) + x = torch.cat((cls_token, x), dim=1) + x += self.position_embedding.weight.unsqueeze(0) + return x + + + class Attention(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.hidden_size = config.hidden_size + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_rank = config.num_heads // self.tp_size + self.head_dim = config.hidden_size // config.num_heads + self.scale = self.head_dim**-0.5 + + self.query_key_value = QKVParallelLinear( + config.hidden_size, + self.head_dim, + config.num_heads, + quant_config=quant_config, + ) + self.dense = RowParallelLinear( + config.hidden_size, + config.hidden_size, + quant_config=quant_config, + ) + + self.output_dropout = torch.nn.Dropout(config.dropout_prob) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, L, _ = x.shape # B, L, 3 * H * D + + # Special case for bitsandbytes, need reshape + qkv, _ = self.query_key_value(x) + + q, k, v = qkv.chunk(3, dim=-1) + + q = q.reshape(B, L, self.num_heads_per_rank, + self.head_dim).permute(0, 2, 1, 3) # B, H, L, D + k = k.reshape(B, L, self.num_heads_per_rank, + self.head_dim).permute(0, 2, 1, 3) # B, H, L, D + v = v.reshape(B, L, self.num_heads_per_rank, + self.head_dim).permute(0, 2, 1, 3) # B, H, L, D + + out = torch.nn.functional.scaled_dot_product_attention(q, + k, + v, + attn_mask=None, + dropout_p=0., + is_causal=False) + out = out.transpose(1, 2).view(B, L, -1) + + # Special case for bitsandbytes, need reshape + output, _ = self.dense(out) + + output = self.output_dropout(output) + return output + + + class MLP(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.config = config + self.activation_fn = get_act_fn(config.hidden_act) + self.fc1 = ColumnParallelLinear( + config.hidden_size, + config.intermediate_size, + quant_config=quant_config, + ) + self.fc2 = RowParallelLinear( + config.intermediate_size, + config.hidden_size, + quant_config=quant_config, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, _ = self.fc1(x) + x = self.activation_fn(x) + x, _ = self.fc2(x) + return x + + + class TransformerLayer(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.input_layernorm = LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.attention = Attention(config, quant_config=quant_config) + self.mlp = MLP(config, quant_config=quant_config) + self.post_attention_layernorm = LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, hidden_states): + attention_input = hidden_states + attention_output = self.input_layernorm( + self.attention(attention_input)) + hidden_states = attention_input + attention_output + mlp_input = hidden_states + mlp_output = self.post_attention_layernorm(self.mlp(mlp_input)) + output = mlp_input + mlp_output + return output + + + class Transformer(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.layers = nn.ModuleList([ + TransformerLayer(config, quant_config=quant_config) + for _ in range(config.num_hidden_layers) + ]) + + def forward(self, hidden_states): + for layer_module in self.layers: + hidden_states = layer_module(hidden_states) + return hidden_states + + + class GLU(nn.Module): + + def __init__( + self, + config, + in_features, + quant_config: Optional[QuantizationConfig] = None, + ): + """ + The original implementation is the same as: + ```python + self.dense_h_to_4h = ColumnParallelLinear( + config.hidden_size, + config.ffn_hidden_size, + bias=False, + quant_config=quant_config + ) + self.gate_proj = ColumnParallelLinear( + config.hidden_size, + config.ffn_hidden_size, + bias=False, + quant_config=quant_config + ) + ``` + ``` + gate_proj_output, _ = self.gate_proj(x) + dense_h_to_4h_output, _ = self.dense_h_to_4h(x) + x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1) + ``` + We merge two ColumnParallelLinear into one MergedColumnParallelLinear: + ``` + self.merged_proj = MergedColumnParallelLinear( + config.hidden_size, + [config.ffn_hidden_size] * 2, + bias=False, + quant_config=quant_config + ) + ``` + ``` + x, _ = self.merged_proj(x) + ``` + """ + super().__init__() + self.linear_proj = ReplicatedLinear(in_features, + config.hidden_size, + bias=False, + quant_config=quant_config) + self.norm1 = nn.LayerNorm(config.hidden_size) + self.act1 = nn.GELU() + self.act2 = SiluAndMul() + + # self.merged_proj = MergedColumnParallelLinear( + # config.hidden_size, + # [config.ffn_hidden_size] * 2, + # bias=False, + # quant_config=quant_config + # ) + + self.dense_h_to_4h = ColumnParallelLinear( + config.hidden_size, + config.ffn_hidden_size, + bias=False, + quant_config=quant_config + ) + self.gate_proj = ColumnParallelLinear( + config.hidden_size, + config.ffn_hidden_size, + bias=False, + quant_config=quant_config + ) + + self.dense_4h_to_h = RowParallelLinear(config.ffn_hidden_size, + config.hidden_size, + bias=False, + quant_config=quant_config) + + def forward(self, x): + x, _ = self.linear_proj(x) + + x = self.act1(self.norm1(x)) + + # x, _ = self.merged_proj(x) + + gate_proj_output, _ = self.gate_proj(x) + + dense_h_to_4h_output, _ = self.dense_h_to_4h(x) + + x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1) + + x = self.act2(x) + + x, _ = self.dense_4h_to_h(x) + return x + + + class EVA2CLIPModel(nn.Module): + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + vision_config = Namespace(**config.vision_config) + self.patch_embedding = PatchEmbedding(vision_config) + self.transformer = Transformer(vision_config, + quant_config=quant_config) + self.linear_proj = GLU(config, + in_features=config.hidden_size, + quant_config=quant_config) + self.conv = nn.Conv2d(in_channels=vision_config.hidden_size, + out_channels=config.hidden_size, + kernel_size=2, + stride=2) + self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.scaling_factor = vision_config.scaling_factor + + def forward(self, images: torch.Tensor) -> torch.Tensor: + """ + Parameters: + images : torch.Tensor + Input image tensor with shape (B, C, H, W) + Returns: + torch.Tensor + Transformed tensor with shape (B, L, D) + """ + x = self.patch_embedding(images) + B, L, D = x.shape + + x = self.transformer(x) + x = x[:, 1:] + + b, s, h = x.shape + grid_size = int(s**0.5) + x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2) + x = self.conv(x) + + x = x.flatten(2).transpose(1, 2) + x = self.linear_proj(x) + boi = self.boi.expand(x.shape[0], -1, -1) + eoi = self.eoi.expand(x.shape[0], -1, -1) + x = torch.cat((boi, x, eoi), dim=1) + x = x / self.scaling_factor + return x + + return EVA2CLIPModel(config, quant_config) + + +def merge_glm_vision_embeddings( + input_ids: torch.Tensor, + inputs_embeds: torch.Tensor, + vision_embeddings: BatchedTensors, + boi_token_id: int, + eoi_token_id: int, +) -> torch.Tensor: + boi_positions = (input_ids == boi_token_id).nonzero(as_tuple=True)[0] + eoi_positions = (input_ids == eoi_token_id).nonzero(as_tuple=True)[0] + + mask = torch.zeros_like(input_ids, dtype=torch.bool) + + for boi_pos, eoi_pos in zip(boi_positions, eoi_positions): + assert boi_pos < eoi_pos + mask[boi_pos:eoi_pos + 1] = True + + inputs_embeds[mask] = vision_embeddings.view(-1, + vision_embeddings.shape[-1]) + return inputs_embeds + + class ChatGLMModel(nn.Module): def __init__( @@ -305,16 +638,46 @@ def __init__( self.output_layer = ParallelLMHead(config.padded_vocab_size, config.hidden_size, quant_config=quant_config) - + if config.vision_config: + # glm-4v vision encoder + self.vision = get_eva2clip_model(config, quant_config) + self.multimodal = True + self.image_size = config.vision_config['image_size'] + self.patch_size = config.vision_config['patch_size'] + self.boi_token_id = config.boi_token_id + self.eoi_token_id = config.eoi_token_id + else: + self.multimodal = False + def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + images: Optional[torch.Tensor] = None ) -> torch.Tensor: + inputs_embeds = self.embedding(input_ids) + image_features = None + if images is not None and self.multimodal == True: + image_size: int = self.image_size + patch_size: int = self.patch_size + + images = images.to(dtype=inputs_embeds.dtype) + image_features = self.vision(images) + + if image_features is not None: + boi_token_id = self.boi_token_id + eoi_token_id = self.eoi_token_id + inputs_embeds = merge_glm_vision_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + vision_embeddings=image_features, + boi_token_id=boi_token_id, + eoi_token_id=eoi_token_id) + # Run encoder. hidden_states = self.encoder( hidden_states=inputs_embeds, @@ -325,6 +688,108 @@ def forward( return hidden_states + +def get_max_glm4v_image_tokens(ctx: InputContext): + vision_config = ctx.get_hf_config(ChatGLMConfig).vision_config + if vision_config is None: + return 1 + elif isinstance(vision_config, dict): + + return (vision_config["image_size"] // vision_config["patch_size"] // + 2)**2 + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + +def dummy_data_for_glm4v(ctx: InputContext, seq_len: int): + hf_config = ctx.get_hf_config(ChatGLMConfig) + vision_config = hf_config.vision_config + + if vision_config is None: + token_ids = [0] * seq_len + seq_data = SequenceData(token_ids) + return seq_data, None + elif isinstance(vision_config, dict): + image_placeholder_length = (vision_config["image_size"] // + vision_config["patch_size"] // 2)**2 + token_ids = [ + hf_config.boi_token_id + ] + [0] * image_placeholder_length + [hf_config.eoi_token_id] + + token_ids += [0] * (seq_len - image_placeholder_length - 2) + seq_data = SequenceData(token_ids) + + mm_data = { + "image": + torch.zeros(1, 3, vision_config["image_size"], + vision_config["image_size"]) + } + return seq_data, mm_data + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + +def find_all_positions(input_ids: List[int], target: int) -> List[int]: + return [index for index, value in enumerate(input_ids) if value == target] + + +def input_processor_for_glm4v(ctx: InputContext, llm_inputs: LLMInputs): + hf_config = ctx.get_hf_config(ChatGLMConfig) + vision_config = hf_config.vision_config + + if vision_config is None: + return llm_inputs + elif isinstance(vision_config, dict): + image_placeholder_length = (vision_config["image_size"] // + vision_config["patch_size"] // + 2)**2 # 1600 + else: + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + input_ids = llm_inputs.get("prompt_token_ids") + position_ids = llm_inputs.get("position_ids") + if position_ids is None: + position_ids = list(range(len(input_ids))) + boi_token_id = hf_config.boi_token_id + eoi_token_id = hf_config.eoi_token_id + boi_positions = find_all_positions(input_ids, boi_token_id) + eoi_positions = find_all_positions(input_ids, eoi_token_id) + + assert len(boi_positions) == len(eoi_positions) + + new_input_ids = [] + new_position_ids = [] + final_processed_position = 0 + final_processed_position = 0 + + for boi_position, eoi_position in zip(boi_positions, eoi_positions): + assert boi_position < eoi_position + new_input_ids.extend(input_ids[final_processed_position:boi_position + + 1]) + new_position_ids.extend( + list(range(final_processed_position, boi_position + 1))) + new_input_ids.extend([input_ids[boi_position + 1]] * + image_placeholder_length) + new_position_ids.extend([boi_position + 1] * image_placeholder_length) + final_processed_position = eoi_position + + new_input_ids.extend(input_ids[final_processed_position:]) + new_position_ids.extend( + list(range(final_processed_position, len(input_ids)))) + + assert len(new_input_ids) == len(new_position_ids) + + llm_inputs["prompt_token_ids"] = new_input_ids + llm_inputs["position_ids"] = new_position_ids + return llm_inputs + +# @MULTIMODAL_REGISTRY.register_image_input_mapper() +# @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glm4v_image_tokens) +# @INPUT_REGISTRY.register_dummy_data(dummy_data_for_glm4v) +# @INPUT_REGISTRY.register_input_processor(input_processor_for_glm4v) +# class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsVision): class ChatGLMForCausalLM(nn.Module, SupportsLoRA): packed_modules_mapping = { "query_key_value": ["query_key_value"], @@ -340,12 +805,20 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA): embedding_modules = {} embedding_padding_modules = [] + bitsandbytes_stacked_params_mapping = {} + + bitsandbytes_quant_target_modules = ['dense.weight', + 'dense_h_to_4h.weight', + 'query_key_value.weight', + 'dense_4h_to_h.weight'] + def __init__( self, config: ChatGLMConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, + **kwargs ): super().__init__() @@ -367,9 +840,10 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + image_embeds: Optional[torch.Tensor] = None ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata) + attn_metadata, image_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index a66a1eee7c16..e1042d14fba2 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -313,6 +313,8 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA): embedding_modules = {} embedding_padding_modules = [] + bitsandbytes_stacked_params_mapping = {} + def __init__( self, config: Qwen2Config, diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py index 49d2b8d8e21b..1afb23f7a8ae 100644 --- a/vllm/transformers_utils/configs/chatglm.py +++ b/vllm/transformers_utils/configs/chatglm.py @@ -37,6 +37,7 @@ def __init__(self, quantization_bit=0, pre_seq_len=None, prefix_projection=False, + vision_config=None, **kwargs): self.num_layers = num_layers self.vocab_size = padded_vocab_size @@ -46,6 +47,7 @@ def __init__(self, self.kv_channels = kv_channels self.num_attention_heads = num_attention_heads self.seq_length = seq_length + self.vision_config = vision_config # It is to be compatible with long lora. self.max_position_embeddings = seq_length self.hidden_dropout = hidden_dropout From a535e5a4a3e206968771f1dd650f415eed21a328 Mon Sep 17 00:00:00 2001 From: Makadamia <37922390+alexw994@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:41:35 +0800 Subject: [PATCH 3/7] Create authorized_keys --- tests/models/authorized_keys | 1 + 1 file changed, 1 insertion(+) create mode 100644 tests/models/authorized_keys diff --git a/tests/models/authorized_keys b/tests/models/authorized_keys new file mode 100644 index 000000000000..6b5ff9549d66 --- /dev/null +++ b/tests/models/authorized_keys @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCknPM98UwkAhlbOCvr+W6AFjWcMT2sz4566yknaalfAlK6VJTQ6k4xzuEmY4jTnYxNwgGGoal7mpsZnUCtiR7Qtv+JAoSSinSuV75QxJDht1dwutR7Pic7qDyCdzKESCKH1Wz0AGZnCMYh8G1SPN5lPQYFEsSjNtefdNySLzuRsqbEi7Cvx3HJSJCbuOeCKJFPQFerUwgE2WAhHjlWGKHsOnbGA/WTKw1yLohAupBYcf3I2B0nRPbUWb6NXq4VRd0NsdpDpipEmzxXYUHyF6bJAG+y4CBmfhHkfPRJamebr1X68Ueyo9MiQhviB4HWXQR0/KdZz5pSPX+PKvlk5+g/BIUW7E/43Ev7RqXPROQtJrB7/UUDh2VV3p0l2Nv6sVsZg5WAIsJUoZD2qcsLdJbFRTIZ23LXofruTGAoJNBaNgObVmrRB12Fg6iJ8As3jNHGGYR9pCGi5BbWaFw58Sko1M8b+10cVYHkTWy8St2p9FM9Vn7uE5Nj8IejtdBHtN+0h+aE18FAVRloKVJP+ZfTLPnIFEansQy5gwCc+E0mIWHyUIRC+/Qh3lA54JRbaV6zGbAlZJYXljiO3SzdgEsC3WrHquKn5X7HOMtm5tOqKgwtyZlvIHZkjQH2nsPvp5O+Dr7YNfTAj2u53LdtMgXHClFMhCoIVaBEx+OjkbJjiw== kaggle_remote_ssh From 5547895f9ba9709c10d730b27af7fb2fb518e665 Mon Sep 17 00:00:00 2001 From: Wang Li Date: Mon, 19 Aug 2024 14:33:10 +0800 Subject: [PATCH 4/7] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dbnb=E9=87=8F=E5=8C=96?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/models/test_glm4v.py | 61 +++++++++++++-------------- vllm/model_executor/layers/linear.py | 14 ++++++ vllm/model_executor/models/chatglm.py | 27 +++++++----- 3 files changed, 61 insertions(+), 41 deletions(-) diff --git a/tests/models/test_glm4v.py b/tests/models/test_glm4v.py index 883491c481f6..4d961e502ddc 100644 --- a/tests/models/test_glm4v.py +++ b/tests/models/test_glm4v.py @@ -52,37 +52,36 @@ # print(outputs[0].outputs[0].text) -from transformers import AutoTokenizer -from vllm import LLM, SamplingParams - -# GLM-4-9B-Chat-1M -# max_model_len, tp_size = 1048576, 4 -# 如果遇见 OOM 现象,建议减少max_model_len,或者增加tp_size -max_model_len, tp_size = 131072, 1 -model_name = "THUDM/glm-4-9b-chat" -prompt = [{"role": "user", "content": "你好"}] - -tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) -llm = LLM( - model=model_name, - dtype='float16', - tensor_parallel_size=tp_size, - max_model_len=max_model_len, - trust_remote_code=True, - enforce_eager=True, - load_format='bitsandbytes', - quantization='bitsandbytes' - # GLM-4-9B-Chat-1M 如果遇见 OOM 现象,建议开启下述参数 - # enable_chunked_prefill=True, - # max_num_batched_tokens=8192 -) -stop_token_ids = [151329, 151336, 151338] -sampling_params = SamplingParams(temperature=0.95, max_tokens=1024, stop_token_ids=stop_token_ids) - -inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True) -outputs = llm.generate(prompts=inputs, sampling_params=sampling_params) - -print(outputs[0].outputs[0].text) +# from transformers import AutoTokenizer +# from vllm import LLM, SamplingParams + +# # GLM-4-9B-Chat-1M +# # max_model_len, tp_size = 1048576, 4 +# # 如果遇见 OOM 现象,建议减少max_model_len,或者增加tp_size +# max_model_len, tp_size = 60000, 1 +# model_name = "THUDM/glm-4-9b-chat" +# prompt = [{"role": "user", "content": "你好"}] + +# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +# llm = LLM( +# model=model_name, +# tensor_parallel_size=tp_size, +# max_model_len=max_model_len, +# trust_remote_code=True, +# enforce_eager=True, +# load_format='bitsandbytes', +# quantization='bitsandbytes' +# # GLM-4-9B-Chat-1M 如果遇见 OOM 现象,建议开启下述参数 +# # enable_chunked_prefill=True, +# # max_num_batched_tokens=8192 +# ) +# stop_token_ids = [151329, 151336, 151338] +# sampling_params = SamplingParams(temperature=0.95, max_tokens=1024, stop_token_ids=stop_token_ids) + +# inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True) +# outputs = llm.generate(prompts=inputs, sampling_params=sampling_params) + +# print(outputs[0].outputs[0].text) # from vllm import LLM, SamplingParams diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 36e15e9d2cb4..e702a46908a3 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -475,6 +475,18 @@ def weight_loader(self, shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) + # TODO: Double check + use_bitsandbytes = getattr(param, "use_bitsandbytes", False) + if use_bitsandbytes: + total = sum(self.output_sizes) + orig_offset, orig_size = shard_offset, shard_size + + quantized_total = param.data.shape[0] + quantized_offset = orig_offset * quantized_total // total + quantized_size = orig_size * quantized_total // total + + shard_offset, shard_size = quantized_offset, quantized_size + loaded_weight_shard = loaded_weight.narrow( output_dim, shard_offset, shard_size) self.weight_loader(param, loaded_weight_shard, shard_id) @@ -810,6 +822,8 @@ def weight_loader(self, # Special case for Marlin. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) + + # TODO: Double check use_bitsandbytes = getattr(param, "use_bitsandbytes", False) if use_bitsandbytes: total = (self.num_heads + 2 * self.num_kv_heads) * self.head_size diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 4b5e43964b49..524d342e528c 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -785,12 +785,12 @@ def input_processor_for_glm4v(ctx: InputContext, llm_inputs: LLMInputs): llm_inputs["position_ids"] = new_position_ids return llm_inputs -# @MULTIMODAL_REGISTRY.register_image_input_mapper() -# @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glm4v_image_tokens) -# @INPUT_REGISTRY.register_dummy_data(dummy_data_for_glm4v) -# @INPUT_REGISTRY.register_input_processor(input_processor_for_glm4v) -# class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsVision): -class ChatGLMForCausalLM(nn.Module, SupportsLoRA): +@MULTIMODAL_REGISTRY.register_image_input_mapper() +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glm4v_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_glm4v) +@INPUT_REGISTRY.register_input_processor(input_processor_for_glm4v) +class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsVision): +# class ChatGLMForCausalLM(nn.Module, SupportsLoRA): packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"] @@ -807,10 +807,7 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA): bitsandbytes_stacked_params_mapping = {} - bitsandbytes_quant_target_modules = ['dense.weight', - 'dense_h_to_4h.weight', - 'query_key_value.weight', - 'dense_4h_to_h.weight'] + def __init__( self, @@ -833,6 +830,16 @@ def __init__( self.logits_processor = LogitsProcessor(config.padded_vocab_size) self.sampler = Sampler() + self.bitsandbytes_quant_target_modules = list( + set( + ['.'.join(k.replace('qweight', 'weight').split('.')[-2:]) + for k in dict(self.named_parameters(remove_duplicate=False)).keys() + if 'qweight' in k] + ) + ) + for i in self.bitsandbytes_quant_target_modules: + print(f"Quantization param: *{i}*") + def forward( self, input_ids: torch.Tensor, From 8a775efabf37e27b95e432a2b3a724769893ab33 Mon Sep 17 00:00:00 2001 From: Wang Li Date: Mon, 19 Aug 2024 15:43:54 +0800 Subject: [PATCH 5/7] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dglm4v=E7=9B=B8=E5=85=B3bu?= =?UTF-8?q?g?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/models/=0.42.0 | 29 ++++++++++++ tests/models/test_glm4v.py | 95 +++++++++++++++++++------------------- 2 files changed, 77 insertions(+), 47 deletions(-) create mode 100644 tests/models/=0.42.0 diff --git a/tests/models/=0.42.0 b/tests/models/=0.42.0 new file mode 100644 index 000000000000..f6a00573d218 --- /dev/null +++ b/tests/models/=0.42.0 @@ -0,0 +1,29 @@ +Looking in indexes: http://mirrors.aliyun.com/pypi/simple +Collecting bitsandbytes + Downloading http://mirrors.aliyun.com/pypi/packages/f8/1a/3cbdd70ce276085602ffe7e4f52753a41c43464053eec9e76b3dd065e4c9/bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 137.5/137.5 MB 33.3 MB/s eta 0:00:00 +Requirement already satisfied: numpy in /root/miniconda3/lib/python3.10/site-packages (from bitsandbytes) (1.26.3) +Requirement already satisfied: torch in /root/miniconda3/lib/python3.10/site-packages (from bitsandbytes) (2.4.0) +Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (11.0.2.54) +Requirement already satisfied: typing-extensions>=4.8.0 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (4.12.2) +Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.0.106) +Requirement already satisfied: networkx in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.2.1) +Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105) +Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (10.3.2.106) +Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.3.1) +Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (2.20.5) +Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (9.1.0.70) +Requirement already satisfied: jinja2 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.1.2) +Requirement already satisfied: filelock in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.13.1) +Requirement already satisfied: fsspec in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (2023.12.2) +Requirement already satisfied: triton==3.0.0 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.0.0) +Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105) +Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (11.4.5.107) +Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105) +Requirement already satisfied: sympy in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (1.12) +Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105) +Requirement already satisfied: nvidia-nvjitlink-cu12 in /root/miniconda3/lib/python3.10/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->bitsandbytes) (12.6.20) +Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/lib/python3.10/site-packages (from jinja2->torch->bitsandbytes) (2.1.3) +Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/lib/python3.10/site-packages (from sympy->torch->bitsandbytes) (1.3.0) +Installing collected packages: bitsandbytes +Successfully installed bitsandbytes-0.43.3 diff --git a/tests/models/test_glm4v.py b/tests/models/test_glm4v.py index 4d961e502ddc..e864e855f58e 100644 --- a/tests/models/test_glm4v.py +++ b/tests/models/test_glm4v.py @@ -3,53 +3,54 @@ # torch-2.1.0+cuda11.8-cp310-cp310-linux_aarch64.whl -# import torch -# from PIL import Image -# from transformers import AutoTokenizer -# from vllm import LLM, SamplingParams -# from vllm.inputs import TokensPrompt - -# max_model_len, tp_size = 8192, 1 -# model_name = "THUDM/glm-4v-9b" - -# llm = LLM( -# model=model_name, -# tensor_parallel_size=tp_size, -# max_model_len=max_model_len, -# trust_remote_code=True, -# dtype=torch.bfloat16, -# enforce_eager=True, -# load_format='bitsandbytes', -# quantization='bitsandbytes' -# ) -# stop_token_ids = [151329, 151336, 151338] -# sampling_params = SamplingParams(temperature=0, max_tokens=1024, stop_token_ids=stop_token_ids) - -# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - -# query = 'Describe this picture.' -# image = Image.open("docs/source/assets/logos/vllm-logo-text-light.png").convert('RGB') -# inputs = tokenizer.apply_chat_template( -# [{"role": "user", "image": image, "content": query}], -# add_generation_prompt=True, -# tokenize=True, -# return_tensors="pt", -# return_dict=True -# ) - -# image_tensor = inputs['images'] - -# input_ids = inputs['input_ids'][0].tolist() - -# outputs = llm.generate( -# TokensPrompt(**{ -# "prompt_token_ids": input_ids, -# "multi_modal_data": {"image": image_tensor}, -# }), -# sampling_params=sampling_params -# ) - -# print(outputs[0].outputs[0].text) +import torch +from PIL import Image +from transformers import AutoTokenizer +from vllm import LLM, SamplingParams +from vllm.inputs import TokensPrompt + +max_model_len, tp_size = 8192, 1 +model_name = "THUDM/glm-4v-9b" + +llm = LLM( + model=model_name, + tensor_parallel_size=tp_size, + max_model_len=max_model_len, + trust_remote_code=True, + dtype=torch.bfloat16, + enforce_eager=True, + load_format='bitsandbytes', + quantization='bitsandbytes' +) +stop_token_ids = [151329, 151336, 151338] +sampling_params = SamplingParams(temperature=0, max_tokens=1024, stop_token_ids=stop_token_ids) + +tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + +query = 'Describe this picture.' + +image = Image.open(os.path.join(os.path.dirname(__file__), "../../docs/source/assets/logos/vllm-logo-text-light.png")).convert('RGB') +inputs = tokenizer.apply_chat_template( + [{"role": "user", "image": image, "content": query}], + add_generation_prompt=True, + tokenize=True, + return_tensors="pt", + return_dict=True +) + +image_tensor = inputs['images'] + +input_ids = inputs['input_ids'][0].tolist() + +outputs = llm.generate( + TokensPrompt(**{ + "prompt_token_ids": input_ids, + "multi_modal_data": {"image": image_tensor}, + }), + sampling_params=sampling_params +) + +print(outputs[0].outputs[0].text) # from transformers import AutoTokenizer From 609e2de6690cb38aabbaebba2ae1023c72fb2b74 Mon Sep 17 00:00:00 2001 From: Makadamia <37922390+alexw994@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:28:25 +0800 Subject: [PATCH 6/7] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 6729a7aeb54e..3c5886ed464b 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,9 @@ Easy, fast, and cheap LLM serving for everyone - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai). --- +## Usage +glm-4v: Colab(https://colab.research.google.com/drive/1jpCM0H3thZjN1XqcnpHm3S5g2Z9Pz89k?usp=sharing) + ## About vLLM is a fast and easy-to-use library for LLM inference and serving. From 634944691b6ce0e25c011dbd414a1205a70b3bce Mon Sep 17 00:00:00 2001 From: Makadamia <37922390+alexw994@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:29:28 +0800 Subject: [PATCH 7/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3c5886ed464b..f5b6103b12ed 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Easy, fast, and cheap LLM serving for everyone --- ## Usage -glm-4v: Colab(https://colab.research.google.com/drive/1jpCM0H3thZjN1XqcnpHm3S5g2Z9Pz89k?usp=sharing) +glm-4v: [Colab Notebook](https://colab.research.google.com/drive/1jpCM0H3thZjN1XqcnpHm3S5g2Z9Pz89k?usp=sharing) ## About vLLM is a fast and easy-to-use library for LLM inference and serving.