From 64c3c1cbd7d72371faf24d145059a0409e9688d4 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 17 Jul 2024 00:12:43 +0000 Subject: [PATCH 01/26] stash --- vllm/model_executor/layers/linear.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 684e1abf7bcf..26bc25b5c2d7 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -72,6 +72,7 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, + layer_name: Optional[str]=None, **extra_weight_attrs): """Create weights for a linear layer. The weights will be set as attributes of the layer. @@ -105,6 +106,7 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, + layer_name: Optional[str]=None, **extra_weight_attrs): weight = Parameter(torch.empty(sum(output_partition_sizes), input_size_per_partition, @@ -141,6 +143,7 @@ def __init__( skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, + layer_name: Optional[str] = None, ): super().__init__() @@ -179,15 +182,18 @@ def __init__(self, bias: bool = True, skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + layer_name: Optional[str] = None, + ): super().__init__(input_size, output_size, skip_bias_add, params_dtype, - quant_config) + quant_config, layer_name) # All the linear layer supports quant method. assert self.quant_method is not None self.quant_method.create_weights(self, self.input_size, [self.output_size], self.input_size, - self.output_size, self.params_dtype) + self.output_size, self.params_dtype, + layer_name=layer_name) if bias: self.bias = Parameter( @@ -249,7 +255,9 @@ def __init__(self, skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, - output_sizes: Optional[List[int]] = None): + output_sizes: Optional[List[int]] = None, + layer_name: Optional[str] = None, + ): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config) @@ -276,7 +284,8 @@ def __init__(self, input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, - weight_loader=self.weight_loader) + layer_name=layer_name, + weight_loader=self.weight_loader,) if bias: self.bias = Parameter( torch.empty(self.output_size_per_partition, @@ -357,7 +366,8 @@ def __init__(self, gather_output: bool = False, skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + layer_name: Optional[str] = None): self.output_sizes = output_sizes tp_size = get_tensor_model_parallel_world_size() assert all(output_size % tp_size == 0 for output_size in output_sizes) @@ -497,7 +507,8 @@ def __init__(self, bias: bool = True, skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + layer_name: Optional[str] = None): self.hidden_size = hidden_size self.head_size = head_size self.total_num_heads = total_num_heads From 891529793f286de6f45caee59d82c150bb4a48e0 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 17 Jul 2024 00:54:49 +0000 Subject: [PATCH 02/26] re-enable nonuniform for llama --- vllm/model_executor/layers/linear.py | 8 +++++--- .../compressed_tensors/compressed_tensors.py | 16 ++++++++++++--- vllm/model_executor/models/llama.py | 20 ++++++++++++++----- vllm/model_executor/models/utils.py | 2 +- 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 26bc25b5c2d7..d6ed2647343a 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -72,7 +72,7 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, - layer_name: Optional[str]=None, + layer_name: Optional[str] = None, **extra_weight_attrs): """Create weights for a linear layer. The weights will be set as attributes of the layer. @@ -106,7 +106,7 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, - layer_name: Optional[str]=None, + layer_name: Optional[str] = None, **extra_weight_attrs): weight = Parameter(torch.empty(sum(output_partition_sizes), input_size_per_partition, @@ -699,7 +699,8 @@ def __init__(self, skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, reduce_results: bool = True, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + layer_name: Optional[str] = None): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config) @@ -717,6 +718,7 @@ def __init__(self, input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, + layer_name=layer_name, weight_loader=self.weight_loader) if not reduce_results and (bias and not skip_bias_add): raise ValueError("When not reduce the results, adding bias to the " diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 524b4c894b9b..4535589b34b0 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -10,7 +10,7 @@ W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensorsScheme, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, - CompressedTensorsWNA16) + CompressedTensorsWNA16, CompressedTensorsUnquantized) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, QuantizationType, find_first_name_or_class_match) @@ -201,10 +201,20 @@ def _get_schema(self, weight_quant: BaseModel, raise NotImplementedError( "No compressed-tensors compatible scheme was found.") - def get_scheme(self, layer: torch.nn.Module) -> "CompressedTensorsScheme": + def get_scheme( + self, + layer: torch.nn.Module, + layer_name: Optional[str] = None) -> "CompressedTensorsScheme": + + if layer_name is not None: + if layer_name in self.ignore: + return CompressedTensorsUnquantized() + else: + # fall back to + layer_name="" layer_type_name = find_first_name_or_class_match( - name="", + name=layer_name, module=layer, targets=self.layer_quant_details.keys(), check_contains=True) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f03e34b9e7c9..03f5b2da38f8 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -57,6 +57,7 @@ class LlamaMLP(nn.Module): def __init__( self, + parent_name: str, hidden_size: int, intermediate_size: int, hidden_act: str, @@ -68,11 +69,13 @@ def __init__( input_size=hidden_size, output_sizes=[intermediate_size] * 2, bias=bias, - quant_config=quant_config) + quant_config=quant_config, + layer_name=f"{parent_name}.gate_up_proj") self.down_proj = RowParallelLinear(input_size=intermediate_size, output_size=hidden_size, bias=bias, - quant_config=quant_config) + quant_config=quant_config, + layer_name=f"{parent_name}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") @@ -89,6 +92,7 @@ class LlamaAttention(nn.Module): def __init__( self, + parent_name: str, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -129,12 +133,14 @@ def __init__( total_num_kv_heads=self.total_num_kv_heads, bias=bias, quant_config=quant_config, + layer_name=f"{parent_name}.qkv_proj", ) self.o_proj = RowParallelLinear( input_size=self.total_num_heads * self.head_dim, output_size=hidden_size, bias=bias, quant_config=quant_config, + layer_name=f"{parent_name}.o_proj", ) self.rotary_emb = get_rope( @@ -170,6 +176,7 @@ class LlamaDecoderLayer(nn.Module): def __init__( self, + parent_name: str, config: LlamaConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -189,6 +196,7 @@ def __init__( attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False) self.self_attn = LlamaAttention( + parent_name=f"{parent_name}.self_attn", hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=getattr(config, "num_key_value_heads", @@ -201,6 +209,7 @@ def __init__( cache_config=cache_config, ) self.mlp = LlamaMLP( + parent_name=f"{parent_name}.mlp", hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, @@ -264,9 +273,10 @@ def __init__( ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda: LlamaDecoderLayer(config=config, - cache_config=cache_config, - quant_config=quant_config)) + lambda idx: LlamaDecoderLayer(parent_name=f"model.layers.{idx}.", + config=config, + cache_config=cache_config, + quant_config=quant_config)) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index c135b2035220..1196cd7de5e6 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -65,7 +65,7 @@ def make_layers( get_pp_group().world_size) modules = torch.nn.ModuleList( [PPMissingLayer() for _ in range(start_layer)] + - [layer_fn() for _ in range(start_layer, end_layer)] + + [layer_fn(idx) for idx in range(start_layer, end_layer)] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)]) return start_layer, end_layer, modules From 18a71ae2b9383b0ab5922c5986f2c3f5d898e765 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 17 Jul 2024 13:04:27 +0000 Subject: [PATCH 03/26] stash --- vllm/model_executor/layers/linear.py | 3 +- .../compressed_tensors/compressed_tensors.py | 45 +++++++++++++++---- .../quantization/compressed_tensors/utils.py | 5 +++ vllm/model_executor/models/llama.py | 3 +- 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index d6ed2647343a..d080875bdea1 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -540,7 +540,8 @@ def __init__(self, gather_output=False, skip_bias_add=skip_bias_add, params_dtype=params_dtype, - quant_config=quant_config) + quant_config=quant_config, + layer_name=layer_name) def weight_loader(self, param: Parameter, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 4535589b34b0..b73e5ad1b0a5 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -13,7 +13,7 @@ CompressedTensorsWNA16, CompressedTensorsUnquantized) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, - QuantizationType, find_first_name_or_class_match) + QuantizationType, find_first_name_or_class_match, _FUSED_LAYER_NAME_MAPPING) from vllm.platforms import current_platform @@ -205,13 +205,42 @@ def get_scheme( self, layer: torch.nn.Module, layer_name: Optional[str] = None) -> "CompressedTensorsScheme": - + if layer_name is not None: - if layer_name in self.ignore: + + # layer_name = model.layers.0.self_attn.qkv_proj + # proj_name = qkv_proj + proj_name = layer_name.split(".")[-1] + + # Fused layers like gate_up_proj or qkv_proj will not be fused + # in the safetensors checkpoint. So, we convert the name + # from the fused version to unfused + check to make sure that + # each shard of the fused layer has the same scheme. + if proj_name in _FUSED_LAYER_NAME_MAPPING: + # Convert fused_name --> shard_names + shard_names = [ + layer_name.replace(proj_name, unfused_proj_name) for + unfused_proj_name in _FUSED_LAYER_NAME_MAPPING[proj_name]] + + # Check if this layer should be skipped. + should_ignore_layer = shard_names[0] in self.ignore + + # Confirm that all the shards are skipped or none are skipped. + for shard_name in shard_names: + should_ignore_shard = (shard_name in should_ignore_layer) + if should_ignore_shard != should_ignore_layer: + raise ValueError( + f"Found a different quantization scheme for {shard_name} in " + f"{shard_names[0]} in layer {layer_name}. vLLM requires all " + "shards in fused layers to share the same scheme.") + else: + should_ignore_layer = layer_name in self.ignore + + if should_ignore_layer: return CompressedTensorsUnquantized() - else: - # fall back to - layer_name="" + + + layer_name="" layer_type_name = find_first_name_or_class_match( name=layer_name, @@ -245,7 +274,7 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, - **extra_weight_attrs): + layer_name: Optional[str], **extra_weight_attrs): """ Use the CompressedTensorsScheme associated with each layer to create the necessary parameters for the layer. See LinearMethodBase for param @@ -254,7 +283,7 @@ def create_weights(self, layer: torch.nn.Module, """ weight_loader = extra_weight_attrs.get("weight_loader") - scheme = self.quantization_config.get_scheme(layer=layer) + scheme = self.quantization_config.get_scheme(layer=layer, layer_name=layer_name) scheme.create_weights( layer=layer, input_size=input_size, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 5b44c215535b..5ccd82c1d127 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -5,6 +5,11 @@ from pydantic import BaseModel, Field from torch.nn import Module +# fused_name: List[shard_name] +_FUSED_LAYER_NAME_MAPPING = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": [""] +} class CompressionFormat(Enum): dense = "dense" diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 03f5b2da38f8..f872cb379917 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -273,7 +273,7 @@ def __init__( ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda idx: LlamaDecoderLayer(parent_name=f"model.layers.{idx}.", + lambda idx: LlamaDecoderLayer(parent_name=f"model.layers.{idx}", config=config, cache_config=cache_config, quant_config=quant_config)) @@ -481,6 +481,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + breakpoint() # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should From 197dd19fd998ce9dc1d54bef7b0cb5d8ad5a7091 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 17 Jul 2024 13:27:49 +0000 Subject: [PATCH 04/26] working e2e --- vllm/model_executor/layers/linear.py | 3 ++- .../compressed_tensors/compressed_tensors.py | 11 ++++------- .../schemes/compressed_tensors_unquantized.py | 1 - .../layers/quantization/compressed_tensors/utils.py | 3 ++- vllm/model_executor/model_loader/loader.py | 1 - vllm/model_executor/models/llama.py | 1 - 6 files changed, 8 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index d080875bdea1..099224c03b5e 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -377,7 +377,8 @@ def __init__(self, gather_output=gather_output, skip_bias_add=skip_bias_add, params_dtype=params_dtype, - quant_config=quant_config) + quant_config=quant_config, + layer_name=layer_name,) def weight_loader(self, param: Parameter, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index b73e5ad1b0a5..9466369a177b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -183,7 +183,8 @@ def _get_schema(self, weight_quant: BaseModel, group_size=weight_quant.group_size) if (self.quant_format == CompressionFormat.int_quantized.value or - self.quant_format == CompressionFormat.float_quantized.value): + self.quant_format == CompressionFormat.float_quantized.value or + self.quant_format == CompressionFormat.naive_quantized.value): if self._is_fp8_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Fp8( input_dynamic=input_quant.dynamic) @@ -207,7 +208,6 @@ def get_scheme( layer_name: Optional[str] = None) -> "CompressedTensorsScheme": if layer_name is not None: - # layer_name = model.layers.0.self_attn.qkv_proj # proj_name = qkv_proj proj_name = layer_name.split(".")[-1] @@ -227,7 +227,7 @@ def get_scheme( # Confirm that all the shards are skipped or none are skipped. for shard_name in shard_names: - should_ignore_shard = (shard_name in should_ignore_layer) + should_ignore_shard = (shard_name in self.ignore) if should_ignore_shard != should_ignore_layer: raise ValueError( f"Found a different quantization scheme for {shard_name} in " @@ -239,11 +239,8 @@ def get_scheme( if should_ignore_layer: return CompressedTensorsUnquantized() - - layer_name="" - layer_type_name = find_first_name_or_class_match( - name=layer_name, + name="", module=layer, targets=self.layer_quant_details.keys(), check_contains=True) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py index 2c7fe3e0e411..7d0c1e11f7ba 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py @@ -29,7 +29,6 @@ def create_weights(self, layer: torch.nn.Module, weight = Parameter(torch.empty(sum(output_partition_sizes), input_size_per_partition, - device="cuda", dtype=params_dtype), requires_grad=False) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 5ccd82c1d127..10cc05fa3eaa 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -8,12 +8,13 @@ # fused_name: List[shard_name] _FUSED_LAYER_NAME_MAPPING = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": [""] + "gate_up_proj": ["gate_proj", "up_proj"] } class CompressionFormat(Enum): dense = "dense" sparse_bitmask = "sparse-bitmask" + naive_quantized = "naive-quantized" float_quantized = "float-quantized" int_quantized = "int-quantized" pack_quantized = "pack-quantized" diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index a1a2b0b323f6..af0054a079e1 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -282,7 +282,6 @@ def load_model(self, *, model_config: ModelConfig, model, "fall_back_to_pt_during_load", True)), ) - for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) if quant_method is not None: diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f872cb379917..d4fc4a38ab77 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -481,7 +481,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - breakpoint() # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should From eb95b49a44de63041cfcf3f60e7a7e543d3cc3ff Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 17 Jul 2024 14:23:17 +0000 Subject: [PATCH 05/26] cleanup code a bit --- .../compressed_tensors/compressed_tensors.py | 39 ++----- .../quantization/compressed_tensors/utils.py | 101 +++++++++++++++--- 2 files changed, 93 insertions(+), 47 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 9466369a177b..468d737c329a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -13,7 +13,8 @@ CompressedTensorsWNA16, CompressedTensorsUnquantized) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, - QuantizationType, find_first_name_or_class_match, _FUSED_LAYER_NAME_MAPPING) + QuantizationType, find_first_name_or_class_match, + should_ignore_layer) from vllm.platforms import current_platform @@ -206,38 +207,10 @@ def get_scheme( self, layer: torch.nn.Module, layer_name: Optional[str] = None) -> "CompressedTensorsScheme": - - if layer_name is not None: - # layer_name = model.layers.0.self_attn.qkv_proj - # proj_name = qkv_proj - proj_name = layer_name.split(".")[-1] - - # Fused layers like gate_up_proj or qkv_proj will not be fused - # in the safetensors checkpoint. So, we convert the name - # from the fused version to unfused + check to make sure that - # each shard of the fused layer has the same scheme. - if proj_name in _FUSED_LAYER_NAME_MAPPING: - # Convert fused_name --> shard_names - shard_names = [ - layer_name.replace(proj_name, unfused_proj_name) for - unfused_proj_name in _FUSED_LAYER_NAME_MAPPING[proj_name]] - - # Check if this layer should be skipped. - should_ignore_layer = shard_names[0] in self.ignore - - # Confirm that all the shards are skipped or none are skipped. - for shard_name in shard_names: - should_ignore_shard = (shard_name in self.ignore) - if should_ignore_shard != should_ignore_layer: - raise ValueError( - f"Found a different quantization scheme for {shard_name} in " - f"{shard_names[0]} in layer {layer_name}. vLLM requires all " - "shards in fused layers to share the same scheme.") - else: - should_ignore_layer = layer_name in self.ignore - - if should_ignore_layer: - return CompressedTensorsUnquantized() + + # Check if the layer is ignored (skipped for quantization). + if should_ignore_layer(layer_name, ignore=self.ignore): + return CompressedTensorsUnquantized() layer_type_name = find_first_name_or_class_match( name="", diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 10cc05fa3eaa..c191105f9f90 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -5,12 +5,6 @@ from pydantic import BaseModel, Field from torch.nn import Module -# fused_name: List[shard_name] -_FUSED_LAYER_NAME_MAPPING = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"] -} - class CompressionFormat(Enum): dense = "dense" sparse_bitmask = "sparse-bitmask" @@ -82,6 +76,71 @@ class QuantizationArgs(BaseModel): ) +# fused_name: List[shard_name] +_FUSED_LAYER_NAME_MAPPING = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] +} + +def should_ignore_layer(layer_name: Optional[str], ignore: Iterable[str]) -> bool: + if layer_name is None: + return False + + # layer_name = model.layers.0.self_attn.qkv_proj + # proj_name = qkv_proj + proj_name = layer_name.split(".")[-1] + + # Fused layers like gate_up_proj or qkv_proj will not be fused + # in the safetensors checkpoint. So, we convert the name + # from the fused version to unfused + check to make sure that + # each shard of the fused layer has the same scheme. + if proj_name in _FUSED_LAYER_NAME_MAPPING: + shard_proj_names = _FUSED_LAYER_NAME_MAPPING[proj_name] + + # Convert fused_name --> [shard_names] + shard_names = [ + layer_name.replace(proj_name, shard_proj_name) for + shard_proj_name in shard_proj_names + ] + + # Layer should be ignored if shards are ignored. + should_ignore_layer = None + for shard_name in shard_names: + should_ignore_shard = check_equal_or_regex_match( + layer_name=shard_name, targets=ignore) + + # If shard_idx=0, set layer ignore to match shard. + if should_ignore_layer is None: + should_ignore_layer = should_ignore_shard + + # If shard_idx=1+ confirm scheme matches prior shards. + elif should_ignore_shard != should_ignore_layer: + raise ValueError( + f"Found a different quantization schemes for " + f"{shard_proj_names} in {layer_name}. vLLM " + "requires all to use the same scheme.") + + # Unfused layers like down_proj and o_proj will match + # the safetensors checkpoint already. + else: + should_ignore_layer = check_equal_or_regex_match( + layer_name=layer_name, targets=ignore) + + return should_ignore_layer + + +def check_equal_or_regex_match(layer_name: str, + targets: Iterable[str]) -> bool: + """ + Checks whether a layer_name is exactly equal or a regex match for + if target starts with 're:' to any target in list. + """ + for target in targets: + if _is_equal_or_regex_match(layer_name, target): + return True + return False + + def find_first_name_or_class_match( name: str, module: Module, @@ -117,13 +176,27 @@ def _find_first_match(value: str, """ for target in targets: - if target.startswith("re:"): - pattern = target[3:] - if re.match(pattern, value): - return target - elif check_contains: - if target.lower() in value.lower(): - return target - elif target == value: + if _is_equal_or_regex_match(value, target, + check_contains=check_contains): return target + return None + + +def _is_equal_or_regex_match(value: str, target: str, + check_contains: bool = False) -> bool: + """ + Checks whether a value is exactly equal or a regex match for target + if taget starts with 're:'. If check_contains is set to True, + additionally checks if the target string is contained within the value. + """ + + if target.startswith("re:"): + pattern = target[3:] + if re.match(pattern, value): + return target + elif check_contains: + if target.lower() in value.lower(): + return target + elif target == value: + return target From e3cf13554f5edfa9bf768441c09c47a77fd09cd5 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 17 Jul 2024 18:30:10 +0000 Subject: [PATCH 06/26] nonuniform --- .../compressed_tensors/compressed_tensors.py | 86 +++++++++++-------- .../quantization/compressed_tensors/utils.py | 40 ++++++--- 2 files changed, 79 insertions(+), 47 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 468d737c329a..f207b9ca2082 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -13,18 +13,21 @@ CompressedTensorsWNA16, CompressedTensorsUnquantized) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, - QuantizationType, find_first_name_or_class_match, - should_ignore_layer) + QuantizationType, find_matched_target, should_ignore_layer) from vllm.platforms import current_platform class CompressedTensorsConfig(QuantizationConfig): - def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str], + def __init__(self, + target_scheme_map: Dict[str, Any], + ignore: List[str], quant_format: str): + self.ignore = ignore - self.layer_quant_details = layer_quant_details self.quant_format = quant_format + # Map from [target -> scheme] + self.target_scheme_map = target_scheme_map def get_linear_method(self) -> "CompressedTensorsLinearMethod": return CompressedTensorsLinearMethod(self) @@ -51,7 +54,7 @@ def get_quant_method( @classmethod def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": - layer_quant_details: Dict[str, Any] = dict() + target_scheme_map: Dict[str, Any] = dict() ignore: List[str] = config.get("ignore", None) quant_format: str = config.get("format", None) @@ -63,21 +66,21 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": # details follow the structure defined by the QuantizationArgs # pydantic model, which is used to verify the structure of the # quant_config and also store the details for later use. - for key, quant_config in config["config_groups"].items(): + for _, quant_config in config["config_groups"].items(): targets = quant_config.get("targets") for target in targets: - layer_quant_details[target] = {} - layer_quant_details[target][ + target_scheme_map[target] = {} + target_scheme_map[target][ "weights"] = QuantizationArgs.parse_obj( quant_config.get("weights")) try: - layer_quant_details[target][ + target_scheme_map[target][ "input_activations"] = QuantizationArgs.parse_obj( quant_config.get("input_activations")) except Exception: - layer_quant_details[target]["input_activations"] = None + target_scheme_map[target]["input_activations"] = None - return cls(layer_quant_details=layer_quant_details, + return cls(target_scheme_map=target_scheme_map, ignore=ignore, quant_format=quant_format) @@ -165,8 +168,10 @@ def _is_wNa16_group_channel(self, weight_quant: BaseModel, return (is_channel_group and input_quant_none and is_symmetric and is_static) - def _get_schema(self, weight_quant: BaseModel, - input_quant: BaseModel) -> "CompressedTensorsScheme": + def _get_scheme_from_parts( + self, + weight_quant: BaseModel, + input_quant: BaseModel) -> "CompressedTensorsScheme": if self._is_wNa16_group_channel(weight_quant, input_quant): self._check_gptq_and_marlin_can_run() @@ -203,33 +208,44 @@ def _get_schema(self, weight_quant: BaseModel, raise NotImplementedError( "No compressed-tensors compatible scheme was found.") - def get_scheme( - self, - layer: torch.nn.Module, - layer_name: Optional[str] = None) -> "CompressedTensorsScheme": + def get_scheme(self, + layer: torch.nn.Module, + layer_name: Optional[str] = None) -> "CompressedTensorsScheme": + """ + compressed-tensors supports non uniform in the following way: - # Check if the layer is ignored (skipped for quantization). - if should_ignore_layer(layer_name, ignore=self.ignore): - return CompressedTensorsUnquantized() + ignore: List of layer_names or nn.Module names to be ignored. + targets of config_groups: There can be N config_groups which each + have a quantization scheme. Each config_group has a list of targets + which can be a full layer_name, a regex for a layer_name, or + an nn.Module name. - layer_type_name = find_first_name_or_class_match( - name="", - module=layer, - targets=self.layer_quant_details.keys(), - check_contains=True) + We first check whether a layer is in the ignore group and use + CompressedTensorsUnquantized (i.e. fp16/bf16) scheme for the layer - if layer_type_name is None: - raise ValueError(f"Could not matching target for layer {layer}") + We then detect whether a layer_name is found in any target and + use the quantization scheme corresponding to the matched target + to select the CompressedTensorsScheme used for infernece. + """ - layer_quant_details: Dict[str, Any] = self.layer_quant_details.get( - layer_type_name, None) - if layer_quant_details is None: - raise ValueError( - f"Could not find quantization details for {layer}.") + # Check if the layer is skipped for quantization. + # TODO (@robertgshaw2): support module names + if should_ignore_layer(layer_name, ignore=self.ignore): + return CompressedTensorsUnquantized() - return self._get_schema( - weight_quant=layer_quant_details["weights"], - input_quant=layer_quant_details["input_activations"]) + # Find the "target" in the compressed-tensors config + # that our layer conforms to. + matched_target = find_matched_target( + layer_name=layer_name, + module=layer, + targets=self.target_scheme_map.keys()) + + # Find the quant_scheme + scheme = self.target_scheme_map[matched_target] + + return self._get_scheme_from_parts( + weight_quant=scheme["weights"], + input_quant=scheme["input_activations"]) class CompressedTensorsLinearMethod(LinearMethodBase): diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index c191105f9f90..84be26b5207b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -141,25 +141,41 @@ def check_equal_or_regex_match(layer_name: str, return False -def find_first_name_or_class_match( - name: str, +def find_matched_target( + layer_name: Optional[str], module: Module, - targets: Iterable[str], - check_contains: bool = False) -> Optional[str]: + targets: Iterable[str]) -> str: """ - Helper function to map the quantization details listed in the config - for a given list of targets against each model layer. First uses the - layer name to try and find a match. If no name match is found, uses - the layer class name. Returns None otherwise. + Helper function to look up which "target" in the compressed-tensors + config that a layer corresponds to. - :param name: layer name + Recall that a compressed-tensors configs has a concept of + config_groups, where each layer can be quantized with with a different + scheme. + + targets in each config_group will be a list of either layer names + (or regexes corresponding to layer names) or names of torch Modules. + + First, we try to match the layer_name with a target + Second, we try to match the module's name with a target + + :param layer_name: layer name :param module: torch.nn.Module :param targets: list of targets to match the layer against - :param check_contains: whether or not to do a substring match """ - return _find_first_match(name, targets) or _find_first_match( - module.__class__.__name__, targets, check_contains) + if layer_name is None: + layer_name = "" + + matched_target= (_find_first_match(layer_name, targets) or + _find_first_match(module.__class__.__name__, targets, True)) + + if matched_target is None: + raise ValueError( + f"Unable to find matching target for {module} in the " + "compressed-tensors config.") + + return matched_target def _find_first_match(value: str, From c6028554e84b2ef38d819e46bdc29dc1ddd57876 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 17 Jul 2024 19:32:33 +0000 Subject: [PATCH 07/26] fix nit --- .../quantization/compressed_tensors/compressed_tensors.py | 5 +++-- vllm/model_executor/model_loader/loader.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index f207b9ca2082..2e6a3fc55570 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -208,8 +208,7 @@ def _get_scheme_from_parts( raise NotImplementedError( "No compressed-tensors compatible scheme was found.") - def get_scheme(self, - layer: torch.nn.Module, + def get_scheme(self, layer: torch.nn.Module, layer_name: Optional[str] = None) -> "CompressedTensorsScheme": """ compressed-tensors supports non uniform in the following way: @@ -235,6 +234,8 @@ def get_scheme(self, # Find the "target" in the compressed-tensors config # that our layer conforms to. + # TODO (@robertgshaw): add compressed-tensors as dep + # so we do not have to re-write these functions matched_target = find_matched_target( layer_name=layer_name, module=layer, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index af0054a079e1..a1a2b0b323f6 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -282,6 +282,7 @@ def load_model(self, *, model_config: ModelConfig, model, "fall_back_to_pt_during_load", True)), ) + for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) if quant_method is not None: From d9355f41eff00f5ad51d65a58f72d49e4e00a8ed Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 17 Jul 2024 19:50:08 +0000 Subject: [PATCH 08/26] format --- vllm/model_executor/layers/linear.py | 72 ++++++++++--------- .../compressed_tensors/compressed_tensors.py | 49 +++++++------ .../quantization/compressed_tensors/utils.py | 71 +++++++++--------- vllm/model_executor/models/llama.py | 21 +++--- 4 files changed, 113 insertions(+), 100 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 099224c03b5e..9edadf4f6110 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -72,7 +72,6 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, - layer_name: Optional[str] = None, **extra_weight_attrs): """Create weights for a linear layer. The weights will be set as attributes of the layer. @@ -106,7 +105,6 @@ def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: List[int], input_size: int, output_size: int, params_dtype: torch.dtype, - layer_name: Optional[str] = None, **extra_weight_attrs): weight = Parameter(torch.empty(sum(output_partition_sizes), input_size_per_partition, @@ -143,7 +141,6 @@ def __init__( skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, - layer_name: Optional[str] = None, ): super().__init__() @@ -176,23 +173,26 @@ class ReplicatedLinear(LinearBase): quant_config: Quantization configure. """ - def __init__(self, - input_size: int, - output_size: int, - bias: bool = True, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None, - layer_name: Optional[str] = None, + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + layer_name: Optional[str] = None, ): super().__init__(input_size, output_size, skip_bias_add, params_dtype, - quant_config, layer_name) + quant_config) # All the linear layer supports quant method. assert self.quant_method is not None - self.quant_method.create_weights(self, self.input_size, - [self.output_size], self.input_size, - self.output_size, self.params_dtype, + self.quant_method.create_weights(self, + self.input_size, [self.output_size], + self.input_size, + self.output_size, + self.params_dtype, layer_name=layer_name) if bias: @@ -247,16 +247,17 @@ class ColumnParallelLinear(LinearBase): the list would be size 3. """ - def __init__(self, - input_size: int, - output_size: int, - bias: bool = True, - gather_output: bool = False, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None, - output_sizes: Optional[List[int]] = None, - layer_name: Optional[str] = None, + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + output_sizes: Optional[List[int]] = None, + layer_name: Optional[str] = None, ): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config) @@ -285,7 +286,8 @@ def __init__(self, output_size=self.output_size, params_dtype=self.params_dtype, layer_name=layer_name, - weight_loader=self.weight_loader,) + weight_loader=self.weight_loader, + ) if bias: self.bias = Parameter( torch.empty(self.output_size_per_partition, @@ -371,14 +373,16 @@ def __init__(self, self.output_sizes = output_sizes tp_size = get_tensor_model_parallel_world_size() assert all(output_size % tp_size == 0 for output_size in output_sizes) - super().__init__(input_size=input_size, - output_size=sum(output_sizes), - bias=bias, - gather_output=gather_output, - skip_bias_add=skip_bias_add, - params_dtype=params_dtype, - quant_config=quant_config, - layer_name=layer_name,) + super().__init__( + input_size=input_size, + output_size=sum(output_sizes), + bias=bias, + gather_output=gather_output, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config, + layer_name=layer_name, + ) def weight_loader(self, param: Parameter, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 2e6a3fc55570..85ba71ffe6d3 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -8,9 +8,9 @@ QuantizationConfig) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, - CompressedTensorsScheme, CompressedTensorsW4A16Sparse24, - CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, - CompressedTensorsWNA16, CompressedTensorsUnquantized) + CompressedTensorsScheme, CompressedTensorsUnquantized, + CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, + CompressedTensorsW8A8Int8, CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, QuantizationType, find_matched_target, should_ignore_layer) @@ -19,9 +19,7 @@ class CompressedTensorsConfig(QuantizationConfig): - def __init__(self, - target_scheme_map: Dict[str, Any], - ignore: List[str], + def __init__(self, target_scheme_map: Dict[str, Any], ignore: List[str], quant_format: str): self.ignore = ignore @@ -169,8 +167,7 @@ def _is_wNa16_group_channel(self, weight_quant: BaseModel, and is_static) def _get_scheme_from_parts( - self, - weight_quant: BaseModel, + self, weight_quant: BaseModel, input_quant: BaseModel) -> "CompressedTensorsScheme": if self._is_wNa16_group_channel(weight_quant, input_quant): @@ -188,9 +185,10 @@ def _get_scheme_from_parts( strategy=weight_quant.strategy, group_size=weight_quant.group_size) - if (self.quant_format == CompressionFormat.int_quantized.value or - self.quant_format == CompressionFormat.float_quantized.value or - self.quant_format == CompressionFormat.naive_quantized.value): + if (self.quant_format == CompressionFormat.int_quantized.value + or self.quant_format == CompressionFormat.float_quantized.value + or self.quant_format + == CompressionFormat.naive_quantized.value): if self._is_fp8_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Fp8( input_dynamic=input_quant.dynamic) @@ -208,8 +206,10 @@ def _get_scheme_from_parts( raise NotImplementedError( "No compressed-tensors compatible scheme was found.") - def get_scheme(self, layer: torch.nn.Module, - layer_name: Optional[str] = None) -> "CompressedTensorsScheme": + def get_scheme( + self, + layer: torch.nn.Module, + layer_name: Optional[str] = None) -> "CompressedTensorsScheme": """ compressed-tensors supports non uniform in the following way: @@ -233,17 +233,17 @@ def get_scheme(self, layer: torch.nn.Module, return CompressedTensorsUnquantized() # Find the "target" in the compressed-tensors config - # that our layer conforms to. + # that our layer conforms to. # TODO (@robertgshaw): add compressed-tensors as dep # so we do not have to re-write these functions matched_target = find_matched_target( layer_name=layer_name, module=layer, targets=self.target_scheme_map.keys()) - - # Find the quant_scheme + + # Find the quant_scheme scheme = self.target_scheme_map[matched_target] - + return self._get_scheme_from_parts( weight_quant=scheme["weights"], input_quant=scheme["input_activations"]) @@ -257,11 +257,15 @@ def __init__(self, quantization_config: CompressedTensorsConfig): def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.scheme.process_weights_after_loading(layer) - def create_weights(self, layer: torch.nn.Module, + def create_weights(self, + layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, - output_size: int, params_dtype: torch.dtype, - layer_name: Optional[str], **extra_weight_attrs): + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + layer_name: Optional[str] = None, + **extra_weight_attrs): """ Use the CompressedTensorsScheme associated with each layer to create the necessary parameters for the layer. See LinearMethodBase for param @@ -270,7 +274,8 @@ def create_weights(self, layer: torch.nn.Module, """ weight_loader = extra_weight_attrs.get("weight_loader") - scheme = self.quantization_config.get_scheme(layer=layer, layer_name=layer_name) + scheme = self.quantization_config.get_scheme(layer=layer, + layer_name=layer_name) scheme.create_weights( layer=layer, input_size=input_size, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 84be26b5207b..c10bcb2e2c22 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -5,6 +5,7 @@ from pydantic import BaseModel, Field from torch.nn import Module + class CompressionFormat(Enum): dense = "dense" sparse_bitmask = "sparse-bitmask" @@ -82,10 +83,12 @@ class QuantizationArgs(BaseModel): "gate_up_proj": ["gate_proj", "up_proj"] } -def should_ignore_layer(layer_name: Optional[str], ignore: Iterable[str]) -> bool: + +def should_ignore_layer(layer_name: Optional[str], + ignore: Iterable[str]) -> bool: if layer_name is None: return False - + # layer_name = model.layers.0.self_attn.qkv_proj # proj_name = qkv_proj proj_name = layer_name.split(".")[-1] @@ -96,11 +99,11 @@ def should_ignore_layer(layer_name: Optional[str], ignore: Iterable[str]) -> boo # each shard of the fused layer has the same scheme. if proj_name in _FUSED_LAYER_NAME_MAPPING: shard_proj_names = _FUSED_LAYER_NAME_MAPPING[proj_name] - + # Convert fused_name --> [shard_names] shard_names = [ - layer_name.replace(proj_name, shard_proj_name) for - shard_proj_name in shard_proj_names + layer_name.replace(proj_name, shard_proj_name) + for shard_proj_name in shard_proj_names ] # Layer should be ignored if shards are ignored. @@ -108,28 +111,28 @@ def should_ignore_layer(layer_name: Optional[str], ignore: Iterable[str]) -> boo for shard_name in shard_names: should_ignore_shard = check_equal_or_regex_match( layer_name=shard_name, targets=ignore) - - # If shard_idx=0, set layer ignore to match shard. + + # If shard_idx=0, set layer ignore to match shard. if should_ignore_layer is None: should_ignore_layer = should_ignore_shard - + # If shard_idx=1+ confirm scheme matches prior shards. elif should_ignore_shard != should_ignore_layer: - raise ValueError( - f"Found a different quantization schemes for " - f"{shard_proj_names} in {layer_name}. vLLM " - "requires all to use the same scheme.") + raise ValueError(f"Found a different quantization schemes for " + f"{shard_proj_names} in {layer_name}. vLLM " + "requires all to use the same scheme.") # Unfused layers like down_proj and o_proj will match # the safetensors checkpoint already. else: - should_ignore_layer = check_equal_or_regex_match( - layer_name=layer_name, targets=ignore) - + should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name, + targets=ignore) + + assert should_ignore_layer is not None return should_ignore_layer -def check_equal_or_regex_match(layer_name: str, +def check_equal_or_regex_match(layer_name: str, targets: Iterable[str]) -> bool: """ Checks whether a layer_name is exactly equal or a regex match for @@ -141,10 +144,8 @@ def check_equal_or_regex_match(layer_name: str, return False -def find_matched_target( - layer_name: Optional[str], - module: Module, - targets: Iterable[str]) -> str: +def find_matched_target(layer_name: Optional[str], module: Module, + targets: Iterable[str]) -> str: """ Helper function to look up which "target" in the compressed-tensors config that a layer corresponds to. @@ -167,13 +168,13 @@ def find_matched_target( if layer_name is None: layer_name = "" - matched_target= (_find_first_match(layer_name, targets) or - _find_first_match(module.__class__.__name__, targets, True)) + matched_target = (_find_first_match(layer_name, targets) + or _find_first_match(module.__class__.__name__, targets, + True)) if matched_target is None: - raise ValueError( - f"Unable to find matching target for {module} in the " - "compressed-tensors config.") + raise ValueError(f"Unable to find matching target for {module} in the " + "compressed-tensors config.") return matched_target @@ -192,27 +193,29 @@ def _find_first_match(value: str, """ for target in targets: - if _is_equal_or_regex_match(value, target, - check_contains=check_contains): + if _is_equal_or_regex_match(value, + target, + check_contains=check_contains): return target - return None -def _is_equal_or_regex_match(value: str, target: str, +def _is_equal_or_regex_match(value: str, + target: str, check_contains: bool = False) -> bool: """ Checks whether a value is exactly equal or a regex match for target - if taget starts with 're:'. If check_contains is set to True, + if target starts with 're:'. If check_contains is set to True, additionally checks if the target string is contained within the value. """ - + if target.startswith("re:"): pattern = target[3:] if re.match(pattern, value): - return target + return True elif check_contains: if target.lower() in value.lower(): - return target + return True elif target == value: - return target + return True + return False diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 78619e19941d..50282af5c50e 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -71,11 +71,12 @@ def __init__( bias=bias, quant_config=quant_config, layer_name=f"{parent_name}.gate_up_proj") - self.down_proj = RowParallelLinear(input_size=intermediate_size, - output_size=hidden_size, - bias=bias, - quant_config=quant_config, - layer_name=f"{parent_name}.down_proj") + self.down_proj = RowParallelLinear( + input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + layer_name=f"{parent_name}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") @@ -276,11 +277,11 @@ def __init__( else: self.embed_tokens = PPMissingLayer() self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda idx: LlamaDecoderLayer(parent_name=f"model.layers.{idx}", - config=config, - cache_config=cache_config, - quant_config=quant_config)) + config.num_hidden_layers, lambda layer_idx: LlamaDecoderLayer( + parent_name=f"model.layers.{layer_idx}", + config=config, + cache_config=cache_config, + quant_config=quant_config)) if get_pp_group().is_last_rank: self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) else: From f036956d4652e18e4821cf6b512ce1281c7a2b19 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 17 Jul 2024 19:51:34 +0000 Subject: [PATCH 09/26] tweak gpt2 --- vllm/model_executor/models/gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index d309a2b27f5d..23e5eabe3ddc 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -186,7 +186,7 @@ def __init__( self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda: GPT2Block(config, cache_config, quant_config)) + lambda layer_idx: GPT2Block(config, cache_config, quant_config)) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) def forward( From ba115ebe132fbfcbec06687f8094ceb84715d2f6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Wed, 17 Jul 2024 21:18:31 +0000 Subject: [PATCH 10/26] added test --- ...a-3-8B-Instruct-nonuniform-compressed-tensors.yaml | 11 +++++++++++ .buildkite/lm-eval-harness/configs/models-small.txt | 1 + 2 files changed, 12 insertions(+) create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml new file mode 100644 index 000000000000..3964f3be5e87 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1 +model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.758 + - name: "exact_match,flexible-extract" + value: 0.759 +limit: 1000 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 3d1306f6bc4f..869fc9cef377 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -2,4 +2,5 @@ Meta-Llama-3-8B-Instruct.yaml Meta-Llama-3-8B-Instruct-FP8.yaml Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml +Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml From 6e28eade7f5b4cf5d6fa128d071c4381b6df4cd6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 01:21:07 +0000 Subject: [PATCH 11/26] update to parent --- .../schemes/compressed_tensors_w8a8_fp8.py | 9 +++++++++ vllm/model_executor/models/llama.py | 20 +++++++++---------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index b93425fb2d62..16acc412aea1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -8,6 +8,7 @@ apply_fp8_linear, create_per_tensor_scale_param, cutlass_fp8_supported, requantize_with_max_scale) from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform __all__ = ["CompressedTensorsW8A8Fp8"] @@ -18,10 +19,18 @@ def __init__(self, input_dynamic: bool): self.input_dynamic = input_dynamic self.cutlass_fp8_supported = cutlass_fp8_supported() + # For GPUs that lack FP8 hardware support, we can leverage the Marlin + # kernel for fast weight-only FP8 quantization + capability = current_platform.get_device_capability() + capability = capability[0] * 10 + capability[1] + self.use_marlin = capability < 89 + # W8A8-Fp8 kernels support only per-tensor and per-channel cases. # So if we have a fused module (QKV, MLP) with per tensor scales (thus N # scales being passed to the kernel), we requantize with a single scale. def process_weights_after_loading(self, layer) -> None: + if self.use_marlin: + weight = layer.weight # Dequant -> Quant with max scale. max_w_scale, weight = requantize_with_max_scale( weight=layer.weight, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 50282af5c50e..9e655317b8a1 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -57,7 +57,7 @@ class LlamaMLP(nn.Module): def __init__( self, - parent_name: str, + prefix: str, hidden_size: int, intermediate_size: int, hidden_act: str, @@ -70,13 +70,13 @@ def __init__( output_sizes=[intermediate_size] * 2, bias=bias, quant_config=quant_config, - layer_name=f"{parent_name}.gate_up_proj") + layer_name=f"{prefix}.gate_up_proj") self.down_proj = RowParallelLinear( input_size=intermediate_size, output_size=hidden_size, bias=bias, quant_config=quant_config, - layer_name=f"{parent_name}.down_proj") + layer_name=f"{prefix}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") @@ -93,7 +93,7 @@ class LlamaAttention(nn.Module): def __init__( self, - parent_name: str, + prefix: str, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -134,14 +134,14 @@ def __init__( total_num_kv_heads=self.total_num_kv_heads, bias=bias, quant_config=quant_config, - layer_name=f"{parent_name}.qkv_proj", + layer_name=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( input_size=self.total_num_heads * self.head_dim, output_size=hidden_size, bias=bias, quant_config=quant_config, - layer_name=f"{parent_name}.o_proj", + layer_name=f"{prefix}.o_proj", ) self.rotary_emb = get_rope( @@ -177,7 +177,7 @@ class LlamaDecoderLayer(nn.Module): def __init__( self, - parent_name: str, + prefix: str, config: LlamaConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -197,7 +197,7 @@ def __init__( attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False) self.self_attn = LlamaAttention( - parent_name=f"{parent_name}.self_attn", + prefix=f"{prefix}.self_attn", hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=getattr(config, "num_key_value_heads", @@ -210,7 +210,7 @@ def __init__( cache_config=cache_config, ) self.mlp = LlamaMLP( - parent_name=f"{parent_name}.mlp", + prefix=f"{prefix}.mlp", hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, @@ -278,7 +278,7 @@ def __init__( self.embed_tokens = PPMissingLayer() self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda layer_idx: LlamaDecoderLayer( - parent_name=f"model.layers.{layer_idx}", + prefix=f"model.layers.{layer_idx}", config=config, cache_config=cache_config, quant_config=quant_config)) From 793abc740b21f9a808d4f7f7506ba41c134eb6ac Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 01:24:27 +0000 Subject: [PATCH 12/26] format --- vllm/model_executor/models/llama.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 9e655317b8a1..781d901462b9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -71,12 +71,11 @@ def __init__( bias=bias, quant_config=quant_config, layer_name=f"{prefix}.gate_up_proj") - self.down_proj = RowParallelLinear( - input_size=intermediate_size, - output_size=hidden_size, - bias=bias, - quant_config=quant_config, - layer_name=f"{prefix}.down_proj") + self.down_proj = RowParallelLinear(input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + layer_name=f"{prefix}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") From bb7d44c213893276ea946c075d115a41d90c3bab Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 01:50:49 +0000 Subject: [PATCH 13/26] format --- vllm/model_executor/layers/linear.py | 24 ++++++++++++------- .../compressed_tensors/compressed_tensors.py | 15 ++++-------- vllm/model_executor/models/gpt2.py | 3 ++- vllm/model_executor/models/llama.py | 19 ++++++++------- vllm/model_executor/models/utils.py | 11 +++++---- 5 files changed, 40 insertions(+), 32 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 9edadf4f6110..8f9343063b7c 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -171,6 +171,8 @@ class ReplicatedLinear(LinearBase): skip_bias_add: If true, skip adding bias but instead return it. params_dtype: Data type for the parameters. quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) """ def __init__( @@ -181,7 +183,7 @@ def __init__( skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, - layer_name: Optional[str] = None, + prefix: Optional[str] = None, ): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config) @@ -193,7 +195,7 @@ def __init__( self.input_size, self.output_size, self.params_dtype, - layer_name=layer_name) + layer_name=prefix) if bias: self.bias = Parameter( @@ -245,6 +247,8 @@ class ColumnParallelLinear(LinearBase): quant_config: Quantization configure. output_sizes: list of output sizes packed into one output, like for QKV the list would be size 3. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) """ def __init__( @@ -257,7 +261,7 @@ def __init__( params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, output_sizes: Optional[List[int]] = None, - layer_name: Optional[str] = None, + prefix: Optional[str] = None, ): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config) @@ -285,7 +289,7 @@ def __init__( input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, - layer_name=layer_name, + layer_name=prefix, weight_loader=self.weight_loader, ) if bias: @@ -359,6 +363,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear): skip adding bias but instead return it. params_dtype: Data type for the parameters. quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) """ def __init__(self, @@ -369,7 +375,7 @@ def __init__(self, skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, - layer_name: Optional[str] = None): + prefix: Optional[str] = None): self.output_sizes = output_sizes tp_size = get_tensor_model_parallel_world_size() assert all(output_size % tp_size == 0 for output_size in output_sizes) @@ -381,7 +387,7 @@ def __init__(self, skip_bias_add=skip_bias_add, params_dtype=params_dtype, quant_config=quant_config, - layer_name=layer_name, + prefix=prefix, ) def weight_loader(self, @@ -502,6 +508,8 @@ class QKVParallelLinear(ColumnParallelLinear): skip adding bias but instead return it. params_dtype: Data type for the parameters. quant_config: Quantization configure. + prefix: The name of the layer in the state dict, including all parents + (e.g. model.layers.0.qkv_proj) """ def __init__(self, @@ -513,7 +521,7 @@ def __init__(self, skip_bias_add: bool = False, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, - layer_name: Optional[str] = None): + prefix: Optional[str] = None): self.hidden_size = hidden_size self.head_size = head_size self.total_num_heads = total_num_heads @@ -546,7 +554,7 @@ def __init__(self, skip_bias_add=skip_bias_add, params_dtype=params_dtype, quant_config=quant_config, - layer_name=layer_name) + prefix=prefix) def weight_loader(self, param: Parameter, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 85ba71ffe6d3..e4df97c465fd 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -257,25 +257,20 @@ def __init__(self, quantization_config: CompressedTensorsConfig): def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.scheme.process_weights_after_loading(layer) - def create_weights(self, - layer: torch.nn.Module, + def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int, - output_partition_sizes: List[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - layer_name: Optional[str] = None, + output_partition_sizes: List[int], input_size: int, + output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): """ Use the CompressedTensorsScheme associated with each layer to create the necessary parameters for the layer. See LinearMethodBase for param details - """ weight_loader = extra_weight_attrs.get("weight_loader") + layer_name = extra_weight_attrs.get("prefix") - scheme = self.quantization_config.get_scheme(layer=layer, - layer_name=layer_name) + scheme = self.quantization_config.get_scheme(layer, layer_name) scheme.create_weights( layer=layer, input_size=input_size, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 23e5eabe3ddc..f64f4e577408 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -186,7 +186,8 @@ def __init__( self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda layer_idx: GPT2Block(config, cache_config, quant_config)) + lambda prefix: GPT2Block(config, cache_config, quant_config), + prefix="") self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) def forward( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 781d901462b9..c00ec9ed82db 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -57,12 +57,12 @@ class LlamaMLP(nn.Module): def __init__( self, - prefix: str, hidden_size: int, intermediate_size: int, hidden_act: str, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, + prefix: str = "", ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( @@ -176,10 +176,10 @@ class LlamaDecoderLayer(nn.Module): def __init__( self, - prefix: str, config: LlamaConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -196,7 +196,6 @@ def __init__( attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False) self.self_attn = LlamaAttention( - prefix=f"{prefix}.self_attn", hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=getattr(config, "num_key_value_heads", @@ -207,14 +206,15 @@ def __init__( quant_config=quant_config, bias=attention_bias, cache_config=cache_config, + prefix=f"{prefix}.self_attn", ) self.mlp = LlamaMLP( - prefix=f"{prefix}.mlp", hidden_size=self.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, bias=getattr(config, "mlp_bias", False), + prefix=f"{prefix}.mlp", ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -276,11 +276,12 @@ def __init__( else: self.embed_tokens = PPMissingLayer() self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, lambda layer_idx: LlamaDecoderLayer( - prefix=f"model.layers.{layer_idx}", - config=config, - cache_config=cache_config, - quant_config=quant_config)) + config.num_hidden_layers, + lambda prefix: LlamaDecoderLayer(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), + prefix="model.layers") if get_pp_group().is_last_rank: self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) else: diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 1196cd7de5e6..db890b7f268c 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -53,7 +53,9 @@ def __init__(self, *args, **kwargs): def make_layers( - num_hidden_layers: int, layer_fn: Callable[[], torch.nn.Module] + num_hidden_layers: int, + layer_fn: Callable[[], torch.nn.Module], + prefix: str, ) -> Tuple[int, int, torch.nn.ModuleList]: """Make a list of layers with the given layer function, taking pipeline parallelism into account. @@ -64,9 +66,10 @@ def make_layers( get_pp_group().rank_in_group, get_pp_group().world_size) modules = torch.nn.ModuleList( - [PPMissingLayer() for _ in range(start_layer)] + - [layer_fn(idx) for idx in range(start_layer, end_layer)] + - [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)]) + [PPMissingLayer() for _ in range(start_layer)] + [ + layer_fn(prefix=f"{prefix}.{idx}") + for idx in range(start_layer, end_layer) + ] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)]) return start_layer, end_layer, modules From 24e5cdf9a1195dcefa0f5081a9b9971498e2f27f Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 01:52:22 +0000 Subject: [PATCH 14/26] format --- vllm/model_executor/models/llama.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index c00ec9ed82db..206648463624 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -70,12 +70,12 @@ def __init__( output_sizes=[intermediate_size] * 2, bias=bias, quant_config=quant_config, - layer_name=f"{prefix}.gate_up_proj") + prefix=f"{prefix}.gate_up_proj") self.down_proj = RowParallelLinear(input_size=intermediate_size, output_size=hidden_size, bias=bias, quant_config=quant_config, - layer_name=f"{prefix}.down_proj") + prefix=f"{prefix}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") @@ -92,7 +92,6 @@ class LlamaAttention(nn.Module): def __init__( self, - prefix: str, hidden_size: int, num_heads: int, num_kv_heads: int, @@ -102,6 +101,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, bias: bool = False, cache_config: Optional[CacheConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -133,14 +133,14 @@ def __init__( total_num_kv_heads=self.total_num_kv_heads, bias=bias, quant_config=quant_config, - layer_name=f"{prefix}.qkv_proj", + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( input_size=self.total_num_heads * self.head_dim, output_size=hidden_size, bias=bias, quant_config=quant_config, - layer_name=f"{prefix}.o_proj", + prefix=f"{prefix}.o_proj", ) self.rotary_emb = get_rope( From 75a617b51bf8a1e090afeddaa4ae0c8d69dc90c9 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 01:53:25 +0000 Subject: [PATCH 15/26] format --- vllm/model_executor/layers/linear.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 8f9343063b7c..740f42823790 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -195,7 +195,7 @@ def __init__( self.input_size, self.output_size, self.params_dtype, - layer_name=prefix) + prefix=prefix) if bias: self.bias = Parameter( @@ -289,7 +289,7 @@ def __init__( input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, - layer_name=prefix, + prefix=prefix, weight_loader=self.weight_loader, ) if bias: @@ -714,7 +714,7 @@ def __init__(self, params_dtype: Optional[torch.dtype] = None, reduce_results: bool = True, quant_config: Optional[QuantizationConfig] = None, - layer_name: Optional[str] = None): + prefix: Optional[str] = None): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config) @@ -732,7 +732,7 @@ def __init__(self, input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, - layer_name=layer_name, + prefix=prefix, weight_loader=self.weight_loader) if not reduce_results and (bias and not skip_bias_add): raise ValueError("When not reduce the results, adding bias to the " From 9f7aecafc637a0a563ca7c4432a889caf7b3df50 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 02:04:32 +0000 Subject: [PATCH 16/26] revert marlin change on wrong branch --- .../schemes/compressed_tensors_w8a8_fp8.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 16acc412aea1..9ed8a7839273 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -19,18 +19,10 @@ def __init__(self, input_dynamic: bool): self.input_dynamic = input_dynamic self.cutlass_fp8_supported = cutlass_fp8_supported() - # For GPUs that lack FP8 hardware support, we can leverage the Marlin - # kernel for fast weight-only FP8 quantization - capability = current_platform.get_device_capability() - capability = capability[0] * 10 + capability[1] - self.use_marlin = capability < 89 - # W8A8-Fp8 kernels support only per-tensor and per-channel cases. # So if we have a fused module (QKV, MLP) with per tensor scales (thus N # scales being passed to the kernel), we requantize with a single scale. def process_weights_after_loading(self, layer) -> None: - if self.use_marlin: - weight = layer.weight # Dequant -> Quant with max scale. max_w_scale, weight = requantize_with_max_scale( weight=layer.weight, From 5b1369acac39e44f7ddb087ba16a2691b83cf95d Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 02:05:14 +0000 Subject: [PATCH 17/26] spurious chnage --- .../compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 9ed8a7839273..b93425fb2d62 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -8,7 +8,6 @@ apply_fp8_linear, create_per_tensor_scale_param, cutlass_fp8_supported, requantize_with_max_scale) from vllm.model_executor.utils import set_weight_attrs -from vllm.platforms import current_platform __all__ = ["CompressedTensorsW8A8Fp8"] From 45acd28983305b337cbdde28cb86801d21ba92e3 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 02:10:21 +0000 Subject: [PATCH 18/26] finally done --- vllm/model_executor/layers/linear.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 740f42823790..d8805346bfde 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -289,9 +289,8 @@ def __init__( input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, - prefix=prefix, weight_loader=self.weight_loader, - ) + prefix=prefix) if bias: self.bias = Parameter( torch.empty(self.output_size_per_partition, @@ -387,8 +386,7 @@ def __init__(self, skip_bias_add=skip_bias_add, params_dtype=params_dtype, quant_config=quant_config, - prefix=prefix, - ) + prefix=prefix) def weight_loader(self, param: Parameter, @@ -732,8 +730,9 @@ def __init__(self, input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, - prefix=prefix, - weight_loader=self.weight_loader) + weight_loader=self.weight_loader, + prefix=prefix) + if not reduce_results and (bias and not skip_bias_add): raise ValueError("When not reduce the results, adding bias to the " "results can lead to incorrect results") From 87c8c87f9955bb459994d28a610083075e27fd4e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 02:11:53 +0000 Subject: [PATCH 19/26] replicated linear num lines --- vllm/model_executor/layers/linear.py | 35 +++++++++++++--------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index d8805346bfde..73327c09a653 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -175,16 +175,14 @@ class ReplicatedLinear(LinearBase): (e.g. model.layers.0.qkv_proj) """ - def __init__( - self, - input_size: int, - output_size: int, - bias: bool = True, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: Optional[str] = None, - ): + def __init__(self, + input_size: int, + output_size: int, + bias: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: Optional[str] = None): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config) @@ -378,15 +376,14 @@ def __init__(self, self.output_sizes = output_sizes tp_size = get_tensor_model_parallel_world_size() assert all(output_size % tp_size == 0 for output_size in output_sizes) - super().__init__( - input_size=input_size, - output_size=sum(output_sizes), - bias=bias, - gather_output=gather_output, - skip_bias_add=skip_bias_add, - params_dtype=params_dtype, - quant_config=quant_config, - prefix=prefix) + super().__init__(input_size=input_size, + output_size=sum(output_sizes), + bias=bias, + gather_output=gather_output, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config, + prefix=prefix) def weight_loader(self, param: Parameter, From 477b1ea49bc19dd165958ff7dde46c70eab4a257 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 02:13:27 +0000 Subject: [PATCH 20/26] remove unnessary changes --- vllm/model_executor/layers/linear.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 73327c09a653..86d15207fb6b 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -249,18 +249,16 @@ class ColumnParallelLinear(LinearBase): (e.g. model.layers.0.qkv_proj) """ - def __init__( - self, - input_size: int, - output_size: int, - bias: bool = True, - gather_output: bool = False, - skip_bias_add: bool = False, - params_dtype: Optional[torch.dtype] = None, - quant_config: Optional[QuantizationConfig] = None, - output_sizes: Optional[List[int]] = None, - prefix: Optional[str] = None, - ): + def __init__(self, + input_size: int, + output_size: int, + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + output_sizes: Optional[List[int]] = None, + prefix: Optional[str] = None): super().__init__(input_size, output_size, skip_bias_add, params_dtype, quant_config) @@ -729,7 +727,6 @@ def __init__(self, params_dtype=self.params_dtype, weight_loader=self.weight_loader, prefix=prefix) - if not reduce_results and (bias and not skip_bias_add): raise ValueError("When not reduce the results, adding bias to the " "results can lead to incorrect results") From 998c84f6cb301517cd38b8659e25f76e8c56f5d8 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 19:46:49 +0000 Subject: [PATCH 21/26] stash --- vllm/model_executor/models/utils.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index db890b7f268c..0ec900a9dd08 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,4 +1,4 @@ -from typing import Callable, Dict, List, Tuple +from typing import Callable, Dict, List, Protocol, Tuple import torch @@ -52,9 +52,16 @@ def __init__(self, *args, **kwargs): super().__init__() +class LayerFn(Protocol): + def __call__( + self, prefix="", + ) -> torch.nn.Module: + ... + + def make_layers( num_hidden_layers: int, - layer_fn: Callable[[], torch.nn.Module], + layer_fn: LayerFn, prefix: str, ) -> Tuple[int, int, torch.nn.ModuleList]: """Make a list of layers with the given layer function, taking From a826e666b5f27e66cef645a57dda3fd98bd10dda Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 20:38:13 +0000 Subject: [PATCH 22/26] staged --- vllm/model_executor/layers/fused_moe/layer.py | 1 + vllm/model_executor/models/gpt2.py | 15 ++++++++--- vllm/model_executor/models/mixtral.py | 26 ++++++++++++++----- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index bb2be3f3eb56..a6fa8ffe5111 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -158,6 +158,7 @@ def __init__( topk_group: Optional[int] = None, quant_config: Optional[QuantizationConfig] = None, tp_size: Optional[int] = None, + prefix: str = "", ): super().__init__() diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index f64f4e577408..86b6a3f62b7f 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -51,6 +51,7 @@ def __init__( config: GPT2Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.hidden_size = config.hidden_size @@ -68,12 +69,14 @@ def __init__( total_num_heads, bias=True, quant_config=quant_config, + prefix=f"{prefix}.c_attn", ) self.c_proj = RowParallelLinear( self.hidden_size, self.hidden_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.c_proj", ) self.attn = Attention(self.num_heads, self.head_dim, @@ -101,6 +104,7 @@ def __init__( intermediate_size: int, config: GPT2Config, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() hidden_size = config.hidden_size @@ -109,12 +113,14 @@ def __init__( intermediate_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.c_fc", ) self.c_proj = RowParallelLinear( intermediate_size, hidden_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.c_proj", ) self.act = get_act_fn(config.activation_function, quant_config, intermediate_size) @@ -133,6 +139,7 @@ def __init__( config: GPT2Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() hidden_size = config.hidden_size @@ -140,9 +147,9 @@ def __init__( hidden_size) self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.attn = GPT2Attention(config, cache_config, quant_config) + self.attn = GPT2Attention(config, cache_config, quant_config, prefix=f"{prefix}.attn") self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.mlp = GPT2MLP(inner_dim, config, quant_config) + self.mlp = GPT2MLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") def forward( self, @@ -186,8 +193,8 @@ def __init__( self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: GPT2Block(config, cache_config, quant_config), - prefix="") + lambda prefix: GPT2Block(config, cache_config, quant_config, prefix=prefix), + prefix="transformer.h") self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) def forward( diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 28dbcb30bdf5..59ae0d5dfa09 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -67,7 +67,8 @@ def __init__(self, intermediate_size: int, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, - tp_size: Optional[int] = None): + tp_size: Optional[int] = None, + prefix: str = ""): super().__init__() self.hidden_size = hidden_size @@ -76,7 +77,8 @@ def __init__(self, num_experts, bias=False, params_dtype=params_dtype, - quant_config=None) + quant_config=None, + prefix=prefix) self.experts = FusedMoE(num_experts=num_experts, top_k=top_k, @@ -86,7 +88,8 @@ def __init__(self, reduce_results=True, renormalize=True, quant_config=quant_config, - tp_size=tp_size) + tp_size=tp_size, + prefix=prefix) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # NOTE: hidden_states can have either 1D or 2D shape. @@ -109,6 +112,7 @@ def __init__( rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -139,12 +143,14 @@ def __init__( self.total_num_kv_heads, bias=False, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) self.rotary_emb = get_rope( self.head_dim, @@ -182,6 +188,7 @@ def __init__( config: MixtralConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -194,13 +201,15 @@ def __init__( num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.self_attn") self.block_sparse_moe = MixtralMoE( num_experts=config.num_local_experts, top_k=config.num_experts_per_tok, hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.block_sparse_moe") self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = RMSNorm(config.hidden_size, @@ -258,8 +267,11 @@ def __init__( ) self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, lambda: MixtralDecoderLayer( - config, cache_config, quant_config=quant_config)) + config.num_hidden_layers, + lambda prefix: MixtralDecoderLayer( + config, cache_config, + quant_config=quant_config, prefix=prefix), + prefix="model.layers") self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) From 3ab65a745ecb5d2e17b1856be474c8a105c5ecb1 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 20:50:15 +0000 Subject: [PATCH 23/26] format --- .../compressed_tensors/compressed_tensors.py | 4 ++-- .../layers/quantization/compressed_tensors/utils.py | 9 --------- vllm/model_executor/models/gpt2.py | 13 ++++++++++--- vllm/model_executor/models/mixtral.py | 6 +++--- vllm/model_executor/models/utils.py | 7 +++++-- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 1f250b94e076..28c552b3654f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -13,8 +13,8 @@ CompressedTensorsW8A8Int8, CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, - QuantizationType, is_activation_quantization_format, - find_matched_target, should_ignore_layer) + QuantizationType, find_matched_target, is_activation_quantization_format, + should_ignore_layer) from vllm.platforms import current_platform diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 0579dd821542..b3110ce65330 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -86,15 +86,6 @@ def is_activation_quantization_format(format: str) -> bool: return format in _ACTIVATION_QUANTIZATION_FORMATS -def is_activation_quantization_format(format: str) -> bool: - _ACTIVATION_QUANTIZATION_FORMATS = [ - CompressionFormat.naive_quantized.value, - CompressionFormat.int_quantized.value, - CompressionFormat.float_quantized.value - ] - return format in _ACTIVATION_QUANTIZATION_FORMATS - - # fused_name: List[shard_name] _FUSED_LAYER_NAME_MAPPING = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 86b6a3f62b7f..6ef39aa0ceb9 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -147,9 +147,15 @@ def __init__( hidden_size) self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.attn = GPT2Attention(config, cache_config, quant_config, prefix=f"{prefix}.attn") + self.attn = GPT2Attention(config, + cache_config, + quant_config, + prefix=f"{prefix}.attn") self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.mlp = GPT2MLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp") + self.mlp = GPT2MLP(inner_dim, + config, + quant_config, + prefix=f"{prefix}.mlp") def forward( self, @@ -193,7 +199,8 @@ def __init__( self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: GPT2Block(config, cache_config, quant_config, prefix=prefix), + lambda prefix: GPT2Block( + config, cache_config, quant_config, prefix=prefix), prefix="transformer.h") self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 59ae0d5dfa09..a584e5465d6e 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -267,10 +267,10 @@ def __init__( ) self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, + config.num_hidden_layers, lambda prefix: MixtralDecoderLayer( - config, cache_config, - quant_config=quant_config, prefix=prefix), + config, cache_config, quant_config=quant_config, prefix=prefix + ), prefix="model.layers") self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 99be055e4dda..197d3839a766 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,4 +1,4 @@ -from typing import Callable, Dict, List, Protocol, Tuple +from typing import Dict, List, Protocol, Tuple import torch from torch.func import functional_call @@ -46,8 +46,10 @@ def merge_vision_embeddings(input_ids: torch.Tensor, class LayerFn(Protocol): + def __call__( - self, prefix="", + self, + prefix="", ) -> torch.nn.Module: ... @@ -124,6 +126,7 @@ def forward(*args, **kwargs): return module + def make_layers( num_hidden_layers: int, layer_fn: LayerFn, From 4d5fc07fc0a2b7f10a9ef23f1d93ad350086152e Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 21:10:48 +0000 Subject: [PATCH 24/26] move root back --- vllm/model_executor/models/gpt2.py | 6 ++++-- vllm/model_executor/models/llama.py | 6 ++++-- vllm/model_executor/models/mixtral.py | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 6ef39aa0ceb9..65a755794115 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -188,6 +188,7 @@ def __init__( config: GPT2Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -201,7 +202,7 @@ def __init__( config.num_hidden_layers, lambda prefix: GPT2Block( config, cache_config, quant_config, prefix=prefix), - prefix="transformer.h") + prefix=f"{prefix}.h") self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) def forward( @@ -244,7 +245,8 @@ def __init__( super().__init__() self.config = config self.quant_config = quant_config - self.transformer = GPT2Model(config, cache_config, quant_config) + self.transformer = GPT2Model(config, cache_config, + quant_config, prefix="transformer") self.lm_head = self.transformer.wte self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 206648463624..7d07b7194258 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -258,6 +258,7 @@ def __init__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.config = config @@ -281,7 +282,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, prefix=prefix), - prefix="model.layers") + prefix=f"{prefix}.layers") if get_pp_group().is_last_rank: self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) else: @@ -377,7 +378,8 @@ def __init__( self.model = LlamaModel(config, cache_config, quant_config, - lora_config=lora_config) + lora_config=lora_config, + prefix="model") if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size if lora_config: diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index a584e5465d6e..10717977fad4 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -252,6 +252,7 @@ def __init__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.padding_idx = config.pad_token_id @@ -271,7 +272,7 @@ def __init__( lambda prefix: MixtralDecoderLayer( config, cache_config, quant_config=quant_config, prefix=prefix ), - prefix="model.layers") + prefix=f"{prefix}.layers") self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -343,7 +344,8 @@ def __init__( self.model = MixtralModel(config, cache_config, quant_config, - lora_config=lora_config) + lora_config=lora_config, + prefix="model") self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size From 1fa917e0b9a30ca0faa8c00256556a2cab8f2fa6 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 21:12:55 +0000 Subject: [PATCH 25/26] fix mixtral --- vllm/model_executor/models/mixtral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 10717977fad4..8fbd537a2c03 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -78,7 +78,7 @@ def __init__(self, bias=False, params_dtype=params_dtype, quant_config=None, - prefix=prefix) + prefix=f"{prefix}.gate") self.experts = FusedMoE(num_experts=num_experts, top_k=top_k, @@ -89,7 +89,7 @@ def __init__(self, renormalize=True, quant_config=quant_config, tp_size=tp_size, - prefix=prefix) + prefix=f"{prefix}.experts") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # NOTE: hidden_states can have either 1D or 2D shape. From 1e57ffadb7322014fc1dd171af9cedb26733efd1 Mon Sep 17 00:00:00 2001 From: "rshaw@neuralmagic.com" Date: Thu, 18 Jul 2024 21:21:15 +0000 Subject: [PATCH 26/26] formatted --- vllm/model_executor/models/gpt2.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 65a755794115..94cd67e75336 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -245,8 +245,10 @@ def __init__( super().__init__() self.config = config self.quant_config = quant_config - self.transformer = GPT2Model(config, cache_config, - quant_config, prefix="transformer") + self.transformer = GPT2Model(config, + cache_config, + quant_config, + prefix="transformer") self.lm_head = self.transformer.wte self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler()