From 64c3c1cbd7d72371faf24d145059a0409e9688d4 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 17 Jul 2024 00:12:43 +0000
Subject: [PATCH 01/26] stash

---
 vllm/model_executor/layers/linear.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 684e1abf7bcf..26bc25b5c2d7 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -72,6 +72,7 @@ def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
+                       layer_name: Optional[str]=None,
                        **extra_weight_attrs):
         """Create weights for a linear layer. 
            The weights will be set as attributes of the layer.
@@ -105,6 +106,7 @@ def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
+                       layer_name: Optional[str]=None,
                        **extra_weight_attrs):
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
@@ -141,6 +143,7 @@ def __init__(
         skip_bias_add: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        layer_name: Optional[str] = None,
     ):
         super().__init__()
 
@@ -179,15 +182,18 @@ def __init__(self,
                  bias: bool = True,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 layer_name: Optional[str] = None,
+    ):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config)
+                         quant_config, layer_name)
 
         # All the linear layer supports quant method.
         assert self.quant_method is not None
         self.quant_method.create_weights(self, self.input_size,
                                          [self.output_size], self.input_size,
-                                         self.output_size, self.params_dtype)
+                                         self.output_size, self.params_dtype,
+                                         layer_name=layer_name)
 
         if bias:
             self.bias = Parameter(
@@ -249,7 +255,9 @@ def __init__(self,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 output_sizes: Optional[List[int]] = None):
+                 output_sizes: Optional[List[int]] = None,
+                 layer_name: Optional[str] = None,
+    ):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config)
 
@@ -276,7 +284,8 @@ def __init__(self,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
-            weight_loader=self.weight_loader)
+            layer_name=layer_name,
+            weight_loader=self.weight_loader,)
         if bias:
             self.bias = Parameter(
                 torch.empty(self.output_size_per_partition,
@@ -357,7 +366,8 @@ def __init__(self,
                  gather_output: bool = False,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 layer_name: Optional[str] = None):
         self.output_sizes = output_sizes
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
@@ -497,7 +507,8 @@ def __init__(self,
                  bias: bool = True,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 layer_name: Optional[str] = None):
         self.hidden_size = hidden_size
         self.head_size = head_size
         self.total_num_heads = total_num_heads

From 891529793f286de6f45caee59d82c150bb4a48e0 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 17 Jul 2024 00:54:49 +0000
Subject: [PATCH 02/26] re-enable nonuniform for llama

---
 vllm/model_executor/layers/linear.py          |  8 +++++---
 .../compressed_tensors/compressed_tensors.py  | 16 ++++++++++++---
 vllm/model_executor/models/llama.py           | 20 ++++++++++++++-----
 vllm/model_executor/models/utils.py           |  2 +-
 4 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 26bc25b5c2d7..d6ed2647343a 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -72,7 +72,7 @@ def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       layer_name: Optional[str]=None,
+                       layer_name: Optional[str] = None,
                        **extra_weight_attrs):
         """Create weights for a linear layer. 
            The weights will be set as attributes of the layer.
@@ -106,7 +106,7 @@ def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       layer_name: Optional[str]=None,
+                       layer_name: Optional[str] = None,
                        **extra_weight_attrs):
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
@@ -699,7 +699,8 @@ def __init__(self,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
                  reduce_results: bool = True,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 layer_name: Optional[str] = None):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config)
 
@@ -717,6 +718,7 @@ def __init__(self,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
+            layer_name=layer_name,
             weight_loader=self.weight_loader)
         if not reduce_results and (bias and not skip_bias_add):
             raise ValueError("When not reduce the results, adding bias to the "
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 524b4c894b9b..4535589b34b0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -10,7 +10,7 @@
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
     CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsWNA16)
+    CompressedTensorsWNA16, CompressedTensorsUnquantized)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
     QuantizationType, find_first_name_or_class_match)
@@ -201,10 +201,20 @@ def _get_schema(self, weight_quant: BaseModel,
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
 
-    def get_scheme(self, layer: torch.nn.Module) -> "CompressedTensorsScheme":
+    def get_scheme(
+            self,
+            layer: torch.nn.Module,
+            layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
+
+        if layer_name is not None:
+            if layer_name in self.ignore:
+                return CompressedTensorsUnquantized()
+        else:
+            # fall back to 
+            layer_name=""
 
         layer_type_name = find_first_name_or_class_match(
-            name="",
+            name=layer_name,
             module=layer,
             targets=self.layer_quant_details.keys(),
             check_contains=True)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f03e34b9e7c9..03f5b2da38f8 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -57,6 +57,7 @@ class LlamaMLP(nn.Module):
 
     def __init__(
         self,
+        parent_name: str,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
@@ -68,11 +69,13 @@ def __init__(
             input_size=hidden_size,
             output_sizes=[intermediate_size] * 2,
             bias=bias,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            layer_name=f"{parent_name}.gate_up_proj")
         self.down_proj = RowParallelLinear(input_size=intermediate_size,
                                            output_size=hidden_size,
                                            bias=bias,
-                                           quant_config=quant_config)
+                                           quant_config=quant_config,
+                                           layer_name=f"{parent_name}.down_proj")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -89,6 +92,7 @@ class LlamaAttention(nn.Module):
 
     def __init__(
         self,
+        parent_name: str,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -129,12 +133,14 @@ def __init__(
             total_num_kv_heads=self.total_num_kv_heads,
             bias=bias,
             quant_config=quant_config,
+            layer_name=f"{parent_name}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             input_size=self.total_num_heads * self.head_dim,
             output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
+            layer_name=f"{parent_name}.o_proj",
         )
 
         self.rotary_emb = get_rope(
@@ -170,6 +176,7 @@ class LlamaDecoderLayer(nn.Module):
 
     def __init__(
         self,
+        parent_name: str,
         config: LlamaConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
@@ -189,6 +196,7 @@ def __init__(
         attention_bias = getattr(config, "attention_bias", False) or getattr(
             config, "bias", False)
         self.self_attn = LlamaAttention(
+            parent_name=f"{parent_name}.self_attn",
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=getattr(config, "num_key_value_heads",
@@ -201,6 +209,7 @@ def __init__(
             cache_config=cache_config,
         )
         self.mlp = LlamaMLP(
+            parent_name=f"{parent_name}.mlp",
             hidden_size=self.hidden_size,
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
@@ -264,9 +273,10 @@ def __init__(
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda: LlamaDecoderLayer(config=config,
-                                      cache_config=cache_config,
-                                      quant_config=quant_config))
+            lambda idx: LlamaDecoderLayer(parent_name=f"model.layers.{idx}.",
+                                          config=config,
+                                          cache_config=cache_config,
+                                          quant_config=quant_config))
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index c135b2035220..1196cd7de5e6 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -65,7 +65,7 @@ def make_layers(
                                             get_pp_group().world_size)
     modules = torch.nn.ModuleList(
         [PPMissingLayer() for _ in range(start_layer)] +
-        [layer_fn() for _ in range(start_layer, end_layer)] +
+        [layer_fn(idx) for idx in range(start_layer, end_layer)] +
         [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
     return start_layer, end_layer, modules
 

From 18a71ae2b9383b0ab5922c5986f2c3f5d898e765 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 17 Jul 2024 13:04:27 +0000
Subject: [PATCH 03/26] stash

---
 vllm/model_executor/layers/linear.py          |  3 +-
 .../compressed_tensors/compressed_tensors.py  | 45 +++++++++++++++----
 .../quantization/compressed_tensors/utils.py  |  5 +++
 vllm/model_executor/models/llama.py           |  3 +-
 4 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index d6ed2647343a..d080875bdea1 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -540,7 +540,8 @@ def __init__(self,
                          gather_output=False,
                          skip_bias_add=skip_bias_add,
                          params_dtype=params_dtype,
-                         quant_config=quant_config)
+                         quant_config=quant_config,
+                         layer_name=layer_name)
 
     def weight_loader(self,
                       param: Parameter,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 4535589b34b0..b73e5ad1b0a5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -13,7 +13,7 @@
     CompressedTensorsWNA16, CompressedTensorsUnquantized)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
-    QuantizationType, find_first_name_or_class_match)
+    QuantizationType, find_first_name_or_class_match, _FUSED_LAYER_NAME_MAPPING)
 from vllm.platforms import current_platform
 
 
@@ -205,13 +205,42 @@ def get_scheme(
             self,
             layer: torch.nn.Module,
             layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
-
+        
         if layer_name is not None:
-            if layer_name in self.ignore:
+            
+            # layer_name = model.layers.0.self_attn.qkv_proj
+            # proj_name = qkv_proj
+            proj_name = layer_name.split(".")[-1]
+
+            # Fused layers like gate_up_proj or qkv_proj will not be fused
+            # in the safetensors checkpoint. So, we convert the name
+            # from the fused version to unfused + check to make sure that
+            # each shard of the fused layer has the same scheme.
+            if proj_name in _FUSED_LAYER_NAME_MAPPING:
+                # Convert fused_name --> shard_names
+                shard_names = [
+                    layer_name.replace(proj_name, unfused_proj_name) for
+                    unfused_proj_name in _FUSED_LAYER_NAME_MAPPING[proj_name]]
+
+                # Check if this layer should be skipped.
+                should_ignore_layer = shard_names[0] in self.ignore
+                
+                # Confirm that all the shards are skipped or none are skipped.
+                for shard_name in shard_names:
+                    should_ignore_shard = (shard_name in should_ignore_layer)
+                    if should_ignore_shard != should_ignore_layer:
+                        raise ValueError(
+                            f"Found a different quantization scheme for {shard_name} in "
+                            f"{shard_names[0]} in layer {layer_name}. vLLM requires all "
+                            "shards in fused layers to share the same scheme.")
+            else:
+                should_ignore_layer = layer_name in self.ignore
+
+            if should_ignore_layer:
                 return CompressedTensorsUnquantized()
-        else:
-            # fall back to 
-            layer_name=""
+
+        
+        layer_name=""
 
         layer_type_name = find_first_name_or_class_match(
             name=layer_name,
@@ -245,7 +274,7 @@ def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
+                       layer_name: Optional[str], **extra_weight_attrs):
         """
         Use the CompressedTensorsScheme associated with each layer to create 
         the necessary parameters for the layer. See LinearMethodBase for param
@@ -254,7 +283,7 @@ def create_weights(self, layer: torch.nn.Module,
         """
         weight_loader = extra_weight_attrs.get("weight_loader")
 
-        scheme = self.quantization_config.get_scheme(layer=layer)
+        scheme = self.quantization_config.get_scheme(layer=layer, layer_name=layer_name)
         scheme.create_weights(
             layer=layer,
             input_size=input_size,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 5b44c215535b..5ccd82c1d127 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -5,6 +5,11 @@
 from pydantic import BaseModel, Field
 from torch.nn import Module
 
+# fused_name: List[shard_name]
+_FUSED_LAYER_NAME_MAPPING = {
+    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    "gate_up_proj": [""]
+}
 
 class CompressionFormat(Enum):
     dense = "dense"
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 03f5b2da38f8..f872cb379917 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -273,7 +273,7 @@ def __init__(
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
-            lambda idx: LlamaDecoderLayer(parent_name=f"model.layers.{idx}.",
+            lambda idx: LlamaDecoderLayer(parent_name=f"model.layers.{idx}",
                                           config=config,
                                           cache_config=cache_config,
                                           quant_config=quant_config))
@@ -481,6 +481,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+        breakpoint()
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should

From 197dd19fd998ce9dc1d54bef7b0cb5d8ad5a7091 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 17 Jul 2024 13:27:49 +0000
Subject: [PATCH 04/26] working e2e

---
 vllm/model_executor/layers/linear.py                  |  3 ++-
 .../compressed_tensors/compressed_tensors.py          | 11 ++++-------
 .../schemes/compressed_tensors_unquantized.py         |  1 -
 .../layers/quantization/compressed_tensors/utils.py   |  3 ++-
 vllm/model_executor/model_loader/loader.py            |  1 -
 vllm/model_executor/models/llama.py                   |  1 -
 6 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index d080875bdea1..099224c03b5e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -377,7 +377,8 @@ def __init__(self,
                          gather_output=gather_output,
                          skip_bias_add=skip_bias_add,
                          params_dtype=params_dtype,
-                         quant_config=quant_config)
+                         quant_config=quant_config,
+                         layer_name=layer_name,)
 
     def weight_loader(self,
                       param: Parameter,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index b73e5ad1b0a5..9466369a177b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -183,7 +183,8 @@ def _get_schema(self, weight_quant: BaseModel,
                     group_size=weight_quant.group_size)
 
         if (self.quant_format == CompressionFormat.int_quantized.value or
-                self.quant_format == CompressionFormat.float_quantized.value):
+            self.quant_format == CompressionFormat.float_quantized.value or 
+            self.quant_format == CompressionFormat.naive_quantized.value):
             if self._is_fp8_w8a8(weight_quant, input_quant):
                 return CompressedTensorsW8A8Fp8(
                     input_dynamic=input_quant.dynamic)
@@ -207,7 +208,6 @@ def get_scheme(
             layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
         
         if layer_name is not None:
-            
             # layer_name = model.layers.0.self_attn.qkv_proj
             # proj_name = qkv_proj
             proj_name = layer_name.split(".")[-1]
@@ -227,7 +227,7 @@ def get_scheme(
                 
                 # Confirm that all the shards are skipped or none are skipped.
                 for shard_name in shard_names:
-                    should_ignore_shard = (shard_name in should_ignore_layer)
+                    should_ignore_shard = (shard_name in self.ignore)
                     if should_ignore_shard != should_ignore_layer:
                         raise ValueError(
                             f"Found a different quantization scheme for {shard_name} in "
@@ -239,11 +239,8 @@ def get_scheme(
             if should_ignore_layer:
                 return CompressedTensorsUnquantized()
 
-        
-        layer_name=""
-
         layer_type_name = find_first_name_or_class_match(
-            name=layer_name,
+            name="",
             module=layer,
             targets=self.layer_quant_details.keys(),
             check_contains=True)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
index 2c7fe3e0e411..7d0c1e11f7ba 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
@@ -29,7 +29,6 @@ def create_weights(self, layer: torch.nn.Module,
 
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
-                                       device="cuda",
                                        dtype=params_dtype),
                            requires_grad=False)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 5ccd82c1d127..10cc05fa3eaa 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -8,12 +8,13 @@
 # fused_name: List[shard_name]
 _FUSED_LAYER_NAME_MAPPING = {
     "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    "gate_up_proj": [""]
+    "gate_up_proj": ["gate_proj", "up_proj"]
 }
 
 class CompressionFormat(Enum):
     dense = "dense"
     sparse_bitmask = "sparse-bitmask"
+    naive_quantized = "naive-quantized"
     float_quantized = "float-quantized"
     int_quantized = "int-quantized"
     pack_quantized = "pack-quantized"
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index a1a2b0b323f6..af0054a079e1 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -282,7 +282,6 @@ def load_model(self, *, model_config: ModelConfig,
                                                model,
                                                "fall_back_to_pt_during_load",
                                                True)), )
-
             for _, module in model.named_modules():
                 quant_method = getattr(module, "quant_method", None)
                 if quant_method is not None:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f872cb379917..d4fc4a38ab77 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -481,7 +481,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
-        breakpoint()
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should

From eb95b49a44de63041cfcf3f60e7a7e543d3cc3ff Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 17 Jul 2024 14:23:17 +0000
Subject: [PATCH 05/26] cleanup code a bit

---
 .../compressed_tensors/compressed_tensors.py  |  39 ++-----
 .../quantization/compressed_tensors/utils.py  | 101 +++++++++++++++---
 2 files changed, 93 insertions(+), 47 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 9466369a177b..468d737c329a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -13,7 +13,8 @@
     CompressedTensorsWNA16, CompressedTensorsUnquantized)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
-    QuantizationType, find_first_name_or_class_match, _FUSED_LAYER_NAME_MAPPING)
+    QuantizationType, find_first_name_or_class_match, 
+    should_ignore_layer)
 from vllm.platforms import current_platform
 
 
@@ -206,38 +207,10 @@ def get_scheme(
             self,
             layer: torch.nn.Module,
             layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
-        
-        if layer_name is not None:
-            # layer_name = model.layers.0.self_attn.qkv_proj
-            # proj_name = qkv_proj
-            proj_name = layer_name.split(".")[-1]
-
-            # Fused layers like gate_up_proj or qkv_proj will not be fused
-            # in the safetensors checkpoint. So, we convert the name
-            # from the fused version to unfused + check to make sure that
-            # each shard of the fused layer has the same scheme.
-            if proj_name in _FUSED_LAYER_NAME_MAPPING:
-                # Convert fused_name --> shard_names
-                shard_names = [
-                    layer_name.replace(proj_name, unfused_proj_name) for
-                    unfused_proj_name in _FUSED_LAYER_NAME_MAPPING[proj_name]]
-
-                # Check if this layer should be skipped.
-                should_ignore_layer = shard_names[0] in self.ignore
-                
-                # Confirm that all the shards are skipped or none are skipped.
-                for shard_name in shard_names:
-                    should_ignore_shard = (shard_name in self.ignore)
-                    if should_ignore_shard != should_ignore_layer:
-                        raise ValueError(
-                            f"Found a different quantization scheme for {shard_name} in "
-                            f"{shard_names[0]} in layer {layer_name}. vLLM requires all "
-                            "shards in fused layers to share the same scheme.")
-            else:
-                should_ignore_layer = layer_name in self.ignore
-
-            if should_ignore_layer:
-                return CompressedTensorsUnquantized()
+
+        # Check if the layer is ignored (skipped for quantization).
+        if should_ignore_layer(layer_name, ignore=self.ignore):
+            return CompressedTensorsUnquantized()
 
         layer_type_name = find_first_name_or_class_match(
             name="",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 10cc05fa3eaa..c191105f9f90 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -5,12 +5,6 @@
 from pydantic import BaseModel, Field
 from torch.nn import Module
 
-# fused_name: List[shard_name]
-_FUSED_LAYER_NAME_MAPPING = {
-    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    "gate_up_proj": ["gate_proj", "up_proj"]
-}
-
 class CompressionFormat(Enum):
     dense = "dense"
     sparse_bitmask = "sparse-bitmask"
@@ -82,6 +76,71 @@ class QuantizationArgs(BaseModel):
     )
 
 
+# fused_name: List[shard_name]
+_FUSED_LAYER_NAME_MAPPING = {
+    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    "gate_up_proj": ["gate_proj", "up_proj"]
+}
+
+def should_ignore_layer(layer_name: Optional[str], ignore: Iterable[str]) -> bool:   
+    if layer_name is None:
+        return False
+    
+    # layer_name = model.layers.0.self_attn.qkv_proj
+    # proj_name = qkv_proj
+    proj_name = layer_name.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in _FUSED_LAYER_NAME_MAPPING:
+        shard_proj_names = _FUSED_LAYER_NAME_MAPPING[proj_name]
+        
+        # Convert fused_name --> [shard_names]
+        shard_names = [
+            layer_name.replace(proj_name, shard_proj_name) for
+            shard_proj_name in shard_proj_names
+        ]
+
+        # Layer should be ignored if shards are ignored.
+        should_ignore_layer = None
+        for shard_name in shard_names:
+            should_ignore_shard = check_equal_or_regex_match(
+                layer_name=shard_name, targets=ignore)
+            
+            # If shard_idx=0, set layer ignore to match shard. 
+            if should_ignore_layer is None:
+                should_ignore_layer = should_ignore_shard
+            
+            # If shard_idx=1+ confirm scheme matches prior shards.
+            elif should_ignore_shard != should_ignore_layer:
+                raise ValueError(
+                    f"Found a different quantization schemes for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme.")
+
+    # Unfused layers like down_proj and o_proj will match
+    # the safetensors checkpoint already.
+    else:
+        should_ignore_layer = check_equal_or_regex_match(
+                layer_name=layer_name, targets=ignore)
+    
+    return should_ignore_layer
+
+
+def check_equal_or_regex_match(layer_name: str, 
+                               targets: Iterable[str]) -> bool:
+    """
+    Checks whether a layer_name is exactly equal or a regex match for 
+    if target starts with 're:' to any target in list.
+    """
+    for target in targets:
+        if _is_equal_or_regex_match(layer_name, target):
+            return True
+    return False
+
+
 def find_first_name_or_class_match(
         name: str,
         module: Module,
@@ -117,13 +176,27 @@ def _find_first_match(value: str,
     """
 
     for target in targets:
-        if target.startswith("re:"):
-            pattern = target[3:]
-            if re.match(pattern, value):
-                return target
-        elif check_contains:
-            if target.lower() in value.lower():
-                return target
-        elif target == value:
+        if _is_equal_or_regex_match(value, target, 
+                                       check_contains=check_contains):
             return target
+
     return None
+
+
+def _is_equal_or_regex_match(value: str, target: str, 
+                             check_contains: bool = False) -> bool:
+    """
+    Checks whether a value is exactly equal or a regex match for target
+    if taget starts with 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+    """
+    
+    if target.startswith("re:"):
+        pattern = target[3:]
+        if re.match(pattern, value):
+            return target
+    elif check_contains:
+        if target.lower() in value.lower():
+            return target
+    elif target == value:
+        return target

From e3cf13554f5edfa9bf768441c09c47a77fd09cd5 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 17 Jul 2024 18:30:10 +0000
Subject: [PATCH 06/26] nonuniform

---
 .../compressed_tensors/compressed_tensors.py  | 86 +++++++++++--------
 .../quantization/compressed_tensors/utils.py  | 40 ++++++---
 2 files changed, 79 insertions(+), 47 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 468d737c329a..f207b9ca2082 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -13,18 +13,21 @@
     CompressedTensorsWNA16, CompressedTensorsUnquantized)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
-    QuantizationType, find_first_name_or_class_match, 
-    should_ignore_layer)
+    QuantizationType, find_matched_target, should_ignore_layer)
 from vllm.platforms import current_platform
 
 
 class CompressedTensorsConfig(QuantizationConfig):
 
-    def __init__(self, layer_quant_details: Dict[str, Any], ignore: List[str],
+    def __init__(self,
+                 target_scheme_map: Dict[str, Any],
+                 ignore: List[str],
                  quant_format: str):
+
         self.ignore = ignore
-        self.layer_quant_details = layer_quant_details
         self.quant_format = quant_format
+        # Map from [target -> scheme]
+        self.target_scheme_map = target_scheme_map
 
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
@@ -51,7 +54,7 @@ def get_quant_method(
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
-        layer_quant_details: Dict[str, Any] = dict()
+        target_scheme_map: Dict[str, Any] = dict()
         ignore: List[str] = config.get("ignore", None)
         quant_format: str = config.get("format", None)
 
@@ -63,21 +66,21 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
         # details follow the structure defined by the QuantizationArgs
         # pydantic model, which is used to verify the structure of the
         # quant_config and also store the details for later use.
-        for key, quant_config in config["config_groups"].items():
+        for _, quant_config in config["config_groups"].items():
             targets = quant_config.get("targets")
             for target in targets:
-                layer_quant_details[target] = {}
-                layer_quant_details[target][
+                target_scheme_map[target] = {}
+                target_scheme_map[target][
                     "weights"] = QuantizationArgs.parse_obj(
                         quant_config.get("weights"))
                 try:
-                    layer_quant_details[target][
+                    target_scheme_map[target][
                         "input_activations"] = QuantizationArgs.parse_obj(
                             quant_config.get("input_activations"))
                 except Exception:
-                    layer_quant_details[target]["input_activations"] = None
+                    target_scheme_map[target]["input_activations"] = None
 
-        return cls(layer_quant_details=layer_quant_details,
+        return cls(target_scheme_map=target_scheme_map,
                    ignore=ignore,
                    quant_format=quant_format)
 
@@ -165,8 +168,10 @@ def _is_wNa16_group_channel(self, weight_quant: BaseModel,
         return (is_channel_group and input_quant_none and is_symmetric
                 and is_static)
 
-    def _get_schema(self, weight_quant: BaseModel,
-                    input_quant: BaseModel) -> "CompressedTensorsScheme":
+    def _get_scheme_from_parts(
+            self, 
+            weight_quant: BaseModel, 
+            input_quant: BaseModel) -> "CompressedTensorsScheme":
 
         if self._is_wNa16_group_channel(weight_quant, input_quant):
             self._check_gptq_and_marlin_can_run()
@@ -203,33 +208,44 @@ def _get_schema(self, weight_quant: BaseModel,
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
 
-    def get_scheme(
-            self,
-            layer: torch.nn.Module,
-            layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
+    def get_scheme(self,
+                   layer: torch.nn.Module,
+                   layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
+        """
+        compressed-tensors supports non uniform in the following way:
 
-        # Check if the layer is ignored (skipped for quantization).
-        if should_ignore_layer(layer_name, ignore=self.ignore):
-            return CompressedTensorsUnquantized()
+        ignore: List of layer_names or nn.Module names to be ignored.
+        targets of config_groups: There can be N config_groups which each
+            have a quantization scheme. Each config_group has a list of targets
+            which can be a full layer_name, a regex for a layer_name, or
+            an nn.Module name.
 
-        layer_type_name = find_first_name_or_class_match(
-            name="",
-            module=layer,
-            targets=self.layer_quant_details.keys(),
-            check_contains=True)
+        We first check whether a layer is in the ignore group and use
+        CompressedTensorsUnquantized (i.e. fp16/bf16) scheme for the layer
 
-        if layer_type_name is None:
-            raise ValueError(f"Could not matching target for layer {layer}")
+        We then detect whether a layer_name is found in any target and
+        use the quantization scheme corresponding to the matched target
+        to select the CompressedTensorsScheme used for infernece.
+        """
 
-        layer_quant_details: Dict[str, Any] = self.layer_quant_details.get(
-            layer_type_name, None)
-        if layer_quant_details is None:
-            raise ValueError(
-                f"Could not find quantization details for {layer}.")
+        # Check if the layer is skipped for quantization.
+        # TODO (@robertgshaw2): support module names
+        if should_ignore_layer(layer_name, ignore=self.ignore):
+            return CompressedTensorsUnquantized()
 
-        return self._get_schema(
-            weight_quant=layer_quant_details["weights"],
-            input_quant=layer_quant_details["input_activations"])
+        # Find the "target" in the compressed-tensors config
+        # that our layer conforms to. 
+        matched_target = find_matched_target(
+            layer_name=layer_name,
+            module=layer,
+            targets=self.target_scheme_map.keys())
+            
+        # Find the quant_scheme 
+        scheme = self.target_scheme_map[matched_target]
+            
+        return self._get_scheme_from_parts(
+            weight_quant=scheme["weights"],
+            input_quant=scheme["input_activations"])
 
 
 class CompressedTensorsLinearMethod(LinearMethodBase):
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index c191105f9f90..84be26b5207b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -141,25 +141,41 @@ def check_equal_or_regex_match(layer_name: str,
     return False
 
 
-def find_first_name_or_class_match(
-        name: str,
+def find_matched_target(
+        layer_name: Optional[str],
         module: Module,
-        targets: Iterable[str],
-        check_contains: bool = False) -> Optional[str]:
+        targets: Iterable[str]) -> str:
     """
-    Helper function to map the quantization details listed in the config 
-    for a given list of targets against each model layer. First uses the
-    layer name to try and find a match. If no name match is found, uses
-    the layer class name. Returns None otherwise.
+    Helper function to look up which "target" in the compressed-tensors
+    config that a layer corresponds to.
 
-    :param name: layer name
+    Recall that a compressed-tensors configs has a concept of 
+    config_groups, where each layer can be quantized with with a different
+    scheme.
+
+    targets in each config_group will be a list of either layer names 
+    (or regexes corresponding to layer names) or names of torch Modules.
+
+    First, we try to match the layer_name with a target
+    Second, we try to match the module's name with a target
+
+    :param layer_name: layer name
     :param module: torch.nn.Module
     :param targets: list of targets to match the layer against
-    :param check_contains: whether or not to do a substring match
     """
 
-    return _find_first_match(name, targets) or _find_first_match(
-        module.__class__.__name__, targets, check_contains)
+    if layer_name is None:
+        layer_name = ""
+
+    matched_target= (_find_first_match(layer_name, targets) or 
+                     _find_first_match(module.__class__.__name__, targets, True))
+
+    if matched_target is None:
+        raise ValueError(
+            f"Unable to find matching target for {module} in the "
+            "compressed-tensors config.")
+
+    return matched_target
 
 
 def _find_first_match(value: str,

From c6028554e84b2ef38d819e46bdc29dc1ddd57876 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 17 Jul 2024 19:32:33 +0000
Subject: [PATCH 07/26] fix nit

---
 .../quantization/compressed_tensors/compressed_tensors.py    | 5 +++--
 vllm/model_executor/model_loader/loader.py                   | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index f207b9ca2082..2e6a3fc55570 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -208,8 +208,7 @@ def _get_scheme_from_parts(
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
 
-    def get_scheme(self,
-                   layer: torch.nn.Module,
+    def get_scheme(self, layer: torch.nn.Module,
                    layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
         """
         compressed-tensors supports non uniform in the following way:
@@ -235,6 +234,8 @@ def get_scheme(self,
 
         # Find the "target" in the compressed-tensors config
         # that our layer conforms to. 
+        # TODO (@robertgshaw): add compressed-tensors as dep
+        # so we do not have to re-write these functions
         matched_target = find_matched_target(
             layer_name=layer_name,
             module=layer,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index af0054a079e1..a1a2b0b323f6 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -282,6 +282,7 @@ def load_model(self, *, model_config: ModelConfig,
                                                model,
                                                "fall_back_to_pt_during_load",
                                                True)), )
+
             for _, module in model.named_modules():
                 quant_method = getattr(module, "quant_method", None)
                 if quant_method is not None:

From d9355f41eff00f5ad51d65a58f72d49e4e00a8ed Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 17 Jul 2024 19:50:08 +0000
Subject: [PATCH 08/26] format

---
 vllm/model_executor/layers/linear.py          | 72 ++++++++++---------
 .../compressed_tensors/compressed_tensors.py  | 49 +++++++------
 .../quantization/compressed_tensors/utils.py  | 71 +++++++++---------
 vllm/model_executor/models/llama.py           | 21 +++---
 4 files changed, 113 insertions(+), 100 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 099224c03b5e..9edadf4f6110 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -72,7 +72,6 @@ def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       layer_name: Optional[str] = None,
                        **extra_weight_attrs):
         """Create weights for a linear layer. 
            The weights will be set as attributes of the layer.
@@ -106,7 +105,6 @@ def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
-                       layer_name: Optional[str] = None,
                        **extra_weight_attrs):
         weight = Parameter(torch.empty(sum(output_partition_sizes),
                                        input_size_per_partition,
@@ -143,7 +141,6 @@ def __init__(
         skip_bias_add: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
-        layer_name: Optional[str] = None,
     ):
         super().__init__()
 
@@ -176,23 +173,26 @@ class ReplicatedLinear(LinearBase):
         quant_config: Quantization configure.
     """
 
-    def __init__(self,
-                 input_size: int,
-                 output_size: int,
-                 bias: bool = True,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 layer_name: Optional[str] = None,
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_name: Optional[str] = None,
     ):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config, layer_name)
+                         quant_config)
 
         # All the linear layer supports quant method.
         assert self.quant_method is not None
-        self.quant_method.create_weights(self, self.input_size,
-                                         [self.output_size], self.input_size,
-                                         self.output_size, self.params_dtype,
+        self.quant_method.create_weights(self,
+                                         self.input_size, [self.output_size],
+                                         self.input_size,
+                                         self.output_size,
+                                         self.params_dtype,
                                          layer_name=layer_name)
 
         if bias:
@@ -247,16 +247,17 @@ class ColumnParallelLinear(LinearBase):
                        the list would be size 3.
     """
 
-    def __init__(self,
-                 input_size: int,
-                 output_size: int,
-                 bias: bool = True,
-                 gather_output: bool = False,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 output_sizes: Optional[List[int]] = None,
-                 layer_name: Optional[str] = None,
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        output_sizes: Optional[List[int]] = None,
+        layer_name: Optional[str] = None,
     ):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config)
@@ -285,7 +286,8 @@ def __init__(self,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
             layer_name=layer_name,
-            weight_loader=self.weight_loader,)
+            weight_loader=self.weight_loader,
+        )
         if bias:
             self.bias = Parameter(
                 torch.empty(self.output_size_per_partition,
@@ -371,14 +373,16 @@ def __init__(self,
         self.output_sizes = output_sizes
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
-        super().__init__(input_size=input_size,
-                         output_size=sum(output_sizes),
-                         bias=bias,
-                         gather_output=gather_output,
-                         skip_bias_add=skip_bias_add,
-                         params_dtype=params_dtype,
-                         quant_config=quant_config,
-                         layer_name=layer_name,)
+        super().__init__(
+            input_size=input_size,
+            output_size=sum(output_sizes),
+            bias=bias,
+            gather_output=gather_output,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            layer_name=layer_name,
+        )
 
     def weight_loader(self,
                       param: Parameter,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 2e6a3fc55570..85ba71ffe6d3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -8,9 +8,9 @@
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
-    CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsWNA16, CompressedTensorsUnquantized)
+    CompressedTensorsScheme, CompressedTensorsUnquantized,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
     QuantizationType, find_matched_target, should_ignore_layer)
@@ -19,9 +19,7 @@
 
 class CompressedTensorsConfig(QuantizationConfig):
 
-    def __init__(self,
-                 target_scheme_map: Dict[str, Any],
-                 ignore: List[str],
+    def __init__(self, target_scheme_map: Dict[str, Any], ignore: List[str],
                  quant_format: str):
 
         self.ignore = ignore
@@ -169,8 +167,7 @@ def _is_wNa16_group_channel(self, weight_quant: BaseModel,
                 and is_static)
 
     def _get_scheme_from_parts(
-            self, 
-            weight_quant: BaseModel, 
+            self, weight_quant: BaseModel,
             input_quant: BaseModel) -> "CompressedTensorsScheme":
 
         if self._is_wNa16_group_channel(weight_quant, input_quant):
@@ -188,9 +185,10 @@ def _get_scheme_from_parts(
                     strategy=weight_quant.strategy,
                     group_size=weight_quant.group_size)
 
-        if (self.quant_format == CompressionFormat.int_quantized.value or
-            self.quant_format == CompressionFormat.float_quantized.value or 
-            self.quant_format == CompressionFormat.naive_quantized.value):
+        if (self.quant_format == CompressionFormat.int_quantized.value
+                or self.quant_format == CompressionFormat.float_quantized.value
+                or self.quant_format
+                == CompressionFormat.naive_quantized.value):
             if self._is_fp8_w8a8(weight_quant, input_quant):
                 return CompressedTensorsW8A8Fp8(
                     input_dynamic=input_quant.dynamic)
@@ -208,8 +206,10 @@ def _get_scheme_from_parts(
         raise NotImplementedError(
             "No compressed-tensors compatible scheme was found.")
 
-    def get_scheme(self, layer: torch.nn.Module,
-                   layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
+    def get_scheme(
+            self,
+            layer: torch.nn.Module,
+            layer_name: Optional[str] = None) -> "CompressedTensorsScheme":
         """
         compressed-tensors supports non uniform in the following way:
 
@@ -233,17 +233,17 @@ def get_scheme(self, layer: torch.nn.Module,
             return CompressedTensorsUnquantized()
 
         # Find the "target" in the compressed-tensors config
-        # that our layer conforms to. 
+        # that our layer conforms to.
         # TODO (@robertgshaw): add compressed-tensors as dep
         # so we do not have to re-write these functions
         matched_target = find_matched_target(
             layer_name=layer_name,
             module=layer,
             targets=self.target_scheme_map.keys())
-            
-        # Find the quant_scheme 
+
+        # Find the quant_scheme
         scheme = self.target_scheme_map[matched_target]
-            
+
         return self._get_scheme_from_parts(
             weight_quant=scheme["weights"],
             input_quant=scheme["input_activations"])
@@ -257,11 +257,15 @@ def __init__(self, quantization_config: CompressedTensorsConfig):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.scheme.process_weights_after_loading(layer)
 
-    def create_weights(self, layer: torch.nn.Module,
+    def create_weights(self,
+                       layer: torch.nn.Module,
                        input_size_per_partition: int,
-                       output_partition_sizes: List[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       layer_name: Optional[str], **extra_weight_attrs):
+                       output_partition_sizes: List[int],
+                       input_size: int,
+                       output_size: int,
+                       params_dtype: torch.dtype,
+                       layer_name: Optional[str] = None,
+                       **extra_weight_attrs):
         """
         Use the CompressedTensorsScheme associated with each layer to create 
         the necessary parameters for the layer. See LinearMethodBase for param
@@ -270,7 +274,8 @@ def create_weights(self, layer: torch.nn.Module,
         """
         weight_loader = extra_weight_attrs.get("weight_loader")
 
-        scheme = self.quantization_config.get_scheme(layer=layer, layer_name=layer_name)
+        scheme = self.quantization_config.get_scheme(layer=layer,
+                                                     layer_name=layer_name)
         scheme.create_weights(
             layer=layer,
             input_size=input_size,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 84be26b5207b..c10bcb2e2c22 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -5,6 +5,7 @@
 from pydantic import BaseModel, Field
 from torch.nn import Module
 
+
 class CompressionFormat(Enum):
     dense = "dense"
     sparse_bitmask = "sparse-bitmask"
@@ -82,10 +83,12 @@ class QuantizationArgs(BaseModel):
     "gate_up_proj": ["gate_proj", "up_proj"]
 }
 
-def should_ignore_layer(layer_name: Optional[str], ignore: Iterable[str]) -> bool:   
+
+def should_ignore_layer(layer_name: Optional[str],
+                        ignore: Iterable[str]) -> bool:
     if layer_name is None:
         return False
-    
+
     # layer_name = model.layers.0.self_attn.qkv_proj
     # proj_name = qkv_proj
     proj_name = layer_name.split(".")[-1]
@@ -96,11 +99,11 @@ def should_ignore_layer(layer_name: Optional[str], ignore: Iterable[str]) -> boo
     # each shard of the fused layer has the same scheme.
     if proj_name in _FUSED_LAYER_NAME_MAPPING:
         shard_proj_names = _FUSED_LAYER_NAME_MAPPING[proj_name]
-        
+
         # Convert fused_name --> [shard_names]
         shard_names = [
-            layer_name.replace(proj_name, shard_proj_name) for
-            shard_proj_name in shard_proj_names
+            layer_name.replace(proj_name, shard_proj_name)
+            for shard_proj_name in shard_proj_names
         ]
 
         # Layer should be ignored if shards are ignored.
@@ -108,28 +111,28 @@ def should_ignore_layer(layer_name: Optional[str], ignore: Iterable[str]) -> boo
         for shard_name in shard_names:
             should_ignore_shard = check_equal_or_regex_match(
                 layer_name=shard_name, targets=ignore)
-            
-            # If shard_idx=0, set layer ignore to match shard. 
+
+            # If shard_idx=0, set layer ignore to match shard.
             if should_ignore_layer is None:
                 should_ignore_layer = should_ignore_shard
-            
+
             # If shard_idx=1+ confirm scheme matches prior shards.
             elif should_ignore_shard != should_ignore_layer:
-                raise ValueError(
-                    f"Found a different quantization schemes for "
-                    f"{shard_proj_names} in {layer_name}. vLLM "
-                    "requires all to use the same scheme.")
+                raise ValueError(f"Found a different quantization schemes for "
+                                 f"{shard_proj_names} in {layer_name}. vLLM "
+                                 "requires all to use the same scheme.")
 
     # Unfused layers like down_proj and o_proj will match
     # the safetensors checkpoint already.
     else:
-        should_ignore_layer = check_equal_or_regex_match(
-                layer_name=layer_name, targets=ignore)
-    
+        should_ignore_layer = check_equal_or_regex_match(layer_name=layer_name,
+                                                         targets=ignore)
+
+    assert should_ignore_layer is not None
     return should_ignore_layer
 
 
-def check_equal_or_regex_match(layer_name: str, 
+def check_equal_or_regex_match(layer_name: str,
                                targets: Iterable[str]) -> bool:
     """
     Checks whether a layer_name is exactly equal or a regex match for 
@@ -141,10 +144,8 @@ def check_equal_or_regex_match(layer_name: str,
     return False
 
 
-def find_matched_target(
-        layer_name: Optional[str],
-        module: Module,
-        targets: Iterable[str]) -> str:
+def find_matched_target(layer_name: Optional[str], module: Module,
+                        targets: Iterable[str]) -> str:
     """
     Helper function to look up which "target" in the compressed-tensors
     config that a layer corresponds to.
@@ -167,13 +168,13 @@ def find_matched_target(
     if layer_name is None:
         layer_name = ""
 
-    matched_target= (_find_first_match(layer_name, targets) or 
-                     _find_first_match(module.__class__.__name__, targets, True))
+    matched_target = (_find_first_match(layer_name, targets)
+                      or _find_first_match(module.__class__.__name__, targets,
+                                           True))
 
     if matched_target is None:
-        raise ValueError(
-            f"Unable to find matching target for {module} in the "
-            "compressed-tensors config.")
+        raise ValueError(f"Unable to find matching target for {module} in the "
+                         "compressed-tensors config.")
 
     return matched_target
 
@@ -192,27 +193,29 @@ def _find_first_match(value: str,
     """
 
     for target in targets:
-        if _is_equal_or_regex_match(value, target, 
-                                       check_contains=check_contains):
+        if _is_equal_or_regex_match(value,
+                                    target,
+                                    check_contains=check_contains):
             return target
-
     return None
 
 
-def _is_equal_or_regex_match(value: str, target: str, 
+def _is_equal_or_regex_match(value: str,
+                             target: str,
                              check_contains: bool = False) -> bool:
     """
     Checks whether a value is exactly equal or a regex match for target
-    if taget starts with 're:'. If check_contains is set to True,
+    if target starts with 're:'. If check_contains is set to True,
     additionally checks if the target string is contained within the value.
     """
-    
+
     if target.startswith("re:"):
         pattern = target[3:]
         if re.match(pattern, value):
-            return target
+            return True
     elif check_contains:
         if target.lower() in value.lower():
-            return target
+            return True
     elif target == value:
-        return target
+        return True
+    return False
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 78619e19941d..50282af5c50e 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -71,11 +71,12 @@ def __init__(
             bias=bias,
             quant_config=quant_config,
             layer_name=f"{parent_name}.gate_up_proj")
-        self.down_proj = RowParallelLinear(input_size=intermediate_size,
-                                           output_size=hidden_size,
-                                           bias=bias,
-                                           quant_config=quant_config,
-                                           layer_name=f"{parent_name}.down_proj")
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            layer_name=f"{parent_name}.down_proj")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -276,11 +277,11 @@ def __init__(
         else:
             self.embed_tokens = PPMissingLayer()
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda idx: LlamaDecoderLayer(parent_name=f"model.layers.{idx}",
-                                          config=config,
-                                          cache_config=cache_config,
-                                          quant_config=quant_config))
+            config.num_hidden_layers, lambda layer_idx: LlamaDecoderLayer(
+                parent_name=f"model.layers.{layer_idx}",
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config))
         if get_pp_group().is_last_rank:
             self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:

From f036956d4652e18e4821cf6b512ce1281c7a2b19 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 17 Jul 2024 19:51:34 +0000
Subject: [PATCH 09/26] tweak gpt2

---
 vllm/model_executor/models/gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index d309a2b27f5d..23e5eabe3ddc 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -186,7 +186,7 @@ def __init__(
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda: GPT2Block(config, cache_config, quant_config))
+            lambda layer_idx: GPT2Block(config, cache_config, quant_config))
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
 
     def forward(

From ba115ebe132fbfcbec06687f8094ceb84715d2f6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Wed, 17 Jul 2024 21:18:31 +0000
Subject: [PATCH 10/26] added test

---
 ...a-3-8B-Instruct-nonuniform-compressed-tensors.yaml | 11 +++++++++++
 .buildkite/lm-eval-harness/configs/models-small.txt   |  1 +
 2 files changed, 12 insertions(+)
 create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml

diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
new file mode 100644
index 000000000000..3964f3be5e87
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
+model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.758
+  - name: "exact_match,flexible-extract"
+    value: 0.759
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 3d1306f6bc4f..869fc9cef377 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -2,4 +2,5 @@ Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml

From 6e28eade7f5b4cf5d6fa128d071c4381b6df4cd6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 01:21:07 +0000
Subject: [PATCH 11/26] update to parent

---
 .../schemes/compressed_tensors_w8a8_fp8.py    |  9 +++++++++
 vllm/model_executor/models/llama.py           | 20 +++++++++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index b93425fb2d62..16acc412aea1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -8,6 +8,7 @@
     apply_fp8_linear, create_per_tensor_scale_param, cutlass_fp8_supported,
     requantize_with_max_scale)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 
 __all__ = ["CompressedTensorsW8A8Fp8"]
 
@@ -18,10 +19,18 @@ def __init__(self, input_dynamic: bool):
         self.input_dynamic = input_dynamic
         self.cutlass_fp8_supported = cutlass_fp8_supported()
 
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        capability = current_platform.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        self.use_marlin = capability < 89
+
     # W8A8-Fp8 kernels support only per-tensor and per-channel cases.
     # So if we have a fused module (QKV, MLP) with per tensor scales (thus N
     # scales being passed to the kernel), we requantize with a single scale.
     def process_weights_after_loading(self, layer) -> None:
+        if self.use_marlin:
+            weight = layer.weight
         # Dequant -> Quant with max scale.
         max_w_scale, weight = requantize_with_max_scale(
             weight=layer.weight,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 50282af5c50e..9e655317b8a1 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -57,7 +57,7 @@ class LlamaMLP(nn.Module):
 
     def __init__(
         self,
-        parent_name: str,
+        prefix: str,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
@@ -70,13 +70,13 @@ def __init__(
             output_sizes=[intermediate_size] * 2,
             bias=bias,
             quant_config=quant_config,
-            layer_name=f"{parent_name}.gate_up_proj")
+            layer_name=f"{prefix}.gate_up_proj")
         self.down_proj = RowParallelLinear(
             input_size=intermediate_size,
             output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
-            layer_name=f"{parent_name}.down_proj")
+            layer_name=f"{prefix}.down_proj")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -93,7 +93,7 @@ class LlamaAttention(nn.Module):
 
     def __init__(
         self,
-        parent_name: str,
+        prefix: str,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -134,14 +134,14 @@ def __init__(
             total_num_kv_heads=self.total_num_kv_heads,
             bias=bias,
             quant_config=quant_config,
-            layer_name=f"{parent_name}.qkv_proj",
+            layer_name=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             input_size=self.total_num_heads * self.head_dim,
             output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
-            layer_name=f"{parent_name}.o_proj",
+            layer_name=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(
@@ -177,7 +177,7 @@ class LlamaDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        parent_name: str,
+        prefix: str,
         config: LlamaConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
@@ -197,7 +197,7 @@ def __init__(
         attention_bias = getattr(config, "attention_bias", False) or getattr(
             config, "bias", False)
         self.self_attn = LlamaAttention(
-            parent_name=f"{parent_name}.self_attn",
+            prefix=f"{prefix}.self_attn",
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=getattr(config, "num_key_value_heads",
@@ -210,7 +210,7 @@ def __init__(
             cache_config=cache_config,
         )
         self.mlp = LlamaMLP(
-            parent_name=f"{parent_name}.mlp",
+            prefix=f"{prefix}.mlp",
             hidden_size=self.hidden_size,
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
@@ -278,7 +278,7 @@ def __init__(
             self.embed_tokens = PPMissingLayer()
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers, lambda layer_idx: LlamaDecoderLayer(
-                parent_name=f"model.layers.{layer_idx}",
+                prefix=f"model.layers.{layer_idx}",
                 config=config,
                 cache_config=cache_config,
                 quant_config=quant_config))

From 793abc740b21f9a808d4f7f7506ba41c134eb6ac Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 01:24:27 +0000
Subject: [PATCH 12/26] format

---
 vllm/model_executor/models/llama.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 9e655317b8a1..781d901462b9 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -71,12 +71,11 @@ def __init__(
             bias=bias,
             quant_config=quant_config,
             layer_name=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(
-            input_size=intermediate_size,
-            output_size=hidden_size,
-            bias=bias,
-            quant_config=quant_config,
-            layer_name=f"{prefix}.down_proj")
+        self.down_proj = RowParallelLinear(input_size=intermediate_size,
+                                           output_size=hidden_size,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           layer_name=f"{prefix}.down_proj")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")

From bb7d44c213893276ea946c075d115a41d90c3bab Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 01:50:49 +0000
Subject: [PATCH 13/26] format

---
 vllm/model_executor/layers/linear.py          | 24 ++++++++++++-------
 .../compressed_tensors/compressed_tensors.py  | 15 ++++--------
 vllm/model_executor/models/gpt2.py            |  3 ++-
 vllm/model_executor/models/llama.py           | 19 ++++++++-------
 vllm/model_executor/models/utils.py           | 11 +++++----
 5 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 9edadf4f6110..8f9343063b7c 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -171,6 +171,8 @@ class ReplicatedLinear(LinearBase):
         skip_bias_add: If true, skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
     """
 
     def __init__(
@@ -181,7 +183,7 @@ def __init__(
         skip_bias_add: bool = False,
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
-        layer_name: Optional[str] = None,
+        prefix: Optional[str] = None,
     ):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config)
@@ -193,7 +195,7 @@ def __init__(
                                          self.input_size,
                                          self.output_size,
                                          self.params_dtype,
-                                         layer_name=layer_name)
+                                         layer_name=prefix)
 
         if bias:
             self.bias = Parameter(
@@ -245,6 +247,8 @@ class ColumnParallelLinear(LinearBase):
         quant_config: Quantization configure.
         output_sizes: list of output sizes packed into one output, like for QKV
                        the list would be size 3.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj) 
     """
 
     def __init__(
@@ -257,7 +261,7 @@ def __init__(
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
         output_sizes: Optional[List[int]] = None,
-        layer_name: Optional[str] = None,
+        prefix: Optional[str] = None,
     ):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config)
@@ -285,7 +289,7 @@ def __init__(
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
-            layer_name=layer_name,
+            layer_name=prefix,
             weight_loader=self.weight_loader,
         )
         if bias:
@@ -359,6 +363,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                        skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
     """
 
     def __init__(self,
@@ -369,7 +375,7 @@ def __init__(self,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 layer_name: Optional[str] = None):
+                 prefix: Optional[str] = None):
         self.output_sizes = output_sizes
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
@@ -381,7 +387,7 @@ def __init__(self,
             skip_bias_add=skip_bias_add,
             params_dtype=params_dtype,
             quant_config=quant_config,
-            layer_name=layer_name,
+            prefix=prefix,
         )
 
     def weight_loader(self,
@@ -502,6 +508,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                        skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
     """
 
     def __init__(self,
@@ -513,7 +521,7 @@ def __init__(self,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 layer_name: Optional[str] = None):
+                 prefix: Optional[str] = None):
         self.hidden_size = hidden_size
         self.head_size = head_size
         self.total_num_heads = total_num_heads
@@ -546,7 +554,7 @@ def __init__(self,
                          skip_bias_add=skip_bias_add,
                          params_dtype=params_dtype,
                          quant_config=quant_config,
-                         layer_name=layer_name)
+                         prefix=prefix)
 
     def weight_loader(self,
                       param: Parameter,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 85ba71ffe6d3..e4df97c465fd 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -257,25 +257,20 @@ def __init__(self, quantization_config: CompressedTensorsConfig):
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.scheme.process_weights_after_loading(layer)
 
-    def create_weights(self,
-                       layer: torch.nn.Module,
+    def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
-                       output_partition_sizes: List[int],
-                       input_size: int,
-                       output_size: int,
-                       params_dtype: torch.dtype,
-                       layer_name: Optional[str] = None,
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
         """
         Use the CompressedTensorsScheme associated with each layer to create 
         the necessary parameters for the layer. See LinearMethodBase for param
         details
-
         """
         weight_loader = extra_weight_attrs.get("weight_loader")
+        layer_name = extra_weight_attrs.get("prefix")
 
-        scheme = self.quantization_config.get_scheme(layer=layer,
-                                                     layer_name=layer_name)
+        scheme = self.quantization_config.get_scheme(layer, layer_name)
         scheme.create_weights(
             layer=layer,
             input_size=input_size,
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 23e5eabe3ddc..f64f4e577408 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -186,7 +186,8 @@ def __init__(
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda layer_idx: GPT2Block(config, cache_config, quant_config))
+            lambda prefix: GPT2Block(config, cache_config, quant_config),
+            prefix="")
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
 
     def forward(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 781d901462b9..c00ec9ed82db 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -57,12 +57,12 @@ class LlamaMLP(nn.Module):
 
     def __init__(
         self,
-        prefix: str,
         hidden_size: int,
         intermediate_size: int,
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -176,10 +176,10 @@ class LlamaDecoderLayer(nn.Module):
 
     def __init__(
         self,
-        prefix: str,
         config: LlamaConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -196,7 +196,6 @@ def __init__(
         attention_bias = getattr(config, "attention_bias", False) or getattr(
             config, "bias", False)
         self.self_attn = LlamaAttention(
-            prefix=f"{prefix}.self_attn",
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=getattr(config, "num_key_value_heads",
@@ -207,14 +206,15 @@ def __init__(
             quant_config=quant_config,
             bias=attention_bias,
             cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
         )
         self.mlp = LlamaMLP(
-            prefix=f"{prefix}.mlp",
             hidden_size=self.hidden_size,
             intermediate_size=config.intermediate_size,
             hidden_act=config.hidden_act,
             quant_config=quant_config,
             bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
@@ -276,11 +276,12 @@ def __init__(
         else:
             self.embed_tokens = PPMissingLayer()
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, lambda layer_idx: LlamaDecoderLayer(
-                prefix=f"model.layers.{layer_idx}",
-                config=config,
-                cache_config=cache_config,
-                quant_config=quant_config))
+            config.num_hidden_layers,
+            lambda prefix: LlamaDecoderLayer(config=config,
+                                             cache_config=cache_config,
+                                             quant_config=quant_config,
+                                             prefix=prefix),
+            prefix="model.layers")
         if get_pp_group().is_last_rank:
             self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 1196cd7de5e6..db890b7f268c 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -53,7 +53,9 @@ def __init__(self, *args, **kwargs):
 
 
 def make_layers(
-    num_hidden_layers: int, layer_fn: Callable[[], torch.nn.Module]
+    num_hidden_layers: int,
+    layer_fn: Callable[[], torch.nn.Module],
+    prefix: str,
 ) -> Tuple[int, int, torch.nn.ModuleList]:
     """Make a list of layers with the given layer function, taking
     pipeline parallelism into account.
@@ -64,9 +66,10 @@ def make_layers(
                                             get_pp_group().rank_in_group,
                                             get_pp_group().world_size)
     modules = torch.nn.ModuleList(
-        [PPMissingLayer() for _ in range(start_layer)] +
-        [layer_fn(idx) for idx in range(start_layer, end_layer)] +
-        [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
+        [PPMissingLayer() for _ in range(start_layer)] + [
+            layer_fn(prefix=f"{prefix}.{idx}")
+            for idx in range(start_layer, end_layer)
+        ] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
     return start_layer, end_layer, modules
 
 

From 24e5cdf9a1195dcefa0f5081a9b9971498e2f27f Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 01:52:22 +0000
Subject: [PATCH 14/26] format

---
 vllm/model_executor/models/llama.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index c00ec9ed82db..206648463624 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -70,12 +70,12 @@ def __init__(
             output_sizes=[intermediate_size] * 2,
             bias=bias,
             quant_config=quant_config,
-            layer_name=f"{prefix}.gate_up_proj")
+            prefix=f"{prefix}.gate_up_proj")
         self.down_proj = RowParallelLinear(input_size=intermediate_size,
                                            output_size=hidden_size,
                                            bias=bias,
                                            quant_config=quant_config,
-                                           layer_name=f"{prefix}.down_proj")
+                                           prefix=f"{prefix}.down_proj")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -92,7 +92,6 @@ class LlamaAttention(nn.Module):
 
     def __init__(
         self,
-        prefix: str,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -102,6 +101,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         bias: bool = False,
         cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -133,14 +133,14 @@ def __init__(
             total_num_kv_heads=self.total_num_kv_heads,
             bias=bias,
             quant_config=quant_config,
-            layer_name=f"{prefix}.qkv_proj",
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             input_size=self.total_num_heads * self.head_dim,
             output_size=hidden_size,
             bias=bias,
             quant_config=quant_config,
-            layer_name=f"{prefix}.o_proj",
+            prefix=f"{prefix}.o_proj",
         )
 
         self.rotary_emb = get_rope(

From 75a617b51bf8a1e090afeddaa4ae0c8d69dc90c9 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 01:53:25 +0000
Subject: [PATCH 15/26] format

---
 vllm/model_executor/layers/linear.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 8f9343063b7c..740f42823790 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -195,7 +195,7 @@ def __init__(
                                          self.input_size,
                                          self.output_size,
                                          self.params_dtype,
-                                         layer_name=prefix)
+                                         prefix=prefix)
 
         if bias:
             self.bias = Parameter(
@@ -289,7 +289,7 @@ def __init__(
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
-            layer_name=prefix,
+            prefix=prefix,
             weight_loader=self.weight_loader,
         )
         if bias:
@@ -714,7 +714,7 @@ def __init__(self,
                  params_dtype: Optional[torch.dtype] = None,
                  reduce_results: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
-                 layer_name: Optional[str] = None):
+                 prefix: Optional[str] = None):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config)
 
@@ -732,7 +732,7 @@ def __init__(self,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
-            layer_name=layer_name,
+            prefix=prefix,
             weight_loader=self.weight_loader)
         if not reduce_results and (bias and not skip_bias_add):
             raise ValueError("When not reduce the results, adding bias to the "

From 9f7aecafc637a0a563ca7c4432a889caf7b3df50 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 02:04:32 +0000
Subject: [PATCH 16/26] revert marlin change on wrong branch

---
 .../schemes/compressed_tensors_w8a8_fp8.py                | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 16acc412aea1..9ed8a7839273 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -19,18 +19,10 @@ def __init__(self, input_dynamic: bool):
         self.input_dynamic = input_dynamic
         self.cutlass_fp8_supported = cutlass_fp8_supported()
 
-        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
-        # kernel for fast weight-only FP8 quantization
-        capability = current_platform.get_device_capability()
-        capability = capability[0] * 10 + capability[1]
-        self.use_marlin = capability < 89
-
     # W8A8-Fp8 kernels support only per-tensor and per-channel cases.
     # So if we have a fused module (QKV, MLP) with per tensor scales (thus N
     # scales being passed to the kernel), we requantize with a single scale.
     def process_weights_after_loading(self, layer) -> None:
-        if self.use_marlin:
-            weight = layer.weight
         # Dequant -> Quant with max scale.
         max_w_scale, weight = requantize_with_max_scale(
             weight=layer.weight,

From 5b1369acac39e44f7ddb087ba16a2691b83cf95d Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 02:05:14 +0000
Subject: [PATCH 17/26] spurious chnage

---
 .../compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 9ed8a7839273..b93425fb2d62 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -8,7 +8,6 @@
     apply_fp8_linear, create_per_tensor_scale_param, cutlass_fp8_supported,
     requantize_with_max_scale)
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.platforms import current_platform
 
 __all__ = ["CompressedTensorsW8A8Fp8"]
 

From 45acd28983305b337cbdde28cb86801d21ba92e3 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 02:10:21 +0000
Subject: [PATCH 18/26] finally done

---
 vllm/model_executor/layers/linear.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 740f42823790..d8805346bfde 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -289,9 +289,8 @@ def __init__(
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
-            prefix=prefix,
             weight_loader=self.weight_loader,
-        )
+            prefix=prefix)
         if bias:
             self.bias = Parameter(
                 torch.empty(self.output_size_per_partition,
@@ -387,8 +386,7 @@ def __init__(self,
             skip_bias_add=skip_bias_add,
             params_dtype=params_dtype,
             quant_config=quant_config,
-            prefix=prefix,
-        )
+            prefix=prefix)
 
     def weight_loader(self,
                       param: Parameter,
@@ -732,8 +730,9 @@ def __init__(self,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
-            prefix=prefix,
-            weight_loader=self.weight_loader)
+            weight_loader=self.weight_loader,
+            prefix=prefix)
+
         if not reduce_results and (bias and not skip_bias_add):
             raise ValueError("When not reduce the results, adding bias to the "
                              "results can lead to incorrect results")

From 87c8c87f9955bb459994d28a610083075e27fd4e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 02:11:53 +0000
Subject: [PATCH 19/26] replicated linear num lines

---
 vllm/model_executor/layers/linear.py | 35 +++++++++++++---------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index d8805346bfde..73327c09a653 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -175,16 +175,14 @@ class ReplicatedLinear(LinearBase):
                         (e.g. model.layers.0.qkv_proj)
     """
 
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        bias: bool = True,
-        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: Optional[str] = None,
-    ):
+    def __init__(self,
+                 input_size: int,
+                 output_size: int,
+                 bias: bool = True,
+                 skip_bias_add: bool = False,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: Optional[str] = None):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config)
 
@@ -378,15 +376,14 @@ def __init__(self,
         self.output_sizes = output_sizes
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
-        super().__init__(
-            input_size=input_size,
-            output_size=sum(output_sizes),
-            bias=bias,
-            gather_output=gather_output,
-            skip_bias_add=skip_bias_add,
-            params_dtype=params_dtype,
-            quant_config=quant_config,
-            prefix=prefix)
+        super().__init__(input_size=input_size,
+                         output_size=sum(output_sizes),
+                         bias=bias,
+                         gather_output=gather_output,
+                         skip_bias_add=skip_bias_add,
+                         params_dtype=params_dtype,
+                         quant_config=quant_config,
+                         prefix=prefix)
 
     def weight_loader(self,
                       param: Parameter,

From 477b1ea49bc19dd165958ff7dde46c70eab4a257 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 02:13:27 +0000
Subject: [PATCH 20/26] remove unnessary changes

---
 vllm/model_executor/layers/linear.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 73327c09a653..86d15207fb6b 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -249,18 +249,16 @@ class ColumnParallelLinear(LinearBase):
                         (e.g. model.layers.0.qkv_proj) 
     """
 
-    def __init__(
-        self,
-        input_size: int,
-        output_size: int,
-        bias: bool = True,
-        gather_output: bool = False,
-        skip_bias_add: bool = False,
-        params_dtype: Optional[torch.dtype] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        output_sizes: Optional[List[int]] = None,
-        prefix: Optional[str] = None,
-    ):
+    def __init__(self,
+                 input_size: int,
+                 output_size: int,
+                 bias: bool = True,
+                 gather_output: bool = False,
+                 skip_bias_add: bool = False,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 output_sizes: Optional[List[int]] = None,
+                 prefix: Optional[str] = None):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config)
 
@@ -729,7 +727,6 @@ def __init__(self,
             params_dtype=self.params_dtype,
             weight_loader=self.weight_loader,
             prefix=prefix)
-
         if not reduce_results and (bias and not skip_bias_add):
             raise ValueError("When not reduce the results, adding bias to the "
                              "results can lead to incorrect results")

From 998c84f6cb301517cd38b8659e25f76e8c56f5d8 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 19:46:49 +0000
Subject: [PATCH 21/26] stash

---
 vllm/model_executor/models/utils.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index db890b7f268c..0ec900a9dd08 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,4 +1,4 @@
-from typing import Callable, Dict, List, Tuple
+from typing import Callable, Dict, List, Protocol, Tuple
 
 import torch
 
@@ -52,9 +52,16 @@ def __init__(self, *args, **kwargs):
         super().__init__()
 
 
+class LayerFn(Protocol):
+    def __call__(
+        self, prefix="",
+    ) -> torch.nn.Module:
+        ...
+
+
 def make_layers(
     num_hidden_layers: int,
-    layer_fn: Callable[[], torch.nn.Module],
+    layer_fn: LayerFn,
     prefix: str,
 ) -> Tuple[int, int, torch.nn.ModuleList]:
     """Make a list of layers with the given layer function, taking

From a826e666b5f27e66cef645a57dda3fd98bd10dda Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 20:38:13 +0000
Subject: [PATCH 22/26] staged

---
 vllm/model_executor/layers/fused_moe/layer.py |  1 +
 vllm/model_executor/models/gpt2.py            | 15 ++++++++---
 vllm/model_executor/models/mixtral.py         | 26 ++++++++++++++-----
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index bb2be3f3eb56..a6fa8ffe5111 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -158,6 +158,7 @@ def __init__(
         topk_group: Optional[int] = None,
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
+        prefix: str = "",
     ):
         super().__init__()
 
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index f64f4e577408..86b6a3f62b7f 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -51,6 +51,7 @@ def __init__(
         config: GPT2Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -68,12 +69,14 @@ def __init__(
             total_num_heads,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_attn",
         )
         self.c_proj = RowParallelLinear(
             self.hidden_size,
             self.hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
@@ -101,6 +104,7 @@ def __init__(
         intermediate_size: int,
         config: GPT2Config,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -109,12 +113,14 @@ def __init__(
             intermediate_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
         )
         self.c_proj = RowParallelLinear(
             intermediate_size,
             hidden_size,
             bias=True,
             quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
         )
         self.act = get_act_fn(config.activation_function, quant_config,
                               intermediate_size)
@@ -133,6 +139,7 @@ def __init__(
         config: GPT2Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         hidden_size = config.hidden_size
@@ -140,9 +147,9 @@ def __init__(
                      hidden_size)
 
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = GPT2Attention(config, cache_config, quant_config)
+        self.attn = GPT2Attention(config, cache_config, quant_config, prefix=f"{prefix}.attn")
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = GPT2MLP(inner_dim, config, quant_config)
+        self.mlp = GPT2MLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -186,8 +193,8 @@ def __init__(
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda prefix: GPT2Block(config, cache_config, quant_config),
-            prefix="")
+            lambda prefix: GPT2Block(config, cache_config, quant_config, prefix=prefix),
+            prefix="transformer.h")
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
 
     def forward(
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 28dbcb30bdf5..59ae0d5dfa09 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -67,7 +67,8 @@ def __init__(self,
                  intermediate_size: int,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 tp_size: Optional[int] = None):
+                 tp_size: Optional[int] = None,
+                 prefix: str = ""):
         super().__init__()
         self.hidden_size = hidden_size
 
@@ -76,7 +77,8 @@ def __init__(self,
                                      num_experts,
                                      bias=False,
                                      params_dtype=params_dtype,
-                                     quant_config=None)
+                                     quant_config=None,
+                                     prefix=prefix)
 
         self.experts = FusedMoE(num_experts=num_experts,
                                 top_k=top_k,
@@ -86,7 +88,8 @@ def __init__(self,
                                 reduce_results=True,
                                 renormalize=True,
                                 quant_config=quant_config,
-                                tp_size=tp_size)
+                                tp_size=tp_size,
+                                prefix=prefix)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -109,6 +112,7 @@ def __init__(
         rope_theta: float = 10000,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -139,12 +143,14 @@ def __init__(
             self.total_num_kv_heads,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -182,6 +188,7 @@ def __init__(
         config: MixtralConfig,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -194,13 +201,15 @@ def __init__(
             num_kv_heads=config.num_key_value_heads,
             rope_theta=rope_theta,
             cache_config=cache_config,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn")
         self.block_sparse_moe = MixtralMoE(
             num_experts=config.num_local_experts,
             top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe")
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
@@ -258,8 +267,11 @@ def __init__(
         )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, lambda: MixtralDecoderLayer(
-                config, cache_config, quant_config=quant_config))
+            config.num_hidden_layers, 
+            lambda prefix: MixtralDecoderLayer(
+                config, cache_config, 
+                quant_config=quant_config, prefix=prefix),
+            prefix="model.layers")
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 

From 3ab65a745ecb5d2e17b1856be474c8a105c5ecb1 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 20:50:15 +0000
Subject: [PATCH 23/26] format

---
 .../compressed_tensors/compressed_tensors.py        |  4 ++--
 .../layers/quantization/compressed_tensors/utils.py |  9 ---------
 vllm/model_executor/models/gpt2.py                  | 13 ++++++++++---
 vllm/model_executor/models/mixtral.py               |  6 +++---
 vllm/model_executor/models/utils.py                 |  7 +++++--
 5 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 1f250b94e076..28c552b3654f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -13,8 +13,8 @@
     CompressedTensorsW8A8Int8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
-    QuantizationType, is_activation_quantization_format, 
-    find_matched_target, should_ignore_layer)
+    QuantizationType, find_matched_target, is_activation_quantization_format,
+    should_ignore_layer)
 from vllm.platforms import current_platform
 
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 0579dd821542..b3110ce65330 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -86,15 +86,6 @@ def is_activation_quantization_format(format: str) -> bool:
     return format in _ACTIVATION_QUANTIZATION_FORMATS
 
 
-def is_activation_quantization_format(format: str) -> bool:
-    _ACTIVATION_QUANTIZATION_FORMATS = [
-        CompressionFormat.naive_quantized.value,
-        CompressionFormat.int_quantized.value,
-        CompressionFormat.float_quantized.value
-    ]
-    return format in _ACTIVATION_QUANTIZATION_FORMATS
-
-
 # fused_name: List[shard_name]
 _FUSED_LAYER_NAME_MAPPING = {
     "qkv_proj": ["q_proj", "k_proj", "v_proj"],
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 86b6a3f62b7f..6ef39aa0ceb9 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -147,9 +147,15 @@ def __init__(
                      hidden_size)
 
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = GPT2Attention(config, cache_config, quant_config, prefix=f"{prefix}.attn")
+        self.attn = GPT2Attention(config,
+                                  cache_config,
+                                  quant_config,
+                                  prefix=f"{prefix}.attn")
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.mlp = GPT2MLP(inner_dim, config, quant_config, prefix=f"{prefix}.mlp")
+        self.mlp = GPT2MLP(inner_dim,
+                           config,
+                           quant_config,
+                           prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -193,7 +199,8 @@ def __init__(
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         self.start_layer, self.end_layer, self.h = make_layers(
             config.num_hidden_layers,
-            lambda prefix: GPT2Block(config, cache_config, quant_config, prefix=prefix),
+            lambda prefix: GPT2Block(
+                config, cache_config, quant_config, prefix=prefix),
             prefix="transformer.h")
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
 
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 59ae0d5dfa09..a584e5465d6e 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -267,10 +267,10 @@ def __init__(
         )
 
         self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, 
+            config.num_hidden_layers,
             lambda prefix: MixtralDecoderLayer(
-                config, cache_config, 
-                quant_config=quant_config, prefix=prefix),
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
             prefix="model.layers")
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 99be055e4dda..197d3839a766 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,4 +1,4 @@
-from typing import Callable, Dict, List, Protocol, Tuple
+from typing import Dict, List, Protocol, Tuple
 
 import torch
 from torch.func import functional_call
@@ -46,8 +46,10 @@ def merge_vision_embeddings(input_ids: torch.Tensor,
 
 
 class LayerFn(Protocol):
+
     def __call__(
-        self, prefix="",
+        self,
+        prefix="",
     ) -> torch.nn.Module:
         ...
 
@@ -124,6 +126,7 @@ def forward(*args, **kwargs):
 
     return module
 
+
 def make_layers(
     num_hidden_layers: int,
     layer_fn: LayerFn,

From 4d5fc07fc0a2b7f10a9ef23f1d93ad350086152e Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 21:10:48 +0000
Subject: [PATCH 24/26] move root back

---
 vllm/model_executor/models/gpt2.py    | 6 ++++--
 vllm/model_executor/models/llama.py   | 6 ++++--
 vllm/model_executor/models/mixtral.py | 6 ++++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 6ef39aa0ceb9..65a755794115 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -188,6 +188,7 @@ def __init__(
         config: GPT2Config,
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.config = config
@@ -201,7 +202,7 @@ def __init__(
             config.num_hidden_layers,
             lambda prefix: GPT2Block(
                 config, cache_config, quant_config, prefix=prefix),
-            prefix="transformer.h")
+            prefix=f"{prefix}.h")
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
 
     def forward(
@@ -244,7 +245,8 @@ def __init__(
         super().__init__()
         self.config = config
         self.quant_config = quant_config
-        self.transformer = GPT2Model(config, cache_config, quant_config)
+        self.transformer = GPT2Model(config, cache_config, 
+                                     quant_config, prefix="transformer")
         self.lm_head = self.transformer.wte
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 206648463624..7d07b7194258 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -258,6 +258,7 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
@@ -281,7 +282,7 @@ def __init__(
                                              cache_config=cache_config,
                                              quant_config=quant_config,
                                              prefix=prefix),
-            prefix="model.layers")
+            prefix=f"{prefix}.layers")
         if get_pp_group().is_last_rank:
             self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         else:
@@ -377,7 +378,8 @@ def __init__(
         self.model = LlamaModel(config,
                                 cache_config,
                                 quant_config,
-                                lora_config=lora_config)
+                                lora_config=lora_config,
+                                prefix="model")
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
             if lora_config:
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index a584e5465d6e..10717977fad4 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -252,6 +252,7 @@ def __init__(
         cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
         lora_config: Optional[LoRAConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.padding_idx = config.pad_token_id
@@ -271,7 +272,7 @@ def __init__(
             lambda prefix: MixtralDecoderLayer(
                 config, cache_config, quant_config=quant_config, prefix=prefix
             ),
-            prefix="model.layers")
+            prefix=f"{prefix}.layers")
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
@@ -343,7 +344,8 @@ def __init__(
         self.model = MixtralModel(config,
                                   cache_config,
                                   quant_config,
-                                  lora_config=lora_config)
+                                  lora_config=lora_config,
+                                  prefix="model")
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size

From 1fa917e0b9a30ca0faa8c00256556a2cab8f2fa6 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 21:12:55 +0000
Subject: [PATCH 25/26] fix mixtral

---
 vllm/model_executor/models/mixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 10717977fad4..8fbd537a2c03 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -78,7 +78,7 @@ def __init__(self,
                                      bias=False,
                                      params_dtype=params_dtype,
                                      quant_config=None,
-                                     prefix=prefix)
+                                     prefix=f"{prefix}.gate")
 
         self.experts = FusedMoE(num_experts=num_experts,
                                 top_k=top_k,
@@ -89,7 +89,7 @@ def __init__(self,
                                 renormalize=True,
                                 quant_config=quant_config,
                                 tp_size=tp_size,
-                                prefix=prefix)
+                                prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.

From 1e57ffadb7322014fc1dd171af9cedb26733efd1 Mon Sep 17 00:00:00 2001
From: "rshaw@neuralmagic.com" <rshaw@neuralmagic.com>
Date: Thu, 18 Jul 2024 21:21:15 +0000
Subject: [PATCH 26/26] formatted

---
 vllm/model_executor/models/gpt2.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 65a755794115..94cd67e75336 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -245,8 +245,10 @@ def __init__(
         super().__init__()
         self.config = config
         self.quant_config = quant_config
-        self.transformer = GPT2Model(config, cache_config, 
-                                     quant_config, prefix="transformer")
+        self.transformer = GPT2Model(config,
+                                     cache_config,
+                                     quant_config,
+                                     prefix="transformer")
         self.lm_head = self.transformer.wte
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()