From d695ec3ea08a628774293ae98e904a380c181fd9 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Wed, 5 Jun 2024 16:00:16 -0400
Subject: [PATCH 01/38] Add compressed-tensors HFQuantizer implementation

---
 src/transformers/modeling_utils.py            |  2 +-
 src/transformers/quantizers/auto.py           |  4 ++
 .../quantizer_compressed_tensors.py           | 69 +++++++++++++++++++
 src/transformers/utils/quantization_config.py | 53 ++++++++++++++
 4 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 src/transformers/quantizers/quantizer_compressed_tensors.py

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 27f26e42a84a..713f45411826 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -3826,7 +3826,7 @@ def from_pretrained(
                 dispatch_model(model, **device_map_kwargs)
 
         if hf_quantizer is not None:
-            hf_quantizer.postprocess_model(model)
+            hf_quantizer.postprocess_model(model, resolved_archive_file=resolved_archive_file)
             model.hf_quantizer = hf_quantizer
 
         if _adapter_model_path is not None:
diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py
index 2c65afa77e28..f2922ee9677d 100755
--- a/src/transformers/quantizers/auto.py
+++ b/src/transformers/quantizers/auto.py
@@ -19,6 +19,7 @@
     AqlmConfig,
     AwqConfig,
     BitsAndBytesConfig,
+    CompressedTensorsConfig,
     EetqConfig,
     GPTQConfig,
     HqqConfig,
@@ -30,6 +31,7 @@
 from .quantizer_awq import AwqQuantizer
 from .quantizer_bnb_4bit import Bnb4BitHfQuantizer
 from .quantizer_bnb_8bit import Bnb8BitHfQuantizer
+from .quantizer_compressed_tensors import CompressedTensorsHfQuantizer
 from .quantizer_eetq import EetqHfQuantizer
 from .quantizer_gptq import GptqHfQuantizer
 from .quantizer_hqq import HqqHfQuantizer
@@ -45,6 +47,7 @@
     "quanto": QuantoHfQuantizer,
     "eetq": EetqHfQuantizer,
     "hqq": HqqHfQuantizer,
+    "compressed_tensors": CompressedTensorsHfQuantizer,
 }
 
 AUTO_QUANTIZATION_CONFIG_MAPPING = {
@@ -56,6 +59,7 @@
     "aqlm": AqlmConfig,
     "quanto": QuantoConfig,
     "hqq": HqqConfig,
+    "compressed_tensors": CompressedTensorsConfig,
 }
 
 
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
new file mode 100644
index 000000000000..a201f504fc4e
--- /dev/null
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -0,0 +1,69 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base import HfQuantizer
+
+
+from ..utils import is_torch_available, logging
+from ..utils.quantization_config import QuantizationConfigMixin
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class CompressedTensorsHfQuantizer(HfQuantizer):
+    """
+    Quantizer for the compressed_tensors package.  Loads and restores models to
+    quantized state with compressed_tensors
+    """
+
+    requires_calibration = False
+    # requires_parameters_quantization = True
+    required_packages = ["compressed_tensors"]
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        from compressed_tensors.compressors import ModelCompressor
+        self.compressor = ModelCompressor.from_compression_config(quantization_config)
+
+    def validate_environment(self, *args, **kwargs):
+        # check torch and compressed_tensors are available, let ImportError raise otherwise
+        import torch
+        from compressed_tensors.compressors import ModelCompressor
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            torch_dtype = torch.float16
+        elif torch_dtype != torch.float16:
+            logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with compressed_tensors.")
+        return torch_dtype
+
+    def _process_model_before_weight_loading(self, model, **kwargs):
+        if self.quantization_config.quantization_config is not None:
+            from compressed_tensors.quantization import apply_quantization_config
+            apply_quantization_config(model, self.quantization_config.quantization_config)
+
+    def _process_model_after_weight_loading(self, model, resolved_archive_file, **kwargs):
+        self.compressor.decompress(model_path=resolved_archive_file, model=model)
+
+    @property
+    def is_trainable(self):
+        return False
+
+    @property
+    def is_serializable(self):
+        return True
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index f9e503cf862f..496501557fde 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -42,6 +42,7 @@ class QuantizationMethod(str, Enum):
     QUANTO = "quanto"
     EETQ = "eetq"
     HQQ = "hqq"
+    COMPRESSED_TENSORS = "compressed_tensors"
 
 
 class AWQLinearVersion(str, Enum):
@@ -1038,3 +1039,55 @@ def post_init(self):
         accepted_weights = ["int8"]
         if self.weights not in accepted_weights:
             raise ValueError(f"Only support weights in {accepted_weights} but found {self.weights}")
+
+
+@dataclass
+class CompressedTensorsConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class that handles compressed-tensors quantization config options.
+    It is a wrapper around `compressed_tensors.QuantizationConfig`
+
+    Args:
+        weights (`str`, *optional*, defaults to `"int8"`):
+            The target dtype for the weights. Supported value is only "int8"
+        modules_to_not_convert (`list`, *optional*, default to `None`):
+            The list of modules to not quantize, useful for quantizing models that explicitly require to have
+            some modules left in their original precision.
+    """
+
+    def __init__(
+        self,
+        config_groups: Dict[str, Union["QuantizationScheme", List[str]]] = None,
+        quant_method: str = "sparseml",
+        format: str = "fakequant",
+        quantization_status: "QuantizationStatus" = "initialized",
+        global_compression_ratio: Optional[float] = None,
+        ignore: Optional[List[str]] = None,
+        sparsity_config: Dict[str, Any] = None,
+        **kwargs,
+    ):
+        from compressed_tensors import QuantizationConfig
+        from compressed_tensors.config import SparsityCompressionConfig
+
+        self.quantization_config = None
+        self.sparsity_configq = None
+
+        # parse from dict to load nested QuantizationScheme objects
+        if config_groups:
+            self.quantization_config = QuantizationConfig.parse_obj(
+                dict(
+                    config_groups=config_groups,
+                    quant_method=quant_method,
+                    format=format,
+                    quantization_status=quantization_status,
+                    global_compression_ratio=global_compression_ratio,
+                    ignore=ignore,
+                )
+            )
+
+        if sparsity_config:
+            self.sparsity_config = SparsityCompressionConfig.load_from_registry(
+                sparsity_config.get("format"), **sparsity_config
+            )
+
+        super().__init__(quant_method=QuantizationMethod.COMPRESSED_TENSORS)

From f4689647e5620fea55118fcafc414bedc76759a0 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Wed, 5 Jun 2024 16:10:08 -0400
Subject: [PATCH 02/38] flag serializable as False

---
 src/transformers/quantizers/quantizer_compressed_tensors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index a201f504fc4e..d1ebe4dc6640 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -66,4 +66,4 @@ def is_trainable(self):
 
     @property
     def is_serializable(self):
-        return True
+        return False

From 41224d3d5f0f194b87ec1099bba1e42aff0056f1 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Mon, 10 Jun 2024 19:56:33 +0000
Subject: [PATCH 03/38] run

---
 .../quantizer_compressed_tensors.py           | 14 +++++++-----
 src/transformers/utils/quantization_config.py | 22 ++++++++++---------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index d1ebe4dc6640..b24700d38c4e 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -11,11 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .base import HfQuantizer
-
-
 from ..utils import is_torch_available, logging
 from ..utils.quantization_config import QuantizationConfigMixin
+from .base import HfQuantizer
 
 
 if is_torch_available():
@@ -38,23 +36,27 @@ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
         super().__init__(quantization_config, **kwargs)
 
         from compressed_tensors.compressors import ModelCompressor
+
+        # self.compressor = ModelCompressor.from_compression_config(quantization_config.to_dict())
         self.compressor = ModelCompressor.from_compression_config(quantization_config)
 
     def validate_environment(self, *args, **kwargs):
         # check torch and compressed_tensors are available, let ImportError raise otherwise
-        import torch
-        from compressed_tensors.compressors import ModelCompressor
+        pass
 
     def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
         if torch_dtype is None:
             torch_dtype = torch.float16
         elif torch_dtype != torch.float16:
-            logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with compressed_tensors.")
+            logger.info(
+                "We suggest you to set `torch_dtype=torch.float16` for better efficiency with compressed_tensors."
+            )
         return torch_dtype
 
     def _process_model_before_weight_loading(self, model, **kwargs):
         if self.quantization_config.quantization_config is not None:
             from compressed_tensors.quantization import apply_quantization_config
+
             apply_quantization_config(model, self.quantization_config.quantization_config)
 
     def _process_model_after_weight_loading(self, model, resolved_archive_file, **kwargs):
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 496501557fde..3031ef6ff341 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -22,6 +22,8 @@
 from enum import Enum
 from typing import Any, Dict, List, Optional, Union
 
+from compressed_tensors.quantization.quant_config import QuantizationStatus
+from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from packaging import version
 
 from ..utils import is_auto_awq_available, is_hqq_available, is_torch_available, logging
@@ -1059,7 +1061,7 @@ def __init__(
         self,
         config_groups: Dict[str, Union["QuantizationScheme", List[str]]] = None,
         quant_method: str = "sparseml",
-        format: str = "fakequant",
+        format: str = "dense",  # "fakequant" not in CompressionFormat
         quantization_status: "QuantizationStatus" = "initialized",
         global_compression_ratio: Optional[float] = None,
         ignore: Optional[List[str]] = None,
@@ -1070,19 +1072,19 @@ def __init__(
         from compressed_tensors.config import SparsityCompressionConfig
 
         self.quantization_config = None
-        self.sparsity_configq = None
+        self.sparsity_config = None
 
         # parse from dict to load nested QuantizationScheme objects
         if config_groups:
             self.quantization_config = QuantizationConfig.parse_obj(
-                dict(
-                    config_groups=config_groups,
-                    quant_method=quant_method,
-                    format=format,
-                    quantization_status=quantization_status,
-                    global_compression_ratio=global_compression_ratio,
-                    ignore=ignore,
-                )
+                {
+                    "config_groups": config_groups,
+                    "quant_method": quant_method,
+                    "format": format,
+                    "quantization_status": quantization_status,
+                    "global_compression_ratio": global_compression_ratio,
+                    "ignore": ignore,
+                }
             )
 
         if sparsity_config:

From b61bfb968db36e1f4f1a0f03f8697f7b116b0591 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Mon, 10 Jun 2024 20:47:38 +0000
Subject: [PATCH 04/38] revive lines deleted by ruff

---
 src/transformers/quantizers/quantizer_compressed_tensors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index b24700d38c4e..3d0d2e009942 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -37,12 +37,12 @@ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
 
         from compressed_tensors.compressors import ModelCompressor
 
-        # self.compressor = ModelCompressor.from_compression_config(quantization_config.to_dict())
         self.compressor = ModelCompressor.from_compression_config(quantization_config)
 
     def validate_environment(self, *args, **kwargs):
         # check torch and compressed_tensors are available, let ImportError raise otherwise
-        pass
+        import torch  # noqa
+        from compressed_tensors.compressors import ModelCompressor  # noqa
 
     def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
         if torch_dtype is None:

From ff8f1c5af0be2eb22095025cd12175345cb2f52e Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Tue, 11 Jun 2024 18:19:41 +0000
Subject: [PATCH 05/38] fixes to load+save from sparseml, edit config to
 quantization_config, and load back

---
 src/transformers/quantizers/auto.py           |  2 +-
 .../quantizer_compressed_tensors.py           |  2 +-
 src/transformers/utils/quantization_config.py | 22 +++++++++++++++++--
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py
index f2922ee9677d..5e26ed91dc40 100755
--- a/src/transformers/quantizers/auto.py
+++ b/src/transformers/quantizers/auto.py
@@ -59,7 +59,7 @@
     "aqlm": AqlmConfig,
     "quanto": QuantoConfig,
     "hqq": HqqConfig,
-    "compressed_tensors": CompressedTensorsConfig,
+    "compressed-tensors": CompressedTensorsConfig,
 }
 
 
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 3d0d2e009942..8493fbfd7fa3 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -68,4 +68,4 @@ def is_trainable(self):
 
     @property
     def is_serializable(self):
-        return False
+        return True
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 3031ef6ff341..f5aa79e70e42 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -18,13 +18,14 @@
 import importlib.metadata
 import json
 import os
-from dataclasses import dataclass
+from dataclasses import asdict, dataclass, is_dataclass
 from enum import Enum
 from typing import Any, Dict, List, Optional, Union
 
 from compressed_tensors.quantization.quant_config import QuantizationStatus
 from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from packaging import version
+from pydantic import BaseModel
 
 from ..utils import is_auto_awq_available, is_hqq_available, is_torch_available, logging
 
@@ -70,6 +71,23 @@ class AwqBackendPackingMethod(str, Enum):
     LLMAWQ = "llm-awq"
 
 
+def convert_to_dict(obj):
+    if is_dataclass(obj):
+        return asdict(obj)
+    elif isinstance(obj, BaseModel):
+        return obj.dict()
+    elif isinstance(obj, Enum):
+        return obj.value
+    elif isinstance(obj, dict):
+        return {k: convert_to_dict(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_to_dict(i) for i in obj]
+    elif isinstance(obj, tuple):
+        return tuple(convert_to_dict(i) for i in obj)
+    else:
+        return obj
+
+
 @dataclass
 class QuantizationConfigMixin:
     """
@@ -133,7 +151,7 @@ def to_dict(self) -> Dict[str, Any]:
         Serializes this instance to a Python dictionary. Returns:
             `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
         """
-        return copy.deepcopy(self.__dict__)
+        return convert_to_dict(copy.deepcopy(self.__dict__))
 
     def __iter__(self):
         """allows `dict(obj)` for situations where obj may be a dict or QuantizationConfigMixin"""

From c1cb55debbd174b9208961b5951c99a9472dfebf Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Tue, 11 Jun 2024 18:21:26 +0000
Subject: [PATCH 06/38] address satrat comment

---
 src/transformers/utils/quantization_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index f5aa79e70e42..4ad61ddb229e 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -45,7 +45,7 @@ class QuantizationMethod(str, Enum):
     QUANTO = "quanto"
     EETQ = "eetq"
     HQQ = "hqq"
-    COMPRESSED_TENSORS = "compressed_tensors"
+    COMPRESSED_TENSORS = "compressed-tensors"
 
 
 class AWQLinearVersion(str, Enum):

From ef9d3f174dfc5fe8022b71aa56c4cb13fdf3e66f Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Wed, 12 Jun 2024 14:20:01 +0000
Subject: [PATCH 07/38] compressed_tensors to compressed-tensors and revert
 back is_serializable

---
 src/transformers/quantizers/auto.py                         | 2 +-
 src/transformers/quantizers/quantizer_compressed_tensors.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py
index 5e26ed91dc40..13b8f2bd68dd 100755
--- a/src/transformers/quantizers/auto.py
+++ b/src/transformers/quantizers/auto.py
@@ -47,7 +47,7 @@
     "quanto": QuantoHfQuantizer,
     "eetq": EetqHfQuantizer,
     "hqq": HqqHfQuantizer,
-    "compressed_tensors": CompressedTensorsHfQuantizer,
+    "compressed-tensors": CompressedTensorsHfQuantizer,
 }
 
 AUTO_QUANTIZATION_CONFIG_MAPPING = {
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 8493fbfd7fa3..3d0d2e009942 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -68,4 +68,4 @@ def is_trainable(self):
 
     @property
     def is_serializable(self):
-        return True
+        return False

From 117d0504899aac6210c4ec786c3be12087c1a60a Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Wed, 12 Jun 2024 15:53:00 +0000
Subject: [PATCH 08/38] rename quant_method from sparseml to compressed-tensors

---
 src/transformers/utils/quantization_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 4ad61ddb229e..c9824f24e6e8 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1078,7 +1078,7 @@ class CompressedTensorsConfig(QuantizationConfigMixin):
     def __init__(
         self,
         config_groups: Dict[str, Union["QuantizationScheme", List[str]]] = None,
-        quant_method: str = "sparseml",
+        quant_method: str = "compressed-tensors",
         format: str = "dense",  # "fakequant" not in CompressionFormat
         quantization_status: "QuantizationStatus" = "initialized",
         global_compression_ratio: Optional[float] = None,

From 1901c3e51d7801bae4ef9f129b4278978a74afa6 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Wed, 12 Jun 2024 22:05:50 +0000
Subject: [PATCH 09/38] tests

---
 .../compressed_tensor/__init__.py             |  0
 .../test_compressed_tensors.py                | 76 +++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 tests/quantization/compressed_tensor/__init__.py
 create mode 100644 tests/quantization/compressed_tensor/test_compressed_tensors.py

diff --git a/tests/quantization/compressed_tensor/__init__.py b/tests/quantization/compressed_tensor/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
new file mode 100644
index 000000000000..53c294f25cec
--- /dev/null
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -0,0 +1,76 @@
+# from transformers.quantizers.quantizer_compressed_tensors import CompressedTensorsHfQuantizer
+# from transformers.quantizers.quantizer_compressed_tensors import CompressedTensorsHfQuantizer
+
+import gc
+import unittest
+
+import torch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
+
+
+class CompressedTensorsTest(unittest.TestCase):
+    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    source_quantized_model_name = "nm-testing/tinyllama-oneshot-w8a8-test-static-shape-change-v3"
+
+    prompt = "Paris is the capital of which country?"
+    # ['<s> Paris is the capital of which country?\n\nA. London\n\nB. New York\n\nC. Paris\n\nD. Tokyo\n\n4. Which country is the capital of the European Union?\n\nA. France\n']
+    expected_response = ""
+
+    def tear_down(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    @classmethod
+    def setUpClass(self):
+        """
+        Setup quantized model
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained(self.source_quantized_model_name)
+        self.source_quantized_model = AutoModelForCausalLM.from_pretrained(self.source_quantized_model_name)
+
+        self.device = self.source_quantized_model.device
+        compression_config = self.source_quantized_model.config.quantization_config.quantization_config.config_groups
+
+        self.config = CompressedTensorsConfig(
+            config_groups=compression_config,
+            sparsity_config=self.source_quantized_model.config.quantization_config.sparsity_config.dict(),
+        )
+
+        self.assertIsNotNone(self.config.sparsity_config, "sparsity_config should not be None")
+        self.assertIsNotNone(self.config.quantization_config, "quantization_config should not be None")
+
+    @unittest.skip("scales not populated")
+    def test_apply_quantization(self):
+        # fails bc state_dict_scale = state_dict[f"{module_name}.{scale_name}"]
+        #  KeyError: 'model.layers.0.self_attn.q_proj.weight_scale
+        self.quantization_model = AutoModelForCausalLM.from_pretrained(
+            self.model_name, quantization_config=self.config
+        )
+        # check that the input layers of self.source_quantized_model and self.quantization_model is the same
+
+    def test_quantized_model(self):
+        # test the quantized model, not the original model
+
+        inputs = self.tokenizer(self.prompt, return_tensors="pt").to(self.device)
+        generated_ids = self.source_quantized_model.generate(**inputs, max_length=50)
+        outputs = self.tokenizer.batch_decode(generated_ids)
+
+        self.expected_response = outputs
+        self.assertEqual(outputs, self.expected_response)
+        self.tear_down()
+
+    def test_forward(self):
+        batch_size = context_size = 1024
+        tensor1 = torch.rand(1024).long()
+        tensor2 = torch.rand(1024).long()
+
+        input_tensor = torch.cat((tensor1, tensor2), dim=0)
+        input_tensor = input_tensor.unsqueeze(0)
+        with torch.no_grad():
+            out = self.source_quantized_model(input_tensor)
+        self.assertEqual(out.shape[0], batch_size)
+        self.assertEqual(out.shape[1], context_size)
+
+        self.tear_down()

From 3ca270dfb50f8365eb6c9f92c8f5ca426339c5ca Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Thu, 13 Jun 2024 18:28:14 +0000
Subject: [PATCH 10/38] edit tests

---
 src/transformers/__init__.py                  |  1 +
 src/transformers/utils/quantization_config.py |  2 +-
 .../test_compressed_tensors.py                | 30 ++++++++-----------
 3 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 40b7905bfdbb..e4b7a227ab94 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -916,6 +916,7 @@
         "AqlmConfig",
         "AwqConfig",
         "BitsAndBytesConfig",
+        "CompressedTensorsConfig",
         "EetqConfig",
         "GPTQConfig",
         "HqqConfig",
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index c9824f24e6e8..07e88d6394c3 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1079,7 +1079,7 @@ def __init__(
         self,
         config_groups: Dict[str, Union["QuantizationScheme", List[str]]] = None,
         quant_method: str = "compressed-tensors",
-        format: str = "dense",  # "fakequant" not in CompressionFormat
+        format: str = "dense",
         quantization_status: "QuantizationStatus" = "initialized",
         global_compression_ratio: Optional[float] = None,
         ignore: Optional[List[str]] = None,
diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
index 53c294f25cec..0b368f7fd785 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -7,6 +7,7 @@
 import torch
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
+from transformers.testing_utils import slow
 
 
 class CompressedTensorsTest(unittest.TestCase):
@@ -14,8 +15,6 @@ class CompressedTensorsTest(unittest.TestCase):
     source_quantized_model_name = "nm-testing/tinyllama-oneshot-w8a8-test-static-shape-change-v3"
 
     prompt = "Paris is the capital of which country?"
-    # ['<s> Paris is the capital of which country?\n\nA. London\n\nB. New York\n\nC. Paris\n\nD. Tokyo\n\n4. Which country is the capital of the European Union?\n\nA. France\n']
-    expected_response = ""
 
     def tear_down(self):
         gc.collect()
@@ -41,35 +40,30 @@ def setUpClass(self):
         self.assertIsNotNone(self.config.sparsity_config, "sparsity_config should not be None")
         self.assertIsNotNone(self.config.quantization_config, "quantization_config should not be None")
 
-    @unittest.skip("scales not populated")
-    def test_apply_quantization(self):
-        # fails bc state_dict_scale = state_dict[f"{module_name}.{scale_name}"]
-        #  KeyError: 'model.layers.0.self_attn.q_proj.weight_scale
-        self.quantization_model = AutoModelForCausalLM.from_pretrained(
-            self.model_name, quantization_config=self.config
-        )
-        # check that the input layers of self.source_quantized_model and self.quantization_model is the same
+        # apply quantization config to the base model
+        self.quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, quantization_config=self.config)
 
     def test_quantized_model(self):
-        # test the quantized model, not the original model
-
+        """Carry out generation"""
         inputs = self.tokenizer(self.prompt, return_tensors="pt").to(self.device)
-        generated_ids = self.source_quantized_model.generate(**inputs, max_length=50)
+        generated_ids = self.quantized_model.generate(**inputs, max_length=50)
         outputs = self.tokenizer.batch_decode(generated_ids)
 
-        self.expected_response = outputs
-        self.assertEqual(outputs, self.expected_response)
+        self.assertIsNotNone(outputs)
         self.tear_down()
 
+    @slow
     def test_forward(self):
         batch_size = context_size = 1024
-        tensor1 = torch.rand(1024).long()
-        tensor2 = torch.rand(1024).long()
+        tensor1 = torch.rand(1024) * 1000
+        tensor1 = tensor1.long()
+        tensor2 = torch.rand(1024) * 1000
+        tensor2 = tensor2.long()
 
         input_tensor = torch.cat((tensor1, tensor2), dim=0)
         input_tensor = input_tensor.unsqueeze(0)
         with torch.no_grad():
-            out = self.source_quantized_model(input_tensor)
+            out = self.quantized_model(input_tensor)
         self.assertEqual(out.shape[0], batch_size)
         self.assertEqual(out.shape[1], context_size)
 

From 9a14b0922ab75cb131e94a9e77ef55fd8f666770 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Fri, 28 Jun 2024 13:51:49 -0400
Subject: [PATCH 11/38] clean up tests

---
 .../test_compressed_tensors.py                | 51 ++++++-------------
 1 file changed, 16 insertions(+), 35 deletions(-)

diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
index 0b368f7fd785..f0b309a4fb79 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -11,8 +11,7 @@
 
 
 class CompressedTensorsTest(unittest.TestCase):
-    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    source_quantized_model_name = "nm-testing/tinyllama-oneshot-w8a8-test-static-shape-change-v3"
+    quantized_model_name = "nm-testing/tinyllama-oneshot-w8a8-test-static-shape-change-v3"
 
     prompt = "Paris is the capital of which country?"
 
@@ -26,45 +25,27 @@ def setUpClass(self):
         """
         Setup quantized model
         """
-        self.tokenizer = AutoTokenizer.from_pretrained(self.source_quantized_model_name)
-        self.source_quantized_model = AutoModelForCausalLM.from_pretrained(self.source_quantized_model_name)
-
-        self.device = self.source_quantized_model.device
-        compression_config = self.source_quantized_model.config.quantization_config.quantization_config.config_groups
-
-        self.config = CompressedTensorsConfig(
-            config_groups=compression_config,
-            sparsity_config=self.source_quantized_model.config.quantization_config.sparsity_config.dict(),
-        )
-
-        self.assertIsNotNone(self.config.sparsity_config, "sparsity_config should not be None")
-        self.assertIsNotNone(self.config.quantization_config, "quantization_config should not be None")
-
-        # apply quantization config to the base model
-        self.quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, quantization_config=self.config)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.quantized_model_name)
+        self.quantized_model = AutoModelForCausalLM.from_pretrained(self.quantized_model_name)
+        self.device = self.quantized_model.device
 
     def test_quantized_model(self):
         """Carry out generation"""
+        self.assertIsNotNone(
+            self.quantized_model.config.quantization_config,
+            "quantization_config should not be None",
+        )
+        self.assertTrue(
+            any(
+                key for key, tensor
+                in self.quantized_model.state_dict().items()
+                if "scale" in key and not torch.all(tensor == 1.0)
+            ),
+            "quantized model should load a non-trivail scale into the state dict"
+        )
         inputs = self.tokenizer(self.prompt, return_tensors="pt").to(self.device)
         generated_ids = self.quantized_model.generate(**inputs, max_length=50)
         outputs = self.tokenizer.batch_decode(generated_ids)
 
         self.assertIsNotNone(outputs)
         self.tear_down()
-
-    @slow
-    def test_forward(self):
-        batch_size = context_size = 1024
-        tensor1 = torch.rand(1024) * 1000
-        tensor1 = tensor1.long()
-        tensor2 = torch.rand(1024) * 1000
-        tensor2 = tensor2.long()
-
-        input_tensor = torch.cat((tensor1, tensor2), dim=0)
-        input_tensor = input_tensor.unsqueeze(0)
-        with torch.no_grad():
-            out = self.quantized_model(input_tensor)
-        self.assertEqual(out.shape[0], batch_size)
-        self.assertEqual(out.shape[1], context_size)
-
-        self.tear_down()

From ec59052d8d5f329d9ce6a29eb8f03ebf99a9290b Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Fri, 28 Jun 2024 13:54:56 -0400
Subject: [PATCH 12/38] make style

---
 .../quantization/compressed_tensor/test_compressed_tensors.py  | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
index f0b309a4fb79..3658c6dbdf6b 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -6,8 +6,7 @@
 
 import torch
 
-from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
-from transformers.testing_utils import slow
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 class CompressedTensorsTest(unittest.TestCase):

From 520ded87cc0efddbdd871ef198d75f62e30f9181 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Fri, 28 Jun 2024 15:02:27 -0400
Subject: [PATCH 13/38] cleanup

---
 .../quantization/compressed_tensor/test_compressed_tensors.py  | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
index 3658c6dbdf6b..39ba39bd456d 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -1,6 +1,3 @@
-# from transformers.quantizers.quantizer_compressed_tensors import CompressedTensorsHfQuantizer
-# from transformers.quantizers.quantizer_compressed_tensors import CompressedTensorsHfQuantizer
-
 import gc
 import unittest
 

From 7dec8fc833a60d7fd5bcd725ed15d2dcb5515685 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Fri, 28 Jun 2024 15:03:17 -0400
Subject: [PATCH 14/38] cleanup

---
 src/transformers/utils/quantization_config.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 07e88d6394c3..43a89612e97d 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -22,8 +22,6 @@
 from enum import Enum
 from typing import Any, Dict, List, Optional, Union
 
-from compressed_tensors.quantization.quant_config import QuantizationStatus
-from compressed_tensors.quantization.quant_scheme import QuantizationScheme
 from packaging import version
 from pydantic import BaseModel
 

From d9b3660155d5526d83706fd02a3528c382c66713 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Thu, 25 Jul 2024 16:16:03 -0400
Subject: [PATCH 15/38] add test skip for when compressed tensors is not
 installed

---
 src/transformers/testing_utils.py                         | 8 ++++++++
 src/transformers/utils/__init__.py                        | 1 +
 src/transformers/utils/import_utils.py                    | 5 +++++
 .../compressed_tensor/test_compressed_tensors.py          | 2 ++
 4 files changed, 16 insertions(+)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index edfc9519963b..e3010ce7bdb8 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -61,6 +61,7 @@
     is_av_available,
     is_bitsandbytes_available,
     is_bs4_available,
+    is_compressed_tensors_available,
     is_cv2_available,
     is_cython_available,
     is_decord_available,
@@ -1117,6 +1118,13 @@ def require_quanto(test_case):
     return unittest.skipUnless(is_quanto_available(), "test requires quanto")(test_case)
 
 
+def require_compressed_tensors(test_case):
+    """
+    Decorator for compressed_tensors dependency
+    """
+    return unittest.skipUnless(is_compressed_tensors_available(), "test requires compressed_tensors")(test_case)
+
+
 def require_fbgemm_gpu(test_case):
     """
     Decorator for fbgemm_gpu dependency
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index efe473a6cded..a916ecf45845 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -119,6 +119,7 @@
     is_bitsandbytes_available,
     is_bs4_available,
     is_coloredlogs_available,
+    is_compressed_tensors_available,
     is_cv2_available,
     is_cython_available,
     is_datasets_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index c52da62c1de8..1fa6791695af 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -138,6 +138,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 # `importlib.metadata.version` doesn't work with `awq`
 _auto_awq_available = importlib.util.find_spec("awq") is not None
 _quanto_available = _is_package_available("quanto")
+_compressed_tensors_available = _is_package_available("compressed_tensors")
 _pandas_available = _is_package_available("pandas")
 _peft_available = _is_package_available("peft")
 _phonemizer_available = _is_package_available("phonemizer")
@@ -888,6 +889,10 @@ def is_quanto_available():
     return _quanto_available
 
 
+def is_compressed_tensors_available():
+    return _compressed_tensors_available
+
+
 def is_auto_gptq_available():
     return _auto_gptq_available
 
diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
index 39ba39bd456d..7913167c75f5 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -4,8 +4,10 @@
 import torch
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.testing_utils import require_compressed_tensors
 
 
+@require_compressed_tensors
 class CompressedTensorsTest(unittest.TestCase):
     quantized_model_name = "nm-testing/tinyllama-oneshot-w8a8-test-static-shape-change-v3"
 

From e51ac5945e0755d9d60fb15c27ab7864b9aef2ba Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Thu, 25 Jul 2024 16:29:33 -0400
Subject: [PATCH 16/38] remove pydantic import + style

---
 src/transformers/utils/quantization_config.py | 38 ++++++++-----------
 .../test_compressed_tensors.py                |  6 +--
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 287d763fcf9f..38f8341b8d3c 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -18,12 +18,11 @@
 import importlib.metadata
 import json
 import os
-from dataclasses import asdict, dataclass, is_dataclass
+from dataclasses import dataclass
 from enum import Enum
 from typing import Any, Dict, List, Optional, Union
 
 from packaging import version
-from pydantic import BaseModel
 
 from ..utils import is_auto_awq_available, is_hqq_available, is_torch_available, logging
 
@@ -70,23 +69,6 @@ class AwqBackendPackingMethod(str, Enum):
     LLMAWQ = "llm-awq"
 
 
-def convert_to_dict(obj):
-    if is_dataclass(obj):
-        return asdict(obj)
-    elif isinstance(obj, BaseModel):
-        return obj.dict()
-    elif isinstance(obj, Enum):
-        return obj.value
-    elif isinstance(obj, dict):
-        return {k: convert_to_dict(v) for k, v in obj.items()}
-    elif isinstance(obj, list):
-        return [convert_to_dict(i) for i in obj]
-    elif isinstance(obj, tuple):
-        return tuple(convert_to_dict(i) for i in obj)
-    else:
-        return obj
-
-
 @dataclass
 class QuantizationConfigMixin:
     """
@@ -150,7 +132,7 @@ def to_dict(self) -> Dict[str, Any]:
         Serializes this instance to a Python dictionary. Returns:
             `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
         """
-        return convert_to_dict(copy.deepcopy(self.__dict__))
+        return copy.deepcopy(self.__dict__)
 
     def __iter__(self):
         """allows `dict(obj)` for situations where obj may be a dict or QuantizationConfigMixin"""
@@ -1083,10 +1065,10 @@ class CompressedTensorsConfig(QuantizationConfigMixin):
 
     def __init__(
         self,
-        config_groups: Dict[str, Union["QuantizationScheme", List[str]]] = None,
+        config_groups: Dict[str, Union["QuantizationScheme", List[str]]] = None,  # noqa: F821
         quant_method: str = "compressed-tensors",
         format: str = "dense",
-        quantization_status: "QuantizationStatus" = "initialized",
+        quantization_status: "QuantizationStatus" = "initialized",  # noqa: F821
         global_compression_ratio: Optional[float] = None,
         ignore: Optional[List[str]] = None,
         sparsity_config: Dict[str, Any] = None,
@@ -1118,7 +1100,17 @@ def __init__(
 
         super().__init__(quant_method=QuantizationMethod.COMPRESSED_TENSORS)
 
-        
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        return {
+            "quantization_config": self.quantization_config.dict(),
+            "sparsity_config": self.sparsity_config.dict(),
+        }
+
+
 class FbgemmFp8Config(QuantizationConfigMixin):
     """
     This is a wrapper class about all possible attributes and features that you can play with a model that has been
diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
index 7913167c75f5..52639e95d63e 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -35,11 +35,11 @@ def test_quantized_model(self):
         )
         self.assertTrue(
             any(
-                key for key, tensor
-                in self.quantized_model.state_dict().items()
+                key
+                for key, tensor in self.quantized_model.state_dict().items()
                 if "scale" in key and not torch.all(tensor == 1.0)
             ),
-            "quantized model should load a non-trivail scale into the state dict"
+            "quantized model should load a non-trivail scale into the state dict",
         )
         inputs = self.tokenizer(self.prompt, return_tensors="pt").to(self.device)
         generated_ids = self.quantized_model.generate(**inputs, max_length=50)

From ccb5442350d06e5b524c06c233d32684a7637fd5 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Thu, 25 Jul 2024 16:41:28 -0400
Subject: [PATCH 17/38] delay torch import in test

---
 .../compressed_tensor/test_compressed_tensors.py       | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
index 52639e95d63e..6b9df0c95feb 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -1,13 +1,17 @@
 import gc
 import unittest
 
-import torch
-
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.testing_utils import require_compressed_tensors
+from transformers.testing_utils import require_compressed_tensors, require_torch
+from transformers.utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
 
 
 @require_compressed_tensors
+@require_torch
 class CompressedTensorsTest(unittest.TestCase):
     quantized_model_name = "nm-testing/tinyllama-oneshot-w8a8-test-static-shape-change-v3"
 

From bfd9220b47be5ebf0b9545077d9bc904c9511c48 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Tue, 30 Jul 2024 17:56:49 -0400
Subject: [PATCH 18/38] initial docs

---
 docs/source/en/main_classes/quantization.md   |  3 ++
 .../en/quantization/compressed_tensors.md     | 47 +++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 docs/source/en/quantization/compressed_tensors.md

diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index fc5808415cbe..0d041f2c4232 100755
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -61,3 +61,6 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 
 [[autodoc]] FbgemmFp8Config
 
+## CompressedTensorsConfig
+
+[[autodoc]] CompressedTensorsConfig
diff --git a/docs/source/en/quantization/compressed_tensors.md b/docs/source/en/quantization/compressed_tensors.md
new file mode 100644
index 000000000000..1019a9e0afc6
--- /dev/null
+++ b/docs/source/en/quantization/compressed_tensors.md
@@ -0,0 +1,47 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Compressed Tensors
+
+Compressed tensors supports the quantization of models to a variety of formats and provides an extensible
+framework for adding new formats and strategies.
+
+Compressed models can be easily created using [llm-compressor](https://github.com/vllm-project/llm-compressor).
+Alternatively models can be created indepedenty and serialized with a compressed tensors config.
+
+Supported formats include:
+
+ - FP8, INT4, INT8 (for Q/DQ arbitrary precision is allowed for INT)
+ - Activation quantization (static)
+ - Dynamic per-token activation quantization
+ - Supports quantization of arbitrary layer types
+ - Targeted support or ignoring of layers by name or class
+
+## Installation
+
+```bash
+pip install compressed-tensors
+```
+
+
+## Sample Model Load
+```python
+from transformers import AutoModelForCausalLM
+compressed_tensors_model = AutoModelForCausalLM.from_pretrained("nm-testing/tinyllama-oneshot-w4a16-group128-v3")
+```
+
+
+## More Coming Soon!

From 71a80f92f791639ea3e27060731be922703ade7d Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Tue, 30 Jul 2024 18:02:59 -0400
Subject: [PATCH 19/38] update main init for compressed tensors config

---
 src/transformers/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8f1dc4e28ce9..fb041a9bc7ab 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -5669,6 +5669,7 @@
         AqlmConfig,
         AwqConfig,
         BitsAndBytesConfig,
+        CompressedTensorsConfig,
         EetqConfig,
         FbgemmFp8Config,
         GPTQConfig,

From 547f9cce9afb99fe76510cb8988ce33013ef3b5b Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Tue, 30 Jul 2024 19:26:24 -0400
Subject: [PATCH 20/38] make fix-copies

---
 src/transformers/utils/quantization_config.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 38f8341b8d3c..9b1e1e2b526d 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1056,11 +1056,13 @@ class CompressedTensorsConfig(QuantizationConfigMixin):
     This is a wrapper class that handles compressed-tensors quantization config options.
     It is a wrapper around `compressed_tensors.QuantizationConfig`
     Args:
-        weights (`str`, *optional*, defaults to `"int8"`):
-            The target dtype for the weights. Supported value is only "int8"
-        modules_to_not_convert (`list`, *optional*, default to `None`):
-            The list of modules to not quantize, useful for quantizing models that explicitly require to have
-            some modules left in their original precision.
+        config_groups (`typing.Dict[str, typing.Union[ForwardRef('QuantizationScheme'), typing.List[str]]]`, *optional*): <fill_docstring>
+        quant_method (`str`, *optional*, defaults to `"compressed-tensors"`): <fill_docstring>
+        format (`str`, *optional*, defaults to `"dense"`): <fill_docstring>
+        quantization_status (`QuantizationStatus`, *optional*, defaults to `"initialized"`): <fill_docstring>
+        global_compression_ratio (`typing.Union[float, NoneType]`, *optional*): <fill_docstring>
+        ignore (`typing.Union[typing.List[str], NoneType]`, *optional*): <fill_docstring>
+        sparsity_config (`typing.Dict[str, typing.Any]`, *optional*): <fill_docstring>
     """
 
     def __init__(

From 8acbc0901b1d46f4c86acb852477b2da68e9651b Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Tue, 30 Jul 2024 22:19:25 -0400
Subject: [PATCH 21/38] docstring

---
 src/transformers/utils/quantization_config.py | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 9b1e1e2b526d..9e846567a5ed 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1056,24 +1056,31 @@ class CompressedTensorsConfig(QuantizationConfigMixin):
     This is a wrapper class that handles compressed-tensors quantization config options.
     It is a wrapper around `compressed_tensors.QuantizationConfig`
     Args:
-        config_groups (`typing.Dict[str, typing.Union[ForwardRef('QuantizationScheme'), typing.List[str]]]`, *optional*): <fill_docstring>
-        quant_method (`str`, *optional*, defaults to `"compressed-tensors"`): <fill_docstring>
-        format (`str`, *optional*, defaults to `"dense"`): <fill_docstring>
-        quantization_status (`QuantizationStatus`, *optional*, defaults to `"initialized"`): <fill_docstring>
-        global_compression_ratio (`typing.Union[float, NoneType]`, *optional*): <fill_docstring>
-        ignore (`typing.Union[typing.List[str], NoneType]`, *optional*): <fill_docstring>
-        sparsity_config (`typing.Dict[str, typing.Any]`, *optional*): <fill_docstring>
+        config_groups (`typing.Dict[str, typing.Union[ForwardRef('QuantizationScheme'), typing.List[str]]]`, *optional*):
+            dictionary mapping group name to a quantization scheme definition
+        format (`str`, *optional*, defaults to `"dense"`):
+            format the model is represented as
+        quantization_status (`QuantizationStatus`, *optional*, defaults to `"initialized"`):
+            status of model in the quantization lifecycle, ie 'initialized', 'calibration', 'frozen'
+        global_compression_ratio (`typing.Union[float, NoneType]`, *optional*):
+            0-1 float percentage of model compression
+        ignore (`typing.Union[typing.List[str], NoneType]`, *optional*):
+            layer names or types to not quantize, supports regex prefixed by 're:'
+        sparsity_config (`typing.Dict[str, typing.Any]`, *optional*): <fill_docstring>:
+            configuration for sparsity compression
+        quant_method (`str`, *optional*, defaults to `"compressed-tensors"`):
+            do not override, should be compressed-tensors
     """
 
     def __init__(
         self,
         config_groups: Dict[str, Union["QuantizationScheme", List[str]]] = None,  # noqa: F821
-        quant_method: str = "compressed-tensors",
         format: str = "dense",
         quantization_status: "QuantizationStatus" = "initialized",  # noqa: F821
         global_compression_ratio: Optional[float] = None,
         ignore: Optional[List[str]] = None,
         sparsity_config: Dict[str, Any] = None,
+        quant_method: str = "compressed-tensors",
         **kwargs,
     ):
         from compressed_tensors import QuantizationConfig

From eaa5f20be3a1b9ba961804304631b1b682b32e24 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Tue, 30 Jul 2024 22:26:59 -0400
Subject: [PATCH 22/38] remove fill_docstring

---
 src/transformers/utils/quantization_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 9e846567a5ed..e978119e597b 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1066,7 +1066,7 @@ class CompressedTensorsConfig(QuantizationConfigMixin):
             0-1 float percentage of model compression
         ignore (`typing.Union[typing.List[str], NoneType]`, *optional*):
             layer names or types to not quantize, supports regex prefixed by 're:'
-        sparsity_config (`typing.Dict[str, typing.Any]`, *optional*): <fill_docstring>:
+        sparsity_config (`typing.Dict[str, typing.Any]`, *optional*):
             configuration for sparsity compression
         quant_method (`str`, *optional*, defaults to `"compressed-tensors"`):
             do not override, should be compressed-tensors

From 4ba75fbc0365885a74351d0ef90c0bab2380dfba Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:12:45 -0400
Subject: [PATCH 23/38] Apply suggestions from code review

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 src/transformers/quantizers/auto.py                           | 4 ++--
 src/transformers/quantizers/quantizer_compressed_tensors.py   | 1 -
 .../quantization/compressed_tensor/test_compressed_tensors.py | 3 +--
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py
index 5e9542b89512..aed27a1cf74d 100755
--- a/src/transformers/quantizers/auto.py
+++ b/src/transformers/quantizers/auto.py
@@ -49,7 +49,7 @@
     "quanto": QuantoHfQuantizer,
     "eetq": EetqHfQuantizer,
     "hqq": HqqHfQuantizer,
-    "compressed-tensors": CompressedTensorsHfQuantizer,
+    "compressed_tensors": CompressedTensorsHfQuantizer,
     "fbgemm_fp8": FbgemmFp8HfQuantizer,
 }
 
@@ -62,7 +62,7 @@
     "aqlm": AqlmConfig,
     "quanto": QuantoConfig,
     "hqq": HqqConfig,
-    "compressed-tensors": CompressedTensorsConfig,
+    "compressed_tensors": CompressedTensorsConfig,
     "fbgemm_fp8": FbgemmFp8Config,
 }
 
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 3d0d2e009942..798d4ecb19bc 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -29,7 +29,6 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
     """
 
     requires_calibration = False
-    # requires_parameters_quantization = True
     required_packages = ["compressed_tensors"]
 
     def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
index 6b9df0c95feb..46d0221ed6e7 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -17,7 +17,7 @@ class CompressedTensorsTest(unittest.TestCase):
 
     prompt = "Paris is the capital of which country?"
 
-    def tear_down(self):
+    def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
         gc.collect()
@@ -50,4 +50,3 @@ def test_quantized_model(self):
         outputs = self.tokenizer.batch_decode(generated_ids)
 
         self.assertIsNotNone(outputs)
-        self.tear_down()

From 94ea0d3cdb0890b30965ce089d777e5a274e7a5f Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Tue, 6 Aug 2024 13:27:53 -0400
Subject: [PATCH 24/38] review comments

---
 .../quantizers/quantizer_compressed_tensors.py     | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 798d4ecb19bc..b4ca3f3d567c 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ..utils import is_torch_available, logging
+from ..utils import is_compressed_tensors_available, is_torch_available, logging
 from ..utils.quantization_config import QuantizationConfigMixin
 from .base import HfQuantizer
 
@@ -39,12 +39,18 @@ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
         self.compressor = ModelCompressor.from_compression_config(quantization_config)
 
     def validate_environment(self, *args, **kwargs):
-        # check torch and compressed_tensors are available, let ImportError raise otherwise
-        import torch  # noqa
-        from compressed_tensors.compressors import ModelCompressor  # noqa
+        if not is_compressed_tensors_available():
+            raise ImportError(
+                "Using `compressed_tensors` quantized models requires the compressed-tensors library: "
+                "`pip install compressed-tensors`"
+            )
+        if not is_torch_available():
+            # torch already should be installed as part of compressed tensors
+            raise ImportError("torch is required for using compressed-tensors quantization")
 
     def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
         if torch_dtype is None:
+            logger.info("Loading model using torch.float16 for compressed-tensors quantization")
             torch_dtype = torch.float16
         elif torch_dtype != torch.float16:
             logger.info(

From c48840d0bf432fdee9c2bf49e43fd519bd5b4e48 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Tue, 6 Aug 2024 13:31:14 -0400
Subject: [PATCH 25/38] review comments

---
 src/transformers/utils/quantization_config.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index e978119e597b..17ec85d1c22e 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1114,9 +1114,12 @@ def to_dict(self) -> Dict[str, Any]:
         Serializes this instance to a Python dictionary. Returns:
             `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
         """
+        quantization_config = self.quantization_config.dict() if self.quantization_config is not None else None
+        sparsity_config = self.sparsity_config.dict() if self.sparsity_config is not None else None
+
         return {
-            "quantization_config": self.quantization_config.dict(),
-            "sparsity_config": self.sparsity_config.dict(),
+            "quantization_config": quantization_config,
+            "sparsity_config": sparsity_config,
         }
 
 

From 2ecf7110ce0d8dabf44adab2ef805b57a9a3ed0c Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Tue, 20 Aug 2024 14:05:50 -0400
Subject: [PATCH 26/38] comments - suppress warnings on state dict load, tests,
 fixes

---
 src/transformers/modeling_utils.py            |  1 +
 src/transformers/quantizers/auto.py           |  4 +-
 src/transformers/quantizers/base.py           | 10 ++++
 .../quantizer_compressed_tensors.py           | 18 ++++++
 src/transformers/utils/quantization_config.py | 27 +++++++++
 .../test_compressed_tensors.py                | 56 +++++++++++++------
 6 files changed, 97 insertions(+), 19 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 9cef7ff13edb..125dd629f4cc 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -4182,6 +4182,7 @@ def _fix_key(key):
             for pat in cls._keys_to_ignore_on_load_unexpected:
                 unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
         if hf_quantizer is not None:
+            unexpected_keys = hf_quantizer.update_unexpected_keys(model, unexpected_keys, prefix)
             missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix)
 
         # retrieve weights on meta device and put them back on CPU.
diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py
index d5c40436b00d..1dcd87c993a2 100755
--- a/src/transformers/quantizers/auto.py
+++ b/src/transformers/quantizers/auto.py
@@ -51,7 +51,7 @@
     "quanto": QuantoHfQuantizer,
     "eetq": EetqHfQuantizer,
     "hqq": HqqHfQuantizer,
-    "compressed_tensors": CompressedTensorsHfQuantizer,
+    "compressed-tensors": CompressedTensorsHfQuantizer,
     "fbgemm_fp8": FbgemmFp8HfQuantizer,
     "torchao": TorchAoHfQuantizer,
 }
@@ -65,7 +65,7 @@
     "aqlm": AqlmConfig,
     "quanto": QuantoConfig,
     "hqq": HqqConfig,
-    "compressed_tensors": CompressedTensorsConfig,
+    "compressed-tensors": CompressedTensorsConfig,
     "fbgemm_fp8": FbgemmFp8Config,
     "torchao": TorchAoConfig,
 }
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
index 3ee28ada1bb2..81eb8ac69562 100644
--- a/src/transformers/quantizers/base.py
+++ b/src/transformers/quantizers/base.py
@@ -99,6 +99,16 @@ def adjust_target_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
         """
         return torch_dtype
 
+    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `unexpected_keys`.
+
+        Args:
+            unexpected_keys (`List[str]`, *optional*):
+                The list of unexpected keys in the state dict of the model compared to the checkpoint
+        """
+        return unexpected_keys
+
     def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
         """
         Override this method if you want to adjust the `missing_keys`.
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index b4ca3f3d567c..3b37d9c52989 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import List
+
 from ..utils import is_compressed_tensors_available, is_torch_available, logging
 from ..utils.quantization_config import QuantizationConfigMixin
 from .base import HfQuantizer
@@ -58,6 +60,22 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
             )
         return torch_dtype
 
+    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
+        def _is_compressed_key(key: str) -> bool:
+            # key names in compressed state dict that will not be present in
+            # a decompressed state dict
+            return key.endswith("weight_shape") or key.endswith("weight_packed")
+
+        return [key for key in unexpected_keys if not _is_compressed_key(key)]
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        def _is_decompressed_key(key: str) -> bool:
+            # key names in decompressed state dict that will not be present in
+            # a compressed state dict
+            return key.endswith("weight") or "scale" in key or "zero_point" in key
+
+        return [key for key in missing_keys if not _is_decompressed_key(key)]
+
     def _process_model_before_weight_loading(self, model, **kwargs):
         if self.quantization_config.quantization_config is not None:
             from compressed_tensors.quantization import apply_quantization_config
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index a6227386cb94..6d3f953f65df 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1100,6 +1100,7 @@ def __init__(
                     "quantization_status": quantization_status,
                     "global_compression_ratio": global_compression_ratio,
                     "ignore": ignore,
+                    **kwargs,
                 }
             )
 
@@ -1110,6 +1111,32 @@ def __init__(
 
         super().__init__(quant_method=QuantizationMethod.COMPRESSED_TENSORS)
 
+    @classmethod
+    def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
+        """
+        Instantiates a [`CompressedTensorsConfig`] from a Python dictionary of parameters.
+        Optionally unwraps any args from the nested quantization_config
+
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object.
+            return_unused_kwargs (`bool`,*optional*, defaults to `False`):
+                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
+                `PreTrainedModel`.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            [`QuantizationConfigMixin`]: The configuration object instantiated from those parameters.
+        """
+        if "quantization_config" in config_dict:
+            config_dict = dict(
+                sparsity_config=config_dict.get("sparsity_config"),
+                **config_dict["quantization_config"],
+            )
+
+        return super().from_dict(config_dict, return_unused_kwargs=return_unused_kwargs, **kwargs)
+
     def to_dict(self) -> Dict[str, Any]:
         """
         Serializes this instance to a Python dictionary. Returns:
diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
index 46d0221ed6e7..e7710b0b594e 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -1,7 +1,7 @@
 import gc
 import unittest
 
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
 from transformers.testing_utils import require_compressed_tensors, require_torch
 from transformers.utils import is_torch_available
 
@@ -13,7 +13,8 @@
 @require_compressed_tensors
 @require_torch
 class CompressedTensorsTest(unittest.TestCase):
-    quantized_model_name = "nm-testing/tinyllama-oneshot-w8a8-test-static-shape-change-v3"
+    tinyllama_w8a8 = "nm-testing/tinyllama-oneshot-w8a8-test-static-shape-change-v3"
+    llama3_8b_fp8 = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"
 
     prompt = "Paris is the capital of which country?"
 
@@ -22,31 +23,52 @@ def tearDown(self):
         torch.cuda.empty_cache()
         gc.collect()
 
-    @classmethod
-    def setUpClass(self):
-        """
-        Setup quantized model
-        """
-        self.tokenizer = AutoTokenizer.from_pretrained(self.quantized_model_name)
-        self.quantized_model = AutoModelForCausalLM.from_pretrained(self.quantized_model_name)
-        self.device = self.quantized_model.device
+    def test_config_args(self):
+        with self.assertRaises(ValueError):
+            # passing quant scheme directly is not allowed
+            CompressedTensorsConfig(config_groups={"weights": {"num_bits": 8}})
+        CompressedTensorsConfig(
+            config_groups={"FP8": ["Linear"]},
+            ignore=["lm_head"],
+            quantization_status="frozen",
+            sparsity_config={"format": "dense"},
+        )
+
+    def test_config_to_from_dict(self):
+        config = CompressedTensorsConfig(config_groups={"FP8": ["Linear"]}, sparsity_config={"format": "dense"})
+        config_dict = config.to_dict()
+        config_from_dict = CompressedTensorsConfig.from_dict(config_dict)
+
+        from compressed_tensors import QuantizationConfig, SparsityCompressionConfig
+
+        self.assertIsInstance(config_from_dict.quantization_config, QuantizationConfig)
+        self.assertIsInstance(config_from_dict.sparsity_config, SparsityCompressionConfig)
+
+    def test_tinyllama_w8a8(self):
+        self._test_quantized_model(self.tinyllama_w8a8)
+
+    def test_llama_8b_fp8(self):
+        self._test_quantized_model(self.llama3_8b_fp8)
 
-    def test_quantized_model(self):
+    def _test_quantized_model(self, model_name: str):
         """Carry out generation"""
+        quantized_model = AutoModelForCausalLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        device = quantized_model.device
         self.assertIsNotNone(
-            self.quantized_model.config.quantization_config,
+            quantized_model.config.quantization_config,
             "quantization_config should not be None",
         )
         self.assertTrue(
             any(
                 key
-                for key, tensor in self.quantized_model.state_dict().items()
+                for key, tensor in quantized_model.state_dict().items()
                 if "scale" in key and not torch.all(tensor == 1.0)
             ),
-            "quantized model should load a non-trivail scale into the state dict",
+            "quantized model should load a non-trivial scale into the state dict",
         )
-        inputs = self.tokenizer(self.prompt, return_tensors="pt").to(self.device)
-        generated_ids = self.quantized_model.generate(**inputs, max_length=50)
-        outputs = self.tokenizer.batch_decode(generated_ids)
+        inputs = tokenizer(self.prompt, return_tensors="pt").to(device)
+        generated_ids = quantized_model.generate(**inputs, max_length=50)
+        outputs = tokenizer.batch_decode(generated_ids)
 
         self.assertIsNotNone(outputs)

From e1ae50492de7dbc447076aa16ce19b99a16a6d71 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Thu, 22 Aug 2024 15:31:05 -0400
Subject: [PATCH 27/38] bug-fix - remove unnecessary call to apply quant
 lifecycle

---
 src/transformers/quantizers/quantizer_compressed_tensors.py | 6 ------
 .../compressed_tensor/test_compressed_tensors.py            | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 3b37d9c52989..685ecc07dbe4 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -76,12 +76,6 @@ def _is_decompressed_key(key: str) -> bool:
 
         return [key for key in missing_keys if not _is_decompressed_key(key)]
 
-    def _process_model_before_weight_loading(self, model, **kwargs):
-        if self.quantization_config.quantization_config is not None:
-            from compressed_tensors.quantization import apply_quantization_config
-
-            apply_quantization_config(model, self.quantization_config.quantization_config)
-
     def _process_model_after_weight_loading(self, model, resolved_archive_file, **kwargs):
         self.compressor.decompress(model_path=resolved_archive_file, model=model)
 
diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
index e7710b0b594e..69f86718ad89 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -52,7 +52,7 @@ def test_llama_8b_fp8(self):
 
     def _test_quantized_model(self, model_name: str):
         """Carry out generation"""
-        quantized_model = AutoModelForCausalLM.from_pretrained(model_name)
+        quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         device = quantized_model.device
         self.assertIsNotNone(

From ea9e927c5f6277daf2345c24f83fb1dadc5e474f Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Fri, 30 Aug 2024 21:03:46 +0000
Subject: [PATCH 28/38] run_compressed compatability

---
 .../quantizers/quantizer_compressed_tensors.py         | 10 ++++++++--
 src/transformers/utils/quantization_config.py          |  4 ++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 685ecc07dbe4..7bde1feca7c1 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -76,8 +76,14 @@ def _is_decompressed_key(key: str) -> bool:
 
         return [key for key in missing_keys if not _is_decompressed_key(key)]
 
-    def _process_model_after_weight_loading(self, model, resolved_archive_file, **kwargs):
-        self.compressor.decompress(model_path=resolved_archive_file, model=model)
+    def _process_model_before_weight_loading(self, model, **kwargs):
+        from compressed_tensors.quantization import apply_quantization_config
+
+        ct_quantization_config = self.compressor.quantization_config
+        apply_quantization_config(model, ct_quantization_config, run_compressed=True)
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        pass
 
     @property
     def is_trainable(self):
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 6d3f953f65df..abe79a86b0b6 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1063,6 +1063,8 @@ class CompressedTensorsConfig(QuantizationConfigMixin):
             format the model is represented as
         quantization_status (`QuantizationStatus`, *optional*, defaults to `"initialized"`):
             status of model in the quantization lifecycle, ie 'initialized', 'calibration', 'frozen'
+        kv_cache_scheme (`typing.Union[QuantizationArgs, NoneType]`, *optional*):
+            specifies quantization of the kv cache. If None, kv cache is not quantized.
         global_compression_ratio (`typing.Union[float, NoneType]`, *optional*):
             0-1 float percentage of model compression
         ignore (`typing.Union[typing.List[str], NoneType]`, *optional*):
@@ -1078,6 +1080,7 @@ def __init__(
         config_groups: Dict[str, Union["QuantizationScheme", List[str]]] = None,  # noqa: F821
         format: str = "dense",
         quantization_status: "QuantizationStatus" = "initialized",  # noqa: F821
+        kv_cache_scheme: Optional["QuantizationArgs"] = None,  # noqa: F821
         global_compression_ratio: Optional[float] = None,
         ignore: Optional[List[str]] = None,
         sparsity_config: Dict[str, Any] = None,
@@ -1098,6 +1101,7 @@ def __init__(
                     "quant_method": quant_method,
                     "format": format,
                     "quantization_status": quantization_status,
+                    "kv_cache_scheme": kv_cache_scheme,
                     "global_compression_ratio": global_compression_ratio,
                     "ignore": ignore,
                     **kwargs,

From 1c3ad5c944049bf56b25683114326810b26978f8 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 3 Sep 2024 18:33:09 +0000
Subject: [PATCH 29/38] revert changes not needed for compression

---
 src/transformers/modeling_utils.py                        | 2 +-
 .../quantizers/quantizer_compressed_tensors.py            | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 125dd629f4cc..75ed37904d7a 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -4035,7 +4035,7 @@ def from_pretrained(
                 dispatch_model(model, **device_map_kwargs)
 
         if hf_quantizer is not None:
-            hf_quantizer.postprocess_model(model, resolved_archive_file=resolved_archive_file)
+            hf_quantizer.postprocess_model(model)
             model.hf_quantizer = hf_quantizer
 
         if _adapter_model_path is not None:
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 7bde1feca7c1..8381f3d4165c 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -68,14 +68,6 @@ def _is_compressed_key(key: str) -> bool:
 
         return [key for key in unexpected_keys if not _is_compressed_key(key)]
 
-    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
-        def _is_decompressed_key(key: str) -> bool:
-            # key names in decompressed state dict that will not be present in
-            # a compressed state dict
-            return key.endswith("weight") or "scale" in key or "zero_point" in key
-
-        return [key for key in missing_keys if not _is_decompressed_key(key)]
-
     def _process_model_before_weight_loading(self, model, **kwargs):
         from compressed_tensors.quantization import apply_quantization_config
 

From aa1a4f978b2c8632f38ace1d82af11f62bc9193a Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 3 Sep 2024 18:36:50 +0000
Subject: [PATCH 30/38] no longer need unexpected keys fn

---
 src/transformers/modeling_utils.py  |  1 -
 src/transformers/quantizers/base.py | 10 ----------
 2 files changed, 11 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 75ed37904d7a..b92d4b447f19 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -4182,7 +4182,6 @@ def _fix_key(key):
             for pat in cls._keys_to_ignore_on_load_unexpected:
                 unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
         if hf_quantizer is not None:
-            unexpected_keys = hf_quantizer.update_unexpected_keys(model, unexpected_keys, prefix)
             missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix)
 
         # retrieve weights on meta device and put them back on CPU.
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
index 81eb8ac69562..3ee28ada1bb2 100644
--- a/src/transformers/quantizers/base.py
+++ b/src/transformers/quantizers/base.py
@@ -99,16 +99,6 @@ def adjust_target_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
         """
         return torch_dtype
 
-    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
-        """
-        Override this method if you want to adjust the `unexpected_keys`.
-
-        Args:
-            unexpected_keys (`List[str]`, *optional*):
-                The list of unexpected keys in the state dict of the model compared to the checkpoint
-        """
-        return unexpected_keys
-
     def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
         """
         Override this method if you want to adjust the `missing_keys`.

From 81a13dd7ec61e782e4984f5ae555355efdb4d83c Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 3 Sep 2024 18:42:04 +0000
Subject: [PATCH 31/38] unexpected keys not needed either

---
 .../quantizers/quantizer_compressed_tensors.py           | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 8381f3d4165c..6d18bcf220c7 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List
 
 from ..utils import is_compressed_tensors_available, is_torch_available, logging
 from ..utils.quantization_config import QuantizationConfigMixin
@@ -60,14 +59,6 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
             )
         return torch_dtype
 
-    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
-        def _is_compressed_key(key: str) -> bool:
-            # key names in compressed state dict that will not be present in
-            # a decompressed state dict
-            return key.endswith("weight_shape") or key.endswith("weight_packed")
-
-        return [key for key in unexpected_keys if not _is_compressed_key(key)]
-
     def _process_model_before_weight_loading(self, model, **kwargs):
         from compressed_tensors.quantization import apply_quantization_config
 

From f53d7b990100bbc7b86dc4a497ff2037d260245f Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara.adkins65@gmail.com>
Date: Mon, 9 Sep 2024 15:20:47 -0400
Subject: [PATCH 32/38] Apply suggestions from code review

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 src/transformers/quantizers/quantizer_compressed_tensors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 6d18bcf220c7..5531838e568a 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -29,7 +29,7 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
     quantized state with compressed_tensors
     """
 
-    requires_calibration = False
+    requires_calibration = True
     required_packages = ["compressed_tensors"]
 
     def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):

From d8f7073c5c34f2e315de4a0d3edb976b5942a7b8 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 9 Sep 2024 19:22:23 +0000
Subject: [PATCH 33/38] add to_diff_dict

---
 src/transformers/utils/quantization_config.py | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index abe79a86b0b6..d97b5d0d332f 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1154,6 +1154,27 @@ def to_dict(self) -> Dict[str, Any]:
             "sparsity_config": sparsity_config,
         }
 
+    def to_diff_dict(self) -> Dict[str, Any]:
+        """
+        Removes all attributes from config which correspond to the default config attributes for better readability and
+        serializes to a Python dictionary.
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict()
+
+        # get the default config dict
+        default_config_dict = CompressedTensorsConfig().to_dict()
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        return serializable_config_dict
+
 
 class FbgemmFp8Config(QuantizationConfigMixin):
     """

From c4fbf70fde2ba09bfbc4d15d3557d10b52208d6e Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Wed, 11 Sep 2024 17:31:41 +0000
Subject: [PATCH 34/38] update docs and expand testing

---
 .../en/quantization/compressed_tensors.md     |  4 ++++
 docs/source/en/quantization/overview.md       |  1 +
 .../test_compressed_tensors.py                | 23 +++++++++++++++----
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/docs/source/en/quantization/compressed_tensors.md b/docs/source/en/quantization/compressed_tensors.md
index 1019a9e0afc6..f4eeae475604 100644
--- a/docs/source/en/quantization/compressed_tensors.md
+++ b/docs/source/en/quantization/compressed_tensors.md
@@ -38,6 +38,10 @@ pip install compressed-tensors
 
 
 ## Sample Model Load
+Quantized models can be easily loaded for inference as shown below. Only models that 
+have already been quantized can be loaded. To quantize a model into the compressed-tensors 
+format see [llm-compressor](https://github.com/vllm-project/llm-compressor).
+
 ```python
 from transformers import AutoModelForCausalLM
 compressed_tensors_model = AutoModelForCausalLM.from_pretrained("nm-testing/tinyllama-oneshot-w4a16-group128-v3")
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
index 9eb74793a127..4a42532a8554 100644
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -50,6 +50,7 @@ Use the table below to help you decide which quantization method to use.
 | [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
 | [AWQ](./awq) | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
 | [bitsandbytes](./bitsandbytes)                        | 🟢                       | 🔴   |     🟢     | 🔴              | 🔴                     | 🔴                       | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/TimDettmers/bitsandbytes |
+| [compressed-tensors](./compressed_tensors)                        | 🔴                       | 🟢   |     🟢     | 🟢              | 🔴                     | 🔴                       | 1 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
 | [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
 | GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
 | [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
diff --git a/tests/quantization/compressed_tensor/test_compressed_tensors.py b/tests/quantization/compressed_tensor/test_compressed_tensors.py
index 69f86718ad89..cbcf492f7c97 100644
--- a/tests/quantization/compressed_tensor/test_compressed_tensors.py
+++ b/tests/quantization/compressed_tensor/test_compressed_tensors.py
@@ -13,7 +13,9 @@
 @require_compressed_tensors
 @require_torch
 class CompressedTensorsTest(unittest.TestCase):
-    tinyllama_w8a8 = "nm-testing/tinyllama-oneshot-w8a8-test-static-shape-change-v3"
+    tinyllama_w8a16 = "nm-testing/tinyllama-w8a16-dense-hf-quantizer"
+    tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
+    tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
     llama3_8b_fp8 = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"
 
     prompt = "Paris is the capital of which country?"
@@ -45,12 +47,22 @@ def test_config_to_from_dict(self):
         self.assertIsInstance(config_from_dict.sparsity_config, SparsityCompressionConfig)
 
     def test_tinyllama_w8a8(self):
-        self._test_quantized_model(self.tinyllama_w8a8)
+        expected_out = "<s> Paris is the capital of which country?\n\n**A) Paris**\n\n**Q** ** Paris is the capital of which country?\n\n**A) Paris**\n\n**Q** ** Paris is the capital of which country"
+        self._test_quantized_model(self.tinyllama_w8a8, expected_out)
+
+    def test_tinyllama_w4a16(self):
+        expected_out = "<s> Paris is the capital of which country?\nAnswer: Paris is the capital of France.\nQuestion: Which country is the capital of which city?\nAnswer: The capital of the city of New York is New York.\nQuestion: Which"
+        self._test_quantized_model(self.tinyllama_w4a16, expected_out)
+
+    def test_tinyllama_w8a16(self):
+        expected_out = "<s> Paris is the capital of which country?\nA. France\nB. Germany\nC. Spain\nD. Italy\nE. Switzerland\nQ10. Which of the following is not a country in the European Union?\nA."
+        self._test_quantized_model(self.tinyllama_w8a16, expected_out)
 
     def test_llama_8b_fp8(self):
-        self._test_quantized_model(self.llama3_8b_fp8)
+        expected_out = "<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous art museum in Paris? The Louvre\nWhat is the name of the famous opera house in Paris? Palais Garnier\nWhat is the name of the"
+        self._test_quantized_model(self.llama3_8b_fp8, expected_out)
 
-    def _test_quantized_model(self, model_name: str):
+    def _test_quantized_model(self, model_name: str, expected_output: str):
         """Carry out generation"""
         quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -68,7 +80,8 @@ def _test_quantized_model(self, model_name: str):
             "quantized model should load a non-trivial scale into the state dict",
         )
         inputs = tokenizer(self.prompt, return_tensors="pt").to(device)
-        generated_ids = quantized_model.generate(**inputs, max_length=50)
+        generated_ids = quantized_model.generate(**inputs, max_length=50, do_sample=False)
         outputs = tokenizer.batch_decode(generated_ids)
 
         self.assertIsNotNone(outputs)
+        self.assertEqual(outputs[0], expected_output)

From 298a6387bd4048e8b7615bd3998dab1259b4814d Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara.adkins65@gmail.com>
Date: Wed, 18 Sep 2024 08:46:22 -0400
Subject: [PATCH 35/38] Update _toctree.yml with compressed-tensors

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 7eff2a383026..35e554e96577 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -173,6 +173,8 @@
     title: Optimum
   - local: quantization/torchao
     title: TorchAO
+  - local: quantization/compressed_tensors
+    title: compressed-tensors
   - local: quantization/contribute
     title: Contribute new quantization method
   title: Quantization Methods

From 3cb44153cee45a7a7313c863c174b3b46ebca9c6 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara.adkins65@gmail.com>
Date: Sun, 22 Sep 2024 20:16:19 -0400
Subject: [PATCH 36/38] Update src/transformers/utils/quantization_config.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 src/transformers/utils/quantization_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index d97b5d0d332f..23a983af7420 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -1176,6 +1176,7 @@ def to_diff_dict(self) -> Dict[str, Any]:
         return serializable_config_dict
 
 
+@dataclass
 class FbgemmFp8Config(QuantizationConfigMixin):
     """
     This is a wrapper class about all possible attributes and features that you can play with a model that has been

From 64f475adc665c54c59b8feb05a52635e65460a67 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Tue, 24 Sep 2024 18:50:51 +0000
Subject: [PATCH 37/38] update doc

---
 .../en/quantization/compressed_tensors.md     | 205 ++++++++++++++++--
 1 file changed, 190 insertions(+), 15 deletions(-)

diff --git a/docs/source/en/quantization/compressed_tensors.md b/docs/source/en/quantization/compressed_tensors.md
index f4eeae475604..5518a081e842 100644
--- a/docs/source/en/quantization/compressed_tensors.md
+++ b/docs/source/en/quantization/compressed_tensors.md
@@ -13,39 +13,214 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 
 -->
-
 # Compressed Tensors
 
-Compressed tensors supports the quantization of models to a variety of formats and provides an extensible
-framework for adding new formats and strategies.
+The [`compressed-tensors`](https://github.com/neuralmagic/compressed-tensors) library provides a versatile and efficient way to store and manage compressed model checkpoints. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more.
+
+Some of the supported formats include:
+1. `dense`
+2. `int-quantized`: INT8 quantized models
+    - sample [model/config](https://huggingface.co/nm-testing/tinyllama-w8a8-compressed-hf-quantizer)
+3. `float-quantized`: FP8 quantized models; currently support E4M3
+    - sample [model/config](https://huggingface.co/nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat/tree/main)
+4. `pack-quantized`: INT4 or INT8 weight-quantized models, packed into INT32. For INT4, the weights have an INT4 range but are stored as INT8 and   then packed into INT32.
+    - sample [model/config](nm-testing/tinyllama-w4a16-compressed-hf-quantizer)
 
 Compressed models can be easily created using [llm-compressor](https://github.com/vllm-project/llm-compressor).
 Alternatively models can be created indepedenty and serialized with a compressed tensors config.
 
-Supported formats include:
+To find existing models on the Hugging Face Model Hub, search for the [`compressed-tensors` tag](https://huggingface.co/models?other=compressed-tensors).
 
- - FP8, INT4, INT8 (for Q/DQ arbitrary precision is allowed for INT)
- - Activation quantization (static)
- - Dynamic per-token activation quantization
- - Supports quantization of arbitrary layer types
- - Targeted support or ignoring of layers by name or class
+#### Features:
+ - Weight and activation precisions: FP8, INT4, INT8 (for Q/DQ arbitrary precision is allowed for INT)
+ - Quantization scales and zero-points strategies: [tensor, channel, group, block, token](https://github.com/neuralmagic/compressed-tensors/blob/83b2e7a969d70606421a76b9a3d112646077c8de/src/compressed_tensors/quantization/quant_args.py#L43-L52)
+ - Dynamic per-token activation quantization (or any static strategy)
+ - Sparsity can be 
+ - Supports quantization of arbitrary modules, not just Linear modules
+ - Targeted support or ignoring of modules by name or class
 
 ## Installation
 
+It is recommended to install stable releases of compressed-tensors from [PyPI](https://pypi.org/project/compressed-tensors):
 ```bash
 pip install compressed-tensors
 ```
 
+Developers who want to experiment with the latest features can also install the package from source:
+```bash
+git clone https://github.com/neuralmagic/compressed-tensors
+cd compressed-tensors
+pip install -e .
+```
 
-## Sample Model Load
-Quantized models can be easily loaded for inference as shown below. Only models that 
-have already been quantized can be loaded. To quantize a model into the compressed-tensors 
-format see [llm-compressor](https://github.com/vllm-project/llm-compressor).
+## Quickstart Model Load
+Quantized models can be easily loaded for inference as shown below. Only models that have already been quantized can be loaded at the moment. To quantize a model into the compressed-tensors format see [llm-compressor](https://github.com/vllm-project/llm-compressor).
 
 ```python
 from transformers import AutoModelForCausalLM
-compressed_tensors_model = AutoModelForCausalLM.from_pretrained("nm-testing/tinyllama-oneshot-w4a16-group128-v3")
+
+# Load the model in compressed-tensors format
+ct_model = AutoModelForCausalLM.from_pretrained("nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf")
+
+# Measure memory usage
+mem_params = sum([param.nelement()*param.element_size() for param in ct_model.parameters()])
+print(f"{mem/2**30:.4f} GB")
+# 8.4575 GB
 ```
 
+We can see just above that the compressed-tensors FP8 checkpoint of Llama 3.1 8B is able to be loaded for inference using half of the memory of the unquantized reference checkpoint.
+
+## Sample Use Cases - Load and run an FP8 model
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+prompt = [
+    "Hello, my name is",
+    "The capital of France is",
+    "The future of AI is"
+]
+
+model_name = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"
+
+quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
 
-## More Coming Soon!
+inputs = tokenizer(prompt, return_tensors="pt")
+generated_ids = quantized_model.generate(**inputs, max_length=50, do_sample=False)
+outputs = tokenizer.batch_decode(generated_ids)
+
+print(outputs)
+
+"""
+['<|begin_of_text|>Hello, my name is [Name]. I am a [Your Profession/Student] and I am here to learn about the [Course/Program] at [University/Institution]. I am excited to be here and I am looking forward to', '<|begin_of_text|>The capital of France is Paris, which is located in the north-central part of the country. Paris is the most populous city in France and is known for its stunning architecture, art museums, fashion, and romantic atmosphere. The city is home to', "<|begin_of_text|>The future of AI is here, and it's already changing the way we live and work. From virtual assistants to self-driving cars, AI is transforming industries and revolutionizing the way we interact with technology. But what does the future of AI hold"]
+"""
+
+```
+## Deep dive into a compressed-tensors model checkpoint
+
+In this example we will examine how the compressed-tensors model nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf is defined through its configuration entry and see how this translates to the loaded model representation. 
+
+First, let us look at the [`quantization_config` of the model](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf/blob/main/config.json). At a glance it looks overwhelming with the number of entries but this is because compressed-tensors is a format that allows for flexible expression both during and after model compression.
+
+In practice for checkpoint loading and inference the configuration can be simplified to not include all the default or empty entries, so we will do that here to focus on what compression is actually represented.
+
+```yaml
+"quantization_config": {
+  "config_groups": {
+    "group_0": {
+      "input_activations": {
+        "num_bits": 8,
+        "strategy": "tensor",
+        "type": "float"
+      },
+      "targets": ["Linear"],
+      "weights": {
+        "num_bits": 8,
+        "strategy": "tensor",
+        "type": "float"
+      }
+    }
+  },
+  "format": "naive-quantized",
+  "ignore": ["lm_head"],
+  "quant_method": "compressed-tensors",
+  "quantization_status": "frozen"
+},
+```
+
+We can see from the above configuration that it is specifying one config group that includes weight and activation quantization to FP8 with a static per-tensor strategy. It is also worth noting that in the `ignore` list there is an entry to skip quantization of the `lm_head` module, so that module should be untouched in the checkpoint.
+
+To see the result of the configuration in practice, we can simply use the [safetensors viewer](https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf?show_file_info=model.safetensors.index.json) on the model card to see the quantized weights, input_scale, and weight_scale for all of the Linear modules in the first model layer (and so on for the rest of the layers).
+
+| Tensors | Shape |	Precision |
+| ------- | ----- | --------- |
+model.layers.0.input_layernorm.weight	| [4 096]	| BF16 
+model.layers.0.mlp.down_proj.input_scale	| [1]	| BF16 
+model.layers.0.mlp.down_proj.weight	| [4 096, 14 336] |	F8_E4M3 
+model.layers.0.mlp.down_proj.weight_scale |	[1]	| BF16 
+model.layers.0.mlp.gate_proj.input_scale |	[1]	| BF16 
+model.layers.0.mlp.gate_proj.weight	| [14 336, 4 096]	| F8_E4M3 
+model.layers.0.mlp.gate_proj.weight_scale	| [1] |	BF16 
+model.layers.0.mlp.up_proj.input_scale|	[1]	|BF16 
+model.layers.0.mlp.up_proj.weight |	[14 336, 4 096]	| F8_E4M3 
+model.layers.0.mlp.up_proj.weight_scale | [1]	| BF16 
+model.layers.0.post_attention_layernorm.weight |	[4 096]	|BF16 
+model.layers.0.self_attn.k_proj.input_scale |	[1]	|  BF16
+model.layers.0.self_attn.k_proj.weight |	[1 024, 4 096]|	F8_E4M3
+model.layers.0.self_attn.k_proj.weight_scale |[1]	| BF16 
+model.layers.0.self_attn.o_proj.input_scale	| [1]	| BF16
+model.layers.0.self_attn.o_proj.weight | [4 096, 4 096]	| F8_E4M3 
+model.layers.0.self_attn.o_proj.weight_scale | [1]	| BF16 
+model.layers.0.self_attn.q_proj.input_scale	| [1]	| BF16 
+model.layers.0.self_attn.q_proj.weight | [4 096, 4 096]	| F8_E4M3 
+model.layers.0.self_attn.q_proj.weight_scale |	[1] | BF16 
+model.layers.0.self_attn.v_proj.input_scale	| [1] | BF16 
+model.layers.0.self_attn.v_proj.weight |	[1 024, 4 096]	| F8_E4M3 
+model.layers.0.self_attn.v_proj.weight_scale |	[1] |	BF16 
+
+When we load the model with the compressed-tensors HFQuantizer integration, we can see that all of the Linear modules that are specified within the quantization configuration have been replaced by `CompressedLinear` modules that manage the compressed weights and forward pass for inference. Note that the `lm_head` mentioned before in the ignore list is still kept as an unquantized Linear module.
+
+```python
+from transformers import AutoModelForCausalLM
+
+ct_model = AutoModelForCausalLM.from_pretrained("nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf")
+print(ct_model)
+"""
+LlamaForCausalLM(
+  (model): LlamaModel(
+    (embed_tokens): Embedding(128256, 4096)
+    (layers): ModuleList(
+      (0-31): 32 x LlamaDecoderLayer(
+        (self_attn): LlamaSdpaAttention(
+          (q_proj): CompressedLinear(
+            in_features=4096, out_features=4096, bias=False
+            (input_observer): MovingAverageMinMaxObserver()
+            (weight_observer): MovingAverageMinMaxObserver()
+          )
+          (k_proj): CompressedLinear(
+            in_features=4096, out_features=1024, bias=False
+            (input_observer): MovingAverageMinMaxObserver()
+            (weight_observer): MovingAverageMinMaxObserver()
+          )
+          (v_proj): CompressedLinear(
+            in_features=4096, out_features=1024, bias=False
+            (input_observer): MovingAverageMinMaxObserver()
+            (weight_observer): MovingAverageMinMaxObserver()
+          )
+          (o_proj): CompressedLinear(
+            in_features=4096, out_features=4096, bias=False
+            (input_observer): MovingAverageMinMaxObserver()
+            (weight_observer): MovingAverageMinMaxObserver()
+          )
+          (rotary_emb): LlamaRotaryEmbedding()
+        )
+        (mlp): LlamaMLP(
+          (gate_proj): CompressedLinear(
+            in_features=4096, out_features=14336, bias=False
+            (input_observer): MovingAverageMinMaxObserver()
+            (weight_observer): MovingAverageMinMaxObserver()
+          )
+          (up_proj): CompressedLinear(
+            in_features=4096, out_features=14336, bias=False
+            (input_observer): MovingAverageMinMaxObserver()
+            (weight_observer): MovingAverageMinMaxObserver()
+          )
+          (down_proj): CompressedLinear(
+            in_features=14336, out_features=4096, bias=False
+            (input_observer): MovingAverageMinMaxObserver()
+            (weight_observer): MovingAverageMinMaxObserver()
+          )
+          (act_fn): SiLU()
+        )
+        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
+        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
+      )
+    )
+    (norm): LlamaRMSNorm((4096,), eps=1e-05)
+    (rotary_emb): LlamaRotaryEmbedding()
+  )
+  (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
+)
+"""
+```

From fabe8a31f0eae3b7edd3c9f80d97a7058d160d05 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Tue, 24 Sep 2024 18:56:00 +0000
Subject: [PATCH 38/38] add note about saving a loaded model

---
 docs/source/en/quantization/compressed_tensors.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/en/quantization/compressed_tensors.md b/docs/source/en/quantization/compressed_tensors.md
index 5518a081e842..f385aae965f6 100644
--- a/docs/source/en/quantization/compressed_tensors.md
+++ b/docs/source/en/quantization/compressed_tensors.md
@@ -97,6 +97,10 @@ print(outputs)
 """
 
 ```
+
+The above shows a quick example for running generation using a `compressed-tensors`
+model. Currently, once loaded the model cannot be saved.
+
 ## Deep dive into a compressed-tensors model checkpoint
 
 In this example we will examine how the compressed-tensors model nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf is defined through its configuration entry and see how this translates to the loaded model representation.