Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/source/quantization_overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ First we want to lay out the torchao stack::

Quantization Algorithms/Flows: weight only/dynamic/static quantization, hqq, awq, gptq etc.
---------------------------------------------------------------------------------------------
Quantized Tensors (derived dtypes): Int4Tensor, Int4PreshuffledTensor, Float8Tensor
Quantized Tensors (derived dtypes): Int4Tensor, Int4PreshuffledTensor, Int8Tensor, Float8Tensor
---------------------------------------------------------------------------------------------
Quantization Primitive Ops/Efficient Kernels: matmul, quantize, dequantize
---------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -88,6 +88,8 @@ So in general we structure Tensor subclasses by dervied dtpype and packing forma
- scaled int4
- preshuffled (special format to optimize for loading)
- float8 act + int4 weight dynamic quantization and int4 weight only quantization
* - Int8Tensor
- plain

.. note::
We don't have granularity specific tensor subclasses, i.e. no Float8RowwiseTensor or Float8BlockwiseTensor, all granularities are implemented in the same Tensor, we typically use a general `block_size` attribute to distinguish between different granularities, and each Tensor is allowed to support only a subset of all possible granularity options.
Expand Down
187 changes: 187 additions & 0 deletions test/quantization/quantize_/workflows/int8/test_int8_tensor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD 3-Clause license found in the
# LICENSE file in the root directory of this source tree.

import copy
import unittest
from typing import Tuple

import torch
from torch.testing._internal import common_utils

from torchao.quantization import (
Int8DynamicActivationInt8WeightConfig,
Int8WeightOnlyConfig,
PerRow,
PerTensor,
quantize_,
)
from torchao.quantization.quantize_.workflows.int8.int8_tensor import Int8Tensor
from torchao.quantization.utils import compute_error
from torchao.testing.utils import TorchAOIntegrationTestCase


# TODO: Refactor after https://github.com/pytorch/ao/pull/2729 is merged
class ToyTwoLinearModel(torch.nn.Module):
def __init__(
self,
input_dim,
hidden_dim,
output_dim,
has_bias=False,
dtype=None,
device=None,
):
super().__init__()
self.dtype = dtype
self.device = device
self.linear1 = torch.nn.Linear(
input_dim, hidden_dim, bias=has_bias, dtype=dtype, device=device
)
self.linear2 = torch.nn.Linear(
hidden_dim, output_dim, bias=has_bias, dtype=dtype, device=device
)

def forward(self, x):
x = self.linear1(x)
x = self.linear2(x)
return x


@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@common_utils.instantiate_parametrized_tests
class TestInt8Tensor(TorchAOIntegrationTestCase):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for test, maybe try to follow https://github.com/pytorch/ao/blob/main/test/quantization/quantize_/workflows/int4/test_int4_marlin_sparse_tensor.py for now and also add some tests for slicing?

def test_slice(self, granularity):
config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
dtype = torch.bfloat16
device = "cuda"
dummy = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
dummy1 = torch.nn.Linear(256, 64, bias=False, dtype=dtype, device=device)
dummy1.weight = torch.nn.Parameter(
dummy.weight.narrow(0, 0, 64), requires_grad=False
)
dummy2 = torch.nn.Linear(128, 256, dtype=dtype, device=device)
dummy2.weight = torch.nn.Parameter(
dummy.weight.narrow(1, 0, 128), requires_grad=False
)
quantize_(dummy, config)
weight1 = dummy.weight.clone().narrow(0, 0, 64)
weight2 = dummy.weight.clone().narrow(1, 0, 128)
self.assertEqual(
weight1.qdata,
dummy.weight.qdata.narrow(0, 0, 64),
)
self.assertEqual(
weight2.qdata,
dummy.weight.qdata.narrow(1, 0, 128),
)
if isinstance(granularity, PerRow):
self.assertEqual(
weight1.scale,
dummy.weight.scale.narrow(0, 0, 64),
)
self.assertEqual(
weight2.scale,
dummy.weight.scale,
)
else:
self.assertEqual(
weight1.scale,
dummy.weight.scale,
)
self.assertEqual(
weight2.scale,
dummy.weight.scale,
)
# check for sliced weight, before and after float8 quantization
# does not differ too much
input = torch.randn(2, 256, dtype=dtype, device=device)
res_ref = dummy1(input)
dummy.weight = torch.nn.Parameter(weight1.contiguous(), requires_grad=False)
res = dummy(input)
sqnr = compute_error(res, res_ref)
self.assertTrue(sqnr > 25, f"sqnr: {sqnr}")
input = torch.randn(2, 128, dtype=dtype, device=device)
res_ref = dummy2(input)
dummy.weight = torch.nn.Parameter(weight2.contiguous(), requires_grad=False)
res = dummy(input)
sqnr = compute_error(res, res_ref)
self.assertTrue(sqnr > 15, f"sqnr: {sqnr}")
and
def test_slice_preserves_aliasing(self, granularity):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes linked unit test is helpful for slicing (PerTensor, PerRow) test, but I didn't implemented granularity in this PR yet for smaller PR size. Can I address it after this PR?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the slicing tests are specific to a granularity, you should be able to adapt it for the currently supported granularity I think

def setUp(self):
super().setUp()
torch.manual_seed(42)
self.weight_fp = torch.randn(4, 3, dtype=torch.bfloat16)
self.input_fp = torch.randn(4, 3, dtype=torch.bfloat16)
self.bias = torch.randn(4, dtype=torch.bfloat16)
self.block_size = [4, 3]

def test_creation_and_attributes(self):
"""Test tensor creation, dtypes, and ranges"""
tensor = Int8Tensor.from_hp(self.weight_fp, self.block_size)

self.assertEqual(tensor.shape, (4, 3))
self.assertEqual(tensor.qdata.dtype, torch.int8)
self.assertTrue(
torch.all(tensor.qdata >= -128) and torch.all(tensor.qdata <= 127)
)

@common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
@common_utils.parametrize(
"sizes",
[
((128,), 256, 128),
((32, 128), 64, 256),
],
)
@common_utils.parametrize(
"config",
[
Int8DynamicActivationInt8WeightConfig(version=2),
Int8WeightOnlyConfig(version=2),
],
)
def test_int8_linear_variants(
self,
dtype: torch.dtype,
sizes: Tuple,
config,
):
M, N, K = sizes
input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")

# Create a linear layer
m = ToyTwoLinearModel(K, N, K).eval().to(dtype).to("cuda")
m_q = copy.deepcopy(m)

# Quantize
quantize_(m_q, config)

output_original = m(input_tensor)
output_quantized = m_q(input_tensor)

error = compute_error(output_original, output_quantized)
assert error > 20, f"Quantization error is too high got a SQNR of {error}"

@unittest.skip("granularity parameter not supported in current API")
@common_utils.parametrize("granularity", [PerTensor(), PerRow()])
def test_slice_preserves_aliasing(self, granularity):
config = Int8DynamicActivationInt8WeightConfig(
granularity=granularity, version=2
)
l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
l.weight = torch.nn.Parameter(
torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
)
quantize_(l, config)
param = l.weight
param_data = param.data
param_data = param_data.narrow(0, 0, 512)
# Making sure the aliasing is preserved in sliced quantized Tensor
assert param.data.qdata.data_ptr() == param_data.qdata.data_ptr()
assert param.data.scale.data_ptr() == param_data.scale.data_ptr()

@common_utils.parametrize(
"config",
[
Int8DynamicActivationInt8WeightConfig(version=2),
Int8WeightOnlyConfig(version=2),
],
)
@common_utils.parametrize("device", ["cpu", "cuda"])
@common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
def test_slice(self, config, device, dtype):
"""Test tensor slicing"""
dummy = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
quantize_(dummy, config)

weight1 = dummy.weight.clone().narrow(0, 0, 64)
weight2 = dummy.weight.clone().narrow(1, 0, 128)

self.assertEqual(weight1.qdata, dummy.weight.qdata.narrow(0, 0, 64))
self.assertEqual(weight2.qdata, dummy.weight.qdata.narrow(1, 0, 128))
Comment on lines +146 to +147
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: add assert for scale as well?


# Int8DynamicActivationInt8WeightConfig uses per-row (PerRow)
# Int8WeightOnlyConfig uses per-tensor (PerTensor)
if isinstance(config, Int8DynamicActivationInt8WeightConfig):
# PerRow: dim 0 slicing affects scale, dim 1 doesn't
self.assertEqual(weight1.scale, dummy.weight.scale.narrow(0, 0, 64))
self.assertEqual(weight2.scale, dummy.weight.scale)
else:
# PerTensor: scale unchanged by slicing
self.assertEqual(weight1.scale, dummy.weight.scale)
self.assertEqual(weight2.scale, dummy.weight.scale)

def test_index_select(self):
"""test that `x_0 = x[0]` works when `x` is a 2D `Int8Tensor`."""
N, K = 256, 512
x = torch.randn(N, K, device="cuda", dtype=torch.bfloat16)
x_int8 = Int8Tensor.from_hp(x, block_size=[N, K])
x_int8_0 = x_int8[0]
torch.testing.assert_close(
x_int8.dequantize()[0], x_int8_0.dequantize(), atol=0, rtol=0
)

def test_error_handling_and_dequant(self):
"""Test input validation and dequantization accuracy"""
with self.assertRaises((AssertionError, ValueError, RuntimeError)):
Int8Tensor.from_hp(torch.randn(5), [1])

with self.assertRaises((AssertionError, ValueError, RuntimeError)):
Int8Tensor.from_hp(self.weight_fp, [1])

test_data = torch.tensor([[1.0, -1.0]], dtype=torch.bfloat16)
tensor = Int8Tensor.from_hp(test_data, [1, 2])

dequantized = torch.ops.aten.dequantize.self(tensor)
self.assertEqual(dequantized.shape, test_data.shape)
self.assertLess(torch.abs(dequantized - test_data).max().item(), 0.1)


if __name__ == "__main__":
common_utils.run_tests()
2 changes: 2 additions & 0 deletions torchao/quantization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
Int4PreshuffledTensor,
Int4Tensor,
Int4TilePackedTo4dTensor,
Int8Tensor,
IntxOpaqueTensor,
IntxUnpackedToInt8Tensor,
)
Expand Down Expand Up @@ -170,6 +171,7 @@
"IntxOpaqueTensor",
"IntxUnpackedToInt8Tensor",
"Int4TilePackedTo4dTensor",
"Int8Tensor",
"Float8Tensor",
"Int4OpaqueTensor",
# smooth quant - subject to change
Expand Down
81 changes: 53 additions & 28 deletions torchao/quantization/quant_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
Int4PreshuffledTensor,
Int4Tensor,
Int4TilePackedTo4dTensor,
Int8Tensor,
IntxChooseQParamsAlgorithm,
IntxOpaqueTensor,
IntxPackingFormat,
Expand Down Expand Up @@ -1365,10 +1366,12 @@ class Int8WeightOnlyConfig(AOBaseConfig):
Otherwise, applies per-group quantization with the specified group size.
set_inductor_config: bool = True - If True, adjusts `torchinductor` settings to recommended values
for better performance with this quantization scheme.
version - Version of the config to use. Version 1 uses AffineQuantization for quantization,
"""

group_size: Optional[int] = None
set_inductor_config: bool = True
version: int = 1

def __post_init__(self):
torch._C._log_api_usage_once("torchao.quantization.Int8WeightOnlyConfig")
Expand All @@ -1379,22 +1382,30 @@ def __post_init__(self):


def _int8_weight_only_quantize_tensor(weight, config):
mapping_type = MappingType.SYMMETRIC
target_dtype = torch.int8
eps = torch.finfo(torch.float32).eps
zero_point_dtype = torch.int64
group_size = config.group_size
if group_size is None:
group_size = weight.shape[-1]
block_size = tuple([1 for x in range(weight.dim() - 1)] + [group_size])
new_weight = to_affine_quantized_intx(
weight,
mapping_type,
block_size,
target_dtype,
eps=eps,
zero_point_dtype=zero_point_dtype,
)
if config.version == 1:
warnings.warn(
"Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details"
)
mapping_type = MappingType.SYMMETRIC
target_dtype = torch.int8
eps = torch.finfo(torch.float32).eps
zero_point_dtype = torch.int64
group_size = config.group_size
if group_size is None:
group_size = weight.shape[-1]
block_size = tuple([1 for x in range(weight.dim() - 1)] + [group_size])
new_weight = to_affine_quantized_intx(
weight,
mapping_type,
block_size,
target_dtype,
eps=eps,
zero_point_dtype=zero_point_dtype,
)
else:
assert config.version == 2, f"Unexpected version: {config.version}"
block_size = [weight.shape[0], weight.shape[1]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be the same as L1393 I think, you can extract L1390-L1393 out of the first if branch and use that I think

Copy link
Contributor Author

@namgyu-youn namgyu-youn Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't dividing logics much safer and easier to deprecate old API in the future? Other APIs like _float8_weight_only_quant_tensor also have been used with this convention without a common branch.

new_weight = Int8Tensor.from_hp(weight, block_size=block_size)
return new_weight


Expand Down Expand Up @@ -1522,12 +1533,14 @@ class Int8DynamicActivationInt8WeightConfig(AOBaseConfig):
in original precision during decode operations.
set_inductor_config: bool = True - If True, adjusts `torchinductor` settings to recommended values
for better performance with this quantization scheme.
version (int): the version of the config, version 1 is using AffineQuantizedTensor that we plan to deprecate/split, version 2 is using Int8Tensor
"""

layout: Optional[Layout] = PlainLayout()
act_mapping_type: Optional[MappingType] = MappingType.SYMMETRIC
weight_only_decode: bool = False
set_inductor_config: bool = True
version: int = 1

def __post_init__(self):
torch._C._log_api_usage_once(
Expand Down Expand Up @@ -1576,18 +1589,30 @@ def get_weight_block_size(x):
input_quant_func = _int8_asymm_per_token_quant

block_size = get_weight_block_size(weight)
new_weight = to_affine_quantized_intx(
weight,
mapping_type,
block_size,
target_dtype,
eps=eps,
zero_point_dtype=zero_point_dtype,
_layout=layout,
zero_point_domain=weight_zero_point_domain,
)
new_weight = to_linear_activation_quantized(new_weight, input_quant_func)
return new_weight
if config.version == 1:
warnings.warn(
"Config Deprecation: version 1 of Int8DynamicActivationInt8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details"
)
quantized_weight = to_affine_quantized_intx(
weight,
mapping_type,
block_size,
target_dtype,
eps=eps,
zero_point_dtype=zero_point_dtype,
_layout=layout,
zero_point_domain=weight_zero_point_domain,
)
quantized_weight = to_linear_activation_quantized(
quantized_weight, input_quant_func
)
else:
quantized_weight = Int8Tensor.from_hp(
weight,
block_size,
)

return quantized_weight


@register_quantize_module_handler(Int8DynamicActivationInt8WeightConfig)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ def _choose_quant_func_and_quantize_tensor(
"""
from torchao.quantization.quantize_.workflows import (
Float8Tensor,
Int8Tensor,
QuantizeTensorToFloat8Kwargs,
QuantizeTensorToInt8Kwargs,
)

if isinstance(quant_kwargs, QuantizeTensorToFloat8Kwargs):
Expand All @@ -52,5 +54,10 @@ def _choose_quant_func_and_quantize_tensor(
quant_kwargs.hp_value_ub,
quant_kwargs.kernel_preference,
)
elif isinstance(quant_kwargs, QuantizeTensorToInt8Kwargs):
return Int8Tensor.from_hp(
tensor,
quant_kwargs.block_size or [1, tensor.shape[-1]],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: why not make block_size mandatory?

)

raise NotImplementedError(f"Quant kwargs not supported: {quant_kwargs}")
6 changes: 6 additions & 0 deletions torchao/quantization/quantize_/workflows/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
Int4Tensor,
)
from .int4.int4_tile_packed_to_4d_tensor import Int4TilePackedTo4dTensor
from .int8.int8_tensor import (
Int8Tensor,
QuantizeTensorToInt8Kwargs,
)
from .intx.intx_choose_qparams_algorithm import IntxChooseQParamsAlgorithm
from .intx.intx_opaque_tensor import (
IntxOpaqueTensor,
Expand All @@ -37,6 +41,8 @@
"Int4MarlinSparseTensor",
"Int4PlainInt32Tensor",
"Int4TilePackedTo4dTensor",
"Int8Tensor",
"QuantizeTensorToInt8Kwargs",
"Float8Tensor",
"QuantizeTensorToFloat8Kwargs",
"Int4OpaqueTensor",
Expand Down
Loading