Skip to content

Commit f9b4bea

Browse files
AlexKoff88SunMarcamyeroberts
authored
Added cache_block_outputs option to enable GPTQ for non-regular models (#27032)
* Added cache_block_outputs option to enable GPTQ for non-regular models * Update src/transformers/utils/quantization_config.py Co-authored-by: Marc Sun <[email protected]> * Update src/transformers/utils/quantization_config.py Co-authored-by: Marc Sun <[email protected]> * Fixed style * Update src/transformers/utils/quantization_config.py Co-authored-by: amyeroberts <[email protected]> --------- Co-authored-by: Marc Sun <[email protected]> Co-authored-by: amyeroberts <[email protected]>
1 parent 037fb7d commit f9b4bea

File tree

1 file changed

+4
-0
lines changed

1 file changed

+4
-0
lines changed

src/transformers/utils/quantization_config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,8 @@ class GPTQConfig(QuantizationConfigMixin):
360360
max_input_length (`int`, *optional*):
361361
The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input
362362
length. It is specific to the exllama backend with act-order.
363+
cache_block_outputs (`bool`, *optional*, defaults to `True`):
364+
Whether to cache block outputs to reuse as inputs for the succeeding block.
363365
"""
364366

365367
def __init__(
@@ -380,6 +382,7 @@ def __init__(
380382
pad_token_id: Optional[int] = None,
381383
disable_exllama: bool = False,
382384
max_input_length: Optional[int] = None,
385+
cache_block_outputs: bool = True,
383386
**kwargs,
384387
):
385388
self.quant_method = QuantizationMethod.GPTQ
@@ -399,6 +402,7 @@ def __init__(
399402
self.pad_token_id = pad_token_id
400403
self.disable_exllama = disable_exllama
401404
self.max_input_length = max_input_length
405+
self.cache_block_outputs = cache_block_outputs
402406
self.post_init()
403407

404408
def get_loading_attributes(self):

0 commit comments

Comments
 (0)