Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/transformers/utils/quantization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,8 @@ class GPTQConfig(QuantizationConfigMixin):
max_input_length (`int`, *optional*):
The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input
length. It is specific to the exllama backend with act-order.
cache_block_outputs (`bool`, *optional*, defaults to `True`):
Whether to cache block outputs to reuse as inputs for the succeeding block.
"""

def __init__(
Expand All @@ -369,6 +371,7 @@ def __init__(
pad_token_id: Optional[int] = None,
disable_exllama: bool = False,
max_input_length: Optional[int] = None,
cache_block_outputs: bool = True,
**kwargs,
):
self.quant_method = QuantizationMethod.GPTQ
Expand All @@ -388,6 +391,7 @@ def __init__(
self.pad_token_id = pad_token_id
self.disable_exllama = disable_exllama
self.max_input_length = max_input_length
self.cache_block_outputs = cache_block_outputs
self.post_init()

def get_loading_attributes(self):
Expand Down