Update deprecated parameters in Hugging Face library (#2982)

namgyu-youn · web-flow · commit bc72e1c76634 · 2025-09-22T10:44:38.000-07:00
* Summary: In `from_pretrained()` method in `huggingface/transformers`, `torch_dtype` is deprecated and `dtype` replaces it. To prevent deprecation warnings, this PR replaces `torch_dtype` with `dtype`. Test plan: CI Reference: huggingface/transformers#39782 * fix pre-commit * revert to source: model uploader
diff --git a/.github/scripts/torchao_model_releases/quantize_and_upload.py b/.github/scripts/torchao_model_releases/quantize_and_upload.py
@@ -592,7 +592,7 @@ def _untie_weights_and_save_locally(model_id):
 python -m executorch.examples.models.qwen3.convert_weights $(hf download {quantized_model}) pytorch_model_converted.bin
 ```
 
-Once we have the checkpoint, we export it to ExecuTorch with a max_seq_length/max_context_length of 1024 to the XNNPACK backend as follows. 
+Once we have the checkpoint, we export it to ExecuTorch with a max_seq_length/max_context_length of 1024 to the XNNPACK backend as follows.
 
 [TODO: fix config path in note where necessary]
 (Note: ExecuTorch LLM export script requires config.json have certain key names. The correct config to use for the LLM export script is located at examples/models/qwen3/config/4b_config.json within the ExecuTorch repo.)
diff --git a/README.md b/README.md
@@ -159,7 +159,7 @@ quantization_config = TorchAoConfig(quant_type=Int4WeightOnlyConfig(group_size=1
 # Load and automatically quantize
 quantized_model = AutoModelForCausalLM.from_pretrained(
     "microsoft/Phi-4-mini-instruct",
-    torch_dtype="auto",
+    dtype="auto",
     device_map="auto",
     quantization_config=quantization_config
 )
diff --git a/benchmarks/_models/eval_hf_models.py b/benchmarks/_models/eval_hf_models.py
@@ -25,7 +25,7 @@ def quantize_model_and_save(model_id, quant_config, output_dir="results"):
     quantized_model = AutoModelForCausalLM.from_pretrained(
         model_id,
         device_map="auto",
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
         quantization_config=quantization_config,
     )
     tokenizer = AutoTokenizer.from_pretrained(model_id)
diff --git a/docs/source/serving.rst b/docs/source/serving.rst
@@ -85,7 +85,7 @@ Install the required packages:
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
         device_map="auto",
-        torch_dtype="auto",
+        dtype="auto",
         trust_remote_code=True,
     )
     tokenizer = AutoTokenizer.from_pretrained(model_path)
@@ -134,7 +134,7 @@ Optionally, we can quantize the embedding and lm_head differently, since those l
     from transformers.modeling_utils import find_tied_parameters
 
     model_id = "microsoft/Phi-4-mini-instruct"
-    untied_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
+    untied_model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map="auto")
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     print(untied_model)
@@ -202,7 +202,7 @@ Quantizing the model for mobile deployment using TorchAO's ``Int8DynamicActivati
     quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True, untie_embedding_weights=True, modules_to_not_convert=[])
 
     # either use `untied_model_id` or `untied_model_local_path`
-    quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, torch_dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
+    quantized_model = AutoModelForCausalLM.from_pretrained(untied_model_id, dtype=torch.float32, device_map="auto", quantization_config=quantization_config)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     # Push to hub
@@ -285,7 +285,7 @@ For Phi-4-mini-instruct, when quantized with float8 dynamic quant, we can reduce
 
     # use "microsoft/Phi-4-mini-instruct" or "pytorch/Phi-4-mini-instruct-float8dq"
     model_id = "pytorch/Phi-4-mini-instruct-float8dq"
-    quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
+    quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 
     torch.cuda.reset_peak_memory_stats()
diff --git a/docs/source/torchao_vllm_integration.md b/docs/source/torchao_vllm_integration.md
@@ -88,7 +88,7 @@ quantization_config = TorchAoConfig(
 # Load and automatically quantize the model
 model = AutoModelForCausalLM.from_pretrained(
     "meta-llama/Llama-3.2-1B",
-    torch_dtype="auto",
+    dtype="auto",
     device_map="auto",
     quantization_config=quantization_config
 )
diff --git a/test/integration/test_load_and_run_checkpoint.py b/test/integration/test_load_and_run_checkpoint.py
@@ -193,7 +193,7 @@ def test_deprecated_hf_models(self, model_info):
         with warnings.catch_warnings(record=True) as caught_warnings:
             quantized_model = AutoModelForCausalLM.from_pretrained(
                 model_name,
-                torch_dtype="bfloat16",
+                dtype="bfloat16",
                 device_map="cuda:0",
             )
             # version mismatch check in config.py
@@ -250,7 +250,7 @@ def test_deprecated_hf_models(self, model_info):
         with warnings.catch_warnings(record=True) as caught_warnings:
             _ = AutoModelForCausalLM.from_pretrained(
                 _HIGH_PRECISION_MODEL,
-                torch_dtype="bfloat16",
+                dtype="bfloat16",
                 device_map="cuda:0",
                 quantization_config=quantized_model.config.quantization_config,
             )
diff --git a/test/integration/test_vllm.py b/test/integration/test_vllm.py
@@ -153,7 +153,7 @@ def quantize_and_save_model(
         # Load and quantize model
         quantized_model = AutoModelForCausalLM.from_pretrained(
             model_name,
-            torch_dtype="bfloat16",
+            dtype="bfloat16",
             device_map="cuda",
             quantization_config=quantization_config,
         )
diff --git a/torchao/prototype/autoround/autoround_llm.py b/torchao/prototype/autoround/autoround_llm.py
@@ -88,7 +88,7 @@ def main(args):
     # Get the model, tokenizer, and decoder_cls
     model_name_or_path = args.model_name_or_path
     model, tokenizer, decoder_cls = ar_utils.get_float_model_info(
-        model_name_or_path, torch_dtype=torch.bfloat16
+        model_name_or_path, dtype=torch.bfloat16
     )
     # Disable the `use_cache` for calibration stage.
     model.config.use_cache = False
diff --git a/torchao/prototype/autoround/eval_autoround.py b/torchao/prototype/autoround/eval_autoround.py
@@ -86,7 +86,7 @@ def main(args):
     with torch.no_grad():
         model_name_or_path = args.model_name_or_path
         model, tokenizer, decoder_cls = ar_utils.get_float_model_info(
-            model_name_or_path, torch_dtype=torch.bfloat16
+            model_name_or_path, dtype=torch.bfloat16
         )
         model.eval()
         model_device = args.model_device
diff --git a/torchao/prototype/autoround/utils.py b/torchao/prototype/autoround/utils.py
@@ -140,11 +140,11 @@ def _auto_detect_decoder_cls(model):
             return type(first_module)
 
 
-def get_float_model_info(model_name_or_path, torch_dtype=torch.float32):
+def get_float_model_info(model_name_or_path, dtype=torch.float32):
     import transformers
 
     model = transformers.AutoModelForCausalLM.from_pretrained(
-        model_name_or_path, torch_dtype=torch_dtype
+        model_name_or_path, dtype=dtype
     )
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
     decoder_cls = _auto_detect_decoder_cls(model)
diff --git a/torchao/prototype/moe_quant/llama4_quant.py b/torchao/prototype/moe_quant/llama4_quant.py
@@ -58,7 +58,7 @@ def convert_fn(module):
 
 
 model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-model = Llama4ForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+model = Llama4ForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 _replace_with_custom_fn_if_matches_filter(
diff --git a/torchao/prototype/quantization/mixed_precision/scripts/fit.py b/torchao/prototype/quantization/mixed_precision/scripts/fit.py
@@ -84,7 +84,7 @@ def main(max_seqlen, checkpoint, nsamples, max_iter, num_layers):
 
     # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
     model = transformers.AutoModelForCausalLM.from_pretrained(
-        checkpoint, torch_dtype=torch.bfloat16
+        checkpoint, dtype=torch.bfloat16
     )
     tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
     model = model.to(device)
diff --git a/torchao/prototype/quantization/mixed_precision/scripts/hessian_grad.py b/torchao/prototype/quantization/mixed_precision/scripts/hessian_grad.py
@@ -130,7 +130,7 @@ def main(layer_id, checkpoint, max_seqlen, max_iter, nsamples):
     with sdpa_kernel(SDPBackend.MATH):
         # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
         model = transformers.AutoModelForCausalLM.from_pretrained(
-            checkpoint, torch_dtype=torch.bfloat16
+            checkpoint, dtype=torch.bfloat16
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
         model = model.cuda()
diff --git a/torchao/prototype/quantization/mixed_precision/scripts/hessian_vhp.py b/torchao/prototype/quantization/mixed_precision/scripts/hessian_vhp.py
@@ -100,7 +100,7 @@ def f(*new_params):
     with sdpa_kernel(SDPBackend.MATH):
         # have been tested models Llama-3-8B, Llama-2-7B, Mistral-7B, and stories110M
         model = transformers.AutoModelForCausalLM.from_pretrained(
-            checkpoint, torch_dtype=torch.bfloat16
+            checkpoint, dtype=torch.bfloat16
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
         model = model.to(device)
diff --git a/torchao/prototype/quantization/mixed_precision/scripts/utils.py b/torchao/prototype/quantization/mixed_precision/scripts/utils.py
@@ -105,9 +105,9 @@ def cal_model_size(model, fqn_to_config):
 
 def load_model(repo_id, device):
     tokenizer = AutoTokenizer.from_pretrained(repo_id)
-    model = AutoModelForCausalLM.from_pretrained(
-        repo_id, torch_dtype=torch.bfloat16
-    ).to(device=device)
+    model = AutoModelForCausalLM.from_pretrained(repo_id, dtype=torch.bfloat16).to(
+        device=device
+    )
     return model, tokenizer
 
 
diff --git a/torchao/prototype/smoothquant/example.py b/torchao/prototype/smoothquant/example.py
@@ -88,7 +88,7 @@ def quantize_and_eval(
     t0 = time.time()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = (
-        AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
         .eval()
         .to(device)
     )
@@ -155,7 +155,7 @@ def compare_models(
     torch.manual_seed(34)
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = (
-        AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
         .eval()
         .to(device)
     )
@@ -167,7 +167,7 @@ def compare_models(
     print("Benchmarking W8A8-dynamic without SmoothQuant...")
     torch.manual_seed(34)
     w8a8_model = (
-        AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16)
         .eval()
         .to(device)
     )

Original file line number	Diff line number	Diff line change
`@@ -159,7 +159,7 @@ quantization_config = TorchAoConfig(quant_type=Int4WeightOnlyConfig(group_size=1`
`159`	`159`	`# Load and automatically quantize`
`160`	`160`	`quantized_model = AutoModelForCausalLM.from_pretrained(`
`161`	`161`	`"microsoft/Phi-4-mini-instruct",`
`162`		`- torch_dtype="auto",`
	`162`	`+ dtype="auto",`
`163`	`163`	`device_map="auto",`
`164`	`164`	`quantization_config=quantization_config`
`165`	`165`	`)`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ def quantize_model_and_save(model_id, quant_config, output_dir="results"):`
`25`	`25`	`quantized_model = AutoModelForCausalLM.from_pretrained(`
`26`	`26`	`model_id,`
`27`	`27`	`device_map="auto",`
`28`		`- torch_dtype=torch.bfloat16,`
	`28`	`+ dtype=torch.bfloat16,`
`29`	`29`	`quantization_config=quantization_config,`
`30`	`30`	`)`
`31`	`31`	`tokenizer = AutoTokenizer.from_pretrained(model_id)`
Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ quantization_config = TorchAoConfig(`
`88`	`88`	`# Load and automatically quantize the model`
`89`	`89`	`model = AutoModelForCausalLM.from_pretrained(`
`90`	`90`	`"meta-llama/Llama-3.2-1B",`
`91`		`- torch_dtype="auto",`
	`91`	`+ dtype="auto",`
`92`	`92`	`device_map="auto",`
`93`	`93`	`quantization_config=quantization_config`
`94`	`94`	`)`
Original file line number	Diff line number	Diff line change
`@@ -153,7 +153,7 @@ def quantize_and_save_model(`
`153`	`153`	`# Load and quantize model`
`154`	`154`	`quantized_model = AutoModelForCausalLM.from_pretrained(`
`155`	`155`	`model_name,`
`156`		`- torch_dtype="bfloat16",`
	`156`	`+ dtype="bfloat16",`
`157`	`157`	`device_map="cuda",`
`158`	`158`	`quantization_config=quantization_config,`
`159`	`159`	`)`
Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ def main(args):`
`88`	`88`	`# Get the model, tokenizer, and decoder_cls`
`89`	`89`	`model_name_or_path = args.model_name_or_path`
`90`	`90`	`model, tokenizer, decoder_cls = ar_utils.get_float_model_info(`
`91`		`- model_name_or_path, torch_dtype=torch.bfloat16`
	`91`	`+ model_name_or_path, dtype=torch.bfloat16`
`92`	`92`	`)`
`93`	`93`	# Disable the `use_cache` for calibration stage.
`94`	`94`	`model.config.use_cache = False`
Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@ def main(args):`
`86`	`86`	`with torch.no_grad():`
`87`	`87`	`model_name_or_path = args.model_name_or_path`
`88`	`88`	`model, tokenizer, decoder_cls = ar_utils.get_float_model_info(`
`89`		`- model_name_or_path, torch_dtype=torch.bfloat16`
	`89`	`+ model_name_or_path, dtype=torch.bfloat16`
`90`	`90`	`)`
`91`	`91`	`model.eval()`
`92`	`92`	`model_device = args.model_device`