Support using Int4PreshuffledTensor after loading

jerryzh168 · jerryzh168 · commit f02db4164d55 · 2025-10-01T17:36:34.000-07:00
Summary: Int4PreshuffledTensor has fasted int4 kernel for int4 weight only and fp8 act + int4 weight in fbgemm, but we can't slice the Tensor due to the preshuffling (and slice has to preserve alias) so we have to use Int4Tensor (plain format) so it can be sliced during loading, and convert the Tensor to preshuffled format after loading using `torchao.prototype.tensor_conversion.api.convert_to_packed_tensor_based_on_current_hardware` function. Test Plan: pytest tests/quantization/test_torchao.py -k test_opt_125m_int4wo_model_running_preshuffled_kernel For test we uploaded a plain int4 tensor checkpoint https://huggingface.co/torchao-testing/opt-125m-Int4WeightOnlyConfig-v2-0.14.0.dev and load it in vllm, then check the model is transformed to use Int4PreshuffledTensor before inference Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
@@ -211,5 +211,32 @@ def test_reload_weights():
         # print("-" * 60)
 
 
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+# @pytest.mark.skip(
+#     reason="since torchao nightly is only compatible with torch nightly"
+#     "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
+#     "torchao tests that requires newer versions (0.14.0.dev+) for now")
+def test_opt_125m_int4wo_model_running_preshuffled_kernel(vllm_runner):
+    """We load a model with Int4Tensor (plain format) linear weights
+    and verify that the weight is updated to Int4PreshuffledTensor
+    after loading in vllm
+    """
+    torch._dynamo.reset()
+    model_name = ("torchao-testing/opt-125m-Int4WeightOnlyConfig-v2"
+                  "-0.14.0.dev")
+    with vllm_runner(model_name=model_name,
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     pt_load_map_location="cuda:0") as llm:
+        model_runner = llm.llm_engine.model_executor.driver_worker.model_runner
+        orig_model = model_runner.model
+        print("orig model:", orig_model)
+
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+
+        assert output
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py
@@ -260,6 +260,12 @@ def apply(
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if self.quant_config.is_checkpoint_torchao_serialized:
+            from torchao.prototype.tensor_conversion.api import (
+                convert_to_packed_tensor_based_on_current_hardware)
+            if hasattr(layer, "weight"):
+                layer.weight = Parameter(
+                    convert_to_packed_tensor_based_on_current_hardware(
+                        layer.weight))
             return
 
         # quantize the weight on the fly if the checkpoint is not already