vllm-project · simon-mo · Jul 8, 2025 · Jun 13, 2025 · Jun 13, 2025 · Jun 13, 2025
diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py
@@ -4,6 +4,7 @@
 import argparse
 import dataclasses
 import json
+import logging
 import os
 import uuid
 
@@ -15,9 +16,13 @@
     TensorizerConfig,
     tensorize_lora_adapter,
     tensorize_vllm_model,
+    tensorizer_kwargs_arg,
 )
 from vllm.utils import FlexibleArgumentParser
 
+logger = logging.getLogger()
+
+
 # yapf conflicts with isort for this docstring
 # yapf: disable
 """
@@ -119,7 +124,7 @@
 """
 
 
-def parse_args():
+def get_parser():
     parser = FlexibleArgumentParser(
         description="An example script that can be used to serialize and "
         "deserialize vLLM models. These models "
@@ -135,13 +140,13 @@ def parse_args():
         required=False,
         help="Path to a LoRA adapter to "
         "serialize along with model tensors. This can then be deserialized "
-        "along with the model by passing a tensorizer_config kwarg to "
-        "LoRARequest with type TensorizerConfig. See the docstring for this "
-        "for a usage example."
-
+        "along with the model by instantiating a TensorizerConfig object, "
+        "creating a dict from it with TensorizerConfig.to_serializable(), "
+        "and passing it to LoRARequest's initializer with the kwarg "
+        "tensorizer_config_dict."
     )
 
-    subparsers = parser.add_subparsers(dest='command')
+    subparsers = parser.add_subparsers(dest='command', required=True)
 
     serialize_parser = subparsers.add_parser(
         'serialize', help="Serialize a model to `--serialized-directory`")
@@ -171,6 +176,14 @@ def parse_args():
         "where `suffix` is given by `--suffix` or a random UUID if not "
         "provided.")
 
+    serialize_parser.add_argument(
+        "--serialization-kwargs",
+        type=tensorizer_kwargs_arg,
+        required=False,
+        help=("A JSON string containing additional keyword arguments to "
+              "pass to Tensorizer's TensorSerializer during "
+              "serialization."))
+
     serialize_parser.add_argument(
         "--keyfile",
         type=str,
@@ -186,21 +199,45 @@ def parse_args():
     deserialize_parser.add_argument(
         "--path-to-tensors",
         type=str,
-        required=True,
+        required=False,
         help="The local path or S3 URI to the model tensors to deserialize. ")
 
+    deserialize_parser.add_argument(
+        "--serialized-directory",
+        type=str,
+        required=False,
+        help="Directory with model artifacts for loading. Assumes a "
+             "model.tensors file exists therein. Can supersede "
+             "--path-to-tensors.")
+
     deserialize_parser.add_argument(
         "--keyfile",
         type=str,
         required=False,
         help=("Path to a binary key to use to decrypt the model weights,"
               " if the model was serialized with encryption"))
 
-    TensorizerArgs.add_cli_args(deserialize_parser)
+    deserialize_parser.add_argument(
+        "--deserialization-kwargs",
+        type=tensorizer_kwargs_arg,
+        required=False,
+        help=("A JSON string containing additional keyword arguments to "
+              "pass to Tensorizer's `TensorDeserializer` during "
+              "deserialization."))
 
-    return parser.parse_args()
+    TensorizerArgs.add_cli_args(deserialize_parser)
 
+    return parser
 
+def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
+                                              cfg: TensorizerConfig):
+    for k, v in extra_cfg.items():
+        if hasattr(cfg, k):
+            setattr(cfg, k, v)
+            logger.info(
+                "Updating TensorizerConfig with %s from "
+                "--model-loader-extra-config provided", k
+            )
 
 def deserialize(args, tensorizer_config):
     if args.lora_path:
@@ -230,7 +267,8 @@ def deserialize(args, tensorizer_config):
             lora_request=LoRARequest("sql-lora",
                                      1,
                                      args.lora_path,
-                                     tensorizer_config = tensorizer_config)
+                                     tensorizer_config_dict = tensorizer_config
+                                     .to_serializable())
             )
         )
     else:
@@ -243,7 +281,8 @@ def deserialize(args, tensorizer_config):
 
 
 def main():
-    args = parse_args()
+    parser = get_parser()
+    args = parser.parse_args()
 
     s3_access_key_id = (getattr(args, 's3_access_key_id', None)
                         or os.environ.get("S3_ACCESS_KEY_ID", None))
@@ -265,13 +304,24 @@ def main():
     else:
         keyfile = None
 
+    extra_config = {}
     if args.model_loader_extra_config:
-        config = json.loads(args.model_loader_extra_config)
-        tensorizer_args = \
-            TensorizerConfig(**config)._construct_tensorizer_args()
-        tensorizer_args.tensorizer_uri = args.path_to_tensors
-    else:
-        tensorizer_args = None
+        extra_config = json.loads(args.model_loader_extra_config)
+
+
+    tensorizer_dir = (args.serialized_directory or
+                      extra_config.get("tensorizer_dir"))
+    tensorizer_uri = (getattr(args, "path_to_tensors", None)
+                      or extra_config.get("tensorizer_uri"))
+
+    if tensorizer_dir and tensorizer_uri:
+        parser.error("--serialized-directory and --path-to-tensors "
+                     "cannot both be provided")
+
+    if not tensorizer_dir and not tensorizer_uri:
+        parser.error("Either --serialized-directory or --path-to-tensors "
+                     "must be provided")
+
 
     if args.command == "serialize":
         eng_args_dict = {f.name: getattr(args, f.name) for f in
@@ -281,7 +331,7 @@ def main():
             argparse.Namespace(**eng_args_dict)
         )
 
-        input_dir = args.serialized_directory.rstrip('/')
+        input_dir = tensorizer_dir.rstrip('/')
         suffix = args.suffix if args.suffix else uuid.uuid4().hex
         base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
         if engine_args.tensor_parallel_size > 1:
@@ -292,21 +342,29 @@ def main():
         tensorizer_config = TensorizerConfig(
             tensorizer_uri=model_path,
             encryption_keyfile=keyfile,
-            **credentials)
+            serialization_kwargs=args.serialization_kwargs or {},
+            **credentials
+        )
 
         if args.lora_path:
             tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
             tensorize_lora_adapter(args.lora_path, tensorizer_config)
 
+        merge_extra_config_with_tensorizer_config(extra_config,
+                                                  tensorizer_config)
         tensorize_vllm_model(engine_args, tensorizer_config)
 
     elif args.command == "deserialize":
-        if not tensorizer_args:
-            tensorizer_config = TensorizerConfig(
-                tensorizer_uri=args.path_to_tensors,
-                encryption_keyfile = keyfile,
-                **credentials
-            )
+        tensorizer_config = TensorizerConfig(
+            tensorizer_uri=args.path_to_tensors,
+            tensorizer_dir=args.serialized_directory,
+            encryption_keyfile=keyfile,
+            deserialization_kwargs=args.deserialization_kwargs or {},
+            **credentials
+        )
+
+        merge_extra_config_with_tensorizer_config(extra_config,
+                                                  tensorizer_config)
         deserialize(args, tensorizer_config)
     else:
         raise ValueError("Either serialize or deserialize must be specified.")

diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
@@ -1,6 +1,6 @@
 # testing
 pytest
-tensorizer>=2.9.0
+tensorizer==2.10.1
 pytest-forked
 pytest-asyncio
 pytest-rerunfailures

diff --git a/requirements/rocm.txt b/requirements/rocm.txt
@@ -11,7 +11,7 @@ datasets
 ray>=2.10.0,<2.45.0
 peft
 pytest-asyncio
-tensorizer>=2.9.0
+tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8

diff --git a/requirements/test.in b/requirements/test.in
@@ -1,6 +1,6 @@
 # testing
 pytest
-tensorizer>=2.9.0
+tensorizer==2.10.1
 pytest-forked
 pytest-asyncio
 pytest-rerunfailures

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -739,7 +739,7 @@ tenacity==9.0.0
     # via
     #   lm-eval
     #   plotly
-tensorizer==2.9.0
+tensorizer==2.10.1
     # via -r requirements/test.in
 threadpoolctl==3.5.0
     # via scikit-learn

diff --git a/setup.py b/setup.py
@@ -689,7 +689,7 @@ def _read_requirements(filename: str) -> list[str]:
     install_requires=get_requirements(),
     extras_require={
         "bench": ["pandas", "datasets"],
-        "tensorizer": ["tensorizer>=2.9.0"],
+        "tensorizer": ["tensorizer==2.10.1"],
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
         "audio": ["librosa", "soundfile"],  # Required for audio processing

@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gc
-import json
+import os
 import tempfile
 
 import openai
@@ -58,18 +58,20 @@ def tensorize_model_and_lora(tmp_dir, model_uri):
 
 @pytest.fixture(scope="module")
 def server(model_uri, tensorize_model_and_lora):
-    model_loader_extra_config = {
-        "tensorizer_uri": model_uri,
-    }
+    # In this case, model_uri is a directory with a model.tensors
+    # file and all necessary model artifacts, particularly a
+    # HF `config.json` file. In this case, Tensorizer can infer the
+    # `TensorizerConfig` so --model-loader-extra-config can be completely
+    # omitted.
 
     ## Start OpenAI API server
     args = [
-        "--load-format", "tensorizer", "--device", "cuda",
-        "--model-loader-extra-config",
-        json.dumps(model_loader_extra_config), "--enable-lora"
+        "--load-format", "tensorizer", "--served-model-name", MODEL_NAME,
+        "--enable-lora"
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    model_dir = os.path.dirname(model_uri)
+    with RemoteOpenAIServer(model_dir, args) as remote_server:
         yield remote_server
 
 

@@ -169,7 +169,8 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
             f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
             MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
             str(tp_size), "serialize", "--serialized-directory",
-            str(tmp_path), "--suffix", suffix
+            str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
+            '{"limit_cpu_concurrency": 4}'
         ],
                                 check=True,
                                 capture_output=True,
@@ -195,7 +196,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
                             tensor_parallel_size=2,
                             max_loras=2)
 
-    tensorizer_config_dict = tensorizer_config.to_dict()
+    tensorizer_config_dict = tensorizer_config.to_serializable()
 
     print("lora adapter created")
     assert do_sample(loaded_vllm_model,