Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
741f111
feat: Improve tensorizer S3 handling and arg passing
sangstar Jun 13, 2025
14413dd
fix: Refine `TensorizerConfig`'s `MutableMapping` implementation
sangstar Jun 13, 2025
b548d40
style: Run pre-commit
sangstar Jun 13, 2025
45c9c74
tests: Use timeout method compatible with all supported Python versions
sangstar Jun 13, 2025
c483863
tests: Add `is not None` assertion for mypy
sangstar Jun 16, 2025
e9224e5
fix: Add changes to appease `mypy`
sangstar Jun 16, 2025
62482cb
style: Re-run `yapf`
sangstar Jun 16, 2025
e0e6f4b
fix: Ensure `TensorizerConfig` becomes serializable dict
sangstar Jun 16, 2025
8d2c0c7
style: Re-run ruff
sangstar Jun 16, 2025
4a284ab
fix: Rename `tensorizer_config_is_validated`
sangstar Jun 16, 2025
3c1c349
fix: Rename and fix config validation fn
sangstar Jun 16, 2025
a534e59
chore: Fix typo in docstring
sangstar Jun 16, 2025
115225a
fix: Revert private attribute naming for some `TensorizerConfig` fields
sangstar Jun 20, 2025
0dfb3e1
fix: Additionally revert `model_class` parameter name change
sangstar Jun 20, 2025
c1160e9
chore: Apply suggested change
sangstar Jul 2, 2025
5f469d7
chore: Fix linting for previous change
sangstar Jul 2, 2025
66b7832
style: Run `isort`
sangstar Jul 3, 2025
e073d54
chore: Update `tensorizer` versions
sangstar Jul 3, 2025
cbe7bb8
chore: Update `tensorizer` version in `test.txt`
sangstar Jul 3, 2025
ab44e9d
chore: Remove semantic versioning, use pinned `2.10.1`
sangstar Jul 3, 2025
31b3633
fix: Fix `tensorizer_config` naming to `tensorizer_config_dict`
sangstar Jul 6, 2025
6b953e9
chore: Fix LoRA CLI args and docstring
sangstar Jul 7, 2025
092b045
chore: Rm backticks
sangstar Jul 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 83 additions & 25 deletions examples/others/tensorize_vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import argparse
import dataclasses
import json
import logging
import os
import uuid

Expand All @@ -15,9 +16,13 @@
TensorizerConfig,
tensorize_lora_adapter,
tensorize_vllm_model,
tensorizer_kwargs_arg,
)
from vllm.utils import FlexibleArgumentParser

logger = logging.getLogger()


# yapf conflicts with isort for this docstring
# yapf: disable
"""
Expand Down Expand Up @@ -119,7 +124,7 @@
"""


def parse_args():
def get_parser():
parser = FlexibleArgumentParser(
description="An example script that can be used to serialize and "
"deserialize vLLM models. These models "
Expand All @@ -135,13 +140,13 @@ def parse_args():
required=False,
help="Path to a LoRA adapter to "
"serialize along with model tensors. This can then be deserialized "
"along with the model by passing a tensorizer_config kwarg to "
"LoRARequest with type TensorizerConfig. See the docstring for this "
"for a usage example."

"along with the model by instantiating a TensorizerConfig object, "
"creating a dict from it with TensorizerConfig.to_serializable(), "
"and passing it to LoRARequest's initializer with the kwarg "
"tensorizer_config_dict."
)

subparsers = parser.add_subparsers(dest='command')
subparsers = parser.add_subparsers(dest='command', required=True)

serialize_parser = subparsers.add_parser(
'serialize', help="Serialize a model to `--serialized-directory`")
Expand Down Expand Up @@ -171,6 +176,14 @@ def parse_args():
"where `suffix` is given by `--suffix` or a random UUID if not "
"provided.")

serialize_parser.add_argument(
"--serialization-kwargs",
type=tensorizer_kwargs_arg,
required=False,
help=("A JSON string containing additional keyword arguments to "
"pass to Tensorizer's TensorSerializer during "
"serialization."))

serialize_parser.add_argument(
"--keyfile",
type=str,
Expand All @@ -186,21 +199,45 @@ def parse_args():
deserialize_parser.add_argument(
"--path-to-tensors",
type=str,
required=True,
required=False,
help="The local path or S3 URI to the model tensors to deserialize. ")

deserialize_parser.add_argument(
"--serialized-directory",
type=str,
required=False,
help="Directory with model artifacts for loading. Assumes a "
"model.tensors file exists therein. Can supersede "
"--path-to-tensors.")

deserialize_parser.add_argument(
"--keyfile",
type=str,
required=False,
help=("Path to a binary key to use to decrypt the model weights,"
" if the model was serialized with encryption"))

TensorizerArgs.add_cli_args(deserialize_parser)
deserialize_parser.add_argument(
"--deserialization-kwargs",
type=tensorizer_kwargs_arg,
required=False,
help=("A JSON string containing additional keyword arguments to "
"pass to Tensorizer's `TensorDeserializer` during "
"deserialization."))

return parser.parse_args()
TensorizerArgs.add_cli_args(deserialize_parser)

return parser

def merge_extra_config_with_tensorizer_config(extra_cfg: dict,
cfg: TensorizerConfig):
for k, v in extra_cfg.items():
if hasattr(cfg, k):
setattr(cfg, k, v)
logger.info(
"Updating TensorizerConfig with %s from "
"--model-loader-extra-config provided", k
)

def deserialize(args, tensorizer_config):
if args.lora_path:
Expand Down Expand Up @@ -230,7 +267,8 @@ def deserialize(args, tensorizer_config):
lora_request=LoRARequest("sql-lora",
1,
args.lora_path,
tensorizer_config = tensorizer_config)
tensorizer_config_dict = tensorizer_config
.to_serializable())
)
)
else:
Expand All @@ -243,7 +281,8 @@ def deserialize(args, tensorizer_config):


def main():
args = parse_args()
parser = get_parser()
args = parser.parse_args()

s3_access_key_id = (getattr(args, 's3_access_key_id', None)
or os.environ.get("S3_ACCESS_KEY_ID", None))
Expand All @@ -265,13 +304,24 @@ def main():
else:
keyfile = None

extra_config = {}
if args.model_loader_extra_config:
config = json.loads(args.model_loader_extra_config)
tensorizer_args = \
TensorizerConfig(**config)._construct_tensorizer_args()
tensorizer_args.tensorizer_uri = args.path_to_tensors
else:
tensorizer_args = None
extra_config = json.loads(args.model_loader_extra_config)


tensorizer_dir = (args.serialized_directory or
extra_config.get("tensorizer_dir"))
tensorizer_uri = (getattr(args, "path_to_tensors", None)
or extra_config.get("tensorizer_uri"))

if tensorizer_dir and tensorizer_uri:
parser.error("--serialized-directory and --path-to-tensors "
"cannot both be provided")

if not tensorizer_dir and not tensorizer_uri:
parser.error("Either --serialized-directory or --path-to-tensors "
"must be provided")


if args.command == "serialize":
eng_args_dict = {f.name: getattr(args, f.name) for f in
Expand All @@ -281,7 +331,7 @@ def main():
argparse.Namespace(**eng_args_dict)
)

input_dir = args.serialized_directory.rstrip('/')
input_dir = tensorizer_dir.rstrip('/')
suffix = args.suffix if args.suffix else uuid.uuid4().hex
base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
if engine_args.tensor_parallel_size > 1:
Expand All @@ -292,21 +342,29 @@ def main():
tensorizer_config = TensorizerConfig(
tensorizer_uri=model_path,
encryption_keyfile=keyfile,
**credentials)
serialization_kwargs=args.serialization_kwargs or {},
**credentials
)

if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
tensorize_lora_adapter(args.lora_path, tensorizer_config)

merge_extra_config_with_tensorizer_config(extra_config,
tensorizer_config)
tensorize_vllm_model(engine_args, tensorizer_config)

elif args.command == "deserialize":
if not tensorizer_args:
tensorizer_config = TensorizerConfig(
tensorizer_uri=args.path_to_tensors,
encryption_keyfile = keyfile,
**credentials
)
tensorizer_config = TensorizerConfig(
tensorizer_uri=args.path_to_tensors,
tensorizer_dir=args.serialized_directory,
encryption_keyfile=keyfile,
deserialization_kwargs=args.deserialization_kwargs or {},
**credentials
)

merge_extra_config_with_tensorizer_config(extra_config,
tensorizer_config)
deserialize(args, tensorizer_config)
else:
raise ValueError("Either serialize or deserialize must be specified.")
Expand Down
2 changes: 1 addition & 1 deletion requirements/nightly_torch_test.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# testing
pytest
tensorizer>=2.9.0
tensorizer==2.10.1
pytest-forked
pytest-asyncio
pytest-rerunfailures
Expand Down
2 changes: 1 addition & 1 deletion requirements/rocm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ datasets
ray>=2.10.0,<2.45.0
peft
pytest-asyncio
tensorizer>=2.9.0
tensorizer==2.10.1
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
Expand Down
2 changes: 1 addition & 1 deletion requirements/test.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# testing
pytest
tensorizer>=2.9.0
tensorizer==2.10.1
pytest-forked
pytest-asyncio
pytest-rerunfailures
Expand Down
2 changes: 1 addition & 1 deletion requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ tenacity==9.0.0
# via
# lm-eval
# plotly
tensorizer==2.9.0
tensorizer==2.10.1
# via -r requirements/test.in
threadpoolctl==3.5.0
# via scikit-learn
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ def _read_requirements(filename: str) -> list[str]:
install_requires=get_requirements(),
extras_require={
"bench": ["pandas", "datasets"],
"tensorizer": ["tensorizer>=2.9.0"],
"tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
"audio": ["librosa", "soundfile"], # Required for audio processing
Expand Down
18 changes: 10 additions & 8 deletions tests/entrypoints/openai/test_tensorizer_entrypoint.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import json
import os
import tempfile

import openai
Expand Down Expand Up @@ -58,18 +58,20 @@ def tensorize_model_and_lora(tmp_dir, model_uri):

@pytest.fixture(scope="module")
def server(model_uri, tensorize_model_and_lora):
model_loader_extra_config = {
"tensorizer_uri": model_uri,
}
# In this case, model_uri is a directory with a model.tensors
# file and all necessary model artifacts, particularly a
# HF `config.json` file. In this case, Tensorizer can infer the
# `TensorizerConfig` so --model-loader-extra-config can be completely
# omitted.

## Start OpenAI API server
args = [
"--load-format", "tensorizer", "--device", "cuda",
"--model-loader-extra-config",
json.dumps(model_loader_extra_config), "--enable-lora"
"--load-format", "tensorizer", "--served-model-name", MODEL_NAME,
"--enable-lora"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
model_dir = os.path.dirname(model_uri)
with RemoteOpenAIServer(model_dir, args) as remote_server:
yield remote_server


Expand Down
5 changes: 3 additions & 2 deletions tests/lora/test_llama_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
str(tp_size), "serialize", "--serialized-directory",
str(tmp_path), "--suffix", suffix
str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
'{"limit_cpu_concurrency": 4}'
],
check=True,
capture_output=True,
Expand All @@ -195,7 +196,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
tensor_parallel_size=2,
max_loras=2)

tensorizer_config_dict = tensorizer_config.to_dict()
tensorizer_config_dict = tensorizer_config.to_serializable()

print("lora adapter created")
assert do_sample(loaded_vllm_model,
Expand Down
Loading