Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_REPOSITORY https://gh-proxy.com/https://github.com/nvidia/cutlass.git
# CUTLASS 3.5.1
GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9
GIT_PROGRESS TRUE
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ Easy, fast, and cheap LLM serving for everyone
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).

---
## Usage
glm-4v: [Colab Notebook](https://colab.research.google.com/drive/1jpCM0H3thZjN1XqcnpHm3S5g2Z9Pz89k?usp=sharing)

## About
vLLM is a fast and easy-to-use library for LLM inference and serving.

Expand Down
29 changes: 29 additions & 0 deletions tests/models/=0.42.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Collecting bitsandbytes
Downloading http://mirrors.aliyun.com/pypi/packages/f8/1a/3cbdd70ce276085602ffe7e4f52753a41c43464053eec9e76b3dd065e4c9/bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 137.5/137.5 MB 33.3 MB/s eta 0:00:00
Requirement already satisfied: numpy in /root/miniconda3/lib/python3.10/site-packages (from bitsandbytes) (1.26.3)
Requirement already satisfied: torch in /root/miniconda3/lib/python3.10/site-packages (from bitsandbytes) (2.4.0)
Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (11.0.2.54)
Requirement already satisfied: typing-extensions>=4.8.0 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (4.12.2)
Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.0.106)
Requirement already satisfied: networkx in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.2.1)
Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105)
Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (10.3.2.106)
Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.3.1)
Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (2.20.5)
Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (9.1.0.70)
Requirement already satisfied: jinja2 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.1.2)
Requirement already satisfied: filelock in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.13.1)
Requirement already satisfied: fsspec in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (2023.12.2)
Requirement already satisfied: triton==3.0.0 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.0.0)
Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105)
Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (11.4.5.107)
Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105)
Requirement already satisfied: sympy in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (1.12)
Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105)
Requirement already satisfied: nvidia-nvjitlink-cu12 in /root/miniconda3/lib/python3.10/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->bitsandbytes) (12.6.20)
Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/lib/python3.10/site-packages (from jinja2->torch->bitsandbytes) (2.1.3)
Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/lib/python3.10/site-packages (from sympy->torch->bitsandbytes) (1.3.0)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3
1 change: 1 addition & 0 deletions tests/models/authorized_keys
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCknPM98UwkAhlbOCvr+W6AFjWcMT2sz4566yknaalfAlK6VJTQ6k4xzuEmY4jTnYxNwgGGoal7mpsZnUCtiR7Qtv+JAoSSinSuV75QxJDht1dwutR7Pic7qDyCdzKESCKH1Wz0AGZnCMYh8G1SPN5lPQYFEsSjNtefdNySLzuRsqbEi7Cvx3HJSJCbuOeCKJFPQFerUwgE2WAhHjlWGKHsOnbGA/WTKw1yLohAupBYcf3I2B0nRPbUWb6NXq4VRd0NsdpDpipEmzxXYUHyF6bJAG+y4CBmfhHkfPRJamebr1X68Ueyo9MiQhviB4HWXQR0/KdZz5pSPX+PKvlk5+g/BIUW7E/43Ev7RqXPROQtJrB7/UUDh2VV3p0l2Nv6sVsZg5WAIsJUoZD2qcsLdJbFRTIZ23LXofruTGAoJNBaNgObVmrRB12Fg6iJ8As3jNHGGYR9pCGi5BbWaFw58Sko1M8b+10cVYHkTWy8St2p9FM9Vn7uE5Nj8IejtdBHtN+0h+aE18FAVRloKVJP+ZfTLPnIFEansQy5gwCc+E0mIWHyUIRC+/Qh3lA54JRbaV6zGbAlZJYXljiO3SzdgEsC3WrHquKn5X7HOMtm5tOqKgwtyZlvIHZkjQH2nsPvp5O+Dr7YNfTAj2u53LdtMgXHClFMhCoIVaBEx+OjkbJjiw== kaggle_remote_ssh
110 changes: 110 additions & 0 deletions tests/models/test_glm4v.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# torch-2.1.0+cuda11.8-cp310-cp310-linux_aarch64.whl

import torch
from PIL import Image
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.inputs import TokensPrompt

max_model_len, tp_size = 8192, 1
model_name = "THUDM/glm-4v-9b"

llm = LLM(
model=model_name,
tensor_parallel_size=tp_size,
max_model_len=max_model_len,
trust_remote_code=True,
dtype=torch.bfloat16,
enforce_eager=True,
load_format='bitsandbytes',
quantization='bitsandbytes'
)
stop_token_ids = [151329, 151336, 151338]
sampling_params = SamplingParams(temperature=0, max_tokens=1024, stop_token_ids=stop_token_ids)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

query = 'Describe this picture.'

image = Image.open(os.path.join(os.path.dirname(__file__), "../../docs/source/assets/logos/vllm-logo-text-light.png")).convert('RGB')
inputs = tokenizer.apply_chat_template(
[{"role": "user", "image": image, "content": query}],
add_generation_prompt=True,
tokenize=True,
return_tensors="pt",
return_dict=True
)

image_tensor = inputs['images']

input_ids = inputs['input_ids'][0].tolist()

outputs = llm.generate(
TokensPrompt(**{
"prompt_token_ids": input_ids,
"multi_modal_data": {"image": image_tensor},
}),
sampling_params=sampling_params
)

print(outputs[0].outputs[0].text)


# from transformers import AutoTokenizer
# from vllm import LLM, SamplingParams

# # GLM-4-9B-Chat-1M
# # max_model_len, tp_size = 1048576, 4
# # 如果遇见 OOM 现象,建议减少max_model_len,或者增加tp_size
# max_model_len, tp_size = 60000, 1
# model_name = "THUDM/glm-4-9b-chat"
# prompt = [{"role": "user", "content": "你好"}]

# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# llm = LLM(
# model=model_name,
# tensor_parallel_size=tp_size,
# max_model_len=max_model_len,
# trust_remote_code=True,
# enforce_eager=True,
# load_format='bitsandbytes',
# quantization='bitsandbytes'
# # GLM-4-9B-Chat-1M 如果遇见 OOM 现象,建议开启下述参数
# # enable_chunked_prefill=True,
# # max_num_batched_tokens=8192
# )
# stop_token_ids = [151329, 151336, 151338]
# sampling_params = SamplingParams(temperature=0.95, max_tokens=1024, stop_token_ids=stop_token_ids)

# inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
# outputs = llm.generate(prompts=inputs, sampling_params=sampling_params)

# print(outputs[0].outputs[0].text)

# from vllm import LLM, SamplingParams


# prompts = [
# "Hello, China is a"
# ]
# sampling_params = SamplingParams(temperature=0.8, top_p=0.95)


# llm = LLM(
# model="huggyllama/llama-7b",
# trust_remote_code=True,
# enforce_eager=True,
# load_format='bitsandbytes',
# quantization='bitsandbytes'
# )

# outputs = llm.generate(prompts, sampling_params)

# # Print the outputs.
# for output in outputs:
# prompt = output.prompt
# generated_text = output.outputs[0].text
# print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
24 changes: 24 additions & 0 deletions vllm/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,18 @@ def weight_loader(self,
shard_size, shard_offset = adjust_marlin_shard(
param, shard_size, shard_offset)

# TODO: Double check
use_bitsandbytes = getattr(param, "use_bitsandbytes", False)
if use_bitsandbytes:
total = sum(self.output_sizes)
orig_offset, orig_size = shard_offset, shard_size

quantized_total = param.data.shape[0]
quantized_offset = orig_offset * quantized_total // total
quantized_size = orig_size * quantized_total // total

shard_offset, shard_size = quantized_offset, quantized_size

loaded_weight_shard = loaded_weight.narrow(
output_dim, shard_offset, shard_size)
self.weight_loader(param, loaded_weight_shard, shard_id)
Expand Down Expand Up @@ -811,6 +823,18 @@ def weight_loader(self,
shard_size, shard_offset = adjust_marlin_shard(
param, shard_size, shard_offset)

# TODO: Double check
use_bitsandbytes = getattr(param, "use_bitsandbytes", False)
if use_bitsandbytes:
total = (self.num_heads + 2 * self.num_kv_heads) * self.head_size
orig_offset, orig_size = shard_offset, shard_size

quantized_total = param.data.shape[0]
quantized_offset = orig_offset * quantized_total // total
quantized_size = orig_size * quantized_total // total

shard_offset, shard_size = quantized_offset, quantized_size

loaded_weight_shard = loaded_weight.narrow(
output_dim, shard_offset, shard_size)
self.weight_loader(param, loaded_weight_shard, shard_id)
Expand Down
11 changes: 11 additions & 0 deletions vllm/model_executor/layers/quantization/bitsandbytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,12 @@ def apply(self,

# only load the bitsandbytes module when needed
from bitsandbytes import matmul_4bit
original_shape = x.shape

if len(original_shape) == 3:
# default reshape
B, L, _ = original_shape
x = x.reshape(B * L, -1)

original_type = x.dtype
bf_x = x.to(torch.bfloat16)
Expand Down Expand Up @@ -154,4 +160,9 @@ def apply(self,
if bias is not None:
out += bias

if len(original_shape) == 3:
# default reshape
B, L, _ = original_shape
out = out.reshape(B, L, -1)

return out
3 changes: 3 additions & 0 deletions vllm/model_executor/model_loader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,9 @@ def generator() -> Generator:

def _load_weights(self, model_config: ModelConfig,
model: nn.Module) -> None:

self.target_modules += getattr(model, 'bitsandbytes_quant_target_modules', [])

if not hasattr(model, 'load_weights'):
raise AttributeError(
"The required method 'load_weights' is not defined in class"
Expand Down
Loading