vllm-project · alexw994 · Aug 15, 2024 · Aug 16, 2024 · Aug 16, 2024 · Aug 19, 2024
@@ -192,7 +192,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
   FetchContent_Declare(
         cutlass
-        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+        GIT_REPOSITORY https://gh-proxy.com/https://github.com/nvidia/cutlass.git
         # CUTLASS 3.5.1
         GIT_TAG 06b21349bcf6ddf6a1686a47a137ad1446579db9 
         GIT_PROGRESS TRUE

diff --git a/README.md b/README.md
@@ -27,6 +27,9 @@ Easy, fast, and cheap LLM serving for everyone
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
 
 ---
+## Usage
+glm-4v: [Colab Notebook](https://colab.research.google.com/drive/1jpCM0H3thZjN1XqcnpHm3S5g2Z9Pz89k?usp=sharing)
+
 ## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 

@@ -0,0 +1,29 @@
+Looking in indexes: http://mirrors.aliyun.com/pypi/simple
+Collecting bitsandbytes
+  Downloading http://mirrors.aliyun.com/pypi/packages/f8/1a/3cbdd70ce276085602ffe7e4f52753a41c43464053eec9e76b3dd065e4c9/bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
+     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 137.5/137.5 MB 33.3 MB/s eta 0:00:00
+Requirement already satisfied: numpy in /root/miniconda3/lib/python3.10/site-packages (from bitsandbytes) (1.26.3)
+Requirement already satisfied: torch in /root/miniconda3/lib/python3.10/site-packages (from bitsandbytes) (2.4.0)
+Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (11.0.2.54)
+Requirement already satisfied: typing-extensions>=4.8.0 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (4.12.2)
+Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.0.106)
+Requirement already satisfied: networkx in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.2.1)
+Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105)
+Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (10.3.2.106)
+Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.3.1)
+Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (2.20.5)
+Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (9.1.0.70)
+Requirement already satisfied: jinja2 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.1.2)
+Requirement already satisfied: filelock in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.13.1)
+Requirement already satisfied: fsspec in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (2023.12.2)
+Requirement already satisfied: triton==3.0.0 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (3.0.0)
+Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105)
+Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (11.4.5.107)
+Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105)
+Requirement already satisfied: sympy in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (1.12)
+Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /root/miniconda3/lib/python3.10/site-packages (from torch->bitsandbytes) (12.1.105)
+Requirement already satisfied: nvidia-nvjitlink-cu12 in /root/miniconda3/lib/python3.10/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->bitsandbytes) (12.6.20)
+Requirement already satisfied: MarkupSafe>=2.0 in /root/miniconda3/lib/python3.10/site-packages (from jinja2->torch->bitsandbytes) (2.1.3)
+Requirement already satisfied: mpmath>=0.19 in /root/miniconda3/lib/python3.10/site-packages (from sympy->torch->bitsandbytes) (1.3.0)
+Installing collected packages: bitsandbytes
+Successfully installed bitsandbytes-0.43.3
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCknPM98UwkAhlbOCvr+W6AFjWcMT2sz4566yknaalfAlK6VJTQ6k4xzuEmY4jTnYxNwgGGoal7mpsZnUCtiR7Qtv+JAoSSinSuV75QxJDht1dwutR7Pic7qDyCdzKESCKH1Wz0AGZnCMYh8G1SPN5lPQYFEsSjNtefdNySLzuRsqbEi7Cvx3HJSJCbuOeCKJFPQFerUwgE2WAhHjlWGKHsOnbGA/WTKw1yLohAupBYcf3I2B0nRPbUWb6NXq4VRd0NsdpDpipEmzxXYUHyF6bJAG+y4CBmfhHkfPRJamebr1X68Ueyo9MiQhviB4HWXQR0/KdZz5pSPX+PKvlk5+g/BIUW7E/43Ev7RqXPROQtJrB7/UUDh2VV3p0l2Nv6sVsZg5WAIsJUoZD2qcsLdJbFRTIZ23LXofruTGAoJNBaNgObVmrRB12Fg6iJ8As3jNHGGYR9pCGi5BbWaFw58Sko1M8b+10cVYHkTWy8St2p9FM9Vn7uE5Nj8IejtdBHtN+0h+aE18FAVRloKVJP+ZfTLPnIFEansQy5gwCc+E0mIWHyUIRC+/Qh3lA54JRbaV6zGbAlZJYXljiO3SzdgEsC3WrHquKn5X7HOMtm5tOqKgwtyZlvIHZkjQH2nsPvp5O+Dr7YNfTAj2u53LdtMgXHClFMhCoIVaBEx+OjkbJjiw== kaggle_remote_ssh
@@ -0,0 +1,110 @@
+import os
+os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' 
+
+# torch-2.1.0+cuda11.8-cp310-cp310-linux_aarch64.whl
+
+import torch
+from PIL import Image
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+from vllm.inputs import TokensPrompt
+
+max_model_len, tp_size = 8192, 1
+model_name = "THUDM/glm-4v-9b"
+
+llm = LLM(
+    model=model_name,
+    tensor_parallel_size=tp_size,
+    max_model_len=max_model_len,
+    trust_remote_code=True,
+    dtype=torch.bfloat16,
+    enforce_eager=True,
+    load_format='bitsandbytes',
+    quantization='bitsandbytes'
+)
+stop_token_ids = [151329, 151336, 151338]
+sampling_params = SamplingParams(temperature=0, max_tokens=1024, stop_token_ids=stop_token_ids)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+query = 'Describe this picture.'
+
+image = Image.open(os.path.join(os.path.dirname(__file__), "../../docs/source/assets/logos/vllm-logo-text-light.png")).convert('RGB')
+inputs = tokenizer.apply_chat_template(
+    [{"role": "user", "image": image, "content": query}],
+    add_generation_prompt=True,
+    tokenize=True,
+    return_tensors="pt",
+    return_dict=True
+)
+
+image_tensor = inputs['images']
+
+input_ids = inputs['input_ids'][0].tolist()
+
+outputs = llm.generate(
+    TokensPrompt(**{
+        "prompt_token_ids": input_ids,
+        "multi_modal_data":  {"image": image_tensor},
+    }),
+    sampling_params=sampling_params
+)
+
+print(outputs[0].outputs[0].text)
+
+
+# from transformers import AutoTokenizer
+# from vllm import LLM, SamplingParams
+
+# # GLM-4-9B-Chat-1M
+# # max_model_len, tp_size = 1048576, 4
+# # 如果遇见 OOM 现象，建议减少max_model_len，或者增加tp_size
+# max_model_len, tp_size = 60000, 1
+# model_name = "THUDM/glm-4-9b-chat"
+# prompt = [{"role": "user", "content": "你好"}]
+
+# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# llm = LLM(
+#     model=model_name,
+#     tensor_parallel_size=tp_size,
+#     max_model_len=max_model_len,
+#     trust_remote_code=True,
+#     enforce_eager=True,
+#     load_format='bitsandbytes',
+#     quantization='bitsandbytes'
+#     # GLM-4-9B-Chat-1M 如果遇见 OOM 现象，建议开启下述参数
+#     # enable_chunked_prefill=True,
+#     # max_num_batched_tokens=8192
+# )
+# stop_token_ids = [151329, 151336, 151338]
+# sampling_params = SamplingParams(temperature=0.95, max_tokens=1024, stop_token_ids=stop_token_ids)
+
+# inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
+# outputs = llm.generate(prompts=inputs, sampling_params=sampling_params)
+
+# print(outputs[0].outputs[0].text)
+
+# from vllm import LLM, SamplingParams
+
+
+# prompts = [
+#     "Hello, China is a"
+# ]
+# sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+# llm = LLM(
+#     model="huggyllama/llama-7b",
+#     trust_remote_code=True,
+#     enforce_eager=True,
+#     load_format='bitsandbytes',
+#     quantization='bitsandbytes'
+# )
+
+# outputs = llm.generate(prompts, sampling_params)
+
+# # Print the outputs.
+# for output in outputs:
+#     prompt = output.prompt
+#     generated_text = output.outputs[0].text
+#     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -475,6 +475,18 @@ def weight_loader(self,
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
+                # TODO: Double check
+                use_bitsandbytes = getattr(param, "use_bitsandbytes", False)
+                if use_bitsandbytes:
+                    total = sum(self.output_sizes)
+                    orig_offset, orig_size = shard_offset, shard_size
+
+                    quantized_total = param.data.shape[0]
+                    quantized_offset = orig_offset * quantized_total // total
+                    quantized_size = orig_size * quantized_total // total
+
+                    shard_offset, shard_size =  quantized_offset, quantized_size
+
                 loaded_weight_shard = loaded_weight.narrow(
                     output_dim, shard_offset, shard_size)
                 self.weight_loader(param, loaded_weight_shard, shard_id)
@@ -811,6 +823,18 @@ def weight_loader(self,
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
+                # TODO: Double check
+                use_bitsandbytes = getattr(param, "use_bitsandbytes", False)
+                if use_bitsandbytes:
+                    total = (self.num_heads + 2 * self.num_kv_heads) * self.head_size
+                    orig_offset, orig_size = shard_offset, shard_size
+
+                    quantized_total = param.data.shape[0]
+                    quantized_offset = orig_offset * quantized_total // total
+                    quantized_size = orig_size * quantized_total // total
+
+                    shard_offset, shard_size =  quantized_offset, quantized_size
+
                 loaded_weight_shard = loaded_weight.narrow(
                     output_dim, shard_offset, shard_size)
                 self.weight_loader(param, loaded_weight_shard, shard_id)

@@ -121,6 +121,12 @@ def apply(self,
 
         # only load the bitsandbytes module when needed
         from bitsandbytes import matmul_4bit
+        original_shape = x.shape
+
+        if len(original_shape) == 3:
+            # default reshape
+            B, L, _ = original_shape
+            x = x.reshape(B * L, -1)
 
         original_type = x.dtype
         bf_x = x.to(torch.bfloat16)
@@ -154,4 +160,9 @@ def apply(self,
         if bias is not None:
             out += bias
 
+        if len(original_shape) == 3:
+            # default reshape
+            B, L, _ = original_shape
+            out = out.reshape(B, L, -1)
+
         return out
@@ -880,6 +880,9 @@ def generator() -> Generator:
 
     def _load_weights(self, model_config: ModelConfig,
                       model: nn.Module) -> None:
+
+        self.target_modules += getattr(model, 'bitsandbytes_quant_target_modules', [])
+
         if not hasattr(model, 'load_weights'):
             raise AttributeError(
                 "The required method 'load_weights' is not defined in class"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQCknPM98UwkAhlbOCvr+W6AFjWcMT2sz4566yknaalfAlK6VJTQ6k4xzuEmY4jTnYxNwgGGoal7mpsZnUCtiR7Qtv+JAoSSinSuV75QxJDht1dwutR7Pic7qDyCdzKESCKH1Wz0AGZnCMYh8G1SPN5lPQYFEsSjNtefdNySLzuRsqbEi7Cvx3HJSJCbuOeCKJFPQFerUwgE2WAhHjlWGKHsOnbGA/WTKw1yLohAupBYcf3I2B0nRPbUWb6NXq4VRd0NsdpDpipEmzxXYUHyF6bJAG+y4CBmfhHkfPRJamebr1X68Ueyo9MiQhviB4HWXQR0/KdZz5pSPX+PKvlk5+g/BIUW7E/43Ev7RqXPROQtJrB7/UUDh2VV3p0l2Nv6sVsZg5WAIsJUoZD2qcsLdJbFRTIZ23LXofruTGAoJNBaNgObVmrRB12Fg6iJ8As3jNHGGYR9pCGi5BbWaFw58Sko1M8b+10cVYHkTWy8St2p9FM9Vn7uE5Nj8IejtdBHtN+0h+aE18FAVRloKVJP+ZfTLPnIFEansQy5gwCc+E0mIWHyUIRC+/Qh3lA54JRbaV6zGbAlZJYXljiO3SzdgEsC3WrHquKn5X7HOMtm5tOqKgwtyZlvIHZkjQH2nsPvp5O+Dr7YNfTAj2u53LdtMgXHClFMhCoIVaBEx+OjkbJjiw== kaggle_remote_ssh