-
-
Notifications
You must be signed in to change notification settings - Fork 10.7k
Description
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
llm = LLM(model="/home/data/llm/LLaMA-Factory-qwen1.5/models/Qwen-1_8B-Chat", enable_lora=True, trust_remote_code=True)
text_format_lora_path = '/home/data/llm/LLaMA-Factory-qwen1.5/qwen1_lora'
sampling_params = SamplingParams(
temperature=0,
max_tokens=256
)
prompts = [
"test",
]
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest("text_format_adapter", 1, text_format_lora_path)
)
print(outputs)
INFO 02-22 09:23:19 llm_engine.py:79] Initializing an LLM engine with config: model='/home/data/llm/LLaMA-Factory-qwen1.5/models/Qwen-1_8B-Chat', tokenizer='/home/data/llm/LLaMA-Factory-qwen1.5/models/Qwen-1_8B-Chat', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
WARNING 02-22 09:23:20 tokenizer.py:64] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead.
Traceback (most recent call last):
File "/home/data/llm/LLaMA-Factory-qwen1.5/multilora_configure.py", line 4, in
llm = LLM(model="/home/data/llm/LLaMA-Factory-qwen1.5/models/Qwen-1_8B-Chat", enable_lora=True, trust_remote_code=True)
File "/home/data/llm/vllm_0.3.2/vllm-main/vllm/entrypoints/llm.py", line 109, in init
self.llm_engine = LLMEngine.from_engine_args(engine_args)
File "/home/data/llm/vllm_0.3.2/vllm-main/vllm/engine/llm_engine.py", line 371, in from_engine_args
engine = cls(*engine_configs,
File "/home/data/llm/vllm_0.3.2/vllm-main/vllm/engine/llm_engine.py", line 120, in init
self._init_workers()
File "/home/data/llm/vllm_0.3.2/vllm-main/vllm/engine/llm_engine.py", line 164, in _init_workers
self._run_workers("load_model")
File "/home/data/llm/vllm_0.3.2/vllm-main/vllm/engine/llm_engine.py", line 1014, in _run_workers
driver_worker_output = getattr(self.driver_worker,
File "/home/data/llm/vllm_0.3.2/vllm-main/vllm/worker/worker.py", line 100, in load_model
self.model_runner.load_model()
File "/home/data/llm/vllm_0.3.2/vllm-main/vllm/worker/model_runner.py", line 84, in load_model
self.model = get_model(self.model_config, self.device_config,
File "/home/data/llm/vllm_0.3.2/vllm-main/vllm/model_executor/model_loader.py", line 73, in get_model
raise ValueError(
ValueError: Model QWenLMHeadModel does not support LoRA, but LoRA is enabled. Support for this model may be added in the future. If this is important to you, please open an issue on github.