`from PIL import Image
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import torch
MODEL_NAME = "openbmb/MiniCPM-V-2_6"
image = Image.open("dubu.png").convert("RGB")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
context_length = 2000
num_device = 1
llm = LLM(model=MODEL_NAME, speculative_max_model_len =context_length ,max_seq_len_to_capture=context_length,max_model_len=context_length
, tensor_parallel_size=num_device,trust_remote_code=True ,worker_use_ray=num_device, quantization="fp8"
,gpu_memory_utilization = 0.95 , )
messages = [{'role': 'user', 'content': '(
./)\n' + 'what is in this image?'}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
sampling_params = SamplingParams(
temperature=0.9,
max_tokens=2000,
best_of=3)
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": image
}
}, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)`
I have already givem the way i like to use vllm in my script