1212from vllm .utils import FlexibleArgumentParser
1313
1414audio_assets = [AudioAsset ("mary_had_lamb" ), AudioAsset ("winning_call" )]
15- question_per_audio_count = [
16- "What is recited in the audio?" ,
17- "What sport and what nursery rhyme are referenced?"
18- ]
15+ question_per_audio_count = {
16+ 0 : "What is 1+1?" ,
17+ 1 : "What is recited in the audio?" ,
18+ 2 : "What sport and what nursery rhyme are referenced?"
19+ }
1920
2021
2122# Ultravox 0.3
22- def run_ultravox (question , audio_count ):
23+ def run_ultravox (question : str , audio_count : int ):
2324 model_name = "fixie-ai/ultravox-v0_3"
2425
2526 tokenizer = AutoTokenizer .from_pretrained (model_name )
@@ -42,9 +43,29 @@ def run_ultravox(question, audio_count):
4243 return llm , prompt , stop_token_ids
4344
4445
45- model_example_map = {
46- "ultravox" : run_ultravox ,
47- }
46+ # Qwen2-Audio
47+ def run_qwen2_audio (question : str , audio_count : int ):
48+ model_name = "Qwen/Qwen2-Audio-7B-Instruct"
49+
50+ llm = LLM (model = model_name ,
51+ max_model_len = 4096 ,
52+ max_num_seqs = 5 ,
53+ limit_mm_per_prompt = {"audio" : audio_count })
54+
55+ audio_in_prompt = "" .join ([
56+ f"Audio { idx + 1 } : "
57+ f"<|audio_bos|><|AUDIO|><|audio_eos|>\n " for idx in range (audio_count )
58+ ])
59+
60+ prompt = ("<|im_start|>system\n You are a helpful assistant.<|im_end|>\n "
61+ "<|im_start|>user\n "
62+ f"{ audio_in_prompt } { question } <|im_end|>\n "
63+ "<|im_start|>assistant\n " )
64+ stop_token_ids = None
65+ return llm , prompt , stop_token_ids
66+
67+
68+ model_example_map = {"ultravox" : run_ultravox , "qwen2_audio" : run_qwen2_audio }
4869
4970
5071def main (args ):
@@ -54,24 +75,25 @@ def main(args):
5475
5576 audio_count = args .num_audios
5677 llm , prompt , stop_token_ids = model_example_map [model ](
57- question_per_audio_count [audio_count - 1 ], audio_count )
78+ question_per_audio_count [audio_count ], audio_count )
5879
5980 # We set temperature to 0.2 so that outputs can be different
6081 # even when all prompts are identical when running batch inference.
6182 sampling_params = SamplingParams (temperature = 0.2 ,
6283 max_tokens = 64 ,
6384 stop_token_ids = stop_token_ids )
6485
65- assert args .num_prompts > 0
66- inputs = {
67- "prompt" : prompt ,
68- "multi_modal_data" : {
86+ mm_data = {}
87+ if audio_count > 0 :
88+ mm_data = {
6989 "audio" : [
7090 asset .audio_and_sample_rate
7191 for asset in audio_assets [:audio_count ]
7292 ]
73- },
74- }
93+ }
94+
95+ assert args .num_prompts > 0
96+ inputs = {"prompt" : prompt , "multi_modal_data" : mm_data }
7597 if args .num_prompts > 1 :
7698 # Batch inference
7799 inputs = [inputs ] * args .num_prompts
@@ -100,7 +122,7 @@ def main(args):
100122 parser .add_argument ("--num-audios" ,
101123 type = int ,
102124 default = 1 ,
103- choices = [1 , 2 ],
125+ choices = [0 , 1 , 2 ],
104126 help = "Number of audio items per prompt." )
105127
106128 args = parser .parse_args ()
0 commit comments