Skip to content

Commit 6c6cacb

Browse files
faychuDarkLight1337
authored andcommitted
[Model] Add Qwen2-Audio model support (vllm-project#9248)
Co-authored-by: DarkLight1337 <[email protected]> Signed-off-by: LeiWang1999 <[email protected]>
1 parent cc489f8 commit 6c6cacb

File tree

7 files changed

+515
-17
lines changed

7 files changed

+515
-17
lines changed

docs/source/models/supported_models.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,12 @@ Text Generation
459459
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
460460
-
461461
- ✅︎
462+
* - :code:`Qwen2AudioForConditionalGeneration`
463+
- Qwen2-Audio
464+
- T + A\ :sup:`+`
465+
- :code:`Qwen/Qwen2-Audio-7B-Instruct`
466+
-
467+
- ✅︎
462468
* - :code:`Qwen2VLForConditionalGeneration`
463469
- Qwen2-VL
464470
- T + I\ :sup:`E+` + V\ :sup:`+`

examples/offline_inference_audio_language.py

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,15 @@
1212
from vllm.utils import FlexibleArgumentParser
1313

1414
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
15-
question_per_audio_count = [
16-
"What is recited in the audio?",
17-
"What sport and what nursery rhyme are referenced?"
18-
]
15+
question_per_audio_count = {
16+
0: "What is 1+1?",
17+
1: "What is recited in the audio?",
18+
2: "What sport and what nursery rhyme are referenced?"
19+
}
1920

2021

2122
# Ultravox 0.3
22-
def run_ultravox(question, audio_count):
23+
def run_ultravox(question: str, audio_count: int):
2324
model_name = "fixie-ai/ultravox-v0_3"
2425

2526
tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -42,9 +43,29 @@ def run_ultravox(question, audio_count):
4243
return llm, prompt, stop_token_ids
4344

4445

45-
model_example_map = {
46-
"ultravox": run_ultravox,
47-
}
46+
# Qwen2-Audio
47+
def run_qwen2_audio(question: str, audio_count: int):
48+
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
49+
50+
llm = LLM(model=model_name,
51+
max_model_len=4096,
52+
max_num_seqs=5,
53+
limit_mm_per_prompt={"audio": audio_count})
54+
55+
audio_in_prompt = "".join([
56+
f"Audio {idx+1}: "
57+
f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
58+
])
59+
60+
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
61+
"<|im_start|>user\n"
62+
f"{audio_in_prompt}{question}<|im_end|>\n"
63+
"<|im_start|>assistant\n")
64+
stop_token_ids = None
65+
return llm, prompt, stop_token_ids
66+
67+
68+
model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio}
4869

4970

5071
def main(args):
@@ -54,24 +75,25 @@ def main(args):
5475

5576
audio_count = args.num_audios
5677
llm, prompt, stop_token_ids = model_example_map[model](
57-
question_per_audio_count[audio_count - 1], audio_count)
78+
question_per_audio_count[audio_count], audio_count)
5879

5980
# We set temperature to 0.2 so that outputs can be different
6081
# even when all prompts are identical when running batch inference.
6182
sampling_params = SamplingParams(temperature=0.2,
6283
max_tokens=64,
6384
stop_token_ids=stop_token_ids)
6485

65-
assert args.num_prompts > 0
66-
inputs = {
67-
"prompt": prompt,
68-
"multi_modal_data": {
86+
mm_data = {}
87+
if audio_count > 0:
88+
mm_data = {
6989
"audio": [
7090
asset.audio_and_sample_rate
7191
for asset in audio_assets[:audio_count]
7292
]
73-
},
74-
}
93+
}
94+
95+
assert args.num_prompts > 0
96+
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
7597
if args.num_prompts > 1:
7698
# Batch inference
7799
inputs = [inputs] * args.num_prompts
@@ -100,7 +122,7 @@ def main(args):
100122
parser.add_argument("--num-audios",
101123
type=int,
102124
default=1,
103-
choices=[1, 2],
125+
choices=[0, 1, 2],
104126
help="Number of audio items per prompt.")
105127

106128
args = parser.parse_args()

tests/distributed/test_pipeline_parallel.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ def iter_params(self, model_name: str):
199199
"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
200200
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501
201201
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
202+
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
202203
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
203204
"fixie-ai/ultravox-v0_3": PPTestSettings.fast(),
204205
}

vllm/entrypoints/chat_utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,10 @@ def _placeholder_str(self, modality: ModalityStr,
196196
elif modality == "audio":
197197
if model_type == "ultravox":
198198
return "<|reserved_special_token_0|>"
199-
raise TypeError(f"Unknown {modality} model type: {model_type}")
199+
if model_type == "qwen2_audio":
200+
return (f"Audio {current_count}: "
201+
f"<|audio_bos|><|AUDIO|><|audio_eos|>")
202+
raise TypeError(f"Unknown model type: {model_type}")
200203
elif modality == "video":
201204
if model_type == "qwen2_vl":
202205
return "<|vision_start|><|video_pad|><|vision_end|>"

0 commit comments

Comments
 (0)