|
12 | 12 |
|
13 | 13 | from ....conftest import HfRunner, VllmRunner |
14 | 14 | from ....utils import RemoteOpenAIServer |
| 15 | +from ...registry import HF_EXAMPLE_MODELS |
15 | 16 | from ...utils import check_logprobs_close |
16 | 17 |
|
17 | 18 | MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b" |
@@ -55,7 +56,10 @@ def server(request, audio_assets): |
55 | 56 | for key, value in request.param.items() |
56 | 57 | ] |
57 | 58 |
|
58 | | - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: |
| 59 | + with RemoteOpenAIServer(MODEL_NAME, |
| 60 | + args, |
| 61 | + env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": |
| 62 | + "30"}) as remote_server: |
59 | 63 | yield remote_server |
60 | 64 |
|
61 | 65 |
|
@@ -106,6 +110,10 @@ def run_test( |
106 | 110 | **kwargs, |
107 | 111 | ): |
108 | 112 | """Inference result should be the same between hf and vllm.""" |
| 113 | + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) |
| 114 | + model_info.check_available_online(on_fail="skip") |
| 115 | + model_info.check_transformers_version(on_fail="skip") |
| 116 | + |
109 | 117 | # NOTE: take care of the order. run vLLM first, and then run HF. |
110 | 118 | # vLLM needs a fresh new process without cuda initialization. |
111 | 119 | # if we run HF first, the cuda initialization will be done and it |
@@ -156,6 +164,10 @@ def run_multi_audio_test( |
156 | 164 | num_logprobs: int, |
157 | 165 | **kwargs, |
158 | 166 | ): |
| 167 | + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) |
| 168 | + model_info.check_available_online(on_fail="skip") |
| 169 | + model_info.check_transformers_version(on_fail="skip") |
| 170 | + |
159 | 171 | with vllm_runner(model, |
160 | 172 | dtype=dtype, |
161 | 173 | enforce_eager=True, |
|
0 commit comments