|
1 | 1 | import asyncio |
2 | 2 | import os |
| 3 | +from asyncio import CancelledError |
3 | 4 | from dataclasses import dataclass |
| 5 | +from typing import Optional |
4 | 6 |
|
5 | 7 | import pytest |
| 8 | +import pytest_asyncio |
6 | 9 | import torch |
7 | 10 |
|
8 | 11 | from vllm import SamplingParams |
9 | 12 | from vllm.config import ParallelConfig |
10 | 13 | from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine |
| 14 | +from vllm.outputs import RequestOutput as RealRequestOutput |
11 | 15 |
|
| 16 | +from ..conftest import cleanup |
12 | 17 | from ..utils import wait_for_gpu_memory_to_clear |
13 | 18 |
|
14 | 19 |
|
@@ -118,33 +123,103 @@ async def test_new_requests_event(): |
118 | 123 | os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY") |
119 | 124 |
|
120 | 125 |
|
121 | | -def test_asyncio_run(): |
| 126 | +def start_engine(): |
122 | 127 | wait_for_gpu_memory_to_clear( |
123 | 128 | devices=list(range(torch.cuda.device_count())), |
124 | 129 | threshold_bytes=2 * 2**30, |
125 | 130 | timeout_s=60, |
126 | 131 | ) |
127 | 132 |
|
128 | | - engine = AsyncLLMEngine.from_engine_args( |
129 | | - AsyncEngineArgs(model="facebook/opt-125m")) |
| 133 | + return AsyncLLMEngine.from_engine_args( |
| 134 | + AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True)) |
| 135 | + |
| 136 | + |
| 137 | +@pytest_asyncio.fixture(scope="module") |
| 138 | +async def async_engine(): |
| 139 | + engine = await asyncio.get_event_loop().run_in_executor(executor=None, |
| 140 | + func=start_engine) |
| 141 | + try: |
| 142 | + yield engine |
| 143 | + finally: |
| 144 | + engine.shutdown_background_loop() |
| 145 | + del engine |
| 146 | + await asyncio.sleep(0.1) |
| 147 | + cleanup() |
| 148 | + |
| 149 | + |
| 150 | +@pytest.fixture() |
| 151 | +def should_do_global_cleanup_after_test(request) -> bool: |
| 152 | + # So we can share the async engine fixture between these tests |
| 153 | + return False |
| 154 | + |
| 155 | + |
| 156 | +@pytest.mark.asyncio(scope="module") |
| 157 | +async def test_asyncio_run(async_engine): |
130 | 158 |
|
131 | 159 | async def run(prompt: str): |
132 | 160 | sampling_params = SamplingParams( |
133 | 161 | temperature=0, |
134 | 162 | max_tokens=32, |
135 | 163 | ) |
136 | 164 |
|
137 | | - async for output in engine.generate(prompt, |
138 | | - sampling_params, |
139 | | - request_id=prompt): |
| 165 | + async for output in async_engine.generate(prompt, |
| 166 | + sampling_params, |
| 167 | + request_id=prompt): |
140 | 168 | final_output = output |
141 | 169 | return final_output |
142 | 170 |
|
143 | | - async def generate(): |
144 | | - return await asyncio.gather( |
145 | | - run("test0"), |
146 | | - run("test1"), |
147 | | - ) |
148 | | - |
149 | | - results = asyncio.run(generate()) |
| 171 | + results = await asyncio.gather( |
| 172 | + run("test0"), |
| 173 | + run("test1"), |
| 174 | + ) |
150 | 175 | assert len(results) == 2 |
| 176 | + |
| 177 | + |
| 178 | +@pytest.mark.asyncio(scope="module") |
| 179 | +async def test_cancellation(async_engine): |
| 180 | + sampling_params = SamplingParams( |
| 181 | + temperature=0, |
| 182 | + min_tokens=10, |
| 183 | + max_tokens=10, |
| 184 | + ) |
| 185 | + |
| 186 | + i = 0 |
| 187 | + with pytest.raises(CancelledError): |
| 188 | + async for output in async_engine.generate("test2", |
| 189 | + sampling_params, |
| 190 | + request_id="test2"): |
| 191 | + assert not output.finished |
| 192 | + i += 1 |
| 193 | + if i == 5: |
| 194 | + await async_engine.abort("test2") |
| 195 | + |
| 196 | + assert i == 5 |
| 197 | + |
| 198 | + |
| 199 | +@pytest.mark.asyncio(scope="module") |
| 200 | +async def test_delayed_generator(async_engine): |
| 201 | + sampling_params = SamplingParams( |
| 202 | + temperature=0, |
| 203 | + min_tokens=10, |
| 204 | + max_tokens=10, |
| 205 | + ) |
| 206 | + |
| 207 | + stream = async_engine.generate("test3", |
| 208 | + sampling_params, |
| 209 | + request_id="test3") |
| 210 | + i = 0 |
| 211 | + final_output: Optional[RealRequestOutput] = None |
| 212 | + async for output in stream: |
| 213 | + final_output = output |
| 214 | + if i == 0: |
| 215 | + # wait for generation to complete before consuming |
| 216 | + # the remaining messages |
| 217 | + await asyncio.sleep(1) |
| 218 | + if i < 9: |
| 219 | + assert not output.finished |
| 220 | + i += 1 |
| 221 | + |
| 222 | + assert i == 10 |
| 223 | + assert final_output is not None |
| 224 | + assert len(final_output.outputs[0].token_ids) == 10 |
| 225 | + assert final_output.finished |
0 commit comments