|
25 | 25 | DeploymentGraph, |
26 | 26 | Payload, |
27 | 27 | chat_completions_response_handler, |
28 | | - completions_response_handler, |
29 | 28 | ) |
30 | 29 | from tests.utils.managed_process import ManagedProcess |
31 | 30 |
|
|
56 | 55 | expected_response=["bus"], |
57 | 56 | ) |
58 | 57 |
|
59 | | -text_payload = Payload( |
60 | | - payload_chat={ |
61 | | - "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", |
62 | | - "messages": [ |
63 | | - { |
64 | | - "role": "user", |
65 | | - "content": text_prompt, # Shorter prompt |
66 | | - } |
67 | | - ], |
68 | | - "max_tokens": 150, # Reduced from 500 |
69 | | - "temperature": 0.1, |
70 | | - # "seed": 0, |
71 | | - }, |
72 | | - payload_completions={ |
73 | | - "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", |
74 | | - "prompt": text_prompt, |
75 | | - "max_tokens": 150, |
76 | | - "temperature": 0.1, |
77 | | - # "seed": 0, |
78 | | - }, |
79 | | - repeat_count=10, |
80 | | - expected_log=[], |
81 | | - expected_response=["AI"], |
82 | | -) |
83 | | - |
84 | 58 | deployment_graphs = { |
85 | | - "agg": ( |
86 | | - DeploymentGraph( |
87 | | - module="graphs.agg:Frontend", |
88 | | - config="configs/agg.yaml", |
89 | | - directory="/workspace/examples/llm", |
90 | | - endpoints=["v1/chat/completions"], |
91 | | - response_handlers=[ |
92 | | - chat_completions_response_handler, |
93 | | - ], |
94 | | - marks=[pytest.mark.gpu_1, pytest.mark.vllm], |
95 | | - ), |
96 | | - text_payload, |
97 | | - ), |
98 | | - "sglang_agg": ( |
99 | | - DeploymentGraph( |
100 | | - module="graphs.agg:Frontend", |
101 | | - config="configs/agg.yaml", |
102 | | - directory="/workspace/examples/sglang", |
103 | | - endpoints=["v1/chat/completions", "v1/completions"], |
104 | | - response_handlers=[ |
105 | | - chat_completions_response_handler, |
106 | | - completions_response_handler, |
107 | | - ], |
108 | | - marks=[pytest.mark.gpu_1, pytest.mark.sglang], |
109 | | - ), |
110 | | - text_payload, |
111 | | - ), |
112 | | - "disagg": ( |
113 | | - DeploymentGraph( |
114 | | - module="graphs.disagg:Frontend", |
115 | | - config="configs/disagg.yaml", |
116 | | - directory="/workspace/examples/llm", |
117 | | - endpoints=["v1/chat/completions"], |
118 | | - response_handlers=[ |
119 | | - chat_completions_response_handler, |
120 | | - ], |
121 | | - marks=[pytest.mark.gpu_2, pytest.mark.vllm], |
122 | | - ), |
123 | | - text_payload, |
124 | | - ), |
125 | | - "agg_router": ( |
126 | | - DeploymentGraph( |
127 | | - module="graphs.agg_router:Frontend", |
128 | | - config="configs/agg_router.yaml", |
129 | | - directory="/workspace/examples/llm", |
130 | | - endpoints=["v1/chat/completions"], |
131 | | - response_handlers=[ |
132 | | - chat_completions_response_handler, |
133 | | - ], |
134 | | - marks=[pytest.mark.gpu_1, pytest.mark.vllm], |
135 | | - # FIXME: This is a hack to allow deployments to start before sending any requests. |
136 | | - # When using KV-router, if all the endpoints are not registered, the service |
137 | | - # enters a non-recoverable state. |
138 | | - delayed_start=120, |
139 | | - ), |
140 | | - text_payload, |
141 | | - ), |
142 | | - "disagg_router": ( |
143 | | - DeploymentGraph( |
144 | | - module="graphs.disagg_router:Frontend", |
145 | | - config="configs/disagg_router.yaml", |
146 | | - directory="/workspace/examples/llm", |
147 | | - endpoints=["v1/chat/completions"], |
148 | | - response_handlers=[ |
149 | | - chat_completions_response_handler, |
150 | | - ], |
151 | | - marks=[pytest.mark.gpu_2, pytest.mark.vllm], |
152 | | - # FIXME: This is a hack to allow deployments to start before sending any requests. |
153 | | - # When using KV-router, if all the endpoints are not registered, the service |
154 | | - # enters a non-recoverable state. |
155 | | - delayed_start=120, |
156 | | - ), |
157 | | - text_payload, |
158 | | - ), |
159 | 59 | "multimodal_agg": ( |
160 | 60 | DeploymentGraph( |
161 | 61 | module="graphs.agg:Frontend", |
|
169 | 69 | ), |
170 | 70 | multimodal_payload, |
171 | 71 | ), |
172 | | - "vllm_v1_agg": ( |
173 | | - DeploymentGraph( |
174 | | - module="graphs.agg:Frontend", |
175 | | - config="configs/agg.yaml", |
176 | | - directory="/workspace/examples/vllm_v1", |
177 | | - endpoints=["v1/chat/completions", "v1/completions"], |
178 | | - response_handlers=[ |
179 | | - chat_completions_response_handler, |
180 | | - completions_response_handler, |
181 | | - ], |
182 | | - marks=[pytest.mark.gpu_1, pytest.mark.vllm], |
183 | | - ), |
184 | | - text_payload, |
185 | | - ), |
186 | | - "trtllm_agg": ( |
187 | | - DeploymentGraph( |
188 | | - module="graphs.agg:Frontend", |
189 | | - config="configs/agg.yaml", |
190 | | - directory="/workspace/examples/tensorrt_llm", |
191 | | - endpoints=["v1/chat/completions", "v1/completions"], |
192 | | - response_handlers=[ |
193 | | - chat_completions_response_handler, |
194 | | - completions_response_handler, |
195 | | - ], |
196 | | - marks=[pytest.mark.gpu_1, pytest.mark.tensorrtllm], |
197 | | - ), |
198 | | - text_payload, |
199 | | - ), |
200 | | - "trtllm_agg_router": ( |
201 | | - DeploymentGraph( |
202 | | - module="graphs.agg:Frontend", |
203 | | - config="configs/agg_router.yaml", |
204 | | - directory="/workspace/examples/tensorrt_llm", |
205 | | - endpoints=["v1/chat/completions", "v1/completions"], |
206 | | - response_handlers=[ |
207 | | - chat_completions_response_handler, |
208 | | - completions_response_handler, |
209 | | - ], |
210 | | - marks=[pytest.mark.gpu_1, pytest.mark.tensorrtllm], |
211 | | - # FIXME: This is a hack to allow deployments to start before sending any requests. |
212 | | - # When using KV-router, if all the endpoints are not registered, the service |
213 | | - # enters a non-recoverable state. |
214 | | - delayed_start=120, |
215 | | - ), |
216 | | - text_payload, |
217 | | - ), |
218 | | - "trtllm_disagg": ( |
219 | | - DeploymentGraph( |
220 | | - module="graphs.disagg:Frontend", |
221 | | - config="configs/disagg.yaml", |
222 | | - directory="/workspace/examples/tensorrt_llm", |
223 | | - endpoints=["v1/chat/completions", "v1/completions"], |
224 | | - response_handlers=[ |
225 | | - chat_completions_response_handler, |
226 | | - completions_response_handler, |
227 | | - ], |
228 | | - marks=[pytest.mark.gpu_2, pytest.mark.tensorrtllm], |
229 | | - ), |
230 | | - text_payload, |
231 | | - ), |
232 | | - "trtllm_disagg_router": ( |
233 | | - DeploymentGraph( |
234 | | - module="graphs.disagg:Frontend", |
235 | | - config="configs/disagg_router.yaml", |
236 | | - directory="/workspace/examples/tensorrt_llm", |
237 | | - endpoints=["v1/chat/completions", "v1/completions"], |
238 | | - response_handlers=[ |
239 | | - chat_completions_response_handler, |
240 | | - completions_response_handler, |
241 | | - ], |
242 | | - marks=[pytest.mark.gpu_2, pytest.mark.tensorrtllm], |
243 | | - # FIXME: This is a hack to allow deployments to start before sending any requests. |
244 | | - # When using KV-router, if all the endpoints are not registered, the service |
245 | | - # enters a non-recoverable state. |
246 | | - delayed_start=120, |
247 | | - ), |
248 | | - text_payload, |
249 | | - ), |
250 | 72 | } |
251 | 73 |
|
252 | 74 |
|
@@ -394,17 +216,6 @@ def wait_for_ready(self, payload, logger=logging.getLogger()): |
394 | 216 | @pytest.fixture( |
395 | 217 | params=[ |
396 | 218 | pytest.param("multimodal_agg", marks=[pytest.mark.vllm, pytest.mark.gpu_2]), |
397 | | - pytest.param("trtllm_agg", marks=[pytest.mark.tensorrtllm, pytest.mark.gpu_1]), |
398 | | - pytest.param( |
399 | | - "trtllm_agg_router", marks=[pytest.mark.tensorrtllm, pytest.mark.gpu_1] |
400 | | - ), |
401 | | - pytest.param( |
402 | | - "trtllm_disagg", marks=[pytest.mark.tensorrtllm, pytest.mark.gpu_2] |
403 | | - ), |
404 | | - pytest.param( |
405 | | - "trtllm_disagg_router", marks=[pytest.mark.tensorrtllm, pytest.mark.gpu_2] |
406 | | - ), |
407 | | - # pytest.param("sglang", marks=[pytest.mark.sglang, pytest.mark.gpu_2]), |
408 | 219 | ] |
409 | 220 | ) |
410 | 221 | def deployment_graph_test(request): |
|
0 commit comments