@@ -37,7 +37,7 @@ def stop_triton_server():
37
37
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["" ])
38
38
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
39
39
ids = ["disableTrtOverlap" ])
40
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
40
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
41
41
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["True" , "False" ],
42
42
ids = ["enableDecoupleMode" , "disableDecoupleMode" ])
43
43
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -170,7 +170,7 @@ def test_llama_v2_7b_ifb(
170
170
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["" ])
171
171
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
172
172
ids = ["disableTrtOverlap" ])
173
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
173
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
174
174
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["True" , "False" ],
175
175
ids = ["enableDecoupleMode" , "disableDecoupleMode" ])
176
176
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -287,7 +287,7 @@ def test_mistral_v1_7b_ifb(
287
287
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["" ])
288
288
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
289
289
ids = ["disableTrtOverlap" ])
290
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
290
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
291
291
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["True" , "False" ],
292
292
ids = ["enableDecoupleMode" , "disableDecoupleMode" ])
293
293
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -474,7 +474,7 @@ def test_mistral_v1_7b_python_backend(
474
474
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["" ])
475
475
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
476
476
ids = ["disableTrtOverlap" ])
477
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
477
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
478
478
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["True" , "False" ],
479
479
ids = ["enableDecoupleMode" , "disableDecoupleMode" ])
480
480
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -592,7 +592,7 @@ def test_llama_v2_70b_ifb(
592
592
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["" ])
593
593
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
594
594
ids = ["disableTrtOverlap" ])
595
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
595
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
596
596
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["True" , "False" ],
597
597
ids = ["enableDecoupleMode" , "disableDecoupleMode" ])
598
598
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -1043,7 +1043,7 @@ def test_gpt_350m_python_backend(
1043
1043
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["" ])
1044
1044
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
1045
1045
ids = ["disableTrtOverlap" ])
1046
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
1046
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
1047
1047
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["True" , "False" ],
1048
1048
ids = ["enableDecoupleMode" , "disableDecoupleMode" ])
1049
1049
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -1445,7 +1445,7 @@ def test_whisper_large_v3_ifb(
1445
1445
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["0.2" ])
1446
1446
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
1447
1447
ids = ["disableTrtOverlap" ])
1448
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
1448
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
1449
1449
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["False" ],
1450
1450
ids = ["disableDecoupleMode" ])
1451
1451
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -1569,7 +1569,7 @@ def test_gpt_gather_logits_ifb(
1569
1569
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["0.2" ])
1570
1570
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
1571
1571
ids = ["disableTrtOverlap" ])
1572
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
1572
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
1573
1573
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["False" ],
1574
1574
ids = ["disableDecoupleMode" ])
1575
1575
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -1759,7 +1759,7 @@ def test_gpt_350m_speculative_decoding(
1759
1759
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["0.2" ])
1760
1760
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
1761
1761
ids = ["disableTrtOverlap" ])
1762
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
1762
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
1763
1763
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["False" ],
1764
1764
ids = ["disableDecoupleMode" ])
1765
1765
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -1950,7 +1950,7 @@ def test_gpt_350m_speculative_decoding_return_logits(
1950
1950
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["0.2" ])
1951
1951
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
1952
1952
ids = ["disableTrtOverlap" ])
1953
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
1953
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
1954
1954
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["False" ],
1955
1955
ids = ["disableDecoupleMode" ])
1956
1956
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -2104,7 +2104,7 @@ def test_gpt_speculative_decoding_bls(
2104
2104
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["0.2" ])
2105
2105
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
2106
2106
ids = ["disableTrtOverlap" ])
2107
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
2107
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
2108
2108
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["False" ],
2109
2109
ids = ["disableDecoupleMode" ])
2110
2110
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -2278,7 +2278,7 @@ def test_llama_v3_speculative_decoding_bls(
2278
2278
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["" ])
2279
2279
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
2280
2280
ids = ["disableTrtOverlap" ])
2281
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
2281
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
2282
2282
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["True" , "False" ],
2283
2283
ids = ["enableDecoupleMode" , "disableDecoupleMode" ])
2284
2284
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
@@ -2394,7 +2394,7 @@ def test_gpt_175b_dummyWeights_ifb(
2394
2394
@pytest .mark .parametrize ("KV_CACHE_FREE_GPU_MEM_FRACTION" , ["0.7" ])
2395
2395
@pytest .mark .parametrize ("ENABLE_TRT_OVERLAP" , ["False" ],
2396
2396
ids = ["disableTrtOverlap" ])
2397
- @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" , "V1" ])
2397
+ @pytest .mark .parametrize ("BATCHING_STRATEGY" , ["inflight_fused_batching" ])
2398
2398
@pytest .mark .parametrize ("DECOUPLED_MODE" , ["True" , "False" ],
2399
2399
ids = ["enableDecoupleMode" , "disableDecoupleMode" ])
2400
2400
@pytest .mark .parametrize ("TRITON_MAX_BATCH_SIZE" , ["128" ])
0 commit comments