diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md index 27512b16e5f..c50e73b8fd2 100644 --- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md +++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md @@ -135,7 +135,6 @@ YOUR_DATA_PATH= cat >./extra-llm-api-config.yml< cat >./extra-llm-api-config.yml< context_extra-llm-api-config.yml +echo -e "pytorch_backend_config:\n disable_overlap_scheduler: True\ncache_transceiver_config:\n max_num_tokens: 2048" > context_extra-llm-api-config.yml echo -e "cache_transceiver_config:\n max_num_tokens: 2048" > gen_extra-llm-api-config.yml export TRTLLM_USE_UCX_KVCACHE=1 @@ -65,7 +65,7 @@ model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 backend: "pytorch" pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/examples/disaggregated/disagg_config.yaml b/examples/disaggregated/disagg_config.yaml index 391ef87e8d2..a199a594522 100644 --- a/examples/disaggregated/disagg_config.yaml +++ b/examples/disaggregated/disagg_config.yaml @@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25 backend: "pytorch" pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/examples/llm-api/llm_inference_kv_events.py b/examples/llm-api/llm_inference_kv_events.py index 69b9dc95a29..827427e538b 100644 --- a/examples/llm-api/llm_inference_kv_events.py +++ b/examples/llm-api/llm_inference_kv_events.py @@ -6,8 +6,7 @@ def main(): - pytorch_config = PyTorchConfig(enable_overlap_scheduler=True, - autotuner_enabled=False, + pytorch_config = PyTorchConfig(autotuner_enabled=False, kv_cache_dtype='auto') llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", diff --git a/examples/llm-api/llm_mgmn_trtllm_bench.sh b/examples/llm-api/llm_mgmn_trtllm_bench.sh index 556f2d9e576..21a0ee48d9b 100644 --- a/examples/llm-api/llm_mgmn_trtllm_bench.sh +++ b/examples/llm-api/llm_mgmn_trtllm_bench.sh @@ -76,7 +76,6 @@ srun -l \ cat > /tmp/pytorch_extra_args.txt << EOF pytorch_backend_config: use_cuda_graph: false - enable_overlap_scheduler: true cuda_graph_padding_enabled: false print_iter_log: true enable_attention_dp: false diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md index cbbcf00227a..72b5196a40b 100644 --- a/examples/models/core/deepseek_v3/README.md +++ b/examples/models/core/deepseek_v3/README.md @@ -21,7 +21,10 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/ - [Quick Start](#quick-start) - [Run a single inference](#run-a-single-inference) - [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp) + - [Relaxed acceptance](#relaxed-acceptance) - [Long context support](#long-context-support) + - [ISL-64k-OSL-1024](#isl-64k-osl-1024) + - [ISL-128k-OSL-1024](#isl-128k-osl-1024) - [Evaluation](#evaluation) - [Serving](#serving) - [Advanced Usages](#advanced-usages) @@ -34,6 +37,7 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/ - [FP8 KV Cache and MLA](#fp8-kv-cache-and-mla) - [W4AFP8](#w4afp8) - [Notes and Troubleshooting](#notes-and-troubleshooting) + - [Known Issues](#known-issues) ## Hardware Requirements @@ -134,7 +138,6 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \ cat < /tmp/extra-llm-api-config.yml pytorch_backend_config: - enable_overlap_scheduler: true use_cuda_graph: true cuda_graph_padding_enabled: true cuda_graph_batch_sizes: [1, 4, 8, 12] @@ -163,7 +166,6 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \ cat < /tmp/extra-llm-api-config.yml pytorch_backend_config: - enable_overlap_scheduler: true use_cuda_graph: true cuda_graph_padding_enabled: true cuda_graph_batch_sizes: [1, 2] @@ -190,7 +192,6 @@ Evaluate the model accuracy using `trtllm-eval`. cat >./extra-llm-api-config.yml < 1: self.event_loop = self._executor_loop_pp else: - self.event_loop = self._executor_loop_overlap if enable_overlap_scheduler else self._executor_loop + self.event_loop = self._executor_loop if disable_overlap_scheduler else self._executor_loop_overlap if is_trace_enabled("TLLM_TRACE_EXECUTOR_LOOP"): self.event_loop = trace_func(self.event_loop) @@ -1975,7 +1975,7 @@ def _handle_responses(self): # If request is in transmission, so we don't need to emit a response # Also, for the first iteration with overlap, we should skip since first token has already been emitted by context server if request.is_disagg_generation_transmission_in_progress or ( - self.enable_overlap_scheduler + not self.disable_overlap_scheduler and request.py_decoding_iter <= 1): new_active_requests.append(request) continue diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index c08b890d310..8241c4c0189 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -106,7 +106,7 @@ def create_py_executor(executor_config: ExecutorConfig, # PyTorchModelEngine modifies these fields, update them to executor_config max_seq_len = model_engine.max_seq_len origin_seq_len = max_seq_len - if pytorch_backend_config.enable_overlap_scheduler: + if not pytorch_backend_config.disable_overlap_scheduler: max_seq_len = model_engine.max_seq_len + 1 if spec_config is not None: max_seq_len += spec_config.max_draft_tokens diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py index 87755dce39a..3e7ed03cd21 100755 --- a/tensorrt_llm/bench/benchmark/utils/general.py +++ b/tensorrt_llm/bench/benchmark/utils/general.py @@ -148,7 +148,6 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str, pyt_options = { "use_cuda_graph": True, "cuda_graph_padding_enabled": True, - "enable_overlap_scheduler": True, "kv_cache_dtype": kv_cache_dtype, "cuda_graph_max_batch_size": max_batch_size, } diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py index d1db9144714..632a956b6ff 100644 --- a/tensorrt_llm/commands/eval.py +++ b/tensorrt_llm/commands/eval.py @@ -115,7 +115,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str, backend = None pytorch_backend_config = None if backend == "pytorch": - pytorch_backend_config = PyTorchConfig(enable_overlap_scheduler=True) + pytorch_backend_config = PyTorchConfig() llm_args = { "model": model, diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py index 1c919331a06..739c835b133 100644 --- a/tensorrt_llm/commands/serve.py +++ b/tensorrt_llm/commands/serve.py @@ -50,8 +50,7 @@ def get_llm_args(model: str, kv_cache_config = KvCacheConfig( free_gpu_memory_fraction=free_gpu_memory_fraction) - pytorch_backend_config = PyTorchConfig( - enable_overlap_scheduler=True) if backend == "pytorch" else None + pytorch_backend_config = PyTorchConfig() if backend == "pytorch" else None dynamic_batch_config = DynamicBatchConfig( enable_batch_size_tuning=True, enable_max_num_tokens_tuning=False, diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 5cfb360ba91..77bc65a3be4 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -384,7 +384,7 @@ def _enqueue_request(self, request: GenerationRequest) -> int: context_phase_params = request.disaggregated_params.get_context_phase_params( ) - is_overlap_enabled = self._is_pytorch_backend and self._executor_config.pytorch_backend_config.enable_overlap_scheduler + is_overlap_enabled = self._is_pytorch_backend and not self._executor_config.pytorch_backend_config.disable_overlap_scheduler if is_overlap_enabled: is_disaggregated = self.engine.kv_cache_transceiver is not None if is_disaggregated and ( diff --git a/tensorrt_llm/scaffolding/worker.py b/tensorrt_llm/scaffolding/worker.py index 66ceec64b8b..d133083dc9b 100644 --- a/tensorrt_llm/scaffolding/worker.py +++ b/tensorrt_llm/scaffolding/worker.py @@ -136,11 +136,11 @@ def init_with_new_llm( max_batch_size: int = 32, max_num_tokens: int = 4096, kv_cache_free_gpu_memory_fraction: float = 0.9, - enable_overlap_scheduler: bool = True, + disable_overlap_scheduler: bool = False, ): pytorch_backend_config = PyTorchConfig( mixed_decoder=True, - enable_overlap_scheduler=enable_overlap_scheduler, + disable_overlap_scheduler=disable_overlap_scheduler, ) kv_cache_config = KvCacheConfig( free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, ) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index d627a3f4481..c842cb7d53b 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -144,16 +144,16 @@ class TestLlama3_1_8B(LlmapiAccuracyTestHarness): @pytest.mark.skip_less_device_memory(32000) @pytest.mark.skip_device_not_contain(["H100"]) - @pytest.mark.parametrize("overlap_scheduler", [False, True]) - def test_auto_dtype(self, overlap_scheduler): + @pytest.mark.parametrize("disable_overlap_scheduler", [False, True]) + def test_auto_dtype(self, disable_overlap_scheduler): ctx_server_config = { "pytorch_backend_config": { - "enable_overlap_scheduler": False + "disable_overlap_scheduler": True } } gen_server_config = { "pytorch_backend_config": { - "enable_overlap_scheduler": overlap_scheduler + "disable_overlap_scheduler": disable_overlap_scheduler } } disaggregated_server_config = { diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index ca03cad0517..ac879df4f7e 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -63,6 +63,7 @@ def test_bfloat16(self, attn_backend, torch_compile): cuda_graph_padding_enabled=torch_compile, cuda_graph_batch_sizes=[4], attn_backend=attn_backend, + disable_overlap_scheduler=torch_compile, ) llm = LLM(self.MODEL_PATH, pytorch_backend_config=pytorch_config) with llm: @@ -87,6 +88,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend, cuda_graph_padding_enabled=torch_compile, cuda_graph_batch_sizes=[4], attn_backend=attn_backend, + disable_overlap_scheduler=torch_compile, ) llm = LLM(self.MODEL_PATH, tensor_parallel_size=tp_size, @@ -109,6 +111,7 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile): cuda_graph_padding_enabled=torch_compile, cuda_graph_batch_sizes=[4], attn_backend=attn_backend, + disable_overlap_scheduler=torch_compile, ) if fp8kv: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 @@ -145,6 +148,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend, cuda_graph_padding_enabled=torch_compile, cuda_graph_batch_sizes=[4], attn_backend=attn_backend, + disable_overlap_scheduler=torch_compile, ) if fp8kv: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 @@ -319,7 +323,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph, # OOM on H100 with default free_gpu_memory_fraction=0.9 kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) mtp_config = None if mtp_nextn > 0: @@ -351,7 +355,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, # OOM on H100 with default free_gpu_memory_fraction=0.9 kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) mtp_config = None if mtp_nextn > 0: @@ -384,7 +388,7 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph, # OOM on H100 with default free_gpu_memory_fraction=0.9 kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) quant_config = QuantConfig() @@ -435,7 +439,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, # OOM on H100 with default free_gpu_memory_fraction=0.9 kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) quant_config = QuantConfig() @@ -480,7 +484,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, (True, True, True, True)]) def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler): pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) quant_config = QuantConfig() @@ -521,7 +525,7 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler): def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler, tp_size, pp_size, ep_size): pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) quant_config = QuantConfig() @@ -569,7 +573,7 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) quant_config = QuantConfig() @@ -615,7 +619,7 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, batch_size): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) quant_config = QuantConfig() @@ -667,7 +671,7 @@ class TestNemotronNas(LlmapiAccuracyTestHarness): @pytest.mark.skip_less_device(8) def test_auto_dtype_tp8(self): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) - pytorch_config = PyTorchConfig(enable_overlap_scheduler=True) + pytorch_config = PyTorchConfig() with LLM(self.MODEL_PATH, tensor_parallel_size=8, @@ -747,7 +751,7 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness): def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8", @@ -774,7 +778,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8", @@ -797,7 +801,7 @@ def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) llm = LLM( @@ -821,7 +825,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) llm = LLM( @@ -849,7 +853,7 @@ class TestQwen3_32B(LlmapiAccuracyTestHarness): def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): pytorch_config = PyTorchConfig( - enable_overlap_scheduler=overlap_scheduler, + disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph) llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8", diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml index 4586d86a788..7a850b121bc 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml @@ -5,7 +5,7 @@ backend: "pytorch" free_gpu_memory_fraction: 0.1 pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True autotuner_enabled: False context_servers: num_instances: 2 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml index 8f678bd51d5..2c9a83ecd65 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml @@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.15 backend: "pytorch" pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True autotuner_enabled: False context_servers: num_instances: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml index 1b8132ebcaf..59db98e2ab7 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml @@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.1 backend: "pytorch" pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml index 37d0a6275d6..bf8b1484151 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml @@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.1 backend: "pytorch" pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml index 96f06b77313..35b1cb6f4e9 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml @@ -13,7 +13,7 @@ context_servers: enable_attention_dp: true pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: @@ -23,6 +23,6 @@ generation_servers: enable_attention_dp: true pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: True + disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml index 4c4fcecf662..b60de54c5eb 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml @@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25 backend: "pytorch" pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 2 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml index c03f001892f..d01502cfc07 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml @@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25 backend: "pytorch" pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 2 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml index 6e5e3e60f32..9f19e0699f9 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml @@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25 backend: "pytorch" pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 2 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml index 1fce7be7129..ee05d96d063 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml @@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25 backend: "pytorch" pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 2 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml index 150b865bd05..2c16cf7aefd 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml @@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25 backend: "pytorch" pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml index dcd6db9f9dd..b55acd05efb 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml @@ -10,7 +10,7 @@ context_servers: enable_attention_dp: True pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: @@ -20,6 +20,6 @@ generation_servers: enable_attention_dp: True pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: True + disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml index df4756e1a05..9428e563d4a 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml @@ -10,7 +10,7 @@ context_servers: enable_attention_dp: true pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: @@ -20,6 +20,6 @@ generation_servers: enable_attention_dp: true pytorch_backend_config: use_cuda_graph: True - enable_overlap_scheduler: True + disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml index 14265346982..a97ac33cb29 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml @@ -9,7 +9,7 @@ context_servers: pipeline_parallel_size: 1 pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: @@ -18,6 +18,6 @@ generation_servers: pipeline_parallel_size: 1 pytorch_backend_config: use_cuda_graph: True - enable_overlap_scheduler: True + disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml index c26b84c1450..99060d86b74 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml @@ -15,7 +15,7 @@ context_servers: pytorch_backend_config: use_cuda_graph: True cuda_graph_batch_sizes: [1,3000] - enable_overlap_scheduler: False + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: @@ -30,7 +30,7 @@ generation_servers: enable_partial_reuse: False pytorch_backend_config: use_cuda_graph: True - enable_overlap_scheduler: False + disable_overlap_scheduler: True cuda_graph_padding_enabled: True cuda_graph_batch_sizes: [1,4,8,16,24,32] urls: diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml index becad3875ba..8ac4a59c5bb 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml @@ -18,7 +18,7 @@ context_servers: enable_partial_reuse: False pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True urls: - "localhost:8001" - "localhost:8002" @@ -37,7 +37,7 @@ generation_servers: enable_partial_reuse: False pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: True + disable_overlap_scheduler: False urls: - "localhost:8003" - "localhost:8004" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml index a4a5de9992d..290b076255d 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml @@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25 backend: "pytorch" pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml index 51bb92bfd09..e35886d8b1a 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml @@ -15,7 +15,7 @@ context_servers: enable_partial_reuse: False pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: False + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: @@ -30,6 +30,6 @@ generation_servers: enable_partial_reuse: False pytorch_backend_config: use_cuda_graph: False - enable_overlap_scheduler: True + disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py index e4739ab8a9c..d8d0e2979ab 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py +++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py @@ -110,13 +110,13 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt, # Context worker worker_pytorch_configs.append( - PyTorchConfig(enable_overlap_scheduler=False, + PyTorchConfig(disable_overlap_scheduler=True, kv_cache_dtype="auto", use_cuda_graph=enable_cuda_graph)) # Generation worker worker_pytorch_configs.append( - PyTorchConfig(enable_overlap_scheduler=generation_overlap, + PyTorchConfig(disable_overlap_scheduler=not generation_overlap, kv_cache_dtype="auto", use_cuda_graph=enable_cuda_graph)) @@ -228,13 +228,13 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph, # Context worker worker_pytorch_configs.append( - PyTorchConfig(enable_overlap_scheduler=False, + PyTorchConfig(disable_overlap_scheduler=True, kv_cache_dtype="auto", use_cuda_graph=enable_cuda_graph)) # Generation worker worker_pytorch_configs.append( - PyTorchConfig(enable_overlap_scheduler=generation_overlap, + PyTorchConfig(disable_overlap_scheduler=not generation_overlap, kv_cache_dtype="auto", use_cuda_graph=enable_cuda_graph)) diff --git a/tests/integration/defs/perf/model_yaml_config.py b/tests/integration/defs/perf/model_yaml_config.py index aa995d91f09..c9ada66638e 100644 --- a/tests/integration/defs/perf/model_yaml_config.py +++ b/tests/integration/defs/perf/model_yaml_config.py @@ -29,7 +29,6 @@ def get_model_yaml_config(model_label: str) -> dict: base_config = { 'enable_attention_dp': True, 'pytorch_backend_config': { - 'enable_overlap_scheduler': True, 'print_iter_log': True, 'use_cuda_graph': True, 'cuda_graph_padding_enabled': True, @@ -40,7 +39,6 @@ def get_model_yaml_config(model_label: str) -> dict: 'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8': { 'pytorch_backend_config': { - 'enable_overlap_scheduler': True, 'use_cuda_graph': True, }, 'speculative_config': { @@ -51,7 +49,6 @@ def get_model_yaml_config(model_label: str) -> dict: 'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8': { 'pytorch_backend_config': { - 'enable_overlap_scheduler': True, 'use_cuda_graph': True, }, 'speculative_config': { diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py index da268ef7d09..5fc185f6842 100644 --- a/tests/integration/defs/stress_test/stress_test.py +++ b/tests/integration/defs/stress_test/stress_test.py @@ -502,9 +502,6 @@ def stress_test(config, "capacity_scheduler_policy": test_server_config.capacity_scheduler_policy }, - "pytorch_backend_config": { - "enable_overlap_scheduler": True, - }, } # Add DeepSeek-V3 specific configuration @@ -519,7 +516,6 @@ def stress_test(config, "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384], "print_iter_log": True, - "enable_overlap_scheduler": True } with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 1d8ec3d50b1..2ede4e7c766 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -425,7 +425,6 @@ def temp_extra_llm_api_options_file(request): if request.node.callspec.params['pytorch_backend_config']: extra_llm_api_options_dict["pytorch_backend_config"] = { - "enable_overlap_scheduler": True, "use_cuda_graph": True, "cuda_graph_batch_sizes": [1, 2, 3], } @@ -1301,7 +1300,6 @@ def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path): delete_on_close=True) as running_log: llm_venv.run_cmd([ str(example_root / "quickstart_advanced.py"), - "--enable_overlap_scheduler", "--enable_chunked_prefill", "--model_dir", f"{llm_models_root()}/{model_path}", @@ -1326,7 +1324,6 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name, llm_venv.run_cmd( [ str(example_root / "quickstart_advanced.py"), - "--enable_overlap_scheduler", "--use_cuda_graph", "--spec_decode_nextn", "1", # test 1 MTP module @@ -1356,7 +1353,6 @@ def test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus( delete_on_close=True) as running_log: llm_venv.run_cmd([ str(example_root / "quickstart_advanced.py"), - "--enable_overlap_scheduler", "--model_dir", f"{llm_models_root()}/{model_path}", "--moe_ep_size=8", @@ -1394,6 +1390,7 @@ def test_ptp_quickstart_advanced_eagle3(llm_root, llm_venv, model_name, "--eagle_model_dir", f"{llm_models_root()}/{eagle_model_path}", "--disable_kv_cache_reuse", + "--disable_overlap_scheduler", ], running_log=running_log) _check_mem_usage(running_log, [25.2, 0, 0, 0]) @@ -1417,7 +1414,6 @@ def test_ptp_quickstart_advanced_deepseek_r1_8gpus(llm_root, llm_venv, delete_on_close=True) as running_log: llm_venv.run_cmd([ str(example_root / "quickstart_advanced.py"), - "--enable_overlap_scheduler", "--model_dir", f"{llm_models_root()}/{model_path}", "--moe_tp_size=1", @@ -1451,7 +1447,6 @@ def test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus( delete_on_close=True) as running_log: llm_venv.run_cmd([ str(example_root / "quickstart_advanced.py"), - "--enable_overlap_scheduler", "--model_dir", f"{llm_models_root()}/{model_path}", "--moe_tp_size=1", @@ -1515,7 +1510,6 @@ def test_ptp_quickstart_advanced_8gpus(llm_root, llm_venv, model_name, delete_on_close=True) as running_log: llm_venv.run_cmd([ str(example_root / "quickstart_advanced.py"), - "--enable_overlap_scheduler", "--enable_chunked_prefill", "--model_dir", f"{llm_models_root()}/{model_path}", @@ -1541,7 +1535,6 @@ def test_ptp_quickstart_advanced_2gpus_sm120(llm_root, llm_venv, model_name, example_root = Path(os.path.join(llm_root, "examples", "pytorch")) llm_venv.run_cmd([ str(example_root / "quickstart_advanced.py"), - "--enable_overlap_scheduler", "--enable_chunked_prefill", "--model_dir", f"{llm_models_root()}/{model_path}", @@ -1786,7 +1779,8 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path, sampling_param = SamplingParams(max_tokens=32, return_context_logits=True) with LLM( model=model_dir, - pytorch_backend_config=PyTorchConfig(attn_backend=backend), + pytorch_backend_config=PyTorchConfig( + attn_backend=backend, disable_overlap_scheduler=True), ) as llm: outputs = llm.generate(prompts, sampling_params=sampling_param) diff --git a/tests/unittest/_torch/modeling/test_modeling_deepseek.py b/tests/unittest/_torch/modeling/test_modeling_deepseek.py index 550a55d18d0..f8b158cc4e1 100644 --- a/tests/unittest/_torch/modeling/test_modeling_deepseek.py +++ b/tests/unittest/_torch/modeling/test_modeling_deepseek.py @@ -57,7 +57,7 @@ def test_deepseek_trtllmgen(model_name): ] * 4 pytorch_config = PyTorchConfig( - enable_overlap_scheduler=False, + disable_overlap_scheduler=True, use_cuda_graph=False, kv_cache_dtype="auto", attn_backend="TRTLLM", diff --git a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py index 49531c7e177..38b15ce1f1f 100644 --- a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py +++ b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py @@ -3,6 +3,7 @@ from parameterized import parameterized from tensorrt_llm._torch import LLM +from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.sampling_params import SamplingParams @@ -40,7 +41,9 @@ def test_llm_api(self, import_oot_code: bool): llm = LLM(model=model_dir, kv_cache_config=kv_cache_config, - max_num_tokens=2048) + max_num_tokens=2048, + pytorch_backend_config=PyTorchConfig( + disable_overlap_scheduler=True)) prompts = [ "Hello, my name is", diff --git a/tests/unittest/_torch/multi_gpu/test_star_attention.py b/tests/unittest/_torch/multi_gpu/test_star_attention.py index 3f04d993560..3938b4164fa 100644 --- a/tests/unittest/_torch/multi_gpu/test_star_attention.py +++ b/tests/unittest/_torch/multi_gpu/test_star_attention.py @@ -62,7 +62,8 @@ def test_model(backend, model_name, quant, sp_size, sa_block_size, max_output_tokens = 128 kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) pytorch_backend_config = PyTorchConfig( - attn_backend='FLASHINFER_STAR_ATTENTION') + attn_backend='FLASHINFER_STAR_ATTENTION', + disable_overlap_scheduler=True) llm = LLM(model=model_dir, backend=backend, diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py index df82ed67590..4ce920e574e 100644 --- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py +++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py @@ -57,7 +57,7 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size): ] * 32 pytorch_config = PyTorchConfig( - enable_overlap_scheduler=False, + disable_overlap_scheduler=True, use_cuda_graph=False, kv_cache_dtype="auto", attn_backend=backend, diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py index b6b210b99b2..5b3094bd3aa 100644 --- a/tests/unittest/_torch/speculative/test_eagle3.py +++ b/tests/unittest/_torch/speculative/test_eagle3.py @@ -25,7 +25,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str): models_path = llm_models_root() pytorch_config = PyTorchConfig( - enable_overlap_scheduler=False, + disable_overlap_scheduler=True, use_cuda_graph=use_cuda_graph, # Only create a single CUDA graph to prevent OOM in CI attn_backend=attn_backend, diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/test_overlap_scheduler.py index af87717b55f..34402ab0929 100644 --- a/tests/unittest/_torch/test_overlap_scheduler.py +++ b/tests/unittest/_torch/test_overlap_scheduler.py @@ -22,11 +22,11 @@ def model_path(): return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" -def create_llm(model_dir, enable_overlap_scheduler, enable_trtllm_decoder): +def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_decoder): """Create LLM with specific overlap scheduler setting""" pytorch_config = PyTorchConfig( use_cuda_graph=True, - enable_overlap_scheduler=enable_overlap_scheduler, + disable_overlap_scheduler=disable_overlap_scheduler, enable_trtllm_decoder=enable_trtllm_decoder) trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False) @@ -62,7 +62,7 @@ def test_overlap_scheduler_consistency(model_path, test_case, # Test with overlap scheduler enabled llm = create_llm(model_path, - enable_overlap_scheduler=True, + disable_overlap_scheduler=False, enable_trtllm_decoder=enable_trtllm_decoder) outputs_with_overlap = llm.generate(prompts, sampling_params=sampling_config, @@ -74,7 +74,7 @@ def test_overlap_scheduler_consistency(model_path, test_case, # Test with overlap scheduler disabled llm = create_llm(model_path, - enable_overlap_scheduler=False, + disable_overlap_scheduler=True, enable_trtllm_decoder=enable_trtllm_decoder) outputs_without_overlap = llm.generate(prompts, sampling_params=sampling_config, diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py index cd298967a31..a2278c0d996 100644 --- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py +++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py @@ -26,7 +26,7 @@ def temp_extra_llm_api_options_file(request): extra_llm_api_options_dict = { "guided_decoding_backend": "xgrammar", "pytorch_backend_config": { - "enable_overlap_scheduler": False, + "disable_overlap_scheduler": True, } } diff --git a/tests/unittest/llmapi/apps/_test_openai_metrics.py b/tests/unittest/llmapi/apps/_test_openai_metrics.py index 2e3fd474122..e79c34da311 100755 --- a/tests/unittest/llmapi/apps/_test_openai_metrics.py +++ b/tests/unittest/llmapi/apps/_test_openai_metrics.py @@ -24,9 +24,7 @@ def client(): kv_cache_config=KvCacheConfig(), backend="pytorch", pytorch_backend_config=PyTorchConfig( - enable_overlap_scheduler=True, - enable_iter_perf_stats=True, - )) + enable_iter_perf_stats=True, )) hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path) app_instance = OpenAIServer(llm, diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index 2ddb709930b..5ddd1a10e33 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -1875,7 +1875,7 @@ def llm_get_stats_test_harness(tp_size: int = 1, llm_args_extra["pytorch_backend_config"] = PyTorchConfig( enable_iter_perf_stats=True, enable_iter_req_stats=enable_iter_req_stats, - enable_overlap_scheduler=use_overlap) + disable_overlap_scheduler=not use_overlap) LLM_CLASS = LLM_torch else: LLM_CLASS = LLM @@ -1944,8 +1944,8 @@ def llm_get_stats_async_test_harness(tp_size: int = 1, from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig llm_args_extra["pytorch_backend_config"] = PyTorchConfig( enable_iter_perf_stats=True, - enable_overlap_scheduler=use_overlap, - enable_iter_req_stats=enable_iter_req_stats) + enable_iter_req_stats=enable_iter_req_stats, + disable_overlap_scheduler=not use_overlap) LLM_CLASS = LLM_torch else: LLM_CLASS = LLM diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index d5c6635c2f8..b2bc5529002 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -82,9 +82,9 @@ def test_llm_reward_model(): from tensorrt_llm._torch import LLM as LLM_torch from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig - llm = LLM_torch( - model=rm_model_path, - pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA")) + llm = LLM_torch(model=rm_model_path, + pytorch_backend_config=PyTorchConfig( + attn_backend="VANILLA", disable_overlap_scheduler=True)) sampling_params = SamplingParams(return_context_logits=True)