diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
index 27512b16e5f..c50e73b8fd2 100644
--- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
+++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
@@ -135,7 +135,6 @@ YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
 pytorch_backend_config:
-    enable_overlap_scheduler: true
     use_cuda_graph: true
     moe_backend: TRTLLM
 speculative_config:
@@ -218,7 +217,6 @@ pytorch_backend_config:
     - 256
     - 384
     print_iter_log: true
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 
@@ -260,7 +258,6 @@ YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
 pytorch_backend_config:
-    enable_overlap_scheduler: true
     use_cuda_graph: true
 speculative_config:
     decoding_type: MTP
@@ -314,7 +311,6 @@ pytorch_backend_config:
     use_cuda_graph: true
     cuda_graph_batch_sizes:
     - 128
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 
diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md
index 17e42f8abc4..70cc78f3619 100644
--- a/examples/disaggregated/README.md
+++ b/examples/disaggregated/README.md
@@ -9,7 +9,7 @@ You can use multiple `trtllm-serve` commands to launch the context and generatio
 for disaggregated serving. For example, you could launch two context servers and one generation servers as follows:
 
 ```
-echo -e "pytorch_backend_config:\n  enable_overlap_scheduler: False\ncache_transceiver_config:\n  max_num_tokens: 2048" > context_extra-llm-api-config.yml
+echo -e "pytorch_backend_config:\n  disable_overlap_scheduler: True\ncache_transceiver_config:\n  max_num_tokens: 2048" > context_extra-llm-api-config.yml
 echo -e "cache_transceiver_config:\n  max_num_tokens: 2048" > gen_extra-llm-api-config.yml
 
 export TRTLLM_USE_UCX_KVCACHE=1
@@ -65,7 +65,7 @@ model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/examples/disaggregated/disagg_config.yaml b/examples/disaggregated/disagg_config.yaml
index 391ef87e8d2..a199a594522 100644
--- a/examples/disaggregated/disagg_config.yaml
+++ b/examples/disaggregated/disagg_config.yaml
@@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/examples/llm-api/llm_inference_kv_events.py b/examples/llm-api/llm_inference_kv_events.py
index 69b9dc95a29..827427e538b 100644
--- a/examples/llm-api/llm_inference_kv_events.py
+++ b/examples/llm-api/llm_inference_kv_events.py
@@ -6,8 +6,7 @@
 
 
 def main():
-    pytorch_config = PyTorchConfig(enable_overlap_scheduler=True,
-                                   autotuner_enabled=False,
+    pytorch_config = PyTorchConfig(autotuner_enabled=False,
                                    kv_cache_dtype='auto')
 
     llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
diff --git a/examples/llm-api/llm_mgmn_trtllm_bench.sh b/examples/llm-api/llm_mgmn_trtllm_bench.sh
index 556f2d9e576..21a0ee48d9b 100644
--- a/examples/llm-api/llm_mgmn_trtllm_bench.sh
+++ b/examples/llm-api/llm_mgmn_trtllm_bench.sh
@@ -76,7 +76,6 @@ srun -l \
         cat > /tmp/pytorch_extra_args.txt << EOF
 pytorch_backend_config:
     use_cuda_graph: false
-    enable_overlap_scheduler: true
     cuda_graph_padding_enabled: false
     print_iter_log: true
 enable_attention_dp: false
diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md
index cbbcf00227a..72b5196a40b 100644
--- a/examples/models/core/deepseek_v3/README.md
+++ b/examples/models/core/deepseek_v3/README.md
@@ -21,7 +21,10 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/
   - [Quick Start](#quick-start)
     - [Run a single inference](#run-a-single-inference)
     - [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp)
+      - [Relaxed acceptance](#relaxed-acceptance)
     - [Long context support](#long-context-support)
+      - [ISL-64k-OSL-1024](#isl-64k-osl-1024)
+      - [ISL-128k-OSL-1024](#isl-128k-osl-1024)
   - [Evaluation](#evaluation)
   - [Serving](#serving)
   - [Advanced Usages](#advanced-usages)
@@ -34,6 +37,7 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/
     - [FP8 KV Cache and MLA](#fp8-kv-cache-and-mla)
     - [W4AFP8](#w4afp8)
   - [Notes and Troubleshooting](#notes-and-troubleshooting)
+  - [Known Issues](#known-issues)
 
 
 ## Hardware Requirements
@@ -134,7 +138,6 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
 pytorch_backend_config:
-  enable_overlap_scheduler: true
   use_cuda_graph: true
   cuda_graph_padding_enabled: true
   cuda_graph_batch_sizes: [1, 4, 8, 12]
@@ -163,7 +166,6 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
 pytorch_backend_config:
-  enable_overlap_scheduler: true
   use_cuda_graph: true
   cuda_graph_padding_enabled: true
   cuda_graph_batch_sizes: [1, 2]
@@ -190,7 +192,6 @@ Evaluate the model accuracy using `trtllm-eval`.
 cat >./extra-llm-api-config.yml <<EOF
 pytorch_backend_config:
     use_cuda_graph: true
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 ```
@@ -246,7 +247,6 @@ pytorch_backend_config:
     - 256
     - 384
     print_iter_log: true
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 
@@ -417,7 +417,6 @@ pytorch_backend_config:
     - 256
     - 384
     print_iter_log: true
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 ```
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
index 717658b02e7..78a55531eb2 100644
--- a/examples/models/core/qwen/README.md
+++ b/examples/models/core/qwen/README.md
@@ -22,7 +22,7 @@ This document shows how to build and run a [Qwen](https://huggingface.co/Qwen) m
       - [Run a single inference](#run-a-single-inference)
     - [Evaluation](#evaluation)
     - [Serving](#serving)
-    - [Notes and Troubleshooting](#notes-and-troubleshooting)
+  - [Notes and Troubleshooting](#notes-and-troubleshooting)
   - [Credits](#credits)
 
 ## Overview
@@ -668,7 +668,6 @@ pytorch_backend_config:
     - 256
     - 384
     print_iter_log: true
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 
diff --git a/examples/pytorch/quickstart_advanced.py b/examples/pytorch/quickstart_advanced.py
index 1e8f29a553d..9137b3d0f9c 100644
--- a/examples/pytorch/quickstart_advanced.py
+++ b/examples/pytorch/quickstart_advanced.py
@@ -72,7 +72,7 @@ def add_llm_args(parser):
     parser.add_argument("--kv_cache_fraction", type=float, default=None)
 
     # Runtime
-    parser.add_argument('--enable_overlap_scheduler',
+    parser.add_argument('--disable_overlap_scheduler',
                         default=False,
                         action='store_true')
     parser.add_argument('--enable_chunked_prefill',
@@ -124,7 +124,7 @@ def parse_arguments():
 
 def setup_llm(args):
     pytorch_config = PyTorchConfig(
-        enable_overlap_scheduler=args.enable_overlap_scheduler,
+        disable_overlap_scheduler=args.disable_overlap_scheduler,
         kv_cache_dtype=args.kv_cache_dtype,
         attn_backend=args.attention_backend,
         use_cuda_graph=args.use_cuda_graph,
diff --git a/examples/scaffolding/run_best_of_n_with_reward.py b/examples/scaffolding/run_best_of_n_with_reward.py
index 8074ab2ada0..767147cfc46 100644
--- a/examples/scaffolding/run_best_of_n_with_reward.py
+++ b/examples/scaffolding/run_best_of_n_with_reward.py
@@ -39,7 +39,7 @@ def main():
         max_batch_size=args.sample_num,
         max_num_tokens=8192,
         kv_cache_free_gpu_memory_fraction=0.2,
-        enable_overlap_scheduler=False)
+        disable_overlap_scheduler=True)
     workers[NativeGenerationController.WorkerTag.GENERATION] = gen_worker
     workers[QwenRewardController.WorkerTag.REWARD] = reward_worker
 
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
index 60bb7462d74..76aa8bd1c8f 100644
--- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -302,7 +302,7 @@ def create_autodeploy_executor(
         model_engine=engine,
         decoder=decoder,
         dist=mpi_dist,
-        enable_overlap_scheduler=py_config.enable_overlap_scheduler,
+        disable_overlap_scheduler=py_config.disable_overlap_scheduler,
         max_input_len=executor_config.max_input_len,
         max_batch_size=executor_config.max_batch_size,
         max_draft_tokens=executor_config.speculative_config.max_draft_tokens
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
index fd069c1ccd6..e7ad74f56ce 100644
--- a/tensorrt_llm/_torch/pyexecutor/_util.py
+++ b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -343,7 +343,7 @@ def create_py_executor_instance(dist,
         if spec_config is not None:
             raise ValueError(
                 "Guided decoding is not supported with speculative decoding.")
-        if pytorch_backend_config.enable_overlap_scheduler:
+        if not pytorch_backend_config.disable_overlap_scheduler:
             raise ValueError(
                 "Guided decoding is not supported with overlap scheduler.")
 
@@ -415,7 +415,7 @@ def create_py_executor_instance(dist,
     if mapping.has_pp():
         num_micro_batches = mapping.pp_size
     else:
-        num_micro_batches = 2 if pytorch_backend_config.enable_overlap_scheduler else 1
+        num_micro_batches = 1 if pytorch_backend_config.disable_overlap_scheduler else 2
 
     resources["seq_slot_manager"] = SeqSlotManager(
         executor_config.max_batch_size * num_micro_batches)
@@ -450,8 +450,8 @@ def create_py_executor_instance(dist,
                       model_engine=model_engine,
                       decoder=decoder,
                       dist=dist,
-                      enable_overlap_scheduler=pytorch_backend_config.
-                      enable_overlap_scheduler,
+                      disable_overlap_scheduler=pytorch_backend_config.
+                      disable_overlap_scheduler,
                       max_batch_size=executor_config.max_batch_size,
                       max_draft_tokens=spec_config.max_draft_tokens
                       if spec_config is not None else 0,
@@ -471,9 +471,9 @@ def instantiate_decoder(model_engine, executor_config, pytorch_backend_config,
                                    spec_config=model_engine.spec_config)
     elif pytorch_backend_config.enable_trtllm_decoder:
         decoding_mode = get_decoding_mode(executor_config)
-        decoder = TRTLLMDecoder(executor_config, model_engine.model,
-                                model_engine.dtype, mapping, decoding_mode,
-                                pytorch_backend_config.enable_overlap_scheduler)
+        decoder = TRTLLMDecoder(
+            executor_config, model_engine.model, model_engine.dtype, mapping,
+            decoding_mode, pytorch_backend_config.disable_overlap_scheduler)
     elif not model_engine.model.model_config.is_generation:
         # NOTE: choose decoder based on model type
         decoder = EarlyStopDecoder()
diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py
index da208343e4c..5584cf5e819 100644
--- a/tensorrt_llm/_torch/pyexecutor/config.py
+++ b/tensorrt_llm/_torch/pyexecutor/config.py
@@ -45,7 +45,7 @@ class PyTorchConfig:
     # If true, batches are rounded up to the nearest cuda_graph_batch_size.
     # This is usually a net win for performance.
     cuda_graph_padding_enabled: bool = False
-    enable_overlap_scheduler: bool = False
+    disable_overlap_scheduler: bool = False
     # If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time.
     # If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used.
     moe_max_num_tokens: Optional[int] = None
diff --git a/tensorrt_llm/_torch/pyexecutor/decoder.py b/tensorrt_llm/_torch/pyexecutor/decoder.py
index 2be5c031f6c..0a9f07641bf 100644
--- a/tensorrt_llm/_torch/pyexecutor/decoder.py
+++ b/tensorrt_llm/_torch/pyexecutor/decoder.py
@@ -449,7 +449,7 @@ def __init__(
         model_dtype,
         mapping: Mapping,
         decoding_mode: DecodingMode,
-        enable_overlap_scheduler: bool,
+        disable_overlap_scheduler: bool,
     ):
 
         vocab_size = model.config.vocab_size
@@ -468,7 +468,7 @@ def __init__(
         self.max_num_sequences = mapping.pp_size * self.executor_config.max_batch_size
         self.max_seq_idle_microseconds = 180 * 1000 * 1000
         self.max_decoding_tokens = 1  # It must be 1 when not in speculative decoding
-        self.is_trt_overlap = enable_overlap_scheduler
+        self.is_trt_overlap = not disable_overlap_scheduler
 
         self.world_config = WorldConfig.mpi(mapping.gpus_per_node,
                                             mapping.tp_size, mapping.pp_size)
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 06ffd32998a..6efe4d1f482 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -331,7 +331,7 @@ def __init__(
             layerwise_nvtx_marker.register_hooks(self.model, module_prefix)
 
         self.enable_attention_dp = self.model.model_config.mapping.enable_attention_dp
-        self._enable_overlap_scheduler = self.pytorch_backend_config.enable_overlap_scheduler
+        self._disable_overlap_scheduler = self.pytorch_backend_config.disable_overlap_scheduler
         self._torch_compile_backend = None
         self.dtype = self.model.config.torch_dtype
         self._init_model_capacity()
@@ -982,7 +982,7 @@ def _preprocess_inputs(self, inputs: Dict[str, Any]):
         """
         Make some changes to the device inputs and avoid block the async data transfer
         """
-        if self.is_spec_decode and self._enable_overlap_scheduler:
+        if self.is_spec_decode and not self._disable_overlap_scheduler:
             # When enabling overlap scheduler, the kv cache for draft tokens will
             # be prepared in advance by using the max_draft_len. But we need to use
             # new_tokens_lens_device to get the real past kv lengths and the
@@ -1086,7 +1086,7 @@ def _prepare_tp_inputs(
                                  dtype=torch.int32).to('cuda',
                                                        non_blocking=True))
 
-        if self._enable_overlap_scheduler and self.is_spec_decode:
+        if not self._disable_overlap_scheduler and self.is_spec_decode:
             spec_dec_mode = self.spec_config.spec_dec_mode
             assert spec_dec_mode.support_overlap_scheduler(
             ), f"{self.spec_config.spec_dec_name} does not support overlap scheduler"
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 72d5b60d30b..77ee38234c9 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -162,7 +162,7 @@ def __init__(self,
                  model_engine: ModelEngine,
                  decoder: Decoder,
                  dist: Distributed,
-                 enable_overlap_scheduler: bool = False,
+                 disable_overlap_scheduler: bool = False,
                  max_input_len: int = 2048,
                  max_batch_size: int = 8,
                  max_draft_tokens: int = 0,
@@ -187,7 +187,7 @@ def __init__(self,
         self.enable_attention_dp = model_engine.enable_attention_dp
         self.decoder = decoder
         self.dist = dist
-        self.enable_overlap_scheduler = enable_overlap_scheduler
+        self.disable_overlap_scheduler = disable_overlap_scheduler
 
         # Draft model for certain spec decode algorithms, e.g. EAGLE3
         self.draft_model_engine = draft_model_engine
@@ -258,7 +258,7 @@ def __init__(self,
         if self.dist.pp_size > 1:
             self.event_loop = self._executor_loop_pp
         else:
-            self.event_loop = self._executor_loop_overlap if enable_overlap_scheduler else self._executor_loop
+            self.event_loop = self._executor_loop if disable_overlap_scheduler else self._executor_loop_overlap
 
         if is_trace_enabled("TLLM_TRACE_EXECUTOR_LOOP"):
             self.event_loop = trace_func(self.event_loop)
@@ -1975,7 +1975,7 @@ def _handle_responses(self):
                 # If request is in transmission, so we don't need to emit a response
                 # Also, for the first iteration with overlap, we should skip since first token has already been emitted by context server
                 if request.is_disagg_generation_transmission_in_progress or (
-                        self.enable_overlap_scheduler
+                        not self.disable_overlap_scheduler
                         and request.py_decoding_iter <= 1):
                     new_active_requests.append(request)
                     continue
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index c08b890d310..8241c4c0189 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -106,7 +106,7 @@ def create_py_executor(executor_config: ExecutorConfig,
     # PyTorchModelEngine modifies these fields, update them to executor_config
     max_seq_len = model_engine.max_seq_len
     origin_seq_len = max_seq_len
-    if pytorch_backend_config.enable_overlap_scheduler:
+    if not pytorch_backend_config.disable_overlap_scheduler:
         max_seq_len = model_engine.max_seq_len + 1
         if spec_config is not None:
             max_seq_len += spec_config.max_draft_tokens
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
index 87755dce39a..3e7ed03cd21 100755
--- a/tensorrt_llm/bench/benchmark/utils/general.py
+++ b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -148,7 +148,6 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
     pyt_options = {
         "use_cuda_graph": True,
         "cuda_graph_padding_enabled": True,
-        "enable_overlap_scheduler": True,
         "kv_cache_dtype": kv_cache_dtype,
         "cuda_graph_max_batch_size": max_batch_size,
     }
diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py
index d1db9144714..632a956b6ff 100644
--- a/tensorrt_llm/commands/eval.py
+++ b/tensorrt_llm/commands/eval.py
@@ -115,7 +115,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
         backend = None
     pytorch_backend_config = None
     if backend == "pytorch":
-        pytorch_backend_config = PyTorchConfig(enable_overlap_scheduler=True)
+        pytorch_backend_config = PyTorchConfig()
 
     llm_args = {
         "model": model,
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
index 1c919331a06..739c835b133 100644
--- a/tensorrt_llm/commands/serve.py
+++ b/tensorrt_llm/commands/serve.py
@@ -50,8 +50,7 @@ def get_llm_args(model: str,
     kv_cache_config = KvCacheConfig(
         free_gpu_memory_fraction=free_gpu_memory_fraction)
 
-    pytorch_backend_config = PyTorchConfig(
-        enable_overlap_scheduler=True) if backend == "pytorch" else None
+    pytorch_backend_config = PyTorchConfig() if backend == "pytorch" else None
     dynamic_batch_config = DynamicBatchConfig(
         enable_batch_size_tuning=True,
         enable_max_num_tokens_tuning=False,
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index 5cfb360ba91..77bc65a3be4 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -384,7 +384,7 @@ def _enqueue_request(self, request: GenerationRequest) -> int:
                 context_phase_params = request.disaggregated_params.get_context_phase_params(
                 )
 
-        is_overlap_enabled = self._is_pytorch_backend and self._executor_config.pytorch_backend_config.enable_overlap_scheduler
+        is_overlap_enabled = self._is_pytorch_backend and not self._executor_config.pytorch_backend_config.disable_overlap_scheduler
         if is_overlap_enabled:
             is_disaggregated = self.engine.kv_cache_transceiver is not None
             if is_disaggregated and (
diff --git a/tensorrt_llm/scaffolding/worker.py b/tensorrt_llm/scaffolding/worker.py
index 66ceec64b8b..d133083dc9b 100644
--- a/tensorrt_llm/scaffolding/worker.py
+++ b/tensorrt_llm/scaffolding/worker.py
@@ -136,11 +136,11 @@ def init_with_new_llm(
         max_batch_size: int = 32,
         max_num_tokens: int = 4096,
         kv_cache_free_gpu_memory_fraction: float = 0.9,
-        enable_overlap_scheduler: bool = True,
+        disable_overlap_scheduler: bool = False,
     ):
         pytorch_backend_config = PyTorchConfig(
             mixed_decoder=True,
-            enable_overlap_scheduler=enable_overlap_scheduler,
+            disable_overlap_scheduler=disable_overlap_scheduler,
         )
         kv_cache_config = KvCacheConfig(
             free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, )
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index d627a3f4481..c842cb7d53b 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -144,16 +144,16 @@ class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
 
     @pytest.mark.skip_less_device_memory(32000)
     @pytest.mark.skip_device_not_contain(["H100"])
-    @pytest.mark.parametrize("overlap_scheduler", [False, True])
-    def test_auto_dtype(self, overlap_scheduler):
+    @pytest.mark.parametrize("disable_overlap_scheduler", [False, True])
+    def test_auto_dtype(self, disable_overlap_scheduler):
         ctx_server_config = {
             "pytorch_backend_config": {
-                "enable_overlap_scheduler": False
+                "disable_overlap_scheduler": True
             }
         }
         gen_server_config = {
             "pytorch_backend_config": {
-                "enable_overlap_scheduler": overlap_scheduler
+                "disable_overlap_scheduler": disable_overlap_scheduler
             }
         }
         disaggregated_server_config = {
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index ca03cad0517..ac879df4f7e 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -63,6 +63,7 @@ def test_bfloat16(self, attn_backend, torch_compile):
             cuda_graph_padding_enabled=torch_compile,
             cuda_graph_batch_sizes=[4],
             attn_backend=attn_backend,
+            disable_overlap_scheduler=torch_compile,
         )
         llm = LLM(self.MODEL_PATH, pytorch_backend_config=pytorch_config)
         with llm:
@@ -87,6 +88,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
             cuda_graph_padding_enabled=torch_compile,
             cuda_graph_batch_sizes=[4],
             attn_backend=attn_backend,
+            disable_overlap_scheduler=torch_compile,
         )
         llm = LLM(self.MODEL_PATH,
                   tensor_parallel_size=tp_size,
@@ -109,6 +111,7 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
             cuda_graph_padding_enabled=torch_compile,
             cuda_graph_batch_sizes=[4],
             attn_backend=attn_backend,
+            disable_overlap_scheduler=torch_compile,
         )
         if fp8kv:
             quant_config.kv_cache_quant_algo = QuantAlgo.FP8
@@ -145,6 +148,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
             cuda_graph_padding_enabled=torch_compile,
             cuda_graph_batch_sizes=[4],
             attn_backend=attn_backend,
+            disable_overlap_scheduler=torch_compile,
         )
         if fp8kv:
             quant_config.kv_cache_quant_algo = QuantAlgo.FP8
@@ -319,7 +323,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
         mtp_config = None
         if mtp_nextn > 0:
@@ -351,7 +355,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
         mtp_config = None
         if mtp_nextn > 0:
@@ -384,7 +388,7 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
 
         quant_config = QuantConfig()
@@ -435,7 +439,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
 
         quant_config = QuantConfig()
@@ -480,7 +484,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                            (True, True, True, True)])
     def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler):
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
 
         quant_config = QuantConfig()
@@ -521,7 +525,7 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler):
     def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                          overlap_scheduler, tp_size, pp_size, ep_size):
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
 
         quant_config = QuantConfig()
@@ -569,7 +573,7 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                          attention_dp, cuda_graph, overlap_scheduler):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
 
         quant_config = QuantConfig()
@@ -615,7 +619,7 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                             batch_size):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
 
         quant_config = QuantConfig()
@@ -667,7 +671,7 @@ class TestNemotronNas(LlmapiAccuracyTestHarness):
     @pytest.mark.skip_less_device(8)
     def test_auto_dtype_tp8(self):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
-        pytorch_config = PyTorchConfig(enable_overlap_scheduler=True)
+        pytorch_config = PyTorchConfig()
 
         with LLM(self.MODEL_PATH,
                  tensor_parallel_size=8,
@@ -747,7 +751,7 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
     def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
                               cuda_graph, overlap_scheduler):
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
 
         llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8",
@@ -774,7 +778,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
     def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
                               cuda_graph, overlap_scheduler):
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
 
         llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8",
@@ -797,7 +801,7 @@ def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
     def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
                  overlap_scheduler):
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
 
         llm = LLM(
@@ -821,7 +825,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
     def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
                    overlap_scheduler):
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
 
         llm = LLM(
@@ -849,7 +853,7 @@ class TestQwen3_32B(LlmapiAccuracyTestHarness):
     def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
                               cuda_graph, overlap_scheduler):
         pytorch_config = PyTorchConfig(
-            enable_overlap_scheduler=overlap_scheduler,
+            disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph)
 
         llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8",
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml
index 4586d86a788..7a850b121bc 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml
@@ -5,7 +5,7 @@ backend: "pytorch"
 free_gpu_memory_fraction: 0.1
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
   autotuner_enabled: False
 context_servers:
   num_instances: 2
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml
index 8f678bd51d5..2c9a83ecd65 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml
@@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.15
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
   autotuner_enabled: False
 context_servers:
   num_instances: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml
index 1b8132ebcaf..59db98e2ab7 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml
@@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.1
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml
index 37d0a6275d6..bf8b1484151 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml
@@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.1
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 speculative_config:
   decoding_type: MTP
   num_nextn_predict_layers: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml
index 96f06b77313..35b1cb6f4e9 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml
@@ -13,7 +13,7 @@ context_servers:
   enable_attention_dp: true
   pytorch_backend_config:
     use_cuda_graph: False
-    enable_overlap_scheduler: False
+    disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
@@ -23,6 +23,6 @@ generation_servers:
   enable_attention_dp: true
   pytorch_backend_config:
     use_cuda_graph: False
-    enable_overlap_scheduler: True
+    disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml
index 4c4fcecf662..b60de54c5eb 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml
@@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 2
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml
index c03f001892f..d01502cfc07 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml
@@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 2
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml
index 6e5e3e60f32..9f19e0699f9 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml
@@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 2
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml
index 1fce7be7129..ee05d96d063 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml
@@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 2
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml
index 150b865bd05..2c16cf7aefd 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml
@@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 speculative_config:
   decoding_type: MTP
   num_nextn_predict_layers: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml
index dcd6db9f9dd..b55acd05efb 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml
@@ -10,7 +10,7 @@ context_servers:
   enable_attention_dp: True
   pytorch_backend_config:
     use_cuda_graph: False
-    enable_overlap_scheduler: False
+    disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
@@ -20,6 +20,6 @@ generation_servers:
   enable_attention_dp: True
   pytorch_backend_config:
     use_cuda_graph: False
-    enable_overlap_scheduler: True
+    disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
index df4756e1a05..9428e563d4a 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
@@ -10,7 +10,7 @@ context_servers:
   enable_attention_dp: true
   pytorch_backend_config:
     use_cuda_graph: False
-    enable_overlap_scheduler: False
+    disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
@@ -20,6 +20,6 @@ generation_servers:
   enable_attention_dp: true
   pytorch_backend_config:
     use_cuda_graph: True
-    enable_overlap_scheduler: True
+    disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
index 14265346982..a97ac33cb29 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
@@ -9,7 +9,7 @@ context_servers:
   pipeline_parallel_size: 1
   pytorch_backend_config:
     use_cuda_graph: False
-    enable_overlap_scheduler: False
+    disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
@@ -18,6 +18,6 @@ generation_servers:
   pipeline_parallel_size: 1
   pytorch_backend_config:
     use_cuda_graph: True
-    enable_overlap_scheduler: True
+    disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
index c26b84c1450..99060d86b74 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
@@ -15,7 +15,7 @@ context_servers:
   pytorch_backend_config:
     use_cuda_graph: True
     cuda_graph_batch_sizes: [1,3000]
-    enable_overlap_scheduler: False
+    disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
@@ -30,7 +30,7 @@ generation_servers:
     enable_partial_reuse: False
   pytorch_backend_config:
     use_cuda_graph: True
-    enable_overlap_scheduler: False
+    disable_overlap_scheduler: True
     cuda_graph_padding_enabled: True
     cuda_graph_batch_sizes: [1,4,8,16,24,32]
   urls:
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml
index becad3875ba..8ac4a59c5bb 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml
@@ -18,7 +18,7 @@ context_servers:
     enable_partial_reuse: False
   pytorch_backend_config:
     use_cuda_graph: False
-    enable_overlap_scheduler: False
+    disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
       - "localhost:8002"
@@ -37,7 +37,7 @@ generation_servers:
     enable_partial_reuse: False
   pytorch_backend_config:
     use_cuda_graph: False
-    enable_overlap_scheduler: True
+    disable_overlap_scheduler: False
   urls:
       - "localhost:8003"
       - "localhost:8004"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml
index a4a5de9992d..290b076255d 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml
@@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
index 51bb92bfd09..e35886d8b1a 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
@@ -15,7 +15,7 @@ context_servers:
     enable_partial_reuse: False
   pytorch_backend_config:
     use_cuda_graph: False
-    enable_overlap_scheduler: False
+    disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
@@ -30,6 +30,6 @@ generation_servers:
     enable_partial_reuse: False
   pytorch_backend_config:
     use_cuda_graph: False
-    enable_overlap_scheduler: True
+    disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
index e4739ab8a9c..d8d0e2979ab 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -110,13 +110,13 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt,
 
     # Context worker
     worker_pytorch_configs.append(
-        PyTorchConfig(enable_overlap_scheduler=False,
+        PyTorchConfig(disable_overlap_scheduler=True,
                       kv_cache_dtype="auto",
                       use_cuda_graph=enable_cuda_graph))
 
     # Generation worker
     worker_pytorch_configs.append(
-        PyTorchConfig(enable_overlap_scheduler=generation_overlap,
+        PyTorchConfig(disable_overlap_scheduler=not generation_overlap,
                       kv_cache_dtype="auto",
                       use_cuda_graph=enable_cuda_graph))
 
@@ -228,13 +228,13 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
 
     # Context worker
     worker_pytorch_configs.append(
-        PyTorchConfig(enable_overlap_scheduler=False,
+        PyTorchConfig(disable_overlap_scheduler=True,
                       kv_cache_dtype="auto",
                       use_cuda_graph=enable_cuda_graph))
 
     # Generation worker
     worker_pytorch_configs.append(
-        PyTorchConfig(enable_overlap_scheduler=generation_overlap,
+        PyTorchConfig(disable_overlap_scheduler=not generation_overlap,
                       kv_cache_dtype="auto",
                       use_cuda_graph=enable_cuda_graph))
 
diff --git a/tests/integration/defs/perf/model_yaml_config.py b/tests/integration/defs/perf/model_yaml_config.py
index aa995d91f09..c9ada66638e 100644
--- a/tests/integration/defs/perf/model_yaml_config.py
+++ b/tests/integration/defs/perf/model_yaml_config.py
@@ -29,7 +29,6 @@ def get_model_yaml_config(model_label: str) -> dict:
     base_config = {
         'enable_attention_dp': True,
         'pytorch_backend_config': {
-            'enable_overlap_scheduler': True,
             'print_iter_log': True,
             'use_cuda_graph': True,
             'cuda_graph_padding_enabled': True,
@@ -40,7 +39,6 @@ def get_model_yaml_config(model_label: str) -> dict:
         'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8':
         {
             'pytorch_backend_config': {
-                'enable_overlap_scheduler': True,
                 'use_cuda_graph': True,
             },
             'speculative_config': {
@@ -51,7 +49,6 @@ def get_model_yaml_config(model_label: str) -> dict:
         'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8':
         {
             'pytorch_backend_config': {
-                'enable_overlap_scheduler': True,
                 'use_cuda_graph': True,
             },
             'speculative_config': {
diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
index da268ef7d09..5fc185f6842 100644
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
@@ -502,9 +502,6 @@ def stress_test(config,
             "capacity_scheduler_policy":
             test_server_config.capacity_scheduler_policy
         },
-        "pytorch_backend_config": {
-            "enable_overlap_scheduler": True,
-        },
     }
 
     # Add DeepSeek-V3 specific configuration
@@ -519,7 +516,6 @@ def stress_test(config,
                 "cuda_graph_batch_sizes":
                 [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
                 "print_iter_log": True,
-                "enable_overlap_scheduler": True
             }
 
     with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 1d8ec3d50b1..2ede4e7c766 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -425,7 +425,6 @@ def temp_extra_llm_api_options_file(request):
 
             if request.node.callspec.params['pytorch_backend_config']:
                 extra_llm_api_options_dict["pytorch_backend_config"] = {
-                    "enable_overlap_scheduler": True,
                     "use_cuda_graph": True,
                     "cuda_graph_batch_sizes": [1, 2, 3],
                 }
@@ -1301,7 +1300,6 @@ def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path):
                                          delete_on_close=True) as running_log:
             llm_venv.run_cmd([
                 str(example_root / "quickstart_advanced.py"),
-                "--enable_overlap_scheduler",
                 "--enable_chunked_prefill",
                 "--model_dir",
                 f"{llm_models_root()}/{model_path}",
@@ -1326,7 +1324,6 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
         llm_venv.run_cmd(
             [
                 str(example_root / "quickstart_advanced.py"),
-                "--enable_overlap_scheduler",
                 "--use_cuda_graph",
                 "--spec_decode_nextn",
                 "1",  # test 1 MTP module
@@ -1356,7 +1353,6 @@ def test_ptp_quickstart_advanced_deepseek_v3_2nodes_8gpus(
                                      delete_on_close=True) as running_log:
         llm_venv.run_cmd([
             str(example_root / "quickstart_advanced.py"),
-            "--enable_overlap_scheduler",
             "--model_dir",
             f"{llm_models_root()}/{model_path}",
             "--moe_ep_size=8",
@@ -1394,6 +1390,7 @@ def test_ptp_quickstart_advanced_eagle3(llm_root, llm_venv, model_name,
             "--eagle_model_dir",
             f"{llm_models_root()}/{eagle_model_path}",
             "--disable_kv_cache_reuse",
+            "--disable_overlap_scheduler",
         ],
                          running_log=running_log)
         _check_mem_usage(running_log, [25.2, 0, 0, 0])
@@ -1417,7 +1414,6 @@ def test_ptp_quickstart_advanced_deepseek_r1_8gpus(llm_root, llm_venv,
                                      delete_on_close=True) as running_log:
         llm_venv.run_cmd([
             str(example_root / "quickstart_advanced.py"),
-            "--enable_overlap_scheduler",
             "--model_dir",
             f"{llm_models_root()}/{model_path}",
             "--moe_tp_size=1",
@@ -1451,7 +1447,6 @@ def test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus(
                                      delete_on_close=True) as running_log:
         llm_venv.run_cmd([
             str(example_root / "quickstart_advanced.py"),
-            "--enable_overlap_scheduler",
             "--model_dir",
             f"{llm_models_root()}/{model_path}",
             "--moe_tp_size=1",
@@ -1515,7 +1510,6 @@ def test_ptp_quickstart_advanced_8gpus(llm_root, llm_venv, model_name,
                                      delete_on_close=True) as running_log:
         llm_venv.run_cmd([
             str(example_root / "quickstart_advanced.py"),
-            "--enable_overlap_scheduler",
             "--enable_chunked_prefill",
             "--model_dir",
             f"{llm_models_root()}/{model_path}",
@@ -1541,7 +1535,6 @@ def test_ptp_quickstart_advanced_2gpus_sm120(llm_root, llm_venv, model_name,
     example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
     llm_venv.run_cmd([
         str(example_root / "quickstart_advanced.py"),
-        "--enable_overlap_scheduler",
         "--enable_chunked_prefill",
         "--model_dir",
         f"{llm_models_root()}/{model_path}",
@@ -1786,7 +1779,8 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path,
     sampling_param = SamplingParams(max_tokens=32, return_context_logits=True)
     with LLM(
             model=model_dir,
-            pytorch_backend_config=PyTorchConfig(attn_backend=backend),
+            pytorch_backend_config=PyTorchConfig(
+                attn_backend=backend, disable_overlap_scheduler=True),
     ) as llm:
 
         outputs = llm.generate(prompts, sampling_params=sampling_param)
diff --git a/tests/unittest/_torch/modeling/test_modeling_deepseek.py b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
index 550a55d18d0..f8b158cc4e1 100644
--- a/tests/unittest/_torch/modeling/test_modeling_deepseek.py
+++ b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
@@ -57,7 +57,7 @@ def test_deepseek_trtllmgen(model_name):
     ] * 4
 
     pytorch_config = PyTorchConfig(
-        enable_overlap_scheduler=False,
+        disable_overlap_scheduler=True,
         use_cuda_graph=False,
         kv_cache_dtype="auto",
         attn_backend="TRTLLM",
diff --git a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
index 49531c7e177..38b15ce1f1f 100644
--- a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
+++ b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
@@ -3,6 +3,7 @@
 from parameterized import parameterized
 
 from tensorrt_llm._torch import LLM
+from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.sampling_params import SamplingParams
 
@@ -40,7 +41,9 @@ def test_llm_api(self, import_oot_code: bool):
 
         llm = LLM(model=model_dir,
                   kv_cache_config=kv_cache_config,
-                  max_num_tokens=2048)
+                  max_num_tokens=2048,
+                  pytorch_backend_config=PyTorchConfig(
+                      disable_overlap_scheduler=True))
 
         prompts = [
             "Hello, my name is",
diff --git a/tests/unittest/_torch/multi_gpu/test_star_attention.py b/tests/unittest/_torch/multi_gpu/test_star_attention.py
index 3f04d993560..3938b4164fa 100644
--- a/tests/unittest/_torch/multi_gpu/test_star_attention.py
+++ b/tests/unittest/_torch/multi_gpu/test_star_attention.py
@@ -62,7 +62,8 @@ def test_model(backend, model_name, quant, sp_size, sa_block_size,
     max_output_tokens = 128
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
     pytorch_backend_config = PyTorchConfig(
-        attn_backend='FLASHINFER_STAR_ATTENTION')
+        attn_backend='FLASHINFER_STAR_ATTENTION',
+        disable_overlap_scheduler=True)
 
     llm = LLM(model=model_dir,
               backend=backend,
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
index df82ed67590..4ce920e574e 100644
--- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
@@ -57,7 +57,7 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
     ] * 32
 
     pytorch_config = PyTorchConfig(
-        enable_overlap_scheduler=False,
+        disable_overlap_scheduler=True,
         use_cuda_graph=False,
         kv_cache_dtype="auto",
         attn_backend=backend,
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
index b6b210b99b2..5b3094bd3aa 100644
--- a/tests/unittest/_torch/speculative/test_eagle3.py
+++ b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -25,7 +25,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str):
     models_path = llm_models_root()
 
     pytorch_config = PyTorchConfig(
-        enable_overlap_scheduler=False,
+        disable_overlap_scheduler=True,
         use_cuda_graph=use_cuda_graph,
         # Only create a single CUDA graph to prevent OOM in CI
         attn_backend=attn_backend,
diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/test_overlap_scheduler.py
index af87717b55f..34402ab0929 100644
--- a/tests/unittest/_torch/test_overlap_scheduler.py
+++ b/tests/unittest/_torch/test_overlap_scheduler.py
@@ -22,11 +22,11 @@ def model_path():
     return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
 
 
-def create_llm(model_dir, enable_overlap_scheduler, enable_trtllm_decoder):
+def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_decoder):
     """Create LLM with specific overlap scheduler setting"""
     pytorch_config = PyTorchConfig(
         use_cuda_graph=True,
-        enable_overlap_scheduler=enable_overlap_scheduler,
+        disable_overlap_scheduler=disable_overlap_scheduler,
         enable_trtllm_decoder=enable_trtllm_decoder)
 
     trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False)
@@ -62,7 +62,7 @@ def test_overlap_scheduler_consistency(model_path, test_case,
 
     # Test with overlap scheduler enabled
     llm = create_llm(model_path,
-                     enable_overlap_scheduler=True,
+                     disable_overlap_scheduler=False,
                      enable_trtllm_decoder=enable_trtllm_decoder)
     outputs_with_overlap = llm.generate(prompts,
                                         sampling_params=sampling_config,
@@ -74,7 +74,7 @@ def test_overlap_scheduler_consistency(model_path, test_case,
 
     # Test with overlap scheduler disabled
     llm = create_llm(model_path,
-                     enable_overlap_scheduler=False,
+                     disable_overlap_scheduler=True,
                      enable_trtllm_decoder=enable_trtllm_decoder)
     outputs_without_overlap = llm.generate(prompts,
                                            sampling_params=sampling_config,
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
index cd298967a31..a2278c0d996 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
@@ -26,7 +26,7 @@ def temp_extra_llm_api_options_file(request):
         extra_llm_api_options_dict = {
             "guided_decoding_backend": "xgrammar",
             "pytorch_backend_config": {
-                "enable_overlap_scheduler": False,
+                "disable_overlap_scheduler": True,
             }
         }
 
diff --git a/tests/unittest/llmapi/apps/_test_openai_metrics.py b/tests/unittest/llmapi/apps/_test_openai_metrics.py
index 2e3fd474122..e79c34da311 100755
--- a/tests/unittest/llmapi/apps/_test_openai_metrics.py
+++ b/tests/unittest/llmapi/apps/_test_openai_metrics.py
@@ -24,9 +24,7 @@ def client():
                      kv_cache_config=KvCacheConfig(),
                      backend="pytorch",
                      pytorch_backend_config=PyTorchConfig(
-                         enable_overlap_scheduler=True,
-                         enable_iter_perf_stats=True,
-                     ))
+                         enable_iter_perf_stats=True, ))
     hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
 
     app_instance = OpenAIServer(llm,
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index 2ddb709930b..5ddd1a10e33 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -1875,7 +1875,7 @@ def llm_get_stats_test_harness(tp_size: int = 1,
         llm_args_extra["pytorch_backend_config"] = PyTorchConfig(
             enable_iter_perf_stats=True,
             enable_iter_req_stats=enable_iter_req_stats,
-            enable_overlap_scheduler=use_overlap)
+            disable_overlap_scheduler=not use_overlap)
         LLM_CLASS = LLM_torch
     else:
         LLM_CLASS = LLM
@@ -1944,8 +1944,8 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
         from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
         llm_args_extra["pytorch_backend_config"] = PyTorchConfig(
             enable_iter_perf_stats=True,
-            enable_overlap_scheduler=use_overlap,
-            enable_iter_req_stats=enable_iter_req_stats)
+            enable_iter_req_stats=enable_iter_req_stats,
+            disable_overlap_scheduler=not use_overlap)
         LLM_CLASS = LLM_torch
     else:
         LLM_CLASS = LLM
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
index d5c6635c2f8..b2bc5529002 100644
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -82,9 +82,9 @@ def test_llm_reward_model():
 
     from tensorrt_llm._torch import LLM as LLM_torch
     from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
-    llm = LLM_torch(
-        model=rm_model_path,
-        pytorch_backend_config=PyTorchConfig(attn_backend="VANILLA"))
+    llm = LLM_torch(model=rm_model_path,
+                    pytorch_backend_config=PyTorchConfig(
+                        attn_backend="VANILLA", disable_overlap_scheduler=True))
 
     sampling_params = SamplingParams(return_context_logits=True)