diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
index 7f90c391c00..d510209b4a5 100644
--- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
+++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
@@ -134,9 +134,8 @@ To do the benchmark, run the following command:
 YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
-    moe_backend: TRTLLM
+use_cuda_graph: true
+moe_backend: TRTLLM
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
@@ -202,21 +201,20 @@ python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
 YOUR_DATA_PATH=./dataset.txt
 
 cat >./extra-llm-api-config.yml <<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
-    cuda_graph_padding_enabled: true
-    cuda_graph_batch_sizes:
-    - 1
-    - 2
-    - 4
-    - 8
-    - 16
-    - 32
-    - 64
-    - 128
-    - 256
-    - 384
-    print_iter_log: true
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes:
+- 1
+- 2
+- 4
+- 8
+- 16
+- 32
+- 64
+- 128
+- 256
+- 384
+print_iter_log: true
 enable_attention_dp: true
 EOF
 
@@ -257,8 +255,7 @@ To do the benchmark, run the following command:
 YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
+use_cuda_graph: true
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
@@ -307,10 +304,9 @@ python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
 YOUR_DATA_PATH=./dataset.txt
 
 cat >./extra-llm-api-config.yml<<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
-    cuda_graph_batch_sizes:
-    - 128
+use_cuda_graph: true
+cuda_graph_batch_sizes:
+- 128
 enable_attention_dp: true
 EOF
 
diff --git a/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md b/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
index bb9a10f1837..0014f1c7f26 100644
--- a/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
+++ b/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
@@ -121,9 +121,8 @@ To benchmark min-latency performance with MTP, you need to follow [this document
 YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
-    moe_backend: TRTLLM
+use_cuda_graph: true
+moe_backend: TRTLLM
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
@@ -177,9 +176,8 @@ To benchmark min-latency performance with MTP Relaxed Acceptance, you need to fo
 YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
-    moe_backend: TRTLLM
+use_cuda_graph: true
+moe_backend: TRTLLM
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
diff --git a/docs/source/performance/perf-benchmarking.md b/docs/source/performance/perf-benchmarking.md
index 8a5daf4ec1e..990eefb477d 100644
--- a/docs/source/performance/perf-benchmarking.md
+++ b/docs/source/performance/perf-benchmarking.md
@@ -628,8 +628,7 @@ If you would like to force the KV cache quantizaton, you can specify the followi
 when the checkpoint precision is `null`:
 
 ```yaml
-pytorch_backend_config:
-  kv_cache_dtype: "fp8"
+kv_cache_dtype: "fp8"
 ```
 
 ```{tip}
diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 891cc310ef8..2cf4204d2f6 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -200,11 +200,9 @@ trtllm-bench --model $model_name throughput --dataset $dataset_file --backend py
 
 `llm_options.yml`
 ```yaml
-
- pytorch_backend_config:
-  use_cuda_graph: true
-  cuda_graph_padding_enabled: true
-  cuda_graph_batch_sizes:
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes:
   - 1
   - 2
   - 4
diff --git a/docs/source/torch/attention.md b/docs/source/torch/attention.md
index a4624650413..2cde32ae905 100644
--- a/docs/source/torch/attention.md
+++ b/docs/source/torch/attention.md
@@ -16,7 +16,7 @@ The following sections explain how to use these implementations and provide a br
 
 
 There are currently three available attention backends: the vanilla backend, the TRT-LLM backend, and the Flashinfer backend.
-You can specify the desired attention backend using `PyTorchConfig.attn_backend`. For instance, to utilize the Flashinfer backend, you can create a `PyTorchConfig` with `attn_backend = "flashinfer"` and then pass it to the `LLM` constructor as follows: `LLM(pytorch_backend_config=pytorch_config)`. This will enable the use of the Flashinfer backend for your model.
+You can specify the desired attention backend using `PyTorchConfig.attn_backend`. For instance, to utilize the Flashinfer backend, you can pass `attn_backend="flashinfer"` to the `LLM` constructor as follows: `LLM(attn_backend="flashinfer")`. This will enable the use of the Flashinfer backend for your model.
 
 The vanilla backend, `VanillaAttention`, is a reference implementation designed primarily for inflight batching and linear KV cache support. While it serves as a useful baseline, it is not recommended for production use due to its limited optimizations.
 
diff --git a/examples/auto_deploy/README.md b/examples/auto_deploy/README.md
index 47daaa78c73..634ad47aede 100644
--- a/examples/auto_deploy/README.md
+++ b/examples/auto_deploy/README.md
@@ -265,7 +265,7 @@ llm = LLM(
     model=<HF_MODEL_CARD_OR_DIR>,
     backend="autodeploy",
     build_config=build_config,
-    pytorch_backend_config=ad_config,
+    auto_deploy_config=ad_config,
     tensor_parallel_size=<NUM_WORLD_RANK>,
 )
 
diff --git a/examples/auto_deploy/build_and_run_ad.py b/examples/auto_deploy/build_and_run_ad.py
index f5c9f1a6d11..1744cf1f4a7 100644
--- a/examples/auto_deploy/build_and_run_ad.py
+++ b/examples/auto_deploy/build_and_run_ad.py
@@ -73,7 +73,7 @@ def build_llm_from_config(config: SimpleConfig) -> LLM:
         model=factory.model,
         backend="autodeploy",
         build_config=build_config,
-        pytorch_backend_config=ad_config,
+        auto_deploy_config=ad_config,
         tensor_parallel_size=config.world_size,
         tokenizer=factory.init_tokenizer() if config.customize_tokenizer else None,
     )
diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md
index 70cc78f3619..fbba3b42748 100644
--- a/examples/disaggregated/README.md
+++ b/examples/disaggregated/README.md
@@ -9,7 +9,7 @@ You can use multiple `trtllm-serve` commands to launch the context and generatio
 for disaggregated serving. For example, you could launch two context servers and one generation servers as follows:
 
 ```
-echo -e "pytorch_backend_config:\n  disable_overlap_scheduler: True\ncache_transceiver_config:\n  max_num_tokens: 2048" > context_extra-llm-api-config.yml
+echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\nmax_num_tokens: 2048" > context_extra-llm-api-config.yml
 echo -e "cache_transceiver_config:\n  max_num_tokens: 2048" > gen_extra-llm-api-config.yml
 
 export TRTLLM_USE_UCX_KVCACHE=1
@@ -63,9 +63,8 @@ hostname: localhost
 port: 8000
 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 backend: "pytorch"
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
+use_cuda_graph: False
+disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/examples/disaggregated/disagg_config.yaml b/examples/disaggregated/disagg_config.yaml
index a199a594522..eedb81f34b9 100644
--- a/examples/disaggregated/disagg_config.yaml
+++ b/examples/disaggregated/disagg_config.yaml
@@ -3,9 +3,8 @@ port: 8000
 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 free_gpu_memory_fraction: 0.25
 backend: "pytorch"
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
+use_cuda_graph: False
+disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/examples/llm-api/llm_inference_kv_events.py b/examples/llm-api/llm_inference_kv_events.py
index 827427e538b..009b7e0dee7 100644
--- a/examples/llm-api/llm_inference_kv_events.py
+++ b/examples/llm-api/llm_inference_kv_events.py
@@ -1,17 +1,15 @@
 ### Get KV Cache Events
 
 from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import KvCacheConfig
 
 
 def main():
-    pytorch_config = PyTorchConfig(autotuner_enabled=False,
-                                   kv_cache_dtype='auto')
 
     llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
               tensor_parallel_size=2,
-              pytorch_backend_config=pytorch_config,
+              autotuner_enabled=False,
+              kv_cache_dtype='auto',
               kv_cache_config=KvCacheConfig(enable_block_reuse=True,
                                             event_buffer_max_size=1024),
               backend="pytorch")
diff --git a/examples/llm-api/llm_mgmn_trtllm_bench.sh b/examples/llm-api/llm_mgmn_trtllm_bench.sh
index 21a0ee48d9b..ab556cf470a 100644
--- a/examples/llm-api/llm_mgmn_trtllm_bench.sh
+++ b/examples/llm-api/llm_mgmn_trtllm_bench.sh
@@ -74,10 +74,9 @@ srun -l \
 
         # This is optional
         cat > /tmp/pytorch_extra_args.txt << EOF
-pytorch_backend_config:
-    use_cuda_graph: false
-    cuda_graph_padding_enabled: false
-    print_iter_log: true
+use_cuda_graph: false
+cuda_graph_padding_enabled: false
+print_iter_log: true
 enable_attention_dp: false
 EOF
 
diff --git a/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py b/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py
index 89e4a293db8..9faff894a4f 100644
--- a/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py
+++ b/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py
@@ -100,7 +100,6 @@ def __init__(
             if hasattr(PyTorchConfig, "moe_backend"):
                 pytorch_config_params["moe_backend"] = self.moe_backend
                 print(f"Info: moe_backend is set to {self.moe_backend}")
-            pytorch_config = PyTorchConfig(**pytorch_config_params)
 
             # stop words not currently supported by torch backend
             self.use_stop_words = False
@@ -110,7 +109,7 @@ def __init__(
                 tensor_parallel_size=tp,
                 trust_remote_code=trust_remote_code,
                 enable_chunked_prefill=False,
-                pytorch_backend_config=pytorch_config,
+                **pytorch_config_params,
                 tokenizer=self.tokenizer,
                 kv_cache_config=trt_kv_cache_config,
                 moe_expert_parallel_size=self.moe_expert_parallel_size,
diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md
index 04989e97fb4..71620d54549 100644
--- a/examples/models/core/deepseek_v3/README.md
+++ b/examples/models/core/deepseek_v3/README.md
@@ -140,10 +140,9 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
         --num-requests 24 > /tmp/benchmarking_64k.txt
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
-pytorch_backend_config:
-  use_cuda_graph: true
-  cuda_graph_padding_enabled: true
-  cuda_graph_batch_sizes: [1, 4, 8, 12]
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes: [1, 4, 8, 12]
 EOF
 
 trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} throughput \
@@ -168,11 +167,10 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
         --num-requests 4 > /tmp/benchmarking_128k.txt
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
-pytorch_backend_config:
-  use_cuda_graph: true
-  cuda_graph_padding_enabled: true
-  cuda_graph_batch_sizes: [1, 2]
-  moe_max_num_tokens: 16384
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes: [1, 2]
+moe_max_num_tokens: 16384
 EOF
 
 trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} throughput \
@@ -193,8 +191,7 @@ Evaluate the model accuracy using `trtllm-eval`.
 1. (Optional) Prepare an advanced configuration file:
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
+use_cuda_graph: true
 enable_attention_dp: true
 EOF
 ```
@@ -236,21 +233,20 @@ To serve the model using `trtllm-serve`:
 
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
-    cuda_graph_padding_enabled: true
-    cuda_graph_batch_sizes:
-    - 1
-    - 2
-    - 4
-    - 8
-    - 16
-    - 32
-    - 64
-    - 128
-    - 256
-    - 384
-    print_iter_log: true
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
+  - 384
+print_iter_log: true
 enable_attention_dp: true
 EOF
 
@@ -427,21 +423,20 @@ python3 /path/to/TensorRT-LLM/benchmarks/cpp/prepare_dataset.py \
     --input-mean=1024 --output-mean=2048 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt
 
 cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
-    cuda_graph_padding_enabled: true
-    cuda_graph_batch_sizes:
-    - 1
-    - 2
-    - 4
-    - 8
-    - 16
-    - 32
-    - 64
-    - 128
-    - 256
-    - 384
-    print_iter_log: true
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
+  - 384
+print_iter_log: true
 enable_attention_dp: true
 EOF
 ```
@@ -605,9 +600,8 @@ To enable FP8 MLA, modify the `kv_cache_quant_algo` property. The following show
 Alternatively, configure FP8 MLA through the `kv_cache_dtype` of the PyTorch backend config. An example is to use `--kv_cache_dtype` of `quickstart_advanced.py`. Also, you can edit `extra-llm-api-config.yml` consumed by `--extra_llm_api_options` of `trtllm-serve`, `trtllm-bench` and so on:
 ```yaml
 # ...
-pytorch_backend_config:
-  kv_cache_dtype: fp8
-  # ...
+kv_cache_dtype: fp8
+# ...
 ```
 
 ### W4AFP8
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
index 78a55531eb2..8ee88a386b0 100644
--- a/examples/models/core/qwen/README.md
+++ b/examples/models/core/qwen/README.md
@@ -653,21 +653,20 @@ To serve the model using `trtllm-serve`:
 
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
-    cuda_graph_padding_enabled: true
-    cuda_graph_batch_sizes:
-    - 1
-    - 2
-    - 4
-    - 8
-    - 16
-    - 32
-    - 64
-    - 128
-    - 256
-    - 384
-    print_iter_log: true
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes:
+- 1
+- 2
+- 4
+- 8
+- 16
+- 32
+- 64
+- 128
+- 256
+- 384
+print_iter_log: true
 enable_attention_dp: true
 EOF
 
diff --git a/examples/pytorch/quickstart_advanced.py b/examples/pytorch/quickstart_advanced.py
index e9bd9f0e2ff..4092e1f0d41 100644
--- a/examples/pytorch/quickstart_advanced.py
+++ b/examples/pytorch/quickstart_advanced.py
@@ -2,7 +2,6 @@
 
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
                                  MTPDecodingConfig, NGramDecodingConfig)
 
@@ -124,19 +123,6 @@ def parse_arguments():
 
 
 def setup_llm(args):
-    pytorch_config = PyTorchConfig(
-        disable_overlap_scheduler=args.disable_overlap_scheduler,
-        kv_cache_dtype=args.kv_cache_dtype,
-        attn_backend=args.attention_backend,
-        use_cuda_graph=args.use_cuda_graph,
-        load_format=args.load_format,
-        print_iter_log=args.print_iter_log,
-        enable_iter_perf_stats=args.print_iter_log,
-        torch_compile_enabled=args.use_torch_compile,
-        torch_compile_piecewise_cuda_graph=args.use_piecewise_cuda_graph,
-        moe_backend=args.moe_backend,
-        enable_trtllm_sampler=args.enable_trtllm_sampler)
-
     kv_cache_config = KvCacheConfig(
         enable_block_reuse=not args.disable_kv_cache_reuse,
         free_gpu_memory_fraction=args.kv_cache_fraction,
@@ -168,13 +154,22 @@ def setup_llm(args):
         spec_config = None
 
     llm = LLM(model=args.model_dir,
+              backend='pytorch',
+              disable_overlap_scheduler=args.disable_overlap_scheduler,
+              kv_cache_dtype=args.kv_cache_dtype,
+              kv_cache_config=kv_cache_config,
+              attn_backend=args.attention_backend,
+              use_cuda_graph=args.use_cuda_graph,
+              load_format=args.load_format,
+              print_iter_log=args.print_iter_log,
+              enable_iter_perf_stats=args.print_iter_log,
+              torch_compile_enabled=args.use_torch_compile,
+              torch_compile_piecewise_cuda_graph=args.use_piecewise_cuda_graph,
+              moe_backend=args.moe_backend,
+              enable_trtllm_sampler=args.enable_trtllm_sampler,
               max_seq_len=args.max_seq_len,
               max_batch_size=args.max_batch_size,
               max_num_tokens=args.max_num_tokens,
-              pytorch_backend_config=pytorch_config,
-              kv_cache_config=kv_cache_config,
-              tensor_parallel_size=args.tp_size,
-              pipeline_parallel_size=args.pp_size,
               enable_attention_dp=args.enable_attention_dp,
               moe_expert_parallel_size=args.moe_ep_size,
               moe_tensor_parallel_size=args.moe_tp_size,
diff --git a/examples/pytorch/star_attention.py b/examples/pytorch/star_attention.py
index 64051201d4b..fe861ad4f46 100644
--- a/examples/pytorch/star_attention.py
+++ b/examples/pytorch/star_attention.py
@@ -8,7 +8,6 @@
 
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
 
 
@@ -66,8 +65,6 @@ def generate_llm_outputs(args, data, fp8=False, fp8_kv_cache=False):
         "block_size": args.sa_block_size
     }
 
-    pytorch_backend_config = PyTorchConfig(
-        attn_backend='FLASHINFER_STAR_ATTENTION')
     llm = LLM(model=args.model_path,
               max_batch_size=args.max_batch_size,
               max_input_len=args.max_input_len,
@@ -77,7 +74,7 @@ def generate_llm_outputs(args, data, fp8=False, fp8_kv_cache=False):
               tensor_parallel_size=1,
               context_parallel_size=args.num_procs,
               cp_config=cp_config,
-              pytorch_backend_config=pytorch_backend_config)
+              attn_backend='FLASHINFER_STAR_ATTENTION')
 
     sampling_params = SamplingParams(add_special_tokens=False,
                                      max_tokens=args.max_new_tokens)
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
index aa0909990f6..2672ae03d55 100644
--- a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
@@ -352,7 +352,7 @@ def __init__(
         **kwargs: Any,
     ):
         try:
-            self.pytorch_backend_config = kwargs.pop("pytorch_backend_config", None)
+            self.pytorch_backend_config = kwargs.pop("auto_deploy_config", None)
             self.args = LlmArgs.from_kwargs(
                 model=model,
                 tokenizer=tokenizer,
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/interface.py b/tensorrt_llm/_torch/auto_deploy/shim/interface.py
index f9bef24badf..ecf2f77a2a8 100644
--- a/tensorrt_llm/_torch/auto_deploy/shim/interface.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/interface.py
@@ -106,8 +106,6 @@ class AutoDeployConfig(PyTorchConfig):
     free_mem_ratio: float = 0.8
 
     def __post_init__(self):
-        super().__post_init__()
-
         # we don't want to loose the default values for model_kwargs unless explicitly set by the
         # user. They are not preserved by the standard initialization process since they whole dict
         # gets replaced by the user provided one. We don't want that though.
diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py
index 8766df69a03..3515dab60f0 100644
--- a/tensorrt_llm/_torch/pyexecutor/config.py
+++ b/tensorrt_llm/_torch/pyexecutor/config.py
@@ -1,15 +1,10 @@
-import json
-import math
-import os
 from dataclasses import dataclass, field
-from enum import Enum
 from typing import Dict, List, Optional, Union
 
-import yaml
-
 from tensorrt_llm.bindings.executor import ExecutorConfig
 
 from ...builder import BuildConfig
+from ...llmapi.llm_args import LoadFormat
 from ...logger import logger
 from ...mapping import Mapping
 from ..model_config import MoeLoadBalancerConfig
@@ -17,12 +12,6 @@
 from .resource_manager import BaseResourceManager
 
 
-class LoadFormat(Enum):
-    AUTO = 0
-    # Initialize all weights randomly.
-    DUMMY = 1
-
-
 @dataclass
 class PyTorchConfig:
     """
@@ -95,62 +84,6 @@ class PyTorchConfig:
     # from the model checkpoint.
     load_format: Union[str, LoadFormat] = 'auto'
 
-    def _convert_load_format(self) -> None:
-        if isinstance(self.load_format, LoadFormat):
-            return
-        load_format = self.load_format.upper()
-        if load_format not in LoadFormat.__members__:
-            raise NotImplementedError(f"Invalid LoadFormat: {self.load_format}")
-        self.load_format = LoadFormat[load_format]
-
-    def __post_init__(self) -> None:
-        if self.torch_compile_enabled and self.torch_compile_piecewise_cuda_graph:
-            assert self.torch_compile_fullgraph, "Fullgraph must be enabled for piecewise CUDA graph."
-
-        if self.cuda_graph_batch_sizes is not None:
-            assert self.cuda_graph_max_batch_size == 0, (
-                "Please don't set both cuda_graph_batch_sizes "
-                "and cuda_graph_max_batch_size.")
-            self.cuda_graph_batch_sizes = sorted(self.cuda_graph_batch_sizes)
-        else:
-            self.cuda_graph_max_batch_size = self.cuda_graph_max_batch_size or 128
-            if self.cuda_graph_padding_enabled:
-                self.cuda_graph_batch_sizes = [1, 2, 4] + [
-                    i * 8 for i in range(1, 17)
-                ]
-            else:
-                self.cuda_graph_batch_sizes = list(range(1, 32)) + [32, 64, 128]
-            self.cuda_graph_batch_sizes += [
-                2**i for i in range(
-                    8, math.floor(math.log(self.cuda_graph_max_batch_size, 2)))
-            ]
-            self.cuda_graph_batch_sizes = [
-                size for size in self.cuda_graph_batch_sizes
-                if size <= self.cuda_graph_max_batch_size
-            ]
-            if self.cuda_graph_max_batch_size != self.cuda_graph_batch_sizes[
-                    -1]:
-                self.cuda_graph_batch_sizes.append(
-                    self.cuda_graph_max_batch_size)
-
-        if isinstance(self.moe_load_balancer, str):
-            assert os.path.exists(self.moe_load_balancer)
-            if self.moe_load_balancer.endswith(".json"):
-                with open(self.moe_load_balancer) as f:
-                    self.moe_load_balancer = json.load(f)
-            elif self.moe_load_balancer.endswith((".yaml", ".yml")):
-                with open(self.moe_load_balancer) as f:
-                    self.moe_load_balancer = yaml.safe_load(f)
-            else:
-                raise ValueError(
-                    f"Unsupported moe load balancer config file: {self.moe_load_balancer}"
-                )
-        if isinstance(self.moe_load_balancer, dict):
-            self.moe_load_balancer = MoeLoadBalancerConfig(
-                **self.moe_load_balancer)
-
-        self._convert_load_format()
-
 
 EXETENDED_EXECUTOR_CONFIG_FIELDS = [
     'backend',
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 64437f7e78b..8dcfe25dfbb 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -310,11 +310,6 @@ def __init__(
         )
 
         attn_backend = pytorch_backend_config.attn_backend
-        # _convert_load_format should already be called by
-        # __post_init__, but call it again just in case.
-        # The config object is not a frozen data class, so it's
-        # possible the user changed it after initialization.
-        pytorch_backend_config._convert_load_format()
         self.model = self._load_model(
             model_path,
             mapping=self.mapping,
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
index cc474ec5296..71797268de7 100755
--- a/tensorrt_llm/bench/benchmark/throughput.py
+++ b/tensorrt_llm/bench/benchmark/throughput.py
@@ -340,8 +340,8 @@ def throughput_command(
         kwargs = kwargs | runtime_config.get_llm_args()
         kwargs['backend'] = backend
 
-        if "pytorch_backend_config" in kwargs and iteration_log is not None:
-            kwargs["pytorch_backend_config"].enable_iter_perf_stats = True
+        if backend == "pytorch":
+            kwargs["enable_iter_perf_stats"] = True
 
         if runtime_config.backend == 'pytorch':
             llm = PyTorchLLM(**kwargs)
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
index 3e7ed03cd21..202d8662d9b 100755
--- a/tensorrt_llm/bench/benchmark/utils/general.py
+++ b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -89,10 +89,8 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
     if extra_llm_api_options:
         with open(extra_llm_api_options, 'r') as f:
             llm_args_dict = yaml.safe_load(f)
-            if "pytorch_backend_config" in llm_args_dict:
-                if "kv_cache_dtype" in llm_args_dict["pytorch_backend_config"]:
-                    kv_cache_dtype = llm_args_dict["pytorch_backend_config"][
-                        "kv_cache_dtype"]
+            if "kv_cache_dtype" in llm_args_dict:
+                kv_cache_dtype = llm_args_dict["kv_cache_dtype"]
 
             enable_chunked_prefill = llm_args_dict.get("enable_chunked_prefill",
                                                        enable_chunked_prefill)
diff --git a/tensorrt_llm/bench/dataclasses/configuration.py b/tensorrt_llm/bench/dataclasses/configuration.py
index 41143abea42..e4576ee487e 100755
--- a/tensorrt_llm/bench/dataclasses/configuration.py
+++ b/tensorrt_llm/bench/dataclasses/configuration.py
@@ -81,8 +81,7 @@ def get_llm_args(self) -> Dict:
         }
 
         if self.backend in backend_config_map:
-            llm_args["pytorch_backend_config"] = backend_config_map[
-                self.backend]()
+            llm_args.update(backend_config_map[self.backend]())
 
         return update_llm_args_with_extra_options(llm_args,
                                                   self.extra_llm_api_options)
@@ -109,7 +108,7 @@ def get_perf_config(self) -> ExtendedRuntimePerfKnobConfig:
         return config
 
     def get_pytorch_perf_config(self) -> PyTorchConfig:
-        return PyTorchConfig(**self.pytorch_config)
+        return self.pytorch_config
 
     def get_autodeploy_perf_config(self) -> AutoDeployConfig:
         ad_config = AutoDeployConfig(**self.pytorch_config)
diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py
index 995b8845041..699c61654e8 100755
--- a/tensorrt_llm/bench/dataclasses/reporting.py
+++ b/tensorrt_llm/bench/dataclasses/reporting.py
@@ -264,9 +264,8 @@ def get_statistics_dict(self) -> Dict[str, Any]:
             model = self.rt_cfg.model_path or self.rt_cfg.model
             model_config = ModelConfig.from_pretrained(model,
                                                        trust_remote_code=True)
-            validate_and_set_kv_cache_quant(
-                model_config,
-                self.kwargs["pytorch_backend_config"].kv_cache_dtype)
+            validate_and_set_kv_cache_quant(model_config,
+                                            self.kwargs["kv_cache_dtype"])
 
             stats_dict["engine"] |= {
                 "backend":
diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py
index 632a956b6ff..8f875cdabda 100644
--- a/tensorrt_llm/commands/eval.py
+++ b/tensorrt_llm/commands/eval.py
@@ -19,7 +19,6 @@
 import tensorrt_llm.profiler as profiler
 
 from .._torch.llm import LLM as PyTorchLLM
-from .._torch.pyexecutor.config import PyTorchConfig
 from ..evaluate import (GSM8K, MMLU, CnnDailymail, GPQADiamond, GPQAExtended,
                         GPQAMain)
 from ..llmapi import LLM, BuildConfig, KvCacheConfig
@@ -113,9 +112,6 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
 
     if backend == "tensorrt":
         backend = None
-    pytorch_backend_config = None
-    if backend == "pytorch":
-        pytorch_backend_config = PyTorchConfig()
 
     llm_args = {
         "model": model,
@@ -128,7 +124,6 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
         "build_config": build_config,
         "kv_cache_config": kv_cache_config,
         "backend": backend,
-        "pytorch_backend_config": pytorch_backend_config,
     }
 
     if extra_llm_api_options is not None:
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
index 28bfa7498c1..f622cd5f60e 100644
--- a/tensorrt_llm/commands/serve.py
+++ b/tensorrt_llm/commands/serve.py
@@ -8,7 +8,6 @@
 from torch.cuda import device_count
 
 from tensorrt_llm._torch.llm import LLM as PyTorchLLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import (LLM, BuildConfig, CapacitySchedulerPolicy,
                                  DynamicBatchConfig, KvCacheConfig,
                                  SchedulerConfig)
@@ -48,7 +47,6 @@ def get_llm_args(model: str,
     kv_cache_config = KvCacheConfig(
         free_gpu_memory_fraction=free_gpu_memory_fraction)
 
-    pytorch_backend_config = PyTorchConfig() if backend == "pytorch" else None
     dynamic_batch_config = DynamicBatchConfig(
         enable_batch_size_tuning=True,
         enable_max_num_tokens_tuning=False,
@@ -74,7 +72,6 @@ def get_llm_args(model: str,
         "max_seq_len": max_seq_len,
         "kv_cache_config": kv_cache_config,
         "backend": backend if backend == "pytorch" else None,
-        "pytorch_backend_config": pytorch_backend_config,
         "_num_postprocess_workers": num_postprocess_workers,
         "_postprocess_tokenizer_dir": tokenizer or model,
         "_reasoning_parser": reasoning_parser,
diff --git a/tensorrt_llm/executor/serialization.py b/tensorrt_llm/executor/serialization.py
index dbd0b2bbf11..54df0b9bb84 100644
--- a/tensorrt_llm/executor/serialization.py
+++ b/tensorrt_llm/executor/serialization.py
@@ -8,8 +8,10 @@
 # it is only needed in a single instance the class can be added at runtime
 # using register_approved_ipc_class.
 BASE_ZMQ_CLASSES = {
-    "builtins": ["Exception", "ValueError"
-                 ],  # each Exception Error class needs to be added explicitly
+    "builtins": [
+        "Exception", "ValueError", "NotImplementedError", "AttributeError",
+        "AssertionError"
+    ],  # each Exception Error class needs to be added explicitly
     "collections": ["OrderedDict"],
     "datetime": ["timedelta"],
     "pathlib": ["PosixPath"],
@@ -57,6 +59,7 @@
         "KvCacheRetentionConfig.TokenRangeRetentionConfig", "PeftCacheConfig",
         "SchedulerConfig", "DynamicBatchConfig"
     ],
+    "tensorrt_llm._torch.pyexecutor.config": ["PyTorchConfig"],
     "tensorrt_llm.builder": ["BuildConfig"],
     "tensorrt_llm.disaggregated_params": ["DisaggregatedParams"],
     "tensorrt_llm.executor.postproc_worker": [
@@ -77,7 +80,7 @@
     "tensorrt_llm.llmapi.llm_args": [
         "_ModelFormatKind", "_ParallelConfig", "CalibConfig",
         "CapacitySchedulerPolicy", "KvCacheConfig", "LookaheadDecodingConfig",
-        "TrtLlmArgs", "SchedulerConfig"
+        "TrtLlmArgs", "SchedulerConfig", "LoadFormat"
     ],
     "tensorrt_llm.llmapi.mpi_session": ["RemoteTask"],
     "tensorrt_llm.llmapi.llm_utils":
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index 9e7e8dd313a..476ea580079 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -112,9 +112,6 @@ def __init__(self,
         self._executor_cls = kwargs.pop("executor_cls", GenerationExecutor)
 
         try:
-            self.pytorch_backend_config = kwargs.pop('pytorch_backend_config',
-                                                     None)
-
             llm_args_cls = TorchLlmArgs if kwargs.get(
                 'backend', None) == 'pytorch' else TrtLlmArgs
 
@@ -625,7 +622,8 @@ def _build_model(self):
         update_executor_config(
             executor_config,
             backend=self.args.backend,
-            pytorch_backend_config=self.pytorch_backend_config,
+            pytorch_backend_config=self.args.get_pytorch_backend_config()
+            if self.args.backend == "pytorch" else None,
             mapping=self.args.parallel_config.to_mapping(),
             build_config=self.args.build_config
             if self._on_trt_backend else None,
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index c8d5033ad41..6d769a12ee3 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -1,14 +1,17 @@
 import json
 import math
+import os
 from abc import ABC, abstractmethod
-from dataclasses import dataclass, field, fields
+from dataclasses import dataclass, field
 from enum import Enum, EnumMeta
 from pathlib import Path
-from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
+from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Literal, Optional,
+                    Union)
 
 import torch
 import yaml
-from pydantic import BaseModel, Field, PrivateAttr, validator
+from pydantic import (BaseModel, Field, PrivateAttr, field_validator,
+                      model_validator)
 from strenum import StrEnum
 from transformers import PreTrainedTokenizerBase
 
@@ -18,6 +21,9 @@
 from .._utils import mpi_rank
 from ..auto_parallel import AutoParallelConfig, infer_cluster_config
 
+if TYPE_CHECKING:
+    from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
+
 # yapf: disable
 # isort: off
 from ..bindings.executor import (
@@ -36,6 +42,8 @@
                                  PeftCacheConfig as _PeftCacheConfig,
                                  SchedulerConfig as _SchedulerConfig) # isort: skip
 # isort: on
+from transformers import PreTrainedTokenizerBase
+
 # yapf: enable
 from ..builder import BuildConfig, EngineConfig
 from ..logger import logger
@@ -549,7 +557,9 @@ class LookaheadDecodingConfig(DecodingBaseConfig, PybindMirror):
         get_default_lookahead_decoding_verification_set(),
         description="Number of NGrams in verification branch per step.")
 
-    @validator('max_window_size', 'max_ngram_size', 'max_verification_set_size')
+    @field_validator('max_window_size', 'max_ngram_size',
+                     'max_verification_set_size')
+    @classmethod
     def validate_positive_values(cls, v):
         if v <= 0:
             raise ValueError(f"Value must be positive, got {v}")
@@ -848,8 +858,8 @@ class BaseLlmArgs(BaseModel):
         default=None, description="Quantization config.")
 
     # Several options from ExecutorConfig, expanded here for less hierarchy
-    kv_cache_config: Optional[KvCacheConfig] = Field(
-        default=None, description="KV cache config.")
+    kv_cache_config: KvCacheConfig = Field(default_factory=KvCacheConfig,
+                                           description="KV cache config.")
 
     enable_chunked_prefill: bool = Field(default=False,
                                          description="Enable chunked prefill.")
@@ -876,8 +886,8 @@ class BaseLlmArgs(BaseModel):
     peft_cache_config: Optional[PeftCacheConfig] = Field(
         default=None, description="PEFT cache config.")
 
-    scheduler_config: Optional[SchedulerConfig] = Field(
-        default=None, description="Scheduler config.")
+    scheduler_config: SchedulerConfig = Field(default_factory=SchedulerConfig,
+                                              description="Scheduler config.")
 
     cache_transceiver_config: Optional[CacheTransceiverConfig] = Field(
         default=None, description="Cache transceiver config.")
@@ -991,10 +1001,6 @@ def model_post_init(self, __context: Any):
             enable_attention_dp=self.enable_attention_dp,
             cp_config=self.cp_config)
 
-        self.kv_cache_config = self.kv_cache_config or KvCacheConfig()
-
-        self.scheduler_config = self.scheduler_config or SchedulerConfig()
-
     @classmethod
     def from_kwargs(cls, **kwargs: Any) -> "BaseLlmArgs":
         """Create `LlmArgs` instance from kwargs.
@@ -1016,8 +1022,7 @@ def to_dict(self) -> dict:
         Returns:
             dict: The dict that contains all fields of the `LlmArgs` instance.
         """
-        return dict(
-            (field.name, getattr(self, field.name)) for field in fields(self))
+        return self.model_dump()
 
     @staticmethod
     def _maybe_update_config_for_consistency(
@@ -1444,6 +1449,12 @@ def model_post_init(self, __context):
                                                             indent=' ' * 4)
 
 
+class LoadFormat(Enum):
+    AUTO = 0
+    # Initialize all weights randomly.
+    DUMMY = 1
+
+
 class TorchLlmArgs(BaseLlmArgs):
 
     # Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs
@@ -1453,12 +1464,275 @@ class TorchLlmArgs(BaseLlmArgs):
         exclude_from_json=True,
         json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"})
 
+    # PyTorch backend specific configurations
+
+    use_cuda_graph: bool = Field(
+        default=False,
+        description=
+        "If true, use CUDA graphs for decoding. CUDA graphs are only created for the batch sizes in cuda_graph_batch_sizes, and are enabled for batches that consist of decoding requests *only* (the reason is that it's hard to capture a single graph with prefill requests since the input shapes are a function of the sequence lengths). Note that each CUDA graph can use up to 200 MB of extra memory."
+    )
+
+    cuda_graph_batch_sizes: Optional[List[int]] = Field(
+        default=None,
+        description="List of batch sizes to create CUDA graphs for.")
+
+    cuda_graph_max_batch_size: int = Field(
+        default=0, description="Maximum batch size for CUDA graphs.")
+
+    cuda_graph_padding_enabled: bool = Field(
+        default=False,
+        description=
+        "If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance."
+    )
+
+    disable_overlap_scheduler: bool = Field(
+        default=False, description="Disable the overlap scheduler.")
+
+    moe_max_num_tokens: Optional[int] = Field(
+        default=None,
+        description=
+        "If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used."
+    )
+
+    moe_load_balancer: Optional[Union[object, dict, str]] = Field(
+        default=None,
+        description="Configuration for MoE load balancing.",
+        json_schema_extra={"type": f"Union[MoeLoadBalancerConfig, dict, str]"})
+
+    attn_backend: str = Field(default='TRTLLM',
+                              description="Attention backend to use.")
+
+    moe_backend: str = Field(default='CUTLASS',
+                             description="MoE backend to use.")
+
+    mixed_sampler: bool = Field(
+        default=False,
+        description=
+        "If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc."
+    )
+
+    enable_trtllm_sampler: bool = Field(
+        default=False,
+        description=
+        "If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies."
+    )
+
+    kv_cache_dtype: str = Field(default="auto",
+                                description="Data type for KV cache.")
+
+    use_kv_cache: bool = Field(default=True,
+                               description="Whether to use KV cache.")
+
+    enable_iter_perf_stats: bool = Field(
+        default=False, description="Enable iteration performance statistics.")
+
+    enable_iter_req_stats: bool = Field(
+        default=False,
+        description=
+        "If true, enables per request stats per iteration. Must also set enable_iter_perf_stats to true to get request stats."
+    )
+
+    print_iter_log: bool = Field(default=False,
+                                 description="Print iteration logs.")
+
+    torch_compile_enabled: bool = Field(
+        default=False, description="Enable torch.compile optimization.")
+
+    torch_compile_fullgraph: bool = Field(
+        default=True,
+        description="Enable full graph compilation in torch.compile.")
+
+    torch_compile_inductor_enabled: bool = Field(
+        default=False, description="Enable inductor backend in torch.compile.")
+
+    torch_compile_piecewise_cuda_graph: bool = Field(
+        default=False,
+        description="Enable piecewise CUDA graph in torch.compile.")
+
+    torch_compile_enable_userbuffers: bool = Field(
+        default=True,
+        description=
+        "When torch compile is enabled, userbuffers is enabled by default.")
+
+    autotuner_enabled: bool = Field(
+        default=True,
+        description="Enable autotuner only when torch compile is enabled.")
+
+    enable_layerwise_nvtx_marker: bool = Field(
+        default=False, description="If true, enable layerwise nvtx marker.")
+
+    auto_deploy_config: Optional[object] = Field(
+        default=None,
+        description="Auto deploy config.",
+        exclude_from_json=True,
+        json_schema_extra={"type": f"Optional[AutoDeployConfig]"})
+
+    load_format: Union[str, LoadFormat] = Field(
+        default=LoadFormat.AUTO,
+        description=
+        "How to load the model weights. By default, detect the weight type from the model checkpoint."
+    )
+
+    @field_validator('load_format', mode='before')
+    @classmethod
+    def convert_load_format(cls, v):
+        if isinstance(v, LoadFormat):
+            return v
+        load_format = v.upper()
+        if load_format not in LoadFormat.__members__:
+            raise ValueError(f"Invalid LoadFormat: {v}")
+        return LoadFormat[load_format]
+
+    # Extra resource managers to use in addition to the KV cache manager.
+    # Each manager's prepare_resources method is called before the forward pass,
+    # and update_resources() is called after the pass finishes. free_resources()
+    # is called when a request finishes. The KV cache manager is guaranteed to
+    # be invoked after all of these extra managers in all stages.
+    _extra_resource_managers: Dict[str,
+                                   object] = PrivateAttr(default_factory=dict, )
+
+    @property
+    def extra_resource_managers(self) -> Dict[str, object]:
+        return self._extra_resource_managers
+
+    @extra_resource_managers.setter
+    def extra_resource_managers(self, value: Dict[str, object]) -> None:
+        self._extra_resource_managers = value
+
     @print_traceback_on_error
     def model_post_init(self, __context):
-        super().model_post_init(__context)
+        from .._torch.model_config import MoeLoadBalancerConfig
 
+        super().model_post_init(__context)
         self.model_format = _ModelFormatKind.HF
 
+        if isinstance(self.moe_load_balancer, str):
+            assert os.path.exists(self.moe_load_balancer)
+            if self.moe_load_balancer.endswith(".json"):
+                with open(self.moe_load_balancer) as f:
+                    self.moe_load_balancer = json.load(f)
+            elif self.moe_load_balancer.endswith((".yaml", ".yml")):
+                with open(self.moe_load_balancer) as f:
+                    self.moe_load_balancer = yaml.safe_load(f)
+            else:
+                raise ValueError(
+                    f"Unsupported moe load balancer config file: {self.moe_load_balancer}"
+                )
+        if isinstance(self.moe_load_balancer, dict):
+            self.moe_load_balancer = MoeLoadBalancerConfig(
+                **self.moe_load_balancer)
+
+    # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
+    def get_pytorch_backend_config(self) -> "PyTorchConfig":
+        from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
+
+        # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
+        # Just a WAR to support the auto_deploy
+        if self.auto_deploy_config is not None:
+            return self.auto_deploy_config
+
+        return PyTorchConfig(
+            extra_resource_managers=self.extra_resource_managers,
+            use_cuda_graph=self.use_cuda_graph,
+            cuda_graph_batch_sizes=self.cuda_graph_batch_sizes,
+            cuda_graph_max_batch_size=self.cuda_graph_max_batch_size,
+            cuda_graph_padding_enabled=self.cuda_graph_padding_enabled,
+            disable_overlap_scheduler=self.disable_overlap_scheduler,
+            moe_max_num_tokens=self.moe_max_num_tokens,
+            moe_load_balancer=self.moe_load_balancer,
+            attn_backend=self.attn_backend,
+            moe_backend=self.moe_backend,
+            mixed_sampler=self.mixed_sampler,
+            enable_trtllm_sampler=self.enable_trtllm_sampler,
+            kv_cache_dtype=self.kv_cache_dtype,
+            use_kv_cache=self.use_kv_cache,
+            enable_iter_perf_stats=self.enable_iter_perf_stats,
+            enable_iter_req_stats=self.enable_iter_req_stats,
+            print_iter_log=self.print_iter_log,
+            torch_compile_enabled=self.torch_compile_enabled,
+            torch_compile_fullgraph=self.torch_compile_fullgraph,
+            torch_compile_inductor_enabled=self.torch_compile_inductor_enabled,
+            torch_compile_piecewise_cuda_graph=self.
+            torch_compile_piecewise_cuda_graph,
+            torch_compile_enable_userbuffers=self.
+            torch_compile_enable_userbuffers,
+            autotuner_enabled=self.autotuner_enabled,
+            enable_layerwise_nvtx_marker=self.enable_layerwise_nvtx_marker,
+            load_format=self.load_format)
+
+    @field_validator('cuda_graph_max_batch_size')
+    @classmethod
+    def validate_cuda_graph_max_batch_size(cls, v):
+        """Validate cuda_graph_max_batch_size is non-negative."""
+        if v < 0:
+            raise ValueError("cuda_graph_max_batch_size must be non-negative")
+        return v
+
+    @staticmethod
+    def _generate_cuda_graph_batch_sizes(max_batch_size: int,
+                                         padding_enabled: bool) -> List[int]:
+        """Generate a list of batch sizes for CUDA graphs.
+
+        Args:
+            max_batch_size: Maximum batch size to generate up to
+            padding_enabled: Whether padding is enabled, which affects the batch size distribution
+
+        Returns:
+            List of batch sizes to create CUDA graphs for
+        """
+        if padding_enabled:
+            batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)]
+        else:
+            batch_sizes = list(range(1, 32)) + [32, 64, 128]
+
+        # Add powers of 2 up to max_batch_size
+        batch_sizes += [
+            2**i for i in range(8, math.floor(math.log(max_batch_size, 2)))
+        ]
+
+        # Filter and sort batch sizes
+        batch_sizes = sorted(
+            [size for size in batch_sizes if size <= max_batch_size])
+
+        # Add max_batch_size if not already included
+        if max_batch_size != batch_sizes[-1]:
+            batch_sizes.append(max_batch_size)
+
+        return batch_sizes
+
+    @model_validator(mode='after')
+    def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
+        """Validate CUDA graph configuration.
+
+        Ensures that:
+        1. If cuda_graph_batch_sizes is provided, cuda_graph_max_batch_size must be 0
+        2. If cuda_graph_batch_sizes is not provided, it is generated based on cuda_graph_max_batch_size
+        3. If both are provided, cuda_graph_batch_sizes must match the generated values
+        """
+        if self.cuda_graph_batch_sizes is not None:
+            self.cuda_graph_batch_sizes = sorted(self.cuda_graph_batch_sizes)
+            if self.cuda_graph_max_batch_size != 0:
+                if self.cuda_graph_batch_sizes != self._generate_cuda_graph_batch_sizes(
+                        self.cuda_graph_max_batch_size,
+                        self.cuda_graph_padding_enabled):
+                    raise ValueError(
+                        "Please don't set both cuda_graph_batch_sizes "
+                        "and cuda_graph_max_batch_size.\n"
+                        f"cuda_graph_batch_sizes: {self.cuda_graph_batch_sizes}, "
+                        f"cuda_graph_max_batch_size: {self.cuda_graph_max_batch_size}"
+                    )
+            else:
+                self.cuda_graph_max_batch_size = max(
+                    self.cuda_graph_batch_sizes)
+        else:
+            max_batch_size = self.cuda_graph_max_batch_size or 128
+            generated_sizes = self._generate_cuda_graph_batch_sizes(
+                max_batch_size, self.cuda_graph_padding_enabled)
+            self.cuda_graph_batch_sizes = generated_sizes
+            self.cuda_graph_max_batch_size = max_batch_size
+
+        return self
+
 
 def update_llm_args_with_extra_dict(
         llm_args: Dict,
diff --git a/tensorrt_llm/llmapi/utils.py b/tensorrt_llm/llmapi/utils.py
index 39cfd2d5739..5872174ab96 100644
--- a/tensorrt_llm/llmapi/utils.py
+++ b/tensorrt_llm/llmapi/utils.py
@@ -507,6 +507,9 @@ def generate_api_docs_as_docstring(model: Type[BaseModel],
         elif field_name in type_hints:
             type_str = str(type_hints[field_name])
             type_str = type_str.replace("typing.", "")
+            # Extract just the class name from full class path
+            if "<class '" in type_str:
+                type_str = type_str[8:-2]
         else:
             type_str = field_type or 'Any'
 
diff --git a/tensorrt_llm/scaffolding/worker.py b/tensorrt_llm/scaffolding/worker.py
index 03c37f0475b..d9a2cd2086d 100644
--- a/tensorrt_llm/scaffolding/worker.py
+++ b/tensorrt_llm/scaffolding/worker.py
@@ -4,7 +4,6 @@
 import openai
 from transformers import AutoTokenizer
 
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.executor import GenerationExecutor
 from tensorrt_llm.llmapi.llm import LLM
 from tensorrt_llm.llmapi.llm_args import KvCacheConfig
@@ -138,10 +137,6 @@ def init_with_new_llm(
         kv_cache_free_gpu_memory_fraction: float = 0.9,
         disable_overlap_scheduler: bool = False,
     ):
-        pytorch_backend_config = PyTorchConfig(
-            mixed_sampler=True,
-            disable_overlap_scheduler=disable_overlap_scheduler,
-        )
         kv_cache_config = KvCacheConfig(
             free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction, )
 
@@ -157,7 +152,8 @@ def init_with_new_llm(
         llm = LLM(model_dir,
                   backend=backend,
                   tokenizer=tokenizer,
-                  pytorch_backend_config=pytorch_backend_config,
+                  mixed_sampler=True,
+                  disable_overlap_scheduler=disable_overlap_scheduler,
                   kv_cache_config=kv_cache_config,
                   max_batch_size=max_batch_size,
                   max_num_tokens=max_num_tokens)
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index b1b33dc3df9..1d886910a88 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -181,15 +181,9 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
     @pytest.mark.skip_device_not_contain(["H100", "H200"])
     @pytest.mark.parametrize("disable_overlap_scheduler", [False, True])
     def test_auto_dtype(self, disable_overlap_scheduler):
-        ctx_server_config = {
-            "pytorch_backend_config": {
-                "disable_overlap_scheduler": True
-            }
-        }
+        ctx_server_config = {"disable_overlap_scheduler": True}
         gen_server_config = {
-            "pytorch_backend_config": {
-                "disable_overlap_scheduler": disable_overlap_scheduler
-            }
+            "disable_overlap_scheduler": disable_overlap_scheduler
         }
         disaggregated_server_config = {
             "hostname": "localhost",
@@ -220,16 +214,8 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
     @pytest.mark.parametrize("overlap_scheduler", [False, True])
     def test_auto_dtype(self, overlap_scheduler):
         pytest.skip("https://nvbugs/5297821")
-        ctx_server_config = {
-            "pytorch_backend_config": {
-                "disable_overlap_scheduler": True
-            }
-        }
-        gen_server_config = {
-            "pytorch_backend_config": {
-                "disable_overlap_scheduler": overlap_scheduler
-            }
-        }
+        ctx_server_config = {"disable_overlap_scheduler": True}
+        gen_server_config = {"disable_overlap_scheduler": overlap_scheduler}
         disaggregated_server_config = {
             "hostname": "localhost",
             "port": 8000,
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index d39fc823aee..f9b9181568a 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -15,8 +15,7 @@
 import pytest
 
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import (MoeLoadBalancerConfig,
-                                                   PyTorchConfig)
+from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig
 from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig, SamplingParams
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo
@@ -59,11 +58,11 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
     @pytest.mark.skip_less_device_memory(32000)
     @parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"])
     def test_chunked_prefill(self, attn_backend):
-        pytorch_config = PyTorchConfig(attn_backend=attn_backend, )
+        pytorch_config = dict(attn_backend=attn_backend, )
         llm = LLM(self.MODEL_PATH,
                   enable_chunked_prefill=True,
                   max_num_tokens=64,
-                  pytorch_backend_config=pytorch_config)
+                  **pytorch_config)
         with llm:
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
@@ -74,7 +73,7 @@ def test_chunked_prefill(self, attn_backend):
     @parametrize_with_ids("torch_compile", [False, True])
     @parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"])
     def test_bfloat16(self, attn_backend, torch_compile):
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             torch_compile_enabled=torch_compile,
             torch_compile_fullgraph=True,
             cuda_graph_padding_enabled=torch_compile,
@@ -82,7 +81,7 @@ def test_bfloat16(self, attn_backend, torch_compile):
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
         )
-        llm = LLM(self.MODEL_PATH, pytorch_backend_config=pytorch_config)
+        llm = LLM(self.MODEL_PATH, **pytorch_config)
         with llm:
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
@@ -100,7 +99,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
                 "Pipeline parallel with torch.compile is not supported yet.\n"
                 "Issue: Unfusing flashinfer_fused_add_rmsnorm causes outputs to be "
                 "discarded at graph breaks.")
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             torch_compile_enabled=torch_compile,
             torch_compile_fullgraph=True,
             cuda_graph_padding_enabled=torch_compile,
@@ -111,7 +110,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
         llm = LLM(self.MODEL_PATH,
                   tensor_parallel_size=tp_size,
                   pipeline_parallel_size=pp_size,
-                  pytorch_backend_config=pytorch_config)
+                  **pytorch_config)
         with llm:
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
@@ -124,7 +123,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
     @parametrize_with_ids("fp8kv", [False, True])
     def test_fp8(self, fp8kv, attn_backend, torch_compile):
         quant_config = QuantConfig(QuantAlgo.FP8)
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             torch_compile_enabled=torch_compile,
             torch_compile_fullgraph=True,
             cuda_graph_padding_enabled=torch_compile,
@@ -134,11 +133,11 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
         )
         if fp8kv:
             quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config.kv_cache_dtype = "fp8"
+            pytorch_config["kv_cache_dtype"] = "fp8"
         llm = LLM(
             f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
             quant_config=quant_config,
-            pytorch_backend_config=pytorch_config)
+            **pytorch_config)
         assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
         if fp8kv:
             assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
@@ -162,7 +161,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
                 "Issue: Unfusing flashinfer_fused_add_rmsnorm causes outputs to be "
                 "discarded at graph breaks.")
         quant_config = QuantConfig(QuantAlgo.FP8)
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             torch_compile_enabled=torch_compile,
             torch_compile_fullgraph=True,
             cuda_graph_padding_enabled=torch_compile,
@@ -172,13 +171,13 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
         )
         if fp8kv:
             quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config.kv_cache_dtype = "fp8"
+            pytorch_config["kv_cache_dtype"] = "fp8"
         llm = LLM(
             f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
             tensor_parallel_size=tp_size,
             pipeline_parallel_size=pp_size,
             quant_config=quant_config,
-            pytorch_backend_config=pytorch_config)
+            **pytorch_config)
         assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
         if fp8kv:
             assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
@@ -192,8 +191,8 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
     @skip_pre_hopper
     def test_fp8_llm_sampler(self):
         model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
-        pytorch_config = PyTorchConfig(enable_trtllm_sampler=True)
-        llm = LLM(model_path, pytorch_backend_config=pytorch_config)
+        pytorch_config = dict(enable_trtllm_sampler=True)
+        llm = LLM(model_path, **pytorch_config)
         assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
 
         sampling_params = SamplingParams(
@@ -453,17 +452,18 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
             pytest.skip("https://nvbugs/5252559")
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph,
             torch_compile_enabled=torch_compile,
-            torch_compile_fullgraph=True)
+            torch_compile_fullgraph=True,
+        )
         mtp_config = None
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
         llm = LLM(self.MODEL_PATH,
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   enable_attention_dp=attention_dp,
                   speculative_config=mtp_config)
         with llm:
@@ -497,11 +497,12 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
             pytest.skip("PP with torch.compile is not supported yet.")
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph,
             torch_compile_enabled=torch_compile,
-            torch_compile_fullgraph=True)
+            torch_compile_fullgraph=True,
+        )
         mtp_config = None
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
@@ -510,7 +511,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   enable_attention_dp=attention_dp,
                   speculative_config=mtp_config)
         with llm:
@@ -539,17 +540,18 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
             pytest.skip("https://nvbugs/5252559")
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph,
             torch_compile_enabled=torch_compile,
-            torch_compile_fullgraph=True)
+            torch_compile_fullgraph=True,
+        )
 
         quant_config = QuantConfig()
         quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
         if fp8kv:
             quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config.kv_cache_dtype = "fp8"
+            pytorch_config["kv_cache_dtype"] = "fp8"
 
         mtp_config = None
         if mtp_nextn > 0:
@@ -557,7 +559,7 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
 
         llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   quant_config=quant_config,
                   enable_attention_dp=attention_dp,
                   speculative_config=mtp_config)
@@ -578,13 +580,15 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
     def test_fp8_block_scales_cuda_graph_padding(self):
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
-        pytorch_config = PyTorchConfig(disable_overlap_scheduler=False,
-                                       use_cuda_graph=True,
-                                       cuda_graph_max_batch_size=512,
-                                       cuda_graph_padding_enabled=True)
+        pytorch_config = dict(
+            disable_overlap_scheduler=False,
+            use_cuda_graph=True,
+            cuda_graph_max_batch_size=512,
+            cuda_graph_padding_enabled=True,
+        )
         llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config)
+                  **pytorch_config)
         assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
         with llm:
             task = MMLU(self.MODEL_NAME)
@@ -620,17 +624,18 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
             pytest.skip("PP with torch.compile is not supported yet.")
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph,
             torch_compile_enabled=torch_compile,
-            torch_compile_fullgraph=True)
+            torch_compile_fullgraph=True,
+        )
 
         quant_config = QuantConfig()
         quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
         if fp8kv:
             quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config.kv_cache_dtype = "fp8"
+            pytorch_config["kv_cache_dtype"] = "fp8"
 
         mtp_config = None
         if mtp_nextn > 0:
@@ -641,7 +646,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   quant_config=quant_config,
                   enable_attention_dp=attention_dp,
                   speculative_config=mtp_config)
@@ -676,13 +681,13 @@ def test_fp8_block_scales_4gpus_static_eplb(self):
             num_slots=num_slots,
             initial_global_assignments=initial_global_assignments,
             layer_updates_per_iter=0)
-        pytorch_config = PyTorchConfig(use_cuda_graph=True,
+        pytorch_backend_options = dict(use_cuda_graph=True,
                                        moe_load_balancer=eplb_config)
         llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
                   tensor_parallel_size=4,
                   moe_expert_parallel_size=4,
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_backend_options,
                   enable_attention_dp=True)
         with llm:
             task = MMLU(self.MODEL_NAME)
@@ -705,21 +710,22 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
             pytest.skip("https://nvbugs/5252559")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
 
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph,
             torch_compile_enabled=torch_compile,
-            torch_compile_fullgraph=True)
+            torch_compile_fullgraph=True,
+        )
 
         quant_config = QuantConfig()
         quant_config.quant_algo = QuantAlgo.NVFP4
         if fp8kv:
             quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config.kv_cache_dtype = "fp8"
+            pytorch_config["kv_cache_dtype"] = "fp8"
 
         llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   quant_config=quant_config,
                   enable_attention_dp=attention_dp)
 
@@ -756,24 +762,25 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
         if torch_compile and pp_size > 1:
             pytest.skip("PP with torch.compile is not supported yet.")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph,
             torch_compile_enabled=torch_compile,
-            torch_compile_fullgraph=True)
+            torch_compile_fullgraph=True,
+        )
 
         quant_config = QuantConfig()
         quant_config.quant_algo = QuantAlgo.NVFP4
         if fp8kv:
             quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config.kv_cache_dtype = "fp8"
+            pytorch_config["kv_cache_dtype"] = "fp8"
 
         llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
                   tensor_parallel_size=tp_size,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   quant_config=quant_config,
                   enable_attention_dp=attention_dp)
 
@@ -815,9 +822,10 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv,
 
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
                                         enable_block_reuse=False)
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
-            use_cuda_graph=cuda_graph)
+            use_cuda_graph=cuda_graph,
+        )
         mtp_config = None
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
@@ -833,11 +841,11 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv,
                 quant_config.quant_algo = QuantAlgo.NVFP4
             if fp8kv:
                 quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-                pytorch_config.kv_cache_dtype = "fp8"
+                pytorch_config["kv_cache_dtype"] = "fp8"
 
         llm = LLM(model_path,
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   quant_config=quant_config,
                   enable_attention_dp=attention_dp,
                   speculative_config=mtp_config)
@@ -883,16 +891,15 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
             pytest.skip("https://nvbugs/5302441")
 
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
-        pytorch_config = PyTorchConfig(
-            disable_overlap_scheduler=not overlap_scheduler,
-            use_cuda_graph=cuda_graph,
-            moe_backend=moe_backend)
+        pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
+                              use_cuda_graph=cuda_graph,
+                              moe_backend=moe_backend)
 
         quant_config = QuantConfig()
         quant_config.quant_algo = QuantAlgo.NVFP4
         if fp8kv:
             quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config.kv_cache_dtype = "fp8"
+            pytorch_config["kv_cache_dtype"] = "fp8"
 
         mtp_config = None
         if mtp_nextn > 0:
@@ -903,12 +910,12 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   quant_config=quant_config,
                   enable_attention_dp=attention_dp,
                   speculative_config=mtp_config)
 
-        assert llm.pytorch_backend_config.moe_backend == moe_backend
+        assert llm.args.moe_backend == moe_backend
         assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
         if fp8kv:
             assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
@@ -933,15 +940,16 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                             attention_dp, cuda_graph, overlap_scheduler,
                             max_batch_size):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
-            use_cuda_graph=cuda_graph)
+            use_cuda_graph=cuda_graph,
+        )
 
         quant_config = QuantConfig()
         quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
         if fp8kv:
             quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config.kv_cache_dtype = "fp8"
+            pytorch_config["kv_cache_dtype"] = "fp8"
 
         mtp_config = None
         if mtp_nextn > 0:
@@ -952,7 +960,7 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   quant_config=quant_config,
                   enable_attention_dp=attention_dp,
                   speculative_config=mtp_config)
@@ -986,12 +994,12 @@ class TestNemotronNas(LlmapiAccuracyTestHarness):
     @pytest.mark.skip_less_device(8)
     def test_auto_dtype_tp8(self):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
-        pytorch_config = PyTorchConfig()
+        pytorch_config = dict()
 
         with LLM(self.MODEL_PATH,
                  tensor_parallel_size=8,
                  kv_cache_config=kv_cache_config,
-                 pytorch_backend_config=pytorch_config) as llm:
+                 **pytorch_config) as llm:
 
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
@@ -1134,15 +1142,14 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
         ids=["latency"])
     def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
                               cuda_graph, overlap_scheduler):
-        pytorch_config = PyTorchConfig(
-            disable_overlap_scheduler=not overlap_scheduler,
-            use_cuda_graph=cuda_graph)
+        pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
+                              use_cuda_graph=cuda_graph)
 
         llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8",
                   tensor_parallel_size=tp_size,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   enable_attention_dp=attention_dp)
         with llm:
             task = CnnDailymail(self.MODEL_NAME)
@@ -1162,15 +1169,14 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
         ids=["latency"])
     def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
                               cuda_graph, overlap_scheduler):
-        pytorch_config = PyTorchConfig(
-            disable_overlap_scheduler=not overlap_scheduler,
-            use_cuda_graph=cuda_graph)
+        pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
+                              use_cuda_graph=cuda_graph)
 
         llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8",
                   tensor_parallel_size=tp_size,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   enable_attention_dp=attention_dp)
         with llm:
             task = MMLU(self.MODEL_NAME)
@@ -1185,16 +1191,15 @@ def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
         ids=["latency"])
     def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
                  overlap_scheduler):
-        pytorch_config = PyTorchConfig(
-            disable_overlap_scheduler=not overlap_scheduler,
-            use_cuda_graph=cuda_graph)
+        pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
+                              use_cuda_graph=cuda_graph)
 
         llm = LLM(
             f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf",
             tensor_parallel_size=tp_size,
             pipeline_parallel_size=pp_size,
             moe_expert_parallel_size=ep_size,
-            pytorch_backend_config=pytorch_config,
+            **pytorch_config,
             enable_attention_dp=attention_dp)
         with llm:
             task = MMLU(self.MODEL_NAME)
@@ -1225,7 +1230,7 @@ def test_nvfp4(
         overlap_scheduler,
         moe_backend,
     ):
-        pytorch_config = PyTorchConfig(
+        pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph,
             moe_backend=moe_backend,
@@ -1236,7 +1241,7 @@ def test_nvfp4(
             tensor_parallel_size=tp_size,
             pipeline_parallel_size=pp_size,
             moe_expert_parallel_size=ep_size,
-            pytorch_backend_config=pytorch_config,
+            **pytorch_config,
             enable_attention_dp=attention_dp)
         with llm:
             task = MMLU(self.MODEL_NAME)
@@ -1255,15 +1260,14 @@ class TestQwen3_32B(LlmapiAccuracyTestHarness):
         ids=["latency"])
     def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
                               cuda_graph, overlap_scheduler):
-        pytorch_config = PyTorchConfig(
-            disable_overlap_scheduler=not overlap_scheduler,
-            use_cuda_graph=cuda_graph)
+        pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
+                              use_cuda_graph=cuda_graph)
 
         llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8",
                   tensor_parallel_size=tp_size,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   enable_attention_dp=attention_dp)
         with llm:
             task = CnnDailymail(self.MODEL_NAME)
@@ -1282,9 +1286,8 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness):
         ids=["latency", "throughput_latency"])
     def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
                  overlap_scheduler):
-        pytorch_config = PyTorchConfig(
-            disable_overlap_scheduler=not overlap_scheduler,
-            use_cuda_graph=cuda_graph)
+        pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
+                              use_cuda_graph=cuda_graph)
 
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
         llm = LLM(
@@ -1292,7 +1295,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             tensor_parallel_size=tp_size,
             pipeline_parallel_size=pp_size,
             moe_expert_parallel_size=ep_size,
-            pytorch_backend_config=pytorch_config,
+            **pytorch_config,
             enable_attention_dp=attention_dp,
             kv_cache_config=kv_cache_config)
         with llm:
@@ -1308,16 +1311,15 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
         ids=["latency", "throughput_latency"])
     def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
                    overlap_scheduler):
-        pytorch_config = PyTorchConfig(
-            disable_overlap_scheduler=not overlap_scheduler,
-            use_cuda_graph=cuda_graph)
+        pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
+                              use_cuda_graph=cuda_graph)
 
         llm = LLM(
             f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
             tensor_parallel_size=tp_size,
             pipeline_parallel_size=pp_size,
             moe_expert_parallel_size=ep_size,
-            pytorch_backend_config=pytorch_config,
+            **pytorch_config,
             enable_attention_dp=attention_dp)
         with llm:
             task = MMLU(self.MODEL_NAME)
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml
index 7a850b121bc..42c439f6aaf 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml
@@ -3,10 +3,9 @@ hostname: localhost
 port: 8000
 backend: "pytorch"
 free_gpu_memory_fraction: 0.1
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
-  autotuner_enabled: False
+use_cuda_graph: False
+disable_overlap_scheduler: True
+autotuner_enabled: False
 context_servers:
   num_instances: 2
   router:
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml
index 2c9a83ecd65..747f6ccc764 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml
@@ -3,10 +3,9 @@ port: 8000
 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 free_gpu_memory_fraction: 0.15
 backend: "pytorch"
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
-  autotuner_enabled: False
+use_cuda_graph: False
+disable_overlap_scheduler: True
+autotuner_enabled: False
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml
index 2a1a605078e..1a1285fcf6b 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml
@@ -5,10 +5,9 @@ backend: "pytorch"
 free_gpu_memory_fraction: 0.15
 conditional_disagg_config:
   max_local_prefill_length: 100
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
-  autotuner_enabled: False
+use_cuda_graph: False
+disable_overlap_scheduler: True
+autotuner_enabled: False
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml
index 59db98e2ab7..2f70eab61f5 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml
@@ -3,9 +3,8 @@ port: 8000
 model: DeepSeek-V3-Lite/fp8
 free_gpu_memory_fraction: 0.1
 backend: "pytorch"
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
+use_cuda_graph: False
+disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml
index bf8b1484151..0959df0052d 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml
@@ -3,9 +3,8 @@ port: 8000
 model: DeepSeek-V3-Lite/fp8
 free_gpu_memory_fraction: 0.1
 backend: "pytorch"
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
+use_cuda_graph: False
+disable_overlap_scheduler: True
 speculative_config:
   decoding_type: MTP
   num_nextn_predict_layers: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml
index 35b1cb6f4e9..9a83d2d0a8b 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml
@@ -11,9 +11,8 @@ context_servers:
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
   enable_attention_dp: true
-  pytorch_backend_config:
-    use_cuda_graph: False
-    disable_overlap_scheduler: True
+  use_cuda_graph: False
+  disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
@@ -21,8 +20,7 @@ generation_servers:
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
   enable_attention_dp: true
-  pytorch_backend_config:
-    use_cuda_graph: False
-    disable_overlap_scheduler: False
+  use_cuda_graph: False
+  disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml
index b60de54c5eb..d99a04097ef 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml
@@ -3,9 +3,8 @@ port: 8000
 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 free_gpu_memory_fraction: 0.25
 backend: "pytorch"
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
+use_cuda_graph: False
+disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 2
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml
index d01502cfc07..1c5755c4287 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml
@@ -3,9 +3,8 @@ port: 8000
 model: DeepSeek-V3-Lite/fp8
 free_gpu_memory_fraction: 0.25
 backend: "pytorch"
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
+use_cuda_graph: False
+disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 2
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml
index 9f19e0699f9..381a13a8a9d 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml
@@ -3,9 +3,8 @@ port: 8000
 model: DeepSeek-V3-Lite/fp8
 free_gpu_memory_fraction: 0.25
 backend: "pytorch"
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
+use_cuda_graph: False
+disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 2
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml
index ee05d96d063..42687143c22 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml
@@ -3,9 +3,8 @@ port: 8000
 model: DeepSeek-V3-Lite/fp8
 free_gpu_memory_fraction: 0.25
 backend: "pytorch"
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
+use_cuda_graph: False
+disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 2
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml
index 2c16cf7aefd..09aa1f6df1c 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml
@@ -3,9 +3,8 @@ port: 8000
 model: DeepSeek-V3-Lite/fp8
 free_gpu_memory_fraction: 0.25
 backend: "pytorch"
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
+use_cuda_graph: False
+disable_overlap_scheduler: True
 speculative_config:
   decoding_type: MTP
   num_nextn_predict_layers: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml
index b55acd05efb..616775cc6e2 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml
@@ -8,9 +8,8 @@ context_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   enable_attention_dp: True
-  pytorch_backend_config:
-    use_cuda_graph: False
-    disable_overlap_scheduler: True
+  use_cuda_graph: False
+  disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
@@ -18,8 +17,7 @@ generation_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   enable_attention_dp: True
-  pytorch_backend_config:
-    use_cuda_graph: False
-    disable_overlap_scheduler: False
+  use_cuda_graph: False
+  disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
index 9428e563d4a..87bcdc4ed2e 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
@@ -8,9 +8,8 @@ context_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   enable_attention_dp: true
-  pytorch_backend_config:
-    use_cuda_graph: False
-    disable_overlap_scheduler: True
+  use_cuda_graph: False
+  disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
@@ -18,8 +17,7 @@ generation_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   enable_attention_dp: true
-  pytorch_backend_config:
-    use_cuda_graph: True
-    disable_overlap_scheduler: False
+  use_cuda_graph: True
+  disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
index a97ac33cb29..ca4f4e64a2f 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
@@ -7,17 +7,15 @@ context_servers:
   num_instances: 1
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
-  pytorch_backend_config:
-    use_cuda_graph: False
-    disable_overlap_scheduler: True
+  use_cuda_graph: False
+  disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
   num_instances: 1
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
-  pytorch_backend_config:
-    use_cuda_graph: True
-    disable_overlap_scheduler: False
+  use_cuda_graph: True
+  disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
index 99060d86b74..110179d3f08 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
@@ -12,10 +12,9 @@ context_servers:
   kv_cache_config:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
-  pytorch_backend_config:
-    use_cuda_graph: True
-    cuda_graph_batch_sizes: [1,3000]
-    disable_overlap_scheduler: True
+  use_cuda_graph: True
+  cuda_graph_batch_sizes: [1,3000]
+  disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
@@ -28,10 +27,9 @@ generation_servers:
   kv_cache_config:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
-  pytorch_backend_config:
-    use_cuda_graph: True
-    disable_overlap_scheduler: True
-    cuda_graph_padding_enabled: True
-    cuda_graph_batch_sizes: [1,4,8,16,24,32]
+  use_cuda_graph: True
+  disable_overlap_scheduler: True
+  cuda_graph_padding_enabled: True
+  cuda_graph_batch_sizes: [1,4,8,16,24,32]
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml
index 30f7fc7721e..c6c51d854c0 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml
@@ -12,8 +12,7 @@ generation_servers:
     free_gpu_memory_fraction: 0.2
     enable_block_reuse: False
     enable_partial_reuse: False
-  pytorch_backend_config:
-    print_iter_log: True
+  print_iter_log: True
   urls:
       - "localhost:8002"
       - "localhost:8003"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml
index 8ac4a59c5bb..dd95a06db0b 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml
@@ -16,9 +16,8 @@ context_servers:
   kv_cache_config:
     free_gpu_memory_fraction: 0.15
     enable_partial_reuse: False
-  pytorch_backend_config:
-    use_cuda_graph: False
-    disable_overlap_scheduler: True
+  use_cuda_graph: False
+  disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
       - "localhost:8002"
@@ -35,9 +34,8 @@ generation_servers:
   kv_cache_config:
     free_gpu_memory_fraction: 0.15
     enable_partial_reuse: False
-  pytorch_backend_config:
-    use_cuda_graph: False
-    disable_overlap_scheduler: False
+  use_cuda_graph: False
+  disable_overlap_scheduler: False
   urls:
       - "localhost:8003"
       - "localhost:8004"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml
index 290b076255d..ebc9066d13b 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml
@@ -3,9 +3,8 @@ port: 8000
 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 free_gpu_memory_fraction: 0.25
 backend: "pytorch"
-pytorch_backend_config:
-  use_cuda_graph: False
-  disable_overlap_scheduler: True
+use_cuda_graph: False
+disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
index e35886d8b1a..36b051dea02 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
@@ -13,9 +13,8 @@ context_servers:
   kv_cache_config:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
-  pytorch_backend_config:
-    use_cuda_graph: False
-    disable_overlap_scheduler: True
+  use_cuda_graph: False
+  disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
 generation_servers:
@@ -28,8 +27,7 @@ generation_servers:
   kv_cache_config:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
-  pytorch_backend_config:
-    use_cuda_graph: False
-    disable_overlap_scheduler: False
+  use_cuda_graph: False
+  disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
index 89c9d2ac2d7..e92a1ccbc72 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -11,7 +11,6 @@
 
 from tensorrt_llm import DisaggregatedParams, SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm._utils import set_mpi_comm
 from tensorrt_llm.llmapi import KvCacheConfig, MpiCommSession
 
@@ -40,6 +39,7 @@ def model_path(model_name):
 
 
 async def run_worker(kv_cache_config, pytorch_config, model_name, rank):
+    assert isinstance(pytorch_config, dict)
     print(f"Running worker {rank}")
     port_name = MPI.Lookup_name('my_port')
     intercomm = MPI.COMM_WORLD.Connect(port_name)
@@ -53,7 +53,7 @@ async def run_worker(kv_cache_config, pytorch_config, model_name, rank):
                   auto_parallel=False,
                   model=model_name,
                   enable_chunked_prefill=False,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   _mpi_session=mpi_session,
                   kv_cache_config=kv_cache_config)
         print(f"LLM created")
@@ -110,15 +110,15 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt,
 
     # Context worker
     worker_pytorch_configs.append(
-        PyTorchConfig(disable_overlap_scheduler=True,
-                      kv_cache_dtype="auto",
-                      use_cuda_graph=enable_cuda_graph))
+        dict(disable_overlap_scheduler=True,
+             kv_cache_dtype="auto",
+             use_cuda_graph=enable_cuda_graph))
 
     # Generation worker
     worker_pytorch_configs.append(
-        PyTorchConfig(disable_overlap_scheduler=not generation_overlap,
-                      kv_cache_dtype="auto",
-                      use_cuda_graph=enable_cuda_graph))
+        dict(disable_overlap_scheduler=not generation_overlap,
+             kv_cache_dtype="auto",
+             use_cuda_graph=enable_cuda_graph))
 
     kv_cache_configs = [KvCacheConfig(max_tokens=2048 * 8) for _ in range(2)]
     model_names = [model_path(model) for _ in range(2)]
@@ -231,15 +231,15 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
 
     # Context worker
     worker_pytorch_configs.append(
-        PyTorchConfig(disable_overlap_scheduler=True,
-                      kv_cache_dtype="auto",
-                      use_cuda_graph=enable_cuda_graph))
+        dict(disable_overlap_scheduler=True,
+             kv_cache_dtype="auto",
+             use_cuda_graph=enable_cuda_graph))
 
     # Generation worker
     worker_pytorch_configs.append(
-        PyTorchConfig(disable_overlap_scheduler=not generation_overlap,
-                      kv_cache_dtype="auto",
-                      use_cuda_graph=enable_cuda_graph))
+        dict(disable_overlap_scheduler=not generation_overlap,
+             kv_cache_dtype="auto",
+             use_cuda_graph=enable_cuda_graph))
 
     kv_cache_configs = [
         KvCacheConfig(max_tokens=128, enable_block_reuse=False)
diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
index 7792377649a..88766ad767d 100644
--- a/tests/integration/defs/perf/pytorch_model_config.py
+++ b/tests/integration/defs/perf/pytorch_model_config.py
@@ -28,18 +28,14 @@ def get_model_yaml_config(model_label: str, input_lens: list[int]) -> dict:
         """
     base_config = {
         'enable_attention_dp': True,
-        'pytorch_backend_config': {
-            'print_iter_log': True,
-            'use_cuda_graph': True,
-            'cuda_graph_padding_enabled': True,
-        }
+        'print_iter_log': True,
+        'use_cuda_graph': True,
+        'cuda_graph_padding_enabled': True,
     }
     model_configs = {
         'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8':
         {
-            'pytorch_backend_config': {
-                'use_cuda_graph': True,
-            },
+            'use_cuda_graph': True,
             'speculative_config': {
                 'decoding_type': 'MTP',
                 'num_nextn_predict_layers': 3
@@ -47,9 +43,7 @@ def get_model_yaml_config(model_label: str, input_lens: list[int]) -> dict:
         },
         'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8':
         {
-            'pytorch_backend_config': {
-                'use_cuda_graph': True,
-            },
+            'use_cuda_graph': True,
             'speculative_config': {
                 'decoding_type': 'MTP',
                 'num_nextn_predict_layers': 3
@@ -57,25 +51,17 @@ def get_model_yaml_config(model_label: str, input_lens: list[int]) -> dict:
         },
         'deepseek_r1-bench-pytorch-float16-maxbs:128-maxnt:1127-input_output_len:1000,2000-quant:fp8-reqs:5120-con:1024-ep:8-gpus:8':
         {
-            'pytorch_backend_config': {
-                'cuda_graph_batch_sizes': [128]
-            },
+            'cuda_graph_batch_sizes': [128]
         },
         'deepseek_r1-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8':
         {
-            'pytorch_backend_config': {
-                'cuda_graph_padding_enabled': True,
-                'cuda_graph_batch_sizes':
-                [1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
-            },
+            'cuda_graph_padding_enabled': True,
+            'cuda_graph_batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
         },
         'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8':
         {
-            'pytorch_backend_config': {
-                'cuda_graph_padding_enabled': True,
-                'cuda_graph_batch_sizes':
-                [1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
-            },
+            'cuda_graph_padding_enabled': True,
+            'cuda_graph_batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
         }
     }
     # get model name from model_label
diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
index 6bb0d7e496b..347f49f08ba 100644
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
@@ -510,13 +510,16 @@ def stress_test(config,
         extra_llm_options["enable_attention_dp"] = True
 
         if config.backend == "pytorch":
-            extra_llm_options["pytorch_backend_config"] = {
-                "use_cuda_graph": True,
-                "cuda_graph_padding_enabled": True,
+            extra_llm_options.update({
+                "use_cuda_graph":
+                True,
+                "cuda_graph_padding_enabled":
+                True,
                 "cuda_graph_batch_sizes":
                 [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
-                "print_iter_log": True,
-            }
+                "print_iter_log":
+                True,
+            })
 
     with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
                                      delete=False) as temp_file:
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index b8a0f059008..e1184664fa7 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -625,11 +625,17 @@ def temp_extra_llm_api_options_file(request):
                 }
             }
 
+            pytorch_backend_config = {}
             if request.node.callspec.params['pytorch_backend_config']:
-                extra_llm_api_options_dict["pytorch_backend_config"] = {
+                pytorch_backend_config = {
                     "use_cuda_graph": True,
-                    "cuda_graph_batch_sizes": [1, 2, 3],
+                    # trtllm-bench will set cuda_max_batch_size to
+                    # max_batch_size, so the cuda_graph_batch_sizes is not
+                    # needed.
+                    # "cuda_graph_batch_sizes": [1, 2, 3],
                 }
+            # Flatten the pytorch_backend_config
+            extra_llm_api_options_dict.update(pytorch_backend_config)
 
             with open(temp_file_path, 'w') as f:
                 yaml.dump(extra_llm_api_options_dict, f)
@@ -1981,7 +1987,6 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path,
 
     from tensorrt_llm import SamplingParams
     from tensorrt_llm._torch import LLM
-    from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
     from tensorrt_llm.sampling_params import SamplingParams
     prompts = [
         "Hello, my name is",
@@ -1994,8 +1999,8 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path,
     sampling_param = SamplingParams(max_tokens=32, return_context_logits=True)
     with LLM(
             model=model_dir,
-            pytorch_backend_config=PyTorchConfig(
-                attn_backend=backend, disable_overlap_scheduler=True),
+            attn_backend=backend,
+            disable_overlap_scheduler=True,
     ) as llm:
 
         outputs = llm.generate(prompts, sampling_params=sampling_param)
diff --git a/tests/unittest/_torch/modeling/test_modeling_deepseek.py b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
index f8b158cc4e1..ffa75e44995 100644
--- a/tests/unittest/_torch/modeling/test_modeling_deepseek.py
+++ b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
@@ -9,7 +9,6 @@
 
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 
@@ -56,7 +55,7 @@ def test_deepseek_trtllmgen(model_name):
         "The president of the United States is",
     ] * 4
 
-    pytorch_config = PyTorchConfig(
+    pytorch_config = dict(
         disable_overlap_scheduler=True,
         use_cuda_graph=False,
         kv_cache_dtype="auto",
@@ -73,7 +72,7 @@ def test_deepseek_trtllmgen(model_name):
     llm = LLM(model=tmp_model_dir,
               tensor_parallel_size=1,
               enable_chunked_prefill=False,
-              pytorch_backend_config=pytorch_config,
+              **pytorch_config,
               moe_expert_parallel_size=-1,
               moe_tensor_parallel_size=-1,
               enable_attention_dp=False,
diff --git a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
index 38b15ce1f1f..7c88900add8 100644
--- a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
+++ b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py
@@ -3,7 +3,6 @@
 from parameterized import parameterized
 
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.sampling_params import SamplingParams
 
@@ -42,8 +41,7 @@ def test_llm_api(self, import_oot_code: bool):
         llm = LLM(model=model_dir,
                   kv_cache_config=kv_cache_config,
                   max_num_tokens=2048,
-                  pytorch_backend_config=PyTorchConfig(
-                      disable_overlap_scheduler=True))
+                  disable_overlap_scheduler=True)
 
         prompts = [
             "Hello, my name is",
diff --git a/tests/unittest/_torch/multi_gpu/test_star_attention.py b/tests/unittest/_torch/multi_gpu/test_star_attention.py
index 3938b4164fa..7c387cd5ccf 100644
--- a/tests/unittest/_torch/multi_gpu/test_star_attention.py
+++ b/tests/unittest/_torch/multi_gpu/test_star_attention.py
@@ -7,7 +7,6 @@
 
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
@@ -61,9 +60,8 @@ def test_model(backend, model_name, quant, sp_size, sa_block_size,
     max_batch_size = 20
     max_output_tokens = 128
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
-    pytorch_backend_config = PyTorchConfig(
-        attn_backend='FLASHINFER_STAR_ATTENTION',
-        disable_overlap_scheduler=True)
+    pytorch_backend_options = dict(attn_backend='FLASHINFER_STAR_ATTENTION',
+                                   disable_overlap_scheduler=True)
 
     llm = LLM(model=model_dir,
               backend=backend,
@@ -72,7 +70,7 @@ def test_model(backend, model_name, quant, sp_size, sa_block_size,
               quant_config=quant_config,
               context_parallel_size=sp_size,
               cp_config=cp_config,
-              pytorch_backend_config=pytorch_backend_config,
+              **pytorch_backend_options,
               max_batch_size=max_batch_size,
               max_input_len=MAX_SEQ_LEN - max_output_tokens,
               max_seq_len=MAX_SEQ_LEN,
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
index 6f0248b86da..b73655719fa 100644
--- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
@@ -9,7 +9,6 @@
 
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 
@@ -63,7 +62,7 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
         " the head of state and head of government of the",
     ] * 32
 
-    pytorch_config = PyTorchConfig(
+    pytorch_config = dict(
         disable_overlap_scheduler=True,
         use_cuda_graph=False,
         kv_cache_dtype="auto",
@@ -78,7 +77,7 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
     llm = LLM(model=model_dir,
               tensor_parallel_size=tp_size,
               enable_chunked_prefill=False,
-              pytorch_backend_config=pytorch_config,
+              **pytorch_config,
               moe_expert_parallel_size=-1,
               moe_tensor_parallel_size=-1,
               enable_attention_dp=enable_attention_dp,
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
index e89c5e00197..87da3e12cc8 100644
--- a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py
@@ -7,7 +7,6 @@
 
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 
 
 @pytest.mark.parametrize(
@@ -48,8 +47,7 @@ def test_llama4(model_name, backend, tp_size, use_cuda_graph,
         " the head of state and head of government of the", " solid white"
     ]
 
-    pytorch_config = PyTorchConfig(attn_backend=backend,
-                                   use_cuda_graph=use_cuda_graph)
+    pytorch_config = dict(attn_backend=backend, use_cuda_graph=use_cuda_graph)
     model_dir = str(llm_models_root() / "llama4-models" / model_name)
 
     llm = LLM(
@@ -57,7 +55,7 @@ def test_llama4(model_name, backend, tp_size, use_cuda_graph,
         tensor_parallel_size=tp_size,
         moe_expert_parallel_size=ep_size,
         moe_tensor_parallel_size=tp_size // ep_size,
-        pytorch_backend_config=pytorch_config,
+        **pytorch_config,
         pipeline_parallel_size=pp_size,
         enable_attention_dp=enable_attention_dp,
     )
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
index 5b3094bd3aa..b698c8a389f 100644
--- a/tests/unittest/_torch/speculative/test_eagle3.py
+++ b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -7,7 +7,6 @@
 
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import BuildConfig, EagleDecodingConfig, KvCacheConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
@@ -24,7 +23,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str):
 
     models_path = llm_models_root()
 
-    pytorch_config = PyTorchConfig(
+    pytorch_config = dict(
         disable_overlap_scheduler=True,
         use_cuda_graph=use_cuda_graph,
         # Only create a single CUDA graph to prevent OOM in CI
@@ -49,7 +48,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str):
         build_config = BuildConfig(max_seq_len=2048)
 
     llm_spec = LLM(model=target_model_dir,
-                   pytorch_backend_config=pytorch_config,
+                   **pytorch_config,
                    kv_cache_config=kv_cache_config,
                    speculative_config=spec_config,
                    build_config=build_config)
@@ -89,7 +88,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str):
     llm_spec.shutdown()
 
     llm_ref = LLM(model=target_model_dir,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   kv_cache_config=kv_cache_config,
                   build_config=build_config)
 
diff --git a/tests/unittest/_torch/speculative/test_ngram.py b/tests/unittest/_torch/speculative/test_ngram.py
index 29efa02e2b6..e996725e5f9 100644
--- a/tests/unittest/_torch/speculative/test_ngram.py
+++ b/tests/unittest/_torch/speculative/test_ngram.py
@@ -7,7 +7,6 @@
 
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import KvCacheConfig, NGramDecodingConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
@@ -26,7 +25,7 @@ def test_llama_ngram(use_cuda_graph: bool, attn_backend: str):
 
     models_path = llm_models_root()
 
-    pytorch_config = PyTorchConfig(
+    pytorch_config = dict(
         enable_overlap_scheduler=False,
         use_cuda_graph=use_cuda_graph,
         # Only create a single CUDA graph to prevent OOM in CI
@@ -54,7 +53,7 @@ def test_llama_ngram(use_cuda_graph: bool, attn_backend: str):
     )
     llm_spec = LLM(model=target_model_dir,
                    max_batch_size=max_batch_size,
-                   pytorch_backend_config=pytorch_config,
+                   **pytorch_config,
                    kv_cache_config=kv_cache_config,
                    speculative_config=spec_config)
 
@@ -67,7 +66,7 @@ def test_llama_ngram(use_cuda_graph: bool, attn_backend: str):
 
     llm_ref = LLM(model=target_model_dir,
                   max_batch_size=max_batch_size,
-                  pytorch_backend_config=pytorch_config,
+                  **pytorch_config,
                   kv_cache_config=kv_cache_config)
 
     results_ref = llm_ref.generate(prompts, sampling_params)
diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/test_overlap_scheduler.py
index 7b281ef0531..18622f94cbd 100644
--- a/tests/unittest/_torch/test_overlap_scheduler.py
+++ b/tests/unittest/_torch/test_overlap_scheduler.py
@@ -6,7 +6,6 @@
 
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
 
 
@@ -24,10 +23,9 @@ def model_path():
 
 def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_sampler):
     """Create LLM with specific overlap scheduler setting"""
-    pytorch_config = PyTorchConfig(
-        use_cuda_graph=True,
-        disable_overlap_scheduler=disable_overlap_scheduler,
-        enable_trtllm_sampler=enable_trtllm_sampler)
+    pytorch_config = dict(use_cuda_graph=True,
+                          disable_overlap_scheduler=disable_overlap_scheduler,
+                          enable_trtllm_sampler=enable_trtllm_sampler)
 
     trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False)
 
@@ -36,7 +34,7 @@ def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_sampler):
         tensor_parallel_size=1,
         trust_remote_code=True,
         enable_chunked_prefill=True,
-        pytorch_backend_config=pytorch_config,
+        **pytorch_config,
         kv_cache_config=trt_kv_cache_config,
         max_num_tokens=
         128  # Only one request longer than max_num_tokens is required to test chunked prefill
diff --git a/tests/unittest/_torch/test_trtllm_sampler.py b/tests/unittest/_torch/test_trtllm_sampler.py
index 3587c7b3b8b..bee47efddaf 100644
--- a/tests/unittest/_torch/test_trtllm_sampler.py
+++ b/tests/unittest/_torch/test_trtllm_sampler.py
@@ -7,7 +7,6 @@
 
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
 
 
@@ -25,8 +24,7 @@ def model_path():
 
 def create_llm(model_dir):
     """Create LLM with specific overlap scheduler setting"""
-    pytorch_config = PyTorchConfig(use_cuda_graph=True,
-                                   enable_trtllm_sampler=True)
+    pytorch_config = dict(use_cuda_graph=True, enable_trtllm_sampler=True)
 
     trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False)
 
@@ -35,7 +33,7 @@ def create_llm(model_dir):
         tensor_parallel_size=1,
         trust_remote_code=True,
         enable_chunked_prefill=True,
-        pytorch_backend_config=pytorch_config,
+        **pytorch_config,
         kv_cache_config=trt_kv_cache_config,
         max_num_tokens=
         128  # Only one request longer than max_num_tokens is required to test chunked prefill
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
index 78c822fedfb..9e9c05245f5 100644
--- a/tests/unittest/api_stability/references/llm.yaml
+++ b/tests/unittest/api_stability/references/llm.yaml
@@ -35,7 +35,7 @@ methods:
         annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig]
         default: null
       scheduler_config:
-        annotation: Optional[tensorrt_llm.llmapi.llm_args.SchedulerConfig]
+        annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig
         default: null
       extended_runtime_perf_knob_config:
         annotation: Optional[tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig]
diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml
index cfb05b05cb2..f2c90635fbe 100644
--- a/tests/unittest/api_stability/references_committed/llm.yaml
+++ b/tests/unittest/api_stability/references_committed/llm.yaml
@@ -103,7 +103,7 @@ methods:
         annotation: bool
         default: false
       kv_cache_config:
-        annotation: Optional[tensorrt_llm.llmapi.llm_args.KvCacheConfig]
+        annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig
         default: null
     return_annotation: None
   generate:
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
index a2278c0d996..aeb46a8a0b0 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
@@ -25,9 +25,7 @@ def temp_extra_llm_api_options_file(request):
     try:
         extra_llm_api_options_dict = {
             "guided_decoding_backend": "xgrammar",
-            "pytorch_backend_config": {
-                "disable_overlap_scheduler": True,
-            }
+            "disable_overlap_scheduler": True,
         }
 
         with open(temp_file_path, 'w') as f:
diff --git a/tests/unittest/llmapi/apps/_test_openai_metrics.py b/tests/unittest/llmapi/apps/_test_openai_metrics.py
index e79c34da311..1b075b67565 100755
--- a/tests/unittest/llmapi/apps/_test_openai_metrics.py
+++ b/tests/unittest/llmapi/apps/_test_openai_metrics.py
@@ -5,7 +5,6 @@
 from transformers import AutoTokenizer
 
 from tensorrt_llm._torch.llm import LLM as PyTorchLLM
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig
 from tensorrt_llm.serve.openai_server import OpenAIServer
 
@@ -23,8 +22,7 @@ def client():
                      build_config=build_config,
                      kv_cache_config=KvCacheConfig(),
                      backend="pytorch",
-                     pytorch_backend_config=PyTorchConfig(
-                         enable_iter_perf_stats=True, ))
+                     enable_iter_perf_stats=True)
     hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path)
 
     app_instance = OpenAIServer(llm,
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index aaa9771e1a9..97e93b92dee 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -1879,11 +1879,10 @@ def llm_get_stats_test_harness(tp_size: int = 1,
 
     if pytorch_backend:
         from tensorrt_llm._torch import LLM as LLM_torch
-        from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
-        llm_args_extra["pytorch_backend_config"] = PyTorchConfig(
-            enable_iter_perf_stats=True,
-            enable_iter_req_stats=enable_iter_req_stats,
-            disable_overlap_scheduler=not use_overlap)
+        llm_args_extra.update(
+            dict(enable_iter_perf_stats=True,
+                 enable_iter_req_stats=enable_iter_req_stats,
+                 disable_overlap_scheduler=not use_overlap))
         LLM_CLASS = LLM_torch
     else:
         LLM_CLASS = LLM
@@ -1949,11 +1948,10 @@ def llm_get_stats_async_test_harness(tp_size: int = 1,
 
     if pytorch_backend:
         from tensorrt_llm._torch import LLM as LLM_torch
-        from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
-        llm_args_extra["pytorch_backend_config"] = PyTorchConfig(
-            enable_iter_perf_stats=True,
-            enable_iter_req_stats=enable_iter_req_stats,
-            disable_overlap_scheduler=not use_overlap)
+        llm_args_extra.update(
+            dict(enable_iter_perf_stats=True,
+                 enable_iter_req_stats=enable_iter_req_stats,
+                 disable_overlap_scheduler=not use_overlap))
         LLM_CLASS = LLM_torch
     else:
         LLM_CLASS = LLM
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
index 99f5239e331..af26293fd8f 100644
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@@ -1,10 +1,10 @@
 import tempfile
 
+import pytest
 import yaml
 
 import tensorrt_llm.bindings.executor as tle
 from tensorrt_llm.llmapi.llm_args import *
-from tensorrt_llm.llmapi.llm_utils import *
 
 from .test_llm import llama_model_path
 
@@ -50,7 +50,7 @@ def test_update_llm_args_with_extra_dict_with_speculative_config():
         f.seek(0)
         dict_content = yaml.safe_load(f)
 
-    llm_args = LlmArgs(llama_model_path)
+    llm_args = LlmArgs(model=llama_model_path)
     llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(),
                                                     dict_content)
     llm_args = LlmArgs(**llm_args_dict)
@@ -173,3 +173,46 @@ def test_PeftCacheConfig_declaration():
     assert pybind_config.device_cache_percent == 0.5
     assert pybind_config.host_cache_size == 1024
     assert pybind_config.lora_prefetch_dir == "."
+
+
+class TestTorchLlmArgsCudaGraphSettings:
+
+    def test_cuda_graph_batch_sizes_case_0(self):
+        # set both cuda_graph_batch_sizes and cuda_graph_max_batch_size, and
+        # cuda_graph_batch_sizes is not equal to generated
+        with pytest.raises(ValueError):
+            TorchLlmArgs(model=llama_model_path,
+                         use_cuda_graph=True,
+                         cuda_graph_batch_sizes=[1, 2, 3],
+                         cuda_graph_max_batch_size=128)
+
+    def test_cuda_graph_batch_sizes_case_0_1(self):
+        # set both cuda_graph_batch_sizes and cuda_graph_max_batch_size, and
+        # cuda_graph_batch_sizes is equal to generated
+        args = TorchLlmArgs(model=llama_model_path,
+                            use_cuda_graph=True,
+                            cuda_graph_padding_enabled=True,
+                            cuda_graph_batch_sizes=TorchLlmArgs.
+                            _generate_cuda_graph_batch_sizes(128, True),
+                            cuda_graph_max_batch_size=128)
+        assert args.cuda_graph_batch_sizes == TorchLlmArgs._generate_cuda_graph_batch_sizes(
+            128, True)
+        assert args.cuda_graph_max_batch_size == 128
+
+    def test_cuda_graph_batch_sizes_case_1(self):
+        # set cuda_graph_batch_sizes only
+        args = TorchLlmArgs(model=llama_model_path,
+                            use_cuda_graph=True,
+                            cuda_graph_padding_enabled=True,
+                            cuda_graph_batch_sizes=[1, 2, 4])
+        assert args.cuda_graph_batch_sizes == [1, 2, 4]
+
+    def test_cuda_graph_batch_sizes_case_2(self):
+        # set cuda_graph_max_batch_size only
+        args = TorchLlmArgs(model=llama_model_path,
+                            use_cuda_graph=True,
+                            cuda_graph_padding_enabled=True,
+                            cuda_graph_max_batch_size=128)
+        assert args.cuda_graph_batch_sizes == TorchLlmArgs._generate_cuda_graph_batch_sizes(
+            128, True)
+        assert args.cuda_graph_max_batch_size == 128
diff --git a/tests/unittest/llmapi/test_llm_kv_cache_events.py b/tests/unittest/llmapi/test_llm_kv_cache_events.py
index a98529af8fc..b445bd1990b 100644
--- a/tests/unittest/llmapi/test_llm_kv_cache_events.py
+++ b/tests/unittest/llmapi/test_llm_kv_cache_events.py
@@ -2,7 +2,6 @@
 import time
 
 import tensorrt_llm
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
 from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm._utils import KVCacheEventSerializer
@@ -48,7 +47,7 @@ def create_llm(tensor_parallel_size=1):
     return LLM(model=llama_model_path,
                tensor_parallel_size=tensor_parallel_size,
                kv_cache_config=global_kvcache_config,
-               pytorch_backend_config=PyTorchConfig(autotuner_enabled=False),
+               autotuner_enabled=False,
                backend="pytorch")
 
 
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
index 92d4b1f2e80..03861d266d5 100644
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -90,10 +90,9 @@ def test_llm_reward_model():
     tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"]
 
     from tensorrt_llm._torch import LLM as LLM_torch
-    from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
     llm = LLM_torch(model=rm_model_path,
-                    pytorch_backend_config=PyTorchConfig(
-                        attn_backend="VANILLA", disable_overlap_scheduler=True))
+                    attn_backend="VANILLA",
+                    disable_overlap_scheduler=True)
 
     sampling_params = SamplingParams(return_context_logits=True)
 
diff --git a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml
index 639e84a614e..df29e937bfc 100644
--- a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml
+++ b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml
@@ -8,8 +8,7 @@ backend: "pytorch"
 tensor_parallel_size: 1
 pipeline_parallel_size: 1
 
-pytorch_backend_config:
-  use_cuda_graph: False
+use_cuda_graph: False
 
 # ======= Triton Server Configurations =======
 # Triton Configurations to override the default values in config.pbtxt