diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md index 7f90c391c00..d510209b4a5 100644 --- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md +++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md @@ -134,9 +134,8 @@ To do the benchmark, run the following command: YOUR_DATA_PATH= cat >./extra-llm-api-config.yml<./extra-llm-api-config.yml < cat >./extra-llm-api-config.yml<./extra-llm-api-config.yml< cat >./extra-llm-api-config.yml< cat >./extra-llm-api-config.yml<, backend="autodeploy", build_config=build_config, - pytorch_backend_config=ad_config, + auto_deploy_config=ad_config, tensor_parallel_size=, ) diff --git a/examples/auto_deploy/build_and_run_ad.py b/examples/auto_deploy/build_and_run_ad.py index f5c9f1a6d11..1744cf1f4a7 100644 --- a/examples/auto_deploy/build_and_run_ad.py +++ b/examples/auto_deploy/build_and_run_ad.py @@ -73,7 +73,7 @@ def build_llm_from_config(config: SimpleConfig) -> LLM: model=factory.model, backend="autodeploy", build_config=build_config, - pytorch_backend_config=ad_config, + auto_deploy_config=ad_config, tensor_parallel_size=config.world_size, tokenizer=factory.init_tokenizer() if config.customize_tokenizer else None, ) diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md index 70cc78f3619..fbba3b42748 100644 --- a/examples/disaggregated/README.md +++ b/examples/disaggregated/README.md @@ -9,7 +9,7 @@ You can use multiple `trtllm-serve` commands to launch the context and generatio for disaggregated serving. For example, you could launch two context servers and one generation servers as follows: ``` -echo -e "pytorch_backend_config:\n disable_overlap_scheduler: True\ncache_transceiver_config:\n max_num_tokens: 2048" > context_extra-llm-api-config.yml +echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\nmax_num_tokens: 2048" > context_extra-llm-api-config.yml echo -e "cache_transceiver_config:\n max_num_tokens: 2048" > gen_extra-llm-api-config.yml export TRTLLM_USE_UCX_KVCACHE=1 @@ -63,9 +63,8 @@ hostname: localhost port: 8000 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 backend: "pytorch" -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True +use_cuda_graph: False +disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/examples/disaggregated/disagg_config.yaml b/examples/disaggregated/disagg_config.yaml index a199a594522..eedb81f34b9 100644 --- a/examples/disaggregated/disagg_config.yaml +++ b/examples/disaggregated/disagg_config.yaml @@ -3,9 +3,8 @@ port: 8000 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 free_gpu_memory_fraction: 0.25 backend: "pytorch" -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True +use_cuda_graph: False +disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/examples/llm-api/llm_inference_kv_events.py b/examples/llm-api/llm_inference_kv_events.py index 827427e538b..009b7e0dee7 100644 --- a/examples/llm-api/llm_inference_kv_events.py +++ b/examples/llm-api/llm_inference_kv_events.py @@ -1,17 +1,15 @@ ### Get KV Cache Events from tensorrt_llm import LLM, SamplingParams -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import KvCacheConfig def main(): - pytorch_config = PyTorchConfig(autotuner_enabled=False, - kv_cache_dtype='auto') llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", tensor_parallel_size=2, - pytorch_backend_config=pytorch_config, + autotuner_enabled=False, + kv_cache_dtype='auto', kv_cache_config=KvCacheConfig(enable_block_reuse=True, event_buffer_max_size=1024), backend="pytorch") diff --git a/examples/llm-api/llm_mgmn_trtllm_bench.sh b/examples/llm-api/llm_mgmn_trtllm_bench.sh index 21a0ee48d9b..ab556cf470a 100644 --- a/examples/llm-api/llm_mgmn_trtllm_bench.sh +++ b/examples/llm-api/llm_mgmn_trtllm_bench.sh @@ -74,10 +74,9 @@ srun -l \ # This is optional cat > /tmp/pytorch_extra_args.txt << EOF -pytorch_backend_config: - use_cuda_graph: false - cuda_graph_padding_enabled: false - print_iter_log: true +use_cuda_graph: false +cuda_graph_padding_enabled: false +print_iter_log: true enable_attention_dp: false EOF diff --git a/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py b/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py index 89e4a293db8..9faff894a4f 100644 --- a/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py +++ b/examples/llm-eval/lm-eval-harness/lm_eval_tensorrt_llm.py @@ -100,7 +100,6 @@ def __init__( if hasattr(PyTorchConfig, "moe_backend"): pytorch_config_params["moe_backend"] = self.moe_backend print(f"Info: moe_backend is set to {self.moe_backend}") - pytorch_config = PyTorchConfig(**pytorch_config_params) # stop words not currently supported by torch backend self.use_stop_words = False @@ -110,7 +109,7 @@ def __init__( tensor_parallel_size=tp, trust_remote_code=trust_remote_code, enable_chunked_prefill=False, - pytorch_backend_config=pytorch_config, + **pytorch_config_params, tokenizer=self.tokenizer, kv_cache_config=trt_kv_cache_config, moe_expert_parallel_size=self.moe_expert_parallel_size, diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md index 04989e97fb4..71620d54549 100644 --- a/examples/models/core/deepseek_v3/README.md +++ b/examples/models/core/deepseek_v3/README.md @@ -140,10 +140,9 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \ --num-requests 24 > /tmp/benchmarking_64k.txt cat < /tmp/extra-llm-api-config.yml -pytorch_backend_config: - use_cuda_graph: true - cuda_graph_padding_enabled: true - cuda_graph_batch_sizes: [1, 4, 8, 12] +use_cuda_graph: true +cuda_graph_padding_enabled: true +cuda_graph_batch_sizes: [1, 4, 8, 12] EOF trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} throughput \ @@ -168,11 +167,10 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \ --num-requests 4 > /tmp/benchmarking_128k.txt cat < /tmp/extra-llm-api-config.yml -pytorch_backend_config: - use_cuda_graph: true - cuda_graph_padding_enabled: true - cuda_graph_batch_sizes: [1, 2] - moe_max_num_tokens: 16384 +use_cuda_graph: true +cuda_graph_padding_enabled: true +cuda_graph_batch_sizes: [1, 2] +moe_max_num_tokens: 16384 EOF trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} throughput \ @@ -193,8 +191,7 @@ Evaluate the model accuracy using `trtllm-eval`. 1. (Optional) Prepare an advanced configuration file: ```bash cat >./extra-llm-api-config.yml <./extra-llm-api-config.yml < /tmp/dataset.txt cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <./extra-llm-api-config.yml < None: - if isinstance(self.load_format, LoadFormat): - return - load_format = self.load_format.upper() - if load_format not in LoadFormat.__members__: - raise NotImplementedError(f"Invalid LoadFormat: {self.load_format}") - self.load_format = LoadFormat[load_format] - - def __post_init__(self) -> None: - if self.torch_compile_enabled and self.torch_compile_piecewise_cuda_graph: - assert self.torch_compile_fullgraph, "Fullgraph must be enabled for piecewise CUDA graph." - - if self.cuda_graph_batch_sizes is not None: - assert self.cuda_graph_max_batch_size == 0, ( - "Please don't set both cuda_graph_batch_sizes " - "and cuda_graph_max_batch_size.") - self.cuda_graph_batch_sizes = sorted(self.cuda_graph_batch_sizes) - else: - self.cuda_graph_max_batch_size = self.cuda_graph_max_batch_size or 128 - if self.cuda_graph_padding_enabled: - self.cuda_graph_batch_sizes = [1, 2, 4] + [ - i * 8 for i in range(1, 17) - ] - else: - self.cuda_graph_batch_sizes = list(range(1, 32)) + [32, 64, 128] - self.cuda_graph_batch_sizes += [ - 2**i for i in range( - 8, math.floor(math.log(self.cuda_graph_max_batch_size, 2))) - ] - self.cuda_graph_batch_sizes = [ - size for size in self.cuda_graph_batch_sizes - if size <= self.cuda_graph_max_batch_size - ] - if self.cuda_graph_max_batch_size != self.cuda_graph_batch_sizes[ - -1]: - self.cuda_graph_batch_sizes.append( - self.cuda_graph_max_batch_size) - - if isinstance(self.moe_load_balancer, str): - assert os.path.exists(self.moe_load_balancer) - if self.moe_load_balancer.endswith(".json"): - with open(self.moe_load_balancer) as f: - self.moe_load_balancer = json.load(f) - elif self.moe_load_balancer.endswith((".yaml", ".yml")): - with open(self.moe_load_balancer) as f: - self.moe_load_balancer = yaml.safe_load(f) - else: - raise ValueError( - f"Unsupported moe load balancer config file: {self.moe_load_balancer}" - ) - if isinstance(self.moe_load_balancer, dict): - self.moe_load_balancer = MoeLoadBalancerConfig( - **self.moe_load_balancer) - - self._convert_load_format() - EXETENDED_EXECUTOR_CONFIG_FIELDS = [ 'backend', diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 64437f7e78b..8dcfe25dfbb 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -310,11 +310,6 @@ def __init__( ) attn_backend = pytorch_backend_config.attn_backend - # _convert_load_format should already be called by - # __post_init__, but call it again just in case. - # The config object is not a frozen data class, so it's - # possible the user changed it after initialization. - pytorch_backend_config._convert_load_format() self.model = self._load_model( model_path, mapping=self.mapping, diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py index cc474ec5296..71797268de7 100755 --- a/tensorrt_llm/bench/benchmark/throughput.py +++ b/tensorrt_llm/bench/benchmark/throughput.py @@ -340,8 +340,8 @@ def throughput_command( kwargs = kwargs | runtime_config.get_llm_args() kwargs['backend'] = backend - if "pytorch_backend_config" in kwargs and iteration_log is not None: - kwargs["pytorch_backend_config"].enable_iter_perf_stats = True + if backend == "pytorch": + kwargs["enable_iter_perf_stats"] = True if runtime_config.backend == 'pytorch': llm = PyTorchLLM(**kwargs) diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py index 3e7ed03cd21..202d8662d9b 100755 --- a/tensorrt_llm/bench/benchmark/utils/general.py +++ b/tensorrt_llm/bench/benchmark/utils/general.py @@ -89,10 +89,8 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str, if extra_llm_api_options: with open(extra_llm_api_options, 'r') as f: llm_args_dict = yaml.safe_load(f) - if "pytorch_backend_config" in llm_args_dict: - if "kv_cache_dtype" in llm_args_dict["pytorch_backend_config"]: - kv_cache_dtype = llm_args_dict["pytorch_backend_config"][ - "kv_cache_dtype"] + if "kv_cache_dtype" in llm_args_dict: + kv_cache_dtype = llm_args_dict["kv_cache_dtype"] enable_chunked_prefill = llm_args_dict.get("enable_chunked_prefill", enable_chunked_prefill) diff --git a/tensorrt_llm/bench/dataclasses/configuration.py b/tensorrt_llm/bench/dataclasses/configuration.py index 41143abea42..e4576ee487e 100755 --- a/tensorrt_llm/bench/dataclasses/configuration.py +++ b/tensorrt_llm/bench/dataclasses/configuration.py @@ -81,8 +81,7 @@ def get_llm_args(self) -> Dict: } if self.backend in backend_config_map: - llm_args["pytorch_backend_config"] = backend_config_map[ - self.backend]() + llm_args.update(backend_config_map[self.backend]()) return update_llm_args_with_extra_options(llm_args, self.extra_llm_api_options) @@ -109,7 +108,7 @@ def get_perf_config(self) -> ExtendedRuntimePerfKnobConfig: return config def get_pytorch_perf_config(self) -> PyTorchConfig: - return PyTorchConfig(**self.pytorch_config) + return self.pytorch_config def get_autodeploy_perf_config(self) -> AutoDeployConfig: ad_config = AutoDeployConfig(**self.pytorch_config) diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index 995b8845041..699c61654e8 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -264,9 +264,8 @@ def get_statistics_dict(self) -> Dict[str, Any]: model = self.rt_cfg.model_path or self.rt_cfg.model model_config = ModelConfig.from_pretrained(model, trust_remote_code=True) - validate_and_set_kv_cache_quant( - model_config, - self.kwargs["pytorch_backend_config"].kv_cache_dtype) + validate_and_set_kv_cache_quant(model_config, + self.kwargs["kv_cache_dtype"]) stats_dict["engine"] |= { "backend": diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py index 632a956b6ff..8f875cdabda 100644 --- a/tensorrt_llm/commands/eval.py +++ b/tensorrt_llm/commands/eval.py @@ -19,7 +19,6 @@ import tensorrt_llm.profiler as profiler from .._torch.llm import LLM as PyTorchLLM -from .._torch.pyexecutor.config import PyTorchConfig from ..evaluate import (GSM8K, MMLU, CnnDailymail, GPQADiamond, GPQAExtended, GPQAMain) from ..llmapi import LLM, BuildConfig, KvCacheConfig @@ -113,9 +112,6 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str, if backend == "tensorrt": backend = None - pytorch_backend_config = None - if backend == "pytorch": - pytorch_backend_config = PyTorchConfig() llm_args = { "model": model, @@ -128,7 +124,6 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str, "build_config": build_config, "kv_cache_config": kv_cache_config, "backend": backend, - "pytorch_backend_config": pytorch_backend_config, } if extra_llm_api_options is not None: diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py index 28bfa7498c1..f622cd5f60e 100644 --- a/tensorrt_llm/commands/serve.py +++ b/tensorrt_llm/commands/serve.py @@ -8,7 +8,6 @@ from torch.cuda import device_count from tensorrt_llm._torch.llm import LLM as PyTorchLLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import (LLM, BuildConfig, CapacitySchedulerPolicy, DynamicBatchConfig, KvCacheConfig, SchedulerConfig) @@ -48,7 +47,6 @@ def get_llm_args(model: str, kv_cache_config = KvCacheConfig( free_gpu_memory_fraction=free_gpu_memory_fraction) - pytorch_backend_config = PyTorchConfig() if backend == "pytorch" else None dynamic_batch_config = DynamicBatchConfig( enable_batch_size_tuning=True, enable_max_num_tokens_tuning=False, @@ -74,7 +72,6 @@ def get_llm_args(model: str, "max_seq_len": max_seq_len, "kv_cache_config": kv_cache_config, "backend": backend if backend == "pytorch" else None, - "pytorch_backend_config": pytorch_backend_config, "_num_postprocess_workers": num_postprocess_workers, "_postprocess_tokenizer_dir": tokenizer or model, "_reasoning_parser": reasoning_parser, diff --git a/tensorrt_llm/executor/serialization.py b/tensorrt_llm/executor/serialization.py index dbd0b2bbf11..54df0b9bb84 100644 --- a/tensorrt_llm/executor/serialization.py +++ b/tensorrt_llm/executor/serialization.py @@ -8,8 +8,10 @@ # it is only needed in a single instance the class can be added at runtime # using register_approved_ipc_class. BASE_ZMQ_CLASSES = { - "builtins": ["Exception", "ValueError" - ], # each Exception Error class needs to be added explicitly + "builtins": [ + "Exception", "ValueError", "NotImplementedError", "AttributeError", + "AssertionError" + ], # each Exception Error class needs to be added explicitly "collections": ["OrderedDict"], "datetime": ["timedelta"], "pathlib": ["PosixPath"], @@ -57,6 +59,7 @@ "KvCacheRetentionConfig.TokenRangeRetentionConfig", "PeftCacheConfig", "SchedulerConfig", "DynamicBatchConfig" ], + "tensorrt_llm._torch.pyexecutor.config": ["PyTorchConfig"], "tensorrt_llm.builder": ["BuildConfig"], "tensorrt_llm.disaggregated_params": ["DisaggregatedParams"], "tensorrt_llm.executor.postproc_worker": [ @@ -77,7 +80,7 @@ "tensorrt_llm.llmapi.llm_args": [ "_ModelFormatKind", "_ParallelConfig", "CalibConfig", "CapacitySchedulerPolicy", "KvCacheConfig", "LookaheadDecodingConfig", - "TrtLlmArgs", "SchedulerConfig" + "TrtLlmArgs", "SchedulerConfig", "LoadFormat" ], "tensorrt_llm.llmapi.mpi_session": ["RemoteTask"], "tensorrt_llm.llmapi.llm_utils": diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index 9e7e8dd313a..476ea580079 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -112,9 +112,6 @@ def __init__(self, self._executor_cls = kwargs.pop("executor_cls", GenerationExecutor) try: - self.pytorch_backend_config = kwargs.pop('pytorch_backend_config', - None) - llm_args_cls = TorchLlmArgs if kwargs.get( 'backend', None) == 'pytorch' else TrtLlmArgs @@ -625,7 +622,8 @@ def _build_model(self): update_executor_config( executor_config, backend=self.args.backend, - pytorch_backend_config=self.pytorch_backend_config, + pytorch_backend_config=self.args.get_pytorch_backend_config() + if self.args.backend == "pytorch" else None, mapping=self.args.parallel_config.to_mapping(), build_config=self.args.build_config if self._on_trt_backend else None, diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index c8d5033ad41..6d769a12ee3 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -1,14 +1,17 @@ import json import math +import os from abc import ABC, abstractmethod -from dataclasses import dataclass, field, fields +from dataclasses import dataclass, field from enum import Enum, EnumMeta from pathlib import Path -from typing import Any, ClassVar, Dict, List, Literal, Optional, Union +from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Literal, Optional, + Union) import torch import yaml -from pydantic import BaseModel, Field, PrivateAttr, validator +from pydantic import (BaseModel, Field, PrivateAttr, field_validator, + model_validator) from strenum import StrEnum from transformers import PreTrainedTokenizerBase @@ -18,6 +21,9 @@ from .._utils import mpi_rank from ..auto_parallel import AutoParallelConfig, infer_cluster_config +if TYPE_CHECKING: + from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig + # yapf: disable # isort: off from ..bindings.executor import ( @@ -36,6 +42,8 @@ PeftCacheConfig as _PeftCacheConfig, SchedulerConfig as _SchedulerConfig) # isort: skip # isort: on +from transformers import PreTrainedTokenizerBase + # yapf: enable from ..builder import BuildConfig, EngineConfig from ..logger import logger @@ -549,7 +557,9 @@ class LookaheadDecodingConfig(DecodingBaseConfig, PybindMirror): get_default_lookahead_decoding_verification_set(), description="Number of NGrams in verification branch per step.") - @validator('max_window_size', 'max_ngram_size', 'max_verification_set_size') + @field_validator('max_window_size', 'max_ngram_size', + 'max_verification_set_size') + @classmethod def validate_positive_values(cls, v): if v <= 0: raise ValueError(f"Value must be positive, got {v}") @@ -848,8 +858,8 @@ class BaseLlmArgs(BaseModel): default=None, description="Quantization config.") # Several options from ExecutorConfig, expanded here for less hierarchy - kv_cache_config: Optional[KvCacheConfig] = Field( - default=None, description="KV cache config.") + kv_cache_config: KvCacheConfig = Field(default_factory=KvCacheConfig, + description="KV cache config.") enable_chunked_prefill: bool = Field(default=False, description="Enable chunked prefill.") @@ -876,8 +886,8 @@ class BaseLlmArgs(BaseModel): peft_cache_config: Optional[PeftCacheConfig] = Field( default=None, description="PEFT cache config.") - scheduler_config: Optional[SchedulerConfig] = Field( - default=None, description="Scheduler config.") + scheduler_config: SchedulerConfig = Field(default_factory=SchedulerConfig, + description="Scheduler config.") cache_transceiver_config: Optional[CacheTransceiverConfig] = Field( default=None, description="Cache transceiver config.") @@ -991,10 +1001,6 @@ def model_post_init(self, __context: Any): enable_attention_dp=self.enable_attention_dp, cp_config=self.cp_config) - self.kv_cache_config = self.kv_cache_config or KvCacheConfig() - - self.scheduler_config = self.scheduler_config or SchedulerConfig() - @classmethod def from_kwargs(cls, **kwargs: Any) -> "BaseLlmArgs": """Create `LlmArgs` instance from kwargs. @@ -1016,8 +1022,7 @@ def to_dict(self) -> dict: Returns: dict: The dict that contains all fields of the `LlmArgs` instance. """ - return dict( - (field.name, getattr(self, field.name)) for field in fields(self)) + return self.model_dump() @staticmethod def _maybe_update_config_for_consistency( @@ -1444,6 +1449,12 @@ def model_post_init(self, __context): indent=' ' * 4) +class LoadFormat(Enum): + AUTO = 0 + # Initialize all weights randomly. + DUMMY = 1 + + class TorchLlmArgs(BaseLlmArgs): # Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs @@ -1453,12 +1464,275 @@ class TorchLlmArgs(BaseLlmArgs): exclude_from_json=True, json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"}) + # PyTorch backend specific configurations + + use_cuda_graph: bool = Field( + default=False, + description= + "If true, use CUDA graphs for decoding. CUDA graphs are only created for the batch sizes in cuda_graph_batch_sizes, and are enabled for batches that consist of decoding requests *only* (the reason is that it's hard to capture a single graph with prefill requests since the input shapes are a function of the sequence lengths). Note that each CUDA graph can use up to 200 MB of extra memory." + ) + + cuda_graph_batch_sizes: Optional[List[int]] = Field( + default=None, + description="List of batch sizes to create CUDA graphs for.") + + cuda_graph_max_batch_size: int = Field( + default=0, description="Maximum batch size for CUDA graphs.") + + cuda_graph_padding_enabled: bool = Field( + default=False, + description= + "If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance." + ) + + disable_overlap_scheduler: bool = Field( + default=False, description="Disable the overlap scheduler.") + + moe_max_num_tokens: Optional[int] = Field( + default=None, + description= + "If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used." + ) + + moe_load_balancer: Optional[Union[object, dict, str]] = Field( + default=None, + description="Configuration for MoE load balancing.", + json_schema_extra={"type": f"Union[MoeLoadBalancerConfig, dict, str]"}) + + attn_backend: str = Field(default='TRTLLM', + description="Attention backend to use.") + + moe_backend: str = Field(default='CUTLASS', + description="MoE backend to use.") + + mixed_sampler: bool = Field( + default=False, + description= + "If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc." + ) + + enable_trtllm_sampler: bool = Field( + default=False, + description= + "If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies." + ) + + kv_cache_dtype: str = Field(default="auto", + description="Data type for KV cache.") + + use_kv_cache: bool = Field(default=True, + description="Whether to use KV cache.") + + enable_iter_perf_stats: bool = Field( + default=False, description="Enable iteration performance statistics.") + + enable_iter_req_stats: bool = Field( + default=False, + description= + "If true, enables per request stats per iteration. Must also set enable_iter_perf_stats to true to get request stats." + ) + + print_iter_log: bool = Field(default=False, + description="Print iteration logs.") + + torch_compile_enabled: bool = Field( + default=False, description="Enable torch.compile optimization.") + + torch_compile_fullgraph: bool = Field( + default=True, + description="Enable full graph compilation in torch.compile.") + + torch_compile_inductor_enabled: bool = Field( + default=False, description="Enable inductor backend in torch.compile.") + + torch_compile_piecewise_cuda_graph: bool = Field( + default=False, + description="Enable piecewise CUDA graph in torch.compile.") + + torch_compile_enable_userbuffers: bool = Field( + default=True, + description= + "When torch compile is enabled, userbuffers is enabled by default.") + + autotuner_enabled: bool = Field( + default=True, + description="Enable autotuner only when torch compile is enabled.") + + enable_layerwise_nvtx_marker: bool = Field( + default=False, description="If true, enable layerwise nvtx marker.") + + auto_deploy_config: Optional[object] = Field( + default=None, + description="Auto deploy config.", + exclude_from_json=True, + json_schema_extra={"type": f"Optional[AutoDeployConfig]"}) + + load_format: Union[str, LoadFormat] = Field( + default=LoadFormat.AUTO, + description= + "How to load the model weights. By default, detect the weight type from the model checkpoint." + ) + + @field_validator('load_format', mode='before') + @classmethod + def convert_load_format(cls, v): + if isinstance(v, LoadFormat): + return v + load_format = v.upper() + if load_format not in LoadFormat.__members__: + raise ValueError(f"Invalid LoadFormat: {v}") + return LoadFormat[load_format] + + # Extra resource managers to use in addition to the KV cache manager. + # Each manager's prepare_resources method is called before the forward pass, + # and update_resources() is called after the pass finishes. free_resources() + # is called when a request finishes. The KV cache manager is guaranteed to + # be invoked after all of these extra managers in all stages. + _extra_resource_managers: Dict[str, + object] = PrivateAttr(default_factory=dict, ) + + @property + def extra_resource_managers(self) -> Dict[str, object]: + return self._extra_resource_managers + + @extra_resource_managers.setter + def extra_resource_managers(self, value: Dict[str, object]) -> None: + self._extra_resource_managers = value + @print_traceback_on_error def model_post_init(self, __context): - super().model_post_init(__context) + from .._torch.model_config import MoeLoadBalancerConfig + super().model_post_init(__context) self.model_format = _ModelFormatKind.HF + if isinstance(self.moe_load_balancer, str): + assert os.path.exists(self.moe_load_balancer) + if self.moe_load_balancer.endswith(".json"): + with open(self.moe_load_balancer) as f: + self.moe_load_balancer = json.load(f) + elif self.moe_load_balancer.endswith((".yaml", ".yml")): + with open(self.moe_load_balancer) as f: + self.moe_load_balancer = yaml.safe_load(f) + else: + raise ValueError( + f"Unsupported moe load balancer config file: {self.moe_load_balancer}" + ) + if isinstance(self.moe_load_balancer, dict): + self.moe_load_balancer = MoeLoadBalancerConfig( + **self.moe_load_balancer) + + # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig + def get_pytorch_backend_config(self) -> "PyTorchConfig": + from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig + + # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig + # Just a WAR to support the auto_deploy + if self.auto_deploy_config is not None: + return self.auto_deploy_config + + return PyTorchConfig( + extra_resource_managers=self.extra_resource_managers, + use_cuda_graph=self.use_cuda_graph, + cuda_graph_batch_sizes=self.cuda_graph_batch_sizes, + cuda_graph_max_batch_size=self.cuda_graph_max_batch_size, + cuda_graph_padding_enabled=self.cuda_graph_padding_enabled, + disable_overlap_scheduler=self.disable_overlap_scheduler, + moe_max_num_tokens=self.moe_max_num_tokens, + moe_load_balancer=self.moe_load_balancer, + attn_backend=self.attn_backend, + moe_backend=self.moe_backend, + mixed_sampler=self.mixed_sampler, + enable_trtllm_sampler=self.enable_trtllm_sampler, + kv_cache_dtype=self.kv_cache_dtype, + use_kv_cache=self.use_kv_cache, + enable_iter_perf_stats=self.enable_iter_perf_stats, + enable_iter_req_stats=self.enable_iter_req_stats, + print_iter_log=self.print_iter_log, + torch_compile_enabled=self.torch_compile_enabled, + torch_compile_fullgraph=self.torch_compile_fullgraph, + torch_compile_inductor_enabled=self.torch_compile_inductor_enabled, + torch_compile_piecewise_cuda_graph=self. + torch_compile_piecewise_cuda_graph, + torch_compile_enable_userbuffers=self. + torch_compile_enable_userbuffers, + autotuner_enabled=self.autotuner_enabled, + enable_layerwise_nvtx_marker=self.enable_layerwise_nvtx_marker, + load_format=self.load_format) + + @field_validator('cuda_graph_max_batch_size') + @classmethod + def validate_cuda_graph_max_batch_size(cls, v): + """Validate cuda_graph_max_batch_size is non-negative.""" + if v < 0: + raise ValueError("cuda_graph_max_batch_size must be non-negative") + return v + + @staticmethod + def _generate_cuda_graph_batch_sizes(max_batch_size: int, + padding_enabled: bool) -> List[int]: + """Generate a list of batch sizes for CUDA graphs. + + Args: + max_batch_size: Maximum batch size to generate up to + padding_enabled: Whether padding is enabled, which affects the batch size distribution + + Returns: + List of batch sizes to create CUDA graphs for + """ + if padding_enabled: + batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)] + else: + batch_sizes = list(range(1, 32)) + [32, 64, 128] + + # Add powers of 2 up to max_batch_size + batch_sizes += [ + 2**i for i in range(8, math.floor(math.log(max_batch_size, 2))) + ] + + # Filter and sort batch sizes + batch_sizes = sorted( + [size for size in batch_sizes if size <= max_batch_size]) + + # Add max_batch_size if not already included + if max_batch_size != batch_sizes[-1]: + batch_sizes.append(max_batch_size) + + return batch_sizes + + @model_validator(mode='after') + def validate_cuda_graph_config(self) -> 'TorchLlmArgs': + """Validate CUDA graph configuration. + + Ensures that: + 1. If cuda_graph_batch_sizes is provided, cuda_graph_max_batch_size must be 0 + 2. If cuda_graph_batch_sizes is not provided, it is generated based on cuda_graph_max_batch_size + 3. If both are provided, cuda_graph_batch_sizes must match the generated values + """ + if self.cuda_graph_batch_sizes is not None: + self.cuda_graph_batch_sizes = sorted(self.cuda_graph_batch_sizes) + if self.cuda_graph_max_batch_size != 0: + if self.cuda_graph_batch_sizes != self._generate_cuda_graph_batch_sizes( + self.cuda_graph_max_batch_size, + self.cuda_graph_padding_enabled): + raise ValueError( + "Please don't set both cuda_graph_batch_sizes " + "and cuda_graph_max_batch_size.\n" + f"cuda_graph_batch_sizes: {self.cuda_graph_batch_sizes}, " + f"cuda_graph_max_batch_size: {self.cuda_graph_max_batch_size}" + ) + else: + self.cuda_graph_max_batch_size = max( + self.cuda_graph_batch_sizes) + else: + max_batch_size = self.cuda_graph_max_batch_size or 128 + generated_sizes = self._generate_cuda_graph_batch_sizes( + max_batch_size, self.cuda_graph_padding_enabled) + self.cuda_graph_batch_sizes = generated_sizes + self.cuda_graph_max_batch_size = max_batch_size + + return self + def update_llm_args_with_extra_dict( llm_args: Dict, diff --git a/tensorrt_llm/llmapi/utils.py b/tensorrt_llm/llmapi/utils.py index 39cfd2d5739..5872174ab96 100644 --- a/tensorrt_llm/llmapi/utils.py +++ b/tensorrt_llm/llmapi/utils.py @@ -507,6 +507,9 @@ def generate_api_docs_as_docstring(model: Type[BaseModel], elif field_name in type_hints: type_str = str(type_hints[field_name]) type_str = type_str.replace("typing.", "") + # Extract just the class name from full class path + if " 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) llm = LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config, + **pytorch_config, enable_attention_dp=attention_dp, speculative_config=mtp_config) with llm: @@ -497,11 +497,12 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, pytest.skip("PP with torch.compile is not supported yet.") # OOM on H100 with default free_gpu_memory_fraction=0.9 kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) - pytorch_config = PyTorchConfig( + pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph, torch_compile_enabled=torch_compile, - torch_compile_fullgraph=True) + torch_compile_fullgraph=True, + ) mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) @@ -510,7 +511,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config, + **pytorch_config, enable_attention_dp=attention_dp, speculative_config=mtp_config) with llm: @@ -539,17 +540,18 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph, pytest.skip("https://nvbugs/5252559") # OOM on H100 with default free_gpu_memory_fraction=0.9 kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) - pytorch_config = PyTorchConfig( + pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph, torch_compile_enabled=torch_compile, - torch_compile_fullgraph=True) + torch_compile_fullgraph=True, + ) quant_config = QuantConfig() quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES if fp8kv: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 - pytorch_config.kv_cache_dtype = "fp8" + pytorch_config["kv_cache_dtype"] = "fp8" mtp_config = None if mtp_nextn > 0: @@ -557,7 +559,7 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph, llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config, + **pytorch_config, quant_config=quant_config, enable_attention_dp=attention_dp, speculative_config=mtp_config) @@ -578,13 +580,15 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph, def test_fp8_block_scales_cuda_graph_padding(self): # OOM on H100 with default free_gpu_memory_fraction=0.9 kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) - pytorch_config = PyTorchConfig(disable_overlap_scheduler=False, - use_cuda_graph=True, - cuda_graph_max_batch_size=512, - cuda_graph_padding_enabled=True) + pytorch_config = dict( + disable_overlap_scheduler=False, + use_cuda_graph=True, + cuda_graph_max_batch_size=512, + cuda_graph_padding_enabled=True, + ) llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config) + **pytorch_config) assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES with llm: task = MMLU(self.MODEL_NAME) @@ -620,17 +624,18 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, pytest.skip("PP with torch.compile is not supported yet.") # OOM on H100 with default free_gpu_memory_fraction=0.9 kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) - pytorch_config = PyTorchConfig( + pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph, torch_compile_enabled=torch_compile, - torch_compile_fullgraph=True) + torch_compile_fullgraph=True, + ) quant_config = QuantConfig() quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES if fp8kv: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 - pytorch_config.kv_cache_dtype = "fp8" + pytorch_config["kv_cache_dtype"] = "fp8" mtp_config = None if mtp_nextn > 0: @@ -641,7 +646,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config, + **pytorch_config, quant_config=quant_config, enable_attention_dp=attention_dp, speculative_config=mtp_config) @@ -676,13 +681,13 @@ def test_fp8_block_scales_4gpus_static_eplb(self): num_slots=num_slots, initial_global_assignments=initial_global_assignments, layer_updates_per_iter=0) - pytorch_config = PyTorchConfig(use_cuda_graph=True, + pytorch_backend_options = dict(use_cuda_graph=True, moe_load_balancer=eplb_config) llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", tensor_parallel_size=4, moe_expert_parallel_size=4, kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config, + **pytorch_backend_options, enable_attention_dp=True) with llm: task = MMLU(self.MODEL_NAME) @@ -705,21 +710,22 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler, pytest.skip("https://nvbugs/5252559") kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) - pytorch_config = PyTorchConfig( + pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph, torch_compile_enabled=torch_compile, - torch_compile_fullgraph=True) + torch_compile_fullgraph=True, + ) quant_config = QuantConfig() quant_config.quant_algo = QuantAlgo.NVFP4 if fp8kv: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 - pytorch_config.kv_cache_dtype = "fp8" + pytorch_config["kv_cache_dtype"] = "fp8" llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only", kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config, + **pytorch_config, quant_config=quant_config, enable_attention_dp=attention_dp) @@ -756,24 +762,25 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph, if torch_compile and pp_size > 1: pytest.skip("PP with torch.compile is not supported yet.") kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) - pytorch_config = PyTorchConfig( + pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph, torch_compile_enabled=torch_compile, - torch_compile_fullgraph=True) + torch_compile_fullgraph=True, + ) quant_config = QuantConfig() quant_config.quant_algo = QuantAlgo.NVFP4 if fp8kv: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 - pytorch_config.kv_cache_dtype = "fp8" + pytorch_config["kv_cache_dtype"] = "fp8" llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only", tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config, + **pytorch_config, quant_config=quant_config, enable_attention_dp=attention_dp) @@ -815,9 +822,10 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv, kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6, enable_block_reuse=False) - pytorch_config = PyTorchConfig( + pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, - use_cuda_graph=cuda_graph) + use_cuda_graph=cuda_graph, + ) mtp_config = None if mtp_nextn > 0: mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) @@ -833,11 +841,11 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv, quant_config.quant_algo = QuantAlgo.NVFP4 if fp8kv: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 - pytorch_config.kv_cache_dtype = "fp8" + pytorch_config["kv_cache_dtype"] = "fp8" llm = LLM(model_path, kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config, + **pytorch_config, quant_config=quant_config, enable_attention_dp=attention_dp, speculative_config=mtp_config) @@ -883,16 +891,15 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, pytest.skip("https://nvbugs/5302441") kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) - pytorch_config = PyTorchConfig( - disable_overlap_scheduler=not overlap_scheduler, - use_cuda_graph=cuda_graph, - moe_backend=moe_backend) + pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler, + use_cuda_graph=cuda_graph, + moe_backend=moe_backend) quant_config = QuantConfig() quant_config.quant_algo = QuantAlgo.NVFP4 if fp8kv: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 - pytorch_config.kv_cache_dtype = "fp8" + pytorch_config["kv_cache_dtype"] = "fp8" mtp_config = None if mtp_nextn > 0: @@ -903,12 +910,12 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config, + **pytorch_config, quant_config=quant_config, enable_attention_dp=attention_dp, speculative_config=mtp_config) - assert llm.pytorch_backend_config.moe_backend == moe_backend + assert llm.args.moe_backend == moe_backend assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 if fp8kv: assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8 @@ -933,15 +940,16 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, max_batch_size): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4) - pytorch_config = PyTorchConfig( + pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, - use_cuda_graph=cuda_graph) + use_cuda_graph=cuda_graph, + ) quant_config = QuantConfig() quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES if fp8kv: quant_config.kv_cache_quant_algo = QuantAlgo.FP8 - pytorch_config.kv_cache_dtype = "fp8" + pytorch_config["kv_cache_dtype"] = "fp8" mtp_config = None if mtp_nextn > 0: @@ -952,7 +960,7 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config, + **pytorch_config, quant_config=quant_config, enable_attention_dp=attention_dp, speculative_config=mtp_config) @@ -986,12 +994,12 @@ class TestNemotronNas(LlmapiAccuracyTestHarness): @pytest.mark.skip_less_device(8) def test_auto_dtype_tp8(self): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) - pytorch_config = PyTorchConfig() + pytorch_config = dict() with LLM(self.MODEL_PATH, tensor_parallel_size=8, kv_cache_config=kv_cache_config, - pytorch_backend_config=pytorch_config) as llm: + **pytorch_config) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) @@ -1134,15 +1142,14 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness): ids=["latency"]) def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): - pytorch_config = PyTorchConfig( - disable_overlap_scheduler=not overlap_scheduler, - use_cuda_graph=cuda_graph) + pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler, + use_cuda_graph=cuda_graph) llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8", tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, - pytorch_backend_config=pytorch_config, + **pytorch_config, enable_attention_dp=attention_dp) with llm: task = CnnDailymail(self.MODEL_NAME) @@ -1162,15 +1169,14 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness): ids=["latency"]) def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): - pytorch_config = PyTorchConfig( - disable_overlap_scheduler=not overlap_scheduler, - use_cuda_graph=cuda_graph) + pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler, + use_cuda_graph=cuda_graph) llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8", tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, - pytorch_backend_config=pytorch_config, + **pytorch_config, enable_attention_dp=attention_dp) with llm: task = MMLU(self.MODEL_NAME) @@ -1185,16 +1191,15 @@ def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, ids=["latency"]) def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): - pytorch_config = PyTorchConfig( - disable_overlap_scheduler=not overlap_scheduler, - use_cuda_graph=cuda_graph) + pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler, + use_cuda_graph=cuda_graph) llm = LLM( f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf", tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, - pytorch_backend_config=pytorch_config, + **pytorch_config, enable_attention_dp=attention_dp) with llm: task = MMLU(self.MODEL_NAME) @@ -1225,7 +1230,7 @@ def test_nvfp4( overlap_scheduler, moe_backend, ): - pytorch_config = PyTorchConfig( + pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, use_cuda_graph=cuda_graph, moe_backend=moe_backend, @@ -1236,7 +1241,7 @@ def test_nvfp4( tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, - pytorch_backend_config=pytorch_config, + **pytorch_config, enable_attention_dp=attention_dp) with llm: task = MMLU(self.MODEL_NAME) @@ -1255,15 +1260,14 @@ class TestQwen3_32B(LlmapiAccuracyTestHarness): ids=["latency"]) def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): - pytorch_config = PyTorchConfig( - disable_overlap_scheduler=not overlap_scheduler, - use_cuda_graph=cuda_graph) + pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler, + use_cuda_graph=cuda_graph) llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8", tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, - pytorch_backend_config=pytorch_config, + **pytorch_config, enable_attention_dp=attention_dp) with llm: task = CnnDailymail(self.MODEL_NAME) @@ -1282,9 +1286,8 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness): ids=["latency", "throughput_latency"]) def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): - pytorch_config = PyTorchConfig( - disable_overlap_scheduler=not overlap_scheduler, - use_cuda_graph=cuda_graph) + pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler, + use_cuda_graph=cuda_graph) kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) llm = LLM( @@ -1292,7 +1295,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, - pytorch_backend_config=pytorch_config, + **pytorch_config, enable_attention_dp=attention_dp, kv_cache_config=kv_cache_config) with llm: @@ -1308,16 +1311,15 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, ids=["latency", "throughput_latency"]) def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler): - pytorch_config = PyTorchConfig( - disable_overlap_scheduler=not overlap_scheduler, - use_cuda_graph=cuda_graph) + pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler, + use_cuda_graph=cuda_graph) llm = LLM( f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, moe_expert_parallel_size=ep_size, - pytorch_backend_config=pytorch_config, + **pytorch_config, enable_attention_dp=attention_dp) with llm: task = MMLU(self.MODEL_NAME) diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml index 7a850b121bc..42c439f6aaf 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml @@ -3,10 +3,9 @@ hostname: localhost port: 8000 backend: "pytorch" free_gpu_memory_fraction: 0.1 -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True - autotuner_enabled: False +use_cuda_graph: False +disable_overlap_scheduler: True +autotuner_enabled: False context_servers: num_instances: 2 router: diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml index 2c9a83ecd65..747f6ccc764 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml @@ -3,10 +3,9 @@ port: 8000 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 free_gpu_memory_fraction: 0.15 backend: "pytorch" -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True - autotuner_enabled: False +use_cuda_graph: False +disable_overlap_scheduler: True +autotuner_enabled: False context_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml index 2a1a605078e..1a1285fcf6b 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml @@ -5,10 +5,9 @@ backend: "pytorch" free_gpu_memory_fraction: 0.15 conditional_disagg_config: max_local_prefill_length: 100 -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True - autotuner_enabled: False +use_cuda_graph: False +disable_overlap_scheduler: True +autotuner_enabled: False context_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml index 59db98e2ab7..2f70eab61f5 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml @@ -3,9 +3,8 @@ port: 8000 model: DeepSeek-V3-Lite/fp8 free_gpu_memory_fraction: 0.1 backend: "pytorch" -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True +use_cuda_graph: False +disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml index bf8b1484151..0959df0052d 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml @@ -3,9 +3,8 @@ port: 8000 model: DeepSeek-V3-Lite/fp8 free_gpu_memory_fraction: 0.1 backend: "pytorch" -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True +use_cuda_graph: False +disable_overlap_scheduler: True speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml index 35b1cb6f4e9..9a83d2d0a8b 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml @@ -11,9 +11,8 @@ context_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 enable_attention_dp: true - pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True + use_cuda_graph: False + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: @@ -21,8 +20,7 @@ generation_servers: tensor_parallel_size: 1 pipeline_parallel_size: 1 enable_attention_dp: true - pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: False + use_cuda_graph: False + disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml index b60de54c5eb..d99a04097ef 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml @@ -3,9 +3,8 @@ port: 8000 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 free_gpu_memory_fraction: 0.25 backend: "pytorch" -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True +use_cuda_graph: False +disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 2 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml index d01502cfc07..1c5755c4287 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml @@ -3,9 +3,8 @@ port: 8000 model: DeepSeek-V3-Lite/fp8 free_gpu_memory_fraction: 0.25 backend: "pytorch" -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True +use_cuda_graph: False +disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 2 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml index 9f19e0699f9..381a13a8a9d 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml @@ -3,9 +3,8 @@ port: 8000 model: DeepSeek-V3-Lite/fp8 free_gpu_memory_fraction: 0.25 backend: "pytorch" -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True +use_cuda_graph: False +disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 2 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml index ee05d96d063..42687143c22 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml @@ -3,9 +3,8 @@ port: 8000 model: DeepSeek-V3-Lite/fp8 free_gpu_memory_fraction: 0.25 backend: "pytorch" -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True +use_cuda_graph: False +disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 2 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml index 2c16cf7aefd..09aa1f6df1c 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml @@ -3,9 +3,8 @@ port: 8000 model: DeepSeek-V3-Lite/fp8 free_gpu_memory_fraction: 0.25 backend: "pytorch" -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True +use_cuda_graph: False +disable_overlap_scheduler: True speculative_config: decoding_type: MTP num_nextn_predict_layers: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml index b55acd05efb..616775cc6e2 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml @@ -8,9 +8,8 @@ context_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 enable_attention_dp: True - pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True + use_cuda_graph: False + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: @@ -18,8 +17,7 @@ generation_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 enable_attention_dp: True - pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: False + use_cuda_graph: False + disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml index 9428e563d4a..87bcdc4ed2e 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml @@ -8,9 +8,8 @@ context_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 enable_attention_dp: true - pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True + use_cuda_graph: False + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: @@ -18,8 +17,7 @@ generation_servers: tensor_parallel_size: 2 pipeline_parallel_size: 1 enable_attention_dp: true - pytorch_backend_config: - use_cuda_graph: True - disable_overlap_scheduler: False + use_cuda_graph: True + disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml index a97ac33cb29..ca4f4e64a2f 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml @@ -7,17 +7,15 @@ context_servers: num_instances: 1 tensor_parallel_size: 2 pipeline_parallel_size: 1 - pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True + use_cuda_graph: False + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: num_instances: 1 tensor_parallel_size: 2 pipeline_parallel_size: 1 - pytorch_backend_config: - use_cuda_graph: True - disable_overlap_scheduler: False + use_cuda_graph: True + disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml index 99060d86b74..110179d3f08 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml @@ -12,10 +12,9 @@ context_servers: kv_cache_config: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False - pytorch_backend_config: - use_cuda_graph: True - cuda_graph_batch_sizes: [1,3000] - disable_overlap_scheduler: True + use_cuda_graph: True + cuda_graph_batch_sizes: [1,3000] + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: @@ -28,10 +27,9 @@ generation_servers: kv_cache_config: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False - pytorch_backend_config: - use_cuda_graph: True - disable_overlap_scheduler: True - cuda_graph_padding_enabled: True - cuda_graph_batch_sizes: [1,4,8,16,24,32] + use_cuda_graph: True + disable_overlap_scheduler: True + cuda_graph_padding_enabled: True + cuda_graph_batch_sizes: [1,4,8,16,24,32] urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml index 30f7fc7721e..c6c51d854c0 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml @@ -12,8 +12,7 @@ generation_servers: free_gpu_memory_fraction: 0.2 enable_block_reuse: False enable_partial_reuse: False - pytorch_backend_config: - print_iter_log: True + print_iter_log: True urls: - "localhost:8002" - "localhost:8003" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml index 8ac4a59c5bb..dd95a06db0b 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml @@ -16,9 +16,8 @@ context_servers: kv_cache_config: free_gpu_memory_fraction: 0.15 enable_partial_reuse: False - pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True + use_cuda_graph: False + disable_overlap_scheduler: True urls: - "localhost:8001" - "localhost:8002" @@ -35,9 +34,8 @@ generation_servers: kv_cache_config: free_gpu_memory_fraction: 0.15 enable_partial_reuse: False - pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: False + use_cuda_graph: False + disable_overlap_scheduler: False urls: - "localhost:8003" - "localhost:8004" diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml index 290b076255d..ebc9066d13b 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml @@ -3,9 +3,8 @@ port: 8000 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 free_gpu_memory_fraction: 0.25 backend: "pytorch" -pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True +use_cuda_graph: False +disable_overlap_scheduler: True context_servers: num_instances: 1 tensor_parallel_size: 1 diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml index e35886d8b1a..36b051dea02 100644 --- a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml +++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml @@ -13,9 +13,8 @@ context_servers: kv_cache_config: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False - pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: True + use_cuda_graph: False + disable_overlap_scheduler: True urls: - "localhost:8001" generation_servers: @@ -28,8 +27,7 @@ generation_servers: kv_cache_config: free_gpu_memory_fraction: 0.2 enable_partial_reuse: False - pytorch_backend_config: - use_cuda_graph: False - disable_overlap_scheduler: False + use_cuda_graph: False + disable_overlap_scheduler: False urls: - "localhost:8002" diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py index 89c9d2ac2d7..e92a1ccbc72 100644 --- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py +++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py @@ -11,7 +11,6 @@ from tensorrt_llm import DisaggregatedParams, SamplingParams from tensorrt_llm._torch import LLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm._utils import set_mpi_comm from tensorrt_llm.llmapi import KvCacheConfig, MpiCommSession @@ -40,6 +39,7 @@ def model_path(model_name): async def run_worker(kv_cache_config, pytorch_config, model_name, rank): + assert isinstance(pytorch_config, dict) print(f"Running worker {rank}") port_name = MPI.Lookup_name('my_port') intercomm = MPI.COMM_WORLD.Connect(port_name) @@ -53,7 +53,7 @@ async def run_worker(kv_cache_config, pytorch_config, model_name, rank): auto_parallel=False, model=model_name, enable_chunked_prefill=False, - pytorch_backend_config=pytorch_config, + **pytorch_config, _mpi_session=mpi_session, kv_cache_config=kv_cache_config) print(f"LLM created") @@ -110,15 +110,15 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt, # Context worker worker_pytorch_configs.append( - PyTorchConfig(disable_overlap_scheduler=True, - kv_cache_dtype="auto", - use_cuda_graph=enable_cuda_graph)) + dict(disable_overlap_scheduler=True, + kv_cache_dtype="auto", + use_cuda_graph=enable_cuda_graph)) # Generation worker worker_pytorch_configs.append( - PyTorchConfig(disable_overlap_scheduler=not generation_overlap, - kv_cache_dtype="auto", - use_cuda_graph=enable_cuda_graph)) + dict(disable_overlap_scheduler=not generation_overlap, + kv_cache_dtype="auto", + use_cuda_graph=enable_cuda_graph)) kv_cache_configs = [KvCacheConfig(max_tokens=2048 * 8) for _ in range(2)] model_names = [model_path(model) for _ in range(2)] @@ -231,15 +231,15 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph, # Context worker worker_pytorch_configs.append( - PyTorchConfig(disable_overlap_scheduler=True, - kv_cache_dtype="auto", - use_cuda_graph=enable_cuda_graph)) + dict(disable_overlap_scheduler=True, + kv_cache_dtype="auto", + use_cuda_graph=enable_cuda_graph)) # Generation worker worker_pytorch_configs.append( - PyTorchConfig(disable_overlap_scheduler=not generation_overlap, - kv_cache_dtype="auto", - use_cuda_graph=enable_cuda_graph)) + dict(disable_overlap_scheduler=not generation_overlap, + kv_cache_dtype="auto", + use_cuda_graph=enable_cuda_graph)) kv_cache_configs = [ KvCacheConfig(max_tokens=128, enable_block_reuse=False) diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index 7792377649a..88766ad767d 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -28,18 +28,14 @@ def get_model_yaml_config(model_label: str, input_lens: list[int]) -> dict: """ base_config = { 'enable_attention_dp': True, - 'pytorch_backend_config': { - 'print_iter_log': True, - 'use_cuda_graph': True, - 'cuda_graph_padding_enabled': True, - } + 'print_iter_log': True, + 'use_cuda_graph': True, + 'cuda_graph_padding_enabled': True, } model_configs = { 'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8': { - 'pytorch_backend_config': { - 'use_cuda_graph': True, - }, + 'use_cuda_graph': True, 'speculative_config': { 'decoding_type': 'MTP', 'num_nextn_predict_layers': 3 @@ -47,9 +43,7 @@ def get_model_yaml_config(model_label: str, input_lens: list[int]) -> dict: }, 'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8': { - 'pytorch_backend_config': { - 'use_cuda_graph': True, - }, + 'use_cuda_graph': True, 'speculative_config': { 'decoding_type': 'MTP', 'num_nextn_predict_layers': 3 @@ -57,25 +51,17 @@ def get_model_yaml_config(model_label: str, input_lens: list[int]) -> dict: }, 'deepseek_r1-bench-pytorch-float16-maxbs:128-maxnt:1127-input_output_len:1000,2000-quant:fp8-reqs:5120-con:1024-ep:8-gpus:8': { - 'pytorch_backend_config': { - 'cuda_graph_batch_sizes': [128] - }, + 'cuda_graph_batch_sizes': [128] }, 'deepseek_r1-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8': { - 'pytorch_backend_config': { - 'cuda_graph_padding_enabled': True, - 'cuda_graph_batch_sizes': - [1, 2, 4, 8, 16, 32, 64, 128, 256, 384] - }, + 'cuda_graph_padding_enabled': True, + 'cuda_graph_batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384] }, 'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8': { - 'pytorch_backend_config': { - 'cuda_graph_padding_enabled': True, - 'cuda_graph_batch_sizes': - [1, 2, 4, 8, 16, 32, 64, 128, 256, 384] - }, + 'cuda_graph_padding_enabled': True, + 'cuda_graph_batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384] } } # get model name from model_label diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py index 6bb0d7e496b..347f49f08ba 100644 --- a/tests/integration/defs/stress_test/stress_test.py +++ b/tests/integration/defs/stress_test/stress_test.py @@ -510,13 +510,16 @@ def stress_test(config, extra_llm_options["enable_attention_dp"] = True if config.backend == "pytorch": - extra_llm_options["pytorch_backend_config"] = { - "use_cuda_graph": True, - "cuda_graph_padding_enabled": True, + extra_llm_options.update({ + "use_cuda_graph": + True, + "cuda_graph_padding_enabled": + True, "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384], - "print_iter_log": True, - } + "print_iter_log": + True, + }) with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as temp_file: diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index b8a0f059008..e1184664fa7 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -625,11 +625,17 @@ def temp_extra_llm_api_options_file(request): } } + pytorch_backend_config = {} if request.node.callspec.params['pytorch_backend_config']: - extra_llm_api_options_dict["pytorch_backend_config"] = { + pytorch_backend_config = { "use_cuda_graph": True, - "cuda_graph_batch_sizes": [1, 2, 3], + # trtllm-bench will set cuda_max_batch_size to + # max_batch_size, so the cuda_graph_batch_sizes is not + # needed. + # "cuda_graph_batch_sizes": [1, 2, 3], } + # Flatten the pytorch_backend_config + extra_llm_api_options_dict.update(pytorch_backend_config) with open(temp_file_path, 'w') as f: yaml.dump(extra_llm_api_options_dict, f) @@ -1981,7 +1987,6 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path, from tensorrt_llm import SamplingParams from tensorrt_llm._torch import LLM - from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.sampling_params import SamplingParams prompts = [ "Hello, my name is", @@ -1994,8 +1999,8 @@ def test_ptp_quickstart_bert(llm_root, llm_venv, model_name, model_path, sampling_param = SamplingParams(max_tokens=32, return_context_logits=True) with LLM( model=model_dir, - pytorch_backend_config=PyTorchConfig( - attn_backend=backend, disable_overlap_scheduler=True), + attn_backend=backend, + disable_overlap_scheduler=True, ) as llm: outputs = llm.generate(prompts, sampling_params=sampling_param) diff --git a/tests/unittest/_torch/modeling/test_modeling_deepseek.py b/tests/unittest/_torch/modeling/test_modeling_deepseek.py index f8b158cc4e1..ffa75e44995 100644 --- a/tests/unittest/_torch/modeling/test_modeling_deepseek.py +++ b/tests/unittest/_torch/modeling/test_modeling_deepseek.py @@ -9,7 +9,6 @@ from tensorrt_llm import SamplingParams from tensorrt_llm._torch import LLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.llmapi.utils import get_total_gpu_memory @@ -56,7 +55,7 @@ def test_deepseek_trtllmgen(model_name): "The president of the United States is", ] * 4 - pytorch_config = PyTorchConfig( + pytorch_config = dict( disable_overlap_scheduler=True, use_cuda_graph=False, kv_cache_dtype="auto", @@ -73,7 +72,7 @@ def test_deepseek_trtllmgen(model_name): llm = LLM(model=tmp_model_dir, tensor_parallel_size=1, enable_chunked_prefill=False, - pytorch_backend_config=pytorch_config, + **pytorch_config, moe_expert_parallel_size=-1, moe_tensor_parallel_size=-1, enable_attention_dp=False, diff --git a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py index 38b15ce1f1f..7c88900add8 100644 --- a/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py +++ b/tests/unittest/_torch/modeling/test_modeling_out_of_tree.py @@ -3,7 +3,6 @@ from parameterized import parameterized from tensorrt_llm._torch import LLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.sampling_params import SamplingParams @@ -42,8 +41,7 @@ def test_llm_api(self, import_oot_code: bool): llm = LLM(model=model_dir, kv_cache_config=kv_cache_config, max_num_tokens=2048, - pytorch_backend_config=PyTorchConfig( - disable_overlap_scheduler=True)) + disable_overlap_scheduler=True) prompts = [ "Hello, my name is", diff --git a/tests/unittest/_torch/multi_gpu/test_star_attention.py b/tests/unittest/_torch/multi_gpu/test_star_attention.py index 3938b4164fa..7c387cd5ccf 100644 --- a/tests/unittest/_torch/multi_gpu/test_star_attention.py +++ b/tests/unittest/_torch/multi_gpu/test_star_attention.py @@ -7,7 +7,6 @@ from tensorrt_llm import SamplingParams from tensorrt_llm._torch import LLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.llmapi.utils import get_total_gpu_memory from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig @@ -61,9 +60,8 @@ def test_model(backend, model_name, quant, sp_size, sa_block_size, max_batch_size = 20 max_output_tokens = 128 kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) - pytorch_backend_config = PyTorchConfig( - attn_backend='FLASHINFER_STAR_ATTENTION', - disable_overlap_scheduler=True) + pytorch_backend_options = dict(attn_backend='FLASHINFER_STAR_ATTENTION', + disable_overlap_scheduler=True) llm = LLM(model=model_dir, backend=backend, @@ -72,7 +70,7 @@ def test_model(backend, model_name, quant, sp_size, sa_block_size, quant_config=quant_config, context_parallel_size=sp_size, cp_config=cp_config, - pytorch_backend_config=pytorch_backend_config, + **pytorch_backend_options, max_batch_size=max_batch_size, max_input_len=MAX_SEQ_LEN - max_output_tokens, max_seq_len=MAX_SEQ_LEN, diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py index 6f0248b86da..b73655719fa 100644 --- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py +++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py @@ -9,7 +9,6 @@ from tensorrt_llm import SamplingParams from tensorrt_llm._torch import LLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.llmapi.utils import get_total_gpu_memory @@ -63,7 +62,7 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size): " the head of state and head of government of the", ] * 32 - pytorch_config = PyTorchConfig( + pytorch_config = dict( disable_overlap_scheduler=True, use_cuda_graph=False, kv_cache_dtype="auto", @@ -78,7 +77,7 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size): llm = LLM(model=model_dir, tensor_parallel_size=tp_size, enable_chunked_prefill=False, - pytorch_backend_config=pytorch_config, + **pytorch_config, moe_expert_parallel_size=-1, moe_tensor_parallel_size=-1, enable_attention_dp=enable_attention_dp, diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py index e89c5e00197..87da3e12cc8 100644 --- a/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py +++ b/tests/unittest/_torch/multi_gpu_modeling/test_llama4.py @@ -7,7 +7,6 @@ from tensorrt_llm import SamplingParams from tensorrt_llm._torch import LLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig @pytest.mark.parametrize( @@ -48,8 +47,7 @@ def test_llama4(model_name, backend, tp_size, use_cuda_graph, " the head of state and head of government of the", " solid white" ] - pytorch_config = PyTorchConfig(attn_backend=backend, - use_cuda_graph=use_cuda_graph) + pytorch_config = dict(attn_backend=backend, use_cuda_graph=use_cuda_graph) model_dir = str(llm_models_root() / "llama4-models" / model_name) llm = LLM( @@ -57,7 +55,7 @@ def test_llama4(model_name, backend, tp_size, use_cuda_graph, tensor_parallel_size=tp_size, moe_expert_parallel_size=ep_size, moe_tensor_parallel_size=tp_size // ep_size, - pytorch_backend_config=pytorch_config, + **pytorch_config, pipeline_parallel_size=pp_size, enable_attention_dp=enable_attention_dp, ) diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py index 5b3094bd3aa..b698c8a389f 100644 --- a/tests/unittest/_torch/speculative/test_eagle3.py +++ b/tests/unittest/_torch/speculative/test_eagle3.py @@ -7,7 +7,6 @@ from tensorrt_llm import SamplingParams from tensorrt_llm._torch import LLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import BuildConfig, EagleDecodingConfig, KvCacheConfig sys.path.append(os.path.join(os.path.dirname(__file__), '..')) @@ -24,7 +23,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str): models_path = llm_models_root() - pytorch_config = PyTorchConfig( + pytorch_config = dict( disable_overlap_scheduler=True, use_cuda_graph=use_cuda_graph, # Only create a single CUDA graph to prevent OOM in CI @@ -49,7 +48,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str): build_config = BuildConfig(max_seq_len=2048) llm_spec = LLM(model=target_model_dir, - pytorch_backend_config=pytorch_config, + **pytorch_config, kv_cache_config=kv_cache_config, speculative_config=spec_config, build_config=build_config) @@ -89,7 +88,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str): llm_spec.shutdown() llm_ref = LLM(model=target_model_dir, - pytorch_backend_config=pytorch_config, + **pytorch_config, kv_cache_config=kv_cache_config, build_config=build_config) diff --git a/tests/unittest/_torch/speculative/test_ngram.py b/tests/unittest/_torch/speculative/test_ngram.py index 29efa02e2b6..e996725e5f9 100644 --- a/tests/unittest/_torch/speculative/test_ngram.py +++ b/tests/unittest/_torch/speculative/test_ngram.py @@ -7,7 +7,6 @@ from tensorrt_llm import SamplingParams from tensorrt_llm._torch import LLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import KvCacheConfig, NGramDecodingConfig sys.path.append(os.path.join(os.path.dirname(__file__), '..')) @@ -26,7 +25,7 @@ def test_llama_ngram(use_cuda_graph: bool, attn_backend: str): models_path = llm_models_root() - pytorch_config = PyTorchConfig( + pytorch_config = dict( enable_overlap_scheduler=False, use_cuda_graph=use_cuda_graph, # Only create a single CUDA graph to prevent OOM in CI @@ -54,7 +53,7 @@ def test_llama_ngram(use_cuda_graph: bool, attn_backend: str): ) llm_spec = LLM(model=target_model_dir, max_batch_size=max_batch_size, - pytorch_backend_config=pytorch_config, + **pytorch_config, kv_cache_config=kv_cache_config, speculative_config=spec_config) @@ -67,7 +66,7 @@ def test_llama_ngram(use_cuda_graph: bool, attn_backend: str): llm_ref = LLM(model=target_model_dir, max_batch_size=max_batch_size, - pytorch_backend_config=pytorch_config, + **pytorch_config, kv_cache_config=kv_cache_config) results_ref = llm_ref.generate(prompts, sampling_params) diff --git a/tests/unittest/_torch/test_overlap_scheduler.py b/tests/unittest/_torch/test_overlap_scheduler.py index 7b281ef0531..18622f94cbd 100644 --- a/tests/unittest/_torch/test_overlap_scheduler.py +++ b/tests/unittest/_torch/test_overlap_scheduler.py @@ -6,7 +6,6 @@ from tensorrt_llm import SamplingParams from tensorrt_llm._torch import LLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig @@ -24,10 +23,9 @@ def model_path(): def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_sampler): """Create LLM with specific overlap scheduler setting""" - pytorch_config = PyTorchConfig( - use_cuda_graph=True, - disable_overlap_scheduler=disable_overlap_scheduler, - enable_trtllm_sampler=enable_trtllm_sampler) + pytorch_config = dict(use_cuda_graph=True, + disable_overlap_scheduler=disable_overlap_scheduler, + enable_trtllm_sampler=enable_trtllm_sampler) trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False) @@ -36,7 +34,7 @@ def create_llm(model_dir, disable_overlap_scheduler, enable_trtllm_sampler): tensor_parallel_size=1, trust_remote_code=True, enable_chunked_prefill=True, - pytorch_backend_config=pytorch_config, + **pytorch_config, kv_cache_config=trt_kv_cache_config, max_num_tokens= 128 # Only one request longer than max_num_tokens is required to test chunked prefill diff --git a/tests/unittest/_torch/test_trtllm_sampler.py b/tests/unittest/_torch/test_trtllm_sampler.py index 3587c7b3b8b..bee47efddaf 100644 --- a/tests/unittest/_torch/test_trtllm_sampler.py +++ b/tests/unittest/_torch/test_trtllm_sampler.py @@ -7,7 +7,6 @@ from tensorrt_llm import SamplingParams from tensorrt_llm._torch import LLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig @@ -25,8 +24,7 @@ def model_path(): def create_llm(model_dir): """Create LLM with specific overlap scheduler setting""" - pytorch_config = PyTorchConfig(use_cuda_graph=True, - enable_trtllm_sampler=True) + pytorch_config = dict(use_cuda_graph=True, enable_trtllm_sampler=True) trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False) @@ -35,7 +33,7 @@ def create_llm(model_dir): tensor_parallel_size=1, trust_remote_code=True, enable_chunked_prefill=True, - pytorch_backend_config=pytorch_config, + **pytorch_config, kv_cache_config=trt_kv_cache_config, max_num_tokens= 128 # Only one request longer than max_num_tokens is required to test chunked prefill diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml index 78c822fedfb..9e9c05245f5 100644 --- a/tests/unittest/api_stability/references/llm.yaml +++ b/tests/unittest/api_stability/references/llm.yaml @@ -35,7 +35,7 @@ methods: annotation: Optional[tensorrt_llm.llmapi.llm_args.PeftCacheConfig] default: null scheduler_config: - annotation: Optional[tensorrt_llm.llmapi.llm_args.SchedulerConfig] + annotation: tensorrt_llm.llmapi.llm_args.SchedulerConfig default: null extended_runtime_perf_knob_config: annotation: Optional[tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig] diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml index cfb05b05cb2..f2c90635fbe 100644 --- a/tests/unittest/api_stability/references_committed/llm.yaml +++ b/tests/unittest/api_stability/references_committed/llm.yaml @@ -103,7 +103,7 @@ methods: annotation: bool default: false kv_cache_config: - annotation: Optional[tensorrt_llm.llmapi.llm_args.KvCacheConfig] + annotation: tensorrt_llm.llmapi.llm_args.KvCacheConfig default: null return_annotation: None generate: diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py index a2278c0d996..aeb46a8a0b0 100644 --- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py +++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py @@ -25,9 +25,7 @@ def temp_extra_llm_api_options_file(request): try: extra_llm_api_options_dict = { "guided_decoding_backend": "xgrammar", - "pytorch_backend_config": { - "disable_overlap_scheduler": True, - } + "disable_overlap_scheduler": True, } with open(temp_file_path, 'w') as f: diff --git a/tests/unittest/llmapi/apps/_test_openai_metrics.py b/tests/unittest/llmapi/apps/_test_openai_metrics.py index e79c34da311..1b075b67565 100755 --- a/tests/unittest/llmapi/apps/_test_openai_metrics.py +++ b/tests/unittest/llmapi/apps/_test_openai_metrics.py @@ -5,7 +5,6 @@ from transformers import AutoTokenizer from tensorrt_llm._torch.llm import LLM as PyTorchLLM -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm.llmapi import BuildConfig, KvCacheConfig from tensorrt_llm.serve.openai_server import OpenAIServer @@ -23,8 +22,7 @@ def client(): build_config=build_config, kv_cache_config=KvCacheConfig(), backend="pytorch", - pytorch_backend_config=PyTorchConfig( - enable_iter_perf_stats=True, )) + enable_iter_perf_stats=True) hf_tokenizer = AutoTokenizer.from_pretrained(llama_model_path) app_instance = OpenAIServer(llm, diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index aaa9771e1a9..97e93b92dee 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -1879,11 +1879,10 @@ def llm_get_stats_test_harness(tp_size: int = 1, if pytorch_backend: from tensorrt_llm._torch import LLM as LLM_torch - from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig - llm_args_extra["pytorch_backend_config"] = PyTorchConfig( - enable_iter_perf_stats=True, - enable_iter_req_stats=enable_iter_req_stats, - disable_overlap_scheduler=not use_overlap) + llm_args_extra.update( + dict(enable_iter_perf_stats=True, + enable_iter_req_stats=enable_iter_req_stats, + disable_overlap_scheduler=not use_overlap)) LLM_CLASS = LLM_torch else: LLM_CLASS = LLM @@ -1949,11 +1948,10 @@ def llm_get_stats_async_test_harness(tp_size: int = 1, if pytorch_backend: from tensorrt_llm._torch import LLM as LLM_torch - from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig - llm_args_extra["pytorch_backend_config"] = PyTorchConfig( - enable_iter_perf_stats=True, - enable_iter_req_stats=enable_iter_req_stats, - disable_overlap_scheduler=not use_overlap) + llm_args_extra.update( + dict(enable_iter_perf_stats=True, + enable_iter_req_stats=enable_iter_req_stats, + disable_overlap_scheduler=not use_overlap)) LLM_CLASS = LLM_torch else: LLM_CLASS = LLM diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py index 99f5239e331..af26293fd8f 100644 --- a/tests/unittest/llmapi/test_llm_args.py +++ b/tests/unittest/llmapi/test_llm_args.py @@ -1,10 +1,10 @@ import tempfile +import pytest import yaml import tensorrt_llm.bindings.executor as tle from tensorrt_llm.llmapi.llm_args import * -from tensorrt_llm.llmapi.llm_utils import * from .test_llm import llama_model_path @@ -50,7 +50,7 @@ def test_update_llm_args_with_extra_dict_with_speculative_config(): f.seek(0) dict_content = yaml.safe_load(f) - llm_args = LlmArgs(llama_model_path) + llm_args = LlmArgs(model=llama_model_path) llm_args_dict = update_llm_args_with_extra_dict(llm_args.to_dict(), dict_content) llm_args = LlmArgs(**llm_args_dict) @@ -173,3 +173,46 @@ def test_PeftCacheConfig_declaration(): assert pybind_config.device_cache_percent == 0.5 assert pybind_config.host_cache_size == 1024 assert pybind_config.lora_prefetch_dir == "." + + +class TestTorchLlmArgsCudaGraphSettings: + + def test_cuda_graph_batch_sizes_case_0(self): + # set both cuda_graph_batch_sizes and cuda_graph_max_batch_size, and + # cuda_graph_batch_sizes is not equal to generated + with pytest.raises(ValueError): + TorchLlmArgs(model=llama_model_path, + use_cuda_graph=True, + cuda_graph_batch_sizes=[1, 2, 3], + cuda_graph_max_batch_size=128) + + def test_cuda_graph_batch_sizes_case_0_1(self): + # set both cuda_graph_batch_sizes and cuda_graph_max_batch_size, and + # cuda_graph_batch_sizes is equal to generated + args = TorchLlmArgs(model=llama_model_path, + use_cuda_graph=True, + cuda_graph_padding_enabled=True, + cuda_graph_batch_sizes=TorchLlmArgs. + _generate_cuda_graph_batch_sizes(128, True), + cuda_graph_max_batch_size=128) + assert args.cuda_graph_batch_sizes == TorchLlmArgs._generate_cuda_graph_batch_sizes( + 128, True) + assert args.cuda_graph_max_batch_size == 128 + + def test_cuda_graph_batch_sizes_case_1(self): + # set cuda_graph_batch_sizes only + args = TorchLlmArgs(model=llama_model_path, + use_cuda_graph=True, + cuda_graph_padding_enabled=True, + cuda_graph_batch_sizes=[1, 2, 4]) + assert args.cuda_graph_batch_sizes == [1, 2, 4] + + def test_cuda_graph_batch_sizes_case_2(self): + # set cuda_graph_max_batch_size only + args = TorchLlmArgs(model=llama_model_path, + use_cuda_graph=True, + cuda_graph_padding_enabled=True, + cuda_graph_max_batch_size=128) + assert args.cuda_graph_batch_sizes == TorchLlmArgs._generate_cuda_graph_batch_sizes( + 128, True) + assert args.cuda_graph_max_batch_size == 128 diff --git a/tests/unittest/llmapi/test_llm_kv_cache_events.py b/tests/unittest/llmapi/test_llm_kv_cache_events.py index a98529af8fc..b445bd1990b 100644 --- a/tests/unittest/llmapi/test_llm_kv_cache_events.py +++ b/tests/unittest/llmapi/test_llm_kv_cache_events.py @@ -2,7 +2,6 @@ import time import tensorrt_llm -from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager from tensorrt_llm._utils import KVCacheEventSerializer @@ -48,7 +47,7 @@ def create_llm(tensor_parallel_size=1): return LLM(model=llama_model_path, tensor_parallel_size=tensor_parallel_size, kv_cache_config=global_kvcache_config, - pytorch_backend_config=PyTorchConfig(autotuner_enabled=False), + autotuner_enabled=False, backend="pytorch") diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 92d4b1f2e80..03861d266d5 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -90,10 +90,9 @@ def test_llm_reward_model(): tokenized_input = tokenizer(prompts, return_tensors="pt")["input_ids"] from tensorrt_llm._torch import LLM as LLM_torch - from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig llm = LLM_torch(model=rm_model_path, - pytorch_backend_config=PyTorchConfig( - attn_backend="VANILLA", disable_overlap_scheduler=True)) + attn_backend="VANILLA", + disable_overlap_scheduler=True) sampling_params = SamplingParams(return_context_logits=True) diff --git a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml index 639e84a614e..df29e937bfc 100644 --- a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml +++ b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.yaml @@ -8,8 +8,7 @@ backend: "pytorch" tensor_parallel_size: 1 pipeline_parallel_size: 1 -pytorch_backend_config: - use_cuda_graph: False +use_cuda_graph: False # ======= Triton Server Configurations ======= # Triton Configurations to override the default values in config.pbtxt