diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py index 3046f89698d..1447038251f 100755 --- a/tensorrt_llm/bench/benchmark/utils/general.py +++ b/tensorrt_llm/bench/benchmark/utils/general.py @@ -86,17 +86,15 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str, enable_chunked_prefill = params.get("enable_chunked_prefill", False) kv_cache_dtype = "auto" - cuda_graph_batch_sizes = None if extra_llm_api_options: with open(extra_llm_api_options, 'r') as f: llm_args_dict = yaml.safe_load(f) - if "kv_cache_dtype" in llm_args_dict: - kv_cache_dtype = llm_args_dict["kv_cache_dtype"] - if "cuda_graph_batch_sizes" in llm_args_dict: - cuda_graph_batch_sizes = llm_args_dict["cuda_graph_batch_sizes"] - enable_chunked_prefill = llm_args_dict.get("enable_chunked_prefill", - enable_chunked_prefill) + if "kv_cache_dtype" in llm_args_dict: + kv_cache_dtype = llm_args_dict["kv_cache_dtype"] + + enable_chunked_prefill = llm_args_dict.get("enable_chunked_prefill", + enable_chunked_prefill) world_config = { "pp_size": params.get("pp"), @@ -152,17 +150,17 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str, # Expecting this to be the max of chunk block and max_num_tokens. pass + cuda_graph_config = { + "padding_enabled": True, + "max_batch_size": max_batch_size + } + pyt_options = { - "cuda_graph_config": { - "padding_enabled": - True, - "max_batch_size": - max_batch_size if cuda_graph_batch_sizes is None else 0, - }, + "cuda_graph_config": cuda_graph_config, "kv_cache_dtype": kv_cache_dtype, } - backend = params.get("backend", "pytorch") + backend = params.get("backend", "pytorch") return { "sw_version": version("tensorrt_llm"), "model_path": model_path,