Skip to content
Merged
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 12 additions & 14 deletions tensorrt_llm/bench/benchmark/utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,17 +86,15 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
enable_chunked_prefill = params.get("enable_chunked_prefill", False)

kv_cache_dtype = "auto"
cuda_graph_batch_sizes = None
if extra_llm_api_options:
with open(extra_llm_api_options, 'r') as f:
llm_args_dict = yaml.safe_load(f)
if "kv_cache_dtype" in llm_args_dict:
kv_cache_dtype = llm_args_dict["kv_cache_dtype"]
if "cuda_graph_batch_sizes" in llm_args_dict:
cuda_graph_batch_sizes = llm_args_dict["cuda_graph_batch_sizes"]

enable_chunked_prefill = llm_args_dict.get("enable_chunked_prefill",
enable_chunked_prefill)
if "kv_cache_dtype" in llm_args_dict:
kv_cache_dtype = llm_args_dict["kv_cache_dtype"]

enable_chunked_prefill = llm_args_dict.get("enable_chunked_prefill",
enable_chunked_prefill)

world_config = {
"pp_size": params.get("pp"),
Expand Down Expand Up @@ -152,17 +150,17 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
# Expecting this to be the max of chunk block and max_num_tokens.
pass

cuda_graph_config = {
"padding_enabled": True,
"max_batch_size": max_batch_size
}

pyt_options = {
"cuda_graph_config": {
"padding_enabled":
True,
"max_batch_size":
max_batch_size if cuda_graph_batch_sizes is None else 0,
},
"cuda_graph_config": cuda_graph_config,
"kv_cache_dtype": kv_cache_dtype,
}
backend = params.get("backend", "pytorch")

backend = params.get("backend", "pytorch")
return {
"sw_version": version("tensorrt_llm"),
"model_path": model_path,
Expand Down