Skip to content

Commit 9bd4ef2

Browse files
committed
remove CLI support for mamba cache dtype setting
Signed-off-by: Shahar Mor <[email protected]>
1 parent 2d40e87 commit 9bd4ef2

File tree

3 files changed

+11
-33
lines changed

3 files changed

+11
-33
lines changed

tensorrt_llm/bench/benchmark/low_latency.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,6 @@
5656
default=.90,
5757
help="The percentage of memory to use for KV Cache after model load.",
5858
)
59-
@optgroup.option(
60-
"--mamba_ssm_cache_dtype",
61-
type=click.Choice(["auto", "float16", "bfloat16", "float32"]),
62-
default="auto",
63-
help="Data type for Mamba SSM cache. If 'auto', inferred from model config.",
64-
)
6559
@optgroup.option(
6660
"--max_seq_len",
6761
type=int,

tensorrt_llm/bench/benchmark/throughput.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,6 @@
103103
default=.90,
104104
help="The percentage of memory to use for KV Cache after model load.",
105105
)
106-
@optgroup.option(
107-
"--mamba_ssm_cache_dtype",
108-
type=click.Choice(["auto", "float16", "bfloat16", "float32"]),
109-
default="auto",
110-
help="Data type for Mamba SSM cache. If 'auto', inferred from model config.",
111-
)
112106
@optgroup.group(
113107
"Engine Input Configuration",
114108
help="Input configuration for driving the engine.",

tensorrt_llm/commands/serve.py

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ def get_llm_args(model: str,
8282
moe_expert_parallel_size: Optional[int] = None,
8383
gpus_per_node: Optional[int] = None,
8484
free_gpu_memory_fraction: Optional[float] = None,
85-
mamba_ssm_cache_dtype: str = "auto",
8685
num_postprocess_workers: int = 0,
8786
trust_remote_code: bool = False,
8887
reasoning_parser: Optional[str] = None,
@@ -98,8 +97,7 @@ def get_llm_args(model: str,
9897
max_beam_width=max_beam_width,
9998
max_seq_len=max_seq_len)
10099
kv_cache_config = KvCacheConfig(
101-
free_gpu_memory_fraction=free_gpu_memory_fraction,
102-
mamba_ssm_cache_dtype=mamba_ssm_cache_dtype)
100+
free_gpu_memory_fraction=free_gpu_memory_fraction, )
103101

104102
dynamic_batch_config = DynamicBatchConfig(
105103
enable_batch_size_tuning=True,
@@ -256,12 +254,6 @@ def launch_mm_encoder_server(
256254
default=0.9,
257255
help="Free GPU memory fraction reserved for KV Cache, "
258256
"after allocating model weights and buffers.")
259-
@click.option(
260-
"--mamba_ssm_cache_dtype",
261-
type=click.Choice(["auto", "float16", "bfloat16", "float32"]),
262-
default="auto",
263-
help="Data type for Mamba SSM cache. If 'auto', inferred from model config."
264-
)
265257
@click.option(
266258
"--num_postprocess_workers",
267259
type=int,
@@ -302,17 +294,16 @@ def launch_mm_encoder_server(
302294
help=
303295
"Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache."
304296
)
305-
def serve(model: str, tokenizer: Optional[str], host: str, port: int,
306-
log_level: str, backend: str, max_beam_width: int,
307-
max_batch_size: int, max_num_tokens: int, max_seq_len: int,
308-
tp_size: int, pp_size: int, ep_size: Optional[int],
309-
cluster_size: Optional[int], gpus_per_node: Optional[int],
310-
kv_cache_free_gpu_memory_fraction: float, mamba_ssm_cache_dtype: str,
311-
num_postprocess_workers: int, trust_remote_code: bool,
312-
extra_llm_api_options: Optional[str], reasoning_parser: Optional[str],
313-
metadata_server_config_file: Optional[str],
314-
server_role: Optional[str],
315-
fail_fast_on_attention_window_too_large: bool):
297+
def serve(
298+
model: str, tokenizer: Optional[str], host: str, port: int,
299+
log_level: str, backend: str, max_beam_width: int, max_batch_size: int,
300+
max_num_tokens: int, max_seq_len: int, tp_size: int, pp_size: int,
301+
ep_size: Optional[int], cluster_size: Optional[int],
302+
gpus_per_node: Optional[int], kv_cache_free_gpu_memory_fraction: float,
303+
num_postprocess_workers: int, trust_remote_code: bool,
304+
extra_llm_api_options: Optional[str], reasoning_parser: Optional[str],
305+
metadata_server_config_file: Optional[str], server_role: Optional[str],
306+
fail_fast_on_attention_window_too_large: bool):
316307
"""Running an OpenAI API compatible server
317308
318309
MODEL: model name | HF checkpoint path | TensorRT engine path
@@ -333,7 +324,6 @@ def serve(model: str, tokenizer: Optional[str], host: str, port: int,
333324
moe_cluster_parallel_size=cluster_size,
334325
gpus_per_node=gpus_per_node,
335326
free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction,
336-
mamba_ssm_cache_dtype=mamba_ssm_cache_dtype,
337327
num_postprocess_workers=num_postprocess_workers,
338328
trust_remote_code=trust_remote_code,
339329
reasoning_parser=reasoning_parser,

0 commit comments

Comments
 (0)