remove CLI support for mamba cache dtype setting

shaharmor98 · shaharmor98 · commit 9bd4ef22cec7 · 2025-08-21T09:07:24.000Z
Signed-off-by: Shahar Mor &lt;17088876+shaharmor98@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py
@@ -56,12 +56,6 @@
     default=.90,
     help="The percentage of memory to use for KV Cache after model load.",
 )
-@optgroup.option(
-    "--mamba_ssm_cache_dtype",
-    type=click.Choice(["auto", "float16", "bfloat16", "float32"]),
-    default="auto",
-    help="Data type for Mamba SSM cache. If 'auto', inferred from model config.",
-)
 @optgroup.option(
     "--max_seq_len",
     type=int,
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
@@ -103,12 +103,6 @@
     default=.90,
     help="The percentage of memory to use for KV Cache after model load.",
 )
-@optgroup.option(
-    "--mamba_ssm_cache_dtype",
-    type=click.Choice(["auto", "float16", "bfloat16", "float32"]),
-    default="auto",
-    help="Data type for Mamba SSM cache. If 'auto', inferred from model config.",
-)
 @optgroup.group(
     "Engine Input Configuration",
     help="Input configuration for driving the engine.",
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
@@ -82,7 +82,6 @@ def get_llm_args(model: str,
                  moe_expert_parallel_size: Optional[int] = None,
                  gpus_per_node: Optional[int] = None,
                  free_gpu_memory_fraction: Optional[float] = None,
-                 mamba_ssm_cache_dtype: str = "auto",
                  num_postprocess_workers: int = 0,
                  trust_remote_code: bool = False,
                  reasoning_parser: Optional[str] = None,
@@ -98,8 +97,7 @@ def get_llm_args(model: str,
                                max_beam_width=max_beam_width,
                                max_seq_len=max_seq_len)
     kv_cache_config = KvCacheConfig(
-        free_gpu_memory_fraction=free_gpu_memory_fraction,
-        mamba_ssm_cache_dtype=mamba_ssm_cache_dtype)
+        free_gpu_memory_fraction=free_gpu_memory_fraction, )
 
     dynamic_batch_config = DynamicBatchConfig(
         enable_batch_size_tuning=True,
@@ -256,12 +254,6 @@ def launch_mm_encoder_server(
               default=0.9,
               help="Free GPU memory fraction reserved for KV Cache, "
               "after allocating model weights and buffers.")
-@click.option(
-    "--mamba_ssm_cache_dtype",
-    type=click.Choice(["auto", "float16", "bfloat16", "float32"]),
-    default="auto",
-    help="Data type for Mamba SSM cache. If 'auto', inferred from model config."
-)
 @click.option(
     "--num_postprocess_workers",
     type=int,
@@ -302,17 +294,16 @@ def launch_mm_encoder_server(
     help=
     "Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache."
 )
-def serve(model: str, tokenizer: Optional[str], host: str, port: int,
-          log_level: str, backend: str, max_beam_width: int,
-          max_batch_size: int, max_num_tokens: int, max_seq_len: int,
-          tp_size: int, pp_size: int, ep_size: Optional[int],
-          cluster_size: Optional[int], gpus_per_node: Optional[int],
-          kv_cache_free_gpu_memory_fraction: float, mamba_ssm_cache_dtype: str,
-          num_postprocess_workers: int, trust_remote_code: bool,
-          extra_llm_api_options: Optional[str], reasoning_parser: Optional[str],
-          metadata_server_config_file: Optional[str],
-          server_role: Optional[str],
-          fail_fast_on_attention_window_too_large: bool):
+def serve(
+        model: str, tokenizer: Optional[str], host: str, port: int,
+        log_level: str, backend: str, max_beam_width: int, max_batch_size: int,
+        max_num_tokens: int, max_seq_len: int, tp_size: int, pp_size: int,
+        ep_size: Optional[int], cluster_size: Optional[int],
+        gpus_per_node: Optional[int], kv_cache_free_gpu_memory_fraction: float,
+        num_postprocess_workers: int, trust_remote_code: bool,
+        extra_llm_api_options: Optional[str], reasoning_parser: Optional[str],
+        metadata_server_config_file: Optional[str], server_role: Optional[str],
+        fail_fast_on_attention_window_too_large: bool):
     """Running an OpenAI API compatible server
 
     MODEL: model name | HF checkpoint path | TensorRT engine path
@@ -333,7 +324,6 @@ def serve(model: str, tokenizer: Optional[str], host: str, port: int,
         moe_cluster_parallel_size=cluster_size,
         gpus_per_node=gpus_per_node,
         free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction,
-        mamba_ssm_cache_dtype=mamba_ssm_cache_dtype,
         num_postprocess_workers=num_postprocess_workers,
         trust_remote_code=trust_remote_code,
         reasoning_parser=reasoning_parser,