@@ -82,7 +82,6 @@ def get_llm_args(model: str,
8282 moe_expert_parallel_size : Optional [int ] = None ,
8383 gpus_per_node : Optional [int ] = None ,
8484 free_gpu_memory_fraction : Optional [float ] = None ,
85- mamba_ssm_cache_dtype : str = "auto" ,
8685 num_postprocess_workers : int = 0 ,
8786 trust_remote_code : bool = False ,
8887 reasoning_parser : Optional [str ] = None ,
@@ -98,8 +97,7 @@ def get_llm_args(model: str,
9897 max_beam_width = max_beam_width ,
9998 max_seq_len = max_seq_len )
10099 kv_cache_config = KvCacheConfig (
101- free_gpu_memory_fraction = free_gpu_memory_fraction ,
102- mamba_ssm_cache_dtype = mamba_ssm_cache_dtype )
100+ free_gpu_memory_fraction = free_gpu_memory_fraction , )
103101
104102 dynamic_batch_config = DynamicBatchConfig (
105103 enable_batch_size_tuning = True ,
@@ -256,12 +254,6 @@ def launch_mm_encoder_server(
256254 default = 0.9 ,
257255 help = "Free GPU memory fraction reserved for KV Cache, "
258256 "after allocating model weights and buffers." )
259- @click .option (
260- "--mamba_ssm_cache_dtype" ,
261- type = click .Choice (["auto" , "float16" , "bfloat16" , "float32" ]),
262- default = "auto" ,
263- help = "Data type for Mamba SSM cache. If 'auto', inferred from model config."
264- )
265257@click .option (
266258 "--num_postprocess_workers" ,
267259 type = int ,
@@ -302,17 +294,16 @@ def launch_mm_encoder_server(
302294 help =
303295 "Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache."
304296)
305- def serve (model : str , tokenizer : Optional [str ], host : str , port : int ,
306- log_level : str , backend : str , max_beam_width : int ,
307- max_batch_size : int , max_num_tokens : int , max_seq_len : int ,
308- tp_size : int , pp_size : int , ep_size : Optional [int ],
309- cluster_size : Optional [int ], gpus_per_node : Optional [int ],
310- kv_cache_free_gpu_memory_fraction : float , mamba_ssm_cache_dtype : str ,
311- num_postprocess_workers : int , trust_remote_code : bool ,
312- extra_llm_api_options : Optional [str ], reasoning_parser : Optional [str ],
313- metadata_server_config_file : Optional [str ],
314- server_role : Optional [str ],
315- fail_fast_on_attention_window_too_large : bool ):
297+ def serve (
298+ model : str , tokenizer : Optional [str ], host : str , port : int ,
299+ log_level : str , backend : str , max_beam_width : int , max_batch_size : int ,
300+ max_num_tokens : int , max_seq_len : int , tp_size : int , pp_size : int ,
301+ ep_size : Optional [int ], cluster_size : Optional [int ],
302+ gpus_per_node : Optional [int ], kv_cache_free_gpu_memory_fraction : float ,
303+ num_postprocess_workers : int , trust_remote_code : bool ,
304+ extra_llm_api_options : Optional [str ], reasoning_parser : Optional [str ],
305+ metadata_server_config_file : Optional [str ], server_role : Optional [str ],
306+ fail_fast_on_attention_window_too_large : bool ):
316307 """Running an OpenAI API compatible server
317308
318309 MODEL: model name | HF checkpoint path | TensorRT engine path
@@ -333,7 +324,6 @@ def serve(model: str, tokenizer: Optional[str], host: str, port: int,
333324 moe_cluster_parallel_size = cluster_size ,
334325 gpus_per_node = gpus_per_node ,
335326 free_gpu_memory_fraction = kv_cache_free_gpu_memory_fraction ,
336- mamba_ssm_cache_dtype = mamba_ssm_cache_dtype ,
337327 num_postprocess_workers = num_postprocess_workers ,
338328 trust_remote_code = trust_remote_code ,
339329 reasoning_parser = reasoning_parser ,
0 commit comments