44import argparse
55from typing import Optional
66
7+ from tensorrt_llm .llmapi import BuildConfig
8+
79from dynamo .trtllm .request_handlers .handler_base import (
810 DisaggregationMode ,
911 DisaggregationStrategy ,
@@ -27,8 +29,16 @@ def __init__(self) -> None:
2729 self .model_path : str = ""
2830 self .served_model_name : Optional [str ] = None
2931 self .tensor_parallel_size : int = 1
32+ self .pipeline_parallel_size : int = 1
33+ self .expert_parallel_size : Optional [int ] = None
3034 self .kv_block_size : int = 32
3135 self .migration_limit : int = 0
36+ self .gpus_per_node : Optional [int ] = None
37+ self .max_batch_size : int = BuildConfig .max_batch_size
38+ self .max_num_tokens : int = BuildConfig .max_num_tokens
39+ self .max_seq_len : int = BuildConfig .max_seq_len
40+ self .max_beam_width : int = BuildConfig .max_beam_width
41+ self .free_gpu_memory_fraction : Optional [float ] = None
3242 self .extra_engine_args : str = ""
3343 self .publish_events_and_metrics : bool = False
3444 self .disaggregation_mode : DisaggregationMode = DEFAULT_DISAGGREGATION_MODE
@@ -45,7 +55,15 @@ def __str__(self) -> str:
4555 f"model_path={ self .model_path } , "
4656 f"served_model_name={ self .served_model_name } , "
4757 f"tensor_parallel_size={ self .tensor_parallel_size } , "
58+ f"pipeline_parallel_size={ self .pipeline_parallel_size } , "
59+ f"expert_parallel_size={ self .expert_parallel_size } , "
4860 f"kv_block_size={ self .kv_block_size } , "
61+ f"gpus_per_node={ self .gpus_per_node } , "
62+ f"max_batch_size={ self .max_batch_size } , "
63+ f"max_num_tokens={ self .max_num_tokens } , "
64+ f"max_seq_len={ self .max_seq_len } , "
65+ f"max_beam_width={ self .max_beam_width } , "
66+ f"free_gpu_memory_fraction={ self .free_gpu_memory_fraction } , "
4967 f"extra_engine_args={ self .extra_engine_args } , "
5068 f"migration_limit={ self .migration_limit } , "
5169 f"publish_events_and_metrics={ self .publish_events_and_metrics } , "
@@ -108,8 +126,21 @@ def cmd_line_args():
108126 help = "Name to serve the model under. Defaults to deriving it from model path." ,
109127 )
110128 parser .add_argument (
111- "--tensor-parallel-size" , type = int , default = 1 , help = "Number of GPUs to use."
129+ "--tensor-parallel-size" , type = int , default = 1 , help = "Tensor parallelism size."
130+ )
131+ parser .add_argument (
132+ "--pipeline-parallel-size" ,
133+ type = int ,
134+ default = None ,
135+ help = "Pipeline parallelism size." ,
136+ )
137+ parser .add_argument (
138+ "--expert-parallel-size" ,
139+ type = int ,
140+ default = None ,
141+ help = "expert parallelism size." ,
112142 )
143+
113144 # IMPORTANT: We should ideally not expose this to users. We should be able to
114145 # query the block size from the TRTLLM engine.
115146 parser .add_argument (
@@ -121,6 +152,43 @@ def cmd_line_args():
121152 default = 0 ,
122153 help = "Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine." ,
123154 )
155+ parser .add_argument (
156+ "--gpus-per-node" ,
157+ type = int ,
158+ default = None ,
159+ help = "Number of GPUs per node. If not provided, will be inferred from the environment." ,
160+ )
161+ parser .add_argument (
162+ "--max-batch-size" ,
163+ type = int ,
164+ default = BuildConfig .max_batch_size ,
165+ help = "Maximum number of requests that the engine can schedule." ,
166+ )
167+ parser .add_argument (
168+ "--max-num-tokens" ,
169+ type = int ,
170+ default = BuildConfig .max_num_tokens ,
171+ help = "Maximum number of batched input tokens after padding is removed in each batch." ,
172+ )
173+ parser .add_argument (
174+ "--max-seq-len" ,
175+ type = int ,
176+ default = BuildConfig .max_seq_len ,
177+ help = "Maximum total length of one request, including prompt and outputs. "
178+ "If unspecified, the value is deduced from the model config." ,
179+ )
180+ parser .add_argument (
181+ "--max-beam-width" ,
182+ type = int ,
183+ default = BuildConfig .max_beam_width ,
184+ help = "Maximum number of beams for beam search decoding." ,
185+ )
186+ parser .add_argument (
187+ "--free-gpu-memory-fraction" ,
188+ type = float ,
189+ default = None ,
190+ help = "Free GPU memory fraction reserved for KV Cache, after allocating model weights and buffers." ,
191+ )
124192
125193 parser .add_argument (
126194 "--extra-engine-args" ,
@@ -195,6 +263,18 @@ def cmd_line_args():
195263 config .next_endpoint = args .next_endpoint
196264
197265 config .tensor_parallel_size = args .tensor_parallel_size
266+ if args .pipeline_parallel_size is not None :
267+ config .pipeline_parallel_size = args .pipeline_parallel_size
268+ if args .expert_parallel_size is not None :
269+ config .expert_parallel_size = args .expert_parallel_size
270+ if args .gpus_per_node is not None :
271+ config .gpus_per_node = args .gpus_per_node
272+ if args .free_gpu_memory_fraction is not None :
273+ config .free_gpu_memory_fraction = args .free_gpu_memory_fraction
274+ config .max_batch_size = args .max_batch_size
275+ config .max_num_tokens = args .max_num_tokens
276+ config .max_seq_len = args .max_seq_len
277+ config .max_beam_width = args .max_beam_width
198278 config .kv_block_size = args .kv_block_size
199279 config .migration_limit = args .migration_limit
200280 config .extra_engine_args = args .extra_engine_args
0 commit comments