@@ -298,49 +298,6 @@ def get_bindings_model_config(self,
298
298
num_heads = self .pretrained_config .num_attention_heads // (
299
299
self .mapping .tp_size * self .mapping .cp_size )
300
300
301
- print ("SMOR, in get_bindings_model_config" )
302
- from IPython import embed
303
- embed ()
304
- # Handle both uniform and per-layer KV heads
305
- num_kv_heads_per_layer = getattr (self .pretrained_config ,
306
- 'num_kv_heads_per_layer' , None )
307
- if num_kv_heads_per_layer is not None :
308
- kv_heads_per_layer_raw = num_kv_heads_per_layer
309
- use_per_layer_kv_heads = True
310
- else :
311
- # Check if num_key_value_heads is a list (per-layer) or scalar (uniform)
312
- num_kv_heads_raw = getattr (self .pretrained_config ,
313
- 'num_key_value_heads' , None )
314
-
315
- if num_kv_heads_raw is not None and isinstance (
316
- num_kv_heads_raw , list ):
317
- kv_heads_per_layer_raw = num_kv_heads_raw
318
- use_per_layer_kv_heads = True
319
- else :
320
- # num_key_value_heads is scalar or None - treat as uniform KV heads
321
- if num_kv_heads_raw is None :
322
- # For uniform models, check: num_key_value_heads (standard) -> num_query_groups (NeMo) -> num_attention_heads
323
- num_kv_heads_raw = getattr (
324
- self .pretrained_config , 'num_query_groups' ,
325
- self .pretrained_config .num_attention_heads )
326
-
327
- num_kv_heads = num_kv_heads_raw // (self .mapping .tp_size *
328
- self .mapping .cp_size )
329
- use_per_layer_kv_heads = False
330
-
331
- if use_per_layer_kv_heads :
332
- # TRT-LLM LoRA requires uniform KV heads across layers
333
- if self .lora_config is not None and len (
334
- set (kv_heads_per_layer_raw )) > 1 :
335
- raise ValueError (
336
- f"TRT-LLM LoRA requires uniform KV heads across layers, "
337
- f"got: { kv_heads_per_layer_raw } " )
338
- # Apply TP/CP scaling to each layer
339
- num_kv_heads_per_layer = [
340
- kv_heads // (self .mapping .tp_size * self .mapping .cp_size )
341
- for kv_heads in kv_heads_per_layer_raw
342
- ]
343
-
344
301
hidden_size = self .pretrained_config .hidden_size // self .mapping .tp_size
345
302
346
303
model_config_cpp = ModelConfigCpp (
@@ -361,9 +318,18 @@ def get_bindings_model_config(self,
361
318
else :
362
319
model_config_cpp .tokens_per_block = tokens_per_block
363
320
364
- if use_per_layer_kv_heads :
321
+ num_key_value_heads = getattr (self .pretrained_config ,
322
+ "num_key_value_heads" , num_heads )
323
+ if isinstance (num_key_value_heads , (list , tuple )):
324
+ # Per-layer KV heads (e.g., Nemotron-NAS, variable GQA models)
325
+ num_kv_heads_per_layer = [
326
+ kv_heads // (self .mapping .tp_size * self .mapping .cp_size )
327
+ for kv_heads in num_key_value_heads
328
+ ]
365
329
model_config_cpp .num_kv_heads_per_layer = num_kv_heads_per_layer
366
330
else :
331
+ num_kv_heads = num_key_value_heads // (self .mapping .tp_size *
332
+ self .mapping .cp_size )
367
333
model_config_cpp .set_num_kv_heads (num_kv_heads )
368
334
369
335
mlp_hidden_size = None
0 commit comments