NVIDIA · yuxianq · May 30, 2025 · May 29, 2025
@@ -678,9 +678,11 @@ def __init__(self, model_config: ModelConfig[LlamaConfig]):
         self.aux_stream = torch.cuda.Stream()
 
         if self.model_config.mapping.enable_attention_dp:
-            self.embed_tokens = Embedding(config.vocab_size,
-                                          config.hidden_size,
-                                          dtype=config.torch_dtype)
+            self.embed_tokens = Embedding(
+                config.vocab_size,
+                config.hidden_size,
+                dtype=config.torch_dtype,
+            )
         else:
             self.embed_tokens = Embedding(
                 config.vocab_size,
@@ -756,14 +758,21 @@ def __init__(self, model_config: ModelConfig[LlamaConfig]):
                 weight = lora_loader.embed_tokens
                 self.has_custom_embed_tokens = True
 
-        self.embed_tokens = Embedding(
-            vocab_size,
-            config.hidden_size,
-            dtype=config.torch_dtype,
-            mapping=model_config.mapping,
-            tensor_parallel_mode=TensorParallelMode.COLUMN,
-            gather_output=True,
-        )
+        if self.model_config.mapping.enable_attention_dp:
+            self.embed_tokens = Embedding(
+                vocab_size,
+                config.hidden_size,
+                dtype=config.torch_dtype,
+            )
+        else:
+            self.embed_tokens = Embedding(
+                vocab_size,
+                config.hidden_size,
+                dtype=config.torch_dtype,
+                mapping=model_config.mapping,
+                tensor_parallel_mode=TensorParallelMode.COLUMN,
+                gather_output=True,
+            )
 
         if self.has_custom_embed_tokens:
             with torch.no_grad():

@@ -11,11 +11,9 @@
 from tqdm import tqdm
 
 from tensorrt_llm.lora_manager import HfLoraLoader
-from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.convert_utils import split_matrix_tp
 
 from ...logger import logger
-from ...mapping import Mapping
 from ...models.modeling_utils import QuantConfig
 from ..attention_backend import AttentionMetadata
 from ..distributed.communicator import pp_recv, pp_send
@@ -356,13 +354,6 @@ def __init__(self, model: TModel, *, config: ModelConfig[TConfig],
                 vocab_size,
                 hidden_size,
                 dtype=config.pretrained_config.torch_dtype,
-                mapping=Mapping(
-                    world_size=1,
-                    tp_size=1,
-                    rank=0,
-                ),
-                tensor_parallel_mode=None,
-                gather_output=False,
             )
         else:
             # TODO(zhenhuanc): Currently lm_head Linear will not accept QuantConfig