[ModelLoad] Concurrent load model (NVIDIA#5291)

arekay · nvzhihanj · dominicshanshan · commit c93b8aa45d4f · 2025-07-09T01:43:49.000-07:00
Signed-off-by: Rashid K &lt;rkaleem@nvidia.com&gt;
Co-authored-by: Zhihan Jiang &lt;68881590+nvzhihanj@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py
@@ -1,5 +1,6 @@
 import contextlib
 import math
+import os
 import time
 from dataclasses import dataclass
 from typing import Dict, Generic, List, Optional, Tuple, Type, TypeVar, Union
@@ -635,6 +636,42 @@ def filter_weights(prefix, weights: Dict):
     return result
 
 
+def run_concurrently(func,
+                     args_list,
+                     reduce_func=None,
+                     pbar=None,
+                     num_workers=None):
+    """
+    Run a function concurrently with a list of arguments.
+    func: the function to run concurrently.
+    args_list: a list of tuples of arguments for the function.
+    reduce_func: an optional function to reduce the results.
+    pbar: an optional tqdm progress bar.
+    """
+    from concurrent import futures
+    with futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+        # Submit all tasks
+        future_to_result = {
+            executor.submit(func, *arg): arg
+            for arg in args_list
+        }
+
+        # Process completed tasks as they finish
+        for result in futures.as_completed(future_to_result):
+            arg = future_to_result[result]
+            try:
+                part_weights = result.result()
+                if reduce_func:
+                    reduce_func(part_weights)
+                if pbar:
+                    pbar.update(1)
+            except Exception as e:
+                logger.error(
+                    f"Error executing {func.__name__} with args {arg}: {str(e)}"
+                )
+                raise
+
+
 def _load_weights_impl(model: Union[nn.Module, DecoderModelForCausalLM],
                        weights: Dict,
                        skip_modules: List[str] = [],
@@ -659,30 +696,29 @@ def _load_weights_impl(model: Union[nn.Module, DecoderModelForCausalLM],
         'gate_up_proj': ['gate_proj', 'up_proj']
     }
 
-    for name, module in tqdm(list(model.named_modules()),
-                             desc="Loading weights"):
+    def load_single_module(name, module):
         if len(module._parameters) > 0:
             # skip load weights if module is in skip_modules
             if any(skip_module in name for skip_module in skip_modules):
-                continue
+                return
 
             # skip load weights if tie word embeddings is enabled and layer is lm_head
             if model.config.tie_word_embeddings and name.startswith("lm_head"):
-                continue
+                return
 
             # Skip loading weights for embedding and lm_head if LoRA is enabled and has custom values
             if hasattr(model, "model") and hasattr(
                     model.model, 'has_custom_embed_tokens'
             ) and model.model.has_custom_embed_tokens and name == "model.embed_tokens":
-                continue
+                return
             if hasattr(model, 'has_custom_lm_head'
                        ) and model.has_custom_lm_head and name == "lm_head":
-                continue
+                return
 
             names = name.split('.')
             # WAR: better solution is that llama has its own load_weights function.
             if names[-1] == 'next_layer_layernorm':
-                continue
+                return
             if names[-1] in params_map:
                 module_weights = []
                 for new_name in params_map[names[-1]]:
@@ -713,3 +749,14 @@ def _load_weights_impl(model: Union[nn.Module, DecoderModelForCausalLM],
                     for n, p in module._parameters.items():
                         if p is not None:
                             p.data.copy_(module_weights[n][:])
+
+    if os.environ.get("TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL",
+                      False) in ["True", "true", "1", "yes", "y"]:
+        for name, module in tqdm(list(model.named_modules()),
+                                 desc="Loading weights"):
+            load_single_module(name, module)
+    else:
+        pbar = tqdm(list(model.named_modules()),
+                    desc="Loading weights concurrently")
+        args_list = [(name, module) for name, module in model.named_modules()]
+        run_concurrently(load_single_module, args_list, pbar=pbar)
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -364,6 +364,9 @@ def load_weights_fused_qkv_linear(self, module: Linear,
         v_weight = v_weight.to(module.dtype) * weight_scale[2]
 
         fused_weight = torch.cat((q_weight, k_weight, v_weight))
+        if module.weight_scale.device != fused_weight.device:
+            module.weight_scale = Parameter(
+                module.weight_scale.data.to(fused_weight.device))
         fused_weight = (fused_weight / module.weight_scale).to(
             torch.float8_e4m3fn)
         copy_weight(module.weight, fused_weight)
@@ -385,6 +388,9 @@ def load_weights_fused_gate_up_linear(self, module: Linear,
         gate_weight = gate_weight.to(module.dtype) * weight_scale[0]
         up_weight = up_weight.to(module.dtype) * weight_scale[1]
         fused_weight = torch.cat((gate_weight, up_weight))
+        if module.weight_scale.device != fused_weight.device:
+            module.weight_scale = Parameter(
+                module.weight_scale.data.to(fused_weight.device))
         fused_weight = (fused_weight / module.weight_scale).to(
             torch.float8_e4m3fn)
         copy_weight(module.weight, fused_weight)
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -18,6 +18,7 @@
 import safetensors
 import torch
 import torch._dynamo.config
+import tqdm
 
 import tensorrt_llm.bindings.internal.userbuffers as ub
 from tensorrt_llm._torch.pyexecutor.llm_request import LlmRequest
@@ -48,7 +49,7 @@
 from ..model_config import ModelConfig, MoeLoadBalancerConfig
 from ..models import AutoModelForCausalLM
 from ..models.modeling_utils import (DecoderModelForCausalLM, MetaInitMode,
-                                     timing)
+                                     run_concurrently, timing)
 from ..modules.fused_moe.moe_load_balancer import (
     MoeLoadBalancer, MoeLoadBalancerIterContext, maybe_create_moe_load_balancer)
 from ..speculative import SpecConfig, SpecMetadata, get_spec_metadata
@@ -180,19 +181,28 @@ def load_weights(checkpoint_dir: str):
                 f"Prefetching {prefetch_size / (1024**3):.2f}GB checkpoint files."
             )
             prefetch_files(weight_files)
-        for file in weight_files:
-            logger.info(f"Loading {file}")
-            part_weights = safetensors.torch.load_file(file)
-            weights.update(part_weights)
+
+        def load_safetensors_file(file):
+            return safetensors.torch.load_file(file)
+
+        pbar = tqdm.tqdm(total=len(weight_files),
+                         desc="Loading safetensors weights in parallel")
+
+        # Note that the function is called with a tuple of arguments, hence we need to wrap the arguments in a tuple via [(w,) for w in weight_files]
+        # specifically the comma right after the w is important to make it a tuple.
+        run_concurrently(load_safetensors_file, [(w, ) for w in weight_files],
+                         reduce_func=weights.update,
+                         pbar=pbar)
+
         return weights
 
     weight_files = glob.glob(f"{checkpoint_dir}/*.bin")
     if not weight_files:
         weight_files = glob.glob(f"{checkpoint_dir}/*.pth")
 
     if weight_files:
-        for file in weight_files:
-            # try mmap first, if failed, turn off mmap
+
+        def load_bin_or_path_file(file):
             try:
                 part_weights = torch.load(file,
                                           weights_only=True,
@@ -206,7 +216,16 @@ def load_weights(checkpoint_dir: str):
                                           weights_only=True,
                                           map_location='cpu',
                                           mmap=False)
-            weights.update(part_weights)
+            finally:
+                return part_weights
+
+        pbar = tqdm.tqdm(total=len(weight_files),
+                         desc="Loading bin weights in parallel")
+        # Note that the function is called with a tuple of arguments, hence we need to wrap the arguments in a tuple via [(w,) for w in weight_files]
+        # specifically the comma right after the w is important to make it a tuple.
+        run_concurrently(load_bin_or_path_file, [(w, ) for w in weight_files],
+                         reduce_func=weights.update,
+                         pbar=pbar)
         return weights
 
     raise RuntimeError(f"No weight files found in {checkpoint_dir}.")