fix: resolve GPU memory imbalance in concurrent weight loading

Nekofish-L · web-flow · commit f5e6e7ecca78 · 2025-07-30T14:30:39.000+08:00
Signed-off-by: Necofish &lt;liuxiangyang@mail.ustc.edu.cn&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py
@@ -13,6 +13,7 @@
 
 from tensorrt_llm.lora_manager import HfLoraLoader
 from tensorrt_llm.models.convert_utils import split_matrix_tp
+from tensorrt_llm._utils import mpi_rank
 
 from ...logger import logger
 from ...models.modeling_utils import QuantConfig
@@ -777,6 +778,7 @@ def _load_weights_impl(model: Union[nn.Module, DecoderModelForCausalLM],
     }
 
     def load_single_module(name, module):
+        torch.cuda.set_device(mpi_rank())
         if len(module._parameters) > 0:
             # skip load weights if module is in skip_modules
             if any(skip_module in name for skip_module in skip_modules):
@@ -873,6 +875,7 @@ def _load_weights_impl_v2(model: Union[nn.Module, DecoderModelForCausalLM],
         logger.info(f"Renamed weights with params_map: {params_map}")
 
     def load_single_module(name, module):
+        torch.cuda.set_device(mpi_rank())
         if len(module._parameters) > 0:
             if weight_mapper.should_skip_module(name):
                 return