From 1b29e297f229672ef13c4f29b18a685b58cbbc63 Mon Sep 17 00:00:00 2001 From: zhengchenyu Date: Sun, 28 Sep 2025 14:05:11 +0800 Subject: [PATCH 1/2] Fixed the issue that universal checkpoint cannot be loaded for stage3 when world size expansion --- deepspeed/runtime/engine.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 363da7a765e1..4ba4ac92e4f6 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -3004,7 +3004,7 @@ def _get_zero_ckpt_name(self, checkpoints_path, tag): bf16_mode = self.bfloat16_enabled() return self._get_rank_zero_ckpt_name(checkpoints_path, tag, mp_rank, pp_rank, bf16_mode) - def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None): + def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None, pp_placeholder=None): if mp_placeholder is not None: mp_rank_str = mp_placeholder else: @@ -3012,7 +3012,12 @@ def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None): mp_rank_str = f"{mp_rank:02d}" if self.zero_optimization_partition_weights(): - filename = "zero_pp_rank_{}".format(dist.get_rank(group=self.optimizer.dp_process_group)) + if pp_placeholder is not None: + pp_rank = pp_placeholder + else: + pp_rank = dist.get_rank(group=self.optimizer.dp_process_group) + + filename = "zero_pp_rank_{}".format(pp_rank) ckpt_name = os.path.join( checkpoints_path, str(tag), @@ -3047,7 +3052,7 @@ def _get_expert_ckpt_name(checkpoints_path, layer_id, expert_id, tag, mpu=None): def _get_all_ckpt_names(self, checkpoints_path, tag): # It is required that (checkpoints_path, tag) are consistent among all ranks. - ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, tag, mp_placeholder="*") + ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, tag, mp_placeholder="*", pp_placeholder="*") import glob ckpt_files = glob.glob(ckpt_file_pattern) From 707a1d08a49fa7d9a8d19fa110ac870671a8dbc9 Mon Sep 17 00:00:00 2001 From: zhengchenyu Date: Mon, 29 Sep 2025 11:23:02 +0800 Subject: [PATCH 2/2] fix load checkpoint when disable univeral --- deepspeed/runtime/engine.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 4ba4ac92e4f6..d7e1c7c35a2e 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -3052,15 +3052,15 @@ def _get_expert_ckpt_name(checkpoints_path, layer_id, expert_id, tag, mpu=None): def _get_all_ckpt_names(self, checkpoints_path, tag): # It is required that (checkpoints_path, tag) are consistent among all ranks. - ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, tag, mp_placeholder="*", pp_placeholder="*") + ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, + tag, + mp_placeholder="*", + pp_placeholder="0" if self.load_universal_checkpoint() else None) import glob ckpt_files = glob.glob(ckpt_file_pattern) ckpt_files.sort() - if self.load_universal_checkpoint(): - return [ckpt_files[0]] - else: - return ckpt_files + return ckpt_files def load_checkpoint(self, load_dir,