fix: improve worker node wait logic

Satish Pasumarthi · Satish Pasumarthi · commit 14f2894bcc84 · 2022-09-09T11:25:30.000-07:00
diff --git a/src/sagemaker_training/__init__.py b/src/sagemaker_training/__init__.py
@@ -69,3 +69,5 @@
     "OutOfRangeError",
     "InvalidArgumentError",
 ]
+
+_MPI_ERRORS_ = ["mpirun.real", "ORTE"]
diff --git a/src/sagemaker_training/mpi.py b/src/sagemaker_training/mpi.py
@@ -28,6 +28,8 @@
 logger = logging_config.get_logger()
 logging.getLogger("paramiko").setLevel(logging.INFO)
 
+MPI_FINISHED_STATUS_FILE = "/tmp/done"
+
 
 def get_modelparallel_exception_classes():
     """Set exception classes"""
@@ -60,7 +62,9 @@ class WorkerRunner(process.ProcessRunner):
     master execution to finish.
     """
 
-    def __init__(self, user_entry_point, args, env_vars, processes_per_host, master_hostname):
+    def __init__(
+        self, user_entry_point, args, env_vars, processes_per_host, master_hostname, current_host
+    ):
         """Initialize a WorkerRunner, which is responsible for preparing distributed
         training with MPI and waiting for MPI master execution to finish.
 
@@ -69,9 +73,11 @@ def __init__(self, user_entry_point, args, env_vars, processes_per_host, master_
             args ([str]): A list of arguments to include when executing the entry point.
             env_vars (dict(str,str)): A dictionary of environment variables.
             master_hostname (str): The master hostname.
+            current_hostname (str): Current hostname.
         """
         super(WorkerRunner, self).__init__(user_entry_point, args, env_vars, processes_per_host)
         self._master_hostname = str(master_hostname)
+        self._current_host = str(current_host)
 
     def run(
         self, wait=True, capture_error=False
@@ -81,6 +87,8 @@ def run(
         - wait for the MPI Master to create its SSH daemon
         - start its SSH daemon
         - monitor the MPI orted process and wait it to finish the MPI execution
+        - wait for the status file from master
+        - Exit once orted process is finished and status file is found.
         """
         logger.info("Starting MPI run as worker node.")
         if wait:
@@ -95,18 +103,41 @@ def run(
 
         if wait:
             logger.info("Waiting for MPI process to finish.")
-            _wait_orted_process_to_finish()
+            gone, alive = _wait_orted_process_to_finish()
+            logger.info(f"Reporting status for ORTEd process. gone: {gone} alive: {alive}")
             logger.info("Orted process exited")
             time.sleep(30)
+            logger.info(f"Begin looking for status file on {self._current_host}")
+            status_file = MPI_FINISHED_STATUS_FILE + "." + self._master_hostname
+            file_found = self._wait_for_status_file(status_file)
+            if file_found:
+                logger.info("MPI training job status file found. Exit gracefully")
+            else:
+                logger.info("Status file not found. Exiting...")
+            logger.info("End looking for status file")
         logger.info("MPI process finished.")
 
+    def _wait_for_status_file(self, status_file):
+        start_time = time.time()
+        file_found = os.path.exists(status_file)
+        while not file_found:
+            time.sleep(30)
+            curr_time = time.time()
+            # Check connectivity with master every 2 minutes
+            if int(curr_time - start_time) % 120 == 0:
+                logger.info("status file not found...")
+                if not _can_connect(self._master_hostname):
+                    return False
+            file_found = os.path.exists(status_file)
+        return True
+
     def _wait_master_to_start(self):  # type: () -> None
         while not _can_connect(self._master_hostname):
             time.sleep(1)
 
-    def _wait_master_to_finish(self):  # type: () -> None
-        while _can_connect(self._master_hostname):
-            time.sleep(30)
+    # def _wait_master_to_finish(self):  # type: () -> None
+    #     while _can_connect(self._master_hostname):
+    #         time.sleep(30)
 
 
 def _write_env_vars_to_file():  # type: () -> None
@@ -115,11 +146,17 @@ def _write_env_vars_to_file():  # type: () -> None
             f.write("{}={}\n".format(name, os.environ.get(name)))
 
 
+def _on_terminate(proc):
+    logger.info("Invoked on_terminate from psutil.wait_for_procs")
+    logger.info("process {} terminated with exit code {}".format(proc, proc.returncode))
+
+
 def _wait_orted_process_to_finish():  # type: () -> None
     orted = _orted_process()
     logger.info("Orted process found %s", orted)
     logger.info("Waiting for orted process %s", orted)
-    psutil.wait_procs(orted)
+    gone, alive = psutil.wait_procs(orted, callback=_on_terminate)
+    return gone, alive
 
 
 def _orted_process():  # pylint: disable=inconsistent-return-statements
@@ -150,6 +187,7 @@ def __init__(
         interval=1,
         timeout_in_seconds=60 * 60,
         num_processes=None,
+        instance_type="ml.p3.16xlarge",
     ):
         """Initialize a MasterRunner, which is responsible for preparing distributed
         training with MPI and synchronizing work among the Workers.
@@ -178,6 +216,8 @@ def __init__(
         self._custom_mpi_options = custom_mpi_options
         self._network_interface_name = network_interface_name
         self._interval = interval
+        self._env_vars = env_vars
+        self._instance_type = instance_type
         self.timeout_in_seconds = timeout_in_seconds
 
     def _setup(self):  # type: () -> None
@@ -265,6 +305,12 @@ def _create_command(self):
         ]
 
         command.extend(additional_options)
+        # EFA settings
+        if self._instance_type in ["ml.p3dn.24xlarge", "ml.p4d.24xlarge"]:
+            # Use EFA's RDMA functionality for one-sided and two-sided transfer
+            command.extend(["-x", "FI_EFA_USE_DEVICE_RDMA=1"])
+            # Use simple protocol to handle the out-of-order data delivery from EFA
+            command.extend(["-x", "NCCL_PROTO=simple"])
 
         for credential in [
             "AWS_ACCESS_KEY_ID",
@@ -280,6 +326,12 @@ def _create_command(self):
         command.extend(super(MasterRunner, self)._create_command())
         return command
 
+    def _get_instance_type(self):
+        """Get instance type"""
+        instance_type = self._env_vars.get("current_instance_type", None)
+        logger.info("instance type: %s" % instance_type)
+        return instance_type
+
     def _python_command(self):
         """Use mpi4py to force processes to abort if an uncaught exception occurs.
         https://docs.chainer.org/en/stable/chainermn/tutorial/tips_faqs.html#mpi-process-hangs-after-an-unhandled-python-exception
@@ -326,7 +378,26 @@ def run(self, wait=True, capture_error=False):
                 capture_error=capture_error,
                 cwd=environment.code_dir,
             )
-
+        logger.info("Begin writing status file from leader node to worker nodes (if any)")
+        # Write status file to all nodes
+        status_file = MPI_FINISHED_STATUS_FILE + "." + self._master_hostname
+        for host in self._hosts:
+            if host != self._master_hostname:
+                status = _write_status_file(host, status_file)
+                retry_count = 5 if not status else 0
+                while not status:
+                    if retry_count == 0:
+                        break
+                    logger.info(f"Retry creating status file onto {host}")
+                    retry_count -= 1
+                    time.sleep(1)
+                    status = _write_status_file(host, status_file)
+
+                if not status:
+                    logger.info(f"Failed to create status file onto {host}")
+
+        time.sleep(30)
+        logger.info("Finished writing status file from leader node to worker nodes (if any)")
         self._tear_down()
         return process_spawned
 
@@ -378,8 +449,28 @@ def _can_connect(host, port=22):  # type: (str, int) -> bool
         return True
     except Exception as e:  # pylint: disable=broad-except
         logger.info("Cannot connect to host %s", host)
+        logger.info(
+            "Connection failed with exception: \n %s. \
+             Can be ignored for worker when master completes and exits.",
+            str(e),
+        )
+        return False
 
-        logger.info("Connection failed with exception: \n %s", str(e))
+
+def _write_status_file(host, status_file):
+    try:
+        logger.info(f"Start writing mpirun finished status to {host}")
+        output = subprocess.run(
+            ["ssh", str(host), "touch", f"{status_file}"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        logger.info(f"output from subprocess run {output}")
+        logger.info("Finished writing status file")
+        return True
+    except subprocess.CalledProcessError:
+        logger.info(f"Cannot connect to {host}")
         return False
 
 
diff --git a/src/sagemaker_training/process.py b/src/sagemaker_training/process.py
@@ -27,6 +27,7 @@
 
 from sagemaker_training import (
     _entry_point_type,
+    _MPI_ERRORS_,
     _PYTHON_ERRORS_,
     environment,
     errors,
@@ -115,7 +116,7 @@ async def watch(stream, proc_per_host, error_classes=None):
                 if any(
                     str(err) in err_line
                     for err in (
-                        _PYTHON_ERRORS_ + error_classes
+                        _PYTHON_ERRORS_ + _MPI_ERRORS_ + error_classes
                         if isinstance(error_classes, list)
                         else [error_classes]
                     )
diff --git a/src/sagemaker_training/runner.py b/src/sagemaker_training/runner.py
@@ -90,6 +90,7 @@ def _get_by_runner_type(
     elif identifier is RunnerType.MPI and env.is_master:
         num_processes = _mpi_param_value(mpi_args, env, params.MPI_NUM_PROCESSES)
         custom_mpi_options = _mpi_param_value(mpi_args, env, params.MPI_CUSTOM_OPTIONS, "")
+        current_instance_type = env.current_instance_type
         return mpi.MasterRunner(
             user_entry_point,
             args,
@@ -100,10 +101,16 @@ def _get_by_runner_type(
             custom_mpi_options,
             env.network_interface_name,
             num_processes=num_processes,
+            instance_type=current_instance_type,
         )
     elif identifier is RunnerType.MPI:
         return mpi.WorkerRunner(
-            user_entry_point, args, env_vars, processes_per_host, env.master_hostname
+            user_entry_point,
+            args,
+            env_vars,
+            processes_per_host,
+            env.master_hostname,
+            env.current_host,
         )
     elif identifier is RunnerType.PyTorchXLA:
         return pytorch_xla.PyTorchXLARunner(
diff --git a/src/sagemaker_training/smdataparallel.py b/src/sagemaker_training/smdataparallel.py
@@ -42,6 +42,8 @@
     )
     exception_classes = [errors.ExecuteUserScriptError]
 
+MPI_FINISHED_STATUS_FILE = "/tmp/done"
+
 
 class SMDataParallelRunner(process.ProcessRunner):
     """Prepare SMDataParallel-based distributed training.
@@ -185,9 +187,12 @@ def _get_mpirun_command(
         mpirun_command.extend(additional_options)
 
         instance_type = self._get_instance_type()
-        # Use EFA's RDMA functionality for one-sided and two-sided transfer
+        # EFA settings
         if instance_type in ["ml.p3dn.24xlarge", "ml.p4d.24xlarge"]:
+            # Use EFA's RDMA functionality for one-sided and two-sided transfer
             mpirun_command.extend(["-x", "FI_EFA_USE_DEVICE_RDMA=1"])
+            # Use simple protocol to handle the out-of-order data delivery from EFA
+            mpirun_command.extend(["-x", "NCCL_PROTO=simple"])
 
         if smdataparallel_server_addr and smdataparallel_server_port:
             # in case of multi-node [distributed] training, smdataparallel_server_addr,
@@ -300,6 +305,28 @@ def run(self, wait=True, capture_error=False):
                 capture_error=capture_error,
                 cwd=environment.code_dir,
             )
+
+        logger.info("Begin writing status file from leader node to worker nodes")
+        # Write status file to all nodes
+        status_file = MPI_FINISHED_STATUS_FILE + "." + self._master_hostname
+        for host in self._hosts:
+            if host != self._master_hostname:
+                status = _write_status_file(host, status_file)
+                retry_count = 5 if not status else 0
+                while not status:
+                    if retry_count == 0:
+                        break
+                    logger.info(f"Retry creating status file onto {host}")
+                    retry_count -= 1
+                    time.sleep(1)
+                    status = _write_status_file(host, status_file)
+
+                if not status:
+                    logger.info(f"Failed to create status file onto {host}")
+
+        time.sleep(30)
+        logger.info("Finished writing status file from leader node to worker nodes")
+
         self._tear_down()
         return process_spawned
 
@@ -357,6 +384,23 @@ def _can_connect(host, port=22):
         logger.info("Connection closed")
 
 
+def _write_status_file(host, status_file):
+    try:
+        logger.info(f"Start writing mpirun finished status to {host}")
+        output = subprocess.run(
+            ["ssh", str(host), "touch", f"{status_file}"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        logger.info(f"output from subprocess run {output}")
+        logger.info("Finished writing status file")
+        return True
+    except subprocess.CalledProcessError:
+        logger.info(f"Cannot connect to {host}")
+        return False
+
+
 def _parse_custom_mpi_options(custom_mpi_options):
     """Parse custom MPI options provided by user. Known options default value will be overridden
     and unknown options will be identified separately."""
diff --git a/test/unit/test_mpi.py b/test/unit/test_mpi.py
diff --git a/test/unit/test_smdataparallel.py b/test/unit/test_smdataparallel.py

Original file line number	Diff line number	Diff line change
`@@ -69,3 +69,5 @@`
`69`	`69`	`"OutOfRangeError",`
`70`	`70`	`"InvalidArgumentError",`
`71`	`71`	`]`
	`72`	`+`
	`73`	`+_MPI_ERRORS_ = ["mpirun.real", "ORTE"]`