|
56 | 56 | # or some combination thereof. |
57 | 57 | # Refer to qstat man page for additional details. |
58 | 58 | # o(rphaned) is not considered as busy since we assume a node in orphaned state is not present in ASG anymore |
59 | | -SGE_BUSY_STATES = ["u", "C", "s", "d", "D", "E", "P"] |
| 59 | +SGE_BUSY_STATES = ["u", "C", "s", "D", "E", "P"] |
| 60 | + |
| 61 | +# This state is set by nodewatcher when the node is locked and is being terminated. |
| 62 | +SGE_DISABLED_STATE = "d" |
60 | 63 |
|
61 | 64 | # If an o(rphaned) state is displayed for a queue instance, it indicates that the queue instance is no longer demanded |
62 | 65 | # by the current cluster queue configuration or the host group configuration. The queue instance is kept because jobs |
@@ -133,10 +136,11 @@ def remove_hosts_from_queue(hosts): |
133 | 136 | def install_sge_on_compute_nodes(hosts, cluster_user): |
134 | 137 | """Start sge on compute nodes in parallel.""" |
135 | 138 | command = ( |
136 | | - "sudo sh -c 'cd {0} && {0}/inst_sge -noremote -x -auto /opt/parallelcluster/templates/sge/sge_inst.conf'" |
| 139 | + "sudo sh -c 'ps aux | grep [s]ge_execd || " |
| 140 | + "(cd {0} && {0}/inst_sge -noremote -x -auto /opt/parallelcluster/templates/sge/sge_inst.conf)'" |
137 | 141 | ).format(sge.SGE_ROOT) |
138 | 142 | hostnames = [host.hostname for host in hosts] |
139 | | - result = RemoteCommandExecutor.run_remote_command_on_multiple_hosts(command, hostnames, cluster_user) |
| 143 | + result = RemoteCommandExecutor.run_remote_command_on_multiple_hosts(command, hostnames, cluster_user, timeout=20) |
140 | 144 |
|
141 | 145 | succeeded_hosts = [] |
142 | 146 | for host in hosts: |
@@ -206,6 +210,7 @@ def get_jobs_info(hostname_filter=None, job_state_filter=None): |
206 | 210 | def get_pending_jobs_info(max_slots_filter=None, skip_if_state=None): |
207 | 211 | """ |
208 | 212 | Retrieve the list of pending jobs. |
| 213 | +
|
209 | 214 | :param max_slots_filter: discard jobs that require a number of slots bigger than the given value |
210 | 215 | :param skip_if_state: discard jobs that are in the given state |
211 | 216 | :return: the list of filtered pending jos. |
|
0 commit comments