Skip to content

Commit deb12f2

Browse files
committed
BF: CS-1387 qrsh takes up to 1s to terminate after job end
1 parent 4bc664b commit deb12f2

File tree

1 file changed

+89
-66
lines changed

1 file changed

+89
-66
lines changed

source/clients/qsh/sge_client_ijs.cc

Lines changed: 89 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
* All Rights Reserved.
2929
*
3030
* Portions of this code are Copyright 2011 Univa Inc.
31-
*
32-
* Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH
31+
*
32+
* Portions of this software are Copyright (c) 2023-2025 HPC-Gridware GmbH
3333
*
3434
************************************************************************/
3535
/*___INFO__MARK_END__*/
@@ -69,6 +69,7 @@ static unsigned int g_pid = 0; /* set by main thread, read by worker thread */
6969
static int g_raw_mode_state = 0; /* set by main thread, read by worker thread */
7070
static int g_suspend_remote = 0; /* set by main thread, read by worker thread */
7171
static COMM_HANDLE *g_comm_handle = nullptr;
72+
static int g_wakeup_pipe[2] = { -1, -1 }; /* pipe to wake up worker thread */
7273

7374
/*
7475
* static volatile sig_atomic_t received_window_change_signal = 1;
@@ -147,7 +148,7 @@ static void broken_pipe_handler(int sig)
147148
* static void signal_handler(int sig)
148149
*
149150
* FUNCTION
150-
* Handler for all signals that quit the program. These signals are trapped
151+
* Handler for all signals that quit the program. These signals are trapped
151152
* in order to restore the terminal modes.
152153
*
153154
* INPUTS
@@ -190,7 +191,7 @@ void set_signal_handlers()
190191
memset(&old_handler, 0, sizeof(old_handler));
191192
memset(&new_handler, 0, sizeof(new_handler));
192193

193-
/* Is SIGHUP necessary?
194+
/* Is SIGHUP necessary?
194195
* Yes: termio(7I) says:
195196
* "When a modem disconnect is detected, a SIGHUP signal is sent
196197
* to the terminal's controlling process.
@@ -288,7 +289,7 @@ static void client_check_window_change(COMM_HANDLE *handle)
288289
if (received_window_change_signal) {
289290
/*
290291
* here we can have a race condition between the two working threads,
291-
* but it doesn't matter - in the worst case, the new window size gets
292+
* but it doesn't matter - in the worst case, the new window size gets
292293
* submitted two times.
293294
*/
294295
received_window_change_signal = 0;
@@ -340,8 +341,13 @@ void* tty_to_commlib(void *t_conf)
340341
DSTRING_STATIC(err_msg, MAX_STRING_SIZE);
341342

342343
thread_func_startup(t_conf);
343-
344-
/*
344+
345+
// create a pipe to wake up this thread
346+
if (pipe(g_wakeup_pipe) < 0) {
347+
DPRINTF("tty_to_commlib: pipe() failed: %s\n", strerror(errno));
348+
}
349+
350+
/*
345351
* allocate working buffer
346352
*/
347353
bool do_exit = false;
@@ -352,28 +358,33 @@ void* tty_to_commlib(void *t_conf)
352358
/* wait for input on tty */
353359
FD_SET(STDIN_FILENO, &read_fds);
354360
}
361+
if (g_wakeup_pipe[0] != -1) {
362+
/* wait for input on wakeup pipe */
363+
FD_SET(g_wakeup_pipe[0], &read_fds);
364+
}
355365
struct timeval timeout;
356366
timeout.tv_sec = 1;
357367
timeout.tv_usec = 0;
358368

359369
if (received_signal == SIGCONT) {
360-
received_signal = 0;
361-
if (continue_handler (g_comm_handle, g_hostname) == 1) {
362-
do_exit = true;
363-
continue;
364-
}
365-
if (g_raw_mode_state == 1) {
366-
/* restore raw-mode after SIGCONT */
367-
if (terminal_enter_raw_mode () != 0) {
368-
DPRINTF("tty_to_commlib: couldn't enter raw mode for pty\n");
369-
do_exit = true;
370-
continue;
370+
received_signal = 0;
371+
if (continue_handler (g_comm_handle, g_hostname) == 1) {
372+
do_exit = true;
373+
continue;
374+
}
375+
if (g_raw_mode_state == 1) {
376+
/* restore raw-mode after SIGCONT */
377+
if (terminal_enter_raw_mode () != 0) {
378+
DPRINTF("tty_to_commlib: couldn't enter raw mode for pty\n");
379+
do_exit = true;
380+
continue;
371381
}
372-
}
373-
}
374-
382+
}
383+
}
384+
375385
DPRINTF("tty_to_commlib: Waiting in select() for data\n");
376-
int ret = select(STDIN_FILENO+1, &read_fds, nullptr, nullptr, &timeout);
386+
int ret = select(MAX(STDIN_FILENO, g_wakeup_pipe[0]) + 1, &read_fds, nullptr, nullptr, &timeout);
387+
DPRINTF("select returned %d\n", ret);
377388

378389
thread_testcancel(t_conf);
379390
client_check_window_change(g_comm_handle);
@@ -394,29 +405,36 @@ void* tty_to_commlib(void *t_conf)
394405
/* We should never get here if STDIN is closed */
395406
DPRINTF("tty_to_commlib: STDIN ready to read while it should be closed!!!\n");
396407
}
397-
DPRINTF("tty_to_commlib: trying to read() from stdin\n");
398-
int nread = read(STDIN_FILENO, pbuf, BUFSIZE-1);
399-
pbuf[nread] = '\0';
400-
sge_dstring_append (&dbuf, pbuf);
401-
DPRINTF("tty_to_commlib: nread = %d\n", nread);
402-
403-
if (nread < 0 && (errno == EINTR || errno == EAGAIN)) {
404-
DPRINTF("tty_to_commlib: EINTR or EAGAIN\n");
405-
/* do nothing */
406-
} else if (nread <= 0) {
407-
do_exit = true;
408-
} else {
409-
DPRINTF("tty_to_commlib: writing to commlib: %d bytes\n", nread);
410-
if (suspend_handler(g_comm_handle, g_hostname, g_is_rsh, g_suspend_remote, g_pid, &dbuf) == 1) {
411-
if (comm_write_message(g_comm_handle, g_hostname,
412-
COMM_CLIENT, 1, (unsigned char*)pbuf,
413-
(unsigned long)nread, STDIN_DATA_MSG, &err_msg) != (unsigned long)nread) {
414-
DPRINTF("tty_to_commlib: couldn't write all data\n");
415-
} else {
416-
DPRINTF("tty_to_commlib: data successfully written\n");
417-
}
408+
if (FD_ISSET(STDIN_FILENO, &read_fds)) {
409+
DPRINTF("tty_to_commlib: trying to read() from stdin\n");
410+
int nread = read(STDIN_FILENO, pbuf, BUFSIZE-1);
411+
pbuf[nread] = '\0';
412+
sge_dstring_append (&dbuf, pbuf);
413+
DPRINTF("tty_to_commlib: nread = %d\n", nread);
414+
415+
if (nread < 0 && (errno == EINTR || errno == EAGAIN)) {
416+
DPRINTF("tty_to_commlib: EINTR or EAGAIN\n");
417+
/* do nothing */
418+
} else if (nread <= 0) {
419+
do_exit = true;
420+
} else {
421+
DPRINTF("tty_to_commlib: writing to commlib: %d bytes\n", nread);
422+
if (suspend_handler(g_comm_handle, g_hostname, g_is_rsh, g_suspend_remote, g_pid, &dbuf) == 1) {
423+
if (comm_write_message(g_comm_handle, g_hostname,
424+
COMM_CLIENT, 1, (unsigned char *) pbuf,
425+
(unsigned long) nread, STDIN_DATA_MSG, &err_msg) != (unsigned long) nread) {
426+
DPRINTF("tty_to_commlib: couldn't write all data\n");
427+
} else {
428+
DPRINTF("tty_to_commlib: data successfully written\n");
429+
}
430+
}
431+
comm_flush_write_messages(g_comm_handle, &err_msg);
418432
}
419-
comm_flush_write_messages(g_comm_handle, &err_msg);
433+
} else if (FD_ISSET(g_wakeup_pipe[0], &read_fds)) {
434+
// If we received something on the wakeup pipe, we shall exit.
435+
// We will probably never get here as thread_testcancel() above will already terminate the thread.
436+
DPRINTF("wakeup pipe was triggered, exiting tty_to_commlib thread\n");
437+
do_exit = true;
420438
}
421439
sge_dstring_free(&dbuf);
422440
} else {
@@ -444,7 +462,7 @@ void* tty_to_commlib(void *t_conf)
444462

445463
/* clean up */
446464
thread_func_cleanup(t_conf);
447-
465+
448466
DPRINTF("tty_to_commlib: exiting tty_to_commlib thread!\n");
449467
DRETURN(nullptr);
450468
}
@@ -559,9 +577,9 @@ void* commlib_to_tty(void *t_conf)
559577
break;
560578
case REGISTER_CTRL_MSG:
561579
/* control message */
562-
/* a client registered with us. With the next loop, the
580+
/* a client registered with us. With the next loop, the
563581
* cl_commlib_trigger function will send the WINDOW_SIZE_CTRL_MSG
564-
* (and perhaps some data messages), which is already in the
582+
* (and perhaps some data messages), which is already in the
565583
* send_messages list of the connection, to the client.
566584
*/
567585
DPRINTF("commlib_to_tty: received register message!\n");
@@ -574,7 +592,7 @@ void* commlib_to_tty(void *t_conf)
574592
case UNREGISTER_CTRL_MSG:
575593
/* control message */
576594
/* the client wants to quit, as this is the last message the client
577-
* sends, we can be sure to have received all messages from the
595+
* sends, we can be sure to have received all messages from the
578596
* client. We answer with a UNREGISTER_RESPONSE_CTRL_MSG so
579597
* the client knows that it can quit now. We can quit, also.
580598
*/
@@ -591,7 +609,7 @@ void* commlib_to_tty(void *t_conf)
591609
* If the job was signalled, the exit code is 128+signal.
592610
*/
593611
sscanf(buf, "%d", &g_exit_status);
594-
comm_write_message(g_comm_handle, g_hostname, COMM_CLIENT, 1,
612+
comm_write_message(g_comm_handle, g_hostname, COMM_CLIENT, 1,
595613
(unsigned char*)" ", 1, UNREGISTER_RESPONSE_CTRL_MSG, &err_msg);
596614

597615
DPRINTF("commlib_to_tty: received exit_status from shepherd: %d\n", g_exit_status);
@@ -747,9 +765,9 @@ int run_ijs_server(COMM_HANDLE *handle, const char *remote_host,
747765

748766
/*
749767
* From here on, the two worker threads are doing all the work.
750-
* This main thread is just waiting until the client closes the
751-
* connection to us, which causes the commlib_to_tty thread to
752-
* exit. Then it closes the tty_to_commlib thread, too, and
768+
* This main thread is just waiting until the client closes the
769+
* connection to us, which causes the commlib_to_tty thread to
770+
* exit. Then it closes the tty_to_commlib thread, too, and
753771
* cleans up everything.
754772
*/
755773
DPRINTF("waiting for end of commlib_to_tty thread\n");
@@ -758,14 +776,19 @@ int run_ijs_server(COMM_HANDLE *handle, const char *remote_host,
758776
DPRINTF("shutting down tty_to_commlib thread\n");
759777
thread_shutdown(pthread_tty_to_commlib);
760778

761-
/*
762-
* Close stdin to awake the tty_to_commlib-thread from the select() call.
763-
* thread_shutdown() doesn't work on all architectures.
764-
*/
765-
close(STDIN_FILENO);
779+
// write a byte into the wakeup pipe to wake up the tty_to_commlib thread
780+
if (g_wakeup_pipe[1] != -1) {
781+
DPRINTF("tty_to_commlib: writing wakeup byte to pipe\n");
782+
if (write(g_wakeup_pipe[1], "x", 1) < 0) {
783+
DPRINTF("tty_to_commlib: write() to wakeup pipe failed: %s\n", strerror(errno));
784+
}
785+
} else {
786+
DPRINTF("tty_to_commlib: g_wakeup_pipe[1] is -1, not writing wakeup byte\n");
787+
}
766788

767789
DPRINTF("waiting for end of tty_to_commlib thread\n");
768790
thread_join(pthread_tty_to_commlib);
791+
DPRINTF("tty_to_commlib thread terminated\n");
769792
cleanup:
770793
/*
771794
* Set our terminal back to 'unraw' mode. Should be done automatically
@@ -775,7 +798,7 @@ int run_ijs_server(COMM_HANDLE *handle, const char *remote_host,
775798
DPRINTF("terminal_leave_raw_mode() returned %s (%d)\n", strerror(ret), ret);
776799
if (ret != 0) {
777800
sge_dstring_sprintf(p_err_msg, "error resetting terminal mode: %s (%d)", strerror(ret), ret);
778-
ret_val = 7;
801+
ret_val = 7;
779802
}
780803

781804
*p_exit_status = g_exit_status;
@@ -795,7 +818,7 @@ int run_ijs_server(COMM_HANDLE *handle, const char *remote_host,
795818
*
796819
* FUNCTION
797820
* Starts the commlib server for the commlib connection between the shepherd
798-
* of the interactive job (qrsh/qlogin) and the qrsh/qlogin command.
821+
* of the interactive job (qrsh/qlogin) and the qrsh/qlogin command.
799822
* Over this connection the stdin/stdout/stderr input/output is transferred.
800823
*
801824
* INPUTS
@@ -815,7 +838,7 @@ int run_ijs_server(COMM_HANDLE *handle, const char *remote_host,
815838
* 2: Can't set connection parameters
816839
*
817840
* NOTES
818-
* MT-NOTE: start_builtin_ijs_server() is not MT safe
841+
* MT-NOTE: start_builtin_ijs_server() is not MT safe
819842
*
820843
* SEE ALSO
821844
* sge_client_ijs/run_ijs_server()
@@ -829,7 +852,7 @@ int start_ijs_server(bool csp_mode, const char* username,
829852

830853
DENTER(TOP_LAYER);
831854

832-
/* we must copy the hostname here to a global variable, because the
855+
/* we must copy the hostname here to a global variable, because the
833856
* worker threads need it later.
834857
* It gets freed in force_ijs_server_shutdown().
835858
* TODO: Cleaner solution for this!
@@ -856,7 +879,7 @@ int start_ijs_server(bool csp_mode, const char* username,
856879
* interactive job support
857880
*
858881
* SYNOPSIS
859-
* int stop_ijs_server(COMM_HANDLE **phandle, dstring *p_err_msg)
882+
* int stop_ijs_server(COMM_HANDLE **phandle, dstring *p_err_msg)
860883
*
861884
* FUNCTION
862885
* Stops the commlib server for the commlib connection between the shepherd
@@ -875,7 +898,7 @@ int start_ijs_server(bool csp_mode, const char* username,
875898
* see p_err_msg for details
876899
*
877900
* NOTES
878-
* MT-NOTE: stop_ijs_server() is not MT safe
901+
* MT-NOTE: stop_ijs_server() is not MT safe
879902
*
880903
* SEE ALSO
881904
* sge_client_ijs/start_ijs_server()
@@ -914,8 +937,8 @@ int stop_ijs_server(COMM_HANDLE **phandle, dstring *p_err_msg)
914937
* interactive job support to shut down
915938
*
916939
* SYNOPSIS
917-
* int force_ijs_server_shutdown(COMM_HANDLE **phandle, const char
918-
* *this_component, dstring *p_err_msg)
940+
* int force_ijs_server_shutdown(COMM_HANDLE **phandle, const char
941+
* *this_component, dstring *p_err_msg)
919942
*
920943
* FUNCTION
921944
* Forces the commlib server for the builtin interactive job support to shut
@@ -933,7 +956,7 @@ int stop_ijs_server(COMM_HANDLE **phandle, dstring *p_err_msg)
933956
* 2: Can't shut down connection, see p_err_msg for details
934957
*
935958
* NOTES
936-
* MT-NOTE: force_ijs_server_shutdown() is not MT safe
959+
* MT-NOTE: force_ijs_server_shutdown() is not MT safe
937960
*
938961
* SEE ALSO
939962
* sge_client_ijs/start_ijs_server()

0 commit comments

Comments
 (0)