From ec6426fec9777d95ceae2b866882c0fb0eddc467 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Fri, 25 May 2007 18:52:33 +0000 Subject: [PATCH] svn merge -r11571:11589 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1 --- NEWS | 3 +++ src/common/node_select.c | 2 +- src/slurmd/slurmd/req.c | 21 ++++++++++++------ src/slurmd/slurmd/slurmd.c | 23 ++++++++++++++------ src/slurmd/slurmstepd/io.c | 15 ++++++++++--- src/slurmd/slurmstepd/mgr.c | 17 +++++++++++---- src/slurmd/slurmstepd/req.c | 34 ++++++++++++++++++++++-------- src/slurmd/slurmstepd/slurmstepd.c | 11 +++++----- src/slurmd/slurmstepd/slurmstepd.h | 1 + src/srun/srun.c | 12 ++++++++++- testsuite/expect/test9.1 | 2 +- 11 files changed, 103 insertions(+), 38 deletions(-) diff --git a/NEWS b/NEWS index 933740ed862..99f8814450c 100644 --- a/NEWS +++ b/NEWS @@ -313,6 +313,9 @@ documents those changes that are of interest to users and admins. - In sched/wiki2: Add NAME to job record. - Changed -w (--nodelist) option to only read in number of nodes specified by -N option unless nprocs was set and in Arbitrary layout mode. + - Added some loops around pthread creates incase they fail and also fixed an + issue in srun to fail job has failed instead of waiting around for threads + that will never end. * Changes in SLURM 1.1.36 ========================= diff --git a/src/common/node_select.c b/src/common/node_select.c index 878eae17591..185b3133a13 100644 --- a/src/common/node_select.c +++ b/src/common/node_select.c @@ -381,7 +381,7 @@ extern int select_g_block_init(List block_list) * IN/OUT data - the data to get from node record */ extern int select_g_get_extra_jobinfo (struct node_record *node_ptr, - struct job_record *job_ptr, + struct job_record *job_ptr, enum select_data_info cr_info, void *data) { diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index ba733bd6050..c9c895e329f 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -256,7 +256,7 @@ _send_slurmstepd_init(int fd, slurmd_step_type_t type, void *req, hostset_t step_hset) { int len = 0; - Buf buffer; + Buf buffer = NULL; slurm_msg_t msg; uid_t uid = (uid_t)-1; struct passwd *pw = NULL; @@ -413,6 +413,8 @@ _send_slurmstepd_init(int fd, slurmd_step_type_t type, void *req, return 0; rwfail: + if(buffer) + free_buf(buffer); error("_send_slurmstepd_init failed"); return -1; } @@ -486,15 +488,19 @@ _forkexec_slurmstepd(slurmd_step_type_t type, void *req, return rc; } else { char *const argv[2] = { slurm_stepd_path, NULL}; + int failed = 0; /* * Child forks and exits */ - if (setsid() < 0) + if (setsid() < 0) { error("_forkexec_slurmstepd: setsid: %m"); - if ((pid = fork()) < 0) + failed = 1; + } + if ((pid = fork()) < 0) { error("_forkexec_slurmstepd: " "Unable to fork grandchild: %m"); - else if (pid > 0) { /* child */ + failed = 2; + } else if (pid > 0) { /* child */ exit(0); } @@ -523,9 +529,10 @@ _forkexec_slurmstepd(slurmd_step_type_t type, void *req, } fd_set_noclose_on_exec(STDERR_FILENO); log_fini(); - execvp(argv[0], argv); - - fatal("exec of slurmstepd failed: %m"); + if(!failed) { + execvp(argv[0], argv); + error("exec of slurmstepd failed: %m"); + } exit(2); } } diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index e1295e9720b..df6e8c4be76 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -291,7 +291,8 @@ static void _decrement_thd_count(void) { slurm_mutex_lock(&active_mutex); - active_threads--; + if(active_threads>0) + active_threads--; pthread_cond_signal(&active_cond); slurm_mutex_unlock(&active_mutex); } @@ -333,6 +334,7 @@ _handle_connection(slurm_fd fd, slurm_addr *cli) pthread_attr_t attr; pthread_t id; conn_t *arg = xmalloc(sizeof(conn_t)); + int retries = 0; arg->fd = fd; arg->cli_addr = cli; @@ -350,13 +352,20 @@ _handle_connection(slurm_fd fd, slurm_addr *cli) fd_set_close_on_exec(fd); _increment_thd_count(); - rc = pthread_create(&id, &attr, &_service_connection, (void *) arg); - slurm_attr_destroy(&attr); - if (rc != 0) { - error("msg_engine: pthread_create: %s", slurm_strerror(rc)); - _service_connection((void *) arg); - return; + while (pthread_create(&id, &attr, &_service_connection, (void *)arg)) { + error("msg_engine: pthread_create: %m"); + if (++retries > 3) { + error("running service_connection without starting " + "a new thread slurmd will be " + "unresponsive until done"); + + _service_connection((void *) arg); + info("slurmd should be responsive now"); + break; + } + usleep(10); /* sleep and again */ } + return; } diff --git a/src/slurmd/slurmstepd/io.c b/src/slurmd/slurmstepd/io.c index 143b6f3eb91..8c88b4e9761 100644 --- a/src/slurmd/slurmstepd/io.c +++ b/src/slurmd/slurmstepd/io.c @@ -72,6 +72,7 @@ #include "src/slurmd/slurmd/slurmd.h" #include "src/slurmd/slurmstepd/io.h" #include "src/slurmd/slurmstepd/fname.h" +#include "src/slurmd/slurmstepd/slurmstepd.h" /********************************************************************** * IO client socket declarations @@ -781,17 +782,25 @@ int io_thread_start(slurmd_job_t *job) { pthread_attr_t attr; + int rc = 0, retries = 0; slurm_attr_init(&attr); - if (pthread_create(&job->ioid, &attr, &_io_thr, (void *)job) != 0) - fatal("pthread_create: %m"); + while (pthread_create(&job->ioid, &attr, &_io_thr, (void *)job)) { + error("io_thread_start: pthread_create error %m"); + if (++retries > MAX_RETRIES) { + error("io_thread_start: Can't create pthread"); + rc = -1; + break; + } + usleep(10); /* sleep and again */ + } slurm_attr_destroy(&attr); /*fatal_add_cleanup(&_fatal_cleanup, (void *) job);*/ - return 0; + return rc; } diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index bb94092a702..a2222e55b6e 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -873,13 +873,11 @@ _fork_all_tasks(slurmd_job_t *job) */ for (i = 0; i < job->ntasks; i++) { pid_t pid; - if ((pid = fork ()) < 0) { - error("fork: %m"); + error("child fork: %m"); goto fail2; } else if (pid == 0) { /* child */ int j; - #ifdef HAVE_AIX (void) mkcrid(0); #endif @@ -994,6 +992,9 @@ _fork_all_tasks(slurmd_job_t *job) fail2: _reclaim_privileges (&sprivs); fail1: + xfree(writefds); + xfree(readfds); + pam_finish(); return SLURM_ERROR; } @@ -1187,12 +1188,20 @@ static void _delay_kill_thread(pthread_t thread_id, int secs) pthread_t kill_id; pthread_attr_t attr; kill_thread_t *kt = xmalloc(sizeof(kill_thread_t)); + int retries = 0; kt->thread_id = thread_id; kt->secs = secs; slurm_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); - pthread_create(&kill_id, &attr, &_kill_thr, (void *) kt); + while (pthread_create(&kill_id, &attr, &_kill_thr, (void *) kt)) { + error("_delay_kill_thread: pthread_create: %m"); + if (++retries > MAX_RETRIES) { + error("_delay_kill_thread: Can't create pthread"); + break; + } + usleep(10); /* sleep and again */ + } slurm_attr_destroy(&attr); } diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index d79413d8fba..0522c0ab5e6 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -215,7 +215,7 @@ msg_thr_create(slurmd_job_t *job) int fd; eio_obj_t *eio_obj; pthread_attr_t attr; - + int rc = SLURM_SUCCESS, retries = 0; errno = 0; fd = _domain_socket_create(conf->spooldir, conf->node_name, job->jobid, job->stepid); @@ -229,15 +229,21 @@ msg_thr_create(slurmd_job_t *job) eio_new_initial_obj(job->msg_handle, eio_obj); slurm_attr_init(&attr); - if (pthread_create(&job->msgid, &attr, - &_msg_thr_internal, (void *)job) != 0) { - error("pthread_create: %m"); - slurm_attr_destroy(&attr); - return SLURM_ERROR; + + while (pthread_create(&job->msgid, &attr, + &_msg_thr_internal, (void *)job)) { + error("msg_thr_create: pthread_create error %m"); + if (++retries > MAX_RETRIES) { + error("msg_thr_create: Can't create pthread"); + rc = SLURM_ERROR; + break; + } + usleep(10); /* sleep and again */ } + slurm_attr_destroy(&attr); - return SLURM_SUCCESS; + return rc; } /* @@ -286,6 +292,7 @@ _msg_socket_accept(eio_obj_t *obj, List objs) struct request_params *param = NULL; pthread_attr_t attr; pthread_t id; + int retries = 0; debug3("Called _msg_socket_accept"); @@ -321,10 +328,19 @@ _msg_socket_accept(eio_obj_t *obj, List objs) param = xmalloc(sizeof(struct request_params)); param->fd = fd; param->job = job; - if (pthread_create(&id, &attr, &_handle_accept, (void *)param) != 0) { + while (pthread_create(&id, &attr, &_handle_accept, (void *)param)) { error("stepd_api message engine pthread_create: %m"); - _handle_accept((void *)param); + if (++retries > MAX_RETRIES) { + error("running handle_accept without " + "starting a thread stepd will be " + "unresponsive until done"); + _handle_accept((void *)param); + info("stepd should be responsive now"); + break; + } + usleep(10); /* sleep and again */ } + slurm_attr_destroy(&attr); param = NULL; diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c index 9079cdbbe85..500dad27fb6 100644 --- a/src/slurmd/slurmstepd/slurmstepd.c +++ b/src/slurmd/slurmstepd/slurmstepd.c @@ -17,7 +17,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -86,14 +86,13 @@ main (int argc, char *argv[]) slurmd_job_t *job; int ngids; gid_t *gids; - int rc; + int rc = 0; xsignal_block(slurmstepd_blocked_signals); conf = xmalloc(sizeof(*conf)); conf->argv = &argv; conf->argc = &argc; init_setproctitle(argc, argv); - _init_from_slurmd(STDIN_FILENO, argv, &cli, &self, &msg, &ngids, &gids); @@ -111,7 +110,8 @@ main (int argc, char *argv[]) /* sets job->msg_handle and job->msgid */ if (msg_thr_create(job) == SLURM_ERROR) { _send_fail_to_slurmd(STDOUT_FILENO); - return -1; + rc = SLURM_FAILURE; + goto ending; } _send_ok_to_slurmd(STDOUT_FILENO); @@ -127,6 +127,7 @@ main (int argc, char *argv[]) eio_signal_shutdown(job->msg_handle); pthread_join(job->msgid, NULL); +ending: _step_cleanup(job, msg, rc); xfree(cli); @@ -137,7 +138,7 @@ main (int argc, char *argv[]) xfree(conf->logfile); xfree(conf); info("done with job"); - return 0; + return rc; } static void diff --git a/src/slurmd/slurmstepd/slurmstepd.h b/src/slurmd/slurmstepd/slurmstepd.h index 7495fcec67c..ef67ba3f094 100644 --- a/src/slurmd/slurmstepd/slurmstepd.h +++ b/src/slurmd/slurmstepd/slurmstepd.h @@ -42,6 +42,7 @@ #include "src/common/bitstring.h" #define STEPD_MESSAGE_COMP_WAIT 15 /* seconds */ +#define MAX_RETRIES 3 extern int slurmstepd_blocked_signals[]; diff --git a/src/srun/srun.c b/src/srun/srun.c index 60bdd19c92d..c112e0fcd49 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -406,7 +406,17 @@ int srun(int ac, char **av) info("Cancelling job"); srun_job_destroy(job, NO_VAL); exit(1); - } + } else if (job->state == SRUN_JOB_FAILED) { + /* This check here is to check if the job failed + because we (srun or slurmd or slurmstepd wasn't + able to fork or make a thread or something we still + need the job failed check below incase the job + failed on it's own. + */ + info("Job Failed"); + srun_job_destroy(job, NO_VAL); + exit(1); + } /* * We want to make sure we get the correct state of the job diff --git a/testsuite/expect/test9.1 b/testsuite/expect/test9.1 index 467c2be47d0..035f39b7e0b 100755 --- a/testsuite/expect/test9.1 +++ b/testsuite/expect/test9.1 @@ -47,7 +47,7 @@ if { [test_bluegene] } { if { [test_xcpu] } { set node_cnt 1-1 } else { - set node_cnt 1-4 + set node_cnt 1-6 } } -- GitLab