From a246d3cf1c74ef7c49d8ad0cc70bcf529730b9c0 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 29 Sep 2005 22:02:59 +0000 Subject: [PATCH] merge -r 5417:5444 from slurm-0-6-branch --- META | 6 +- NEWS | 19 +++ doc/man/man5/slurm.conf.5 | 13 +- etc/slurm.conf.example | 3 + src/common/read_config.c | 19 ++- src/plugins/mpi/lam/Makefile.am | 3 - src/plugins/mpi/mpichgm/Makefile.am | 7 +- src/plugins/mpi/mvapich/Makefile.am | 5 +- src/plugins/mpi/mvapich/mvapich.c | 20 ++- src/plugins/mpi/none/Makefile.am | 3 - src/plugins/select/bluegene/bluegene.c | 2 - src/plugins/select/bluegene/partition_sys.c | 11 +- src/slurmd/req.c | 17 ++- src/smap/job_functions.c | 4 +- src/smap/partition_functions.c | 157 +++++++++++++++----- src/srun/launch.c | 6 +- src/srun/msg.c | 45 ++++-- 17 files changed, 237 insertions(+), 103 deletions(-) diff --git a/META b/META index 7db18821c5f..cc54702ecc3 100644 --- a/META +++ b/META @@ -9,9 +9,9 @@ Name: slurm Major: 0 Minor: 6 - Micro: 0 - Version: 0.6.0 - Release: 0.pre8 + Micro: 1 + Version: 0.6.1 + Release: 1 API_CURRENT: 7 API_AGE: 4 API_REVISION: 0 diff --git a/NEWS b/NEWS index b32e7dc249e..a70d66d1210 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,25 @@ documents those changes that are of interest to users and admins. REQUEST_KILL_JOB/TASKS changed to REQUEST_SIGNAL_JOB/TASKS. -- Add support for e-mail notification on job state changes. +* Changes in SLURM 0.6.2 +======================== + +* Changes in SLURM 0.6.1 +======================== + -- Fixed smap -Db to display slurm partitions correctly (take 2). + -- Add srun fork() retry logic for very heavily loaded system. + -- Fix possible srun hang on task launch failure. + -- Add support for mvapich v0.9.4, 0.9.5 and gen2. + +* Changes in SLURM 0.6.0 +======================== + -- Add documentation for ProctrackType=proctrack/rms. + -- Make proctrack/rms be the default for switch/elan. + -- Do not preceed SIGKILL or SIGTERM to job step with (non-requested) SIGCONT. + -- Fixed smap -Db to display slurm partitions correctly. + -- Explicitly disallow ProctrackType=proctrack/linuxproc with + SwitchType=switch/elan. They will not work properly together. + * Changes in SLURM 0.6.0-pre8 ============================= -- Remove debugging xassert in switch/federation that were accidentally diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 303f19c9d30..b799b3e78c7 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1,4 +1,4 @@ -.TH "slurm.conf" "5" "August 2005" "slurm.conf 0.6" "Slurm configuration file" +.TH "slurm.conf" "5" "September 2005" "slurm.conf 0.6" "Slurm configuration file" .SH "NAME" slurm.conf \- Slurm configuration file .SH "DESCRIPTION" @@ -235,11 +235,16 @@ The default value is "/usr/local/lib/slurm". Identifies the plugin to be used for process tracking. The slurmd daemon uses this mechanism to identify all processes which are children of processes it spawns for a user job. -Acceptable values at present include "proctrack/aix" (which -is the default for AIX systems) and "proctrack/pgid" (which -is the default for all other systems). +Acceptable values at present include +"proctrack/aix" (which uses an AIX kernel extenstion and is +the default for AIX systems), +"proctrack/linuxproc" (which uses linux process tree), +"proctrack/rms" (which uses Quadrics kernel patch and is the +default if "SwitchType=switch/elan") and +"proctrack/pgid" (which is the default for all other systems). The slurmd daemon must be restarted for a change in ProctrackType to take effect. +NOTE: "proctrack/linuxproc" is not compatable with "swich/elan." .TP \fBProlog\fR Fully qualified pathname of a script to execute as user root on every diff --git a/etc/slurm.conf.example b/etc/slurm.conf.example index f2223bd353e..0255017fb88 100644 --- a/etc/slurm.conf.example +++ b/etc/slurm.conf.example @@ -231,6 +231,9 @@ # the default value on all other computers # "proctrack/linuxproc" : use parent process ID to establish process # tree, required for MPICH-GM use +# "proctrack/rms" : use Quadrics kernal infrastructure to track +# processes, strongly recommended for systems +# with a Quadrics switch # # ProctrackType=proctrack/pgid diff --git a/src/common/read_config.c b/src/common/read_config.c index 4da69a83d3f..877515c1fab 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -1310,8 +1310,20 @@ validate_config (slurm_ctl_conf_t *ctl_conf_ptr) if (ctl_conf_ptr->plugindir == NULL) ctl_conf_ptr->plugindir = xstrdup(SLURM_PLUGIN_PATH); - if (ctl_conf_ptr->proctrack_type == NULL) - ctl_conf_ptr->proctrack_type = xstrdup(DEFAULT_PROCTRACK_TYPE); + if (ctl_conf_ptr->switch_type == NULL) + ctl_conf_ptr->switch_type = xstrdup(DEFAULT_SWITCH_TYPE); + + if (ctl_conf_ptr->proctrack_type == NULL) { + if (!strcmp(ctl_conf_ptr->switch_type,"switch/elan")) + ctl_conf_ptr->proctrack_type = + xstrdup("proctrack/rms"); + else + ctl_conf_ptr->proctrack_type = + xstrdup(DEFAULT_PROCTRACK_TYPE); + } + if ((!strcmp(ctl_conf_ptr->switch_type, "switch/elan")) + && (!strcmp(ctl_conf_ptr->proctrack_type,"proctrack/linuxproc"))) + fatal("proctrack/linuxproc is incompatable with switch/elan"); if (ctl_conf_ptr->propagate_rlimits_except) { if ((parse_rlimits( ctl_conf_ptr->propagate_rlimits_except, @@ -1381,8 +1393,7 @@ validate_config (slurm_ctl_conf_t *ctl_conf_ptr) ctl_conf_ptr->state_save_location = xstrdup( DEFAULT_SAVE_STATE_LOC); - if (ctl_conf_ptr->switch_type == NULL) - ctl_conf_ptr->switch_type = xstrdup(DEFAULT_SWITCH_TYPE); + /* see above for switch_type, order dependent */ if (ctl_conf_ptr->tmp_fs == NULL) ctl_conf_ptr->tmp_fs = xstrdup(DEFAULT_TMP_FS); diff --git a/src/plugins/mpi/lam/Makefile.am b/src/plugins/mpi/lam/Makefile.am index b34a0a559ad..bb1d541f8de 100644 --- a/src/plugins/mpi/lam/Makefile.am +++ b/src/plugins/mpi/lam/Makefile.am @@ -11,6 +11,3 @@ pkglib_LTLIBRARIES = mpi_lam.la mpi_lam_la_SOURCES = mpi_lam.c lam.h mpi_lam_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) -mpi_lam_la_LIBADD = \ - $(top_builddir)/src/common/libcommon.la -lpthread \ - $(top_builddir)/src/api/libslurm.la diff --git a/src/plugins/mpi/mpichgm/Makefile.am b/src/plugins/mpi/mpichgm/Makefile.am index abc64f13306..324561f396e 100644 --- a/src/plugins/mpi/mpichgm/Makefile.am +++ b/src/plugins/mpi/mpichgm/Makefile.am @@ -10,8 +10,7 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common pkglib_LTLIBRARIES = mpi_mpichgm.la # Null switch plugin. -mpi_mpichgm_la_SOURCES = mpi_mpichgm.c mpichgm.c +mpi_mpichgm_la_SOURCES = mpi_mpichgm.c mpichgm.c \ + $(top_srcdir)/src/common/global_srun.c \ + $(top_srcdir)/src/common/net.c mpi_mpichgm_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) -mpi_mpichgm_la_LIBADD = \ - $(top_builddir)/src/common/libcommon.la -lpthread \ - $(top_builddir)/src/api/libslurm.la diff --git a/src/plugins/mpi/mvapich/Makefile.am b/src/plugins/mpi/mvapich/Makefile.am index ca57696a78d..76ef64eda6b 100644 --- a/src/plugins/mpi/mvapich/Makefile.am +++ b/src/plugins/mpi/mvapich/Makefile.am @@ -10,8 +10,5 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common pkglib_LTLIBRARIES = mpi_mvapich.la # Null switch plugin. -mpi_mvapich_la_SOURCES = mpi_mvapich.c mvapich.c +mpi_mvapich_la_SOURCES = mpi_mvapich.c mvapich.c $(top_srcdir)/src/common/net.c mpi_mvapich_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) -mpi_mvapich_la_LIBADD = \ - $(top_builddir)/src/common/libcommon.la -lpthread \ - $(top_builddir)/src/api/libslurm.la diff --git a/src/plugins/mpi/mvapich/mvapich.c b/src/plugins/mpi/mvapich/mvapich.c index 678e9100cb7..59fbfb5126d 100644 --- a/src/plugins/mpi/mvapich/mvapich.c +++ b/src/plugins/mpi/mvapich/mvapich.c @@ -60,7 +60,8 @@ struct mvapich_info int fd; /* fd for socket connection to MPI task */ int version; /* Version of mvapich startup protocol */ int rank; /* This process' MPI rank */ - int pid; /* This rank's local pid (V3 only) */ + int pidlen; /* length of pid buffer */ + char *pid; /* This rank's local pid (V3 only) */ int addrlen; /* Length of addr array in bytes */ int *addr; /* This process' address array, which for @@ -115,7 +116,7 @@ static struct mvapich_info * mvapich_info_create (int fd) if (fd_read_n (fd, &mvi->rank, sizeof (int)) < 0) E_RET ("mvapich: Unable to read rank id: %m", mvi->rank); - if (mvi->version != 2 && mvi->version != 3) + if (mvi->version <= 1 || mvi->version > 3) E_RET ("Unsupported version %d from rank %d", mvi->version, mvi->rank); if (fd_read_n (fd, &mvi->addrlen, sizeof (int)) < 0) @@ -127,25 +128,22 @@ static struct mvapich_info * mvapich_info_create (int fd) E_RET ("mvapich: Unable to read addr info for rank %d: %m", mvi->rank); if (mvi->version == 3) { - int pidlen; - if (fd_read_n (fd, &pidlen, sizeof (int)) < 0) + if (fd_read_n (fd, &mvi->pidlen, sizeof (int)) < 0) E_RET ("mvapich: Unable to read pidlen for rank %d: %m", mvi->rank); - if (pidlen != sizeof (mvi->pid)) - E_RET ("mvapich: Confused. Rank %d pidlen of %d not what I expected", - mvi->rank, pidlen); + mvi->pid = xmalloc (mvi->pidlen); - if (fd_read_n (fd, &mvi->pid, pidlen) < 0) + if (fd_read_n (fd, &mvi->pid, mvi->pidlen) < 0) E_RET ("mvapich: Unable to read pid for rank %d: %m", mvi->rank); } - return (mvi); } static void mvapich_info_destroy (struct mvapich_info *mvi) { xfree (mvi->addr); + xfree (mvi->pid); xfree (mvi); return; } @@ -201,7 +199,7 @@ static void mvapich_bcast (void) */ if (protocol_version == 3) { for (j = 0; j < nprocs; j++) - fd_write_n (m->fd, &mvarray[j]->pid, sizeof (int)); + fd_write_n (m->fd, &mvarray[j]->pid, mvarray[j]->pidlen); } } @@ -288,7 +286,7 @@ static void *mvapich_thr(void *arg) mvarray = xmalloc (nprocs * sizeof (*mvarray)); - debug ("mvapich-0.9.[45]: thread started: %ld", pthread_self ()); + debug ("mvapich-0.9.[45]/gen2: thread started: %ld", pthread_self ()); while (i < nprocs) { struct mvapich_info *mvi = NULL; diff --git a/src/plugins/mpi/none/Makefile.am b/src/plugins/mpi/none/Makefile.am index 5de265c825d..f1b7ab64f1f 100644 --- a/src/plugins/mpi/none/Makefile.am +++ b/src/plugins/mpi/none/Makefile.am @@ -12,6 +12,3 @@ pkglib_LTLIBRARIES = mpi_none.la # Null MPI plugin. mpi_none_la_SOURCES = mpi_none.c mpi_none_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) -mpi_none_la_LIBADD = \ - $(top_builddir)/src/common/libcommon.la -lpthread \ - $(top_builddir)/src/api/libslurm.la diff --git a/src/plugins/select/bluegene/bluegene.c b/src/plugins/select/bluegene/bluegene.c index 705e7e73c40..f627f9fb3bf 100644 --- a/src/plugins/select/bluegene/bluegene.c +++ b/src/plugins/select/bluegene/bluegene.c @@ -1482,8 +1482,6 @@ static void _process_nodes(bgl_record_t *bgl_record) int j=0, number; int start[PA_SYSTEM_DIMENSIONS]; int end[PA_SYSTEM_DIMENSIONS]; - char buffer[BUFSIZE]; - int funky=0; ListIterator itr; pa_node_t* pa_node = NULL; diff --git a/src/plugins/select/bluegene/partition_sys.c b/src/plugins/select/bluegene/partition_sys.c index d92826568d2..1e7a9afa37a 100755 --- a/src/plugins/select/bluegene/partition_sys.c +++ b/src/plugins/select/bluegene/partition_sys.c @@ -409,10 +409,13 @@ int read_bgl_partitions() slurm_user_name); } else { user_name = NULL; - if ((rc = rm_get_data(part_ptr, RM_PartitionFirstUser, - &user_name)) != STATUS_OK) { - error("rm_get_data(RM_PartitionFirstUser): %s", - bgl_err_str(rc)); + if ((rc = rm_get_data(part_ptr, + RM_PartitionFirstUser, + &user_name)) + != STATUS_OK) { + error("rm_get_data" + "(RM_PartitionFirstUser): %s", + bgl_err_str(rc)); } if(!user_name) { error("No user name was " diff --git a/src/slurmd/req.c b/src/slurmd/req.c index 64588ea92eb..78a4963f442 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -223,12 +223,17 @@ _fork_new_slurmd(void) * to return until signaled by grandchild process that * slurmd job manager has been successfully created. */ - if (pipe(fds) < 0) + if (pipe(fds) < 0) { error("fork_slurmd: pipe: %m"); + return -1; + } - if ((pid = fork()) < 0) + if ((pid = fork()) < 0) { error("fork_slurmd: fork: %m"); - else if (pid > 0) { + close(fds[0]); + close(fds[1]); + return -1; + } else if (pid > 0) { if ((fds[1] >= 0) && (close(fds[1]) < 0)) error("Unable to close write-pipe in parent: %m"); @@ -817,6 +822,8 @@ _rpc_kill_tasks(slurm_msg_t *msg, slurm_addr *cli_addr) goto done; } +#if 0 + /* This code was used in an investigation of hung TotalView proceses */ if ((req->signal == SIGKILL) || (req->signal == SIGINT)) { /* for proctrack/linuxproc */ /* @@ -826,7 +833,9 @@ _rpc_kill_tasks(slurm_msg_t *msg, slurm_addr *cli_addr) slurm_container_signal(step->cont_id, SIGCONT); if (slurm_container_signal(step->cont_id, req->signal) < 0) rc = errno; - } else if (req->signal == 0) { + } else +#endif + if (req->signal == 0) { if (slurm_container_signal(step->cont_id, req->signal) < 0) rc = errno; /* SIGMIGRATE and SIGSOUND are used to initiate job checkpoint on AIX. diff --git a/src/smap/job_functions.c b/src/smap/job_functions.c index f9f24660e22..b0ba00e9d44 100644 --- a/src/smap/job_functions.c +++ b/src/smap/job_functions.c @@ -173,7 +173,7 @@ static void _print_header_job(void) pa_system_ptr->xcord += 3; mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, pa_system_ptr->xcord, "JOBID"); - pa_system_ptr->xcord += 6; + pa_system_ptr->xcord += 7; mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, pa_system_ptr->xcord, "PARTITION"); pa_system_ptr->xcord += 10; @@ -232,7 +232,7 @@ static int _print_text_job(job_info_t * job_ptr) pa_system_ptr->xcord += 3; mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, pa_system_ptr->xcord, "%d", job_ptr->job_id); - pa_system_ptr->xcord += 6; + pa_system_ptr->xcord += 7; mvwprintw(pa_system_ptr->text_win, pa_system_ptr->ycord, pa_system_ptr->xcord, "%.10s", job_ptr->partition); pa_system_ptr->xcord += 10; diff --git a/src/smap/partition_functions.c b/src/smap/partition_functions.c index 35f4a2a70f4..279394999aa 100644 --- a/src/smap/partition_functions.c +++ b/src/smap/partition_functions.c @@ -66,10 +66,10 @@ static int _set_start_finish(db2_block_info_t *db2_info_ptr); static void _block_list_del(void *object); static void _nodelist_del(void *object); static int _list_match_all(void *object, void *key); -static int _in_slurm_partition(db2_block_info_t *db2_info_ptr, - int *first, - int *last); +static int _in_slurm_partition(List slurm_nodes, List bgl_nodes); static int _print_rest(db2_block_info_t *block_ptr); +static int _addto_node_list(List nodelist, int *start, int *end); +static int _make_nodelist(char *nodes, List nodelist); #endif extern void get_slurm_part() @@ -166,7 +166,8 @@ extern void get_bgl_part() int number, start[PA_SYSTEM_DIMENSIONS], end[PA_SYSTEM_DIMENSIONS]; db2_block_info_t *block_ptr = NULL; ListIterator itr; - + List nodelist = NULL; + if (part_info_ptr) { error_code = slurm_load_partitions(part_info_ptr->last_update, &new_part_ptr, SHOW_ALL); @@ -247,6 +248,9 @@ extern void get_bgl_part() = xstrdup(new_bgl_ptr->bgl_info_array[i].bgl_part_id); block_ptr->nodes = xstrdup(new_bgl_ptr->bgl_info_array[i].nodes); + block_ptr->nodelist = list_create(_nodelist_del); + _make_nodelist(block_ptr->nodes,block_ptr->nodelist); + block_ptr->bgl_user_name = xstrdup(new_bgl_ptr->bgl_info_array[i].owner_name); block_ptr->state @@ -273,40 +277,22 @@ extern void get_bgl_part() if (!part.nodes || (part.nodes[0] == '\0')) continue; /* empty partition */ - while (part.nodes[j] != '\0') { - if ((part.nodes[j] == '[') - && (part.nodes[j+8] == ']') - && ((part.nodes[j+4] == 'x') - || (part.nodes[j+4] == '-'))) { - j++; - number = atoi(part.nodes + j); - start[X] = number / 100; - start[Y] = (number % 100) / 10; - start[Z] = (number % 10); - j += 4; - - number = atoi(part.nodes + j); - end[X] = number / 100; - end[Y] = (number % 100) / 10; - end[Z] = (number % 10); - break; - } - j++; - } + nodelist = list_create(_nodelist_del); + _make_nodelist(part.nodes,nodelist); if (block_list) { itr = list_iterator_create(block_list); while ((block_ptr = (db2_block_info_t*) list_next(itr)) != NULL) { - if(_in_slurm_partition(block_ptr, - start, - end)) { + if(_in_slurm_partition(nodelist, + block_ptr->nodelist)) { block_ptr->slurm_part_name = xstrdup(part.name); } } list_iterator_destroy(itr); } + list_destroy(nodelist); } /* Report the BGL Blocks */ @@ -730,6 +716,8 @@ static void _block_list_del(void *object) static void _nodelist_del(void *object) { + int *coord = (int *)object; + xfree(coord); return; } @@ -783,17 +771,39 @@ static int _set_start_finish(db2_block_info_t *db2_info_ptr) return 1; } -static int _in_slurm_partition(db2_block_info_t *db2_info_ptr, - int *first, int *last) +static int _in_slurm_partition(List slurm_nodes, List bgl_nodes) { - if((db2_info_ptr->start[X]>=first[X]) - && (db2_info_ptr->start[Y]>=first[Y]) - && (db2_info_ptr->start[Z]>=first[Z]) - && (db2_info_ptr->end[X]<=last[X]) - && (db2_info_ptr->end[Y]<=last[Y]) - && (db2_info_ptr->end[Z]<=last[Z])) + ListIterator slurm_itr; + ListIterator bgl_itr; + int *coord = NULL; + int *slurm_coord = NULL; + int found = 0; + + bgl_itr = list_iterator_create(bgl_nodes); + slurm_itr = list_iterator_create(slurm_nodes); + while ((coord = list_next(bgl_itr)) != NULL) { + list_iterator_reset(slurm_itr); + found = 0; + while ((slurm_coord = list_next(slurm_itr)) != NULL) { + if((coord[X] == slurm_coord[X]) + && (coord[Y] == slurm_coord[Y]) + && (coord[Z] == slurm_coord[Z])) { + found=1; + break; + } + + + } + if(!found) { + break; + } + } + list_iterator_destroy(slurm_itr); + list_iterator_destroy(bgl_itr); + + if(found) return 1; - else + else return 0; } @@ -823,6 +833,81 @@ static int _print_rest(db2_block_info_t *block_ptr) return SLURM_SUCCESS; } + +static int _addto_nodelist(List nodelist, int *start, int *end) +{ + int *coord = NULL; + int x,y,z; + + assert(end[X] < DIM_SIZE[X]); + assert(start[X] >= 0); + assert(end[Y] < DIM_SIZE[Y]); + assert(start[Y] >= 0); + assert(end[Z] < DIM_SIZE[Z]); + assert(start[Z] >= 0); + + for (x = start[X]; x <= end[X]; x++) { + for (y = start[Y]; y <= end[Y]; y++) { + for (z = start[Z]; z <= end[Z]; z++) { + coord = xmalloc(sizeof(int)*3); + coord[X] = x; + coord[Y] = y; + coord[Z] = z; + list_append(nodelist, coord); + } + } + } + return 1; +} + +static int _make_nodelist(char *nodes, List nodelist) +{ + int j = 0; + int number; + int start[PA_SYSTEM_DIMENSIONS]; + int end[PA_SYSTEM_DIMENSIONS]; + + if(!nodelist) + nodelist = list_create(_nodelist_del); + while (nodes[j] != '\0') { + if ((nodes[j] == '[' + || nodes[j] == ',') + && (nodes[j+8] == ']' + || nodes[j+8] == ',') + && (nodes[j+4] == 'x' + || nodes[j+4] == '-')) { + j++; + number = atoi(nodes + j); + start[X] = number / 100; + start[Y] = (number % 100) / 10; + start[Z] = (number % 10); + j += 4; + number = atoi(nodes + j); + end[X] = number / 100; + end[Y] = (number % 100) / 10; + end[Z] = (number % 10); + j += 3; + _addto_nodelist(nodelist, start, end); + if(nodes[j] != ',') + break; + j--; + } else if((nodes[j] < 58 + && nodes[j] > 47)) { + + number = atoi(nodes + j); + start[X] = number / 100; + start[Y] = (number % 100) / 10; + start[Z] = (number % 10); + j+=3; + _addto_nodelist(nodelist, start, start); + if(nodes[j] != ',') + break; + } + j++; + } + return 1; +} + #endif static char* _convert_conn_type(enum connection_type conn_type) diff --git a/src/srun/launch.c b/src/srun/launch.c index 99f55c8310e..6cadabc8f38 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -310,6 +310,9 @@ static void _p_launch(slurm_msg_t *req, srun_job_t *job) continue; } + if (job->state > SRUN_JOB_LAUNCHING) + break; + pthread_mutex_lock(&active_mutex); while (active >= opt.max_threads || rc < 0) rc = _wait_on_active(thd, job); @@ -318,9 +321,6 @@ static void _p_launch(slurm_msg_t *req, srun_job_t *job) active++; pthread_mutex_unlock(&active_mutex); - if (job->state > SRUN_JOB_LAUNCHING) - break; - thd[i].task.req = &req[i]; thd[i].task.job = job; diff --git a/src/srun/msg.c b/src/srun/msg.c index c0f6cab7ef0..6ff00ed8c89 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -966,7 +966,9 @@ par_thr(void *arg) return (void *)1; } -int +/* NOTE: call this before creating any pthreads to avoid having forked process + * hang on localtime_t() mutex locked in parent processes pthread */ +extern int msg_thr_create(srun_job_t *job) { int i; @@ -990,18 +992,30 @@ msg_thr_create(srun_job_t *job) job->jaddr[i]).sin_port)); } - if (pipe(job->forked_msg->par_msg->msg_pipe) == -1) - return SLURM_ERROR; // there was an error - if (pipe(job->forked_msg->msg_par->msg_pipe) == -1) - return SLURM_ERROR; // there was an error + if (pipe(job->forked_msg->par_msg->msg_pipe) == -1) { + error("pipe(): %m"); + return SLURM_ERROR; + } + if (pipe(job->forked_msg->msg_par->msg_pipe) == -1) { + error("pipe(): %m"); + return SLURM_ERROR; + } debug2("created the pipes for communication"); - if((job->forked_msg->par_msg->pid = fork()) == -1) - return SLURM_ERROR; // there was an error - else if (job->forked_msg->par_msg->pid == 0) - { // child: -#ifdef DISABLE_LOCALTIME - disable_localtime(); -#endif + + /* retry fork for super-heavily loaded systems */ + for (i = 0; ; i++) { + if((job->forked_msg->par_msg->pid = fork()) != -1) + break; + if (i < 3) + usleep(1000); + else { + error("fork(): %m"); + return SLURM_ERROR; + } + } + + if (job->forked_msg->par_msg->pid == 0) { + /* child */ setsid(); message_thread = 1; close(job->forked_msg-> @@ -1027,10 +1041,9 @@ msg_thr_create(srun_job_t *job) xfree(job->forked_msg->msg_par); xfree(job->forked_msg); _exit(0); - } - else - { // parent: - + } else { + /* parent */ + slurm_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); if ((errno = pthread_create(&job->jtid, &attr, &par_thr, -- GitLab