diff --git a/NEWS b/NEWS index 970d14441a31419c7cd63ce51d9b89e7f0462908..f1add3d6be0d01f51aba7e474e985b9a139e8364 100644 --- a/NEWS +++ b/NEWS @@ -26,6 +26,15 @@ documents those changes that are of interest to users and administrators. -- Document which srun options apply to only job, only step, or job and step allocations. -- Use more compatible function to get thread name (>= 2.6.11). + -- Fix order of job then step id when noting cleaning flag being set. + -- Make it so the extern step sends a message with accounting information + back to the slurmctld. + -- Make it so the extern step calls the select_g_step_start|finish functions. + -- Don't print error when extern step is canceled because job is ending. + -- Handle a few error codes when dealing with the extern step to make sure + we have the pids added to the system correctly. + -- Add support for job dependencies with job array expressions. Previous logic + required listing each task of job array individually. * Changes in Slurm 16.05.1 ========================== diff --git a/doc/html/download.shtml b/doc/html/download.shtml index 03a5575cbb46ff6cc6367dd745ff207f7dada388..0feeb5d7aaab83eea4616afa2b1c58b42a974033 100644 --- a/doc/html/download.shtml +++ b/doc/html/download.shtml @@ -71,7 +71,7 @@ It may be included directly in a future release of Slurm.</li><br> <li><b>Debuggers</b> and debugging tools</li> <ul> -<li><a href="http://www.totalviewtech.com/"><b>TotalView</b></a> +<li><a href="http://www.roguewave.com/products-services/totalview"><b>TotalView</b></a> is a GUI-based source code debugger well suited for parallel applications.</li> <li><a href="http://padb.pittman.org.uk/"><b>Padb</b></a> is a job inspection tool for examining and debugging parallel programs, primarily it simplifies the process of gathering stack traces but also supports a wide range of other functions. diff --git a/doc/html/slurm_ug_cfp.shtml b/doc/html/slurm_ug_cfp.shtml index 00e04c7a739be1b31a10429371a446b69433582a..ebb925ca98f3c21b5b208825fc4e8dc4f166bbe7 100644 --- a/doc/html/slurm_ug_cfp.shtml +++ b/doc/html/slurm_ug_cfp.shtml @@ -10,7 +10,8 @@ or site report to be given at the Slurm User Group Meeting 2016. This event is sponsored and organized by <a href="http://www.grnet.gr/">Greek Research and Technology Network (GRNET)</a> and <a href="http://www.schedmd.com/">SchedMD</a>. -It will be held in Athens, Greece on 26-27 September 2016.</p> +It will be held at the Technopolis, 100 Pireos Street in Athens, Greece +on 26-27 September 2016.</p> <p>This international event is opened to everyone who wants to: <ul> @@ -32,6 +33,7 @@ or tutorial about Slurm is invited to send an abstract to </p> <p><b>Program Committee:</b><br> +Vangelis Floros (GRNET) Yiannis Georgiou (Bull)<br> Brian Gilmer (Cray)<br> Matthieu Hautreux (CEA)<br> @@ -41,6 +43,6 @@ Morris Jette (SchedMD)</p> <p><a href="slurm_ug_registration.html">Registration information</a></p> --> -<p style="text-align:center;">Last modified 23 June 2016</p> +<p style="text-align:center;">Last modified 5 July 2016</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man1/sinfo.1 b/doc/man/man1/sinfo.1 index c28c9cd8ea32658800d94c58cf3c2993cc260a5a..8186484bac8b5f58620836410cee9527843912ac 100644 --- a/doc/man/man1/sinfo.1 +++ b/doc/man/man1/sinfo.1 @@ -79,7 +79,7 @@ Don't convert units from their original type (e.g. 2048M won't be converted to .TP \fB\-N\fR, \fB\-\-Node\fR -Print information in a node\-oriented format. +Print information in a node\-oriented format with one line per node. The default is to print information in a partition\-oriented format. This is ignored if the \fB\-\-format\fR option is specified. diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index 1b776525ac6e0d97f36d5e059f0198457fbc80ce..8e12c230b27f807dbbe5e50af5635d9520c7e61a 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -2168,7 +2168,7 @@ extern int select_p_step_start(struct step_record *step_ptr) #endif jobinfo = step_ptr->job_ptr->select_jobinfo->data; - if (jobinfo->npc) { + if (jobinfo->npc && (step_ptr->step_id != SLURM_EXTERN_CONT)) { int i; select_jobinfo_t *step_jobinfo = step_ptr->select_jobinfo->data; select_nodeinfo_t *nodeinfo; diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 15fba92ade076260cc4fc1102a056c2f34262f7e..3bd22c4832645c6fb390b78b2f2d04310fa84662 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -135,6 +135,7 @@ static pthread_mutex_t sched_mutex = PTHREAD_MUTEX_INITIALIZER; static int sched_pend_thread = 0; static bool sched_running = false; static struct timeval sched_last = {0, 0}; +static uint32_t max_array_size = NO_VAL; #ifdef HAVE_ALPS_CRAY static int sched_min_interval = 1000000; #else @@ -2634,6 +2635,75 @@ extern int test_job_dependency(struct job_record *job_ptr) return results; } +/* Given a new job dependency specification, expand job array specifications + * into a collection of task IDs that update_job_dependency can parse. + * (e.g. "after:123_[4-5]" to "after:123_4:123_5") + * Returns NULL if not valid job array specification. + * Returned value must be xfreed. */ +static char *_xlate_array_dep(char *new_depend) +{ + char *new_array_dep = NULL, *array_tmp, *jobid_ptr = NULL, *sep; + bitstr_t *array_bitmap; + int i; + uint32_t job_id; + int32_t t, t_first, t_last; + + if (strstr(new_depend, "_[") == NULL) + return NULL; /* No job array expressions */ + + if (max_array_size == NO_VAL) { + slurm_ctl_conf_t *conf; + conf = slurm_conf_lock(); + max_array_size = conf->max_array_sz; + slurm_conf_unlock(); + } + + for (i = 0; new_depend[i]; i++) { + xstrfmtcat(new_array_dep, "%c", new_depend[i]); + if ((new_depend[i] >= '0') && (new_depend[i] <= '9')) { + if (jobid_ptr == NULL) + jobid_ptr = new_depend + i; + } else if ((new_depend[i] == '_') && (new_depend[i+1] == '[') && + (jobid_ptr != NULL)) { + job_id = (uint32_t) atol(jobid_ptr); + i += 2; /* Skip over "_[" */ + array_tmp = xstrdup(new_depend + i); + sep = strchr(array_tmp, ']'); + if (sep) + sep[0] = '\0'; + array_bitmap = bit_alloc(max_array_size); + if ((sep == NULL) || + (bit_unfmt(array_bitmap, array_tmp) != 0) || + ((t_first = bit_ffs(array_bitmap)) == -1)) { + /* Invalid format */ + xfree(array_tmp); + bit_free(array_bitmap); + xfree(new_array_dep); + return NULL; + } + i += (sep - array_tmp); /* Move to location of ']' */ + xfree(array_tmp); + t_last = bit_fls(array_bitmap); + for (t = t_first; t <= t_last; t++) { + if (!bit_test(array_bitmap, t)) + continue; + if (t == t_first) { + xstrfmtcat(new_array_dep, "%d", t); + } else { + xstrfmtcat(new_array_dep, ":%u_%d", + job_id, t); + } + } + bit_free(array_bitmap); + jobid_ptr = NULL; + } else { + jobid_ptr = NULL; + } + } + + return new_array_dep; +} + /* * Parse a job dependency string and use it to establish a "depend_spec" * list of dependencies. We accept both old format (a single job ID) and @@ -2648,7 +2718,7 @@ extern int update_job_dependency(struct job_record *job_ptr, char *new_depend) uint16_t depend_type = 0; uint32_t job_id = 0; uint32_t array_task_id; - char *tok = new_depend, *sep_ptr, *sep_ptr2 = NULL; + char *tok, *new_array_dep, *sep_ptr, *sep_ptr2 = NULL; List new_depend_list = NULL; struct depend_spec *dep_ptr; struct job_record *dep_job_ptr; @@ -2669,10 +2739,12 @@ extern int update_job_dependency(struct job_record *job_ptr, char *new_depend) } new_depend_list = list_create(_depend_list_del); - + if ((new_array_dep = _xlate_array_dep(new_depend))) + tok = new_array_dep; + else + tok = new_depend; /* validate new dependency string */ while (rc == SLURM_SUCCESS) { - /* test singleton dependency flag */ if ( strncasecmp(tok, "singleton", 9) == 0 ) { depend_type = SLURM_DEPEND_SINGLETON; @@ -2891,6 +2963,7 @@ extern int update_job_dependency(struct job_record *job_ptr, char *new_depend) } else { FREE_NULL_LIST(new_depend_list); } + xfree(new_array_dep); return rc; } diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index b51420fa4d8410d17538048bddd1df0a9ff33698..17bdfd15e402bc888cdfc348ae06762c9317de4f 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -2718,6 +2718,31 @@ static void _launch_prolog(struct job_record *job_ptr) agent_arg_ptr->msg_type = REQUEST_LAUNCH_PROLOG; agent_arg_ptr->msg_args = (void *) prolog_msg_ptr; + /* At least on a Cray we have to treat this as a real step, so + * this is where to do it. + */ + if (slurmctld_conf.prolog_flags & PROLOG_FLAG_CONTAIN) { + struct step_record step_rec; + slurm_step_layout_t layout; + + memset(&step_rec, 0, sizeof(step_rec)); + memset(&layout, 0, sizeof(layout)); + +#ifdef HAVE_FRONT_END + layout.node_list = job_ptr->front_end_ptr->name; +#else + layout.node_list = job_ptr->nodes; +#endif + layout.node_cnt = agent_arg_ptr->node_count; + + step_rec.step_layout = &layout; + step_rec.step_id = SLURM_EXTERN_CONT; + step_rec.job_ptr = job_ptr; + step_rec.name = "external"; + + select_g_step_start(&step_rec); + } + /* Launch the RPC via agent */ agent_queue_request(agent_arg_ptr); } diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index cfd005ddeb66bacfd0af01a92d976f60e95e2417..6be281d5b941a1f75ca6ff9b3efb022b45e969de 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -739,7 +739,7 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid, return ESLURM_INVALID_JOB_ID; if (step_ptr->step_id == SLURM_EXTERN_CONT) - return SLURM_SUCCESS; + return select_g_step_finish(step_ptr, true); /* If the job is already cleaning we have already been here * before, so just return. */ @@ -749,7 +749,7 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid, if (cleaning) { /* Step hasn't finished cleanup yet. */ debug("%s: Cleaning flag already set for " "job step %u.%u, no reason to cleanup again.", - __func__, step_ptr->step_id, step_ptr->job_ptr->job_id); + __func__, job_ptr->job_id, step_ptr->step_id); return SLURM_SUCCESS; } diff --git a/src/slurmd/common/proctrack.c b/src/slurmd/common/proctrack.c index 7e83cc2fdc09bc5e6010700161239b69b8a9adc6..327a0c86ea61bd1b2b6e6d709ee52f88dd065d34 100644 --- a/src/slurmd/common/proctrack.c +++ b/src/slurmd/common/proctrack.c @@ -178,9 +178,22 @@ extern int proctrack_g_create(stepd_step_rec_t * job) */ extern int proctrack_g_add(stepd_step_rec_t * job, pid_t pid) { + int i = 0, max_retry = 3, rc; + if (slurm_proctrack_init() < 0) return SLURM_ERROR; + /* Sometimes a plugin is transient in adding a pid, so lets + * try a few times before we call it quits. + */ + while ((rc = (*(ops.add)) (job, pid)) != SLURM_SUCCESS) { + if (i++ > max_retry) + break; + debug("%s: %u.%u couldn't add pid %u, sleeping and trying again", + __func__, job->jobid, job->stepid, pid); + sleep(1); + } + return (*(ops.add)) (job, pid); } diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index c7eeb29a268034285df87de20ea75e6ee215a6dd..96be28da16dbf2e3c3f9c3d50d47e12d3cd0c8ea 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -974,6 +974,7 @@ static int _spawn_job_container(stepd_step_rec_t *job) jobacct_id_t jobacct_id; int status = 0; pid_t pid; + int rc = SLURM_SUCCESS; debug2("%s: Before call to spank_init()", __func__); if (spank_init(job) < 0) { @@ -1003,11 +1004,16 @@ static int _spawn_job_container(stepd_step_rec_t *job) } else if (pid < 0) { error("fork: %m"); _set_job_state(job, SLURMSTEPD_STEP_ENDING); - return SLURM_ERROR; + rc = SLURM_ERROR; + goto fail1; } job->pgid = pid; - proctrack_g_add(job, pid); + if ((rc = proctrack_g_add(job, pid)) != SLURM_SUCCESS) { + error("%s: Step %u.%u unable to add pid %d to the proctrack plugin", + __func__, job->jobid, job->stepid, pid); + goto fail1; + } jobacct_id.nodeid = job->nodeid; jobacct_id.taskid = job->nodeid; /* Treat node ID as global task ID */ @@ -1056,12 +1062,16 @@ static int _spawn_job_container(stepd_step_rec_t *job) * condition starting another job on these CPUs. */ while (_send_pending_exit_msgs(job)) {;} +fail1: debug2("%s: Before call to spank_fini()", __func__); if (spank_fini(job) < 0) error("spank_fini failed"); debug2("%s: After call to spank_fini()", __func__); - return SLURM_SUCCESS; + _set_job_state(job, SLURMSTEPD_STEP_ENDING); + _send_step_complete_msgs(job); + + return rc; } /* diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index 147a6f0f57d169cbb2d4ccfbfda7cda865b0cb51..81dd954481ab0d58b2240ceb58831882400fe324 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -792,7 +792,8 @@ _handle_signal_container(int fd, stepd_step_rec_t *job, uid_t uid) ptr = getenvp(job->env, "SLURM_STEP_KILLED_MSG_NODE_ID"); if (ptr) target_node_id = atoi(ptr); - if ((job->nodeid == target_node_id) && (msg_sent == 0) && + if ((job->stepid != SLURM_EXTERN_CONT) && + (job->nodeid == target_node_id) && (msg_sent == 0) && (job->state < SLURMSTEPD_STEP_ENDING)) { time_t now = time(NULL); char entity[24], time_str[24]; @@ -1324,9 +1325,20 @@ static int _handle_add_extern_pid_internal(stepd_step_rec_t *job, pid_t pid) jobacct_id.nodeid = job->nodeid; jobacct_id.job = job; - proctrack_g_add(job, pid); - task_g_add_pid(pid); - jobacct_gather_add_task(pid, &jobacct_id, 1); + if (proctrack_g_add(job, pid) != SLURM_SUCCESS) { + error("%s: Job %u can't add pid %d to proctrack plugin in the extern_step.", __func__, job->jobid, pid); + return SLURM_FAILURE; + } + + if (task_g_add_pid(pid) != SLURM_SUCCESS) { + error("%s: Job %u can't add pid %d to task plugin in the extern_step.", __func__, job->jobid, pid); + return SLURM_FAILURE; + } + + if (jobacct_gather_add_task(pid, &jobacct_id, 1) != SLURM_SUCCESS) { + error("%s: Job %u can't add pid %d to jobacct_gather plugin in the extern_step.", __func__, job->jobid, pid); + return SLURM_FAILURE; + } /* spawn a thread that will wait on the pid given */ slurm_attr_init(&attr);