From c2e3713e765b66a9c0ae8a2fb4dc0e1e0f367050 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 1 Aug 2008 00:28:20 +0000 Subject: [PATCH] Report abormal task termination message (restored functionality present in slurm v1.2). --- NEWS | 2 + src/api/step_launch.c | 101 +++++++++++++++++++++++++++++++++++++++--- src/srun/srun.c | 10 ++--- 3 files changed, 103 insertions(+), 10 deletions(-) diff --git a/NEWS b/NEWS index 9fa39721878..30950a40648 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,8 @@ documents those changes that are of interest to users and admins. -- Added Python module to process hostslists as used by SLURM. See contribs/python/hostlist. Supplied by Kent Engstrom, National Supercomputer Centre, Sweden. + -- Report abormal task termination message (restored functionality present + in slurm v1.2). * Changes in SLURM 1.3.6 ======================== diff --git a/src/api/step_launch.c b/src/api/step_launch.c index deea41826b0..7159dc12cb8 100644 --- a/src/api/step_launch.c +++ b/src/api/step_launch.c @@ -65,6 +65,12 @@ #include "src/api/step_ctx.h" #include "src/api/pmi_server.h" +#if defined (HAVE_DECL_STRSIGNAL) && !HAVE_DECL_STRSIGNAL +# ifndef strsignal + extern char *strsignal(int); +# endif +#endif /* defined HAVE_DECL_STRSIGNAL && !HAVE_DECL_STRSIGNAL */ + #define STEP_ABORT_TIME 2 extern char **environ; @@ -76,6 +82,9 @@ static int _launch_tasks(slurm_step_ctx_t *ctx, launch_tasks_request_msg_t *launch_msg, uint32_t timeout); static char *_lookup_cwd(void); +static void _print_exit_status(struct step_launch_state *sls, + task_exit_msg_t *exit_msg, + bitstr_t *tasks_exited); static void _print_launch_msg(launch_tasks_request_msg_t *msg, char *hostname, int nodeid); @@ -768,6 +777,7 @@ static void _exit_handler(struct step_launch_state *sls, slurm_msg_t *exit_msg) { task_exit_msg_t *msg = (task_exit_msg_t *) exit_msg->data; + bitstr_t *tasks_exited; int i; if ((msg->job_id != sls->mpi_info->jobid) || @@ -777,6 +787,11 @@ _exit_handler(struct step_launch_state *sls, slurm_msg_t *exit_msg) return; } + if (msg->num_tasks < 1) { + error("task_exit_msg has zero tasks"); + return; + } + /* Record SIGTERM and SIGKILL termination codes to * recognize abnormal termination */ if (WIFSIGNALED(msg->return_code)) { @@ -786,11 +801,12 @@ _exit_handler(struct step_launch_state *sls, slurm_msg_t *exit_msg) } pthread_mutex_lock(&sls->lock); - - for (i = 0; i < msg->num_tasks; i++) { - debug("task %u done", msg->task_id_list[i]); - bit_set(sls->tasks_exited, msg->task_id_list[i]); - } + tasks_exited = bit_alloc(sls->tasks_requested); + for (i = 0; i < msg->num_tasks; i++) + bit_set(tasks_exited, msg->task_id_list[i]); + _print_exit_status(sls, msg, tasks_exited); + bit_or(sls->tasks_exited, tasks_exited); + bit_free(tasks_exited); if (sls->callback.task_finish != NULL) (sls->callback.task_finish)(msg); @@ -799,6 +815,81 @@ _exit_handler(struct step_launch_state *sls, slurm_msg_t *exit_msg) pthread_mutex_unlock(&sls->lock); } +static void +_print_exit_status(struct step_launch_state *sls, task_exit_msg_t *exit_msg, + bitstr_t *tasks_exited) +{ + char buf[2048], term_msg[32], task_id[32]; + char *host_str, *core_str = "", *msg_str = NULL; + void (*print) (const char *, ...) = (void *) &error; + hostlist_t node_list, task_list; + int i, j; + + if (exit_msg->return_code == 0) { + if (exit_msg->num_tasks == 1) { + snprintf(buf, sizeof(buf), "%u", + exit_msg->task_id_list[0]); + verbose("task %s: Completed", buf); + } else { + bit_fmt(buf, sizeof(buf), tasks_exited); + verbose("task %s: Completed", buf); + } + return; + } + +#ifdef WCOREDUMP + if (WCOREDUMP(exit_msg->return_code)) + core_str = " (core dumped)"; +#endif + + + if (WIFSIGNALED(exit_msg->return_code)) { + /* + * Print message that task was signaled as verbose message + * not error message if the user generated the signal. + */ + if (sls->abort) + print = &verbose; + msg_str = strsignal(WTERMSIG(exit_msg->return_code)); + } else { + snprintf(term_msg, sizeof(term_msg), + "Exited with exit code %u", + WEXITSTATUS(exit_msg->return_code)); + msg_str = term_msg; + } + + /* We want to identify the nodes associated with the failure. + * The message may contain the response from several nodes, + * so we split out the results by node for this message */ + node_list = hostlist_create(sls->layout->node_list); + task_list = NULL; + if (node_list == NULL) + fatal("malloc failure"); + for (i=0; i<sls->layout->node_cnt; i++) { + host_str = hostlist_shift(node_list); + for (j=0; j<sls->layout->tasks[i]; j++) { + if (!bit_test(tasks_exited, + sls->layout->tids[i][j])) + continue; + if (task_list == NULL) + task_list = hostlist_create(NULL); + snprintf(task_id, sizeof(task_id), "%u", + sls->layout->tids[i][j]); + hostlist_push(task_list, task_id); + } + if (task_list) { + hostlist_ranged_string(task_list, sizeof(buf), buf); + hostlist_destroy(task_list); + task_list = NULL; + (*print) ("%s: task %s: %s%s", + host_str, buf, msg_str, core_str); + } + free(host_str); + } + hostlist_destroy(node_list); + return; +} + static void _job_complete_handler(struct step_launch_state *sls, slurm_msg_t *complete_msg) { diff --git a/src/srun/srun.c b/src/srun/srun.c index fae2b6e57fe..e5b20f3bd60 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -994,7 +994,7 @@ _task_state_struct_print(void) bit_copybits(tmp, task_state.finish_abnormal); bit_and(tmp, not_seen); bit_fmt(buf, BUFSIZ, tmp); - info("task%s: exited abnormally", buf); + info("task %s: exited abnormally", buf); bit_or(seen, tmp); bit_copybits(not_seen, seen); bit_not(not_seen); @@ -1004,7 +1004,7 @@ _task_state_struct_print(void) bit_copybits(tmp, task_state.finish_normal); bit_and(tmp, not_seen); bit_fmt(buf, BUFSIZ, tmp); - info("task%s: exited", buf); + info("task %s: exited", buf); bit_or(seen, tmp); bit_copybits(not_seen, seen); bit_not(not_seen); @@ -1014,7 +1014,7 @@ _task_state_struct_print(void) bit_copybits(tmp, task_state.start_failure); bit_and(tmp, not_seen); bit_fmt(buf, BUFSIZ, tmp); - info("task%s: failed to start", buf); + info("task %s: failed to start", buf); bit_or(seen, tmp); bit_copybits(not_seen, seen); bit_not(not_seen); @@ -1024,7 +1024,7 @@ _task_state_struct_print(void) bit_copybits(tmp, task_state.start_success); bit_and(tmp, not_seen); bit_fmt(buf, BUFSIZ, tmp); - info("task%s: running", buf); + info("task %s: running", buf); bit_or(seen, tmp); bit_copybits(not_seen, seen); bit_not(not_seen); @@ -1071,7 +1071,7 @@ static void _handle_intr() job->jobid, job->stepid); last_intr_sent = time(NULL); slurm_step_launch_fwd_signal(job->step_ctx, SIGINT); - + slurm_step_launch_abort(job->step_ctx); } else { job_force_termination(job); slurm_step_launch_abort(job->step_ctx); -- GitLab