From 7ed68bc9bfb3372f7f7bed97c2425387cb711f4c Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 1 Aug 2008 19:13:08 +0000 Subject: [PATCH] add hostlist to task exit messages --- src/srun/srun.c | 59 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/src/srun/srun.c b/src/srun/srun.c index 16a518d63f1..2f1cb2c2723 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -72,6 +72,7 @@ #include "src/common/fd.h" +#include "src/common/hostlist.h" #include "src/common/log.h" #include "src/common/slurm_protocol_api.h" #include "src/common/switch.h" @@ -916,11 +917,50 @@ _handle_max_wait(int signo) _terminate_job_step(job->step_ctx); } +static char * +_taskids_to_nodelist(bitstr_t *tasks_exited) +{ + int i, hostid; + bitstr_t *nodes_exited = NULL; + char *hostname, *hostlist_str; + hostlist_t hostlist; + job_step_create_response_msg_t *step_resp; + slurm_step_layout_t *step_layout; + + if (!job->step_ctx) { + error("No step_ctx"); + hostlist_str = xstrdup("Unknown"); + return hostlist_str; + } + + slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_RESP, &step_resp); + step_layout = step_resp->step_layout; + nodes_exited = bit_alloc(job->nhosts); + for (i=0; i<job->ntasks; i++) { + if (!bit_test(tasks_exited, i)) + continue; + hostid = slurm_step_layout_host_id(step_layout, i); + bit_set(nodes_exited, hostid); + } + hostlist = hostlist_create(NULL); + for (i=0; i<job->nhosts; i++) { + if (!bit_test(tasks_exited, i)) + continue; + hostname = slurm_step_layout_host_name(step_layout, i); + hostlist_push(hostlist, hostname); + } + hostlist_str = xmalloc(2048); + hostlist_ranged_string(hostlist, 2048, hostlist_str); + hostlist_destroy(hostlist); + bit_free(nodes_exited); + return hostlist_str; +} + static void _task_finish(task_exit_msg_t *msg) { bitstr_t *tasks_exited = NULL; - char buf[2048], *core_str = "", *msg_str; + char buf[2048], *core_str = "", *msg_str, *node_list = NULL; static bool first_done = true; static bool first_error = true; int rc = 0; @@ -936,7 +976,9 @@ _task_finish(task_exit_msg_t *msg) rc = WEXITSTATUS(msg->return_code); if (rc != 0) { bit_or(task_state.finish_abnormal, tasks_exited); - error("task %s: Exited with exit code %d", buf, rc); + node_list = _taskids_to_nodelist(tasks_exited); + error("%s: task %s: Exited with exit code %d", + node_list, buf, rc); } else { bit_or(task_state.finish_normal, tasks_exited); verbose("task %s: Completed", buf); @@ -949,11 +991,16 @@ _task_finish(task_exit_msg_t *msg) if (WCOREDUMP(msg->return_code)) core_str = " (core dumped)"; #endif - if (job->state >= SRUN_JOB_CANCELLED) - verbose("task %s: %s%s", buf, msg_str, core_str); - else - error("task %s: %s%s", buf, msg_str, core_str); + node_list = _taskids_to_nodelist(tasks_exited); + if (job->state >= SRUN_JOB_CANCELLED) { + verbose("%s: task %s: %s%s", + node_list, buf, msg_str, core_str); + } else { + error("%s: task %s: %s%s", + node_list, buf, msg_str, core_str); + } } + xfree(node_list); bit_free(tasks_exited); global_rc = MAX(global_rc, rc); -- GitLab