diff --git a/NEWS b/NEWS index 5f1ce9f2f03268366321f7eec30398548fb2c339..da3270d32e1a3d228d70e30bef6a124cc96fe00b 100644 --- a/NEWS +++ b/NEWS @@ -304,6 +304,8 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.1.36 ========================= - Permit node state specification of DRAIN in slurm.conf. + - In jobcomp/script - fix bug that prevented UID and JOBID environment + variables from being set. * Changes in SLURM 1.1.35 ========================= diff --git a/src/plugins/mpi/mvapich/mvapich.c b/src/plugins/mpi/mvapich/mvapich.c index dc2637f45b57003a6dfb6a4d3fc342f1a8528dca..3d0e4325605b3115462e0a318572c21a52552f03 100644 --- a/src/plugins/mpi/mvapich/mvapich.c +++ b/src/plugins/mpi/mvapich/mvapich.c @@ -650,39 +650,61 @@ mvapich_print_abort_message (mvapich_state_t *st, int rank, { slurm_step_layout_t *sl = st->job->step_layout; char *host; + char *msgstr; if (!mvapich_abort_sends_rank (st)) { info ("mvapich: Received ABORT message from an MPI process."); return; } + if (msg && (msglen > 0)) { + /* + * Remove trailing newline if it exists (syslog will add newline) + */ + if (msg [msglen - 1] == '\n') + msg [msglen - 1] = '\0'; + + msgstr = msg; + } + else { + msgstr = ""; + msglen = 0; + } + host = slurm_step_layout_host_name( sl, slurm_step_layout_host_id(sl, rank)); if (dest >= 0) { const char *dsthost = slurm_step_layout_host_name (sl, dest); - if (msg [msglen - 1] == '\n') - msg [msglen - 1] = '\0'; - info ("mvapich: %M: ABORT from MPI rank %d [on %s] dest rank %d [on %s]", rank, host, dest, dsthost); /* - * If we got a message from MVAPICH, log it to syslog + * Log the abort event to syslog * so that system administrators know about possible HW events. */ - if (msglen > 0) { - openlog ("srun", 0, LOG_USER); - syslog (LOG_WARNING, - "MVAPICH ABORT [jobid=%u.%u src=%d(%s) dst=%d(%s)]: %s", - st->job->jobid, st->job->stepid, rank, host, dest, dsthost, msg); - closelog(); - } + openlog ("srun", 0, LOG_USER); + syslog (LOG_WARNING, + "MVAPICH ABORT [jobid=%u.%u src=%d(%s) dst=%d(%s)]: %s", + st->job->jobid, st->job->stepid, + rank, host, dest, dsthost, msgstr); + closelog(); } else { info ("mvapich: %M: ABORT from MPI rank %d [on %s]", rank, host); + /* + * Log the abort event to syslog + * so that system administrators know about possible HW events. + */ + openlog ("srun", 0, LOG_USER); + syslog (LOG_WARNING, + "MVAPICH ABORT [jobid=%u.%u src=%d(%s) dst=-1()]: %s", + st->job->jobid, st->job->stepid, + rank, host, msgstr); + closelog(); + } return; }