From f091b17c502511da7e93f0af21806b068319be07 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 16 May 2007 20:22:17 +0000 Subject: [PATCH] svn merge -r11442:11518 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1 --- NEWS | 2 ++ src/plugins/mpi/mvapich/mvapich.c | 44 +++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/NEWS b/NEWS index 5f1ce9f2f03..da3270d32e1 100644 --- a/NEWS +++ b/NEWS @@ -304,6 +304,8 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.1.36 ========================= - Permit node state specification of DRAIN in slurm.conf. + - In jobcomp/script - fix bug that prevented UID and JOBID environment + variables from being set. * Changes in SLURM 1.1.35 ========================= diff --git a/src/plugins/mpi/mvapich/mvapich.c b/src/plugins/mpi/mvapich/mvapich.c index dc2637f45b5..3d0e4325605 100644 --- a/src/plugins/mpi/mvapich/mvapich.c +++ b/src/plugins/mpi/mvapich/mvapich.c @@ -650,39 +650,61 @@ mvapich_print_abort_message (mvapich_state_t *st, int rank, { slurm_step_layout_t *sl = st->job->step_layout; char *host; + char *msgstr; if (!mvapich_abort_sends_rank (st)) { info ("mvapich: Received ABORT message from an MPI process."); return; } + if (msg && (msglen > 0)) { + /* + * Remove trailing newline if it exists (syslog will add newline) + */ + if (msg [msglen - 1] == '\n') + msg [msglen - 1] = '\0'; + + msgstr = msg; + } + else { + msgstr = ""; + msglen = 0; + } + host = slurm_step_layout_host_name( sl, slurm_step_layout_host_id(sl, rank)); if (dest >= 0) { const char *dsthost = slurm_step_layout_host_name (sl, dest); - if (msg [msglen - 1] == '\n') - msg [msglen - 1] = '\0'; - info ("mvapich: %M: ABORT from MPI rank %d [on %s] dest rank %d [on %s]", rank, host, dest, dsthost); /* - * If we got a message from MVAPICH, log it to syslog + * Log the abort event to syslog * so that system administrators know about possible HW events. */ - if (msglen > 0) { - openlog ("srun", 0, LOG_USER); - syslog (LOG_WARNING, - "MVAPICH ABORT [jobid=%u.%u src=%d(%s) dst=%d(%s)]: %s", - st->job->jobid, st->job->stepid, rank, host, dest, dsthost, msg); - closelog(); - } + openlog ("srun", 0, LOG_USER); + syslog (LOG_WARNING, + "MVAPICH ABORT [jobid=%u.%u src=%d(%s) dst=%d(%s)]: %s", + st->job->jobid, st->job->stepid, + rank, host, dest, dsthost, msgstr); + closelog(); } else { info ("mvapich: %M: ABORT from MPI rank %d [on %s]", rank, host); + /* + * Log the abort event to syslog + * so that system administrators know about possible HW events. + */ + openlog ("srun", 0, LOG_USER); + syslog (LOG_WARNING, + "MVAPICH ABORT [jobid=%u.%u src=%d(%s) dst=-1()]: %s", + st->job->jobid, st->job->stepid, + rank, host, msgstr); + closelog(); + } return; } -- GitLab