Skip to content
Snippets Groups Projects
Commit f091b17c authored by Moe Jette's avatar Moe Jette
Browse files
parent 0c928735
No related branches found
No related tags found
No related merge requests found
...@@ -304,6 +304,8 @@ documents those changes that are of interest to users and admins. ...@@ -304,6 +304,8 @@ documents those changes that are of interest to users and admins.
* Changes in SLURM 1.1.36 * Changes in SLURM 1.1.36
========================= =========================
- Permit node state specification of DRAIN in slurm.conf. - Permit node state specification of DRAIN in slurm.conf.
- In jobcomp/script - fix bug that prevented UID and JOBID environment
variables from being set.
* Changes in SLURM 1.1.35 * Changes in SLURM 1.1.35
========================= =========================
......
...@@ -650,39 +650,61 @@ mvapich_print_abort_message (mvapich_state_t *st, int rank, ...@@ -650,39 +650,61 @@ mvapich_print_abort_message (mvapich_state_t *st, int rank,
{ {
slurm_step_layout_t *sl = st->job->step_layout; slurm_step_layout_t *sl = st->job->step_layout;
char *host; char *host;
char *msgstr;
if (!mvapich_abort_sends_rank (st)) { if (!mvapich_abort_sends_rank (st)) {
info ("mvapich: Received ABORT message from an MPI process."); info ("mvapich: Received ABORT message from an MPI process.");
return; return;
} }
if (msg && (msglen > 0)) {
/*
* Remove trailing newline if it exists (syslog will add newline)
*/
if (msg [msglen - 1] == '\n')
msg [msglen - 1] = '\0';
msgstr = msg;
}
else {
msgstr = "";
msglen = 0;
}
host = slurm_step_layout_host_name( host = slurm_step_layout_host_name(
sl, slurm_step_layout_host_id(sl, rank)); sl, slurm_step_layout_host_id(sl, rank));
if (dest >= 0) { if (dest >= 0) {
const char *dsthost = slurm_step_layout_host_name (sl, dest); const char *dsthost = slurm_step_layout_host_name (sl, dest);
if (msg [msglen - 1] == '\n')
msg [msglen - 1] = '\0';
info ("mvapich: %M: ABORT from MPI rank %d [on %s] dest rank %d [on %s]", info ("mvapich: %M: ABORT from MPI rank %d [on %s] dest rank %d [on %s]",
rank, host, dest, dsthost); rank, host, dest, dsthost);
/* /*
* If we got a message from MVAPICH, log it to syslog * Log the abort event to syslog
* so that system administrators know about possible HW events. * so that system administrators know about possible HW events.
*/ */
if (msglen > 0) { openlog ("srun", 0, LOG_USER);
openlog ("srun", 0, LOG_USER); syslog (LOG_WARNING,
syslog (LOG_WARNING, "MVAPICH ABORT [jobid=%u.%u src=%d(%s) dst=%d(%s)]: %s",
"MVAPICH ABORT [jobid=%u.%u src=%d(%s) dst=%d(%s)]: %s", st->job->jobid, st->job->stepid,
st->job->jobid, st->job->stepid, rank, host, dest, dsthost, msg); rank, host, dest, dsthost, msgstr);
closelog(); closelog();
}
} }
else { else {
info ("mvapich: %M: ABORT from MPI rank %d [on %s]", info ("mvapich: %M: ABORT from MPI rank %d [on %s]",
rank, host); rank, host);
/*
* Log the abort event to syslog
* so that system administrators know about possible HW events.
*/
openlog ("srun", 0, LOG_USER);
syslog (LOG_WARNING,
"MVAPICH ABORT [jobid=%u.%u src=%d(%s) dst=-1()]: %s",
st->job->jobid, st->job->stepid,
rank, host, msgstr);
closelog();
} }
return; return;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment