Skip to content
Snippets Groups Projects
Commit feab26f8 authored by Moe Jette's avatar Moe Jette
Browse files
parent dfc039c5
No related branches found
No related tags found
No related merge requests found
......@@ -214,6 +214,8 @@ documents those changes that are of interest to users and admins.
* Changes in SLURM 1.1.26
=========================
- In sched/wiki2, fixes for support of job features.
- In sched/wiki2, add "FLAGS=INTERACTIVE;" to GETJOBS response for
non-batch (not srun --batch) jobs.
* Changes in SLURM 1.1.25
=========================
......
......@@ -416,8 +416,10 @@ static void mvapich_barrier (void)
}
static void
mvapich_print_abort_message (slurm_step_layout_t *sl, int rank, int dest)
mvapich_print_abort_message (srun_job_t *job, int rank, int dest,
char *msg, int msglen)
{
slurm_step_layout_t *sl = job->step_layout;
char *host;
if (!mvapich_abort_sends_rank ()) {
......@@ -429,9 +431,25 @@ mvapich_print_abort_message (slurm_step_layout_t *sl, int rank, int dest)
sl, slurm_step_layout_host_id(sl, rank));
if (dest >= 0) {
const char *dsthost = step_layout_host_name (sl, dest);
if (msg [msglen - 1] == '\n')
msg [msglen - 1] = '\0';
info ("mvapich: %M: ABORT from MPI rank %d [on %s] dest rank %d [on %s]",
rank, host, dest,
slurm_step_layout_host_name (sl, dest));
rank, host, dest, dsthost);
/*
* If we got a message from MVAPICH, log it to syslog
* so that system administrators know about possible HW events.
*/
if (msglen > 0) {
openlog ("srun", 0, LOG_USER);
syslog (LOG_WARNING,
"MVAPICH ABORT [jobid=%u.%u src=%d(%s) dst=%d(%s)]: %s",
job->jobid, job->stepid, rank, host, dest, dsthost, msg);
closelog();
}
}
else {
info ("mvapich: %M: ABORT from MPI rank %d [on %s]",
......@@ -443,9 +461,11 @@ mvapich_print_abort_message (slurm_step_layout_t *sl, int rank, int dest)
static void mvapich_wait_for_abort(srun_job_t *job)
{
int rlen;
char rbuf[1024];
int *p = (int *) rbuf;
int src, dst;
int ranks[2];
int n;
char msg [1024] = "";
int msglen = 0;
/*
* Wait for abort notification from any process.
......@@ -464,17 +484,32 @@ static void mvapich_wait_for_abort(srun_job_t *job)
fd_set_blocking (newfd);
if ((rlen = fd_read_n (newfd, rbuf, sizeof (rbuf))) < 0) {
error("MPI recv (abort-wait) returned %d", rlen);
close(newfd);
ranks[1] = -1;
if ((n = fd_read_n (newfd, &ranks, sizeof (ranks))) < 0) {
error("mvapich: MPI recv (abort-wait) failed");
close (newfd);
continue;
}
/*
* If we read both src/dest rank, then also try to
* read an error message. If this fails, msglen will
* stay zero and no message will be printed.
*/
if (ranks[1] >= 0) {
dst = ranks[0];
src = ranks[1];
fd_read_n (newfd, &msglen, sizeof (int));
if (msglen)
fd_read_n (newfd, msg, msglen);
} else {
src = ranks[0];
dst = -1;
}
close(newfd);
if (rlen > sizeof (int))
mvapich_print_abort_message (job->step_layout, p[1], p[0]);
else
mvapich_print_abort_message (job->step_layout, p[0], -1);
mvapich_print_abort_message (job, src, dst, msg, msglen);
fwd_signal(job, SIGKILL, opt.max_threads);
}
......@@ -624,6 +659,9 @@ again:
while (i < nprocs) {
int fd;
mvapich_debug ("Waiting to accept remote connection %d of %d\n",
i, nprocs);
if ((fd = mvapich_get_next_connection (mvapich_fd)) < 0) {
error ("mvapich: accept: %m");
goto fail;
......
......@@ -79,6 +79,7 @@ static uint32_t _get_job_time_limit(struct job_record *job_ptr);
* [TASKLIST=<node1:node2>;] nodes in use, if running or completing
* [REJMESSAGE=<str>;] reason job is not running, if any
* UPDATETIME=<uts>; time last active
* [FLAGS=INTERACTIVE;] set if interactive (not batch) job
* WCLIMIT=<secs>; wall clock time limit, seconds
* TASKS=<cpus>; CPUs required
* QUEUETIME=<uts>; submission time
......@@ -246,6 +247,9 @@ static char * _dump_job(struct job_record *job_ptr, int state_info)
xstrcat(buf, tmp);
}
if (job_ptr->batch_flag == 0)
xstrcat(buf, "FLAGS=INTERACTIVE;");
snprintf(tmp, sizeof(tmp),
"UPDATETIME=%u;WCLIMIT=%u;",
(uint32_t) job_ptr->time_last_active,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment