From eeb9705024fbdab4fe109f4be3d55105a4f21235 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Thu, 16 Oct 2014 10:36:08 -0700 Subject: [PATCH] Cray PMI refinements Refine commit 5f89223fe8457795bb2e66caebe39e5f90584e61 based upon feedback from David Gloe: * It's not only MPI jobs, but anything that uses PMI. That includes MPI, shmem, etc, so you may want to reword the error message. * I added the terminated flag because if multiple tasks on a node exit, you would get an error message from each of them. That reduces it to one error message per node. Cray bug 810310 prompted that change. * Since we're now relying on --kill-on-bad-exit, I think we should update the Cray slurm.conf template to default to 1 (set KillOnBadExit=1 in contribs/cray/slurm.conf.template). bug 1171 --- contribs/cray/slurm.conf.template | 2 +- src/plugins/task/cray/task_cray.c | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/contribs/cray/slurm.conf.template b/contribs/cray/slurm.conf.template index 7e42c302d66..43ff90c9df4 100644 --- a/contribs/cray/slurm.conf.template +++ b/contribs/cray/slurm.conf.template @@ -32,7 +32,7 @@ JobContainerType=job_container/cncu #JobFileAppend=0 #JobRequeue=1 #JobSubmitPlugins=1 -#KillOnBadExit=0 +#KillOnBadExit=1 #LaunchType=launch/slurm #Licenses=foo*4,bar #MailProg=/bin/mail diff --git a/src/plugins/task/cray/task_cray.c b/src/plugins/task/cray/task_cray.c index 519720a6aaa..c201a885c1d 100644 --- a/src/plugins/task/cray/task_cray.c +++ b/src/plugins/task/cray/task_cray.c @@ -120,6 +120,8 @@ unsigned int numa_bitmask_weight(const struct bitmask *bmp); static int _get_numa_nodes(char *path, int *cnt, int **numa_array); static int _get_cpu_masks(int num_numa_nodes, int32_t *numa_array, cpu_set_t **cpuMasks); + +static int terminated = 0; #endif /* @@ -386,8 +388,18 @@ extern int task_p_post_term (stepd_step_rec_t *job, } // Cancel the job step, since we didn't find the mpi_fini msg - error("step %u.%u task %u exited without calling mpi_fini()", - job->jobid, job->stepid, task->gtid); + // srun only gets the error() messages by default, send one + // per compute node, but log all other events with info(). + if (terminated) { + info("step %u.%u task %u exited without calling " + "PMI_Finalize()", + job->jobid, job->stepid, task->gtid); + } else { + error("step %u.%u task %u exited without calling " + "PMI_Finalize()", + job->jobid, job->stepid, task->gtid); + terminated = 1; + } info("reset estatus from %d to %d", task->estatus, SIGKILL); task->estatus = SIGKILL; } -- GitLab