diff --git a/contribs/cray/slurm.conf.template b/contribs/cray/slurm.conf.template index 7e42c302d6665e677198aa559fae36bdc3dd608a..43ff90c9df4055ba5bb2bc87d6db8ce3700223c8 100644 --- a/contribs/cray/slurm.conf.template +++ b/contribs/cray/slurm.conf.template @@ -32,7 +32,7 @@ JobContainerType=job_container/cncu #JobFileAppend=0 #JobRequeue=1 #JobSubmitPlugins=1 -#KillOnBadExit=0 +#KillOnBadExit=1 #LaunchType=launch/slurm #Licenses=foo*4,bar #MailProg=/bin/mail diff --git a/src/plugins/task/cray/task_cray.c b/src/plugins/task/cray/task_cray.c index 519720a6aaa0fac28f1e22713821385493bc77a0..c201a885c1d7c18fc4ad52ae9cd8b973893dc8f7 100644 --- a/src/plugins/task/cray/task_cray.c +++ b/src/plugins/task/cray/task_cray.c @@ -120,6 +120,8 @@ unsigned int numa_bitmask_weight(const struct bitmask *bmp); static int _get_numa_nodes(char *path, int *cnt, int **numa_array); static int _get_cpu_masks(int num_numa_nodes, int32_t *numa_array, cpu_set_t **cpuMasks); + +static int terminated = 0; #endif /* @@ -386,8 +388,18 @@ extern int task_p_post_term (stepd_step_rec_t *job, } // Cancel the job step, since we didn't find the mpi_fini msg - error("step %u.%u task %u exited without calling mpi_fini()", - job->jobid, job->stepid, task->gtid); + // srun only gets the error() messages by default, send one + // per compute node, but log all other events with info(). + if (terminated) { + info("step %u.%u task %u exited without calling " + "PMI_Finalize()", + job->jobid, job->stepid, task->gtid); + } else { + error("step %u.%u task %u exited without calling " + "PMI_Finalize()", + job->jobid, job->stepid, task->gtid); + terminated = 1; + } info("reset estatus from %d to %d", task->estatus, SIGKILL); task->estatus = SIGKILL; }