Skip to content
Snippets Groups Projects
Commit eeb97050 authored by Morris Jette's avatar Morris Jette
Browse files

Cray PMI refinements

Refine commit 5f89223f based upon
feedback from David Gloe:
* It's not only MPI jobs, but anything that uses PMI. That includes MPI,
shmem, etc, so you may want to reword the error message.
* I added the terminated flag because if multiple tasks on a node exit,
you would get an error message from each of them. That reduces it to one
error message per node. Cray bug 810310 prompted that change.
* Since we're now relying on --kill-on-bad-exit, I think we should update
the Cray slurm.conf template to default to 1 (set KillOnBadExit=1 in
contribs/cray/slurm.conf.template).
bug 1171
parent 5f89223f
No related branches found
No related tags found
No related merge requests found
...@@ -32,7 +32,7 @@ JobContainerType=job_container/cncu ...@@ -32,7 +32,7 @@ JobContainerType=job_container/cncu
#JobFileAppend=0 #JobFileAppend=0
#JobRequeue=1 #JobRequeue=1
#JobSubmitPlugins=1 #JobSubmitPlugins=1
#KillOnBadExit=0 #KillOnBadExit=1
#LaunchType=launch/slurm #LaunchType=launch/slurm
#Licenses=foo*4,bar #Licenses=foo*4,bar
#MailProg=/bin/mail #MailProg=/bin/mail
......
...@@ -120,6 +120,8 @@ unsigned int numa_bitmask_weight(const struct bitmask *bmp); ...@@ -120,6 +120,8 @@ unsigned int numa_bitmask_weight(const struct bitmask *bmp);
static int _get_numa_nodes(char *path, int *cnt, int **numa_array); static int _get_numa_nodes(char *path, int *cnt, int **numa_array);
static int _get_cpu_masks(int num_numa_nodes, int32_t *numa_array, static int _get_cpu_masks(int num_numa_nodes, int32_t *numa_array,
cpu_set_t **cpuMasks); cpu_set_t **cpuMasks);
static int terminated = 0;
#endif #endif
/* /*
...@@ -386,8 +388,18 @@ extern int task_p_post_term (stepd_step_rec_t *job, ...@@ -386,8 +388,18 @@ extern int task_p_post_term (stepd_step_rec_t *job,
} }
// Cancel the job step, since we didn't find the mpi_fini msg // Cancel the job step, since we didn't find the mpi_fini msg
error("step %u.%u task %u exited without calling mpi_fini()", // srun only gets the error() messages by default, send one
job->jobid, job->stepid, task->gtid); // per compute node, but log all other events with info().
if (terminated) {
info("step %u.%u task %u exited without calling "
"PMI_Finalize()",
job->jobid, job->stepid, task->gtid);
} else {
error("step %u.%u task %u exited without calling "
"PMI_Finalize()",
job->jobid, job->stepid, task->gtid);
terminated = 1;
}
info("reset estatus from %d to %d", task->estatus, SIGKILL); info("reset estatus from %d to %d", task->estatus, SIGKILL);
task->estatus = SIGKILL; task->estatus = SIGKILL;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment