Skip to content
Snippets Groups Projects
Commit adb45114 authored by Brian Christiansen's avatar Brian Christiansen
Browse files

Fix locking around Cray CCM prolog/epilog

job_ptr was still being referenced after releasing the read locks. Also
prolog_running_decr() needs to have a job write lock and a fed read
lock.

Bug 4947
parent 1ba9eb76
No related branches found
No related tags found
No related merge requests found
......@@ -20,6 +20,7 @@ documents those changes that are of interest to users and administrators.
-- Add documentation for fix IDLE*+POWER due to capmc stuck in Cray systems.
-- Fix missing mutex unlock when prolog is failing on a node, leading to a
hung slurmd.
-- Fix locking around Cray CCM prolog/epilog.
* Changes in Slurm 17.11.5
==========================
......
......@@ -530,19 +530,33 @@ extern int ccm_check_partitions(struct job_record *job_ptr)
extern void *ccm_begin(void *args)
{
int i, j, num_ents, kill = 1;
uint32_t job_id;
size_t copysz;
ccm_info_t ccm_info;
char err_str_buf[128], srun_msg_buf[256];
struct job_record *job_ptr = (struct job_record *)args;
slurmctld_lock_t job_read_lock =
{NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
{ NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
slurmctld_lock_t job_write_lock =
{ NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
lock_slurmctld(job_read_lock);
if (job_ptr->magic != JOB_MAGIC) {
unlock_slurmctld(job_read_lock);
error("ccm job has disappeared");
return NULL;
} else if (IS_JOB_COMPLETING(job_ptr)) {
unlock_slurmctld(job_read_lock);
debug("ccm %u job has already completed", job_ptr->job_id);
return NULL;
}
job_id = job_ptr->job_id;
debug2("CCM job %u_ccm_begin partition %s", job_ptr->job_id,
job_ptr->partition);
memset(&ccm_info, 0, sizeof(ccm_info_t));
lock_slurmctld(job_read_lock);
ccm_info.job_id = job_ptr->job_id;
ccm_info.user_id = job_ptr->user_id;
ccm_info.nodelist = xstrdup(job_ptr->nodes);
......@@ -585,7 +599,6 @@ extern void *ccm_begin(void *args)
ccm_info.task_dist = job_ptr->details->task_dist;
}
ccm_info.plane_size = job_ptr->details->plane_size;
unlock_slurmctld(job_read_lock);
debug("CCM job %u, user_id %u, nodelist %s, node_cnt %d, "
"num_tasks %d", ccm_info.job_id, ccm_info.user_id,
......@@ -601,10 +614,12 @@ extern void *ccm_begin(void *args)
num_ents++;
}
}
unlock_slurmctld(job_read_lock);
if (ccm_info.node_cnt != num_ents) {
CRAY_ERR("CCM job %u ccm_info.node_cnt %d doesn't match the "
"number of cpu_count_reps entries %d",
job_ptr->job_id, ccm_info.node_cnt, num_ents);
job_id, ccm_info.node_cnt, num_ents);
snprintf(err_str_buf, sizeof(err_str_buf),
"node_cnt %d != cpu_count_reps %d, prolog not run",
ccm_info.node_cnt, num_ents);
......@@ -614,6 +629,14 @@ extern void *ccm_begin(void *args)
snprintf(err_str_buf, sizeof(err_str_buf),
"prolog failed");
}
lock_slurmctld(job_write_lock);
if ((job_ptr->magic != JOB_MAGIC) ||
(job_ptr->job_id != job_id)) {
unlock_slurmctld(job_write_lock);
error("ccm job %u has disappeared after running ccm", job_id);
return NULL;
}
debug("CCM ccm_begin job %u prolog_running_decr, cur %d",
ccm_info.job_id, job_ptr->details->prolog_running);
prolog_running_decr(job_ptr);
......@@ -625,6 +648,7 @@ extern void *ccm_begin(void *args)
srun_user_message(job_ptr, srun_msg_buf);
(void) job_signal(job_ptr->job_id, SIGKILL, 0, 0, false);
}
unlock_slurmctld(job_write_lock);
/* Free the malloc'd fields within this structure */
_free_ccm_info(&ccm_info);
return NULL;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment