diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index e9959e1458a41cc5a7c486cd1443797d8b218fbf..f61fda3fd289c3b6623c0c2a10f45f22373805e6 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -253,13 +253,6 @@ static pthread_mutex_t job_limits_mutex = PTHREAD_MUTEX_INITIALIZER; static List job_limits_list = NULL; static bool job_limits_loaded = false; -/* - * To be fixed in 17.11 to match the count of cpus on a node instead of a hard - * code. - */ -#define FINI_JOB_CNT 256 -static pthread_mutex_t fini_mutex = PTHREAD_MUTEX_INITIALIZER; -static uint32_t fini_job_id[FINI_JOB_CNT]; static int next_fini_job_inx = 0; /* NUM_PARALLEL_SUSP_JOBS controls the number of jobs that can be suspended or @@ -1835,24 +1828,24 @@ static bool _is_batch_job_finished(uint32_t job_id) bool found_job = false; int i; - slurm_mutex_lock(&fini_mutex); - for (i = 0; i < FINI_JOB_CNT; i++) { + slurm_mutex_lock(&fini_job_mutex); + for (i = 0; i < fini_job_cnt; i++) { if (fini_job_id[i] == job_id) { found_job = true; break; } } - slurm_mutex_unlock(&fini_mutex); + slurm_mutex_unlock(&fini_job_mutex); return found_job; } static void _note_batch_job_finished(uint32_t job_id) { - slurm_mutex_lock(&fini_mutex); + slurm_mutex_lock(&fini_job_mutex); fini_job_id[next_fini_job_inx] = job_id; - if (++next_fini_job_inx >= FINI_JOB_CNT) + if (++next_fini_job_inx >= fini_job_cnt) next_fini_job_inx = 0; - slurm_mutex_unlock(&fini_mutex); + slurm_mutex_unlock(&fini_job_mutex); } /* Send notification to slurmctld we are finished running the prolog. diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index eda00ad142863b337ffa3713d672069e596f1de0..059b4cc88dbe19b39f1c5d64b17902f98c672fa3 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -125,6 +125,9 @@ /* global, copied to STDERR_FILENO in tasks before the exec */ int devnull = -1; slurmd_conf_t * conf = NULL; +int fini_job_cnt = 0; +uint32_t *fini_job_id = NULL; +pthread_mutex_t fini_job_mutex = PTHREAD_MUTEX_INITIALIZER; /* * count of active threads @@ -1506,7 +1509,8 @@ _slurmd_init(void) */ _read_config(); - cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); + fini_job_cnt = cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); + fini_job_id = xmalloc(sizeof(uint32_t) * fini_job_cnt); if ((gres_plugin_init() != SLURM_SUCCESS) || (gres_plugin_node_config_load(cpu_cnt, conf->node_name, NULL) @@ -1727,6 +1731,10 @@ _slurmd_fini(void) acct_gather_conf_destroy(); fini_system_cgroup(); route_fini(); + slurm_mutex_lock(&fini_job_mutex); + xfree(fini_job_id); + fini_job_cnt = 0; + slurm_mutex_unlock(&fini_job_mutex); return SLURM_SUCCESS; } diff --git a/src/slurmd/slurmd/slurmd.h b/src/slurmd/slurmd/slurmd.h index b5e6339caddf40dbe649cfa6d639d8f771919782..f38a8f9f05977984c36dcd452b423fc0acc964ee 100644 --- a/src/slurmd/slurmd/slurmd.h +++ b/src/slurmd/slurmd/slurmd.h @@ -170,6 +170,9 @@ typedef struct slurmd_config { } slurmd_conf_t; extern slurmd_conf_t * conf; +extern int fini_job_cnt; +extern uint32_t *fini_job_id; +extern pthread_mutex_t fini_job_mutex; /* Send node registration message with status to controller * IN status - same values slurm error codes (for node shutdown)