From 231af0d51c0c30cc70c1d03a12a0754f2641f64c Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Mon, 12 Jun 2017 16:33:30 -0600 Subject: [PATCH] Make the fini array in slurmd based on the number of cpus on the node instead of a hard coded value. This probably needs to be something higher to accommodate potential systems with with oversubscribe given. See bug 3833 --- src/slurmd/slurmd/req.c | 19 ++++++------------- src/slurmd/slurmd/slurmd.c | 10 +++++++++- src/slurmd/slurmd/slurmd.h | 3 +++ 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index e9959e1458a..f61fda3fd28 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -253,13 +253,6 @@ static pthread_mutex_t job_limits_mutex = PTHREAD_MUTEX_INITIALIZER; static List job_limits_list = NULL; static bool job_limits_loaded = false; -/* - * To be fixed in 17.11 to match the count of cpus on a node instead of a hard - * code. - */ -#define FINI_JOB_CNT 256 -static pthread_mutex_t fini_mutex = PTHREAD_MUTEX_INITIALIZER; -static uint32_t fini_job_id[FINI_JOB_CNT]; static int next_fini_job_inx = 0; /* NUM_PARALLEL_SUSP_JOBS controls the number of jobs that can be suspended or @@ -1835,24 +1828,24 @@ static bool _is_batch_job_finished(uint32_t job_id) bool found_job = false; int i; - slurm_mutex_lock(&fini_mutex); - for (i = 0; i < FINI_JOB_CNT; i++) { + slurm_mutex_lock(&fini_job_mutex); + for (i = 0; i < fini_job_cnt; i++) { if (fini_job_id[i] == job_id) { found_job = true; break; } } - slurm_mutex_unlock(&fini_mutex); + slurm_mutex_unlock(&fini_job_mutex); return found_job; } static void _note_batch_job_finished(uint32_t job_id) { - slurm_mutex_lock(&fini_mutex); + slurm_mutex_lock(&fini_job_mutex); fini_job_id[next_fini_job_inx] = job_id; - if (++next_fini_job_inx >= FINI_JOB_CNT) + if (++next_fini_job_inx >= fini_job_cnt) next_fini_job_inx = 0; - slurm_mutex_unlock(&fini_mutex); + slurm_mutex_unlock(&fini_job_mutex); } /* Send notification to slurmctld we are finished running the prolog. diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index eda00ad1428..059b4cc88db 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -125,6 +125,9 @@ /* global, copied to STDERR_FILENO in tasks before the exec */ int devnull = -1; slurmd_conf_t * conf = NULL; +int fini_job_cnt = 0; +uint32_t *fini_job_id = NULL; +pthread_mutex_t fini_job_mutex = PTHREAD_MUTEX_INITIALIZER; /* * count of active threads @@ -1506,7 +1509,8 @@ _slurmd_init(void) */ _read_config(); - cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); + fini_job_cnt = cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); + fini_job_id = xmalloc(sizeof(uint32_t) * fini_job_cnt); if ((gres_plugin_init() != SLURM_SUCCESS) || (gres_plugin_node_config_load(cpu_cnt, conf->node_name, NULL) @@ -1727,6 +1731,10 @@ _slurmd_fini(void) acct_gather_conf_destroy(); fini_system_cgroup(); route_fini(); + slurm_mutex_lock(&fini_job_mutex); + xfree(fini_job_id); + fini_job_cnt = 0; + slurm_mutex_unlock(&fini_job_mutex); return SLURM_SUCCESS; } diff --git a/src/slurmd/slurmd/slurmd.h b/src/slurmd/slurmd/slurmd.h index b5e6339cadd..f38a8f9f059 100644 --- a/src/slurmd/slurmd/slurmd.h +++ b/src/slurmd/slurmd/slurmd.h @@ -170,6 +170,9 @@ typedef struct slurmd_config { } slurmd_conf_t; extern slurmd_conf_t * conf; +extern int fini_job_cnt; +extern uint32_t *fini_job_id; +extern pthread_mutex_t fini_job_mutex; /* Send node registration message with status to controller * IN status - same values slurm error codes (for node shutdown) -- GitLab