diff --git a/NEWS b/NEWS index 9c40051a5b8fce3cacff33ba705d5bdc9d4a4563..8d5c7f4559adeae09d87a3d5a95dc945aea5136d 100644 --- a/NEWS +++ b/NEWS @@ -135,6 +135,8 @@ documents those changes that are of interest to users and administrators. * Changes in Slurm 16.05.7 ========================== + -- Fix issue in the priority/multifactor plugin where on a slurmctld restart + more time than should be allowed would be accounted for. * Changes in Slurm 16.05.6 ========================== diff --git a/src/plugins/priority/multifactor/priority_multifactor.c b/src/plugins/priority/multifactor/priority_multifactor.c index 367216f3cc8be80e4e10d8a5b2f76db25a901fd7..75fc0a4797f1ae8273bddf0470a58f75c2814aba 100644 --- a/src/plugins/priority/multifactor/priority_multifactor.c +++ b/src/plugins/priority/multifactor/priority_multifactor.c @@ -130,6 +130,8 @@ const uint32_t plugin_version = SLURM_VERSION_NUMBER; static pthread_t decay_handler_thread; static pthread_t cleanup_handler_thread; static pthread_mutex_t decay_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t decay_init_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t decay_init_cond = PTHREAD_COND_INITIALIZER; static bool running_decay = 0, reconfig = 0, calc_fairshare = 1; static bool favor_small; /* favor small jobs over large */ static uint16_t damp_factor = 1; /* weight for age factor */ @@ -1206,6 +1208,9 @@ static void *_decay_thread(void *no_data) * * This explain the following declaration. */ + + slurm_mutex_lock(&decay_init_mutex); + if (decay_hl > 0) decay_factor = 1 - (0.693 / decay_hl); @@ -1216,6 +1221,9 @@ static void *_decay_thread(void *no_data) if (last_reset == 0) last_reset = start_time; + pthread_cond_signal(&decay_init_cond); + slurm_mutex_unlock(&decay_init_mutex); + _init_grp_used_cpu_run_secs(g_last_ran); while (1) { @@ -1622,6 +1630,17 @@ int init ( void ) _decay_thread, NULL)) fatal("pthread_create error %m"); + /* The decay_thread sets up some global variables that are + * needed outside of the decay_thread (i.e. decay_factor, + * g_last_ran). These are needed if a job was completing and + * the slurmctld was reset. If they aren't setup before + * continuing we could get more time added than should be on a + * restart. So wait until they are set up. + */ + slurm_mutex_lock(&decay_init_mutex); + pthread_cond_wait(&decay_init_cond, &decay_init_mutex); + slurm_mutex_unlock(&decay_init_mutex); + /* This is here to join the decay thread so we don't core * dump if in the sleep, since there is no other place to join * we have to create another thread to do it. */