From a99fec24c7efc6994564a32db1d475d037dcc6f0 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Thu, 24 Jul 2014 10:33:05 -0700 Subject: [PATCH] Move functions around so all static functions are together, on top. --- .../multifactor/priority_multifactor.c | 1082 ++++++++--------- 1 file changed, 528 insertions(+), 554 deletions(-) diff --git a/src/plugins/priority/multifactor/priority_multifactor.c b/src/plugins/priority/multifactor/priority_multifactor.c index 8767e0ad084..26457a44c77 100644 --- a/src/plugins/priority/multifactor/priority_multifactor.c +++ b/src/plugins/priority/multifactor/priority_multifactor.c @@ -160,445 +160,6 @@ static double decay_factor = 1; /* The decay factor when decaying time. */ /* variables defined in prirority_multifactor.h */ bool priority_debug = 0; -/*void priority_p_set_assoc_usage(slurmdb_association_rec_t *assoc); -double priority_p_calc_fs_factor(long double usage_efctv, - long double shares_norm);*/ - - -static void _ticket_based_set_usage_efctv(slurmdb_association_rec_t *assoc); -static double _get_fairshare_priority(struct job_record *job_ptr); -static uint32_t _get_priority_internal(time_t start_time, - struct job_record *job_ptr); -static void _init_grp_used_cpu_run_secs(time_t last_ran); -static int _apply_new_usage(struct job_record *job_ptr, - time_t start_period, time_t end_period); -static int _filter_job(struct job_record *job_ptr, List req_job_list, - List req_user_list); -static void _set_norm_shares(List children_list); -static void _depth_oblivious_set_usage_efctv( - slurmdb_association_rec_t *assoc, - char *child, - char *child_str); -static void _set_usage_efctv(slurmdb_association_rec_t *assoc); -static void _internal_setup(void); - - -extern void priority_p_reconfig(bool assoc_clear) -{ - assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, - NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; - - - reconfig = 1; - prevflags = flags; - _internal_setup(); - - /* Since LEVEL_BASED uses a different shares calculation method, we - * must reassign shares at reconfigure if the algorithm was switched to - * or from LEVEL_BASED */ - if ((flags & PRIORITY_FLAGS_LEVEL_BASED) != - (prevflags & PRIORITY_FLAGS_LEVEL_BASED)) { - assoc_mgr_lock(&locks); - _set_norm_shares(assoc_mgr_root_assoc->usage->children_list); - assoc_mgr_unlock(&locks); - } - - /* Since the used_cpu_run_secs has been reset by the reconfig, - * we need to remove the time that has past since the last - * poll. We can't just do the correct calculation in the - * first place because it will mess up everything in the poll - * since it is based off the g_last_ran time. - */ - if (assoc_clear) - _init_grp_used_cpu_run_secs(g_last_ran); - debug2("%s reconfigured", plugin_name); - - return; -} - - -extern uint32_t priority_p_set(uint32_t last_prio, struct job_record *job_ptr) -{ - uint32_t priority = _get_priority_internal(time(NULL), job_ptr); - - debug2("initial priority for job %u is %u", job_ptr->job_id, priority); - - return priority; -} - - -extern void priority_p_set_assoc_usage(slurmdb_association_rec_t *assoc) -{ - char *child; - char *child_str; - - xassert(assoc_mgr_root_assoc); - xassert(assoc); - xassert(assoc->usage); - xassert(assoc->usage->fs_assoc_ptr); - - if (assoc->user) { - child = "user"; - child_str = assoc->user; - } else { - child = "account"; - child_str = assoc->acct; - } - - if (assoc_mgr_root_assoc->usage->usage_raw) { - assoc->usage->usage_norm = assoc->usage->usage_raw - / assoc_mgr_root_assoc->usage->usage_raw; - } else { - /* This should only happen when no usage has occured - * at all so no big deal, the other usage should be 0 - * as well here. */ - assoc->usage->usage_norm = 0; - } - - if (priority_debug) { - info("Normalized usage for %s %s off %s(%s) %Lf / %Lf = %Lf", - child, child_str, - assoc->usage->parent_assoc_ptr->acct, - assoc->usage->fs_assoc_ptr->acct, - assoc->usage->usage_raw, - assoc_mgr_root_assoc->usage->usage_raw, - assoc->usage->usage_norm); - } - /* This is needed in case someone changes the half-life on the - * fly and now we have used more time than is available under - * the new config */ - if (assoc->usage->usage_norm > 1.0) - assoc->usage->usage_norm = 1.0; - - if (flags & PRIORITY_FLAGS_LEVEL_BASED) - assoc->usage->usage_efctv = - level_based_calc_assoc_usage(assoc); - else if (assoc->usage->fs_assoc_ptr == assoc_mgr_root_assoc) { - assoc->usage->usage_efctv = assoc->usage->usage_norm; - if (priority_debug) - info("Effective usage for %s %s off %s(%s) %Lf %Lf", - child, child_str, - assoc->usage->parent_assoc_ptr->acct, - assoc->usage->fs_assoc_ptr->acct, - assoc->usage->usage_efctv, - assoc->usage->usage_norm); - } else if (flags & PRIORITY_FLAGS_TICKET_BASED) { - _ticket_based_set_usage_efctv(assoc); - if (priority_debug) { - info("Effective usage for %s %s off %s(%s) = %Lf", - child, child_str, - assoc->usage->parent_assoc_ptr->acct, - assoc->usage->fs_assoc_ptr->acct, - assoc->usage->usage_efctv); - } - } else if (assoc->shares_raw == SLURMDB_FS_USE_PARENT) { - slurmdb_association_rec_t *parent_assoc = - assoc->usage->fs_assoc_ptr; - - assoc->usage->usage_efctv = - parent_assoc->usage->usage_efctv; - if (priority_debug) { - info("Effective usage for %s %s off %s %Lf", - child, child_str, - parent_assoc->acct, - parent_assoc->usage->usage_efctv); - } - } else if (flags & PRIORITY_FLAGS_DEPTH_OBLIVIOUS) { - _depth_oblivious_set_usage_efctv(assoc, child, child_str); - } else { - _set_usage_efctv(assoc); - if (priority_debug) { - info("Effective usage for %s %s off %s(%s) " - "%Lf + ((%Lf - %Lf) * %d / %d) = %Lf", - child, child_str, - assoc->usage->parent_assoc_ptr->acct, - assoc->usage->fs_assoc_ptr->acct, - assoc->usage->usage_norm, - assoc->usage->fs_assoc_ptr->usage->usage_efctv, - assoc->usage->usage_norm, - assoc->shares_raw, - assoc->usage->level_shares, - assoc->usage->usage_efctv); - } - } -} - - -extern double priority_p_calc_fs_factor(long double usage_efctv, - long double shares_norm) -{ - double priority_fs = 0.0; - - if (fuzzy_equal(usage_efctv, NO_VAL)) - return priority_fs; - - if (shares_norm <= 0) - return priority_fs; - - if (flags & PRIORITY_FLAGS_TICKET_BASED) { - if (usage_efctv < MIN_USAGE_FACTOR * shares_norm) - usage_efctv = MIN_USAGE_FACTOR * shares_norm; - priority_fs = shares_norm / usage_efctv; - } else { - priority_fs = - pow(2.0, -((usage_efctv/shares_norm) / damp_factor)); - } - - return priority_fs; -} - -extern List priority_p_get_priority_factors_list( - priority_factors_request_msg_t *req_msg, uid_t uid) -{ - List req_job_list; - List req_user_list; - List ret_list = NULL; - ListIterator itr; - priority_factors_object_t *obj = NULL; - struct job_record *job_ptr = NULL; - time_t start_time = time(NULL); - - /* Read lock on jobs, nodes, and partitions */ - slurmctld_lock_t job_read_lock = - { NO_LOCK, READ_LOCK, READ_LOCK, READ_LOCK }; - - xassert(req_msg); - req_job_list = req_msg->job_id_list; - req_user_list = req_msg->uid_list; - - lock_slurmctld(job_read_lock); - if (job_list && list_count(job_list)) { - ret_list = list_create(slurm_destroy_priority_factors_object); - itr = list_iterator_create(job_list); - while ((job_ptr = list_next(itr))) { - if (!(flags & PRIORITY_FLAGS_CALCULATE_RUNNING) && - !IS_JOB_PENDING(job_ptr)) - continue; - - /* - * This means the job is not eligible yet - */ - if (!job_ptr->details->begin_time - || (job_ptr->details->begin_time > start_time)) - continue; - - /* - * 0 means the job is held - */ - if (job_ptr->priority == 0) - continue; - - /* - * Priority has been set elsewhere (e.g. by SlurmUser) - */ - if (job_ptr->direct_set_prio) - continue; - - if (_filter_job(job_ptr, req_job_list, req_user_list)) - continue; - - if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) - && (job_ptr->user_id != uid) - && !validate_operator(uid) - && !assoc_mgr_is_user_acct_coord( - acct_db_conn, uid, - job_ptr->account)) - continue; - - obj = xmalloc(sizeof(priority_factors_object_t)); - memcpy(obj, job_ptr->prio_factors, - sizeof(priority_factors_object_t)); - obj->job_id = job_ptr->job_id; - obj->user_id = job_ptr->user_id; - list_append(ret_list, obj); - } - list_iterator_destroy(itr); - if (!list_count(ret_list)) { - list_destroy(ret_list); - ret_list = NULL; - } - } - unlock_slurmctld(job_read_lock); - - return ret_list; -} - -/* at least slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, - * READ_LOCK, READ_LOCK }; should be locked before calling this */ -extern void priority_p_job_end(struct job_record *job_ptr) -{ - if (priority_debug) - info("priority_p_job_end: called for job %u", job_ptr->job_id); - - _apply_new_usage(job_ptr, g_last_ran, time(NULL)); -} - -extern bool decay_apply_new_usage(struct job_record *job_ptr, - time_t *start_time_ptr) -{ - - /* Don't need to handle finished jobs. */ - if (IS_JOB_FINISHED(job_ptr) || IS_JOB_COMPLETING(job_ptr)) - return false; - - /* apply new usage */ - if (((flags & PRIORITY_FLAGS_CALCULATE_RUNNING) || - !IS_JOB_PENDING(job_ptr)) && - job_ptr->start_time && job_ptr->assoc_ptr) { - if (!_apply_new_usage(job_ptr, g_last_ran, *start_time_ptr)) - return false; - } - return true; -} - - -extern void decay_apply_weighted_factors(struct job_record *job_ptr, - time_t *start_time_ptr) -{ - /* - * Priority 0 is reserved for held - * jobs. Also skip priority - * calculation for non-pending jobs. - */ - if ((job_ptr->priority == 0) || - (!IS_JOB_PENDING(job_ptr) && - !(flags & PRIORITY_FLAGS_CALCULATE_RUNNING))) - return; - - job_ptr->priority = _get_priority_internal(*start_time_ptr, job_ptr); - last_job_update = time(NULL); - debug2("priority for job %u is now %u", - job_ptr->job_id, job_ptr->priority); - -} - - -extern void set_priority_factors(time_t start_time, struct job_record *job_ptr) -{ - slurmdb_qos_rec_t *qos_ptr = NULL; - - xassert(job_ptr); - - if (!job_ptr->prio_factors) - job_ptr->prio_factors = - xmalloc(sizeof(priority_factors_object_t)); - else - memset(job_ptr->prio_factors, 0, - sizeof(priority_factors_object_t)); - - qos_ptr = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; - - if (weight_age) { - uint32_t diff = 0; - time_t use_time; - - if (flags & PRIORITY_FLAGS_ACCRUE_ALWAYS) - use_time = job_ptr->details->submit_time; - else - use_time = job_ptr->details->begin_time; - - /* Only really add an age priority if the use_time is - past the start_time. - */ - if (start_time > use_time) - diff = start_time - use_time; - - if (job_ptr->details->begin_time - || (flags & PRIORITY_FLAGS_ACCRUE_ALWAYS)) { - if (diff < max_age) { - job_ptr->prio_factors->priority_age = - (double)diff / (double)max_age; - } else - job_ptr->prio_factors->priority_age = 1.0; - } - } - - if (job_ptr->assoc_ptr && weight_fs) { - job_ptr->prio_factors->priority_fs = - _get_fairshare_priority(job_ptr); - } - - if (weight_js) { - uint32_t cpu_cnt = 0, min_nodes = 1; - /* On the initial run of this we don't have total_cpus - so go off the requesting. After the first shot - total_cpus should be filled in. - */ - if (job_ptr->total_cpus) - cpu_cnt = job_ptr->total_cpus; - else if (job_ptr->details - && (job_ptr->details->max_cpus != NO_VAL)) - cpu_cnt = job_ptr->details->max_cpus; - else if (job_ptr->details && job_ptr->details->min_cpus) - cpu_cnt = job_ptr->details->min_cpus; - if (job_ptr->details) - min_nodes = job_ptr->details->min_nodes; - - if (flags & PRIORITY_FLAGS_SIZE_RELATIVE) { - uint32_t time_limit = 1; - /* Job size in CPUs (based upon average CPUs/Node */ - job_ptr->prio_factors->priority_js = - (double)min_nodes * - (double)cluster_cpus / - (double)node_record_count; - if (cpu_cnt > job_ptr->prio_factors->priority_js) { - job_ptr->prio_factors->priority_js = - (double)cpu_cnt; - } - /* Divide by job time limit */ - if (job_ptr->time_limit != NO_VAL) - time_limit = job_ptr->time_limit; - else if (job_ptr->part_ptr) - time_limit = job_ptr->part_ptr->max_time; - job_ptr->prio_factors->priority_js /= time_limit; - /* Normalize to max value of 1.0 */ - job_ptr->prio_factors->priority_js /= cluster_cpus; - if (favor_small) { - job_ptr->prio_factors->priority_js = - (double) 1.0 - - job_ptr->prio_factors->priority_js; - } - } else if (favor_small) { - job_ptr->prio_factors->priority_js = - (double)(node_record_count - min_nodes) - / (double)node_record_count; - if (cpu_cnt) { - job_ptr->prio_factors->priority_js += - (double)(cluster_cpus - cpu_cnt) - / (double)cluster_cpus; - job_ptr->prio_factors->priority_js /= 2; - } - } else { /* favor large */ - job_ptr->prio_factors->priority_js = - (double)min_nodes / (double)node_record_count; - if (cpu_cnt) { - job_ptr->prio_factors->priority_js += - (double)cpu_cnt / (double)cluster_cpus; - job_ptr->prio_factors->priority_js /= 2; - } - } - if (job_ptr->prio_factors->priority_js < .0) - job_ptr->prio_factors->priority_js = 0.0; - else if (job_ptr->prio_factors->priority_js > 1.0) - job_ptr->prio_factors->priority_js = 1.0; - } - - if (job_ptr->part_ptr && job_ptr->part_ptr->priority && weight_part) { - job_ptr->prio_factors->priority_part = - job_ptr->part_ptr->norm_priority; - } - - if (qos_ptr && qos_ptr->priority && weight_qos) { - job_ptr->prio_factors->priority_qos = - qos_ptr->usage->norm_priority; - } - - if (job_ptr->details) - job_ptr->prio_factors->nice = job_ptr->details->nice; - else - job_ptr->prio_factors->nice = NICE_OFFSET; -} - /* * apply decay factor to all associations usage_raw @@ -1844,6 +1405,147 @@ static void _internal_setup(void) } } + +/* Reursively call assoc_mgr_normalize_assoc_shares from assoc_mgr.c on + * children of an association + */ +static void _set_norm_shares(List children_list) +{ + ListIterator itr = NULL; + slurmdb_association_rec_t *assoc = NULL; + + if (!children_list || list_is_empty(children_list)) + return; + + itr = list_iterator_create(children_list); + while ((assoc = list_next(itr))) { + assoc_mgr_normalize_assoc_shares(assoc); + if (!assoc->user) + _set_norm_shares(assoc->usage->children_list); + } + + list_iterator_destroy(itr); +} + + +static void _depth_oblivious_set_usage_efctv( + slurmdb_association_rec_t *assoc, + char *child, + char *child_str) +{ + long double ratio_p, ratio_l, k, f, ratio_s; + slurmdb_association_rec_t *parent_assoc = NULL; + ListIterator sib_itr = NULL; + slurmdb_association_rec_t *sibling = NULL; + + /* We want priority_fs = pow(2.0, -R); where + R = ratio_p * ratio_l^k + */ + + /* ratio_p is R for our parent */ + + /* ratio_l is our usage ratio r divided by ratio_s, + * the usage ratio of our siblings (including + * ourselves). In the standard case where everything + * is consumed at the leaf accounts ratio_s=ratio_p + */ + + /* k is a factor which tends towards 0 when ratio_p + diverges from 1 and ratio_l would bring back R + towards 1 + */ + + /* Effective usage is now computed to be R*shares_norm + so that the general formula of + priority_fs = pow(2.0, -(usage_efctv / shares_norm)) + gives what we want: priority_fs = pow(2.0, -R); + */ + + f = 5.0; /* FIXME: This could be a tunable parameter + (higher f means more impact when parent consumption + is inadequate) */ + parent_assoc = assoc->usage->fs_assoc_ptr; + + if (assoc->usage->shares_norm && + parent_assoc->usage->shares_norm && + parent_assoc->usage->usage_efctv && + assoc->usage->usage_norm) { + ratio_p = (parent_assoc->usage->usage_efctv / + parent_assoc->usage->shares_norm); + + ratio_s = 0; + sib_itr = list_iterator_create( + parent_assoc->usage->children_list); + while ((sibling = list_next(sib_itr))) { + if(sibling->shares_raw != SLURMDB_FS_USE_PARENT) + ratio_s += sibling->usage->usage_norm; + } + list_iterator_destroy(sib_itr); + ratio_s /= parent_assoc->usage->shares_norm; + + ratio_l = (assoc->usage->usage_norm / + assoc->usage->shares_norm) / ratio_s; +#if defined(__FreeBSD__) + if (!ratio_p || !ratio_l + || log(ratio_p) * log(ratio_l) >= 0) { + k = 1; + } else { + k = 1 / (1 + pow(f * log(ratio_p), 2)); + } + + assoc->usage->usage_efctv = + ratio_p * pow(ratio_l, k) * + assoc->usage->shares_norm; +#else + if (!ratio_p || !ratio_l + || logl(ratio_p) * logl(ratio_l) >= 0) { + k = 1; + } else { + k = 1 / (1 + powl(f * logl(ratio_p), 2)); + } + + assoc->usage->usage_efctv = + ratio_p * pow(ratio_l, k) * + assoc->usage->shares_norm; +#endif + + if (priority_debug) { + info("Effective usage for %s %s off %s(%s) " + "(%Lf * %Lf ^ %Lf) * %f = %Lf", + child, child_str, + assoc->usage->parent_assoc_ptr->acct, + assoc->usage->fs_assoc_ptr->acct, + ratio_p, ratio_l, k, + assoc->usage->shares_norm, + assoc->usage->usage_efctv); + } + } else { + assoc->usage->usage_efctv = assoc->usage->usage_norm; + if (priority_debug) { + info("Effective usage for %s %s off %s(%s) %Lf", + child, child_str, + assoc->usage->parent_assoc_ptr->acct, + assoc->usage->fs_assoc_ptr->acct, + assoc->usage->usage_efctv); + } + } +} + +static void _set_usage_efctv(slurmdb_association_rec_t *assoc) +{ + /* Variable names taken from HTML documentation */ + long double ua_child = assoc->usage->usage_norm; + long double ue_parent = + assoc->usage->fs_assoc_ptr->usage->usage_efctv; + uint32_t s_child = assoc->shares_raw; + uint32_t s_all_siblings = assoc->usage->level_shares; + + assoc->usage->usage_efctv = ua_child + + (ue_parent - ua_child) * + (s_child / (long double) s_all_siblings); +} + + /* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. @@ -1928,147 +1630,419 @@ int fini ( void ) slurm_mutex_unlock(&decay_lock); - return SLURM_SUCCESS; + return SLURM_SUCCESS; +} + +extern uint32_t priority_p_set(uint32_t last_prio, struct job_record *job_ptr) +{ + uint32_t priority = _get_priority_internal(time(NULL), job_ptr); + + debug2("initial priority for job %u is %u", job_ptr->job_id, priority); + + return priority; +} + +extern void priority_p_reconfig(bool assoc_clear) +{ + assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, + NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; + + + reconfig = 1; + prevflags = flags; + _internal_setup(); + + /* Since LEVEL_BASED uses a different shares calculation method, we + * must reassign shares at reconfigure if the algorithm was switched to + * or from LEVEL_BASED */ + if ((flags & PRIORITY_FLAGS_LEVEL_BASED) != + (prevflags & PRIORITY_FLAGS_LEVEL_BASED)) { + assoc_mgr_lock(&locks); + _set_norm_shares(assoc_mgr_root_assoc->usage->children_list); + assoc_mgr_unlock(&locks); + } + + /* Since the used_cpu_run_secs has been reset by the reconfig, + * we need to remove the time that has past since the last + * poll. We can't just do the correct calculation in the + * first place because it will mess up everything in the poll + * since it is based off the g_last_ran time. + */ + if (assoc_clear) + _init_grp_used_cpu_run_secs(g_last_ran); + debug2("%s reconfigured", plugin_name); + + return; +} + +extern void priority_p_set_assoc_usage(slurmdb_association_rec_t *assoc) +{ + char *child; + char *child_str; + + xassert(assoc_mgr_root_assoc); + xassert(assoc); + xassert(assoc->usage); + xassert(assoc->usage->fs_assoc_ptr); + + if (assoc->user) { + child = "user"; + child_str = assoc->user; + } else { + child = "account"; + child_str = assoc->acct; + } + + if (assoc_mgr_root_assoc->usage->usage_raw) { + assoc->usage->usage_norm = assoc->usage->usage_raw + / assoc_mgr_root_assoc->usage->usage_raw; + } else { + /* This should only happen when no usage has occured + * at all so no big deal, the other usage should be 0 + * as well here. */ + assoc->usage->usage_norm = 0; + } + + if (priority_debug) { + info("Normalized usage for %s %s off %s(%s) %Lf / %Lf = %Lf", + child, child_str, + assoc->usage->parent_assoc_ptr->acct, + assoc->usage->fs_assoc_ptr->acct, + assoc->usage->usage_raw, + assoc_mgr_root_assoc->usage->usage_raw, + assoc->usage->usage_norm); + } + /* This is needed in case someone changes the half-life on the + * fly and now we have used more time than is available under + * the new config */ + if (assoc->usage->usage_norm > 1.0) + assoc->usage->usage_norm = 1.0; + + if (flags & PRIORITY_FLAGS_LEVEL_BASED) + assoc->usage->usage_efctv = + level_based_calc_assoc_usage(assoc); + else if (assoc->usage->fs_assoc_ptr == assoc_mgr_root_assoc) { + assoc->usage->usage_efctv = assoc->usage->usage_norm; + if (priority_debug) + info("Effective usage for %s %s off %s(%s) %Lf %Lf", + child, child_str, + assoc->usage->parent_assoc_ptr->acct, + assoc->usage->fs_assoc_ptr->acct, + assoc->usage->usage_efctv, + assoc->usage->usage_norm); + } else if (flags & PRIORITY_FLAGS_TICKET_BASED) { + _ticket_based_set_usage_efctv(assoc); + if (priority_debug) { + info("Effective usage for %s %s off %s(%s) = %Lf", + child, child_str, + assoc->usage->parent_assoc_ptr->acct, + assoc->usage->fs_assoc_ptr->acct, + assoc->usage->usage_efctv); + } + } else if (assoc->shares_raw == SLURMDB_FS_USE_PARENT) { + slurmdb_association_rec_t *parent_assoc = + assoc->usage->fs_assoc_ptr; + + assoc->usage->usage_efctv = + parent_assoc->usage->usage_efctv; + if (priority_debug) { + info("Effective usage for %s %s off %s %Lf", + child, child_str, + parent_assoc->acct, + parent_assoc->usage->usage_efctv); + } + } else if (flags & PRIORITY_FLAGS_DEPTH_OBLIVIOUS) { + _depth_oblivious_set_usage_efctv(assoc, child, child_str); + } else { + _set_usage_efctv(assoc); + if (priority_debug) { + info("Effective usage for %s %s off %s(%s) " + "%Lf + ((%Lf - %Lf) * %d / %d) = %Lf", + child, child_str, + assoc->usage->parent_assoc_ptr->acct, + assoc->usage->fs_assoc_ptr->acct, + assoc->usage->usage_norm, + assoc->usage->fs_assoc_ptr->usage->usage_efctv, + assoc->usage->usage_norm, + assoc->shares_raw, + assoc->usage->level_shares, + assoc->usage->usage_efctv); + } + } +} + + +extern double priority_p_calc_fs_factor(long double usage_efctv, + long double shares_norm) +{ + double priority_fs = 0.0; + + if (fuzzy_equal(usage_efctv, NO_VAL)) + return priority_fs; + + if (shares_norm <= 0) + return priority_fs; + + if (flags & PRIORITY_FLAGS_TICKET_BASED) { + if (usage_efctv < MIN_USAGE_FACTOR * shares_norm) + usage_efctv = MIN_USAGE_FACTOR * shares_norm; + priority_fs = shares_norm / usage_efctv; + } else { + priority_fs = + pow(2.0, -((usage_efctv/shares_norm) / damp_factor)); + } + + return priority_fs; +} + +extern List priority_p_get_priority_factors_list( + priority_factors_request_msg_t *req_msg, uid_t uid) +{ + List req_job_list; + List req_user_list; + List ret_list = NULL; + ListIterator itr; + priority_factors_object_t *obj = NULL; + struct job_record *job_ptr = NULL; + time_t start_time = time(NULL); + + /* Read lock on jobs, nodes, and partitions */ + slurmctld_lock_t job_read_lock = + { NO_LOCK, READ_LOCK, READ_LOCK, READ_LOCK }; + + xassert(req_msg); + req_job_list = req_msg->job_id_list; + req_user_list = req_msg->uid_list; + + lock_slurmctld(job_read_lock); + if (job_list && list_count(job_list)) { + ret_list = list_create(slurm_destroy_priority_factors_object); + itr = list_iterator_create(job_list); + while ((job_ptr = list_next(itr))) { + if (!(flags & PRIORITY_FLAGS_CALCULATE_RUNNING) && + !IS_JOB_PENDING(job_ptr)) + continue; + + /* + * This means the job is not eligible yet + */ + if (!job_ptr->details->begin_time + || (job_ptr->details->begin_time > start_time)) + continue; + + /* + * 0 means the job is held + */ + if (job_ptr->priority == 0) + continue; + + /* + * Priority has been set elsewhere (e.g. by SlurmUser) + */ + if (job_ptr->direct_set_prio) + continue; + + if (_filter_job(job_ptr, req_job_list, req_user_list)) + continue; + + if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) + && (job_ptr->user_id != uid) + && !validate_operator(uid) + && !assoc_mgr_is_user_acct_coord( + acct_db_conn, uid, + job_ptr->account)) + continue; + + obj = xmalloc(sizeof(priority_factors_object_t)); + memcpy(obj, job_ptr->prio_factors, + sizeof(priority_factors_object_t)); + obj->job_id = job_ptr->job_id; + obj->user_id = job_ptr->user_id; + list_append(ret_list, obj); + } + list_iterator_destroy(itr); + if (!list_count(ret_list)) { + list_destroy(ret_list); + ret_list = NULL; + } + } + unlock_slurmctld(job_read_lock); + + return ret_list; +} + +/* at least slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, + * READ_LOCK, READ_LOCK }; should be locked before calling this */ +extern void priority_p_job_end(struct job_record *job_ptr) +{ + if (priority_debug) + info("priority_p_job_end: called for job %u", job_ptr->job_id); + + _apply_new_usage(job_ptr, g_last_ran, time(NULL)); } - -/* Reursively call assoc_mgr_normalize_assoc_shares from assoc_mgr.c on - * children of an association - */ -static void _set_norm_shares(List children_list) +extern bool decay_apply_new_usage(struct job_record *job_ptr, + time_t *start_time_ptr) { - ListIterator itr = NULL; - slurmdb_association_rec_t *assoc = NULL; - if (!children_list || list_is_empty(children_list)) - return; + /* Don't need to handle finished jobs. */ + if (IS_JOB_FINISHED(job_ptr) || IS_JOB_COMPLETING(job_ptr)) + return false; - itr = list_iterator_create(children_list); - while ((assoc = list_next(itr))) { - assoc_mgr_normalize_assoc_shares(assoc); - if (!assoc->user) - _set_norm_shares(assoc->usage->children_list); + /* apply new usage */ + if (((flags & PRIORITY_FLAGS_CALCULATE_RUNNING) || + !IS_JOB_PENDING(job_ptr)) && + job_ptr->start_time && job_ptr->assoc_ptr) { + if (!_apply_new_usage(job_ptr, g_last_ran, *start_time_ptr)) + return false; } - - list_iterator_destroy(itr); + return true; } -static void _depth_oblivious_set_usage_efctv( - slurmdb_association_rec_t *assoc, - char *child, - char *child_str) +extern void decay_apply_weighted_factors(struct job_record *job_ptr, + time_t *start_time_ptr) { - long double ratio_p, ratio_l, k, f, ratio_s; - slurmdb_association_rec_t *parent_assoc = NULL; - ListIterator sib_itr = NULL; - slurmdb_association_rec_t *sibling = NULL; + /* + * Priority 0 is reserved for held + * jobs. Also skip priority + * calculation for non-pending jobs. + */ + if ((job_ptr->priority == 0) || + (!IS_JOB_PENDING(job_ptr) && + !(flags & PRIORITY_FLAGS_CALCULATE_RUNNING))) + return; - /* We want priority_fs = pow(2.0, -R); where - R = ratio_p * ratio_l^k - */ + job_ptr->priority = _get_priority_internal(*start_time_ptr, job_ptr); + last_job_update = time(NULL); + debug2("priority for job %u is now %u", + job_ptr->job_id, job_ptr->priority); - /* ratio_p is R for our parent */ +} - /* ratio_l is our usage ratio r divided by ratio_s, - * the usage ratio of our siblings (including - * ourselves). In the standard case where everything - * is consumed at the leaf accounts ratio_s=ratio_p - */ - /* k is a factor which tends towards 0 when ratio_p - diverges from 1 and ratio_l would bring back R - towards 1 - */ +extern void set_priority_factors(time_t start_time, struct job_record *job_ptr) +{ + slurmdb_qos_rec_t *qos_ptr = NULL; - /* Effective usage is now computed to be R*shares_norm - so that the general formula of - priority_fs = pow(2.0, -(usage_efctv / shares_norm)) - gives what we want: priority_fs = pow(2.0, -R); - */ + xassert(job_ptr); - f = 5.0; /* FIXME: This could be a tunable parameter - (higher f means more impact when parent consumption - is inadequate) */ - parent_assoc = assoc->usage->fs_assoc_ptr; + if (!job_ptr->prio_factors) + job_ptr->prio_factors = + xmalloc(sizeof(priority_factors_object_t)); + else + memset(job_ptr->prio_factors, 0, + sizeof(priority_factors_object_t)); - if (assoc->usage->shares_norm && - parent_assoc->usage->shares_norm && - parent_assoc->usage->usage_efctv && - assoc->usage->usage_norm) { - ratio_p = (parent_assoc->usage->usage_efctv / - parent_assoc->usage->shares_norm); + qos_ptr = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; - ratio_s = 0; - sib_itr = list_iterator_create( - parent_assoc->usage->children_list); - while ((sibling = list_next(sib_itr))) { - if(sibling->shares_raw != SLURMDB_FS_USE_PARENT) - ratio_s += sibling->usage->usage_norm; - } - list_iterator_destroy(sib_itr); - ratio_s /= parent_assoc->usage->shares_norm; + if (weight_age) { + uint32_t diff = 0; + time_t use_time; - ratio_l = (assoc->usage->usage_norm / - assoc->usage->shares_norm) / ratio_s; -#if defined(__FreeBSD__) - if (!ratio_p || !ratio_l - || log(ratio_p) * log(ratio_l) >= 0) { - k = 1; - } else { - k = 1 / (1 + pow(f * log(ratio_p), 2)); - } + if (flags & PRIORITY_FLAGS_ACCRUE_ALWAYS) + use_time = job_ptr->details->submit_time; + else + use_time = job_ptr->details->begin_time; - assoc->usage->usage_efctv = - ratio_p * pow(ratio_l, k) * - assoc->usage->shares_norm; -#else - if (!ratio_p || !ratio_l - || logl(ratio_p) * logl(ratio_l) >= 0) { - k = 1; - } else { - k = 1 / (1 + powl(f * logl(ratio_p), 2)); + /* Only really add an age priority if the use_time is + past the start_time. + */ + if (start_time > use_time) + diff = start_time - use_time; + + if (job_ptr->details->begin_time + || (flags & PRIORITY_FLAGS_ACCRUE_ALWAYS)) { + if (diff < max_age) { + job_ptr->prio_factors->priority_age = + (double)diff / (double)max_age; + } else + job_ptr->prio_factors->priority_age = 1.0; } + } - assoc->usage->usage_efctv = - ratio_p * pow(ratio_l, k) * - assoc->usage->shares_norm; -#endif + if (job_ptr->assoc_ptr && weight_fs) { + job_ptr->prio_factors->priority_fs = + _get_fairshare_priority(job_ptr); + } - if (priority_debug) { - info("Effective usage for %s %s off %s(%s) " - "(%Lf * %Lf ^ %Lf) * %f = %Lf", - child, child_str, - assoc->usage->parent_assoc_ptr->acct, - assoc->usage->fs_assoc_ptr->acct, - ratio_p, ratio_l, k, - assoc->usage->shares_norm, - assoc->usage->usage_efctv); - } - } else { - assoc->usage->usage_efctv = assoc->usage->usage_norm; - if (priority_debug) { - info("Effective usage for %s %s off %s(%s) %Lf", - child, child_str, - assoc->usage->parent_assoc_ptr->acct, - assoc->usage->fs_assoc_ptr->acct, - assoc->usage->usage_efctv); + if (weight_js) { + uint32_t cpu_cnt = 0, min_nodes = 1; + /* On the initial run of this we don't have total_cpus + so go off the requesting. After the first shot + total_cpus should be filled in. + */ + if (job_ptr->total_cpus) + cpu_cnt = job_ptr->total_cpus; + else if (job_ptr->details + && (job_ptr->details->max_cpus != NO_VAL)) + cpu_cnt = job_ptr->details->max_cpus; + else if (job_ptr->details && job_ptr->details->min_cpus) + cpu_cnt = job_ptr->details->min_cpus; + if (job_ptr->details) + min_nodes = job_ptr->details->min_nodes; + + if (flags & PRIORITY_FLAGS_SIZE_RELATIVE) { + uint32_t time_limit = 1; + /* Job size in CPUs (based upon average CPUs/Node */ + job_ptr->prio_factors->priority_js = + (double)min_nodes * + (double)cluster_cpus / + (double)node_record_count; + if (cpu_cnt > job_ptr->prio_factors->priority_js) { + job_ptr->prio_factors->priority_js = + (double)cpu_cnt; + } + /* Divide by job time limit */ + if (job_ptr->time_limit != NO_VAL) + time_limit = job_ptr->time_limit; + else if (job_ptr->part_ptr) + time_limit = job_ptr->part_ptr->max_time; + job_ptr->prio_factors->priority_js /= time_limit; + /* Normalize to max value of 1.0 */ + job_ptr->prio_factors->priority_js /= cluster_cpus; + if (favor_small) { + job_ptr->prio_factors->priority_js = + (double) 1.0 - + job_ptr->prio_factors->priority_js; + } + } else if (favor_small) { + job_ptr->prio_factors->priority_js = + (double)(node_record_count - min_nodes) + / (double)node_record_count; + if (cpu_cnt) { + job_ptr->prio_factors->priority_js += + (double)(cluster_cpus - cpu_cnt) + / (double)cluster_cpus; + job_ptr->prio_factors->priority_js /= 2; + } + } else { /* favor large */ + job_ptr->prio_factors->priority_js = + (double)min_nodes / (double)node_record_count; + if (cpu_cnt) { + job_ptr->prio_factors->priority_js += + (double)cpu_cnt / (double)cluster_cpus; + job_ptr->prio_factors->priority_js /= 2; + } } + if (job_ptr->prio_factors->priority_js < .0) + job_ptr->prio_factors->priority_js = 0.0; + else if (job_ptr->prio_factors->priority_js > 1.0) + job_ptr->prio_factors->priority_js = 1.0; } -} + if (job_ptr->part_ptr && job_ptr->part_ptr->priority && weight_part) { + job_ptr->prio_factors->priority_part = + job_ptr->part_ptr->norm_priority; + } -static void _set_usage_efctv(slurmdb_association_rec_t *assoc) -{ - /* Variable names taken from HTML documentation */ - long double ua_child = assoc->usage->usage_norm; - long double ue_parent = - assoc->usage->fs_assoc_ptr->usage->usage_efctv; - uint32_t s_child = assoc->shares_raw; - uint32_t s_all_siblings = assoc->usage->level_shares; + if (qos_ptr && qos_ptr->priority && weight_qos) { + job_ptr->prio_factors->priority_qos = + qos_ptr->usage->norm_priority; + } - assoc->usage->usage_efctv = ua_child + - (ue_parent - ua_child) * - (s_child / (long double) s_all_siblings); + if (job_ptr->details) + job_ptr->prio_factors->nice = job_ptr->details->nice; + else + job_ptr->prio_factors->nice = NICE_OFFSET; } - -- GitLab