diff --git a/META b/META index c6f6426e3e38235ae7759f20dc5545910e0f1c95..9dc5a72f47bd884fe5a1368d3a421daea0557b9e 100644 --- a/META +++ b/META @@ -11,7 +11,7 @@ Minor: 6 Micro: 0 Version: 2.6.0 - Release: 0pre4 + Release: 0pre4 ## # When changing API_CURRENT update src/common/slurm_protocol_common.h diff --git a/NEWS b/NEWS index 532078f6867001e0c437a81ef9b49e3ecf828437..63216d201276839903771bff48f334a238ea9c62 100644 --- a/NEWS +++ b/NEWS @@ -166,6 +166,10 @@ documents those changes that are of interest to users and admins. progressing, holding job #" -- checkpoint/blcr - Reset max_nodes from zero to NO_VAL on job restart. -- launch/poe - Fix for hostlist file support with repeated host names. +<<<<<<< HEAD +======= + -- priority/multifactor2 - Prevent possible divide by zero. +>>>>>>> slurm-2.5 * Changes in Slurm 2.5.6 ======================== diff --git a/src/plugins/priority/multifactor/priority_multifactor.c b/src/plugins/priority/multifactor/priority_multifactor.c index 39d0b1867834fc85f2561637113246891de0b001..dc21918180c172269b0d021e4ccedc22eadbf43a 100644 --- a/src/plugins/priority/multifactor/priority_multifactor.c +++ b/src/plugins/priority/multifactor/priority_multifactor.c @@ -390,21 +390,25 @@ static int _write_last_decay_ran(time_t last_ran, time_t last_reset) /* Set the effective usage of a node. */ static void _set_usage_efctv(slurmdb_association_rec_t *assoc) { - if ((assoc->shares_raw == SLURMDB_FS_USE_PARENT) - && assoc->usage->parent_assoc_ptr) { - assoc->usage->shares_norm = - assoc->usage->parent_assoc_ptr->usage->shares_norm; - assoc->usage->usage_norm = - assoc->usage->parent_assoc_ptr->usage->usage_norm; - } + long double min_shares_norm; - if (assoc->usage->usage_norm > MIN_USAGE_FACTOR - * (assoc->shares_raw / assoc->usage->level_shares)) { + if ((assoc->shares_raw == SLURMDB_FS_USE_PARENT) + && assoc->usage->parent_assoc_ptr) { + assoc->usage->shares_norm = + assoc->usage->parent_assoc_ptr->usage->shares_norm; + assoc->usage->usage_norm = + assoc->usage->parent_assoc_ptr->usage->usage_norm; + } + + if (assoc->usage->level_shares) { + min_shares_norm = (long double) MIN_USAGE_FACTOR + * assoc->shares_raw / assoc->usage->level_shares; + if (assoc->usage->usage_norm > min_shares_norm) + assoc->usage->usage_efctv = assoc->usage->usage_norm; + else + assoc->usage->usage_efctv = min_shares_norm; + } else assoc->usage->usage_efctv = assoc->usage->usage_norm; - } else { - assoc->usage->usage_efctv = MIN_USAGE_FACTOR - * (assoc->shares_raw / assoc->usage->level_shares); - } } @@ -1655,7 +1659,8 @@ extern double priority_p_calc_fs_factor(long double usage_efctv, { double priority_fs = 0.0; - xassert(!fuzzy_equal(usage_efctv, NO_VAL)); + if (fuzzy_equal(usage_efctv, NO_VAL)) + return priority_fs; if (shares_norm <= 0) return priority_fs; diff --git a/src/plugins/priority/multifactor2/priority_multifactor2.c b/src/plugins/priority/multifactor2/priority_multifactor2.c new file mode 100644 index 0000000000000000000000000000000000000000..da4d0c80ea124a75a0ae212f4b2b5e18512de541 --- /dev/null +++ b/src/plugins/priority/multifactor2/priority_multifactor2.c @@ -0,0 +1,1569 @@ +/*****************************************************************************\ + * priority_multifactor2.c - slurm multifactor priority plugin version 2. + ***************************************************************************** + * Copyright (C) 2012 Aalto University + * Written by Janne Blomqvist <janne.blomqvist@aalto.fi> + * + * Based on priority_multifactor.c, whose copyright information is + * reproduced below: + * + * Copyright (C) 2008-2009 Lawrence Livermore National Security. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.schedmd.com/slurmdocs/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#if HAVE_STDINT_H +# include <stdint.h> +#endif +#if HAVE_INTTYPES_H +# include <inttypes.h> +#endif +#ifdef WITH_PTHREADS +# include <pthread.h> +#endif /* WITH_PTHREADS */ + +#include <sys/stat.h> +#include <stdio.h> +#include <fcntl.h> + +#include <math.h> +#include "slurm/slurm_errno.h" + +#include "src/common/slurm_priority.h" +#include "src/common/xstring.h" +#include "src/common/assoc_mgr.h" +#include "src/common/parse_time.h" + +#include "src/slurmctld/locks.h" + +#define SECS_PER_DAY (24 * 60 * 60) +#define SECS_PER_WEEK (7 * SECS_PER_DAY) + +#define MIN_USAGE_FACTOR 0.01 + +/* These are defined here so when we link with something other than + * the slurmctld we will have these symbols defined. They will get + * overwritten when linking with the slurmctld. + */ +#if defined (__APPLE__) +void *acct_db_conn __attribute__((weak_import)) = NULL; +uint32_t cluster_cpus __attribute__((weak_import)) = NO_VAL; +List job_list __attribute__((weak_import)) = NULL; +time_t last_job_update __attribute__((weak_import)); +#else +void *acct_db_conn = NULL; +uint32_t cluster_cpus = NO_VAL; +List job_list = NULL; +time_t last_job_update; +#endif + +/* + * These variables are required by the generic plugin interface. If they + * are not found in the plugin, the plugin loader will ignore it. + * + * plugin_name - a string giving a human-readable description of the + * plugin. There is no maximum length, but the symbol must refer to + * a valid string. + * + * plugin_type - a string suggesting the type of the plugin or its + * applicability to a particular form of data or method of data handling. + * If the low-level plugin API is used, the contents of this string are + * unimportant and may be anything. SLURM uses the higher-level plugin + * interface which requires this string to be of the form + * + * <application>/<method> + * + * where <application> is a description of the intended application of + * the plugin (e.g., "jobcomp" for SLURM job completion logging) and <method> + * is a description of how this plugin satisfies that application. SLURM will + * only load job completion logging plugins if the plugin_type string has a + * prefix of "jobcomp/". + * + * plugin_version - an unsigned 32-bit integer giving the version number + * of the plugin. If major and minor revisions are desired, the major + * version number may be multiplied by a suitable magnitude constant such + * as 100 or 1000. Various SLURM versions will likely require a certain + * minimum version for their plugins as the job completion logging API + * matures. + */ +const char plugin_name[] = "Priority MULTIFACTOR 2 plugin"; +const char plugin_type[] = "priority/multifactor2"; +const uint32_t plugin_version = 100; + +static pthread_t decay_handler_thread; +static pthread_mutex_t decay_lock = PTHREAD_MUTEX_INITIALIZER; +static bool running_decay = 0, reconfig = 0, + calc_fairshare = 1, priority_debug = 0; +static bool favor_small; /* favor small jobs over large */ +static uint32_t max_age; /* time when not to add any more + * priority to a job if reached */ +static uint32_t weight_age; /* weight for age factor */ +static uint32_t weight_fs; /* weight for Fairshare factor */ +static uint32_t weight_js; /* weight for Job Size factor */ +static uint32_t weight_part; /* weight for Partition factor */ +static uint32_t weight_qos; /* weight for QOS factor */ +static uint32_t flags; /* Priority Flags */ + +static uint32_t max_tickets; /* Maximum number of tickets given to a + * user. Protected by assoc_mgr lock. */ + +extern void priority_p_set_assoc_usage(slurmdb_association_rec_t *assoc); +extern double priority_p_calc_fs_factor(long double usage_efctv, + long double shares_norm); + +/* + * apply decay factor to all associations usage_raw + * IN: decay_factor - decay to be applied to each associations' used + * shares. This should already be modified with the amount of delta + * time from last application.. + * RET: SLURM_SUCCESS on SUCCESS, SLURM_ERROR else. + */ +static int _apply_decay(double decay_factor) +{ + ListIterator itr = NULL; + slurmdb_association_rec_t *assoc = NULL; + slurmdb_qos_rec_t *qos = NULL; + assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, + WRITE_LOCK, NO_LOCK, NO_LOCK }; + + /* continue if decay_factor is 0 or 1 since that doesn't help + * us at all. 1 means no decay and 0 will just zero + * everything out so don't waste time doing it */ + if (!decay_factor) + return SLURM_ERROR; + else if (!calc_fairshare) + return SLURM_SUCCESS; + + assoc_mgr_lock(&locks); + + xassert(assoc_mgr_association_list); + xassert(assoc_mgr_qos_list); + + itr = list_iterator_create(assoc_mgr_association_list); + if (!itr) + fatal("list_iterator_create: malloc failure"); + /* We want to do this to all associations including + * root. All usage_raws are calculated from the bottom up. */ + while ((assoc = list_next(itr))) { + assoc->usage->usage_raw *= decay_factor; + assoc->usage->grp_used_wall *= decay_factor; + } + list_iterator_destroy(itr); + + itr = list_iterator_create(assoc_mgr_qos_list); + if (!itr) + fatal("list_iterator_create: malloc failure"); + while ((qos = list_next(itr))) { + qos->usage->usage_raw *= decay_factor; + qos->usage->grp_used_wall *= decay_factor; + } + list_iterator_destroy(itr); + assoc_mgr_unlock(&locks); + + return SLURM_SUCCESS; +} + +/* + * reset usage_raw, and grp_used_wall on all associations + * This should be called every PriorityUsageResetPeriod + * RET: SLURM_SUCCESS on SUCCESS, SLURM_ERROR else. + */ +static int _reset_usage(void) +{ + ListIterator itr = NULL; + slurmdb_association_rec_t *assoc = NULL; + slurmdb_qos_rec_t *qos = NULL; + assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, + WRITE_LOCK, NO_LOCK, NO_LOCK }; + + if (!calc_fairshare) + return SLURM_SUCCESS; + + assoc_mgr_lock(&locks); + + xassert(assoc_mgr_association_list); + + itr = list_iterator_create(assoc_mgr_association_list); + if (!itr) + fatal("list_iterator_create: malloc failure"); + /* We want to do this to all associations including + * root. All usage_raws are calculated from the bottom up. */ + while ((assoc = list_next(itr))) { + assoc->usage->usage_raw = 0; + assoc->usage->grp_used_wall = 0; + } + list_iterator_destroy(itr); + + itr = list_iterator_create(assoc_mgr_qos_list); + while ((qos = list_next(itr))) { + qos->usage->usage_raw = 0; + qos->usage->grp_used_wall = 0; + } + list_iterator_destroy(itr); + assoc_mgr_unlock(&locks); + + return SLURM_SUCCESS; +} + +static void _read_last_decay_ran(time_t *last_ran, time_t *last_reset) +{ + int data_allocated, data_read = 0; + uint32_t data_size = 0; + int state_fd; + char *data = NULL, *state_file; + Buf buffer; + + xassert(last_ran); + xassert(last_reset); + + (*last_ran) = 0; + (*last_reset) = 0; + + /* read the file */ + state_file = xstrdup(slurmctld_conf.state_save_location); + xstrcat(state_file, "/priority_last_decay_ran"); + lock_state_files(); + state_fd = open(state_file, O_RDONLY); + if (state_fd < 0) { + info("No last decay (%s) to recover", state_file); + unlock_state_files(); + return; + } else { + data_allocated = BUF_SIZE; + data = xmalloc(data_allocated); + while (1) { + data_read = read(state_fd, &data[data_size], + BUF_SIZE); + if (data_read < 0) { + if (errno == EINTR) + continue; + else { + error("Read error on %s: %m", + state_file); + break; + } + } else if (data_read == 0) /* eof */ + break; + data_size += data_read; + data_allocated += data_read; + xrealloc(data, data_allocated); + } + close(state_fd); + } + xfree(state_file); + unlock_state_files(); + + buffer = create_buf(data, data_size); + safe_unpack_time(last_ran, buffer); + safe_unpack_time(last_reset, buffer); + free_buf(buffer); + if (priority_debug) + info("Last ran decay on jobs at %ld", (long)*last_ran); + + return; + +unpack_error: + error("Incomplete priority last decay file returning"); + free_buf(buffer); + return; + +} + +static int _write_last_decay_ran(time_t last_ran, time_t last_reset) +{ + /* Save high-water mark to avoid buffer growth with copies */ + static int high_buffer_size = BUF_SIZE; + int error_code = SLURM_SUCCESS; + int state_fd; + char *old_file, *new_file, *state_file; + Buf buffer; + + if (!strcmp(slurmctld_conf.state_save_location, "/dev/null")) { + error("Can not save priority state information, " + "StateSaveLocation is /dev/null"); + return error_code; + } + + buffer = init_buf(high_buffer_size); + pack_time(last_ran, buffer); + pack_time(last_reset, buffer); + + /* read the file */ + old_file = xstrdup(slurmctld_conf.state_save_location); + xstrcat(old_file, "/priority_last_decay_ran.old"); + state_file = xstrdup(slurmctld_conf.state_save_location); + xstrcat(state_file, "/priority_last_decay_ran"); + new_file = xstrdup(slurmctld_conf.state_save_location); + xstrcat(new_file, "/priority_last_decay_ran.new"); + + lock_state_files(); + state_fd = creat(new_file, 0600); + if (state_fd < 0) { + error("Can't save decay state, create file %s error %m", + new_file); + error_code = errno; + } else { + int pos = 0, nwrite = get_buf_offset(buffer), amount; + char *data = (char *)get_buf_data(buffer); + high_buffer_size = MAX(nwrite, high_buffer_size); + while (nwrite > 0) { + amount = write(state_fd, &data[pos], nwrite); + if ((amount < 0) && (errno != EINTR)) { + error("Error writing file %s, %m", new_file); + error_code = errno; + break; + } + nwrite -= amount; + pos += amount; + } + fsync(state_fd); + close(state_fd); + } + + if (error_code != SLURM_SUCCESS) + (void) unlink(new_file); + else { /* file shuffle */ + (void) unlink(old_file); + if (link(state_file, old_file)) + debug3("unable to create link for %s -> %s: %m", + state_file, old_file); + (void) unlink(state_file); + if (link(new_file, state_file)) + debug3("unable to create link for %s -> %s: %m", + new_file, state_file); + (void) unlink(new_file); + } + xfree(old_file); + xfree(state_file); + xfree(new_file); + + unlock_state_files(); + debug4("done writing time %ld", (long)last_ran); + free_buf(buffer); + + return error_code; +} + + +/* Set the effective usage of a node. */ +static void _set_usage_efctv(slurmdb_association_rec_t *assoc) +{ + long double min_shares_norm; + + if ((assoc->shares_raw == SLURMDB_FS_USE_PARENT) + && assoc->usage->parent_assoc_ptr) { + assoc->usage->shares_norm = + assoc->usage->parent_assoc_ptr->usage->shares_norm; + assoc->usage->usage_norm = + assoc->usage->parent_assoc_ptr->usage->usage_norm; + } + + if (assoc->usage->level_shares) { + min_shares_norm = (long double) MIN_USAGE_FACTOR + * assoc->shares_raw / assoc->usage->level_shares; + if (assoc->usage->usage_norm > min_shares_norm) + assoc->usage->usage_efctv = assoc->usage->usage_norm; + else + assoc->usage->usage_efctv = min_shares_norm; + } else + assoc->usage->usage_efctv = assoc->usage->usage_norm; +} + + +/* This should initially get the childern list from + * assoc_mgr_root_assoc. Since our algorythm goes from top down we + * calculate all the non-user associations now. When a user submits a + * job, that norm_fairshare is calculated. Here we will set the + * usage_efctv to NO_VAL for users to not have to calculate a bunch + * of things that will never be used. + * + * NOTE: acct_mgr_association_lock must be locked before this is called. + */ +static int _set_children_usage_efctv(List childern_list) +{ + slurmdb_association_rec_t *assoc = NULL; + ListIterator itr = NULL; + + if (!childern_list || !list_count(childern_list)) + return SLURM_SUCCESS; + + itr = list_iterator_create(childern_list); + if (!itr) + fatal("list_iterator_create: malloc failure"); + while ((assoc = list_next(itr))) { + if (assoc->user) { + assoc->usage->usage_efctv = (long double)NO_VAL; + continue; + } + priority_p_set_assoc_usage(assoc); + _set_children_usage_efctv(assoc->usage->childern_list); + } + list_iterator_destroy(itr); + return SLURM_SUCCESS; +} + + +/* Distribute the tickets to child nodes recursively. + * + * NOTE: acct_mgr_association_lock must be locked before this is called. + */ +static int _distribute_tickets(List childern_list, uint32_t tickets) +{ + ListIterator itr; + slurmdb_association_rec_t *assoc; + double sfsum = 0, fs; + + if (!childern_list || !list_count(childern_list)) + return SLURM_SUCCESS; + + itr = list_iterator_create(childern_list); + if (!itr) + fatal("list_iterator_create: malloc failure"); + while ((assoc = list_next(itr))) { + if (assoc->usage->active_seqno + != assoc_mgr_root_assoc->usage->active_seqno) + continue; + if (fuzzy_equal(assoc->usage->usage_efctv, NO_VAL)) + priority_p_set_assoc_usage(assoc); + fs = priority_p_calc_fs_factor(assoc->usage->usage_efctv, + assoc->usage->shares_norm); + sfsum += assoc->usage->shares_norm * fs; + } + list_iterator_destroy(itr); + + itr = list_iterator_create(childern_list); + if (!itr) + fatal("list_iterator_create: malloc failure"); + while ((assoc = list_next(itr))) { + if (assoc->usage->active_seqno + != assoc_mgr_root_assoc->usage->active_seqno) + continue; + fs = priority_p_calc_fs_factor(assoc->usage->usage_efctv, + assoc->usage->shares_norm); + assoc->usage->tickets = tickets * assoc->usage->shares_norm + * fs / sfsum; + if (priority_debug) { + if (assoc->user) + info("User %s in account %s gets %u tickets", + assoc->user, assoc->acct, + assoc->usage->tickets); + else + info("Account %s gets %u tickets", + assoc->acct, assoc->usage->tickets); + } + if (assoc->user && assoc->usage->tickets > max_tickets) + max_tickets = assoc->usage->tickets; + _distribute_tickets(assoc->usage->childern_list, + assoc->usage->tickets); + } + list_iterator_destroy(itr); + + return SLURM_SUCCESS; +} + + +/* job_ptr should already have the partition priority and such added + * here before had we will be adding to it + */ +static double _get_fairshare_priority( struct job_record *job_ptr) +{ + slurmdb_association_rec_t *job_assoc = + (slurmdb_association_rec_t *)job_ptr->assoc_ptr; + slurmdb_association_rec_t *fs_assoc = NULL; + double priority_fs = 0.0; + assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, + NO_LOCK, NO_LOCK, NO_LOCK }; + + if (!calc_fairshare) + return 0; + + if (!job_assoc) { + error("Job %u has no association. Unable to " + "compute fairshare.", job_ptr->job_id); + return 0; + } + + fs_assoc = job_assoc; + + assoc_mgr_lock(&locks); + + /* Use values from parent when FairShare=SLURMDB_FS_USE_PARENT */ + while ((fs_assoc->shares_raw == SLURMDB_FS_USE_PARENT) + && fs_assoc->usage->parent_assoc_ptr + && (fs_assoc != assoc_mgr_root_assoc)) { + fs_assoc = fs_assoc->usage->parent_assoc_ptr; + } + + if (fuzzy_equal(fs_assoc->usage->usage_efctv, NO_VAL)) + priority_p_set_assoc_usage(fs_assoc); + + /* Priority is 0 -> 1 */ + if (fs_assoc->usage->active_seqno + == assoc_mgr_root_assoc->usage->active_seqno && max_tickets) + priority_fs = (double) fs_assoc->usage->tickets / max_tickets; + else + priority_fs = 0; + if (priority_debug) { + info("Fairshare priority of job %u for user %s in acct" + " %s is %f", + job_ptr->job_id, job_assoc->user, job_assoc->acct, + priority_fs); + } + + assoc_mgr_unlock(&locks); + + return priority_fs; +} + +static void _get_priority_factors(time_t start_time, struct job_record *job_ptr) +{ + slurmdb_qos_rec_t *qos_ptr = NULL; + + xassert(job_ptr); + + if (!job_ptr->prio_factors) { + job_ptr->prio_factors = + xmalloc(sizeof(priority_factors_object_t)); + } else { + memset(job_ptr->prio_factors, 0, + sizeof(priority_factors_object_t)); + } + + qos_ptr = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; + + if (weight_age) { + uint32_t diff = 0; + time_t use_time; + + if (flags & PRIORITY_FLAGS_ACCRUE_ALWAYS) + use_time = job_ptr->details->submit_time; + else + use_time = job_ptr->details->begin_time; + + /* Only really add an age priority if the use_time is + * past the start_time. */ + if (start_time > use_time) + diff = start_time - use_time; + + if (job_ptr->details->begin_time) { + if (diff < max_age) { + job_ptr->prio_factors->priority_age = + (double)diff / (double)max_age; + } else + job_ptr->prio_factors->priority_age = 1.0; + } else if (flags & PRIORITY_FLAGS_ACCRUE_ALWAYS) { + if (diff < max_age) { + job_ptr->prio_factors->priority_age = + (double)diff / (double)max_age; + } else + job_ptr->prio_factors->priority_age = 1.0; + } + } + + if (job_ptr->assoc_ptr && weight_fs) { + job_ptr->prio_factors->priority_fs = + _get_fairshare_priority(job_ptr); + } + + if (weight_js) { + uint32_t cpu_cnt = 0; + /* On the initial run of this we don't have total_cpus + * so go off the requesting. After the first shot + * total_cpus should be filled in. */ + if (job_ptr->total_cpus) + cpu_cnt = job_ptr->total_cpus; + else if (job_ptr->details + && (job_ptr->details->max_cpus != NO_VAL)) + cpu_cnt = job_ptr->details->max_cpus; + else if (job_ptr->details && job_ptr->details->min_cpus) + cpu_cnt = job_ptr->details->min_cpus; + + if (favor_small) { + job_ptr->prio_factors->priority_js = + (double)(node_record_count + - job_ptr->details->min_nodes) + / (double)node_record_count; + if (cpu_cnt) { + job_ptr->prio_factors->priority_js += + (double)(cluster_cpus - cpu_cnt) + / (double)cluster_cpus; + job_ptr->prio_factors->priority_js /= 2; + } + } else { + job_ptr->prio_factors->priority_js = + (double)job_ptr->details->min_nodes + / (double)node_record_count; + if (cpu_cnt) { + job_ptr->prio_factors->priority_js += + (double)cpu_cnt / (double)cluster_cpus; + job_ptr->prio_factors->priority_js /= 2; + } + } + if (job_ptr->prio_factors->priority_js < .0) + job_ptr->prio_factors->priority_js = 0.0; + else if (job_ptr->prio_factors->priority_js > 1.0) + job_ptr->prio_factors->priority_js = 1.0; + } + + if (job_ptr->part_ptr && job_ptr->part_ptr->priority && weight_part) { + job_ptr->prio_factors->priority_part = + job_ptr->part_ptr->norm_priority; + } + + if (qos_ptr && qos_ptr->priority && weight_qos) { + job_ptr->prio_factors->priority_qos = + qos_ptr->usage->norm_priority; + } + + job_ptr->prio_factors->nice = job_ptr->details->nice; +} + +static uint32_t _get_priority_internal(time_t start_time, + struct job_record *job_ptr) +{ + double priority = 0.0; + priority_factors_object_t pre_factors; + + if (job_ptr->direct_set_prio && (job_ptr->priority > 0)) + return job_ptr->priority; + + if (!job_ptr->details) { + error("_get_priority_internal: job %u does not have a " + "details symbol set, can't set priority", + job_ptr->job_id); + return 0; + } + + /* figure out the priority */ + _get_priority_factors(start_time, job_ptr); + memcpy(&pre_factors, job_ptr->prio_factors, + sizeof(priority_factors_object_t)); + + job_ptr->prio_factors->priority_age *= (double)weight_age; + job_ptr->prio_factors->priority_fs *= (double)weight_fs; + job_ptr->prio_factors->priority_js *= (double)weight_js; + job_ptr->prio_factors->priority_part *= (double)weight_part; + job_ptr->prio_factors->priority_qos *= (double)weight_qos; + + priority = job_ptr->prio_factors->priority_age + + job_ptr->prio_factors->priority_fs + + job_ptr->prio_factors->priority_js + + job_ptr->prio_factors->priority_part + + job_ptr->prio_factors->priority_qos + - (double)(job_ptr->prio_factors->nice - NICE_OFFSET); + + /* Priority 0 is reserved for held jobs */ + if (priority < 1) + priority = 1; + + if (priority_debug) { + info("Weighted Age priority is %f * %u = %.2f", + pre_factors.priority_age, weight_age, + job_ptr->prio_factors->priority_age); + info("Weighted Fairshare priority is %f * %u = %.2f", + pre_factors.priority_fs, weight_fs, + job_ptr->prio_factors->priority_fs); + info("Weighted JobSize priority is %f * %u = %.2f", + pre_factors.priority_js, weight_js, + job_ptr->prio_factors->priority_js); + info("Weighted Partition priority is %f * %u = %.2f", + pre_factors.priority_part, weight_part, + job_ptr->prio_factors->priority_part); + info("Weighted QOS priority is %f * %u = %.2f", + pre_factors.priority_qos, weight_qos, + job_ptr->prio_factors->priority_qos); + info("Job %u priority: %.2f + %.2f + %.2f + %.2f + %.2f - %d " + "= %.2f", + job_ptr->job_id, job_ptr->prio_factors->priority_age, + job_ptr->prio_factors->priority_fs, + job_ptr->prio_factors->priority_js, + job_ptr->prio_factors->priority_part, + job_ptr->prio_factors->priority_qos, + (job_ptr->prio_factors->nice - NICE_OFFSET), + priority); + } + return (uint32_t)priority; +} + + +/* Mark an association and its parents as active (i.e. it may be given + * tickets) during the current scheduling cycle. The association + * manager lock should be held on entry. */ +static bool _mark_assoc_active(struct job_record *job_ptr) +{ + slurmdb_association_rec_t *job_assoc = + (slurmdb_association_rec_t *)job_ptr->assoc_ptr, + *assoc; + + if (!job_assoc) { + error("Job %u has no association. Unable to " + "mark assiciation as active.", job_ptr->job_id); + return false; + } + + for (assoc = job_assoc; assoc != assoc_mgr_root_assoc; + assoc = assoc->usage->parent_assoc_ptr) { + if (assoc->usage->active_seqno + == assoc_mgr_root_assoc->usage->active_seqno) + break; + assoc->usage->active_seqno + = assoc_mgr_root_assoc->usage->active_seqno; + } + return true; +} + + +/* based upon the last reset time, compute when the next reset should be */ +static time_t _next_reset(uint16_t reset_period, time_t last_reset) +{ + struct tm last_tm; + time_t tmp_time, now = time(NULL); + + if (localtime_r(&last_reset, &last_tm) == NULL) + return (time_t) 0; + + last_tm.tm_sec = 0; + last_tm.tm_min = 0; + last_tm.tm_hour = 0; +/* last_tm.tm_wday = 0 ignored */ +/* last_tm.tm_yday = 0; ignored */ + last_tm.tm_isdst = -1; + switch (reset_period) { + case PRIORITY_RESET_DAILY: + tmp_time = mktime(&last_tm); + tmp_time += SECS_PER_DAY; + while ((tmp_time + SECS_PER_DAY) < now) + tmp_time += SECS_PER_DAY; + return tmp_time; + case PRIORITY_RESET_WEEKLY: + tmp_time = mktime(&last_tm); + tmp_time += (SECS_PER_DAY * (7 - last_tm.tm_wday)); + while ((tmp_time + SECS_PER_WEEK) < now) + tmp_time += SECS_PER_WEEK; + return tmp_time; + case PRIORITY_RESET_MONTHLY: + last_tm.tm_mday = 1; + if (last_tm.tm_mon < 11) + last_tm.tm_mon++; + else { + last_tm.tm_mon = 0; + last_tm.tm_year++; + } + break; + case PRIORITY_RESET_QUARTERLY: + last_tm.tm_mday = 1; + if (last_tm.tm_mon < 3) + last_tm.tm_mon = 3; + else if (last_tm.tm_mon < 6) + last_tm.tm_mon = 6; + else if (last_tm.tm_mon < 9) + last_tm.tm_mon = 9; + else { + last_tm.tm_mon = 0; + last_tm.tm_year++; + } + break; + case PRIORITY_RESET_YEARLY: + last_tm.tm_mday = 1; + last_tm.tm_mon = 0; + last_tm.tm_year++; + break; + default: + return (time_t) 0; + } + return mktime(&last_tm); +} + +/* + * Remove previously used time from qos and assocs grp_used_cpu_run_secs. + * + * When restarting slurmctld acct_policy_job_begin() is called for all + * running jobs. There every jobs total requested cputime (total_cpus * + * time_limit) is added to grp_used_cpu_run_secs of assocs and qos. + * + * This function will subtract all cputime that was used until the + * decay thread last ran. This kludge is necessary as the decay thread + * last_ran variable can't be accessed from acct_policy_job_begin(). + */ +static void _init_grp_used_cpu_run_secs(time_t last_ran) +{ + struct job_record *job_ptr = NULL; + ListIterator itr; + assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, + WRITE_LOCK, NO_LOCK, NO_LOCK }; + slurmctld_lock_t job_read_lock = + { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; + uint64_t delta; + slurmdb_qos_rec_t *qos; + slurmdb_association_rec_t *assoc; + + if (priority_debug) + info("Initializing grp_used_cpu_run_secs"); + + if (!(job_list && list_count(job_list))) + return; + + lock_slurmctld(job_read_lock); + itr = list_iterator_create(job_list); + if (itr == NULL) + fatal("list_iterator_create: malloc failure"); + + assoc_mgr_lock(&locks); + while ((job_ptr = list_next(itr))) { + if (priority_debug) + debug2("job: %u",job_ptr->job_id); + qos = NULL; + assoc = NULL; + delta = 0; + + if (!IS_JOB_RUNNING(job_ptr)) + continue; + + if (job_ptr->start_time > last_ran) + continue; + + delta = job_ptr->total_cpus * (last_ran - job_ptr->start_time); + + qos = (slurmdb_qos_rec_t *) job_ptr->qos_ptr; + assoc = (slurmdb_association_rec_t *) job_ptr->assoc_ptr; + + if (qos) { + if (priority_debug) + info("Subtracting %"PRIu64" from qos " + "%u grp_used_cpu_run_secs " + "%"PRIu64" = %"PRIu64"", + delta, + qos->id, + qos->usage->grp_used_cpu_run_secs, + qos->usage->grp_used_cpu_run_secs - + delta); + qos->usage->grp_used_cpu_run_secs -= delta; + } + while (assoc) { + if (priority_debug) + info("Subtracting %"PRIu64" from assoc %u " + "grp_used_cpu_run_secs " + "%"PRIu64" = %"PRIu64"", + delta, + assoc->id, + assoc->usage->grp_used_cpu_run_secs, + assoc->usage->grp_used_cpu_run_secs - + delta); + assoc->usage->grp_used_cpu_run_secs -= delta; + assoc = assoc->usage->parent_assoc_ptr; + } + } + assoc_mgr_unlock(&locks); + list_iterator_destroy(itr); + unlock_slurmctld(job_read_lock); +} + +/* If the job is running then apply decay to the job. + * + * Return 0 if we don't need to process the job any further, 1 if + * futher processing is needed. + */ +static int _apply_new_usage(struct job_record *job_ptr, double decay_factor, + time_t start_period, time_t end_period) +{ + slurmdb_qos_rec_t *qos; + slurmdb_association_rec_t *assoc; + double run_delta = 0.0, run_decay = 0.0, real_decay = 0.0; + uint64_t cpu_run_delta = 0; + uint64_t job_time_limit_ends = 0; + assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, + WRITE_LOCK, NO_LOCK, NO_LOCK }; + assoc_mgr_lock_t qos_read_lock = { NO_LOCK, NO_LOCK, + READ_LOCK, NO_LOCK, NO_LOCK }; + + /* If usage_factor is 0 just skip this since we don't add the usage. */ + assoc_mgr_lock(&qos_read_lock); + qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; + if (qos && !qos->usage_factor) { + assoc_mgr_unlock(&qos_read_lock); + return 0; + } + assoc_mgr_unlock(&qos_read_lock); + + if (job_ptr->start_time > start_period) + start_period = job_ptr->start_time; + + if (job_ptr->end_time + && (end_period > job_ptr->end_time)) + end_period = job_ptr->end_time; + + run_delta = difftime(end_period, start_period); + + /* job already has been accounted for, go to next */ + if (run_delta < 1) + return 0; + + /* cpu_run_delta will is used to + * decrease qos and assocs + * grp_used_cpu_run_secs values. When + * a job is started only seconds until + * start_time+time_limit is added, so + * for jobs running over their + * timelimit we should only subtract + * the used time until the time limit. */ + job_time_limit_ends = + (uint64_t)job_ptr->start_time + + (uint64_t)job_ptr->time_limit * 60; + + if ((uint64_t)start_period >= job_time_limit_ends) + cpu_run_delta = 0; + else if (end_period > job_time_limit_ends) + cpu_run_delta = job_ptr->total_cpus * + (job_time_limit_ends - (uint64_t)start_period); + else + cpu_run_delta = job_ptr->total_cpus * run_delta; + + if (priority_debug) { + info("job %u ran for %g seconds on %u cpus", + job_ptr->job_id, run_delta, job_ptr->total_cpus); + } + + /* get the time in decayed fashion */ + run_decay = run_delta * pow(decay_factor, run_delta); + + real_decay = run_decay * (double)job_ptr->total_cpus; + + assoc_mgr_lock(&locks); + /* Just to make sure we don't make a window where the qos_ptr could of + * changed make sure we get it again here. */ + qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; + assoc = (slurmdb_association_rec_t *)job_ptr->assoc_ptr; + + /* now apply the usage factor for this qos */ + if (qos) { + if (qos->usage_factor >= 0) { + real_decay *= qos->usage_factor; + run_decay *= qos->usage_factor; + } + qos->usage->grp_used_wall += run_decay; + qos->usage->usage_raw += (long double)real_decay; + if (qos->usage->grp_used_cpu_run_secs >= cpu_run_delta) { + if (priority_debug) + info("grp_used_cpu_run_secs is %"PRIu64", " + "will subtract %"PRIu64"", + qos->usage->grp_used_cpu_run_secs, + cpu_run_delta); + qos->usage->grp_used_cpu_run_secs -= cpu_run_delta; + } else { + if (priority_debug) + info("jobid %u, qos %s: setting " + "grp_used_cpu_run_secs " + "to 0 because %"PRIu64" < %"PRIu64"", + job_ptr->job_id, qos->name, + qos->usage->grp_used_cpu_run_secs, + cpu_run_delta); + qos->usage->grp_used_cpu_run_secs = 0; + } + } + + /* We want to do this all the way up + * to and including root. This way we + * can keep track of how much usage + * has occured on the entire system + * and use that to normalize against. */ + while (assoc) { + if (assoc->usage->grp_used_cpu_run_secs >= cpu_run_delta) { + if (priority_debug) + info("grp_used_cpu_run_secs is %"PRIu64", " + "will subtract %"PRIu64"", + assoc->usage->grp_used_cpu_run_secs, + cpu_run_delta); + assoc->usage->grp_used_cpu_run_secs -= cpu_run_delta; + } else { + if (priority_debug) + info("jobid %u, assoc %u: setting " + "grp_used_cpu_run_secs " + "to 0 because %"PRIu64" < %"PRIu64"", + job_ptr->job_id, assoc->id, + assoc->usage->grp_used_cpu_run_secs, + cpu_run_delta); + assoc->usage->grp_used_cpu_run_secs = 0; + } + + assoc->usage->grp_used_wall += run_decay; + assoc->usage->usage_raw += (long double)real_decay; + if (priority_debug) + info("adding %f new usage to assoc %u (user='%s' " + "acct='%s') raw usage is now %Lf. Group wall " + "added %f making it %f. GrpCPURunMins is " + "%"PRIu64"", + real_decay, assoc->id, + assoc->user, assoc->acct, + assoc->usage->usage_raw, + run_decay, + assoc->usage->grp_used_wall, + assoc->usage->grp_used_cpu_run_secs/60); + assoc = assoc->usage->parent_assoc_ptr; + } + assoc_mgr_unlock(&locks); + return 1; +} + +static void *_decay_thread(void *no_data) +{ + struct job_record *job_ptr = NULL; + ListIterator itr; + time_t start_time = time(NULL); + time_t last_ran = 0; + time_t last_reset = 0, next_reset = 0; + uint32_t calc_period = slurm_get_priority_calc_period(); + double decay_hl = (double)slurm_get_priority_decay_hl(); + double decay_factor = 1; + uint16_t reset_period = slurm_get_priority_reset_period(); + + /* Write lock on jobs, read lock on nodes and partitions */ + slurmctld_lock_t job_write_lock = + { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; + slurmctld_lock_t job_read_lock = + { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; + assoc_mgr_lock_t locks = { WRITE_LOCK, NO_LOCK, + NO_LOCK, NO_LOCK, NO_LOCK }; + + + if (decay_hl > 0) + decay_factor = 1 - (0.693 / decay_hl); + + (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); + + _read_last_decay_ran(&last_ran, &last_reset); + if (last_reset == 0) + last_reset = start_time; + + _init_grp_used_cpu_run_secs(last_ran); + + while (1) { + time_t now = start_time; + double run_delta = 0.0, real_decay = 0.0; + + slurm_mutex_lock(&decay_lock); + running_decay = 1; + + /* If reconfig is called handle all that happens + * outside of the loop here */ + if (reconfig) { + /* if decay_hl is 0 or less that means no + * decay is to be had. This also means we + * flush the used time at a certain time set + * by PriorityUsageResetPeriod in the slurm.conf */ + calc_period = slurm_get_priority_calc_period(); + reset_period = slurm_get_priority_reset_period(); + next_reset = 0; + decay_hl = (double)slurm_get_priority_decay_hl(); + if (decay_hl > 0) + decay_factor = 1 - (0.693 / decay_hl); + else + decay_factor = 1; + + reconfig = 0; + } + + /* this needs to be done right away so as to + * incorporate it into the decay loop. + */ + switch(reset_period) { + case PRIORITY_RESET_NONE: + break; + case PRIORITY_RESET_NOW: /* do once */ + _reset_usage(); + reset_period = PRIORITY_RESET_NONE; + last_reset = now; + break; + case PRIORITY_RESET_DAILY: + case PRIORITY_RESET_WEEKLY: + case PRIORITY_RESET_MONTHLY: + case PRIORITY_RESET_QUARTERLY: + case PRIORITY_RESET_YEARLY: + if (next_reset == 0) { + next_reset = _next_reset(reset_period, + last_reset); + } + if (now >= next_reset) { + _reset_usage(); + last_reset = next_reset; + next_reset = _next_reset(reset_period, + last_reset); + } + } + + /* now calculate all the normalized usage here */ + assoc_mgr_lock(&locks); + _set_children_usage_efctv( + assoc_mgr_root_assoc->usage->childern_list); + assoc_mgr_unlock(&locks); + + if (!last_ran) + goto calc_tickets; + else + run_delta = difftime(start_time, last_ran); + + if (run_delta <= 0) + goto calc_tickets; + + real_decay = pow(decay_factor, run_delta); + + if (priority_debug) + info("Decay factor over %g seconds goes " + "from %.15f -> %.15f", + run_delta, decay_factor, real_decay); + + /* first apply decay to used time */ + if (_apply_decay(real_decay) != SLURM_SUCCESS) { + error("problem applying decay"); + running_decay = 0; + slurm_mutex_unlock(&decay_lock); + break; + } + + + /* Multifactor2 core algo 1/3. Iterate through all + * jobs, mark parent associations with the current + * sequence id, so that we know which + * associations/users are active. At the same time as + * we're looping through all the jobs anyway, apply + * the new usage of running jobs too. + */ + + calc_tickets: + lock_slurmctld(job_read_lock); + assoc_mgr_lock(&locks); + /* seqno 0 is a special invalid value. */ + assoc_mgr_root_assoc->usage->active_seqno++; + if (!assoc_mgr_root_assoc->usage->active_seqno) + assoc_mgr_root_assoc->usage->active_seqno++; + assoc_mgr_unlock(&locks); + itr = list_iterator_create(job_list); + while ((job_ptr = list_next(itr))) { + /* apply new usage */ + if (!IS_JOB_PENDING(job_ptr) && + job_ptr->start_time && job_ptr->assoc_ptr + && last_ran) + _apply_new_usage(job_ptr, decay_factor, + last_ran, start_time); + + if (IS_JOB_PENDING(job_ptr) && job_ptr->assoc_ptr) { + assoc_mgr_lock(&locks); + _mark_assoc_active(job_ptr); + assoc_mgr_unlock(&locks); + } + } + list_iterator_destroy(itr); + unlock_slurmctld(job_read_lock); + + /* Multifactor2 core algo 2/3. Start from the root, + * distribute tickets to active child associations + * proportional to the fair share (s*F). We start with + * UINT32_MAX tickets at the root. + */ + assoc_mgr_lock(&locks); + max_tickets = 0; + assoc_mgr_root_assoc->usage->tickets = (uint32_t) -1; + _distribute_tickets (assoc_mgr_root_assoc->usage->childern_list, + (uint32_t) -1); + assoc_mgr_unlock(&locks); + + /* Multifactor2 core algo 3/3. Iterate through the job + * list again, give priorities proportional to the + * maximum number of tickets given to any user. + */ + lock_slurmctld(job_write_lock); + itr = list_iterator_create(job_list); + while ((job_ptr = list_next(itr))) { + /* + * Priority 0 is reserved for held jobs. Also skip + * priority calculation for non-pending jobs. + */ + if ((job_ptr->priority == 0) + || !IS_JOB_PENDING(job_ptr)) + continue; + + job_ptr->priority = + _get_priority_internal(start_time, job_ptr); + last_job_update = time(NULL); + debug2("priority for job %u is now %u", + job_ptr->job_id, job_ptr->priority); + } + list_iterator_destroy(itr); + unlock_slurmctld(job_write_lock); + + last_ran = start_time; + + _write_last_decay_ran(last_ran, last_reset); + + running_decay = 0; + slurm_mutex_unlock(&decay_lock); + + /* Sleep until the next time. */ + now = time(NULL); + double elapsed = difftime(now, start_time); + if (elapsed < calc_period) { + sleep(calc_period - elapsed); + start_time = time(NULL); + } else + start_time = now; + /* repeat ;) */ + } + return NULL; +} + +/* Selects the specific jobs that the user wanted to see + * Requests that include job id(s) and user id(s) must match both to be passed. + * Returns 1 if job should be omitted */ +static int _filter_job(struct job_record *job_ptr, List req_job_list, + List req_user_list) +{ + int filter = 0; + ListIterator iterator; + uint32_t *job_id; + uint32_t *user_id; + + if (req_job_list) { + filter = 1; + iterator = list_iterator_create(req_job_list); + while ((job_id = list_next(iterator))) { + if (*job_id == job_ptr->job_id) { + filter = 0; + break; + } + } + list_iterator_destroy(iterator); + if (filter == 1) { + return 1; + } + } + + if (req_user_list) { + filter = 1; + iterator = list_iterator_create(req_user_list); + while ((user_id = list_next(iterator))) { + if (*user_id == job_ptr->user_id) { + filter = 0; + break; + } + } + list_iterator_destroy(iterator); + if (filter == 1) + return 1; + } + + return filter; +} + + +static void _internal_setup(void) +{ + if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) + priority_debug = 1; + else + priority_debug = 0; + + favor_small = slurm_get_priority_favor_small(); + + max_age = slurm_get_priority_max_age(); + weight_age = slurm_get_priority_weight_age(); + weight_fs = slurm_get_priority_weight_fairshare(); + weight_js = slurm_get_priority_weight_job_size(); + weight_part = slurm_get_priority_weight_partition(); + weight_qos = slurm_get_priority_weight_qos(); + flags = slurmctld_conf.priority_flags; + + if (priority_debug) { + info("priority: Max Age is %u", max_age); + info("priority: Weight Age is %u", weight_age); + info("priority: Weight Fairshare is %u", weight_fs); + info("priority: Weight JobSize is %u", weight_js); + info("priority: Weight Part is %u", weight_part); + info("priority: Weight QOS is %u", weight_qos); + info("priority: Flags is %u", flags); + } +} + +/* + * init() is called when the plugin is loaded, before any other functions + * are called. Put global initialization here. + */ +int init ( void ) +{ + pthread_attr_t thread_attr; + char *temp = NULL; + + /* This means we aren't running from the controller so skip setup. */ + if (cluster_cpus == NO_VAL) + return SLURM_SUCCESS; + + _internal_setup(); + + /* Check to see if we are running a supported accounting plugin */ + temp = slurm_get_accounting_storage_type(); + if (strcasecmp(temp, "accounting_storage/slurmdbd") + && strcasecmp(temp, "accounting_storage/mysql")) { + error("You are not running a supported " + "accounting_storage plugin\n(%s).\n" + "Fairshare can only be calculated with either " + "'accounting_storage/slurmdbd' " + "or 'accounting_storage/mysql' enabled. " + "If you want multifactor priority without fairshare " + "ignore this message.", + temp); + calc_fairshare = 0; + weight_fs = 0; + } else if (assoc_mgr_root_assoc) { + if (!cluster_cpus) + fatal("We need to have a cluster cpu count " + "before we can init the priority/multifactor " + "plugin"); + assoc_mgr_root_assoc->usage->usage_efctv = 1.0; + slurm_attr_init(&thread_attr); + if (pthread_create(&decay_handler_thread, &thread_attr, + _decay_thread, NULL)) + fatal("pthread_create error %m"); + slurm_attr_destroy(&thread_attr); + } else { + if (weight_fs) + fatal("It appears you don't have any association " + "data from your database. " + "The priority/multifactor plugin requires " + "this information to run correctly. Please " + "check your database connection and try again."); + + calc_fairshare = 0; + } + + xfree(temp); + + debug("%s loaded", plugin_name); + return SLURM_SUCCESS; +} + +int fini ( void ) +{ + /* Daemon termination handled here */ + if (running_decay) + debug("Waiting for decay thread to finish."); + + slurm_mutex_lock(&decay_lock); + + /* cancel the decay thread and then join it */ + if (decay_handler_thread) { + pthread_cancel(decay_handler_thread); + pthread_join(decay_handler_thread, NULL); + } + + slurm_mutex_unlock(&decay_lock); + + return SLURM_SUCCESS; +} + +extern uint32_t priority_p_set(uint32_t last_prio, struct job_record *job_ptr) +{ + uint32_t priority = _get_priority_internal(time(NULL), job_ptr); + + debug2("initial priority for job %u is %u", job_ptr->job_id, priority); + + return priority; +} + +extern void priority_p_reconfig(void) +{ + reconfig = 1; + _internal_setup(); + debug2("%s reconfigured", plugin_name); + + return; +} + +extern void priority_p_set_assoc_usage(slurmdb_association_rec_t *assoc) +{ + char *child; + char *child_str; + + xassert(assoc_mgr_root_assoc); + xassert(assoc); + xassert(assoc->usage); + xassert(assoc->usage->parent_assoc_ptr); + + if (assoc->user) { + child = "user"; + child_str = assoc->user; + } else { + child = "account"; + child_str = assoc->acct; + } + + if (assoc_mgr_root_assoc->usage->usage_raw) + assoc->usage->usage_norm = assoc->usage->usage_raw + / assoc_mgr_root_assoc->usage->usage_raw; + else + /* This should only happen when no usage has occured + at all so no big deal, the other usage should be 0 + as well here. + */ + assoc->usage->usage_norm = 0; + + if (priority_debug) + info("Normalized usage for %s %s off %s %Lf / %Lf = %Lf", + child, child_str, assoc->usage->parent_assoc_ptr->acct, + assoc->usage->usage_raw, + assoc_mgr_root_assoc->usage->usage_raw, + assoc->usage->usage_norm); + /* This is needed in case someone changes the half-life on the + * fly and now we have used more time than is available under + * the new config */ + if (assoc->usage->usage_norm > 1.0) + assoc->usage->usage_norm = 1.0; + + if (assoc->usage->parent_assoc_ptr == assoc_mgr_root_assoc) { + assoc->usage->usage_efctv = assoc->usage->usage_norm; + if (priority_debug) + info("Effective usage for %s %s off %s %Lf %Lf", + child, child_str, + assoc->usage->parent_assoc_ptr->acct, + assoc->usage->usage_efctv, + assoc->usage->usage_norm); + } else { + _set_usage_efctv(assoc); + if (priority_debug) { + info("Effective usage for %s %s off %s = %Lf", + child, child_str, + assoc->usage->parent_assoc_ptr->acct, + assoc->usage->usage_efctv); + } + } +} + +extern double priority_p_calc_fs_factor(long double usage_efctv, + long double shares_norm) +{ + double priority_fs; + + if (fuzzy_equal(usage_efctv, NO_VAL)) + return 0.0; + + if (shares_norm > 0.0) { + if (usage_efctv < MIN_USAGE_FACTOR * shares_norm) + usage_efctv = MIN_USAGE_FACTOR * shares_norm; + priority_fs = shares_norm / usage_efctv; + } + else + priority_fs = 0.0; + + return priority_fs; +} + +extern List priority_p_get_priority_factors_list( + priority_factors_request_msg_t *req_msg, uid_t uid) +{ + List req_job_list; + List req_user_list; + List ret_list = NULL; + ListIterator itr; + priority_factors_object_t *obj = NULL; + struct job_record *job_ptr = NULL; + time_t start_time = time(NULL); + + xassert(req_msg); + req_job_list = req_msg->job_id_list; + req_user_list = req_msg->uid_list; + + /* Read lock on jobs, nodes, and partitions */ + slurmctld_lock_t job_read_lock = + { NO_LOCK, READ_LOCK, READ_LOCK, READ_LOCK }; + + if (job_list && list_count(job_list)) { + ret_list = list_create(slurm_destroy_priority_factors_object); + lock_slurmctld(job_read_lock); + itr = list_iterator_create(job_list); + if (itr == NULL) + fatal("list_iterator_create: malloc failure"); + while ((job_ptr = list_next(itr))) { + /* + * We are only looking for pending jobs + */ + if (!IS_JOB_PENDING(job_ptr)) + continue; + + /* + * This means the job is not eligible yet + */ + if (!job_ptr->details->begin_time + || (job_ptr->details->begin_time > start_time)) + continue; + + /* + * 0 means the job is held + */ + if (job_ptr->priority == 0) + continue; + + /* + * Priority has been set elsewhere (e.g. by SlurmUser) + */ + if (job_ptr->direct_set_prio) + continue; + + if (_filter_job(job_ptr, req_job_list, req_user_list)) + continue; + + if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) + && (job_ptr->user_id != uid) + && !validate_operator(uid) + && !assoc_mgr_is_user_acct_coord( + acct_db_conn, uid, + job_ptr->account)) + continue; + + obj = xmalloc(sizeof(priority_factors_object_t)); + memcpy(obj, job_ptr->prio_factors, + sizeof(priority_factors_object_t)); + obj->job_id = job_ptr->job_id; + obj->user_id = job_ptr->user_id; + list_append(ret_list, obj); + } + list_iterator_destroy(itr); + unlock_slurmctld(job_read_lock); + if (!list_count(ret_list)) { + list_destroy(ret_list); + ret_list = NULL; + } + } + + return ret_list; +}