diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 57da96230a08964da3fc8e98e53ecd2fccb00a04..baebc7575208207ee332deaa2dba808ab9ac6c9a 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -4049,6 +4049,43 @@ Also refer to DenyQos. Partition name of alternate partition to be used if the state of this partition is "DRAIN" or "INACTIVE." +.TP +\fBChargeRate\fR +The ChargeRate is used to define a CPU Equivalent that is multiplied by time +when calculating the usage of a job. Different charge rates may be specified +per resource type; the CPU equivalent is the MAX() of the charges. + +Charge rates are specified as a comma-separated list of +\fIResourceType\fR=\fIResourceChargeRate\fR pairs. The job's quantity of +\fIResourceType\fR is multiplied by the corresponding +\fIResourceChargeRate\fR (floating point). For example, when a job +is allocated 1 CPU core and 8 GB of memory and the partition is configured with +ChargeRate="CPU=1.0,MemGB=0.25,GRES:gpu=2.0", the CPU equivalent is +MAX(1*1.0, 8*0.25, 0*2.0) = 2.0 + +The CPU charge rate defaults to 1.0 and all others to 0.0. Available resource +types include: + +.RS +.TP 10 +\fBCPU\fP +Charge per allocated CPU. Charge rates other than 1.0 can be used to charge +more or less for CPUs in a partition that are considered to be more or less +capable than those in other partitions. +.TP +\fBMemGB\fP +Charge per allocated GB of memory +.TP +\fBNode\fP +Charge per allocated node +.TP +\fBGRES:\fR\fI<type>\fR\fB\fP +Charge per allocated GRES of type \fI<type>\fR +.TP +\fBLicense:\fR\fI<type>\fR\fB\fP +Charge per allocated license of type \fI<type>\fR +.RE + .TP \fBDefault\fR If this keyword is set, jobs submitted without a partition diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 3e3eea34b8cb32e8c5c5aa09005359e3e0afd635..f234e8240b3d30e75bdfe98f371e5bad55f89106 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1403,6 +1403,7 @@ typedef struct job_info { uint16_t contiguous; /* 1 if job requires contiguous nodes */ uint16_t core_spec; /* specialized core count */ uint16_t cores_per_socket; /* cores per socket required by job */ + double cpu_equiv; /* cpu equivalents cache. updated upon resize */ uint16_t cpus_per_task; /* number of processors required for * each task */ uint32_t cpu_freq_min; /* Minimum cpu frequency */ diff --git a/src/common/read_config.c b/src/common/read_config.c index 7ed7d85bed28985f679dbf91e4b3778458082a27..5d369875b23eb13bf2c703222902f9cf8bc56494 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -1044,6 +1044,7 @@ static int _parse_partitionname(void **dest, slurm_parser_enum_t type, {"AllowGroups", S_P_STRING}, {"AllowQos", S_P_STRING}, {"Alternate", S_P_STRING}, + {"ChargeRate", S_P_STRING}, {"DefMemPerCPU", S_P_UINT32}, {"DefMemPerNode", S_P_UINT32}, {"Default", S_P_BOOLEAN}, /* YES or NO */ @@ -1137,6 +1138,10 @@ static int _parse_partitionname(void **dest, slurm_parser_enum_t type, if (!s_p_get_string(&p->alternate, "Alternate", tbl)) s_p_get_string(&p->alternate, "Alternate", dflt); + if (!s_p_get_string(&p->charge_rate, "ChargeRate", tbl) && + !s_p_get_string(&p->charge_rate, "ChargeRate", dflt)) + xfree(p->charge_rate); + if (!s_p_get_boolean(&p->default_flag, "Default", tbl) && !s_p_get_boolean(&p->default_flag, "Default", dflt)) p->default_flag = false; @@ -4855,6 +4860,60 @@ extern int sort_key_pairs(void *v1, void *v2) return 0; } + +extern void destroy_config_key_double_pair(void *object) +{ + config_key_double_pair_t *key_double_pair_ptr = + (config_key_double_pair_t *)object; + + if (key_double_pair_ptr) { + xfree(key_double_pair_ptr->name); + xfree(key_double_pair_ptr); + } +} + +extern void pack_config_key_double_pair(void *in, uint16_t rpc_version, + Buf buffer) +{ + config_key_double_pair_t *object = (config_key_double_pair_t *)in; + packstr(object->name, buffer); + packdouble(object->value, buffer); +} + +extern int unpack_config_key_double_pair(void **object, uint16_t rpc_version, + Buf buffer) +{ + uint32_t uint32_tmp; + config_key_double_pair_t *object_ptr = + xmalloc(sizeof(config_key_double_pair_t)); + + *object = object_ptr; + safe_unpackstr_xmalloc(&object_ptr->name, &uint32_tmp, buffer); + safe_unpackdouble(&object_ptr->value, buffer); + + return SLURM_SUCCESS; + +unpack_error: + destroy_config_key_double_pair(object_ptr); + *object = NULL; + return SLURM_ERROR; +} + +extern int sort_key_double_pairs(void *v1, void *v2) +{ + config_key_double_pair_t *key_a = *(config_key_double_pair_t **)v1; + config_key_double_pair_t *key_b = *(config_key_double_pair_t **)v2; + + int size_a = strcmp(key_a->name, key_b->name); + + if (size_a < 0) + return -1; + else if (size_a > 0) + return 1; + + return 0; +} + /* * Return the pathname of the extra .conf file */ diff --git a/src/common/read_config.h b/src/common/read_config.h index 2445b74a9d4f0a48654fcf12015c25a701e46d90..4ec86c4a2013a7ea21864c75bd8c1cd6cf7afaad 100644 --- a/src/common/read_config.h +++ b/src/common/read_config.h @@ -251,6 +251,7 @@ typedef struct slurm_conf_partition { char *alternate; /* name of alternate partition */ uint16_t cr_type; /* Custom CR values for partition (supported * by select/cons_res plugin only) */ + char *charge_rate; /* per resource charge rates */ uint32_t def_mem_per_cpu; /* default MB memory per allocated CPU */ bool default_flag; /* Set if default partition */ uint32_t default_time; /* minutes or INFINITE */ @@ -292,6 +293,11 @@ typedef struct { char *value; } config_key_pair_t; +typedef struct { + char *name; + double value; +} config_key_double_pair_t; + /* Destroy a front_end record built by slurm_conf_frontend_array() */ extern void destroy_frontend(void *ptr); @@ -563,6 +569,12 @@ extern void pack_config_key_pair(void *in, uint16_t rpc_version, Buf buffer); extern int unpack_config_key_pair(void **object, uint16_t rpc_version, Buf buffer); extern int sort_key_pairs(void *v1, void *v2); +extern void destroy_config_key_double_pair(void *object); +extern void pack_config_key_double_pair(void *in, uint16_t rpc_version, + Buf buffer); +extern int unpack_config_key_double_pair(void **object, uint16_t rpc_version, + Buf buffer); +extern int sort_key_double_pairs(void *v1, void *v2); /* * Return the pathname of the extra .conf file * return value must be xfreed diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index f1378dbe6af34a8014bd7a74790148849edd29fb..d7d0912bdfd1425fb36c5c5f1ff9352215a6cdaa 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -5905,6 +5905,7 @@ _unpack_job_info_members(job_info_t * job, Buf buffer, safe_unpack_time(&job->resize_time, buffer); safe_unpack_time(&job->preempt_time, buffer); safe_unpack32(&job->priority, buffer); + safe_unpackdouble(&job->cpu_equiv, buffer); safe_unpackstr_xmalloc(&job->nodes, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&job->sched_nodes, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&job->partition, &uint32_tmp, buffer); diff --git a/src/plugins/priority/multifactor/priority_multifactor.c b/src/plugins/priority/multifactor/priority_multifactor.c index e94aef80133edf8be28a82df808f1ddd7144028e..126d4542cff7fabcba1e0a1b41f5035fe7faf020 100644 --- a/src/plugins/priority/multifactor/priority_multifactor.c +++ b/src/plugins/priority/multifactor/priority_multifactor.c @@ -76,6 +76,9 @@ #include "src/common/parse_time.h" #include "src/common/slurm_time.h" #include "src/common/xstring.h" +#include "src/common/gres.h" + +#include "src/slurmctld/licenses.h" #include "fair_tree.h" @@ -670,6 +673,119 @@ static time_t _next_reset(uint16_t reset_period, time_t last_reset) return slurm_mktime(&last_tm); } + +/* + * Calculate CPU equivalents based on partition's defined ChargeRate. If none is + * defined, return total_cpus. This is cached on job_ptr->cpu_equiv and is + * updated if the job was resized since the last iteration. + */ +static double _cpu_equivalents(struct job_record *job_ptr, time_t start_time) +{ + double equiv = 0.0; + double charge_mem_mb = 0.0; + struct part_record *part_ptr = job_ptr->part_ptr; + uint32_t total_memory = 0; + uint32_t gres_alloc_count = 0; + uint32_t lic_alloc_count = 0; + config_key_double_pair_t *charge_pair; + ListIterator itr; + + /* Don't recalculate unless the job is new or resized */ + if ((!fuzzy_equal(job_ptr->cpu_equiv, NO_VAL)) && + difftime(job_ptr->resize_time, start_time) < 0.0) + return job_ptr->cpu_equiv; + + debug3("ChargeRate: job %d is either new or it was resized", + job_ptr->job_id); + + /* No charge rate defined. Return CPU count */ + if (!part_ptr->charge_rate) { + job_ptr->cpu_equiv = job_ptr->total_cpus; + return job_ptr->cpu_equiv; + } + + debug3("ChargeRate: job %d using ChargeRate=\"%s\" from partition %s", + job_ptr->job_id, part_ptr->charge_rate, + job_ptr->part_ptr->name); + + /* Calculate total memory since it is stored either per node or cpu */ + if (job_ptr->details->pn_min_memory & MEM_PER_CPU) + total_memory = (job_ptr->details->pn_min_memory ^ MEM_PER_CPU) * + job_ptr->total_cpus; + else + total_memory = job_ptr->details->pn_min_memory * + job_ptr->total_nodes; + + charge_mem_mb = part_ptr->charge_mem_gb / 1024.0l; + + equiv = (double)job_ptr->total_cpus * part_ptr->charge_cpu; + equiv = MAX(equiv, + (double)job_ptr->total_nodes * part_ptr->charge_node); + equiv = MAX(equiv, (double)total_memory * charge_mem_mb); + + /* + * Potential performance improvement: + * It would likely be lower overhead to iterate through allocated + * gres and maybe licenses first then compare to charge rates + * rather than iterate through all defined charge rates then compare to + * gres and licenses. Site-specific differences are likely. + * + * It may be very uncommon for per partition per license charges to + * exist but still have licenses requested per job. It is much more + * likely to define a gres charge but have many jobs that don't request + * gres. This means that the license code is probably fine for most + * sites but gres may be better or worse the current way depending + * on a site's configuration and job patterns. + */ + + /* Calculate for each gres */ + if(job_ptr->gres_list && !list_is_empty(job_ptr->gres_list)) { + itr = list_iterator_create(part_ptr->charge_gres); + while ((charge_pair = list_next(itr))) { + gres_alloc_count = + gres_get_value_by_type(job_ptr->gres_list, + charge_pair->name); + if(gres_alloc_count > 0) { + debug3("ChargeRate: GRES:%s = %d * %f", + charge_pair->name, gres_alloc_count, + charge_pair->value); + equiv = MAX(equiv, gres_alloc_count * + (double)charge_pair->value); + } + } + list_iterator_destroy(itr); + } + + /* Calculate for each license */ + if(job_ptr->license_list && !list_is_empty(job_ptr->license_list)) { + itr = list_iterator_create(part_ptr->charge_lic); + while ((charge_pair = list_next(itr))) { + lic_alloc_count = lic_get_value_by_type( + job_ptr->license_list, + charge_pair->name); + if(lic_alloc_count > 0) { + debug3("ChargeRate: License:%s = %d * %f", + charge_pair->name, lic_alloc_count, + charge_pair->value); + equiv = MAX(equiv, lic_alloc_count * + (double)charge_pair->value); + } + } + list_iterator_destroy(itr); + } + + debug3("ChargeRate: CPU = %d * %f", job_ptr->total_cpus, + part_ptr->charge_cpu); + debug3("ChargeRate: Node = %d * %f", job_ptr->total_nodes, + part_ptr->charge_node); + debug3("ChargeRate: Mem (MB) = %d * (%f/1024)", total_memory, + part_ptr->charge_mem_gb); + debug3("ChargeRate: job %d MAX(...) = %f", job_ptr->job_id, equiv); + job_ptr->cpu_equiv = equiv; + return equiv; +} + + /* * Remove previously used time from qos and assocs grp_used_cpu_run_secs. * @@ -859,7 +975,7 @@ static int _apply_new_usage(struct job_record *job_ptr, /* get the time in decayed fashion */ run_decay = run_delta * pow(decay_factor, run_delta); - real_decay = run_decay * (double)job_ptr->total_cpus; + real_decay = run_decay * _cpu_equivalents(job_ptr, start_period); assoc_mgr_lock(&locks); /* Just to make sure we don't make a diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index d6ca64784f8bea8d3e8b7a8061bf3f3a035cdfce..52f6027d7270416f0c7943c1e11d5649df15c490 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -468,6 +468,7 @@ static struct job_record *_create_job_record(int *error_code, uint32_t num_jobs) detail_ptr->submit_time = time(NULL); job_ptr->requid = -1; /* force to -1 for sacct to know this * hasn't been set yet */ + job_ptr->cpu_equiv = (double)NO_VAL; (void) list_append(job_list, job_ptr); return job_ptr; @@ -1161,6 +1162,7 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer) pack8(dump_job_ptr->power_flags, buffer); pack8(dump_job_ptr->sicp_mode, buffer); pack16(dump_job_ptr->start_protocol_ver, buffer); + packdouble(dump_job_ptr->cpu_equiv, buffer); if (IS_JOB_COMPLETING(dump_job_ptr)) { if (dump_job_ptr->nodes_completing == NULL) { @@ -1275,6 +1277,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) slurmdb_qos_rec_t qos_rec; bool job_finished = false; char jbuf[JBUFSIZ]; + double cpu_equiv = (double)NO_VAL; char *tres_alloc_str = NULL, *tres_fmt_alloc_str = NULL; if (protocol_version >= SLURM_15_08_PROTOCOL_VERSION) { @@ -1373,6 +1376,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) safe_unpack8(&power_flags, buffer); safe_unpack8(&sicp_mode, buffer); safe_unpack16(&start_protocol_ver, buffer); + safe_unpackdouble(&cpu_equiv, buffer); if (job_state & JOB_COMPLETING) { safe_unpackstr_xmalloc(&nodes_completing, @@ -1878,6 +1882,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) xfree(job_ptr->comment); job_ptr->comment = comment; comment = NULL; /* reused, nothing left to free */ + job_ptr->cpu_equiv = cpu_equiv; xfree(job_ptr->gres); job_ptr->gres = gres; gres = NULL; /* reused, nothing left to free */ @@ -7877,6 +7882,7 @@ void pack_job(struct job_record *dump_job_ptr, uint16_t show_flags, Buf buffer, pack_time(dump_job_ptr->resize_time, buffer); pack_time(dump_job_ptr->preempt_time, buffer); pack32(dump_job_ptr->priority, buffer); + packdouble(dump_job_ptr->cpu_equiv, buffer); /* Only send the allocated nodelist since we are only sending * the number of cpus and nodes that are currently allocated. */ diff --git a/src/slurmctld/licenses.c b/src/slurmctld/licenses.c index 3674362db13103d6c7b9bf653c8b41f2d9512baa..83debdef8a13eedf47bb4210921dd177a085f73b 100644 --- a/src/slurmctld/licenses.c +++ b/src/slurmctld/licenses.c @@ -232,6 +232,20 @@ static void _add_res_rec_2_lic_list(slurmdb_res_rec_t *rec, bool sync) last_license_update = time(NULL); } +/* Get how many of a given license are in a list */ +extern uint32_t lic_get_value_by_type(List license_list, char *name) +{ + licenses_t *license_entry; + uint32_t used = 0; + + license_entry = list_find_first( + license_list, _license_find_remote_rec, name); + + if(license_entry) + used = license_entry->used; + return used; +} + /* Get string of used license information. Caller must xfree return value */ extern char *get_licenses_used(void) { diff --git a/src/slurmctld/licenses.h b/src/slurmctld/licenses.h index 0ef28d6b0b0a0ff7c2263c0752c1d29d898f767f..0f8da29ade4ab973eeb1e3d1fe7e442ff83d3805 100644 --- a/src/slurmctld/licenses.h +++ b/src/slurmctld/licenses.h @@ -74,6 +74,14 @@ extern void license_free(void); /* Free a license_t record (for use by list_destroy) */ extern void license_free_rec(void *x); +/* + * lic_get_value_by_type - Return count of named licenses used by job + * IN licenses - list containing licenses_t records + * IN name - name of the license + * RET number of licenses of the particular type used + */ +extern uint32_t lic_get_value_by_type(List license_list, char *name); + /* * license_job_copy - create a copy of a job's license list * IN license_list_src - job license list to be copied diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index c22932a6e156ef48d1864ea9d7e6fe6e79d82a2e..0dac9cdfb2b438f63d63b8b10bd27cc97400bb82 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -134,6 +134,83 @@ static int _compare_hostnames(struct node_record *old_node_table, struct node_record *node_table, int node_count); + + +/* Convert the value of a k=v pair to a double. Inputs are the k=v string and an + * integer offset representing the start of the value */ +static double _charge_rate_item_to_double(char *item_str, int value_offset) +{ + double d; + errno = 0; + + d = strtod(item_str + value_offset, NULL); + if(errno) + fatal("Unable to convert %s value to double in ChargeRate", + item_str); + + return d; +} + +static void _charge_rate_item_add_to_list(List l, char *item_str, int offset) +{ + char *tmp_str; + char *kv = item_str+offset; + config_key_double_pair_t *pair = xmalloc( + sizeof(config_key_double_pair_t)); + + if (!(tmp_str = strstr(kv, "="))) + fatal("\"%s\" is an invalid ChargeRate entry", item_str); + + pair->name = xstrndup(kv, tmp_str - kv); + pair->value = _charge_rate_item_to_double(item_str, + tmp_str - item_str + 1); + + list_append(l, pair); +} + +static void _charge_rate_item(struct part_record *p, char *item_str) +{ + if (!item_str) + fatal("ChargeRate item is null"); + + if (!strncasecmp(item_str, "MemGB=", 6)) + p->charge_mem_gb = _charge_rate_item_to_double(item_str, 6); + else if (!strncasecmp(item_str, "CPU=", 4)) + p->charge_cpu = _charge_rate_item_to_double(item_str, 4); + else if (!strncasecmp(item_str, "Node=", 5)) + p->charge_node = _charge_rate_item_to_double(item_str, 5); + else if (!strncasecmp(item_str, "GRES:", 5)) + _charge_rate_item_add_to_list(p->charge_gres, item_str, 5); + else if (!strncasecmp(item_str, "License:", 8)) + _charge_rate_item_add_to_list(p->charge_lic, item_str, 8); +} + +static void _charge_rate(struct part_record *p, char *charge_str) +{ + char *tmp_str = xstrdup(charge_str); + char *token, *last = NULL; + + p->charge_cpu = 1.0; + p->charge_mem_gb = 0.0; + p->charge_node = 0.0; + + if(p->charge_gres) + list_destroy(p->charge_gres); + p->charge_gres = list_create(destroy_config_key_double_pair); + + if(p->charge_lic) + list_destroy(p->charge_lic); + p->charge_lic = list_create(destroy_config_key_double_pair); + + token = strtok_r(tmp_str, ",", &last); + while (token) { + _charge_rate_item(p, token); + token = strtok_r(NULL, ",", &last); + } + xfree(tmp_str); + return; +} + /* Verify that Slurm directories are secure, not world writable */ static void _stat_slurm_dirs(void) { @@ -807,6 +884,12 @@ static int _build_single_partitionline_info(slurm_conf_partition_t *part) part_ptr->grace_time = part->grace_time; part_ptr->cr_type = part->cr_type; + if (part->charge_rate) { + xfree(part_ptr->charge_rate); + part_ptr->charge_rate = xstrdup(part->charge_rate); + _charge_rate(part_ptr, part_ptr->charge_rate); + } + if (part->allow_accounts) { xfree(part_ptr->allow_accounts); part_ptr->allow_accounts = xstrdup(part->allow_accounts); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 542b613210def9c5b6717a9d90e4f9376e437378..2b4d17dc87bc51f4c211a126f1a112656d753d82 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -317,6 +317,12 @@ struct part_record { bitstr_t *allow_qos_bitstr; /* (DON'T PACK) assocaited with * char *allow_qos but used internally */ char *alternate; /* name of alternate partition */ + double charge_cpu; /* charge per allocated CPU */ + List charge_gres; /* list of per allocated GRES charges */ + List charge_lic; /* list of per allocated license charges */ + double charge_mem_gb; /* charge per allocated memory (GB) */ + double charge_node; /* charge per allocated node */ + char *charge_rate; /* per resource charge rate string */ uint32_t def_mem_per_cpu; /* default MB memory per allocated CPU */ uint32_t default_time; /* minutes, NO_VAL or INFINITE */ char *deny_accounts; /* comma delimited list of denied accounts */ @@ -572,6 +578,11 @@ struct job_record { * by the job, decremented while job is * completing (N/A for bluegene * systems) */ + double cpu_equiv; /* CPU equivalents allocated to the job, + * as defined by the partition's charge + * rate. Recalculated upon job resize. + * Cannot be calculated until the job is + * alloocated resources. */ uint16_t cr_enabled; /* specify if Consumable Resources * is enabled. Needed since CR deals * with a finer granularity in its