From edf3880c66292dd7a5376c44dc7204897644069b Mon Sep 17 00:00:00 2001 From: Nathan Yee <nyee32@schedmd.com> Date: Tue, 5 Jan 2016 16:52:31 -0800 Subject: [PATCH] Initial commit for Rejecting a multipartition job if it violates a partition limit. --- doc/man/man5/slurm.conf.5 | 11 ++- slurm/slurm.h.in | 6 ++ src/api/config_info.c | 3 +- src/common/read_config.c | 15 +++- src/common/slurm_protocol_defs.c | 42 +++++++++ src/common/slurm_protocol_defs.h | 4 + src/slurmctld/job_mgr.c | 142 +++++++++++++++++++++++-------- 7 files changed, 179 insertions(+), 44 deletions(-) diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 00466d57c46..b16386edd44 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -770,10 +770,13 @@ May not exceed 65533. .TP \fBEnforcePartLimits\fR -If set to "YES" then jobs which exceed a partition's size and/or time limits -will be rejected at submission time. If set to "NO" then the job will be -accepted and remain queued until the partition limits are altered. -The default value is "NO". +If set to "ALL" then jobs which exceed a partition's size and/or +time limits will be rejected at submission time. If job is submitted to +multiple partitions, the job must satisfy the limits on all the requested +paritions. If set to "NO" then the job will be accepted and remain queued +until the partition limits are altered(Time and Node Limits). +If set to "ANY" or "YES" a job must satisfy any of the requested partitions +to be submitted. The default value is "NO". NOTE: If set, then a job's QOS can not be used to exceed partition limits. .TP diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 236da4121e8..03aa004dfe2 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -557,6 +557,12 @@ enum job_acct_types { #define PARTITION_DRAIN (PARTITION_SCHED) #define PARTITION_INACTIVE 0x00 +/* Partition enforce flags for jobs */ +#define PARTITION_ENFORCE_NONE 0 +#define PARTITION_ENFORCE_ALL 1 /* job limit must be valid for ALL + * partitions */ +#define PARTITION_ENFORCE_ANY 2 /* job limit must be valid for ANY + * partition */ enum connection_type { SELECT_MESH, /* nodes wired in mesh */ SELECT_TORUS, /* nodes wired in torus */ diff --git a/src/api/config_info.c b/src/api/config_info.c index 6a459477b01..320c4ce72d6 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -569,7 +569,8 @@ extern void *slurm_ctl_conf_2_key_pairs (slurm_ctl_conf_t* slurm_ctl_conf_ptr) list_append(ret_list, key_pair); key_pair->name = xstrdup("EnforcePartLimits"); key_pair->value = xstrdup( - slurm_ctl_conf_ptr->enforce_part_limits ? "Yes" : "No"); + parse_part_enforce_type_2str( + slurm_ctl_conf_ptr->enforce_part_limits)); key_pair = xmalloc(sizeof(config_key_pair_t)); key_pair->name = xstrdup("Epilog"); diff --git a/src/common/read_config.c b/src/common/read_config.c index a41555074d0..231e717ba69 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -208,7 +208,7 @@ s_p_options_t slurm_conf_options[] = { {"DefMemPerNode", S_P_UINT32}, {"DisableRootJobs", S_P_BOOLEAN}, {"EioTimeout", S_P_UINT16}, - {"EnforcePartLimits", S_P_BOOLEAN}, + {"EnforcePartLimits", S_P_STRING}, {"Epilog", S_P_STRING}, {"EpilogMsgTime", S_P_UINT32}, {"EpilogSlurmctld", S_P_STRING}, @@ -3153,9 +3153,18 @@ _validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) "DisableRootJobs", hashtbl)) conf->disable_root_jobs = DEFAULT_DISABLE_ROOT_JOBS; - if (!s_p_get_boolean((bool *) &conf->enforce_part_limits, - "EnforcePartLimits", hashtbl)) + if (s_p_get_string(&temp_str, + "EnforcePartLimits", hashtbl)) { + uint16_t enforce_param; + if (parse_part_enforce_type(temp_str, &enforce_param) < 0) { + error("Bad EnforcePartLimits: %s", temp_str); + xfree(temp_str); + return SLURM_ERROR; + } + conf->enforce_part_limits = enforce_param; + } else { conf->enforce_part_limits = DEFAULT_ENFORCE_PART_LIMITS; + } s_p_get_string(&conf->epilog, "Epilog", hashtbl); diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index e9120444695..f99ddfda88d 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -4449,3 +4449,45 @@ extern void slurm_free_assoc_mgr_info_request_msg( FREE_NULL_LIST(msg->user_list); xfree(msg); } + +extern int parse_part_enforce_type(char *enforce_part_type, uint16_t *param) +{ + int rc = SLURM_SUCCESS; + + char *value = xstrdup(enforce_part_type); + + if (!strcasecmp(value, "yes") + || !strcasecmp(value, "up") + || !strcasecmp(value, "true") + || !strcasecmp(value, "1") || !strcasecmp(value, "any")) { + *param = PARTITION_ENFORCE_ANY; + } else if (!strcasecmp(value, "no") + || !strcasecmp(value, "down") + || !strcasecmp(value, "false") + || !strcasecmp(value, "0")) { + *param = PARTITION_ENFORCE_NONE; + } else if (!strcasecmp(value, "all")) { + *param = PARTITION_ENFORCE_ALL; + } else { + error("Bad EnforcePartLimits: %s\n", value); + rc = SLURM_FAILURE; + } + + xfree(value); + return rc; +} + +extern char * parse_part_enforce_type_2str (uint16_t type) +{ + static char type_str[1024]; + + if (type == PARTITION_ENFORCE_NONE) { + strcpy(type_str, "NO"); + } else if (type == PARTITION_ENFORCE_ANY) { + strcpy(type_str, "ANY"); + } else if (type == PARTITION_ENFORCE_ALL) { + strcpy(type_str, "ALL"); + } + + return type_str; +} diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index f5f7c71eb80..deea37cd876 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -1427,6 +1427,10 @@ extern char *reservation_flags_string(uint32_t flags); extern char * slurm_bb_flags2str(uint32_t bb_flags); extern uint32_t slurm_bb_str2flags(char *bb_str); +/* Function to convert enforce type flags between strings and numbers */ +extern int parse_part_enforce_type(char *enforce_part_type, uint16_t *param); +extern char * parse_part_enforce_type_2str (uint16_t type); + /* Given a protocol opcode return its string * description mapping the slurm_msg_type_t * to its name. diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index f2707ae9dc2..47f1376348c 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -5094,7 +5094,7 @@ static int _part_access_check(struct part_record *part_ptr, uid_t submit_uid, slurmdb_qos_rec_t *qos_ptr, char *acct) { - uint32_t total_nodes; + uint32_t total_nodes, min_nodes_tmp, max_nodes_tmp; size_t resv_name_leng = 0; int rc = SLURM_SUCCESS; @@ -5150,15 +5150,27 @@ static int _part_access_check(struct part_record *part_ptr, } if ((part_ptr->state_up & PARTITION_SCHED) && - (job_desc->min_cpus != NO_VAL) && - (job_desc->min_cpus > part_ptr->total_cpus)) { - info("_part_access_check: Job requested too many cpus (%u) of " - "partition %s(%u)", - job_desc->min_cpus, part_ptr->name, - part_ptr->total_cpus); - return ESLURM_TOO_MANY_REQUESTED_CPUS; - } - + (job_desc->min_cpus != NO_VAL)) { + + if (job_desc->min_cpus > part_ptr->total_cpus) { + info("_part_access_check: Job requested too many " + "cpus (%u) of partition %s(%u)", + job_desc->min_cpus, part_ptr->name, + part_ptr->total_cpus); + return ESLURM_TOO_MANY_REQUESTED_CPUS; + } else if (job_desc->min_cpus > + (part_ptr->max_cpus_per_node * + part_ptr->total_nodes)) { + info("_part_access_check: Job requested too many " + "cpus (%u) of partition %s(%u)", + job_desc->min_cpus, part_ptr->name, + (part_ptr->max_cpus_per_node * + part_ptr->total_nodes)); + return ESLURM_TOO_MANY_REQUESTED_CPUS; + } + } + + /* Check against total nodes on the partition */ total_nodes = part_ptr->total_nodes; select_g_alter_node_cnt(SELECT_APPLY_NODE_MAX_OFFSET, &total_nodes); if ((part_ptr->state_up & PARTITION_SCHED) && @@ -5176,6 +5188,36 @@ static int _part_access_check(struct part_record *part_ptr, return ESLURM_REQUESTED_NODES_NOT_IN_PARTITION; } + /* Check against min/max node limits in the partition */ + min_nodes_tmp = part_ptr->min_nodes; + max_nodes_tmp = part_ptr->max_nodes; + if ((part_ptr->state_up & PARTITION_SCHED) && + (job_desc->min_nodes != NO_VAL) && + (job_desc->min_nodes < min_nodes_tmp)) { + info("_part_access_check: Job requested for nodes (%u) " + "smaller than partition %s(%u) min nodes", + job_desc->min_nodes, part_ptr->name, min_nodes_tmp); + return ESLURM_INVALID_NODE_COUNT; + } + + if ((part_ptr->state_up & PARTITION_SCHED) && + (job_desc->min_nodes != NO_VAL) && + (job_desc->max_nodes > max_nodes_tmp)) { + info("_part_access_check: Job requested for nodes (%u) " + "greater than partition %s(%u) max nodes", + job_desc->max_nodes, part_ptr->name, max_nodes_tmp); + return ESLURM_INVALID_NODE_COUNT; + } + + if ((part_ptr->state_up & PARTITION_SCHED) && + (job_desc->time_limit != NO_VAL) && + (job_desc->time_limit > part_ptr->max_time)) { + info("_part_access_check: Job time limit (%u) exceeds limit of " + "partition %s(%u)", + job_desc->time_limit, part_ptr->name, part_ptr->max_time); + return ESLURM_INVALID_TIME_LIMIT; + } + if (slurmctld_conf.enforce_part_limits) { if ((rc = part_policy_valid_acct(part_ptr, acct)) != SLURM_SUCCESS) @@ -5304,11 +5346,11 @@ static int _valid_job_part(job_desc_msg_t * job_desc, slurmdb_assoc_rec_t assoc_rec; uint32_t min_nodes_orig = INFINITE, max_nodes_orig = 1; uint32_t max_time = 0; + bool any_check = false; /* Change partition pointer(s) to alternates as needed */ if (part_ptr_list) { int fail_rc = SLURM_SUCCESS; - bool rebuild_name_list = false; ListIterator iter = list_iterator_create(part_ptr_list); while ((part_ptr_tmp = (struct part_record *)list_next(iter))) { @@ -5339,12 +5381,24 @@ static int _valid_job_part(job_desc_msg_t * job_desc, qos_ptr, assoc_ptr ? assoc_ptr->acct : NULL); + + + if (rc != SLURM_SUCCESS) { fail_rc = rc; - list_remove(iter); - rebuild_name_list = true; - continue; + if (slurmctld_conf.enforce_part_limits == + PARTITION_ENFORCE_ALL) { + break; + } } + else { + any_check = true; + } + + // Set to success since we found a usable partition + if (any_check && slurmctld_conf.enforce_part_limits == + PARTITION_ENFORCE_ANY) + fail_rc = SLURM_SUCCESS; min_nodes_orig = MIN(min_nodes_orig, part_ptr_tmp->min_nodes_orig); @@ -5353,28 +5407,23 @@ static int _valid_job_part(job_desc_msg_t * job_desc, max_time = MAX(max_time, part_ptr_tmp->max_time); } list_iterator_destroy(iter); - if (list_is_empty(part_ptr_list)) { - if (fail_rc != SLURM_SUCCESS) + + if (list_is_empty(part_ptr_list) || + (slurmctld_conf.enforce_part_limits && + (fail_rc != SLURM_SUCCESS))) { + if (slurmctld_conf.enforce_part_limits == + PARTITION_ENFORCE_ALL) rc = fail_rc; - else + else if (slurmctld_conf.enforce_part_limits == + PARTITION_ENFORCE_ANY && !any_check) + rc = fail_rc; + else { rc = ESLURM_PARTITION_NOT_AVAIL; - goto fini; - } - rc = SLURM_SUCCESS; /* At least some partition usable */ - if (rebuild_name_list) { - *part_pptr = part_ptr = NULL; - xfree(job_desc->partition); - iter = list_iterator_create(part_ptr_list); - while ((part_ptr_tmp = list_next(iter))) { - if (job_desc->partition) - xstrcat(job_desc->partition, ","); - else - *part_pptr = part_ptr = part_ptr_tmp; - xstrcat(job_desc->partition, - part_ptr_tmp->name); } - list_iterator_destroy(iter); + goto fini; } + rc = SLURM_SUCCESS; /* At least some partition + * usable */ } else { min_nodes_orig = part_ptr->min_nodes_orig; max_nodes_orig = part_ptr->max_nodes_orig; @@ -5382,11 +5431,18 @@ static int _valid_job_part(job_desc_msg_t * job_desc, rc = _part_access_check(part_ptr, job_desc, req_bitmap, submit_uid, qos_ptr, assoc_ptr ? assoc_ptr->acct : NULL); - if (rc != SLURM_SUCCESS) + + if (rc != SLURM_SUCCESS && slurmctld_conf.enforce_part_limits) goto fini; + + // Enforce Part Limit = no + rc = SLURM_SUCCESS; } /* Validate job limits against partition limits */ + + // Check Partition with the highest limits when there are + // muliple if (job_desc->min_nodes == NO_VAL) { /* Avoid setting the job request to 0 nodes if the user didn't ask for 0. @@ -6223,7 +6279,7 @@ static int _test_job_desc_fields(job_desc_msg_t * job_desc) _test_strlen(job_desc->licenses, "licenses", 1024) || _test_strlen(job_desc->linuximage, "linuximage", 1024) || _test_strlen(job_desc->mail_user, "mail_user", 1024) || - _test_strlen(job_desc->mcs_label, "mcs_label", 1024) || + _test_strlen(job_desc->mcs_label, "mcs_label", 1024) || _test_strlen(job_desc->mem_bind, "mem_bind", 1024) || _test_strlen(job_desc->mloaderimage, "mloaderimage", 1024) || _test_strlen(job_desc->name, "name", 1024) || @@ -7716,8 +7772,9 @@ static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate, job_desc_msg->pn_min_memory = slurmctld_conf.def_mem_per_cpu; } - } else if (!_validate_min_mem_partition(job_desc_msg, part_ptr, part_list)) + } else if (!_validate_min_mem_partition(job_desc_msg, part_ptr, part_list)) { return ESLURM_INVALID_TASK_MEMORY; + } if (job_desc_msg->pn_min_memory == MEM_PER_CPU) { /* Map --mem-per-cpu=0 to --mem=0 for simpler logic */ job_desc_msg->pn_min_memory = 0; @@ -7760,8 +7817,21 @@ _validate_min_mem_partition(job_desc_msg_t *job_desc_msg, cc = false; iter = list_iterator_create(part_list); while ((part = list_next(iter))) { - if ((cc = _valid_pn_min_mem(job_desc_msg, part))) + if (!(cc = _valid_pn_min_mem(job_desc_msg, part))) { + printf("mem is bad\n"); + if (slurmctld_conf.enforce_part_limits == + PARTITION_ENFORCE_ALL) { + break; + } else if (slurmctld_conf.enforce_part_limits == + PARTITION_ENFORCE_ANY) { + info("%s: Job requested for (%u)MB is invalid" + " for partition %s", + __func__, job_desc_msg->pn_min_memory, part->name); + } + } else if ((cc = _valid_pn_min_mem(job_desc_msg, part))) { break; + } + } list_iterator_destroy(iter); -- GitLab