diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 4ea3897a1d9b321e531d5d88c1ea9e0a461c7b69..a891603213db6df43d7184c4ba11e4f9bc0d498d 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1351,7 +1351,8 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) } /* return result */ - if (error_code) { + if ((error_code != SLURM_SUCCESS) && + (error_code != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) { info("_slurm_rpc_submit_batch_job time=%ld, error=%s", (long) (clock() - start_time), slurm_strerror(error_code)); @@ -1361,7 +1362,8 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) "_slurm_rpc_submit_batch_job success for id=%u, time=%ld", job_id, (long) (clock() - start_time)); /* send job_ID */ - submit_msg.job_id = job_id; + submit_msg.job_id = job_id; + submit_msg.error_code = error_code; response_msg.msg_type = RESPONSE_SUBMIT_BATCH_JOB; response_msg.data = &submit_msg; slurm_send_node_msg(msg->conn_fd, &response_msg); @@ -1382,14 +1384,14 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) char *node_list_ptr = NULL; uint16_t num_cpu_groups = 0; uint32_t *cpus_per_node = NULL, *cpu_count_reps = NULL; - uint32_t job_id; + uint32_t job_id = 0; resource_allocation_response_msg_t alloc_msg; /* Locks: Write job, write node, read partition */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; uid_t uid; - uint16_t node_cnt; - slurm_addr *node_addr; + uint16_t node_cnt = 0; + slurm_addr *node_addr = NULL; start_time = clock(); debug("Processing RPC: REQUEST_RESOURCE_ALLOCATION"); @@ -1403,6 +1405,7 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) error("Security violation, RESOURCE_ALLOCATE from uid %u", (unsigned int) uid); } + if (error_code == SLURM_SUCCESS) { int immediate = job_desc_msg->immediate; lock_slurmctld(job_write_lock); @@ -1415,7 +1418,8 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) } /* return result */ - if (error_code) { + if ((error_code != SLURM_SUCCESS) && + (error_code != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) { info("_slurm_rpc_allocate_resources time=%ld, error=%s ", (long) (clock() - start_time), slurm_strerror(error_code)); @@ -1426,14 +1430,14 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) node_list_ptr, job_id, (long) (clock() - start_time)); /* send job_ID and node_name_ptr */ - - alloc_msg.job_id = job_id; - alloc_msg.node_list = node_list_ptr; - alloc_msg.num_cpu_groups = num_cpu_groups; - alloc_msg.cpus_per_node = cpus_per_node; alloc_msg.cpu_count_reps = cpu_count_reps; - alloc_msg.node_cnt = node_cnt; - alloc_msg.node_addr = node_addr; + alloc_msg.cpus_per_node = cpus_per_node; + alloc_msg.error_code = error_code; + alloc_msg.job_id = job_id; + alloc_msg.node_addr = node_addr; + alloc_msg.node_cnt = node_cnt; + alloc_msg.node_list = node_list_ptr; + alloc_msg.num_cpu_groups = num_cpu_groups; response_msg.msg_type = RESPONSE_RESOURCE_ALLOCATION; response_msg.data = &alloc_msg; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index eec48e8b6abe99c9ef0889099527974b8176def1..eaa144d5c6d3eba41ceb6c50debd24e741daeea0 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1110,12 +1110,15 @@ int job_allocate(job_desc_msg_t * job_specs, uint32_t * new_job_id, no_alloc = test_only || (!top_prio); error_code = select_nodes(job_ptr, no_alloc); - if (error_code == ESLURM_NODES_BUSY) { + if ((error_code == ESLURM_NODES_BUSY) || + (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) { + /* Not fatal error, but job can't be scheduled right now */ if (immediate) { job_ptr->job_state = JOB_FAILED; job_ptr->end_time = 0; } else /* job remains queued */ - error_code = 0; + if (error_code == ESLURM_NODES_BUSY) + error_code = SLURM_SUCCESS; return error_code; } @@ -1305,7 +1308,7 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id, int allocate, int will_run, struct job_record **job_rec_ptr, uid_t submit_uid) { - int error_code, i; + int error_code = SLURM_SUCCESS, i; struct part_record *part_ptr; bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL; @@ -1413,14 +1416,10 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id, error_code = ESLURM_TOO_MANY_REQUESTED_CPUS; goto cleanup; } - if ((job_desc->min_nodes > part_ptr->total_nodes) || - (job_desc->min_nodes > part_ptr->max_nodes)) { - if (part_ptr->total_nodes > part_ptr->max_nodes) - i = part_ptr->max_nodes; - else - i = part_ptr->total_nodes; + if (job_desc->min_nodes > part_ptr->total_nodes) { info("Job requested too many nodes (%d) of partition %s(%d)", - job_desc->min_nodes, part_ptr->name, i); + job_desc->min_nodes, part_ptr->name, + part_ptr->total_nodes); error_code = ESLURM_TOO_MANY_REQUESTED_NODES; goto cleanup; } @@ -1430,15 +1429,13 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id, error_code = ESLURM_TOO_MANY_REQUESTED_NODES; goto cleanup; } - if (job_desc->max_nodes > part_ptr->max_nodes) - job_desc->max_nodes = part_ptr->max_nodes; if ((error_code =_validate_job_create_req(job_desc))) goto cleanup; if (will_run) { - error_code = 0; + error_code = SLURM_SUCCESS; goto cleanup; } @@ -1462,15 +1459,26 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id, (*job_rec_ptr)->batch_flag = 1; } else (*job_rec_ptr)->batch_flag = 0; - - if (part_ptr->shared == SHARED_FORCE) /* shared=force */ - (*job_rec_ptr)->details->shared = 1; - else if (((*job_rec_ptr)->details->shared != 1) || - (part_ptr->shared == SHARED_NO)) /* can't share */ - (*job_rec_ptr)->details->shared = 0; - *new_job_id = (*job_rec_ptr)->job_id; - return SLURM_SUCCESS; + + /* Insure that requested partition is valid right now, + * otherwise leave job queued and provide warning code */ + if (job_desc->min_nodes > part_ptr->max_nodes) { + info("Job %u requested too many nodes (%d) of partition %s(%d)", + *new_job_id, job_desc->min_nodes, part_ptr->name, + part_ptr->max_nodes); + error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; + } else if ((job_desc->max_nodes != 0) && /* no max_nodes for job */ + (job_desc->max_nodes < part_ptr->min_nodes)) { + info("Job %u requested too few nodes (%d) of partition %s(%d)", + *new_job_id, job_desc->max_nodes, + part_ptr->name, part_ptr->min_nodes); + error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; + } else if (part_ptr->state_up == 0) { + info("Job %u requested down partition %s", + *new_job_id, part_ptr->name); + error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; + } cleanup: FREE_NULL_BITMAP(req_bitmap); @@ -1802,7 +1810,7 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, strncpy(job_ptr->partition, part_ptr->name, MAX_NAME_LEN); job_ptr->part_ptr = part_ptr; - if (job_desc->job_id != NO_VAL) + if (job_desc->job_id != NO_VAL) /* already confirmed unique */ job_ptr->job_id = job_desc->job_id; else _set_job_id(job_ptr); @@ -1817,11 +1825,12 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, job_ptr->time_limit = job_desc->time_limit; job_ptr->alloc_sid = job_desc->alloc_sid; job_ptr->alloc_node = xstrdup(job_desc->alloc_node); - if ((job_desc->priority != - NO_VAL) /* also check submit UID is root */ ) + + if (job_desc->priority != NO_VAL) /* already confirmed submit_uid==0 */ job_ptr->priority = job_desc->priority; else _set_job_prio(job_ptr); + if (job_desc->kill_on_node_fail != (uint16_t) NO_VAL) job_ptr->kill_on_node_fail = job_desc->kill_on_node_fail; @@ -1967,7 +1976,7 @@ static void _job_timed_out(struct job_record *job_ptr) /* _validate_job_desc - validate that a job descriptor for job submit or * allocate has valid data, set values to defaults as required - * IN job_desc_msg - pointer to job descriptor + * IN/OUT job_desc_msg - pointer to job descriptor, modified as needed * IN allocate - if clear job to be queued, if set allocate for user now * IN submit_uid - who request originated */ @@ -2020,6 +2029,9 @@ static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate, _purge_job_record(job_desc_msg->job_id); } + if (submit_uid != 0) /* only root can set job priority */ + job_desc_msg->priority = NO_VAL; + if (job_desc_msg->num_procs == NO_VAL) job_desc_msg->num_procs = 1; /* default cpu count of 1 */ if (job_desc_msg->min_nodes == NO_VAL) diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 70a00bf7fc2290a35a6185434dae9b57a2073ed3..9de3d73e1e97581eb5284c70dc44ea3c67bd61b4 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -143,6 +143,10 @@ int schedule(void) sizeof(struct part_record *)); failed_parts[failed_part_cnt++] = job_ptr->part_ptr; + } else if (error_code == + ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) { + debug2("job %u not runnable with present config", + job_ptr->job_id); } else if (error_code == SLURM_SUCCESS) { /* job initiated */ last_job_update = time(NULL); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index afb039bdb6ecd0bd34df812ec36a7e81f9c37901..1f03c1aac87de45b196811467884671455d990a3 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -76,8 +76,7 @@ static int _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, bitstr_t ** req_bitmap, uint32_t req_cpus, uint32_t min_nodes, uint32_t max_nodes, - int contiguous, int shared, - uint32_t node_lim); + int contiguous, int shared); static int _valid_features(char *requested, char *available); @@ -489,8 +488,6 @@ _enough_nodes(int avail_nodes, int rem_nodes, int min_nodes, int max_nodes) * IN max_nodes - maximum count of nodes required by the job (0==no limit) * IN contiguous - 1 if allocated nodes must be contiguous, 0 otherwise * IN shared - set to 1 if nodes may be shared, 0 otherwise - * IN node_lim - maximum number of nodes permitted for job, - * INFIITE for no limit (partition limit) * RET 0 on success, EAGAIN if request can not be satisfied now, EINVAL if * request can never be satisfied (insufficient contiguous nodes) * NOTE: the caller must xfree memory pointed to by req_bitmap @@ -511,7 +508,7 @@ static int _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, bitstr_t ** req_bitmap, uint32_t req_cpus, uint32_t min_nodes, uint32_t max_nodes, - int contiguous, int shared, uint32_t node_lim) + int contiguous, int shared) { int error_code = SLURM_SUCCESS, i, j, pick_code; int total_nodes = 0, total_cpus = 0; /* total resources configured @@ -526,14 +523,6 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, info("_pick_best_nodes: empty node set for selection"); return EINVAL; } - if (node_lim != INFINITE) { - if (min_nodes > node_lim) { - info("_pick_best_nodes: exceed partition node limit"); - return EINVAL; - } - if (max_nodes > node_lim) - max_nodes = node_lim; - } if (*req_bitmap) { /* specific nodes required */ /* we have already confirmed that all of these nodes have a @@ -542,8 +531,8 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, total_nodes = bit_set_count(*req_bitmap); if (req_cpus != 0) total_cpus = count_cpus(*req_bitmap); - if (total_nodes > node_lim) { - info("_pick_best_nodes: exceed partition node limit"); + if (total_nodes > max_nodes) { + info("_pick_best_nodes: required nodes exceed limit"); return EINVAL; } if ((min_nodes <= total_nodes) && @@ -588,23 +577,16 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, if ((*req_bitmap) && (!bit_super_set(*req_bitmap, avail_bitmap))) continue; - if ((avail_nodes < min_nodes) || - (avail_cpus < req_cpus) || - ((max_nodes > min_nodes) && + if ((avail_nodes < min_nodes) || + (avail_cpus < req_cpus) || + ((max_nodes > min_nodes) && (avail_nodes < max_nodes))) - continue; - pick_code = - _pick_best_quadrics(avail_bitmap, *req_bitmap, - min_nodes, max_nodes, - req_cpus, contiguous); - if ((pick_code == 0) && (node_lim != INFINITE) && - (bit_set_count(avail_bitmap) > node_lim)) { - info("_pick_best_nodes: %u nodes, max is %u", - bit_set_count(avail_bitmap), node_lim); - error_code = EINVAL; - break; - } - if (pick_code == 0) { + continue; /* Keep accumulating nodes */ + pick_code = _pick_best_quadrics(avail_bitmap, + *req_bitmap, min_nodes, + max_nodes, req_cpus, + contiguous); + if (pick_code == SLURM_SUCCESS) { FREE_NULL_BITMAP(total_bitmap); FREE_NULL_BITMAP(*req_bitmap); *req_bitmap = avail_bitmap; @@ -613,13 +595,13 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, } /* try to get max_nodes now for this feature */ - if ((max_nodes > min_nodes) && + if ((max_nodes > min_nodes) && (avail_nodes >= min_nodes) && (avail_nodes < max_nodes)) { pick_code = _pick_best_quadrics(avail_bitmap, *req_bitmap, min_nodes, max_nodes, req_cpus, contiguous); - if (pick_code == 0) { + if (pick_code == SLURM_SUCCESS) { FREE_NULL_BITMAP(total_bitmap); FREE_NULL_BITMAP(*req_bitmap); *req_bitmap = avail_bitmap; @@ -629,22 +611,15 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, /* determine if job could possibly run (if all configured * nodes available) */ - if ((error_code == 0) && (!runable) && + if ((error_code == SLURM_SUCCESS) && (!runable) && (total_nodes >= min_nodes) && (total_cpus >= req_cpus) && ((*req_bitmap == NULL) || - (bit_super_set(*req_bitmap, total_bitmap))) && - ((node_lim == INFINITE) || (min_nodes <= node_lim))) { - pick_code = - _pick_best_quadrics(total_bitmap, *req_bitmap, - min_nodes, 0, - req_cpus, contiguous); - if ((pick_code == 0) && (node_lim != INFINITE) && - (bit_set_count(total_bitmap) > node_lim)) { - info("_pick_best_nodes: %u nodes, max is %u", - bit_set_count(avail_bitmap), node_lim); - error_code = EINVAL; - } - if (pick_code == 0) + (bit_super_set(*req_bitmap, total_bitmap)))) { + pick_code = _pick_best_quadrics(total_bitmap, + *req_bitmap, min_nodes, + max_nodes, req_cpus, + contiguous); + if (pick_code == SLURM_SUCCESS) runable = true; } FREE_NULL_BITMAP(avail_bitmap); @@ -706,10 +681,11 @@ _add_node_set_info(struct node_set *node_set_ptr, */ int select_nodes(struct job_record *job_ptr, bool test_only) { - int error_code = SLURM_SUCCESS, i, node_set_size = 0; + int error_code = SLURM_SUCCESS, i, shared, node_set_size = 0; bitstr_t *req_bitmap = NULL; struct node_set *node_set_ptr = NULL; struct part_record *part_ptr = job_ptr->part_ptr; + uint32_t min_nodes, max_nodes; if (job_ptr == NULL) fatal ("select_nodes: job_ptr == NULL"); @@ -718,17 +694,22 @@ int select_nodes(struct job_record *job_ptr, bool test_only) /* insure that partition exists and is up */ if (part_ptr == NULL) { part_ptr = find_part_record(job_ptr->partition); + if (part_ptr == NULL) + fatal("Invalid partition name %s for job %u", + job_ptr->partition, job_ptr->job_id); job_ptr->part_ptr = part_ptr; error("partition pointer reset for job %u, part %s", job_ptr->job_id, job_ptr->partition); } - if (part_ptr == NULL) - fatal("Invalid partition name %s for job %u", - job_ptr->partition, job_ptr->job_id); - if (part_ptr->state_up == 0) - return ESLURM_NODES_BUSY; - /* get sets of nodes from the configuration list */ + /* Confirm that partition is up and has compatible nodes limits */ + if ((part_ptr->state_up == 0) || + ((job_ptr->details->max_nodes != 0) && /* no node limit */ + (job_ptr->details->max_nodes < part_ptr->min_nodes)) || + (job_ptr->details->min_nodes > part_ptr->max_nodes)) + return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; + + /* build sets of usable nodes based upon their configuration */ error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size); if (error_code) return error_code; @@ -746,14 +727,25 @@ int select_nodes(struct job_record *job_ptr, bool test_only) } /* pick the nodes providing a best-fit */ + min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); + if (part_ptr->max_nodes == INFINITE) + max_nodes = job_ptr->details->max_nodes; + else if (job_ptr->details->max_nodes == 0) + max_nodes = part_ptr->max_nodes; + else + max_nodes = MIN(job_ptr->details->max_nodes, + part_ptr->max_nodes); + if (part_ptr->shared == SHARED_FORCE) /* shared=force */ + shared = 1; + else if (part_ptr->shared == SHARED_NO) /* can't share */ + shared = 0; + else + shared = job_ptr->details->shared; + error_code = _pick_best_nodes(node_set_ptr, node_set_size, - &req_bitmap, - job_ptr->details->num_procs, - job_ptr->details->min_nodes, - job_ptr->details->max_nodes, - job_ptr->details->contiguous, - job_ptr->details->shared, - part_ptr->max_nodes); + &req_bitmap, job_ptr->details->num_procs, + min_nodes, max_nodes, + job_ptr->details->contiguous, shared); if (error_code == EAGAIN) { error_code = ESLURM_NODES_BUSY; goto cleanup; diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index a500d61c847afc9312a05208f12f570aeeea5cca..d3cd32ff2b96ea96ee039cd65459b14e05fa2629 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -68,9 +68,11 @@ _X = NULL; \ } while (0) #define IS_JOB_FINISHED(_X) \ - ((_X->job_state & (~JOB_COMPLETING)) > JOB_RUNNING) + ((_X->job_state & (~JOB_COMPLETING)) > JOB_RUNNING) #define IS_JOB_PENDING(_X) \ ((_X->job_state & (~JOB_COMPLETING)) == JOB_PENDING) +#define MAX(x,y) (((x)>(y))?(x):(y)) +#define MIN(x,y) (((x)<(y))?(x):(y)) /*****************************************************************************\ * GENERAL CONFIGURATION parameters and data structures