diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 91596a6409e7ba814cc9d6c9bbea32f79ff7525a..7e0707ed85ec2f86cc7148e91d9caad6434d3588 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -67,12 +67,6 @@ #include "src/slurmctld/agent.h" #include "src/slurmctld/locks.h" -#define FREE_NULL(_X) \ - do { \ - if (_X) xfree (_X); \ - _X = NULL; \ - } while (0) - #if COMMAND_TIMEOUT == 1 # define WDOG_POLL 1 /* secs */ #else diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 43fae4bf5163a04f9c3b031a9393ec04f006da18..1f19fe2a313ec12cbd8bc280f423657fcb0e38c1 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1411,9 +1411,8 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) /* return result */ if (error_code) { - info( - "_slurm_rpc_allocate_resources error %d allocating resources, time=%ld", - error_code, (long) (clock() - start_time)); + info("_slurm_rpc_allocate_resources error %d, time=%ld", + error_code, (long) (clock() - start_time)); slurm_send_rc_msg(msg, error_code); } else { info( @@ -1854,8 +1853,7 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg) (long) (clock() - start_time)); job_step_resp.job_step_id = step_rec->step_id; - job_step_resp.node_list = - bitmap2node_name(step_rec->node_bitmap); + job_step_resp.node_list = xstrdup(step_rec->step_node_list); job_step_resp.credentials = &step_rec->job_ptr->details->credential; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 2291ff2c3422fb65f10eb9e74c36529a7db7585a..463fab39065d74ad3e6dd00369b6b7de269f2b16 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -65,23 +65,11 @@ slurm_ssl_key_ctx_t sign_ctx; #define DETAILS_FLAG 0xdddd -#define MAX_STR_PACK 128 +#define MAX_STR_PACK 1024 #define SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0 0 #define STEP_FLAG 0xbbbb #define TOP_PRIORITY 0xffff0000 /* large, but leave headroom for higher */ -#define FREE_NULL(_X) \ - do { \ - if (_X) xfree (_X); \ - _X = NULL; \ - } while (0) - -#define FREE_NULL_BITMAP(_X) \ - do { \ - if (_X) bit_free (_X); \ - _X = NULL; \ - } while (0) - #define JOB_HASH_INX(_job_id) (_job_id % MAX_JOB_COUNT) #define YES_OR_NO(_in_string) \ @@ -107,10 +95,11 @@ static int _copy_job_desc_to_file(job_desc_msg_t * job_desc, static int _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, struct job_record **job_ptr, struct part_record *part_ptr, - bitstr_t * req_bitmap); + bitstr_t ** exc_bitmap, + bitstr_t ** req_bitmap); static void _del_batch_list_rec(void *x); static void _delete_job_desc_files(uint32_t job_id); -static void _dump_job_details_state(struct job_details *detail_ptr, +static void _dump_job_details(struct job_details *detail_ptr, Buf buffer); static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer); static void _dump_job_step_state(struct step_record *step_ptr, Buf buffer); @@ -123,6 +112,9 @@ static int _job_create(job_desc_msg_t * job_specs, uint32_t * new_job_id, struct job_record **job_rec_ptr, uid_t submit_uid); static void _list_delete_job(void *job_entry); static int _list_find_job_old(void *job_entry, void *key); +static int _load_job_details(struct job_record *job_ptr, Buf buffer); +static int _load_job_state(Buf buffer); +static int _load_step_state(struct job_record *job_ptr, Buf buffer); static void _pack_job_details(struct job_details *detail_ptr, Buf buffer); static void _read_data_array_from_file(char *file_name, char ***data, uint16_t * size); @@ -134,7 +126,6 @@ static void _signal_job_on_node(uint32_t job_id, uint16_t step_id, int signum, struct node_record *node_ptr); static void _spawn_signal_agent(agent_arg_t *agent_info); static bool _top_priority(struct job_record *job_ptr); -static int _unload_step_state(struct job_record *job_ptr, Buf buffer); static int _validate_job_create_req(job_desc_msg_t * job_desc); static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate); static void _validate_job_files(List batch_dirs); @@ -204,16 +195,17 @@ void delete_job_details(struct job_record *job_entry) if (job_entry->details->magic != DETAILS_MAGIC) fatal ("delete_job_details: passed invalid job details pointer"); - FREE_NULL(job_entry->details->credential.node_list); FREE_NULL(job_entry->details->req_nodes); + FREE_NULL(job_entry->details->exc_nodes); FREE_NULL_BITMAP(job_entry->details->req_node_bitmap); + FREE_NULL_BITMAP(job_entry->details->exc_node_bitmap); + FREE_NULL(job_entry->details->credential.node_list); FREE_NULL(job_entry->details->features); FREE_NULL(job_entry->details->err); FREE_NULL(job_entry->details->in); FREE_NULL(job_entry->details->out); FREE_NULL(job_entry->details->work_dir); - xfree(job_entry->details); - job_entry->details = NULL; + FREE_NULL(job_entry->details); } /* _delete_job_desc_files - delete job descriptor related files */ @@ -242,7 +234,7 @@ static void _delete_job_desc_files(uint32_t job_id) xfree(dir_name); } -/* dump_all_job_state - save the state of all jobs to file +/* dump_all_job_state - save the state of all jobs to file for checkpoint * RET 0 or error code */ int dump_all_job_state(void) { @@ -311,6 +303,68 @@ int dump_all_job_state(void) return error_code; } +/* + * load_all_job_state - load the job state from file, recover from last + * checkpoint. Execute this after loading the configuration file data. + * RET 0 or error code + */ +int load_all_job_state(void) +{ + int data_allocated, data_read = 0, error_code = 0; + uint32_t data_size = 0; + int state_fd; + char *data = NULL, *state_file; + Buf buffer; + time_t buf_time; + + /* read the file */ + state_file = xstrdup(slurmctld_conf.state_save_location); + xstrcat(state_file, "/job_state"); + lock_state_files(); + state_fd = open(state_file, O_RDONLY); + if (state_fd < 0) { + info("No job state file (%s) to recover", state_file); + error_code = ENOENT; + } else { + data_allocated = BUF_SIZE; + data = xmalloc(data_allocated); + while ((data_read = + read(state_fd, &data[data_size], + BUF_SIZE)) == BUF_SIZE) { + data_size += data_read; + data_allocated += BUF_SIZE; + xrealloc(data, data_allocated); + } + data_size += data_read; + close(state_fd); + if (data_read < 0) + error("Error reading file %s: %m", state_file); + } + xfree(state_file); + unlock_state_files(); + + if (job_id_sequence < 0) + job_id_sequence = slurmctld_conf.first_job_id; + + buffer = create_buf(data, data_size); + safe_unpack_time(&buf_time, buffer); + + while (remaining_buf(buffer) > 0) { + error_code = _load_job_state(buffer); + if (error_code != SLURM_SUCCESS) + goto unpack_error; + } + + free_buf(buffer); + return error_code; + + unpack_error: + error("Incomplete job data checkpoint file"); + error("Job state not completely restored"); + free_buf(buffer); + return SLURM_FAILURE; +} + /* * _dump_job_state - dump the state of a specific job, its details, and * steps to a buffer @@ -348,7 +402,7 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer) if (detail_ptr->magic != DETAILS_MAGIC) fatal("dump_all_job: job detail integrity is bad"); pack16((uint16_t) DETAILS_FLAG, buffer); - _dump_job_details_state(detail_ptr, buffer); + _dump_job_details(detail_ptr, buffer); } else pack16((uint16_t) 0, buffer); /* no details flag */ @@ -364,411 +418,293 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer) pack16((uint16_t) 0, buffer); /* no step flag */ } -/* - * _dump_job_details_state - dump the state of a specific job details to - * a buffer - * IN detail_ptr - pointer to job details for which information is requested - * IN/OUT buffer - location to store data, pointers automatically advanced - */ -void _dump_job_details_state(struct job_details *detail_ptr, Buf buffer) +/* Unpack a job's state information from a buffer */ +static int _load_job_state(Buf buffer) { - char tmp_str[MAX_STR_PACK]; + uint32_t job_id, user_id, time_limit, priority; + time_t start_time, end_time; + uint16_t job_state, next_step_id, details, batch_flag, step_flag; + uint16_t kill_on_node_fail, kill_on_step_done, name_len; + char *nodes = NULL, *partition = NULL, *name = NULL; + bitstr_t *node_bitmap = NULL; + struct job_record *job_ptr; + struct part_record *part_ptr; + int error_code; - pack_job_credential(&detail_ptr->credential, buffer); + safe_unpack32(&job_id, buffer); + safe_unpack32(&user_id, buffer); + safe_unpack32(&time_limit, buffer); + safe_unpack32(&priority, buffer); - pack32((uint32_t) detail_ptr->num_procs, buffer); - pack32((uint32_t) detail_ptr->min_nodes, buffer); + safe_unpack_time(&start_time, buffer); + safe_unpack_time(&end_time, buffer); - pack16((uint16_t) detail_ptr->shared, buffer); - pack16((uint16_t) detail_ptr->contiguous, buffer); + safe_unpack16(&job_state, buffer); + safe_unpack16(&next_step_id, buffer); + safe_unpack16(&kill_on_node_fail, buffer); + safe_unpack16(&kill_on_step_done, buffer); + safe_unpack16(&batch_flag, buffer); - pack32((uint32_t) detail_ptr->min_procs, buffer); - pack32((uint32_t) detail_ptr->min_memory, buffer); - pack32((uint32_t) detail_ptr->min_tmp_disk, buffer); - pack_time(detail_ptr->submit_time, buffer); - pack32((uint32_t) detail_ptr->total_procs, buffer); + safe_unpackstr_xmalloc(&nodes, &name_len, buffer); + safe_unpackstr_xmalloc(&partition, &name_len, buffer); + safe_unpackstr_xmalloc(&name, &name_len, buffer); - if ((detail_ptr->req_nodes == NULL) || - (strlen(detail_ptr->req_nodes) < MAX_STR_PACK)) - packstr(detail_ptr->req_nodes, buffer); - else { - strncpy(tmp_str, detail_ptr->req_nodes, MAX_STR_PACK); - tmp_str[MAX_STR_PACK - 1] = (char) NULL; - packstr(tmp_str, buffer); + /* validity test as possible */ + if ((job_state >= JOB_END) || (batch_flag > 1)) { + error("Invalid data for job %u: job_state=%u batch_flag=%u", + job_id, job_state, batch_flag); + goto unpack_error; } - - if (detail_ptr->features == NULL || - strlen(detail_ptr->features) < MAX_STR_PACK) - packstr(detail_ptr->features, buffer); - else { - strncpy(tmp_str, detail_ptr->features, MAX_STR_PACK); - tmp_str[MAX_STR_PACK - 1] = (char) NULL; - packstr(tmp_str, buffer); + if ((kill_on_node_fail > 1) || (kill_on_step_done > 1)) { + error("Invalid data for job %u: kill_on_node_fail=%u", + job_id, kill_on_node_fail); + goto unpack_error; } - - if ((detail_ptr->err == NULL) || - (strlen(detail_ptr->err) < MAX_STR_PACK)) - packstr(detail_ptr->err, buffer); - else { - strncpy(tmp_str, detail_ptr->err, MAX_STR_PACK); - tmp_str[MAX_STR_PACK - 1] = (char) NULL; - packstr(tmp_str, buffer); + if ((kill_on_node_fail > 1) || (kill_on_step_done > 1)) { + error("Invalid data for job %u: kill_on_step_done=%u", + job_id, kill_on_step_done); + goto unpack_error; } - - if ((detail_ptr->in == NULL) || - (strlen(detail_ptr->in) < MAX_STR_PACK)) - packstr(detail_ptr->in, buffer); - else { - strncpy(tmp_str, detail_ptr->in, MAX_STR_PACK); - tmp_str[MAX_STR_PACK - 1] = (char) NULL; - packstr(tmp_str, buffer); + if ((nodes) && (node_name2bitmap(nodes, &node_bitmap))) { + error("_load_job_state: invalid nodes (%s) for job_id %u", + nodes, job_id); + goto unpack_error; } - if (detail_ptr->out == NULL || - strlen(detail_ptr->out) < MAX_STR_PACK) - packstr(detail_ptr->out, buffer); - else { - strncpy(tmp_str, detail_ptr->out, MAX_STR_PACK); - tmp_str[MAX_STR_PACK - 1] = (char) NULL; - packstr(tmp_str, buffer); + job_ptr = find_job_record(job_id); + if (job_ptr == NULL) { + part_ptr = list_find_first(part_list, &list_find_part, + partition); + if (part_ptr == NULL) { + info("Invalid partition (%s) for job_id %u", + partition, job_id); + goto unpack_error; + } + job_ptr = create_job_record(&error_code); + if (error_code) { + error("Create job entry failed for job_id %u", + job_id); + goto unpack_error; + } + job_ptr->job_id = job_id; + strncpy(job_ptr->partition, partition, MAX_NAME_LEN); + job_ptr->part_ptr = part_ptr; + _add_job_hash(job_ptr); + } + + if (default_prio >= priority) + default_prio = priority - 1; + if (job_id_sequence <= job_id) + job_id_sequence = job_id + 1; + + safe_unpack16(&details, buffer); + if ((details == DETAILS_FLAG) && + (_load_job_details(job_ptr, buffer))) + goto unpack_error; + + job_ptr->user_id = user_id; + job_ptr->time_limit = time_limit; + job_ptr->priority = priority; + job_ptr->start_time = start_time; + job_ptr->end_time = end_time; + job_ptr->time_last_active = time(NULL); + job_ptr->job_state = job_state; + job_ptr->next_step_id = next_step_id; + strncpy(job_ptr->name, name, MAX_NAME_LEN); + FREE_NULL(name); + job_ptr->nodes = nodes; + nodes = NULL; /* reused, nothing left to free */ + job_ptr->node_bitmap = node_bitmap; + FREE_NULL(partition); + job_ptr->kill_on_node_fail = kill_on_node_fail; + job_ptr->kill_on_step_done = kill_on_step_done; + job_ptr->batch_flag = batch_flag; + info("recovered job id %u", job_id); + + safe_unpack16(&step_flag, buffer); + while (step_flag == STEP_FLAG) { + if ((error_code = _load_step_state(job_ptr, buffer))) + goto unpack_error; + safe_unpack16(&step_flag, buffer); } - if ((detail_ptr->work_dir == NULL) || - (strlen(detail_ptr->work_dir) < MAX_STR_PACK)) - packstr(detail_ptr->work_dir, buffer); - else { - strncpy(tmp_str, detail_ptr->work_dir, MAX_STR_PACK); - tmp_str[MAX_STR_PACK - 1] = (char) NULL; - packstr(tmp_str, buffer); - } + return SLURM_SUCCESS; + + unpack_error: + FREE_NULL(nodes); + FREE_NULL(partition); + FREE_NULL(name); + FREE_NULL_BITMAP(node_bitmap); + return SLURM_FAILURE; } /* - * _dump_job_step_state - dump the state of a specific job step to a buffer - * IN detail_ptr - pointer to job step for which information is requested + * _dump_job_details - dump the state of a specific job details to + * a buffer + * IN detail_ptr - pointer to job details for which information is requested * IN/OUT buffer - location to store data, pointers automatically advanced */ -static void _dump_job_step_state(struct step_record *step_ptr, Buf buffer) +void _dump_job_details(struct job_details *detail_ptr, Buf buffer) { - char *node_list; + pack_job_credential(&detail_ptr->credential, buffer); - pack16((uint16_t) step_ptr->step_id, buffer); - pack16((uint16_t) step_ptr->cyclic_alloc, buffer); - pack32(step_ptr->num_tasks, buffer); - pack_time(step_ptr->start_time, buffer); - node_list = bitmap2node_name(step_ptr->node_bitmap); - packstr(node_list, buffer); - xfree(node_list); -#ifdef HAVE_LIBELAN3 - qsw_pack_jobinfo(step_ptr->qsw_job, buffer); -#endif -} + pack32((uint32_t) detail_ptr->num_procs, buffer); + pack32((uint32_t) detail_ptr->min_nodes, buffer); + pack32((uint32_t) detail_ptr->max_nodes, buffer); + pack32((uint32_t) detail_ptr->total_procs, buffer); -/* - * load_job_state - load the job state from file, recover from last slurmctld - * checkpoint. Execute this after loading the configuration file data. - * RET 0 or error code - */ -int load_job_state(void) -{ - int data_allocated, data_read = 0, error_code = 0; - uint32_t data_size = 0; - int state_fd; - char *data = NULL, *state_file; - Buf buffer; - uint32_t job_id, user_id, time_limit, priority, total_procs; - time_t buf_time, start_time, end_time, submit_time; - uint16_t job_state, next_step_id, details; - char *nodes = NULL, *partition = NULL, *name = NULL; - uint32_t num_procs, min_nodes, min_procs, min_memory, min_tmp_disk; - uint16_t shared, contiguous, batch_flag; - uint16_t kill_on_node_fail, kill_on_step_done, name_len; - char *req_nodes = NULL, *features = NULL; - char *err = NULL, *in = NULL, *out = NULL, *work_dir = NULL; - slurm_job_credential_t *credential_ptr = NULL; - struct job_record *job_ptr; - struct part_record *part_ptr; - bitstr_t *node_bitmap = NULL, *req_node_bitmap = NULL; - uint16_t step_flag; + pack16((uint16_t) detail_ptr->shared, buffer); + pack16((uint16_t) detail_ptr->contiguous, buffer); - /* read the file */ - state_file = xstrdup(slurmctld_conf.state_save_location); - xstrcat(state_file, "/job_state"); - lock_state_files(); - state_fd = open(state_file, O_RDONLY); - if (state_fd < 0) { - info("No job state file (%s) to recover", state_file); - error_code = ENOENT; - } else { - data_allocated = BUF_SIZE; - data = xmalloc(data_allocated); - while ((data_read = - read(state_fd, &data[data_size], - BUF_SIZE)) == BUF_SIZE) { - data_size += data_read; - data_allocated += BUF_SIZE; - xrealloc(data, data_allocated); - } - data_size += data_read; - close(state_fd); - if (data_read < 0) - error("Error reading file %s: %m", state_file); - } - xfree(state_file); - unlock_state_files(); + pack32((uint32_t) detail_ptr->min_procs, buffer); + pack32((uint32_t) detail_ptr->min_memory, buffer); + pack32((uint32_t) detail_ptr->min_tmp_disk, buffer); + pack_time(detail_ptr->submit_time, buffer); - if (job_id_sequence < 0) - job_id_sequence = slurmctld_conf.first_job_id; + safe_packstr(detail_ptr->req_nodes, MAX_STR_PACK, buffer); + safe_packstr(detail_ptr->exc_nodes, MAX_STR_PACK, buffer); + safe_packstr(detail_ptr->features, MAX_STR_PACK, buffer); - buffer = create_buf(data, data_size); - safe_unpack_time(&buf_time, buffer); + safe_packstr(detail_ptr->err, MAX_STR_PACK, buffer); + safe_packstr(detail_ptr->in, MAX_STR_PACK, buffer); + safe_packstr(detail_ptr->out, MAX_STR_PACK, buffer); + safe_packstr(detail_ptr->work_dir, MAX_STR_PACK, buffer); +} - while (remaining_buf(buffer) > 0) { - safe_unpack32(&job_id, buffer); - safe_unpack32(&user_id, buffer); - safe_unpack32(&time_limit, buffer); - safe_unpack32(&priority, buffer); - - safe_unpack_time(&start_time, buffer); - safe_unpack_time(&end_time, buffer); - - safe_unpack16(&job_state, buffer); - safe_unpack16(&next_step_id, buffer); - safe_unpack16(&kill_on_node_fail, buffer); - safe_unpack16(&kill_on_step_done, buffer); - safe_unpack16(&batch_flag, buffer); - - safe_unpackstr_xmalloc(&nodes, &name_len, buffer); - safe_unpackstr_xmalloc(&partition, &name_len, buffer); - safe_unpackstr_xmalloc(&name, &name_len, buffer); - - /* validity test as possible */ - if ((job_state >= JOB_END) || - (kill_on_node_fail > 1) || - (kill_on_step_done > 1) || - (batch_flag > 1)) { - error - ("Invalid data for job %u: job_state=%u batch_flag=%u kill_on_node_fail=%u kill_on_step_done=%u", - job_id, job_state, batch_flag, - kill_on_node_fail, kill_on_step_done); - error - ("No more job data will be processed from the checkpoint file"); - FREE_NULL(nodes); - FREE_NULL(partition); - FREE_NULL(name); - error_code = EINVAL; - break; - } +/* _load_job_details - Unpack a job details information from buffer */ +static int _load_job_details(struct job_record *job_ptr, Buf buffer) +{ + char *req_nodes = NULL, *exc_nodes = NULL, *features = NULL; + char *err = NULL, *in = NULL, *out = NULL, *work_dir = NULL; + bitstr_t *req_node_bitmap = NULL, *exc_node_bitmap = NULL; + slurm_job_credential_t *credential_ptr = NULL; + uint32_t num_procs, min_nodes, max_nodes, min_procs; + uint16_t shared, contiguous, name_len; + uint32_t min_memory, min_tmp_disk, total_procs; + time_t submit_time; - safe_unpack16(&details, buffer); - - if (details == DETAILS_FLAG) { - if (unpack_job_credential(&credential_ptr, buffer)) - goto unpack_error; - - safe_unpack32(&num_procs, buffer); - safe_unpack32(&min_nodes, buffer); - - safe_unpack16(&shared, buffer); - safe_unpack16(&contiguous, buffer); - - safe_unpack32(&min_procs, buffer); - safe_unpack32(&min_memory, buffer); - safe_unpack32(&min_tmp_disk, buffer); - safe_unpack_time(&submit_time, buffer); - safe_unpack32(&total_procs, buffer); - - safe_unpackstr_xmalloc(&req_nodes, &name_len, - buffer); - safe_unpackstr_xmalloc(&features, &name_len, - buffer); - safe_unpackstr_xmalloc(&err, &name_len, buffer); - safe_unpackstr_xmalloc(&in, &name_len, buffer); - safe_unpackstr_xmalloc(&out, &name_len, buffer); - safe_unpackstr_xmalloc(&work_dir, &name_len, - buffer); - - /* validity test as possible */ - if ((shared > 1) || - (contiguous > 1) || (batch_flag > 1)) { - error - ("Invalid data for job %u: shared=%u contiguous=%u", - job_id, shared, contiguous); - error - ("No more job data will be processed from the checkpoint file"); - FREE_NULL(req_nodes); - FREE_NULL(features); - FREE_NULL(err); - FREE_NULL(in); - FREE_NULL(out); - FREE_NULL(work_dir); - error_code = EINVAL; - break; - } - } + /* unpack the job's details from the buffer */ + if (unpack_job_credential(&credential_ptr, buffer)) + goto unpack_error; - if (nodes) { - error_code = node_name2bitmap(nodes, &node_bitmap); - if (error_code) { - error - ("load_job_state: invalid nodes (%s) for job_id %u", - nodes, job_id); - goto cleanup; - } - } - if (req_nodes) { - error_code = - node_name2bitmap(req_nodes, &req_node_bitmap); - if (error_code) { - error - ("load_job_state: invalid req_nodes (%s) for job_id %u", - req_nodes, job_id); - goto cleanup; - } - } + safe_unpack32(&num_procs, buffer); + safe_unpack32(&min_nodes, buffer); + safe_unpack32(&max_nodes, buffer); + safe_unpack32(&total_procs, buffer); - job_ptr = find_job_record(job_id); - if (job_ptr == NULL) { - part_ptr = - list_find_first(part_list, &list_find_part, - partition); - if (part_ptr == NULL) { - info("load_job_state: invalid partition (%s) for job_id %u", - partition, job_id); - error_code = EINVAL; - goto cleanup; - } - job_ptr = create_job_record(&error_code); - if (error_code) { - error - ("load_job_state: unable to create job entry for job_id %u", - job_id); - goto cleanup; - } - job_ptr->job_id = job_id; - strncpy(job_ptr->partition, partition, - MAX_NAME_LEN); - job_ptr->part_ptr = part_ptr; - _add_job_hash(job_ptr); - info("recovered job id %u", job_id); - } + safe_unpack16(&shared, buffer); + safe_unpack16(&contiguous, buffer); - job_ptr->user_id = user_id; - job_ptr->time_limit = time_limit; - job_ptr->priority = priority; - job_ptr->start_time = start_time; - job_ptr->end_time = end_time; - job_ptr->time_last_active = time(NULL); - job_ptr->job_state = job_state; - job_ptr->next_step_id = next_step_id; - strncpy(job_ptr->name, name, MAX_NAME_LEN); - job_ptr->nodes = nodes; - nodes = NULL; - job_ptr->node_bitmap = node_bitmap; - node_bitmap = NULL; - job_ptr->kill_on_node_fail = kill_on_node_fail; - job_ptr->kill_on_step_done = kill_on_step_done; - job_ptr->batch_flag = batch_flag; - build_node_details(job_ptr); - - if (default_prio >= priority) - default_prio = priority - 1; - if (job_id_sequence <= job_id) - job_id_sequence = job_id + 1; - - if (details == DETAILS_FLAG) { - job_ptr->details->num_procs = num_procs; - job_ptr->details->min_nodes = min_nodes; - job_ptr->details->shared = shared; - job_ptr->details->contiguous = contiguous; - job_ptr->details->min_procs = min_procs; - job_ptr->details->min_memory = min_memory; - job_ptr->details->min_tmp_disk = min_tmp_disk; - job_ptr->details->submit_time = submit_time; - job_ptr->details->total_procs = total_procs; - job_ptr->details->req_nodes = req_nodes; - req_nodes = NULL; - job_ptr->details->req_node_bitmap = - req_node_bitmap; - req_node_bitmap = NULL; - job_ptr->details->features = features; - features = NULL; - job_ptr->details->err = err; - err = NULL; - job_ptr->details->in = in; - in = NULL; - job_ptr->details->out = out; - out = NULL; - job_ptr->details->work_dir = work_dir; - work_dir = NULL; - memcpy(&job_ptr->details->credential, - credential_ptr, - sizeof(job_ptr->details->credential)); - } + safe_unpack32(&min_procs, buffer); + safe_unpack32(&min_memory, buffer); + safe_unpack32(&min_tmp_disk, buffer); + safe_unpack_time(&submit_time, buffer); - safe_unpack16(&step_flag, buffer); - while (step_flag == STEP_FLAG) { - if ((error_code = _unload_step_state(job_ptr, buffer))) - goto unpack_error; - safe_unpack16(&step_flag, buffer); - } - if (error_code) - break; + safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer); + safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer); + safe_unpackstr_xmalloc(&features, &name_len, buffer); - cleanup: - FREE_NULL(nodes); - FREE_NULL(partition); - FREE_NULL(name); - FREE_NULL(req_nodes); - FREE_NULL(features); - FREE_NULL(err); - FREE_NULL(in); - FREE_NULL(out); - FREE_NULL(work_dir); - FREE_NULL_BITMAP(node_bitmap); - FREE_NULL_BITMAP(req_node_bitmap); - FREE_NULL(credential_ptr); - } + safe_unpackstr_xmalloc(&err, &name_len, buffer); + safe_unpackstr_xmalloc(&in, &name_len, buffer); + safe_unpackstr_xmalloc(&out, &name_len, buffer); + safe_unpackstr_xmalloc(&work_dir, &name_len, buffer); - free_buf(buffer); - return error_code; + /* validity test as possible */ + if ((shared > 1) || (contiguous > 1)) { + error("Invalid data for job %u: shared=%u contiguous=%u", + job_ptr->job_id, shared, contiguous); + goto unpack_error; + } + if ((req_nodes) && (node_name2bitmap(req_nodes, &req_node_bitmap))) { + error("Invalid req_nodes (%s) for job_id %u", + req_nodes, job_ptr->job_id); + goto unpack_error; + } + if ((exc_nodes) && (node_name2bitmap(exc_nodes, &exc_node_bitmap))) { + error("Invalid exc_nodes (%s) for job_id %u", + exc_nodes, job_ptr->job_id); + goto unpack_error; + } + + /* now put the details into the job record */ + memcpy(&job_ptr->details->credential, credential_ptr, + sizeof(job_ptr->details->credential)); + job_ptr->details->num_procs = num_procs; + job_ptr->details->min_nodes = min_nodes; + job_ptr->details->max_nodes = max_nodes; + job_ptr->details->total_procs = total_procs; + job_ptr->details->shared = shared; + job_ptr->details->contiguous = contiguous; + job_ptr->details->min_procs = min_procs; + job_ptr->details->min_memory = min_memory; + job_ptr->details->min_tmp_disk = min_tmp_disk; + job_ptr->details->submit_time = submit_time; + job_ptr->details->req_nodes = req_nodes; + job_ptr->details->req_node_bitmap = req_node_bitmap; + job_ptr->details->exc_nodes = exc_nodes; + job_ptr->details->exc_node_bitmap = exc_node_bitmap; + job_ptr->details->features = features; + job_ptr->details->err = err; + job_ptr->details->in = in; + job_ptr->details->out = out; + job_ptr->details->work_dir = work_dir; + build_node_details(job_ptr); /* set: num_cpu_groups, cpus_per_node, + * cpu_count_reps, node_cnt, and + * node_addr */ + return SLURM_SUCCESS; unpack_error: - error("Incomplete job data checkpoint file"); - error("Job state not completely restored"); - FREE_NULL(nodes); - FREE_NULL(partition); - FREE_NULL(name); FREE_NULL(req_nodes); + FREE_NULL(exc_nodes); + FREE_NULL_BITMAP(req_node_bitmap); + FREE_NULL_BITMAP(exc_node_bitmap); FREE_NULL(features); FREE_NULL(err); FREE_NULL(in); FREE_NULL(out); FREE_NULL(work_dir); - free_buf(buffer); - return EFAULT; + return SLURM_FAILURE; +} + + +/* + * _dump_job_step_state - dump the state of a specific job step to a buffer + * IN detail_ptr - pointer to job step for which information is requested + * IN/OUT buffer - location to store data, pointers automatically advanced + */ +static void _dump_job_step_state(struct step_record *step_ptr, Buf buffer) +{ + pack16((uint16_t) step_ptr->step_id, buffer); + pack16((uint16_t) step_ptr->cyclic_alloc, buffer); + pack32(step_ptr->num_tasks, buffer); + pack_time(step_ptr->start_time, buffer); + + safe_packstr(step_ptr->step_node_list, MAX_STR_PACK, buffer); +#ifdef HAVE_LIBELAN3 + qsw_pack_jobinfo(step_ptr->qsw_job, buffer); +#endif } -/* Unpack a job step state information from a buffer */ -static int _unload_step_state(struct job_record *job_ptr, Buf buffer) +/* Unpack job step state information from a buffer */ +static int _load_step_state(struct job_record *job_ptr, Buf buffer) { struct step_record *step_ptr; uint16_t step_id, cyclic_alloc, name_len; uint32_t num_tasks; time_t start_time; - char *node_list = NULL; + char *step_node_list = NULL; safe_unpack16(&step_id, buffer); safe_unpack16(&cyclic_alloc, buffer); safe_unpack32(&num_tasks, buffer); safe_unpack_time(&start_time, buffer); - safe_unpackstr_xmalloc(&node_list, &name_len, buffer); + safe_unpackstr_xmalloc(&step_node_list, &name_len, buffer); /* validity test as possible */ if (cyclic_alloc > 1) { error("Invalid data for job %u.%u: cyclic_alloc=%u", job_ptr->job_id, step_id, cyclic_alloc); - return SLURM_FAILURE; + goto unpack_error; } step_ptr = create_step_record(job_ptr); @@ -778,22 +714,23 @@ static int _unload_step_state(struct job_record *job_ptr, Buf buffer) step_ptr->cyclic_alloc = cyclic_alloc; step_ptr->num_tasks = num_tasks; step_ptr->start_time = start_time; - info("recovered job step %u.%u", job_ptr->job_id, step_id); - if (node_list) { - (void) node_name2bitmap(node_list, &(step_ptr->node_bitmap)); - FREE_NULL(node_list); - } + step_ptr->step_node_list = step_node_list; + if (step_node_list) + (void) node_name2bitmap(step_node_list, + &(step_ptr->step_node_bitmap)); + step_node_list = NULL; /* re-used, nothing left to free */ #ifdef HAVE_LIBELAN3 qsw_alloc_jobinfo(&step_ptr->qsw_job); if (qsw_unpack_jobinfo(step_ptr->qsw_job, buffer)) { qsw_free_jobinfo(step_ptr->qsw_job); - return SLURM_FAILURE; + goto unpack_error; } #endif + info("recovered job step %u.%u", job_ptr->job_id, step_id); return SLURM_SUCCESS; unpack_error: - FREE_NULL(node_list); + FREE_NULL(step_node_list); return SLURM_FAILURE; } @@ -897,8 +834,7 @@ int kill_running_job_by_node_name(char *node_name) (struct job_record *) list_next(job_record_iterator))) { if (job_record_point->job_state != JOB_RUNNING) continue; /* job not active */ - if (bit_test(job_record_point->node_bitmap, bit_position) - == 0) + if (!bit_test(job_record_point->node_bitmap, bit_position)) continue; /* job not on this node */ error("Running job_id %u on failed node %s", @@ -915,6 +851,7 @@ int kill_running_job_by_node_name(char *node_name) /* Remove node from this job's list */ _excise_node_from_job(job_record_point, node_record_point); + make_node_idle(node_record_point); } } @@ -1292,7 +1229,7 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id, { int error_code, i; struct part_record *part_ptr; - bitstr_t *req_bitmap = NULL; + bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL; if ((error_code = _validate_job_desc(job_desc, allocate))) return error_code; @@ -1343,8 +1280,10 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id, if (job_desc->req_nodes) { error_code = node_name2bitmap(job_desc->req_nodes, &req_bitmap); - if (error_code == EINVAL) + if (error_code == EINVAL) { + error_code = ESLURM_INVALID_NODE_NAME; goto cleanup; + } if (error_code != 0) { error_code = EAGAIN; /* no memory */ goto cleanup; @@ -1365,8 +1304,34 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id, if (i > job_desc->min_nodes) job_desc->min_nodes = i; } + if (job_desc->exc_nodes) { + error_code = + node_name2bitmap(job_desc->exc_nodes, &exc_bitmap); + if (error_code == EINVAL) { + error_code = ESLURM_INVALID_NODE_NAME; + goto cleanup; + } + } + if ((exc_bitmap != NULL) && (req_bitmap != NULL)) { + bitstr_t *tmp_bitmap = NULL; + bool first_set; + tmp_bitmap = bit_copy(exc_bitmap); + bit_and(tmp_bitmap, req_bitmap); + first_set = bit_ffs(tmp_bitmap); + FREE_NULL_BITMAP(tmp_bitmap); + if (first_set != -1) { + info("Job's required and excluded node lists overlap"); + error_code = ESLURM_INVALID_NODE_NAME; + goto cleanup; + } + } + + if (job_desc->min_nodes == NO_VAL) + job_desc->min_nodes = 1; + if (job_desc->max_nodes == NO_VAL) + job_desc->max_nodes = 0; if (job_desc->min_nodes > part_ptr->total_cpus) { - info("_job_create: too many cpus (%d) requested of partition %s(%d)", + info("Job requested too many cpus (%d) of partition %s(%d)", job_desc->min_nodes, part_ptr->name, part_ptr->total_cpus); error_code = ESLURM_TOO_MANY_REQUESTED_CPUS; @@ -1378,11 +1343,20 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id, i = part_ptr->max_nodes; else i = part_ptr->total_nodes; - info("_job_create: too many nodes (%d) requested of partition %s(%d)", + info("Job requested too many nodes (%d) of partition %s(%d)", job_desc->min_nodes, part_ptr->name, i); error_code = ESLURM_TOO_MANY_REQUESTED_NODES; goto cleanup; } + if (job_desc->max_nodes && + (job_desc->max_nodes < job_desc->min_nodes)) { + info("Job's max_nodes < min_nodes"); + error_code = ESLURM_TOO_MANY_REQUESTED_CPUS; + goto cleanup; + } + if (job_desc->max_nodes > part_ptr->max_nodes) + job_desc->max_nodes = part_ptr->max_nodes; + if ((error_code =_validate_job_create_req(job_desc))) goto cleanup; @@ -1395,7 +1369,8 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id, if ((error_code = _copy_job_desc_to_job_record(job_desc, job_rec_ptr, part_ptr, - req_bitmap))) { + &req_bitmap, + &exc_bitmap))) { error_code = ESLURM_ERROR_ON_DESC_TO_RECORD_COPY; goto cleanup; } @@ -1423,6 +1398,7 @@ static int _job_create(job_desc_msg_t * job_desc, uint32_t * new_job_id, cleanup: FREE_NULL_BITMAP(req_bitmap); + FREE_NULL_BITMAP(exc_bitmap); return error_code; } @@ -1736,7 +1712,8 @@ static int _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, struct job_record **job_rec_ptr, struct part_record *part_ptr, - bitstr_t * req_bitmap) + bitstr_t ** req_bitmap, + bitstr_t ** exc_bitmap) { int error_code; struct job_details *detail_ptr; @@ -1772,9 +1749,16 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, detail_ptr = job_ptr->details; detail_ptr->num_procs = job_desc->num_procs; detail_ptr->min_nodes = job_desc->min_nodes; + detail_ptr->max_nodes = job_desc->max_nodes; if (job_desc->req_nodes) { detail_ptr->req_nodes = xstrdup(job_desc->req_nodes); - detail_ptr->req_node_bitmap = req_bitmap; + detail_ptr->req_node_bitmap = *req_bitmap; + *req_bitmap = NULL; /* Reused nothing left to free */ + } + if (job_desc->exc_nodes) { + detail_ptr->exc_nodes = xstrdup(job_desc->exc_nodes); + detail_ptr->exc_node_bitmap = *exc_bitmap; + *exc_bitmap = NULL; /* Reused nothing left to free */ } if (job_desc->features) detail_ptr->features = xstrdup(job_desc->features); @@ -1904,11 +1888,11 @@ static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate) job_desc_msg->name); return ESLURM_JOB_NAME_TOO_LONG; } - if (job_desc_msg->contiguous == NO_VAL) + if (job_desc_msg->contiguous == (uint16_t) NO_VAL) job_desc_msg->contiguous = 0; - if (job_desc_msg->kill_on_node_fail == NO_VAL) + if (job_desc_msg->kill_on_node_fail == (uint16_t) NO_VAL) job_desc_msg->kill_on_node_fail = 1; - if (job_desc_msg->shared == NO_VAL) + if (job_desc_msg->shared == (uint16_t) NO_VAL) job_desc_msg->shared = 0; if ((job_desc_msg->job_id != NO_VAL) && @@ -1925,7 +1909,7 @@ static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate) job_desc_msg->min_memory = 1; /* default 1 MB mem per node */ if (job_desc_msg->min_tmp_disk == NO_VAL) job_desc_msg->min_tmp_disk = 1; /* default 1 MB disk per node */ - if (job_desc_msg->shared == NO_VAL) + if (job_desc_msg->shared == (uint16_t) NO_VAL) job_desc_msg->shared = 0; /* default not shared nodes */ if (job_desc_msg->min_procs == NO_VAL) job_desc_msg->min_procs = 1; /* default 1 cpu per node */ @@ -2070,7 +2054,6 @@ pack_all_jobs(char **buffer_ptr, int *buffer_size) */ void pack_job(struct job_record *dump_job_ptr, Buf buffer) { - char tmp_str[MAX_STR_PACK]; struct job_details *detail_ptr; pack32(dump_job_ptr->job_id, buffer); @@ -2087,12 +2070,8 @@ void pack_job(struct job_record *dump_job_ptr, Buf buffer) packstr(dump_job_ptr->nodes, buffer); packstr(dump_job_ptr->partition, buffer); packstr(dump_job_ptr->name, buffer); - if (dump_job_ptr->node_bitmap) { - (void) bit_fmt(tmp_str, MAX_STR_PACK, - dump_job_ptr->node_bitmap); - packstr(tmp_str, buffer); - } else - packstr(NULL, buffer); + safe_pack_bit_fmt(dump_job_ptr->node_bitmap, + MAX_STR_PACK, buffer); detail_ptr = dump_job_ptr->details; if (detail_ptr && dump_job_ptr->job_state == JOB_PENDING) @@ -2101,10 +2080,10 @@ void pack_job(struct job_record *dump_job_ptr, Buf buffer) _pack_job_details(NULL, buffer); } +/* pack job details for "get_job_info" RPC */ static void _pack_job_details(struct job_details *detail_ptr, Buf buffer) { if (detail_ptr) { - char tmp_str[MAX_STR_PACK]; pack32((uint32_t) detail_ptr->num_procs, buffer); pack32((uint32_t) detail_ptr->min_nodes, buffer); pack16((uint16_t) detail_ptr->shared, buffer); @@ -2114,32 +2093,10 @@ static void _pack_job_details(struct job_details *detail_ptr, Buf buffer) pack32((uint32_t) detail_ptr->min_memory, buffer); pack32((uint32_t) detail_ptr->min_tmp_disk, buffer); - if ((detail_ptr->req_nodes == NULL) || - (strlen(detail_ptr->req_nodes) < MAX_STR_PACK)) - packstr(detail_ptr->req_nodes, buffer); - else { - strncpy(tmp_str, detail_ptr->req_nodes, - MAX_STR_PACK); - tmp_str[MAX_STR_PACK - 1] = (char) NULL; - packstr(tmp_str, buffer); - } - - if (detail_ptr->req_node_bitmap) { - (void) bit_fmt(tmp_str, MAX_STR_PACK, - detail_ptr->req_node_bitmap); - packstr(tmp_str, buffer); - } else - packstr(NULL, buffer); - - if (detail_ptr->features == NULL || - strlen(detail_ptr->features) < MAX_STR_PACK) - packstr(detail_ptr->features, buffer); - else { - strncpy(tmp_str, detail_ptr->features, - MAX_STR_PACK); - tmp_str[MAX_STR_PACK - 1] = (char) NULL; - packstr(tmp_str, buffer); - } + safe_packstr(detail_ptr->req_nodes, MAX_STR_PACK, buffer); + safe_pack_bit_fmt(detail_ptr->req_node_bitmap, + MAX_STR_PACK, buffer); + safe_packstr(detail_ptr->features, MAX_STR_PACK, buffer); } else { @@ -2157,6 +2114,7 @@ static void _pack_job_details(struct job_details *detail_ptr, Buf buffer) packstr(NULL, buffer); } } + /* * purge_old_job - purge old job records. * the jobs must have completed at least MIN_JOB_AGE minutes ago @@ -2214,6 +2172,12 @@ void reset_job_bitmaps(void) req_nodes, &job_record_point->details-> req_node_bitmap); + FREE_NULL_BITMAP(job_record_point->details->exc_node_bitmap); + if (job_record_point->details->exc_nodes) + node_name2bitmap(job_record_point->details-> + exc_nodes, + &job_record_point->details-> + exc_node_bitmap); } list_iterator_destroy(job_record_iterator); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 3be7b5540a03ac5e45881aecd5f9a406755144d6..8e07c28de4f52548b7448bb902e3e3caf6c95520 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -412,11 +412,11 @@ _dump_node_state (struct node_record *dump_node_ptr, Buf buffer) } /* - * load_node_state - load the node state from file, recover from slurmctld + * load_all_node_state - load the node state from file, recover on slurmctld * restart. execute this after loading the configuration file data. * data goes into common storage */ -int load_node_state ( void ) +int load_all_node_state ( void ) { char *node_name, *data = NULL, *state_file; int data_allocated, data_read = 0, error_code = 0; diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index aa6348c660c46f888908d9b69c70bb73560b2db3..a230b11a67ecd6d5246d19eecc4ea92f9722b34b 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -53,15 +53,27 @@ struct node_set { /* set of nodes with same configuration */ bitstr_t *my_bitmap; }; +static void _add_node_set_info(struct node_set *node_set_ptr, + bitstr_t ** node_bitmap, + int *node_cnt, int *cpu_cnt); +static int _build_node_list(struct job_record *job_ptr, + struct node_set **node_set_pptr, + int *node_set_size); +static void _filter_nodes_in_set(struct node_set *node_set_ptr, + struct job_details *detail_ptr); static int _match_feature(char *seek, char *available); +static int _nodes_in_sets(bitstr_t *req_bitmap, + struct node_set * node_set_ptr, + int node_set_size); static int _pick_best_quadrics(bitstr_t * bitmap, bitstr_t * req_bitmap, - int req_nodes, int req_cpus, + int min_nodes, int max_nodes, int req_cpus, int consecutive); static int _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, bitstr_t ** req_bitmap, - uint32_t req_cpus, uint32_t req_nodes, + uint32_t req_cpus, + uint32_t min_nodes, uint32_t max_nodes, int contiguous, int shared, - uint32_t max_nodes); + uint32_t node_lim); static int _valid_features(char *requested, char *available); @@ -124,7 +136,6 @@ void deallocate_nodes(struct job_record *job_ptr) pthread_attr_t attr_agent; pthread_t thread_agent; int buf_rec_size = 0; - uint16_t no_resp_flag, base_state; agent_args = xmalloc(sizeof(agent_arg_t)); agent_args->msg_type = REQUEST_REVOKE_JOB_CREDENTIAL; @@ -154,23 +165,15 @@ void deallocate_nodes(struct job_record *job_ptr) node_names[MAX_NAME_LEN * agent_args->node_count], node_record_table_ptr[i].name, MAX_NAME_LEN); agent_args->node_count++; - base_state = - node_record_table_ptr[i]. - node_state & (~NODE_STATE_NO_RESPOND); - no_resp_flag = - node_record_table_ptr[i]. - node_state & NODE_STATE_NO_RESPOND; - if (base_state == NODE_STATE_DRAINING) { - node_record_table_ptr[i].node_state = - NODE_STATE_DRAINED; - bit_clear(idle_node_bitmap, i); - bit_clear(up_node_bitmap, i); - } else { - node_record_table_ptr[i].node_state = - NODE_STATE_IDLE | no_resp_flag; - if (no_resp_flag == 0) - bit_set(idle_node_bitmap, i); - } + make_node_idle(&node_record_table_ptr[i]); + } + + if (agent_args->node_count == 0) { + error("Job %u allocated no nodes on for credential revoke", + job_ptr->job_id); + xfree(revoke_job_cred); + xfree(agent_args); + return; } agent_args->msg_args = revoke_job_cred; @@ -197,6 +200,25 @@ void deallocate_nodes(struct job_record *job_ptr) } +/* make_node_idle - flag specified node as no longer being in use */ +void make_node_idle(struct node_record *node_ptr) +{ + int inx = node_ptr - node_record_table_ptr; + uint16_t no_resp_flag, base_state; + + base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND); + no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND; + if (base_state == NODE_STATE_DRAINING) { + node_ptr->node_state = NODE_STATE_DRAINED; + bit_clear(idle_node_bitmap, inx); + bit_clear(up_node_bitmap, inx); + } else { + node_ptr->node_state = NODE_STATE_IDLE | no_resp_flag; + if (no_resp_flag == 0) + bit_set(idle_node_bitmap, inx); + } +} + /* * _match_feature - determine if the desired feature is one of those available * IN seek - desired feature @@ -240,7 +262,8 @@ static int _match_feature(char *seek, char *available) * IN/OUT bitmap - usable nodes are set on input, nodes not required to * satisfy the request are cleared, other left set * IN req_bitmap - map of required nodes - * IN req_nodes - count of required nodes + * IN min_nodes - minimum count of nodes + * IN max_nodes - maximum count of nodes (0==don't care) * IN req_cpus - count of required processors * IN consecutive - allocated nodes must be consecutive if set * RET zero on success, EINVAL otherwise @@ -251,9 +274,10 @@ static int _match_feature(char *seek, char *available) */ static int _pick_best_quadrics(bitstr_t * bitmap, bitstr_t * req_bitmap, - int req_nodes, int req_cpus, int consecutive) + int min_nodes, int max_nodes, + int req_cpus, int consecutive) { - int i, index, error_code, sufficient; + int i, index, error_code = EINVAL, sufficient; int *consec_nodes; /* how many nodes we can add from this * consecutive set of nodes */ int *consec_cpus; /* how many nodes we can add from this @@ -270,21 +294,23 @@ _pick_best_quadrics(bitstr_t * bitmap, bitstr_t * req_bitmap, if (bitmap == NULL) fatal("_pick_best_quadrics: bitmap pointer is NULL"); - error_code = EINVAL; /* default is no fit */ consec_index = 0; - consec_size = 50; /* start allocation for 50 sets of + consec_size = 50; /* start allocation for 50 sets of * consecutive nodes */ - consec_cpus = xmalloc(sizeof(int) * consec_size); + consec_cpus = xmalloc(sizeof(int) * consec_size); consec_nodes = xmalloc(sizeof(int) * consec_size); consec_start = xmalloc(sizeof(int) * consec_size); - consec_end = xmalloc(sizeof(int) * consec_size); - consec_req = xmalloc(sizeof(int) * consec_size); + consec_end = xmalloc(sizeof(int) * consec_size); + consec_req = xmalloc(sizeof(int) * consec_size); /* Build table with information about sets of consecutive nodes */ consec_cpus[consec_index] = consec_nodes[consec_index] = 0; consec_req[consec_index] = -1; /* no required nodes here by default */ rem_cpus = req_cpus; - rem_nodes = req_nodes; + if (max_nodes) + rem_nodes = max_nodes; + else + rem_nodes = min_nodes; for (index = 0; index < node_record_count; index++) { if (bit_test(bitmap, index)) { if (consec_nodes[consec_index] == 0) @@ -423,39 +449,38 @@ _pick_best_quadrics(bitstr_t * bitmap, bitstr_t * req_bitmap, } } if ((rem_nodes <= 0) && (rem_cpus <= 0)) { - error_code = 0; + error_code = SLURM_SUCCESS; break; } consec_cpus[best_fit_location] = 0; consec_nodes[best_fit_location] = 0; } + if (error_code && (rem_cpus <= 0) && + max_nodes && ((max_nodes - rem_nodes) >= min_nodes)) + error_code = SLURM_SUCCESS; - if (consec_cpus) - xfree(consec_cpus); - if (consec_nodes) - xfree(consec_nodes); - if (consec_start) - xfree(consec_start); - if (consec_end) - xfree(consec_end); - if (consec_req) - xfree(consec_req); + FREE_NULL(consec_cpus); + FREE_NULL(consec_nodes); + FREE_NULL(consec_start); + FREE_NULL(consec_end); + FREE_NULL(consec_req); return error_code; } /* - * _pick_best_nodes - from a weigh order table of all nodes satisfying a + * _pick_best_nodes - from a weigh order list of all nodes satisfying a * job's specifications, select the "best" for use * IN node_set_ptr - pointer to node specification information * IN node_set_size - number of entries in records pointed to by node_set_ptr * IN/OUT req_bitmap - pointer to bitmap of specific nodes required by the * job, could be NULL, returns bitmap of selected nodes, must xfree * IN req_cpus - count of cpus required by the job - * IN req_nodes - count of nodes required by the job + * IN min_nodes - minimum count of nodes required by the job + * IN max_nodes - maximum count of nodes required by the job (0==no limit) * IN contiguous - 1 if allocated nodes must be contiguous, 0 otherwise * IN shared - set to 1 if nodes may be shared, 0 otherwise - * IN max_nodes - maximum number of nodes permitted for job, + * IN node_lim - maximum number of nodes permitted for job, * INFIITE for no limit (partition limit) * RET 0 on success, EAGAIN if request can not be satisfied now, EINVAL if * request can never be satisfied (insufficient contiguous nodes) @@ -464,7 +489,7 @@ _pick_best_quadrics(bitstr_t * bitmap, bitstr_t * req_bitmap, * 1) If required node list is specified, determine implicitly required * processor and node count * 2) Determine how many disjoint required "features" are represented - * (e.g. "FS1|FS2") + * (e.g. "FS1|FS2|FS3") * 3) For each feature: find matching node table entries, identify nodes * that are up and available (idle or shared) and add them to a bit * map, call _pick_best_quadrics() to select the "best" of those @@ -476,47 +501,49 @@ _pick_best_quadrics(bitstr_t * bitmap, bitstr_t * req_bitmap, static int _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, bitstr_t ** req_bitmap, uint32_t req_cpus, - uint32_t req_nodes, int contiguous, int shared, - uint32_t max_nodes) + uint32_t min_nodes, uint32_t max_nodes, + int contiguous, int shared, uint32_t node_lim) { - int error_code, i, j, pick_code; - int total_nodes, total_cpus; /* total resources configured in - partition */ - int avail_nodes, avail_cpus; /* resources available for use now */ - bitstr_t *avail_bitmap, *total_bitmap; + int error_code = SLURM_SUCCESS, i, j, pick_code; + int total_nodes = 0, total_cpus = 0; /* total resources configured + * in partition */ + int avail_nodes = 0, avail_cpus = 0; /* resources available for + * use now */ + bitstr_t *avail_bitmap = NULL, *total_bitmap = NULL; int max_feature, min_feature; - int avail_set, total_set, runable; + bool runable = false; if (node_set_size == 0) { info("_pick_best_nodes: empty node set for selection"); return EINVAL; } - if ((max_nodes != INFINITE) && (req_nodes > max_nodes)) { - info("_pick_best_nodes: more nodes required than partition limit"); - return EINVAL; + if (node_lim != INFINITE) { + if (min_nodes > node_lim) { + info("_pick_best_nodes: exceed partition node limit"); + return EINVAL; + } + if (max_nodes > node_lim) + max_nodes = node_lim; } - error_code = 0; - avail_bitmap = total_bitmap = NULL; - avail_nodes = avail_cpus = 0; - total_nodes = total_cpus = 0; - if (req_bitmap[0]) { /* specific nodes required */ + + if (*req_bitmap) { /* specific nodes required */ /* we have already confirmed that all of these nodes have a * usable configuration and are in the proper partition */ - if (req_nodes != 0) - total_nodes = bit_set_count(req_bitmap[0]); + if (min_nodes != 0) + total_nodes = bit_set_count(*req_bitmap); if (req_cpus != 0) - total_cpus = count_cpus(req_bitmap[0]); - if (total_nodes > max_nodes) { - info("_pick_best_nodes: more nodes required than partition limit"); + total_cpus = count_cpus(*req_bitmap); + if (total_nodes > node_lim) { + info("_pick_best_nodes: exceed partition node limit"); return EINVAL; } - if ((req_nodes <= total_nodes) && (req_cpus <= total_cpus)) { - if (bit_super_set(req_bitmap[0], up_node_bitmap) != - 1) + if ((min_nodes <= total_nodes) && + (max_nodes <= min_nodes ) && + (req_cpus <= total_cpus )) { + if (!bit_super_set(*req_bitmap, up_node_bitmap)) return EAGAIN; - if ((shared != 1) && - (bit_super_set(req_bitmap[0], idle_node_bitmap) - != 1)) + if ((!shared) && + (!bit_super_set(*req_bitmap, idle_node_bitmap))) return EAGAIN; return SLURM_SUCCESS; /* user can have selected * nodes, we're done! */ @@ -533,120 +560,124 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, min_feature = node_set_ptr[i].feature; } - runable = 0; /* assume not runable until proven otherwise */ for (j = min_feature; j <= max_feature; j++) { - avail_set = total_set = 0; for (i = 0; i < node_set_size; i++) { if (node_set_ptr[i].feature != j) continue; - if (runable == 0) { - if (total_set) - bit_or(total_bitmap, - node_set_ptr[i].my_bitmap); - else { - total_bitmap = - bit_copy(node_set_ptr[i]. - my_bitmap); - if (total_bitmap == NULL) - fatal - ("bit_copy failed to allocate memory"); - total_set = 1; - } - total_nodes += node_set_ptr[i].nodes; - total_cpus += - (node_set_ptr[i].nodes * - node_set_ptr[i].cpus_per_node); - } + if (!runable) + _add_node_set_info(&node_set_ptr[i], + &total_bitmap, + &total_nodes, &total_cpus); bit_and(node_set_ptr[i].my_bitmap, up_node_bitmap); - if (shared != 1) + if (!shared) bit_and(node_set_ptr[i].my_bitmap, idle_node_bitmap); node_set_ptr[i].nodes = bit_set_count(node_set_ptr[i].my_bitmap); - if (avail_set) - bit_or(avail_bitmap, - node_set_ptr[i].my_bitmap); - else { - avail_bitmap = - bit_copy(node_set_ptr[i].my_bitmap); - if (avail_bitmap == NULL) - fatal - ("bit_copy memory allocation failure"); - avail_set = 1; - } - avail_nodes += node_set_ptr[i].nodes; - avail_cpus += - (node_set_ptr[i].nodes * - node_set_ptr[i].cpus_per_node); - if ((req_bitmap[0]) - && (bit_super_set(req_bitmap[0], avail_bitmap) - == 0)) + _add_node_set_info(&node_set_ptr[i], &avail_bitmap, + &avail_nodes, &avail_cpus); + if ((*req_bitmap) && + (!bit_super_set(*req_bitmap, avail_bitmap))) continue; - if (avail_nodes < req_nodes) - continue; - if (avail_cpus < req_cpus) + if ((avail_nodes < min_nodes) || + (avail_cpus < req_cpus) || + ((max_nodes > min_nodes) && + (avail_nodes < max_nodes))) continue; pick_code = - _pick_best_quadrics(avail_bitmap, - req_bitmap[0], req_nodes, + _pick_best_quadrics(avail_bitmap, *req_bitmap, + min_nodes, max_nodes, req_cpus, contiguous); - if ((pick_code == 0) && (max_nodes != INFINITE) - && (bit_set_count(avail_bitmap) > max_nodes)) { - info("_pick_best_nodes: too many nodes selected %u partition maximum is %u", - bit_set_count(avail_bitmap), max_nodes); + if ((pick_code == 0) && (node_lim != INFINITE) && + (bit_set_count(avail_bitmap) > node_lim)) { + info("_pick_best_nodes: %u nodes, max is %u", + bit_set_count(avail_bitmap), node_lim); error_code = EINVAL; break; } if (pick_code == 0) { - if (total_bitmap) - bit_free(total_bitmap); - if (req_bitmap[0]) - bit_free(req_bitmap[0]); - req_bitmap[0] = avail_bitmap; + FREE_NULL_BITMAP(total_bitmap); + FREE_NULL_BITMAP(*req_bitmap); + *req_bitmap = avail_bitmap; return SLURM_SUCCESS; } } - /* determine if job could possibly run (if configured - * nodes all available) */ - if ((error_code == 0) && (runable == 0) && - (total_nodes >= req_nodes) && (total_cpus >= req_cpus) - && ((req_bitmap[0] == NULL) - || (bit_super_set(req_bitmap[0], total_bitmap) == - 1)) && ((max_nodes == INFINITE) - || (req_nodes <= max_nodes))) { + /* try to get max_nodes now for this feature */ + if ((max_nodes > min_nodes) && + (avail_nodes < max_nodes)) { pick_code = - _pick_best_quadrics(total_bitmap, - req_bitmap[0], req_nodes, + _pick_best_quadrics(avail_bitmap, *req_bitmap, + min_nodes, max_nodes, req_cpus, contiguous); - if ((pick_code == 0) && (max_nodes != INFINITE) - && (bit_set_count(total_bitmap) > max_nodes)) { + if (pick_code == 0) { + FREE_NULL_BITMAP(total_bitmap); + FREE_NULL_BITMAP(*req_bitmap); + *req_bitmap = avail_bitmap; + return SLURM_SUCCESS; + } + } + + /* determine if job could possibly run (if all configured + * nodes available) */ + if ((error_code == 0) && (!runable) && + (total_nodes >= min_nodes) && (total_cpus >= req_cpus) && + ((*req_bitmap == NULL) || + (bit_super_set(*req_bitmap, total_bitmap))) && + ((node_lim == INFINITE) || (min_nodes <= node_lim))) { + pick_code = + _pick_best_quadrics(total_bitmap, *req_bitmap, + min_nodes, 0, + req_cpus, contiguous); + if ((pick_code == 0) && (node_lim != INFINITE) && + (bit_set_count(total_bitmap) > node_lim)) { + info("_pick_best_nodes: %u nodes, max is %u", + bit_set_count(avail_bitmap), node_lim); error_code = EINVAL; - info("_pick_best_nodes: %u nodes selected, max is %u", - bit_set_count(avail_bitmap), max_nodes); } if (pick_code == 0) - runable = 1; + runable = true; } - if (avail_bitmap) - bit_free(avail_bitmap); - if (total_bitmap) - bit_free(total_bitmap); - avail_bitmap = total_bitmap = NULL; + FREE_NULL_BITMAP(avail_bitmap); + FREE_NULL_BITMAP(total_bitmap); if (error_code != 0) break; } - if (runable == 0) { + if (!runable) { error_code = EINVAL; info("_pick_best_nodes: job never runnable"); } - if (error_code == 0) + if (error_code == SLURM_SUCCESS) error_code = EAGAIN; return error_code; } +/* + * _add_node_set_info - add info in node_set_ptr to + * IN node_set_ptr - node set info + * IN/OUT node_bitmap - add nodes in set to this bitmap + * IN/OUT node_cnt - add count of nodes in set to this total + * IN/OUT cpu_cnt - add count of cpus in set to this total + */ +static void +_add_node_set_info(struct node_set *node_set_ptr, + bitstr_t ** node_bitmap, + int *node_cnt, int *cpu_cnt) +{ + if (*node_bitmap) + bit_or(*node_bitmap, node_set_ptr->my_bitmap); + else { + *node_bitmap = bit_copy(node_set_ptr->my_bitmap); + if (*node_bitmap == NULL) + fatal("bit_copy malloc"); + } + *node_cnt += node_set_ptr->nodes; + *cpu_cnt += node_set_ptr->nodes * + node_set_ptr->cpus_per_node; +} + /* * select_nodes - select and allocate nodes to a specific job * IN job_ptr - pointer to the job record @@ -666,19 +697,10 @@ _pick_best_nodes(struct node_set *node_set_ptr, int node_set_size, */ int select_nodes(struct job_record *job_ptr, bool test_only) { - int error_code, i, node_set_index, node_set_size = 0; - bitstr_t *req_bitmap, *scratch_bitmap; - ListIterator config_record_iterator; - struct config_record *config_record_point; - struct node_set *node_set_ptr; - struct part_record *part_ptr; - int tmp_feature, check_node_config; - - error_code = SLURM_SUCCESS; - req_bitmap = scratch_bitmap = NULL; - config_record_iterator = (ListIterator) NULL; - node_set_ptr = NULL; - part_ptr = NULL; + int error_code = SLURM_SUCCESS, i, node_set_size = 0; + bitstr_t *req_bitmap = NULL; + struct node_set *node_set_ptr = NULL; + struct part_record *part_ptr = job_ptr->part_ptr; if (job_ptr == NULL) fatal("select_nodes: NULL job pointer value"); @@ -686,155 +708,41 @@ int select_nodes(struct job_record *job_ptr, bool test_only) fatal("select_nodes: bad job pointer value"); /* insure that partition exists and is up */ - part_ptr = find_part_record(job_ptr->partition); + if (part_ptr == NULL) { + part_ptr = find_part_record(job_ptr->partition); + job_ptr->part_ptr = part_ptr; + error("partition pointer reset for job %u, part %s", + job_ptr->job_id, job_ptr->partition); + } if (part_ptr == NULL) - fatal("select_nodes: invalid partition name %s for job %u", + fatal("Invalid partition name %s for job %u", job_ptr->partition, job_ptr->job_id); if (part_ptr->state_up == 0) return ESLURM_NODES_BUSY; - /* pick up nodes from the weight ordered configuration list */ - node_set_index = 0; - node_set_size = 0; - node_set_ptr = - (struct node_set *) xmalloc(sizeof(struct node_set)); - node_set_ptr[node_set_size++].my_bitmap = NULL; - if (job_ptr->details->req_node_bitmap) /* insure selected nodes in - this partition */ - req_bitmap = bit_copy(job_ptr->details->req_node_bitmap); - - config_record_iterator = list_iterator_create(config_list); - if (config_record_iterator == NULL) - fatal - ("select_nodes: ListIterator_create unable to allocate memory"); - - while ((config_record_point = (struct config_record *) - list_next(config_record_iterator))) { - - tmp_feature = _valid_features(job_ptr->details->features, - config_record_point-> - feature); - if (tmp_feature == 0) - continue; - - /* since nodes can register with more resources than defined */ - /* in the configuration, we want to use those higher values */ - /* for scheduling, but only as needed */ - if (slurmctld_conf.fast_schedule) - check_node_config = 0; - else if ((job_ptr->details->min_procs > - config_record_point->cpus) - || (job_ptr->details->min_memory > - config_record_point->real_memory) - || (job_ptr->details->min_tmp_disk > - config_record_point->tmp_disk)) { - check_node_config = 1; - } else - check_node_config = 0; - - node_set_ptr[node_set_index].my_bitmap = - bit_copy(config_record_point->node_bitmap); - if (node_set_ptr[node_set_index].my_bitmap == NULL) - fatal("bit_copy memory allocation failure"); - bit_and(node_set_ptr[node_set_index].my_bitmap, - part_ptr->node_bitmap); - node_set_ptr[node_set_index].nodes = - bit_set_count(node_set_ptr[node_set_index].my_bitmap); - - /* check configuration of individual nodes only if the check */ - /* of baseline values in the configuration file are too low. */ - /* this will slow the scheduling for very large cluster. */ - if (check_node_config - && (node_set_ptr[node_set_index].nodes != 0)) { - for (i = 0; i < node_record_count; i++) { - if (bit_test - (node_set_ptr[node_set_index]. - my_bitmap, i) == 0) - continue; - if ((job_ptr->details->min_procs <= - node_record_table_ptr[i].cpus) - && (job_ptr->details->min_memory <= - node_record_table_ptr[i]. - real_memory) - && (job_ptr->details->min_tmp_disk <= - node_record_table_ptr[i].tmp_disk)) - continue; - bit_clear(node_set_ptr[node_set_index]. - my_bitmap, i); - if ((--node_set_ptr[node_set_index]. - nodes) == 0) - break; - } - } - if (node_set_ptr[node_set_index].nodes == 0) { - bit_free(node_set_ptr[node_set_index].my_bitmap); - node_set_ptr[node_set_index].my_bitmap = NULL; - continue; - } - if (req_bitmap) { - if (scratch_bitmap) - bit_or(scratch_bitmap, - node_set_ptr[node_set_index]. - my_bitmap); - else { - scratch_bitmap = - bit_copy(node_set_ptr[node_set_index]. - my_bitmap); - if (scratch_bitmap == NULL) - fatal - ("bit_copy memory allocation failure"); - } - } - node_set_ptr[node_set_index].cpus_per_node = - config_record_point->cpus; - node_set_ptr[node_set_index].weight = - config_record_point->weight; - node_set_ptr[node_set_index].feature = tmp_feature; - debug - ("found %d usable nodes from configuration containing nodes %s", - node_set_ptr[node_set_index].nodes, - config_record_point->nodes); - - node_set_index++; - xrealloc(node_set_ptr, - sizeof(struct node_set) * (node_set_index + 1)); - node_set_ptr[node_set_size++].my_bitmap = NULL; - } - if (node_set_index == 0) { - info("select_nodes: no node configurations satisfy requirements procs=%u:mem=%u:disk=%u:feature=%s", - job_ptr->details->min_procs, - job_ptr->details->min_memory, - job_ptr->details->min_tmp_disk, - job_ptr->details->features); - error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; - goto cleanup; - } - /* eliminate last (incomplete) node_set record */ - if (node_set_ptr[node_set_index].my_bitmap) - bit_free(node_set_ptr[node_set_index].my_bitmap); - node_set_ptr[node_set_index].my_bitmap = NULL; - node_set_size = node_set_index; - - if (req_bitmap) { - if ((scratch_bitmap == NULL) - || (bit_super_set(req_bitmap, scratch_bitmap) != 1)) { - info("select_nodes: requested nodes do not satisfy configurations requirements procs=%u:mem=%u:disk=%u:feature=%s", - job_ptr->details->min_procs, - job_ptr->details->min_memory, - job_ptr->details->min_tmp_disk, - job_ptr->details->features); - error_code = - ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; - goto cleanup; + /* get sets of nodes from the configuration list */ + error_code = _build_node_list(job_ptr, &node_set_ptr, &node_set_size); + if (error_code) + return error_code; + + /* insure that selected nodes in these node sets */ + if (job_ptr->details->req_node_bitmap) { + error_code = _nodes_in_sets(job_ptr->details->req_node_bitmap, + node_set_ptr, node_set_size); + if (error_code) { + info("No nodes satify requirements for job %u", + job_ptr->job_id); + return error_code; } + req_bitmap = bit_copy(job_ptr->details->req_node_bitmap); } - /* pick the nodes providing a best-fit */ error_code = _pick_best_nodes(node_set_ptr, node_set_size, &req_bitmap, job_ptr->details->num_procs, job_ptr->details->min_nodes, + job_ptr->details->max_nodes, job_ptr->details->contiguous, job_ptr->details->shared, part_ptr->max_nodes); @@ -868,22 +776,172 @@ int select_nodes(struct job_record *job_ptr, bool test_only) build_job_cred(job_ptr); /* uses end_time set above */ cleanup: - if (req_bitmap) - bit_free(req_bitmap); - if (scratch_bitmap) - bit_free(scratch_bitmap); + FREE_NULL_BITMAP(req_bitmap); if (node_set_ptr) { - for (i = 0; i < node_set_size; i++) { - if (node_set_ptr[i].my_bitmap) - bit_free(node_set_ptr[i].my_bitmap); - } + for (i = 0; i < node_set_size; i++) + FREE_NULL_BITMAP(node_set_ptr[i].my_bitmap); xfree(node_set_ptr); } - if (config_record_iterator) - list_iterator_destroy(config_record_iterator); return error_code; } +/* + * _build_node_list - identify which nodes could be allocated to a job + * IN job_ptr - pointer to node to be scheduled + * OUT node_set_pptr - list of node sets which could be used for the job + * OUT node_set_size - number of node_set entries + * RET error code + */ +static int _build_node_list(struct job_record *job_ptr, + struct node_set **node_set_pptr, + int *node_set_size) +{ + int node_set_inx; + struct node_set *node_set_ptr; + struct config_record *config_ptr; + struct part_record *part_ptr = job_ptr->part_ptr; + ListIterator config_record_iterator; + int tmp_feature, check_node_config; + struct job_details *detail_ptr = job_ptr->details; + bitstr_t *exc_node_mask = NULL; + + node_set_inx = 0; + node_set_ptr = (struct node_set *) xmalloc(sizeof(struct node_set) * 2); + node_set_ptr[node_set_inx+1].my_bitmap = NULL; + if (detail_ptr->exc_node_bitmap) { + exc_node_mask = bit_copy(detail_ptr->exc_node_bitmap); + bit_not(exc_node_mask); + } + + config_record_iterator = list_iterator_create(config_list); + if (config_record_iterator == NULL) + fatal("list_iterator_create malloc failure"); + + while ((config_ptr = (struct config_record *) + list_next(config_record_iterator))) { + + tmp_feature = _valid_features(job_ptr->details->features, + config_ptr->feature); + if (tmp_feature == 0) + continue; + + /* since nodes can register with more resources than defined */ + /* in the configuration, we want to use those higher values */ + /* for scheduling, but only as needed (slower) */ + if (slurmctld_conf.fast_schedule) + check_node_config = 0; + else if ((detail_ptr->min_procs > config_ptr->cpus ) || + (detail_ptr->min_memory > config_ptr->real_memory) || + (detail_ptr->min_tmp_disk > config_ptr->tmp_disk)) { + check_node_config = 1; + } else + check_node_config = 0; + + node_set_ptr[node_set_inx].my_bitmap = + bit_copy(config_ptr->node_bitmap); + if (node_set_ptr[node_set_inx].my_bitmap == NULL) + fatal("bit_copy memory allocation failure"); + bit_and(node_set_ptr[node_set_inx].my_bitmap, + part_ptr->node_bitmap); + if (exc_node_mask) + bit_and(node_set_ptr[node_set_inx].my_bitmap, + exc_node_mask); + node_set_ptr[node_set_inx].nodes = + bit_set_count(node_set_ptr[node_set_inx].my_bitmap); + if (check_node_config && + (node_set_ptr[node_set_inx].nodes != 0)) + _filter_nodes_in_set(&node_set_ptr[node_set_inx], + detail_ptr); + + if (node_set_ptr[node_set_inx].nodes == 0) { + FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap); + continue; + } + node_set_ptr[node_set_inx].cpus_per_node = + config_ptr->cpus; + node_set_ptr[node_set_inx].weight = + config_ptr->weight; + node_set_ptr[node_set_inx].feature = tmp_feature; + debug("found %d usable nodes from config containing %s", + node_set_ptr[node_set_inx].nodes, config_ptr->nodes); + + node_set_inx++; + xrealloc(node_set_ptr, + sizeof(struct node_set) * (node_set_inx + 2)); + node_set_ptr[node_set_inx + 1].my_bitmap = NULL; + } + list_iterator_destroy(config_record_iterator); + /* eliminate last (incomplete) node_set record */ + FREE_NULL_BITMAP(node_set_ptr[node_set_inx].my_bitmap); + FREE_NULL_BITMAP(exc_node_mask); + + if (node_set_inx == 0) { + info("No nodes satisfy job %u requirements", + job_ptr->job_id); + FREE_NULL(node_set_ptr); + return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; + } + + *node_set_size = node_set_inx; + *node_set_pptr = node_set_ptr; + return SLURM_SUCCESS; +} + +/* Remove from the node set any nodes which lack sufficient resources + * to satisfy the job's request */ +static void _filter_nodes_in_set(struct node_set *node_set_ptr, + struct job_details *detail_ptr) +{ + int i; + + for (i = 0; i < node_record_count; i++) { + if (bit_test(node_set_ptr->my_bitmap, i) == 0) + continue; + if ((detail_ptr->min_procs <= + node_record_table_ptr[i].cpus) && + (detail_ptr->min_memory <= + node_record_table_ptr[i].real_memory) && + (detail_ptr->min_tmp_disk <= + node_record_table_ptr[i].tmp_disk)) + continue; + bit_clear(node_set_ptr->my_bitmap, i); + if ((--(node_set_ptr->nodes)) == 0) + break; + } +} + +/* + * IN req_bitmap - nodes specifically required by the job + * IN node_set_ptr - sets of valid nodes + * IN node_set_size - count of node_set entries + * RET 0 if in set, otherwise an error code + */ +static int _nodes_in_sets(bitstr_t *req_bitmap, + struct node_set * node_set_ptr, + int node_set_size) +{ + bitstr_t *scratch_bitmap = NULL; + int error_code = SLURM_SUCCESS, i; + + for (i=0; i<node_set_size; i++) { + if (scratch_bitmap) + bit_or(scratch_bitmap, + node_set_ptr[i].my_bitmap); + else { + scratch_bitmap = + bit_copy(node_set_ptr[i].my_bitmap); + if (scratch_bitmap == NULL) + fatal("bit_copy malloc failure"); + } + } + + if ((scratch_bitmap == NULL) + || (bit_super_set(req_bitmap, scratch_bitmap) != 1)) + error_code = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; + + FREE_NULL_BITMAP(scratch_bitmap); + return error_code; +} /* * build_node_details - set cpu counts and addresses for allocated nodes diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c index 6098abf3e919679d420de0428d3e90b30f1215fa..a4188c78c0809980b7e147b133374d99a1ac11b1 100644 --- a/src/slurmctld/partition_mgr.c +++ b/src/slurmctld/partition_mgr.c @@ -334,11 +334,11 @@ static void _dump_part_state(struct part_record *part_record_point, Buf buffer) } /* - * load_part_state - load the partition state from file, recover from + * load_all_part_state - load the partition state from file, recover on * slurmctld restart. execute this after loading the configuration * file data. */ -int load_part_state(void) +int load_all_part_state(void) { char *part_name, *allow_groups, *nodes, *state_file, *data = NULL; uint32_t max_time, max_nodes; @@ -428,7 +428,7 @@ int load_part_state(void) xfree(part_ptr->nodes); part_ptr->nodes = nodes; } else { - info("load_part_state: partition %s removed from configuration file", + info("load_all_part_state: partition %s removed from configuration file", part_name); } diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 720c8b39cd9f64f8a0675ea7535cc55b052a738d..d81b55e055e77c3e89c4106a2e122eb4d2315444 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -48,11 +48,6 @@ #include "src/slurmctld/slurmctld.h" #define BUF_SIZE 1024 -#define FREE_NULL(_X) \ - do { \ - if (_X) xfree (_X); \ - _X = NULL; \ - } while (0) static int _build_bitmaps(void); static int _init_all_slurm_conf(void); @@ -776,9 +771,9 @@ int read_slurm_conf(int recover) set_slurmd_addr(); if (recover) { - (void) load_node_state(); - (void) load_part_state(); - (void) load_job_state(); + (void) load_all_node_state(); + (void) load_all_part_state(); + (void) load_all_job_state(); } (void) sync_job_files(); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 4c20f9f9df99363f24befb28fbaf58221c8cd830..50c45ef5e8a7445207a7aaca5df733a0ee5553a9 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -62,6 +62,18 @@ #include "src/common/slurm_protocol_api.h" #include "src/common/xmalloc.h" +#define FREE_NULL(_X) \ + do { \ + if (_X) xfree (_X); \ + _X = NULL; \ + } while (0) + +#define FREE_NULL_BITMAP(_X) \ + do { \ + if (_X) bit_free (_X); \ + _X = NULL; \ + } while (0) + /*****************************************************************************\ * GENERAL CONFIGURATION parameters and data structures \*****************************************************************************/ @@ -190,7 +202,7 @@ extern time_t last_job_update; /* time of last update to part records */ extern int job_count; /* number of jobs in the system */ /* job_details - specification of a job's constraints, - can be purged after initiation */ + * can be purged after initiation */ struct job_details { uint32_t magic; /* magic cookie for data integrity */ uint32_t num_procs; /* minimum number of processors */ @@ -199,6 +211,7 @@ struct job_details { char *req_nodes; /* required nodes */ char *exc_nodes; /* excluded nodes */ bitstr_t *req_node_bitmap; /* bitmap of required nodes */ + bitstr_t *exc_node_bitmap; /* bitmap of excluded nodes */ slurm_job_credential_t credential; /* job credential */ char *features; /* required features */ uint16_t shared; /* set node can be shared*/ @@ -258,7 +271,9 @@ struct step_record { across nodes */ uint32_t num_tasks; /* number of tasks required */ time_t start_time; /* step allocation time */ - bitstr_t *node_bitmap; /* bitmap of nodes allocated to job + char *step_node_list; /* list of nodes allocated to job + step */ + bitstr_t *step_node_bitmap; /* bitmap of nodes allocated to job step */ #ifdef HAVE_LIBELAN3 qsw_jobinfo_t qsw_job; /* Elan3 switch context, opaque */ @@ -641,18 +656,18 @@ int list_compare_config (void *config_entry1, void *config_entry2); extern int list_find_part (void *part_entry, void *key); /* - * load_job_state - load the job state from file, recover from last slurmctld + * load_all_job_state - load the job state from file, recover from last * checkpoint. Execute this after loading the configuration file data. * RET 0 or error code */ -extern int load_job_state ( void ); +extern int load_all_job_state ( void ); /* - * load_node_state - load the node state from file, recover from slurmctld + * load_all_node_state - load the node state from file, recover on slurmctld * restart. execute this after loading the configuration file data. * data goes into common storage */ -extern int load_node_state ( void ); +extern int load_all_node_state ( void ); /* * load_part_uid_allow_list - reload the allow_uid list of partitions @@ -662,11 +677,14 @@ extern int load_node_state ( void ); extern void load_part_uid_allow_list ( int force ); /* - * load_part_state - load the partition state from file, recover from + * load_all_part_state - load the partition state from file, recover from * slurmctld restart. execute this after loading the configuration * file data. */ -extern int load_part_state ( void ); +extern int load_all_part_state ( void ); + +/* make_node_idle - flag specified node as no longer being in use */ +extern void make_node_idle(struct node_record *node_ptr); /* * node_name2bitmap - given a node name regular expression, build a bitmap diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index b851828675a9606d2477a949aa2f0c85284756cb..a9307b680ec11138f1f7f3c22f4baed4c098f9f2 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -96,8 +96,8 @@ delete_all_step_records (struct job_record *job_ptr) #ifdef HAVE_LIBELAN3 qsw_free_jobinfo (step_record_point->qsw_job); #endif - if (step_record_point->node_bitmap) - bit_free (step_record_point->node_bitmap); + FREE_NULL(step_record_point->step_node_list); + FREE_NULL_BITMAP(step_record_point->step_node_bitmap); xfree (step_record_point); } @@ -130,8 +130,8 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id) #ifdef HAVE_LIBELAN3 qsw_free_jobinfo (step_record_point->qsw_job); #endif - if (step_record_point->node_bitmap) - bit_free (step_record_point->node_bitmap); + FREE_NULL(step_record_point->step_node_list); + FREE_NULL_BITMAP(step_record_point->step_node_bitmap); xfree (step_record_point); error_code = 0; break; @@ -401,15 +401,12 @@ _pick_step_nodes (struct job_record *job_ptr, step_specs *step_spec ) { } } - if (nodes_avail) - bit_free(nodes_avail); + FREE_NULL_BITMAP(nodes_avail); return nodes_picked; cleanup: - if (nodes_avail) - bit_free(nodes_avail); - if (nodes_picked) - bit_free(nodes_picked); + FREE_NULL_BITMAP(nodes_avail); + FREE_NULL_BITMAP(nodes_picked); return NULL; } @@ -484,7 +481,8 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record, fatal ("create_step_record failed with no memory"); /* set the step_record values */ - step_ptr->node_bitmap = nodeset; + step_ptr->step_node_list = bitmap2node_name(nodeset); + step_ptr->step_node_bitmap = nodeset; step_ptr->cyclic_alloc = (uint16_t) (step_specs->task_dist == SLURM_DIST_CYCLIC); step_ptr->num_tasks = step_specs->num_tasks; @@ -492,13 +490,13 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record, #ifdef HAVE_LIBELAN3 if (qsw_alloc_jobinfo (&step_ptr->qsw_job) < 0) fatal ("step_create: qsw_alloc_jobinfo error"); - first = bit_ffs (step_ptr->node_bitmap); - last = bit_fls (step_ptr->node_bitmap); + first = bit_ffs (step_ptr->step_node_bitmap); + last = bit_fls (step_ptr->step_node_bitmap); nodeset = bit_alloc (node_set_size); if (nodeset == NULL) fatal ("step_create: bit_alloc error"); for (i = first; i <= last; i++) { - if (bit_test (step_ptr->node_bitmap, i)) { + if (bit_test (step_ptr->step_node_bitmap, i)) { node_id = qsw_getnodeid_byhost ( node_record_table_ptr[i].name); if (node_id >= 0) /* no lookup error */ @@ -533,23 +531,13 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record, */ static void _pack_ctld_job_step_info(struct step_record *step, Buf buffer) { - char *node_list; - - if (step->node_bitmap) - node_list = bitmap2node_name(step->node_bitmap); - else { - node_list = xmalloc(1); - node_list[0] = '\0'; - } - pack_job_step_info_members(step->job_ptr->job_id, step->step_id, step->job_ptr->user_id, step->num_tasks, step->start_time, step->job_ptr->partition, - node_list, buffer); - xfree(node_list); + step->step_node_list, buffer); } /*