From 24c6b84666263d3dd111cd17950ccd6d0c9675b6 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 17 May 2010 21:30:45 +0000 Subject: [PATCH] add logic to save/restore a job's gres info and sync it (more work needed to save/restore node state for proper data sync) --- src/common/gres.c | 106 +++++++++++++++++++++++++++++++- src/common/gres.h | 24 +++++++- src/common/node_select.h | 4 +- src/plugins/gres/gpu/gres_gpu.c | 63 +++++++++++++++++++ src/plugins/gres/nic/gres_nic.c | 63 +++++++++++++++++++ src/slurmctld/job_mgr.c | 3 +- src/slurmctld/node_mgr.c | 17 +++-- 7 files changed, 264 insertions(+), 16 deletions(-) diff --git a/src/common/gres.c b/src/common/gres.c index c0f4f394bc8..b7be1450141 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -99,6 +99,10 @@ typedef struct slurm_gres_ops { int (*unpack_node_state) ( void **gres_data, Buf buffer ); void * (*dup_node_state) ( void *gres_data ); + void (*node_state_dealloc) ( void *gres_data ); + int (*node_state_realloc) ( void *job_gres_data, + int node_offset, + void *node_gres_data ); void (*node_state_log) ( void *gres_data, char *node_name ); @@ -175,6 +179,8 @@ static int _load_gres_plugin(char *plugin_name, "pack_node_state", "unpack_node_state", "dup_node_state", + "node_state_dealloc", + "node_state_realloc", "node_state_log", "job_config_delete", "job_gres_validate", @@ -427,12 +433,18 @@ extern int gres_plugin_reconfig(bool *did_change) slurm_mutex_unlock(&gres_context_lock); if (plugin_change) { - info("GresPlugins changed from %s to %s", + error("GresPlugins changed from %s to %s ignored", gres_plugin_list, plugin_names); + error("Restart the slurmctld daemon to change GresPlugins"); *did_change = true; +#if 0 + /* This logic would load new plugins, but we need the old + * plugins to persist in order to process old state + * information. */ rc = gres_plugin_fini(); if (rc == SLURM_SUCCESS) rc = gres_plugin_init(); +#endif } xfree(plugin_names); @@ -868,6 +880,8 @@ unpack_error: /* * Duplicate a node gres status (used for will-run logic) + * IN gres_list - node gres state information + * RET a copy of gres_list or NULL on failure */ extern List gres_plugin_dup_node_state(List gres_list) { @@ -915,6 +929,94 @@ extern List gres_plugin_dup_node_state(List gres_list) return new_list; } +/* + * Deallocate all resources on this node previous allocated to any jobs. + * This function isused to synchronize state after slurmctld restarts or + * is reconfigured. + * IN gres_list - node gres state information + */ +extern void gres_plugin_node_state_dealloc(List gres_list) +{ + int i; + ListIterator gres_iter; + gres_state_t *gres_ptr; + + if (gres_list == NULL) + return; + + (void) gres_plugin_init(); + + slurm_mutex_lock(&gres_context_lock); + gres_iter = list_iterator_create(gres_list); + while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) { + for (i=0; i<gres_context_cnt; i++) { + if (gres_ptr->plugin_id != + *(gres_context[i].ops.plugin_id)) + continue; + (*(gres_context[i].ops.node_state_dealloc)) + (gres_ptr->gres_data); + break; + } + } + list_iterator_destroy(gres_iter); + slurm_mutex_unlock(&gres_context_lock); +} + +/* + * Allocate in this nodes record the resources previously allocated to this + * job. This function isused to synchronize state after slurmctld restarts + * or is reconfigured. + * IN job_gres_list - job gres state information + * IN node_offset - zero-origin index of this node in the job's allocation + * IN node_gres_list - node gres state information + * RET SLURM_SUCCESS or error code + */ +extern int gres_plugin_node_state_realloc(List job_gres_list, int node_offset, + List node_gres_list) +{ + ListIterator job_gres_iter, node_gres_iter; + gres_state_t *job_gres_ptr, *node_gres_ptr; + int i; + + if (job_gres_list == NULL) + return SLURM_SUCCESS; + if (node_gres_list == NULL) + return SLURM_ERROR; + + (void) gres_plugin_init(); + + slurm_mutex_lock(&gres_context_lock); + job_gres_iter = list_iterator_create(job_gres_list); + while ((job_gres_ptr = (gres_state_t *) list_next(job_gres_iter))) { + node_gres_iter = list_iterator_create(node_gres_list); + while ((node_gres_ptr = (gres_state_t *) + list_next(node_gres_iter))) { + if (job_gres_ptr->plugin_id == node_gres_ptr->plugin_id) + break; + } + list_iterator_destroy(node_gres_iter); + if (node_gres_ptr == NULL) { + error("Could not find plugin id %u to realloc job", + job_gres_ptr->plugin_id); + continue; + } + + for (i=0; i<gres_context_cnt; i++) { + if (job_gres_ptr->plugin_id != + *(gres_context[i].ops.plugin_id)) + continue; + (*(gres_context[i].ops.node_state_realloc)) + (job_gres_ptr->gres_data, node_offset, + node_gres_ptr->gres_data); + break; + } + } + list_iterator_destroy(job_gres_iter); + slurm_mutex_unlock(&gres_context_lock); + + return SLURM_SUCCESS; +} + /* * Log a node's current gres state * IN gres_list - generated by gres_plugin_node_config_validate() @@ -971,7 +1073,7 @@ static void _gres_job_list_delete(void *list_element) * Given a job's requested gres configuration, validate it and build a gres list * IN req_config - job request's gres input string * OUT gres_list - List of Gres records for this job to track usage - * RET SLURM_SUCCESS or ESLURM__INVALIDGRES + * RET SLURM_SUCCESS or ESLURM_INVALID_GRES */ extern int gres_plugin_job_gres_validate(char *req_config, List *gres_list) { diff --git a/src/common/gres.h b/src/common/gres.h index 4581ee4e922..25128c6aa3b 100644 --- a/src/common/gres.h +++ b/src/common/gres.h @@ -157,9 +157,31 @@ extern int gres_plugin_unpack_node_state(List *gres_list, Buf buffer, /* * Duplicate a node gres status (used for will-run logic) + * IN gres_list - node gres state information + * RET a copy of gres_list or NULL on failure */ extern List gres_plugin_dup_node_state(List gres_list); +/* + * Deallocate all resources on this node previous allocated to any jobs. + * This function isused to synchronize state after slurmctld restarts or + * is reconfigured. + * IN gres_list - node gres state information + */ +extern void gres_plugin_node_state_dealloc(List gres_list); + +/* + * Allocate in this nodes record the resources previously allocated to this + * job. This function isused to synchronize state after slurmctld restarts + * or is reconfigured. + * IN job_gres_list - job gres state information + * IN node_offset - zero-origin index of this node in the job's allocation + * IN node_gres_list - node gres state information + * RET SLURM_SUCCESS or error code + */ +extern int gres_plugin_node_state_realloc(List job_gres_list, int node_offset, + List node_gres_list); + /* * Log a node's current gres state * IN gres_list - generated by gres_plugin_node_config_validate() @@ -171,7 +193,7 @@ extern void gres_plugin_node_state_log(List gres_list, char *node_name); * Given a job's requested gres configuration, validate it and build a gres list * IN req_config - job request's gres input string * OUT gres_list - List of Gres records for this job to track usage - * RET SLURM_SUCCESS or ESLURM__INVALIDGRES + * RET SLURM_SUCCESS or ESLURM_INVALID_GRES */ extern int gres_plugin_job_gres_validate(char *req_config, List *gres_list); diff --git a/src/common/node_select.h b/src/common/node_select.h index 8d0e5e036dd..e83d5a8de2c 100644 --- a/src/common/node_select.h +++ b/src/common/node_select.h @@ -355,8 +355,8 @@ extern char *select_g_select_jobinfo_sprint(dynamic_plugin_data_t *jobinfo, * IN mode - print mode, see enum select_print_mode * RET - the string, same as buf */ -extern char *select_g_select_jobinfo_xstrdup( - dynamic_plugin_data_t *jobinfo, int mode); +extern char *select_g_select_jobinfo_xstrdup(dynamic_plugin_data_t *jobinfo, + int mode); /*******************************************************\ * NODE-SPECIFIC SELECT CREDENTIAL MANAGEMENT FUNCIONS * diff --git a/src/plugins/gres/gpu/gres_gpu.c b/src/plugins/gres/gpu/gres_gpu.c index 55182e93213..9caa5f8f9b7 100644 --- a/src/plugins/gres/gpu/gres_gpu.c +++ b/src/plugins/gres/gpu/gres_gpu.c @@ -527,6 +527,69 @@ extern void *dup_node_state(void *gres_data) return new_gres; } +extern void node_state_dealloc(void *gres_data) +{ + gpu_node_state_t *gres_ptr = (gpu_node_state_t *) gres_data; + + gres_ptr->gpu_cnt_alloc = 0; + if (gres_ptr->gpu_bit_alloc) { + int i = bit_size(gres_ptr->gpu_bit_alloc) - 1; + bit_nclear(gres_ptr->gpu_bit_alloc, 0, i); + } +} + +extern int node_state_realloc(void *job_gres_data, int node_offset, + void *node_gres_data) +{ + gpu_job_state_t *job_gres_ptr = (gpu_job_state_t *) job_gres_data; + gpu_node_state_t *node_gres_ptr = (gpu_node_state_t *) node_gres_data; + int i, job_bit_size, node_bit_size; + + xassert(job_gres_ptr); + xassert(node_gres_ptr); + + if (node_offset >= job_gres_ptr->node_cnt) { + error("%s job node offset is bad (%d >= %u)", + plugin_name, node_offset, job_gres_ptr->node_cnt); + return EINVAL; + } + + if ((job_gres_ptr->gpu_bit_alloc == NULL) || + (job_gres_ptr->gpu_bit_alloc[node_offset] == NULL)) { + error("%s job bit_alloc is NULL", plugin_name); + return EINVAL; + } + + if (node_gres_ptr->gpu_bit_alloc == NULL) { + error("%s node bit_alloc is NULL", plugin_name); + return EINVAL; + } + + job_bit_size = bit_size(job_gres_ptr->gpu_bit_alloc[node_offset]); + node_bit_size = bit_size(node_gres_ptr->gpu_bit_alloc); + if (job_bit_size != node_bit_size) { + error("%s job/node bit size mismatch (%d != %d)", + plugin_name, job_bit_size, node_bit_size); + /* Update what we can */ + node_bit_size = MIN(job_bit_size, node_bit_size); + for (i=0; i<node_bit_size; i++) { + if (!bit_test(job_gres_ptr->gpu_bit_alloc[node_offset], + i)) + continue; + node_gres_ptr->gpu_cnt_alloc++; + bit_set(node_gres_ptr->gpu_bit_alloc, i); + } + } else { + node_gres_ptr->gpu_cnt_alloc += bit_set_count(job_gres_ptr-> + gpu_bit_alloc + [node_offset]); + bit_or(node_gres_ptr->gpu_bit_alloc, + job_gres_ptr->gpu_bit_alloc[node_offset]); + } + + return SLURM_SUCCESS; +} + extern void node_state_log(void *gres_data, char *node_name) { gpu_node_state_t *gres_ptr; diff --git a/src/plugins/gres/nic/gres_nic.c b/src/plugins/gres/nic/gres_nic.c index 148d19cee63..631f3a67643 100644 --- a/src/plugins/gres/nic/gres_nic.c +++ b/src/plugins/gres/nic/gres_nic.c @@ -527,6 +527,69 @@ extern void *dup_node_state(void *gres_data) return new_gres; } +extern void node_state_dealloc(void *gres_data) +{ + nic_node_state_t *gres_ptr = (nic_node_state_t *) gres_data; + + gres_ptr->nic_cnt_alloc = 0; + if (gres_ptr->nic_bit_alloc) { + int i = bit_size(gres_ptr->nic_bit_alloc) - 1; + bit_nclear(gres_ptr->nic_bit_alloc, 0, i); + } +} + +extern int node_state_realloc(void *job_gres_data, int node_offset, + void *node_gres_data) +{ + nic_job_state_t *job_gres_ptr = (nic_job_state_t *) job_gres_data; + nic_node_state_t *node_gres_ptr = (nic_node_state_t *) node_gres_data; + int i, job_bit_size, node_bit_size; + + xassert(job_gres_ptr); + xassert(node_gres_ptr); + + if (node_offset >= job_gres_ptr->node_cnt) { + error("%s job node offset is bad (%d >= %u)", + plugin_name, node_offset, job_gres_ptr->node_cnt); + return EINVAL; + } + + if ((job_gres_ptr->nic_bit_alloc == NULL) || + (job_gres_ptr->nic_bit_alloc[node_offset] == NULL)) { + error("%s job bit_alloc is NULL", plugin_name); + return EINVAL; + } + + if (node_gres_ptr->nic_bit_alloc == NULL) { + error("%s node bit_alloc is NULL", plugin_name); + return EINVAL; + } + + job_bit_size = bit_size(job_gres_ptr->nic_bit_alloc[node_offset]); + node_bit_size = bit_size(node_gres_ptr->nic_bit_alloc); + if (job_bit_size != node_bit_size) { + error("%s job/node bit size mismatch (%d != %d)", + plugin_name, job_bit_size, node_bit_size); + /* Update what we can */ + node_bit_size = MIN(job_bit_size, node_bit_size); + for (i=0; i<node_bit_size; i++) { + if (!bit_test(job_gres_ptr->nic_bit_alloc[node_offset], + i)) + continue; + node_gres_ptr->nic_cnt_alloc++; + bit_set(node_gres_ptr->nic_bit_alloc, i); + } + } else { + node_gres_ptr->nic_cnt_alloc += bit_set_count(job_gres_ptr-> + nic_bit_alloc + [node_offset]); + bit_or(node_gres_ptr->nic_bit_alloc, + job_gres_ptr->nic_bit_alloc[node_offset]); + } + + return SLURM_SUCCESS; +} + extern void node_state_log(void *gres_data, char *node_name) { nic_node_state_t *gres_ptr; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 213a5949ce5..7ed95bad0cc 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -753,8 +753,7 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer) if (IS_JOB_COMPLETING(dump_job_ptr)) { if (dump_job_ptr->nodes_completing == NULL) { dump_job_ptr->nodes_completing = - bitmap2node_name( - dump_job_ptr->node_bitmap); + bitmap2node_name(dump_job_ptr->node_bitmap); } packstr(dump_job_ptr->nodes_completing, buffer); } diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index f53e3bc9efc..c99b21bbd3d 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -118,9 +118,10 @@ int dump_all_node_state ( void ) static int high_buffer_size = (1024 * 1024); int error_code = 0, inx, log_fd; char *old_file, *new_file, *reg_file; + struct node_record *node_ptr; /* Locks: Read config and node */ slurmctld_lock_t node_read_lock = { READ_LOCK, NO_LOCK, READ_LOCK, - NO_LOCK }; + NO_LOCK }; Buf buffer = init_buf(high_buffer_size); DEF_TIMERS; @@ -131,12 +132,11 @@ int dump_all_node_state ( void ) /* write node records to buffer */ lock_slurmctld (node_read_lock); - for (inx = 0; inx < node_record_count; inx++) { - xassert (node_record_table_ptr[inx].magic == NODE_MAGIC); - xassert (node_record_table_ptr[inx].config_ptr->magic == - CONFIG_MAGIC); - - _dump_node_state (&node_record_table_ptr[inx], buffer); + for (inx = 0, node_ptr = node_record_table_ptr; inx < node_record_count; + inx++, node_ptr++) { + xassert (node_ptr->magic == NODE_MAGIC); + xassert (node_ptr->config_ptr->magic == CONFIG_MAGIC); + _dump_node_state (node_ptr, buffer); } old_file = xstrdup (slurmctld_conf.state_save_location); @@ -151,8 +151,7 @@ int dump_all_node_state ( void ) lock_state_files(); log_fd = creat (new_file, 0600); if (log_fd < 0) { - error ("Can't save state, error creating file %s %m", - new_file); + error ("Can't save state, error creating file %s %m", new_file); error_code = errno; } else { int pos = 0, nwrite = get_buf_offset(buffer), amount, rc; -- GitLab