diff --git a/src/common/gres.c b/src/common/gres.c index fd2a1a2cb13d73b0bc240318dc8b3b7d07f5aac5..b6cdbec92bbd0cb88223af33d204a185b56a05d3 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -164,6 +164,8 @@ static void _set_gres_cnt(char *orig_config, char **new_config, char *gres_name_colon, int gres_name_colon_len); static int _step_state_validate(char *config, void **gres_data, slurm_gres_context_t *context_ptr); +static uint32_t _step_test(void *step_gres_data, void *job_gres_data, + int node_offset, bool ignore_alloc, char *gres_name); static int _strcmp(const char *s1, const char *s2); static int _unload_gres_plugin(slurm_gres_context_t *plugin_context); static void _validate_config(slurm_gres_context_t *context_ptr); @@ -2144,6 +2146,9 @@ List gres_plugin_job_state_dup(List gres_list) * IN gres_list - generated by gres_plugin_job_config_validate() * IN/OUT buffer - location to write state to * IN job_id - job's ID + * + * NOTE: A job's allocation to steps is not recorded here, but recovered with + * the job step state information upon slurmctld restart. */ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer, uint32_t job_id) @@ -2152,7 +2157,6 @@ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer, uint32_t top_offset, tail_offset; uint32_t magic = GRES_MAGIC; uint16_t rec_cnt = 0; - uint8_t has_bitmap; ListIterator gres_iter; gres_state_t *gres_ptr; gres_job_state_t *gres_job_ptr; @@ -2174,15 +2178,13 @@ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer, pack32(gres_job_ptr->gres_cnt_alloc, buffer); pack32(gres_job_ptr->node_cnt, buffer); if (gres_job_ptr->gres_bit_alloc) { - has_bitmap = 1; - pack8(has_bitmap, buffer); + pack8((uint8_t) 1, buffer); for (i=0; i<gres_job_ptr->node_cnt; i++) { pack_bit_str(gres_job_ptr->gres_bit_alloc[i], buffer); } } else { - has_bitmap = 0; - pack8(has_bitmap, buffer); + pack8((uint8_t) 0, buffer); } rec_cnt++; } @@ -2252,11 +2254,11 @@ extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer, break; } if (i >= gres_context_cnt) { + /* A likely sign that GresPlugins has changed. + * Not a fatal error, skip over the data. */ error("gres_plugin_job_state_unpack: no plugin " "configured to unpack data type %u from job %u", plugin_id, job_id); - /* A likely sign that GresPlugins has changed. - * Not a fatal error, skip over the data. */ _job_state_delete(gres_job_ptr); continue; } @@ -3123,24 +3125,6 @@ List gres_plugin_step_state_dup(List gres_list) return new_gres_list; } -static int _step_state_pack(void *gres_data, Buf buffer) -{ - int i; - gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data; - - pack32(gres_ptr->gres_cnt_alloc, buffer); - pack32(gres_ptr->node_cnt, buffer); - if (gres_ptr->gres_bit_alloc) { - pack8((uint8_t) 1, buffer); - for (i=0; i<gres_ptr->node_cnt; i++) - pack_bit_str(gres_ptr->gres_bit_alloc[i], buffer); - } else { - pack8((uint8_t) 0, buffer); - } - - return SLURM_SUCCESS; -} - /* * Pack a step's current gres status, called from slurmctld for save/restore * IN gres_list - generated by gres_plugin_step_allocate() @@ -3150,13 +3134,12 @@ static int _step_state_pack(void *gres_data, Buf buffer) extern int gres_plugin_step_state_pack(List gres_list, Buf buffer, uint32_t job_id, uint32_t step_id) { - int i, rc = SLURM_SUCCESS, rc2; - uint32_t top_offset, gres_size = 0; - uint32_t header_offset, size_offset, data_offset, tail_offset; - uint32_t magic = GRES_MAGIC; + int i, rc = SLURM_SUCCESS; + uint32_t top_offset, tail_offset, magic = GRES_MAGIC; uint16_t rec_cnt = 0; ListIterator gres_iter; gres_state_t *gres_ptr; + gres_step_state_t *gres_step_ptr; top_offset = get_buf_offset(buffer); pack16(rec_cnt, buffer); /* placeholder if data */ @@ -3169,35 +3152,20 @@ extern int gres_plugin_step_state_pack(List gres_list, Buf buffer, slurm_mutex_lock(&gres_context_lock); gres_iter = list_iterator_create(gres_list); while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) { - for (i=0; i<gres_context_cnt; i++) { - if (gres_ptr->plugin_id != - gres_context[i].plugin_id) - continue; - header_offset = get_buf_offset(buffer); - pack32(magic, buffer); - pack32(gres_ptr->plugin_id, buffer); - size_offset = get_buf_offset(buffer); - pack32(gres_size, buffer); /* placeholder */ - data_offset = get_buf_offset(buffer); - rc2 = _step_state_pack(gres_ptr->gres_data, buffer); - if (rc2 != SLURM_SUCCESS) { - rc = rc2; - set_buf_offset(buffer, header_offset); - continue; - } - tail_offset = get_buf_offset(buffer); - set_buf_offset(buffer, size_offset); - gres_size = tail_offset - data_offset; - pack32(gres_size, buffer); - set_buf_offset(buffer, tail_offset); - rec_cnt++; - break; - } - if (i >= gres_context_cnt) { - error("Could not find plugin id %u to pack record for " - "step %u.%u", - gres_ptr->plugin_id, job_id, step_id); + gres_step_ptr = (gres_step_state_t *) gres_ptr->gres_data; + pack32(magic, buffer); + pack32(gres_ptr->plugin_id, buffer); + pack32(gres_step_ptr->gres_cnt_alloc, buffer); + pack32(gres_step_ptr->node_cnt, buffer); + if (gres_step_ptr->gres_bit_alloc) { + pack8((uint8_t) 1, buffer); + for (i=0; i<gres_step_ptr->node_cnt; i++) + pack_bit_str(gres_step_ptr->gres_bit_alloc[i], + buffer); + } else { + pack8((uint8_t) 0, buffer); } + rec_cnt++; } list_iterator_destroy(gres_iter); slurm_mutex_unlock(&gres_context_lock); @@ -3210,42 +3178,6 @@ extern int gres_plugin_step_state_pack(List gres_list, Buf buffer, return rc; } -static int _step_state_unpack(void **gres_data, Buf buffer, char *gres_name) -{ - int i; - gres_step_state_t *gres_ptr; - uint8_t gres_bit_flag; - - gres_ptr = xmalloc(sizeof(gres_step_state_t)); - - if (buffer) { - safe_unpack32(&gres_ptr->gres_cnt_alloc, buffer); - safe_unpack32(&gres_ptr->node_cnt, buffer); - safe_unpack8(&gres_bit_flag, buffer); - if (gres_bit_flag) { - gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *) * - gres_ptr->node_cnt); - for (i=0; i<gres_ptr->node_cnt; i++) - unpack_bit_str(&gres_ptr->gres_bit_alloc[i], - buffer); - } - } - - *gres_data = gres_ptr; - return SLURM_SUCCESS; - -unpack_error: - error("Unpacking gres/%s step state info", gres_name); - if (gres_ptr->gres_bit_alloc) { - for (i=0; i<gres_ptr->node_cnt; i++) - FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]); - xfree(gres_ptr->gres_bit_alloc); - } - xfree(gres_ptr); - *gres_data = NULL; - return SLURM_ERROR; -} - /* * Unpack a step's current gres status, called from slurmctld for save/restore * OUT gres_list - restored state stored by gres_plugin_step_state_pack() @@ -3255,11 +3187,12 @@ unpack_error: extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer, uint32_t job_id, uint32_t step_id) { - int i, rc, rc2; - uint32_t gres_size, magic, tail_offset, plugin_id; + int i, rc; + uint32_t magic, plugin_id; uint16_t rec_cnt; + uint8_t has_file; gres_state_t *gres_ptr; - void *gres_data; + gres_step_state_t *gres_step_ptr = NULL; safe_unpack16(&rec_cnt, buffer); if (rec_cnt == 0) @@ -3282,41 +3215,49 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer, if (magic != GRES_MAGIC) goto unpack_error; safe_unpack32(&plugin_id, buffer); - safe_unpack32(&gres_size, buffer); + gres_step_ptr = xmalloc(sizeof(gres_step_state_t)); + safe_unpack32(&gres_step_ptr->gres_cnt_alloc, buffer); + safe_unpack32(&gres_step_ptr->node_cnt, buffer); + safe_unpack8(&has_file, buffer); + if (has_file) { + gres_step_ptr->gres_bit_alloc = + xmalloc(sizeof(bitstr_t) * + gres_step_ptr->node_cnt); + for (i=0; i<gres_step_ptr->node_cnt; i++) { + unpack_bit_str(&gres_step_ptr->gres_bit_alloc[i], + buffer); + } + } + for (i=0; i<gres_context_cnt; i++) { if (gres_context[i].plugin_id == plugin_id) break; } if (i >= gres_context_cnt) { - error("gres_plugin_step_state_unpack: no plugin " + /* A likely sign that GresPlugins has changed. + * Not a fatal error, skip over the data. */ + info("gres_plugin_step_state_unpack: no plugin " "configured to unpack data type %u from " "step %u.%u", plugin_id, job_id, step_id); - /* A likely sign that GresPlugins has changed. - * Not a fatal error, skip over the data. */ - tail_offset = get_buf_offset(buffer); - tail_offset += gres_size; - set_buf_offset(buffer, tail_offset); + _step_state_delete(gres_step_ptr); + gres_step_ptr = NULL; continue; } - rc2 = _step_state_unpack(&gres_data, buffer, - gres_context[i].gres_name); - if (rc2 != SLURM_SUCCESS) { - rc = rc2; - } else { - gres_ptr = xmalloc(sizeof(gres_state_t)); - gres_ptr->plugin_id = gres_context[i].plugin_id; - gres_ptr->gres_data = gres_data; - list_append(*gres_list, gres_ptr); - } + gres_ptr = xmalloc(sizeof(gres_state_t)); + gres_ptr->plugin_id = gres_context[i].plugin_id; + gres_ptr->gres_data = gres_step_ptr; + gres_step_ptr = NULL; + list_append(*gres_list, gres_ptr); } slurm_mutex_unlock(&gres_context_lock); - return rc; unpack_error: - error("gres_plugin_job_state_unpack: unpack error from step %u.%u", + error("gres_plugin_step_state_unpack: unpack error from step %u.%u", job_id, step_id); + if (gres_step_ptr) + _step_state_delete(gres_step_ptr); slurm_mutex_unlock(&gres_context_lock); return SLURM_ERROR; } @@ -3333,7 +3274,9 @@ static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id, info(" gres_cnt:%u node_cnt:%u", gres_ptr->gres_cnt_alloc, gres_ptr->node_cnt); - if (gres_ptr->node_cnt && gres_ptr->gres_bit_alloc) { + if (gres_ptr->gres_bit_alloc == NULL) + info(" gres_bit_alloc:NULL"); + else { for (i=0; i<gres_ptr->node_cnt; i++) { if (gres_ptr->gres_bit_alloc[i]) { bit_fmt(tmp_str, sizeof(tmp_str), @@ -3342,8 +3285,6 @@ static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id, } else info(" gres_bit_alloc[%d]:NULL", i); } - } else { - info(" gres_bit_alloc:NULL"); } } @@ -3368,8 +3309,7 @@ extern void gres_plugin_step_state_log(List gres_list, uint32_t job_id, gres_iter = list_iterator_create(gres_list); while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) { for (i=0; i<gres_context_cnt; i++) { - if (gres_ptr->plugin_id != - gres_context[i].plugin_id) + if (gres_ptr->plugin_id != gres_context[i].plugin_id) continue; _step_state_log(gres_ptr->gres_data, job_id, step_id, gres_context[i].gres_name); diff --git a/src/common/gres.h b/src/common/gres.h index df3a5d36584f4980d44f12e46d0fb4010d14ffa1..a4a91ad0b27148e4a0f06fb67745991e259608b1 100644 --- a/src/common/gres.h +++ b/src/common/gres.h @@ -113,7 +113,6 @@ typedef struct gres_step_state { /* Resources currently allocated to the job step on each node */ uint32_t node_cnt; bitstr_t **gres_bit_alloc; - uint32_t *gres_cnt_step_alloc; } gres_step_state_t; /* @@ -296,6 +295,9 @@ List gres_plugin_job_state_dup(List gres_list); * IN gres_list - generated by gres_plugin_job_config_validate() * IN/OUT buffer - location to write state to * IN job_id - job's ID + * + * NOTE: A job's allocation to steps is not recorded here, but recovered with + * the job step state information upon slurmctld restart. */ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer, uint32_t job_id);