Skip to content
Snippets Groups Projects
Commit 4f775b41 authored by Moe Jette's avatar Moe Jette
Browse files

streamline step gres state save/restore

parent 9b5facf4
No related branches found
No related tags found
No related merge requests found
......@@ -164,6 +164,8 @@ static void _set_gres_cnt(char *orig_config, char **new_config,
char *gres_name_colon, int gres_name_colon_len);
static int _step_state_validate(char *config, void **gres_data,
slurm_gres_context_t *context_ptr);
static uint32_t _step_test(void *step_gres_data, void *job_gres_data,
int node_offset, bool ignore_alloc, char *gres_name);
static int _strcmp(const char *s1, const char *s2);
static int _unload_gres_plugin(slurm_gres_context_t *plugin_context);
static void _validate_config(slurm_gres_context_t *context_ptr);
......@@ -2144,6 +2146,9 @@ List gres_plugin_job_state_dup(List gres_list)
* IN gres_list - generated by gres_plugin_job_config_validate()
* IN/OUT buffer - location to write state to
* IN job_id - job's ID
*
* NOTE: A job's allocation to steps is not recorded here, but recovered with
* the job step state information upon slurmctld restart.
*/
extern int gres_plugin_job_state_pack(List gres_list, Buf buffer,
uint32_t job_id)
......@@ -2152,7 +2157,6 @@ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer,
uint32_t top_offset, tail_offset;
uint32_t magic = GRES_MAGIC;
uint16_t rec_cnt = 0;
uint8_t has_bitmap;
ListIterator gres_iter;
gres_state_t *gres_ptr;
gres_job_state_t *gres_job_ptr;
......@@ -2174,15 +2178,13 @@ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer,
pack32(gres_job_ptr->gres_cnt_alloc, buffer);
pack32(gres_job_ptr->node_cnt, buffer);
if (gres_job_ptr->gres_bit_alloc) {
has_bitmap = 1;
pack8(has_bitmap, buffer);
pack8((uint8_t) 1, buffer);
for (i=0; i<gres_job_ptr->node_cnt; i++) {
pack_bit_str(gres_job_ptr->gres_bit_alloc[i],
buffer);
}
} else {
has_bitmap = 0;
pack8(has_bitmap, buffer);
pack8((uint8_t) 0, buffer);
}
rec_cnt++;
}
......@@ -2252,11 +2254,11 @@ extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer,
break;
}
if (i >= gres_context_cnt) {
/* A likely sign that GresPlugins has changed.
* Not a fatal error, skip over the data. */
error("gres_plugin_job_state_unpack: no plugin "
"configured to unpack data type %u from job %u",
plugin_id, job_id);
/* A likely sign that GresPlugins has changed.
* Not a fatal error, skip over the data. */
_job_state_delete(gres_job_ptr);
continue;
}
......@@ -3123,24 +3125,6 @@ List gres_plugin_step_state_dup(List gres_list)
return new_gres_list;
}
static int _step_state_pack(void *gres_data, Buf buffer)
{
int i;
gres_step_state_t *gres_ptr = (gres_step_state_t *) gres_data;
pack32(gres_ptr->gres_cnt_alloc, buffer);
pack32(gres_ptr->node_cnt, buffer);
if (gres_ptr->gres_bit_alloc) {
pack8((uint8_t) 1, buffer);
for (i=0; i<gres_ptr->node_cnt; i++)
pack_bit_str(gres_ptr->gres_bit_alloc[i], buffer);
} else {
pack8((uint8_t) 0, buffer);
}
return SLURM_SUCCESS;
}
/*
* Pack a step's current gres status, called from slurmctld for save/restore
* IN gres_list - generated by gres_plugin_step_allocate()
......@@ -3150,13 +3134,12 @@ static int _step_state_pack(void *gres_data, Buf buffer)
extern int gres_plugin_step_state_pack(List gres_list, Buf buffer,
uint32_t job_id, uint32_t step_id)
{
int i, rc = SLURM_SUCCESS, rc2;
uint32_t top_offset, gres_size = 0;
uint32_t header_offset, size_offset, data_offset, tail_offset;
uint32_t magic = GRES_MAGIC;
int i, rc = SLURM_SUCCESS;
uint32_t top_offset, tail_offset, magic = GRES_MAGIC;
uint16_t rec_cnt = 0;
ListIterator gres_iter;
gres_state_t *gres_ptr;
gres_step_state_t *gres_step_ptr;
top_offset = get_buf_offset(buffer);
pack16(rec_cnt, buffer); /* placeholder if data */
......@@ -3169,35 +3152,20 @@ extern int gres_plugin_step_state_pack(List gres_list, Buf buffer,
slurm_mutex_lock(&gres_context_lock);
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
for (i=0; i<gres_context_cnt; i++) {
if (gres_ptr->plugin_id !=
gres_context[i].plugin_id)
continue;
header_offset = get_buf_offset(buffer);
pack32(magic, buffer);
pack32(gres_ptr->plugin_id, buffer);
size_offset = get_buf_offset(buffer);
pack32(gres_size, buffer); /* placeholder */
data_offset = get_buf_offset(buffer);
rc2 = _step_state_pack(gres_ptr->gres_data, buffer);
if (rc2 != SLURM_SUCCESS) {
rc = rc2;
set_buf_offset(buffer, header_offset);
continue;
}
tail_offset = get_buf_offset(buffer);
set_buf_offset(buffer, size_offset);
gres_size = tail_offset - data_offset;
pack32(gres_size, buffer);
set_buf_offset(buffer, tail_offset);
rec_cnt++;
break;
}
if (i >= gres_context_cnt) {
error("Could not find plugin id %u to pack record for "
"step %u.%u",
gres_ptr->plugin_id, job_id, step_id);
gres_step_ptr = (gres_step_state_t *) gres_ptr->gres_data;
pack32(magic, buffer);
pack32(gres_ptr->plugin_id, buffer);
pack32(gres_step_ptr->gres_cnt_alloc, buffer);
pack32(gres_step_ptr->node_cnt, buffer);
if (gres_step_ptr->gres_bit_alloc) {
pack8((uint8_t) 1, buffer);
for (i=0; i<gres_step_ptr->node_cnt; i++)
pack_bit_str(gres_step_ptr->gres_bit_alloc[i],
buffer);
} else {
pack8((uint8_t) 0, buffer);
}
rec_cnt++;
}
list_iterator_destroy(gres_iter);
slurm_mutex_unlock(&gres_context_lock);
......@@ -3210,42 +3178,6 @@ extern int gres_plugin_step_state_pack(List gres_list, Buf buffer,
return rc;
}
static int _step_state_unpack(void **gres_data, Buf buffer, char *gres_name)
{
int i;
gres_step_state_t *gres_ptr;
uint8_t gres_bit_flag;
gres_ptr = xmalloc(sizeof(gres_step_state_t));
if (buffer) {
safe_unpack32(&gres_ptr->gres_cnt_alloc, buffer);
safe_unpack32(&gres_ptr->node_cnt, buffer);
safe_unpack8(&gres_bit_flag, buffer);
if (gres_bit_flag) {
gres_ptr->gres_bit_alloc = xmalloc(sizeof(bitstr_t *) *
gres_ptr->node_cnt);
for (i=0; i<gres_ptr->node_cnt; i++)
unpack_bit_str(&gres_ptr->gres_bit_alloc[i],
buffer);
}
}
*gres_data = gres_ptr;
return SLURM_SUCCESS;
unpack_error:
error("Unpacking gres/%s step state info", gres_name);
if (gres_ptr->gres_bit_alloc) {
for (i=0; i<gres_ptr->node_cnt; i++)
FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc[i]);
xfree(gres_ptr->gres_bit_alloc);
}
xfree(gres_ptr);
*gres_data = NULL;
return SLURM_ERROR;
}
/*
* Unpack a step's current gres status, called from slurmctld for save/restore
* OUT gres_list - restored state stored by gres_plugin_step_state_pack()
......@@ -3255,11 +3187,12 @@ unpack_error:
extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
uint32_t job_id, uint32_t step_id)
{
int i, rc, rc2;
uint32_t gres_size, magic, tail_offset, plugin_id;
int i, rc;
uint32_t magic, plugin_id;
uint16_t rec_cnt;
uint8_t has_file;
gres_state_t *gres_ptr;
void *gres_data;
gres_step_state_t *gres_step_ptr = NULL;
safe_unpack16(&rec_cnt, buffer);
if (rec_cnt == 0)
......@@ -3282,41 +3215,49 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
if (magic != GRES_MAGIC)
goto unpack_error;
safe_unpack32(&plugin_id, buffer);
safe_unpack32(&gres_size, buffer);
gres_step_ptr = xmalloc(sizeof(gres_step_state_t));
safe_unpack32(&gres_step_ptr->gres_cnt_alloc, buffer);
safe_unpack32(&gres_step_ptr->node_cnt, buffer);
safe_unpack8(&has_file, buffer);
if (has_file) {
gres_step_ptr->gres_bit_alloc =
xmalloc(sizeof(bitstr_t) *
gres_step_ptr->node_cnt);
for (i=0; i<gres_step_ptr->node_cnt; i++) {
unpack_bit_str(&gres_step_ptr->gres_bit_alloc[i],
buffer);
}
}
for (i=0; i<gres_context_cnt; i++) {
if (gres_context[i].plugin_id == plugin_id)
break;
}
if (i >= gres_context_cnt) {
error("gres_plugin_step_state_unpack: no plugin "
/* A likely sign that GresPlugins has changed.
* Not a fatal error, skip over the data. */
info("gres_plugin_step_state_unpack: no plugin "
"configured to unpack data type %u from "
"step %u.%u",
plugin_id, job_id, step_id);
/* A likely sign that GresPlugins has changed.
* Not a fatal error, skip over the data. */
tail_offset = get_buf_offset(buffer);
tail_offset += gres_size;
set_buf_offset(buffer, tail_offset);
_step_state_delete(gres_step_ptr);
gres_step_ptr = NULL;
continue;
}
rc2 = _step_state_unpack(&gres_data, buffer,
gres_context[i].gres_name);
if (rc2 != SLURM_SUCCESS) {
rc = rc2;
} else {
gres_ptr = xmalloc(sizeof(gres_state_t));
gres_ptr->plugin_id = gres_context[i].plugin_id;
gres_ptr->gres_data = gres_data;
list_append(*gres_list, gres_ptr);
}
gres_ptr = xmalloc(sizeof(gres_state_t));
gres_ptr->plugin_id = gres_context[i].plugin_id;
gres_ptr->gres_data = gres_step_ptr;
gres_step_ptr = NULL;
list_append(*gres_list, gres_ptr);
}
slurm_mutex_unlock(&gres_context_lock);
return rc;
unpack_error:
error("gres_plugin_job_state_unpack: unpack error from step %u.%u",
error("gres_plugin_step_state_unpack: unpack error from step %u.%u",
job_id, step_id);
if (gres_step_ptr)
_step_state_delete(gres_step_ptr);
slurm_mutex_unlock(&gres_context_lock);
return SLURM_ERROR;
}
......@@ -3333,7 +3274,9 @@ static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id,
info(" gres_cnt:%u node_cnt:%u", gres_ptr->gres_cnt_alloc,
gres_ptr->node_cnt);
if (gres_ptr->node_cnt && gres_ptr->gres_bit_alloc) {
if (gres_ptr->gres_bit_alloc == NULL)
info(" gres_bit_alloc:NULL");
else {
for (i=0; i<gres_ptr->node_cnt; i++) {
if (gres_ptr->gres_bit_alloc[i]) {
bit_fmt(tmp_str, sizeof(tmp_str),
......@@ -3342,8 +3285,6 @@ static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id,
} else
info(" gres_bit_alloc[%d]:NULL", i);
}
} else {
info(" gres_bit_alloc:NULL");
}
}
......@@ -3368,8 +3309,7 @@ extern void gres_plugin_step_state_log(List gres_list, uint32_t job_id,
gres_iter = list_iterator_create(gres_list);
while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
for (i=0; i<gres_context_cnt; i++) {
if (gres_ptr->plugin_id !=
gres_context[i].plugin_id)
if (gres_ptr->plugin_id != gres_context[i].plugin_id)
continue;
_step_state_log(gres_ptr->gres_data, job_id, step_id,
gres_context[i].gres_name);
......
......@@ -113,7 +113,6 @@ typedef struct gres_step_state {
/* Resources currently allocated to the job step on each node */
uint32_t node_cnt;
bitstr_t **gres_bit_alloc;
uint32_t *gres_cnt_step_alloc;
} gres_step_state_t;
/*
......@@ -296,6 +295,9 @@ List gres_plugin_job_state_dup(List gres_list);
* IN gres_list - generated by gres_plugin_job_config_validate()
* IN/OUT buffer - location to write state to
* IN job_id - job's ID
*
* NOTE: A job's allocation to steps is not recorded here, but recovered with
* the job step state information upon slurmctld restart.
*/
extern int gres_plugin_job_state_pack(List gres_list, Buf buffer,
uint32_t job_id);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment