From 14a9a55c0c6a616f3c00177af6b433a34c9d49a9 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Mon, 21 Dec 2015 13:18:22 -0800 Subject: [PATCH] burst_buffer/cray: work Interactive pools support state save/restore fix --- .../burst_buffer/common/burst_buffer_common.c | 2 + .../burst_buffer/common/burst_buffer_common.h | 2 +- .../burst_buffer/cray/burst_buffer_cray.c | 54 +++++++++++++------ 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/src/plugins/burst_buffer/common/burst_buffer_common.c b/src/plugins/burst_buffer/common/burst_buffer_common.c index 2b3cb458839..8a8720f84d0 100644 --- a/src/plugins/burst_buffer/common/burst_buffer_common.c +++ b/src/plugins/burst_buffer/common/burst_buffer_common.c @@ -993,6 +993,7 @@ extern bb_alloc_t *bb_alloc_job_rec(bb_state_t *state_ptr, xstrfmtcat(bb_alloc->name, "%u", job_ptr->job_id); bb_alloc->next = state_ptr->bb_ahash[i]; bb_alloc->partition = xstrdup(bb_job->partition); + bb_alloc->pool = xstrdup(bb_job->job_pool); bb_alloc->qos = xstrdup(bb_job->qos); state_ptr->bb_ahash[i] = bb_alloc; bb_alloc->size = bb_job->total_size; @@ -1035,6 +1036,7 @@ extern void bb_free_alloc_buf(bb_alloc_t *bb_alloc) xfree(bb_alloc->gres_ptr); xfree(bb_alloc->name); xfree(bb_alloc->partition); + xfree(bb_alloc->pool); xfree(bb_alloc->qos); xfree(bb_alloc); } diff --git a/src/plugins/burst_buffer/common/burst_buffer_common.h b/src/plugins/burst_buffer/common/burst_buffer_common.h index e49338c3617..e6e9db9ebc4 100644 --- a/src/plugins/burst_buffer/common/burst_buffer_common.h +++ b/src/plugins/burst_buffer/common/burst_buffer_common.h @@ -299,7 +299,7 @@ extern int bb_job_queue_sort(void *x, void *y); /* Load and process configuration parameters */ extern void bb_load_config(bb_state_t *state_ptr, char *plugin_type); -/* Pack individual burst buffer records into a buffer */ +/* Pack individual burst buffer records into a buffer */ extern int bb_pack_bufs(uid_t uid, bb_state_t *state_ptr, Buf buffer, uint16_t protocol_version); diff --git a/src/plugins/burst_buffer/cray/burst_buffer_cray.c b/src/plugins/burst_buffer/cray/burst_buffer_cray.c index 81059cfccd3..8be1ee35f6e 100644 --- a/src/plugins/burst_buffer/cray/burst_buffer_cray.c +++ b/src/plugins/burst_buffer/cray/burst_buffer_cray.c @@ -431,7 +431,6 @@ static void *_bb_agent(void *args) static bb_job_t *_get_bb_job(struct job_record *job_ptr) { char *bb_specs, *bb_hurry, *bb_name, *bb_type, *bb_access, *bb_pool; - char *job_pool = NULL; char *end_ptr = NULL, *save_ptr = NULL, *sub_tok, *tok; bool have_bb = false; uint64_t tmp_cnt; @@ -566,9 +565,9 @@ static bb_job_t *_get_bb_job(struct job_record *job_ptr) } bb_job->total_size += tmp_cnt; if ((sub_tok = strstr(tok, "pool="))) { - xfree(job_pool); - job_pool = xstrdup(sub_tok + 5); - sub_tok = strchr(job_pool, ' '); + xfree(bb_job->job_pool); + bb_job->job_pool = xstrdup(sub_tok + 5); + sub_tok = strchr(bb_job->job_pool, ' '); if (sub_tok) sub_tok[0] = '\0'; } @@ -613,9 +612,9 @@ static bb_job_t *_get_bb_job(struct job_record *job_ptr) bb_job->total_size += (bb_job->swap_size * bb_job->swap_nodes); if ((sub_tok = strstr(tok, "pool="))) { - xfree(job_pool); - job_pool = xstrdup(sub_tok + 5); - sub_tok = strchr(job_pool, ' '); + xfree(bb_job->job_pool); + bb_job->job_pool = xstrdup(sub_tok + 5); + sub_tok = strchr(bb_job->job_pool, ' '); if (sub_tok) sub_tok[0] = '\0'; } @@ -639,6 +638,8 @@ static bb_job_t *_get_bb_job(struct job_record *job_ptr) bb_job_del(&bb_state, job_ptr->job_id); return NULL; } + if (!bb_job->job_pool) + bb_job->job_pool = xstrdup(bb_state.bb_config.default_pool); if (bb_state.bb_config.debug_flag) bb_job_log(&bb_state, bb_job); return bb_job; @@ -676,7 +677,7 @@ static void _save_bb_state(void) char *old_file = NULL, *new_file = NULL, *reg_file = NULL; int i, count_offset, offset, state_fd; int error_code = 0; - uint16_t protocol_version = SLURM_15_08_PROTOCOL_VERSION; + uint16_t protocol_version = SLURM_PROTOCOL_VERSION; if ((bb_state.last_update_time <= last_save_time) && !bb_state.term_flag) @@ -891,8 +892,6 @@ static void _recover_bb_state(void) bb_alloc->array_job_id = bb_alloc->job_id; bb_alloc->array_task_id = NO_VAL; } - bb_alloc->pool = pool; - pool = NULL; bb_alloc->seen_time = time(NULL); bb_alloc->size = size; } else { @@ -1930,6 +1929,7 @@ static int _test_size_limit(struct job_record *job_ptr, bb_job_t *bb_job) if (add_space > bb_state.total_space) return 1; +//FIXME: Needs work for multiple pools resv_bb = job_test_bb_resv(job_ptr, now); if (resv_bb) { burst_buffer_info_t *resv_bb_ptr; @@ -1961,7 +1961,7 @@ static int _test_size_limit(struct job_record *job_ptr, bb_job_t *bb_job) granularity); bb_job->gres_ptr[i].count = tmp_g; if (tmp_g > bb_state.bb_config.gres_ptr[j].avail_cnt) { - debug("%s: %s requests more %s GRES than" + debug("%s: %s requests more in pool %s than" "configured", __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf)), @@ -1982,7 +1982,7 @@ static int _test_size_limit(struct job_record *job_ptr, bb_job_t *bb_job) break; } if (j >= bb_state.bb_config.gres_cnt) { - debug("%s: %s requests %s GRES which are undefined", + debug("%s: %s requests resources in undefined pool %s", __func__, jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf)), bb_job->gres_ptr[i].name); @@ -2361,7 +2361,7 @@ static int _xlate_batch(struct job_descriptor *job_desc) * burst_buffer options in a batch script file */ static int _xlate_interactive(struct job_descriptor *job_desc) { - char *access = NULL, *type = NULL; + char *access = NULL, *pool = NULL, *type = NULL; char *end_ptr = NULL, *tok; uint64_t buf_size = 0, swap_cnt = 0; int rc = SLURM_SUCCESS; @@ -2387,6 +2387,17 @@ static int _xlate_interactive(struct job_descriptor *job_desc) } } + + if ((tok = strstr(job_desc->burst_buffer, "pool="))) { + pool = xstrdup(tok + 5); + tok = strchr(pool, ','); + if (tok) + tok[0] = '\0'; + tok = strchr(pool, ' '); + if (tok) + tok[0] = '\0'; + } + if ((tok = strstr(job_desc->burst_buffer, "swap="))) swap_cnt = strtol(tok + 5, &end_ptr, 10); @@ -2405,9 +2416,15 @@ static int _xlate_interactive(struct job_descriptor *job_desc) if ((rc == SLURM_SUCCESS) && (swap_cnt || buf_size)) { if (swap_cnt) { xstrfmtcat(job_desc->burst_buffer, - "#DW swap %"PRIu64"GiB\n", swap_cnt); + "#DW swap %"PRIu64"GiB", swap_cnt); + if (pool) { + xstrfmtcat(job_desc->burst_buffer, + " pool=%s", pool); + } } if (buf_size) { + if (job_desc->burst_buffer) + xstrfmtcat(job_desc->burst_buffer, "\n"); xstrfmtcat(job_desc->burst_buffer, "#DW jobdw capacity=%s", bb_get_size_str(buf_size)); @@ -2415,15 +2432,19 @@ static int _xlate_interactive(struct job_descriptor *job_desc) xstrfmtcat(job_desc->burst_buffer, " access_mode=%s", access); } + if (pool) { + xstrfmtcat(job_desc->burst_buffer, + " pool=%s", pool); + } if (type) { xstrfmtcat(job_desc->burst_buffer, " type=%s", type); } - xstrfmtcat(job_desc->burst_buffer, "\n"); } } fini: xfree(access); + xfree(pool); xfree(type); return rc; } @@ -3570,7 +3591,8 @@ static int _create_bufs(struct job_record *job_ptr, bb_job_t *bb_job, if (bb_job->persist_add >= bb_alloc->size) { bb_job->persist_add -= bb_alloc->size; } else { - error("%s: Persistent buffer size underflow for job %u", + error("%s: Persistent buffer size " + "underflow for job %u", __func__, job_ptr->job_id); bb_job->persist_add = 0; } -- GitLab