From 60a9ec12e18f260cde569cc3e675cf2cb1688e4d Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Fri, 9 Jan 2015 15:44:17 -0800 Subject: [PATCH] Burst buffer enhancements Remove GRES spec from burst_buffer.conf There is no logic in the generic plugin to support this and the cray plugin does not get the information from the config file. Change allow/deny user separator from colon to comma Confirm AllowUsers for swap and gres space Make sure nodes not shared on cray system Minor changes in several other places --- doc/man/man5/burst_buffer.conf.5 | 37 ++++++++++--------- src/common/slurm_errno.c | 2 +- .../burst_buffer/common/burst_buffer_common.c | 22 ++++++++--- .../burst_buffer/cray/burst_buffer_cray.c | 16 +++++--- 4 files changed, 47 insertions(+), 30 deletions(-) diff --git a/doc/man/man5/burst_buffer.conf.5 b/doc/man/man5/burst_buffer.conf.5 index 66bc16b5f41..70231215608 100644 --- a/doc/man/man5/burst_buffer.conf.5 +++ b/doc/man/man5/burst_buffer.conf.5 @@ -54,17 +54,18 @@ Slurm distribution for an example. Granularity of job space allocations in units of gigabytes. The default value is 1 gigabyte. -.TP -\fBGres\fR -Generic resources associated with burst buffers. -This is a completely separate name space from the Gres defined in the slurm.conf -file. -The Gres value consistes of a comma separated list of generic resources, -each of which includes a name separated by a colon and a numeric value. -The numeric value can include a suffic of "k", "m" or "g", which multiplies -the numeric value by 1,024, 1,048,576, or 1,073,741,824 respectively. -The numeric value is a 32-bit value. -See the example below. +.\ Possible future enhancement +.\ .TP +.\ \fBGres\fR +.\ Generic resources associated with burst buffers. +.\ This is a completely separate name space from the Gres defined in the slurm.conf +.\ file. +.\ The Gres value consistes of a comma separated list of generic resources, +.\ each of which includes a name separated by a colon and a numeric value. +.\ The numeric value can include a suffic of "k", "m" or "g", which multiplies +.\ the numeric value by 1,024, 1,048,576, or 1,073,741,824 respectively. +.\ The numeric value is a 32-bit value. +.\ See the example below. .TP \fBJobSizeLimit\fR @@ -155,19 +156,19 @@ By default there is no job allocation size limit. .br ################################################################## .br -AllowUsers=alan:brenda +AllowUsers=alan,brenda .br PrivateData=true -.br -Gres=nodes:10,other:20 +.\ .br +.\ Gres=nodes:10,other:20 .br # .br Granularity=1GB .br -JobSizeLimit=20GB # Applies to each job +JobSizeLimit=200GB # Applies to each job .br -UserSizeLimit=50GB # Applies to ALL users +UserSizeLimit=500GB # Applies to each user .br # .br @@ -177,9 +178,9 @@ PrioBoostAlloc=200 .br # .br -StageInTimeout=30 +StageInTimeout=30 # Seconds .br -StageOutTimeout=30 +StageOutTimeout=30 # Seconds .br # .br diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 21b71fd52d6..34c9dda31b3 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -310,7 +310,7 @@ static slurm_errtab_t slurm_errtab[] = { "BurstBufferType change requires restart of slurmctld daemon " "to take effect"}, { ESLURM_BURST_BUFFER_PERMISSION, - "Burst Buffer permssion denied" }, + "Burst Buffer permission denied" }, { ESLURM_BURST_BUFFER_LIMIT, "Burst Buffer resource limit exceeded" }, { ESLURM_INVALID_BURST_BUFFER_REQUEST, diff --git a/src/plugins/burst_buffer/common/burst_buffer_common.c b/src/plugins/burst_buffer/common/burst_buffer_common.c index ffd3310d56e..63b5ef29ae1 100644 --- a/src/plugins/burst_buffer/common/burst_buffer_common.c +++ b/src/plugins/burst_buffer/common/burst_buffer_common.c @@ -67,7 +67,10 @@ #include "burst_buffer_common.h" -/* Translate colon delimitted list of users into a UID array, +/* For possible future use by burst_buffer/generic */ +#define _SUPPORT_GRES 0 + +/* Translate comma delimitted list of users into a UID array, * Return value must be xfreed */ static uid_t *_parse_users(char *buf) { @@ -83,7 +86,7 @@ static uid_t *_parse_users(char *buf) delim[0] = '\0'; array_size = 1; user_array = xmalloc(sizeof(uid_t) * array_size); - tok = strtok_r(tmp, ":", &save_ptr); + tok = strtok_r(tmp, ",", &save_ptr); while (tok) { if ((uid_from_string(tok, user_array + inx) == -1) || (user_array[inx] == 0)) { @@ -95,7 +98,7 @@ static uid_t *_parse_users(char *buf) sizeof(uid_t)*array_size); } } - tok = strtok_r(NULL, ":", &save_ptr); + tok = strtok_r(NULL, ",", &save_ptr); } xfree(tmp); return user_array; @@ -116,7 +119,7 @@ static char *_print_users(uid_t *buf) if (!user_elem) continue; if (user_str) - xstrcat(user_str, ":"); + xstrcat(user_str, ","); xstrcat(user_str, user_elem); xfree(user_elem); } @@ -322,6 +325,7 @@ extern void bb_remove_user_load(bb_alloc_t *bb_ptr, bb_state_t *state_ptr) } } +#if _SUPPORT_GRES static uint32_t _atoi(char *tok) { char *end_ptr = NULL; @@ -341,20 +345,24 @@ static uint32_t _atoi(char *tok) } return size_u; } +#endif /* Load and process configuration parameters */ extern void bb_load_config(bb_state_t *state_ptr, char *type) { s_p_hashtbl_t *bb_hashtbl = NULL; - char *bb_conf, *colon, *save_ptr, *tmp = NULL, *tok, *value; + char *bb_conf, *tmp = NULL, *value; +#if _SUPPORT_GRES + char *colon, *save_ptr = NULL, *tok; uint32_t gres_cnt; +#endif int fd, i; static s_p_options_t bb_options[] = { {"AllowUsers", S_P_STRING}, {"DenyUsers", S_P_STRING}, {"GetSysState", S_P_STRING}, {"Granularity", S_P_STRING}, - {"Gres", S_P_STRING}, +/* {"Gres", S_P_STRING}, */ {"JobSizeLimit", S_P_STRING}, {"PrioBoostAlloc", S_P_UINT32}, {"PrioBoostUse", S_P_UINT32}, @@ -417,6 +425,7 @@ extern void bb_load_config(bb_state_t *state_ptr, char *type) state_ptr->bb_config.granularity = 1; } } +#if _SUPPORT_GRES if (s_p_get_string(&tmp, "Gres", bb_hashtbl)) { tok = strtok_r(tmp, ",", &save_ptr); while (tok) { @@ -441,6 +450,7 @@ extern void bb_load_config(bb_state_t *state_ptr, char *type) } xfree(tmp); } +#endif if (s_p_get_string(&tmp, "JobSizeLimit", bb_hashtbl)) { state_ptr->bb_config.job_size_limit = bb_get_size_num(tmp, 1); xfree(tmp); diff --git a/src/plugins/burst_buffer/cray/burst_buffer_cray.c b/src/plugins/burst_buffer/cray/burst_buffer_cray.c index 6569a0d94ff..1d65169087f 100644 --- a/src/plugins/burst_buffer/cray/burst_buffer_cray.c +++ b/src/plugins/burst_buffer/cray/burst_buffer_cray.c @@ -1586,7 +1586,7 @@ extern int bb_p_load_state(bool init_config) { pthread_mutex_lock(&bb_state.bb_mutex); if (bb_state.bb_config.debug_flag) - info("%s: %s", plugin_type, __func__); + debug("%s: %s", plugin_type, __func__); _load_state(); pthread_mutex_unlock(&bb_state.bb_mutex); @@ -1652,6 +1652,7 @@ extern int bb_p_state_pack(uid_t uid, Buf buffer, uint16_t protocol_version) extern int bb_p_job_validate(struct job_descriptor *job_desc, uid_t submit_uid) { + bool have_gres = false, have_swap = false; int32_t bb_size = 0; char *key; int i, rc; @@ -1672,8 +1673,12 @@ extern int bb_p_job_validate(struct job_descriptor *job_desc, bb_size = bb_get_size_num(key + 11, bb_state.bb_config.granularity); } + if (strstr(job_desc->burst_buffer, "SLURM_GRES=")) + have_gres = true; + if (strstr(job_desc->burst_buffer, "SLURM_SWAP=")) + have_swap = true; } - if (bb_size == 0) + if ((bb_size == 0) && (have_gres == false) && (have_swap == false)) return SLURM_SUCCESS; if (bb_size < 0) return ESLURM_BURST_BUFFER_LIMIT; @@ -1716,9 +1721,10 @@ extern int bb_p_job_validate(struct job_descriptor *job_desc, "but total space is only %u", job_desc->user_id, bb_size, bb_state.total_space); } - pthread_mutex_unlock(&bb_state.bb_mutex); + job_desc->shared = 0; /* Compute nodes can not be shared */ + return SLURM_SUCCESS; } @@ -1951,10 +1957,10 @@ extern int bb_p_job_test_stage_in(struct job_record *job_ptr, bool test_only) jobid2fmt(job_ptr, jobid_buf, sizeof(jobid_buf)), (int) test_only); } - if (job_ptr->array_recs && (job_ptr->array_task_id == NO_VAL)) - return -1; if ((bb_spec = _get_bb_spec(job_ptr)) == NULL) return rc; + if (job_ptr->array_recs && (job_ptr->array_task_id == NO_VAL)) + return -1; pthread_mutex_lock(&bb_state.bb_mutex); bb_ptr = bb_find_job_rec(job_ptr, bb_state.bb_hash); if (!bb_ptr) { -- GitLab