From bc2e11e058e41e661586b8a743b9e83de10d9f93 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Fri, 4 Jan 2019 16:11:35 -0700 Subject: [PATCH] Add infrastructure to support GRES suffix of "%" This also adds documentation and some tests. The actual scheduling logic is still not written. --- doc/html/gres.shtml | 2 +- doc/man/man1/salloc.1 | 7 ++++ doc/man/man1/sbatch.1 | 7 ++++ doc/man/man1/srun.1 | 7 ++++ src/common/gres.c | 70 +++++++++++++++++++++++++++++++++------ src/common/gres.h | 5 +++ testsuite/expect/test40.1 | 50 ++++++++++++++++++++++++++++ 7 files changed, 137 insertions(+), 11 deletions(-) diff --git a/doc/html/gres.shtml b/doc/html/gres.shtml index 09523db5f5f..7c3e309a8aa 100644 --- a/doc/html/gres.shtml +++ b/doc/html/gres.shtml @@ -329,7 +329,7 @@ select/cons_tres plugin.</P> <p>Job requests for MPS will be processed the same as any other GRES except that the request must be satisfied using only one GPU per node. -A job request for "--gres=mps:50" will not be satisfied by using 20 percent +A job request for "--gres=mps:50%" will not be satisfied by using 20 percent of one GPU and 30 percent of a second GPU on a single node. Also note that GRES types of GPU <u>and</u> MPS can not be requested within a single job.</p> diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index a37096c0751..a5ecd06b35b 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -654,6 +654,13 @@ Specifies a comma delimited list of generic consumable resources. The format of each entry on the list is "name[[:type]:count]". The name is that of the consumable resource. The count is the number of those resources with a default value of 1. +The count can have a suffix of +"k" or "K" (multiple of 1024), +"m" or "M" (multiple of 1024 x 1024), +"g" or "G" (multiple of 1024 x 1024 x 1024), +"t" or "T" (multiple of 1024 x 1024 x 1024 x 1024), +"p" or "P" (multiple of 1024 x 1024 x 1024 x 1024 x 1024), or +"%" (percentage of a GPU for gres/mps). The specified resources will be allocated to the job on each node. The available generic consumable resources is configurable by the system administrator. diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 362e5c93cfb..eb3238f7006 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -747,6 +747,13 @@ Specifies a comma delimited list of generic consumable resources. The format of each entry on the list is "name[[:type]:count]". The name is that of the consumable resource. The count is the number of those resources with a default value of 1. +The count can have a suffix of +"k" or "K" (multiple of 1024), +"m" or "M" (multiple of 1024 x 1024), +"g" or "G" (multiple of 1024 x 1024 x 1024), +"t" or "T" (multiple of 1024 x 1024 x 1024 x 1024), +"p" or "P" (multiple of 1024 x 1024 x 1024 x 1024 x 1024), or +"%" (percentage of a GPU for gres/mps). The specified resources will be allocated to the job on each node. The available generic consumable resources is configurable by the system administrator. diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 67ca9c3d672..0944e1c230b 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -928,6 +928,13 @@ Specifies a comma delimited list of generic consumable resources. The format of each entry on the list is "name[[:type]:count]". The name is that of the consumable resource. The count is the number of those resources with a default value of 1. +The count can have a suffix of +"k" or "K" (multiple of 1024), +"m" or "M" (multiple of 1024 x 1024), +"g" or "G" (multiple of 1024 x 1024 x 1024), +"t" or "T" (multiple of 1024 x 1024 x 1024 x 1024), +"p" or "P" (multiple of 1024 x 1024 x 1024 x 1024 x 1024), or +"%" (percentage of a GPU for gres/mps). The specified resources will be allocated to the job on each node. The available generic consumable resources is configurable by the system administrator. diff --git a/src/common/gres.c b/src/common/gres.c index ae58e2dc926..b33dd4a6488 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -3793,18 +3793,22 @@ static int _test_gres_cnt(gres_job_state_t *job_gres_data, * in_val IN - initial input string * type OUT - must be xfreed by caller * cnt OUT - count of values + * flags OUT - user flags (GRES_PERCENT, GRES_NO_CONSUME) * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call * RET rc - error code */ static int _get_next_gres(char *in_val, char **type_ptr, int *context_inx_ptr, - uint64_t *cnt, char **save_ptr) + uint64_t *cnt, uint16_t *flags, char **save_ptr) { char *end_ptr = NULL, *comma, *sep, *sep2, *name = NULL, *type = NULL; size_t offset = 0; int i, rc = SLURM_SUCCESS; unsigned long long int value; + xassert(cnt); + xassert(flags); xassert(save_ptr); + *flags = 0; if (!in_val && (*save_ptr == NULL)) { return rc; @@ -3894,7 +3898,10 @@ next: if (*save_ptr[0] == '\0') { /* Empty input token */ rc = ESLURM_INVALID_GRES; goto fini; } - if ((end_ptr[0] == 'k') || (end_ptr[0] == 'K')) { + if (end_ptr[0] == '%') { + *flags |= GRES_PERCENT; + end_ptr++; + } else if ((end_ptr[0] == 'k') || (end_ptr[0] == 'K')) { value *= 1024; end_ptr++; } else if ((end_ptr[0] == 'm') || (end_ptr[0] == 'M')) { @@ -3958,6 +3965,7 @@ static gres_job_state_t *_get_next_job_gres(char *in_val, uint64_t *cnt, gres_state_t *gres_ptr; gres_key_t job_search_key; char *type = NULL, *name = NULL; + uint16_t flags = 0; xassert(save_ptr); if (!in_val && (*save_ptr == NULL)) { @@ -3976,12 +3984,19 @@ static gres_job_state_t *_get_next_job_gres(char *in_val, uint64_t *cnt, return NULL; } - if ((my_rc = _get_next_gres(in_val, &type, - &context_inx, cnt, &prev_save_ptr)) || + if ((my_rc = _get_next_gres(in_val, &type, &context_inx, + cnt, &flags, &prev_save_ptr)) || (context_inx == NO_VAL)) { prev_save_ptr = NULL; goto fini; } + if ((flags & GRES_PERCENT) && + ((*cnt > 100) || + !_shared_gres(gres_context[context_inx].plugin_id))) { + my_rc = ESLURM_INVALID_GRES; + goto fini; + } + /* Find the job GRES record */ job_search_key.plugin_id = gres_context[context_inx].plugin_id; job_search_key.type_id = gres_plugin_build_id(type); @@ -4002,6 +4017,7 @@ static gres_job_state_t *_get_next_job_gres(char *in_val, uint64_t *cnt, gres_ptr->gres_data = job_gres_data; list_append(gres_list, gres_ptr); } + job_gres_data->flags = flags; fini: xfree(name); xfree(type); @@ -5010,6 +5026,7 @@ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer, pack32(magic, buffer); pack32(gres_ptr->plugin_id, buffer); pack16(gres_job_ptr->cpus_per_gres, buffer); + pack16(gres_job_ptr->flags, buffer); pack64(gres_job_ptr->gres_per_job, buffer); pack64(gres_job_ptr->gres_per_node, buffer); pack64(gres_job_ptr->gres_per_socket, buffer); @@ -5205,6 +5222,7 @@ extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer, safe_unpack32(&plugin_id, buffer); gres_job_ptr = xmalloc(sizeof(gres_job_state_t)); safe_unpack16(&gres_job_ptr->cpus_per_gres, buffer); + safe_unpack16(&gres_job_ptr->flags, buffer); safe_unpack64(&gres_job_ptr->gres_per_job, buffer); safe_unpack64(&gres_job_ptr->gres_per_node, buffer); safe_unpack64(&gres_job_ptr->gres_per_socket, buffer); @@ -9733,6 +9751,25 @@ extern void gres_plugin_job_set_defs(List job_gres_list, char *gres_name, list_iterator_destroy(gres_iter); } +/* + * Translate GRES flag to string. + * NOT reentrant + */ +static char *_gres_flags_str(uint16_t flags) +{ + static char flag_str[128]; + + flag_str[0] = '\0'; + if (flags & GRES_PERCENT) + strcat(flag_str, "percent"); + if (flags & GRES_NO_CONSUME) { + if (flag_str[0]) + strcat(flag_str, ","); + strcat(flag_str, "no_consume"); + } + return flag_str; +} + static void _job_state_log(void *gres_data, uint32_t job_id, uint32_t plugin_id) { gres_job_state_t *gres_ptr; @@ -9741,9 +9778,9 @@ static void _job_state_log(void *gres_data, uint32_t job_id, uint32_t plugin_id) xassert(gres_data); gres_ptr = (gres_job_state_t *) gres_data; - info("gres:%s(%u) type:%s(%u) job:%u state", + info("gres:%s(%u) type:%s(%u) job:%u flags:%s state", gres_ptr->gres_name, plugin_id, gres_ptr->type_name, - gres_ptr->type_id, job_id); + gres_ptr->type_id, job_id, _gres_flags_str(gres_ptr->flags)); if (gres_ptr->cpus_per_gres) info(" cpus_per_gres:%u", gres_ptr->cpus_per_gres); else if (gres_ptr->def_cpus_per_gres) @@ -10137,6 +10174,7 @@ static gres_step_state_t *_get_next_step_gres(char *in_val, uint64_t *cnt, gres_state_t *gres_ptr; gres_key_t step_search_key; char *type = NULL, *name = NULL; + uint16_t flags = 0; xassert(save_ptr); if (!in_val && (*save_ptr == NULL)) { @@ -10155,12 +10193,20 @@ static gres_step_state_t *_get_next_step_gres(char *in_val, uint64_t *cnt, return NULL; } - if ((my_rc = _get_next_gres(in_val, &type, - &context_inx, cnt, &prev_save_ptr)) || + if ((my_rc = _get_next_gres(in_val, &type, &context_inx, + cnt, &flags, &prev_save_ptr)) || (context_inx == NO_VAL)) { prev_save_ptr = NULL; goto fini; } + if ((flags & GRES_PERCENT) && + ((*cnt > 100) || + !_shared_gres(gres_context[context_inx].plugin_id))) { + my_rc = ESLURM_INVALID_GRES; + prev_save_ptr = NULL; + goto fini; + } + /* Find the step GRES record */ step_search_key.plugin_id = gres_context[context_inx].plugin_id; step_search_key.type_id = gres_plugin_build_id(type); @@ -10179,6 +10225,7 @@ static gres_step_state_t *_get_next_step_gres(char *in_val, uint64_t *cnt, gres_ptr->gres_data = step_gres_data; list_append(gres_list, gres_ptr); } + step_gres_data->flags = flags; fini: xfree(name); xfree(type); @@ -10631,6 +10678,7 @@ extern int gres_plugin_step_state_pack(List gres_list, Buf buffer, pack32(magic, buffer); pack32(gres_ptr->plugin_id, buffer); pack16(gres_step_ptr->cpus_per_gres, buffer); + pack16(gres_step_ptr->flags, buffer); pack64(gres_step_ptr->gres_per_step, buffer); pack64(gres_step_ptr->gres_per_node, buffer); pack64(gres_step_ptr->gres_per_socket, buffer); @@ -10757,6 +10805,7 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer, safe_unpack32(&plugin_id, buffer); gres_step_ptr = xmalloc(sizeof(gres_step_state_t)); safe_unpack16(&gres_step_ptr->cpus_per_gres, buffer); + safe_unpack16(&gres_step_ptr->flags, buffer); safe_unpack64(&gres_step_ptr->gres_per_step, buffer); safe_unpack64(&gres_step_ptr->gres_per_node, buffer); safe_unpack64(&gres_step_ptr->gres_per_socket, buffer); @@ -11247,8 +11296,9 @@ static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id, int i; xassert(gres_ptr); - info("gres:%s type:%s(%u) step:%u.%u state", gres_name, - gres_ptr->type_name, gres_ptr->type_id, job_id, step_id); + info("gres:%s type:%s(%u) step:%u.%u flags:%s state", gres_name, + gres_ptr->type_name, gres_ptr->type_id, job_id, step_id, + _gres_flags_str(gres_ptr->flags)); if (gres_ptr->cpus_per_gres) info(" cpus_per_gres:%u", gres_ptr->cpus_per_gres); if (gres_ptr->gres_per_step) diff --git a/src/common/gres.h b/src/common/gres.h index f3a8172358f..a26b8df5fe6 100644 --- a/src/common/gres.h +++ b/src/common/gres.h @@ -68,6 +68,9 @@ typedef struct { #define GRES_CONF_HAS_FILE 0x02 /* File= is configured */ #define GRES_CONF_HAS_TYPE 0x04 /* Type= is configured */ +#define GRES_PERCENT 0x0001 /* Requesting percentage of available */ +#define GRES_NO_CONSUME 0x0002 /* Requesting no consume of resources */ + /* Gres state information gathered by slurmd daemon */ typedef struct gres_slurmd_conf { uint8_t config_flags; /* See GRES_CONF_* values above */ @@ -170,6 +173,7 @@ typedef struct gres_job_state { char *gres_name; /* GRES name (e.g. "gpu") */ uint32_t type_id; /* GRES type (e.g. model ID) */ char *type_name; /* GRES type (e.g. model name) */ + uint16_t flags; /* GRES_PERCENT, GRES_NO_CONSUME, etc. */ /* Count of required GRES resources plus associated CPUs and memory */ uint16_t cpus_per_gres; @@ -222,6 +226,7 @@ typedef struct gres_job_state { typedef struct gres_step_state { uint32_t type_id; /* GRES type (e.g. model ID) */ char *type_name; /* GRES type (e.g. model name) */ + uint16_t flags; /* GRES_PERCENT, GRES_NO_CONSUME, etc. */ /* Count of required GRES resources plus associated CPUs and memory */ uint16_t cpus_per_gres; diff --git a/testsuite/expect/test40.1 b/testsuite/expect/test40.1 index 99f074a6e32..24130c64eab 100755 --- a/testsuite/expect/test40.1 +++ b/testsuite/expect/test40.1 @@ -104,6 +104,56 @@ expect { } } +# +# Request over 100% MPS +# +send_user "\n\n==== TEST 3 ====\n" +spawn $sbatch --gres=mps:101% -N1 --output=/dev/null -t1 --wrap $bin_hostname +expect { + -re "Submitted batch job ($number)" { + cancel_job $expect_out(1,string) + send_user "\nFAILURE: batch request not rejected\n" + set exit_code 1 + exp_continue + } + -re "error: " { + send_user "\nError is expected, no worries.\n" + exp_continue + } + timeout { + send_user "\nFAILURE: sbatch not responding\n" + set exit_code 1 + } + eof { + wait + } +} + +# +# Request over percentage of GPUs +# +send_user "\n\n==== TEST 4 ====\n" +spawn $sbatch --gres=gpu:10% -N1 --output=/dev/null -t1 --wrap $bin_hostname +expect { + -re "Submitted batch job ($number)" { + cancel_job $expect_out(1,string) + send_user "\nFAILURE: batch request not rejected\n" + set exit_code 1 + exp_continue + } + -re "error: " { + send_user "\nError is expected, no worries.\n" + exp_continue + } + timeout { + send_user "\nFAILURE: sbatch not responding\n" + set exit_code 1 + } + eof { + wait + } +} + # # Request MPS per job with node count > 1 # Request MPS per socket with socket count > 1 -- GitLab