From bc2e11e058e41e661586b8a743b9e83de10d9f93 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Fri, 4 Jan 2019 16:11:35 -0700
Subject: [PATCH] Add infrastructure to support GRES suffix of "%"

This also adds documentation and some tests. The actual scheduling
logic is still not written.
---
 doc/html/gres.shtml       |  2 +-
 doc/man/man1/salloc.1     |  7 ++++
 doc/man/man1/sbatch.1     |  7 ++++
 doc/man/man1/srun.1       |  7 ++++
 src/common/gres.c         | 70 +++++++++++++++++++++++++++++++++------
 src/common/gres.h         |  5 +++
 testsuite/expect/test40.1 | 50 ++++++++++++++++++++++++++++
 7 files changed, 137 insertions(+), 11 deletions(-)

diff --git a/doc/html/gres.shtml b/doc/html/gres.shtml
index 09523db5f5f..7c3e309a8aa 100644
--- a/doc/html/gres.shtml
+++ b/doc/html/gres.shtml
@@ -329,7 +329,7 @@ select/cons_tres plugin.</P>
 
 <p>Job requests for MPS will be processed the same as any other GRES except
 that the request must be satisfied using only one GPU per node.
-A job request for "--gres=mps:50" will not be satisfied by using 20 percent
+A job request for "--gres=mps:50%" will not be satisfied by using 20 percent
 of one GPU and 30 percent of a second GPU on a single node.
 Also note that GRES types of GPU <u>and</u> MPS can not be requested within
 a single job.</p>
diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1
index a37096c0751..a5ecd06b35b 100644
--- a/doc/man/man1/salloc.1
+++ b/doc/man/man1/salloc.1
@@ -654,6 +654,13 @@ Specifies a comma delimited list of generic consumable resources.
 The format of each entry on the list is "name[[:type]:count]".
 The name is that of the consumable resource.
 The count is the number of those resources with a default value of 1.
+The count can have a suffix of
+"k" or "K" (multiple of 1024),
+"m" or "M" (multiple of 1024 x 1024),
+"g" or "G" (multiple of 1024 x 1024 x 1024),
+"t" or "T" (multiple of 1024 x 1024 x 1024 x 1024),
+"p" or "P" (multiple of 1024 x 1024 x 1024 x 1024 x 1024), or
+"%" (percentage of a GPU for gres/mps).
 The specified resources will be allocated to the job on each node.
 The available generic consumable resources is configurable by the system
 administrator.
diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1
index 362e5c93cfb..eb3238f7006 100644
--- a/doc/man/man1/sbatch.1
+++ b/doc/man/man1/sbatch.1
@@ -747,6 +747,13 @@ Specifies a comma delimited list of generic consumable resources.
 The format of each entry on the list is "name[[:type]:count]".
 The name is that of the consumable resource.
 The count is the number of those resources with a default value of 1.
+The count can have a suffix of
+"k" or "K" (multiple of 1024),
+"m" or "M" (multiple of 1024 x 1024),
+"g" or "G" (multiple of 1024 x 1024 x 1024),
+"t" or "T" (multiple of 1024 x 1024 x 1024 x 1024),
+"p" or "P" (multiple of 1024 x 1024 x 1024 x 1024 x 1024), or
+"%" (percentage of a GPU for gres/mps).
 The specified resources will be allocated to the job on each node.
 The available generic consumable resources is configurable by the system
 administrator.
diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1
index 67ca9c3d672..0944e1c230b 100644
--- a/doc/man/man1/srun.1
+++ b/doc/man/man1/srun.1
@@ -928,6 +928,13 @@ Specifies a comma delimited list of generic consumable resources.
 The format of each entry on the list is "name[[:type]:count]".
 The name is that of the consumable resource.
 The count is the number of those resources with a default value of 1.
+The count can have a suffix of
+"k" or "K" (multiple of 1024),
+"m" or "M" (multiple of 1024 x 1024),
+"g" or "G" (multiple of 1024 x 1024 x 1024),
+"t" or "T" (multiple of 1024 x 1024 x 1024 x 1024),
+"p" or "P" (multiple of 1024 x 1024 x 1024 x 1024 x 1024), or
+"%" (percentage of a GPU for gres/mps).
 The specified resources will be allocated to the job on each node.
 The available generic consumable resources is configurable by the system
 administrator.
diff --git a/src/common/gres.c b/src/common/gres.c
index ae58e2dc926..b33dd4a6488 100644
--- a/src/common/gres.c
+++ b/src/common/gres.c
@@ -3793,18 +3793,22 @@ static int _test_gres_cnt(gres_job_state_t *job_gres_data,
  * in_val IN - initial input string
  * type OUT -  must be xfreed by caller
  * cnt OUT - count of values
+ * flags OUT - user flags (GRES_PERCENT, GRES_NO_CONSUME)
  * save_ptr IN/OUT - NULL on initial call, otherwise value from previous call
  * RET rc - error code
  */
 static int _get_next_gres(char *in_val, char **type_ptr, int *context_inx_ptr,
-			  uint64_t *cnt, char **save_ptr)
+			  uint64_t *cnt, uint16_t *flags, char **save_ptr)
 {
 	char *end_ptr = NULL, *comma, *sep, *sep2, *name = NULL, *type = NULL;
 	size_t offset = 0;
 	int i, rc = SLURM_SUCCESS;
 	unsigned long long int value;
 
+	xassert(cnt);
+	xassert(flags);
 	xassert(save_ptr);
+	*flags = 0;
 
 	if (!in_val && (*save_ptr == NULL)) {
 		return rc;
@@ -3894,7 +3898,10 @@ next:	if (*save_ptr[0] == '\0') {	/* Empty input token */
 			rc = ESLURM_INVALID_GRES;
 			goto fini;
 		}
-		if ((end_ptr[0] == 'k') || (end_ptr[0] == 'K')) {
+		if (end_ptr[0] == '%') {
+			*flags |= GRES_PERCENT;
+			end_ptr++;
+		} else if ((end_ptr[0] == 'k') || (end_ptr[0] == 'K')) {
 			value *= 1024;
 			end_ptr++;
 		} else if ((end_ptr[0] == 'm') || (end_ptr[0] == 'M')) {
@@ -3958,6 +3965,7 @@ static gres_job_state_t *_get_next_job_gres(char *in_val, uint64_t *cnt,
 	gres_state_t *gres_ptr;
 	gres_key_t job_search_key;
 	char *type = NULL, *name = NULL;
+	uint16_t flags = 0;
 
 	xassert(save_ptr);
 	if (!in_val && (*save_ptr == NULL)) {
@@ -3976,12 +3984,19 @@ static gres_job_state_t *_get_next_job_gres(char *in_val, uint64_t *cnt,
 		return NULL;
 	}
 
-	if ((my_rc = _get_next_gres(in_val, &type,
-				    &context_inx, cnt, &prev_save_ptr)) ||
+	if ((my_rc = _get_next_gres(in_val, &type, &context_inx,
+				    cnt, &flags, &prev_save_ptr)) ||
 	    (context_inx == NO_VAL)) {
 		prev_save_ptr = NULL;
 		goto fini;
 	}
+	if ((flags & GRES_PERCENT) &&
+	    ((*cnt > 100) ||
+	     !_shared_gres(gres_context[context_inx].plugin_id))) {
+		my_rc = ESLURM_INVALID_GRES;
+		goto fini;
+	}
+
 	/* Find the job GRES record */
 	job_search_key.plugin_id = gres_context[context_inx].plugin_id;
 	job_search_key.type_id = gres_plugin_build_id(type);
@@ -4002,6 +4017,7 @@ static gres_job_state_t *_get_next_job_gres(char *in_val, uint64_t *cnt,
 		gres_ptr->gres_data = job_gres_data;
 		list_append(gres_list, gres_ptr);
 	}
+	job_gres_data->flags = flags;
 
 fini:	xfree(name);
 	xfree(type);
@@ -5010,6 +5026,7 @@ extern int gres_plugin_job_state_pack(List gres_list, Buf buffer,
 			pack32(magic, buffer);
 			pack32(gres_ptr->plugin_id, buffer);
 			pack16(gres_job_ptr->cpus_per_gres, buffer);
+			pack16(gres_job_ptr->flags, buffer);
 			pack64(gres_job_ptr->gres_per_job, buffer);
 			pack64(gres_job_ptr->gres_per_node, buffer);
 			pack64(gres_job_ptr->gres_per_socket, buffer);
@@ -5205,6 +5222,7 @@ extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer,
 			safe_unpack32(&plugin_id, buffer);
 			gres_job_ptr = xmalloc(sizeof(gres_job_state_t));
 			safe_unpack16(&gres_job_ptr->cpus_per_gres, buffer);
+			safe_unpack16(&gres_job_ptr->flags, buffer);
 			safe_unpack64(&gres_job_ptr->gres_per_job, buffer);
 			safe_unpack64(&gres_job_ptr->gres_per_node, buffer);
 			safe_unpack64(&gres_job_ptr->gres_per_socket, buffer);
@@ -9733,6 +9751,25 @@ extern void gres_plugin_job_set_defs(List job_gres_list, char *gres_name,
 	list_iterator_destroy(gres_iter);
 }
 
+/*
+ * Translate GRES flag to string.
+ * NOT reentrant
+ */
+static char *_gres_flags_str(uint16_t flags)
+{
+	static char flag_str[128];
+
+	flag_str[0] = '\0';
+	if (flags & GRES_PERCENT)
+		strcat(flag_str, "percent");
+	if (flags & GRES_NO_CONSUME) {
+		if (flag_str[0])
+			strcat(flag_str, ",");
+		strcat(flag_str, "no_consume");
+	}
+	return flag_str;
+}
+
 static void _job_state_log(void *gres_data, uint32_t job_id, uint32_t plugin_id)
 {
 	gres_job_state_t *gres_ptr;
@@ -9741,9 +9778,9 @@ static void _job_state_log(void *gres_data, uint32_t job_id, uint32_t plugin_id)
 
 	xassert(gres_data);
 	gres_ptr = (gres_job_state_t *) gres_data;
-	info("gres:%s(%u) type:%s(%u) job:%u state",
+	info("gres:%s(%u) type:%s(%u) job:%u flags:%s state",
 	      gres_ptr->gres_name, plugin_id, gres_ptr->type_name,
-	      gres_ptr->type_id, job_id);
+	      gres_ptr->type_id, job_id, _gres_flags_str(gres_ptr->flags));
 	if (gres_ptr->cpus_per_gres)
 		info("  cpus_per_gres:%u", gres_ptr->cpus_per_gres);
 	else if (gres_ptr->def_cpus_per_gres)
@@ -10137,6 +10174,7 @@ static gres_step_state_t *_get_next_step_gres(char *in_val, uint64_t *cnt,
 	gres_state_t *gres_ptr;
 	gres_key_t step_search_key;
 	char *type = NULL, *name = NULL;
+	uint16_t flags = 0;
 
 	xassert(save_ptr);
 	if (!in_val && (*save_ptr == NULL)) {
@@ -10155,12 +10193,20 @@ static gres_step_state_t *_get_next_step_gres(char *in_val, uint64_t *cnt,
 		return NULL;
 	}
 
-	if ((my_rc = _get_next_gres(in_val, &type,
-				    &context_inx, cnt, &prev_save_ptr)) ||
+	if ((my_rc = _get_next_gres(in_val, &type, &context_inx,
+				    cnt, &flags, &prev_save_ptr)) ||
 	    (context_inx == NO_VAL)) {
 		prev_save_ptr = NULL;
 		goto fini;
 	}
+	if ((flags & GRES_PERCENT) &&
+	    ((*cnt > 100) ||
+	     !_shared_gres(gres_context[context_inx].plugin_id))) {
+		my_rc = ESLURM_INVALID_GRES;
+		prev_save_ptr = NULL;
+		goto fini;
+	}
+
 	/* Find the step GRES record */
 	step_search_key.plugin_id = gres_context[context_inx].plugin_id;
 	step_search_key.type_id = gres_plugin_build_id(type);
@@ -10179,6 +10225,7 @@ static gres_step_state_t *_get_next_step_gres(char *in_val, uint64_t *cnt,
 		gres_ptr->gres_data = step_gres_data;
 		list_append(gres_list, gres_ptr);
 	}
+	step_gres_data->flags = flags;
 
 fini:	xfree(name);
 	xfree(type);
@@ -10631,6 +10678,7 @@ extern int gres_plugin_step_state_pack(List gres_list, Buf buffer,
 			pack32(magic, buffer);
 			pack32(gres_ptr->plugin_id, buffer);
 			pack16(gres_step_ptr->cpus_per_gres, buffer);
+			pack16(gres_step_ptr->flags, buffer);
 			pack64(gres_step_ptr->gres_per_step, buffer);
 			pack64(gres_step_ptr->gres_per_node, buffer);
 			pack64(gres_step_ptr->gres_per_socket, buffer);
@@ -10757,6 +10805,7 @@ extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
 			safe_unpack32(&plugin_id, buffer);
 			gres_step_ptr = xmalloc(sizeof(gres_step_state_t));
 			safe_unpack16(&gres_step_ptr->cpus_per_gres, buffer);
+			safe_unpack16(&gres_step_ptr->flags, buffer);
 			safe_unpack64(&gres_step_ptr->gres_per_step, buffer);
 			safe_unpack64(&gres_step_ptr->gres_per_node, buffer);
 			safe_unpack64(&gres_step_ptr->gres_per_socket, buffer);
@@ -11247,8 +11296,9 @@ static void _step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id,
 	int i;
 
 	xassert(gres_ptr);
-	info("gres:%s type:%s(%u) step:%u.%u state", gres_name,
-	     gres_ptr->type_name, gres_ptr->type_id, job_id, step_id);
+	info("gres:%s type:%s(%u) step:%u.%u flags:%s state", gres_name,
+	     gres_ptr->type_name, gres_ptr->type_id, job_id, step_id,
+	     _gres_flags_str(gres_ptr->flags));
 	if (gres_ptr->cpus_per_gres)
 		info("  cpus_per_gres:%u", gres_ptr->cpus_per_gres);
 	if (gres_ptr->gres_per_step)
diff --git a/src/common/gres.h b/src/common/gres.h
index f3a8172358f..a26b8df5fe6 100644
--- a/src/common/gres.h
+++ b/src/common/gres.h
@@ -68,6 +68,9 @@ typedef struct {
 #define GRES_CONF_HAS_FILE	0x02	/* File= is configured */
 #define GRES_CONF_HAS_TYPE	0x04	/* Type= is configured */
 
+#define GRES_PERCENT		0x0001	/* Requesting percentage of available */
+#define GRES_NO_CONSUME		0x0002	/* Requesting no consume of resources */
+
 /* Gres state information gathered by slurmd daemon */
 typedef struct gres_slurmd_conf {
 	uint8_t config_flags;	/* See GRES_CONF_* values above */
@@ -170,6 +173,7 @@ typedef struct gres_job_state {
 	char *gres_name;		/* GRES name (e.g. "gpu") */
 	uint32_t type_id;		/* GRES type (e.g. model ID) */
 	char *type_name;		/* GRES type (e.g. model name) */
+	uint16_t flags;			/* GRES_PERCENT, GRES_NO_CONSUME, etc. */
 
 	/* Count of required GRES resources plus associated CPUs and memory */
 	uint16_t cpus_per_gres;
@@ -222,6 +226,7 @@ typedef struct gres_job_state {
 typedef struct gres_step_state {
 	uint32_t type_id;		/* GRES type (e.g. model ID) */
 	char *type_name;		/* GRES type (e.g. model name) */
+	uint16_t flags;			/* GRES_PERCENT, GRES_NO_CONSUME, etc. */
 
 	/* Count of required GRES resources plus associated CPUs and memory */
 	uint16_t cpus_per_gres;
diff --git a/testsuite/expect/test40.1 b/testsuite/expect/test40.1
index 99f074a6e32..24130c64eab 100755
--- a/testsuite/expect/test40.1
+++ b/testsuite/expect/test40.1
@@ -104,6 +104,56 @@ expect {
 	}
 }
 
+#
+# Request over 100% MPS
+#
+send_user "\n\n==== TEST 3 ====\n"
+spawn $sbatch --gres=mps:101% -N1 --output=/dev/null -t1 --wrap $bin_hostname
+expect {
+	-re "Submitted batch job ($number)" {
+		cancel_job $expect_out(1,string)
+		send_user "\nFAILURE: batch request not rejected\n"
+		set exit_code 1
+		exp_continue
+	}
+	-re "error: " {
+		send_user "\nError is expected, no worries.\n"
+		exp_continue
+	}
+	timeout {
+		send_user "\nFAILURE: sbatch not responding\n"
+		set exit_code 1
+	}
+	eof {
+		wait
+	}
+}
+
+#
+# Request over percentage of GPUs
+#
+send_user "\n\n==== TEST 4 ====\n"
+spawn $sbatch --gres=gpu:10% -N1 --output=/dev/null -t1 --wrap $bin_hostname
+expect {
+	-re "Submitted batch job ($number)" {
+		cancel_job $expect_out(1,string)
+		send_user "\nFAILURE: batch request not rejected\n"
+		set exit_code 1
+		exp_continue
+	}
+	-re "error: " {
+		send_user "\nError is expected, no worries.\n"
+		exp_continue
+	}
+	timeout {
+		send_user "\nFAILURE: sbatch not responding\n"
+		set exit_code 1
+	}
+	eof {
+		wait
+	}
+}
+
 #
 # Request MPS per job with node count > 1
 # Request MPS per socket with socket count > 1
-- 
GitLab