From a6fad4035879d90ab7746035e257917e6cef3aae Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 30 Jul 2010 19:16:27 +0000
Subject: [PATCH] improve logic for node's gres state save/restore/deallocate.
 let job/step specify gres requirement with suffix of M (mega)

---
 src/common/gres.c | 200 ++++++++++++++++++----------------------------
 1 file changed, 79 insertions(+), 121 deletions(-)

diff --git a/src/common/gres.c b/src/common/gres.c
index 04a756cf5e7..70da277e31a 100644
--- a/src/common/gres.c
+++ b/src/common/gres.c
@@ -54,14 +54,17 @@
 #      include <stdint.h>
 #    endif
 #  endif /* HAVE_INTTYPES_H */
+#  ifdef HAVE_LIMITS_H
+#    include <limits.h>
+#  endif
 #else /* ! HAVE_CONFIG_H */
+#  include <limits.h>
 #  include <sys/types.h>
 #  include <stdint.h>
 #  include <stdlib.h>
 #  include <string.h>
 #endif /* HAVE_CONFIG_H */
 
-#include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <slurm/slurm.h>
@@ -151,15 +154,12 @@ static int	_node_reconfig(char *node_name, char *orig_config,
 			       char **new_config, gres_state_t *gres_ptr,
 			       uint16_t fast_schedule,
 			       slurm_gres_context_t *context_ptr);
-static void	_node_state_dealloc(void *gres_data);
+static void	_node_state_dealloc(gres_state_t *gres_ptr);
 static void *	_node_state_dup(void *gres_data);
 static void	_node_state_log(void *gres_data, char *node_name,
 				char *gres_name);
-static void	_node_state_pack(void *gres_data, Buf buffer);
 static int	_node_state_realloc(void *job_gres_data, int node_offset,
 				    void *node_gres_data, char *gres_name);
-static int	_node_state_unpack(void **gres_data, Buf buffer,
-				   bool has_file);
 static int	_parse_gres_config(void **dest, slurm_parser_enum_t type,
 				   const char *key, const char *value,
 				   const char *line, char **leftover);
@@ -633,8 +633,9 @@ static int _parse_gres_config(void **dest, slurm_parser_enum_t type,
 			fatal("bit_alloc: malloc failure");
 		i = bit_unfmt(cpu_bitmap, p->cpus);
 		if (i != 0) {
-			fatal("Invalid gres data for %s, CPUs=%s",
-			      p->name, p->cpus);
+			fatal("Invalid gres data for %s, CPUs=%s (only %u CPUs"
+			      " are available)",
+			      p->name, p->cpus, gres_cpu_cnt);
 		}
 		FREE_NULL_BITMAP(cpu_bitmap);
 	}
@@ -1486,55 +1487,6 @@ extern int gres_plugin_node_reconfig(char *node_name,
 	return rc;
 }
 
-static void _node_state_pack(void *gres_data, Buf buffer)
-{
-	gres_node_state_t *gres_ptr = (gres_node_state_t *) gres_data;
-
-	pack32(gres_ptr->gres_cnt_avail,  buffer);
-	pack32(gres_ptr->gres_cnt_alloc,  buffer);
-	pack_bit_str(gres_ptr->gres_bit_alloc, buffer);
-}
-
-static int _node_state_unpack(void **gres_data, Buf buffer, bool has_file)
-{
-	gres_node_state_t *gres_ptr;
-
-	gres_ptr = xmalloc(sizeof(gres_node_state_t));
-
-	gres_ptr->gres_cnt_found = NO_VAL;
-	if (buffer) {
-		safe_unpack32(&gres_ptr->gres_cnt_avail,  buffer);
-		safe_unpack32(&gres_ptr->gres_cnt_alloc,  buffer);
-		unpack_bit_str(&gres_ptr->gres_bit_alloc, buffer);
-		if (!has_file)
-			FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc);
-		if ((gres_ptr->gres_bit_alloc != NULL) &&
-		    (gres_ptr->gres_cnt_avail !=
-		     bit_size(gres_ptr->gres_bit_alloc))) {
-			gres_ptr->gres_bit_alloc =
-					bit_realloc(gres_ptr->gres_bit_alloc,
-						    gres_ptr->gres_cnt_avail);
-			if (gres_ptr->gres_bit_alloc == NULL)
-				goto unpack_error;
-		}
-		if ((gres_ptr->gres_bit_alloc != NULL) &&
-		    (gres_ptr->gres_cnt_alloc != 
-		     bit_set_count(gres_ptr->gres_bit_alloc))) {
-			error("gres _node_state_unpack bit count inconsistent");
-			goto unpack_error;
-		}
-	}
-
-	*gres_data = gres_ptr;
-	return SLURM_SUCCESS;
-
-unpack_error:
-	FREE_NULL_BITMAP(gres_ptr->gres_bit_alloc);
-	xfree(gres_ptr);
-	*gres_data = NULL;
-	return SLURM_ERROR;
-}
-
 /*
  * Pack a node's current gres status, called from slurmctld for save/restore
  * IN gres_list - generated by gres_plugin_node_config_validate()
@@ -1544,13 +1496,14 @@ unpack_error:
 extern int gres_plugin_node_state_pack(List gres_list, Buf buffer,
 				       char *node_name)
 {
-	int i, rc = SLURM_SUCCESS;
-	uint32_t top_offset, gres_size = 0;
-	uint32_t header_offset, size_offset, data_offset, tail_offset;
+	int rc = SLURM_SUCCESS;
+	uint32_t top_offset, tail_offset;
 	uint32_t magic = GRES_MAGIC;
 	uint16_t rec_cnt = 0;
+	uint8_t  has_bitmap;
 	ListIterator gres_iter;
 	gres_state_t *gres_ptr;
+	gres_node_state_t *gres_node_ptr;
 
 	if (gres_list == NULL) {
 		pack16(rec_cnt, buffer);
@@ -1568,29 +1521,19 @@ extern int gres_plugin_node_state_pack(List gres_list, Buf buffer,
 	slurm_mutex_lock(&gres_context_lock);
 	gres_iter = list_iterator_create(gres_list);
 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
-		for (i=0; i<gres_context_cnt; i++) {
-			if (gres_ptr->plugin_id != gres_context[i].plugin_id)
-				continue;
-			header_offset = get_buf_offset(buffer);
-			pack32(magic, buffer);
-			pack32(gres_ptr->plugin_id, buffer);
-			size_offset = get_buf_offset(buffer);
-			pack32(gres_size, buffer);	/* placeholder */
-			data_offset = get_buf_offset(buffer);
-			_node_state_pack(gres_ptr->gres_data, buffer);
-			tail_offset = get_buf_offset(buffer);
-			set_buf_offset(buffer, size_offset);
-			gres_size = tail_offset - data_offset;
-			pack32(gres_size, buffer);
-			set_buf_offset(buffer, tail_offset);
-			rec_cnt++;
-			break;
-		}
-		if (i >= gres_context_cnt) {
-			error("Could not find plugin id %u to pack record for "
-			      "node %s",
-			      gres_ptr->plugin_id, node_name);
-		}
+		gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
+		pack32(magic, buffer);
+		pack32(gres_ptr->plugin_id, buffer);
+		pack32(gres_node_ptr->gres_cnt_avail, buffer);
+		/* Just note if gres_bit_alloc exists.
+		 * Rebuild it based upon the state of recovered jobs */
+		if (gres_node_ptr->gres_bit_alloc)
+			has_bitmap = 1;
+		else
+			has_bitmap = 0;
+		pack8(has_bitmap, buffer);
+		rec_cnt++;
+		break;
 	}
 	list_iterator_destroy(gres_iter);
 	slurm_mutex_unlock(&gres_context_lock);
@@ -1612,11 +1555,12 @@ extern int gres_plugin_node_state_pack(List gres_list, Buf buffer,
 extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer,
 					 char *node_name)
 {
-	int i, rc, rc2;
-	uint32_t gres_size, magic, tail_offset, plugin_id;
+	int i, rc;
+	uint32_t gres_cnt_avail, magic, plugin_id;
 	uint16_t rec_cnt;
+	uint8_t  has_bitmap;
 	gres_state_t *gres_ptr;
-	void *gres_data;
+	gres_node_state_t *gres_node_ptr;
 
 	safe_unpack16(&rec_cnt, buffer);
 	if (rec_cnt == 0)
@@ -1642,7 +1586,8 @@ extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer,
 		if (magic != GRES_MAGIC)
 			goto unpack_error;
 		safe_unpack32(&plugin_id, buffer);
-		safe_unpack32(&gres_size, buffer);
+		safe_unpack32(&gres_cnt_avail, buffer);
+		safe_unpack8(&has_bitmap, buffer);
 		for (i=0; i<gres_context_cnt; i++) {
 			if (gres_context[i].plugin_id == plugin_id)
 				break;
@@ -1653,25 +1598,21 @@ extern int gres_plugin_node_state_unpack(List *gres_list, Buf buffer,
 			      plugin_id, node_name);
 			/* A likely sign that GresPlugins has changed.
 			 * Not a fatal error, skip over the data. */
-			tail_offset = get_buf_offset(buffer);
-			tail_offset += gres_size;
-			set_buf_offset(buffer, tail_offset);
 			continue;
 		}
 		gres_context[i].unpacked_info = true;
-		rc2 = _node_state_unpack(&gres_data, buffer,
-					 gres_context[i].has_file);
-		if (rc2 != SLURM_SUCCESS) {
-			error("gres_plugin_node_state_unpack: error unpacking "
-			      "data of type %s from node %s",
-			      gres_context[i].gres_name, node_name);
-			rc = rc2;
-		} else {
-			gres_ptr = xmalloc(sizeof(gres_state_t));
-			gres_ptr->plugin_id = gres_context[i].plugin_id;
-			gres_ptr->gres_data = gres_data;
-			list_append(*gres_list, gres_ptr);
+		gres_node_ptr = _build_gres_node_state();
+		gres_node_ptr->gres_cnt_avail = gres_cnt_avail;
+		if (has_bitmap) {
+			gres_node_ptr->gres_bit_alloc =
+				bit_alloc(gres_cnt_avail);
+			if (gres_node_ptr->gres_bit_alloc == NULL)
+				fatal("bit_alloc: malloc failure");
 		}
+		gres_ptr = xmalloc(sizeof(gres_state_t));
+		gres_ptr->plugin_id = gres_context[i].plugin_id;
+		gres_ptr->gres_data = gres_node_ptr;
+		list_append(*gres_list, gres_ptr);
 	}
 
 fini:	/* Insure that every gres plugin is called for unpack, even if no data
@@ -1681,18 +1622,12 @@ fini:	/* Insure that every gres plugin is called for unpack, even if no data
 		if (gres_context[i].unpacked_info)
 			continue;
 		error("gres_plugin_node_state_unpack: no info packed for %s "
-		      "by node %s",
-		      gres_context[i].gres_type, node_name);
-		rc2 = _node_state_unpack(&gres_data, NULL,
-					 gres_context[i].has_file);
-		if (rc2 != SLURM_SUCCESS) {
-			rc = rc2;
-		} else {
-			gres_ptr = xmalloc(sizeof(gres_state_t));
-			gres_ptr->plugin_id = gres_context[i].plugin_id;
-			gres_ptr->gres_data = gres_data;
-			list_append(*gres_list, gres_ptr);
-		}
+		      "by node %s", gres_context[i].gres_type, node_name);
+		gres_node_ptr = _build_gres_node_state();
+		gres_ptr = xmalloc(sizeof(gres_state_t));
+		gres_ptr->plugin_id = gres_context[i].plugin_id;
+		gres_ptr->gres_data = gres_node_ptr;
+		list_append(*gres_list, gres_ptr);
 	}
 	slurm_mutex_unlock(&gres_context_lock);
 
@@ -1721,6 +1656,8 @@ static void *_node_state_dup(void *gres_data)
 	new_gres->gres_cnt_alloc  = gres_ptr->gres_cnt_alloc;
 	if (gres_ptr->gres_bit_alloc)
 		new_gres->gres_bit_alloc = bit_copy(gres_ptr->gres_bit_alloc);
+	if (gres_ptr->topo_cnt == 0)
+		return new_gres;
 
 	new_gres->topo_cnt         = gres_ptr->topo_cnt;
 	new_gres->topo_cpus_bitmap = xmalloc(gres_ptr->topo_cnt *
@@ -1736,12 +1673,14 @@ static void *_node_state_dup(void *gres_data)
 			bit_copy(gres_ptr->topo_cpus_bitmap[i]);
 		new_gres->topo_gres_bitmap[i] =
 			bit_copy(gres_ptr->topo_gres_bitmap[i]);
+		if ((new_gres->topo_cpus_bitmap[i] == NULL) ||
+		    (new_gres->topo_gres_bitmap[i] == NULL))
+			fatal("bit_copy: malloc failure");
 		new_gres->topo_gres_cnt_alloc[i] =
 			gres_ptr->topo_gres_cnt_alloc[i];
 		new_gres->topo_gres_cnt_avail[i] =
 			gres_ptr->topo_gres_cnt_avail[i];
 	}
-
 	return new_gres;
 }
 
@@ -1794,15 +1733,32 @@ extern List gres_plugin_node_state_dup(List gres_list)
 	return new_list;
 }
 
-static void _node_state_dealloc(void *gres_data)
+static void _node_state_dealloc(gres_state_t *gres_ptr)
 {
-	gres_node_state_t *gres_ptr = (gres_node_state_t *) gres_data;
+	int i;
+	gres_node_state_t *gres_node_ptr;
+	char *gres_name = NULL;
 
-	gres_ptr->gres_cnt_alloc = 0;
-	if (gres_ptr->gres_bit_alloc) {
-		int i = bit_size(gres_ptr->gres_bit_alloc) - 1;
+	gres_node_ptr = (gres_node_state_t *) gres_ptr->gres_data;
+	gres_node_ptr->gres_cnt_alloc = 0;
+	if (gres_node_ptr->gres_bit_alloc) {
+		int i = bit_size(gres_node_ptr->gres_bit_alloc) - 1;
 		if (i > 0)
-			bit_nclear(gres_ptr->gres_bit_alloc, 0, i);
+			bit_nclear(gres_node_ptr->gres_bit_alloc, 0, i);
+	}
+	if (gres_node_ptr->topo_cnt && !gres_node_ptr->topo_gres_cnt_alloc) {
+		for (i=0; i<gres_context_cnt; i++) {
+			if (gres_ptr->plugin_id == gres_context[i].plugin_id) {
+				gres_name = gres_context[i].gres_name;
+				break;
+			}
+		}
+		error("gres_plugin_node_state_dealloc: gres/%s topo_cnt!=0 "
+		      "and topo_gres_cnt_alloc is NULL", gres_name);
+	} else {
+		for (i=0; i<gres_node_ptr->topo_cnt; i++) {
+			gres_node_ptr->topo_gres_cnt_alloc[i] = 0;
+		}
 	}
 }
 
@@ -1825,7 +1781,7 @@ extern void gres_plugin_node_state_dealloc(List gres_list)
 	slurm_mutex_lock(&gres_context_lock);
 	gres_iter = list_iterator_create(gres_list);
 	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
-		_node_state_dealloc(gres_ptr->gres_data);
+		_node_state_dealloc(gres_ptr);
 	}
 	list_iterator_destroy(gres_iter);
 	slurm_mutex_unlock(&gres_context_lock);
@@ -2070,6 +2026,8 @@ static int _job_config_validate(char *config, uint32_t *gres_cnt,
 			;
 		else if ((last_num[0] == 'k') || (last_num[0] == 'K'))
 			cnt *= 1024;
+		else if ((last_num[0] == 'm') || (last_num[0] == 'M'))
+			cnt *= (1024 * 1024);
 		else
 			return SLURM_ERROR;
 		if (cnt <= 0)
-- 
GitLab