From 3a6c788fc867b6a61a90f1d083af6dee3fbadcf5 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 21 May 2010 17:02:11 +0000
Subject: [PATCH] add logic to save/restore/log a job step's gres details (i.e.
 specific bits)

---
 src/common/gres.c               | 216 +++++++++++++++++++++++++++++++-
 src/common/gres.h               |  26 ++++
 src/plugins/gres/gpu/gres_gpu.c |  97 ++++++++++++++
 src/plugins/gres/nic/gres_nic.c |  97 ++++++++++++++
 src/slurmctld/job_mgr.c         |   2 +-
 src/slurmctld/slurmctld.h       |   7 +-
 src/slurmctld/step_mgr.c        |  17 ++-
 7 files changed, 456 insertions(+), 6 deletions(-)

diff --git a/src/common/gres.c b/src/common/gres.c
index d2aa93b92a5..67dfa0f19c1 100644
--- a/src/common/gres.c
+++ b/src/common/gres.c
@@ -130,6 +130,12 @@ typedef struct slurm_gres_ops {
 	int		(*job_dealloc)		( void *job_gres_data,
 						  void *node_gres_data,
 						  int node_offset );
+	int		(*step_state_pack)	( void *gres_data,
+						  Buf buffer );
+	int		(*step_state_unpack)	( void **gres_data,
+						  Buf buffer );
+	void		(*step_state_log)	( void *gres_data,
+						  uint32_t job_id );
 } slurm_gres_ops_t;
 
 typedef struct slurm_gres_context {
@@ -194,7 +200,10 @@ static int _load_gres_plugin(char *plugin_name,
 		"job_state_log",
 		"job_test",
 		"job_alloc",
-		"job_dealloc"
+		"job_dealloc",
+		"step_state_pack",
+		"step_state_unpack",
+		"step_state_log"
 	};
 	int n_syms = sizeof(syms) / sizeof(char *);
 
@@ -1341,7 +1350,7 @@ extern int gres_plugin_job_state_unpack(List *gres_list, Buf buffer,
 		}
 		if (i >= gres_context_cnt) {
 			error("gres_plugin_job_state_unpack: no plugin "
-			      "configured to unpack data type %u from job %j",
+			      "configured to unpack data type %u from job %u",
 			      plugin_id, job_id);
 			/* A likely sign that GresPlugins has changed.
 			 * Not a fatal error, skip over the data. */
@@ -1597,3 +1606,206 @@ extern void gres_plugin_job_state_log(List gres_list, uint32_t job_id)
 	list_iterator_destroy(gres_iter);
 	slurm_mutex_unlock(&gres_context_lock);
 }
+
+/*
+ * Pack a step's current gres status, called from slurmctld for save/restore
+ * IN gres_list - generated by gres_plugin_step_allocate()
+ * IN/OUT buffer - location to write state to
+ * IN job_id, step_id - job and step ID for logging
+ */
+extern int gres_plugin_step_state_pack(List gres_list, Buf buffer,
+				       uint32_t job_id, uint32_t step_id)
+{
+	int i, rc = SLURM_SUCCESS, rc2;
+	uint32_t top_offset, gres_size = 0;
+	uint32_t header_offset, size_offset, data_offset, tail_offset;
+	uint32_t magic = GRES_MAGIC;
+	uint16_t rec_cnt = 0;
+	ListIterator gres_iter;
+	gres_state_t *gres_ptr;
+
+	top_offset = get_buf_offset(buffer);
+	pack16(rec_cnt, buffer);	/* placeholder if data */
+
+	if (gres_list == NULL)
+		return rc;
+
+	(void) gres_plugin_init();
+
+	slurm_mutex_lock(&gres_context_lock);
+	gres_iter = list_iterator_create(gres_list);
+	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
+		for (i=0; i<gres_context_cnt; i++) {
+			if (gres_ptr->plugin_id !=
+			    *(gres_context[i].ops.plugin_id))
+				continue;
+			header_offset = get_buf_offset(buffer);
+			pack32(magic, buffer);
+			pack32(gres_ptr->plugin_id, buffer);
+			size_offset = get_buf_offset(buffer);
+			pack32(gres_size, buffer);	/* placeholder */
+			data_offset = get_buf_offset(buffer);
+			rc2 = (*(gres_context[i].ops.step_state_pack))
+					(gres_ptr->gres_data, buffer);
+			if (rc2 != SLURM_SUCCESS) {
+				rc = rc2;
+				set_buf_offset(buffer, header_offset);
+				continue;
+			}
+			tail_offset = get_buf_offset(buffer);
+			set_buf_offset(buffer, size_offset);
+			gres_size = tail_offset - data_offset;
+			pack32(gres_size, buffer);
+			set_buf_offset(buffer, tail_offset);
+			rec_cnt++;
+			break;
+		}
+		if (i >= gres_context_cnt) {
+			error("Could not find plugin id %u to pack record for "
+			      "step %u.%u",
+			      gres_ptr->plugin_id, job_id, step_id);
+		}
+	}
+	list_iterator_destroy(gres_iter);
+	slurm_mutex_unlock(&gres_context_lock);
+
+	tail_offset = get_buf_offset(buffer);
+	set_buf_offset(buffer, top_offset);
+	pack16(rec_cnt, buffer);
+	set_buf_offset(buffer, tail_offset);
+
+	return rc;
+}
+
+/*
+ * Unpack a step's current gres status, called from slurmctld for save/restore
+ * OUT gres_list - restored state stored by gres_plugin_step_state_pack()
+ * IN/OUT buffer - location to read state from
+ * IN job_id, step_id - job and step ID for logging
+ */
+extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
+					 uint32_t job_id, uint32_t step_id)
+{
+	int i, rc, rc2;
+	uint32_t gres_size, magic, tail_offset, plugin_id;
+	uint16_t rec_cnt;
+	gres_state_t *gres_ptr;
+	void *gres_data;
+
+	safe_unpack16(&rec_cnt, buffer);
+	if (rec_cnt == 0)
+		return SLURM_SUCCESS;
+
+	rc = gres_plugin_init();
+
+	slurm_mutex_lock(&gres_context_lock);
+	if ((gres_context_cnt > 0) && (*gres_list == NULL)) {
+		*gres_list = list_create(_gres_job_list_delete);
+		if (*gres_list == NULL)
+			fatal("list_create malloc failure");
+	}
+
+	for (i=0; i<gres_context_cnt; i++)
+		gres_context[i].unpacked_info = false;
+
+	while ((rc == SLURM_SUCCESS) && (rec_cnt)) {
+		if ((buffer == NULL) || (remaining_buf(buffer) == 0))
+			break;
+		rec_cnt--;
+		safe_unpack32(&magic, buffer);
+		if (magic != GRES_MAGIC)
+			goto unpack_error;
+		safe_unpack32(&plugin_id, buffer);
+		safe_unpack32(&gres_size, buffer);
+		for (i=0; i<gres_context_cnt; i++) {
+			if (*(gres_context[i].ops.plugin_id) == plugin_id)
+				break;
+		}
+		if (i >= gres_context_cnt) {
+			error("gres_plugin_step_state_unpack: no plugin "
+			      "configured to unpack data type %u from "
+			      "step %u.%u",
+			      plugin_id, job_id, step_id);
+			/* A likely sign that GresPlugins has changed.
+			 * Not a fatal error, skip over the data. */
+			tail_offset = get_buf_offset(buffer);
+			tail_offset += gres_size;
+			set_buf_offset(buffer, tail_offset);
+			continue;
+		}
+		gres_context[i].unpacked_info = true;
+		rc2 = (*(gres_context[i].ops.step_state_unpack))
+				(&gres_data, buffer);
+		if (rc2 != SLURM_SUCCESS) {
+			rc = rc2;
+		} else {
+			gres_ptr = xmalloc(sizeof(gres_state_t));
+			gres_ptr->plugin_id = *(gres_context[i].ops.plugin_id);
+			gres_ptr->gres_data = gres_data;
+			list_append(*gres_list, gres_ptr);
+		}
+	}
+
+fini:	/* Insure that every gres plugin is called for unpack, even if no data
+	 * was packed by the job. A likely sign that GresPlugins is
+	 * inconsistently configured. */
+	for (i=0; i<gres_context_cnt; i++) {
+		if (gres_context[i].unpacked_info)
+			continue;
+		debug("gres_plugin_job_state_unpack: no info packed for %s "
+		      "by step %u.%u",
+		      gres_context[i].gres_type, job_id, step_id);
+		rc2 = (*(gres_context[i].ops.job_state_unpack))
+				(&gres_data, NULL);
+		if (rc2 != SLURM_SUCCESS) {
+			rc = rc2;
+		} else {
+			gres_ptr = xmalloc(sizeof(gres_state_t));
+			gres_ptr->plugin_id = *(gres_context[i].ops.plugin_id);
+			gres_ptr->gres_data = gres_data;
+			list_append(*gres_list, gres_ptr);
+		}
+	}
+	slurm_mutex_unlock(&gres_context_lock);
+
+	return rc;
+
+unpack_error:
+	error("gres_plugin_job_state_unpack: unpack error from step %u.%u",
+	      job_id, step_id);
+	rc = SLURM_ERROR;
+	goto fini;
+}
+
+/*
+ * Log a step's current gres state
+ * IN gres_list - generated by gres_plugin_step_allocate()
+ * IN job_id - job's ID
+ */
+extern void gres_plugin_step_state_log(List gres_list, uint32_t job_id,
+				       uint32_t step_id)
+{
+	int i;
+	ListIterator gres_iter;
+	gres_state_t *gres_ptr;
+
+	if (!gres_debug || (gres_list == NULL))
+		return;
+
+	(void) gres_plugin_init();
+
+	slurm_mutex_lock(&gres_context_lock);
+	gres_iter = list_iterator_create(gres_list);
+	while ((gres_ptr = (gres_state_t *) list_next(gres_iter))) {
+		for (i=0; i<gres_context_cnt; i++) {
+			if (gres_ptr->plugin_id !=
+			    *(gres_context[i].ops.plugin_id))
+				continue;
+			(*(gres_context[i].ops.step_state_log))
+					(gres_ptr->gres_data, job_id);
+			break;
+		}
+	}
+	list_iterator_destroy(gres_iter);
+	slurm_mutex_unlock(&gres_context_lock);
+}
diff --git a/src/common/gres.h b/src/common/gres.h
index a6f079f5b10..727cf7fa089 100644
--- a/src/common/gres.h
+++ b/src/common/gres.h
@@ -275,4 +275,30 @@ extern int gres_plugin_job_dealloc(List job_gres_list, List node_gres_list,
  */
 extern void gres_plugin_job_state_log(List gres_list, uint32_t job_id);
 
+/*
+ * Pack a step's current gres status, called from slurmctld for save/restore
+ * IN gres_list - generated by gres_plugin_step_allocate()
+ * IN/OUT buffer - location to write state to
+ * IN job_id, step_id - job and step ID for logging
+ */
+extern int gres_plugin_step_state_pack(List gres_list, Buf buffer,
+				       uint32_t job_id, uint32_t step_id);
+
+/*
+ * Unpack a step's current gres status, called from slurmctld for save/restore
+ * OUT gres_list - restored state stored by gres_plugin_step_state_pack()
+ * IN/OUT buffer - location to read state from
+ * IN job_id, step_id - job and step ID for logging
+ */
+extern int gres_plugin_step_state_unpack(List *gres_list, Buf buffer,
+					 uint32_t job_id, uint32_t step_id);
+
+/*
+ * Log a step's current gres state
+ * IN gres_list - generated by gres_plugin_step_allocate()
+ * IN job_id - job's ID
+ */
+extern void gres_plugin_step_state_log(List gres_list, uint32_t job_id,
+				       uint32_t step_id);
+
 #endif /* !_GRES_H */
diff --git a/src/plugins/gres/gpu/gres_gpu.c b/src/plugins/gres/gpu/gres_gpu.c
index 6859fd76696..c48a0b07c37 100644
--- a/src/plugins/gres/gpu/gres_gpu.c
+++ b/src/plugins/gres/gpu/gres_gpu.c
@@ -140,8 +140,29 @@ typedef struct gpu_job_state {
 	/* Resources currently allocated to job on each node */
 	uint32_t node_cnt;
 	bitstr_t **gpu_bit_alloc;
+#if 0
+Need to modify state save, allocation, free, etc.
+	/* Resources currently allocated to job steps on each node.
+	 * This will be a subset of resources allocated to the job.
+	 * gpu_bit_step_alloc is a subset of gpu_bit_alloc */
+	bitstr_t **gpu_bit_step_alloc;
+#endif
 } gpu_job_state_t;
 
+/* Gres job step state as used by slurmctld. */
+typedef struct gpu_step_state {
+	/* Count of resources needed */
+	uint32_t gpu_cnt_alloc;
+
+	/* If 0 then gpu_cnt_alloc is per node,
+	 * if 1 then gpu_cnt_alloc is per CPU */
+	uint8_t  gpu_cnt_mult;
+
+	/* Resources currently allocated to the job step on each node */
+	uint32_t node_cnt;
+	bitstr_t **gpu_bit_alloc;
+} gpu_step_state_t;
+
 /*
  * This will be the output for "--gres=help" option.
  * Called only by salloc, sbatch and srun.
@@ -988,3 +1009,79 @@ extern void job_state_log(void *gres_data, uint32_t job_id)
 		info("  gpu_bit_alloc:NULL");
 	}
 }
+
+extern int step_state_pack(void *gres_data, Buf buffer)
+{
+	int i;
+	gpu_step_state_t *gres_ptr = (gpu_step_state_t *) gres_data;
+
+	pack32(gres_ptr->gpu_cnt_alloc, buffer);
+	pack8 (gres_ptr->gpu_cnt_mult,  buffer);
+
+	pack32(gres_ptr->node_cnt,      buffer);
+	for (i=0; i<gres_ptr->node_cnt; i++)
+		pack_bit_str(gres_ptr->gpu_bit_alloc[i], buffer);
+
+	return SLURM_SUCCESS;
+}
+
+extern int step_state_unpack(void **gres_data, Buf buffer)
+{
+	int i;
+	gpu_step_state_t *gres_ptr;
+
+	gres_ptr = xmalloc(sizeof(gpu_step_state_t));
+
+	if (buffer) {
+		safe_unpack32(&gres_ptr->gpu_cnt_alloc,  buffer);
+		safe_unpack8 (&gres_ptr->gpu_cnt_mult,   buffer);
+
+		safe_unpack32(&gres_ptr->node_cnt,       buffer);
+		gres_ptr->gpu_bit_alloc = xmalloc(sizeof(bitstr_t *) *
+						  (gres_ptr->node_cnt + 1));
+		for (i=0; i<gres_ptr->node_cnt; i++)
+			unpack_bit_str(&gres_ptr->gpu_bit_alloc[i], buffer);
+	}
+
+	*gres_data = gres_ptr;
+	return SLURM_SUCCESS;
+
+unpack_error:
+	error("Unpacking %s step state info", plugin_name);
+	if (gres_ptr->gpu_bit_alloc) {
+		for (i=0; i<gres_ptr->node_cnt; i++) {
+			if (gres_ptr->gpu_bit_alloc[i])
+				bit_free(gres_ptr->gpu_bit_alloc[i]);
+		}
+		xfree(gres_ptr->gpu_bit_alloc);
+	}
+	xfree(gres_ptr);
+	*gres_data = NULL;
+	return SLURM_ERROR;
+}
+
+extern void step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id)
+{
+	gpu_step_state_t *gres_ptr;
+	char *mult, tmp_str[128];
+	int i;
+
+	xassert(gres_data);
+	gres_ptr = (gpu_step_state_t *) gres_data;
+	info("%s state for step %u.%u", plugin_name, job_id, step_id);
+	if (gres_ptr->gpu_cnt_mult)
+		mult = "cpu";
+	else
+		mult = "node";
+	info("  gpu_cnt %u per %s", gres_ptr->gpu_cnt_alloc, mult);
+
+	if (gres_ptr->node_cnt && gres_ptr->gpu_bit_alloc) {
+		for (i=0; i<gres_ptr->node_cnt; i++) {
+			bit_fmt(tmp_str, sizeof(tmp_str),
+				gres_ptr->gpu_bit_alloc[i]);
+			info("  gpu_bit_alloc[%d]:%s", i, tmp_str);
+		}
+	} else {
+		info("  gpu_bit_alloc:NULL");
+	}
+}
diff --git a/src/plugins/gres/nic/gres_nic.c b/src/plugins/gres/nic/gres_nic.c
index adee32805ad..d123bb11800 100644
--- a/src/plugins/gres/nic/gres_nic.c
+++ b/src/plugins/gres/nic/gres_nic.c
@@ -140,8 +140,29 @@ typedef struct nic_job_state {
 	/* Resources currently allocated to job on each node */
 	uint32_t node_cnt;
 	bitstr_t **nic_bit_alloc;
+#if 0
+Need to modify state save, allocation, free, etc.
+	/* Resources currently allocated to job steps on each node.
+	 * This will be a subset of resources allocated to the job.
+	 * nic_bit_step_alloc is a subset of nic_bit_alloc */
+	bitstr_t **nic_bit_step_alloc;
+#endif
 } nic_job_state_t;
 
+/* Gres job step state as used by slurmctld. */
+typedef struct nic_step_state {
+	/* Count of resources needed */
+	uint32_t nic_cnt_alloc;
+
+	/* If 0 then nic_cnt_alloc is per node,
+	 * if 1 then nic_cnt_alloc is per CPU */
+	uint8_t  nic_cnt_mult;
+
+	/* Resources currently allocated to the job step on each node */
+	uint32_t node_cnt;
+	bitstr_t **nic_bit_alloc;
+} nic_step_state_t;
+
 /*
  * This will be the output for "--gres=help" option.
  * Called only by salloc, sbatch and srun.
@@ -988,3 +1009,79 @@ extern void job_state_log(void *gres_data, uint32_t job_id)
 		info("  nic_bit_alloc:NULL");
 	}
 }
+
+extern int step_state_pack(void *gres_data, Buf buffer)
+{
+	int i;
+	nic_step_state_t *gres_ptr = (nic_step_state_t *) gres_data;
+
+	pack32(gres_ptr->nic_cnt_alloc, buffer);
+	pack8 (gres_ptr->nic_cnt_mult,  buffer);
+
+	pack32(gres_ptr->node_cnt,      buffer);
+	for (i=0; i<gres_ptr->node_cnt; i++)
+		pack_bit_str(gres_ptr->nic_bit_alloc[i], buffer);
+
+	return SLURM_SUCCESS;
+}
+
+extern int step_state_unpack(void **gres_data, Buf buffer)
+{
+	int i;
+	nic_step_state_t *gres_ptr;
+
+	gres_ptr = xmalloc(sizeof(nic_step_state_t));
+
+	if (buffer) {
+		safe_unpack32(&gres_ptr->nic_cnt_alloc,  buffer);
+		safe_unpack8 (&gres_ptr->nic_cnt_mult,   buffer);
+
+		safe_unpack32(&gres_ptr->node_cnt,       buffer);
+		gres_ptr->nic_bit_alloc = xmalloc(sizeof(bitstr_t *) *
+						  (gres_ptr->node_cnt + 1));
+		for (i=0; i<gres_ptr->node_cnt; i++)
+			unpack_bit_str(&gres_ptr->nic_bit_alloc[i], buffer);
+	}
+
+	*gres_data = gres_ptr;
+	return SLURM_SUCCESS;
+
+unpack_error:
+	error("Unpacking %s step state info", plugin_name);
+	if (gres_ptr->nic_bit_alloc) {
+		for (i=0; i<gres_ptr->node_cnt; i++) {
+			if (gres_ptr->nic_bit_alloc[i])
+				bit_free(gres_ptr->nic_bit_alloc[i]);
+		}
+		xfree(gres_ptr->nic_bit_alloc);
+	}
+	xfree(gres_ptr);
+	*gres_data = NULL;
+	return SLURM_ERROR;
+}
+
+extern void step_state_log(void *gres_data, uint32_t job_id, uint32_t step_id)
+{
+	nic_step_state_t *gres_ptr;
+	char *mult, tmp_str[128];
+	int i;
+
+	xassert(gres_data);
+	gres_ptr = (nic_step_state_t *) gres_data;
+	info("%s state for step %u.%u", plugin_name, job_id, step_id);
+	if (gres_ptr->nic_cnt_mult)
+		mult = "cpu";
+	else
+		mult = "node";
+	info("  nic_cnt %u per %s", gres_ptr->nic_cnt_alloc, mult);
+
+	if (gres_ptr->node_cnt && gres_ptr->nic_bit_alloc) {
+		for (i=0; i<gres_ptr->node_cnt; i++) {
+			bit_fmt(tmp_str, sizeof(tmp_str),
+				gres_ptr->nic_bit_alloc[i]);
+			info("  nic_bit_alloc[%d]:%s", i, tmp_str);
+		}
+	} else {
+		info("  nic_bit_alloc:NULL");
+	}
+}
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 61139bb0bd4..47cbd75e040 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -797,7 +797,7 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer)
 	while ((step_ptr = (struct step_record *)
 		list_next(step_iterator))) {
 		pack16((uint16_t) STEP_FLAG, buffer);
-		dump_job_step_state(step_ptr, buffer);
+		dump_job_step_state(dump_job_ptr, step_ptr, buffer);
 	}
 	list_iterator_destroy(step_iterator);
 	pack16((uint16_t) 0, buffer);	/* no step flag */
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 38853f1e270..7d14b6fd86a 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -413,7 +413,7 @@ struct job_record {
 	uint32_t exit_code;		/* exit code for job (status from
 					 * wait call) */
 	char *gres;			/* generic resources */
-	List gres_list;			/* generic resource allocation detaisl */
+	List gres_list;			/* generic resource allocation detail */
 	uint32_t group_id;		/* group submitted under */
 	uint32_t job_id;		/* job ID */
 	struct job_record *job_next;	/* next entry with same hash index */
@@ -525,6 +525,7 @@ struct 	step_record {
 	uint32_t exit_code;		/* highest exit code from any task */
 	bitstr_t *exit_node_bitmap;	/* bitmap of exited nodes */
 	char *gres;			/* generic resources required */
+	List gres_list;			/* generic resource allocation detail */
 	char *host;			/* host for srun communications */
 	struct job_record* job_ptr; 	/* ptr to the job that owns the step */
 	jobacctinfo_t *jobacct;         /* keep track of process info in the
@@ -686,10 +687,12 @@ extern void dump_job_desc(job_desc_msg_t * job_specs);
 /*
  * dump_job_step_state - dump the state of a specific job step to a buffer,
  *	load with load_step_state
+ * IN job_ptr - pointer to job for which information is to be dumpped
  * IN step_ptr - pointer to job step for which information is to be dumpped
  * IN/OUT buffer - location to store data, pointers automatically advanced
  */
-extern void dump_job_step_state(struct step_record *step_ptr, Buf buffer);
+extern void dump_job_step_state(struct job_record *job_ptr, 
+				struct step_record *step_ptr, Buf buffer);
 
 /*
  * dump_step_desc - dump the incoming step initiate request message
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 282ca8d0426..9e20be8528d 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -57,6 +57,7 @@
 #include "src/common/bitstring.h"
 #include "src/common/checkpoint.h"
 #include "src/common/forward.h"
+#include "src/common/gres.h"
 #include "src/common/slurm_accounting_storage.h"
 #include "src/common/slurm_jobacct_gather.h"
 #include "src/common/slurm_protocol_interface.h"
@@ -2328,10 +2329,12 @@ resume_job_step(struct job_record *job_ptr)
 /*
  * dump_job_step_state - dump the state of a specific job step to a buffer,
  *	load with load_step_state
+ * IN job_ptr - pointer to job for which information is to be dumpped
  * IN step_ptr - pointer to job step for which information is to be dumpped
  * IN/OUT buffer - location to store data, pointers automatically advanced
  */
-extern void dump_job_step_state(struct step_record *step_ptr, Buf buffer)
+extern void dump_job_step_state(struct job_record *job_ptr, 
+				struct step_record *step_ptr, Buf buffer)
 {
 	pack32(step_ptr->step_id, buffer);
 	pack16(step_ptr->cyclic_alloc, buffer);
@@ -2368,7 +2371,11 @@ extern void dump_job_step_state(struct step_record *step_ptr, Buf buffer)
 	packstr(step_ptr->name, buffer);
 	packstr(step_ptr->network, buffer);
 	packstr(step_ptr->ckpt_dir, buffer);
+
 	packstr(step_ptr->gres, buffer);
+	(void) gres_plugin_step_state_pack(step_ptr->gres_list, buffer,
+					   job_ptr->job_id, step_ptr->step_id);
+
 	pack16(step_ptr->batch_step, buffer);
 	if (!step_ptr->batch_step) {
 		pack_slurm_step_layout(step_ptr->step_layout, buffer,
@@ -2400,6 +2407,7 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer,
 	switch_jobinfo_t *switch_tmp = NULL;
 	check_jobinfo_t check_tmp = NULL;
 	slurm_step_layout_t *step_layout = NULL;
+	List gres_list = NULL;
 
 	if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) {
 		safe_unpack32(&step_id, buffer);
@@ -2433,7 +2441,11 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer,
 		safe_unpackstr_xmalloc(&name, &name_len, buffer);
 		safe_unpackstr_xmalloc(&network, &name_len, buffer);
 		safe_unpackstr_xmalloc(&ckpt_dir, &name_len, buffer);
+
 		safe_unpackstr_xmalloc(&gres, &name_len, buffer);
+		if (gres_plugin_step_state_unpack(&gres_list, buffer,
+				job_ptr->job_id, step_id) != SLURM_SUCCESS)
+			goto unpack_error;
 
 		safe_unpack16(&batch_step, buffer);
 		if (!batch_step) {
@@ -2534,6 +2546,7 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer,
 	step_ptr->no_kill      = no_kill;
 	step_ptr->ckpt_dir     = ckpt_dir;
 	step_ptr->gres         = gres;
+	step_ptr->gres_list    = gres_list;
 	step_ptr->port         = port;
 	step_ptr->ckpt_interval= ckpt_interval;
 	step_ptr->mem_per_cpu  = mem_per_cpu;
@@ -2591,6 +2604,8 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer,
 	xfree(network);
 	xfree(ckpt_dir);
 	xfree(gres);
+	if (gres_list)
+		list_destroy(gres_list);
 	xfree(bit_fmt);
 	xfree(core_job);
 	if (switch_tmp)
-- 
GitLab