From 3edd414fb004785a30f98f1bdf09ba2fdf9f8075 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 16 Aug 2002 22:34:23 +0000
Subject: [PATCH] Support larger buffers for dumping job info (the elan
 credential is 2k bytes).

---
 src/slurmctld/controller.c |  2 +-
 src/slurmctld/job_mgr.c    | 40 ++++++++++++++++++++++++++++----------
 src/slurmctld/slurmctld.h  |  4 ++++
 src/slurmctld/step_mgr.c   | 36 ++++++++++++++++++++++++++++++++--
 4 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 627ae991f24..0634ff89254 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -260,7 +260,7 @@ slurmctld_rpc_mgr ( void * no_data )
 		pthread_mutex_lock(&thread_count_lock);
 		server_thread_count++;
 		pthread_mutex_unlock(&thread_count_lock);
-		if (server_thread_count > MAX_SERVER_THREAD_COUNT) {
+		if (server_thread_count >= MAX_SERVER_THREAD_COUNT) {
 			info ("Warning: server_thread_count is %d, over system limit", server_thread_count);
 			no_thread = 1;
 		}
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index ddf43a09e66..be4b520cb45 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -39,6 +39,13 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <unistd.h>
+#ifdef HAVE_LIBELAN3
+#include <elan3/elan3.h>
+#include <elan3/elanvp.h>
+#define BUF_SIZE (1024 + ELAN_MAX_VPS)
+#else
+#define BUF_SIZE 1024
+#endif
 
 #include <src/common/list.h>
 #include <src/common/macros.h>
@@ -51,7 +58,6 @@
 #include <src/common/credential_utils.h>
 slurm_ssl_key_ctx_t sign_ctx ;
 
-#define BUF_SIZE 1024
 #define MAX_STR_PACK 128
 #define SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0 0
 #define TOP_PRIORITY 100000;
@@ -277,6 +283,7 @@ dump_all_job_state ( void )
 {
 	int buf_len, buffer_allocated, buffer_offset = 0, error_code = 0, log_fd;
 	char *buffer;
+	int buffer_needed;
 	void *buf_ptr;
 	char *old_file, *new_file, *reg_file;
 	/* Locks: Read config and node */
@@ -298,15 +305,18 @@ dump_all_job_state ( void )
 	while ((job_record_point = (struct job_record *) list_next (job_record_iterator))) {
 		if (job_record_point->magic != JOB_MAGIC)
 			fatal ("dump_all_job: job integrity is bad");
-
+		buffer_needed = BUF_SIZE;
+#ifdef HAVE_LIBELAN3
+		buffer_needed += (step_count (job_record_point) * ELAN_MAX_VPS / 8);
+#endif
+		if (buf_len < buffer_needed) {
+			buffer_allocated += buffer_needed;
+			buf_len += buffer_needed;
+			buffer_offset = (char *)buf_ptr - buffer;
+			xrealloc(buffer, buffer_allocated);
+			buf_ptr = buffer + buffer_offset;
+		}
 		dump_job_state (job_record_point, &buf_ptr, &buf_len);
-		if (buf_len > BUF_SIZE) 
-			continue;
-		buffer_allocated += (BUF_SIZE*16);
-		buf_len += (BUF_SIZE*16);
-		buffer_offset = (char *)buf_ptr - buffer;
-		xrealloc(buffer, buffer_allocated);
-		buf_ptr = buffer + buffer_offset;
 	}		
 	unlock_slurmctld (job_read_lock);
 	list_iterator_destroy (job_record_iterator);
@@ -394,6 +404,13 @@ dump_job_state (struct job_record *dump_job_ptr, void **buf_ptr, int *buf_len)
 	/* Dump job steps */
 	step_record_iterator = list_iterator_create (dump_job_ptr->step_list);		
 	while ((step_record_ptr = (struct step_record *) list_next (step_record_iterator))) {
+#ifdef HAVE_LIBELAN3
+		if (*buf_len < ((ELAN_MAX_VPS / 8) + 60)) {
+			fatal ("dump_job_state, buffer space too small for %u.%u",
+				dump_job_ptr->job_id, step_record_ptr->step_id);
+			break;
+		}
+#endif
 		pack16  ((uint16_t) 0xbbbb, buf_ptr, buf_len);	/* step flag */
 		dump_job_step_state (step_record_ptr, buf_ptr, buf_len);
 	};
@@ -496,6 +513,7 @@ dump_job_step_state (struct step_record *step_ptr, void **buf_ptr, int *buf_len)
 	char *node_list;
 
 	pack16  ((uint16_t) step_ptr->step_id, buf_ptr, buf_len);
+	pack16  ((uint16_t) step_ptr->cyclic_alloc, buf_ptr, buf_len);
 	pack32  ((uint32_t) step_ptr->start_time, buf_ptr, buf_len);
 	node_list = bitmap2node_name (step_ptr->node_bitmap);
 	packstr (node_list, buf_ptr, buf_len);
@@ -689,11 +707,12 @@ load_job_state ( void )
 		safe_unpack16 (&step_flag, &buf_ptr, &buffer_size);
 		while ((step_flag == 0xbbbb) && (buffer_size > (2 * sizeof (uint32_t)))) {
 			struct step_record *step_ptr;
-			uint16_t step_id;
+			uint16_t step_id, cyclic_alloc;
 			uint32_t start_time;
 			char *node_list;
 
 			safe_unpack16 (&step_id, &buf_ptr, &buffer_size);
+			safe_unpack16 (&cyclic_alloc, &buf_ptr, &buffer_size);
 			safe_unpack32 (&start_time, &buf_ptr, &buffer_size);
 			safe_unpackstr_xmalloc (&node_list, &name_len, &buf_ptr, &buffer_size);
 
@@ -701,6 +720,7 @@ load_job_state ( void )
 			if (step_ptr == NULL) 
 				break;
 			step_ptr->step_id = step_id;
+			step_ptr->cyclic_alloc = cyclic_alloc;
 			step_ptr->start_time = start_time;
 			info ("recovered job step %u.%u", job_id, step_id);
 			if (node_list) {
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 6e72a67d322..cdc159b8b83 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -200,6 +200,7 @@ struct job_record {
 struct 	step_record {
 	struct job_record* job_ptr; 	/* ptr to the job that owns the step */
 	uint16_t step_id;		/* step number */
+	uint16_t cyclic_alloc;		/* set for cyclic task allocation to nodes */
 	time_t start_time;      	/* step allocation time */
 	bitstr_t *node_bitmap;		/* bitmap of nodes in allocated to job step */
 #ifdef HAVE_LIBELAN3
@@ -472,6 +473,9 @@ extern void	set_job_prio (struct job_record *job_ptr);
 /* set_slurmd_addr - establish the slurm_addr for the slurmd on each node */
 extern void set_slurmd_addr (void);
 
+/* step_count - return a count of steps associated with a specific job */
+extern int step_count (struct job_record *job_ptr);
+
 /* step_create - parse the suppied job step specification and create step_records for it */
 extern int step_create ( step_specs *step_specs, struct step_record** );
 
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index 51d572651c5..e9d51f0ed9e 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -342,10 +342,15 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record  )
 	for (i = first; i <= last; i++) {
 		if (bit_test (step_ptr->node_bitmap, i)) {
 			node_id = qsw_getnodeid_byhost (node_record_table_ptr[i].name);
-			bit_set(nodeset, node_id);
+			if (node_id >= 0)	/* no lookup error */
+				bit_set(nodeset, node_id);
+			else
+				error ("qsw_getnodeid_byhost lookup failure on %s",
+				       node_record_table_ptr[i].name);
 		}
 	}
-	if (qsw_setup_jobinfo (step_ptr->qsw_job, nprocs, nodeset, step_ptr->node_bitmap) < 0)
+	nprocs = 1;	/* allocate based upon nodeset only */
+	if (qsw_setup_jobinfo (step_ptr->qsw_job, nprocs, nodeset, step_ptr->cyclic_alloc) < 0)
 		fatal ("step_create: qsw_setup_jobinfo error %m");
 	bit_free (nodeset);
 #endif
@@ -353,3 +358,30 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record  )
 	*new_step_record = step_ptr;
 	return SLURM_SUCCESS;
 }
+
+/* 
+ * step_count - return a count of steps associated with a specific job
+ * input: job_ptr - pointer to job table entry to have step record added
+ * output: returns count of job steps
+ */
+int
+step_count (struct job_record *job_ptr) 
+{
+	int step_count = 0;
+	ListIterator step_record_iterator;
+	struct step_record *step_record_point;
+
+	if (job_ptr == NULL)
+		return step_count;
+
+	step_record_iterator = list_iterator_create (job_ptr->step_list);		
+
+	while ((step_record_point = (struct step_record *) list_next (step_record_iterator))) {
+		step_count++;
+	}		
+
+	list_iterator_destroy (step_record_iterator);
+	return step_count;
+}
+
+
-- 
GitLab