From 89143f58d431d26f43d6861ed0f20e2d9c4a93ca Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 4 Jul 2003 18:55:40 +0000
Subject: [PATCH] Make MaxJobCount into a changable parameter. Create the job
 hash table at slurmctld initialization.

---
 doc/man/man5/slurm.conf.5   |  4 +++-
 src/common/read_config.c    |  4 ++++
 src/slurmctld/job_mgr.c     | 43 +++++++++++++++++++++++++++----------
 src/slurmctld/node_mgr.c    |  4 ++--
 src/slurmctld/read_config.c | 19 ++++++----------
 src/slurmctld/slurmctld.h   |  8 +++++--
 6 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 97c8e0a2c9e..ddaf901bcd4 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -118,7 +118,9 @@ The maximum number of jobs SLURM can have in its active database
 at one time. Set the values of \fBMaxJobCount\fR and \fBMinJobAge\fR 
 to insure the slurmctld daemon does not exhaust its memory or other 
 resources. Once this limit is reached, requests to submit additional 
-jobs will fail. The default value is 2000 jobs.
+jobs will fail. The default value is 2000 jobs. This value may not 
+be reset via "scontrol reconfig". It only takes effect upon restart 
+of the slurmctld daemon.
 .TP
 \fBMinJobAge\fR
 The minimum age of a completed job before its record is purged from 
diff --git a/src/common/read_config.c b/src/common/read_config.c
index 52c5af72abd..cc5b7540147 100644
--- a/src/common/read_config.c
+++ b/src/common/read_config.c
@@ -745,6 +745,10 @@ validate_config (slurm_ctl_conf_t *ctl_conf_ptr)
 		xfree (ctl_conf_ptr->backup_controller);
 	}
 
+	if (ctl_conf_ptr->max_job_cnt < 1)
+		fatal ("MaxJobCount=%u, No jobs permitted",
+		       ctl_conf_ptr->max_job_cnt);
+
 	if (ctl_conf_ptr->slurmctld_port == (uint32_t) NO_VAL) {
 		servent = getservbyname (SLURMCTLD_PORT, NULL);
 		if (servent)
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index f51c4a625ba..130bd6409eb 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -69,7 +69,7 @@
 #define STEP_FLAG 0xbbbb
 #define TOP_PRIORITY 0xffff0000	/* large, but leave headroom for higher */
 
-#define JOB_HASH_INX(_job_id)	(_job_id % DEFAULT_MAX_JOB_COUNT)
+#define JOB_HASH_INX(_job_id)	(_job_id % hash_table_size)
 
 #define YES_OR_NO(_in_string)	\
 		(( strcmp ((_in_string),"YES"))? \
@@ -80,12 +80,13 @@ List job_list = NULL;		/* job_record list */
 time_t last_job_update;		/* time of last update to job records */
 
 /* Local variables */
-static int default_prio = TOP_PRIORITY;
-static int job_count;		/* job's in the system */
-static long job_id_sequence = -1;	/* first job_id to assign new job */
-static struct job_record *job_hash[DEFAULT_MAX_JOB_COUNT];
-static struct job_record *job_hash_over[DEFAULT_MAX_JOB_COUNT];
-static int max_hash_over = 0;
+static int    default_prio = TOP_PRIORITY;
+static int    hash_table_size = 0;
+static int    job_count = 0;        /* job's in the system */
+static long   job_id_sequence = -1; /* first job_id to assign new job */
+static struct job_record **job_hash = NULL;
+static struct job_record **job_hash_over = NULL;
+static int    max_hash_over = 0;
 
 /* Local functions */
 static void _add_job_hash(struct job_record *job_ptr);
@@ -154,7 +155,7 @@ struct job_record *create_job_record(int *error_code)
 	struct job_record *job_record_point;
 	struct job_details *job_details_point;
 
-	if (job_count >= DEFAULT_MAX_JOB_COUNT) {
+	if (job_count >= slurmctld_conf.max_job_cnt) {
 		error("create_job_record: job_count exceeds limit");
 		*error_code = EAGAIN;
 		return NULL;
@@ -770,7 +771,7 @@ void _add_job_hash(struct job_record *job_ptr)
 
 	inx = JOB_HASH_INX(job_ptr->job_id);
 	if (job_hash[inx]) {
-		if (max_hash_over >= DEFAULT_MAX_JOB_COUNT)
+		if (max_hash_over >= hash_table_size)
 			fatal("Job hash table overflow");
 		job_hash_over[max_hash_over++] = job_ptr;
 	} else
@@ -1009,7 +1010,8 @@ void dump_job_desc(job_desc_msg_t * job_specs)
  * init_job_conf - initialize the job configuration tables and values. 
  *	this should be called after creating node information, but 
  *	before creating any job entries. Pre-existing job entries are 
- *	left unchanged.
+ *	left unchanged. 
+ *	NOTE: The job hash table size does not change after initial creation.
  * RET 0 if no error, otherwise an error code
  * global: last_job_update - time of last job table update
  *	job_list - pointer to global job list
@@ -1020,13 +1022,32 @@ int init_job_conf(void)
 		job_count = 0;
 		job_list = list_create(&_list_delete_job);
 		if (job_list == NULL)
-			fatal ("Memory allocation failure");;
+			fatal ("Memory allocation failure");
 	}
 
 	last_job_update = time(NULL);
 	return SLURM_SUCCESS;
 }
 
+/* rehash_jobs - Create or rebuild the job rehash table. Actually for now we 
+ * just preserve it */
+void rehash_jobs(void)
+{
+	if (job_hash == NULL) {
+		hash_table_size = slurmctld_conf.max_job_cnt;
+		job_hash = (struct job_record **) xmalloc(hash_table_size *
+					sizeof(struct job_record *));
+		job_hash_over = (struct job_record **) xmalloc(hash_table_size *
+					sizeof(struct job_record *));
+	} else if (hash_table_size < slurmctld_conf.max_job_cnt) {
+		/* If the MaxJobCount grows by too much, the hash table will 
+		 * be ineffective without rebuilding. We don't presently bother 
+		 * to rebuild the hash table, but cut MaxJobCount back as 
+		 * needed. */ 
+		error ("MaxJobCount reset too high, restart slurmctld");
+		slurmctld_conf.max_job_cnt = hash_table_size;
+	}
+}
 
 /*
  * job_allocate - create job_records for the suppied job specification and 
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index d55277545b9..2c1eec9f368 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -803,7 +803,7 @@ static void _pack_node (struct node_record *dump_node_ptr, Buf buffer)
 
 
 /* 
- * rehash - build a hash table of the node_record entries. this is a large 
+ * rehash_node - build a hash table of the node_record entries. this is a large 
  *	hash table to permit the immediate finding of a record based only 
  *	upon its name without regards to their number. there should be no 
  *	need for a search. 
@@ -811,7 +811,7 @@ static void _pack_node (struct node_record *dump_node_ptr, Buf buffer)
  *         hash_table - table of hash indecies
  * NOTE: manages memory for hash_table
  */
-void rehash (void) 
+void rehash_node (void) 
 {
 	int i, inx;
 
diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c
index 474f4c5c992..3412dfb2868 100644
--- a/src/slurmctld/read_config.c
+++ b/src/slurmctld/read_config.c
@@ -721,8 +721,8 @@ int read_slurm_conf(int recover)
 	}
 	fclose(slurm_spec_file);
 
-	validate_config(&slurmctld_conf);
 	_set_config_defaults(&slurmctld_conf);
+	validate_config(&slurmctld_conf);
 	update_logging();
 
 	if (default_part_loc == NULL) {
@@ -737,7 +737,8 @@ int read_slurm_conf(int recover)
 		return EINVAL;
 	}
 
-	rehash();
+	rehash_node();
+	rehash_jobs();
 	set_slurmd_addr();
 
 	if ((error_code = getnodename(node_name, MAX_NAME_LEN)))
@@ -804,13 +805,12 @@ static void _restore_node_state(struct node_record *old_node_table_ptr,
 
 
 /* Set configuration parameters to default values if not initialized 
- * by the configuration file 
+ * by the configuration file or common/read_config.c:validate_config()
  */
 static void _set_config_defaults(slurm_ctl_conf_t * ctl_conf_ptr)
 {
 	if (ctl_conf_ptr->backup_controller == NULL)
-		info(
-		   "read_slurm_conf: backup_controller value not specified.");
+		info("read_slurm_conf: backup_controller value not specified.");
 
 	if (ctl_conf_ptr->fast_schedule == (uint16_t) NO_VAL)
 		ctl_conf_ptr->fast_schedule = DEFAULT_FAST_SCHEDULE;
@@ -827,9 +827,6 @@ static void _set_config_defaults(slurm_ctl_conf_t * ctl_conf_ptr)
 	if (ctl_conf_ptr->inactive_limit == (uint16_t) NO_VAL)
 		ctl_conf_ptr->inactive_limit = DEFAULT_INACTIVE_LIMIT;
 
-	if (ctl_conf_ptr->kill_wait == (uint16_t) NO_VAL)
-		ctl_conf_ptr->kill_wait = DEFAULT_KILL_WAIT;
-
 	if (ctl_conf_ptr->max_job_cnt == (uint16_t) NO_VAL)
 		ctl_conf_ptr->max_job_cnt = DEFAULT_MAX_JOB_COUNT;
 
@@ -846,14 +843,10 @@ static void _set_config_defaults(slurm_ctl_conf_t * ctl_conf_ptr)
 		ctl_conf_ptr->slurmd_timeout = DEFAULT_SLURMD_TIMEOUT;
 
 	if (ctl_conf_ptr->state_save_location == NULL)
-		ctl_conf_ptr->state_save_location =
-		    xstrdup(DEFAULT_TMP_FS);
+		ctl_conf_ptr->state_save_location = xstrdup(DEFAULT_TMP_FS);
 
 	if (ctl_conf_ptr->tmp_fs == NULL)
 		ctl_conf_ptr->tmp_fs = xstrdup(DEFAULT_TMP_FS);
-
-	if (ctl_conf_ptr->wait_time == (uint16_t) NO_VAL)
-		ctl_conf_ptr->wait_time = DEFAULT_WAIT_TIME;
 }
 
 
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 2f527cc3ab2..6542e1b0ab8 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -833,8 +833,12 @@ void purge_old_job (void);
  */
 extern int  read_slurm_conf (int recover);
 
+/* rehash_jobs - Create or rebuild the job rehash table. Actually for now we 
+ * just preserve it */
+void rehash_jobs(void);
+
 /* 
- * rehash - build a hash table of the node_record entries. this is a large 
+ * rehash_node - build a hash table of the node_record entries. this is a large 
  *	hash table to permit the immediate finding of a record based only 
  *	upon its name without regards to their number. there should be no 
  *	need for a search. 
@@ -842,7 +846,7 @@ extern int  read_slurm_conf (int recover);
  *         hash_table - table of hash indecies
  * NOTE: manages memory for hash_table
  */
-extern void rehash (void);
+extern void rehash_node (void);
 
 /* update first assigned job id as needed on reconfigure */
 extern void reset_first_job_id(void);
-- 
GitLab