From 2358d42ad71cb3b1f0ec00568a3bdadb3f81f302 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 22 Feb 2008 23:49:04 +0000
Subject: [PATCH] Add EpilogMsgTime configuration parameter. See "man
 slurm.conf" for details.

---
 NEWS                                     |  1 +
 RELEASE_NOTES                            |  6 +++
 doc/html/configurator.html.in            |  1 +
 doc/man/man5/slurm.conf.5                | 11 ++++
 slurm/slurm.h.in                         |  6 ++-
 src/api/config_info.c                    |  2 +
 src/common/read_config.c                 |  5 ++
 src/common/read_config.h                 |  1 +
 src/common/slurm_protocol_api.c          | 14 +++++
 src/common/slurm_protocol_api.h          |  5 ++
 src/common/slurm_protocol_pack.c         |  2 +
 src/plugins/sched/gang/gang.c            | 66 ++++++------------------
 src/plugins/select/cons_res/dist_tasks.c |  3 +-
 src/slurmctld/proc_req.c                 |  1 +
 src/slurmd/slurmd/req.c                  |  4 +-
 15 files changed, 73 insertions(+), 55 deletions(-)

diff --git a/NEWS b/NEWS
index 023077e4684..6d2543b6dc2 100644
--- a/NEWS
+++ b/NEWS
@@ -23,6 +23,7 @@ documents those changes that are of interest to users and admins.
     for a partition, this plugin will gang-schedule or "timeslice" jobs that
     share common resources within the partition. Note that resources that are
     shared across partitions are not gang-scheduled.
+ -- Add EpilogMsgTime configuration parameter. See "man slurm.conf" for details.
  
 * Changes in SLURM 1.3.0-pre9
 =============================
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 572f160303d..47c29e3d7d1 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -75,6 +75,11 @@ CONFIGURATION FILE CHANGES
   upon two factors. First the priority of its partition and the job's priority. 
   Since nodes can be configured in multiple partitions, this can be used to configure
   high priority partitions (queues).
+* Added new parameters "HealthCheckInterval" and "HealthCheckProgram" to automatically
+  test the health of compute nodes.
+* Added "EpilogMsgTime" to prevent a burst of EpilogComplete messages from being 
+  sent at the same time from many slurmd daemons to slurmctld. This should help
+  prevent lost messages and improve throughput for large jobs.
 * The partition parameter "Shared" now has a job count. For example:
   Shared=YES:4     (Up to 4 jobs may share each resource)
   Shared=FORCE:2   (Up to 2 jobs must share each resource)
@@ -82,6 +87,7 @@ CONFIGURATION FILE CHANGES
   hours:minutes:seconds, days-hours, days-hours:minutes, 
   days-hours:minutes:seconds or "UNLIMITED".
 * Checkpoint plugin added for XLCH.
+* Added sched/gang plugin for time-slicing of parallel jobs.
 * See "man slurm.conf" for more information.
 
 OTHER CHANGES
diff --git a/doc/html/configurator.html.in b/doc/html/configurator.html.in
index 143cd82780d..fcb42d95841 100644
--- a/doc/html/configurator.html.in
+++ b/doc/html/configurator.html.in
@@ -183,6 +183,7 @@ function displayfile()
    "InactiveLimit=" + document.config.inactive_limit.value + "<br>" +
    "MinJobAge=" + document.config.min_job_age.value + "<br>" +
    "KillWait=" + document.config.kill_wait.value + "<br>" +
+   "#EpilogMsgTime=2000 <br>" +
    "#GetEnvTimeout=2 <br>" +
    "#HealthCheckInterval=0 <br>" +
    "#HealthCheckProgram= <br>" +
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 5e7da434d25..bf7a5de1542 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -130,6 +130,17 @@ Fully qualified pathname of a script to execute as user root on every
 node when a user's job completes (e.g. "/usr/local/slurm/epilog"). This may 
 be used to purge files, disable user login, etc. By default there is no epilog.
 
+.TP
+\fBEpilogMsgTime\fR
+The number of microseconds the the slurmctld daemon requires to process
+an epilog completion message from the slurmd dameons. This parameter can
+be used to prevent a burst of epilog completion messages from being sent 
+at the same time which should help prevent lost messages and improve 
+throughput for large jobs.
+The default value is 2000 microseconds.
+For a 1000 node job, this spreads the epilog completion messages out over
+two seconds.
+
 .TP
 \fBFastSchedule\fR
 Controls how a nodes configuration specifications in slurm.conf are used.
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 7ea3569bd63..491c8e9e132 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -934,10 +934,12 @@ typedef struct slurm_ctl_conf {
 	char *crypto_type;	/* cryptographic signature plugin */
 	uint32_t def_mem_per_task; /* default MB memory per spawned task */
 	char *epilog;		/* pathname of job epilog */
+	uint32_t epilog_msg_time;  /* usecs for slurmctld to process an
+				 * epilog complete message */
+	uint16_t fast_schedule;	/* 1 to *not* check configurations by node
+				 * (only check configuration file, faster) */
 	uint32_t first_job_id;	/* first slurm generated job_id to assign */
 	uint32_t next_job_id;	/* next slurm generated job_id to assign */
-	uint16_t fast_schedule;	/* 1 to *not* check configurations by node 
-				 * (only check configuration file, faster) */
 	uint16_t get_env_timeout; /* timeout for srun --get-user-env option */
 	uint16_t health_check_interval;	/* secs between health checks */
 	char * health_check_program;	/* pathname of health check program */
diff --git a/src/api/config_info.c b/src/api/config_info.c
index 5e92cc051b6..85e5afe8929 100644
--- a/src/api/config_info.c
+++ b/src/api/config_info.c
@@ -144,6 +144,8 @@ void slurm_print_ctl_conf ( FILE* out,
 		fprintf(out, "DefMemPerTask           = UNLIMITED\n");
 	fprintf(out, "Epilog                  = %s\n",
 		slurm_ctl_conf_ptr->epilog);
+	fprintf(out, "EpilogMsgTime           = %u\n",
+		slurm_ctl_conf_ptr->epilog_msg_time);
 	fprintf(out, "FastSchedule            = %u\n",
 		slurm_ctl_conf_ptr->fast_schedule);
 	fprintf(out, "FirstJobId              = %u\n",
diff --git a/src/common/read_config.c b/src/common/read_config.c
index f9fb7481474..690b365afb6 100644
--- a/src/common/read_config.c
+++ b/src/common/read_config.c
@@ -138,6 +138,7 @@ s_p_options_t slurm_conf_options[] = {
 	{"CryptoType", S_P_STRING},
 	{"DefMemPerTask", S_P_UINT32},
 	{"Epilog", S_P_STRING},
+	{"EpilogMsgTime", S_P_UINT32},
 	{"FastSchedule", S_P_UINT16},
 	{"FirstJobId", S_P_UINT32},
 	{"HashBase", S_P_LONG, defunct_option},
@@ -1144,6 +1145,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr)
 	xfree (ctl_conf_ptr->crypto_type);
 	ctl_conf_ptr->def_mem_per_task          = 0;
 	xfree (ctl_conf_ptr->epilog);
+	ctl_conf_ptr->epilog_msg_time		= (uint32_t) NO_VAL;
 	ctl_conf_ptr->fast_schedule		= (uint16_t) NO_VAL;
 	ctl_conf_ptr->first_job_id		= (uint32_t) NO_VAL;
 	ctl_conf_ptr->health_check_interval	= 0;
@@ -1516,6 +1518,9 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl)
 
 	s_p_get_string(&conf->epilog, "Epilog", hashtbl);
 
+	if (!s_p_get_uint32(&conf->epilog_msg_time, "EpilogMsgTime", hashtbl))
+		conf->epilog_msg_time = DEFAULT_EPILOG_MSG_TIME;
+
 	if (!s_p_get_uint16(&conf->fast_schedule, "FastSchedule", hashtbl))
 		conf->fast_schedule = DEFAULT_FAST_SCHEDULE;
 
diff --git a/src/common/read_config.h b/src/common/read_config.h
index 768d98c5ef3..34eea1322d5 100644
--- a/src/common/read_config.h
+++ b/src/common/read_config.h
@@ -51,6 +51,7 @@ extern char *default_plugstack;
 #define DEFAULT_AUTH_TYPE          "auth/none"
 #define DEFAULT_CACHE_GROUPS        0
 #define DEFAULT_CRYPTO_TYPE        "crypto/openssl"
+#define DEFAULT_EPILOG_MSG_TIME     2000
 #define DEFAULT_FAST_SCHEDULE       1
 #define DEFAULT_FIRST_JOB_ID        1
 #define DEFAULT_GET_ENV_TIMEOUT     2
diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index 6528f4d3eb9..67630952fd0 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -211,6 +211,20 @@ uint32_t slurm_get_max_mem_per_task(void)
 	return mem_per_task;
 }
 
+/* slurm_get_epilog_msg_time
+ * RET EpilogMsgTime value from slurm.conf
+ */
+uint32_t slurm_get_epilog_msg_time(void)
+{
+        uint32_t epilog_msg_time;
+        slurm_ctl_conf_t *conf;
+
+        conf = slurm_conf_lock();
+        epilog_msg_time = conf->epilog_msg_time;
+        slurm_conf_unlock();
+        return epilog_msg_time;
+}
+
 /* slurm_get_env_timeout
  * return default timeout for srun/sbatch --get-user-env option
  */
diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h
index fc86cc31f64..0746e183e74 100644
--- a/src/common/slurm_protocol_api.h
+++ b/src/common/slurm_protocol_api.h
@@ -107,6 +107,11 @@ uint32_t slurm_get_def_mem_per_task(void);
  */
 uint32_t slurm_get_max_mem_per_task(void);
 
+/* slurm_get_epilog_msg_time
+ * RET EpilogMsgTime value from slurm.conf
+ */
+uint32_t slurm_get_epilog_msg_time(void);
+
 /* slurm_get_env_timeout
  * return default timeout for srun/sbatch --get-user-env option
  */
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index 739d87445a2..83ce2b343a7 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -2212,6 +2212,7 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer)
 	pack32(build_ptr->def_mem_per_task, buffer);
 
 	packstr(build_ptr->epilog, buffer);
+	pack32(build_ptr->epilog_msg_time, buffer);
 
 	pack16(build_ptr->fast_schedule, buffer);
 	pack32(build_ptr->first_job_id, buffer);
@@ -2357,6 +2358,7 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t **
 	safe_unpack32(&build_ptr->def_mem_per_task, buffer);
 
 	safe_unpackstr_xmalloc(&build_ptr->epilog, &uint32_tmp, buffer);
+	safe_unpack32(&build_ptr->epilog_msg_time, buffer);
 
 	safe_unpack16(&build_ptr->fast_schedule, buffer);
 	safe_unpack32(&build_ptr->first_job_id, buffer);
diff --git a/src/plugins/sched/gang/gang.c b/src/plugins/sched/gang/gang.c
index b9b102dc7d5..adeadc71cc0 100644
--- a/src/plugins/sched/gang/gang.c
+++ b/src/plugins/sched/gang/gang.c
@@ -222,20 +222,15 @@ _load_phys_res_cnt()
 {
 	int i, array_size = GS_CPU_ARRAY_INCREMENT;
 	uint32_t adder;
-	if (gs_cpus_per_res)
-		xfree(gs_cpus_per_res);
-	gs_cpus_per_res = NULL;
-	if (gs_cpu_count_reps)
-		xfree(gs_cpu_count_reps);
-	gs_cpu_count_reps = NULL;
+
+	xfree(gs_cpus_per_res);
+	xfree(gs_cpu_count_reps);
 	gs_num_groups = 0;
 	if (gr_type == GS_NODE || gr_type == GS_SOCKET)
 		return;
 
 	gs_cpus_per_res   = xmalloc(array_size * sizeof(uint16_t));
 	gs_cpu_count_reps = xmalloc(array_size * sizeof(uint32_t));
-	gs_cpus_per_res[0] = 0;
-	gs_cpu_count_reps[0] = 0;
 	for (i = 0; i < node_record_count; i++) {
 		uint16_t res = _compute_resources(i, 0);
 		if (gs_cpus_per_res[gs_num_groups] == res) {
@@ -301,31 +296,17 @@ _destroy_parts() {
 		xfree(tmp->part_name);
 		for (i = 0; i < tmp->num_jobs; i++) {
 			j_ptr = tmp->job_list[i];
-			if (j_ptr->bitmap) {
+			if (j_ptr->bitmap)
 				bit_free(j_ptr->bitmap);
-				j_ptr->bitmap = NULL;
-			}
-			if (j_ptr->alloc_cpus) {
-				xfree(j_ptr->alloc_cpus);
-				j_ptr->alloc_cpus = NULL;
-			}
+			xfree(j_ptr->alloc_cpus);
 			xfree(j_ptr);
 		}
-		if (tmp->job_list) {
-			xfree(tmp->job_list);
-			tmp->job_list = NULL;
-		}
-		if (tmp->active_bitmap) {
+		if (tmp->active_bitmap)
 			bit_free(tmp->active_bitmap);
-			tmp->active_bitmap = NULL;
-		}
-		if (tmp->active_cpus) {
-			xfree(tmp->active_cpus);
-			tmp->active_cpus = NULL;
-		}
+		xfree(tmp->active_cpus);
+		xfree(tmp->job_list);
 	}
 	xfree(gs_part_list);
-	gs_part_list = NULL;
 }
 
 /* just build the gs_part_list. The job_list will be created */
@@ -347,17 +328,10 @@ _build_parts() {
 		fatal ("memory allocation failure");
 
 	gs_part_list = xmalloc(num_parts * sizeof(struct gs_part));
-
 	i = 0;
 	while ((p_ptr = (struct part_record *) list_next(part_iterator))) {
 		gs_part_list[i].part_name = xstrdup(p_ptr->name);
-		gs_part_list[i].num_jobs = 0;
-		gs_part_list[i].job_list = NULL;
-		gs_part_list[i].job_list_size = 0;
-		gs_part_list[i].jobs_running = 0;
-		gs_part_list[i].active_bitmap = NULL;
-		gs_part_list[i].array_size = 0;
-		gs_part_list[i].active_cpus = NULL;
+		/* everything else is already set to zero/NULL */
 		gs_part_list[i].next = &(gs_part_list[i+1]);
 		i++;
 	}
@@ -563,8 +537,8 @@ static void
 _load_alloc_cpus(struct gs_job *j_ptr, bitstr_t *nodemap)
 {
 	int i, a, alloc_index, sz;
-	if (j_ptr->alloc_cpus)
-		xfree(j_ptr->alloc_cpus);
+
+	xfree(j_ptr->alloc_cpus);
 	sz = bit_set_count(j_ptr->bitmap);
 	j_ptr->alloc_cpus = xmalloc(sz * sizeof(uint16_t));
 
@@ -648,8 +622,7 @@ _add_job_to_part(struct gs_part *p_ptr, uint32_t job_id, bitstr_t *job_bitmap)
 		p_ptr->job_list_size = default_job_list_size;
 		p_ptr->job_list = xmalloc(p_ptr->job_list_size *
 						sizeof(struct gs_job *));
-		for (i = 0; i < p_ptr->job_list_size; i++)
-			p_ptr->job_list[i] = NULL;
+		/* job_list is initialized to be NULL filled */
 	}
 	/* protect against duplicates */
 	for (i = 0; i < p_ptr->num_jobs; i++) {
@@ -745,10 +718,7 @@ _remove_job_from_part(uint32_t job_id, struct gs_part *p_ptr)
 		_signal_job(j_ptr->job_id, GS_RESUME);
 	}
 	bit_free(j_ptr->bitmap);
-	j_ptr->bitmap = NULL;
-	if (j_ptr->alloc_cpus)
-		xfree(j_ptr->alloc_cpus);
-	j_ptr->alloc_cpus = NULL;
+	xfree(j_ptr->alloc_cpus);
 	xfree(j_ptr);
 
 	/* in order to remove this job from the active row,
@@ -914,14 +884,8 @@ gs_fini()
 	
 	pthread_mutex_lock(&data_mutex);
 	_destroy_parts();
-	if (gs_cpus_per_res) {
-		xfree(gs_cpus_per_res);
-		gs_cpus_per_res = NULL;
-	}
-	if (gs_cpu_count_reps) {
-		xfree(gs_cpu_count_reps);
-		gs_cpu_count_reps = NULL;
-	}
+	xfree(gs_cpus_per_res);
+	xfree(gs_cpu_count_reps);
 	gs_num_groups = 0;
 	pthread_mutex_unlock(&data_mutex);
 	debug3("sched/gang: leaving gs_fini");
diff --git a/src/plugins/select/cons_res/dist_tasks.c b/src/plugins/select/cons_res/dist_tasks.c
index 2ecbb4a621c..68c06621862 100644
--- a/src/plugins/select/cons_res/dist_tasks.c
+++ b/src/plugins/select/cons_res/dist_tasks.c
@@ -89,7 +89,8 @@ int compute_c_b_task_dist(struct select_cr_job *job)
 
 #if (CR_DEBUG)	
 	for (i = 0; i < job->nhosts; i++) {
-		info("cons_res _c_b_task_dist %u host_index %d nprocs %u maxtasks %u cpus %u alloc_cpus %u", 
+		info("cons_res _c_b_task_dist %u host_index %d nprocs %u "
+		     "maxtasks %u cpus %u alloc_cpus %u", 
 		     job->job_id, i, job->nprocs, 
 		     maxtasks, job->cpus[i], job->alloc_cpus[i]);
 	}
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 7a99c72b585..e6c73d45668 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -329,6 +329,7 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr)
 	conf_ptr->crypto_type         = xstrdup(conf->crypto_type);
 	conf_ptr->def_mem_per_task    = conf->def_mem_per_task;
 	conf_ptr->epilog              = xstrdup(conf->epilog);
+	conf_ptr->epilog_msg_time     = conf->epilog_msg_time;
 	conf_ptr->fast_schedule       = conf->fast_schedule;
 	conf_ptr->first_job_id        = conf->first_job_id;
 	conf_ptr->inactive_limit      = conf->inactive_limit;
diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c
index b5c5a7daa34..463e976f363 100644
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -2505,6 +2505,7 @@ static void _sync_messages_kill(kill_job_msg_t *req)
 	int host_cnt, host_inx;
 	char *host;
 	hostset_t hosts;
+	int epilog_msg_time;
 
 	hosts = hostset_create(req->nodes);
 	host_cnt = hostset_count(hosts);
@@ -2523,7 +2524,8 @@ static void _sync_messages_kill(kill_job_msg_t *req)
 		}
 		free(host);
 	}
-	_delay_rpc(host_inx, host_cnt, 10000);
+	epilog_msg_time = slurm_get_epilog_msg_time();
+	_delay_rpc(host_inx, host_cnt, epilog_msg_time);
 
  fini:	hostset_destroy(hosts);
 }
-- 
GitLab