From dd4aa1c3c471eeb4e0af711f308f9eefca96014b Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Wed, 2 Apr 2014 15:08:45 -0700
Subject: [PATCH] Defer scheduling for many batch jobs

Permit multiple batch job submissions to be made for each run of the
scheduler logic if the job submissions occur at the nearly same time.
bug 616
---
 NEWS                     |  2 ++
 src/slurmctld/proc_req.c | 48 ++++++++++++++++++++++++++--------------
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/NEWS b/NEWS
index 3eca0aa52f7..3ba9fab3dca 100644
--- a/NEWS
+++ b/NEWS
@@ -23,6 +23,8 @@ documents those changes that are of interest to users and admins.
  -- Fix preempt/partition_prio to avoid preempting jobs in partitions with
     PreemptMode=OFF
  -- launch/poe - Implicitly set --network in job step create request as needed.
+ -- Permit multiple batch job submissions to be made for each run of the
+    scheduler logic if the job submissions occur at the nearly same time.
 
 * Changes in Slurm 2.6.7
 ========================
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 7c941100e0a..7aa829a46bf 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -2776,6 +2776,9 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 	static time_t config_update = 0;
 	static bool defer_sched = false;
 	static int active_rpc_cnt = 0;
+	static pthread_mutex_t sched_cnt_mutex = PTHREAD_MUTEX_INITIALIZER;
+	static int sched_cnt = 0;
+	int sched_now_cnt = 0;
 	int error_code = SLURM_SUCCESS;
 	DEF_TIMERS;
 	uint32_t step_id = 0;
@@ -2822,6 +2825,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 	if (error_code == SLURM_SUCCESS) {
 		_throttle_start(&active_rpc_cnt);
 		lock_slurmctld(job_write_lock);
+		START_TIMER;	/* Restart after we have locks */
 		if (job_desc_msg->job_id != SLURM_BATCH_SCRIPT) {
 			job_ptr = find_job_record(job_desc_msg->job_id);
 			if (job_ptr && IS_JOB_FINISHED(job_ptr)) {
@@ -2833,7 +2837,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 						ESLURM_DUPLICATE_JOB_ID);
 					unlock_slurmctld(job_write_lock);
 					_throttle_fini(&active_rpc_cnt);
-					return;
+					goto fini;
 				}
 				job_ptr = NULL;	/* OK to re-use job id */
 			}
@@ -2852,7 +2856,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 				slurm_send_rc_msg(msg, ESLURM_NO_STEPS);
 				unlock_slurmctld(job_write_lock);
 				_throttle_fini(&active_rpc_cnt);
-				return;
+				goto fini;
 			}
 #endif
 
@@ -2865,14 +2869,14 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 				slurm_send_rc_msg(msg, ESLURM_USER_ID_MISSING);
 				unlock_slurmctld(job_write_lock);
 				_throttle_fini(&active_rpc_cnt);
-				return;
+				goto fini;
 			}
 			if (job_ptr->details &&
 			    job_ptr->details->prolog_running) {
 				slurm_send_rc_msg(msg, EAGAIN);
 				unlock_slurmctld(job_write_lock);
 				_throttle_fini(&active_rpc_cnt);
-				return;
+				goto fini;
 			}
 
 			error_code = _launch_batch_step(job_desc_msg, uid,
@@ -2900,7 +2904,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 						    &response_msg);
 				schedule_job_save();
 			}
-			return;
+			goto fini;
 		}
 
 		/* Create new job allocation */
@@ -2936,20 +2940,32 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 		response_msg.msg_type = RESPONSE_SUBMIT_BATCH_JOB;
 		response_msg.data = &submit_msg;
 		slurm_send_node_msg(msg->conn_fd, &response_msg);
-		/* We need to use schedule() to initiate a batch job in order
-		 * to run the various prologs, boot the node, etc.
-		 * We also run schedule() even if this job could not start,
-		 * say due to a higher priority job, since the locks are
-		 * released above and we might start some other job here.
-		 *
-		 * In defer mode, avoid triggering the scheduler logic
-		 * for every submit batch job request.
-		 */
-		if (!defer_sched)
-			(void) schedule(schedule_cnt); /* has own locks */
+
+		/* In defer mode, avoid triggering the scheduler logic
+		 * for every submit batch job request. */
+		if (!defer_sched) {
+			slurm_mutex_lock(&sched_cnt_mutex);
+			sched_cnt += schedule_cnt;
+			slurm_mutex_unlock(&sched_cnt_mutex);
+		}
 		schedule_job_save();	/* has own locks */
 		schedule_node_save();	/* has own locks */
 	}
+
+fini:	/* We need to use schedule() to initiate a batch job in order to run
+	 * the various prologs, boot the node, etc. We also run schedule()
+	 * even if this job could not start, say due to a higher priority job,
+	 * since the locks are released above and we might start some other
+	 * job here. We do not run schedule() on each batch submission to
+	 * limit its overhead on large numbers of job submissions */
+	slurm_mutex_lock(&sched_cnt_mutex);
+	if ((active_rpc_cnt == 0) || (sched_cnt > 32)) {
+		sched_now_cnt = sched_cnt;
+		sched_cnt = 0;
+	}
+	slurm_mutex_unlock(&sched_cnt_mutex);
+	if (sched_now_cnt)
+		(void) schedule(sched_now_cnt); /* has own locks */
 }
 
 /* _slurm_rpc_update_job - process RPC to update the configuration of a
-- 
GitLab