From 25367a648ad001e12b0013a17f208366ddf16a4e Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 23 Jan 2008 22:30:09 +0000
Subject: [PATCH] set node write lock for backfill.

---
 src/plugins/sched/backfill/backfill.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c
index a42754ee823..06d13e22953 100644
--- a/src/plugins/sched/backfill/backfill.c
+++ b/src/plugins/sched/backfill/backfill.c
@@ -76,6 +76,7 @@ typedef struct node_space_map {
 	bitstr_t *avail_bitmap;
 	int next;	/* next record, by time, zero termination */
 } node_space_map_t;
+int backfilled_jobs = 0;
 
 /*********************** local variables *********************/
 static bool new_work      = false;
@@ -90,7 +91,7 @@ static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 /* Set __DEBUG to get detailed logging for this thread without 
  * detailed logging for the entire slurmctld daemon */
-#define __DEBUG			1
+#define __DEBUG			0
 
 /* Do not attempt to build job/resource/time record for
  * more than MAX_BACKFILL_JOB_CNT records */
@@ -164,14 +165,17 @@ extern void *backfill_agent(void *args)
 	char tv_str[20];
 	time_t now;
 	static time_t last_backfill_time = 0;
-	/* Read config, node, and partitions; Write jobs */
+	/* Read config, and partitions; Write jobs and nodes */
 	slurmctld_lock_t all_locks = {
-		READ_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
+		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };
 
 	while (!stop_backfill) {
 		sleep(1);		/* don't run continuously */
 
 		now = time(NULL);
+		/* Avoid resource fragmentation if important */
+		if (switch_no_frag() && job_is_completing())
+			continue;
 		if ((difftime(now, last_backfill_time) < BACKFILL_INTERVAL) ||
 		    stop_backfill || (!_more_work()))
 			continue;
@@ -347,20 +351,27 @@ static int _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap)
 	if (rc == SLURM_SUCCESS) {	
 		/* job initiated */
 		last_job_update = time(NULL);
-		info("backfill: JobId=%u NodeList=%s",
+		info("backfill: Started JobId=%u on %s",
 		     job_ptr->job_id, job_ptr->nodes);
 		if (job_ptr->batch_flag)
 			launch_job(job_ptr);
 		else
 			srun_allocate(job_ptr->job_id);
+		backfilled_jobs++;
+#if __DEBUG
+		info("backfill: Jobs backfilled: %d", backfilled_jobs);
+#endif
 	} else if (job_ptr->job_id != fail_jobid) {
 		char *node_list = bitmap2node_name(avail_bitmap);
-		error("backfill: could not start JobId=%u on nodes:%s",
-		      job_ptr->job_id, node_list);
+		/* This happens when a job has sharing disabled and
+		 * a selected node is still completing some job, 
+		 * which should be rare. */
+		verbose("backfill: Failed to start JobId=%u on %s: %s",
+			job_ptr->job_id, node_list, slurm_strerror(rc));
 		xfree(node_list);
 		fail_jobid = job_ptr->job_id;
 	} else {
-		debug3("backfill: could not start JobId=%u", job_ptr->job_id);
+		debug3("backfill: Failed to start JobId=%u", job_ptr->job_id);
 	}
 
 	return rc;
-- 
GitLab