From e104201caa31cbbe904ad24eaf881133f3651d33 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Wed, 9 Sep 2009 23:22:30 +0000
Subject: [PATCH] ok, not tested a bunch, but it appears qos support is fully
 functional, limits anyway

---
 src/slurmctld/acct_policy.c | 202 ++++++++++++---
 src/slurmctld/job_mgr.c     | 495 +++++++++++++++++++++++++-----------
 2 files changed, 513 insertions(+), 184 deletions(-)

diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c
index 3095cf18eda..b92b23e8f63 100644
--- a/src/slurmctld/acct_policy.c
+++ b/src/slurmctld/acct_policy.c
@@ -361,9 +361,12 @@ extern void acct_policy_job_fini(struct job_record *job_ptr)
  */
 extern bool acct_policy_job_runnable(struct job_record *job_ptr)
 {
+	acct_qos_rec_t *qos_ptr;
 	acct_association_rec_t *assoc_ptr;
 	uint32_t time_limit;
 	bool rc = true;
+	uint64_t usage_mins;
+	uint32_t wall_mins;
 	int parent = 0; /*flag to tell us if we are looking at the
 			 * parent or not 
 			 */
@@ -387,18 +390,174 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr)
 	    (job_ptr->state_reason == WAIT_ASSOC_TIME_LIMIT))
                 job_ptr->state_reason = WAIT_NO_REASON;
 
+	slurm_mutex_lock(&assoc_mgr_qos_lock);
+	qos_ptr = job_ptr->qos_ptr;
+	if(qos_ptr) {
+		usage_mins = (uint64_t)(qos_ptr->usage_raw / 60.0);
+		wall_mins = qos_ptr->grp_used_wall / 60;
+
+		if ((qos_ptr->grp_cpu_mins != (uint64_t)INFINITE)
+		    && (usage_mins >= qos_ptr->grp_cpu_mins)) {
+			job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT;
+			xfree(job_ptr->state_desc);
+			debug2("Job %u being held, "
+			       "the job is at or exceeds QOS %s's "
+			       "group max cpu minutes of %llu with %llu",
+			       job_ptr->job_id, 
+			       qos_ptr->name, qos_ptr->grp_cpu_mins,
+			       usage_mins);
+			rc = false;
+			goto end_qos;
+		}
+
+		/* NOTE: We can't enforce qos_ptr->grp_cpus at this
+		 * time because we don't have access to a CPU count for the job
+		 * due to how all of the job's specifications interact */
+
+		if ((qos_ptr->grp_jobs != INFINITE) &&
+		    (qos_ptr->grp_used_jobs >= qos_ptr->grp_jobs)) {
+			job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT;
+			xfree(job_ptr->state_desc);
+			debug2("job %u being held, "
+			       "the job is at or exceeds QOS %s's "
+			       "group max jobs limit %u with %u for qos %s",
+			       job_ptr->job_id, 
+			       qos_ptr->grp_jobs, 
+			       qos_ptr->grp_used_jobs, qos_ptr->name);
+			
+			rc = false;
+			goto end_qos;
+		}
+		
+		if (qos_ptr->grp_nodes != INFINITE) {
+			if (job_ptr->details->min_nodes > qos_ptr->grp_nodes) {
+				info("job %u is being cancelled, "
+				     "min node request %u exceeds "
+				     "group max node limit %u for qos '%s'",
+				     job_ptr->job_id, 
+				     job_ptr->details->min_nodes, 
+				     qos_ptr->grp_nodes,
+				     qos_ptr->name);
+				_cancel_job(job_ptr);
+			} else if ((qos_ptr->grp_used_nodes + 
+				    job_ptr->details->min_nodes) > 
+				   qos_ptr->grp_nodes) {
+				job_ptr->state_reason = 
+					WAIT_ASSOC_RESOURCE_LIMIT;
+				xfree(job_ptr->state_desc);
+				debug2("job %u being held, "
+				       "the job is at or exceeds "
+				       "group max node limit %u "
+				       "with already used %u + requested %u "
+				       "for qos %s",
+				       job_ptr->job_id,
+				       qos_ptr->grp_nodes, 
+				       qos_ptr->grp_used_nodes,
+				       job_ptr->details->min_nodes, 
+				       qos_ptr->name);
+				rc = false;
+				goto end_qos;
+			}
+		}
+
+		/* we don't need to check submit_jobs here */
+		
+		if ((qos_ptr->grp_wall != INFINITE)
+		    && (wall_mins >= qos_ptr->grp_wall)) {
+			job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT;
+			xfree(job_ptr->state_desc);
+			debug2("job %u being held, "
+			       "the job is at or exceeds "
+			       "group wall limit %u "
+			       "with %u for qos %s",
+			       job_ptr->job_id,
+			       qos_ptr->grp_wall, 
+			       wall_mins, qos_ptr->name);
+			       
+			rc = false;
+			goto end_qos;
+		}
+				
+		/* NOTE: We can't enforce qos_ptr->max_cpu_mins_pj at this
+		 * time because we don't have access to a CPU count for the job
+		 * due to how all of the job's specifications interact */
+		
+		/* NOTE: We can't enforce qos_ptr->max_cpus at this
+		 * time because we don't have access to a CPU count for the job
+		 * due to how all of the job's specifications interact */
+				
+		if (qos_ptr->max_jobs_pu != INFINITE) {
+			acct_used_limits_t *used_limits = NULL;
+			if(qos_ptr->user_limit_list) {
+				ListIterator itr = list_iterator_create(
+					qos_ptr->user_limit_list);
+				while((used_limits = list_next(itr))) {
+					if(used_limits->uid == job_ptr->user_id)
+						break;
+				}
+				list_iterator_destroy(itr);
+			}
+			if(used_limits && (used_limits->jobs 
+					   >= qos_ptr->max_jobs_pu)) {
+				debug2("job %u being held, "
+				       "the job is at or exceeds "
+				       "max jobs limit %u with %u for QOS %s",
+				       job_ptr->job_id,
+				       qos_ptr->max_jobs_pu, 
+				       used_limits->jobs, qos_ptr->name);
+				rc = false;
+				goto end_qos;
+			}
+		}
+		
+		if (qos_ptr->max_nodes_pj != INFINITE) {
+			if (job_ptr->details->min_nodes > 
+			    qos_ptr->max_nodes_pj) {
+				info("job %u being cancelled, "
+				     "min node limit %u exceeds "
+				     "qos max %u",
+				     job_ptr->job_id,
+				     job_ptr->details->min_nodes, 
+				     qos_ptr->max_nodes_pj);
+				_cancel_job(job_ptr);
+				rc = false;
+				goto end_qos;
+			}
+		}
+			
+		/* we don't need to check submit_jobs_pu here */
+
+		/* if the qos limits have changed since job
+		 * submission and job can not run, then kill it */
+		if ((qos_ptr->max_wall_pj != INFINITE)) {
+			time_limit = qos_ptr->max_wall_pj;
+			if ((job_ptr->time_limit != NO_VAL) &&
+			    (job_ptr->time_limit > time_limit)) {
+				info("job %u being cancelled, "
+				     "time limit %u exceeds account max %u",
+				     job_ptr->job_id, job_ptr->time_limit, 
+				     time_limit);
+				_cancel_job(job_ptr);
+				rc = false;
+				goto end_qos;
+			}
+		}		
+	}
+	slurm_mutex_unlock(&assoc_mgr_qos_lock);
+end_qos:
+	if(!rc)
+		return rc;
+
 	slurm_mutex_lock(&assoc_mgr_association_lock);
 	assoc_ptr = job_ptr->assoc_ptr;
 	while(assoc_ptr) {
-		uint64_t usage_mins =
-			(uint64_t)(assoc_ptr->usage_raw / 60.0);
-		uint32_t wall_mins = assoc_ptr->grp_used_wall / 60;
+		usage_mins = (uint64_t)(assoc_ptr->usage_raw / 60.0);
+		wall_mins = assoc_ptr->grp_used_wall / 60;
 #if _DEBUG
 		info("acct_job_limits: %u of %u", 
 		     assoc_ptr->used_jobs, assoc_ptr->max_jobs);
 #endif		
-		if ((assoc_ptr->grp_cpu_mins != (uint64_t)NO_VAL)
-		    && (assoc_ptr->grp_cpu_mins != (uint64_t)INFINITE)
+		if ((assoc_ptr->grp_cpu_mins != (uint64_t)INFINITE)
 		    && (usage_mins >= assoc_ptr->grp_cpu_mins)) {
 			job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT;
 			xfree(job_ptr->state_desc);
@@ -414,8 +573,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr)
 			goto end_it;
 		}
 
-		if ((assoc_ptr->grp_jobs != NO_VAL) &&
-		    (assoc_ptr->grp_jobs != INFINITE) &&
+		if ((assoc_ptr->grp_jobs != INFINITE) &&
 		    (assoc_ptr->used_jobs >= assoc_ptr->grp_jobs)) {
 			job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT;
 			xfree(job_ptr->state_desc);
@@ -430,8 +588,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr)
 			goto end_it;
 		}
 		
-		if ((assoc_ptr->grp_nodes != NO_VAL) &&
-		    (assoc_ptr->grp_nodes != INFINITE)) {
+		if (assoc_ptr->grp_nodes != INFINITE) {
 			if (job_ptr->details->min_nodes > 
 			    assoc_ptr->grp_nodes) {
 				info("job %u being cancelled, "
@@ -464,8 +621,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr)
 
 		/* we don't need to check submit_jobs here */
 
-		if ((assoc_ptr->grp_wall != NO_VAL) 
-		    && (assoc_ptr->grp_wall != INFINITE)
+		if ((assoc_ptr->grp_wall != INFINITE)
 		    && (wall_mins >= assoc_ptr->grp_wall)) {
 			job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT;
 			xfree(job_ptr->state_desc);
@@ -499,8 +655,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr)
 		 * time because we don't have access to a CPU count for the job
 		 * due to how all of the job's specifications interact */
 
-		if ((assoc_ptr->max_jobs != NO_VAL) &&
-		    (assoc_ptr->max_jobs != INFINITE) &&
+		if ((assoc_ptr->max_jobs != INFINITE) &&
 		    (assoc_ptr->used_jobs >= assoc_ptr->max_jobs)) {
 			job_ptr->state_reason = WAIT_ASSOC_JOB_LIMIT;
 			xfree(job_ptr->state_desc);
@@ -514,8 +669,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr)
 			goto end_it;
 		}
 		
-		if ((assoc_ptr->max_nodes_pj != NO_VAL) &&
-		    (assoc_ptr->max_nodes_pj != INFINITE)) {
+		if ((assoc_ptr->max_nodes_pj != INFINITE)) {
 			if (job_ptr->details->min_nodes > 
 			    assoc_ptr->max_nodes_pj) {
 				info("job %u being cancelled, "
@@ -534,8 +688,7 @@ extern bool acct_policy_job_runnable(struct job_record *job_ptr)
 
 		/* if the association limits have changed since job
 		 * submission and job can not run, then kill it */
-		if ((assoc_ptr->max_wall_pj != NO_VAL) &&
-		    (assoc_ptr->max_wall_pj != INFINITE)) {
+		if (assoc_ptr->max_wall_pj != INFINITE) {
 			time_limit = assoc_ptr->max_wall_pj;
 			if ((job_ptr->time_limit != NO_VAL) &&
 			    (job_ptr->time_limit > time_limit)) {
@@ -556,18 +709,3 @@ end_it:
 	slurm_mutex_unlock(&assoc_mgr_association_lock);
 	return rc;
 }
-
-/* FIX ME: This function should be called every so often to update time, and
- * shares used.  It doesn't do anything right now.
- */
-extern void acct_policy_update_running_job_usage(struct job_record *job_ptr)
-{
-	acct_association_rec_t *assoc_ptr;
-
-	slurm_mutex_lock(&assoc_mgr_association_lock);
-	assoc_ptr = job_ptr->assoc_ptr;
-	while(assoc_ptr) {
-		assoc_ptr = assoc_ptr->parent_assoc_ptr;
-	}
-	slurm_mutex_unlock(&assoc_mgr_association_lock);
-}
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 26837d38726..938503e6f55 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -122,12 +122,13 @@ static int  _copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
 					 struct part_record *part_ptr,
 					 bitstr_t ** exc_bitmap,
 					 bitstr_t ** req_bitmap);
-static job_desc_msg_t * _copy_job_record_to_job_desc(struct job_record *job_ptr);
+static job_desc_msg_t * _copy_job_record_to_job_desc(
+	struct job_record *job_ptr);
 static char *_copy_nodelist_no_dup(char *node_list);
 static void _del_batch_list_rec(void *x);
 static void _delete_job_desc_files(uint32_t job_id);
-static int  _determine_and_validate_qos(struct job_record *job_ptr, 
-					acct_qos_rec_t *qos_rec);
+static acct_qos_rec_t *_determine_and_validate_qos(
+	acct_association_rec_t *assoc_ptr, acct_qos_rec_t *qos_rec);
 static void _dump_job_details(struct job_details *detail_ptr,
 			      Buf buffer);
 static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer);
@@ -169,7 +170,8 @@ static int  _suspend_job_nodes(struct job_record *job_ptr, bool clear_prio);
 static bool _top_priority(struct job_record *job_ptr);
 static bool _validate_acct_policy(job_desc_msg_t *job_desc,
 				  struct part_record *part_ptr,
-				  acct_association_rec_t *assoc_ptr);
+				  acct_association_rec_t *assoc_in,
+				  acct_qos_rec_t *qos_ptr);
 static int  _validate_job_create_req(job_desc_msg_t * job_desc);
 static int  _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate,
 			       uid_t submit_uid);
@@ -297,22 +299,19 @@ static void _delete_job_desc_files(uint32_t job_id)
 	xfree(dir_name);
 }
 
-static int _determine_and_validate_qos(struct job_record *job_ptr, 
-				       acct_qos_rec_t *qos_rec)
+static acct_qos_rec_t *_determine_and_validate_qos(
+	acct_association_rec_t *assoc_ptr, acct_qos_rec_t *qos_rec)
 { 
-	acct_association_rec_t *assoc_ptr = NULL;
+	acct_qos_rec_t *qos_ptr = NULL;
 
 	/* If enforcing associations make sure this is a valid qos
 	   with the association.  If not just fill in the qos and
 	   continue. */
 
-	xassert(job_ptr);
 	if(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)
-		xassert(job_ptr->assoc_ptr);
+		xassert(assoc_ptr);
 	xassert(qos_rec);
 
-	assoc_ptr = (acct_association_rec_t *)job_ptr->assoc_ptr;	
-
 	if(!qos_rec->name && !qos_rec->id) {
 		if(assoc_ptr && assoc_ptr->valid_qos 
 		   && bit_set_count(assoc_ptr->valid_qos) == 1)
@@ -322,11 +321,11 @@ static int _determine_and_validate_qos(struct job_record *job_ptr,
 	}
 
 	if(assoc_mgr_fill_in_qos(acct_db_conn, qos_rec, accounting_enforce,
-				 (acct_qos_rec_t **) &job_ptr->qos_ptr)
+				 &qos_ptr)
 	   != SLURM_SUCCESS) {
-		error("Invalid qos (%s) for job_id %u", qos_rec->name,
-		      job_ptr->job_id);
-		return ESLURM_INVALID_QOS;
+		error("Invalid qos (%s)", qos_rec->name);
+		errno = ESLURM_INVALID_QOS;
+		return NULL;
 	} 
 	
 	if((accounting_enforce & ACCOUNTING_ENFORCE_QOS)
@@ -338,12 +337,13 @@ static int _determine_and_validate_qos(struct job_record *job_ptr,
 		      "access to qos %s", 
 		      assoc_ptr->id, assoc_ptr->acct, assoc_ptr->user,
 		      assoc_ptr->partition, qos_rec->name);
-		return ESLURM_INVALID_QOS;
+		errno = ESLURM_INVALID_QOS;
+		return NULL;
 	}			
 
-	job_ptr->qos = qos_rec->id;
+	errno = SLURM_SUCCESS;
 
-	return SLURM_SUCCESS;
+	return qos_ptr;
 }
 
 
@@ -1051,8 +1051,9 @@ static int _load_job_state(Buf buffer)
 	if(job_ptr->qos) {
 		memset(&qos_rec, 0, sizeof(acct_qos_rec_t));
 		qos_rec.id = job_ptr->qos;
-		if(_determine_and_validate_qos(job_ptr, &qos_rec)
-		   != SLURM_SUCCESS) {
+		job_ptr->qos_ptr = _determine_and_validate_qos(
+			job_ptr->assoc_ptr, &qos_rec);
+		if(errno != SLURM_SUCCESS) {
 			info("Cancelling job %u with invalid qos", job_id);
 			job_ptr->job_state = JOB_CANCELLED;
 			job_ptr->state_reason = FAIL_BANK_ACCOUNT;
@@ -1062,6 +1063,7 @@ static int _load_job_state(Buf buffer)
 			job_ptr->end_time = now;
 			job_completion_logger(job_ptr);
 		} 
+		job_ptr->qos = qos_rec.id;
 	}
 	safe_unpack16(&step_flag, buffer);
 
@@ -2360,7 +2362,7 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run,
 	acct_association_rec_t assoc_rec, *assoc_ptr;
 	List license_list = NULL;
 	bool valid;
-	acct_qos_rec_t qos_rec;
+	acct_qos_rec_t qos_rec, *qos_ptr;
 
 #ifdef HAVE_BG
 	uint16_t geo[SYSTEM_DIMENSIONS];
@@ -2496,8 +2498,26 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run,
 	}
 	if (job_desc->account == NULL)
 		job_desc->account = xstrdup(assoc_rec.acct);
+
+	/* This must be done after we have the assoc_ptr set */
+	
+	memset(&qos_rec, 0, sizeof(acct_qos_rec_t));
+	qos_rec.name = job_desc->qos;
+	if (wiki_sched && job_ptr->comment &&
+	    strstr(job_ptr->comment, "QOS:")) {
+		if (strstr(job_ptr->comment, "FLAGS:PREEMPTOR"))
+			qos_rec.name = "expedite";
+		else if (strstr(job_ptr->comment, "FLAGS:PREEMPTEE"))
+			qos_rec.name = "standby";
+	}
+
+	qos_ptr = _determine_and_validate_qos(assoc_ptr, &qos_rec);
+	
+	if(errno != SLURM_SUCCESS)
+		return errno;
+
 	if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) &&
-	    (!_validate_acct_policy(job_desc, part_ptr, assoc_ptr))) {
+	    (!_validate_acct_policy(job_desc, part_ptr, assoc_ptr, qos_ptr))) {
 		info("_job_create: exceeded association's node or time limit "
 		     "for user %u", job_desc->user_id);
 		error_code = ESLURM_ACCOUNTING_POLICY;
@@ -2582,7 +2602,7 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run,
 			geo[i] = 0;
 		}
 		select_g_select_jobinfo_set(job_desc->select_jobinfo,
-				     SELECT_JOBDATA_GEOMETRY, &geo);
+					    SELECT_JOBDATA_GEOMETRY, &geo);
 	} else if (geo[0] != 0) {
 		uint32_t i, tot = 1;
 		for (i=0; i<SYSTEM_DIMENSIONS; i++)
@@ -2596,28 +2616,29 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run,
 		job_desc->min_nodes = tot;
 	}
 	select_g_select_jobinfo_get(job_desc->select_jobinfo,
-			     SELECT_JOBDATA_REBOOT, &reboot);
+				    SELECT_JOBDATA_REBOOT, &reboot);
 	if (reboot == (uint16_t) NO_VAL) {
 		reboot = 0;	/* default is no reboot */
 		select_g_select_jobinfo_set(job_desc->select_jobinfo,
-				     SELECT_JOBDATA_REBOOT, &reboot);
+					    SELECT_JOBDATA_REBOOT, &reboot);
 	}
 	select_g_select_jobinfo_get(job_desc->select_jobinfo,
-			     SELECT_JOBDATA_ROTATE, &rotate);
+				    SELECT_JOBDATA_ROTATE, &rotate);
 	if (rotate == (uint16_t) NO_VAL) {
 		rotate = 1;	/* refault is to rotate */
 		select_g_select_jobinfo_set(job_desc->select_jobinfo,
-				     SELECT_JOBDATA_ROTATE, &rotate);
+					    SELECT_JOBDATA_ROTATE, &rotate);
 	}
 	select_g_select_jobinfo_get(job_desc->select_jobinfo,
-			     SELECT_JOBDATA_CONN_TYPE, &conn_type);
+				    SELECT_JOBDATA_CONN_TYPE, &conn_type);
 	if (conn_type == (uint16_t) NO_VAL) {
 		conn_type = (uint16_t) SELECT_TORUS;
 		select_g_select_jobinfo_set(job_desc->select_jobinfo,
-				     SELECT_JOBDATA_CONN_TYPE, &conn_type);
+					    SELECT_JOBDATA_CONN_TYPE,
+					    &conn_type);
 	}
 #endif
-
+	
 	if (job_desc->max_nodes == NO_VAL)
 		job_desc->max_nodes = 0;
 	if ((part_ptr->state_up)
@@ -2675,23 +2696,9 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run,
 	
 	job_ptr->assoc_id = assoc_rec.id;
 	job_ptr->assoc_ptr = (void *) assoc_ptr;
+	job_ptr->qos_ptr = (void *) qos_ptr;
+	job_ptr->qos = qos_ptr->id;
 
-	/* This must be done after we have the assoc_ptr set */
-	
-	memset(&qos_rec, 0, sizeof(acct_qos_rec_t));
-	qos_rec.name = job_desc->qos;
-	if (wiki_sched && job_ptr->comment &&
-	    strstr(job_ptr->comment, "QOS:")) {
-		if (strstr(job_ptr->comment, "FLAGS:PREEMPTOR"))
-			qos_rec.name = "expedite";
-		else if (strstr(job_ptr->comment, "FLAGS:PREEMPTEE"))
-			qos_rec.name = "standby";
-	}
-
-	if((error_code = _determine_and_validate_qos(job_ptr, &qos_rec))
-	   != SLURM_SUCCESS) 
-		goto cleanup_fail;
-	
 	/* already confirmed submit_uid==0 */
 	/* If the priority isn't given we will figure it out later
 	 * after we see if the job is eligible or not. So we want
@@ -3597,6 +3604,9 @@ void job_time_limit(void)
 	time_t over_run;
 	int resv_status = 0;
 	uint64_t job_cpu_usage_mins = 0;
+	uint64_t usage_mins;
+	uint32_t wall_mins;
+
 	if (slurmctld_conf.over_time_limit == (uint16_t) INFINITE)
 		over_run = now - (365 * 24 * 60 * 60);	/* one year */
 	else
@@ -3605,7 +3615,7 @@ void job_time_limit(void)
 	begin_job_resv_check();
 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr =(struct job_record *) list_next(job_iterator))) {
-/* 		acct_qos_rec_t *qos = NULL; */
+		acct_qos_rec_t *qos = NULL;
 		acct_association_rec_t *assoc =	NULL;
 
 		xassert (job_ptr->magic == JOB_MAGIC);
@@ -3637,7 +3647,7 @@ void job_time_limit(void)
 		if (!IS_JOB_RUNNING(job_ptr))
 			continue;
 
-/* 		qos = (acct_qos_rec_t *)job_ptr->qos_ptr; */
+		qos = (acct_qos_rec_t *)job_ptr->qos_ptr;
 		assoc =	(acct_association_rec_t *)job_ptr->assoc_ptr;
 
 		/* find out how many cpu minutes this job has been
@@ -3691,55 +3701,68 @@ void job_time_limit(void)
 		    (list_count(job_ptr->step_list) > 0))
 			check_job_step_time_limit(job_ptr, now);
 
-		/* Too be added later once qos actually works.  The
-		 * idea here is for qos to trump what an association
+		/* The idea here is for qos to trump what an association
 		 * has set for a limit, so if an association set of
 		 * wall 10 mins and the qos has 20 mins set and the
 		 * job has been running for 11 minutes it continues
 		 * until 20.
 		 */
-/* 		if(qos) { */
-/* 			slurm_mutex_lock(&assoc_mgr_qos_lock); */
-/* 			if ((qos->grp_cpu_mins != (uint64_t)NO_VAL) */
-/* 			    && (qos->grp_cpu_mins != (uint64_t)INFINITE) */
-/* 			    && ((uint64_t)qos->usage_raw  */
-/* 				>= qos->grp_cpu_mins)) { */
-/* 				last_job_update = now; */
-/* 				info("QOS %s group max cpu minutes is " */
-/* 				     "at or exceeds %llu with %Lf for JobId=%u", */
-/* 				     qos->name, qos->grp_cpu_mins, */
-/* 				     qos->usage_raw, job_ptr->job_id); */
-/* 				_job_timed_out(job_ptr); */
-/* 				job_ptr->state_reason = FAIL_TIMEOUT; */
-/* 			} */
-
-/* 			if ((qos->max_wall_pj != NO_VAL) */
-/* 			    && (qos->max_wall_pj != INFINITE) */
-/* 			    && (job_ptr-> >= qos->max_wall_pj)) { */
-/* 				last_job_update = now; */
-/* 				info("QOS %s group max cpu minutes is " */
-/* 				     "at or exceeds %llu with %Lf for JobId=%u", */
-/* 				     qos->name, qos->grp_cpu_mins, */
-/* 				     qos->usage_raw, job_ptr->job_id); */
-/* 				_job_timed_out(job_ptr); */
-/* 				job_ptr->state_reason = FAIL_TIMEOUT; */
-/* 			} */
-/* 			slurm_mutex_unlock(&assoc_mgr_qos_lock); */
-
-/* 			if(job_ptr->state_reason == FAIL_TIMEOUT) { */
-/* 				xfree(job_ptr->state_desc); */
-/* 				continue; */
-/* 			} */
-/* 		} */
+		if(qos) {
+			slurm_mutex_lock(&assoc_mgr_qos_lock);
+			usage_mins = (uint64_t)(qos->usage_raw / 60.0);
+			wall_mins = qos->grp_used_wall / 60;
+
+			if ((qos->grp_cpu_mins != (uint64_t)INFINITE)
+			    && (usage_mins >= qos->grp_cpu_mins)) {
+				last_job_update = now;
+				info("Job %u timed out, "
+				     "the job is at or exceeds QOS %s's "
+				     "group max cpu minutes of %llu with %llu",
+				     job_ptr->job_id, 
+				     qos->name, qos->grp_cpu_mins,
+				     usage_mins);
+				job_ptr->state_reason = FAIL_TIMEOUT;
+				slurm_mutex_unlock(&assoc_mgr_qos_lock);
+				goto job_failed;
+			}
+
+			if ((qos->grp_wall != INFINITE)
+			    && (wall_mins >= qos->grp_wall)) {
+				last_job_update = now;
+				info("Job %u timed out, "
+				     "the job is at or exceeds QOS %s's "
+				     "group wall limit of %u with %u",
+				     job_ptr->job_id, 
+				     qos->name, qos->grp_wall,
+				     wall_mins);
+				job_ptr->state_reason = FAIL_TIMEOUT;
+				slurm_mutex_unlock(&assoc_mgr_qos_lock);
+				goto job_failed;
+			}
+			
+			if ((qos->max_cpu_mins_pj != (uint64_t)INFINITE)
+			    && (job_cpu_usage_mins >= qos->max_cpu_mins_pj)) {
+				last_job_update = now;
+				info("Job %u timed out, "
+				     "the job is at or exceeds QOS %s's "
+				     "max cpu minutes of %llu with %llu",
+				     job_ptr->job_id, 
+				     qos->name, qos->max_cpu_mins_pj,
+				     job_cpu_usage_mins);
+				job_ptr->state_reason = FAIL_TIMEOUT;
+				slurm_mutex_unlock(&assoc_mgr_qos_lock);
+				goto job_failed;
+			}
+			slurm_mutex_unlock(&assoc_mgr_qos_lock);
+		}
 
 		/* handle any association stuff here */
 		slurm_mutex_lock(&assoc_mgr_association_lock);
 		while(assoc) {
-			uint64_t usage_mins =
-				(uint64_t)(assoc->usage_raw / 60.0);
-			uint32_t wall_mins = assoc->grp_used_wall / 60;
+			usage_mins = (uint64_t)(assoc->usage_raw / 60.0);
+			wall_mins = assoc->grp_used_wall / 60;
 			
-			if ((assoc->grp_cpu_mins != (uint64_t)NO_VAL)
+			if ((qos && (qos->grp_cpu_mins == INFINITE))
 			    && (assoc->grp_cpu_mins != (uint64_t)INFINITE)
 			    && (usage_mins >= assoc->grp_cpu_mins)) {
 				info("Job %u timed out, "
@@ -3753,7 +3776,7 @@ void job_time_limit(void)
 				break;
 			}
 
-			if ((assoc->grp_wall != NO_VAL)
+			if ((qos && (qos->grp_wall == INFINITE))
 			    && (assoc->grp_wall != INFINITE)
 			    && (wall_mins >= assoc->grp_wall)) {
 				info("Job %u timed out, "
@@ -3767,9 +3790,9 @@ void job_time_limit(void)
 				break;
 			}
 
-			if ((assoc->max_cpu_mins_pj != (uint64_t)NO_VAL)   &&
-			    (assoc->max_cpu_mins_pj != (uint64_t)INFINITE) &&
-			    (job_cpu_usage_mins >= assoc->max_cpu_mins_pj)) {
+			if ((qos && (qos->max_cpu_mins_pj == INFINITE))
+			    && (assoc->max_cpu_mins_pj != (uint64_t)INFINITE) 
+			    && (job_cpu_usage_mins >= assoc->max_cpu_mins_pj)) {
 				info("Job %u timed out, "
 				     "assoc %u is at or exceeds "
 				     "max cpu minutes limit %llu "
@@ -3787,7 +3810,7 @@ void job_time_limit(void)
 				break;
 		}
 		slurm_mutex_unlock(&assoc_mgr_association_lock);
-		
+	job_failed:
 		if(job_ptr->state_reason == FAIL_TIMEOUT) {
 			last_job_update = now;
 			_job_timed_out(job_ptr);
@@ -4745,7 +4768,12 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 	time_t now = time(NULL);
 	multi_core_data_t *mc_ptr = NULL;
 	bool update_accounting = false;
-
+#ifdef HAVE_BG
+ 	 uint16_t reboot = (uint16_t) NO_VAL;
+	 uint16_t rotate = (uint16_t) NO_VAL;
+	 uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL};
+	 char *image = NULL;
+#endif
 	job_ptr = find_job_record(job_specs->job_id);
 	if (job_ptr == NULL) {
 		error("update_job: job_id %u does not exist.",
@@ -4847,43 +4875,6 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 		}
 	}
 
-	if (job_specs->comment && wiki_sched && (!super_user)) {
-		/* User must use Moab command to change job comment */
-		error("Attempt to change comment for job %u",
-		      job_specs->job_id);
-		error_code = ESLURM_ACCESS_DENIED;
-	} else if (job_specs->comment) {
-		xfree(job_ptr->comment);
-		job_ptr->comment = job_specs->comment;
-		job_specs->comment = NULL;	/* Nothing left to free */
-		info("update_job: setting comment to %s for job_id %u",
-		     job_ptr->comment, job_specs->job_id);
-
-		if (wiki_sched && strstr(job_ptr->comment, "QOS:")) {
-			acct_qos_rec_t qos_rec;
-
-			memset(&qos_rec, 0, sizeof(acct_qos_rec_t));
-
-			if (strstr(job_ptr->comment, "FLAGS:PREEMPTOR"))
-				qos_rec.name = "expedite";
-			else if (strstr(job_ptr->comment, "FLAGS:PREEMPTEE"))
-				qos_rec.name = "standby";
-			
-			error_code = _determine_and_validate_qos(job_ptr, 
-								 &qos_rec);
-		}
-	} else if(job_specs->qos) {
-		acct_qos_rec_t qos_rec;
-		
-		info("update_job: setting qos to %s for job_id %u",
-		     job_specs->qos, job_specs->job_id);		
-		
-		memset(&qos_rec, 0, sizeof(acct_qos_rec_t));
-		qos_rec.name = job_specs->qos;
-		
-		error_code = _determine_and_validate_qos(job_ptr, &qos_rec);
-	}	
-
 	if (job_specs->requeue != (uint16_t) NO_VAL) {
 		detail_ptr->requeue = job_specs->requeue;
 		info("update_job: setting requeue to %u for job_id %u",
@@ -5412,13 +5403,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 	}
 
 #ifdef HAVE_BG
- {
-	 uint16_t reboot = (uint16_t) NO_VAL;
-	 uint16_t rotate = (uint16_t) NO_VAL;
-	 uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL};
-	 char *image = NULL;
-
-	 select_g_select_jobinfo_get(job_specs->select_jobinfo,
+ 	 select_g_select_jobinfo_get(job_specs->select_jobinfo,
 				     SELECT_JOBDATA_ROTATE, &rotate);
 	 if (rotate != (uint16_t) NO_VAL) {
 		 if (!IS_JOB_PENDING(job_ptr))
@@ -5525,16 +5510,60 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 				 image);
 		 }
 	 }
- }
+ 
 #endif
-        if(update_accounting) {
-		if (job_ptr->details && job_ptr->details->begin_time) {
-			/* Update job record in accounting to reflect changes */
-			jobacct_storage_g_job_start(
-				acct_db_conn, slurmctld_cluster_name, job_ptr);
-		}
-	}
-	return error_code;
+	 /* Always do this last just incase the assoc_ptr changed */
+	 if (job_specs->comment && wiki_sched && (!super_user)) {
+		 /* User must use Moab command to change job comment */
+		 error("Attempt to change comment for job %u",
+		       job_specs->job_id);
+		 error_code = ESLURM_ACCESS_DENIED;
+	 } else if (job_specs->comment) {
+		 xfree(job_ptr->comment);
+		 job_ptr->comment = job_specs->comment;
+		 job_specs->comment = NULL;	/* Nothing left to free */
+		 info("update_job: setting comment to %s for job_id %u",
+		      job_ptr->comment, job_specs->job_id);
+		 
+		 if (wiki_sched && strstr(job_ptr->comment, "QOS:")) {
+			 acct_qos_rec_t qos_rec;
+
+			 memset(&qos_rec, 0, sizeof(acct_qos_rec_t));
+
+			 if (strstr(job_ptr->comment, "FLAGS:PREEMPTOR"))
+				 qos_rec.name = "expedite";
+			 else if (strstr(job_ptr->comment, "FLAGS:PREEMPTEE"))
+				 qos_rec.name = "standby";
+			
+			 job_ptr->qos_ptr = _determine_and_validate_qos(
+				 job_ptr->assoc_ptr, &qos_rec);
+			 job_ptr->qos = qos_rec.id;			
+			 error_code = errno;
+		 }
+	 } else if(job_specs->qos) {
+		 acct_qos_rec_t qos_rec;
+		
+		 info("update_job: setting qos to %s for job_id %u",
+		      job_specs->qos, job_specs->job_id);		
+		
+		 memset(&qos_rec, 0, sizeof(acct_qos_rec_t));
+		 qos_rec.name = job_specs->qos;
+		
+		 job_ptr->qos_ptr = _determine_and_validate_qos(
+			 job_ptr->assoc_ptr, &qos_rec);
+		 job_ptr->qos = qos_rec.id;
+		 error_code = errno;
+	 }
+	
+	 if(update_accounting) {
+		 if (job_ptr->details && job_ptr->details->begin_time) {
+			 /* Update job record in accounting to reflect
+			  * changes */
+			 jobacct_storage_g_job_start(
+				 acct_db_conn, slurmctld_cluster_name, job_ptr);
+		 }
+	 }
+	 return error_code;
 }
 
 
@@ -6740,7 +6769,8 @@ extern void update_job_nodes_completing(void)
 
 static bool _validate_acct_policy(job_desc_msg_t *job_desc,
 				  struct part_record *part_ptr,
-				  acct_association_rec_t *assoc_in)
+				  acct_association_rec_t *assoc_in,
+				  acct_qos_rec_t *qos_ptr)
 {
 	uint32_t time_limit;
 	acct_association_rec_t *assoc_ptr = assoc_in;
@@ -6750,6 +6780,167 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc,
 	char *user_name = assoc_ptr->user;
 	bool rc = true;
 
+	slurm_mutex_lock(&assoc_mgr_qos_lock);
+	if(qos_ptr) {
+		/* for validation we don't need to look at 
+		 * qos_ptr->grp_cpu_mins.
+		 */
+
+		/* NOTE: We can't enforce qos_ptr->grp_cpus at this
+		 * time because we don't have access to a CPU count for the job
+		 * due to how all of the job's specifications interact */
+
+		/* for validation we don't need to look at 
+		 * qos_ptr->grp_jobs.
+		 */
+
+		if (qos_ptr->grp_nodes != INFINITE) {
+			if (job_desc->min_nodes > qos_ptr->grp_nodes) {
+				info("job submit for user %s(%u): "
+				     "min node request %u exceeds "
+				     "group max node limit %u for qos '%s'",
+				     user_name,
+				     job_desc->user_id, 
+				     job_desc->min_nodes, 
+				     qos_ptr->grp_nodes,
+				     qos_ptr->name);
+				rc = false;
+				goto end_qos;
+			} else if (job_desc->max_nodes == 0
+				   || (max_nodes_set 
+				       && (job_desc->max_nodes 
+					   > qos_ptr->grp_nodes))) {
+				job_desc->max_nodes = qos_ptr->grp_nodes;
+				max_nodes_set = 1;
+			} else if (job_desc->max_nodes > 
+				   qos_ptr->grp_nodes) {
+				info("job submit for user %s(%u): "
+				     "max node changed %u -> %u because "
+				     "of qos limit",
+				     user_name,
+				     job_desc->user_id, 
+				     job_desc->max_nodes, 
+				     qos_ptr->grp_nodes);
+				job_desc->max_nodes = qos_ptr->grp_nodes;
+			}
+		}
+
+		if ((qos_ptr->grp_submit_jobs != INFINITE) &&
+		    (qos_ptr->grp_used_submit_jobs 
+		     >= qos_ptr->grp_submit_jobs)) {
+			info("job submit for user %s(%u): "
+			     "group max submit job limit exceded %u "
+			     "for qos '%s'",
+			     user_name,
+			     job_desc->user_id, 
+			     qos_ptr->grp_submit_jobs,
+			     qos_ptr->name);
+			rc = false;
+			goto end_qos;
+		}
+		
+		
+		/* for validation we don't need to look at 
+		 * qos_ptr->grp_wall. It is checked while the job is running.
+		 */
+		
+				
+		/* for validation we don't need to look at 
+		 * qos_ptr->max_cpu_mins_pj.
+		 */
+		
+		/* NOTE: We can't enforce qos_ptr->max_cpus at this
+		 * time because we don't have access to a CPU count for the job
+		 * due to how all of the job's specifications interact */
+				
+		/* for validation we don't need to look at 
+		 * qos_ptr->max_jobs.
+		 */
+				
+		if (qos_ptr->max_nodes_pj != INFINITE) {
+			if (job_desc->min_nodes > qos_ptr->max_nodes_pj) {
+				info("job submit for user %s(%u): "
+				     "min node limit %u exceeds "
+				     "qos max %u",
+				     user_name,
+				     job_desc->user_id, 
+				     job_desc->min_nodes, 
+				     qos_ptr->max_nodes_pj);
+				rc = false;
+				goto end_qos;
+			} else if (job_desc->max_nodes == 0
+				   || (max_nodes_set 
+				       && (job_desc->max_nodes 
+					   > qos_ptr->max_nodes_pj))) {
+				job_desc->max_nodes = qos_ptr->max_nodes_pj;
+				max_nodes_set = 1;
+			} else if (job_desc->max_nodes > 
+				   qos_ptr->max_nodes_pj) {
+				info("job submit for user %s(%u): "
+				     "max node changed %u -> %u because "
+				     "of qos limit",
+				     user_name,
+				     job_desc->user_id, 
+				     job_desc->max_nodes, 
+				     qos_ptr->max_nodes_pj);
+				job_desc->max_nodes = qos_ptr->max_nodes_pj;
+			}
+		}
+			
+		if (qos_ptr->max_submit_jobs_pu != INFINITE) {
+			acct_used_limits_t *used_limits = NULL;
+			if(qos_ptr->user_limit_list) {
+				ListIterator itr = list_iterator_create(
+					qos_ptr->user_limit_list);
+				while((used_limits = list_next(itr))) {
+					if(used_limits->uid 
+					   == job_desc->user_id)
+						break;
+				}
+				list_iterator_destroy(itr);
+			}
+			if(used_limits && (used_limits->submit_jobs 
+					   >= qos_ptr->max_submit_jobs_pu)) {
+				info("job submit for user %s(%u): "
+				     "account max submit job limit exceded %u",
+				     user_name,
+				     job_desc->user_id, 
+				     qos_ptr->max_submit_jobs_pu);
+				rc = false;
+				goto end_qos;
+			}
+		}
+
+		if (qos_ptr->max_wall_pj != INFINITE) {
+			time_limit = qos_ptr->max_wall_pj;
+			if (job_desc->time_limit == NO_VAL) {
+				if (part_ptr->max_time == INFINITE)
+					job_desc->time_limit = time_limit;
+				else 
+					job_desc->time_limit =
+						MIN(time_limit, 
+						    part_ptr->max_time);
+				timelimit_set = 1;
+			} else if (timelimit_set && 
+				   job_desc->time_limit > time_limit) {
+				job_desc->time_limit = time_limit;
+			} else if (job_desc->time_limit > time_limit) {
+				info("job submit for user %s(%u): "
+				     "time limit %u exceeds account max %u",
+				     user_name,
+				     job_desc->user_id, 
+				     job_desc->time_limit, time_limit);
+				rc = false;
+				goto end_qos;
+			}
+		}
+
+	}
+	slurm_mutex_unlock(&assoc_mgr_qos_lock);
+end_qos:
+	if(!rc)
+		return rc;
+
 	slurm_mutex_lock(&assoc_mgr_association_lock);
 	while(assoc_ptr) {
 		/* for validation we don't need to look at 
@@ -6764,7 +6955,7 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc,
 		 * assoc_ptr->grp_jobs.
 		 */
 
-		if ((assoc_ptr->grp_nodes != NO_VAL) &&
+		if ((qos_ptr->grp_nodes == INFINITE) &&
 		    (assoc_ptr->grp_nodes != INFINITE)) {
 			if (job_desc->min_nodes > assoc_ptr->grp_nodes) {
 				info("job submit for user %s(%u): "
@@ -6796,7 +6987,7 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc,
 			}
 		}
 
-		if ((assoc_ptr->grp_submit_jobs != NO_VAL) &&
+		if ((qos_ptr->grp_submit_jobs == INFINITE) &&
 		    (assoc_ptr->grp_submit_jobs != INFINITE) &&
 		    (assoc_ptr->used_submit_jobs 
 		     >= assoc_ptr->grp_submit_jobs)) {
@@ -6837,7 +7028,7 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc,
 		 * assoc_ptr->max_jobs.
 		 */
 		
-		if ((assoc_ptr->max_nodes_pj != NO_VAL) &&
+		if ((qos_ptr->max_nodes_pj == INFINITE) &&
 		    (assoc_ptr->max_nodes_pj != INFINITE)) {
 			if (job_desc->min_nodes > assoc_ptr->max_nodes_pj) {
 				info("job submit for user %s(%u): "
@@ -6868,7 +7059,7 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc,
 			}
 		}
 
-		if ((assoc_ptr->max_submit_jobs != NO_VAL) &&
+		if ((qos_ptr->max_submit_jobs_pu == INFINITE) &&
 		    (assoc_ptr->max_submit_jobs != INFINITE) &&
 		    (assoc_ptr->used_submit_jobs 
 		     >= assoc_ptr->max_submit_jobs)) {
@@ -6881,7 +7072,7 @@ static bool _validate_acct_policy(job_desc_msg_t *job_desc,
 			break;
 		}
 		
-		if ((assoc_ptr->max_wall_pj != NO_VAL) &&
+		if ((qos_ptr->max_wall_pj == INFINITE) &&
 		    (assoc_ptr->max_wall_pj != INFINITE)) {
 			time_limit = assoc_ptr->max_wall_pj;
 			if (job_desc->time_limit == NO_VAL) {
-- 
GitLab