Start to flesh out preempt/job_prio

80b3b84a · Morris Jette · 99063507 · 80b3b84a
Commit 80b3b84a authored 10 years ago by Morris Jette
--- a/src/plugins/preempt/job_prio/preempt_job_prio.c
+++ b/src/plugins/preempt/job_prio/preempt_job_prio.c
@@ -80,11 +80,281 @@ const char  plugin_name[]   = "Preempt by Job Priority and Runtime";
 const char  plugin_type[]   = "preempt/job_prio";
 const uint32_t  plugin_version  = 100;

-static bool	_job_prio_preemptable(struct job_record *preemptor,
-				      struct job_record *preemptee);
+/* The acct_usage_element data structure holds informaiton about
+ * an association's current usage and current CPU count*/
+typedef struct
+{
+	uint32_t *id;
+	double *current_usage;
+	uint32_t *current_cpu_count;
+} acct_usage_element;
+

 /*****End of plugin specific declarations**********************************/

+/* Destroy a acct_usage_element data structure element. */
+static void _destroy_acct_usage_element(void *object)
+{
+	acct_usage_element *tmp = (acct_usage_element *)object;
+	xfree(tmp->id);
+	xfree(tmp->current_usage);
+	xfree(tmp->current_cpu_count);
+	xfree(tmp);
+}
+
+/* Find the matching association ID in usage_acct_list List. */
+static int _find_acct_usage_list_entry(void *x, void *key)
+{
+	acct_usage_element *element_ptr = (acct_usage_element *) x;
+	uint32_t *keyid = (uint32_t*)key;
+
+	if (*(element_ptr->id) == *keyid)
+		return 1;
+	return 0;
+}
+
+/* Code taken from job_info.c calculate cummulative run time for a job */
+static time_t _get_job_runtime(struct job_record *job_ptr)
+{
+	time_t end_time, run_time;
+
+	if (IS_JOB_PENDING(job_ptr))
+		run_time = 0;
+	else if (IS_JOB_SUSPENDED(job_ptr))
+		run_time = job_ptr->pre_sus_time;
+	else {
+		if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0))
+			end_time = time(NULL);
+		else
+			end_time = job_ptr->end_time;
+		if (job_ptr->suspend_time) {
+			run_time = (time_t)
+				   (difftime(end_time, job_ptr->suspend_time)
+				    + job_ptr->pre_sus_time);
+		} else {
+			run_time = (time_t)
+				   difftime(end_time, job_ptr->start_time);
+		}
+	}
+
+	return run_time;
+}
+
+/* Return true of the cummulative run time of job1 is greater than job 2 */
+static bool _is_job_runtime_greater(struct job_record *job_ptr1,
+				    struct job_record *job_ptr2)
+{
+	time_t runtime_job1, runtime_job2;
+	double timediff_job1_job2 = 0.0;
+
+	runtime_job1 = _get_job_runtime(job_ptr1);
+	runtime_job2 = _get_job_runtime(job_ptr2);
+	timediff_job1_job2 = difftime(runtime_job1, runtime_job2);
+
+	if (timediff_job1_job2 > 0) {
+		if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) {
+    			info("%s: Runtime of JobId %u > JobId %u (%u > %u)",
+			     plugin_type, job_ptr1->job_id, job_ptr2->job_id,
+			     (uint32_t) runtime_job1, (uint32_t) runtime_job2);
+		}
+		return true;
+	} else {
+		if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) {
+    			info("%s: Runtime of JobId %u <= JobId %u (%u <= %u)",
+			     plugin_type, job_ptr1->job_id, job_ptr2->job_id,
+			     (uint32_t) runtime_job1, (uint32_t) runtime_job2);
+		}
+		return false;
+	}
+}
+
+/* This _get_nb_cpus function is greatly inspired from the Job_Size calculation
+ * in job_manager.c, but reused here to find out the requested resources. As
+ * stated in the comment of the Job_Size calculation, the first scheduling run
+ * may not have the actual total_cpus so we start by using the amount requested.
+ * Then the actual required cpus will be filled in. This function estimates
+ * the future value of total_cpus if it is not set.
+ */
+static int _get_nb_cpus(struct job_record *job_ptr)
+{
+	uint32_t cpu_cnt = 0;
+	uint32_t min_nodes = 0;
+	uint32_t max_nodes = 0;
+	uint32_t req_nodes = 0;
+	uint32_t cpus_per_node;
+
+	cpus_per_node = (uint32_t) job_ptr->part_ptr->total_cpus /
+			job_ptr->part_ptr->total_nodes;
+	min_nodes = MAX(job_ptr->details->min_nodes,
+			job_ptr->part_ptr->min_nodes);
+
+	if (job_ptr->details->max_nodes == 0) {
+		max_nodes = job_ptr->part_ptr->max_nodes;
+	} else {
+		max_nodes = MIN(job_ptr->details->max_nodes,
+				job_ptr->part_ptr->max_nodes);
+	}
+	max_nodes = MIN(max_nodes, 500000);	/* prevent overflows */
+
+	if (!job_ptr->limit_set_max_nodes && job_ptr->details->max_nodes)
+		req_nodes = max_nodes;
+	else
+		req_nodes = min_nodes;
+
+
+	if (job_ptr->total_cpus) {
+		/* This indicates that nodes have been allocated already, but
+		 * the job might have been requeued afterward. */
+		cpu_cnt = job_ptr->total_cpus;
+		if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) {
+			info("%s: JobId=%u (%s) total_cpus=%u",
+			     plugin_type, job_ptr->job_id, job_ptr->name,
+			     cpu_cnt);
+		}
+	} else {
+		cpu_cnt = req_nodes * cpus_per_node;
+		if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) {
+			info("%s: JobId=%u (%s) req_cpus=%u",
+			     plugin_type, job_ptr->job_id, job_ptr->name,
+			     cpu_cnt);
+		}
+	}
+
+	return cpu_cnt;
+}
+
+/* Test if preemptor request will overallocate the account */
+static int _overalloc_test(struct job_record *preemptor,
+			   struct job_record *preemptee)
+{
+	uint32_t cpu_cnt_preemptee, cpu_cnt_preemptor;
+	slurmdb_association_rec_t *assoc_preemptee, *assoc_preemptor;
+	double shares_preemptee, shares_preemptor;
+	uint32_t new_usage_preemptee, new_usage_preemptor;
+	double allotment_preemptee, allotment_preemptor;
+	double new_fairshare_preemptee, new_fairshare_preemptor;
+	double new_fairshare_diff;
+	char *relation = "equal";
+	int rc = 0;
+
+	cpu_cnt_preemptee = _get_nb_cpus(preemptee);
+	cpu_cnt_preemptor = _get_nb_cpus(preemptor);
+
+	assoc_preemptee = (slurmdb_association_rec_t *)preemptee->assoc_ptr;
+	assoc_preemptor = (slurmdb_association_rec_t *)preemptor->assoc_ptr;
+
+	shares_preemptee = assoc_preemptee->usage->shares_norm;
+	shares_preemptor = assoc_preemptor->usage->shares_norm;
+	new_usage_preemptee = assoc_preemptee->usage->grp_used_cpus;
+	new_usage_preemptor = assoc_preemptor->usage->grp_used_cpus +
+			      cpu_cnt_preemptor;
+
+	allotment_preemptee = shares_preemptee * preemptee->part_ptr->total_cpus;
+	allotment_preemptor = shares_preemptor * preemptor->part_ptr->total_cpus;
+
+	/* Fairshare will be less than 1 if running the job will not overrun
+	 * the share allocation */
+	new_fairshare_preemptee = (double)new_usage_preemptee /
+				  allotment_preemptee;
+	new_fairshare_preemptor = (double)new_usage_preemptor /
+			 	  allotment_preemptor;
+	new_fairshare_diff = new_fairshare_preemptee - new_fairshare_preemptor;
+
+	/* We don't always want to preempt based solely on priority.
+	 * A fairshare value greater than 1 means share overallocation.
+	 * 1) if both jobs will overallocate their account pocket -> use
+	 *    priority value
+	 * 2) if fairshare for preemptor is less than 1 but fairshare for
+	 *    preemptee is greater than 1 -> Preemptor CAN preempt
+	 * 3) if fairshare for preemptee is less than 1 but fairshare for
+	 *    preemptor is greater than 1 -> Preemptor WILL NOT preempt
+	 * 4) if fairshare for both jobs is less than 1 -> use priority value
+	 * 5) if both jobs have equal fairshare OR are from the same account
+	 *    then use priority value
+	 */
+	if (((new_fairshare_preemptee > 1.0 && new_fairshare_preemptor < 1.0) ||
+	     (new_fairshare_preemptee < 1.0 && new_fairshare_preemptor > 1.0))&&
+	    (new_fairshare_diff != 0.0) &&
+	    (strcmp(assoc_preemptor->acct, assoc_preemptee->acct) != 0)) {
+		if (new_fairshare_diff > 0.0) {
+			relation = "lower (better)";
+			rc = 1;	/* Preemptor can preempt */
+		} else {
+			relation = "higher (worse)";
+			rc =  -1;	/* Preemptor not can preempt */
+		}
+	}
+
+	if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) {
+		info("%s: Preemptor(%u, %s) acccount %s have %s "
+		     "fairshare than preemptee(%u, %s) account %s  %f vs. %f",
+		     plugin_type, preemptor->job_id, preemptor->name,
+		     assoc_preemptor->acct, relation, preemptee->job_id,
+		     preemptee->name, assoc_preemptee->acct,
+		     new_fairshare_preemptor, new_fairshare_preemptor);
+		info(" 	CPU CNT: %u and %u  USED CPUS: %u and %u  "
+		     "SHARES: %f and %f  TOT-CPUS: %u and %u",
+		     cpu_cnt_preemptor, cpu_cnt_preemptee,
+		     assoc_preemptor->usage->grp_used_cpus,
+		     assoc_preemptee->usage->grp_used_cpus,
+		     shares_preemptor, shares_preemptee,
+		     preemptor->part_ptr->total_cpus,
+		     preemptee->part_ptr->total_cpus);
+	}
+
+	return rc;
+}
+
+/*  Return true if the preemptor can preempt the preemptee, otherwise false */
+static bool _job_prio_preemptable(struct job_record *preemptor,
+				  struct job_record *preemptee)
+{
+	uint32_t job_prio1, job_prio2;
+	int rc;
+
+	if (CHECK_FOR_PREEMPTOR_OVERALLOC) {
+		rc = _overalloc_test(preemptor, preemptee);
+		if (rc > 0)
+			return true;
+		else if (rc < 0)
+			return false;
+	}
+
+	job_prio1 = preemptor->priority;
+	job_prio2 = preemptee->priority;
+
+	if (job_prio1 > job_prio2) {
+		if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) {
+    			info("%s: Priority of JobId %u > JobId %u (%u > %u)",
+			     plugin_type, preemptor->job_id, preemptee->job_id,
+			     job_prio1, job_prio2);
+		}
+		return true;	/* Preemptor can preempt */
+	} else {
+		if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) {
+    			info("%s: Priority of JobId %u <= JobId %u (%u <= %u)",
+			     plugin_type, preemptor->job_id, preemptee->job_id,
+			     job_prio1, job_prio2);
+		}
+		return false;	/* Preemptor can not preempt */
+	}
+}
+
+/* Sort jobs by priority. Use runtime as secondary key */
+static int _sort_by_job_prio(void *x, void *y)
+{
+	struct job_record *job_ptr1 = (struct job_record *) x;
+	struct job_record *job_ptr2 = (struct job_record *) y;
+
+	if      (job_ptr1->priority > job_ptr2->priority)
+		return 1;
+	else if (job_ptr1->priority < job_ptr2->priority)
+		return -1;
+	else if (_is_job_runtime_greater(job_ptr1, job_ptr2))
+		return 1;
+	return 0;
+}
+
 /**************************************************************************/
 /*  TAG(                              init                              ) */
 /**************************************************************************/
@@ -109,7 +379,7 @@ extern int init( void )
 /**************************************************************************/
 /*  TAG(                              fini                              ) */
 /**************************************************************************/
-extern void fini( void )
+extern void fini(void)
 {
 	/* Empty. */
 }
@@ -130,12 +400,12 @@ extern List find_preemptable_jobs(struct job_record *job_ptr)
 		return preemptee_job_list;
 	}
 	if (!IS_JOB_PENDING(preemptor_job_ptr)) {
-		error("%s: job %u not pending",
+		error("%s: JobId %u not pending",
 		      plugin_type, preemptor_job_ptr->job_id);
 		return preemptee_job_list;
 	}
 	if (preemptor_job_ptr->part_ptr == NULL) {
-		error("%s: job %u has NULL partition ptr",
+		error("%s: JobId %u has NULL partition ptr",
 		      plugin_type, preemptor_job_ptr->job_id);
 		return preemptee_job_list;
 	}
@@ -146,7 +416,7 @@ extern List find_preemptable_jobs(struct job_record *job_ptr)
 	}

 	if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) {
-		info("%s: Looking for jobs to preempt for job %u",
+		info("%s: Looking for jobs to preempt for JobId %u",
 		    plugin_type, preemptor_job_ptr->job_id);
 	}

@@ -157,7 +427,7 @@ extern List find_preemptable_jobs(struct job_record *job_ptr)
 		if (!IS_JOB_RUNNING(preemptee_job_ptr) &&
 		    !IS_JOB_SUSPENDED(preemptee_job_ptr))
 			continue;
-		if (!_job_prio_preemptable(preemptor_job_ptr, preemptee_job_ptr))
+		if (!_job_prio_preemptable(preemptor_job_ptr,preemptee_job_ptr))
 			continue;
 		if ((preemptee_job_ptr->node_bitmap == NULL) ||
 		   (bit_overlap(preemptee_job_ptr->node_bitmap,
@@ -185,25 +455,6 @@ extern List find_preemptable_jobs(struct job_record *job_ptr)
 	return preemptee_job_list;
 }

-/*
- *  Return true if the preemptor can preempt the preemptee, otherwise false
- * */
-
-static bool _job_prio_preemptable(struct job_record *preemptor,
-				  struct job_record *preemptee)
-{
-	uint32_t job_prio1, job_prio2;
-	
-	job_prio1 = preemptor->priority;
-	job_prio2 = preemptee->priority;
-
-	if (job_prio2 >= job_prio1) {
-		return false;	/* Preemptor can not preempt */
-	} else {
-		return true;	/* Preemptor can preempt */
-	}
-}
-
 /**************************************************************************/
 /* TAG(                 job_preempt_mode                                ) */
 /**************************************************************************/