Skip to content
Snippets Groups Projects
Commit 22f2accc authored by Moe Jette's avatar Moe Jette
Browse files

backfill was attempting to run while jobs were in completing state if

CompleteWait configured to 0 (default). Change to wait.
parent 466be21a
No related branches found
No related tags found
No related merge requests found
...@@ -119,6 +119,7 @@ static void _add_reservation(uint32_t start_time, uint32_t end_reserve, ...@@ -119,6 +119,7 @@ static void _add_reservation(uint32_t start_time, uint32_t end_reserve,
static void _attempt_backfill(void); static void _attempt_backfill(void);
static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2, static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2,
char *tv_str, int len_tv_str); char *tv_str, int len_tv_str);
static bool _job_is_completing(void);
static bool _more_work(void); static bool _more_work(void);
static int _num_feature_count(struct job_record *job_ptr); static int _num_feature_count(struct job_record *job_ptr);
static int _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap); static int _start_job(struct job_record *job_ptr, bitstr_t *avail_bitmap);
...@@ -166,6 +167,37 @@ static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2, ...@@ -166,6 +167,37 @@ static void _diff_tv_str(struct timeval *tv1,struct timeval *tv2,
snprintf(tv_str, len_tv_str, "usec=%ld", delta_t); snprintf(tv_str, len_tv_str, "usec=%ld", delta_t);
} }
/*
* _job_is_completing - Determine if jobs are in the process of completing.
* This is a variant of job_is_completing in slurmctld/job_scheduler.c.
* It always gives completing jobs at least 5 secs to complete.
* RET - True of any job is in the process of completing
*/
static bool _job_is_completing(void)
{
bool completing = false;
ListIterator job_iterator;
struct job_record *job_ptr = NULL;
uint16_t complete_wait = slurm_get_complete_wait();
time_t recent;
if (job_list == NULL)
return completing;
recent = time(NULL) - MIN(complete_wait, 5);
job_iterator = list_iterator_create(job_list);
while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
if (IS_JOB_COMPLETING(job_ptr) &&
(job_ptr->end_time >= recent)) {
completing = true;
break;
}
}
list_iterator_destroy(job_iterator);
return completing;
}
/* test if job has feature count specification */ /* test if job has feature count specification */
static int _num_feature_count(struct job_record *job_ptr) static int _num_feature_count(struct job_record *job_ptr)
{ {
...@@ -498,7 +530,7 @@ static void _attempt_backfill(void) ...@@ -498,7 +530,7 @@ static void _attempt_backfill(void)
else if (rc != SLURM_SUCCESS) else if (rc != SLURM_SUCCESS)
/* Planned to start job, but something bad /* Planned to start job, but something bad
* happended. */ * happended. */
continue; break;
} }
if (job_ptr->start_time > (now + BACKFILL_WINDOW)) { if (job_ptr->start_time > (now + BACKFILL_WINDOW)) {
/* Starts too far in the future to worry about */ /* Starts too far in the future to worry about */
...@@ -573,7 +605,7 @@ static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap) ...@@ -573,7 +605,7 @@ static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap)
/* This happens when a job has sharing disabled and /* This happens when a job has sharing disabled and
* a selected node is still completing some job, * a selected node is still completing some job,
* which should be a temporary situation. */ * which should be a temporary situation. */
verbose("backfill: Failed to start JobId=%u in %s: %s", verbose("backfill: Failed to start JobId=%u on %s: %s",
job_ptr->job_id, node_list, slurm_strerror(rc)); job_ptr->job_id, node_list, slurm_strerror(rc));
xfree(node_list); xfree(node_list);
fail_jobid = job_ptr->job_id; fail_jobid = job_ptr->job_id;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment