diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 2464d97a212e862b30c9a1283c8f669f4ea80471..cf3a78c90337d12e8bdf6410e0d2e56d7a7c438f 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3172,28 +3172,33 @@ void job_time_limit(void) time_t now = time(NULL); time_t old = now - slurmctld_conf.inactive_limit; time_t over_run; + int resv_status; if (slurmctld_conf.over_time_limit == (uint16_t) INFINITE) over_run = now - (365 * 24 * 60 * 60); /* one year */ else over_run = now - (slurmctld_conf.over_time_limit * 60); + begin_job_resv_check(); job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { xassert (job_ptr->magic == JOB_MAGIC); + + resv_status = job_resv_check(job_ptr); + if (job_ptr->job_state != JOB_RUNNING) continue; /* Consider a job active if it has any active steps */ - if (job_ptr->step_list - && (list_count(job_ptr->step_list) > 0)) + if (job_ptr->step_list && + (list_count(job_ptr->step_list) > 0)) job_ptr->time_last_active = now; - if (slurmctld_conf.inactive_limit - && (job_ptr->time_last_active <= old) - && (job_ptr->part_ptr) - && (job_ptr->part_ptr->root_only == 0)) { + if (slurmctld_conf.inactive_limit && + (job_ptr->time_last_active <= old) && + (job_ptr->part_ptr) && + (job_ptr->part_ptr->root_only == 0)) { /* job inactive, kill it */ info("Inactivity time limit reached for JobId=%u", job_ptr->job_id); @@ -3202,8 +3207,8 @@ void job_time_limit(void) xfree(job_ptr->state_desc); continue; } - if ((job_ptr->time_limit != INFINITE) - && (job_ptr->end_time <= over_run)) { + if ((job_ptr->time_limit != INFINITE) && + (job_ptr->end_time <= over_run)) { last_job_update = now; info("Time limit exhausted for JobId=%u", job_ptr->job_id); @@ -3213,12 +3218,23 @@ void job_time_limit(void) continue; } + if (resv_status != SLURM_SUCCESS) { + last_job_update = now; + info("Reservation ended for JobId=%u", + job_ptr->job_id); + _job_timed_out(job_ptr); + job_ptr->state_reason = FAIL_TIMEOUT; + xfree(job_ptr->state_desc); + continue; + } + /* Give srun command warning message about pending timeout */ if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2)) srun_timeout (job_ptr); } list_iterator_destroy(job_iterator); + fini_job_resv_check(); } /* Terminate a job that has exhausted its time limit */ diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 9b4e5a400dc654cb213f8ca6a03b5d50dfca7ae6..d098f27475b81b750f8fc96003d1da552efa25fe 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -371,7 +371,8 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) bzero(conf_ptr, sizeof(slurm_ctl_conf_t)); conf_ptr->last_update = time(NULL); - conf_ptr->accounting_storage_enforce = conf->accounting_storage_enforce; + conf_ptr->accounting_storage_enforce = + conf->accounting_storage_enforce; conf_ptr->accounting_storage_host = xstrdup(conf->accounting_storage_host); conf_ptr->accounting_storage_loc = diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index f747ddc8d390af1df94336cb707c9ab9bebe2d10..50cb73d9ef9f51608f8d85fd6f06865eb9c23a3a 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -75,10 +75,10 @@ /* Change RESV_STATE_VERSION value when changing the state save format */ #define RESV_STATE_VERSION "VER001" -time_t last_resv_update = (time_t) 0; - -List resv_list = (List) NULL; -uint32_t top_suffix = 0; +time_t last_resv_update = (time_t) 0; +List resv_list = (List) NULL; +uint32_t resv_over_run; +uint32_t top_suffix = 0; static int _build_account_list(char *accounts, int *account_cnt, char ***account_list); @@ -1146,7 +1146,6 @@ extern int dump_all_resv_state(void) slurmctld_lock_t resv_read_lock = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK }; Buf buffer = init_buf(BUF_SIZE); - time_t now = time(NULL); DEF_TIMERS; START_TIMER; @@ -1163,15 +1162,8 @@ extern int dump_all_resv_state(void) iter = list_iterator_create(resv_list); if (!iter) fatal("malloc: list_iterator_create"); - while ((resv_ptr = (slurmctld_resv_t *) list_next(iter))) { - if (resv_ptr->end_time > now) { - _pack_resv(resv_ptr, buffer, true); - } else { - debug("Purging vestigial reservation record %s", - resv_ptr->name); - list_delete_item(iter); - } - } + while ((resv_ptr = (slurmctld_resv_t *) list_next(iter))) + _pack_resv(resv_ptr, buffer, true); list_iterator_destroy(iter); /* Maintain config read lock until we copy state_save_location *\ \* unlock_slurmctld(resv_read_lock); - see below */ @@ -1303,7 +1295,6 @@ static void _validate_all_reservations(void) { ListIterator iter; slurmctld_resv_t *resv_ptr; - time_t now = time(NULL); char *tmp; uint32_t res_num; @@ -1311,11 +1302,7 @@ static void _validate_all_reservations(void) if (!iter) fatal("malloc: list_iterator_create"); while ((resv_ptr = (slurmctld_resv_t *) list_next(iter))) { - if (resv_ptr->end_time < now) { - debug("Purging vestigial reservation record %s", - resv_ptr->name); - list_delete_item(iter); - } else if (!_validate_one_reservation(resv_ptr)) { + if (!_validate_one_reservation(resv_ptr)) { error("Purging invalid reservation record %s", resv_ptr->name); list_delete_item(iter); @@ -1783,3 +1770,83 @@ extern int job_test_resv(struct job_record *job_ptr, time_t *when, return rc; } + +/* Begin scan of all jobs for valid reservations */ +extern void begin_job_resv_check(void) +{ + ListIterator iter; + slurmctld_resv_t *resv_ptr; + slurm_ctl_conf_t *conf; + + if (!resv_list) + return; + + conf = slurm_conf_lock(); + resv_over_run = conf->resv_over_run; + slurm_conf_unlock(); + if (resv_over_run == (uint16_t) INFINITE) + resv_over_run = 365 * 24 * 60 * 60; + else + resv_over_run *= 60; + + iter = list_iterator_create(resv_list); + if (!iter) + fatal("malloc: list_iterator_create"); + while ((resv_ptr = (slurmctld_resv_t *) list_next(iter))) + resv_ptr->job_cnt = 0; + list_iterator_destroy(iter); +} + +/* Test a particular job for valid reservation + * RET ESLURM_INVALID_TIME_VALUE if reservation is terminated + * SLURM_SUCCESS if reservation is still valid */ +extern int job_resv_check(struct job_record *job_ptr) +{ + if (!job_ptr->resv_name) + return SLURM_SUCCESS; + + if (!job_ptr->resv_ptr) { + job_ptr->resv_ptr = (slurmctld_resv_t *) list_find_first ( + resv_list, _find_resv_name, + job_ptr->resv_name); + if (!job_ptr->resv_ptr) { + /* This should only happen when we have trouble + * on a slurm restart and fail to recover a + * reservation */ + error("JobId %u linked to defunct reservation %s", + job_ptr->job_id, job_ptr->resv_name); + return ESLURM_INVALID_TIME_VALUE; + } + } + + job_ptr->resv_ptr->job_cnt++; + if (job_ptr->resv_ptr->end_time < (time(NULL) + resv_over_run)) + return ESLURM_INVALID_TIME_VALUE; + return SLURM_SUCCESS; +} + +/* Finish scan of all jobs for valid reservations */ +extern void fini_job_resv_check(void) +{ + ListIterator iter; + slurmctld_resv_t *resv_ptr; + time_t now = time(NULL); + + if (!resv_list) + return; + + iter = list_iterator_create(resv_list); + if (!iter) + fatal("malloc: list_iterator_create"); + while ((resv_ptr = (slurmctld_resv_t *) list_next(iter))) { + if ((resv_ptr->job_cnt == 0) && + (resv_ptr->end_time <= now)) { + info("Purging vestigial reservation record %s", + resv_ptr->name); + list_delete_item(iter); + last_resv_update = now; + } + + } + list_iterator_destroy(iter); +} diff --git a/src/slurmctld/reservation.h b/src/slurmctld/reservation.h index a01b423b4b3d30344fbce204653727a0b116301d..bcc8a8fcac750af3c60345d62a0861af77f544eb 100644 --- a/src/slurmctld/reservation.h +++ b/src/slurmctld/reservation.h @@ -93,4 +93,15 @@ extern int validate_job_resv(struct job_record *job_ptr); extern int job_test_resv(struct job_record *job_ptr, time_t *when, bitstr_t **node_bitmap); +/* Begin scan of all jobs for valid reservations */ +extern void begin_job_resv_check(void); + +/* Test a particular job for valid reservation + * RET ESLURM_INVALID_TIME_VALUE if reservation is terminated + * SLURM_SUCCESS if reservation is still valid */ +extern int job_resv_check(struct job_record *job_ptr); + +/* Finish scan of all jobs for valid reservations */ +extern void fini_job_resv_check(void); + #endif /* !_RESERVATION_H */ diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index d8c359074c3142b365756f821f627c2feebd25e0..6a8acc1c92a504547ba4915f126352b2371dc0db 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -313,6 +313,7 @@ typedef struct slurmctld_resv { uint32_t cpu_cnt; /* number of reserved CPUs */ time_t end_time; /* end time of reservation */ char *features; /* required node features */ + uint32_t job_cnt; /* number of jobs associated with this */ uint16_t magic; /* magic cookie, RESV_MAGIC */ char *name; /* name of reservation */ uint32_t node_cnt; /* count of nodes required */