diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index 18cfa082e1c94cb5a1062585da78c32d5aa675b3..7baf13fad8c1c8fc5f3c65d5c9792cac8264dec3 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -2005,3 +2005,153 @@ extern int acct_policy_update_pending_job(struct job_record *job_ptr) return rc; } + +/* + * acct_policy_job_runnable - Determine of the specified job has timed + * out based on it's QOS or association. + */ +extern bool acct_policy_job_time_out(struct job_record *job_ptr) +{ + uint64_t job_cpu_usage_mins = 0; + uint64_t usage_mins; + uint32_t wall_mins; + slurmdb_qos_rec_t *qos = NULL; + slurmdb_association_rec_t *assoc = NULL; + assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, + READ_LOCK, NO_LOCK, NO_LOCK }; + time_t now; + + /* now see if we are enforcing limits */ + if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) + return false; + + assoc_mgr_lock(&locks); + + qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; + assoc = (slurmdb_association_rec_t *)job_ptr->assoc_ptr; + + now = time(NULL); + + /* find out how many cpu minutes this job has been + * running for. */ + job_cpu_usage_mins = (uint64_t) + ((((now - job_ptr->start_time) + - job_ptr->tot_sus_time) / 60) + * job_ptr->total_cpus); + + /* The idea here is for qos to trump what an association + * has set for a limit, so if an association set of + * wall 10 mins and the qos has 20 mins set and the + * job has been running for 11 minutes it continues + * until 20. + */ + if (qos) { + usage_mins = (uint64_t)(qos->usage->usage_raw / 60.0); + wall_mins = qos->usage->grp_used_wall / 60; + + if ((qos->grp_cpu_mins != (uint64_t)INFINITE) + && (usage_mins >= qos->grp_cpu_mins)) { + last_job_update = now; + info("Job %u timed out, " + "the job is at or exceeds QOS %s's " + "group max cpu minutes of %"PRIu64" " + "with %"PRIu64"", + job_ptr->job_id, + qos->name, + qos->grp_cpu_mins, + usage_mins); + job_ptr->state_reason = FAIL_TIMEOUT; + goto job_failed; + } + + if ((qos->grp_wall != INFINITE) + && (wall_mins >= qos->grp_wall)) { + last_job_update = now; + info("Job %u timed out, " + "the job is at or exceeds QOS %s's " + "group wall limit of %u with %u", + job_ptr->job_id, + qos->name, qos->grp_wall, + wall_mins); + job_ptr->state_reason = FAIL_TIMEOUT; + goto job_failed; + } + + if ((qos->max_cpu_mins_pj != (uint64_t)INFINITE) + && (job_cpu_usage_mins >= qos->max_cpu_mins_pj)) { + last_job_update = now; + info("Job %u timed out, " + "the job is at or exceeds QOS %s's " + "max cpu minutes of %"PRIu64" " + "with %"PRIu64"", + job_ptr->job_id, + qos->name, + qos->max_cpu_mins_pj, + job_cpu_usage_mins); + job_ptr->state_reason = FAIL_TIMEOUT; + goto job_failed; + } + } + + /* handle any association stuff here */ + while (assoc) { + usage_mins = (uint64_t)(assoc->usage->usage_raw / 60.0); + wall_mins = assoc->usage->grp_used_wall / 60; + + if ((qos && (qos->grp_cpu_mins == INFINITE)) + && (assoc->grp_cpu_mins != (uint64_t)INFINITE) + && (usage_mins >= assoc->grp_cpu_mins)) { + info("Job %u timed out, " + "assoc %u is at or exceeds " + "group max cpu minutes limit %"PRIu64" " + "with %"PRIu64" for account %s", + job_ptr->job_id, assoc->id, + assoc->grp_cpu_mins, + usage_mins, + assoc->acct); + job_ptr->state_reason = FAIL_TIMEOUT; + break; + } + + if ((qos && (qos->grp_wall == INFINITE)) + && (assoc->grp_wall != INFINITE) + && (wall_mins >= assoc->grp_wall)) { + info("Job %u timed out, " + "assoc %u is at or exceeds " + "group wall limit %u " + "with %u for account %s", + job_ptr->job_id, assoc->id, + assoc->grp_wall, + wall_mins, assoc->acct); + job_ptr->state_reason = FAIL_TIMEOUT; + break; + } + + if ((qos && (qos->max_cpu_mins_pj == INFINITE)) + && (assoc->max_cpu_mins_pj != (uint64_t)INFINITE) + && (job_cpu_usage_mins >= assoc->max_cpu_mins_pj)) { + info("Job %u timed out, " + "assoc %u is at or exceeds " + "max cpu minutes limit %"PRIu64" " + "with %"PRIu64" for account %s", + job_ptr->job_id, assoc->id, + assoc->max_cpu_mins_pj, + job_cpu_usage_mins, + assoc->acct); + job_ptr->state_reason = FAIL_TIMEOUT; + break; + } + + assoc = assoc->usage->parent_assoc_ptr; + /* these limits don't apply to the root assoc */ + if(assoc == assoc_mgr_root_assoc) + break; + } +job_failed: + assoc_mgr_unlock(&locks); + + if (job_ptr->state_reason == FAIL_TIMEOUT) + return true; + + return false; +} diff --git a/src/slurmctld/acct_policy.h b/src/slurmctld/acct_policy.h index bac08981564614ef115572b5175f67b42aed3902..0f4f3ae24588d115408338ac45fed0f525a90747 100644 --- a/src/slurmctld/acct_policy.h +++ b/src/slurmctld/acct_policy.h @@ -108,4 +108,11 @@ extern bool acct_policy_job_runnable_state(struct job_record *job_ptr); */ extern int acct_policy_update_pending_job(struct job_record *job_ptr); +/* + * acct_policy_job_runnable - Determine of the specified job has timed + * out based on it's QOS or association. Returns True if job is + * timed out and sets job_ptr->state_reason = FAIL_TIMEOUT; + */ +extern bool acct_policy_job_time_out(struct job_record *job_ptr); + #endif /* !_HAVE_ACCT_POLICY_H */ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 5924c7e8dccbce314f2fc74303f90de3a3e7bbef..9c9e121a37bc94ad1ee8ff7bf5b7663788203c5e 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -4850,9 +4850,6 @@ void job_time_limit(void) slurmctld_conf.msg_timeout + 1); time_t over_run; int resv_status = 0; - uint64_t job_cpu_usage_mins = 0; - uint64_t usage_mins; - uint32_t wall_mins; if (slurmctld_conf.over_time_limit == (uint16_t) INFINITE) over_run = now - (365 * 24 * 60 * 60); /* one year */ @@ -4862,11 +4859,6 @@ void job_time_limit(void) begin_job_resv_check(); job_iterator = list_iterator_create(job_list); while ((job_ptr =(struct job_record *) list_next(job_iterator))) { - slurmdb_qos_rec_t *qos = NULL; - slurmdb_association_rec_t *assoc = NULL; - assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, - READ_LOCK, NO_LOCK, NO_LOCK }; - xassert (job_ptr->magic == JOB_MAGIC); if (IS_JOB_CONFIGURING(job_ptr)) { @@ -4888,13 +4880,6 @@ void job_time_limit(void) if (!IS_JOB_RUNNING(job_ptr)) continue; - /* find out how many cpu minutes this job has been - * running for. */ - job_cpu_usage_mins = (uint64_t) - ((((now - job_ptr->start_time) - - job_ptr->tot_sus_time) / 60) - * job_ptr->total_cpus); - if (slurmctld_conf.inactive_limit && (job_ptr->batch_flag == 0) && (job_ptr->time_last_active <= old) && @@ -4946,120 +4931,7 @@ void job_time_limit(void) (list_count(job_ptr->step_list) > 0)) check_job_step_time_limit(job_ptr, now); - assoc_mgr_lock(&locks); - qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; - assoc = (slurmdb_association_rec_t *)job_ptr->assoc_ptr; - - /* The idea here is for qos to trump what an association - * has set for a limit, so if an association set of - * wall 10 mins and the qos has 20 mins set and the - * job has been running for 11 minutes it continues - * until 20. - */ - if(qos) { - usage_mins = (uint64_t)(qos->usage->usage_raw / 60.0); - wall_mins = qos->usage->grp_used_wall / 60; - - if ((qos->grp_cpu_mins != (uint64_t)INFINITE) - && (usage_mins >= qos->grp_cpu_mins)) { - last_job_update = now; - info("Job %u timed out, " - "the job is at or exceeds QOS %s's " - "group max cpu minutes of %"PRIu64" " - "with %"PRIu64"", - job_ptr->job_id, - qos->name, - qos->grp_cpu_mins, - usage_mins); - job_ptr->state_reason = FAIL_TIMEOUT; - goto job_failed; - } - - if ((qos->grp_wall != INFINITE) - && (wall_mins >= qos->grp_wall)) { - last_job_update = now; - info("Job %u timed out, " - "the job is at or exceeds QOS %s's " - "group wall limit of %u with %u", - job_ptr->job_id, - qos->name, qos->grp_wall, - wall_mins); - job_ptr->state_reason = FAIL_TIMEOUT; - goto job_failed; - } - - if ((qos->max_cpu_mins_pj != (uint64_t)INFINITE) - && (job_cpu_usage_mins >= qos->max_cpu_mins_pj)) { - last_job_update = now; - info("Job %u timed out, " - "the job is at or exceeds QOS %s's " - "max cpu minutes of %"PRIu64" " - "with %"PRIu64"", - job_ptr->job_id, - qos->name, - qos->max_cpu_mins_pj, - job_cpu_usage_mins); - job_ptr->state_reason = FAIL_TIMEOUT; - goto job_failed; - } - } - - /* handle any association stuff here */ - while(assoc) { - usage_mins = (uint64_t)(assoc->usage->usage_raw / 60.0); - wall_mins = assoc->usage->grp_used_wall / 60; - - if ((qos && (qos->grp_cpu_mins == INFINITE)) - && (assoc->grp_cpu_mins != (uint64_t)INFINITE) - && (usage_mins >= assoc->grp_cpu_mins)) { - info("Job %u timed out, " - "assoc %u is at or exceeds " - "group max cpu minutes limit %"PRIu64" " - "with %"PRIu64" for account %s", - job_ptr->job_id, assoc->id, - assoc->grp_cpu_mins, - usage_mins, - assoc->acct); - job_ptr->state_reason = FAIL_TIMEOUT; - break; - } - - if ((qos && (qos->grp_wall == INFINITE)) - && (assoc->grp_wall != INFINITE) - && (wall_mins >= assoc->grp_wall)) { - info("Job %u timed out, " - "assoc %u is at or exceeds " - "group wall limit %u " - "with %u for account %s", - job_ptr->job_id, assoc->id, - assoc->grp_wall, - wall_mins, assoc->acct); - job_ptr->state_reason = FAIL_TIMEOUT; - break; - } - - if ((qos && (qos->max_cpu_mins_pj == INFINITE)) - && (assoc->max_cpu_mins_pj != (uint64_t)INFINITE) - && (job_cpu_usage_mins >= assoc->max_cpu_mins_pj)) { - info("Job %u timed out, " - "assoc %u is at or exceeds " - "max cpu minutes limit %"PRIu64" " - "with %"PRIu64" for account %s", - job_ptr->job_id, assoc->id, - assoc->max_cpu_mins_pj, - job_cpu_usage_mins, - assoc->acct); - job_ptr->state_reason = FAIL_TIMEOUT; - break; - } - - assoc = assoc->usage->parent_assoc_ptr; - /* these limits don't apply to the root assoc */ - if(assoc == assoc_mgr_root_assoc) - break; - } - job_failed: - assoc_mgr_unlock(&locks); + acct_policy_job_time_out(job_ptr); if(job_ptr->state_reason == FAIL_TIMEOUT) { last_job_update = now;