From f6e22a48676880d8d016b5b44f1ece366cc50b66 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Thu, 8 Nov 2012 14:17:25 -0800 Subject: [PATCH] Accounting - Move logic from job_mgr.c to acct_policy.c. One change is a check was added to see if Limits are enforced before doing any of the work. Before it wasn't part of the mix so some jobs could get killed if limits were enforced and then turned off. --- src/slurmctld/acct_policy.c | 150 ++++++++++++++++++++++++++++++++++++ src/slurmctld/acct_policy.h | 7 ++ src/slurmctld/job_mgr.c | 130 +------------------------------ 3 files changed, 158 insertions(+), 129 deletions(-) diff --git a/src/slurmctld/acct_policy.c b/src/slurmctld/acct_policy.c index 18cfa082e1c..7baf13fad8c 100644 --- a/src/slurmctld/acct_policy.c +++ b/src/slurmctld/acct_policy.c @@ -2005,3 +2005,153 @@ extern int acct_policy_update_pending_job(struct job_record *job_ptr) return rc; } + +/* + * acct_policy_job_runnable - Determine of the specified job has timed + * out based on it's QOS or association. + */ +extern bool acct_policy_job_time_out(struct job_record *job_ptr) +{ + uint64_t job_cpu_usage_mins = 0; + uint64_t usage_mins; + uint32_t wall_mins; + slurmdb_qos_rec_t *qos = NULL; + slurmdb_association_rec_t *assoc = NULL; + assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, + READ_LOCK, NO_LOCK, NO_LOCK }; + time_t now; + + /* now see if we are enforcing limits */ + if (!(accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) + return false; + + assoc_mgr_lock(&locks); + + qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; + assoc = (slurmdb_association_rec_t *)job_ptr->assoc_ptr; + + now = time(NULL); + + /* find out how many cpu minutes this job has been + * running for. */ + job_cpu_usage_mins = (uint64_t) + ((((now - job_ptr->start_time) + - job_ptr->tot_sus_time) / 60) + * job_ptr->total_cpus); + + /* The idea here is for qos to trump what an association + * has set for a limit, so if an association set of + * wall 10 mins and the qos has 20 mins set and the + * job has been running for 11 minutes it continues + * until 20. + */ + if (qos) { + usage_mins = (uint64_t)(qos->usage->usage_raw / 60.0); + wall_mins = qos->usage->grp_used_wall / 60; + + if ((qos->grp_cpu_mins != (uint64_t)INFINITE) + && (usage_mins >= qos->grp_cpu_mins)) { + last_job_update = now; + info("Job %u timed out, " + "the job is at or exceeds QOS %s's " + "group max cpu minutes of %"PRIu64" " + "with %"PRIu64"", + job_ptr->job_id, + qos->name, + qos->grp_cpu_mins, + usage_mins); + job_ptr->state_reason = FAIL_TIMEOUT; + goto job_failed; + } + + if ((qos->grp_wall != INFINITE) + && (wall_mins >= qos->grp_wall)) { + last_job_update = now; + info("Job %u timed out, " + "the job is at or exceeds QOS %s's " + "group wall limit of %u with %u", + job_ptr->job_id, + qos->name, qos->grp_wall, + wall_mins); + job_ptr->state_reason = FAIL_TIMEOUT; + goto job_failed; + } + + if ((qos->max_cpu_mins_pj != (uint64_t)INFINITE) + && (job_cpu_usage_mins >= qos->max_cpu_mins_pj)) { + last_job_update = now; + info("Job %u timed out, " + "the job is at or exceeds QOS %s's " + "max cpu minutes of %"PRIu64" " + "with %"PRIu64"", + job_ptr->job_id, + qos->name, + qos->max_cpu_mins_pj, + job_cpu_usage_mins); + job_ptr->state_reason = FAIL_TIMEOUT; + goto job_failed; + } + } + + /* handle any association stuff here */ + while (assoc) { + usage_mins = (uint64_t)(assoc->usage->usage_raw / 60.0); + wall_mins = assoc->usage->grp_used_wall / 60; + + if ((qos && (qos->grp_cpu_mins == INFINITE)) + && (assoc->grp_cpu_mins != (uint64_t)INFINITE) + && (usage_mins >= assoc->grp_cpu_mins)) { + info("Job %u timed out, " + "assoc %u is at or exceeds " + "group max cpu minutes limit %"PRIu64" " + "with %"PRIu64" for account %s", + job_ptr->job_id, assoc->id, + assoc->grp_cpu_mins, + usage_mins, + assoc->acct); + job_ptr->state_reason = FAIL_TIMEOUT; + break; + } + + if ((qos && (qos->grp_wall == INFINITE)) + && (assoc->grp_wall != INFINITE) + && (wall_mins >= assoc->grp_wall)) { + info("Job %u timed out, " + "assoc %u is at or exceeds " + "group wall limit %u " + "with %u for account %s", + job_ptr->job_id, assoc->id, + assoc->grp_wall, + wall_mins, assoc->acct); + job_ptr->state_reason = FAIL_TIMEOUT; + break; + } + + if ((qos && (qos->max_cpu_mins_pj == INFINITE)) + && (assoc->max_cpu_mins_pj != (uint64_t)INFINITE) + && (job_cpu_usage_mins >= assoc->max_cpu_mins_pj)) { + info("Job %u timed out, " + "assoc %u is at or exceeds " + "max cpu minutes limit %"PRIu64" " + "with %"PRIu64" for account %s", + job_ptr->job_id, assoc->id, + assoc->max_cpu_mins_pj, + job_cpu_usage_mins, + assoc->acct); + job_ptr->state_reason = FAIL_TIMEOUT; + break; + } + + assoc = assoc->usage->parent_assoc_ptr; + /* these limits don't apply to the root assoc */ + if(assoc == assoc_mgr_root_assoc) + break; + } +job_failed: + assoc_mgr_unlock(&locks); + + if (job_ptr->state_reason == FAIL_TIMEOUT) + return true; + + return false; +} diff --git a/src/slurmctld/acct_policy.h b/src/slurmctld/acct_policy.h index bac08981564..0f4f3ae2458 100644 --- a/src/slurmctld/acct_policy.h +++ b/src/slurmctld/acct_policy.h @@ -108,4 +108,11 @@ extern bool acct_policy_job_runnable_state(struct job_record *job_ptr); */ extern int acct_policy_update_pending_job(struct job_record *job_ptr); +/* + * acct_policy_job_runnable - Determine of the specified job has timed + * out based on it's QOS or association. Returns True if job is + * timed out and sets job_ptr->state_reason = FAIL_TIMEOUT; + */ +extern bool acct_policy_job_time_out(struct job_record *job_ptr); + #endif /* !_HAVE_ACCT_POLICY_H */ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 5924c7e8dcc..9c9e121a37b 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -4850,9 +4850,6 @@ void job_time_limit(void) slurmctld_conf.msg_timeout + 1); time_t over_run; int resv_status = 0; - uint64_t job_cpu_usage_mins = 0; - uint64_t usage_mins; - uint32_t wall_mins; if (slurmctld_conf.over_time_limit == (uint16_t) INFINITE) over_run = now - (365 * 24 * 60 * 60); /* one year */ @@ -4862,11 +4859,6 @@ void job_time_limit(void) begin_job_resv_check(); job_iterator = list_iterator_create(job_list); while ((job_ptr =(struct job_record *) list_next(job_iterator))) { - slurmdb_qos_rec_t *qos = NULL; - slurmdb_association_rec_t *assoc = NULL; - assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, - READ_LOCK, NO_LOCK, NO_LOCK }; - xassert (job_ptr->magic == JOB_MAGIC); if (IS_JOB_CONFIGURING(job_ptr)) { @@ -4888,13 +4880,6 @@ void job_time_limit(void) if (!IS_JOB_RUNNING(job_ptr)) continue; - /* find out how many cpu minutes this job has been - * running for. */ - job_cpu_usage_mins = (uint64_t) - ((((now - job_ptr->start_time) - - job_ptr->tot_sus_time) / 60) - * job_ptr->total_cpus); - if (slurmctld_conf.inactive_limit && (job_ptr->batch_flag == 0) && (job_ptr->time_last_active <= old) && @@ -4946,120 +4931,7 @@ void job_time_limit(void) (list_count(job_ptr->step_list) > 0)) check_job_step_time_limit(job_ptr, now); - assoc_mgr_lock(&locks); - qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; - assoc = (slurmdb_association_rec_t *)job_ptr->assoc_ptr; - - /* The idea here is for qos to trump what an association - * has set for a limit, so if an association set of - * wall 10 mins and the qos has 20 mins set and the - * job has been running for 11 minutes it continues - * until 20. - */ - if(qos) { - usage_mins = (uint64_t)(qos->usage->usage_raw / 60.0); - wall_mins = qos->usage->grp_used_wall / 60; - - if ((qos->grp_cpu_mins != (uint64_t)INFINITE) - && (usage_mins >= qos->grp_cpu_mins)) { - last_job_update = now; - info("Job %u timed out, " - "the job is at or exceeds QOS %s's " - "group max cpu minutes of %"PRIu64" " - "with %"PRIu64"", - job_ptr->job_id, - qos->name, - qos->grp_cpu_mins, - usage_mins); - job_ptr->state_reason = FAIL_TIMEOUT; - goto job_failed; - } - - if ((qos->grp_wall != INFINITE) - && (wall_mins >= qos->grp_wall)) { - last_job_update = now; - info("Job %u timed out, " - "the job is at or exceeds QOS %s's " - "group wall limit of %u with %u", - job_ptr->job_id, - qos->name, qos->grp_wall, - wall_mins); - job_ptr->state_reason = FAIL_TIMEOUT; - goto job_failed; - } - - if ((qos->max_cpu_mins_pj != (uint64_t)INFINITE) - && (job_cpu_usage_mins >= qos->max_cpu_mins_pj)) { - last_job_update = now; - info("Job %u timed out, " - "the job is at or exceeds QOS %s's " - "max cpu minutes of %"PRIu64" " - "with %"PRIu64"", - job_ptr->job_id, - qos->name, - qos->max_cpu_mins_pj, - job_cpu_usage_mins); - job_ptr->state_reason = FAIL_TIMEOUT; - goto job_failed; - } - } - - /* handle any association stuff here */ - while(assoc) { - usage_mins = (uint64_t)(assoc->usage->usage_raw / 60.0); - wall_mins = assoc->usage->grp_used_wall / 60; - - if ((qos && (qos->grp_cpu_mins == INFINITE)) - && (assoc->grp_cpu_mins != (uint64_t)INFINITE) - && (usage_mins >= assoc->grp_cpu_mins)) { - info("Job %u timed out, " - "assoc %u is at or exceeds " - "group max cpu minutes limit %"PRIu64" " - "with %"PRIu64" for account %s", - job_ptr->job_id, assoc->id, - assoc->grp_cpu_mins, - usage_mins, - assoc->acct); - job_ptr->state_reason = FAIL_TIMEOUT; - break; - } - - if ((qos && (qos->grp_wall == INFINITE)) - && (assoc->grp_wall != INFINITE) - && (wall_mins >= assoc->grp_wall)) { - info("Job %u timed out, " - "assoc %u is at or exceeds " - "group wall limit %u " - "with %u for account %s", - job_ptr->job_id, assoc->id, - assoc->grp_wall, - wall_mins, assoc->acct); - job_ptr->state_reason = FAIL_TIMEOUT; - break; - } - - if ((qos && (qos->max_cpu_mins_pj == INFINITE)) - && (assoc->max_cpu_mins_pj != (uint64_t)INFINITE) - && (job_cpu_usage_mins >= assoc->max_cpu_mins_pj)) { - info("Job %u timed out, " - "assoc %u is at or exceeds " - "max cpu minutes limit %"PRIu64" " - "with %"PRIu64" for account %s", - job_ptr->job_id, assoc->id, - assoc->max_cpu_mins_pj, - job_cpu_usage_mins, - assoc->acct); - job_ptr->state_reason = FAIL_TIMEOUT; - break; - } - - assoc = assoc->usage->parent_assoc_ptr; - /* these limits don't apply to the root assoc */ - if(assoc == assoc_mgr_root_assoc) - break; - } - job_failed: - assoc_mgr_unlock(&locks); + acct_policy_job_time_out(job_ptr); if(job_ptr->state_reason == FAIL_TIMEOUT) { last_job_update = now; -- GitLab