From 3014c4ba437392b9990bf151eea3e8a3b8b8d9f5 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Mon, 13 Mar 2017 16:21:24 -0600 Subject: [PATCH] This is the meat of getting the consumed energy for the job to the database. We use the last step in the allocation (works for everything but salloc) to get the data to the job. We update the job as steps finish and send a new tres_alloc_str to the database which will update the job record with the info. We could not use the job_comp message for anything but an salloc since there is not guarantee it will get there after the last step does. --- .../accounting_storage/mysql/as_mysql_job.c | 14 +++++++++++++ .../slurmdbd/accounting_storage_slurmdbd.c | 8 +++++++ src/slurmctld/job_mgr.c | 19 ++++++++++++++--- src/slurmctld/step_mgr.c | 21 +++++++++++++++++++ src/slurmdbd/proc_req.c | 7 +++++++ 5 files changed, 66 insertions(+), 3 deletions(-) diff --git a/src/plugins/accounting_storage/mysql/as_mysql_job.c b/src/plugins/accounting_storage/mysql/as_mysql_job.c index c09a778b847..daa2431efa1 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_job.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_job.c @@ -1296,6 +1296,20 @@ extern int as_mysql_step_complete(mysql_conn_t *mysql_conn, rc = mysql_db_query(mysql_conn, query); xfree(query); + /* set the energy for the entire job. */ + if (step_ptr->job_ptr->tres_alloc_str) { + query = xstrdup_printf( + "update \"%s_%s\" set tres_alloc='%s' where " + "job_db_inx=%"PRIu64, + mysql_conn->cluster_name, job_table, + step_ptr->job_ptr->tres_alloc_str, + step_ptr->job_ptr->db_index); + if (debug_flags & DEBUG_FLAG_DB_STEP) + DB_DEBUG(mysql_conn->conn, "query\n%s", query); + rc = mysql_db_query(mysql_conn, query); + xfree(query); + } + return rc; } diff --git a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c index 572c485e5c2..edb8cfe4a86 100644 --- a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c +++ b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c @@ -2645,6 +2645,9 @@ extern int jobacct_storage_p_job_complete(void *db_conn, req.submit_time = job_ptr->details->submit_time; } + if (!(job_ptr->bit_flags && TRES_STR_CALC)) + req.tres_alloc_str = job_ptr->tres_alloc_str; + msg.msg_type = DBD_JOB_COMPLETE; msg.data = &req; @@ -2738,6 +2741,7 @@ extern int jobacct_storage_p_step_start(void *db_conn, req.total_tasks = tasks; req.tres_alloc_str = step_ptr->tres_alloc_str; + req.req_cpufreq_min = step_ptr->cpu_freq_min; req.req_cpufreq_max = step_ptr->cpu_freq_max; req.req_cpufreq_gov = step_ptr->cpu_freq_gov; @@ -2807,6 +2811,10 @@ extern int jobacct_storage_p_step_complete(void *db_conn, req.job_submit_time = step_ptr->job_ptr->resize_time; else if (step_ptr->job_ptr->details) req.job_submit_time = step_ptr->job_ptr->details->submit_time; + + if (step_ptr->job_ptr->bit_flags && TRES_STR_CALC) + req.job_tres_alloc_str = step_ptr->job_ptr->tres_alloc_str; + req.state = step_ptr->state; req.step_id = step_ptr->step_id; req.total_tasks = tasks; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 62164459274..2e2fae18f3d 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2202,9 +2202,13 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) /* make sure we have this job completed in the * database */ if (IS_JOB_FINISHED(job_ptr)) { - if (slurmctld_init_db) - jobacct_storage_g_job_complete( - acct_db_conn, job_ptr); + if (slurmctld_init_db && + !(job_ptr->bit_flags && TRES_STR_CALC) && + job_ptr->tres_alloc_cnt && + (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64)) + set_job_tres_alloc_str(job_ptr, false); + jobacct_storage_g_job_complete( + acct_db_conn, job_ptr); job_finished = 1; } } @@ -13325,6 +13329,11 @@ extern void job_completion_logger(struct job_record *job_ptr, bool requeue) if (!with_slurmdbd && !job_ptr->db_index) jobacct_storage_g_job_start(acct_db_conn, job_ptr); + if (!(job_ptr->bit_flags && TRES_STR_CALC) && + job_ptr->tres_alloc_cnt && + (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64)) + set_job_tres_alloc_str(job_ptr, false); + jobacct_storage_g_job_complete(acct_db_conn, job_ptr); } @@ -15655,6 +15664,10 @@ extern bool job_hold_requeue(struct job_record *job_ptr) return false; /* Sent event requeue to the database. */ + if (!(job_ptr->bit_flags && TRES_STR_CALC) && + job_ptr->tres_alloc_cnt && + (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64)) + set_job_tres_alloc_str(job_ptr, false); jobacct_storage_g_job_complete(acct_db_conn, job_ptr); debug("%s: job %u state 0x%x", __func__, job_ptr->job_id, state); diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index c4e44ce4c01..7bf2c19a130 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -245,6 +245,27 @@ static void _build_pending_step(struct job_record *job_ptr, static void _internal_step_complete(struct job_record *job_ptr, struct step_record *step_ptr) { + struct jobacctinfo *jobacct = (struct jobacctinfo *)step_ptr->jobacct; + if (jobacct && job_ptr->tres_alloc_cnt && + (jobacct->energy.consumed_energy != NO_VAL64)) { + if (job_ptr->tres_alloc_cnt[TRES_ARRAY_ENERGY] == NO_VAL64) + job_ptr->tres_alloc_cnt[TRES_ARRAY_ENERGY] = 0; + job_ptr->tres_alloc_cnt[TRES_ARRAY_ENERGY] += + jobacct->energy.consumed_energy; + } + + if (IS_JOB_FINISHED(job_ptr) && + (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64) && + (list_count(job_ptr->step_list) == 1)) { + set_job_tres_alloc_str(job_ptr, false); + /* This flag says we have processed the tres alloc including + * energy from all steps, so don't process or handle it again + * with the job. It also tells the slurmdbd plugin to send it + * to the DBD. + */ + job_ptr->bit_flags |= TRES_STR_CALC; + } + jobacct_storage_g_step_complete(acct_db_conn, step_ptr); if (step_ptr->step_id == SLURM_PENDING_STEP) diff --git a/src/slurmdbd/proc_req.c b/src/slurmdbd/proc_req.c index 698050d56b2..318fabd3107 100644 --- a/src/slurmdbd/proc_req.c +++ b/src/slurmdbd/proc_req.c @@ -1814,6 +1814,8 @@ static int _job_complete(slurmdbd_conn_t *slurmdbd_conn, job.nodes = job_comp_msg->nodes; job.start_time = job_comp_msg->start_time; details.submit_time = job_comp_msg->submit_time; + job.start_protocol_ver = slurmdbd_conn->conn->version; + job.tres_alloc_str = job_comp_msg->tres_alloc_str; job.details = &details; @@ -1904,6 +1906,7 @@ static int _job_suspend(slurmdbd_conn_t *slurmdbd_conn, job.job_id = job_suspend_msg->job_id; job.job_state = job_suspend_msg->job_state; details.submit_time = job_suspend_msg->submit_time; + job.start_protocol_ver = slurmdbd_conn->conn->version; job.suspend_time = job_suspend_msg->suspend_time; job.details = &details; @@ -2552,6 +2555,7 @@ static void _process_job_start(slurmdbd_conn_t *slurmdbd_conn, job.qos_id = job_start_msg->qos_id; job.resv_id = job_start_msg->resv_id; job.priority = job_start_msg->priority; + job.start_protocol_ver = slurmdbd_conn->conn->version; job.start_time = job_start_msg->start_time; job.time_limit = job_start_msg->timelimit; job.tres_alloc_str = job_start_msg->tres_alloc_str; @@ -3359,7 +3363,9 @@ static int _step_complete(slurmdbd_conn_t *slurmdbd_conn, step.jobacct = step_comp_msg->jobacct; job.job_id = step_comp_msg->job_id; step.requid = step_comp_msg->req_uid; + job.start_protocol_ver = slurmdbd_conn->conn->version; job.start_time = step_comp_msg->start_time; + job.tres_alloc_str = step_comp_msg->job_tres_alloc_str; step.state = step_comp_msg->state; step.step_id = step_comp_msg->step_id; details.submit_time = step_comp_msg->job_submit_time; @@ -3426,6 +3432,7 @@ static int _step_start(slurmdbd_conn_t *slurmdbd_conn, step.name = step_start_msg->name; job.nodes = step_start_msg->nodes; step.network = step_start_msg->node_inx; + job.start_protocol_ver = slurmdbd_conn->conn->version; step.start_time = step_start_msg->start_time; details.submit_time = step_start_msg->job_submit_time; step.step_id = step_start_msg->step_id; -- GitLab