From 652b98700e25a6c1c20f3bd5a4e2cb2f1da98d80 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Fri, 9 Apr 2010 22:11:03 +0000 Subject: [PATCH] ok, it appears things are handled correctly for jobs when they resize, it is still needed to add steps resizing --- src/common/slurm_protocol_defs.h | 2 + .../filetxt/accounting_storage_filetxt.c | 2 +- .../mysql/accounting_storage_mysql.c | 36 +++--- .../accounting_storage/mysql/as_mysql_job.c | 121 ++++++++++++------ .../mysql/as_mysql_jobacct_process.c | 32 +++-- .../accounting_storage/pgsql/jobacct.c | 87 ++++++++----- .../slurmdbd/accounting_storage_slurmdbd.c | 60 +++++---- src/plugins/jobcomp/mysql/jobcomp_mysql.c | 2 +- src/plugins/jobcomp/pgsql/jobcomp_pgsql.c | 2 +- src/plugins/jobcomp/script/jobcomp_script.c | 2 +- src/sacct/options.c | 1 - src/slurmctld/job_mgr.c | 31 +++-- src/slurmdbd/proc_req.c | 16 ++- 13 files changed, 251 insertions(+), 143 deletions(-) diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index bfb7d722826..c84dfb15828 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -99,6 +99,8 @@ ((_X->job_state & JOB_STATE_BASE) > JOB_SUSPENDED) #define IS_JOB_COMPLETED(_X) \ (IS_JOB_FINISHED(_X) && ((_X->job_state & JOB_COMPLETING) == 0)) +#define IS_JOB_RESIZING(_X) \ + (_X->job_state & JOB_RESIZING) /* Defined node states */ #define IS_NODE_UNKNOWN(_X) \ diff --git a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c index 7730e23da70..703698c2559 100644 --- a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c +++ b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c @@ -598,7 +598,7 @@ extern int jobacct_storage_p_job_complete(void *db_conn, } debug2("slurmdb_job_complete() called"); - if (job_ptr->job_state & JOB_RESIZING) { + if (IS_JOB_RESIZING(job_ptr)) { job_state = JOB_RESIZING; if (job_ptr->resize_time) duration = time(NULL) - job_ptr->resize_time; diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c index c472e129525..8a04842f0ec 100644 --- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c +++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c @@ -672,7 +672,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name) storage_field_t assoc_table_fields[] = { { "creation_time", "int unsigned not null" }, { "mod_time", "int unsigned default 0 not null" }, - { "deleted", "tinyint default 0" }, + { "deleted", "tinyint default 0 not null" }, { "id_assoc", "int not null auto_increment" }, { "user", "tinytext not null default ''" }, { "acct", "tinytext not null" }, @@ -701,25 +701,25 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name) storage_field_t assoc_usage_table_fields[] = { { "creation_time", "int unsigned not null" }, { "mod_time", "int unsigned default 0 not null" }, - { "deleted", "tinyint default 0" }, + { "deleted", "tinyint default 0 not null" }, { "id_assoc", "int not null" }, { "time_start", "int unsigned not null" }, - { "alloc_cpu_secs", "bigint default 0" }, + { "alloc_cpu_secs", "bigint default 0 not null" }, { NULL, NULL} }; storage_field_t cluster_usage_table_fields[] = { { "creation_time", "int unsigned not null" }, { "mod_time", "int unsigned default 0 not null" }, - { "deleted", "tinyint default 0" }, + { "deleted", "tinyint default 0 not null" }, { "time_start", "int unsigned not null" }, - { "cpu_count", "int default 0" }, - { "alloc_cpu_secs", "bigint default 0" }, - { "down_cpu_secs", "bigint default 0" }, - { "pdown_cpu_secs", "bigint default 0" }, - { "idle_cpu_secs", "bigint default 0" }, - { "resv_cpu_secs", "bigint default 0" }, - { "over_cpu_secs", "bigint default 0" }, + { "cpu_count", "int default 0 not null" }, + { "alloc_cpu_secs", "bigint default 0 not null" }, + { "down_cpu_secs", "bigint default 0 not null" }, + { "pdown_cpu_secs", "bigint default 0 not null" }, + { "idle_cpu_secs", "bigint default 0 not null" }, + { "resv_cpu_secs", "bigint default 0 not null" }, + { "over_cpu_secs", "bigint default 0 not null" }, { NULL, NULL} }; @@ -737,7 +737,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name) storage_field_t job_table_fields[] = { { "job_db_inx", "int not null auto_increment" }, - { "deleted", "tinyint default 0" }, + { "deleted", "tinyint default 0 not null" }, { "account", "tinytext" }, { "cpus_req", "int unsigned not null" }, { "cpus_alloc", "int unsigned not null" }, @@ -756,7 +756,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name) { "node_inx", "text" }, { "partition", "tinytext not null" }, { "priority", "int not null" }, - { "qos", "smallint default 0" }, + { "qos", "smallint default 0 not null" }, { "state", "smallint unsigned not null" }, { "timelimit", "int unsigned default 0 not null" }, { "time_submit", "int unsigned default 0 not null" }, @@ -771,7 +771,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name) storage_field_t resv_table_fields[] = { { "id_resv", "int unsigned default 0 not null" }, - { "deleted", "tinyint default 0" }, + { "deleted", "tinyint default 0 not null" }, { "assoclist", "text not null default ''" }, { "cpus", "int unsigned not null" }, { "flags", "smallint unsigned default 0 not null" }, @@ -785,7 +785,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name) storage_field_t step_table_fields[] = { { "job_db_inx", "int not null" }, - { "deleted", "tinyint default 0" }, + { "deleted", "tinyint default 0 not null" }, { "cpus_alloc", "int unsigned not null" }, { "exit_code", "int default 0 not null" }, { "id_step", "smallint not null" }, @@ -796,7 +796,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name) { "state", "smallint unsigned not null" }, { "step_name", "text not null" }, { "task_cnt", "int unsigned not null" }, - { "task_dist", "smallint default 0" }, + { "task_dist", "smallint default 0 not null" }, { "time_start", "int unsigned default 0 not null" }, { "time_end", "int unsigned default 0 not null" }, { "time_suspended", "int unsigned default 0 not null" }, @@ -834,7 +834,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name) storage_field_t wckey_table_fields[] = { { "creation_time", "int unsigned not null" }, { "mod_time", "int unsigned default 0 not null" }, - { "deleted", "tinyint default 0" }, + { "deleted", "tinyint default 0 not null" }, { "id_wckey", "int not null auto_increment" }, { "wckey_name", "tinytext not null default ''" }, { "user", "tinytext not null" }, @@ -844,7 +844,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name) storage_field_t wckey_usage_table_fields[] = { { "creation_time", "int unsigned not null" }, { "mod_time", "int unsigned default 0 not null" }, - { "deleted", "tinyint default 0" }, + { "deleted", "tinyint default 0 not null" }, { "id_wckey", "int not null" }, { "time_start", "int unsigned not null" }, { "alloc_cpu_secs", "bigint default 0" }, diff --git a/src/plugins/accounting_storage/mysql/as_mysql_job.c b/src/plugins/accounting_storage/mysql/as_mysql_job.c index 47a29fbb123..c00c9e01fcd 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_job.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_job.c @@ -206,7 +206,7 @@ no_wckeyid: /* extern functions */ extern int as_mysql_job_start(mysql_conn_t *mysql_conn, - struct job_record *job_ptr) + struct job_record *job_ptr) { int rc=SLURM_SUCCESS; char *nodes = NULL, *jname = NULL, *node_inx = NULL; @@ -217,8 +217,10 @@ extern int as_mysql_job_start(mysql_conn_t *mysql_conn, time_t begin_time, check_time, start_time, submit_time; uint32_t wckeyid = 0; int job_state, node_cnt = 0; +// uint32_t job_db_inx = job_ptr->db_index; - if (!job_ptr->details || !job_ptr->details->submit_time) { + if ((!job_ptr->details || !job_ptr->details->submit_time) + && !job_ptr->resize_time) { error("as_mysql_job_start: " "Not inputing this job, it has no submit time."); return SLURM_ERROR; @@ -228,6 +230,21 @@ extern int as_mysql_job_start(mysql_conn_t *mysql_conn, return ESLURM_DB_CONNECTION; debug2("as_mysql_slurmdb_job_start() called"); + + job_state = job_ptr->job_state; + + /* Since we need a new db_inx make sure the old db_inx + * removed. This is most likely the only time we are going to + * be notified of the change also so make the state without + * the resize. */ + if(IS_JOB_RESIZING(job_ptr)) { + as_mysql_job_complete(mysql_conn, job_ptr); + job_state &= (~JOB_RESIZING); + job_ptr->db_index = 0; + } + + job_state &= JOB_STATE_BASE; + if (job_ptr->resize_time) { begin_time = job_ptr->resize_time; submit_time = job_ptr->resize_time; @@ -237,10 +254,6 @@ extern int as_mysql_job_start(mysql_conn_t *mysql_conn, submit_time = job_ptr->details->submit_time; start_time = job_ptr->start_time; } - if (job_ptr->job_state & JOB_RESIZING) - job_state = JOB_RESIZING; - else - job_state = job_ptr->job_state & JOB_STATE_BASE; /* See what we are hearing about here if no start time. If * this job latest time is before the last roll up we will @@ -368,13 +381,6 @@ no_rollup_change: mysql_conn->cluster_name, job_ptr->assoc_id); - - /* We need to put a 0 for 'end' incase of funky job state - * files from a hot start of the controllers we call - * job_start on jobs we may still know about after - * job_flush has been called so we need to restart - * them by zeroing out the end. - */ if(!job_ptr->db_index) { if(!begin_time) begin_time = submit_time; @@ -491,6 +497,10 @@ no_rollup_change: xfree(block_id); xfree(query); + /* now we will reset all the steps */ + /* if(IS_JOB_RESIZING(job_ptr)) */ + /* rc = _resize_steps(job_db_inx, job_ptr); */ + return rc; } @@ -499,10 +509,11 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn, { char *query = NULL, *nodes = NULL; int rc = SLURM_SUCCESS, job_state; - time_t start_time, end_time; + time_t submit_time, end_time; if (!job_ptr->db_index - && (!job_ptr->details || !job_ptr->details->submit_time)) { + && ((!job_ptr->details || !job_ptr->details->submit_time) + && !job_ptr->resize_time)) { error("as_mysql_job_complete: " "Not inputing this job, it has no submit time."); return SLURM_ERROR; @@ -512,13 +523,13 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn, return ESLURM_DB_CONNECTION; debug2("as_mysql_slurmdb_job_complete() called"); - if (job_ptr->resize_time) { - start_time = job_ptr->resize_time; - } else { - start_time = job_ptr->start_time; - } - if (job_ptr->job_state & JOB_RESIZING) { - end_time = time(NULL); + if (job_ptr->resize_time) + submit_time = job_ptr->resize_time; + else + submit_time = job_ptr->details->submit_time; + + if (IS_JOB_RESIZING(job_ptr)) { + end_time = job_ptr->resize_time; job_state = JOB_RESIZING; } else { /* If we get an error with this just fall through to avoid an @@ -531,8 +542,6 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn, end_time = job_ptr->end_time; job_state = job_ptr->job_state & JOB_STATE_BASE; } - if (start_time > end_time) - start_time = 0; slurm_mutex_lock(&rollup_lock); if(end_time < global_last_rollup) { @@ -558,7 +567,7 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn, if(!job_ptr->db_index) { if(!(job_ptr->db_index = _get_db_index(mysql_conn, - job_ptr->details->submit_time, + submit_time, job_ptr->job_id, job_ptr->assoc_id))) { /* If we get an error with this just fall @@ -573,13 +582,14 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn, } } - query = xstrdup_printf("update \"%s_%s\" set time_start=%d, time_end=%d, " + query = xstrdup_printf("update \"%s_%s\" set time_end=%d, " "state=%d, nodelist='%s', exit_code=%d, " - "kill_requid=%d where job_db_inx=%d", + "kill_requid=%d where job_db_inx=%d;", mysql_conn->cluster_name, job_table, - start_time, end_time, job_state, + end_time, job_state, nodes, job_ptr->exit_code, job_ptr->requid, job_ptr->db_index); + debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); rc = mysql_db_query(mysql_conn->db_conn, query); @@ -589,25 +599,37 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn, } extern int as_mysql_step_start(mysql_conn_t *mysql_conn, - struct step_record *step_ptr) + struct step_record *step_ptr) { int cpus = 0, tasks = 0, nodes = 0, task_dist = 0; int rc=SLURM_SUCCESS; char node_list[BUFFER_SIZE]; char *node_inx = NULL; + time_t start_time, submit_time; + #ifdef HAVE_BG char *ionodes = NULL; #endif char *query = NULL; if (!step_ptr->job_ptr->db_index - && (!step_ptr->job_ptr->details - || !step_ptr->job_ptr->details->submit_time)) { + && ((!step_ptr->job_ptr->details + || !step_ptr->job_ptr->details->submit_time) + && !step_ptr->job_ptr->resize_time)) { error("as_mysql_step_start: " "Not inputing this job, it has no submit time."); return SLURM_ERROR; } + if (step_ptr->job_ptr->resize_time) { + submit_time = start_time = step_ptr->job_ptr->resize_time; + if(step_ptr->start_time > submit_time) + start_time = step_ptr->start_time; + } else { + start_time = step_ptr->start_time; + submit_time = step_ptr->job_ptr->details->submit_time; + } + if(check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; if(slurmdbd_conf) { @@ -660,7 +682,7 @@ extern int as_mysql_step_start(mysql_conn_t *mysql_conn, if(!step_ptr->job_ptr->db_index) { if(!(step_ptr->job_ptr->db_index = _get_db_index(mysql_conn, - step_ptr->job_ptr->details->submit_time, + submit_time, step_ptr->job_ptr->job_id, step_ptr->job_ptr->assoc_id))) { /* If we get an error with this just fall @@ -690,7 +712,7 @@ extern int as_mysql_step_start(mysql_conn_t *mysql_conn, mysql_conn->cluster_name, step_table, step_ptr->job_ptr->db_index, step_ptr->step_id, - (int)step_ptr->start_time, step_ptr->name, + (int)start_time, step_ptr->name, JOB_RUNNING, cpus, nodes, tasks, node_list, node_inx, task_dist, cpus, nodes, tasks, JOB_RUNNING, node_list, node_inx, task_dist); @@ -716,15 +738,26 @@ extern int as_mysql_step_complete(mysql_conn_t *mysql_conn, char *query = NULL; int rc =SLURM_SUCCESS; uint32_t exit_code = 0; + time_t start_time, submit_time; if (!step_ptr->job_ptr->db_index - && (!step_ptr->job_ptr->details - || !step_ptr->job_ptr->details->submit_time)) { + && ((!step_ptr->job_ptr->details + || !step_ptr->job_ptr->details->submit_time) + && !step_ptr->job_ptr->resize_time)) { error("as_mysql_step_complete: " "Not inputing this job, it has no submit time."); return SLURM_ERROR; } + if (step_ptr->job_ptr->resize_time) { + submit_time = start_time = step_ptr->job_ptr->resize_time; + if(step_ptr->start_time > submit_time) + start_time = step_ptr->start_time; + } else { + start_time = step_ptr->start_time; + submit_time = step_ptr->job_ptr->details->submit_time; + } + if (jobacct == NULL) { /* JobAcctGather=slurmdb_gather/none, no data to process */ memset(&dummy_jobacct, 0, sizeof(dummy_jobacct)); @@ -753,8 +786,8 @@ extern int as_mysql_step_complete(mysql_conn_t *mysql_conn, #endif } - if ((elapsed=now-step_ptr->start_time)<0) - elapsed=0; /* For *very* short jobs, if clock is wrong */ + if ((elapsed = (now - start_time)) < 0) + elapsed = 0; /* For *very* short jobs, if clock is wrong */ exit_code = step_ptr->exit_code; if (exit_code == NO_VAL) { @@ -786,7 +819,7 @@ extern int as_mysql_step_complete(mysql_conn_t *mysql_conn, if(!step_ptr->job_ptr->db_index) { if(!(step_ptr->job_ptr->db_index = _get_db_index(mysql_conn, - step_ptr->job_ptr->details->submit_time, + submit_time, step_ptr->job_ptr->job_id, step_ptr->job_ptr->assoc_id))) { /* If we get an error with this just fall @@ -853,18 +886,26 @@ extern int as_mysql_step_complete(mysql_conn_t *mysql_conn, return rc; } -extern int as_mysql_suspend(mysql_conn_t *mysql_conn, struct job_record *job_ptr) +extern int as_mysql_suspend(mysql_conn_t *mysql_conn, + struct job_record *job_ptr) { char *query = NULL; int rc = SLURM_SUCCESS; bool suspended = false; + time_t submit_time; if(check_connection(mysql_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; + + if (job_ptr->resize_time) + submit_time = job_ptr->resize_time; + else + submit_time = job_ptr->details->submit_time; + if(!job_ptr->db_index) { if(!(job_ptr->db_index = _get_db_index(mysql_conn, - job_ptr->details->submit_time, + submit_time, job_ptr->job_id, job_ptr->assoc_id))) { /* If we get an error with this just fall diff --git a/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c b/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c index cd0f291477f..64f756378a2 100644 --- a/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c +++ b/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c @@ -322,7 +322,7 @@ static int _cluster_get_jobs(mysql_conn_t *mysql_conn, int set = 0; char *prefix="t2"; int rc = SLURM_SUCCESS; - int last_id = -1, curr_id = -1; + int last_id = -1, curr_id = -1, last_state = -1; local_cluster_t *curr_cluster = NULL; /* This is here to make sure we are looking at only this user @@ -390,12 +390,10 @@ static int _cluster_get_jobs(mysql_conn_t *mysql_conn, } /* Here we want to order them this way in such a way so it is - easy to look for duplicates + easy to look for duplicates, it is also easy to sort the + resized jobs. */ - if(job_cond && !job_cond->duplicates) - xstrcat(query, " order by id_job, time_submit desc"); - else - xstrcat(query, " order by time_submit desc"); + xstrcat(query, " group by id_job, time_submit desc"); debug3("%d(%s:%d) query\n%s", mysql_conn->conn, THIS_FILE, __LINE__, query); @@ -413,22 +411,32 @@ static int _cluster_get_jobs(mysql_conn_t *mysql_conn, curr_id = atoi(row[JOB_REQ_JOBID]); - if(job_cond && !job_cond->duplicates && curr_id == last_id) + if(job_cond && !job_cond->duplicates + && (curr_id == last_id) + && (atoi(row[JOB_REQ_STATE]) != JOB_RESIZING)) continue; - last_id = curr_id; - /* check the bitmap to see if this is one of the jobs we are looking for */ if(!good_nodes_from_inx(local_cluster_list, (void **)&curr_cluster, - row[JOB_REQ_NODE_INX], submit)) + row[JOB_REQ_NODE_INX], submit)) { + last_id = curr_id; continue; + } job = slurmdb_create_job_rec(); - list_append(job_list, job); - job->state = atoi(row[JOB_REQ_STATE]); + last_state = job->state; + if(curr_id == last_id) + /* put in reverse so we order by the submit getting + larger which it is given to us in reverse + order from the database */ + list_prepend(job_list, job); + else + list_append(job_list, job); + last_id = curr_id; + job->alloc_cpus = atoi(row[JOB_REQ_ALLOC_CPUS]); job->alloc_nodes = atoi(row[JOB_REQ_ALLOC_NODES]); job->associd = atoi(row[JOB_REQ_ASSOCID]); diff --git a/src/plugins/accounting_storage/pgsql/jobacct.c b/src/plugins/accounting_storage/pgsql/jobacct.c index 5cab8b2d3d4..46298b903be 100644 --- a/src/plugins/accounting_storage/pgsql/jobacct.c +++ b/src/plugins/accounting_storage/pgsql/jobacct.c @@ -190,13 +190,20 @@ _get_db_index(pgsql_conn_t *pg_conn, time_t submit, uint32_t jobid, static int _check_job_db_index(pgsql_conn_t *pg_conn, struct job_record *job_ptr) { + time_t submit_time; + + if (job_ptr->resize_time) + submit_time = job_ptr->resize_time; + else + submit_time = job_ptr->details->submit_time; + if(!job_ptr->db_index) { job_ptr->db_index = _get_db_index( pg_conn, - job_ptr->details->submit_time, + submit_time, job_ptr->job_id, job_ptr->assoc_id); - if (! job_ptr->db_index) { + if (!job_ptr->db_index) { /* If we get an error with this just fall * through to avoid an infinite loop */ @@ -375,7 +382,8 @@ js_p_job_start(pgsql_conn_t *pg_conn, int job_state, node_cnt = 0; uint32_t wckeyid = 0; - if (!job_ptr->details || !job_ptr->details->submit_time) { + if ((!job_ptr->details || !job_ptr->details->submit_time) + && !job_ptr->resize_time) { error("as/pg: job_start: Not inputing this job, " "it has no submit time."); return SLURM_ERROR; @@ -385,6 +393,21 @@ js_p_job_start(pgsql_conn_t *pg_conn, return ESLURM_DB_CONNECTION; debug3("as/pg: job_start() called"); + + job_state = job_ptr->job_state; + + /* Since we need a new db_inx make sure the old db_inx + * removed. This is most likely the only time we are going to + * be notified of the change also so make the state without + * the resize. */ + if(job_state & JOB_RESIZING) { + js_p_job_complete(pg_conn, job_ptr); + job_state &= (~JOB_RESIZING); + job_ptr->db_index = 0; + } + + job_state &= JOB_STATE_BASE; + if (job_ptr->resize_time) { begin_time = job_ptr->resize_time; submit_time = job_ptr->resize_time; @@ -394,10 +417,6 @@ js_p_job_start(pgsql_conn_t *pg_conn, submit_time = job_ptr->details->submit_time; start_time = job_ptr->start_time; } - if (job_ptr->job_state & JOB_RESIZING) - job_state = JOB_RESIZING; - else - job_state = job_ptr->job_state & JOB_STATE_BASE; /* See what we are hearing about here if no start time. If * this job latest time is before the last roll up we will @@ -512,12 +531,6 @@ no_rollup_change: job_ptr->user_id, pg_conn->cluster_name, job_ptr->assoc_id); - /* We need to put a 0 for 'end' incase of funky job state - * files from a hot start of the controllers we call - * job_start on jobs we may still know about after - * job_flush has been called so we need to restart - * them by zeroing out the end. - */ if(!job_ptr->db_index) { if (!begin_time) begin_time = submit_time; @@ -626,10 +639,11 @@ js_p_job_complete(pgsql_conn_t *pg_conn, { char *query = NULL, *nodes = NULL; int rc = SLURM_SUCCESS, job_state; - time_t start_time, end_time; + time_t end_time; if (!job_ptr->db_index - && (!job_ptr->details || !job_ptr->details->submit_time)) { + && ((!job_ptr->details || !job_ptr->details->submit_time) + && !job_ptr->resize_time)) { error("jobacct_storage_p_job_complete: " "Not inputing this job, it has no submit time."); return SLURM_ERROR; @@ -640,13 +654,8 @@ js_p_job_complete(pgsql_conn_t *pg_conn, debug2("as/pg: job_complete() called"); - if (job_ptr->resize_time) { - start_time = job_ptr->resize_time; - } else { - start_time = job_ptr->start_time; - } - if (job_ptr->job_state & JOB_RESIZING) { - end_time = time(NULL); + if (IS_JOB_RESIZING(job_ptr)) { + end_time = job_ptr->resize_time; job_state = JOB_RESIZING; } else { /* If we get an error with this just fall through to avoid an @@ -659,8 +668,6 @@ js_p_job_complete(pgsql_conn_t *pg_conn, end_time = job_ptr->end_time; job_state = job_ptr->job_state & JOB_STATE_BASE; } - if (start_time > end_time) - start_time = 0; slurm_mutex_lock(&rollup_lock); if(end_time < global_last_rollup) { @@ -686,10 +693,10 @@ js_p_job_complete(pgsql_conn_t *pg_conn, if (_check_job_db_index(pg_conn, job_ptr) != SLURM_SUCCESS) return SLURM_SUCCESS; - query = xstrdup_printf("UPDATE %s SET start=%d, endtime=%d, state=%d, " + query = xstrdup_printf("UPDATE %s SET endtime=%d, state=%d, " "nodelist='%s', comp_code=%d, " "kill_requid=%d WHERE id=%d", - job_table, start_time, end_time, job_state, + job_table, end_time, job_state, nodes, job_ptr->exit_code, job_ptr->requid, job_ptr->db_index); rc = DEF_QUERY_RET_RC; @@ -716,15 +723,22 @@ js_p_step_start(pgsql_conn_t *pg_conn, char *ionodes = NULL; #endif char *query = NULL, *rec = NULL; + time_t start_time; if (!step_ptr->job_ptr->db_index - && (!step_ptr->job_ptr->details - || !step_ptr->job_ptr->details->submit_time)) { + && ((!step_ptr->job_ptr->details + || !step_ptr->job_ptr->details->submit_time) + && !step_ptr->job_ptr->resize_time)) { error("jobacct_storage_p_step_start: " "Not inputing this job step, it has no submit time."); return SLURM_ERROR; } + if(step_ptr->start_time > step_ptr->job_ptr->resize_time) + start_time = step_ptr->start_time; + else + start_time = step_ptr->job_ptr->resize_time; + if(check_db_connection(pg_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; @@ -785,7 +799,7 @@ js_p_step_start(pgsql_conn_t *pg_conn, step_ptr->job_ptr->db_index, /* deleted=0 */ step_ptr->step_id, - step_ptr->start_time, + start_time, /* endtime=0 */ /* suspended=0 */ step_ptr->name ?: "", @@ -829,15 +843,22 @@ js_p_step_complete(pgsql_conn_t *pg_conn, char *query = NULL; int rc =SLURM_SUCCESS; uint32_t exit_code; + time_t start_time; if (!step_ptr->job_ptr->db_index - && (!step_ptr->job_ptr->details - || !step_ptr->job_ptr->details->submit_time)) { + && ((!step_ptr->job_ptr->details + || !step_ptr->job_ptr->details->submit_time) + && !step_ptr->job_ptr->resize_time)) { error("jobacct_storage_p_step_complete: " "Not inputing this job step, it has no submit time."); return SLURM_ERROR; } + if(step_ptr->start_time > step_ptr->job_ptr->resize_time) + start_time = step_ptr->start_time; + else + start_time = step_ptr->job_ptr->resize_time; + if(check_db_connection(pg_conn) != SLURM_SUCCESS) return ESLURM_DB_CONNECTION; @@ -866,8 +887,8 @@ js_p_step_complete(pgsql_conn_t *pg_conn, #endif } - if ((elapsed=now-step_ptr->start_time)<0) - elapsed=0; /* For *very* short jobs, if clock is wrong */ + if ((elapsed = (now - start_time)) < 0) + elapsed = 0; /* For *very* short jobs, if clock is wrong */ exit_code = step_ptr->exit_code; if (exit_code == NO_VAL) { diff --git a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c index faa6394bcce..460766cd5d6 100644 --- a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c +++ b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c @@ -1620,24 +1620,21 @@ extern int jobacct_storage_p_job_start(void *db_conn, req.alloc_nodes = job_ptr->node_cnt; #endif req.block_id = block_id; - req.db_index = job_ptr->db_index; if (job_ptr->resize_time) { req.eligible_time = job_ptr->resize_time; - req.start_time = job_ptr->resize_time; req.submit_time = job_ptr->resize_time; - } else { - if (job_ptr->details) { - req.eligible_time = job_ptr->details->begin_time; - req.submit_time = job_ptr->details->submit_time; - } - req.start_time = job_ptr->start_time; + } else if (job_ptr->details) { + req.eligible_time = job_ptr->details->begin_time; + req.submit_time = job_ptr->details->submit_time; } + + req.start_time = job_ptr->start_time; req.gid = job_ptr->group_id; req.job_id = job_ptr->job_id; - if (job_ptr->job_state & JOB_RESIZING) - req.job_state = JOB_RESIZING; - else - req.job_state = job_ptr->job_state & JOB_STATE_BASE; + + req.db_index = job_ptr->db_index; + + req.job_state = job_ptr->job_state; req.name = job_ptr->name; req.nodes = job_ptr->nodes; if(job_ptr->node_bitmap) { @@ -1660,7 +1657,7 @@ extern int jobacct_storage_p_job_start(void *db_conn, /* if we already have the db_index don't wait around for it * again just send the message */ - if(req.db_index) { + if(req.db_index && !IS_JOB_RESIZING(job_ptr)) { if (slurm_send_slurmdbd_msg(SLURMDBD_VERSION, &msg) < 0) { xfree(block_id); return SLURM_ERROR; @@ -1715,8 +1712,8 @@ extern int jobacct_storage_p_job_complete(void *db_conn, req.db_index = job_ptr->db_index; req.exit_code = job_ptr->exit_code; req.job_id = job_ptr->job_id; - if (job_ptr->job_state & JOB_RESIZING) { - req.end_time = time(NULL); + if (IS_JOB_RESIZING(job_ptr)) { + req.end_time = job_ptr->resize_time; req.job_state = JOB_RESIZING; } else { req.end_time = job_ptr->end_time; @@ -1724,6 +1721,7 @@ extern int jobacct_storage_p_job_complete(void *db_conn, } req.req_uid = job_ptr->requid; req.nodes = job_ptr->nodes; + if (job_ptr->resize_time) { req.start_time = job_ptr->resize_time; req.submit_time = job_ptr->resize_time; @@ -1810,8 +1808,15 @@ extern int jobacct_storage_p_step_start(void *db_conn, step_ptr->step_node_bitmap); } req.node_cnt = nodes; - req.start_time = step_ptr->start_time; - if (step_ptr->job_ptr->details) + + if(step_ptr->start_time > step_ptr->job_ptr->resize_time) + req.start_time = step_ptr->start_time; + else + req.start_time = step_ptr->job_ptr->resize_time; + + if (step_ptr->job_ptr->resize_time) + req.job_submit_time = step_ptr->job_ptr->resize_time; + else if (step_ptr->job_ptr->details) req.job_submit_time = step_ptr->job_ptr->details->submit_time; req.step_id = step_ptr->step_id; if (step_ptr->step_layout) @@ -1873,8 +1878,9 @@ extern int jobacct_storage_p_step_complete(void *db_conn, #endif if (!step_ptr->job_ptr->db_index - && (!step_ptr->job_ptr->details - || !step_ptr->job_ptr->details->submit_time)) { + && ((!step_ptr->job_ptr->details + || !step_ptr->job_ptr->details->submit_time) + && !step_ptr->job_ptr->resize_time)) { error("jobacct_storage_p_step_complete: " "Not inputing this job, it has no submit time."); return SLURM_ERROR; @@ -1889,8 +1895,14 @@ extern int jobacct_storage_p_step_complete(void *db_conn, req.jobacct = step_ptr->jobacct; req.job_id = step_ptr->job_ptr->job_id; req.req_uid = step_ptr->requid; - req.start_time = step_ptr->start_time; - if (step_ptr->job_ptr->details) + if(step_ptr->start_time > step_ptr->job_ptr->resize_time) + req.start_time = step_ptr->start_time; + else + req.start_time = step_ptr->job_ptr->resize_time; + + if (step_ptr->job_ptr->resize_time) + req.job_submit_time = step_ptr->job_ptr->resize_time; + else if (step_ptr->job_ptr->details) req.job_submit_time = step_ptr->job_ptr->details->submit_time; req.step_id = step_ptr->step_id; req.total_cpus = cpus; @@ -1920,8 +1932,12 @@ extern int jobacct_storage_p_suspend(void *db_conn, req.job_id = job_ptr->job_id; req.db_index = job_ptr->db_index; req.job_state = job_ptr->job_state & JOB_STATE_BASE; - if (job_ptr->details) + + if (job_ptr->resize_time) + req.submit_time = job_ptr->resize_time; + else if (job_ptr->details) req.submit_time = job_ptr->details->submit_time; + req.suspend_time = job_ptr->suspend_time; msg.msg_type = DBD_JOB_SUSPEND; msg.data = &req; diff --git a/src/plugins/jobcomp/mysql/jobcomp_mysql.c b/src/plugins/jobcomp/mysql/jobcomp_mysql.c index 542d7966282..0ae69e3452c 100644 --- a/src/plugins/jobcomp/mysql/jobcomp_mysql.c +++ b/src/plugins/jobcomp/mysql/jobcomp_mysql.c @@ -318,7 +318,7 @@ extern int slurm_jobcomp_log_record(struct job_record *job_ptr) /* Job will typically be COMPLETING when this is called. * We remove the flags to get the eventual completion state: * JOB_FAILED, JOB_TIMEOUT, etc. */ - if (job_ptr->job_state & JOB_RESIZING) { + if (IS_JOB_RESIZING(job_ptr)) { job_state = JOB_RESIZING; if (job_ptr->resize_time) start_time = job_ptr->resize_time; diff --git a/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c b/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c index 53ef6a80db9..3ec7ceb7061 100644 --- a/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c +++ b/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c @@ -340,7 +340,7 @@ extern int slurm_jobcomp_log_record(struct job_record *job_ptr) /* Job will typically be COMPLETING when this is called. * We remove the flags to get the eventual completion state: * JOB_FAILED, JOB_TIMEOUT, etc. */ - if (job_ptr->job_state & JOB_RESIZING) { + if (IS_JOB_RESIZING(job_ptr)) { job_state = JOB_RESIZING; if (job_ptr->resize_time) start_time = job_ptr->resize_time; diff --git a/src/plugins/jobcomp/script/jobcomp_script.c b/src/plugins/jobcomp/script/jobcomp_script.c index f6ee790bd00..6f15422317d 100644 --- a/src/plugins/jobcomp/script/jobcomp_script.c +++ b/src/plugins/jobcomp/script/jobcomp_script.c @@ -205,7 +205,7 @@ static struct jobcomp_info * _jobcomp_info_create (struct job_record *job) j->gid = job->group_id; j->name = xstrdup (job->name); - if (job->job_state & JOB_RESIZING) { + if (IS_JOB_RESIZING(job)) { state = JOB_RESIZING; j->jobstate = xstrdup (job_state_string (state)); if (job->resize_time) diff --git a/src/sacct/options.c b/src/sacct/options.c index f3587586c83..eb5f085c229 100644 --- a/src/sacct/options.c +++ b/src/sacct/options.c @@ -523,7 +523,6 @@ int get_data(void) jobs = g_slurm_jobcomp_get_jobs(job_cond); return SLURM_SUCCESS; } else { - jobs = slurmdb_jobs_get(acct_db_conn, job_cond); } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 0f9406f39e0..492c2e451ab 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1754,7 +1754,8 @@ extern int kill_running_job_by_node_name(char *node_name) job_ptr->job_id, node_name); _set_job_prio(job_ptr); snprintf(requeue_msg, sizeof(requeue_msg), - "Job requeued due to failure of node %s", + "Job requeued due to failure " + "of node %s", node_name); slurm_sched_requeue(job_ptr, requeue_msg); job_ptr->time_last_active = now; @@ -6028,10 +6029,14 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) excise_node_from_job(job_ptr, node_ptr); } job_post_resize_acctg(job_ptr); - } + update_accounting = false; + } else + /* Since job_post_resize_acctg will restart + things don't do it again. */ + update_accounting = true; + FREE_NULL_BITMAP(req_bitmap); xfree(job_specs->req_nodes); - update_accounting = true; } #endif @@ -6100,8 +6105,10 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) info("sched: update_job: set nodes to %s for " "job_id %u", job_ptr->nodes, job_specs->job_id); + /* Since job_post_resize_acctg will restart + things don't do it again. */ + update_accounting = false; } - update_accounting = true; } #endif @@ -6366,6 +6373,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) #endif if(update_accounting) { + info("updating accounting"); if (job_ptr->details && job_ptr->details->begin_time) { /* Update job record in accounting to reflect * changes */ @@ -6380,9 +6388,9 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) extern void job_pre_resize_acctg(struct job_record *job_ptr) { job_ptr->job_state |= JOB_RESIZING; + job_ptr->resize_time = time(NULL); job_completion_logger(job_ptr); job_ptr->job_state &= (~JOB_RESIZING); - job_ptr->resize_time = time(NULL); } /* Record accounting information for a job immediately after changing size */ @@ -6997,14 +7005,13 @@ void job_fini (void) extern void job_completion_logger(struct job_record *job_ptr) { int base_state; - bool job_resizing, sent_start = false; + bool sent_start = false; xassert(job_ptr); - job_resizing = job_ptr->job_state & JOB_RESIZING; acct_policy_remove_job_submit(job_ptr); - if (!job_resizing) { + if (!IS_JOB_RESIZING(job_ptr)) { /* Remove configuring state just to make sure it isn't there * since it will throw off displays of the job. */ job_ptr->job_state &= (~JOB_CONFIGURING); @@ -7028,6 +7035,11 @@ extern void job_completion_logger(struct job_record *job_ptr) g_slurm_jobcomp_write(job_ptr); + /* When starting the resized job everything is taken care of + there, so don't call it here. */ + if (IS_JOB_RESIZING(job_ptr)) + return; + if(!job_ptr->assoc_id) { slurmdb_association_rec_t assoc_rec; /* In case accounting enabled after starting the job */ @@ -7053,9 +7065,8 @@ extern void job_completion_logger(struct job_record *job_ptr) * keep track of all jobs, so we will set the db_inx to * INFINITE and the database will understand what happened. */ - if(!job_ptr->nodes && !job_ptr->db_index && !sent_start) { + if(!job_ptr->nodes && !job_ptr->db_index && !sent_start) jobacct_storage_g_job_start(acct_db_conn, job_ptr); - } jobacct_storage_g_job_complete(acct_db_conn, job_ptr); } diff --git a/src/slurmdbd/proc_req.c b/src/slurmdbd/proc_req.c index 72c9cb7a995..0f6c2b96eb9 100644 --- a/src/slurmdbd/proc_req.c +++ b/src/slurmdbd/proc_req.c @@ -1789,8 +1789,6 @@ static int _job_complete(slurmdbd_conn_t *slurmdbd_conn, goto end_it; } - debug2("DBD_JOB_COMPLETE: ID:%u ", job_comp_msg->job_id); - memset(&job, 0, sizeof(struct job_record)); memset(&details, 0, sizeof(struct job_details)); @@ -1806,6 +1804,13 @@ static int _job_complete(slurmdbd_conn_t *slurmdbd_conn, details.submit_time = job_comp_msg->submit_time; job.details = &details; + + if(job.job_state & JOB_RESIZING) { + job.resize_time = job_comp_msg->end_time; + debug2("DBD_JOB_COMPLETE: RESIZE ID:%u", job_comp_msg->job_id); + } else + debug2("DBD_JOB_COMPLETE: ID:%u", job_comp_msg->job_id); + rc = jobacct_storage_g_job_complete(slurmdbd_conn->db_conn, &job); if(rc && errno == 740) /* meaning data is already there */ @@ -1876,7 +1881,12 @@ static int _job_start(slurmdbd_conn_t *slurmdbd_conn, job.details = &details; - if(job.start_time) { + if(job.job_state & JOB_RESIZING) { + job.resize_time = job_start_msg->eligible_time; + debug2("DBD_JOB_START: RESIZE CALL ID:%u NAME:%s INX:%u", + job_start_msg->job_id, job_start_msg->name, + job.db_index); + } else if(job.start_time) { debug2("DBD_JOB_START: START CALL ID:%u NAME:%s INX:%u", job_start_msg->job_id, job_start_msg->name, job.db_index); -- GitLab