From 652b98700e25a6c1c20f3bd5a4e2cb2f1da98d80 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Fri, 9 Apr 2010 22:11:03 +0000
Subject: [PATCH] ok, it appears things are handled correctly for jobs when
 they resize, it is still needed to add steps resizing

---
 src/common/slurm_protocol_defs.h              |   2 +
 .../filetxt/accounting_storage_filetxt.c      |   2 +-
 .../mysql/accounting_storage_mysql.c          |  36 +++---
 .../accounting_storage/mysql/as_mysql_job.c   | 121 ++++++++++++------
 .../mysql/as_mysql_jobacct_process.c          |  32 +++--
 .../accounting_storage/pgsql/jobacct.c        |  87 ++++++++-----
 .../slurmdbd/accounting_storage_slurmdbd.c    |  60 +++++----
 src/plugins/jobcomp/mysql/jobcomp_mysql.c     |   2 +-
 src/plugins/jobcomp/pgsql/jobcomp_pgsql.c     |   2 +-
 src/plugins/jobcomp/script/jobcomp_script.c   |   2 +-
 src/sacct/options.c                           |   1 -
 src/slurmctld/job_mgr.c                       |  31 +++--
 src/slurmdbd/proc_req.c                       |  16 ++-
 13 files changed, 251 insertions(+), 143 deletions(-)

diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index bfb7d722826..c84dfb15828 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -99,6 +99,8 @@
 	((_X->job_state & JOB_STATE_BASE) >  JOB_SUSPENDED)
 #define IS_JOB_COMPLETED(_X)		\
 	(IS_JOB_FINISHED(_X) && ((_X->job_state & JOB_COMPLETING) == 0))
+#define IS_JOB_RESIZING(_X)		\
+	(_X->job_state & JOB_RESIZING)
 
 /* Defined node states */
 #define IS_NODE_UNKNOWN(_X)		\
diff --git a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c
index 7730e23da70..703698c2559 100644
--- a/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c
+++ b/src/plugins/accounting_storage/filetxt/accounting_storage_filetxt.c
@@ -598,7 +598,7 @@ extern int jobacct_storage_p_job_complete(void *db_conn,
 	}
 
 	debug2("slurmdb_job_complete() called");
-	if (job_ptr->job_state & JOB_RESIZING) {
+	if (IS_JOB_RESIZING(job_ptr)) {
 		job_state = JOB_RESIZING;
 		if (job_ptr->resize_time)
 			duration = time(NULL) - job_ptr->resize_time;
diff --git a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c
index c472e129525..8a04842f0ec 100644
--- a/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c
+++ b/src/plugins/accounting_storage/mysql/accounting_storage_mysql.c
@@ -672,7 +672,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name)
 	storage_field_t assoc_table_fields[] = {
 		{ "creation_time", "int unsigned not null" },
 		{ "mod_time", "int unsigned default 0 not null" },
-		{ "deleted", "tinyint default 0" },
+		{ "deleted", "tinyint default 0 not null" },
 		{ "id_assoc", "int not null auto_increment" },
 		{ "user", "tinytext not null default ''" },
 		{ "acct", "tinytext not null" },
@@ -701,25 +701,25 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name)
 	storage_field_t assoc_usage_table_fields[] = {
 		{ "creation_time", "int unsigned not null" },
 		{ "mod_time", "int unsigned default 0 not null" },
-		{ "deleted", "tinyint default 0" },
+		{ "deleted", "tinyint default 0 not null" },
 		{ "id_assoc", "int not null" },
 		{ "time_start", "int unsigned not null" },
-		{ "alloc_cpu_secs", "bigint default 0" },
+		{ "alloc_cpu_secs", "bigint default 0 not null" },
 		{ NULL, NULL}
 	};
 
 	storage_field_t cluster_usage_table_fields[] = {
 		{ "creation_time", "int unsigned not null" },
 		{ "mod_time", "int unsigned default 0 not null" },
-		{ "deleted", "tinyint default 0" },
+		{ "deleted", "tinyint default 0 not null" },
 		{ "time_start", "int unsigned not null" },
-		{ "cpu_count", "int default 0" },
-		{ "alloc_cpu_secs", "bigint default 0" },
-		{ "down_cpu_secs", "bigint default 0" },
-		{ "pdown_cpu_secs", "bigint default 0" },
-		{ "idle_cpu_secs", "bigint default 0" },
-		{ "resv_cpu_secs", "bigint default 0" },
-		{ "over_cpu_secs", "bigint default 0" },
+		{ "cpu_count", "int default 0 not null" },
+		{ "alloc_cpu_secs", "bigint default 0 not null" },
+		{ "down_cpu_secs", "bigint default 0 not null" },
+		{ "pdown_cpu_secs", "bigint default 0 not null" },
+		{ "idle_cpu_secs", "bigint default 0 not null" },
+		{ "resv_cpu_secs", "bigint default 0 not null" },
+		{ "over_cpu_secs", "bigint default 0 not null" },
 		{ NULL, NULL}
 	};
 
@@ -737,7 +737,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name)
 
 	storage_field_t job_table_fields[] = {
 		{ "job_db_inx", "int not null auto_increment" },
-		{ "deleted", "tinyint default 0" },
+		{ "deleted", "tinyint default 0 not null" },
 		{ "account", "tinytext" },
 		{ "cpus_req", "int unsigned not null" },
 		{ "cpus_alloc", "int unsigned not null" },
@@ -756,7 +756,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name)
 		{ "node_inx", "text" },
 		{ "partition", "tinytext not null" },
 		{ "priority", "int not null" },
-		{ "qos", "smallint default 0" },
+		{ "qos", "smallint default 0 not null" },
 		{ "state", "smallint unsigned not null" },
 		{ "timelimit", "int unsigned default 0 not null" },
 		{ "time_submit", "int unsigned default 0 not null" },
@@ -771,7 +771,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name)
 
 	storage_field_t resv_table_fields[] = {
 		{ "id_resv", "int unsigned default 0 not null" },
-		{ "deleted", "tinyint default 0" },
+		{ "deleted", "tinyint default 0 not null" },
 		{ "assoclist", "text not null default ''" },
 		{ "cpus", "int unsigned not null" },
 		{ "flags", "smallint unsigned default 0 not null" },
@@ -785,7 +785,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name)
 
 	storage_field_t step_table_fields[] = {
 		{ "job_db_inx", "int not null" },
-		{ "deleted", "tinyint default 0" },
+		{ "deleted", "tinyint default 0 not null" },
 		{ "cpus_alloc", "int unsigned not null" },
 		{ "exit_code", "int default 0 not null" },
 		{ "id_step", "smallint not null" },
@@ -796,7 +796,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name)
 		{ "state", "smallint unsigned not null" },
 		{ "step_name", "text not null" },
 		{ "task_cnt", "int unsigned not null" },
-		{ "task_dist", "smallint default 0" },
+		{ "task_dist", "smallint default 0 not null" },
 		{ "time_start", "int unsigned default 0 not null" },
 		{ "time_end", "int unsigned default 0 not null" },
 		{ "time_suspended", "int unsigned default 0 not null" },
@@ -834,7 +834,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name)
 	storage_field_t wckey_table_fields[] = {
 		{ "creation_time", "int unsigned not null" },
 		{ "mod_time", "int unsigned default 0 not null" },
-		{ "deleted", "tinyint default 0" },
+		{ "deleted", "tinyint default 0 not null" },
 		{ "id_wckey", "int not null auto_increment" },
 		{ "wckey_name", "tinytext not null default ''" },
 		{ "user", "tinytext not null" },
@@ -844,7 +844,7 @@ extern int create_cluster_tables(MYSQL *db_conn, char *cluster_name)
 	storage_field_t wckey_usage_table_fields[] = {
 		{ "creation_time", "int unsigned not null" },
 		{ "mod_time", "int unsigned default 0 not null" },
-		{ "deleted", "tinyint default 0" },
+		{ "deleted", "tinyint default 0 not null" },
 		{ "id_wckey", "int not null" },
 		{ "time_start", "int unsigned not null" },
 		{ "alloc_cpu_secs", "bigint default 0" },
diff --git a/src/plugins/accounting_storage/mysql/as_mysql_job.c b/src/plugins/accounting_storage/mysql/as_mysql_job.c
index 47a29fbb123..c00c9e01fcd 100644
--- a/src/plugins/accounting_storage/mysql/as_mysql_job.c
+++ b/src/plugins/accounting_storage/mysql/as_mysql_job.c
@@ -206,7 +206,7 @@ no_wckeyid:
 /* extern functions */
 
 extern int as_mysql_job_start(mysql_conn_t *mysql_conn,
-			   struct job_record *job_ptr)
+			      struct job_record *job_ptr)
 {
 	int rc=SLURM_SUCCESS;
 	char *nodes = NULL, *jname = NULL, *node_inx = NULL;
@@ -217,8 +217,10 @@ extern int as_mysql_job_start(mysql_conn_t *mysql_conn,
 	time_t begin_time, check_time, start_time, submit_time;
 	uint32_t wckeyid = 0;
 	int job_state, node_cnt = 0;
+//	uint32_t job_db_inx = job_ptr->db_index;
 
-	if (!job_ptr->details || !job_ptr->details->submit_time) {
+	if ((!job_ptr->details || !job_ptr->details->submit_time)
+	    && !job_ptr->resize_time) {
 		error("as_mysql_job_start: "
 		      "Not inputing this job, it has no submit time.");
 		return SLURM_ERROR;
@@ -228,6 +230,21 @@ extern int as_mysql_job_start(mysql_conn_t *mysql_conn,
 		return ESLURM_DB_CONNECTION;
 
 	debug2("as_mysql_slurmdb_job_start() called");
+
+	job_state = job_ptr->job_state;
+
+	/* Since we need a new db_inx make sure the old db_inx
+	 * removed. This is most likely the only time we are going to
+	 * be notified of the change also so make the state without
+	 * the resize. */
+	if(IS_JOB_RESIZING(job_ptr)) {
+		as_mysql_job_complete(mysql_conn, job_ptr);
+		job_state &= (~JOB_RESIZING);
+		job_ptr->db_index = 0;
+	}
+
+	job_state &= JOB_STATE_BASE;
+
 	if (job_ptr->resize_time) {
 		begin_time  = job_ptr->resize_time;
 		submit_time = job_ptr->resize_time;
@@ -237,10 +254,6 @@ extern int as_mysql_job_start(mysql_conn_t *mysql_conn,
 		submit_time = job_ptr->details->submit_time;
 		start_time  = job_ptr->start_time;
 	}
-	if (job_ptr->job_state & JOB_RESIZING)
-		job_state = JOB_RESIZING;
-	else
-		job_state = job_ptr->job_state & JOB_STATE_BASE;
 
 	/* See what we are hearing about here if no start time. If
 	 * this job latest time is before the last roll up we will
@@ -368,13 +381,6 @@ no_rollup_change:
 				       mysql_conn->cluster_name,
 				       job_ptr->assoc_id);
 
-
-	/* We need to put a 0 for 'end' incase of funky job state
-	 * files from a hot start of the controllers we call
-	 * job_start on jobs we may still know about after
-	 * job_flush has been called so we need to restart
-	 * them by zeroing out the end.
-	 */
 	if(!job_ptr->db_index) {
 		if(!begin_time)
 			begin_time = submit_time;
@@ -491,6 +497,10 @@ no_rollup_change:
 	xfree(block_id);
 	xfree(query);
 
+	/* now we will reset all the steps */
+	/* if(IS_JOB_RESIZING(job_ptr)) */
+	/* 	rc = _resize_steps(job_db_inx, job_ptr); */
+
 	return rc;
 }
 
@@ -499,10 +509,11 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn,
 {
 	char *query = NULL, *nodes = NULL;
 	int rc = SLURM_SUCCESS, job_state;
-	time_t start_time, end_time;
+	time_t submit_time, end_time;
 
 	if (!job_ptr->db_index
-	    && (!job_ptr->details || !job_ptr->details->submit_time)) {
+	    && ((!job_ptr->details || !job_ptr->details->submit_time)
+		&& !job_ptr->resize_time)) {
 		error("as_mysql_job_complete: "
 		      "Not inputing this job, it has no submit time.");
 		return SLURM_ERROR;
@@ -512,13 +523,13 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn,
 		return ESLURM_DB_CONNECTION;
 	debug2("as_mysql_slurmdb_job_complete() called");
 
-	if (job_ptr->resize_time) {
-		start_time  = job_ptr->resize_time;
-	} else {
-		start_time  = job_ptr->start_time;
-	}
-	if (job_ptr->job_state & JOB_RESIZING) {
-		end_time = time(NULL);
+	if (job_ptr->resize_time)
+		submit_time = job_ptr->resize_time;
+	else
+		submit_time = job_ptr->details->submit_time;
+
+	if (IS_JOB_RESIZING(job_ptr)) {
+		end_time = job_ptr->resize_time;
 		job_state = JOB_RESIZING;
 	} else {
 		/* If we get an error with this just fall through to avoid an
@@ -531,8 +542,6 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn,
 		end_time = job_ptr->end_time;
 		job_state = job_ptr->job_state & JOB_STATE_BASE;
 	}
-	if (start_time > end_time)
-		start_time = 0;
 
 	slurm_mutex_lock(&rollup_lock);
 	if(end_time < global_last_rollup) {
@@ -558,7 +567,7 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn,
 	if(!job_ptr->db_index) {
 		if(!(job_ptr->db_index =
 		     _get_db_index(mysql_conn,
-				   job_ptr->details->submit_time,
+				   submit_time,
 				   job_ptr->job_id,
 				   job_ptr->assoc_id))) {
 			/* If we get an error with this just fall
@@ -573,13 +582,14 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn,
 		}
 	}
 
-	query = xstrdup_printf("update \"%s_%s\" set time_start=%d, time_end=%d, "
+	query = xstrdup_printf("update \"%s_%s\" set time_end=%d, "
 			       "state=%d, nodelist='%s', exit_code=%d, "
-			       "kill_requid=%d where job_db_inx=%d",
+			       "kill_requid=%d where job_db_inx=%d;",
 			       mysql_conn->cluster_name, job_table,
-			       start_time, end_time, job_state,
+			       end_time, job_state,
 			       nodes, job_ptr->exit_code,
 			       job_ptr->requid, job_ptr->db_index);
+
 	debug3("%d(%s:%d) query\n%s",
 	       mysql_conn->conn, THIS_FILE, __LINE__, query);
 	rc = mysql_db_query(mysql_conn->db_conn, query);
@@ -589,25 +599,37 @@ extern int as_mysql_job_complete(mysql_conn_t *mysql_conn,
 }
 
 extern int as_mysql_step_start(mysql_conn_t *mysql_conn,
-			    struct step_record *step_ptr)
+			       struct step_record *step_ptr)
 {
 	int cpus = 0, tasks = 0, nodes = 0, task_dist = 0;
 	int rc=SLURM_SUCCESS;
 	char node_list[BUFFER_SIZE];
 	char *node_inx = NULL;
+	time_t start_time, submit_time;
+
 #ifdef HAVE_BG
 	char *ionodes = NULL;
 #endif
 	char *query = NULL;
 
 	if (!step_ptr->job_ptr->db_index
-	    && (!step_ptr->job_ptr->details
-		|| !step_ptr->job_ptr->details->submit_time)) {
+	    && ((!step_ptr->job_ptr->details
+		 || !step_ptr->job_ptr->details->submit_time)
+		&& !step_ptr->job_ptr->resize_time)) {
 		error("as_mysql_step_start: "
 		      "Not inputing this job, it has no submit time.");
 		return SLURM_ERROR;
 	}
 
+	if (step_ptr->job_ptr->resize_time) {
+		submit_time = start_time = step_ptr->job_ptr->resize_time;
+		if(step_ptr->start_time > submit_time)
+			start_time = step_ptr->start_time;
+	} else {
+		start_time = step_ptr->start_time;
+		submit_time = step_ptr->job_ptr->details->submit_time;
+	}
+
 	if(check_connection(mysql_conn) != SLURM_SUCCESS)
 		return ESLURM_DB_CONNECTION;
 	if(slurmdbd_conf) {
@@ -660,7 +682,7 @@ extern int as_mysql_step_start(mysql_conn_t *mysql_conn,
 	if(!step_ptr->job_ptr->db_index) {
 		if(!(step_ptr->job_ptr->db_index =
 		     _get_db_index(mysql_conn,
-				   step_ptr->job_ptr->details->submit_time,
+				   submit_time,
 				   step_ptr->job_ptr->job_id,
 				   step_ptr->job_ptr->assoc_id))) {
 			/* If we get an error with this just fall
@@ -690,7 +712,7 @@ extern int as_mysql_step_start(mysql_conn_t *mysql_conn,
 		mysql_conn->cluster_name, step_table,
 		step_ptr->job_ptr->db_index,
 		step_ptr->step_id,
-		(int)step_ptr->start_time, step_ptr->name,
+		(int)start_time, step_ptr->name,
 		JOB_RUNNING, cpus, nodes, tasks, node_list, node_inx, task_dist,
 		cpus, nodes, tasks, JOB_RUNNING,
 		node_list, node_inx, task_dist);
@@ -716,15 +738,26 @@ extern int as_mysql_step_complete(mysql_conn_t *mysql_conn,
 	char *query = NULL;
 	int rc =SLURM_SUCCESS;
 	uint32_t exit_code = 0;
+	time_t start_time, submit_time;
 
 	if (!step_ptr->job_ptr->db_index
-	    && (!step_ptr->job_ptr->details
-		|| !step_ptr->job_ptr->details->submit_time)) {
+	    && ((!step_ptr->job_ptr->details
+		 || !step_ptr->job_ptr->details->submit_time)
+		&& !step_ptr->job_ptr->resize_time)) {
 		error("as_mysql_step_complete: "
 		      "Not inputing this job, it has no submit time.");
 		return SLURM_ERROR;
 	}
 
+	if (step_ptr->job_ptr->resize_time) {
+		submit_time = start_time = step_ptr->job_ptr->resize_time;
+		if(step_ptr->start_time > submit_time)
+			start_time = step_ptr->start_time;
+	} else {
+		start_time = step_ptr->start_time;
+		submit_time = step_ptr->job_ptr->details->submit_time;
+	}
+
 	if (jobacct == NULL) {
 		/* JobAcctGather=slurmdb_gather/none, no data to process */
 		memset(&dummy_jobacct, 0, sizeof(dummy_jobacct));
@@ -753,8 +786,8 @@ extern int as_mysql_step_complete(mysql_conn_t *mysql_conn,
 #endif
 	}
 
-	if ((elapsed=now-step_ptr->start_time)<0)
-		elapsed=0;	/* For *very* short jobs, if clock is wrong */
+	if ((elapsed = (now - start_time)) < 0)
+		elapsed = 0;	/* For *very* short jobs, if clock is wrong */
 
 	exit_code = step_ptr->exit_code;
 	if (exit_code == NO_VAL) {
@@ -786,7 +819,7 @@ extern int as_mysql_step_complete(mysql_conn_t *mysql_conn,
 	if(!step_ptr->job_ptr->db_index) {
 		if(!(step_ptr->job_ptr->db_index =
 		     _get_db_index(mysql_conn,
-				   step_ptr->job_ptr->details->submit_time,
+				   submit_time,
 				   step_ptr->job_ptr->job_id,
 				   step_ptr->job_ptr->assoc_id))) {
 			/* If we get an error with this just fall
@@ -853,18 +886,26 @@ extern int as_mysql_step_complete(mysql_conn_t *mysql_conn,
 	return rc;
 }
 
-extern int as_mysql_suspend(mysql_conn_t *mysql_conn, struct job_record *job_ptr)
+extern int as_mysql_suspend(mysql_conn_t *mysql_conn,
+			    struct job_record *job_ptr)
 {
 	char *query = NULL;
 	int rc = SLURM_SUCCESS;
 	bool suspended = false;
+	time_t submit_time;
 
 	if(check_connection(mysql_conn) != SLURM_SUCCESS)
 		return ESLURM_DB_CONNECTION;
+
+	if (job_ptr->resize_time)
+		submit_time = job_ptr->resize_time;
+	else
+		submit_time = job_ptr->details->submit_time;
+
 	if(!job_ptr->db_index) {
 		if(!(job_ptr->db_index =
 		     _get_db_index(mysql_conn,
-				   job_ptr->details->submit_time,
+				   submit_time,
 				   job_ptr->job_id,
 				   job_ptr->assoc_id))) {
 			/* If we get an error with this just fall
diff --git a/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c b/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c
index cd0f291477f..64f756378a2 100644
--- a/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c
+++ b/src/plugins/accounting_storage/mysql/as_mysql_jobacct_process.c
@@ -322,7 +322,7 @@ static int _cluster_get_jobs(mysql_conn_t *mysql_conn,
 	int set = 0;
 	char *prefix="t2";
 	int rc = SLURM_SUCCESS;
-	int last_id = -1, curr_id = -1;
+	int last_id = -1, curr_id = -1, last_state = -1;
 	local_cluster_t *curr_cluster = NULL;
 
 	/* This is here to make sure we are looking at only this user
@@ -390,12 +390,10 @@ static int _cluster_get_jobs(mysql_conn_t *mysql_conn,
 	}
 
 	/* Here we want to order them this way in such a way so it is
-	   easy to look for duplicates
+	   easy to look for duplicates, it is also easy to sort the
+	   resized jobs.
 	*/
-	if(job_cond && !job_cond->duplicates)
-		xstrcat(query, " order by id_job, time_submit desc");
-	else
-		xstrcat(query, " order by time_submit desc");
+	xstrcat(query, " group by id_job, time_submit desc");
 
 	debug3("%d(%s:%d) query\n%s",
 	       mysql_conn->conn, THIS_FILE, __LINE__, query);
@@ -413,22 +411,32 @@ static int _cluster_get_jobs(mysql_conn_t *mysql_conn,
 
 		curr_id = atoi(row[JOB_REQ_JOBID]);
 
-		if(job_cond && !job_cond->duplicates && curr_id == last_id)
+		if(job_cond && !job_cond->duplicates
+		   && (curr_id == last_id)
+		   && (atoi(row[JOB_REQ_STATE]) != JOB_RESIZING))
 			continue;
 
-		last_id = curr_id;
-
 		/* check the bitmap to see if this is one of the jobs
 		   we are looking for */
 		if(!good_nodes_from_inx(local_cluster_list,
 					(void **)&curr_cluster,
-					row[JOB_REQ_NODE_INX], submit))
+					row[JOB_REQ_NODE_INX], submit)) {
+			last_id = curr_id;
 			continue;
+		}
 
 		job = slurmdb_create_job_rec();
-		list_append(job_list, job);
-
 		job->state = atoi(row[JOB_REQ_STATE]);
+		last_state = job->state;
+		if(curr_id == last_id)
+			/* put in reverse so we order by the submit getting
+			   larger which it is given to us in reverse
+			   order from the database */
+			list_prepend(job_list, job);
+		else
+			list_append(job_list, job);
+		last_id = curr_id;
+
 		job->alloc_cpus = atoi(row[JOB_REQ_ALLOC_CPUS]);
 		job->alloc_nodes = atoi(row[JOB_REQ_ALLOC_NODES]);
 		job->associd = atoi(row[JOB_REQ_ASSOCID]);
diff --git a/src/plugins/accounting_storage/pgsql/jobacct.c b/src/plugins/accounting_storage/pgsql/jobacct.c
index 5cab8b2d3d4..46298b903be 100644
--- a/src/plugins/accounting_storage/pgsql/jobacct.c
+++ b/src/plugins/accounting_storage/pgsql/jobacct.c
@@ -190,13 +190,20 @@ _get_db_index(pgsql_conn_t *pg_conn, time_t submit, uint32_t jobid,
 static int
 _check_job_db_index(pgsql_conn_t *pg_conn, struct job_record *job_ptr)
 {
+	time_t submit_time;
+
+	if (job_ptr->resize_time)
+		submit_time = job_ptr->resize_time;
+	else
+		submit_time = job_ptr->details->submit_time;
+
 	if(!job_ptr->db_index) {
 		job_ptr->db_index = _get_db_index(
 			pg_conn,
-			job_ptr->details->submit_time,
+			submit_time,
 			job_ptr->job_id,
 			job_ptr->assoc_id);
-		if (! job_ptr->db_index) {
+		if (!job_ptr->db_index) {
 			/* If we get an error with this just fall
 			 * through to avoid an infinite loop
 			 */
@@ -375,7 +382,8 @@ js_p_job_start(pgsql_conn_t *pg_conn,
 	int job_state, node_cnt = 0;
 	uint32_t wckeyid = 0;
 
-	if (!job_ptr->details || !job_ptr->details->submit_time) {
+	if ((!job_ptr->details || !job_ptr->details->submit_time)
+	    && !job_ptr->resize_time) {
 		error("as/pg: job_start: Not inputing this job, "
 		      "it has no submit time.");
 		return SLURM_ERROR;
@@ -385,6 +393,21 @@ js_p_job_start(pgsql_conn_t *pg_conn,
 		return ESLURM_DB_CONNECTION;
 
 	debug3("as/pg: job_start() called");
+
+	job_state = job_ptr->job_state;
+
+	/* Since we need a new db_inx make sure the old db_inx
+	 * removed. This is most likely the only time we are going to
+	 * be notified of the change also so make the state without
+	 * the resize. */
+	if(job_state & JOB_RESIZING) {
+		js_p_job_complete(pg_conn, job_ptr);
+		job_state &= (~JOB_RESIZING);
+		job_ptr->db_index = 0;
+	}
+
+	job_state &= JOB_STATE_BASE;
+
 	if (job_ptr->resize_time) {
 		begin_time  = job_ptr->resize_time;
 		submit_time = job_ptr->resize_time;
@@ -394,10 +417,6 @@ js_p_job_start(pgsql_conn_t *pg_conn,
 		submit_time = job_ptr->details->submit_time;
 		start_time  = job_ptr->start_time;
 	}
-	if (job_ptr->job_state & JOB_RESIZING)
-		job_state = JOB_RESIZING;
-	else
-		job_state = job_ptr->job_state & JOB_STATE_BASE;
 
 	/* See what we are hearing about here if no start time. If
 	 * this job latest time is before the last roll up we will
@@ -512,12 +531,6 @@ no_rollup_change:
 				      job_ptr->user_id, pg_conn->cluster_name,
 				      job_ptr->assoc_id);
 
-	/* We need to put a 0 for 'end' incase of funky job state
-	 * files from a hot start of the controllers we call
-	 * job_start on jobs we may still know about after
-	 * job_flush has been called so we need to restart
-	 * them by zeroing out the end.
-	 */
 	if(!job_ptr->db_index) {
 		if (!begin_time)
 			begin_time = submit_time;
@@ -626,10 +639,11 @@ js_p_job_complete(pgsql_conn_t *pg_conn,
 {
 	char *query = NULL, *nodes = NULL;
 	int rc = SLURM_SUCCESS, job_state;
-	time_t start_time, end_time;
+	time_t end_time;
 
 	if (!job_ptr->db_index
-	    && (!job_ptr->details || !job_ptr->details->submit_time)) {
+	    && ((!job_ptr->details || !job_ptr->details->submit_time)
+		&& !job_ptr->resize_time)) {
 		error("jobacct_storage_p_job_complete: "
 		      "Not inputing this job, it has no submit time.");
 		return SLURM_ERROR;
@@ -640,13 +654,8 @@ js_p_job_complete(pgsql_conn_t *pg_conn,
 
 	debug2("as/pg: job_complete() called");
 
-	if (job_ptr->resize_time) {
-		start_time = job_ptr->resize_time;
-	} else {
-		start_time = job_ptr->start_time;
-	}
-	if (job_ptr->job_state & JOB_RESIZING) {
-		end_time = time(NULL);
+	if (IS_JOB_RESIZING(job_ptr)) {
+		end_time = job_ptr->resize_time;
 		job_state = JOB_RESIZING;
 	} else {
 		/* If we get an error with this just fall through to avoid an
@@ -659,8 +668,6 @@ js_p_job_complete(pgsql_conn_t *pg_conn,
 		end_time = job_ptr->end_time;
 		job_state = job_ptr->job_state & JOB_STATE_BASE;
 	}
-	if (start_time > end_time)
- 		start_time = 0;
 
 	slurm_mutex_lock(&rollup_lock);
 	if(end_time < global_last_rollup) {
@@ -686,10 +693,10 @@ js_p_job_complete(pgsql_conn_t *pg_conn,
 	if (_check_job_db_index(pg_conn, job_ptr) != SLURM_SUCCESS)
 		return SLURM_SUCCESS;
 
-	query = xstrdup_printf("UPDATE %s SET start=%d, endtime=%d, state=%d, "
+	query = xstrdup_printf("UPDATE %s SET endtime=%d, state=%d, "
 			       "nodelist='%s', comp_code=%d, "
 			       "kill_requid=%d WHERE id=%d",
-			       job_table, start_time, end_time, job_state,
+			       job_table, end_time, job_state,
 			       nodes, job_ptr->exit_code,
 			       job_ptr->requid, job_ptr->db_index);
 	rc = DEF_QUERY_RET_RC;
@@ -716,15 +723,22 @@ js_p_step_start(pgsql_conn_t *pg_conn,
 	char *ionodes = NULL;
 #endif
 	char *query = NULL, *rec = NULL;
+	time_t start_time;
 
 	if (!step_ptr->job_ptr->db_index
-	    && (!step_ptr->job_ptr->details
-		|| !step_ptr->job_ptr->details->submit_time)) {
+	    && ((!step_ptr->job_ptr->details
+		 || !step_ptr->job_ptr->details->submit_time)
+		&& !step_ptr->job_ptr->resize_time)) {
 		error("jobacct_storage_p_step_start: "
 		      "Not inputing this job step, it has no submit time.");
 		return SLURM_ERROR;
 	}
 
+	if(step_ptr->start_time > step_ptr->job_ptr->resize_time)
+		start_time = step_ptr->start_time;
+	else
+		start_time = step_ptr->job_ptr->resize_time;
+
 	if(check_db_connection(pg_conn) != SLURM_SUCCESS)
 		return ESLURM_DB_CONNECTION;
 
@@ -785,7 +799,7 @@ js_p_step_start(pgsql_conn_t *pg_conn,
 			     step_ptr->job_ptr->db_index,
 			     /* deleted=0 */
 			     step_ptr->step_id,
-			     step_ptr->start_time,
+			     start_time,
 			     /* endtime=0 */
 			     /* suspended=0 */
 			     step_ptr->name ?: "",
@@ -829,15 +843,22 @@ js_p_step_complete(pgsql_conn_t *pg_conn,
 	char *query = NULL;
 	int rc =SLURM_SUCCESS;
 	uint32_t exit_code;
+	time_t start_time;
 
 	if (!step_ptr->job_ptr->db_index
-	    && (!step_ptr->job_ptr->details
-		|| !step_ptr->job_ptr->details->submit_time)) {
+	    && ((!step_ptr->job_ptr->details
+		 || !step_ptr->job_ptr->details->submit_time)
+		&& !step_ptr->job_ptr->resize_time)) {
 		error("jobacct_storage_p_step_complete: "
 		      "Not inputing this job step, it has no submit time.");
 		return SLURM_ERROR;
 	}
 
+	if(step_ptr->start_time > step_ptr->job_ptr->resize_time)
+		start_time = step_ptr->start_time;
+	else
+		start_time = step_ptr->job_ptr->resize_time;
+
 	if(check_db_connection(pg_conn) != SLURM_SUCCESS)
 		return ESLURM_DB_CONNECTION;
 
@@ -866,8 +887,8 @@ js_p_step_complete(pgsql_conn_t *pg_conn,
 #endif
 	}
 
-	if ((elapsed=now-step_ptr->start_time)<0)
-		elapsed=0;	/* For *very* short jobs, if clock is wrong */
+	if ((elapsed = (now - start_time)) < 0)
+		elapsed = 0;	/* For *very* short jobs, if clock is wrong */
 
 	exit_code = step_ptr->exit_code;
 	if (exit_code == NO_VAL) {
diff --git a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c
index faa6394bcce..460766cd5d6 100644
--- a/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c
+++ b/src/plugins/accounting_storage/slurmdbd/accounting_storage_slurmdbd.c
@@ -1620,24 +1620,21 @@ extern int jobacct_storage_p_job_start(void *db_conn,
 	req.alloc_nodes      = job_ptr->node_cnt;
 #endif
 	req.block_id      = block_id;
-	req.db_index      = job_ptr->db_index;
 	if (job_ptr->resize_time) {
 		req.eligible_time = job_ptr->resize_time;
-		req.start_time    = job_ptr->resize_time;
 		req.submit_time   = job_ptr->resize_time;
-	} else {
-		if (job_ptr->details) {
-			req.eligible_time = job_ptr->details->begin_time;
-			req.submit_time   = job_ptr->details->submit_time;
-		}
-		req.start_time    = job_ptr->start_time;
+	} else if (job_ptr->details) {
+		req.eligible_time = job_ptr->details->begin_time;
+		req.submit_time   = job_ptr->details->submit_time;
 	}
+
+	req.start_time    = job_ptr->start_time;
 	req.gid           = job_ptr->group_id;
 	req.job_id        = job_ptr->job_id;
-	if (job_ptr->job_state & JOB_RESIZING)
-		req.job_state     = JOB_RESIZING;
-	else
-		req.job_state     = job_ptr->job_state & JOB_STATE_BASE;
+
+	req.db_index      = job_ptr->db_index;
+
+	req.job_state     = job_ptr->job_state;
 	req.name          = job_ptr->name;
 	req.nodes         = job_ptr->nodes;
 	if(job_ptr->node_bitmap) {
@@ -1660,7 +1657,7 @@ extern int jobacct_storage_p_job_start(void *db_conn,
 	/* if we already have the db_index don't wait around for it
 	 * again just send the message
 	 */
-	if(req.db_index) {
+	if(req.db_index && !IS_JOB_RESIZING(job_ptr)) {
 		if (slurm_send_slurmdbd_msg(SLURMDBD_VERSION, &msg) < 0) {
 			xfree(block_id);
 			return SLURM_ERROR;
@@ -1715,8 +1712,8 @@ extern int jobacct_storage_p_job_complete(void *db_conn,
 	req.db_index    = job_ptr->db_index;
 	req.exit_code   = job_ptr->exit_code;
 	req.job_id      = job_ptr->job_id;
-	if (job_ptr->job_state & JOB_RESIZING) {
-		req.end_time    = time(NULL);
+	if (IS_JOB_RESIZING(job_ptr)) {
+		req.end_time    = job_ptr->resize_time;
 		req.job_state   = JOB_RESIZING;
 	} else {
 		req.end_time    = job_ptr->end_time;
@@ -1724,6 +1721,7 @@ extern int jobacct_storage_p_job_complete(void *db_conn,
 	}
 	req.req_uid     = job_ptr->requid;
 	req.nodes       = job_ptr->nodes;
+
 	if (job_ptr->resize_time) {
 		req.start_time  = job_ptr->resize_time;
 		req.submit_time = job_ptr->resize_time;
@@ -1810,8 +1808,15 @@ extern int jobacct_storage_p_step_start(void *db_conn,
 				       step_ptr->step_node_bitmap);
 	}
 	req.node_cnt    = nodes;
-	req.start_time  = step_ptr->start_time;
-	if (step_ptr->job_ptr->details)
+
+	if(step_ptr->start_time > step_ptr->job_ptr->resize_time)
+		req.start_time = step_ptr->start_time;
+	else
+		req.start_time = step_ptr->job_ptr->resize_time;
+
+	if (step_ptr->job_ptr->resize_time)
+		req.job_submit_time   = step_ptr->job_ptr->resize_time;
+	else if (step_ptr->job_ptr->details)
 		req.job_submit_time   = step_ptr->job_ptr->details->submit_time;
 	req.step_id     = step_ptr->step_id;
 	if (step_ptr->step_layout)
@@ -1873,8 +1878,9 @@ extern int jobacct_storage_p_step_complete(void *db_conn,
 #endif
 
 	if (!step_ptr->job_ptr->db_index
-	    && (!step_ptr->job_ptr->details
-		|| !step_ptr->job_ptr->details->submit_time)) {
+	    && ((!step_ptr->job_ptr->details
+		 || !step_ptr->job_ptr->details->submit_time)
+		&& !step_ptr->job_ptr->resize_time)) {
 		error("jobacct_storage_p_step_complete: "
 		      "Not inputing this job, it has no submit time.");
 		return SLURM_ERROR;
@@ -1889,8 +1895,14 @@ extern int jobacct_storage_p_step_complete(void *db_conn,
 	req.jobacct     = step_ptr->jobacct;
 	req.job_id      = step_ptr->job_ptr->job_id;
 	req.req_uid     = step_ptr->requid;
-	req.start_time  = step_ptr->start_time;
-	if (step_ptr->job_ptr->details)
+	if(step_ptr->start_time > step_ptr->job_ptr->resize_time)
+		req.start_time = step_ptr->start_time;
+	else
+		req.start_time = step_ptr->job_ptr->resize_time;
+
+	if (step_ptr->job_ptr->resize_time)
+		req.job_submit_time   = step_ptr->job_ptr->resize_time;
+	else if (step_ptr->job_ptr->details)
 		req.job_submit_time   = step_ptr->job_ptr->details->submit_time;
 	req.step_id     = step_ptr->step_id;
 	req.total_cpus = cpus;
@@ -1920,8 +1932,12 @@ extern int jobacct_storage_p_suspend(void *db_conn,
 	req.job_id       = job_ptr->job_id;
 	req.db_index     = job_ptr->db_index;
 	req.job_state    = job_ptr->job_state & JOB_STATE_BASE;
-	if (job_ptr->details)
+
+	if (job_ptr->resize_time)
+		req.submit_time   = job_ptr->resize_time;
+	else if (job_ptr->details)
 		req.submit_time   = job_ptr->details->submit_time;
+
 	req.suspend_time = job_ptr->suspend_time;
 	msg.msg_type     = DBD_JOB_SUSPEND;
 	msg.data         = &req;
diff --git a/src/plugins/jobcomp/mysql/jobcomp_mysql.c b/src/plugins/jobcomp/mysql/jobcomp_mysql.c
index 542d7966282..0ae69e3452c 100644
--- a/src/plugins/jobcomp/mysql/jobcomp_mysql.c
+++ b/src/plugins/jobcomp/mysql/jobcomp_mysql.c
@@ -318,7 +318,7 @@ extern int slurm_jobcomp_log_record(struct job_record *job_ptr)
 	/* Job will typically be COMPLETING when this is called.
 	 * We remove the flags to get the eventual completion state:
 	 * JOB_FAILED, JOB_TIMEOUT, etc. */
-	if (job_ptr->job_state & JOB_RESIZING) {
+	if (IS_JOB_RESIZING(job_ptr)) {
 		job_state = JOB_RESIZING;
 		if (job_ptr->resize_time)
 			start_time = job_ptr->resize_time;
diff --git a/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c b/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c
index 53ef6a80db9..3ec7ceb7061 100644
--- a/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c
+++ b/src/plugins/jobcomp/pgsql/jobcomp_pgsql.c
@@ -340,7 +340,7 @@ extern int slurm_jobcomp_log_record(struct job_record *job_ptr)
 	/* Job will typically be COMPLETING when this is called.
 	 * We remove the flags to get the eventual completion state:
 	 * JOB_FAILED, JOB_TIMEOUT, etc. */
-	if (job_ptr->job_state & JOB_RESIZING) {
+	if (IS_JOB_RESIZING(job_ptr)) {
 		job_state = JOB_RESIZING;
 		if (job_ptr->resize_time)
 			start_time = job_ptr->resize_time;
diff --git a/src/plugins/jobcomp/script/jobcomp_script.c b/src/plugins/jobcomp/script/jobcomp_script.c
index f6ee790bd00..6f15422317d 100644
--- a/src/plugins/jobcomp/script/jobcomp_script.c
+++ b/src/plugins/jobcomp/script/jobcomp_script.c
@@ -205,7 +205,7 @@ static struct jobcomp_info * _jobcomp_info_create (struct job_record *job)
 	j->gid = job->group_id;
 	j->name = xstrdup (job->name);
 
-	if (job->job_state & JOB_RESIZING) {
+	if (IS_JOB_RESIZING(job)) {
 		state = JOB_RESIZING;
 		j->jobstate = xstrdup (job_state_string (state));
 		if (job->resize_time)
diff --git a/src/sacct/options.c b/src/sacct/options.c
index f3587586c83..eb5f085c229 100644
--- a/src/sacct/options.c
+++ b/src/sacct/options.c
@@ -523,7 +523,6 @@ int get_data(void)
 		jobs = g_slurm_jobcomp_get_jobs(job_cond);
 		return SLURM_SUCCESS;
 	} else {
-
 		jobs = slurmdb_jobs_get(acct_db_conn, job_cond);
 	}
 
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 0f9406f39e0..492c2e451ab 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -1754,7 +1754,8 @@ extern int kill_running_job_by_node_name(char *node_name)
 				     job_ptr->job_id, node_name);
 				_set_job_prio(job_ptr);
 				snprintf(requeue_msg, sizeof(requeue_msg),
-					 "Job requeued due to failure of node %s",
+					 "Job requeued due to failure "
+					 "of node %s",
 					 node_name);
 				slurm_sched_requeue(job_ptr, requeue_msg);
 				job_ptr->time_last_active  = now;
@@ -6028,10 +6029,14 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 				excise_node_from_job(job_ptr, node_ptr);
 			}
 			job_post_resize_acctg(job_ptr);
-		}
+			update_accounting = false;
+		} else
+			/* Since job_post_resize_acctg will restart
+			   things don't do it again. */
+			update_accounting = true;
+
 		FREE_NULL_BITMAP(req_bitmap);
 		xfree(job_specs->req_nodes);
-		update_accounting = true;
 	}
 #endif
 
@@ -6100,8 +6105,10 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 			info("sched: update_job: set nodes to %s for "
 			     "job_id %u",
 			     job_ptr->nodes, job_specs->job_id);
+			/* Since job_post_resize_acctg will restart
+			   things don't do it again. */
+			update_accounting = false;
 		}
-		update_accounting = true;
 	}
 #endif
 
@@ -6366,6 +6373,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 
 #endif
 	 if(update_accounting) {
+		 info("updating accounting");
 		 if (job_ptr->details && job_ptr->details->begin_time) {
 			/* Update job record in accounting to reflect
 			 * changes */
@@ -6380,9 +6388,9 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid)
 extern void job_pre_resize_acctg(struct job_record *job_ptr)
 {
 	job_ptr->job_state |= JOB_RESIZING;
+	job_ptr->resize_time = time(NULL);
 	job_completion_logger(job_ptr);
 	job_ptr->job_state &= (~JOB_RESIZING);
-	job_ptr->resize_time = time(NULL);
 }
 
 /* Record accounting information for a job immediately after changing size */
@@ -6997,14 +7005,13 @@ void job_fini (void)
 extern void job_completion_logger(struct job_record  *job_ptr)
 {
 	int base_state;
-	bool job_resizing, sent_start = false;
+	bool sent_start = false;
 
 	xassert(job_ptr);
-	job_resizing = job_ptr->job_state & JOB_RESIZING;
 
 	acct_policy_remove_job_submit(job_ptr);
 
-	if (!job_resizing) {
+	if (!IS_JOB_RESIZING(job_ptr)) {
 		/* Remove configuring state just to make sure it isn't there
 		 * since it will throw off displays of the job. */
 		job_ptr->job_state &= (~JOB_CONFIGURING);
@@ -7028,6 +7035,11 @@ extern void job_completion_logger(struct job_record  *job_ptr)
 
 	g_slurm_jobcomp_write(job_ptr);
 
+	/* When starting the resized job everything is taken care of
+	   there, so don't call it here. */
+	if (IS_JOB_RESIZING(job_ptr))
+		return;
+
 	if(!job_ptr->assoc_id) {
 		slurmdb_association_rec_t assoc_rec;
 		/* In case accounting enabled after starting the job */
@@ -7053,9 +7065,8 @@ extern void job_completion_logger(struct job_record  *job_ptr)
 	 * keep track of all jobs, so we will set the db_inx to
 	 * INFINITE and the database will understand what happened.
 	 */
-	if(!job_ptr->nodes && !job_ptr->db_index && !sent_start) {
+	if(!job_ptr->nodes && !job_ptr->db_index && !sent_start)
 		jobacct_storage_g_job_start(acct_db_conn, job_ptr);
-	}
 
 	jobacct_storage_g_job_complete(acct_db_conn, job_ptr);
 }
diff --git a/src/slurmdbd/proc_req.c b/src/slurmdbd/proc_req.c
index 72c9cb7a995..0f6c2b96eb9 100644
--- a/src/slurmdbd/proc_req.c
+++ b/src/slurmdbd/proc_req.c
@@ -1789,8 +1789,6 @@ static int  _job_complete(slurmdbd_conn_t *slurmdbd_conn,
 		goto end_it;
 	}
 
-	debug2("DBD_JOB_COMPLETE: ID:%u ", job_comp_msg->job_id);
-
 	memset(&job, 0, sizeof(struct job_record));
 	memset(&details, 0, sizeof(struct job_details));
 
@@ -1806,6 +1804,13 @@ static int  _job_complete(slurmdbd_conn_t *slurmdbd_conn,
 	details.submit_time = job_comp_msg->submit_time;
 
 	job.details = &details;
+
+	if(job.job_state & JOB_RESIZING) {
+		job.resize_time = job_comp_msg->end_time;
+		debug2("DBD_JOB_COMPLETE: RESIZE ID:%u", job_comp_msg->job_id);
+	} else
+		debug2("DBD_JOB_COMPLETE: ID:%u", job_comp_msg->job_id);
+
 	rc = jobacct_storage_g_job_complete(slurmdbd_conn->db_conn, &job);
 
 	if(rc && errno == 740) /* meaning data is already there */
@@ -1876,7 +1881,12 @@ static int  _job_start(slurmdbd_conn_t *slurmdbd_conn,
 
 	job.details = &details;
 
-	if(job.start_time) {
+	if(job.job_state & JOB_RESIZING) {
+		job.resize_time = job_start_msg->eligible_time;
+		debug2("DBD_JOB_START: RESIZE CALL ID:%u NAME:%s INX:%u",
+		       job_start_msg->job_id, job_start_msg->name,
+		       job.db_index);
+	} else if(job.start_time) {
 		debug2("DBD_JOB_START: START CALL ID:%u NAME:%s INX:%u",
 		       job_start_msg->job_id, job_start_msg->name,
 		       job.db_index);
-- 
GitLab