diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 271e8dfde9a4c1b2bda616305114e0f72fb074ad..b233db616069a451c575aa2b99d8fd0798401d86 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -477,6 +477,7 @@ static void *_thread_per_node_rpc(void *args)
 	state_t thread_state = DSH_NO_RESP;
 	sigset_t set;
 #if AGENT_IS_THREAD
+	struct node_record *node_ptr;
 	/* Locks: Write write node */
 	slurmctld_lock_t node_write_lock =
 	    { NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK };
@@ -545,26 +546,31 @@ static void *_thread_per_node_rpc(void *args)
 		goto cleanup;
 	}
 
+#if AGENT_IS_THREAD
+	/* SPECIAL CASE: Immediately mark node as IDLE */
+	if ((task_ptr->msg_type == REQUEST_REVOKE_JOB_CREDENTIAL) &&
+	    (node_ptr = find_node_record(thread_ptr->node_name))) {
+		revoke_credential_msg_t *revoke_job_cred;
+		revoke_job_cred = (revoke_credential_msg_t *)
+				   task_ptr->msg_args_ptr;
+		node_ptr = find_node_record(thread_ptr->node_name);
+		debug3("Revoke on node %s job_id %u",
+		       thread_ptr->node_name, revoke_job_cred->job_id);
+		lock_slurmctld(node_write_lock);
+		make_node_idle(node_ptr, 
+			       find_job_record(revoke_job_cred->job_id));
+		unlock_slurmctld(node_write_lock);
+		/* scheduler(); Overhead too high, 
+		 * only do when last node registers */
+	}
+#endif
+
 	switch (response_msg->msg_type) {
 	case RESPONSE_SLURM_RC:
 		slurm_rc_msg = (return_code_msg_t *) response_msg->data;
 		rc = slurm_rc_msg->return_code;
 		slurm_free_return_code_msg(slurm_rc_msg);
 		if (rc == 0) {
-#if AGENT_IS_THREAD
-			/* SPECIAL CASE: Immediately mark node as idle */
-			if ((task_ptr->msg_type == 
-			     REQUEST_REVOKE_JOB_CREDENTIAL) &&
-			    (rc == SLURM_SUCCESS)) {
-				lock_slurmctld(node_write_lock);
-				make_node_idle(
-					find_node_record(
-					thread_ptr->node_name));
-				unlock_slurmctld(node_write_lock);
-				/* scheduler(); Overhead too high, 
-				 * do when last node registers */
-			}
-#endif
 			debug3("agent processed RPC to node %s",
 			       thread_ptr->node_name);
 			thread_state = DSH_DONE;
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 920dd1953750f20bc3f69837a3e732cd4c33cbd0..cc393ee3da2aaabedd843e16d17a43800c3aaf29 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -214,7 +214,7 @@ int main(int argc, char *argv[])
 	}
 
 	if ((error_code = getnodename(node_name, MAX_NAME_LEN)))
-		fatal("getnodename error %d", error_code);
+		fatal("getnodename error %s", slurm_strerror(error_code));
 
 	/* init ssl job credential stuff */
 	slurm_ssl_init();
@@ -346,12 +346,10 @@ static void *_slurmctld_signal_hand(void *no_data)
 			info("Reconfigure signal (SIGHUP) received");
 			lock_slurmctld(config_write_lock);
 			error_code = read_slurm_conf(0);
-			if (error_code == 0)
-				reset_job_bitmaps();
 			unlock_slurmctld(config_write_lock);
 			if (error_code)
-				error("read_slurm_conf error %d",
-				      error_code);
+				error("read_slurm_conf error %s",
+				      slurm_strerror(error_code));
 			else 
 				_update_logging();
 			break;
@@ -509,9 +507,8 @@ static void *_slurmctld_background(void *no_data)
 		WRITE_LOCK, NO_LOCK
 	};
 	/* Locks: Write partition */
-	slurmctld_lock_t part_write_lock = { NO_LOCK, NO_LOCK,
-		NO_LOCK, WRITE_LOCK
-	};
+	slurmctld_lock_t part_write_lock = { 
+		NO_LOCK, NO_LOCK, NO_LOCK, WRITE_LOCK };
 
 	/* Let the dust settle before doing work */
 	now = time(NULL);
@@ -788,9 +785,8 @@ static void _slurm_rpc_dump_conf(slurm_msg_t * msg)
 	last_update_msg_t *last_time_msg = (last_update_msg_t *) msg->data;
 	slurm_ctl_conf_info_msg_t config_tbl;
 	/* Locks: Read config */
-	slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK,
-		NO_LOCK, NO_LOCK
-	};
+	slurmctld_lock_t config_read_lock = { 
+		READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
 
 	start_time = clock();
 	debug("Processing RPC: REQUEST_BUILD_INFO");
@@ -828,9 +824,8 @@ static void _slurm_rpc_dump_jobs(slurm_msg_t * msg)
 	job_info_request_msg_t *last_time_msg =
 	    (job_info_request_msg_t *) msg->data;
 	/* Locks: Read job */
-	slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK,
-		NO_LOCK, NO_LOCK
-	};
+	slurmctld_lock_t job_read_lock = { 
+		NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
 
 	start_time = clock();
 	debug("Processing RPC: REQUEST_JOB_INFO");
@@ -868,9 +863,8 @@ static void _slurm_rpc_dump_nodes(slurm_msg_t * msg)
 	slurm_msg_t response_msg;
 	last_update_msg_t *last_time_msg = (last_update_msg_t *) msg->data;
 	/* Locks: Read node */
-	slurmctld_lock_t node_read_lock = { NO_LOCK, NO_LOCK,
-		READ_LOCK, NO_LOCK
-	};
+	slurmctld_lock_t node_read_lock = { 
+		NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK };
 
 	start_time = clock();
 	debug("Processing RPC: REQUEST_NODE_INFO");
@@ -908,9 +902,8 @@ static void _slurm_rpc_dump_partitions(slurm_msg_t * msg)
 	slurm_msg_t response_msg;
 	last_update_msg_t *last_time_msg = (last_update_msg_t *) msg->data;
 	/* Locks: Read partition */
-	slurmctld_lock_t part_read_lock = { NO_LOCK, NO_LOCK,
-		NO_LOCK, READ_LOCK
-	};
+	slurmctld_lock_t part_read_lock = { 
+		NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
 
 	start_time = clock();
 	debug("Processing RPC: REQUEST_PARTITION_INFO");
@@ -944,14 +937,13 @@ static void _slurm_rpc_dump_partitions(slurm_msg_t * msg)
 static void _slurm_rpc_job_step_kill(slurm_msg_t * msg)
 {
 	/* init */
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	clock_t start_time;
 	job_step_kill_msg_t *job_step_kill_msg =
 	    (job_step_kill_msg_t *) msg->data;
 	/* Locks: Write job, write node */
-	slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK,
-		WRITE_LOCK, NO_LOCK
-	};
+	slurmctld_lock_t job_write_lock = { 
+		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
 	uid_t uid;
 
 	start_time = clock();
@@ -968,13 +960,14 @@ static void _slurm_rpc_job_step_kill(slurm_msg_t * msg)
 		/* return result */
 		if (error_code) {
 			info(
-			   "_slurm_rpc_job_step_kill error %d for %u, time=%ld", 
-			   error_code, job_step_kill_msg->job_id, 
-			   (long) (clock() - start_time));
+			   "_slurm_rpc_job_step_kill JobId=%u, time=%ld, error=%s", 
+			   job_step_kill_msg->job_id, 
+			   (long) (clock() - start_time),
+			   slurm_strerror(error_code));
 			slurm_send_rc_msg(msg, error_code);
 		} else {
 			info(
-			   "_slurm_rpc_job_step_kill success for JobId=%u, time=%ld", 
+			   "_slurm_rpc_job_step_kill JobId=%u, time=%ld, success", 
 			   job_step_kill_msg->job_id, 
 			   (long) (clock() - start_time));
 			slurm_send_rc_msg(msg, SLURM_SUCCESS);
@@ -993,14 +986,15 @@ static void _slurm_rpc_job_step_kill(slurm_msg_t * msg)
 		/* return result */
 		if (error_code) {
 			info(
-			   "_slurm_rpc_job_step_kill error %d for %u.%u, time=%ld", 
-			   error_code, job_step_kill_msg->job_id, 
+			   "_slurm_rpc_job_step_kill StepId=%u.%u, time=%ld, error=%s", 
+			   job_step_kill_msg->job_id, 
 			   job_step_kill_msg->job_step_id, 
-			   (long) (clock() - start_time));
+			   (long) (clock() - start_time),
+			   slurm_strerror(error_code));
 			slurm_send_rc_msg(msg, error_code);
 		} else {
 			info(
-			   "_slurm_rpc_job_step_kill success for %u.%u, time=%ld", 
+			   "_slurm_rpc_job_step_kill StepId=%u.%u, time=%ld, success", 
 			   job_step_kill_msg->job_id, 
 			   job_step_kill_msg->job_step_id, 
 			   (long) (clock() - start_time));
@@ -1031,7 +1025,7 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg)
 	lock_slurmctld(job_write_lock);
 
 	/* do RPC call */
-	/* First set node down as needed on fatal error */
+	/* First set node DOWN if fatal error */
 	if (complete_job_step_msg->slurm_rc == ESLURM_ALREADY_DONE) {
 		/* race condition on job termination, not a real error */
 		info("slurmd error running job %u from node %s: %s",
@@ -1072,13 +1066,14 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg)
 		/* return result */
 		if (error_code) {
 			info(
-			   "_slurm_rpc_job_step_complete error %d for %u, time=%ld", 
-			   error_code, complete_job_step_msg->job_id, 
-			   (long) (clock() - start_time));
+			   "_slurm_rpc_job_step_complete JobId=%u, time=%ld, error=%s", 
+			   complete_job_step_msg->job_id, 
+			   (long) (clock() - start_time), 
+			   slurm_strerror(error_code));
 			slurm_send_rc_msg(msg, error_code);
 		} else {
 			info(
-			   "_slurm_rpc_job_step_complete success for JobId=%u, time=%ld", 
+			   "_slurm_rpc_job_step_complete JobId=%u, time=%ld, success", 
 			   complete_job_step_msg->job_id, 
 			   (long) (clock() - start_time));
 			slurm_send_rc_msg(msg, SLURM_SUCCESS);
@@ -1096,14 +1091,15 @@ static void _slurm_rpc_job_step_complete(slurm_msg_t * msg)
 		/* return result */
 		if (error_code) {
 			info(
-			   "_slurm_rpc_job_step_complete error %d for %u.%u, time=%ld", 
-			   error_code, complete_job_step_msg->job_id, 
+			   "_slurm_rpc_job_step_complete StepId=%u.%u, time=%ld, error=%s", 
+			   complete_job_step_msg->job_id, 
 			   complete_job_step_msg->job_step_id, 
-			   (long) (clock() - start_time));
+			   (long) (clock() - start_time),
+			   slurm_strerror(error_code));
 			slurm_send_rc_msg(msg, error_code);
 		} else {
 			info(
-			   "_slurm_rpc_job_step_complete success for %u.%u, time=%ld", 
+			   "_slurm_rpc_job_step_complete StepId=%u.%u, time=%ld, success", 
 			   complete_job_step_msg->job_id, 
 			   complete_job_step_msg->job_step_id, 
 			   (long) (clock() - start_time));
@@ -1119,7 +1115,7 @@ static void _slurm_rpc_job_step_get_info(slurm_msg_t * msg)
 	clock_t start_time;
 	void *resp_buffer = NULL;
 	int resp_buffer_size = 0;
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	job_step_info_request_msg_t *request =
 	    (job_step_info_request_msg_t *) msg->data;
 	/* Locks: Read job */
@@ -1153,8 +1149,9 @@ static void _slurm_rpc_job_step_get_info(slurm_msg_t * msg)
 			   (long) (clock() - start_time));
 		else if (error_code)
 			error
-			    ("_slurm_rpc_job_step_get_info, error %d, time=%ld",
-			     error_code, (long) (clock() - start_time));
+			    ("_slurm_rpc_job_step_get_info, time=%ld, error=%s",
+			     (long) (clock() - start_time),
+			     slurm_strerror(error_code));
 	}
 
 	if (error_code)
@@ -1182,9 +1179,8 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg)
 	clock_t start_time;
 	job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data;
 	/* Locks: Write job, read node, read partition */
-	slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK,
-		READ_LOCK, READ_LOCK
-	};
+	slurmctld_lock_t job_write_lock = { 
+		NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
 	uid_t uid;
 
 	start_time = clock();
@@ -1199,9 +1195,9 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg)
 	/* return result */
 	if (error_code) {
 		error(
-		     "_slurm_rpc_update_job error %d for job id %u, time=%ld",
-		     error_code, job_desc_msg->job_id,
-		     (long) (clock() - start_time));
+		     "_slurm_rpc_update_job JobID=%u, time=%ld, error=%s",
+		     job_desc_msg->job_id, (long) (clock() - start_time),
+		     slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
 		info(
@@ -1219,14 +1215,13 @@ static void _slurm_rpc_update_job(slurm_msg_t * msg)
 static void _slurm_rpc_update_node(slurm_msg_t * msg)
 {
 	/* init */
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	clock_t start_time;
 	update_node_msg_t *update_node_msg_ptr =
 	    			(update_node_msg_t *) msg->data;
 	/* Locks: Write node */
-	slurmctld_lock_t node_write_lock = { NO_LOCK, NO_LOCK,
-		WRITE_LOCK, NO_LOCK
-	};
+	slurmctld_lock_t node_write_lock = { 
+		NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK };
 	uid_t uid;
 
 	start_time = clock();
@@ -1238,7 +1233,7 @@ static void _slurm_rpc_update_node(slurm_msg_t * msg)
 		      (unsigned int) uid);
 	}
 
-	if (error_code == 0) {
+	if (error_code == SLURM_SUCCESS) {
 		/* do RPC call */
 		lock_slurmctld(node_write_lock);
 		error_code = update_node(update_node_msg_ptr);
@@ -1247,10 +1242,10 @@ static void _slurm_rpc_update_node(slurm_msg_t * msg)
 
 	/* return result */
 	if (error_code) {
-		error
-		    ("_slurm_rpc_update_node error %d for node %s, time=%ld",
-		     error_code, update_node_msg_ptr->node_names,
-		     (long) (clock() - start_time));
+		error("_slurm_rpc_update_node node=%s, time=%ld, error=%s",
+		      update_node_msg_ptr->node_names,
+		      (long) (clock() - start_time), 
+		      slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
 		info(
@@ -1271,13 +1266,12 @@ static void _slurm_rpc_update_node(slurm_msg_t * msg)
 static void _slurm_rpc_update_partition(slurm_msg_t * msg)
 {
 	/* init */
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	clock_t start_time;
 	update_part_msg_t *part_desc_ptr = (update_part_msg_t *) msg->data;
 	/* Locks: Read node, write partition */
-	slurmctld_lock_t part_write_lock = { NO_LOCK, NO_LOCK,
-		READ_LOCK, WRITE_LOCK
-	};
+	slurmctld_lock_t part_write_lock = { 
+		NO_LOCK, NO_LOCK, READ_LOCK, WRITE_LOCK };
 	uid_t uid;
 
 	start_time = clock();
@@ -1290,7 +1284,7 @@ static void _slurm_rpc_update_partition(slurm_msg_t * msg)
 		     (unsigned int) uid);
 	}
 
-	if (error_code == 0) {
+	if (error_code == SLURM_SUCCESS) {
 		/* do RPC call */
 		lock_slurmctld(part_write_lock);
 		error_code = update_part(part_desc_ptr);
@@ -1300,9 +1294,10 @@ static void _slurm_rpc_update_partition(slurm_msg_t * msg)
 	/* return result */
 	if (error_code) {
 		error(
-		     "_slurm_rpc_update_partition error %d for partition %s, time=%ld",
-		     error_code, part_desc_ptr->name,
-		     (long) (clock() - start_time));
+		     "_slurm_rpc_update_partition partition=%s, time=%ld, error=%s",
+		     part_desc_ptr->name,
+		     (long) (clock() - start_time),
+		     slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
 		info(
@@ -1321,16 +1316,15 @@ static void _slurm_rpc_update_partition(slurm_msg_t * msg)
 static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 {
 	/* init */
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	clock_t start_time;
 	uint32_t job_id;
 	slurm_msg_t response_msg;
 	submit_response_msg_t submit_msg;
 	job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data;
 	/* Locks: Write job, read node, read partition */
-	slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK,
-		READ_LOCK, READ_LOCK
-	};
+	slurmctld_lock_t job_write_lock = { 
+		NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
 	uid_t uid;
 
 	start_time = clock();
@@ -1345,7 +1339,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 		error("Security violation, SUBMIT_JOB from uid %u",
 		      (unsigned int) uid);
 	}
-	if (error_code == 0) {
+	if (error_code == SLURM_SUCCESS) {
 		lock_slurmctld(job_write_lock);
 		error_code = job_allocate(job_desc_msg, &job_id,
 					  (char **) NULL,
@@ -1358,8 +1352,9 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 
 	/* return result */
 	if (error_code) {
-		info("_slurm_rpc_submit_batch_job error %d, time=%ld",
-		     error_code, (long) (clock() - start_time));
+		info("_slurm_rpc_submit_batch_job time=%ld, error=%s",
+		     (long) (clock() - start_time),
+		     slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
 		info(
@@ -1380,7 +1375,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 {
 	/* init */
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	slurm_msg_t response_msg;
 	clock_t start_time;
 	job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data;
@@ -1390,9 +1385,8 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 	uint32_t job_id;
 	resource_allocation_response_msg_t alloc_msg;
 	/* Locks: Write job, write node, read partition */
-	slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK,
-		WRITE_LOCK, READ_LOCK
-	};
+	slurmctld_lock_t job_write_lock = { 
+		NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };
 	uid_t uid;
 	uint16_t node_cnt;
 	slurm_addr *node_addr;
@@ -1409,7 +1403,7 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 		error("Security violation, RESOURCE_ALLOCATE from uid %u",
 		      (unsigned int) uid);
 	}
-	if (error_code == 0) {
+	if (error_code == SLURM_SUCCESS) {
 		int immediate = job_desc_msg->immediate;
 		lock_slurmctld(job_write_lock);
 		error_code = job_allocate(job_desc_msg, &job_id,
@@ -1422,8 +1416,9 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 
 	/* return result */
 	if (error_code) {
-		info("_slurm_rpc_allocate_resources error %d, time=%ld", 
-		     error_code, (long) (clock() - start_time));
+		info("_slurm_rpc_allocate_resources time=%ld, error=%s ", 
+		     (long) (clock() - start_time), 
+		     slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
 		info(
@@ -1452,7 +1447,7 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 {
 	/* init */
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	slurm_msg_t response_msg;
 	clock_t start_time;
 	job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data;
@@ -1464,8 +1459,8 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 	struct step_record *step_rec;
 	job_step_create_request_msg_t req_step_msg;
 	/* Locks: Write job, write node, read partition */
-	slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK,
-		WRITE_LOCK, READ_LOCK };
+	slurmctld_lock_t job_write_lock = { 
+		NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK };
 	uid_t uid;
 	uint16_t node_cnt;
 	slurm_addr *node_addr;
@@ -1484,7 +1479,7 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 		     (unsigned int) uid);
 	}
 
-	if (error_code == 0) {
+	if (error_code == SLURM_SUCCESS) {
 		int immediate = true; /* job_desc_msg->immediate == true */
 		lock_slurmctld(job_write_lock);
 		error_code = job_allocate(job_desc_msg, &job_id,
@@ -1497,9 +1492,9 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 	/* return result */
 	if (error_code) {
 		unlock_slurmctld(job_write_lock);
-		info(
-		   "_slurm_rpc_allocate_and_run error %d allocating resources, time=%ld", 
-		   error_code, (long) (clock() - start_time));
+		info("_slurm_rpc_allocate_and_run time=%ld, error=%s", 
+		     (long) (clock() - start_time), 
+		     slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 		return;
 	}
@@ -1516,8 +1511,9 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 		job_complete(job_id, job_desc_msg->user_id, false, 0);
 		unlock_slurmctld(job_write_lock);
 		info(
-		   "_slurm_rpc_allocate_and_run error %d creating job step, time=%ld", 
-		   error_code, (long) (clock() - start_time));
+		   "_slurm_rpc_allocate_and_run creating job step, time=%ld, error=%s", 
+		   (long) (clock() - start_time),
+		   slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
 
@@ -1555,7 +1551,7 @@ static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg)
 /* _slurm_rpc_old_job_alloc - process RPC to get details on existing job */
 static void _slurm_rpc_old_job_alloc(slurm_msg_t * msg)
 {
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	slurm_msg_t response_msg;
 	clock_t start_time;
 	old_job_alloc_msg_t *job_desc_msg =
@@ -1565,8 +1561,8 @@ static void _slurm_rpc_old_job_alloc(slurm_msg_t * msg)
 	uint32_t *cpus_per_node = NULL, *cpu_count_reps = NULL;
 	resource_allocation_response_msg_t alloc_msg;
 	/* Locks: Read job, read node */
-	slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK,
-		READ_LOCK, NO_LOCK };
+	slurmctld_lock_t job_read_lock = { 
+		NO_LOCK, READ_LOCK, READ_LOCK, NO_LOCK };
 	uint16_t node_cnt;
 	slurm_addr *node_addr;
 	uid_t uid;
@@ -1581,7 +1577,7 @@ static void _slurm_rpc_old_job_alloc(slurm_msg_t * msg)
 		error("Security violation, RESOURCE_ALLOCATE from uid %u",
 		      (unsigned int) uid);
 	}
-	if (error_code == 0) {
+	if (error_code == SLURM_SUCCESS) {
 		lock_slurmctld(job_read_lock);
 		error_code = old_job_info(job_desc_msg->uid,
 					  job_desc_msg->job_id,
@@ -1593,10 +1589,11 @@ static void _slurm_rpc_old_job_alloc(slurm_msg_t * msg)
 
 	/* return result */
 	if (error_code) {
-		info(
-		   "_slurm_rpc_old_job_alloc error %d getting info, job=%u, uid=%u, time=%ld", 
-		   error_code, job_desc_msg->job_id, job_desc_msg->uid, 
-		   (long) (clock() - start_time));
+		debug(
+		   "_slurm_rpc_old_job_alloc: JobId=%u, uid=%u, time=%ld, error=%s", 
+		   job_desc_msg->job_id, job_desc_msg->uid, 
+		   (long) (clock() - start_time), 
+		   slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
 		info(
@@ -1625,7 +1622,7 @@ static void _slurm_rpc_old_job_alloc(slurm_msg_t * msg)
 static void _slurm_rpc_job_will_run(slurm_msg_t * msg)
 {
 	/* init */
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	clock_t start_time;
 	uint16_t num_cpu_groups = 0;
 	uint32_t *cpus_per_node = NULL, *cpu_count_reps = NULL;
@@ -1633,9 +1630,8 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg)
 	job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data;
 	char *node_list_ptr = NULL;
 	/* Locks: Write job, read node, read partition */
-	slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK,
-		READ_LOCK, READ_LOCK
-	};
+	slurmctld_lock_t job_write_lock = { 
+		NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
 	uid_t uid;
 
 	start_time = clock();
@@ -1651,7 +1647,7 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg)
 		      (unsigned int) uid);
 	}
 
-	if (error_code == 0) {
+	if (error_code == SLURM_SUCCESS) {
 		lock_slurmctld(job_write_lock);
 		error_code = job_allocate(job_desc_msg, &job_id,
 					  &node_list_ptr, &num_cpu_groups,
@@ -1663,8 +1659,9 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg)
 
 	/* return result */
 	if (error_code) {
-		info("_slurm_rpc_job_will_run error %d, time=%ld",
-		     error_code, (long) (clock() - start_time));
+		info("_slurm_rpc_job_will_run time=%ld, error=%s",
+		     (long) (clock() - start_time),
+		     slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
 		info("_slurm_rpc_job_will_run success for , time=%ld",
@@ -1699,9 +1696,8 @@ static void _slurm_rpc_reconfigure_controller(slurm_msg_t * msg)
 	int error_code = SLURM_SUCCESS;
 	clock_t start_time;
 	/* Locks: Write configuration, job, node and partition */
-	slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK,
-		WRITE_LOCK, WRITE_LOCK
-	};
+	slurmctld_lock_t config_write_lock = { 
+		WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK };
 	uid_t uid;
 
 	start_time = clock();
@@ -1717,10 +1713,8 @@ static void _slurm_rpc_reconfigure_controller(slurm_msg_t * msg)
 	if (error_code == SLURM_SUCCESS) {
 		lock_slurmctld(config_write_lock);
 		error_code = read_slurm_conf(0);
-		if (error_code == SLURM_SUCCESS) {
-			reset_job_bitmaps();
+		if (error_code == SLURM_SUCCESS)
 			msg_to_slurmd(REQUEST_RECONFIGURE);
-		}
 		unlock_slurmctld(config_write_lock);
 	}
 	if (error_code == SLURM_SUCCESS) {  /* Stuff to do after unlock */
@@ -1735,12 +1729,13 @@ static void _slurm_rpc_reconfigure_controller(slurm_msg_t * msg)
 	/* return result */
 	if (error_code) {
 		error(
-		     "_slurm_rpc_reconfigure_controller error %d, time=%ld",
-		     error_code, (long) (clock() - start_time));
+		     "_slurm_rpc_reconfigure_controller: time=%ld, error=%s",
+		     (long) (clock() - start_time),
+		     slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
 		info(
-		   "_slurm_rpc_reconfigure_controller completed, time=%ld", 
+		   "_slurm_rpc_reconfigure_controller: completed, time=%ld", 
 		   (long) (clock() - start_time));
 		slurm_send_rc_msg(msg, SLURM_SUCCESS);
 		schedule();
@@ -1752,14 +1747,13 @@ static void _slurm_rpc_reconfigure_controller(slurm_msg_t * msg)
 /* _slurm_rpc_shutdown_controller - process RPC to shutdown slurmctld */
 static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg)
 {
-	int error_code = 0, i;
+	int error_code = SLURM_SUCCESS, i;
 	uint16_t core_arg = 0;
 	shutdown_msg_t *shutdown_msg = (shutdown_msg_t *) msg->data;
 	uid_t uid;
 	/* Locks: Read node */
-	slurmctld_lock_t node_read_lock = { NO_LOCK, NO_LOCK,
-		READ_LOCK, NO_LOCK
-	};
+	slurmctld_lock_t node_read_lock = { 
+		NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK };
 
 	uid = g_slurm_auth_get_uid(msg->cred);
 	if ((uid != 0) && (uid != getuid())) {
@@ -1803,7 +1797,7 @@ static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg)
 			sleep(1);
 	}
 	slurm_send_rc_msg(msg, error_code);
-	if ((error_code == 0) && core_arg)
+	if ((error_code == SLURM_SUCCESS) && core_arg)
 		fatal("Aborting per RPC request");
 }
 
@@ -1811,7 +1805,7 @@ static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg)
  *	slurmctld */
 static void _slurm_rpc_shutdown_controller_immediate(slurm_msg_t * msg)
 {
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	uid_t uid;
 
 	uid = g_slurm_auth_get_uid(msg->cred);
@@ -1824,7 +1818,7 @@ static void _slurm_rpc_shutdown_controller_immediate(slurm_msg_t * msg)
 
 	/* do RPC call */
 	/* No op: just used to knock loose accept RPC thread */
-	if (error_code == 0)
+	if (error_code == SLURM_SUCCESS)
 		debug("Performing RPC: REQUEST_SHUTDOWN_IMMEDIATE");
 }
 
@@ -1833,7 +1827,7 @@ static void _slurm_rpc_shutdown_controller_immediate(slurm_msg_t * msg)
 static void _slurm_rpc_job_step_create(slurm_msg_t * msg)
 {
 	/* init */
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	clock_t start_time;
 
 	slurm_msg_t resp;
@@ -1842,9 +1836,8 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg)
 	job_step_create_request_msg_t *req_step_msg =
 	    (job_step_create_request_msg_t *) msg->data;
 	/* Locks: Write jobs, read nodes */
-	slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK,
-		READ_LOCK, NO_LOCK
-	};
+	slurmctld_lock_t job_write_lock = { 
+		NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK };
 	uid_t uid;
 
 	start_time = clock();
@@ -1859,7 +1852,7 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg)
 		     (unsigned int) uid);
 	}
 
-	if (error_code == 0) {
+	if (error_code == SLURM_SUCCESS) {
 		/* issue the RPC */
 		lock_slurmctld(job_write_lock);
 		error_code = step_create(req_step_msg, &step_rec, false);
@@ -1868,12 +1861,12 @@ static void _slurm_rpc_job_step_create(slurm_msg_t * msg)
 	/* return result */
 	if (error_code) {
 		unlock_slurmctld(job_write_lock);
-		info("_slurm_rpc_job_step_create error %s, time=%ld",
-		     slurm_strerror(error_code),
-		     (long) (clock() - start_time));
+		info("_slurm_rpc_job_step_create: time=%ld error=%s",
+		     (long) (clock() - start_time),
+		     slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
-		info("_slurm_rpc_job_step_create %u.%u success time=%ld",
+		info("_slurm_rpc_job_step_create: %u.%u success time=%ld",
 		     step_rec->job_ptr->job_id, step_rec->step_id,
 		     (long) (clock() - start_time));
 
@@ -1909,9 +1902,8 @@ static void _slurm_rpc_node_registration(slurm_msg_t * msg)
 	slurm_node_registration_status_msg_t *node_reg_stat_msg =
 	    (slurm_node_registration_status_msg_t *) msg->data;
 	/* Locks: Write job and node */
-	slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK,
-		WRITE_LOCK, NO_LOCK
-	};
+	slurmctld_lock_t job_write_lock = { 
+		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
 	uid_t uid;
 
 	start_time = clock();
@@ -1922,7 +1914,7 @@ static void _slurm_rpc_node_registration(slurm_msg_t * msg)
 		error("Security violation,  NODE_REGISTER RPC from uid %u",
 		      (unsigned int) uid);
 	}
-	if (error_code == 0) {
+	if (error_code == SLURM_SUCCESS) {
 		/* do RPC call */
 		lock_slurmctld(job_write_lock);
 		validate_jobs_on_node(node_reg_stat_msg->node_name,
@@ -1944,9 +1936,10 @@ static void _slurm_rpc_node_registration(slurm_msg_t * msg)
 	/* return result */
 	if (error_code) {
 		error(
-		     "_slurm_rpc_node_registration error %d for %s, time=%ld",
-		     error_code, node_reg_stat_msg->node_name,
-		     (long) (clock() - start_time));
+		     "_slurm_rpc_node_registration node=%s, time=%ld, error=%s",
+		     node_reg_stat_msg->node_name,
+		     (long) (clock() - start_time),
+		     slurm_strerror(error_code));
 		slurm_send_rc_msg(msg, error_code);
 	} else {
 		info(
@@ -2188,7 +2181,6 @@ static void _run_backup(void)
 
 	if (read_slurm_conf(1))	/* Recover all state */
 		fatal("Unable to recover slurm state");
-	reset_job_bitmaps();
 	shutdown_time = (time_t) 0;
 	return;
 }
@@ -2279,7 +2271,7 @@ static void *_background_rpc_mgr(void *no_data)
 			error("slurm_receive_msg error %m");
 		else {
 			error_code = _background_process_msg(msg);
-			if ((error_code == 0) &&
+			if ((error_code == SLURM_SUCCESS) &&
 			    (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE))
 				done_flag = true;
 		}
@@ -2299,7 +2291,7 @@ static void *_background_rpc_mgr(void *no_data)
 /* _background_process_msg - process an RPC to the backup_controller */
 static int _background_process_msg(slurm_msg_t * msg)
 {
-	int error_code = 0;
+	int error_code = SLURM_SUCCESS;
 	uid_t uid;
 
 	uid = g_slurm_auth_get_uid(msg->cred);
@@ -2309,7 +2301,7 @@ static int _background_process_msg(slurm_msg_t * msg)
 		error_code = ESLURM_USER_ID_MISSING;
 	}
 
-	if (error_code == 0) {
+	if (error_code == SLURM_SUCCESS) {
 		if (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE) {
 			debug3
 			    ("Performing RPC: REQUEST_SHUTDOWN_IMMEDIATE");
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index c4c6b298af0f8195d03f3aa5fc5cb6634e724ede..1ac821e1df0dc83ff8628c67d680e1e00976f19a 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -56,6 +56,7 @@
 #include "src/common/list.h"
 #include "src/common/macros.h"
 #include "src/common/pack.h"
+#include "src/common/xassert.h"
 #include "src/common/xstring.h"
 
 #include "src/slurmctld/agent.h"
@@ -108,6 +109,7 @@ static void _excise_node_from_job(struct job_record *job_record_ptr,
 				  struct node_record *node_record_ptr);
 static int  _find_batch_dir(void *x, void *key);
 static void _get_batch_job_dir_ids(List batch_dirs);
+static void _job_timed_out(struct job_record *job_ptr);
 static int  _job_create(job_desc_msg_t * job_specs, uint32_t * new_job_id,
 		        int allocate, int will_run,
 		        struct job_record **job_rec_ptr, uid_t submit_uid);
@@ -123,6 +125,8 @@ static void _read_data_array_from_file(char *file_name, char ***data,
 				       uint16_t * size);
 static void _read_data_from_file(char *file_name, char **data);
 static void _remove_defunct_batch_dirs(List batch_dirs);
+static void _reset_detail_bitmaps(struct job_record *job_ptr);
+static void _reset_step_bitmaps(struct job_record *job_ptr);
 static void _set_job_id(struct job_record *job_ptr);
 static void _set_job_prio(struct job_record *job_ptr);
 static void _signal_job_on_node(uint32_t job_id, uint16_t step_id,
@@ -168,17 +172,17 @@ struct job_record *create_job_record(int *error_code)
 	job_details_point =
 	    (struct job_details *) xmalloc(sizeof(struct job_details));
 
-	job_record_point->magic = JOB_MAGIC;
+	xassert (job_record_point->magic = JOB_MAGIC); /* sets value */
 	job_record_point->details = job_details_point;
 	job_record_point->step_list = list_create(NULL);
 	if (job_record_point->step_list == NULL)
-		fatal("list_create can not allocate memory");
+		fatal("memory allocation failure");
 
-	job_details_point->magic = DETAILS_MAGIC;
+	xassert (job_details_point->magic = DETAILS_MAGIC); /* set value */
 	job_details_point->submit_time = time(NULL);
 
-	if (list_append(job_list, job_record_point) == NULL)
-		fatal("create_job_record: unable to allocate memory");
+	if (list_append(job_list, job_record_point) == 0)
+		fatal("list_append memory allocation failure");
 
 	return job_record_point;
 }
@@ -196,9 +200,7 @@ void delete_job_details(struct job_record *job_entry)
 		return;
 
 	_delete_job_desc_files(job_entry->job_id);
-	if (job_entry->details->magic != DETAILS_MAGIC)
-		fatal
-		    ("delete_job_details: passed invalid job details pointer");
+	xassert (job_entry->details->magic == DETAILS_MAGIC);
 	xfree(job_entry->details->req_nodes);
 	xfree(job_entry->details->exc_nodes);
 	FREE_NULL_BITMAP(job_entry->details->req_node_bitmap);
@@ -259,8 +261,7 @@ int dump_all_job_state(void)
 	job_record_iterator = list_iterator_create(job_list);
 	while ((job_record_point =
 		(struct job_record *) list_next(job_record_iterator))) {
-		if (job_record_point->magic != JOB_MAGIC)
-			fatal("dump_all_job: job integrity is bad");
+		xassert (job_record_point->magic == JOB_MAGIC);
 		_dump_job_state(job_record_point, buffer);
 	}
 	unlock_slurmctld(job_read_lock);
@@ -405,8 +406,7 @@ static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer)
 	/* Dump job details, if available */
 	detail_ptr = dump_job_ptr->details;
 	if (detail_ptr) {
-		if (detail_ptr->magic != DETAILS_MAGIC)
-			fatal("dump_all_job: job detail integrity is bad");
+		xassert (detail_ptr->magic == DETAILS_MAGIC);
 		pack16((uint16_t) DETAILS_FLAG, buffer);
 		_dump_job_details(detail_ptr, buffer);
 	} else
@@ -459,7 +459,8 @@ static int _load_job_state(Buf buffer)
 	safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer);
 
 	/* validity test as possible */
-	if ((job_state >= JOB_END) || (batch_flag > 1)) {
+	if (((job_state & (~JOB_COMPLETING)) >= JOB_END) || 
+	    (batch_flag > 1)) {
 		error("Invalid data for job %u: job_state=%u batch_flag=%u",
 		      job_id, job_state, batch_flag);
 		goto unpack_error;
@@ -479,16 +480,16 @@ static int _load_job_state(Buf buffer)
 		      nodes, job_id);
 		goto unpack_error;
 	}
+	part_ptr = list_find_first(part_list, &list_find_part,
+				   partition);
+	if (part_ptr == NULL) {
+		error("Invalid partition (%s) for job_id %u", 
+		     partition, job_id);
+		goto unpack_error;
+	}
 
 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL) {
-		part_ptr = list_find_first(part_list, &list_find_part,
-					   partition);
-		if (part_ptr == NULL) {
-			info("Invalid partition (%s) for job_id %u", 
-			     partition, job_id);
-			goto unpack_error;
-		}
 		job_ptr = create_job_record(&error_code);
 		if (error_code) {
 			error("Create job entry failed for job_id %u",
@@ -496,8 +497,6 @@ static int _load_job_state(Buf buffer)
 			goto unpack_error;
 		}
 		job_ptr->job_id = job_id;
-		strncpy(job_ptr->partition, partition, MAX_NAME_LEN);
-		job_ptr->part_ptr = part_ptr;
 		_add_job_hash(job_ptr);
 	}
 
@@ -517,20 +516,28 @@ static int _load_job_state(Buf buffer)
 	job_ptr->alloc_sid    = alloc_sid;
 	job_ptr->start_time   = start_time;
 	job_ptr->end_time     = end_time;
-	job_ptr->time_last_active = time(NULL);
 	job_ptr->job_state    = job_state;
 	job_ptr->next_step_id = next_step_id;
+	job_ptr->time_last_active = time(NULL);
 	strncpy(job_ptr->name, name, MAX_NAME_LEN);
 	xfree(name);
-	job_ptr->nodes = nodes;
+	xfree(job_ptr->nodes);
+	job_ptr->nodes        = nodes;
 	nodes = NULL;	/* reused, nothing left to free */
-	job_ptr->alloc_node = alloc_node;
+	xfree(job_ptr->alloc_node);
+	job_ptr->alloc_node   = alloc_node;
 	alloc_node = NULL;	/* reused, nothing left to free */
-	job_ptr->node_bitmap = node_bitmap;
+	FREE_NULL_BITMAP(job_ptr->node_bitmap);
+	job_ptr->node_bitmap  = node_bitmap;
+	strncpy(job_ptr->partition, partition, MAX_NAME_LEN);
 	xfree(partition);
+	job_ptr->part_ptr = part_ptr;
 	job_ptr->kill_on_node_fail = kill_on_node_fail;
 	job_ptr->kill_on_step_done = kill_on_step_done;
 	job_ptr->batch_flag        = batch_flag;
+	build_node_details(job_ptr);	/* set: num_cpu_groups, cpus_per_node, 
+					 *	cpu_count_reps, node_cnt, and
+					 *	node_addr */
 	info("recovered job id %u", job_id);
 
 	safe_unpack16(&step_flag, buffer);
@@ -639,6 +646,17 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer)
 		goto unpack_error;
 	}
 
+	/* free any left-over data */
+	xfree(job_ptr->details->req_nodes);
+	FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
+	xfree(job_ptr->details->exc_nodes);
+	FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
+	xfree(job_ptr->details->features);
+	xfree(job_ptr->details->err);
+	xfree(job_ptr->details->in);
+	xfree(job_ptr->details->out);
+	xfree(job_ptr->details->work_dir);
+
 	/* now put the details into the job record */
 	memcpy(&job_ptr->details->credential, credential_ptr,
 	       sizeof(job_ptr->details->credential));
@@ -661,9 +679,7 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer)
 	job_ptr->details->in = in;
 	job_ptr->details->out = out;
 	job_ptr->details->work_dir = work_dir;
-	build_node_details(job_ptr);	/* set: num_cpu_groups, cpus_per_node, 
-					 *	cpu_count_reps, node_cnt, and
-					 *	node_addr */
+
 	return SLURM_SUCCESS;
 
       unpack_error:
@@ -720,13 +736,21 @@ static int _load_step_state(struct job_record *job_ptr, Buf buffer)
 		goto unpack_error;
 	}
 
-	step_ptr = create_step_record(job_ptr);
+	step_ptr = find_step_record(job_ptr, step_id);
+	if (step_ptr == NULL)
+		step_ptr = create_step_record(job_ptr);
 	if (step_ptr == NULL)
 		return SLURM_FAILURE;
-	step_ptr->step_id = step_id;
+
+	/* free any left-over values */
+	xfree(step_ptr->step_node_list);
+	FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
+
+	/* set new values */
+	step_ptr->step_id      = step_id;
 	step_ptr->cyclic_alloc = cyclic_alloc;
-	step_ptr->num_tasks = num_tasks;
-	step_ptr->start_time = start_time;
+	step_ptr->num_tasks    = num_tasks;
+	step_ptr->start_time   = start_time;
 	step_ptr->step_node_list = step_node_list;
 	if (step_node_list)
 		(void) node_name2bitmap(step_node_list, 
@@ -824,8 +848,8 @@ struct job_record *find_running_job_by_node_name(char *node_name)
 }
 
 /*
- * kill_running_job_by_node_name - Given a node name, deallocate jobs 
- *	from the node or kill them 
+ * kill_running_job_by_node_name - Given a node name, deallocate RUNNING 
+ *	or COMPLETING jobs from the node or kill them 
  * IN node_name - name of a node
  * IN step_test - if true, only kill the job if a step is running on the node
  * RET number of killed jobs
@@ -833,42 +857,56 @@ struct job_record *find_running_job_by_node_name(char *node_name)
 int kill_running_job_by_node_name(char *node_name, bool step_test)
 {
 	ListIterator job_record_iterator;
-	struct job_record *job_record_point;
-	struct node_record *node_record_point;
+	struct job_record *job_ptr;
+	struct node_record *node_ptr;
 	int bit_position;
 	int job_count = 0;
 
-	node_record_point = find_node_record(node_name);
-	if (node_record_point == NULL)	/* No such node */
+	node_ptr = find_node_record(node_name);
+	if (node_ptr == NULL)	/* No such node */
 		return 0;
-	bit_position = node_record_point - node_record_table_ptr;
+	bit_position = node_ptr - node_record_table_ptr;
 
 	job_record_iterator = list_iterator_create(job_list);
-	while ((job_record_point =
+	while ((job_ptr =
 		(struct job_record *) list_next(job_record_iterator))) {
-		if (job_record_point->job_state != JOB_RUNNING)
-			continue;	/* job not active */
-		if (!bit_test(job_record_point->node_bitmap, bit_position))
+		if ((job_ptr->node_bitmap == NULL) ||
+		    (!bit_test(job_ptr->node_bitmap, bit_position)))
 			continue;	/* job not on this node */
-		if (step_test && 
-		    (step_on_node(job_record_point, node_record_point) == 0))
-			continue;
-		error("Running job_id %u on failed node %s",
-		      job_record_point->job_id, node_name);
-		job_count++;
-		if ((job_record_point->details == NULL) ||
-		    (job_record_point->kill_on_node_fail) ||
-		    (job_record_point->node_cnt <= 1)) {
-			job_record_point->job_state = JOB_NODE_FAIL;
-			job_record_point->end_time = time(NULL);
-			deallocate_nodes(job_record_point);
-			delete_all_step_records(job_record_point);
-			delete_job_details(job_record_point);
-		} else {
-			/* Remove node from this job's list */
-			_excise_node_from_job(job_record_point, 
-					      node_record_point);
-			make_node_idle(node_record_point);
+		if (job_ptr->job_state & JOB_COMPLETING) {
+			job_count++;
+			bit_clear(job_ptr->node_bitmap, bit_position);
+			if (job_ptr->node_cnt)
+				(job_ptr->node_cnt)--;
+			else
+				error("node_cnt underflow on JobId=%u", 
+			   	      job_ptr->job_id);
+			if (job_ptr->node_cnt == 0)
+				job_ptr->job_state &= (~JOB_COMPLETING);
+			if (node_ptr->comp_job_cnt)
+				(node_ptr->comp_job_cnt)--;
+			else
+				error("Node %s comp_job_cnt underflow, JobId=%u", 
+				      node_ptr->name, job_ptr->job_id);
+		} else if (job_ptr->job_state == JOB_RUNNING) {
+			if (step_test && 
+			    (step_on_node(job_ptr, node_ptr) == 0))
+				continue;
+			error("Running job_id %u on failed node %s",
+		   	   job_ptr->job_id, node_name);
+			job_count++;
+			if ((job_ptr->details == NULL) ||
+			    (job_ptr->kill_on_node_fail) ||
+			    (job_ptr->node_cnt <= 1)) {
+				job_ptr->job_state = JOB_NODE_FAIL | 
+						     JOB_COMPLETING;
+				job_ptr->end_time = time(NULL);
+				deallocate_nodes(job_ptr);
+				delete_all_step_records(job_ptr);
+			} else {
+				/* Remove node from this job's list */
+				_excise_node_from_job(job_ptr, node_ptr);
+			}
 		}
 
 	}
@@ -883,10 +921,7 @@ int kill_running_job_by_node_name(char *node_name, bool step_test)
 static void _excise_node_from_job(struct job_record *job_record_ptr, 
 				  struct node_record *node_record_ptr)
 {
-	int bit_position;
-
-	bit_position = node_record_ptr - node_record_table_ptr;
-	bit_clear(job_record_ptr->node_bitmap, bit_position);
+	make_node_idle(node_record_ptr, job_record_ptr); /* clear node_bitmap */
 	job_record_ptr->nodes = bitmap2node_name(job_record_ptr->node_bitmap);
 	xfree(job_record_ptr->cpus_per_node);
 	xfree(job_record_ptr->cpu_count_reps);
@@ -982,7 +1017,8 @@ void dump_job_desc(job_desc_msg_t * job_specs)
 /* 
  * init_job_conf - initialize the job configuration tables and values. 
  *	this should be called after creating node information, but 
- *	before creating any job entries.
+ *	before creating any job entries. Pre-existing job entries are 
+ *	left unchanged.
  * RET 0 if no error, otherwise an error code
  * global: last_job_update - time of last job table update
  *	job_list - pointer to global job list
@@ -993,7 +1029,7 @@ int init_job_conf(void)
 		job_count = 0;
 		job_list = list_create(&_list_delete_job);
 		if (job_list == NULL)
-			fatal("init_job_conf: No memory");
+			fatal ("Memory allocation failure");;
 	}
 
 	last_job_update = time(NULL);
@@ -1133,9 +1169,7 @@ int job_signal(uint32_t job_id, uint16_t signal, uid_t uid)
 		return ESLURM_INVALID_JOB_ID;
 	}
 
-	if ((job_ptr->job_state == JOB_FAILED) ||
-	    (job_ptr->job_state == JOB_COMPLETE) ||
-	    (job_ptr->job_state == JOB_TIMEOUT) ||
+	if ((IS_JOB_FINISHED(job_ptr)) ||
 	    (job_ptr->kill_on_step_done & KILL_IN_PROGRESS))
 		return ESLURM_ALREADY_DONE;
 
@@ -1169,18 +1203,16 @@ int job_signal(uint32_t job_id, uint16_t signal, uid_t uid)
 		}
 		list_iterator_destroy (step_record_iterator);
 
-		if ((signal == SIGKILL) &&
-		    ((job_ptr->kill_on_step_done & KILL_IN_PROGRESS) == 0)) {
+		if (signal == SIGKILL) {
 			job_ptr->kill_on_step_done = KILL_IN_PROGRESS;
 			job_ptr->time_last_active = now;
 			last_job_update = now;
 		}
 		if ((signal == SIGKILL) && (step_cnt == 0)) {
 			/* kill job with no active steps */
-			job_ptr->job_state = JOB_COMPLETE;
+			job_ptr->job_state = JOB_COMPLETE | JOB_COMPLETING;
 			job_ptr->end_time = now;
 			deallocate_nodes(job_ptr);
-			delete_job_details(job_ptr);
 		}
 		verbose("job_signal of running job %u successful", job_id);
 		return SLURM_SUCCESS;
@@ -1196,7 +1228,7 @@ int job_signal(uint32_t job_id, uint16_t signal, uid_t uid)
  * IN job_id - id of the job which completed
  * IN uid - user id of user issuing the RPC
  * IN requeue - job should be run again if possible
- * IN job_return_code - job's return code, if set then set state to JOB_FAILED
+ * IN job_return_code - job's return code, if set then set state to FAILED
  * RET - 0 on success, otherwise ESLURM error code 
  * global: job_list - pointer global job list
  *	last_job_update - time of last job table update
@@ -1207,6 +1239,7 @@ job_complete(uint32_t job_id, uid_t uid, bool requeue,
 {
 	struct job_record *job_ptr;
 	time_t now = time(NULL);
+	uint32_t job_comp_flag = 0;
 
 	job_ptr = find_job_record(job_id);
 	if (job_ptr == NULL) {
@@ -1214,10 +1247,7 @@ job_complete(uint32_t job_id, uid_t uid, bool requeue,
 		return ESLURM_INVALID_JOB_ID;
 	}
 
-	if ((job_ptr->job_state == JOB_FAILED) ||
-	    (job_ptr->job_state == JOB_COMPLETE) ||
-	    (job_ptr->job_state == JOB_TIMEOUT) ||
-	    (job_ptr->job_state == JOB_NODE_FAIL))
+	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;
 
 	if ((job_ptr->user_id != uid) && (uid != 0) && (uid != getuid())) {
@@ -1226,31 +1256,31 @@ job_complete(uint32_t job_id, uid_t uid, bool requeue,
 		return ESLURM_USER_ID_MISSING;
 	}
 
-	if (job_ptr->job_state == JOB_PENDING) {
-		verbose("job_complete for job id %u successful", job_id);
-	} else if (job_ptr->job_state == JOB_RUNNING) {
-		deallocate_nodes(job_ptr);
-		verbose("job_complete for job id %u successful", job_id);
-	} else {
-		error("job_complete for job id %u from bad state",
-		      job_id, job_ptr->job_state);
-	}
-
+	if (job_ptr->job_state == JOB_RUNNING)
+		job_comp_flag = JOB_COMPLETING;
 	if (requeue && job_ptr->details && job_ptr->batch_flag) {
-		job_ptr->job_state = JOB_PENDING;
+		job_ptr->job_state = JOB_PENDING | job_comp_flag;
 		info("Requeing job %u", job_ptr->job_id);
 	} else {
 		if (job_return_code)
-			job_ptr->job_state = JOB_FAILED;
-		else if (job_ptr->end_time < now)
-			job_ptr->job_state = JOB_TIMEOUT;
+			job_ptr->job_state = JOB_FAILED   | job_comp_flag;
+		else if (job_comp_flag &&		/* job was running */
+			 (job_ptr->end_time < now))	/* over time limit */
+			job_ptr->job_state = JOB_TIMEOUT  | job_comp_flag;
 		else
-			job_ptr->job_state = JOB_COMPLETE;
+			job_ptr->job_state = JOB_COMPLETE | job_comp_flag;
 		job_ptr->end_time = now;
-		delete_job_details(job_ptr);
 		delete_all_step_records(job_ptr);
 	}
+
 	last_job_update = now;
+	if (job_comp_flag) {	/* job was running */
+		deallocate_nodes(job_ptr);
+		verbose("job_complete for job id %u successful", job_id);
+	} else {
+		verbose("job_complete for job id %u successful", job_id);
+	}
+
 	return SLURM_SUCCESS;
 }
 
@@ -1876,29 +1906,24 @@ void job_time_limit(void)
 	job_record_iterator = list_iterator_create(job_list);
 	while ((job_ptr =
 		(struct job_record *) list_next(job_record_iterator))) {
-		if (job_ptr->magic != JOB_MAGIC)
-			fatal("job_time_limit: job integrity is bad");
-		if ((job_ptr->job_state == JOB_PENDING) ||
-		    (job_ptr->job_state == JOB_FAILED) ||
-		    (job_ptr->job_state == JOB_COMPLETE) ||
-		    (job_ptr->job_state == JOB_TIMEOUT) ||
-		    (job_ptr->job_state == JOB_NODE_FAIL))
+		xassert (job_ptr->magic == JOB_MAGIC);
+		if (job_ptr->job_state != JOB_RUNNING)
 			continue;
 
-		if ((job_ptr->kill_on_step_done & KILL_IN_PROGRESS) &&
-		    (difftime(now, job_ptr->time_last_active) >
-		     JOB_KILL_TIMEOUT)) {
+		if (job_ptr->kill_on_step_done & KILL_IN_PROGRESS) {
+			if (difftime(now, job_ptr->time_last_active) <=
+			    JOB_KILL_TIMEOUT)
+				continue;
+			last_job_update = now;
 			info("Job_id %u not properly terminating, forcing it",
 			     job_ptr->job_id);
 			last_job_update = now;
-			job_ptr->job_state = JOB_TIMEOUT;
 			job_ptr->end_time = time(NULL);
+			job_ptr->job_state = JOB_TIMEOUT | JOB_COMPLETING;
 			deallocate_nodes(job_ptr);
 			delete_all_step_records(job_ptr);
-			delete_job_details(job_ptr);
-		}
-		if (job_ptr->kill_on_step_done & KILL_IN_PROGRESS)
 			continue;
+		}
 
 		if (slurmctld_conf.inactive_limit) {
 			if (job_ptr->step_list &&
@@ -1917,14 +1942,29 @@ void job_time_limit(void)
 		last_job_update = now;
 		info("Time limit exhausted for job_id %u, terminating",
 		     job_ptr->job_id);
-		job_signal(job_ptr->job_id, SIGKILL, 0);
-		if (job_ptr->job_state == JOB_COMPLETE)
-			job_ptr->job_state = JOB_TIMEOUT;
+		_job_timed_out(job_ptr);
 	}
 
 	list_iterator_destroy(job_record_iterator);
 }
 
+/* Terminate a job that has exhausted its time limit */
+static void _job_timed_out(struct job_record *job_ptr)
+{
+#if NEW_TIME_LIMIT_RPC
+	// FIXME
+	// SET UP AND ISSUE NEW RPC TO ALL ALLOCATED NODES,
+	// see deallocate_nodes code for template
+#else
+	job_signal(job_ptr->job_id, SIGKILL, 0);
+#endif
+
+	job_ptr->time_last_active   = time(NULL);
+	job_ptr->job_state          = JOB_TIMEOUT | JOB_COMPLETING;
+	job_ptr->kill_on_step_done &= KILL_IN_PROGRESS;
+	return;
+}
+
 /* _validate_job_desc - validate that a job descriptor for job submit or 
  *	allocate has valid data, set values to defaults as required 
  * IN job_desc_msg - pointer to job descriptor
@@ -1937,7 +1977,7 @@ static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate,
 	if ((job_desc_msg->num_procs == NO_VAL) &&
 	    (job_desc_msg->min_nodes == NO_VAL) &&
 	    (job_desc_msg->req_nodes == NULL)) {
-		info("_validate_job_desc: job failed to specify num_procs, min_nodes or req_nodes");
+		info("Job failed to specify num_procs, min_nodes or req_nodes");
 		return ESLURM_JOB_MISSING_SIZE_SPECIFICATION;
 	}
 	if ((allocate == SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0) &&
@@ -1971,8 +2011,7 @@ static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate,
 		}
 		dup_job_ptr = find_job_record((uint32_t) job_desc_msg->job_id);
 		if (dup_job_ptr && 
-		    ((dup_job_ptr->job_state == JOB_PENDING) ||
-		     (dup_job_ptr->job_state == JOB_RUNNING))) {
+		    (!(IS_JOB_FINISHED(dup_job_ptr)))) {
 			info("attempt re-use active job_id %u", 
 			     job_desc_msg->job_id);
 			return ESLURM_DUPLICATE_JOB_ID;
@@ -2011,9 +2050,8 @@ static void _list_delete_job(void *job_entry)
 
 	job_record_point = (struct job_record *) job_entry;
 	if (job_record_point == NULL)
-		fatal("_list_delete_job: passed null job pointer");
-	if (job_record_point->magic != JOB_MAGIC)
-		fatal("_list_delete_job: passed invalid job pointer");
+		fatal ("_list_delete_job: job_record_point == NULL");
+	xassert (job_record_point->magic == JOB_MAGIC);
 
 	if (job_hash[JOB_HASH_INX(job_record_point->job_id)] ==
 	    job_record_point)
@@ -2070,18 +2108,14 @@ static int _list_find_job_id(void *job_entry, void *key)
  */
 static int _list_find_job_old(void *job_entry, void *key)
 {
-	time_t min_age;
+	time_t min_age = time(NULL) - MIN_JOB_AGE;
+	struct job_record *job_ptr = (struct job_record *)job_entry;
 
-	min_age = time(NULL) - MIN_JOB_AGE;
+	if (job_ptr->end_time > min_age)
+		return 0;	/* Too new to purge */
 
-	if (((struct job_record *) job_entry)->end_time > min_age)
-		return 0;
-
-	if ((((struct job_record *) job_entry)->job_state != JOB_COMPLETE)
-	    && (((struct job_record *) job_entry)->job_state != JOB_FAILED)
-	    && (((struct job_record *) job_entry)->job_state !=
-		JOB_TIMEOUT))
-		return 0;
+	if (!(IS_JOB_FINISHED(job_ptr)))
+		return 0;	/* Still active, can't purge */
 
 	return 1;
 }
@@ -2120,8 +2154,7 @@ pack_all_jobs(char **buffer_ptr, int *buffer_size)
 	job_record_iterator = list_iterator_create(job_list);
 	while ((job_record_point =
 		(struct job_record *) list_next(job_record_iterator))) {
-		if (job_record_point->magic != JOB_MAGIC)
-			fatal("dump_all_job: job integrity is bad");
+		xassert (job_record_point->magic == JOB_MAGIC);
 
 		pack_job(job_record_point, buffer);
 		jobs_packed++;
@@ -2254,47 +2287,91 @@ static int _purge_job_record(uint32_t job_id)
 void reset_job_bitmaps(void)
 {
 	ListIterator job_record_iterator;
-	struct job_record *job_record_point;
-
-	if (job_list == NULL)
-		fatal
-		    ("init_job_conf: list_create can not allocate memory");
+	struct job_record *job_ptr;
+	struct part_record *part_ptr;
 
+	if (job_list == NULL) 
+		fatal ("reset_job_bitmaps: job_list == NULL");
 	job_record_iterator = list_iterator_create(job_list);
-	while ((job_record_point =
+	while ((job_ptr =
 		(struct job_record *) list_next(job_record_iterator))) {
-		if (job_record_point->magic != JOB_MAGIC)
-			fatal("dump_all_job: job integrity is bad");
-		FREE_NULL_BITMAP(job_record_point->node_bitmap);
-		if (job_record_point->nodes) {
-			node_name2bitmap(job_record_point->nodes,
-					 &job_record_point->node_bitmap);
-			if (job_record_point->job_state == JOB_RUNNING)
-				allocate_nodes(job_record_point->
-					       node_bitmap);
+		xassert (job_ptr->magic == JOB_MAGIC);
+		part_ptr = list_find_first(part_list, &list_find_part,
+				   job_ptr->partition);
+		if (part_ptr == NULL) {
+			error("Invalid partition (%s) for job_id %u", 
+		    	      job_ptr->partition, job_ptr->job_id);
+			job_ptr->job_state = JOB_NODE_FAIL;
+		}
+		job_ptr->part_ptr = part_ptr;
 
+		FREE_NULL_BITMAP(job_ptr->node_bitmap);
+		if ((job_ptr->nodes) && 
+		    (node_name2bitmap(job_ptr->nodes, &job_ptr->node_bitmap))) {
+			error("Invalid nodes (%s) for job_id %u", 
+		    	      job_ptr->nodes, job_ptr->job_id);
+			job_ptr->job_state = JOB_NODE_FAIL;
 		}
+		build_node_details(job_ptr);	/* set: num_cpu_groups, 
+						 * cpu_count_reps, node_cnt, 
+						 * cpus_per_node, node_addr */
+		_reset_detail_bitmaps(job_ptr);
+		_reset_step_bitmaps(job_ptr);
 
-		if (job_record_point->details == NULL)
-			continue;
-		FREE_NULL_BITMAP(job_record_point->details->req_node_bitmap);
-		if (job_record_point->details->req_nodes)
-			node_name2bitmap(job_record_point->details->
-					 req_nodes,
-					 &job_record_point->details->
-					 req_node_bitmap);
-		FREE_NULL_BITMAP(job_record_point->details->exc_node_bitmap);
-		if (job_record_point->details->exc_nodes)
-			node_name2bitmap(job_record_point->details->
-					 exc_nodes,
-					 &job_record_point->details->
-					 exc_node_bitmap);
+		if ((job_ptr->kill_on_step_done) &&
+		    (list_count(job_ptr->step_list) <= 1))
+			job_ptr->job_state = JOB_NODE_FAIL;
 	}
 
 	list_iterator_destroy(job_record_iterator);
 	last_job_update = time(NULL);
 }
 
+static void _reset_detail_bitmaps(struct job_record *job_ptr)
+{
+	if (job_ptr->details == NULL) 
+		return;
+
+	FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
+	if ((job_ptr->details->req_nodes) && 
+	    (node_name2bitmap(job_ptr->details->req_nodes, 
+			      &job_ptr->details->req_node_bitmap))) {
+		error("Invalid req_nodes (%s) for job_id %u", 
+	    	      job_ptr->details->req_nodes, job_ptr->job_id);
+		job_ptr->job_state = JOB_NODE_FAIL;
+	}
+
+	FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
+	if ((job_ptr->details->exc_nodes) && 
+	    (node_name2bitmap(job_ptr->details->exc_nodes, 
+			      &job_ptr->details->exc_node_bitmap))) {
+		error("Invalid exc_nodes (%s) for job_id %u", 
+	    	      job_ptr->details->exc_nodes, job_ptr->job_id);
+		job_ptr->job_state = JOB_NODE_FAIL;
+	}
+}
+
+static void _reset_step_bitmaps(struct job_record *job_ptr)
+{
+	ListIterator step_record_iterator;
+	struct step_record *step_ptr;
+
+	step_record_iterator = list_iterator_create (job_ptr->step_list);		
+	while ((step_ptr = (struct step_record *) 
+			   list_next (step_record_iterator))) {
+		if ((step_ptr->step_node_list) && 		
+		    (node_name2bitmap(step_ptr->step_node_list, 
+			      &step_ptr->step_node_bitmap))) {
+			error("Invalid step_node_list (%s) for step_id %u.%u", 
+	   	 	      step_ptr->step_node_list, 
+			      job_ptr->job_id, step_ptr->step_id);
+			delete_step_record (job_ptr, step_ptr->step_id);
+		}
+	}		
+
+	list_iterator_destroy (step_record_iterator);
+	return;
+}
 
 /*
  * _set_job_id - set a default job_id, insure that it is unique
@@ -2307,8 +2384,9 @@ static void _set_job_id(struct job_record *job_ptr)
 	if (job_id_sequence < 0)
 		job_id_sequence = slurmctld_conf.first_job_id;
 
-	if ((job_ptr == NULL) || (job_ptr->magic != JOB_MAGIC))
-		fatal("_set_job_id: invalid job_ptr");
+	if (job_ptr == NULL)
+		fatal ("_set_job_id: job_ptr == NULL");
+	xassert (job_ptr->magic == JOB_MAGIC);
 	if ((job_ptr->partition == NULL)
 	    || (strlen(job_ptr->partition) == 0))
 		fatal("_set_job_id: partition not set");
@@ -2331,8 +2409,9 @@ static void _set_job_id(struct job_record *job_ptr)
  */
 static void _set_job_prio(struct job_record *job_ptr)
 {
-	if ((job_ptr == NULL) || (job_ptr->magic != JOB_MAGIC))
-		fatal("_set_job_prio: invalid job_ptr");
+	if (job_ptr == NULL)
+		fatal ("_set_job_prio: job_ptr == NULL");
+	xassert (job_ptr->magic == JOB_MAGIC);
 	job_ptr->priority = default_prio--;
 }
 
@@ -2356,8 +2435,7 @@ static bool _top_priority(struct job_record *job_ptr)
 	job_record_iterator = list_iterator_create(job_list);
 	while ((job_record_point =
 		(struct job_record *) list_next(job_record_iterator))) {
-		if (job_record_point->magic != JOB_MAGIC)
-			fatal("_top_priority: job integrity is bad");
+		xassert (job_record_point->magic == JOB_MAGIC);
 		if (job_record_point == job_ptr)
 			continue;
 		if (job_record_point->job_state != JOB_PENDING)
@@ -2637,6 +2715,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 
 	/* If no job is running here, ensure none are assigned to this node */
 	if (*job_count == 0) {
+		debug("Node %s registered with no jobs", node_name);
 		(void) kill_running_job_by_node_name(node_name, true);
 		return;
 	}
@@ -2758,7 +2837,7 @@ static void _spawn_signal_agent(agent_arg_t *agent_info)
  * old_job_info - get details about an existing job allocation
  * IN uid - job issuing the code
  * IN job_id - ID of job for which info is requested
- * OUT everything else - the job's detains
+ * OUT everything else - the job's details
  */
 int
 old_job_info(uint32_t uid, uint32_t job_id, char **node_list,
@@ -2773,9 +2852,9 @@ old_job_info(uint32_t uid, uint32_t job_id, char **node_list,
 		return ESLURM_INVALID_JOB_ID;
 	if ((uid != 0) && (job_ptr->user_id != uid))
 		return ESLURM_ACCESS_DENIED;
-	if (job_ptr->job_state == JOB_PENDING)
+	if (IS_JOB_PENDING(job_ptr))
 		return ESLURM_JOB_PENDING;
-	if (job_ptr->job_state != JOB_RUNNING)
+	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;
 
 	if (node_list)
@@ -2844,7 +2923,8 @@ static void _get_batch_job_dir_ids(List batch_dirs)
 
 /* All pending batch jobs must have a batch_dir entry, 
  *	otherwise we flag it as FAILED and don't schedule
- * If the batch_dir entry exists for a batch job, remove it */
+ * If the batch_dir entry exists for a PENDING or RUNNING batch job, 
+ *	remove it the list (of directories to be deleted) */
 static void _validate_job_files(List batch_dirs)
 {
 	ListIterator job_record_iterator;
@@ -2856,13 +2936,13 @@ static void _validate_job_files(List batch_dirs)
 		    (struct job_record *) list_next(job_record_iterator))) {
 		if (!job_ptr->batch_flag)
 			continue;
-		if ((job_ptr->job_state != JOB_PENDING) &&
-		    (job_ptr->job_state != JOB_RUNNING))
+		if (IS_JOB_FINISHED(job_ptr))
 			continue;
 		/* Want to keep this job's files */
 		del_cnt = list_delete_all(batch_dirs, _find_batch_dir, 
 					  &(job_ptr->job_id));
-		if ((del_cnt == 0) && (job_ptr->job_state == JOB_PENDING)) {
+		if ((del_cnt == 0) && 
+		    (job_ptr->job_state == JOB_PENDING)) {
 			error("Script for job %u lost, state set to FAILED",
 			      job_ptr->job_id);
 			job_ptr->job_state = JOB_FAILED;
diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c
index d728053a8f6210a7ec82d6b0aa7d57be9b63dd77..70a00bf7fc2290a35a6185434dae9b57a2073ed3 100644
--- a/src/slurmctld/job_scheduler.c
+++ b/src/slurmctld/job_scheduler.c
@@ -37,6 +37,7 @@
 
 #include "src/common/list.h"
 #include "src/common/macros.h"
+#include "src/common/xassert.h"
 #include "src/common/xstring.h"
 #include "src/slurmctld/agent.h"
 #include "src/slurmctld/locks.h"
@@ -77,8 +78,7 @@ static int _build_job_queue(struct job_queue **job_queue)
 			continue;
 		if (job_record_point->priority == 0)	/* held */
 			continue;
-		if (job_record_point->magic != JOB_MAGIC)
-			fatal("prio_order_job: data integrity is bad");
+		xassert (job_record_point->magic == JOB_MAGIC);
 		if (job_buffer_size <= job_queue_size) {
 			job_buffer_size += 50;
 			xrealloc(my_job_queue, job_buffer_size *
@@ -139,8 +139,8 @@ int schedule(void)
 		error_code = select_nodes(job_ptr, false);
 		if (error_code == ESLURM_NODES_BUSY) {
 			xrealloc(failed_parts,
-				 (failed_part_cnt +
-				  1) * sizeof(struct part_record *));
+				 (failed_part_cnt + 1) * 
+				 sizeof(struct part_record *));
 			failed_parts[failed_part_cnt++] =
 			    job_ptr->part_ptr;
 		} else if (error_code == SLURM_SUCCESS) {	
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
index a0237e81ff7d5c208083706d3400989962baf7a8..dc581965b039d9e667eba38d5f44af6a31d941b2 100644
--- a/src/slurmctld/node_mgr.c
+++ b/src/slurmctld/node_mgr.c
@@ -43,6 +43,7 @@
 
 #include "src/common/hostlist.h"
 #include "src/common/pack.h"
+#include "src/common/xassert.h"
 #include "src/common/xstring.h"
 #include "src/slurmctld/agent.h"
 #include "src/slurmctld/locks.h"
@@ -198,34 +199,33 @@ char * bitmap2node_name (bitstr_t *bitmap)
  */
 struct config_record * create_config_record (void) 
 {
-	struct config_record *config_point;
+	struct config_record *config_ptr;
 
 	last_node_update = time (NULL);
-	config_point =
-		(struct config_record *)
-		xmalloc (sizeof (struct config_record));
+	config_ptr = (struct config_record *)
+		     xmalloc (sizeof (struct config_record));
 
 	/* set default values */
-	config_point->cpus = default_config_record.cpus;
-	config_point->real_memory = default_config_record.real_memory;
-	config_point->tmp_disk = default_config_record.tmp_disk;
-	config_point->weight = default_config_record.weight;
-	config_point->nodes = NULL;
-	config_point->node_bitmap = NULL;
-	config_point->magic = CONFIG_MAGIC;
+	config_ptr->cpus = default_config_record.cpus;
+	config_ptr->real_memory = default_config_record.real_memory;
+	config_ptr->tmp_disk = default_config_record.tmp_disk;
+	config_ptr->weight = default_config_record.weight;
+	config_ptr->nodes = NULL;
+	config_ptr->node_bitmap = NULL;
+	xassert (config_ptr->magic = CONFIG_MAGIC);  /* set value */
 	if (default_config_record.feature) {
-		config_point->feature =
+		config_ptr->feature =
 			(char *)
 			xmalloc (strlen (default_config_record.feature) + 1);
-		strcpy (config_point->feature, default_config_record.feature);
+		strcpy (config_ptr->feature, default_config_record.feature);
 	}
 	else
-		config_point->feature = (char *) NULL;
+		config_ptr->feature = (char *) NULL;
 
-	if (list_append(config_list, config_point) == NULL)
+	if (list_append(config_list, config_ptr) == NULL)
 		fatal ("create_config_record: unable to allocate memory");
 
-	return config_point;
+	return config_ptr;
 }
 
 
@@ -279,7 +279,7 @@ create_node_record (struct config_record *config_point, char *node_name)
 	node_record_point->cpus = config_point->cpus;
 	node_record_point->real_memory = config_point->real_memory;
 	node_record_point->tmp_disk = config_point->tmp_disk;
-	node_record_point->magic = NODE_MAGIC;
+	xassert (node_record_point->magic = NODE_MAGIC)  /* set value */;
 	last_bitmap_update = time (NULL);
 	return node_record_point;
 }
@@ -299,37 +299,6 @@ static int _delete_config_record (void)
 }
 
 
-/* 
- * delete_node_record - delete the node record for a node with specified name
- *   to avoid invalidating the bitmaps and hash table, we just clear the name 
- *   set its state to NODE_STATE_DOWN
- * IN name - name of the desired node
- * RET 0 on success, errno otherwise
- * global: node_record_table_ptr - pointer to global node table
- */
-int delete_node_record (char *name) 
-{
-	struct node_record *node_record_point;	/* pointer to node_record */
-
-	last_node_update = time (NULL);
-	node_record_point = find_node_record (name);
-	if (node_record_point == (struct node_record *) NULL) {
-		error("delete_node_record: can't delete non-existent node %s", 
-		      name);
-		return ENOENT;
-	}  
-
-	if (node_record_point->partition_ptr) {
-		(node_record_point->partition_ptr->total_nodes)--;
-		(node_record_point->partition_ptr->total_cpus) -=
-			node_record_point->cpus;
-	}
-	strcpy (node_record_point->name, "");
-	_make_node_down(node_record_point);
-	return SLURM_SUCCESS;
-}
-
-
 /* dump_all_node_state - save the state of all nodes to file */
 int dump_all_node_state ( void )
 {
@@ -346,10 +315,9 @@ int dump_all_node_state ( void )
 	/* write node records to buffer */
 	lock_slurmctld (node_read_lock);
 	for (inx = 0; inx < node_record_count; inx++) {
-		if ((node_record_table_ptr[inx].magic != NODE_MAGIC) ||
-		    (node_record_table_ptr[inx].config_ptr->magic != 
-							CONFIG_MAGIC))
-			fatal ("dump_all_node_state: data integrity is bad");
+		xassert (node_record_table_ptr[inx].magic == NODE_MAGIC);
+		xassert (node_record_table_ptr[inx].config_ptr->magic == 
+			 CONFIG_MAGIC);
 
 		_dump_node_state (&node_record_table_ptr[inx], buffer);
 	}
@@ -651,7 +619,7 @@ int init_node_conf (void)
 		config_list = list_create (&_list_delete_config);
 
 	if (config_list == NULL)
-		fatal ("init_node_conf: list_create can not allocate memory");
+		fatal ("memory allocation failure");
 	return SLURM_SUCCESS;
 }
 
@@ -671,13 +639,16 @@ int list_compare_config (void *config_entry1, void *config_entry2)
  *	see list.h for documentation */
 static void _list_delete_config (void *config_entry) 
 {
-	struct config_record *config_record_point;
-
-	config_record_point = (struct config_record *) config_entry;
-	xfree (config_record_point->feature);
-	xfree (config_record_point->nodes);
-	FREE_NULL_BITMAP (config_record_point->node_bitmap);
-	xfree (config_record_point);
+	struct config_record *config_ptr = (struct config_record *) 
+					   config_entry;
+
+	if (config_ptr == NULL)
+		fatal ("_list_delete_config: config_ptr == NULL");
+	xassert(config_ptr->magic == CONFIG_MAGIC);
+	xfree (config_ptr->feature);
+	xfree (config_ptr->nodes);
+	FREE_NULL_BITMAP (config_ptr->node_bitmap);
+	xfree (config_ptr);
 }
 
 
@@ -727,8 +698,8 @@ int node_name2bitmap (char *node_names, bitstr_t **bitmap)
 	}
 
 	my_bitmap = (bitstr_t *) bit_alloc (node_record_count);
-	if (my_bitmap == NULL)
-		fatal("bit_alloc memory allocation failure");
+	if (my_bitmap == 0)
+		fatal ("memory allocation failure");
 
 	while ( (this_node_name = hostlist_shift (host_list)) ) {
 		node_record_point = find_node_record (this_node_name);
@@ -780,10 +751,9 @@ void pack_all_node (char **buffer_ptr, int *buffer_size)
 
 	/* write node records */
 	for (inx = 0; inx < node_record_count; inx++) {
-		if ((node_record_table_ptr[inx].magic != NODE_MAGIC) ||
-		    (node_record_table_ptr[inx].config_ptr->magic != 
-							CONFIG_MAGIC))
-			fatal ("pack_all_node: data integrity is bad");
+		xassert (node_record_table_ptr[inx].magic == NODE_MAGIC);
+		xassert (node_record_table_ptr[inx].config_ptr->magic ==  
+			 CONFIG_MAGIC);
 
 		_pack_node(&node_record_table_ptr[inx], buffer);
 		nodes_packed ++ ;
@@ -1222,8 +1192,8 @@ void set_node_down (char *name)
 		return;
 	}
 
-	_make_node_down(node_ptr);
 	(void) kill_running_job_by_node_name(name, false);
+	_make_node_down(node_ptr);
 
 	return;
 }
@@ -1269,9 +1239,9 @@ void ping_nodes (void)
 		    (base_state != NODE_STATE_DOWN)) {
 			error ("Node %s not responding, setting DOWN", 
 			       node_record_table_ptr[i].name);
-			_make_node_down(&node_record_table_ptr[i]);
 			kill_running_job_by_node_name (
 					node_record_table_ptr[i].name, false);
+			_make_node_down(&node_record_table_ptr[i]);
 			continue;
 		}
 
@@ -1480,13 +1450,15 @@ void msg_to_slurmd (slurm_msg_type_t msg_type)
 void make_node_alloc(struct node_record *node_ptr)
 {
 	int inx = node_ptr - node_record_table_ptr;
-	uint16_t no_resp_flag;
+	uint16_t no_resp_flag, base_state;
 
 	last_node_update = time (NULL);
-	no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND;
-	node_ptr->node_state = NODE_STATE_ALLOCATED | no_resp_flag;
-	(node_ptr->job_cnt)++;
+	(node_ptr->run_job_cnt)++;
 	bit_clear(idle_node_bitmap, inx);
+	base_state   = node_ptr->node_state & (~NODE_STATE_NO_RESPOND);
+	no_resp_flag = node_ptr->node_state &   NODE_STATE_NO_RESPOND ;
+	if (base_state != NODE_STATE_COMPLETING)
+		node_ptr->node_state = NODE_STATE_ALLOCATED | no_resp_flag;
 }
 
 /* make_node_comp - flag specified node as completing a job */
@@ -1496,7 +1468,7 @@ void make_node_comp(struct node_record *node_ptr)
 
 	last_node_update = time (NULL);
 	base_state   = node_ptr->node_state & (~NODE_STATE_NO_RESPOND);
-	no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND;
+	no_resp_flag = node_ptr->node_state &   NODE_STATE_NO_RESPOND;
 	if ((base_state == NODE_STATE_DOWN) ||
 	    (base_state == NODE_STATE_DRAINED) ||
 	    (base_state == NODE_STATE_DRAINING)) {
@@ -1506,6 +1478,12 @@ void make_node_comp(struct node_record *node_ptr)
 	} else {
 		node_ptr->node_state = NODE_STATE_COMPLETING | no_resp_flag;
 	}
+
+	if (node_ptr->run_job_cnt)
+		(node_ptr->run_job_cnt)--;
+	else
+		error("Node %s run_job_cnt underflow", node_ptr->name);
+	(node_ptr->comp_job_cnt)++;
 }
 
 /* _make_node_down - flag specified node as down */
@@ -1517,28 +1495,55 @@ static void _make_node_down(struct node_record *node_ptr)
 	last_node_update = time (NULL);
 	no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND;
 	node_ptr->node_state = NODE_STATE_DOWN | no_resp_flag;
-	node_ptr->job_cnt = 0;
 	bit_clear (up_node_bitmap, inx);
 	bit_clear (idle_node_bitmap, inx);
 }
 
-/* make_node_idle - flag specified node as having completed a job */
-void make_node_idle(struct node_record *node_ptr)
+/*
+ * make_node_idle - flag specified node as having completed a job
+ * IN node_ptr - pointer to node reporting job completion
+ * IN job_ptr - pointer to job that just completed
+ */
+void make_node_idle(struct node_record *node_ptr, 
+		    struct job_record *job_ptr)
 {
 	int inx = node_ptr - node_record_table_ptr;
 	uint16_t no_resp_flag, base_state;
 
+	if ((job_ptr) &&			/* Specific job completed */
+	    (bit_test(job_ptr->node_bitmap, inx))) {	/* Not a replay */
+		last_job_update = time (NULL);
+		bit_clear(job_ptr->node_bitmap, inx);
+		if (job_ptr->node_cnt) {
+			if ((--job_ptr->node_cnt) == 0)
+				job_ptr->job_state &= (~JOB_COMPLETING);
+		} else {
+			error("node_cnt underflow on job_id %u", 
+			      job_ptr->job_id);
+		}
+
+		if (node_ptr->comp_job_cnt)
+			(node_ptr->comp_job_cnt)--;
+		else
+			error("Node %s comp_job_cnt underflow, job_id %u", 
+			      node_ptr->name, job_ptr->job_id);
+		if (node_ptr->comp_job_cnt > 0) 
+			return;		/* More jobs completing */
+	}
+
 	last_node_update = time (NULL);
 	base_state   = node_ptr->node_state & (~NODE_STATE_NO_RESPOND);
 	no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND;
 	if ((base_state == NODE_STATE_DOWN) ||
 	    (base_state == NODE_STATE_DRAINED)) {
-		debug3("Node %s being left in state %s", 
-		       node_state_string((enum node_states)node_ptr->name));
+		debug3("Node %s being left in state %s", node_ptr->name, 
+		       node_state_string((enum node_states)base_state));
 	} else if (base_state == NODE_STATE_DRAINING) {
 		node_ptr->node_state = NODE_STATE_DRAINED;
 		bit_clear(idle_node_bitmap, inx);
 		bit_clear(up_node_bitmap, inx);
+	} else if (node_ptr->run_job_cnt) {
+		node_ptr->node_state = NODE_STATE_ALLOCATED | no_resp_flag;
 	} else {
 		node_ptr->node_state = NODE_STATE_IDLE | no_resp_flag;
 		if (no_resp_flag == 0)
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 41d9b4ed56a751a96aa793c6baa7ec48bcbdcd49..4cf4a58a8f57eb4070882cb009858256155a5581 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -40,6 +40,7 @@
 #include <slurm/slurm_errno.h>
 
 #include "src/common/hostlist.h"
+#include "src/common/xassert.h"
 #include "src/common/xmalloc.h"
 #include "src/slurmctld/agent.h"
 #include "src/slurmctld/slurmctld.h"
@@ -137,6 +138,10 @@ void deallocate_nodes(struct job_record *job_ptr)
 	pthread_attr_t attr_agent;
 	pthread_t thread_agent;
 	int buf_rec_size = 0;
+	if (job_ptr == NULL)
+		fatal ("job_ptr == NULL");
+	if (job_ptr->details == NULL)
+		fatal ("job_ptr->details == NULL");
 
 	agent_args = xmalloc(sizeof(agent_arg_t));
 	agent_args->msg_type = REQUEST_REVOKE_JOB_CREDENTIAL;
@@ -275,7 +280,7 @@ _pick_best_quadrics(bitstr_t * bitmap, bitstr_t * req_bitmap,
 	int best_fit_location = 0, best_fit_sufficient;
 
 	if (bitmap == NULL)
-		fatal("_pick_best_quadrics: bitmap pointer is NULL");
+		fatal ("_pick_best_quadrics: bitmap == NULL");
 
 	consec_index = 0;
 	consec_size  = 50;	/* start allocation for 50 sets of 
@@ -702,9 +707,8 @@ int select_nodes(struct job_record *job_ptr, bool test_only)
 	struct part_record *part_ptr = job_ptr->part_ptr;
 
 	if (job_ptr == NULL)
-		fatal("select_nodes: NULL job pointer value");
-	if (job_ptr->magic != JOB_MAGIC)
-		fatal("select_nodes: bad job pointer value");
+		fatal ("select_nodes: job_ptr == NULL");
+	xassert (job_ptr->magic == JOB_MAGIC);
 
 	/* insure that partition exists and is up */
 	if (part_ptr == NULL) {
@@ -761,8 +765,8 @@ int select_nodes(struct job_record *job_ptr, bool test_only)
 
 	/* assign the nodes and stage_in the job */
 	job_ptr->nodes = bitmap2node_name(req_bitmap);
-	allocate_nodes(req_bitmap);
 	job_ptr->node_bitmap = req_bitmap;
+	allocate_nodes(job_ptr->node_bitmap);
 	build_node_details(job_ptr);
 	req_bitmap = NULL;
 	job_ptr->job_state = JOB_RUNNING;
@@ -944,11 +948,9 @@ static int _nodes_in_sets(bitstr_t *req_bitmap,
 }
 
 /*
- * build_node_details - set cpu counts and addresses for allocated nodes
+ * build_node_details - set cpu counts and addresses for allocated nodes:
+ *	cpu_count_reps, cpus_per_node, node_addr, node_cnt, num_cpu_groups
  * IN job_ptr - pointer to a job record
- * NOTE: the arrays cpus_per_node, cpu_count_reps and node_addr in the job 
- *	details record are allocated by build_node_details and must be 
- *	xfreed by the caller, preferably using delete_job_details
  */
 void build_node_details(struct job_record *job_ptr)
 {
diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c
index 37c41be0ca6ae573bc678dd4ed802b290287c6bf..c83b42fe71ec1afbaa87b2103baad97cf295d3fb 100644
--- a/src/slurmctld/partition_mgr.c
+++ b/src/slurmctld/partition_mgr.c
@@ -170,43 +170,43 @@ static int _build_part_bitmap(struct part_record *part_record_point)
  */
 struct part_record *create_part_record(void)
 {
-	struct part_record *part_record_point;
+	struct part_record *part_ptr;
 
 	last_part_update = time(NULL);
 
-	part_record_point =
+	part_ptr =
 	    (struct part_record *) xmalloc(sizeof(struct part_record));
 
-	strcpy(part_record_point->name, "DEFAULT");
-	part_record_point->max_time = default_part.max_time;
-	part_record_point->max_nodes = default_part.max_nodes;
-	part_record_point->root_only = default_part.root_only;
-	part_record_point->state_up = default_part.state_up;
-	part_record_point->shared = default_part.shared;
-	part_record_point->total_nodes = default_part.total_nodes;
-	part_record_point->total_cpus = default_part.total_cpus;
-	part_record_point->node_bitmap = NULL;
-	part_record_point->magic = PART_MAGIC;
+	strcpy(part_ptr->name, "DEFAULT");
+	part_ptr->max_time    = default_part.max_time;
+	part_ptr->max_nodes   = default_part.max_nodes;
+	part_ptr->root_only   = default_part.root_only;
+	part_ptr->state_up    = default_part.state_up;
+	part_ptr->shared      = default_part.shared;
+	part_ptr->total_nodes = default_part.total_nodes;
+	part_ptr->total_cpus  = default_part.total_cpus;
+	part_ptr->node_bitmap = NULL;
+	xassert (part_ptr->magic = PART_MAGIC);  /* set value */
 
 	if (default_part.allow_groups) {
-		part_record_point->allow_groups =
+		part_ptr->allow_groups =
 		    (char *) xmalloc(strlen(default_part.allow_groups) + 1);
-		strcpy(part_record_point->allow_groups,
+		strcpy(part_ptr->allow_groups,
 		       default_part.allow_groups);
 	} else
-		part_record_point->allow_groups = NULL;
+		part_ptr->allow_groups = NULL;
 
 	if (default_part.nodes) {
-		part_record_point->nodes =
+		part_ptr->nodes =
 		    (char *) xmalloc(strlen(default_part.nodes) + 1);
-		strcpy(part_record_point->nodes, default_part.nodes);
+		strcpy(part_ptr->nodes, default_part.nodes);
 	} else
-		part_record_point->nodes = NULL;
+		part_ptr->nodes = NULL;
 
-	if (list_append(part_list, part_record_point) == NULL)
+	if (list_append(part_list, part_ptr) == NULL)
 		fatal("create_part_record: unable to allocate memory");
 
-	return part_record_point;
+	return part_ptr;
 }
 
 
@@ -256,8 +256,7 @@ int dump_all_part_state(void)
 	part_record_iterator = list_iterator_create(part_list);
 	while ((part_record_point =
 		(struct part_record *) list_next(part_record_iterator))) {
-		if (part_record_point->magic != PART_MAGIC)
-			fatal("dump_all_part_state: data integrity is bad");
+		xassert (part_record_point->magic == PART_MAGIC);
 		_dump_part_state(part_record_point, buffer);
 	}
 	list_iterator_destroy(part_record_iterator);
@@ -407,8 +406,8 @@ int load_all_part_state(void)
 		}
 
 		/* find record and perform update */
-		part_ptr =
-		    list_find_first(part_list, &list_find_part, part_name);
+		part_ptr = list_find_first(part_list, &list_find_part, 
+					   part_name);
 
 		if (part_ptr) {
 			part_ptr->max_time = max_time;
@@ -467,13 +466,13 @@ int init_part_conf(void)
 	last_part_update = time(NULL);
 
 	strcpy(default_part.name, "DEFAULT");
-	default_part.max_time = INFINITE;
-	default_part.max_nodes = INFINITE;
-	default_part.root_only = 0;
-	default_part.state_up = 1;
-	default_part.shared = SHARED_NO;
+	default_part.max_time    = INFINITE;
+	default_part.max_nodes   = INFINITE;
+	default_part.root_only   = 0;
+	default_part.state_up    = 1;
+	default_part.shared      = SHARED_NO;
 	default_part.total_nodes = 0;
-	default_part.total_cpus = 0;
+	default_part.total_cpus  = 0;
 	xfree(default_part.nodes);
 	xfree(default_part.allow_groups);
 	xfree(default_part.allow_uids);
@@ -485,9 +484,7 @@ int init_part_conf(void)
 		part_list = list_create(&_list_delete_part);
 
 	if (part_list == NULL)
-		fatal
-		    ("init_part_conf: list_create can not allocate memory");
-
+		fatal ("memory allocation failure");
 
 	strcpy(default_part_name, "");
 	default_part_loc = (struct part_record *) NULL;
@@ -573,8 +570,7 @@ pack_all_part(char **buffer_ptr, int *buffer_size)
 	part_record_iterator = list_iterator_create(part_list);
 	while ((part_record_point =
 		(struct part_record *) list_next(part_record_iterator))) {
-		if (part_record_point->magic != PART_MAGIC)
-			fatal("pack_all_part: data integrity is bad");
+		xassert (part_record_point->magic == PART_MAGIC);
 
 		pack_part(part_record_point, buffer);
 		parts_packed++;
diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c
index f72ec18c7f4ca9d13b6811867fdab7421e170089..74cf467704435ea7f394b3820ac66302245f6467 100644
--- a/src/slurmctld/read_config.c
+++ b/src/slurmctld/read_config.c
@@ -54,7 +54,9 @@ static int  _init_all_slurm_conf(void);
 static int  _parse_node_spec(char *in_line);
 static int  _parse_part_spec(char *in_line);
 static void _set_config_defaults(slurm_ctl_conf_t * ctl_conf_ptr);
+static int  _sync_nodes_to_comp_job(void);
 static int  _sync_nodes_to_jobs(void);
+static int  _sync_nodes_to_run_job(struct job_record *job_ptr);
 #ifdef 	HAVE_LIBELAN3
 static void _validate_node_proc_count(void);
 #endif
@@ -95,25 +97,22 @@ static int _build_bitmaps(void)
 	FREE_NULL_BITMAP(idle_node_bitmap);
 	FREE_NULL_BITMAP(up_node_bitmap);
 	idle_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
-	up_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
-	if ((idle_node_bitmap == NULL) || (up_node_bitmap == NULL))
-		fatal("bit_alloc memory allocation failure");
-
+	up_node_bitmap   = (bitstr_t *) bit_alloc(node_record_count);
+	if ((idle_node_bitmap == NULL) ||
+	    (up_node_bitmap   == NULL)) 
+		fatal ("memory allocation failure");
 	/* initialize the configuration bitmaps */
 	config_record_iterator = list_iterator_create(config_list);
 	if (config_record_iterator == NULL)
-		fatal
-		    ("_build_bitmaps: list_iterator_create unable to allocate memory");
+		fatal ("memory allocation failure");
 
-	while ((config_record_point =
-		(struct config_record *)
-		list_next(config_record_iterator))) {
+	while ((config_record_point = (struct config_record *)
+				      list_next(config_record_iterator))) {
 		FREE_NULL_BITMAP(config_record_point->node_bitmap);
-
 		config_record_point->node_bitmap =
 		    (bitstr_t *) bit_alloc(node_record_count);
 		if (config_record_point->node_bitmap == NULL)
-			fatal("bit_alloc memory allocation failure");
+			fatal ("memory allocation failure");
 	}
 	list_iterator_destroy(config_record_iterator);
 
@@ -124,15 +123,13 @@ static int _build_bitmaps(void)
 
 		if (node_record_table_ptr[i].name[0] == '\0')
 			continue;	/* defunct */
-		base_state =
-		    node_record_table_ptr[i].node_state & 
-		    (~NODE_STATE_NO_RESPOND);
-		no_resp_flag =
-		    node_record_table_ptr[i].node_state & 
-		    NODE_STATE_NO_RESPOND;
+		base_state   = node_record_table_ptr[i].node_state & 
+			       (~NODE_STATE_NO_RESPOND);
+		no_resp_flag = node_record_table_ptr[i].node_state & 
+			       NODE_STATE_NO_RESPOND;
 		if (base_state == NODE_STATE_IDLE)
 			bit_set(idle_node_bitmap, i);
-		if ((base_state != NODE_STATE_DOWN) &&
+		if ((base_state != NODE_STATE_DOWN)    &&
 		    (base_state != NODE_STATE_UNKNOWN) &&
 		    (base_state != NODE_STATE_DRAINED) &&
 		    (no_resp_flag == 0))
@@ -145,11 +142,10 @@ static int _build_bitmaps(void)
 	/* scan partition table and identify nodes in each */
 	all_part_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
 	if (all_part_node_bitmap == NULL)
-		fatal("bit_alloc memory allocation failure");
+		fatal ("memory allocation failure");
 	part_record_iterator = list_iterator_create(part_list);
 	if (part_record_iterator == NULL)
-		fatal
-		    ("_build_bitmaps: list_iterator_create unable to allocate memory");
+		fatal ("memory allocation failure");
 
 	while ((part_record_point =
 		(struct part_record *) list_next(part_record_iterator))) {
@@ -157,7 +153,7 @@ static int _build_bitmaps(void)
 		part_record_point->node_bitmap =
 		    (bitstr_t *) bit_alloc(node_record_count);
 		if (part_record_point->node_bitmap == NULL)
-			fatal("bit_alloc memory allocation failure");
+			fatal ("memory allocation failure");
 
 		/* check for each node in the partition */
 		if ((part_record_point->nodes == NULL) ||
@@ -209,9 +205,10 @@ static int _build_bitmaps(void)
 
 /* 
  * _init_all_slurm_conf - initialize or re-initialize the slurm 
- *	configuration values.   
- * RET 0 if no error, otherwise an error code
- * Note: Operates on common variables, no arguments
+ *	configuration values.  
+ * RET 0 if no error, otherwise an error code.
+ * NOTE: We leave the job table intact
+ * NOTE: Operates on common variables, no arguments
  */
 static int _init_all_slurm_conf(void)
 {
@@ -314,7 +311,7 @@ static int _parse_node_spec(char *in_line)
 			free(this_node_name);
 			this_node_name = malloc(128);
 			if (this_node_name == NULL)
-				fatal("memory allocation failure");
+				fatal ("memory allocation failure");
 			getnodename(this_node_name, 128);
 		}
 		if (strcasecmp(this_node_name, "DEFAULT") == 0) {
@@ -593,7 +590,7 @@ static int _parse_part_spec(char *in_line)
 			xfree(nodes);
 			nodes = xmalloc(128);
 			if (nodes == NULL)
-				fatal("memory allocation failure");
+				fatal ("memory allocation failure");
 			getnodename(nodes, 128);
 		}
 		part_record_point->nodes = nodes;
@@ -728,33 +725,35 @@ int read_slurm_conf(int recover)
 	}
 
 	rehash();
-	if (old_node_table_ptr) {
-		info("restoring original state of nodes");
-		for (i = 0; i < old_node_record_count; i++) {
-			node_record_point =
-			    find_node_record(old_node_table_ptr[i].name);
-			if (node_record_point)
-				node_record_point->node_state =
-				    old_node_table_ptr[i].node_state;
-		}
-		xfree(old_node_table_ptr);
-	}
 	set_slurmd_addr();
 
 	if (recover) {
 		(void) load_all_node_state();
 		(void) load_all_part_state();
 		(void) load_all_job_state();
+	} else {
+		if (old_node_table_ptr) {
+			info("restoring original state of nodes");
+			for (i = 0; i < old_node_record_count; i++) {
+				node_record_point  = 
+				  find_node_record(old_node_table_ptr[i].name);
+				if (node_record_point)
+					node_record_point->node_state =
+					    old_node_table_ptr[i].node_state;
+			}
+		}
+		reset_job_bitmaps();
 	}
+	(void) _sync_nodes_to_jobs();
 	(void) sync_job_files();
+	xfree(old_node_table_ptr);
 
 	if ((error_code = _build_bitmaps()))
 		return error_code;
 #ifdef 	HAVE_LIBELAN3
 	_validate_node_proc_count();
 #endif
-	if (recover)
-		(void) _sync_nodes_to_jobs();
+	(void) _sync_nodes_to_comp_job();
 
 	load_part_uid_allow_list(1);
 
@@ -816,10 +815,8 @@ static void _set_config_defaults(slurm_ctl_conf_t * ctl_conf_ptr)
 
 /*
  * _sync_nodes_to_jobs - sync node state to job states on slurmctld restart.
- *	we perform "lazy" updates on node states due to their number (assumes  
- *	number of jobs is much smaller than the number of nodes). This   
- *	routine marks nodes allocated to a job as busy no matter what the  
- *	node's last saved state 
+ *	This routine marks nodes allocated to a job as busy no matter what 
+ *	the node's last saved state 
  * RET count of nodes having state changed
  * Note: Operates on common variables, no arguments
  */
@@ -827,28 +824,41 @@ static int _sync_nodes_to_jobs(void)
 {
 	struct job_record *job_ptr;
 	ListIterator job_record_iterator;
-	int i, update_cnt = 0;
-	uint16_t no_resp_flag;
+	int update_cnt = 0;
 
 	job_record_iterator = list_iterator_create(job_list);
-	while ((job_ptr =
-		(struct job_record *) list_next(job_record_iterator))) {
-		if (job_ptr->job_state > JOB_COMPLETING)
-			continue;
+	while ((job_ptr = (struct job_record *) 
+			  list_next(job_record_iterator))) {
 		if (job_ptr->node_bitmap == NULL)
 			continue;
-		for (i = 0; i < node_record_count; i++) {
-			if (bit_test(job_ptr->node_bitmap, i) == 0)
-				continue;
-			node_record_table_ptr[i].job_cnt++;
-			if (node_record_table_ptr[i].node_state ==
-			    NODE_STATE_ALLOCATED)
-				continue;	/* already in proper state */
+
+		if ((job_ptr->job_state == JOB_RUNNING) ||
+		    (job_ptr->job_state &  JOB_COMPLETING))
+			update_cnt += _sync_nodes_to_run_job(job_ptr);
+	}
+	if (update_cnt)
+		info("_sync_nodes_to_jobs updated state of %d nodes",
+		     update_cnt);
+	return update_cnt;
+}
+
+/* For jobs which are in state COMPLETING, deallocate the nodes and 
+ * issue the RPC to revoke credentials */
+static int _sync_nodes_to_comp_job(void)
+{
+	struct job_record *job_ptr;
+	ListIterator job_record_iterator;
+	int update_cnt = 0;
+
+	job_record_iterator = list_iterator_create(job_list);
+	while ((job_ptr = (struct job_record *) 
+			  list_next(job_record_iterator))) {
+		if ((job_ptr->node_bitmap) &&
+		    (job_ptr->job_state & JOB_COMPLETING)) {
 			update_cnt++;
-			no_resp_flag = node_record_table_ptr[i].node_state & 
-				       NODE_STATE_NO_RESPOND;
-			node_record_table_ptr[i].node_state =
-				    NODE_STATE_ALLOCATED | no_resp_flag;
+			info("Revoking credentials for job_id %u",
+			     job_ptr->job_id);
+			deallocate_nodes(job_ptr);
 		}
 	}
 	if (update_cnt)
@@ -857,6 +867,32 @@ static int _sync_nodes_to_jobs(void)
 	return update_cnt;
 }
 
+static int _sync_nodes_to_run_job(struct job_record *job_ptr)
+{
+	int i, cnt = 0;
+	uint16_t base_state, no_resp_flag;
+
+	for (i = 0; i < node_record_count; i++) {
+		if (bit_test(job_ptr->node_bitmap, i) == 0)
+			continue;
+		node_record_table_ptr[i].run_job_cnt++;
+		base_state = node_record_table_ptr[i].node_state & 
+			     (~NODE_STATE_NO_RESPOND);
+		if (base_state == NODE_STATE_DOWN)
+			job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING;
+		if ((base_state == NODE_STATE_UNKNOWN) || 
+		    (base_state == NODE_STATE_IDLE)    ||
+		    (base_state == NODE_STATE_DRAINED)) {
+			cnt++;
+			no_resp_flag = node_record_table_ptr[i].node_state & 
+				       NODE_STATE_NO_RESPOND;
+			node_record_table_ptr[i].node_state =
+				    NODE_STATE_ALLOCATED | no_resp_flag;
+		}
+	}
+	return cnt;
+}
+
 #ifdef 	HAVE_LIBELAN3
 /* Every node in a given partition must have the same processor count 
  * at present, this function insure it */
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 482f85fbe8793d96868c74d3c39abaac275ba6a4..d38dc8fc8f730926ad7781272337dc7ef392acfb 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -67,6 +67,10 @@
 		if (_X) bit_free (_X);	\
 		_X	= NULL; 	\
 	} while (0)
+#define IS_JOB_FINISHED(_X)		\
+	((_X->job_state & (~JOB_COMPLETING)) > JOB_RUNNING)
+#define IS_JOB_PENDING(_X)		\
+	((_X->job_state & (~JOB_COMPLETING)) == JOB_PENDING)
 
 /*****************************************************************************\
  *  GENERAL CONFIGURATION parameters and data structures
@@ -144,7 +148,8 @@ struct node_record {
 	struct part_record *partition_ptr; /* partition for this node */
 	char comm_name[MAX_NAME_LEN];	/* communications path name to node */
 	slurm_addr slurm_addr;		/* network address */
-	uint16_t job_cnt;		/* count of jobs allocated to node */
+	uint16_t comp_job_cnt;		/* count of jobs completing on node */
+	uint16_t run_job_cnt;		/* count of jobs running on node */
 };
 
 extern struct node_record *node_record_table_ptr;  /* ptr to node records */
@@ -237,10 +242,7 @@ struct job_record {
 	struct part_record *part_ptr;	/* pointer to the partition record */
 	uint16_t batch_flag;		/* 1 if batch job (with script) */
 	uint32_t user_id;		/* user the job runs as */
-	enum job_states job_state;	/* state of the job, NOTE: state
-					 * JOB_COMPLETING is set in pack_job 
-					 * when (job state > JOB_RUNNING) &&
-					 * (node_count > 0), its artificial */
+	enum job_states job_state;	/* state of the job */
 	uint16_t kill_on_node_fail;	/* 1 if job should be killed on 
 					   node failure */
 	uint16_t kill_on_step_done;	/* 1 if job should be killed when 
@@ -321,11 +323,9 @@ extern char * bitmap2node_name (bitstr_t *bitmap) ;
 void build_job_cred(struct job_record *job_ptr);
 
 /*
- * build_node_details - set cpu counts and addresses for allocated nodes
+ * build_node_details - set cpu counts and addresses for allocated nodes:
+ *	cpu_count_reps, cpus_per_node, node_addr, node_cnt, num_cpu_groups
  * IN job_ptr - pointer to a job record
- * NOTE: the arrays cpus_per_node, cpu_count_reps and node_addr in the job 
- *	details record are allocated by build_node_details and must be 
- *	xfreed by the caller, preferably using delete_job_details
  */
 extern void build_node_details (struct job_record *job_ptr);
 
@@ -418,16 +418,6 @@ extern void delete_all_step_records (struct job_record *job_ptr);
  */
 extern void  delete_job_details (struct job_record *job_entry);
 
-/* 
- * delete_node_record - delete the node record for a node with specified name
- *   to avoid invalidating the bitmaps and hash table, we just clear the name 
- *   set its state to NODE_STATE_DOWN
- * IN name - name of the desired node
- * RET 0 on success, errno otherwise
- * global: node_record_table_ptr - pointer to global node table
- */
-extern int delete_node_record (char *name);
-
 /* 
  * delete_step_record - delete record for job step for specified job_ptr 
  *	and step_id
@@ -711,8 +701,13 @@ extern void make_node_alloc(struct node_record *node_ptr);
 /* make_node_comp - flag specified node as completing a job */
 extern void make_node_comp(struct node_record *node_ptr);
 
-/* make_node_idle - flag specified node as no longer being in use */
-extern void make_node_idle(struct node_record *node_ptr);
+/*
+ * make_node_idle - flag specified node as having completed a job
+ * IN node_ptr - pointer to node reporting job completion
+ * IN job_ptr - pointer to job that just completed
+ */
+extern void make_node_idle(struct node_record *node_ptr, 
+			   struct job_record *job_ptr);
 
 /* msg_to_slurmd - send given msg_type every slurmd, no args */
 extern void msg_to_slurmd (slurm_msg_type_t msg_type);
diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index d63b76301a782d594b91623eee04a89b86e96a1c..ae820eda500d4d92925f49304ab25941888617c8 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -29,7 +29,6 @@
 #endif
 
 #include <time.h>
-#include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <stdio.h>
@@ -63,7 +62,8 @@ create_step_record (struct job_record *job_ptr)
 {
 	struct step_record *step_record_point;
 
-	assert (job_ptr);
+	if (job_ptr == NULL)
+		fatal ("create_step_record: job_ptr == NULL");
 	step_record_point = 
 		(struct step_record *) xmalloc (sizeof (struct step_record));
 
@@ -88,7 +88,8 @@ delete_all_step_records (struct job_record *job_ptr)
 	ListIterator step_record_iterator;
 	struct step_record *step_record_point;
 
-	assert (job_ptr);
+	if (job_ptr == NULL)
+		fatal ("delete_all_step_records: job_ptr == NULL");
 	step_record_iterator = list_iterator_create (job_ptr->step_list);		
 
 	while ((step_record_point = 
@@ -120,7 +121,8 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id)
 	struct step_record *step_record_point;
 	int error_code;
 
-	assert (job_ptr);
+	if (job_ptr == NULL)
+		fatal ("delete_step_record: job_ptr == NULL");
 	error_code = ENOENT;
 	step_record_iterator = list_iterator_create (job_ptr->step_list);		
 
@@ -174,21 +176,21 @@ struct step_record *
 find_step_record(struct job_record *job_ptr, uint16_t step_id) 
 {
 	ListIterator step_record_iterator;
-	struct step_record *step_record_point;
+	struct step_record *step_ptr;
 
 	if (job_ptr == NULL)
 		return NULL;
 
 	step_record_iterator = list_iterator_create (job_ptr->step_list);		
-	while ((step_record_point = 
-		(struct step_record *) list_next (step_record_iterator))) {
-		if (step_record_point->step_id == step_id) {
+	while ((step_ptr = (struct step_record *) 
+			   list_next (step_record_iterator))) {
+		if (step_ptr->step_id == step_id) {
 			break;
 		}
 	}		
 
 	list_iterator_destroy (step_record_iterator);
-	return step_record_point;
+	return step_ptr;
 }
 
 
@@ -215,10 +217,7 @@ int job_step_signal(uint32_t job_id, uint32_t step_id,
 		return ESLURM_INVALID_JOB_ID;
 	}
 
-	if ((job_ptr->job_state == JOB_FAILED)    ||
-	    (job_ptr->job_state == JOB_COMPLETE)  ||
-	    (job_ptr->job_state == JOB_NODE_FAIL) ||
-	    (job_ptr->job_state == JOB_TIMEOUT))
+	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;
 
 	if ((job_ptr->user_id != uid) && (uid != 0) && (uid != getuid())) {
@@ -231,7 +230,7 @@ int job_step_signal(uint32_t job_id, uint32_t step_id,
 	if (step_ptr == NULL) {
 		info("job_step_cancel step %u.%u not found",
 		     job_id, step_id);
-		return ESLURM_ALREADY_DONE;
+		return ESLURM_INVALID_JOB_ID;
 	}
 
 	signal_step_tasks(step_ptr, signal);
@@ -337,9 +336,7 @@ int job_step_complete(uint32_t job_id, uint32_t step_id, uid_t uid,
 	    (list_count(job_ptr->step_list) <= 1))
 		return job_complete(job_id, uid, requeue, job_return_code);
 
-	if ((job_ptr->job_state == JOB_FAILED) ||
-	    (job_ptr->job_state == JOB_COMPLETE) ||
-	    (job_ptr->job_state == JOB_TIMEOUT))
+	if (IS_JOB_FINISHED(job_ptr))
 		return ESLURM_ALREADY_DONE;
 
 	if ((job_ptr->user_id != uid) && (uid != 0) && (uid != getuid())) {
@@ -510,9 +507,10 @@ step_create ( step_specs *step_specs, struct step_record** new_step_record,
 	    (step_specs->user_id != 0))
 		return ESLURM_ACCESS_DENIED ;
 
-	if ((job_ptr->job_state == JOB_COMPLETE) || 
-	    (job_ptr->job_state == JOB_FAILED) ||
-	    (job_ptr->job_state == JOB_TIMEOUT) ||
+	if (IS_JOB_PENDING(job_ptr))
+		return ESLURM_INVALID_JOB_ID ;
+
+	if (IS_JOB_FINISHED(job_ptr) || 
 	    (job_ptr->end_time <= time(NULL)))
 		return ESLURM_ALREADY_DONE;
 
diff --git a/src/squeue/opts.c b/src/squeue/opts.c
index 4cdf72282b7e56520eb7ff0678ed7585a8442689..7611487b71e9eaa69678ed5cf9ec9c6ac452ed39 100644
--- a/src/squeue/opts.c
+++ b/src/squeue/opts.c
@@ -190,7 +190,7 @@ parse_command_line( int argc, char* argv[] )
 }
 
 /*
- * _parse_state - convert state name string to numeric value
+ * _parse_state - convert job state name string to numeric value
  * IN str - state name
  * OUT states - enum job_states value corresponding to str
  * RET 0 or error code
@@ -211,13 +211,20 @@ _parse_state( char* str, enum job_states* states )
 			return SLURM_SUCCESS;
 		}	
 	}
-	
+	if ((strcasecmp(job_state_string(JOB_COMPLETING), str) == 0) ||
+	    (strcasecmp(job_state_string_compact(JOB_COMPLETING),str) == 0)) {
+		*states = JOB_COMPLETING;
+		return SLURM_SUCCESS;
+	}	
+
 	fprintf (stderr, "Invalid job state specified: %s\n", str);
 	state_names = xstrdup(job_state_string(0));
 	for (i=1; i<JOB_END; i++) {
 		xstrcat(state_names, ",");
 		xstrcat(state_names, job_state_string(i));
 	}
+	xstrcat(state_names, ",");
+	xstrcat(state_names, job_state_string(JOB_COMPLETING));
 	fprintf (stderr, "Valid job states include: %s\n", state_names);
 	xfree (state_names);
 	return SLURM_ERROR;
@@ -557,8 +564,8 @@ _build_part_list( char* str )
 }
 
 /*
- * _build_state_list - build a list of node states
- * IN str - comma separated list of node states
+ * _build_state_list - build a list of job states
+ * IN str - comma separated list of job states
  * RET List of enum job_states values
  */
 static List 
@@ -594,7 +601,7 @@ _build_state_list( char* str )
 }
 
 /*
- * _build_all_states_list - build a list containing all possible node states
+ * _build_all_states_list - build a list containing all possible job states
  * RET List of enum job_states values
  */
 static List 
@@ -610,6 +617,9 @@ _build_all_states_list( void )
 		*state_id = ( enum job_states ) i;
 		list_append( my_list, state_id );
 	}
+	state_id = xmalloc( sizeof( enum job_states ) );
+	*state_id = ( enum job_states ) JOB_COMPLETING;
+	list_append( my_list, state_id );
 	return my_list;
 
 }