diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c
index 4ba0c2f6f73e53046751630a58cebf4d77a1f6f4..7965eea52a6e4794fd058e2aae6317a87ba7d768 100644
--- a/src/slurmctld/agent.c
+++ b/src/slurmctld/agent.c
@@ -399,6 +399,14 @@ static void *_wdog(void *args)
 				node_not_resp(thread_ptr[i].node_name,
 				              thread_ptr[i].start_time);
 		}
+		if (agent_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH) {
+			/* Requeue the request */
+			batch_job_launch_msg_t *launch_msg_ptr = 
+					*agent_ptr->msg_args_pptr;
+			uint32_t job_id = launch_msg_ptr->job_id;
+			info("Non-responding node, requeue JobId=%u", job_id);
+			job_complete(job_id, 0, true, 0);
+		}
 		unlock_slurmctld(node_write_lock);
 #else
 		/* Build a list of all non-responding nodes and send 
@@ -569,7 +577,7 @@ static void *_thread_per_node_rpc(void *args)
 			job_id, slurm_strerror(rc));
 		thread_state = DSH_DONE;
 		lock_slurmctld(job_write_lock);
-		job_signal(job_id, SIGKILL, 0);
+		job_complete(job_id, 0, false, 1);
 		unlock_slurmctld(job_write_lock);
 		goto cleanup;
 	}
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index a201f458a32d341e9f3779c8ede80be5e7d7b41c..ed33424f0b943d3c74c4af8fd4ddb0beb924361d 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -111,6 +111,7 @@ static int  _load_job_state(Buf buffer);
 static int  _load_step_state(struct job_record *job_ptr, Buf buffer);
 static void _pack_job_details(struct job_details *detail_ptr, Buf buffer);
 static int  _purge_job_record(uint32_t job_id);
+static void _purge_lost_batch_jobs(int node_inx, time_t now);
 static void _read_data_array_from_file(char *file_name, char ***data,
 				       uint16_t * size);
 static void _read_data_from_file(char *file_name, char **data);
@@ -2714,6 +2715,7 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 	int i, node_inx, jobs_on_node;
 	struct node_record *node_ptr;
 	struct job_record *job_ptr;
+	time_t now = time(NULL);
 
 	node_ptr = find_node_record(node_name);
 	if (node_ptr == NULL) {
@@ -2738,6 +2740,9 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 				debug3("Registered job %u.%u on node %s ",
 				       job_id_ptr[i], step_id_ptr[i], 
 				       node_name);
+				if ((job_ptr->batch_flag) &&
+				    (node_inx == bit_ffs(job_ptr->node_bitmap)))
+					job_ptr->time_last_active = now;
 			} else {
 				error
 				    ("Registered job %u.u on wrong node %s ",
@@ -2770,12 +2775,15 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 	}
 
 	jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
+	if (jobs_on_node)
+		_purge_lost_batch_jobs(node_inx, now);
 
 	if (jobs_on_node != *job_count) {
 		/* slurmd will not know of a job unless the job has
 		 * steps active at registration time, so this is not 
-		 * an error condition */
-		info("resetting job_count on node %s from %d to %d", 
+		 * an error condition, slurmd is also reporting steps 
+		 * rather than jobs */
+		debug3("resetting job_count on node %s from %d to %d", 
 		     node_name, *job_count, jobs_on_node);
 		*job_count = jobs_on_node;
 	}
@@ -2783,6 +2791,29 @@ validate_jobs_on_node(char *node_name, uint32_t * job_count,
 	return;
 }
 
+/* Purge any batch job that should have its script running on node 
+ * node_inx, but is not (i.e. its time_last_active != now) */
+static void _purge_lost_batch_jobs(int node_inx, time_t now)
+{
+	ListIterator job_record_iterator;
+	struct job_record *job_ptr;
+
+	job_record_iterator = list_iterator_create(job_list);
+	while ((job_ptr =
+		    (struct job_record *) list_next(job_record_iterator))) {
+		if ((job_ptr->job_state != JOB_RUNNING) ||
+		    (job_ptr->batch_flag == 0)          ||
+		    (job_ptr->time_last_active == now)  ||
+		    (node_inx != bit_ffs(job_ptr->node_bitmap)))
+			continue;
+
+		info("Master node lost JobId=%u, killing it", 
+			job_ptr->job_id);
+		job_complete(job_ptr->job_id, 0, false, 0);
+	}
+	list_iterator_destroy(job_record_iterator);
+}
+
 /* _kill_job_on_node - Kill the specific job_id on a specific node */
 static void
 _kill_job_on_node(uint32_t job_id, struct node_record *node_ptr)
diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c
index 267ace7319c2bc6f54a225f4103fc5ad94b81f19..ebd0e536838af46c801798a2ab5e6b9963f339d2 100644
--- a/src/slurmctld/job_scheduler.c
+++ b/src/slurmctld/job_scheduler.c
@@ -76,9 +76,9 @@ static int _build_job_queue(struct job_queue **job_queue)
 
 	while ((job_record_point =
 		(struct job_record *) list_next(job_record_iterator))) {
-		if (job_record_point->job_state != JOB_PENDING)
-			continue;
-		if (job_record_point->priority == 0)	/* held */
+		if ((job_record_point->job_state != JOB_PENDING)   ||
+		    (job_record_point->job_state & JOB_COMPLETING) ||
+		    (job_record_point->priority == 0))	/* held */
 			continue;
 		xassert (job_record_point->magic == JOB_MAGIC);
 		if (job_buffer_size <= job_queue_size) {
@@ -154,12 +154,13 @@ int schedule(void)
 		} else if (error_code == SLURM_SUCCESS) {	
 			/* job initiated */
 			last_job_update = time(NULL);
-			info("schedule: job_id %u on nodes %s",
+			info("schedule: JobId=%u NodeList=%s",
 			     job_ptr->job_id, job_ptr->nodes);
-			_launch_job(job_ptr);
+			if (job_ptr->batch_flag)
+				_launch_job(job_ptr);
 			job_cnt++;
 		} else {
-			info("schedule: job_id %u non-runnable, error %m",
+			info("schedule: JobId=%u non-runnable: %m",
 			     job_ptr->job_id);
 			last_job_update = time(NULL);
 			job_ptr->job_state = JOB_FAILED;
@@ -219,9 +220,6 @@ static void _launch_job(struct job_record *job_ptr)
 	pthread_t thread_agent;
 	int retries = 0;
 
-	if (job_ptr->batch_flag == 0)
-		return;
-
 	node_ptr = find_first_node_record(job_ptr->node_bitmap);
 	if (node_ptr == NULL)
 		return;
@@ -245,7 +243,7 @@ static void _launch_job(struct job_record *job_ptr)
 
 	agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t));
 	agent_arg_ptr->node_count = 1;
-	agent_arg_ptr->retry = 1;
+	agent_arg_ptr->retry = 0;
 	agent_arg_ptr->slurm_addr = xmalloc(sizeof(struct sockaddr_in));
 	memcpy(agent_arg_ptr->slurm_addr,
 	       &(node_ptr->slurm_addr), sizeof(struct sockaddr_in));
diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c
index 08ee4ea0b8db38ef6b4bbd35cc974a4f8befd2bf..d6d9c23c9d849fcc87982be5f7e84218a87d65b3 100644
--- a/src/slurmctld/node_scheduler.c
+++ b/src/slurmctld/node_scheduler.c
@@ -1028,14 +1028,14 @@ void build_node_details(struct job_record *job_ptr)
 				job_ptr->cpu_count_reps[cpu_inx]++;
 
 		} else {
-			error("Invalid node %s in job_id %u",
+			error("Invalid node %s in JobId=%u",
 			      this_node_name, job_ptr->job_id);
 		}
 		free(this_node_name);
 	}
 	hostlist_destroy(host_list);
 	if (job_ptr->node_cnt != node_inx) {
-		error("Node count mismatch for job_id %u (%u,%u)",
+		error("Node count mismatch for JobId=%u (%u,%u)",
 		      job_ptr->job_id, job_ptr->node_cnt, node_inx);
 		job_ptr->node_cnt = node_inx;
 	}