diff --git a/NEWS b/NEWS
index f5527aa25cf888805a61f532bfbcd989db7e14d2..29791f61795c2f9403b957a30009537bba9c1778 100644
--- a/NEWS
+++ b/NEWS
@@ -122,6 +122,11 @@ documents those changes that are of interest to users and administrators.
     slurmstepd versions where some are less than 17.11.6.
  -- Prevent the backup slurmctld from loosing the active/available node
     features list on takeover.
+ -- Add documentation for fix IDLE*+POWER due to capmc stuck in Cray systems.
+ -- Fix missing mutex unlock when prolog is failing on a node, leading to a
+    hung slurmd.
+ -- Fix locking around Cray CCM prolog/epilog.
+ -- Add missing fed_mgr read locks.
 
 * Changes in Slurm 17.11.5
 ==========================
diff --git a/doc/html/faq.shtml b/doc/html/faq.shtml
index ff899f0b63994c311690f7d1e7bdf055239a3c08..a03c62a7d577a61e1f31cc9eb832e35abe5ab43d 100644
--- a/doc/html/faq.shtml
+++ b/doc/html/faq.shtml
@@ -1159,16 +1159,10 @@ Create a resource reservation as described b. Slurm's
 
 <p><a name="pam"><b>9. How can PAM be used to control a user's limits on
 or access to compute nodes?</b></a><br>
-The pam_slurm_adopt PAM module is highly recommended for most installations,
-and is documented in its <a href="pam_slurm_adopt.shtml">own guide</a>. The
-following info applies to the older less-functional pam_slurm plugin instead.
-
-You will need to build and install Slurm including it's PAM module
-(a <i>slurm_pam</i> package is provided, the code is located in the
-<i>contribs/pam</i> directory).
-First, enable Slurm's use of PAM by setting <i>UsePAM=1</i> in
-<i>slurm.conf</i>.<br>
-Second, establish PAM configuration file(s) for Slurm in <i>/etc/pam.conf</i>
+To control a user's limits on a compute node:<br>
+<p>First, enable Slurm's use of PAM by setting <i>UsePAM=1</i> in
+<i>slurm.conf</i>.</p>
+<p>Second, establish PAM configuration file(s) for Slurm in <i>/etc/pam.conf</i>
 or the appropriate files in the <i>/etc/pam.d</i> directory (e.g.
 <i>/etc/pam.d/sshd</i> by adding the line "account required pam_slurm.so".
 A basic configuration you might use is:</p>
@@ -1190,16 +1184,21 @@ all resource limits are propagated from that session. For example, adding
 the following line to <i>slurm.conf</i> will prevent the locked memory
 limit from being propagated:<i>PropagateResourceLimitsExcept=MEMLOCK</i>.</p>
 
-<p>We also have a PAM module for Slurm that prevents users from
+<p>To control a user's access to a compute node:</p>
+<p>The pam_slurm_adopt and pam_slurm modules prevent users from
 logging into nodes that they have not been allocated (except for user
 root, which can always login).
-This pam_slurm module is included with the Slurm distribution.
-The module is built by default, but can be disabled using the
-.rpmmacros option "%_without_pam 1" or by entering the command line
+They are both included with the Slurm distribution.
+<p>The pam_slurm_adopt module is highly recommended for most installations,
+and is documented in its <a href="pam_slurm_adopt.shtml">own guide</a>.</p>
+<p>pam_slurm is older and less functional.
+These modules are built by default for RPM packages, but can be disabled using
+the .rpmmacros option "%_without_pam 1" or by entering the command line
 option "--without pam" when the configure program is executed.
-It's source code is in the directory "contribs/pam".
-The use of pam_slurm does not require <i>UsePAM</i> being set. The
-two uses of PAM are independent.
+Their source code is in the "contribs/pam" and "contribs/pam_slurm_adopt"
+directories respectively.</p>
+<p>The use of either pam_slurm_adopt or pam_slurm does not require
+<i>UsePAM</i> being set. The two uses of PAM are independent.</p>
 
 <p><a name="time"><b>10. Why are jobs allocated nodes and then unable
 to initiate programs on some nodes?</b></a><br>
@@ -2262,6 +2261,6 @@ accessed files within jobs.</p>
 
 <p class="footer"><a href="#top">top</a></p>
 
-<p style="text-align:center;">Last modified 1 March 2018</p>
+<p style="text-align:center;">Last modified 27 March 2018</p>
 
 <!--#include virtual="footer.txt"-->
diff --git a/doc/html/power_mgmt.shtml b/doc/html/power_mgmt.shtml
index ded4c2c93df0a15dbcab423ad9ad9e58a40bffab..66ca056868a522317b44d00cc1a5a630e802f2bb 100644
--- a/doc/html/power_mgmt.shtml
+++ b/doc/html/power_mgmt.shtml
@@ -227,8 +227,15 @@ to be distributed over 9 nodes or 184 watts per node.</p>
   It is not clear that configuration is practical to support as gang scheduling
   time slices will typically be smaller than the power management
   balance_interval and synchronizing changes may be difficult</li>
+<li>There can be situations where capmc program gets stuck for some reason and
+  the node remains in IDLE*+POWER state until ResumeTimeout is reached, despite
+  it has been rebooted or manually cleaned.
+  In this situation the node can be brought back into service issuing an
+  'scontrol update nodename=xxx state=power_down' which will cancel the
+  previous power_up request. Then capmc program must be diagnosed and fixed.
+</li>
 </ul>
 
-<p style="text-align:center;">Last modified 5 May 2015</p>
+<p style="text-align:center;">Last modified 7 Mar 2018</p>
 
 <!--#include virtual="footer.txt"-->
diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1
index c881046267e7d97387fd88d1c7fc7c1fca9c9359..3e08900d9633e879af615eca1340158e667101e4 100644
--- a/doc/man/man1/scontrol.1
+++ b/doc/man/man1/scontrol.1
@@ -1124,8 +1124,13 @@ node to be terminated.
 "POWER_DOWN" and "POWER_UP" will use the configured \fISuspendProg\fR and
 \fIResumeProg\fR programs to explicitly place a node in or out of a power
 saving mode. If a node is already in the process of being powered up or down,
-the command will have no effect until the configured ResumeTimeout or
-SuspendTimeout is reached.
+the command will only change the state of the node but won't have any
+effect until the configured ResumeTimeout or SuspendTimeout is reached.
+Use of this command can be useful in situations
+where a \fIResumeProg\fR like \fIcapmc\fR in Cray machines is stalled and one
+wants to restore the node to "IDLE" manually, in this case rebooting the node
+and setting the state to "POWER_DOWN" will cancel the previous "POWER_UP"
+state and the node will become "IDLE".
 The "NoResp" state will only set the "NoResp" flag for a node without
 changing its underlying state.
 While all of the above states are valid, some of them are not valid new
diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index 1a4450fbc08d6d98fbfd21d1e011a1e2f36a08af..f6962fcc5a9987ef98c6994836fba751ad7d07ff 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -2055,7 +2055,7 @@ which uses process group IDs
 \fBProlog\fR
 Fully qualified pathname of a program for the slurmd to execute
 whenever it is asked to run a job step from a new job allocation (e.g.
-"/usr/local/slurm/prolog").  A glob pattern (See \fBglob\fR(7)) may
+"/usr/local/slurm/prolog"). A glob pattern (See \fBglob\fR (7)) may
 also be used to specify more than one program to run (e.g.
 "/etc/slurm/prolog.d/*"). The slurmd executes the prolog before starting
 the first job step.  The prolog script or scripts may be used to purge files,
diff --git a/src/plugins/burst_buffer/cray/burst_buffer_cray.c b/src/plugins/burst_buffer/cray/burst_buffer_cray.c
index 4f2d71bf693c9b32f67444e289a1225c0e33ad08..99a9fdfc2292cec5b601593f9809bd77785066c9 100644
--- a/src/plugins/burst_buffer/cray/burst_buffer_cray.c
+++ b/src/plugins/burst_buffer/cray/burst_buffer_cray.c
@@ -4052,7 +4052,7 @@ static void *_start_pre_run(void *x)
 		NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
 	/* Locks: write job */
 	slurmctld_lock_t job_write_lock = {
-		NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
+		NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
 	pre_run_args_t *pre_run_args = (pre_run_args_t *) x;
 	char *resp_msg = NULL;
 	char jobid_buf[64];
diff --git a/src/plugins/select/cray/ccm.c b/src/plugins/select/cray/ccm.c
index 5f20824cc43f27f1e17c654afd465b15534c126b..6ce9bfb475bbf474ec35e130630c4a53d7c718cc 100644
--- a/src/plugins/select/cray/ccm.c
+++ b/src/plugins/select/cray/ccm.c
@@ -530,19 +530,33 @@ extern int ccm_check_partitions(struct job_record *job_ptr)
 extern void *ccm_begin(void *args)
 {
 	int i, j, num_ents, kill = 1;
+	uint32_t job_id;
 	size_t copysz;
 	ccm_info_t ccm_info;
 	char err_str_buf[128], srun_msg_buf[256];
 	struct job_record *job_ptr = (struct job_record *)args;
 	slurmctld_lock_t job_read_lock =
-		{NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
+		{ NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
+	slurmctld_lock_t job_write_lock =
+		{ NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
+
+	lock_slurmctld(job_read_lock);
+	if (job_ptr->magic != JOB_MAGIC) {
+		unlock_slurmctld(job_read_lock);
+		error("ccm job has disappeared");
+		return NULL;
+	} else if (IS_JOB_COMPLETING(job_ptr)) {
+		unlock_slurmctld(job_read_lock);
+		debug("ccm %u job has already completed", job_ptr->job_id);
+		return NULL;
+	}
+
+	job_id = job_ptr->job_id;
 
 	debug2("CCM job %u_ccm_begin partition %s", job_ptr->job_id,
 	       job_ptr->partition);
 	memset(&ccm_info, 0, sizeof(ccm_info_t));
 
-	lock_slurmctld(job_read_lock);
-
 	ccm_info.job_id = job_ptr->job_id;
 	ccm_info.user_id = job_ptr->user_id;
 	ccm_info.nodelist = xstrdup(job_ptr->nodes);
@@ -585,7 +599,6 @@ extern void *ccm_begin(void *args)
 		ccm_info.task_dist = job_ptr->details->task_dist;
 	}
 	ccm_info.plane_size = job_ptr->details->plane_size;
-	unlock_slurmctld(job_read_lock);
 
 	debug("CCM job %u, user_id %u, nodelist %s, node_cnt %d, "
 	      "num_tasks %d", ccm_info.job_id, ccm_info.user_id,
@@ -601,10 +614,12 @@ extern void *ccm_begin(void *args)
 			num_ents++;
 		}
 	}
+	unlock_slurmctld(job_read_lock);
+
 	if (ccm_info.node_cnt != num_ents) {
 		CRAY_ERR("CCM job %u ccm_info.node_cnt %d doesn't match the "
 			 "number of cpu_count_reps entries %d",
-			 job_ptr->job_id, ccm_info.node_cnt, num_ents);
+			 job_id, ccm_info.node_cnt, num_ents);
 		snprintf(err_str_buf, sizeof(err_str_buf),
 			 "node_cnt %d != cpu_count_reps %d, prolog not run",
 			 ccm_info.node_cnt, num_ents);
@@ -614,6 +629,14 @@ extern void *ccm_begin(void *args)
 		snprintf(err_str_buf, sizeof(err_str_buf),
 			 "prolog failed");
 	}
+
+	lock_slurmctld(job_write_lock);
+	if ((job_ptr->magic  != JOB_MAGIC) ||
+	    (job_ptr->job_id != job_id)) {
+		unlock_slurmctld(job_write_lock);
+		error("ccm job %u has disappeared after running ccm", job_id);
+		return NULL;
+	}
 	debug("CCM ccm_begin job %u prolog_running_decr, cur %d",
 	      ccm_info.job_id, job_ptr->details->prolog_running);
 	prolog_running_decr(job_ptr);
@@ -625,6 +648,7 @@ extern void *ccm_begin(void *args)
 		srun_user_message(job_ptr, srun_msg_buf);
 		(void) job_signal(job_ptr->job_id, SIGKILL, 0, 0, false);
 	}
+	unlock_slurmctld(job_write_lock);
 	/* Free the malloc'd fields within this structure */
 	_free_ccm_info(&ccm_info);
 	return NULL;
diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c
index 52eff2ee0a25b0566848dacbb9a75ea76c5b299f..44344dbee34f42f81b54cf8ef303b9943161aef5 100644
--- a/src/slurmctld/job_scheduler.c
+++ b/src/slurmctld/job_scheduler.c
@@ -4289,7 +4289,8 @@ static void *_wait_boot(void *arg)
 		READ_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
 	/* Locks: Write jobs; write nodes */
 	slurmctld_lock_t node_write_lock = {
-		READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
+		READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK };
+	bitstr_t *boot_node_bitmap;
 	uint16_t resume_timeout = slurm_get_resume_timeout();
 	struct node_record *node_ptr;
 	time_t start_time = time(NULL);
@@ -4389,7 +4390,7 @@ static void *_run_prolog(void *arg)
 	char *argv[2], **my_env;
 	/* Locks: Read config; Write jobs, nodes */
 	slurmctld_lock_t config_read_lock = {
-		READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
+		READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK };
 	bitstr_t *node_bitmap = NULL;
 	time_t now = time(NULL);
 	uint16_t resume_timeout = slurm_get_resume_timeout();
@@ -4504,6 +4505,9 @@ static void *_run_prolog(void *arg)
 /* Decrement a job's prolog_running counter and launch the job if zero */
 extern void prolog_running_decr(struct job_record *job_ptr)
 {
+	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
+	xassert(verify_lock(FED_LOCK, READ_LOCK));
+
 	if (!job_ptr)
 		return;
 
diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c
index 351e0236f0edc6292c3e83dcb20e9b1b80e52add..4e980f1df7223a773ee1a8e6e7d871c0763da09e 100644
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -2335,6 +2335,7 @@ _rpc_batch_job(slurm_msg_t *msg, bool new_msg)
 			if (retry_cnt > 50) {
 				slurm_mutex_unlock(&prolog_mutex);
 				rc = ESLURMD_PROLOG_FAILED;
+				slurm_mutex_unlock(&prolog_mutex);
 				goto done;
 			}
 
diff --git a/testsuite/expect/globals b/testsuite/expect/globals
index 742a04ab2d84eeca472c64d2e279e38c49e25e26..6535aa321815de24e5e7b0737717541d0868098f 100755
--- a/testsuite/expect/globals
+++ b/testsuite/expect/globals
@@ -3546,6 +3546,74 @@ proc get_partition_nodes {partition states} {
 	return $node_list
 }
 
+#####################################################################
+#
+# Proc: get_partition_maximum_time_limit
+#
+# Purpose: Get the maximum time limit in a given partition
+#
+# Input:   partition - partition to get the max time limit of
+#
+# Returns: time limit in seconds, -1 if undefined or error
+#
+#####################################################################
+
+proc get_partition_maximum_time_limit {partition} {
+
+	global sinfo number exit_code
+
+	if {[string length $partition] == 0} {
+		set partition [default_partition]
+	}
+
+	set secs 0
+	log_user 0
+	set sinfo_pid [spawn -noecho $sinfo -h -p $partition -O time -e]
+	expect {
+		-re "infinite" {
+			set secs -1
+			exp_continue
+		}
+		-re "n/a" {
+			set secs -1
+			exp_continue
+		}
+		-re "($number)-($number):($number):($number)" {
+			set days  [expr $expect_out(1,string) * 24 * 60 * 60]
+			set hours [expr $expect_out(2,string) * 60 * 60]
+			set mins  [expr $expect_out(3,string) * 60]
+			set secs  [expr $days + $hours + $mins + $expect_out(4,string)]
+			exp_continue
+		}
+		-re "($number):($number):($number)" {
+			set hours [expr $expect_out(1,string) * 60 * 60]
+			set mins  [expr $expect_out(2,string) * 60]
+			set secs  [expr $hours + $mins + $expect_out(3,string)]
+			exp_continue
+		}
+		-re "($number):($number)" {
+			set mins  [expr $expect_out(1,string) * 60]
+			set secs  [expr $mins + $expect_out(2,string)]
+			exp_continue
+		}
+		-re "($number)" {
+			set secs  [expr $expect_out(1,string) * 60]
+			exp_continue
+		}
+		timeout {
+			send_user "\nFAILURE: sinfo not responding\n"
+			slow_kill $sinfo_pid
+			set exit_code 1
+		}
+		eof {
+			wait
+		}
+	}
+
+	log_user 1
+	return $secs
+}
+
 #####################################################################
 #
 # Proc: get_partition_default_time_limit
@@ -3560,7 +3628,7 @@ proc get_partition_nodes {partition states} {
 
 proc get_partition_default_time_limit {partition} {
 
-	global sinfo number
+	global sinfo number exit_code
 
 	if {[string length $partition] == 0} {
 		set partition [default_partition]
@@ -3570,6 +3638,10 @@ proc get_partition_default_time_limit {partition} {
 	log_user 0
 	set sinfo_pid [spawn -noecho $sinfo -h -p $partition -O defaulttime -e]
 	expect {
+		-re "infinite" {
+			set secs -1
+			exp_continue
+		}
 		-re "n/a" {
 			set secs -1
 			exp_continue
@@ -3968,6 +4040,51 @@ proc test_pack_step { } {
 	return $pack_step
 }
 
+################################################################
+#
+# Proc: reconfigure
+#
+# Purpose:	Calls scontrol reconfigure.
+#
+# Input (optional) cluster 	- The cluster to reconfigure.
+# Output global exit_code 	- Sets exit_code to 1 on failure.
+#
+# Returns void
+#
+################################################################
+proc reconfigure { {cluster ""} } {
+	global exit_code scontrol
+
+	if { $cluster == "" } {
+		spawn $scontrol reconfigure
+	} else {
+		spawn $scontrol -M$cluster reconfigure
+	}
+	expect {
+		-re "slurm_reconfigure error: Invalid user id" {
+			log_error "Invalid user id"
+			set exit_code 1
+			exp_continue
+		}
+		-re "Error|error" {
+			log_error "scontrol reconfigure error"
+			set exit_code 1
+			exp_continue
+		}
+		timeout {
+			log_error "scontrol not responding\n"
+			set exit_code 1
+		}
+		eof {
+			wait
+		}
+	}
+	#
+	# Wait for reconfigure to complete, then return.
+	#
+	sleep 5
+}
+
 proc log_error {msg} {
 	send_user "\nFAILURE: $msg\n"
 }
diff --git a/testsuite/expect/test17.54 b/testsuite/expect/test17.54
index 25838e2217e1e8eafe2bfbe5fd6138303f5e5b77..b38f583c151b465827ff093e5bf5d4bcd54addcd 100755
--- a/testsuite/expect/test17.54
+++ b/testsuite/expect/test17.54
@@ -40,6 +40,50 @@ set job_id      0
 
 print_header $test_id
 
+#
+# We must be SlurmUser or root in order to change the partition MaxTime limit,
+# otherwise this test may fail.
+#
+if {[test_super_user] == 0} {
+	log_warn "You must be SlurmUser or root to run this test."
+	exit 0
+}
+
+set default_part [default_partition]
+if { $default_part == "" } {
+	log_error "Unable to get the default partition."
+	exit 1
+}
+
+#
+# Ensure that MaxTime is UNLIMITED so this test won't fail due to trying to set
+# time limits greater than MaxTime. Ensure DefaultTime is UNLIMITED so the job
+# time limit is what we expect.
+#
+spawn $scontrol update partitionname=$default_part MaxTime=-1 DefaultTime=-1
+expect {
+	timeout {
+		log_error "scontrol not responding"
+		set exit_code 1
+	}
+	eof {
+		wait
+	}
+}
+if { $exit_code == 1 } {
+	exit $exit_code
+}
+set timelimit [get_partition_maximum_time_limit $default_part]
+if { $timelimit != -1 } {
+	log_error "Unable to update partition MaxTime to UNLIMITED"
+	exit 1
+}
+set timelimit [get_partition_default_time_limit $default_part]
+if { $timelimit != -1 } {
+	log_error "Unable to update partition DefaultTime to UNLIMITED"
+	exit 1
+}
+
 #
 # Build input script file
 #
@@ -105,6 +149,12 @@ expect {
 	}
 }
 
+#
+# Restore partition MaxTime and DefaultTime.
+# reconfigure will set exit_code and log any errors if it fails.
+#
+reconfigure
+
 if {$exit_code == 0} {
 	exec $bin_rm -f $file_in
 	send_user "\nSUCCESS\n"
diff --git a/testsuite/expect/test17.8 b/testsuite/expect/test17.8
index 9f7965f8e4d30222416c15c69cdab0f4c101342c..c813f541b0b84ec4f577ef2044a545986dd1165e 100755
--- a/testsuite/expect/test17.8
+++ b/testsuite/expect/test17.8
@@ -41,6 +41,44 @@ set time_get    0
 
 print_header $test_id
 
+#
+# We must be SlurmUser or root in order to change the partition MaxTime limit,
+# otherwise this test may fail.
+#
+if {[test_super_user] == 0} {
+	log_warn "You must be SlurmUser or root to run this test."
+	exit 0
+}
+
+set default_part [default_partition]
+if { $default_part == "" } {
+	log_error "Unable to get the default partition."
+	exit 1
+}
+
+#
+# Ensure that MaxTime is UNLIMITED so this test won't fail due to trying to set
+# time limits greater than MaxTime.
+#
+spawn $scontrol update partitionname=$default_part MaxTime=-1
+expect {
+	timeout {
+		log_error "scontrol not responding"
+		set exit_code 1
+	}
+	eof {
+		wait
+	}
+}
+if { $exit_code == 1 } {
+	exit $exit_code
+}
+set maxtime [get_partition_maximum_time_limit $default_part]
+if { $maxtime != -1 } {
+	log_error "Unable to update partition MaxTime"
+	exit 1
+}
+
 #
 # Delete left-over stdin/out/err files
 # Build stdin file
@@ -184,6 +222,12 @@ if {$time_get != 1500} {
 }
 cancel_job $job_id
 
+#
+# Restore partition MaxTime.
+# reconfigure will set exit_code and log any errors if it fails.
+#
+reconfigure
+
 if {$exit_code == 0} {
 	exec $bin_rm -f $file_in
 	send_user "\nSUCCESS\n"