From 4224c25c4ad0575354d0892f485a91a6666f615e Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Thu, 28 Jul 2011 08:12:00 -0700
Subject: [PATCH] Document use of consistent Cray/SLURM node timeout values

---
 doc/html/cray.shtml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/doc/html/cray.shtml b/doc/html/cray.shtml
index 2459d970bce..b538747c55b 100644
--- a/doc/html/cray.shtml
+++ b/doc/html/cray.shtml
@@ -380,6 +380,12 @@ This is specified in the <i>slurm.conf</i> file by using the
 <i>FrontendName</i> and optionally the <i>FrontEndAddr</i> fields
 as seen in the examples below.</p>
 
+<p>Note that SLURM will by default kill running jobs when a node goes DOWN,
+while a DOWN node in ALPS only prevents new jobs from being scheduled on the
+node. To help avoid confusion, we recommend that <i>SlurmdTimeout</i> in the
+<i>slurm.conf</i> file be set to the same value as the <i>suspectend</i>
+parameter in ALPS' <i>nodehealth.conf</i> file.</p>
+
 <p>You need to specify the appropriate resource selection plugin (the
 <i>SelectType</i> option in SLURM's <i>slurm.conf</i> configuration file).
 Configure <i>SelectType</i> to <i>select/cray</i> The <i>select/cray</i> 
@@ -450,6 +456,10 @@ SlurmdPidFile=/var/run/slurmd.pid
 # Return DOWN nodes to service when e.g. slurmd has been unresponsive
 ReturnToService=1
 
+# Configure the suspectend parameter in ALPS' nodehealth.conf file to the same
+# value as SlurmdTimeout for consistent behavior (e.g. "suspectend: 600")
+SlurmdTimeout=600
+
 # Controls how a node's configuration specifications in slurm.conf are
 # used.
 # 0 - use hardware configuration (must agree with slurm.conf)
@@ -621,6 +631,6 @@ allocation.</p>
 
 <p class="footer"><a href="#top">top</a></p>
 
-<p style="text-align:center;">Last modified 27 July 2011</p></td>
+<p style="text-align:center;">Last modified 28 July 2011</p></td>
 
 <!--#include virtual="footer.txt"-->
-- 
GitLab