From 7246b955f63d12f6b049be301fcb1f72d720b5fa Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Fri, 6 May 2016 10:01:18 -0700
Subject: [PATCH] Document lack of current support for KNL quad/SNC2

Note that Slurm can not support heterogenous core counts for each
NUMA nodes.
bug 2704
---
 doc/html/intel_knl.shtml | 19 +++++++++++++------
 doc/man/man5/knl.conf.5  |  9 ++++++++-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/doc/html/intel_knl.shtml b/doc/html/intel_knl.shtml
index 327cd016a54..d43b86341f7 100644
--- a/doc/html/intel_knl.shtml
+++ b/doc/html/intel_knl.shtml
@@ -46,8 +46,8 @@ value for that configuration will be used.</p>
 <tr><td>NUMA</td><td>a2a</td><td>All to all</td></tr>
 <tr><td>NUMA</td><td>hemi</td><td>Hemisphere</td></tr>
 <tr><td>NUMA</td><td>snc2</td><td>Sub-NUMA cluster 2</td></tr>
-<tr><td>NUMA</td><td>snc4</td><td>Sub-NUMA cluster 4</td></tr>
-<tr><td>NUMA</td><td>quad</td><td>Quadrant</td></tr>
+<tr><td>NUMA</td><td>snc4</td><td>Sub-NUMA cluster 4 (<a href="#note">NOTE</a>)</td></tr>
+<tr><td>NUMA</td><td>quad</td><td>Quadrant (<a href="#note">NOTE</a>)</td></tr>
 </table>
 
 <p>Jobs requiring some or all of the KNL high bandwidth memory (HBM) should
@@ -55,6 +55,13 @@ explicitly request that memory using Slurm's Generic RESource (GRES) options.
 The HBM will always be known by Slurm GRES name of "hbm".
 Examples below demonstrate use of HBM.</p>
 
+<p><a name="note"><b>NOTE</b></a>: Slurm version 16.05 and earlier can only
+support homogeneous nodes (e.g. the same number of cores per NUMA node).
+KNL scn4 and quad modes are not homogeneous, but each each NUMA mode will have
+either 16 or 18 cores. This will result in Slurm using the lower core count,
+finding a total of 256 threads rather than 272 threads and setting the node
+to a DOWN state.</p>
+
 <h3>Accounting</h3>
 
 <p>If a node requires rebooting for a job's required configuration, the job
@@ -82,8 +89,8 @@ $ srun --constraint=flat --gres=hbm:2g -n36 a.out
 $ sinfo -o "%30N %20b %f"
 NODELIST       ACTIVE_FEATURES  AVAIL_FEATURES
 nid000[10-11]
-nid000[12-35]  flat,a2a         flat,a2a,snc2,snc4,hemi,quad
-nid000[36-43]  cache,a2a        flat,equal,cache,a2a,hemi,quad
+nid000[12-35]  flat,a2a         flat,a2a,snc2,hemi
+nid000[36-43]  cache,a2a        flat,equal,cache,a2a,hemi
 </pre>
 
 <h3>Network Topology</h3>
@@ -212,8 +219,8 @@ information provided by the capmc program.</p>
 CapmcPath=/opt/cray/capmc/default/bin/capmc
 CapmcTimeout=2000	# msec
 DefaultNUMA=a2a         # NUMA=all2all
+AllowNUMA=a2a,snc2,hemi
 DefaultMCDRAM=cache     # MCDRAM=cache
-</pre>
 
 <h3>Sample slurm.conf File</h3>
 
@@ -233,6 +240,6 @@ NodeName=nid[00000-00127] State=UNKNOWN
 
 <p class="footer"><a href="#top">top</a></p>
 
-<p style="text-align:center;">Last modified 30 March 2016</p>
+<p style="text-align:center;">Last modified 6 May 2016</p>
 
 <!--#include virtual="footer.txt"-->
diff --git a/doc/man/man5/knl.conf.5 b/doc/man/man5/knl.conf.5
index e58d43eb4ea..85831960753 100644
--- a/doc/man/man5/knl.conf.5
+++ b/doc/man/man5/knl.conf.5
@@ -1,4 +1,4 @@
-.TH "knl.conf" "5" "Slurm Configuration File" "April 2016" "Slurm Configuration File"
+.TH "knl.conf" "5" "Slurm Configuration File" "May 2016" "Slurm Configuration File"
 
 .SH "NAME"
 knl.conf \- Slurm configuration file for Intel Knights Landing processor.
@@ -49,6 +49,13 @@ This may be a subset of NUMA modes supported by the node.
 If not specified, all NUMA modes supported by the node are available for use.
 The comma separated list of allowed NUMA modes may include any of the modes
 listed below.
+Note that Slurm version 16.05 and earlier can only support homogeneous nodes
+(e.g. the same number of cores per NUMA node).
+KNL scn4 and quad modes are not homogeneous, but each each NUMA mode will have
+either 16 or 18 cores.
+This will result in Slurm using the lower core count and finding a total of
+256 threads rather than 272 threads and setting the node to a DOWN state.
+Therefore it is recommended that snc4 and quad mode not be allowed at this time.
 .RS
 .TP 17
 \fBa2a\fR
-- 
GitLab