From 412846edceb21a66a35242639cccd690e4e60682 Mon Sep 17 00:00:00 2001
From: Dominik Bartkiewicz <bart@schedmd.com>
Date: Mon, 18 Dec 2017 11:16:45 -0700
Subject: [PATCH] Avoid using knl_generic plugin except on real KNL nodes

Add "Force=1" to knl_generic.conf to override (for testing)
bug 4487
---
 doc/man/man5/knl.conf.5                       | 10 ++++-
 .../knl_generic/node_features_knl_generic.c   | 39 +++++++++++++++----
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/doc/man/man5/knl.conf.5 b/doc/man/man5/knl.conf.5
index 0eb0e17fc81..581a66c006e 100644
--- a/doc/man/man5/knl.conf.5
+++ b/doc/man/man5/knl.conf.5
@@ -1,4 +1,4 @@
-.TH "knl.conf" "5" "Slurm Configuration File" "May 2016" "Slurm Configuration File"
+.TH "knl.conf" "5" "Slurm Configuration File" "December 2017" "Slurm Configuration File"
 
 .SH "NAME"
 knl.conf \- Slurm configuration file for Intel Knights Landing processor.
@@ -135,6 +135,12 @@ The value can include one of the possible values identified with the
 \fBAllowNUMA\fR configuration parameter above.
 The default value is "a2a".
 
+.TP
+\fBForce\fR
+If set to a non\-zero value then load the node_features/generic plugin even
+on non\-KNL nodes.
+Used primarily for testing purposes.
+
 .TP
 \fBLogFile\fR
 Fully qualified path to a log file.
@@ -205,7 +211,7 @@ LogFile=/var/tmp/slurm_node_feature.log
 SyscfgPath=/usr/sbin/syscfg
 
 .SH "COPYING"
-Copyright (C) 2015-2016 SchedMD LLC.
+Copyright (C) 2015-2017 SchedMD LLC.
 .LP
 This file is part of Slurm, a resource management program.
 For details, see <https://slurm.schedmd.com/>.
diff --git a/src/plugins/node_features/knl_generic/node_features_knl_generic.c b/src/plugins/node_features/knl_generic/node_features_knl_generic.c
index ada184f5029..02933924db7 100644
--- a/src/plugins/node_features/knl_generic/node_features_knl_generic.c
+++ b/src/plugins/node_features/knl_generic/node_features_knl_generic.c
@@ -178,6 +178,8 @@ static knl_system_type_t knl_system_type = KNL_SYSTEM_TYPE_INTEL;
 static uint32_t ume_check_interval = 0;
 static pthread_mutex_t ume_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_t ume_thread = 0;
+static uint32_t force_load = 0;
+static int hw_is_knl = -1;
 
 /* Percentage of MCDRAM used for cache by type, updated from syscfg */
 static int mcdram_pct[KNL_MCDRAM_CNT];
@@ -190,6 +192,7 @@ static s_p_options_t knl_conf_file_options[] = {
 	{"BootTime", S_P_UINT32},
 	{"DefaultMCDRAM", S_P_STRING},
 	{"DefaultNUMA", S_P_STRING},
+	{"Force", S_P_UINT32},
 	{"LogFile", S_P_STRING},
 	{"McPath", S_P_STRING},
 	{"SyscfgPath", S_P_STRING},
@@ -714,6 +717,9 @@ extern int init(void)
 	s_p_hashtbl_t *tbl;
 	struct stat stat_buf;
 	int rc = SLURM_SUCCESS;
+	char* cpuinfo_path = "/proc/cpuinfo";
+	FILE *cpu_info_file;
+	char buf[1024];
 
 	/* Set default values */
 	allow_mcdram = KNL_MCDRAM_FLAG;
@@ -774,6 +780,7 @@ extern int init(void)
 			}
 			xfree(tmp_str);
 		}
+		(void) s_p_get_uint32(&force_load, "Force", tbl);
 		(void) s_p_get_string(&mc_path, "McPath", tbl);
 		(void) s_p_get_string(&syscfg_path, "SyscfgPath", tbl);
 		if (s_p_get_string(&tmp_str, "SystemType", tbl)) {
@@ -802,6 +809,20 @@ extern int init(void)
 	else
 		syscfg_found = 0;
 
+	hw_is_knl = 0;
+	cpu_info_file = fopen(cpuinfo_path, "r");
+	if (cpu_info_file == NULL) {
+		error("Error opening/reading %s: %m", cpuinfo_path);
+	} else {
+		while (fgets(buf, sizeof(buf), cpu_info_file)) {
+			if (strstr(buf, "Xeon Phi")) {
+				hw_is_knl = 1;
+				break;
+			}
+		}
+		fclose(cpu_info_file);
+	}
+
 	if ((resume_program = slurm_get_resume_program())) {
 		error("Use of ResumeProgram with %s not currently supported",
 		      plugin_name);
@@ -824,6 +845,7 @@ extern int init(void)
 		info("BootTIme=%u", boot_time);
 		info("DefaultMCDRAM=%s DefaultNUMA=%s",
 		     default_mcdram_str, default_numa_str);
+		info("Force=%u", force_load);
 		info("McPath=%s", mc_path);
 		info("SyscfgPath=%s", syscfg_path);
 		info("SyscfgTimeout=%u msec", syscfg_timeout);
@@ -911,11 +933,11 @@ extern void node_features_p_node_state(char **avail_modes, char **current_mode)
 
 	if (!syscfg_path || !avail_modes || !current_mode)
 		return;
-	if (syscfg_found == 0) {
+	if ((syscfg_found == 0) || (!hw_is_knl && !force_load)) {
 		/* This node on cluster lacks syscfg; should not be KNL */
 		static bool log_event = true;
 		if (log_event) {
-			info("%s: syscfg program not found, can not get KNL modes",
+			info("%s: syscfg program not found or node isn't KNL, can not get KNL modes",
 			     __func__);
 			log_event = false;
 		}
@@ -1289,11 +1311,14 @@ extern int node_features_p_node_set(char *active_features)
 		error("%s: SyscfgPath not configured", __func__);
 		return SLURM_ERROR;
 	}
-	if (syscfg_found == 0) {
-		/* This node on cluster lacks syscfg; should not be KNL.
-		 * This code should never be reached */
-		error("%s: syscfg program not found; can not set KNL modes",
-		      __func__);
+	if ((syscfg_found == 0) || (!hw_is_knl && !force_load)) {
+		/* This node on cluster lacks syscfg; should not be KNL */
+		static bool log_event = true;
+		if (log_event) {
+			error("%s: syscfg program not found or node isn't KNL; can not set KNL modes",
+			      __func__);
+			log_event = false;
+		}
 		return SLURM_ERROR;
 	}
 
-- 
GitLab