From c38beaec91f8b79cd2e7ee081e964a19f6acbca9 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 19 May 2009 23:34:42 +0000
Subject: [PATCH] Slurmd notifies slurmctld of node boot time to better clean
 up after node     reboots. Slurmd sends node registration information
 repeatedly until successful     transmit.

---
 NEWS                              |   7 ++
 src/common/slurm_protocol_api.c   |  17 +++
 src/common/slurm_protocol_api.h   |   5 +
 src/common/slurm_protocol_defs.h  |   1 +
 src/common/slurm_protocol_pack.c  |  24 +++--
 src/slurmctld/job_mgr.c           |  80 ++++++++++----
 src/slurmctld/slurmctld.h         |   1 +
 src/slurmd/slurmd/get_mach_stat.c | 103 +++++++++++++-----
 src/slurmd/slurmd/get_mach_stat.h |   1 +
 src/slurmd/slurmd/req.c           |   8 +-
 src/slurmd/slurmd/slurmd.c        | 168 +++++++++++++++++++++---------
 src/slurmd/slurmd/slurmd.h        |   2 +
 12 files changed, 304 insertions(+), 113 deletions(-)

diff --git a/NEWS b/NEWS
index 4eeb3f751ca..0e9f8e3abc5 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,13 @@
 This file describes changes in recent versions of SLURM. It primarily
 documents those changes that are of interest to users and admins.
 
+* Changes in SLURM 2.1.0-pre1
+=============================
+ -- Slurmd notifies slurmctld of node boot time to better clean up after node
+    reboots.
+ -- Slurmd sends node registration information repeatedly until successful
+    transmit.
+
 * Changes in SLURM 2.0.0
 ============================
  -- Fix for bluegene systems to be able to create 32 node blocks with only 
diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index 8a35ce11604..57b6725fc13 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -245,6 +245,23 @@ uint16_t slurm_get_resume_timeout(void)
         return resume_timeout;
 }
 
+/* slurm_get_suspend_time
+ * RET SuspendTime value from slurm.conf
+ */
+uint16_t slurm_get_suspend_time(void)
+{
+	uint16_t suspend_time = 0;
+	slurm_ctl_conf_t *conf;
+
+	if(slurmdbd_conf) {
+	} else {
+		conf = slurm_conf_lock();
+		suspend_time = conf->suspend_time;
+		slurm_conf_unlock();
+	}
+	return suspend_time;
+}
+
 /* slurm_get_def_mem_per_task
  * RET DefMemPerTask value from slurm.conf
  */
diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h
index 5b7f67b96b4..e5c51b6bfef 100644
--- a/src/common/slurm_protocol_api.h
+++ b/src/common/slurm_protocol_api.h
@@ -108,6 +108,11 @@ uint16_t slurm_get_batch_start_timeout(void);
  */
 uint16_t slurm_get_resume_timeout(void);
 
+/* slurm_get_suspend_time
+ * RET SuspendTime value from slurm.conf
+ */
+uint16_t slurm_get_suspend_time(void);
+
 /* slurm_get_complete_wait
  * RET CompleteWait value from slurm.conf
  */
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index ebb731f40ae..e96ed53ecc6 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -798,6 +798,7 @@ typedef struct slurm_node_registration_status_msg {
 	uint32_t *step_id;	/* IDs of running job steps (if any) */
 	uint32_t status;	/* node status code, same as return codes */
 	uint16_t startup;	/* slurmd just restarted */
+	uint32_t up_time;	/* seconds since reboot */
 	switch_node_info_t switch_nodeinfo;	/* set only if startup != 0 */
 } slurm_node_registration_status_msg_t;
 
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index cb240584c1e..7ca80258d93 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -1679,24 +1679,26 @@ _pack_node_registration_status_msg(slurm_node_registration_status_msg_t *
 	xassert(msg != NULL);
 
 	pack_time(msg->timestamp, buffer);
-	pack32((uint32_t)msg->status, buffer);
+	pack32(msg->status, buffer);
 	packstr(msg->node_name, buffer);
 	packstr(msg->arch, buffer);
 	packstr(msg->os, buffer);
-	pack16((uint32_t)msg->cpus, buffer);
-	pack16((uint32_t)msg->sockets, buffer);
-	pack16((uint32_t)msg->cores, buffer);
-	pack16((uint32_t)msg->threads, buffer);
-	pack32((uint32_t)msg->real_memory, buffer);
-	pack32((uint32_t)msg->tmp_disk, buffer);
-	pack32((uint32_t)msg->job_count, buffer);
+	pack16(msg->cpus, buffer);
+	pack16(msg->sockets, buffer);
+	pack16(msg->cores, buffer);
+	pack16(msg->threads, buffer);
+	pack32(msg->real_memory, buffer);
+	pack32(msg->tmp_disk, buffer);
+	pack32(msg->up_time, buffer);
+
+	pack32(msg->job_count, buffer);
 	for (i = 0; i < msg->job_count; i++) {
-		pack32((uint32_t)msg->job_id[i], buffer);
+		pack32(msg->job_id[i], buffer);
 	}
 	for (i = 0; i < msg->job_count; i++) {
 		pack32(msg->step_id[i], buffer);
 	}
-	pack16((uint16_t)msg->startup, buffer);
+	pack16(msg->startup, buffer);
 	if (msg->startup)
 		switch_g_pack_node_info(msg->switch_nodeinfo, buffer);
 }
@@ -1727,6 +1729,8 @@ _unpack_node_registration_status_msg(slurm_node_registration_status_msg_t
 	safe_unpack16(&node_reg_ptr->threads, buffer);
 	safe_unpack32(&node_reg_ptr->real_memory, buffer);
 	safe_unpack32(&node_reg_ptr->tmp_disk, buffer);
+	safe_unpack32(&node_reg_ptr->up_time, buffer);
+
 	safe_unpack32(&node_reg_ptr->job_count, buffer);
 	node_reg_ptr->job_id =
 		xmalloc(sizeof(uint32_t) * node_reg_ptr->job_count);
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 3516beb26c3..362cd15335c 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -140,14 +140,14 @@ static int  _list_find_job_old(void *job_entry, void *key);
 static int  _load_job_details(struct job_record *job_ptr, Buf buffer);
 static int  _load_job_state(Buf buffer);
 static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx,
-				      time_t now);
+				      time_t now, time_t node_boot_time);
 static void _pack_job_for_ckpt (struct job_record *job_ptr, Buf buffer);
 static void _pack_default_job_details(struct job_details *detail_ptr,
 				      Buf buffer);
 static void _pack_pending_job_details(struct job_details *detail_ptr,
 				      Buf buffer);
 static int  _purge_job_record(uint32_t job_id);
-static void _purge_lost_batch_jobs(int node_inx, time_t now);
+static void _purge_missing_jobs(int node_inx, time_t now);
 static void _read_data_array_from_file(char *file_name, char ***data,
 				       uint32_t * size);
 static void _read_data_from_file(char *file_name, char **data);
@@ -5336,6 +5336,13 @@ extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg)
 			reg_msg->node_name);
 		return;
 	}
+
+	if (node_ptr->up_time > reg_msg->up_time) {
+		verbose("Node %s rebooted %u secs ago", 
+			reg_msg->node_name, reg_msg->up_time);
+	}
+	node_ptr->up_time = reg_msg->up_time;
+
 	node_inx = node_ptr - node_record_table_ptr;
 
 	/* Check that jobs running are really supposed to be there */
@@ -5415,7 +5422,7 @@ extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg)
 
 	jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
 	if (jobs_on_node)
-		_purge_lost_batch_jobs(node_inx, now);
+		_purge_missing_jobs(node_inx, now);
 
 	if (jobs_on_node != reg_msg->job_count) {
 		/* slurmd will not know of a job unless the job has
@@ -5431,42 +5438,64 @@ extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg)
 }
 
 /* Purge any batch job that should have its script running on node 
- * node_inx, but is not. Allow "batch_start_timeout" secs for startup. 
+ * node_inx, but is not. Allow BatchStartTimeout + ResumeTimeout seconds 
+ * for startup. 
+ *
+ * Purge all job steps that were started before the node was last booted.
  *
  * Also notify srun if any job steps should be active on this node
  * but are not found. */
-static void _purge_lost_batch_jobs(int node_inx, time_t now)
+static void _purge_missing_jobs(int node_inx, time_t now)
 {
 	ListIterator job_iterator;
 	struct job_record *job_ptr;
-	uint16_t batch_start_timeout = slurm_get_batch_start_timeout() +
-				       slurm_get_resume_timeout();
-	time_t recent = now - batch_start_timeout;
+	struct node_record *node_ptr = node_record_table_ptr + node_inx;
+	uint16_t batch_start_timeout	= slurm_get_batch_start_timeout();
+	uint16_t msg_timeout		= slurm_get_msg_timeout();
+	uint16_t resume_timeout		= slurm_get_resume_timeout();
+	uint16_t suspend_time		= slurm_get_suspend_time();
+	time_t batch_startup_time, node_boot_time = (time_t) 0, startup_time;
+
+	if (node_ptr->up_time) {
+		node_boot_time = now - node_ptr->up_time;
+		node_boot_time -= msg_timeout;
+		node_boot_time -= 5;	/* allow for other delays */
+	}
+	batch_startup_time  = now - batch_start_timeout;
+	batch_startup_time -= msg_timeout;
 
 	job_iterator = list_iterator_create(job_list);
 	while ((job_ptr = (struct job_record *) list_next(job_iterator))) {
 		bool job_active = ((job_ptr->job_state == JOB_RUNNING) ||
 				   (job_ptr->job_state == JOB_SUSPENDED));
+
 		if ((!job_active) ||
 		    (!bit_test(job_ptr->node_bitmap, node_inx)))
 			continue;
-		if (job_ptr->batch_flag == 0) {
-			_notify_srun_missing_step(job_ptr, node_inx, now);
-			continue;
+		if ((job_ptr->batch_flag != 0)			&&
+		    (suspend_time != 0) /* power mgmt on */	&&
+		    (job_ptr->start_time < node_boot_time)) {
+			startup_time = batch_startup_time - resume_timeout;
+		} else
+			startup_time = batch_startup_time;
+
+		if ((job_ptr->batch_flag != 0)			&&
+		    (job_ptr->time_last_active < startup_time)	&&
+		    (job_ptr->start_time       < startup_time)	&&
+		    (node_inx == bit_ffs(job_ptr->node_bitmap))) {
+			info("Batch JobId=%u missing from node 0, killing it", 
+			     job_ptr->job_id);
+			job_complete(job_ptr->job_id, 0, false, NO_VAL);
+		} else {
+			_notify_srun_missing_step(job_ptr, node_inx, 
+						  now, node_boot_time);
 		}
-		if (((job_ptr->time_last_active+batch_start_timeout) > now) ||
-		    (job_ptr->start_time >= recent)     ||
-		    (node_inx != bit_ffs(job_ptr->node_bitmap)))
-			continue;
-		info("Batch JobId=%u missing from master node, killing it", 
-			job_ptr->job_id);
-		job_complete(job_ptr->job_id, 0, false, NO_VAL);
 	}
 	list_iterator_destroy(job_iterator);
 }
 
 static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx, 
-				      time_t now)
+				      time_t now, time_t node_boot_time)
 {
 	ListIterator step_iterator;
 	struct step_record *step_ptr;
@@ -5475,6 +5504,8 @@ static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx,
 	xassert(job_ptr);
 	step_iterator = list_iterator_create (job_ptr->step_list);
 	while ((step_ptr = (struct step_record *) list_next (step_iterator))) {
+		if (!bit_test(step_ptr->step_node_bitmap, node_inx))
+			continue;
 		if (step_ptr->time_last_active >= now) {
 			/* Back up timer in case more than one node 
 			 * registration happens at this same time.
@@ -5482,7 +5513,18 @@ static void _notify_srun_missing_step(struct job_record *job_ptr, int node_inx,
 			 * to count toward a different node's 
 			 * registration message. */
 			step_ptr->time_last_active = now - 1;
+		} else if ((step_ptr->start_time < node_boot_time) &&
+			   (step_ptr->no_kill == 0)) {
+			/* There is a risk that the job step's tasks completed
+			 * on this node before its reboot, but that should be 
+			 * very rare */
+			info("Node %s rebooted, killing missing step %u.%u", 
+			     node_name, job_ptr->job_id, step_ptr->step_id);
+			srun_step_complete(step_ptr);
+			signal_step_tasks(step_ptr, SIGKILL);
 		} else if (step_ptr->host && step_ptr->port) {
+			/* srun may be able to verify step exists on
+			 * this node using I/O sockets */
 			srun_step_missing(step_ptr, node_name);
 		}
 	}		
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 8cc32ee7a78..267230b7bf8 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -213,6 +213,7 @@ struct node_record {
 	uint16_t threads;		/* number of threads per core */
 	uint32_t real_memory;		/* MB real memory on the node */
 	uint32_t tmp_disk;		/* MB total disk in TMP_FS */
+	uint32_t up_time;		/* seconds since node boot */
 	struct config_record *config_ptr;  /* configuration spec ptr */
 	uint16_t part_cnt;		/* number of associated partitions */
 	struct part_record **part_pptr;	/* array of pointers to partitions 
diff --git a/src/slurmd/slurmd/get_mach_stat.c b/src/slurmd/slurmd/get_mach_stat.c
index c5c83f145a6..f4de42301e2 100644
--- a/src/slurmd/slurmd/get_mach_stat.c
+++ b/src/slurmd/slurmd/get_mach_stat.c
@@ -64,6 +64,16 @@
 #include <stdlib.h>
 #include <string.h>
 #include <syslog.h>
+
+#ifdef HAVE_AIX
+#  include <sys/times.h>
+#  include <sys/types.h>
+#else
+/* NOTE: Getting the system uptime on AIX uses completely different logic.
+ * sys/sysinfo.h on AIX defines structures that conflict with SLURM code. */
+#  include <sys/sysinfo.h>
+#endif
+
 #include <sys/utsname.h>
 #ifdef HAVE_SYS_VFS_H
 #  include <sys/vfs.h>
@@ -79,17 +89,17 @@
 
 static char* _cpuinfo_path = "/proc/cpuinfo";
 
-int compute_block_map(uint16_t numproc,
-			uint16_t **block_map, uint16_t **block_map_inv);
-int chk_cpuinfo_str(char *buffer, char *keyword, char **valptr);
-int chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val);
-int chk_cpuinfo_float(char *buffer, char *keyword, float *val);
+static int _compute_block_map(uint16_t numproc,
+			      uint16_t **block_map, uint16_t **block_map_inv);
+static int _chk_cpuinfo_str(char *buffer, char *keyword, char **valptr);
+static int _chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val);
 
 /* #define DEBUG_DETAIL	1 */	/* enable detailed debugging within SLURM */
 
 #if DEBUG_MODULE
 #define DEBUG_DETAIL	1
-#define debug0	printf
+#define error 	printf
+#define debug 	printf
 #define debug1	printf
 #define debug2	printf
 #define debug3	printf
@@ -110,6 +120,8 @@ main(int argc, char * argv[])
 	char node_name[MAX_SLURM_NAME];
 	float speed;
 	uint16_t testnumproc = 0;
+	uint32_t up_time = 0;
+	int days, hours, mins, secs;
 
 	if (argc > 1) {
 	    	_cpuinfo_path = argv[1];
@@ -132,6 +144,7 @@ main(int argc, char * argv[])
 	xfree(block_map_inv);	/* not used here */
 	error_code += get_memory(&this_node.real_memory);
 	error_code += get_tmp_disk(&this_node.tmp_disk, "/tmp");
+	error_code += get_up_time(&up_time);
 #ifdef USE_CPU_SPEED
 	error_code += get_speed(&speed);
 #endif
@@ -142,6 +155,12 @@ main(int argc, char * argv[])
 		this_node.sockets, this_node.cores, this_node.threads);
 	debug3("\tRealMemory=%u TmpDisk=%u Speed=%f\n",
 		this_node.real_memory, this_node.tmp_disk, speed);
+	secs  = up_time % 60;
+	mins  = (up_time / 60) % 60;
+	hours = (up_time / 3600) % 24;
+	days  = (up_time / 86400);
+	debug3("\tUpTime=%u=%u-%2.2u:%2.2u:%2.2u\n",
+	       up_time, days, hours, mins, secs);
 	if (error_code != 0) 
 		debug3("get_mach_stat error_code=%d encountered\n", error_code);
 	exit (error_code);
@@ -368,8 +387,37 @@ get_tmp_disk(uint32_t *tmp_disk, char *tmp_fs)
 	return error_code;
 }
 
+extern int get_up_time(uint32_t *up_time)
+{
+#ifdef HAVE_AIX
+	clock_t tm;
+	struct tms buf;
+
+	tm = times(&buf);
+	if (tm == (clock_t) -1) {
+		*up_time = 0;
+		return errno;
+	}
+
+	*up_time = tm / sysconf(_SC_CLK_TCK);
+#else
+	/* NOTE for Linux: The return value of times() may overflow the 
+	 * possible range of type clock_t. There is also an offset of 
+	 * 429 million seconds on some implementations. We just use the 
+	 * simpler sysinfo() function instead. */
+	struct sysinfo info;
+
+	if (sysinfo(&info) < 0) {
+		*up_time = 0;
+		return errno;
+	}
+
+	*up_time = info.uptime;
+#endif
+	return 0;
+}
 
-/* chk_cpuinfo_str
+/* _chk_cpuinfo_str
  *	check a line of cpuinfo data (buffer) for a keyword.  If it
  *	exists, return the string value for that keyword in *valptr.
  * Input:  buffer - single line of cpuinfo data
@@ -377,7 +425,7 @@ get_tmp_disk(uint32_t *tmp_disk, char *tmp_fs)
  * Output: valptr - string value corresponding to keyword
  *         return code - true if keyword found, false if not found
  */
-int chk_cpuinfo_str(char *buffer, char *keyword, char **valptr)
+static int _chk_cpuinfo_str(char *buffer, char *keyword, char **valptr)
 {
 	char *ptr;
 	if (strncmp(buffer, keyword, strlen(keyword)))
@@ -390,7 +438,7 @@ int chk_cpuinfo_str(char *buffer, char *keyword, char **valptr)
 	return true;
 }
 
-/* chk_cpuinfo_uint32
+/* _chk_cpuinfo_uint32
  *	check a line of cpuinfo data (buffer) for a keyword.  If it
  *	exists, return the uint16 value for that keyword in *valptr.
  * Input:  buffer - single line of cpuinfo data
@@ -398,18 +446,18 @@ int chk_cpuinfo_str(char *buffer, char *keyword, char **valptr)
  * Output: valptr - uint32 value corresponding to keyword
  *         return code - true if keyword found, false if not found
  */
-int chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val)
+static int _chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val)
 {
 	char *valptr;
-	if (chk_cpuinfo_str(buffer, keyword, &valptr)) {
+	if (_chk_cpuinfo_str(buffer, keyword, &valptr)) {
 		*val = strtoul(valptr, (char **)NULL, 10);
 		return true;
 	} else {
 		return false;
 	}
 }
-
-/* chk_cpuinfo_float
+#ifdef USE_CPU_SPEED
+/* _chk_cpuinfo_float
  *	check a line of cpuinfo data (buffer) for a keyword.  If it
  *	exists, return the float value for that keyword in *valptr.
  * Input:  buffer - single line of cpuinfo data
@@ -417,10 +465,10 @@ int chk_cpuinfo_uint32(char *buffer, char *keyword, uint32_t *val)
  * Output: valptr - float value corresponding to keyword
  *         return code - true if keyword found, false if not found
  */
-int chk_cpuinfo_float(char *buffer, char *keyword, float *val)
+static int _chk_cpuinfo_float(char *buffer, char *keyword, float *val)
 {
 	char *valptr;
-	if (chk_cpuinfo_str(buffer, keyword, &valptr)) {
+	if (_chk_cpuinfo_str(buffer, keyword, &valptr)) {
 		*val = (float) strtod(valptr, (char **)NULL);
 		return true;
 	} else {
@@ -428,7 +476,6 @@ int chk_cpuinfo_float(char *buffer, char *keyword, float *val)
 	}
 }
 
-#ifdef USE_CPU_SPEED
 /*
  * get_speed - Return the speed of procs on this system (MHz clock)
  * Input: procs - buffer for the CPU speed
@@ -444,12 +491,12 @@ get_speed(float *speed)
 	*speed = 1.0;
 	cpu_info_file = fopen(_cpuinfo_path, "r");
 	if (cpu_info_file == NULL) {
-		error ("get_speed: error %d opening %s\n", errno, _cpuinfo_path);
+		error("get_speed: error %d opening %s\n", errno, _cpuinfo_path);
 		return errno;
 	} 
 
 	while (fgets(buffer, sizeof(buffer), cpu_info_file) != NULL) {
-		chk_cpuinfo_float(buffer, "cpu MHz", speed);
+		_chk_cpuinfo_float(buffer, "cpu MHz", speed);
 	} 
 
 	fclose(cpu_info_file);
@@ -533,7 +580,7 @@ get_cpuinfo(uint16_t numproc,
 	curcpu = 0;
 	while (fgets(buffer, sizeof(buffer), cpu_info_file) != NULL) {
 		uint32_t val;
-		if (chk_cpuinfo_uint32(buffer, "processor", &val)) {
+		if (_chk_cpuinfo_uint32(buffer, "processor", &val)) {
 			numcpu++;
 			curcpu = val;
 		    	if (val >= numproc) {	/* out of bounds, ignore */
@@ -545,7 +592,7 @@ get_cpuinfo(uint16_t numproc,
 			cpuinfo[val].cpuid = val;
 			maxcpuid = MAX(maxcpuid, val);
 			mincpuid = MIN(mincpuid, val);
-		} else if (chk_cpuinfo_uint32(buffer, "physical id", &val)) {
+		} else if (_chk_cpuinfo_uint32(buffer, "physical id", &val)) {
 			/* see if the ID has already been seen */
 			for (i=0; i<numproc; i++) {
 				if ((cpuinfo[i].physid == val)
@@ -566,7 +613,7 @@ get_cpuinfo(uint16_t numproc,
 
 			maxphysid = MAX(maxphysid, val);
 			minphysid = MIN(minphysid, val);
-		} else if (chk_cpuinfo_uint32(buffer, "core id", &val)) {
+		} else if (_chk_cpuinfo_uint32(buffer, "core id", &val)) {
 			/* see if the ID has already been seen */
 			for (i = 0; i < numproc; i++) {
 				if ((cpuinfo[i].coreid == val)
@@ -587,7 +634,7 @@ get_cpuinfo(uint16_t numproc,
 
 			maxcoreid = MAX(maxcoreid, val);
 			mincoreid = MIN(mincoreid, val);
-		} else if (chk_cpuinfo_uint32(buffer, "siblings", &val)) {
+		} else if (_chk_cpuinfo_uint32(buffer, "siblings", &val)) {
 			/* Note: this value is a count, not an index */
 		    	if (val > numproc) {	/* out of bounds, ignore */
 				debug("siblings is %u (> %d), ignored",
@@ -598,7 +645,7 @@ get_cpuinfo(uint16_t numproc,
 				cpuinfo[curcpu].siblings = val;
 			maxsibs = MAX(maxsibs, val);
 			minsibs = MIN(minsibs, val);
-		} else if (chk_cpuinfo_uint32(buffer, "cpu cores", &val)) {
+		} else if (_chk_cpuinfo_uint32(buffer, "cpu cores", &val)) {
 			/* Note: this value is a count, not an index */
 		    	if (val > numproc) {	/* out of bounds, ignore */
 				debug("cores is %u (> %d), ignored",
@@ -695,7 +742,7 @@ get_cpuinfo(uint16_t numproc,
 #endif
 
 	*block_map_size = numcpu;
-	retval = compute_block_map(*block_map_size, block_map, block_map_inv);
+	retval = _compute_block_map(*block_map_size, block_map, block_map_inv);
 
 	xfree(cpuinfo);		/* done with raw cpuinfo data */
 
@@ -703,7 +750,7 @@ get_cpuinfo(uint16_t numproc,
 }
 
 /*
- * compute_block_map - Compute abstract->machine block mapping (and inverse)
+ * _compute_block_map - Compute abstract->machine block mapping (and inverse)
  *   allows computation of CPU ID masks for an abstract block distribution
  *   of logical processors which can then be mapped the IDs used in the
  *   actual machine processor ID ordering (which can be BIOS/OS dependendent)
@@ -764,7 +811,7 @@ static int _icmp32(uint32_t a, uint32_t b)
 	}
 }
 
-int _compare_cpus(const void *a1, const void *b1) {
+static int _compare_cpus(const void *a1, const void *b1) {
 	uint16_t *a = (uint16_t *) a1;
 	uint16_t *b = (uint16_t *) b1;
 	int cmp;
@@ -785,8 +832,8 @@ int _compare_cpus(const void *a1, const void *b1) {
 	return cmp;
 }
 
-int compute_block_map(uint16_t numproc,
-		uint16_t **block_map, uint16_t **block_map_inv)
+static int _compute_block_map(uint16_t numproc,
+			      uint16_t **block_map, uint16_t **block_map_inv)
 {
 	uint16_t i;
 	/* Compute abstract->machine block mapping (and inverse) */
diff --git a/src/slurmd/slurmd/get_mach_stat.h b/src/slurmd/slurmd/get_mach_stat.h
index 1fcb7812106..f82ba29f47c 100644
--- a/src/slurmd/slurmd/get_mach_stat.h
+++ b/src/slurmd/slurmd/get_mach_stat.h
@@ -60,6 +60,7 @@ extern int get_cpuinfo(uint16_t numproc,
 extern int get_mach_name(char *node_name);
 extern int get_memory(uint32_t *real_memory);
 extern int get_tmp_disk(uint32_t *tmp_disk, char *tmp_fs);
+extern int get_up_time(uint32_t *up_time);
 
 #ifdef USE_OS_NAME
 extern int get_os_name(char *os_name);
diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c
index 56473ee02c3..5410d4311f4 100644
--- a/src/slurmd/slurmd/req.c
+++ b/src/slurmd/slurmd/req.c
@@ -154,7 +154,7 @@ static gids_t *_gids_cache_lookup(char *user, gid_t gid);
 static List waiters;
 
 static pthread_mutex_t launch_mutex = PTHREAD_MUTEX_INITIALIZER;
-static time_t booted = 0;
+static time_t startup = 0;		/* daemon startup time */
 static time_t last_slurmctld_msg = 0;
 
 static pthread_mutex_t job_limits_mutex = PTHREAD_MUTEX_INITIALIZER;
@@ -175,8 +175,8 @@ slurmd_req(slurm_msg_t *msg)
 	int rc;
 
 	if (msg == NULL) {
-		if (booted == 0)
-			booted = time(NULL);
+		if (startup == 0)
+			startup = time(NULL);
 		if (waiters) {
 			list_destroy(waiters);
 			waiters = NULL;
@@ -1731,7 +1731,7 @@ _rpc_daemon_status(slurm_msg_t *msg)
 	resp->actual_threads     = conf->actual_threads;
 	resp->actual_real_mem    = conf->real_memory_size;
 	resp->actual_tmp_disk    = conf->tmp_disk_space;
-	resp->booted             = booted;
+	resp->booted             = startup;
 	resp->hostname           = xstrdup(conf->node_name);
 	resp->step_list          = _get_step_list();
 	resp->last_slurmctld_msg = last_slurmctld_msg;
diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c
index 77c1b8145f0..58cc1cc5bda 100644
--- a/src/slurmd/slurmd/slurmd.c
+++ b/src/slurmd/slurmd/slurmd.c
@@ -72,6 +72,7 @@
 #include "src/common/slurm_cred.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/parse_spec.h"
+#include "src/common/parse_time.h"
 #include "src/common/hostlist.h"
 #include "src/common/macros.h"
 #include "src/common/fd.h"
@@ -121,33 +122,37 @@ typedef struct connection {
 static sig_atomic_t _shutdown = 0;
 static sig_atomic_t _reconfig = 0;
 static pthread_t msg_pthread = (pthread_t) 0;
+static time_t sent_reg_time = (time_t) 0;
 
-static void      _term_handler(int);
+static void      _atfork_final(void);
+static void      _atfork_prepare(void);
+static void      _create_msg_socket(void);
+static void      _decrement_thd_count(void);
+static void      _destroy_conf(void);
+static void      _fill_registration_msg(slurm_node_registration_status_msg_t *);
+static void      _handle_connection(slurm_fd fd, slurm_addr *client);
 static void      _hup_handler(int);
+static void      _increment_thd_count(void);
+static void      _init_conf(void);
+static void      _install_fork_handlers(void);
+static void 	 _kill_old_slurmd(void);
+static void      _msg_engine(void);
+static void      _print_conf(void);
 static void      _process_cmdline(int ac, char **av);
-static void      _create_msg_socket();
-static void      _msg_engine();
-static int       _slurmd_init();
-static int       _slurmd_fini();
-static void      _init_conf();
-static void      _destroy_conf();
-static void      _print_conf();
-static void      _read_config();
-static void 	 _kill_old_slurmd();
-static void      _reconfigure();
+static void      _read_config(void);
+static void      _reconfigure(void);
+static void     *_registration_engine(void *arg);
 static int       _restore_cred_state(slurm_cred_ctx_t ctx);
-static void      _increment_thd_count();
-static void      _decrement_thd_count();
-static void      _wait_for_all_threads();
-static int       _set_slurmd_spooldir(void);
-static void      _usage();
-static void      _handle_connection(slurm_fd fd, slurm_addr *client);
 static void     *_service_connection(void *);
-static void      _fill_registration_msg(slurm_node_registration_status_msg_t *);
+static int       _set_slurmd_spooldir(void);
+static int       _slurmd_init(void);
+static int       _slurmd_fini(void);
+static void      _spawn_registration_engine(void);
+static void      _term_handler(int);
 static void      _update_logging(void);
-static void      _atfork_prepare(void);
-static void      _atfork_final(void);
-static void      _install_fork_handlers(void);
+static void      _usage(void);
+static void      _wait_for_all_threads(void);
+
 
 int 
 main (int argc, char *argv[])
@@ -193,11 +198,11 @@ main (int argc, char *argv[])
 		char *curr_user = NULL;
 
 		/* since when you do a getpwuid you get a pointer to a
-		   structure you have to do a xstrdup on the first
-		   call or your information will just get over
-		   written.  This is a memory leak, but a fatal is
-		   called right after so it isn't that big of a deal.
-		*/
+		 * structure you have to do a xstrdup on the first
+		 * call or your information will just get over
+		 * written.  This is a memory leak, but a fatal is
+		 * called right after so it isn't that big of a deal.
+		 */
 		if ((pw=getpwuid(slurmd_uid)))
 			slurmd_user = xstrdup(pw->pw_name);	
 		if ((pw=getpwuid(curr_uid)))
@@ -283,13 +288,11 @@ main (int argc, char *argv[])
 
 	info("%s started on %T", xbasename(argv[0]));
 
-        if (send_registration_msg(SLURM_SUCCESS, true) < 0) 
-		error("Unable to register with slurm controller");
-
 	_install_fork_handlers();
 	list_install_fork_handlers();
 	slurm_conf_install_fork_handlers();
-	
+
+	_spawn_registration_engine();
 	_msg_engine();
 
 	/*
@@ -314,9 +317,59 @@ main (int argc, char *argv[])
        	return 0;
 }
 
+static void
+_spawn_registration_engine(void)
+{
+	int            rc;
+	pthread_attr_t attr;
+	pthread_t      id;
+	int            retries = 0;
+
+	slurm_attr_init(&attr);
+	rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+	if (rc != 0) {
+		errno = rc;
+		fatal("Unable to set detachstate on attr: %m");
+		slurm_attr_destroy(&attr);
+		return;
+	}
+
+	while (pthread_create(&id, &attr, &_registration_engine, NULL)) {
+		error("msg_engine: pthread_create: %m");
+		if (++retries > 3)
+			fatal("msg_engine: pthread_create: %m");
+		usleep(10);	/* sleep and again */
+	}
+	
+	return;
+}
+
+/* Spawn a thread to make sure we send at least one registration message to 
+ * slurmctld. If slurmctld restarts, it will request another registration 
+ * message. */
+static void *
+_registration_engine(void *arg)
+{
+	_increment_thd_count();
+
+	while (!_shutdown) {
+		if ((sent_reg_time == (time_t) 0) &&
+        	    (send_registration_msg(SLURM_SUCCESS, true) != 
+		     SLURM_SUCCESS)) {
+			debug("Unable to register with slurm controller, "
+			      "retrying");
+		} else if (_shutdown || sent_reg_time) {
+			break;
+		}
+		sleep(1);
+	}
+
+	_decrement_thd_count();
+	return NULL;
+}
 
 static void
-_msg_engine()
+_msg_engine(void)
 {
 	slurm_fd sock;
 
@@ -350,7 +403,7 @@ static void
 _decrement_thd_count(void)
 {
 	slurm_mutex_lock(&active_mutex);
-	if(active_threads>0)
+	if (active_threads>0)
 		active_threads--;
 	pthread_cond_signal(&active_cond);
 	slurm_mutex_unlock(&active_mutex);
@@ -375,7 +428,7 @@ _increment_thd_count(void)
 }
 
 static void
-_wait_for_all_threads()
+_wait_for_all_threads(void)
 {
 	slurm_mutex_lock(&active_mutex);
 	while (active_threads > 0) {
@@ -460,10 +513,10 @@ cleanup:
 	return NULL;
 }
 
-int
+extern int
 send_registration_msg(uint32_t status, bool startup)
 {
-	int retval = SLURM_SUCCESS;
+	int ret_val = SLURM_SUCCESS;
 	slurm_msg_t req;
 	slurm_msg_t resp;
 	slurm_node_registration_status_msg_t *msg = 
@@ -481,15 +534,17 @@ send_registration_msg(uint32_t status, bool startup)
 
 	if (slurm_send_recv_controller_msg(&req, &resp) < 0) {
 		error("Unable to register: %m");
-		retval = SLURM_FAILURE;
-	} else
-		slurm_free_return_code_msg(resp.data);	
+		ret_val = SLURM_FAILURE;
+	} else {
+		sent_reg_time = time(NULL);
+		slurm_free_return_code_msg(resp.data);
+	}
 	slurm_free_node_registration_status_msg (msg);
 
 	/* XXX look at response msg
 	 */
 
-	return SLURM_SUCCESS;
+	return ret_val;
 }
 
 static void
@@ -511,17 +566,20 @@ _fill_registration_msg(slurm_node_registration_status_msg_t *msg)
 	msg->real_memory = conf->real_memory_size;
 	msg->tmp_disk    = conf->tmp_disk_space;
 
+	get_up_time(&conf->up_time);
+	msg->up_time     = conf->up_time;
+
 	if (first_msg) {
 		first_msg = false;
 		info("Procs=%u Sockets=%u Cores=%u Threads=%u "
-		     "Memory=%u TmpDisk=%u",
+		     "Memory=%u TmpDisk=%u Uptime=%u",
 		     msg->cpus, msg->sockets, msg->cores, msg->threads,
-		     msg->real_memory, msg->tmp_disk);
+		     msg->real_memory, msg->tmp_disk, msg->up_time);
 	} else {
 		debug3("Procs=%u Sockets=%u Cores=%u Threads=%u "
-		       "Memory=%u TmpDisk=%u",
+		       "Memory=%u TmpDisk=%u Uptime=%u",
 		       msg->cpus, msg->sockets, msg->cores, msg->threads,
-		       msg->real_memory, msg->tmp_disk);
+		       msg->real_memory, msg->tmp_disk, msg->up_time);
 	}
 	uname(&buf);
 	if ((arch = getenv("SLURM_ARCH")))
@@ -609,7 +667,7 @@ _massage_pathname(char **path)
  * values into the slurmd configuration in preference of the defaults.
  */
 static void
-_read_config()
+_read_config(void)
 {
         char *path_pubkey = NULL;
 	slurm_ctl_conf_t *cf = NULL;
@@ -679,6 +737,7 @@ _read_config()
 	conf->threads = conf->actual_threads;
 
 	get_memory(&conf->real_memory_size);
+	get_up_time(&conf->up_time);
 
 	cf = slurm_conf_lock();
 	get_tmp_disk(&conf->tmp_disk_space, cf->tmp_fs);
@@ -706,6 +765,7 @@ _read_config()
 		fatal("Unable to establish controller machine");
 	if (cf->slurmctld_port == 0)
 		fatal("Unable to establish controller port");
+	conf->slurmd_timeout = cf->slurmd_timeout;
 	conf->use_pam = cf->use_pam;
 	conf->task_plugin_param = cf->task_plugin_param;
 
@@ -742,10 +802,10 @@ _reconfigure(void)
 }
 
 static void
-_print_conf()
+_print_conf(void)
 {
 	slurm_ctl_conf_t *cf;
-	char *str;
+	char *str, time_str[32];
 	int i;
 
 	cf = slurm_conf_lock();
@@ -768,6 +828,10 @@ _print_conf()
 	       conf->threads,
 	       conf->conf_threads,
 	       conf->actual_threads);
+
+	secs2time_str((time_t)conf->up_time, time_str, sizeof(time_str));
+	debug3("UpTime      = %u = %s", conf->up_time, time_str);
+
 	str = xmalloc(conf->block_map_size*5);
 	str[0] = '\0';
 	for (i = 0; i < conf->block_map_size; i++) {
@@ -811,7 +875,7 @@ _print_conf()
 /* Initialize slurmd configuration table.
  * Everything is already NULL/zero filled when called */
 static void
-_init_conf()
+_init_conf(void)
 {
 	char  host[MAXHOSTNAMELEN];
 	log_options_t lopts = LOG_OPTS_INITIALIZER;
@@ -833,7 +897,7 @@ _init_conf()
 }
 
 static void
-_destroy_conf()
+_destroy_conf(void)
 {
 	if(conf) {
 		xfree(conf->block_map);
@@ -902,7 +966,7 @@ _process_cmdline(int ac, char **av)
 			exit(0);
 			break;
 		default:
-			_usage(c);
+			_usage();
 			exit(1);
 			break;
 		}
@@ -911,7 +975,7 @@ _process_cmdline(int ac, char **av)
 
 
 static void
-_create_msg_socket()
+_create_msg_socket(void)
 {
 	char* node_addr;
 
@@ -940,7 +1004,7 @@ _create_msg_socket()
 
 
 static int
-_slurmd_init()
+_slurmd_init(void)
 {
 	struct rlimit rlim;
 	slurm_ctl_conf_t *cf;
@@ -1097,7 +1161,7 @@ cleanup:
  * All allocated memory should be freed
 \**************************************************************************/
 static int
-_slurmd_fini()
+_slurmd_fini(void)
 {
 	save_cred_state(conf->vctx);
 	int slurm_proctrack_init();
diff --git a/src/slurmd/slurmd/slurmd.h b/src/slurmd/slurmd/slurmd.h
index 42680fc7eb4..f4a89129b71 100644
--- a/src/slurmd/slurmd/slurmd.h
+++ b/src/slurmd/slurmd/slurmd.h
@@ -90,6 +90,7 @@ typedef struct slurmd_config {
 	uint16_t     actual_threads;    /* actual thread per core count    */
 	uint32_t     real_memory_size;  /* amount of real memory	   */
 	uint32_t     tmp_disk_space;    /* size of temporary disk	   */
+	uint32_t     up_time;		/* seconds since last boot time    */
 	uint16_t     block_map_size;	/* size of block map               */
 	uint16_t     *block_map;	/* abstract->machine block map     */
 	uint16_t     *block_map_inv;	/* machine->abstract (inverse) map */
@@ -121,6 +122,7 @@ typedef struct slurmd_config {
 
 	slurm_cred_ctx_t vctx;          /* slurm_cred_t verifier context   */
 
+	uint16_t	slurmd_timeout;	/* SlurmdTimeout                   */
 	uid_t           slurm_user_id;	/* UID that slurmctld runs as      */
 	pthread_mutex_t config_mutex;	/* lock for slurmd_config access   */
 	uint16_t        job_acct_gather_freq;
-- 
GitLab