From b9cb202d67dac2a174d6a41fd1880debf4fdc7d1 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Wed, 23 Jan 2008 22:38:30 +0000
Subject: [PATCH] svn merge -r13067:13077 
 https://eris.llnl.gov/svn/slurm/branches/slurm-1.2

---
 NEWS                                          |   3 +
 slurm.spec                                    |   5 +
 .../gold/jobacct_storage_gold.c               | 328 +-----------------
 .../gold/nodeacct_storage_gold.c              |  70 ++--
 src/slurmctld/controller.c                    |  30 ++
 5 files changed, 82 insertions(+), 354 deletions(-)

diff --git a/NEWS b/NEWS
index 57c3b63fc04..ba41777edc6 100644
--- a/NEWS
+++ b/NEWS
@@ -159,6 +159,9 @@ documents those changes that are of interest to users and admins.
  -- Add support for node UP/DOWN event logging in jobacct/gold plugin
     WARNING: using the jobacct/gold plugin slows the system startup set the
     MessageTimeout variable in the slurm.conf to around 20+.
+ -- Added check at start of slurmctld to look for /tmp/slurm_gold_first if
+    there, and using the gold plugin slurm will make record of all nodes in
+    downed or drained state.
 
 * Changes in SLURM 1.2.21
 =========================
diff --git a/slurm.spec b/slurm.spec
index 7a2b8749164..55778a9c280 100644
--- a/slurm.spec
+++ b/slurm.spec
@@ -376,6 +376,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/slurm/jobacct_gather_linux.so
 %{_libdir}/slurm/jobacct_gather_none.so
 %{_libdir}/slurm/jobacct_storage_filetxt.so
+%{_libdir}/slurm/jobacct_storage_gold.so
 %{_libdir}/slurm/jobacct_storage_mysql.so
 %{_libdir}/slurm/jobacct_storage_none.so
 %{_libdir}/slurm/jobacct_storage_pgsql.so
@@ -384,6 +385,10 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/slurm/jobcomp_mysql.so
 %{_libdir}/slurm/jobcomp_pgsql.so
 %{_libdir}/slurm/jobcomp_script.so
+%{_libdir}/slurm/nodeacct_storage_gold.so
+%{_libdir}/slurm/nodeacct_storage_mysql.so
+%{_libdir}/slurm/nodeacct_storage_none.so
+%{_libdir}/slurm/nodeacct_storage_pgsql.so
 %{_libdir}/slurm/proctrack_pgid.so
 %{_libdir}/slurm/proctrack_linuxproc.so
 %{_libdir}/slurm/sched_backfill.so
diff --git a/src/plugins/jobacct_storage/gold/jobacct_storage_gold.c b/src/plugins/jobacct_storage/gold/jobacct_storage_gold.c
index 64b7335f51a..e81d2e30945 100644
--- a/src/plugins/jobacct_storage/gold/jobacct_storage_gold.c
+++ b/src/plugins/jobacct_storage/gold/jobacct_storage_gold.c
@@ -420,12 +420,12 @@ extern int jobacct_storage_p_init(char *gold_info)
 	return SLURM_SUCCESS;
 }
 
-int jobacct_storage_p_fini()
+extern int jobacct_storage_p_fini()
 {
 	return SLURM_SUCCESS;
 }
 
-int jobacct_storage_p_job_start(struct job_record *job_ptr)
+extern int jobacct_storage_p_job_start(struct job_record *job_ptr)
 {
 	gold_object_t action = GOLD_ACTION_CREATE;
 	
@@ -439,7 +439,7 @@ int jobacct_storage_p_job_start(struct job_record *job_ptr)
 	return _add_edit_job(job_ptr, action);
 }
 
-int jobacct_storage_p_job_complete(struct job_record *job_ptr) 
+extern int jobacct_storage_p_job_complete(struct job_record *job_ptr) 
 {
 	gold_object_t action = GOLD_ACTION_MODIFY;
 	
@@ -452,7 +452,7 @@ int jobacct_storage_p_job_complete(struct job_record *job_ptr)
 	return _add_edit_job(job_ptr, action);
 }
 
-int jobacct_storage_p_step_start(struct step_record *step)
+extern int jobacct_storage_p_step_start(struct step_record *step)
 {
 	gold_object_t action = GOLD_ACTION_MODIFY;
 	
@@ -467,12 +467,12 @@ int jobacct_storage_p_step_start(struct step_record *step)
 
 }
 
-int jobacct_storage_p_step_complete(struct step_record *step)
+extern int jobacct_storage_p_step_complete(struct step_record *step)
 {
 	return SLURM_SUCCESS;	
 }
 
-int jobacct_storage_p_suspend(struct job_record *job_ptr)
+extern int jobacct_storage_p_suspend(struct job_record *job_ptr)
 {
 	return SLURM_SUCCESS;
 }
@@ -482,7 +482,7 @@ int jobacct_storage_p_suspend(struct job_record *job_ptr)
  * returns List of job_rec_t *
  * note List needs to be freed when called
  */
-int jobacct_storage_p_get_jobs(List job_list,
+extern int jobacct_storage_p_get_jobs(List job_list,
 			       List selected_steps,
 			       List selected_parts,
 			       void *params)
@@ -502,317 +502,3 @@ extern void jobacct_storage_p_archive(List selected_parts,
 	
 	return;
 }
-
-int jobacct_p_endpoll()
-{
-	return SLURM_SUCCESS;
-}
-
-int jobacct_p_set_proctrack_container_id(uint32_t id)
-{
-	return SLURM_SUCCESS;
-}
-
-int jobacct_p_add_task(pid_t pid, jobacct_id_t *jobacct_id)
-{
-	return SLURM_SUCCESS;
-}
-
-struct jobacctinfo *jobacct_p_stat_task(pid_t pid)
-{
-	return NULL;
-}
-
-struct jobacctinfo *jobacct_p_remove_task(pid_t pid)
-{
-	return NULL;
-}
-
-void jobacct_p_suspend_poll()
-{
-	return;
-}
-
-void jobacct_p_resume_poll()
-{
-	return;
-}
-
-#define _DEBUG 0
-
-extern int jobacct_p_node_down(struct node_record *node_ptr, time_t event_time,
-			       char *reason)
-{
-	uint16_t cpus;
-	int rc = SLURM_ERROR;
-	gold_request_t *gold_request = NULL;
-	gold_response_t *gold_response = NULL;
-	char tmp_buff[50];
-
-	if (slurmctld_conf.fast_schedule)
-		cpus = node_ptr->config_ptr->cpus;
-	else
-		cpus = node_ptr->cpus;
-
-#if _DEBUG
-	slurm_make_time_str(&event_time, tmp_buff, sizeof(tmp_buff));
-	info("Node_acct_down: %s at %s with %u cpus due to %s", 
-	     node_ptr->name, tmp_buff, cpus, node_ptr->reason);
-#endif
-	/* If the node was already down end that record since the
-	 * reason will most likely be different
-	 */
-
-	gold_request = create_gold_request(GOLD_OBJECT_EVENT,
-					   GOLD_ACTION_MODIFY);
-	if(!gold_request) 
-		return rc;
-	
-	gold_request_add_condition(gold_request, "Machine", cluster_name,
-				   GOLD_OPERATOR_NONE);
-	gold_request_add_condition(gold_request, "EndTime", "0",
-				   GOLD_OPERATOR_NONE);
-	gold_request_add_condition(gold_request, "Name", node_ptr->name,
-				   GOLD_OPERATOR_NONE);
-
-	snprintf(tmp_buff, sizeof(tmp_buff), "%d", ((int)event_time - 1));
-	gold_request_add_assignment(gold_request, "EndTime", tmp_buff);		
-			
-	gold_response = get_gold_response(gold_request);	
-	destroy_gold_request(gold_request);
-
-	if(!gold_response) {
-		error("jobacct_p_cluster_procs: no response received");
-		return rc;
-	}
-
-	if(gold_response->rc) {
-		error("gold_response has non-zero rc(%d): %s",
-		      gold_response->rc,
-		      gold_response->message);
-		destroy_gold_response(gold_response);
-		return rc;
-	}
-	destroy_gold_response(gold_response);
-
-	/* now add the new one */
-	gold_request = create_gold_request(GOLD_OBJECT_EVENT,
-					   GOLD_ACTION_CREATE);
-	if(!gold_request) 
-		return rc;
-	
-	gold_request_add_assignment(gold_request, "Machine", cluster_name);
-	snprintf(tmp_buff, sizeof(tmp_buff), "%d", (int)event_time);
-	gold_request_add_assignment(gold_request, "StartTime", tmp_buff);
-	gold_request_add_assignment(gold_request, "Name", node_ptr->name);
-	snprintf(tmp_buff, sizeof(tmp_buff), "%u", node_ptr->cpus);
-	gold_request_add_assignment(gold_request, "CPUCount", tmp_buff);
-	if(reason)
-		gold_request_add_assignment(gold_request, "Reason", reason);
-	else	
-		gold_request_add_assignment(gold_request, "Reason", 
-					    node_ptr->reason);
-			
-	gold_response = get_gold_response(gold_request);	
-	destroy_gold_request(gold_request);
-
-	if(!gold_response) {
-		error("jobacct_p_cluster_procs: no response received");
-		return rc;
-	}
-
-	if(!gold_response->rc) 
-		rc = SLURM_SUCCESS;
-	else {
-		error("gold_response has non-zero rc(%d): %s",
-		      gold_response->rc,
-		      gold_response->message);
-	}
-	destroy_gold_response(gold_response);
-
-	return rc;
-}
-
-extern int jobacct_p_node_up(struct node_record *node_ptr, time_t event_time)
-{
-	int rc = SLURM_ERROR;
-	gold_request_t *gold_request = NULL;
-	gold_response_t *gold_response = NULL;
-	char tmp_buff[50];
-
-#if _DEBUG
-	slurm_make_time_str(&event_time, tmp_buff, sizeof(tmp_buff));
-	info("Node_acct_up: %s at %s", node_ptr->name, tmp_buff);
-#endif
-	/* FIXME: WRITE TO DATABASE HERE */
-
-	gold_request = create_gold_request(GOLD_OBJECT_EVENT,
-					   GOLD_ACTION_MODIFY);
-	if(!gold_request) 
-		return rc;
-	
-	gold_request_add_condition(gold_request, "Machine", cluster_name,
-				   GOLD_OPERATOR_NONE);
-	gold_request_add_condition(gold_request, "EndTime", "0",
-				   GOLD_OPERATOR_NONE);
-	gold_request_add_condition(gold_request, "Name", node_ptr->name,
-				   GOLD_OPERATOR_NONE);
-
-	snprintf(tmp_buff, sizeof(tmp_buff), "%d", ((int)event_time - 1));
-	gold_request_add_assignment(gold_request, "EndTime", tmp_buff);		
-			
-	gold_response = get_gold_response(gold_request);	
-	destroy_gold_request(gold_request);
-
-	if(!gold_response) {
-		error("jobacct_p_node_up: no response received");
-		return rc;
-	}
-
-	if(gold_response->rc) {
-		error("gold_response has non-zero rc(%d): %s",
-		      gold_response->rc,
-		      gold_response->message);
-		destroy_gold_response(gold_response);
-		return rc;
-	}
-	destroy_gold_response(gold_response);
-
-
-	return rc;
-}
-
-extern int jobacct_p_cluster_procs(uint32_t procs, time_t event_time)
-{
-	static uint32_t last_procs = -1;
-	gold_request_t *gold_request = NULL;
-	gold_response_t *gold_response = NULL;
-	char tmp_buff[50];
-	int rc = SLURM_ERROR;
-
-	if (procs == last_procs) {
-		debug3("we have the same procs as before no need to "
-		       "query the database.");
-		return SLURM_SUCCESS;
-	}
-	last_procs = procs;
-
-	/* Record the processor count */
-#if _DEBUG
-	slurm_make_time_str(&event_time, tmp_buff, sizeof(tmp_buff));
-	info("Node_acct_procs: %s has %u total CPUs at %s", 
-	     cluster_name, procs, tmp_buff);
-#endif
-	
-	/* get the last known one */
-	gold_request = create_gold_request(GOLD_OBJECT_EVENT,
-					   GOLD_ACTION_QUERY);
-	if(!gold_request) 
-		return rc;
-	gold_request_add_condition(gold_request, "Machine", cluster_name,
-				   GOLD_OPERATOR_NONE);
-	gold_request_add_condition(gold_request, "EndTime", "0",
-				   GOLD_OPERATOR_NONE);
-	gold_request_add_condition(gold_request, "Name", "NULL",
-				   GOLD_OPERATOR_NONE);
-
-	gold_request_add_selection(gold_request, "CPUCount");
-		
-	gold_response = get_gold_response(gold_request);	
-	destroy_gold_request(gold_request);
-
-	if(!gold_response) {
-		error("jobacct_p_cluster_procs: no response received");
-		return rc;
-	}
-
-	if(gold_response->entry_cnt > 0) {
-		gold_response_entry_t *resp_entry = 
-			list_pop(gold_response->entries);
-		gold_name_value_t *name_val = list_pop(resp_entry->name_val);
-
-		if(procs == atoi(name_val->value)) {
-			debug("System hasn't changed since last entry");
-			destroy_gold_name_value(name_val);
-			destroy_gold_response_entry(resp_entry);
-			destroy_gold_response(gold_response);
-			return SLURM_SUCCESS;
-		} else {
-			debug("System has changed from %s cpus to %d",
-			      name_val->value, procs);   
-		}
-
-		destroy_gold_name_value(name_val);
-		destroy_gold_response_entry(resp_entry);
-	} else {
-		debug("We don't have an entry for this machine "
-		      "most likely a first time running.");
-	}
-
-	destroy_gold_response(gold_response);
-	
-
-
-	gold_request = create_gold_request(GOLD_OBJECT_EVENT,
-					   GOLD_ACTION_MODIFY);
-	if(!gold_request) 
-		return rc;
-	
-	gold_request_add_condition(gold_request, "Machine", cluster_name,
-				   GOLD_OPERATOR_NONE);
-	gold_request_add_condition(gold_request, "EndTime", "0",
-				   GOLD_OPERATOR_NONE);
-	gold_request_add_condition(gold_request, "Name", "NULL",
-				   GOLD_OPERATOR_NONE);
-
-	snprintf(tmp_buff, sizeof(tmp_buff), "%d", ((int)event_time - 1));
-	gold_request_add_assignment(gold_request, "EndTime", tmp_buff);		
-			
-	gold_response = get_gold_response(gold_request);	
-	destroy_gold_request(gold_request);
-
-	if(!gold_response) {
-		error("jobacct_p_cluster_procs: no response received");
-		return rc;
-	}
-
-	if(gold_response->rc) {
-		error("gold_response has non-zero rc(%d): %s",
-		      gold_response->rc,
-		      gold_response->message);
-		destroy_gold_response(gold_response);
-		return rc;
-	}
-	destroy_gold_response(gold_response);
-
-	/* now add the new one */
-	gold_request = create_gold_request(GOLD_OBJECT_EVENT,
-					   GOLD_ACTION_CREATE);
-	if(!gold_request) 
-		return rc;
-	
-	gold_request_add_assignment(gold_request, "Machine", cluster_name);
-	snprintf(tmp_buff, sizeof(tmp_buff), "%d", (int)event_time);
-	gold_request_add_assignment(gold_request, "StartTime", tmp_buff);
-	snprintf(tmp_buff, sizeof(tmp_buff), "%u", procs);
-	gold_request_add_assignment(gold_request, "CPUCount", tmp_buff);
-			
-	gold_response = get_gold_response(gold_request);	
-	destroy_gold_request(gold_request);
-
-	if(!gold_response) {
-		error("jobacct_p_cluster_procs: no response received");
-		return rc;
-	}
-
-	if(!gold_response->rc) 
-		rc = SLURM_SUCCESS;
-	else {
-		error("gold_response has non-zero rc(%d): %s",
-		      gold_response->rc,
-		      gold_response->message);
-	}
-	destroy_gold_response(gold_response);
-
-	return rc;
-}
diff --git a/src/plugins/nodeacct_storage/gold/nodeacct_storage_gold.c b/src/plugins/nodeacct_storage/gold/nodeacct_storage_gold.c
index fd5ae9d8045..a6123110095 100644
--- a/src/plugins/nodeacct_storage/gold/nodeacct_storage_gold.c
+++ b/src/plugins/nodeacct_storage/gold/nodeacct_storage_gold.c
@@ -178,7 +178,7 @@ extern int nodeacct_storage_p_node_down(struct node_record *node_ptr,
 	destroy_gold_request(gold_request);
 
 	if(!gold_response) {
-		error("nodeacct_p_cluster_procs: no response received");
+		error("nodeacct_storage_p_node_down: no response received");
 		return rc;
 	}
 
@@ -213,7 +213,7 @@ extern int nodeacct_storage_p_node_down(struct node_record *node_ptr,
 	destroy_gold_request(gold_request);
 
 	if(!gold_response) {
-		error("nodeacct_p_cluster_procs: no response received");
+		error("nodeacct_p_node_down: no response received");
 		return rc;
 	}
 
@@ -286,6 +286,7 @@ extern int nodeacct_storage_p_cluster_procs(uint32_t procs, time_t event_time)
 	gold_response_t *gold_response = NULL;
 	char tmp_buff[50];
 	int rc = SLURM_ERROR;
+	bool no_modify = 0;
 
 	if (procs == last_procs) {
 		debug3("we have the same procs as before no need to "
@@ -344,43 +345,46 @@ extern int nodeacct_storage_p_cluster_procs(uint32_t procs, time_t event_time)
 	} else {
 		debug("We don't have an entry for this machine "
 		      "most likely a first time running.");
+		no_modify = 1;
 	}
 
 	destroy_gold_response(gold_response);
 	
-
-
-	gold_request = create_gold_request(GOLD_OBJECT_EVENT,
-					   GOLD_ACTION_MODIFY);
-	if(!gold_request) 
-		return rc;
-	
-	gold_request_add_condition(gold_request, "Machine", cluster_name,
-				   GOLD_OPERATOR_NONE);
-	gold_request_add_condition(gold_request, "EndTime", "0",
-				   GOLD_OPERATOR_NONE);
-	gold_request_add_condition(gold_request, "Name", "NULL",
-				   GOLD_OPERATOR_NONE);
-
-	snprintf(tmp_buff, sizeof(tmp_buff), "%d", ((int)event_time - 1));
-	gold_request_add_assignment(gold_request, "EndTime", tmp_buff);		
-			
-	gold_response = get_gold_response(gold_request);	
-	destroy_gold_request(gold_request);
-
-	if(!gold_response) {
-		error("nodeacct_p_cluster_procs: no response received");
-		return rc;
-	}
-
-	if(gold_response->rc) {
-		error("gold_response has non-zero rc(%d): %s",
-		      gold_response->rc,
-		      gold_response->message);
+	if(no_modify) {
+		gold_request = create_gold_request(GOLD_OBJECT_EVENT,
+						   GOLD_ACTION_MODIFY);
+		if(!gold_request) 
+			return rc;
+		
+		gold_request_add_condition(gold_request, "Machine",
+					   cluster_name,
+					   GOLD_OPERATOR_NONE);
+		gold_request_add_condition(gold_request, "EndTime", "0",
+					   GOLD_OPERATOR_NONE);
+		gold_request_add_condition(gold_request, "Name", "NULL",
+					   GOLD_OPERATOR_NONE);
+		
+		snprintf(tmp_buff, sizeof(tmp_buff), "%d", 
+			 ((int)event_time - 1));
+		gold_request_add_assignment(gold_request, "EndTime", tmp_buff);	
+		
+		gold_response = get_gold_response(gold_request);	
+		destroy_gold_request(gold_request);
+		
+		if(!gold_response) {
+			error("jobacct_p_cluster_procs: no response received");
+			return rc;
+		}
+		
+		if(gold_response->rc) {
+			error("gold_response has non-zero rc(%d): %s",
+			      gold_response->rc,
+			      gold_response->message);
+			destroy_gold_response(gold_response);
+			return rc;
+		}
 		destroy_gold_response(gold_response);
-		return rc;
 	}
-	destroy_gold_response(gold_response);
 
 	/* now add the new one */
 	gold_request = create_gold_request(GOLD_OBJECT_EVENT,
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index d688adce23f..24e8696c889 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -324,9 +324,39 @@ int main(int argc, char *argv[])
 					slurmctld_conf.slurm_conf,
 					slurm_strerror(error_code));
 			}
+			
 			if (recover == 0)
 				_gold_mark_all_nodes_down("cold-start",
 							  time(NULL));
+			else if (!stat("/tmp/slurm_gold_first", &stat_buf)) {
+				/* this is here for when slurm is
+				 * started with gold for the first
+				 * time to log any downed nodes.
+				 */
+				struct node_record *node_ptr =
+					node_record_table_ptr;
+				int i=0;
+				time_t event_time = time(NULL);
+				debug("found /tmp/slurm_gold_first, "
+				      "setting nodes down");
+				for (i = 0;
+				     i < node_record_count;
+				     i++, node_ptr++) {
+					if (node_ptr->name == '\0'
+					    || !node_ptr->reason)
+						continue;
+					
+					if(jobacct_g_node_down(
+						   node_ptr,
+						   event_time,
+						   node_ptr->reason)
+					   == SLURM_ERROR) 
+						break;
+				}
+				 if(unlink("/tmp/slurm_gold_first") < 0)
+					 error("Error deleting "
+					       "/tmp/slurm_gold_first");
+			}
 		} else {
 			error("this host (%s) not valid controller (%s or %s)",
 				node_name, slurmctld_conf.control_machine,
-- 
GitLab