From b9cb202d67dac2a174d6a41fd1880debf4fdc7d1 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Wed, 23 Jan 2008 22:38:30 +0000 Subject: [PATCH] svn merge -r13067:13077 https://eris.llnl.gov/svn/slurm/branches/slurm-1.2 --- NEWS | 3 + slurm.spec | 5 + .../gold/jobacct_storage_gold.c | 328 +----------------- .../gold/nodeacct_storage_gold.c | 70 ++-- src/slurmctld/controller.c | 30 ++ 5 files changed, 82 insertions(+), 354 deletions(-) diff --git a/NEWS b/NEWS index 57c3b63fc04..ba41777edc6 100644 --- a/NEWS +++ b/NEWS @@ -159,6 +159,9 @@ documents those changes that are of interest to users and admins. -- Add support for node UP/DOWN event logging in jobacct/gold plugin WARNING: using the jobacct/gold plugin slows the system startup set the MessageTimeout variable in the slurm.conf to around 20+. + -- Added check at start of slurmctld to look for /tmp/slurm_gold_first if + there, and using the gold plugin slurm will make record of all nodes in + downed or drained state. * Changes in SLURM 1.2.21 ========================= diff --git a/slurm.spec b/slurm.spec index 7a2b8749164..55778a9c280 100644 --- a/slurm.spec +++ b/slurm.spec @@ -376,6 +376,7 @@ rm -rf $RPM_BUILD_ROOT %{_libdir}/slurm/jobacct_gather_linux.so %{_libdir}/slurm/jobacct_gather_none.so %{_libdir}/slurm/jobacct_storage_filetxt.so +%{_libdir}/slurm/jobacct_storage_gold.so %{_libdir}/slurm/jobacct_storage_mysql.so %{_libdir}/slurm/jobacct_storage_none.so %{_libdir}/slurm/jobacct_storage_pgsql.so @@ -384,6 +385,10 @@ rm -rf $RPM_BUILD_ROOT %{_libdir}/slurm/jobcomp_mysql.so %{_libdir}/slurm/jobcomp_pgsql.so %{_libdir}/slurm/jobcomp_script.so +%{_libdir}/slurm/nodeacct_storage_gold.so +%{_libdir}/slurm/nodeacct_storage_mysql.so +%{_libdir}/slurm/nodeacct_storage_none.so +%{_libdir}/slurm/nodeacct_storage_pgsql.so %{_libdir}/slurm/proctrack_pgid.so %{_libdir}/slurm/proctrack_linuxproc.so %{_libdir}/slurm/sched_backfill.so diff --git a/src/plugins/jobacct_storage/gold/jobacct_storage_gold.c b/src/plugins/jobacct_storage/gold/jobacct_storage_gold.c index 64b7335f51a..e81d2e30945 100644 --- a/src/plugins/jobacct_storage/gold/jobacct_storage_gold.c +++ b/src/plugins/jobacct_storage/gold/jobacct_storage_gold.c @@ -420,12 +420,12 @@ extern int jobacct_storage_p_init(char *gold_info) return SLURM_SUCCESS; } -int jobacct_storage_p_fini() +extern int jobacct_storage_p_fini() { return SLURM_SUCCESS; } -int jobacct_storage_p_job_start(struct job_record *job_ptr) +extern int jobacct_storage_p_job_start(struct job_record *job_ptr) { gold_object_t action = GOLD_ACTION_CREATE; @@ -439,7 +439,7 @@ int jobacct_storage_p_job_start(struct job_record *job_ptr) return _add_edit_job(job_ptr, action); } -int jobacct_storage_p_job_complete(struct job_record *job_ptr) +extern int jobacct_storage_p_job_complete(struct job_record *job_ptr) { gold_object_t action = GOLD_ACTION_MODIFY; @@ -452,7 +452,7 @@ int jobacct_storage_p_job_complete(struct job_record *job_ptr) return _add_edit_job(job_ptr, action); } -int jobacct_storage_p_step_start(struct step_record *step) +extern int jobacct_storage_p_step_start(struct step_record *step) { gold_object_t action = GOLD_ACTION_MODIFY; @@ -467,12 +467,12 @@ int jobacct_storage_p_step_start(struct step_record *step) } -int jobacct_storage_p_step_complete(struct step_record *step) +extern int jobacct_storage_p_step_complete(struct step_record *step) { return SLURM_SUCCESS; } -int jobacct_storage_p_suspend(struct job_record *job_ptr) +extern int jobacct_storage_p_suspend(struct job_record *job_ptr) { return SLURM_SUCCESS; } @@ -482,7 +482,7 @@ int jobacct_storage_p_suspend(struct job_record *job_ptr) * returns List of job_rec_t * * note List needs to be freed when called */ -int jobacct_storage_p_get_jobs(List job_list, +extern int jobacct_storage_p_get_jobs(List job_list, List selected_steps, List selected_parts, void *params) @@ -502,317 +502,3 @@ extern void jobacct_storage_p_archive(List selected_parts, return; } - -int jobacct_p_endpoll() -{ - return SLURM_SUCCESS; -} - -int jobacct_p_set_proctrack_container_id(uint32_t id) -{ - return SLURM_SUCCESS; -} - -int jobacct_p_add_task(pid_t pid, jobacct_id_t *jobacct_id) -{ - return SLURM_SUCCESS; -} - -struct jobacctinfo *jobacct_p_stat_task(pid_t pid) -{ - return NULL; -} - -struct jobacctinfo *jobacct_p_remove_task(pid_t pid) -{ - return NULL; -} - -void jobacct_p_suspend_poll() -{ - return; -} - -void jobacct_p_resume_poll() -{ - return; -} - -#define _DEBUG 0 - -extern int jobacct_p_node_down(struct node_record *node_ptr, time_t event_time, - char *reason) -{ - uint16_t cpus; - int rc = SLURM_ERROR; - gold_request_t *gold_request = NULL; - gold_response_t *gold_response = NULL; - char tmp_buff[50]; - - if (slurmctld_conf.fast_schedule) - cpus = node_ptr->config_ptr->cpus; - else - cpus = node_ptr->cpus; - -#if _DEBUG - slurm_make_time_str(&event_time, tmp_buff, sizeof(tmp_buff)); - info("Node_acct_down: %s at %s with %u cpus due to %s", - node_ptr->name, tmp_buff, cpus, node_ptr->reason); -#endif - /* If the node was already down end that record since the - * reason will most likely be different - */ - - gold_request = create_gold_request(GOLD_OBJECT_EVENT, - GOLD_ACTION_MODIFY); - if(!gold_request) - return rc; - - gold_request_add_condition(gold_request, "Machine", cluster_name, - GOLD_OPERATOR_NONE); - gold_request_add_condition(gold_request, "EndTime", "0", - GOLD_OPERATOR_NONE); - gold_request_add_condition(gold_request, "Name", node_ptr->name, - GOLD_OPERATOR_NONE); - - snprintf(tmp_buff, sizeof(tmp_buff), "%d", ((int)event_time - 1)); - gold_request_add_assignment(gold_request, "EndTime", tmp_buff); - - gold_response = get_gold_response(gold_request); - destroy_gold_request(gold_request); - - if(!gold_response) { - error("jobacct_p_cluster_procs: no response received"); - return rc; - } - - if(gold_response->rc) { - error("gold_response has non-zero rc(%d): %s", - gold_response->rc, - gold_response->message); - destroy_gold_response(gold_response); - return rc; - } - destroy_gold_response(gold_response); - - /* now add the new one */ - gold_request = create_gold_request(GOLD_OBJECT_EVENT, - GOLD_ACTION_CREATE); - if(!gold_request) - return rc; - - gold_request_add_assignment(gold_request, "Machine", cluster_name); - snprintf(tmp_buff, sizeof(tmp_buff), "%d", (int)event_time); - gold_request_add_assignment(gold_request, "StartTime", tmp_buff); - gold_request_add_assignment(gold_request, "Name", node_ptr->name); - snprintf(tmp_buff, sizeof(tmp_buff), "%u", node_ptr->cpus); - gold_request_add_assignment(gold_request, "CPUCount", tmp_buff); - if(reason) - gold_request_add_assignment(gold_request, "Reason", reason); - else - gold_request_add_assignment(gold_request, "Reason", - node_ptr->reason); - - gold_response = get_gold_response(gold_request); - destroy_gold_request(gold_request); - - if(!gold_response) { - error("jobacct_p_cluster_procs: no response received"); - return rc; - } - - if(!gold_response->rc) - rc = SLURM_SUCCESS; - else { - error("gold_response has non-zero rc(%d): %s", - gold_response->rc, - gold_response->message); - } - destroy_gold_response(gold_response); - - return rc; -} - -extern int jobacct_p_node_up(struct node_record *node_ptr, time_t event_time) -{ - int rc = SLURM_ERROR; - gold_request_t *gold_request = NULL; - gold_response_t *gold_response = NULL; - char tmp_buff[50]; - -#if _DEBUG - slurm_make_time_str(&event_time, tmp_buff, sizeof(tmp_buff)); - info("Node_acct_up: %s at %s", node_ptr->name, tmp_buff); -#endif - /* FIXME: WRITE TO DATABASE HERE */ - - gold_request = create_gold_request(GOLD_OBJECT_EVENT, - GOLD_ACTION_MODIFY); - if(!gold_request) - return rc; - - gold_request_add_condition(gold_request, "Machine", cluster_name, - GOLD_OPERATOR_NONE); - gold_request_add_condition(gold_request, "EndTime", "0", - GOLD_OPERATOR_NONE); - gold_request_add_condition(gold_request, "Name", node_ptr->name, - GOLD_OPERATOR_NONE); - - snprintf(tmp_buff, sizeof(tmp_buff), "%d", ((int)event_time - 1)); - gold_request_add_assignment(gold_request, "EndTime", tmp_buff); - - gold_response = get_gold_response(gold_request); - destroy_gold_request(gold_request); - - if(!gold_response) { - error("jobacct_p_node_up: no response received"); - return rc; - } - - if(gold_response->rc) { - error("gold_response has non-zero rc(%d): %s", - gold_response->rc, - gold_response->message); - destroy_gold_response(gold_response); - return rc; - } - destroy_gold_response(gold_response); - - - return rc; -} - -extern int jobacct_p_cluster_procs(uint32_t procs, time_t event_time) -{ - static uint32_t last_procs = -1; - gold_request_t *gold_request = NULL; - gold_response_t *gold_response = NULL; - char tmp_buff[50]; - int rc = SLURM_ERROR; - - if (procs == last_procs) { - debug3("we have the same procs as before no need to " - "query the database."); - return SLURM_SUCCESS; - } - last_procs = procs; - - /* Record the processor count */ -#if _DEBUG - slurm_make_time_str(&event_time, tmp_buff, sizeof(tmp_buff)); - info("Node_acct_procs: %s has %u total CPUs at %s", - cluster_name, procs, tmp_buff); -#endif - - /* get the last known one */ - gold_request = create_gold_request(GOLD_OBJECT_EVENT, - GOLD_ACTION_QUERY); - if(!gold_request) - return rc; - gold_request_add_condition(gold_request, "Machine", cluster_name, - GOLD_OPERATOR_NONE); - gold_request_add_condition(gold_request, "EndTime", "0", - GOLD_OPERATOR_NONE); - gold_request_add_condition(gold_request, "Name", "NULL", - GOLD_OPERATOR_NONE); - - gold_request_add_selection(gold_request, "CPUCount"); - - gold_response = get_gold_response(gold_request); - destroy_gold_request(gold_request); - - if(!gold_response) { - error("jobacct_p_cluster_procs: no response received"); - return rc; - } - - if(gold_response->entry_cnt > 0) { - gold_response_entry_t *resp_entry = - list_pop(gold_response->entries); - gold_name_value_t *name_val = list_pop(resp_entry->name_val); - - if(procs == atoi(name_val->value)) { - debug("System hasn't changed since last entry"); - destroy_gold_name_value(name_val); - destroy_gold_response_entry(resp_entry); - destroy_gold_response(gold_response); - return SLURM_SUCCESS; - } else { - debug("System has changed from %s cpus to %d", - name_val->value, procs); - } - - destroy_gold_name_value(name_val); - destroy_gold_response_entry(resp_entry); - } else { - debug("We don't have an entry for this machine " - "most likely a first time running."); - } - - destroy_gold_response(gold_response); - - - - gold_request = create_gold_request(GOLD_OBJECT_EVENT, - GOLD_ACTION_MODIFY); - if(!gold_request) - return rc; - - gold_request_add_condition(gold_request, "Machine", cluster_name, - GOLD_OPERATOR_NONE); - gold_request_add_condition(gold_request, "EndTime", "0", - GOLD_OPERATOR_NONE); - gold_request_add_condition(gold_request, "Name", "NULL", - GOLD_OPERATOR_NONE); - - snprintf(tmp_buff, sizeof(tmp_buff), "%d", ((int)event_time - 1)); - gold_request_add_assignment(gold_request, "EndTime", tmp_buff); - - gold_response = get_gold_response(gold_request); - destroy_gold_request(gold_request); - - if(!gold_response) { - error("jobacct_p_cluster_procs: no response received"); - return rc; - } - - if(gold_response->rc) { - error("gold_response has non-zero rc(%d): %s", - gold_response->rc, - gold_response->message); - destroy_gold_response(gold_response); - return rc; - } - destroy_gold_response(gold_response); - - /* now add the new one */ - gold_request = create_gold_request(GOLD_OBJECT_EVENT, - GOLD_ACTION_CREATE); - if(!gold_request) - return rc; - - gold_request_add_assignment(gold_request, "Machine", cluster_name); - snprintf(tmp_buff, sizeof(tmp_buff), "%d", (int)event_time); - gold_request_add_assignment(gold_request, "StartTime", tmp_buff); - snprintf(tmp_buff, sizeof(tmp_buff), "%u", procs); - gold_request_add_assignment(gold_request, "CPUCount", tmp_buff); - - gold_response = get_gold_response(gold_request); - destroy_gold_request(gold_request); - - if(!gold_response) { - error("jobacct_p_cluster_procs: no response received"); - return rc; - } - - if(!gold_response->rc) - rc = SLURM_SUCCESS; - else { - error("gold_response has non-zero rc(%d): %s", - gold_response->rc, - gold_response->message); - } - destroy_gold_response(gold_response); - - return rc; -} diff --git a/src/plugins/nodeacct_storage/gold/nodeacct_storage_gold.c b/src/plugins/nodeacct_storage/gold/nodeacct_storage_gold.c index fd5ae9d8045..a6123110095 100644 --- a/src/plugins/nodeacct_storage/gold/nodeacct_storage_gold.c +++ b/src/plugins/nodeacct_storage/gold/nodeacct_storage_gold.c @@ -178,7 +178,7 @@ extern int nodeacct_storage_p_node_down(struct node_record *node_ptr, destroy_gold_request(gold_request); if(!gold_response) { - error("nodeacct_p_cluster_procs: no response received"); + error("nodeacct_storage_p_node_down: no response received"); return rc; } @@ -213,7 +213,7 @@ extern int nodeacct_storage_p_node_down(struct node_record *node_ptr, destroy_gold_request(gold_request); if(!gold_response) { - error("nodeacct_p_cluster_procs: no response received"); + error("nodeacct_p_node_down: no response received"); return rc; } @@ -286,6 +286,7 @@ extern int nodeacct_storage_p_cluster_procs(uint32_t procs, time_t event_time) gold_response_t *gold_response = NULL; char tmp_buff[50]; int rc = SLURM_ERROR; + bool no_modify = 0; if (procs == last_procs) { debug3("we have the same procs as before no need to " @@ -344,43 +345,46 @@ extern int nodeacct_storage_p_cluster_procs(uint32_t procs, time_t event_time) } else { debug("We don't have an entry for this machine " "most likely a first time running."); + no_modify = 1; } destroy_gold_response(gold_response); - - - gold_request = create_gold_request(GOLD_OBJECT_EVENT, - GOLD_ACTION_MODIFY); - if(!gold_request) - return rc; - - gold_request_add_condition(gold_request, "Machine", cluster_name, - GOLD_OPERATOR_NONE); - gold_request_add_condition(gold_request, "EndTime", "0", - GOLD_OPERATOR_NONE); - gold_request_add_condition(gold_request, "Name", "NULL", - GOLD_OPERATOR_NONE); - - snprintf(tmp_buff, sizeof(tmp_buff), "%d", ((int)event_time - 1)); - gold_request_add_assignment(gold_request, "EndTime", tmp_buff); - - gold_response = get_gold_response(gold_request); - destroy_gold_request(gold_request); - - if(!gold_response) { - error("nodeacct_p_cluster_procs: no response received"); - return rc; - } - - if(gold_response->rc) { - error("gold_response has non-zero rc(%d): %s", - gold_response->rc, - gold_response->message); + if(no_modify) { + gold_request = create_gold_request(GOLD_OBJECT_EVENT, + GOLD_ACTION_MODIFY); + if(!gold_request) + return rc; + + gold_request_add_condition(gold_request, "Machine", + cluster_name, + GOLD_OPERATOR_NONE); + gold_request_add_condition(gold_request, "EndTime", "0", + GOLD_OPERATOR_NONE); + gold_request_add_condition(gold_request, "Name", "NULL", + GOLD_OPERATOR_NONE); + + snprintf(tmp_buff, sizeof(tmp_buff), "%d", + ((int)event_time - 1)); + gold_request_add_assignment(gold_request, "EndTime", tmp_buff); + + gold_response = get_gold_response(gold_request); + destroy_gold_request(gold_request); + + if(!gold_response) { + error("jobacct_p_cluster_procs: no response received"); + return rc; + } + + if(gold_response->rc) { + error("gold_response has non-zero rc(%d): %s", + gold_response->rc, + gold_response->message); + destroy_gold_response(gold_response); + return rc; + } destroy_gold_response(gold_response); - return rc; } - destroy_gold_response(gold_response); /* now add the new one */ gold_request = create_gold_request(GOLD_OBJECT_EVENT, diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index d688adce23f..24e8696c889 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -324,9 +324,39 @@ int main(int argc, char *argv[]) slurmctld_conf.slurm_conf, slurm_strerror(error_code)); } + if (recover == 0) _gold_mark_all_nodes_down("cold-start", time(NULL)); + else if (!stat("/tmp/slurm_gold_first", &stat_buf)) { + /* this is here for when slurm is + * started with gold for the first + * time to log any downed nodes. + */ + struct node_record *node_ptr = + node_record_table_ptr; + int i=0; + time_t event_time = time(NULL); + debug("found /tmp/slurm_gold_first, " + "setting nodes down"); + for (i = 0; + i < node_record_count; + i++, node_ptr++) { + if (node_ptr->name == '\0' + || !node_ptr->reason) + continue; + + if(jobacct_g_node_down( + node_ptr, + event_time, + node_ptr->reason) + == SLURM_ERROR) + break; + } + if(unlink("/tmp/slurm_gold_first") < 0) + error("Error deleting " + "/tmp/slurm_gold_first"); + } } else { error("this host (%s) not valid controller (%s or %s)", node_name, slurmctld_conf.control_machine, -- GitLab