diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 076ba438cc853082e827eaa4c04ea7a76adc428f..135c0f9ded3438249aa9e912c3250b8d773a45f9 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -430,7 +430,8 @@ static void *_wdog(void *args) lock_slurmctld(node_write_lock); for (i = 0; i < agent_ptr->thread_count; i++) { if (thread_ptr[i].state == DSH_FAILED) - set_node_down(thread_ptr[i].node_name); + set_node_down(thread_ptr[i].node_name, + "Prolog/epilog failure"); if ((thread_ptr[i].state == DSH_DONE) || (thread_ptr[i].state == DSH_JOB_HUNG)) node_did_resp(thread_ptr[i].node_name); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 9f9d81de342411e7ed3068a94d1e9bc60949a57b..b466a6b18f46bd3a7cd5f845738acc9af37be041 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1036,6 +1036,7 @@ validate_node_specs (char *node_name, uint32_t cpus, struct config_record *config_ptr; struct node_record *node_ptr; uint16_t resp_state; + char *reason_down; node_ptr = find_node_record (node_name); if (node_ptr == NULL) @@ -1047,8 +1048,8 @@ validate_node_specs (char *node_name, uint32_t cpus, if (cpus < config_ptr->cpus) { error ("Node %s has low cpu count %u", node_name, cpus); - error_code = EINVAL; - } + error_code = EINVAL; + reason_down = "Low CPUs"; } node_ptr->cpus = cpus; if ((config_ptr->cpus != cpus) && (node_ptr->partition_ptr)) node_ptr->partition_ptr->total_cpus += @@ -1057,7 +1058,8 @@ validate_node_specs (char *node_name, uint32_t cpus, if (real_memory < config_ptr->real_memory) { error ("Node %s has low real_memory size %u", node_name, real_memory); - error_code = EINVAL; + error_code = EINVAL; + reason_down = "Low RealMemory"; } node_ptr->real_memory = real_memory; @@ -1065,6 +1067,7 @@ validate_node_specs (char *node_name, uint32_t cpus, error ("Node %s has low tmp_disk size %u", node_name, tmp_disk); error_code = EINVAL; + reason_down = "Low TmpDisk"; } node_ptr->tmp_disk = tmp_disk; @@ -1074,14 +1077,14 @@ validate_node_specs (char *node_name, uint32_t cpus, if ((node_ptr->node_state != NODE_STATE_DRAINING) && (node_ptr->node_state != NODE_STATE_DRAINED)) { error ("Setting node %s state to DOWN", node_name); - set_node_down(node_name); + set_node_down(node_name, reason_down); } } else if (status == ESLURMD_PROLOG_FAILED) { if ((node_ptr->node_state != NODE_STATE_DRAINING) && (node_ptr->node_state != NODE_STATE_DRAINED)) { error ("Prolog failure on node %s, state to DOWN", node_name); - set_node_down(node_name); + set_node_down(node_name, "Prolog failed"); } } else { node_ptr->cpus = cpus; @@ -1092,8 +1095,8 @@ validate_node_specs (char *node_name, uint32_t cpus, * processor count at present */ if ((slurmctld_conf.fast_schedule == 0) && (node_ptr->config_ptr->cpus != cpus)) { - error ("Node %s processor count inconsistent with rest of partition", - node_name); + error ("Node %s processor count inconsistent with rest " + "of partition", node_name); return EINVAL; /* leave node down */ } #endif @@ -1216,8 +1219,9 @@ void node_not_resp (char *name, time_t msg_time) * set_node_down - make the specified node's state DOWN if possible * (not in a DRAIN state), kill jobs as needed * IN name - name of the node + * IN reason - why the node is DOWN */ -void set_node_down (char *name) +void set_node_down (char *name, char *reason) { struct node_record *node_ptr; @@ -1231,6 +1235,8 @@ void set_node_down (char *name) (node_ptr->node_state != NODE_STATE_DRAINED)) _make_node_down(node_ptr); (void) kill_running_job_by_node_name(name, false); + if (node_ptr->reason == NULL) + node_ptr->reason = xstrdup(reason); return; } diff --git a/src/slurmctld/ping_nodes.c b/src/slurmctld/ping_nodes.c index b3a7df42777079adf2fd5d19a0f0a54965729177..ebf8213c43b755d3b691494b7b6f376a041e7641 100644 --- a/src/slurmctld/ping_nodes.c +++ b/src/slurmctld/ping_nodes.c @@ -137,7 +137,8 @@ void ping_nodes (void) (base_state != NODE_STATE_DRAINED))) { error ("Node %s not responding, setting DOWN", node_record_table_ptr[i].name); - set_node_down(node_record_table_ptr[i].name); + set_node_down(node_record_table_ptr[i].name, + "Not responding"); continue; } diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index c9511b78a2dce6e4ea50cbb78e65e78b6064095d..63408d34c96ddd024b3c04c9a5c408b2fe57c80d 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -93,10 +93,10 @@ inline static void _update_cred_key(void); /* * diff_tv_str - build a string showing the time difference between two times - * tv1 IN - start of event - * tv2 IN - end of event - * tv_str OUT - place to put delta time in format "usec=%ld" - * len_tv_str IN - size of tv_str in bytes + * IN tv1 - start of event + * IN tv2 - end of event + * OUT tv_str - place to put delta time in format "usec=%ld" + * IN len_tv_str - size of tv_str in bytes */ inline void diff_tv_str(struct timeval *tv1,struct timeval *tv2, char *tv_str, int len_tv_str) @@ -109,7 +109,7 @@ inline void diff_tv_str(struct timeval *tv1,struct timeval *tv2, /* * slurmctld_req - Process an individual RPC request - * IN/OUT - the request message, data associated with the message is freed + * IN/OUT msg - the request message, data associated with the message is freed */ void slurmctld_req (slurm_msg_t * msg) { diff --git a/src/slurmctld/proc_req.h b/src/slurmctld/proc_req.h index 5db54b0bc98f0ece3c3ed3c0e3728ea86d6729f7..bcb9d9f8c466ed780d453d9b8db6bb20513f2350 100644 --- a/src/slurmctld/proc_req.h +++ b/src/slurmctld/proc_req.h @@ -46,17 +46,17 @@ /* * diff_tv_str - build a string showing the time difference between two times - * tv1 IN - start of event - * tv2 IN - end of event - * tv_str OUT - place to put delta time in format "usec=%ld" - * len_tv_str IN - size of tv_str in bytes + * IN tv1 - start of event + * IN tv2 - end of event + * OUT tv_str - place to put delta time in format "usec=%ld" + * IN len_tv_str - size of tv_str in bytes */ extern inline void diff_tv_str(struct timeval *tv1,struct timeval *tv2, char *tv_str, int len_tv_str); /* * slurmctld_req - Process an individual RPC request - * IN/OUT - the request message, data associated with the message is freed + * IN/OUT msg - the request message, data associated with the message is freed */ void slurmctld_req (slurm_msg_t * msg); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 5f05ef8e7a8cfe6df8782c0dd491e02d204d596d..765c36b318aea1ff26cab9053de5f4bf46b1453f 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -936,8 +936,9 @@ extern int select_nodes (struct job_record *job_ptr, bool test_only); * set_node_down - make the specified node's state DOWN if possible * (not in a DRAIN state), kill jobs as needed * IN name - name of the node + * IN reason - why the node is DOWN */ -extern void set_node_down (char *name); +extern void set_node_down (char *name, char *reason); /* set_slurmd_addr - establish the slurm_addr for the slurmd on each node * Uses common data structures. */