Skip to content
Snippets Groups Projects
Commit fc65aaf3 authored by Moe Jette's avatar Moe Jette
Browse files

Make nodes DOWN and log event on slurmd prolog/epilog failure.

parent e6d85f72
No related branches found
No related tags found
No related merge requests found
...@@ -73,7 +73,8 @@ ...@@ -73,7 +73,8 @@
# define WDOG_POLL 2 /* secs */ # define WDOG_POLL 2 /* secs */
#endif #endif
typedef enum { DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_FAILED } state_t; typedef enum { DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_NO_RESP,
DSH_FAILED } state_t;
typedef struct thd { typedef struct thd {
pthread_t thread; /* thread ID */ pthread_t thread; /* thread ID */
...@@ -296,7 +297,8 @@ static int _valid_agent_arg(agent_arg_t *agent_arg_ptr) ...@@ -296,7 +297,8 @@ static int _valid_agent_arg(agent_arg_t *agent_arg_ptr)
*/ */
static void *_wdog(void *args) static void *_wdog(void *args)
{ {
int i, fail_cnt, work_done, delay, max_delay = 0; int fail_cnt, no_resp_cnt, work_done;
int i, delay, max_delay = 0;
agent_info_t *agent_ptr = (agent_info_t *) args; agent_info_t *agent_ptr = (agent_info_t *) args;
thd_t *thread_ptr = agent_ptr->thread_struct; thd_t *thread_ptr = agent_ptr->thread_struct;
#if AGENT_IS_THREAD #if AGENT_IS_THREAD
...@@ -309,8 +311,10 @@ static void *_wdog(void *args) ...@@ -309,8 +311,10 @@ static void *_wdog(void *args)
#endif #endif
while (1) { while (1) {
work_done = 1; /* assume all threads complete for now */ work_done = 1; /* assume all threads complete */
fail_cnt = 0; /* assume all threads complete sucessfully for now */ fail_cnt = 0; /* assume no threads failures */
no_resp_cnt = 0; /* assume all threads respond */
sleep(WDOG_POLL); sleep(WDOG_POLL);
slurm_mutex_lock(&agent_ptr->thread_mutex); slurm_mutex_lock(&agent_ptr->thread_mutex);
...@@ -332,6 +336,9 @@ static void *_wdog(void *args) ...@@ -332,6 +336,9 @@ static void *_wdog(void *args)
max_delay = max_delay =
(int) thread_ptr[i].time; (int) thread_ptr[i].time;
break; break;
case DSH_NO_RESP:
no_resp_cnt++;
break;
case DSH_FAILED: case DSH_FAILED:
fail_cnt++; fail_cnt++;
break; break;
...@@ -343,12 +350,12 @@ static void *_wdog(void *args) ...@@ -343,12 +350,12 @@ static void *_wdog(void *args)
} }
/* Notify slurmctld of non-responding nodes */ /* Notify slurmctld of non-responding nodes */
if (fail_cnt) { if (no_resp_cnt) {
#if AGENT_IS_THREAD #if AGENT_IS_THREAD
/* Update node table data for non-responding nodes */ /* Update node table data for non-responding nodes */
lock_slurmctld(node_write_lock); lock_slurmctld(node_write_lock);
for (i = 0; i < agent_ptr->thread_count; i++) { for (i = 0; i < agent_ptr->thread_count; i++) {
if (thread_ptr[i].state == DSH_FAILED) if (thread_ptr[i].state == DSH_NO_RESP)
node_not_resp(thread_ptr[i].node_name); node_not_resp(thread_ptr[i].node_name);
} }
unlock_slurmctld(node_write_lock); unlock_slurmctld(node_write_lock);
...@@ -358,7 +365,7 @@ static void *_wdog(void *args) ...@@ -358,7 +365,7 @@ static void *_wdog(void *args)
slurm_names = xmalloc(fail_cnt * MAX_NAME_LEN); slurm_names = xmalloc(fail_cnt * MAX_NAME_LEN);
fail_cnt = 0; fail_cnt = 0;
for (i = 0; i < agent_ptr->thread_count; i++) { for (i = 0; i < agent_ptr->thread_count; i++) {
if (thread_ptr[i].state == DSH_FAILED) { if (thread_ptr[i].state == DSH_NO_RESP) {
strncpy(&slurm_names strncpy(&slurm_names
[MAX_NAME_LEN * fail_cnt], [MAX_NAME_LEN * fail_cnt],
thread_ptr[i].node_name, thread_ptr[i].node_name,
...@@ -383,6 +390,8 @@ static void *_wdog(void *args) ...@@ -383,6 +390,8 @@ static void *_wdog(void *args)
/* Update last_response on responding nodes */ /* Update last_response on responding nodes */
lock_slurmctld(node_write_lock); lock_slurmctld(node_write_lock);
for (i = 0; i < agent_ptr->thread_count; i++) { for (i = 0; i < agent_ptr->thread_count; i++) {
if (thread_ptr[i].state == DSH_FAILED)
set_node_down(thread_ptr[i].node_name);
if (thread_ptr[i].state == DSH_DONE) if (thread_ptr[i].state == DSH_DONE)
node_did_resp(thread_ptr[i].node_name); node_did_resp(thread_ptr[i].node_name);
} }
...@@ -390,16 +399,17 @@ static void *_wdog(void *args) ...@@ -390,16 +399,17 @@ static void *_wdog(void *args)
#else #else
/* Build a list of all responding nodes and send it to slurmctld to /* Build a list of all responding nodes and send it to slurmctld to
* update time stamps */ * update time stamps */
done_cnt = agent_ptr->thread_count - fail_cnt; done_cnt = agent_ptr->thread_count - fail_cnt - no_resp_cnt;
slurm_names = xmalloc(done_cnt * MAX_NAME_LEN); slurm_names = xmalloc(done_cnt * MAX_NAME_LEN);
done_cnt = 0; done_cnt = 0;
for (i = 0; i < agent_ptr->thread_count; i++) { for (i = 0; i < agent_ptr->thread_count; i++) {
if (thread_ptr[i].state == DSH_DONE) { if (thread_ptr[i].state == DSH_DONE)
strncpy(&slurm_names[MAX_NAME_LEN * done_cnt], strncpy(&slurm_names[MAX_NAME_LEN * done_cnt],
thread_ptr[i].node_name, MAX_NAME_LEN); thread_ptr[i].node_name, MAX_NAME_LEN);
done_cnt++; done_cnt++;
} }
} }
/* need support for node failures here too */
/* send RPC */ /* send RPC */
fatal("Code development needed here if agent is not thread"); fatal("Code development needed here if agent is not thread");
...@@ -428,7 +438,7 @@ static void *_thread_per_node_rpc(void *args) ...@@ -428,7 +438,7 @@ static void *_thread_per_node_rpc(void *args)
return_code_msg_t *slurm_rc_msg; return_code_msg_t *slurm_rc_msg;
task_info_t *task_ptr = (task_info_t *) args; task_info_t *task_ptr = (task_info_t *) args;
thd_t *thread_ptr = task_ptr->thread_struct_ptr; thd_t *thread_ptr = task_ptr->thread_struct_ptr;
state_t thread_state = DSH_FAILED; state_t thread_state = DSH_NO_RESP;
sigset_t set; sigset_t set;
/* set up SIGALRM handler */ /* set up SIGALRM handler */
...@@ -494,19 +504,27 @@ static void *_thread_per_node_rpc(void *args) ...@@ -494,19 +504,27 @@ static void *_thread_per_node_rpc(void *args)
slurm_rc_msg = (return_code_msg_t *) response_msg->data; slurm_rc_msg = (return_code_msg_t *) response_msg->data;
rc = slurm_rc_msg->return_code; rc = slurm_rc_msg->return_code;
slurm_free_return_code_msg(slurm_rc_msg); slurm_free_return_code_msg(slurm_rc_msg);
if (rc) if (rc == 0) {
error("_thread_per_node_rpc/rc error from host %s: %s", debug3("agent processed RPC to node %s",
thread_ptr->node_name);
thread_state = DSH_DONE;
} else if (rc == ESLURMD_EPILOG_FAILED) {
error("Epilog failure on host %s, setting DOWN",
thread_ptr->node_name);
thread_state = DSH_FAILED;
} else if (rc == ESLURMD_PROLOG_FAILED) {
error("Prolog failure on host %s, setting DOWN",
thread_ptr->node_name);
thread_state = DSH_FAILED;
} else {
error("agent error from host %s: %s",
thread_ptr->node_name, thread_ptr->node_name,
slurm_strerror(rc)); /* Don't use %m */ slurm_strerror(rc)); /* Don't use %m */
else { thread_state = DSH_DONE;
debug3
("agent sucessfully processed RPC to node %s",
thread_ptr->node_name);
} }
thread_state = DSH_DONE;
break; break;
default: default:
error("_thread_per_node_rpc from host %s, bad msg_type %d", error("agent reply from host %s, bad msg_type %d",
thread_ptr->node_name, response_msg->msg_type); thread_ptr->node_name, response_msg->msg_type);
break; break;
} }
...@@ -578,7 +596,7 @@ static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count) ...@@ -578,7 +596,7 @@ static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count)
j = 0; j = 0;
for (i = 0; i < agent_info_ptr->thread_count; i++) { for (i = 0; i < agent_info_ptr->thread_count; i++) {
if (thread_ptr[i].state != DSH_FAILED) if (thread_ptr[i].state != DSH_NO_RESP)
continue; continue;
agent_arg_ptr->slurm_addr[j] = thread_ptr[i].slurm_addr; agent_arg_ptr->slurm_addr[j] = thread_ptr[i].slurm_addr;
strncpy(&agent_arg_ptr->node_names[j * MAX_NAME_LEN], strncpy(&agent_arg_ptr->node_names[j * MAX_NAME_LEN],
......
...@@ -1194,6 +1194,32 @@ void node_not_resp (char *name) ...@@ -1194,6 +1194,32 @@ void node_not_resp (char *name)
return; return;
} }
/* set_node_down - make the specified node's state DOWN, kill jobs as needed
* IN name - name of the node */
void set_node_down (char *name)
{
struct node_record *node_ptr;
int node_inx;
uint16_t resp_state;
node_ptr = find_node_record (name);
if (node_ptr == NULL) {
error ("node_not_resp unable to find node %s", name);
return;
}
node_inx = node_ptr - node_record_table_ptr;
last_node_update = time (NULL);
/* preserve NODE_STATE_NO_RESPOND flag if set */
resp_state = node_ptr->node_state & NODE_STATE_NO_RESPOND;
node_ptr->node_state = NODE_STATE_DOWN | resp_state;
bit_clear (up_node_bitmap, node_inx);
bit_clear (idle_node_bitmap, node_inx);
(void) kill_running_job_by_node_name(name, false);
return;
}
/* ping_nodes - check that all nodes and daemons are alive, /* ping_nodes - check that all nodes and daemons are alive,
* get nodes in UNKNOWN state to register */ * get nodes in UNKNOWN state to register */
void ping_nodes (void) void ping_nodes (void)
......
...@@ -211,7 +211,9 @@ void make_node_idle(struct node_record *node_ptr) ...@@ -211,7 +211,9 @@ void make_node_idle(struct node_record *node_ptr)
base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND); base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND);
no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND; no_resp_flag = node_ptr->node_state & NODE_STATE_NO_RESPOND;
if (base_state == NODE_STATE_DRAINING) { if (base_state == NODE_STATE_DOWN) {
debug3("Node %s being left DOWN", node_ptr->name);
} else if (base_state == NODE_STATE_DRAINING) {
node_ptr->node_state = NODE_STATE_DRAINED; node_ptr->node_state = NODE_STATE_DRAINED;
bit_clear(idle_node_bitmap, inx); bit_clear(idle_node_bitmap, inx);
bit_clear(up_node_bitmap, inx); bit_clear(up_node_bitmap, inx);
......
...@@ -883,6 +883,10 @@ extern int select_nodes (struct job_record *job_ptr, bool test_only); ...@@ -883,6 +883,10 @@ extern int select_nodes (struct job_record *job_ptr, bool test_only);
*/ */
extern int set_batch_job_sid(uid_t uid, uint32_t job_id, uint32_t batch_sid); extern int set_batch_job_sid(uid_t uid, uint32_t job_id, uint32_t batch_sid);
/* set_node_down - make the specified node's state DOWN
* IN name - name of the node */
extern void set_node_down (char *name);
/* set_slurmd_addr - establish the slurm_addr for the slurmd on each node /* set_slurmd_addr - establish the slurm_addr for the slurmd on each node
* Uses common data structures. */ * Uses common data structures. */
extern void set_slurmd_addr (void); extern void set_slurmd_addr (void);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment