Skip to content
Snippets Groups Projects
Commit 6a398331 authored by Moe Jette's avatar Moe Jette
Browse files

Shutdown RPC to slurmctld sends shutdown RPC to all slurmd daemons.

parent 428d5591
No related branches found
No related tags found
No related merge requests found
...@@ -283,7 +283,9 @@ static int _valid_agent_arg(agent_arg_t *agent_arg_ptr) ...@@ -283,7 +283,9 @@ static int _valid_agent_arg(agent_arg_t *agent_arg_ptr)
(agent_arg_ptr->msg_type != REQUEST_KILL_TASKS) && (agent_arg_ptr->msg_type != REQUEST_KILL_TASKS) &&
(agent_arg_ptr->msg_type != REQUEST_PING) && (agent_arg_ptr->msg_type != REQUEST_PING) &&
(agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH) && (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH) &&
(agent_arg_ptr->msg_type != REQUEST_SHUTDOWN) &&
(agent_arg_ptr->msg_type != REQUEST_UPDATE_JOB_TIME)) (agent_arg_ptr->msg_type != REQUEST_UPDATE_JOB_TIME))
/* Add appropriate free msg type to agent() as needed */
fatal("agent passed invalid message type %d", fatal("agent passed invalid message type %d",
agent_arg_ptr->msg_type); agent_arg_ptr->msg_type);
return SLURM_SUCCESS; return SLURM_SUCCESS;
...@@ -477,9 +479,10 @@ static void *_thread_per_node_rpc(void *args) ...@@ -477,9 +479,10 @@ static void *_thread_per_node_rpc(void *args)
goto cleanup; goto cleanup;
} }
/* receive message */ /* receive message as needed (most message types) */
if ((msg_size = slurm_receive_msg(sockfd, response_msg)) if ((task_ptr->msg_type != REQUEST_SHUTDOWN) &&
== SLURM_SOCKET_ERROR) { ((msg_size = slurm_receive_msg(sockfd, response_msg))
== SLURM_SOCKET_ERROR)) {
error( error(
"_thread_per_node_rpc/slurm_receive_msg to host %s: %m", "_thread_per_node_rpc/slurm_receive_msg to host %s: %m",
thread_ptr->node_name); thread_ptr->node_name);
...@@ -493,6 +496,10 @@ static void *_thread_per_node_rpc(void *args) ...@@ -493,6 +496,10 @@ static void *_thread_per_node_rpc(void *args)
thread_ptr->node_name); thread_ptr->node_name);
goto cleanup; goto cleanup;
} }
if (task_ptr->msg_type == REQUEST_SHUTDOWN) {
thread_state = DSH_DONE;
goto cleanup;
}
if (msg_size) { if (msg_size) {
error("_thread_per_node_rpc/msg_size to host %s error %d", error("_thread_per_node_rpc/msg_size to host %s error %d",
thread_ptr->node_name, msg_size); thread_ptr->node_name, msg_size);
......
...@@ -1750,16 +1750,15 @@ static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg) ...@@ -1750,16 +1750,15 @@ static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg)
/* do RPC call */ /* do RPC call */
if (error_code); if (error_code);
else if (core_arg) else if (core_arg)
debug3 debug3("performing immeditate shutdown without state save");
("performing immeditate shutdown without state save");
else if (shutdown_time) else if (shutdown_time)
debug3 debug3("shutdown RPC issued when already in progress");
("_slurm_rpc_shutdown_controller RPC issued after shutdown in progress");
else if (thread_id_sig) { else if (thread_id_sig) {
shutdown_slurmd();
pthread_kill(thread_id_sig, SIGTERM); /* signal clean-up */ pthread_kill(thread_id_sig, SIGTERM); /* signal clean-up */
} else { } else {
error shutdown_slurmd();
("thread_id_sig undefined, doing shutdown the hard way"); error("thread_id_sig undefined, doing shutdown the hard way");
shutdown_time = time(NULL); shutdown_time = time(NULL);
/* send REQUEST_SHUTDOWN_IMMEDIATE RPC */ /* send REQUEST_SHUTDOWN_IMMEDIATE RPC */
_slurmctld_shutdown(); _slurmctld_shutdown();
......
...@@ -1406,4 +1406,61 @@ static void _dump_hash (void) ...@@ -1406,4 +1406,61 @@ static void _dump_hash (void)
} }
#endif #endif
/* shutdown_slurmd - tell every slurmd to shutdown */
void shutdown_slurmd (void)
{
int i, pos;
shutdown_msg_t *shutdown_req;
int kill_buf_rec_size = 0;
agent_arg_t *kill_agent_args;
pthread_attr_t kill_attr_agent;
pthread_t kill_thread_agent;
shutdown_req = xmalloc(sizeof(shutdown_msg_t));
shutdown_req->core = 0;
kill_agent_args = xmalloc (sizeof (agent_arg_t));
kill_agent_args->msg_type = REQUEST_SHUTDOWN;
kill_agent_args->msg_args = shutdown_req;
kill_agent_args->retry = 0;
for (i = 0; i < node_record_count; i++) {
if ((kill_agent_args->node_count+1) > kill_buf_rec_size) {
kill_buf_rec_size += 32;
xrealloc ((kill_agent_args->slurm_addr),
(sizeof (struct sockaddr_in) *
kill_buf_rec_size));
xrealloc ((kill_agent_args->node_names),
(MAX_NAME_LEN * kill_buf_rec_size));
}
kill_agent_args->slurm_addr[kill_agent_args->node_count] =
node_record_table_ptr[i].slurm_addr;
pos = MAX_NAME_LEN * kill_agent_args->node_count;
strncpy (&kill_agent_args->node_names[pos],
node_record_table_ptr[i].name, MAX_NAME_LEN);
kill_agent_args->node_count++;
}
if (kill_agent_args->node_count == 0)
xfree (kill_agent_args);
else {
debug ("Spawning slurmd shutdown agent");
if (pthread_attr_init (&kill_attr_agent))
fatal ("pthread_attr_init error %m");
if (pthread_attr_setdetachstate (&kill_attr_agent,
PTHREAD_CREATE_DETACHED))
error ("pthread_attr_setdetachstate error %m");
#ifdef PTHREAD_SCOPE_SYSTEM
if (pthread_attr_setscope (&kill_attr_agent,
PTHREAD_SCOPE_SYSTEM))
error ("pthread_attr_setscope error %m");
#endif
if (pthread_create (&kill_thread_agent, &kill_attr_agent,
agent, (void *)kill_agent_args)) {
error ("pthread_create error %m");
agent((void *)kill_agent_args); /* do inline */
}
}
}
...@@ -891,6 +891,9 @@ extern void set_node_down (char *name); ...@@ -891,6 +891,9 @@ extern void set_node_down (char *name);
* Uses common data structures. */ * Uses common data structures. */
extern void set_slurmd_addr (void); extern void set_slurmd_addr (void);
/* shutdown_slurmd - tell every slurmd to shutdown */
extern void shutdown_slurmd (void);
/* /*
* signal_step_tasks - send specific signal to specific job step * signal_step_tasks - send specific signal to specific job step
* IN step_ptr - step record pointer * IN step_ptr - step record pointer
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment