diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index 2829f20ef750ecafe8f738793f286f69f78dcfff..9bf654b07187ae724f306c9e14e538d6a4dd3f6b 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -283,7 +283,9 @@ static int _valid_agent_arg(agent_arg_t *agent_arg_ptr) (agent_arg_ptr->msg_type != REQUEST_KILL_TASKS) && (agent_arg_ptr->msg_type != REQUEST_PING) && (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH) && + (agent_arg_ptr->msg_type != REQUEST_SHUTDOWN) && (agent_arg_ptr->msg_type != REQUEST_UPDATE_JOB_TIME)) + /* Add appropriate free msg type to agent() as needed */ fatal("agent passed invalid message type %d", agent_arg_ptr->msg_type); return SLURM_SUCCESS; @@ -477,9 +479,10 @@ static void *_thread_per_node_rpc(void *args) goto cleanup; } - /* receive message */ - if ((msg_size = slurm_receive_msg(sockfd, response_msg)) - == SLURM_SOCKET_ERROR) { + /* receive message as needed (most message types) */ + if ((task_ptr->msg_type != REQUEST_SHUTDOWN) && + ((msg_size = slurm_receive_msg(sockfd, response_msg)) + == SLURM_SOCKET_ERROR)) { error( "_thread_per_node_rpc/slurm_receive_msg to host %s: %m", thread_ptr->node_name); @@ -493,6 +496,10 @@ static void *_thread_per_node_rpc(void *args) thread_ptr->node_name); goto cleanup; } + if (task_ptr->msg_type == REQUEST_SHUTDOWN) { + thread_state = DSH_DONE; + goto cleanup; + } if (msg_size) { error("_thread_per_node_rpc/msg_size to host %s error %d", thread_ptr->node_name, msg_size); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 00b9fee9ee325d49a46e26f6293759866be2488a..149c02422e75e932f3913a6eb749ee67b1a4a403 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1750,16 +1750,15 @@ static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg) /* do RPC call */ if (error_code); else if (core_arg) - debug3 - ("performing immeditate shutdown without state save"); + debug3("performing immeditate shutdown without state save"); else if (shutdown_time) - debug3 - ("_slurm_rpc_shutdown_controller RPC issued after shutdown in progress"); + debug3("shutdown RPC issued when already in progress"); else if (thread_id_sig) { + shutdown_slurmd(); pthread_kill(thread_id_sig, SIGTERM); /* signal clean-up */ } else { - error - ("thread_id_sig undefined, doing shutdown the hard way"); + shutdown_slurmd(); + error("thread_id_sig undefined, doing shutdown the hard way"); shutdown_time = time(NULL); /* send REQUEST_SHUTDOWN_IMMEDIATE RPC */ _slurmctld_shutdown(); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index f4403e51cb376e42fb34ed82ac2d0e35176b596d..8ca51460839d58e3601da6b6f7ce1d7f123adf59 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1406,4 +1406,61 @@ static void _dump_hash (void) } #endif +/* shutdown_slurmd - tell every slurmd to shutdown */ +void shutdown_slurmd (void) +{ + int i, pos; + shutdown_msg_t *shutdown_req; + + int kill_buf_rec_size = 0; + agent_arg_t *kill_agent_args; + pthread_attr_t kill_attr_agent; + pthread_t kill_thread_agent; + + shutdown_req = xmalloc(sizeof(shutdown_msg_t)); + shutdown_req->core = 0; + + kill_agent_args = xmalloc (sizeof (agent_arg_t)); + kill_agent_args->msg_type = REQUEST_SHUTDOWN; + kill_agent_args->msg_args = shutdown_req; + kill_agent_args->retry = 0; + + for (i = 0; i < node_record_count; i++) { + if ((kill_agent_args->node_count+1) > kill_buf_rec_size) { + kill_buf_rec_size += 32; + xrealloc ((kill_agent_args->slurm_addr), + (sizeof (struct sockaddr_in) * + kill_buf_rec_size)); + xrealloc ((kill_agent_args->node_names), + (MAX_NAME_LEN * kill_buf_rec_size)); + } + kill_agent_args->slurm_addr[kill_agent_args->node_count] = + node_record_table_ptr[i].slurm_addr; + pos = MAX_NAME_LEN * kill_agent_args->node_count; + strncpy (&kill_agent_args->node_names[pos], + node_record_table_ptr[i].name, MAX_NAME_LEN); + kill_agent_args->node_count++; + + } + if (kill_agent_args->node_count == 0) + xfree (kill_agent_args); + else { + debug ("Spawning slurmd shutdown agent"); + if (pthread_attr_init (&kill_attr_agent)) + fatal ("pthread_attr_init error %m"); + if (pthread_attr_setdetachstate (&kill_attr_agent, + PTHREAD_CREATE_DETACHED)) + error ("pthread_attr_setdetachstate error %m"); +#ifdef PTHREAD_SCOPE_SYSTEM + if (pthread_attr_setscope (&kill_attr_agent, + PTHREAD_SCOPE_SYSTEM)) + error ("pthread_attr_setscope error %m"); +#endif + if (pthread_create (&kill_thread_agent, &kill_attr_agent, + agent, (void *)kill_agent_args)) { + error ("pthread_create error %m"); + agent((void *)kill_agent_args); /* do inline */ + } + } +} diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 0fdce02a7fdbd0ded7eb2e3c79561afb87880957..00ea35367e78725af134734b0d3adc10ebfdfb15 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -891,6 +891,9 @@ extern void set_node_down (char *name); * Uses common data structures. */ extern void set_slurmd_addr (void); +/* shutdown_slurmd - tell every slurmd to shutdown */ +extern void shutdown_slurmd (void); + /* * signal_step_tasks - send specific signal to specific job step * IN step_ptr - step record pointer