diff --git a/doc/man/man3/slurm_admin.3 b/doc/man/man3/slurm_admin.3 index 2cbb2303ad18e9dbf374cf714a9bbdd4752ac4bf..80f9b8d2950e86c9baab795a303e1a125366e6d8 100644 --- a/doc/man/man3/slurm_admin.3 +++ b/doc/man/man3/slurm_admin.3 @@ -15,6 +15,8 @@ void \fBslurm_free_key\fR ( slurm_key_t *\fBslurm_get_key\fR ( ); .LP int \fBslurm_reconfigure\fR ( ); +.LP +int \fBslurm_shutdown\fR ( ); .LP int \fBslurm_update_job\fR ( .br @@ -63,6 +65,8 @@ Specifies the pointer to a partition update request specification. See slurm.h f .LP \fBslurm_reconfigure\fR Request that the Slurm controller re-read its configuration file. The new configuration parameters take effect immediately. This function may only be successfully executed by user root. .LP +\fBslurm_shutdown\fR Request that the Slurm controller save its state and terminate. +.LP \fBslurm_update_job\fR Request that the configuration of a job be updated. Note that most, but not all paramters of a job may be changed by this function. Initialize the data structure using the \fBslurm_init_job_desc_msg\fR function to avoid making unanticipated changes to a job's configuration. This function may only be successfully executed by user root. .LP \fBslurm_update_node\fR Request that the state of one or more nodes be updated. Note that the state of a node (e.g. DRAINING, IDLE, etc.) may be changed, but its hardware configuration may not be changed by this function. If the hardware configuration of a node changes, update the Slurm configuration file and execute the \fBslurm_reconfigure\fR function. This function may only be successfully executed by user root. diff --git a/doc/man/man3/slurm_error.3 b/doc/man/man3/slurm_error.3 index f85ee92b6e1e70f226068415cd2083079596e5b4..1f09c8b6a68c127b332b40ffe3fb08f3f8d59c47 100644 --- a/doc/man/man3/slurm_error.3 +++ b/doc/man/man3/slurm_error.3 @@ -92,7 +92,7 @@ details. \fBslurm_init_part_desc_msg\fR(3), \fBslurm_init_job_desc_msg\fR(3), \fBslurm_job_will_run\fR(3), \fBslurm_load_ctl_conf\fR(3), \fBslurm_load_jobs\fR(3), \fBslurm_load_node\fR(3), \fBslurm_load_partitions\fR(3), -\fBslurm_reconfigure\fR(3), \fBslurm_submit_batch_job\fR(3), +\fBslurm_reconfigure\fR(3), \fBslurm_shutdown\fR(3), \fBslurm_submit_batch_job\fR(3), \fBslurm_update_job\fR(3), \fBslurm_update_node\fR(3), \fBslurm_update_partition\fR(3) diff --git a/src/api/reconfigure.c b/src/api/reconfigure.c index e0a042e6809e86cc65bf634b76ada71aff44dbc0..e349bb7cea604eb040a4cd61c7d6336948a60c96 100644 --- a/src/api/reconfigure.c +++ b/src/api/reconfigure.c @@ -1,5 +1,6 @@ /*****************************************************************************\ - * reconfigure.c - request that slurmctld re-read the configuration files + * reconfigure.c - request that slurmctld shutdown or re-read the + * configuration files ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -96,3 +97,62 @@ slurm_reconfigure () return SLURM_PROTOCOL_SUCCESS ; } +/* slurm_shutdown - issue RPC to have slurmctld shutdown */ +int +slurm_shutdown () +{ + int msg_size ; + int rc ; + slurm_fd sockfd ; + slurm_msg_t request_msg ; + slurm_msg_t response_msg ; + return_code_msg_t * slurm_rc_msg ; + + /* init message connection for message communication with controller */ + if ( ( sockfd = slurm_open_controller_conn ( ) ) == SLURM_SOCKET_ERROR ) { + slurm_seterrno ( SLURM_COMMUNICATIONS_CONNECTION_ERROR ); + return SLURM_SOCKET_ERROR ; + } + + /* send request message */ + request_msg . msg_type = REQUEST_SHUTDOWN ; + + if ( ( rc = slurm_send_controller_msg ( sockfd , & request_msg ) ) == SLURM_SOCKET_ERROR ) { + slurm_seterrno ( SLURM_COMMUNICATIONS_SEND_ERROR ); + return SLURM_SOCKET_ERROR ; + } + + /* receive message */ + if ( ( msg_size = slurm_receive_msg ( sockfd , & response_msg ) ) == SLURM_SOCKET_ERROR ) { + slurm_seterrno ( SLURM_COMMUNICATIONS_RECEIVE_ERROR ); + return SLURM_SOCKET_ERROR ; + } + + /* shutdown message connection */ + if ( ( rc = slurm_shutdown_msg_conn ( sockfd ) ) == SLURM_SOCKET_ERROR ) { + slurm_seterrno ( SLURM_COMMUNICATIONS_SHUTDOWN_ERROR ); + return SLURM_SOCKET_ERROR ; + } + if ( msg_size ) + return msg_size; + + switch ( response_msg . msg_type ) + { + case RESPONSE_SLURM_RC: + slurm_rc_msg = ( return_code_msg_t * ) response_msg . data ; + rc = slurm_rc_msg->return_code; + slurm_free_return_code_msg ( slurm_rc_msg ); + if (rc) { + slurm_seterrno ( rc ); + return SLURM_PROTOCOL_ERROR; + } + break ; + default: + slurm_seterrno ( SLURM_UNEXPECTED_MSG_ERROR ); + return SLURM_PROTOCOL_ERROR; + break ; + } + + return SLURM_PROTOCOL_SUCCESS ; +} + diff --git a/src/api/slurm.h b/src/api/slurm.h index 5f5cb334bdff78dcd0f7bc6c6c11c8928b4c1211..03f572e8a97b621316afdff0258a711985a27f4c 100644 --- a/src/api/slurm.h +++ b/src/api/slurm.h @@ -133,12 +133,13 @@ extern int slurm_submit_batch_job (job_desc_msg_t * job_desc_msg, */ extern int slurm_job_will_run (job_desc_msg_t * job_desc_msg , resource_allocation_response_msg_t ** job_alloc_resp_msg ); -/* - * reconfigure - _ request that slurmctld re-read the configuration files - * output: returns 0 on success, errno otherwise - */ +/* slurm_reconfigure - request that slurmctld re-read the configuration files */ extern int slurm_reconfigure (); +/* + * slurm_shutdown - request that slurmctld terminate gracefully */ +extern int slurm_shutdown (); + /* update a job, node, or partition's configuration, root access only */ extern int slurm_update_job ( job_desc_msg_t * job_msg ) ; extern int slurm_update_node ( update_node_msg_t * node_msg ) ; diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 6a69ca8ffde7bd8786faf3c61550425021b93801..d15bd5226da5fed997432fdf338b7016a5a35a2b 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -96,6 +96,8 @@ typedef enum { MESSAGE_NODE_REGISTRATION_STATUS, REQUEST_RECONFIGURE, RESPONSE_RECONFIGURE, + REQUEST_SHUTDOWN, + RESPONSE_SHUTDOWN, REQUEST_BUILD_INFO=2001, RESPONSE_BUILD_INFO, diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index dd65c97cba6f6989ca4bfcdc889bcedebf896c08..8e647f568da3eb91c188190fe43b5794a5cef844 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -128,6 +128,7 @@ int pack_msg ( slurm_msg_t const * msg , char ** buffer , uint32_t * buf_len ) break ; case REQUEST_NODE_REGISTRATION_STATUS : case REQUEST_RECONFIGURE : + case REQUEST_SHUTDOWN : /* Message contains no body/information */ break ; case RESPONSE_SUBMIT_BATCH_JOB: @@ -177,6 +178,7 @@ int pack_msg ( slurm_msg_t const * msg , char ** buffer , uint32_t * buf_len ) case REQUEST_SIGNAL_JOB_STEP : break ; case RESPONSE_RECONFIGURE : + case RESPONSE_SHUTDOWN : case RESPONSE_CANCEL_JOB_STEP : case RESPONSE_COMPLETE_JOB_STEP : case RESPONSE_SIGNAL_JOB : @@ -267,6 +269,7 @@ int unpack_msg ( slurm_msg_t * msg , char ** buffer , uint32_t * buf_len ) break ; case REQUEST_NODE_REGISTRATION_STATUS : case REQUEST_RECONFIGURE : + case REQUEST_SHUTDOWN : /* Message contains no body/information */ break ; case RESPONSE_SUBMIT_BATCH_JOB : @@ -318,6 +321,7 @@ int unpack_msg ( slurm_msg_t * msg , char ** buffer , uint32_t * buf_len ) case REQUEST_SIGNAL_JOB_STEP : break ; case RESPONSE_RECONFIGURE : + case RESPONSE_SHUTDOWN : case RESPONSE_CANCEL_JOB_STEP : case RESPONSE_COMPLETE_JOB_STEP : case RESPONSE_SIGNAL_JOB : diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index e79d9ae2103c564d551bf93b09bca34cf8f746dd..c7d9f3abb237d05fa7d54fb901244b77de4e6773 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -568,6 +568,15 @@ process_command (int argc, char *argv[]) argv[1], argv[0]); } + } + else if (strncmp_i (argv[0], "shutdown", 5) == 0) { + if (argc > 2) + fprintf (stderr, + "too many arguments for keyword:%s\n", argv[0]); + error_code = slurm_shutdown (); + if ((error_code != 0) && (quiet_flag != 1)) + fprintf (stderr, "error %d from reconfigure\n", error_code); + } else if (strcmp_i (argv[0], "update") == 0) { if (argc < 2) { @@ -869,7 +878,8 @@ usage () { printf (" quit terminate this command.\n"); printf (" reconfigure re-read configuration files.\n"); printf (" show <ENTITY> [<ID>] display state of identified entity, default is all records.\n"); - printf (" update <SPECIFICATIONS> update job, node, or partition configuration.\n"); + printf (" shutdown shutdown slurm controller.\n"); + printf (" update <SPECIFICATIONS> update job, node, or partition configuration.\n"); printf (" verbose enable detailed logging.\n"); printf (" version display tool version number.\n"); printf (" <ENTITY> may be \"config\", \"job\", \"node\", or \"partition\".\n");