diff --git a/NEWS b/NEWS index b88e869c7506951c68ce4b5f073a21427da6b132..a28aa11c70c3ef36db3eff04b92e71b229c88554 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,8 @@ documents those changes that are of interest to users and admins. "tux5-1"). -- Avoid re-use of job_id (if specified at submit time) when the existing job is in completing state (possible race condition with Moab). + -- Add support for "scontrol takeover" command for backup controller to + assume control immediately. Patch from Matthieu Hautreux, CEA. * Changes in SLURM 2.0.0-rc1 ============================== diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am index 24c23cdb0096343835174855a9613713c997208d..3ee8ff7965911739361d38d68e3d86324170a74d 100644 --- a/doc/man/Makefile.am +++ b/doc/man/Makefile.am @@ -135,6 +135,7 @@ man3_MANS = man3/slurm_hostlist_create.3 \ man3/slurm_strerror.3 \ man3/slurm_submit_batch_job.3 \ man3/slurm_suspend.3 \ + man3/slurm_takeover.3 \ man3/slurm_terminate_job.3 \ man3/slurm_terminate_job_step.3 \ man3/slurm_update_job.3 \ diff --git a/doc/man/Makefile.in b/doc/man/Makefile.in index 31c9b10c181bb3dc7b9eff5d086b807343d83041..5faf27b8c0258967f5f39c884924da3b724c3ed5 100644 --- a/doc/man/Makefile.in +++ b/doc/man/Makefile.in @@ -387,6 +387,7 @@ man3_MANS = man3/slurm_hostlist_create.3 \ man3/slurm_strerror.3 \ man3/slurm_submit_batch_job.3 \ man3/slurm_suspend.3 \ + man3/slurm_takeover.3 \ man3/slurm_terminate_job.3 \ man3/slurm_terminate_job_step.3 \ man3/slurm_update_job.3 \ diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 68478f44e8013d40c07f099217080e4a8e813370..0503dd93d0ccd061a72b4dbc42593c42d14d5aa7 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -1,4 +1,4 @@ -.TH SCONTROL "1" "April 2009" "scontrol 2.0" "Slurm components" +.TH SCONTROL "1" "May 2009" "scontrol 2.0" "Slurm components" .SH "NAME" scontrol \- Used view and modify Slurm configuration and state. @@ -263,6 +263,18 @@ User processes must stop on receipt of SIGSTOP signal and resume upon receipt of SIGCONT for this operation to be effective. Not all architectures and configurations support job suspension. +.TP +\fBtakeover\fP +Instruct SLURM's backup controller (slurmctld) to take over system control. +SLURM's backup controller requests control from the primary and waits for +its termination. After that, it switches from backup mode to controller +mode. If primary controller can not be contacted, it directly switches to +controller mode. This can be used to speed up the SLURM controller +fail\-over mechanism when the primary node is down. +This can be used to minimize disruption if the computer executing the +primary SLURM controller is scheduled down. +(Note: SLURM's primary controller will take the control back at startup.) + .TP \fBupdate\fP \fISPECIFICATION\fP Update job, node, partition, or reservation configuration per the supplied @@ -755,6 +767,7 @@ details. \fBslurm_load_partitions\fR(3), \fBslurm_reconfigure\fR(3), \fBslurm_requeue\fR(3), \fBslurm_resume\fR(3), \fBslurm_shutdown\fR(3), \fBslurm_suspend\fR(3), +\fBslurm_takeover\fR(3), \fBslurm_update_job\fR(3), \fBslurm_update_node\fR(3), \fBslurm_update_partition\fR(3), \fBslurm.conf\fR(5) diff --git a/doc/man/man3/slurm_reconfigure.3 b/doc/man/man3/slurm_reconfigure.3 index 7bb4dd7221e1db786ecc01eb0adcf990463a8cba..a04c25569db67b26a34c678379a91189adc18989 100644 --- a/doc/man/man3/slurm_reconfigure.3 +++ b/doc/man/man3/slurm_reconfigure.3 @@ -1,9 +1,9 @@ -.TH "Slurm API" "3" "Jan 2009" "Morris Jette et.al." "Slurm administrative calls" +.TH "Slurm API" "3" "May 2009" "Morris Jette" "Slurm administrative calls" .SH "NAME" slurm_create_partition, slurm_create_reservation, slurm_delete_partition, slurm_delete_reservation, slurm_init_part_desc_msg, slurm_init_resv_desc_msg, -slurm_reconfigure, slurm_shutdown, slurm_update_job, +slurm_reconfigure, slurm_shutdown, slurm_takeover, slurm_update_job, ,slurm_init_update_node_msg slurm_update_node, slurm_update_partition, slurm_update_reservation \- Slurm administrative functions @@ -54,6 +54,8 @@ int \fBslurm_shutdown\fR ( uint16_t \fIshutdown_options\fP .br ); +.LP +int \fBslurm_takeover\fR ( ); .LP int \fBslurm_update_job\fR ( .br @@ -162,6 +164,10 @@ immediately. This function may only be successfully executed by user root. \fBslurm_shutdown\fR Request that the Slurm controller terminate. This function may only be successfully executed by user root. .LP +\fBslurm_takeover\fR Request that the Slurm primary controller shutdown +immediately and the backup controller take over. +This function may only be successfully executed by user root. +.LP \fBslurm_update_job\fR Request that the configuration of a job be updated. Note that most, but not all parameters of a job may be changed by this function. Initialize the data structure using the \fBslurm_init_job_desc_msg\fR function diff --git a/doc/man/man3/slurm_takeover.3 b/doc/man/man3/slurm_takeover.3 new file mode 100644 index 0000000000000000000000000000000000000000..8c2ed98140da9b138eaf0ab3b329016669dac41f --- /dev/null +++ b/doc/man/man3/slurm_takeover.3 @@ -0,0 +1 @@ +.so man3/slurm_reconfigure.3 diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 1bf4da80db7427456d097c5ff5fd43860f32f7c3..1aea5747699206013df8804853c952a6818d7b5e 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -2232,6 +2232,14 @@ extern int slurm_reconfigure PARAMS(( void )); */ extern int slurm_shutdown PARAMS(( uint16_t options )); +/* + * slurm_takeover - issue RPC to have Slurm backup controller (slurmctld) + * take over the primary controller. + * + * RET 0 or a slurm error code + */ +extern int slurm_takeover PARAMS(( void )); + /* * slurm_set_debug_level - issue RPC to set slurm controller debug level * IN debug_level - requested debug level diff --git a/src/api/reconfigure.c b/src/api/reconfigure.c index 64ac9f6a7f9f273732192448d65b238e49a3512d..e7e5d281457889a063d5b9ae8f0f8656e1cc9e56 100644 --- a/src/api/reconfigure.c +++ b/src/api/reconfigure.c @@ -134,6 +134,23 @@ slurm_shutdown (uint16_t options) return _send_message_controller(PRIMARY_CONTROLLER, &req_msg); } +/* + * slurm_takeover - issue RPC to have Slurm backup controller take over the + * primary controller. REQUEST_CONTROL is sent by the backup + * to the primary controller to take control + * RET 0 or a slurm error code + */ +int +slurm_takeover ( void ) +{ + slurm_msg_t req_msg; + + slurm_msg_t_init(&req_msg); + req_msg.msg_type = REQUEST_TAKEOVER; + + return _send_message_controller(SECONDARY_CONTROLLER, &req_msg); +} + int _send_message_controller (enum controller_id dest, slurm_msg_t *req) { diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 2db2c6b6443d21c682f3b2e9c0b8d4d93701c69d..37c687465921c2dec78d8f5fada1ee8e170b7c28 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1758,6 +1758,7 @@ extern int slurm_free_msg_data(slurm_msg_type_t type, void *data) case REQUEST_PING: case REQUEST_RECONFIGURE: case REQUEST_CONTROL: + case REQUEST_TAKEOVER: case REQUEST_SHUTDOWN_IMMEDIATE: case RESPONSE_FORWARD_FAILED: case REQUEST_DAEMON_STATUS: diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 08f1dcc841b359e1b4da9f4076c5b9b9c7fd8e7a..ebb731f40ae86c5299da0a647981274808cb896a 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -96,6 +96,7 @@ typedef enum { REQUEST_CONTROL, REQUEST_SET_DEBUG_LEVEL, REQUEST_HEALTH_CHECK, + REQUEST_TAKEOVER, REQUEST_BUILD_INFO = 2001, RESPONSE_BUILD_INFO, diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index d004ae76692ff681c86aec505e2f849c105907a8..3d3890cf2d5f1d13939f88613f4984879bfa44ab 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -530,6 +530,7 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) case REQUEST_SHUTDOWN_IMMEDIATE: case REQUEST_PING: case REQUEST_CONTROL: + case REQUEST_TAKEOVER: case REQUEST_DAEMON_STATUS: case REQUEST_HEALTH_CHECK: case ACCOUNTING_FIRST_REG: @@ -906,6 +907,7 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) case REQUEST_SHUTDOWN_IMMEDIATE: case REQUEST_PING: case REQUEST_CONTROL: + case REQUEST_TAKEOVER: case REQUEST_DAEMON_STATUS: case REQUEST_HEALTH_CHECK: case ACCOUNTING_FIRST_REG: diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index ef3f5f71c8352c253ae4619c60b7ae4b8063c863..f7fb098ef5f031b8f69210bacd0a4c308cdf9cc4 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -710,6 +710,27 @@ _process_command (int argc, char *argv[]) else if (strncasecmp (tag, "show", MAX(taglen, 3)) == 0) { _show_it (argc, argv); } + else if (strncasecmp (tag, "takeover", MAX(taglen, 8)) == 0) { + char *secondary = NULL; + slurm_ctl_conf_info_msg_t *slurm_ctl_conf_ptr = NULL; + + slurm_ctl_conf_ptr = slurm_conf_lock(); + secondary = xstrdup(slurm_ctl_conf_ptr->backup_controller); + slurm_conf_unlock(); + + if ( secondary && secondary[0] != '\0' ) { + error_code = slurm_takeover(); + if (error_code) { + exit_code = 1; + if (quiet_flag != 1) + slurm_perror("slurm_takeover error"); + } + } else { + fprintf(stderr, "slurm_takeover error: no backup " + "controller defined\n"); + } + xfree(secondary); + } else if (strncasecmp (tag, "shutdown", MAX(taglen, 8)) == 0) { /* require full command name */ uint16_t options = 0; @@ -1274,6 +1295,8 @@ scontrol [<OPTION>] [<COMMAND>] \n\ show <ENTITY> [<ID>] display state of identified entity, default \n\ is all records. \n\ shutdown <OPTS> shutdown slurm daemons \n\ + takeover ask slurm backup controller to take over \n\ + (the primary controller will be stopped) \n\ suspend <job_id> susend specified job \n\ resume <job_id> resume previously suspended job \n\ update <SPECIFICATIONS> update job, node, partition, reservation, or \n\ diff --git a/src/slurmctld/backup.c b/src/slurmctld/backup.c index e56ca4887b439be5d71061d11d502d79b857d7ff..51eaaf10d79a12fa3fdb2478cb997171004332f5 100644 --- a/src/slurmctld/backup.c +++ b/src/slurmctld/backup.c @@ -67,15 +67,27 @@ #include "src/slurmctld/read_config.h" #include "src/slurmctld/slurmctld.h" +#ifndef VOLATILE +#if defined(__STDC__) || defined(__cplusplus) +#define VOLATILE volatile +#else +#define VOLATILE +#endif +#endif + +#define SHUTDOWN_WAIT 2 /* Time to wait for primary server shutdown */ + static int _background_process_msg(slurm_msg_t * msg); static int _backup_reconfig(void); static void * _background_rpc_mgr(void *no_data); static void * _background_signal_hand(void *no_data); static int _ping_controller(void); inline static void _update_cred_key(void); +static int _shutdown_primary_controller(int wait_time); /* Local variables */ -static bool dump_core = false; +static bool dump_core = false; +static VOLATILE bool takeover = false; /* * Static list of signals to block in this process @@ -97,6 +109,8 @@ void run_backup(void) READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; info("slurmctld running in background mode"); + takeover = false; + /* default: don't resume if shutdown */ slurmctld_config.resume_backup = false; if (xsignal_block(backup_sigarray) < 0) @@ -125,23 +139,29 @@ void run_backup(void) while (slurmctld_config.shutdown_time == 0) { sleep(1); /* Lock of slurmctld_conf below not important */ - if (slurmctld_conf.slurmctld_timeout - && (difftime(time(NULL), last_ping) < + if (slurmctld_conf.slurmctld_timeout && + (takeover == false) && + (difftime(time(NULL), last_ping) < (slurmctld_conf.slurmctld_timeout / 3))) continue; last_ping = time(NULL); if (_ping_controller() == 0) last_controller_response = time(NULL); - else { + else if ( takeover == true ) { + /* in takeover mode, take control as soon as */ + /* primary no longer respond */ + break; + } else { uint32_t timeout; lock_slurmctld(config_read_lock); timeout = slurmctld_conf.slurmctld_timeout; unlock_slurmctld(config_read_lock); if (difftime(time(NULL), last_controller_response) > - timeout) + timeout) { break; + } } } @@ -366,6 +386,12 @@ static int _background_process_msg(slurm_msg_t * msg) (msg->msg_type == REQUEST_SHUTDOWN)) { info("Performing RPC: REQUEST_SHUTDOWN"); pthread_kill(slurmctld_config.thread_id_sig, SIGTERM); + } else if (super_user && + (msg->msg_type == REQUEST_TAKEOVER)) { + info("Performing RPC: REQUEST_TAKEOVER"); + _shutdown_primary_controller(SHUTDOWN_WAIT); + takeover = true ; + error_code = SLURM_SUCCESS; } else if (super_user && (msg->msg_type == REQUEST_CONTROL)) { debug3("Ignoring RPC: REQUEST_CONTROL"); @@ -430,3 +456,55 @@ static int _backup_reconfig(void) slurmctld_conf.last_update = time(NULL); return SLURM_SUCCESS; } + +/* + * Tell the primary_controller to relinquish control, primary control_machine + * has to suspend operation + * Based on _shutdown_backup_controller from controller.c + * wait_time - How long to wait for primary controller to write state, seconds. + * RET 0 or an error code + * NOTE: READ lock_slurmctld config before entry (or be single-threaded) + */ +static int _shutdown_primary_controller(int wait_time) +{ + int rc; + slurm_msg_t req; + + slurm_msg_t_init(&req); + if ((slurmctld_conf.control_addr == NULL) || + (slurmctld_conf.control_addr[0] == '\0')) { + error("_shutdown_primary_controller: " + "no primary controller to shutdown"); + return SLURM_ERROR; + } + + slurm_set_addr(&req.address, slurmctld_conf.slurmctld_port, + slurmctld_conf.control_addr); + + /* send request message */ + req.msg_type = REQUEST_CONTROL; + + if (slurm_send_recv_rc_msg_only_one(&req, &rc, + (CONTROL_TIMEOUT * 1000)) < 0) { + error("_shutdown_primary_controller:send/recv: %m"); + return SLURM_ERROR; + } + if (rc == ESLURM_DISABLED) + debug("primary controller responding"); + else if (rc == 0) { + debug("primary controller has relinquished control"); + } else { + error("_shutdown_primary_controller: %s", slurm_strerror(rc)); + return SLURM_ERROR; + } + + /* FIXME: Ideally the REQUEST_CONTROL RPC does not return until all + * other activity has ceased and the state has been saved. That is + * not presently the case (it returns when no other work is pending, + * so the state save should occur right away). We sleep for a while + * here and give the primary controller time to shutdown */ + if (wait_time) + sleep(wait_time); + + return SLURM_SUCCESS; +} diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 1675f03b85078241a41d36b9752df0a79f15c518..a0ecfa7b2f6b4f3f5393dfd311d741cde11a9b46 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -165,6 +165,7 @@ static int recover = DEFAULT_RECOVER; static pthread_cond_t server_thread_cond = PTHREAD_COND_INITIALIZER; static pid_t slurmctld_pid; static char *slurm_conf_filename; +static int primary = 1 ; /* * Static list of signals to block in this process * *Must be zero-terminated* @@ -408,6 +409,7 @@ int main(int argc, char *argv[]) (strcmp(node_name, slurmctld_conf.backup_controller) == 0)) { slurm_sched_fini(); /* make sure shutdown */ + primary = 0; run_backup(); } else if (slurmctld_conf.control_machine && (strcmp(node_name, slurmctld_conf.control_machine) @@ -427,6 +429,8 @@ int main(int argc, char *argv[]) if (recover == 0) _accounting_mark_all_nodes_down("cold-start"); + + primary = 1; } else { error("this host (%s) not valid controller (%s or %s)", @@ -541,6 +545,12 @@ int main(int argc, char *argv[]) if (slurmctld_config.resume_backup == false) break; + + /* primary controller doesn't resume backup mode */ + if ((slurmctld_config.resume_backup == true) && + (primary == 1)) + break; + recover = 2; } diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index b0ac95c64dd19ce2ecdb86971a17a496ed0ebc79..60e1643796de005013392900047998adfbfd9e8a 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -122,6 +122,7 @@ inline static void _slurm_rpc_resv_update(slurm_msg_t * msg); inline static void _slurm_rpc_resv_delete(slurm_msg_t * msg); inline static void _slurm_rpc_resv_show(slurm_msg_t * msg); inline static void _slurm_rpc_requeue(slurm_msg_t * msg); +inline static void _slurm_rpc_takeover(slurm_msg_t * msg); inline static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg); inline static void _slurm_rpc_shutdown_controller_immediate(slurm_msg_t * msg); @@ -245,6 +246,10 @@ void slurmctld_req (slurm_msg_t * msg) _slurm_rpc_shutdown_controller(msg); /* No body to free */ break; + case REQUEST_TAKEOVER: + _slurm_rpc_takeover(msg); + /* No body to free */ + break; case REQUEST_SHUTDOWN: _slurm_rpc_shutdown_controller(msg); slurm_free_shutdown_msg(msg->data); @@ -1785,6 +1790,28 @@ static void _slurm_rpc_reconfigure_controller(slurm_msg_t * msg) } } +/* _slurm_rpc_takeover - process takeover RPC */ +static void _slurm_rpc_takeover(slurm_msg_t * msg) +{ + int error_code = SLURM_SUCCESS; + uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); + + /* We could authenticate here, if desired */ + if (!validate_super_user(uid)) { + error("Security violation, TAKEOVER RPC from uid=%u", + (unsigned int) uid); + error_code = ESLURM_USER_ID_MISSING; + } else { + /* takeover is not possible in controller mode */ + /* return success */ + info("Performing RPC: REQUEST_TAKEOVER : " + "already in controller mode - skipping"); + } + + slurm_send_rc_msg(msg, error_code); + +} + /* _slurm_rpc_shutdown_controller - process RPC to shutdown slurmctld */ static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg) {