diff --git a/src/api/reconfigure.c b/src/api/reconfigure.c index e349bb7cea604eb040a4cd61c7d6336948a60c96..7bf7fbaae9dbd7f100c58d6fa79ba858ef7fec60 100644 --- a/src/api/reconfigure.c +++ b/src/api/reconfigure.c @@ -99,7 +99,7 @@ slurm_reconfigure () /* slurm_shutdown - issue RPC to have slurmctld shutdown */ int -slurm_shutdown () +slurm_shutdown (uint16_t core) { int msg_size ; int rc ; @@ -107,6 +107,7 @@ slurm_shutdown () slurm_msg_t request_msg ; slurm_msg_t response_msg ; return_code_msg_t * slurm_rc_msg ; + shutdown_msg_t shutdown_msg ; /* init message connection for message communication with controller */ if ( ( sockfd = slurm_open_controller_conn ( ) ) == SLURM_SOCKET_ERROR ) { @@ -115,7 +116,9 @@ slurm_shutdown () } /* send request message */ + shutdown_msg . core = core ; request_msg . msg_type = REQUEST_SHUTDOWN ; + request_msg . data = &shutdown_msg; if ( ( rc = slurm_send_controller_msg ( sockfd , & request_msg ) ) == SLURM_SOCKET_ERROR ) { slurm_seterrno ( SLURM_COMMUNICATIONS_SEND_ERROR ); diff --git a/src/api/slurm.h b/src/api/slurm.h index a921b8897013dab7f76841e909d37180e8a9daba..23e3fbc797e8b83963817376282fa42ec70aa23b 100644 --- a/src/api/slurm.h +++ b/src/api/slurm.h @@ -125,7 +125,7 @@ extern int slurm_job_will_run (job_desc_msg_t * job_desc_msg , resource_allocati extern int slurm_reconfigure (); /* slurm_shutdown - request that slurmctld terminate gracefully */ -extern int slurm_shutdown (); +extern int slurm_shutdown (uint16_t core); /* update a job, node, or partition's configuration, root access only */ extern int slurm_update_job ( job_desc_msg_t * job_msg ) ; diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 3b7ed27abb96a02f626fa64b8cf390000e13444c..3820e48230776e60db16d20b7d4a8fecafb89b99 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -42,6 +42,11 @@ void slurm_free_last_update_msg(last_update_msg_t * msg) xfree(msg); } +void slurm_free_shutdown_msg(shutdown_msg_t * msg) +{ + xfree(msg); +} + void slurm_free_job_id_msg(job_id_msg_t * msg) { xfree(msg); diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 43e2f6fd2bf4d2d7f19369d031bc55c86c6ddba1..42dd60fe9c608c99503d161f0eb8a851fd8c5b69 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -411,6 +411,10 @@ typedef struct kill_tasks_msg { uint32_t signal; } kill_tasks_msg_t; +typedef struct shutdown_msg { + uint16_t core; +} shutdown_msg_t; + typedef struct last_update_msg { uint32_t last_update; } last_update_msg_t; @@ -542,6 +546,7 @@ void inline slurm_free_job_step_id(job_step_id_t * msg); #define slurm_free_job_info_request_msg(msg) slurm_free_job_step_id(msg) void inline slurm_free_ctl_conf(slurm_ctl_conf_info_msg_t * build_ptr); +void inline slurm_free_shutdown_msg (shutdown_msg_t * msg); void inline slurm_free_job_desc_msg(job_desc_msg_t * msg); void inline @@ -554,8 +559,7 @@ void inline slurm_free_submit_response_response_msg(submit_response_msg_t * msg); void inline -slurm_free_node_registration_status_msg -(slurm_node_registration_status_msg_t * msg); +slurm_free_node_registration_status_msg (slurm_node_registration_status_msg_t * msg); void inline slurm_free_job_info_msg(job_info_msg_t * msg); void inline slurm_free_job_info(job_info_t * job); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index b9ed180d0e6f08a7fa0885e6c3d992c6bc3b5cdc..52435b99632496ca96bcad90a9e10db3de150fc8 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -134,15 +134,17 @@ int pack_msg ( slurm_msg_t const * msg , char ** buffer , uint32_t * buf_len ) case REQUEST_SUBMIT_BATCH_JOB : case REQUEST_IMMEDIATE_RESOURCE_ALLOCATION : case REQUEST_JOB_WILL_RUN : - case REQUEST_ALLOCATION_AND_RUN_JOB_STEP : + case REQUEST_ALLOCATION_AND_RUN_JOB_STEP : pack_job_desc ( (job_desc_msg_t * ) msg -> data , ( void ** ) buffer , buf_len ) ; break ; case REQUEST_NODE_REGISTRATION_STATUS : case REQUEST_RECONFIGURE : - case REQUEST_SHUTDOWN : case REQUEST_SHUTDOWN_IMMEDIATE : /* Message contains no body/information */ break ; + case REQUEST_SHUTDOWN : + pack_shutdown_msg ( (shutdown_msg_t *) msg -> data, ( void ** ) buffer , buf_len ) ; + break; case RESPONSE_SUBMIT_BATCH_JOB: pack_submit_response_msg ( ( submit_response_msg_t * ) msg -> data , ( void ** ) buffer , buf_len ) ; break ; @@ -287,10 +289,12 @@ int unpack_msg ( slurm_msg_t * msg , char ** buffer , uint32_t * buf_len ) break ; case REQUEST_NODE_REGISTRATION_STATUS : case REQUEST_RECONFIGURE : - case REQUEST_SHUTDOWN : case REQUEST_SHUTDOWN_IMMEDIATE : /* Message contains no body/information */ break ; + case REQUEST_SHUTDOWN : + unpack_shutdown_msg ( ( shutdown_msg_t **) & ( msg-> data ), ( void ** ) buffer , buf_len ) ; + break ; case RESPONSE_SUBMIT_BATCH_JOB : unpack_submit_response_msg ( ( submit_response_msg_t ** ) & ( msg -> data ) , ( void ** ) buffer , buf_len ) ; break ; @@ -1406,7 +1410,7 @@ int unpack_cancel_tasks_msg ( kill_tasks_msg_t ** msg_ptr , void ** buffer , uin kill_tasks_msg_t * msg ; msg = xmalloc ( sizeof ( kill_tasks_msg_t ) ) ; - if ( msg == NULL) + if ( msg == NULL) { *msg_ptr = NULL ; return ENOMEM ; @@ -1419,6 +1423,27 @@ int unpack_cancel_tasks_msg ( kill_tasks_msg_t ** msg_ptr , void ** buffer , uin return 0 ; } +void pack_shutdown_msg ( shutdown_msg_t * msg , void ** buffer , uint32_t * length ) +{ + pack16 ( msg -> core , buffer , length ) ; +} + +int unpack_shutdown_msg ( shutdown_msg_t ** msg_ptr , void ** buffer , uint32_t * length ) +{ + shutdown_msg_t * msg ; + + msg = xmalloc ( sizeof ( shutdown_msg_t ) ) ; + if ( msg == NULL) + { + *msg_ptr = NULL ; + return ENOMEM ; + } + + unpack16 ( & msg -> core , buffer , length ) ; + *msg_ptr = msg ; + return 0 ; +} + void pack_job_step_id ( job_step_id_t * msg , void ** buffer , uint32_t * length ) { pack32 ( msg -> last_update , buffer , length ) ; diff --git a/src/common/slurm_protocol_pack.h b/src/common/slurm_protocol_pack.h index 17c2da249895a4e514937c55b41c0e66df2c2331..b105f05145ff7588390213312f5b3074894a81d2 100644 --- a/src/common/slurm_protocol_pack.h +++ b/src/common/slurm_protocol_pack.h @@ -121,6 +121,9 @@ int unpack_partition_table_msg ( partition_desc_msg_t ** msg_ptr , void ** buff void pack_update_partition_msg ( update_part_msg_t * msg , void ** buffer, uint32_t * length ); int unpack_update_partition_msg ( update_part_msg_t ** msg_ptr , void ** buffer, uint32_t * length ); +void pack_shutdown_msg ( shutdown_msg_t * msg , void ** buffer, uint32_t * length ); +int unpack_shutdown_msg ( shutdown_msg_t ** msg_ptr , void ** buffer, uint32_t * length ); + void pack_launch_tasks_request_msg ( launch_tasks_request_msg_t * msg , void ** buffer , uint32_t * length ); int unpack_launch_tasks_request_msg ( launch_tasks_request_msg_t ** msg_ptr , void ** buffer , uint32_t * length ); diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index 696e3bdf8bb11bae4e26f4ca53e5ab1be4e212f2..0cdfb75aec59d0c7b4823d3bbd04fa532d57cc0f 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -542,6 +542,14 @@ process_command (int argc, char *argv[]) if (quiet_flag == -1) fprintf(stderr, "no input"); } + else if (strncasecmp (argv[0], "abort", 5) == 0) { + if (argc > 2) + fprintf (stderr, + "too many arguments for keyword:%s\n", argv[0]); + error_code = slurm_shutdown (1); + if ((error_code != 0) && (quiet_flag != 1)) + slurm_perror ("slurm_shutdown error"); + } else if ((strcasecmp (argv[0], "exit") == 0) || (strcasecmp (argv[0], "quit") == 0)) { if (argc > 1) @@ -566,7 +574,7 @@ process_command (int argc, char *argv[]) fprintf (stderr, "too many arguments for keyword:%s\n", argv[0]); error_code = slurm_reconfigure (); if ((error_code != 0) && (quiet_flag != 1)) - fprintf (stderr, "error %d from reconfigure\n", error_code); + fprintf (stderr, "error from reconfigure %s\n", slurm_strerror (error_code)); } else if (strcasecmp (argv[0], "show") == 0) { @@ -619,10 +627,9 @@ process_command (int argc, char *argv[]) if (argc > 2) fprintf (stderr, "too many arguments for keyword:%s\n", argv[0]); - error_code = slurm_shutdown (); + error_code = slurm_shutdown (0); if ((error_code != 0) && (quiet_flag != 1)) - fprintf (stderr, "error %d from shutdown\n", error_code); - + slurm_perror ("slurm_shutdown error"); } else if (strcasecmp (argv[0], "update") == 0) { if (argc < 2) { @@ -918,6 +925,7 @@ usage () { printf (" <keyword> may be omitted from the execute line and scontrol will execute in interactive\n"); printf (" mode. It will process commands as entered until explicitly terminated.\n"); printf (" Valid <COMMAND> values are:\n"); + printf (" abort shutdown slurm controller immediately generating a core file.\n"); printf (" exit terminate this command.\n"); printf (" help print this description of use.\n"); printf (" quiet print no messages other than error messages.\n"); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 27b8d49d91eaa3c35947bad7e16f3760662cf192..7f239e9ce5593122e16adc754af23a8b523780a3 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -82,7 +82,7 @@ inline static void save_all_state ( void ); void *slurmctld_background ( void * no_data ); void *slurmctld_signal_hand ( void * no_data ); void *slurmctld_rpc_mgr( void * no_data ); -int slurm_shutdown ( void ); +inline static int slurmctld_shutdown ( void ); void * service_connection ( void * arg ); void usage (char *prog_name); @@ -99,7 +99,8 @@ inline static void slurm_rpc_job_step_get_info ( slurm_msg_t * msg ) ; inline static void slurm_rpc_job_will_run ( slurm_msg_t * msg ) ; inline static void slurm_rpc_node_registration ( slurm_msg_t * msg ) ; inline static void slurm_rpc_reconfigure_controller ( slurm_msg_t * msg ) ; -inline static void slurm_rpc_shutdown_controller ( slurm_msg_t * msg, int response ); +inline static void slurm_rpc_shutdown_controller ( slurm_msg_t * msg ); +inline static void slurm_rpc_shutdown_controller_immediate ( slurm_msg_t * msg ); inline static void slurm_rpc_submit_batch_job ( slurm_msg_t * msg ) ; inline static void slurm_rpc_update_job ( slurm_msg_t * msg ) ; inline static void slurm_rpc_update_node ( slurm_msg_t * msg ) ; @@ -136,6 +137,10 @@ main (int argc, char *argv[]) if ( ( error_code = read_slurm_conf (recover)) ) fatal ("read_slurm_conf error %d reading %s", error_code, SLURM_CONFIG_FILE); + if (daemonize) { + if (chdir (slurmctld_conf.state_save_location)) + fatal ("chdir to %s error %m", slurmctld_conf.state_save_location); + } if ( ( error_code = getnodename (node_name, MAX_NAME_LEN) ) ) fatal ("getnodename error %d", error_code); @@ -218,7 +223,7 @@ slurmctld_signal_hand ( void * no_data ) info ("Terminate signal (SIGINT or SIGTERM) received\n"); shutdown_time = time (NULL); /* send REQUEST_SHUTDOWN_IMMEDIATE RPC */ - slurm_shutdown (); + slurmctld_shutdown (); /* ssl clean up */ slurm_destroy_ssl_key_ctx ( & sign_ctx ) ; slurm_ssl_destroy ( ) ; @@ -546,10 +551,11 @@ slurmctld_req ( slurm_msg_t * msg ) slurm_rpc_reconfigure_controller ( msg ) ; break; case REQUEST_SHUTDOWN: - slurm_rpc_shutdown_controller ( msg , 1 ) ; + slurm_rpc_shutdown_controller ( msg ) ; + slurm_free_shutdown_msg ( msg -> data ) ; break; case REQUEST_SHUTDOWN_IMMEDIATE: - slurm_rpc_shutdown_controller ( msg , 0 ) ; + slurm_rpc_shutdown_controller_immediate ( msg ) ; break; case REQUEST_UPDATE_JOB: slurm_rpc_update_job ( msg ) ; @@ -1289,6 +1295,11 @@ slurm_rpc_reconfigure_controller ( slurm_msg_t * msg ) error_code = read_slurm_conf (0); if (error_code == 0) reset_job_bitmaps (); + + if (daemonize) { + if (chdir (slurmctld_conf.state_save_location)) + fatal ("chdir to %s error %m", slurmctld_conf.state_save_location); + } unlock_slurmctld (config_write_lock); /* return result */ @@ -1311,33 +1322,44 @@ slurm_rpc_reconfigure_controller ( slurm_msg_t * msg ) /* slurm_rpc_shutdown_controller - process RPC to shutdown slurmctld */ void -slurm_rpc_shutdown_controller ( slurm_msg_t * msg, int response ) +slurm_rpc_shutdown_controller ( slurm_msg_t * msg ) { + shutdown_msg_t * shutdown_msg = (shutdown_msg_t *) msg->data; /* must be user root */ /* do RPC call */ - if (response) - debug ("Performing RPC: REQUEST_SHUTDOWN"); - else - debug ("Performing RPC: REQUEST_SHUTDOWN_IMMEDIATE"); + debug ("Performing RPC: REQUEST_SHUTDOWN"); - if (shutdown_time) + if (shutdown_msg->core) + debug3 ("performing immeditate shutdown without state save"); + else if (shutdown_time) debug3 ("slurm_rpc_shutdown_controller RPC issued after shutdown in progress"); else if (thread_id_sig) { pthread_kill (thread_id_sig, SIGTERM); /* tell master to clean-up */ info ("slurm_rpc_shutdown_controller completed successfully"); - } else { + } + else { error ("thread_id_sig undefined, doing shutdown the hard way"); shutdown_time = time (NULL); /* send REQUEST_SHUTDOWN_IMMEDIATE RPC */ - slurm_shutdown (); + slurmctld_shutdown (); } - if (response) - slurm_send_rc_msg ( msg , SLURM_SUCCESS ); + slurm_send_rc_msg ( msg , SLURM_SUCCESS ); + if (shutdown_msg->core) + fatal ("Aborting per RPC request"); } +/* slurm_rpc_shutdown_controller_immediate - process RPC to shutdown slurmctld */ +void +slurm_rpc_shutdown_controller_immediate ( slurm_msg_t * msg ) +{ +/* must be user root */ + /* do RPC call */ + debug ("Performing RPC: REQUEST_SHUTDOWN_IMMEDIATE"); + /* No op: just used to knock loose accept RPC thread */ +} /* slurm_rpc_create_job_step - process RPC to creates/registers a job step with the step_mgr */ void slurm_rpc_job_step_create( slurm_msg_t* msg ) @@ -1435,11 +1457,11 @@ slurm_rpc_node_registration ( slurm_msg_t * msg ) } /* - * slurm_shutdown - issue RPC to have slurmctld shutdown, + * slurmctld_shutdown - issue RPC to have slurmctld shutdown, * knocks loose an slurm_accept_msg_conn() if we have a thread hung there */ int -slurm_shutdown () +slurmctld_shutdown () { int rc ; slurm_fd sockfd ; diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index 59fcf6ab2db34d44c93a2913356d1f39e6c0d999..aaa2dd5d7854dd8329a461ce210244c2dbd4d20d 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -80,6 +80,8 @@ slurmd_config_t slurmd_conf; /* function prototypes */ static char *public_cert_filename(); +inline static void reset_cwd(void); +inline static char *state_save_location (void); static void slurmd_req(slurm_msg_t * msg); static void *slurmd_msg_engine(void *args); inline static int send_node_registration_status_msg(); @@ -120,15 +122,9 @@ int main(int argc, char *argv[]) if (slurmd_conf.daemonize == true) { daemon(false, true); + reset_cwd(); } -/* - if ( ( rc = init_slurm_conf () ) ) - fatal ("slurmd: init_slurm_conf error %d", rc); - if ( ( rc = read_slurm_conf ( ) ) ) - fatal ("slurmd: error %d from read_slurm_conf reading %s", rc, SLURM_CONFIG_FILE); -*/ - /* shared memory init */ slurmd_init(); @@ -191,7 +187,8 @@ void *slurmd_handle_signals(void *args) break; case SIGHUP: /* kill -1 */ info("Reconfigure signal (SIGHUP) received\n"); - //error_code = read_slurm_conf ( ); + if (slurmd_conf.daemonize == true) + reset_cwd(); break; default: error("Invalid signal (%d) received", sig); @@ -773,3 +770,83 @@ int parse_commandline_args(int argc, char **argv, } return SLURM_SUCCESS; } + +/* reset_cwd - reset the current working directory per slurm configuration file + * this makes the core file go to StateSaveLocation if a daemon */ +void +reset_cwd(void) +{ + char *dir; + + dir = state_save_location (); + if (dir == NULL) + error ("No state save location specified in configuration file"); + else { + if (chdir (dir)) + error ("chdir to %s error %m", dir); +debug ("chdir %s", dir); + xfree (dir); + } +} + +/* state_save_location - returns the value of StateSaveLocation from the slurm configuration file + * NOTE: The caller must xfree the return value */ +char * +state_save_location (void) +{ + FILE *slurm_spec_file; + char in_line[BUF_SIZE]; /* input line */ + char *dir = NULL; + int i, j, error_code, line_num = 0; + + slurm_spec_file = fopen (SLURM_CONFIG_FILE, "r"); + if (slurm_spec_file == NULL) { + error ( "state_save_location error %d opening file %s: %m", + errno, SLURM_CONFIG_FILE); + return NULL ; + } + + while (fgets (in_line, BUF_SIZE, slurm_spec_file) != NULL) { + line_num++; + if (strlen (in_line) >= (BUF_SIZE - 1)) { + error ("state_save_location line %d, of input file %s too long\n", + line_num, SLURM_CONFIG_FILE); + fclose (slurm_spec_file); + return NULL; + } + + /* everything after a non-escaped "#" is a comment */ + /* replace comment flag "#" with an end of string (NULL) */ + for (i = 0; i < BUF_SIZE; i++) { + if (in_line[i] == (char) NULL) + break; + if (in_line[i] != '#') + continue; + if ((i > 0) && (in_line[i - 1] == '\\')) { /* escaped "#" */ + for (j = i; j < BUF_SIZE; j++) { + in_line[j - 1] = in_line[j]; + } + continue; + } + in_line[i] = (char) NULL; + break; + } + + /* parse what is left */ + /* overall slurm configuration parameters */ + error_code = slurm_parser(in_line, + "StateSaveLocation=", 's', &dir, + "END"); + if (error_code) { + error ("error parsing configuration file input line %d", line_num); + fclose (slurm_spec_file); + return NULL; + } + + if ( dir ) { + fclose (slurm_spec_file); + return dir; + } + } + return NULL; +}