From e6f525e64accd0da4b3e63d59f27bac1cf52ba04 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Sat, 14 Sep 2002 00:11:11 +0000
Subject: [PATCH] Support abort RPC, generate core file in save state directory
 (per configuration file for all daemons).

---
 src/api/reconfigure.c            |  5 +-
 src/api/slurm.h                  |  2 +-
 src/common/slurm_protocol_defs.c |  5 ++
 src/common/slurm_protocol_defs.h |  8 ++-
 src/common/slurm_protocol_pack.c | 33 ++++++++++--
 src/common/slurm_protocol_pack.h |  3 ++
 src/scontrol/scontrol.c          | 16 ++++--
 src/slurmctld/controller.c       | 56 +++++++++++++------
 src/slurmd/slurmd.c              | 93 +++++++++++++++++++++++++++++---
 9 files changed, 184 insertions(+), 37 deletions(-)

diff --git a/src/api/reconfigure.c b/src/api/reconfigure.c
index e349bb7cea6..7bf7fbaae9d 100644
--- a/src/api/reconfigure.c
+++ b/src/api/reconfigure.c
@@ -99,7 +99,7 @@ slurm_reconfigure ()
 
 /* slurm_shutdown - issue RPC to have slurmctld shutdown */
 int
-slurm_shutdown ()
+slurm_shutdown (uint16_t core)
 {
 	int msg_size ;
 	int rc ;
@@ -107,6 +107,7 @@ slurm_shutdown ()
 	slurm_msg_t request_msg ;
 	slurm_msg_t response_msg ;
 	return_code_msg_t * slurm_rc_msg ;
+	shutdown_msg_t shutdown_msg ;
 
         /* init message connection for message communication with controller */
 	if ( ( sockfd = slurm_open_controller_conn ( ) ) == SLURM_SOCKET_ERROR ) {
@@ -115,7 +116,9 @@ slurm_shutdown ()
 	}
 
 	/* send request message */
+	shutdown_msg . core = core ;
 	request_msg . msg_type = REQUEST_SHUTDOWN ;
+	request_msg . data = &shutdown_msg;
 
 	if ( ( rc = slurm_send_controller_msg ( sockfd , & request_msg ) ) == SLURM_SOCKET_ERROR ) {
 		slurm_seterrno ( SLURM_COMMUNICATIONS_SEND_ERROR );
diff --git a/src/api/slurm.h b/src/api/slurm.h
index a921b889701..23e3fbc797e 100644
--- a/src/api/slurm.h
+++ b/src/api/slurm.h
@@ -125,7 +125,7 @@ extern int slurm_job_will_run (job_desc_msg_t * job_desc_msg , resource_allocati
 extern int slurm_reconfigure ();
 
 /* slurm_shutdown - request that slurmctld terminate gracefully */
-extern int slurm_shutdown ();
+extern int slurm_shutdown (uint16_t core);
 
 /* update a job, node, or partition's configuration, root access only */ 
 extern int slurm_update_job ( job_desc_msg_t * job_msg ) ;
diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c
index 3b7ed27abb9..3820e482307 100644
--- a/src/common/slurm_protocol_defs.c
+++ b/src/common/slurm_protocol_defs.c
@@ -42,6 +42,11 @@ void slurm_free_last_update_msg(last_update_msg_t * msg)
 	xfree(msg);
 }
 
+void slurm_free_shutdown_msg(shutdown_msg_t * msg)
+{
+	xfree(msg);
+}
+
 void slurm_free_job_id_msg(job_id_msg_t * msg)
 {
 	xfree(msg);
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index 43e2f6fd2bf..42dd60fe9c6 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -411,6 +411,10 @@ typedef struct kill_tasks_msg {
 	uint32_t signal;
 } kill_tasks_msg_t;
 
+typedef struct shutdown_msg {
+	uint16_t core;
+} shutdown_msg_t;
+
 typedef struct last_update_msg {
 	uint32_t last_update;
 } last_update_msg_t;
@@ -542,6 +546,7 @@ void inline slurm_free_job_step_id(job_step_id_t * msg);
 #define slurm_free_job_info_request_msg(msg) slurm_free_job_step_id(msg)
 
 void inline slurm_free_ctl_conf(slurm_ctl_conf_info_msg_t * build_ptr);
+void inline slurm_free_shutdown_msg (shutdown_msg_t * msg);
 
 void inline slurm_free_job_desc_msg(job_desc_msg_t * msg);
 void inline
@@ -554,8 +559,7 @@ void inline slurm_free_submit_response_response_msg(submit_response_msg_t *
 						    msg);
 
 void inline
-slurm_free_node_registration_status_msg
-(slurm_node_registration_status_msg_t * msg);
+slurm_free_node_registration_status_msg (slurm_node_registration_status_msg_t * msg);
 
 void inline slurm_free_job_info_msg(job_info_msg_t * msg);
 void inline slurm_free_job_info(job_info_t * job);
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index b9ed180d0e6..52435b99632 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -134,15 +134,17 @@ int pack_msg ( slurm_msg_t const * msg , char ** buffer , uint32_t * buf_len )
 		case REQUEST_SUBMIT_BATCH_JOB :
 		case REQUEST_IMMEDIATE_RESOURCE_ALLOCATION : 
 		case REQUEST_JOB_WILL_RUN : 
-		case REQUEST_ALLOCATION_AND_RUN_JOB_STEP : 
+		case REQUEST_ALLOCATION_AND_RUN_JOB_STEP :
 			pack_job_desc ( (job_desc_msg_t * )  msg -> data , ( void ** ) buffer , buf_len )  ;
 			break ;
 		case REQUEST_NODE_REGISTRATION_STATUS :
 		case REQUEST_RECONFIGURE :
-		case REQUEST_SHUTDOWN :
 		case REQUEST_SHUTDOWN_IMMEDIATE :
 			/* Message contains no body/information */
 			break ;
+		case REQUEST_SHUTDOWN :
+			pack_shutdown_msg ( (shutdown_msg_t *) msg -> data, ( void ** ) buffer , buf_len )  ;
+			break;
 		case RESPONSE_SUBMIT_BATCH_JOB:
 			pack_submit_response_msg ( ( submit_response_msg_t * ) msg -> data , ( void ** ) buffer , buf_len ) ;
 			break ;
@@ -287,10 +289,12 @@ int unpack_msg ( slurm_msg_t * msg , char ** buffer , uint32_t * buf_len )
 			break ;
 		case REQUEST_NODE_REGISTRATION_STATUS :
 		case REQUEST_RECONFIGURE :
-		case REQUEST_SHUTDOWN :
 		case REQUEST_SHUTDOWN_IMMEDIATE :
 			/* Message contains no body/information */
 			break ;
+		case REQUEST_SHUTDOWN :
+			unpack_shutdown_msg ( ( shutdown_msg_t **) & ( msg-> data ), ( void ** ) buffer , buf_len ) ;
+			break ;
 		case RESPONSE_SUBMIT_BATCH_JOB :
 			unpack_submit_response_msg ( ( submit_response_msg_t ** ) & ( msg -> data ) , ( void ** ) buffer , buf_len ) ;
 			break ;
@@ -1406,7 +1410,7 @@ int unpack_cancel_tasks_msg ( kill_tasks_msg_t ** msg_ptr , void ** buffer , uin
 	kill_tasks_msg_t * msg ;
 
 	msg = xmalloc ( sizeof ( kill_tasks_msg_t ) ) ;
-	if ( msg == NULL) 
+	if ( msg == NULL)
 	{
 		*msg_ptr = NULL ;
 		return ENOMEM ;
@@ -1419,6 +1423,27 @@ int unpack_cancel_tasks_msg ( kill_tasks_msg_t ** msg_ptr , void ** buffer , uin
 	return 0 ;
 }
 
+void pack_shutdown_msg ( shutdown_msg_t * msg , void ** buffer , uint32_t * length )
+{
+	pack16 ( msg -> core , buffer , length ) ;
+}
+
+int unpack_shutdown_msg ( shutdown_msg_t ** msg_ptr , void ** buffer , uint32_t * length )
+{
+	shutdown_msg_t * msg ;
+
+	msg = xmalloc ( sizeof ( shutdown_msg_t ) ) ;
+	if ( msg == NULL)
+	{
+		*msg_ptr = NULL ;
+		return ENOMEM ;
+	}
+
+	unpack16 ( & msg -> core , buffer , length ) ;
+	*msg_ptr = msg ;
+	return 0 ;
+}
+
 void pack_job_step_id ( job_step_id_t * msg , void ** buffer , uint32_t * length )
 {
 	pack32 ( msg -> last_update , buffer , length ) ;
diff --git a/src/common/slurm_protocol_pack.h b/src/common/slurm_protocol_pack.h
index 17c2da24989..b105f05145f 100644
--- a/src/common/slurm_protocol_pack.h
+++ b/src/common/slurm_protocol_pack.h
@@ -121,6 +121,9 @@ int unpack_partition_table_msg ( partition_desc_msg_t **  msg_ptr , void ** buff
 void pack_update_partition_msg ( update_part_msg_t * msg , void ** buffer, uint32_t * length  );
 int unpack_update_partition_msg ( update_part_msg_t ** msg_ptr , void ** buffer, uint32_t * length  );
 
+void pack_shutdown_msg ( shutdown_msg_t * msg , void ** buffer, uint32_t * length  );
+int unpack_shutdown_msg ( shutdown_msg_t ** msg_ptr , void ** buffer, uint32_t * length  );
+
 void pack_launch_tasks_request_msg ( launch_tasks_request_msg_t * msg , void ** buffer , uint32_t * length );
 int unpack_launch_tasks_request_msg ( launch_tasks_request_msg_t ** msg_ptr , void ** buffer , uint32_t * length );
 
diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c
index 696e3bdf8bb..0cdfb75aec5 100644
--- a/src/scontrol/scontrol.c
+++ b/src/scontrol/scontrol.c
@@ -542,6 +542,14 @@ process_command (int argc, char *argv[])
 		if (quiet_flag == -1)
 			fprintf(stderr, "no input");
 	}
+	else if (strncasecmp (argv[0], "abort", 5) == 0) {
+		if (argc > 2)
+			fprintf (stderr,
+				 "too many arguments for keyword:%s\n", argv[0]);
+		error_code = slurm_shutdown (1);
+		if ((error_code != 0) && (quiet_flag != 1))
+			slurm_perror ("slurm_shutdown error");
+	}
 	else if ((strcasecmp (argv[0], "exit") == 0) ||
 	         (strcasecmp (argv[0], "quit") == 0)) {
 		if (argc > 1)
@@ -566,7 +574,7 @@ process_command (int argc, char *argv[])
 			fprintf (stderr, "too many arguments for keyword:%s\n", argv[0]);
 		error_code = slurm_reconfigure ();
 		if ((error_code != 0) && (quiet_flag != 1))
-			fprintf (stderr, "error %d from reconfigure\n", error_code);
+			fprintf (stderr, "error from reconfigure %s\n", slurm_strerror (error_code));
 
 	}
 	else if (strcasecmp (argv[0], "show") == 0) {
@@ -619,10 +627,9 @@ process_command (int argc, char *argv[])
 		if (argc > 2)
 			fprintf (stderr,
 				 "too many arguments for keyword:%s\n", argv[0]);
-		error_code = slurm_shutdown ();
+		error_code = slurm_shutdown (0);
 		if ((error_code != 0) && (quiet_flag != 1))
-			fprintf (stderr, "error %d from shutdown\n", error_code);
-
+			slurm_perror ("slurm_shutdown error");
 	}
 	else if (strcasecmp (argv[0], "update") == 0) {
 		if (argc < 2) {
@@ -918,6 +925,7 @@ usage () {
 	printf ("  <keyword> may be omitted from the execute line and scontrol will execute in interactive\n");
 	printf ("    mode. It will process commands as entered until explicitly terminated.\n");
 	printf ("    Valid <COMMAND> values are:\n");
+	printf ("     abort                    shutdown slurm controller immediately generating a core file.\n");
 	printf ("     exit                     terminate this command.\n");
 	printf ("     help                     print this description of use.\n");
 	printf ("     quiet                    print no messages other than error messages.\n");
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 27b8d49d91e..7f239e9ce55 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -82,7 +82,7 @@ inline static void save_all_state ( void );
 void *slurmctld_background ( void * no_data );
 void *slurmctld_signal_hand ( void * no_data );
 void *slurmctld_rpc_mgr( void * no_data );
-int slurm_shutdown ( void );
+inline static int slurmctld_shutdown ( void );
 void * service_connection ( void * arg );
 void usage (char *prog_name);
 
@@ -99,7 +99,8 @@ inline static void slurm_rpc_job_step_get_info ( slurm_msg_t * msg ) ;
 inline static void slurm_rpc_job_will_run ( slurm_msg_t * msg ) ;
 inline static void slurm_rpc_node_registration ( slurm_msg_t * msg ) ;
 inline static void slurm_rpc_reconfigure_controller ( slurm_msg_t * msg ) ;
-inline static void slurm_rpc_shutdown_controller ( slurm_msg_t * msg, int response );
+inline static void slurm_rpc_shutdown_controller ( slurm_msg_t * msg );
+inline static void slurm_rpc_shutdown_controller_immediate ( slurm_msg_t * msg );
 inline static void slurm_rpc_submit_batch_job ( slurm_msg_t * msg ) ;
 inline static void slurm_rpc_update_job ( slurm_msg_t * msg ) ;
 inline static void slurm_rpc_update_node ( slurm_msg_t * msg ) ;
@@ -136,6 +137,10 @@ main (int argc, char *argv[])
 
 	if ( ( error_code = read_slurm_conf (recover)) ) 
 		fatal ("read_slurm_conf error %d reading %s", error_code, SLURM_CONFIG_FILE);
+	if (daemonize) {
+		if (chdir (slurmctld_conf.state_save_location))
+			fatal ("chdir to %s error %m", slurmctld_conf.state_save_location);
+	}
 	if ( ( error_code = getnodename (node_name, MAX_NAME_LEN) ) ) 
 		fatal ("getnodename error %d", error_code);
 
@@ -218,7 +223,7 @@ slurmctld_signal_hand ( void * no_data )
 				info ("Terminate signal (SIGINT or SIGTERM) received\n");
 				shutdown_time = time (NULL);
 				/* send REQUEST_SHUTDOWN_IMMEDIATE RPC */
-				slurm_shutdown ();
+				slurmctld_shutdown ();
 				/* ssl clean up */
 				slurm_destroy_ssl_key_ctx ( & sign_ctx ) ;
 				slurm_ssl_destroy ( ) ;
@@ -546,10 +551,11 @@ slurmctld_req ( slurm_msg_t * msg )
 			slurm_rpc_reconfigure_controller ( msg ) ;
 			break;
 		case REQUEST_SHUTDOWN:
-			slurm_rpc_shutdown_controller ( msg , 1 ) ;
+			slurm_rpc_shutdown_controller ( msg ) ;
+			slurm_free_shutdown_msg ( msg -> data ) ;
 			break;
 		case REQUEST_SHUTDOWN_IMMEDIATE:
-			slurm_rpc_shutdown_controller ( msg , 0 ) ;
+			slurm_rpc_shutdown_controller_immediate ( msg ) ;
 			break;
 		case REQUEST_UPDATE_JOB:
 			slurm_rpc_update_job ( msg ) ;
@@ -1289,6 +1295,11 @@ slurm_rpc_reconfigure_controller ( slurm_msg_t * msg )
 	error_code = read_slurm_conf (0);
 	if (error_code == 0)
 		reset_job_bitmaps ();
+
+	if (daemonize) {
+		if (chdir (slurmctld_conf.state_save_location))
+			fatal ("chdir to %s error %m", slurmctld_conf.state_save_location);
+	}
 	unlock_slurmctld (config_write_lock);
 
 	/* return result */
@@ -1311,33 +1322,44 @@ slurm_rpc_reconfigure_controller ( slurm_msg_t * msg )
 
 /* slurm_rpc_shutdown_controller - process RPC to shutdown slurmctld */
 void 
-slurm_rpc_shutdown_controller ( slurm_msg_t * msg, int response )
+slurm_rpc_shutdown_controller ( slurm_msg_t * msg )
 {
+	shutdown_msg_t * shutdown_msg = (shutdown_msg_t *) msg->data;
 /* must be user root */
 
 	/* do RPC call */
-	if (response)
-		debug ("Performing RPC: REQUEST_SHUTDOWN");
-	else
-		debug ("Performing RPC: REQUEST_SHUTDOWN_IMMEDIATE");
+	debug ("Performing RPC: REQUEST_SHUTDOWN");
 
-	if (shutdown_time)
+	if (shutdown_msg->core)
+		debug3 ("performing immeditate shutdown without state save");
+	else if (shutdown_time)
 		debug3 ("slurm_rpc_shutdown_controller RPC issued after shutdown in progress");
 	else if (thread_id_sig) {
 		pthread_kill (thread_id_sig, SIGTERM);	/* tell master to clean-up */
 		info ("slurm_rpc_shutdown_controller completed successfully");
-	} else {
+	} 
+	else {
 		error ("thread_id_sig undefined, doing shutdown the hard way");
 		shutdown_time = time (NULL);
 		/* send REQUEST_SHUTDOWN_IMMEDIATE RPC */
-		slurm_shutdown ();
+		slurmctld_shutdown ();
 	}
 
-	if (response)
-		slurm_send_rc_msg ( msg , SLURM_SUCCESS );
+	slurm_send_rc_msg ( msg , SLURM_SUCCESS );
+	if (shutdown_msg->core)
+		fatal ("Aborting per RPC request");
 }
 
+/* slurm_rpc_shutdown_controller_immediate - process RPC to shutdown slurmctld */
+void 
+slurm_rpc_shutdown_controller_immediate ( slurm_msg_t * msg )
+{
+/* must be user root */
 
+	/* do RPC call */
+	debug ("Performing RPC: REQUEST_SHUTDOWN_IMMEDIATE");
+	/* No op: just used to knock loose accept RPC thread */
+}
 /* slurm_rpc_create_job_step - process RPC to creates/registers a job step with the step_mgr */
 void 
 slurm_rpc_job_step_create( slurm_msg_t* msg )
@@ -1435,11 +1457,11 @@ slurm_rpc_node_registration ( slurm_msg_t * msg )
 }
 
 /*
- * slurm_shutdown - issue RPC to have slurmctld shutdown, 
+ * slurmctld_shutdown - issue RPC to have slurmctld shutdown, 
  *	knocks loose an slurm_accept_msg_conn() if we have a thread hung there
  */
 int
-slurm_shutdown ()
+slurmctld_shutdown ()
 {
 	int rc ;
 	slurm_fd sockfd ;
diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c
index 59fcf6ab2db..aaa2dd5d785 100644
--- a/src/slurmd/slurmd.c
+++ b/src/slurmd/slurmd.c
@@ -80,6 +80,8 @@ slurmd_config_t slurmd_conf;
 
 /* function prototypes */
 static char *public_cert_filename();
+inline static void reset_cwd(void);
+inline static char *state_save_location (void);
 static void slurmd_req(slurm_msg_t * msg);
 static void *slurmd_msg_engine(void *args);
 inline static int send_node_registration_status_msg();
@@ -120,15 +122,9 @@ int main(int argc, char *argv[])
 
 	if (slurmd_conf.daemonize == true) {
 		daemon(false, true);
+		reset_cwd();
 	}
 
-/*
-	if ( ( rc = init_slurm_conf () ) ) 
-		fatal ("slurmd: init_slurm_conf error %d", rc);
-	if ( ( rc = read_slurm_conf ( ) ) ) 
-		fatal ("slurmd: error %d from read_slurm_conf reading %s", rc, SLURM_CONFIG_FILE);
-*/
-
 	/* shared memory init */
 	slurmd_init();
 
@@ -191,7 +187,8 @@ void *slurmd_handle_signals(void *args)
 			break;
 		case SIGHUP:	/* kill -1 */
 			info("Reconfigure signal (SIGHUP) received\n");
-			//error_code = read_slurm_conf ( );
+			if (slurmd_conf.daemonize == true)
+				reset_cwd();
 			break;
 		default:
 			error("Invalid signal (%d) received", sig);
@@ -773,3 +770,83 @@ int parse_commandline_args(int argc, char **argv,
 	}
 	return SLURM_SUCCESS;
 }
+
+/* reset_cwd - reset the current working directory per slurm configuration file 
+ *	this makes the core file go to StateSaveLocation if a daemon */
+void 
+reset_cwd(void)
+{
+	char *dir;
+
+	dir = state_save_location ();
+	if (dir == NULL)
+		error ("No state save location specified in configuration file");
+	else {
+		if (chdir (dir))
+			error ("chdir to %s error %m", dir);
+debug ("chdir %s", dir);
+		xfree (dir);
+	}
+}
+
+/* state_save_location - returns the value of StateSaveLocation from the slurm configuration file
+ *	NOTE: The caller must xfree the return value */
+char *
+state_save_location (void)
+{
+	FILE *slurm_spec_file;
+	char in_line[BUF_SIZE];	/* input line */
+	char *dir = NULL;
+	int i, j, error_code, line_num = 0;
+
+        slurm_spec_file = fopen (SLURM_CONFIG_FILE, "r");
+	if (slurm_spec_file == NULL) {
+		error ( "state_save_location error %d opening file %s: %m",
+			errno, SLURM_CONFIG_FILE);
+		return NULL ;
+	}
+
+	while (fgets (in_line, BUF_SIZE, slurm_spec_file) != NULL) {
+		line_num++;
+		if (strlen (in_line) >= (BUF_SIZE - 1)) {
+			error ("state_save_location line %d, of input file %s too long\n",
+				 line_num, SLURM_CONFIG_FILE);
+			fclose (slurm_spec_file);
+			return NULL;
+		}		
+
+		/* everything after a non-escaped "#" is a comment */
+		/* replace comment flag "#" with an end of string (NULL) */
+		for (i = 0; i < BUF_SIZE; i++) {
+			if (in_line[i] == (char) NULL)
+				break;
+			if (in_line[i] != '#')
+				continue;
+			if ((i > 0) && (in_line[i - 1] == '\\')) {	/* escaped "#" */
+				for (j = i; j < BUF_SIZE; j++) {
+					in_line[j - 1] = in_line[j];
+				}	
+				continue;
+			}	
+			in_line[i] = (char) NULL;
+			break;
+		}		
+
+		/* parse what is left */
+		/* overall slurm configuration parameters */
+		error_code = slurm_parser(in_line,
+			"StateSaveLocation=", 's', &dir, 
+			"END");
+		if (error_code) {
+			error ("error parsing configuration file input line %d", line_num);
+			fclose (slurm_spec_file);
+			return NULL;
+		}		
+
+		if ( dir ) {
+			fclose (slurm_spec_file);
+			return dir;	
+		}
+	}			
+	return NULL;
+}
-- 
GitLab