From 39bc50bd31b6851fb30cbc2ea847d39a05152dbd Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Tue, 18 Dec 2007 18:58:35 +0000
Subject: [PATCH] Added new scontrol option "setdebug" that can change the
 slurmctld daemons  debug level at any time (Hongjia Cao, NUDT).

---
 NEWS                             |  2 +
 doc/man/man1/scontrol.1          | 14 ++++++-
 slurm/slurm.h.in                 |  8 ++++
 src/api/reconfigure.c            | 36 ++++++++++++++++++
 src/common/slurm_protocol_defs.c |  9 +++++
 src/common/slurm_protocol_defs.h | 10 ++++-
 src/common/slurm_protocol_pack.c | 33 ++++++++++++++++
 src/scontrol/scontrol.c          | 51 +++++++++++++++++++++++++
 src/slurmctld/controller.c       |  3 ++
 src/slurmctld/proc_req.c         | 64 ++++++++++++++++++++++++++++++++
 10 files changed, 227 insertions(+), 3 deletions(-)

diff --git a/NEWS b/NEWS
index 5376b5fc7d1..2f34eba43d8 100644
--- a/NEWS
+++ b/NEWS
@@ -14,6 +14,8 @@ documents those changes that are of interest to users and admins.
     accounting data collection polling interval.
  -- In sched/wiki2 add support for hostlist expression use in GETNODES command
     with HostFormat=2 in the wiki.conf file.
+ -- Added new scontrol option "setdebug" that can change the slurmctld daemons
+    debug level at any time (Hongjia Cao, NUDT).
 
 * Changes in SLURM 1.3.0-pre7
 =============================
diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1
index da1311c9934..16ef7e4e91d 100644
--- a/doc/man/man1/scontrol.1
+++ b/doc/man/man1/scontrol.1
@@ -1,4 +1,4 @@
-.TH SCONTROL "1" "October 2007" "scontrol 1.3" "Slurm components"
+.TH SCONTROL "1" "December 2008" "scontrol 1.3" "Slurm components"
 
 .SH "NAME"
 scontrol \- Used view and modify Slurm configuration and state.
@@ -181,6 +181,18 @@ Resume a previously suspended job.
 \fBrequeue\fP \fIjob_id\fP
 Requeue a running or pending SLURM batch job.
 
+.TP
+\fBsetdebug\fP \fILEVEL\fP
+Change the debug level of the slurmctld daemon.
+\fILEVEL\fP may be an integer value between zero and nine (using the 
+same values as \fISlurmctldDebug\fP in the \fIslurm.conf\fP file) or 
+the name of the most detailed message type to be printed: 
+"quiet", "fatal", "error", "info", "verbose", "debug", "debug2", "debug3", 
+"debug4", or "debug5".
+This value is temporary and will be overwritten whenever the slurmctld 
+daemon reads the slurm.conf configuration file (e.g. when the daemon 
+is restarted or "scontrol reconfigure" is executed).
+
 .TP
 \fBshow\fP \fIENTITY\fP \fIID\fP
 Display the state of the specified entity with the specified identification.
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 9c3d3cc56c2..66bcc526366 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -1859,6 +1859,14 @@ extern int slurm_reconfigure PARAMS(( void ));
  */
 extern int slurm_shutdown PARAMS(( uint16_t core ));
 
+/*
+ * slurm_set_debug_level - issue RPC to set slurm controller debug level
+ * IN debug_level - requested debug level
+ * RET 0 on success, otherwise return -1 and set errno to indicate the error
+ */
+extern int slurm_set_debug_level PARAMS((uint32_t debug_level));
+
+
 /*****************************************************************************\
  *      SLURM JOB SUSPEND FUNCTIONS
 \*****************************************************************************/
diff --git a/src/api/reconfigure.c b/src/api/reconfigure.c
index f378a726992..5d23e513f9f 100644
--- a/src/api/reconfigure.c
+++ b/src/api/reconfigure.c
@@ -165,3 +165,39 @@ _send_message_controller (enum controller_id dest, slurm_msg_t *req)
         return rc;
 }
 
+/*
+ * slurm_set_debug_level - issue RPC to set slurm controller debug level
+ * IN debug_level - requested debug level
+ * RET 0 on success, otherwise return -1 and set errno to indicate the error
+ */
+int
+slurm_set_debug_level (uint32_t debug_level)
+{
+	int rc;
+	slurm_msg_t req_msg;
+	slurm_msg_t resp_msg;
+	set_debug_level_msg_t req;
+
+	slurm_msg_t_init(&req_msg);
+	slurm_msg_t_init(&resp_msg);
+
+	req.debug_level  = debug_level;
+	req_msg.msg_type = REQUEST_SET_DEBUG_LEVEL;
+	req_msg.data     = &req;
+
+	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
+		return SLURM_ERROR;
+
+	switch (resp_msg.msg_type) {
+	case RESPONSE_SLURM_RC:
+		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
+		slurm_free_return_code_msg(resp_msg.data);
+		if (rc)
+			slurm_seterrno_ret(rc);
+		break;
+	default:
+		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
+		break;
+	}
+        return SLURM_PROTOCOL_SUCCESS;
+}
diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c
index 4f1d7397511..04631b8f4fd 100644
--- a/src/common/slurm_protocol_defs.c
+++ b/src/common/slurm_protocol_defs.c
@@ -1142,6 +1142,12 @@ void inline slurm_free_trigger_msg(trigger_info_msg_t *msg)
 	xfree(msg);
 }
 
+void slurm_free_set_debug_level_msg(set_debug_level_msg_t *msg)
+{
+	xfree(msg);
+}
+
+
 extern int slurm_free_msg_data(slurm_msg_type_t type, void *data)
 {
 	switch(type) {
@@ -1270,6 +1276,9 @@ extern int slurm_free_msg_data(slurm_msg_type_t type, void *data)
 	case RESPONSE_SLURM_RC:
 		slurm_free_return_code_msg(data);
 		break;
+	case REQUEST_SET_DEBUG_LEVEL:
+		slurm_free_set_debug_level_msg(data);
+		break;
 	case SLURM_SUCCESS:		
 	case REQUEST_PING:		
 	case REQUEST_RECONFIGURE:
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index 6ea5c1fe637..28357539fd1 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -94,7 +94,8 @@ typedef enum {
 	RESPONSE_SHUTDOWN,
 	REQUEST_PING,
 	REQUEST_CONTROL,
-
+	REQUEST_SET_DEBUG_LEVEL,
+	
 	REQUEST_BUILD_INFO = 2001,
 	RESPONSE_BUILD_INFO,
 	REQUEST_JOB_INFO,
@@ -388,6 +389,10 @@ typedef struct last_update_msg {
 	time_t last_update;
 } last_update_msg_t;
 
+typedef struct set_debug_level_msg {
+	int32_t debug_level;
+} set_debug_level_msg_t;
+
 typedef struct job_step_specs {
 	uint32_t job_id;	/* job ID */
 	uint32_t user_id;	/* user the job runs as */
@@ -712,7 +717,7 @@ extern void slurm_msg_t_init (slurm_msg_t *msg);
 extern void slurm_msg_t_copy(slurm_msg_t *dest, slurm_msg_t *src);
 
 /* free message functions */
-void slurm_free_checkpoint_tasks_msg(checkpoint_tasks_msg_t * msg);
+void inline slurm_free_checkpoint_tasks_msg(checkpoint_tasks_msg_t * msg);
 void inline slurm_free_last_update_msg(last_update_msg_t * msg);
 void inline slurm_free_return_code_msg(return_code_msg_t * msg);
 void inline slurm_free_job_alloc_info_msg(job_alloc_info_msg_t * msg);
@@ -721,6 +726,7 @@ void inline slurm_free_job_step_info_request_msg(
 		job_step_info_request_msg_t *msg);
 void inline slurm_free_node_info_request_msg(node_info_request_msg_t *msg);
 void inline slurm_free_part_info_request_msg(part_info_request_msg_t *msg);
+void inline slurm_free_set_debug_level_msg(set_debug_level_msg_t *msg);
 
 #define	slurm_free_timelimit_msg(msg) \
 	slurm_free_kill_job_msg(msg)
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index 15579392003..7fe1a5aa58b 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -336,6 +336,9 @@ static int  _unpack_slurmd_status(slurmd_status_t **msg_ptr, Buf buffer);
 static void _pack_job_notify(job_notify_msg_t *msg, Buf buffer);
 static int  _unpack_job_notify(job_notify_msg_t **msg_ptr, Buf buffer);
 
+static void _pack_set_debug_level_msg(set_debug_level_msg_t * msg, Buf buffer);
+static int _unpack_set_debug_level_msg(set_debug_level_msg_t ** msg_ptr, Buf buffer);
+
 /* pack_header
  * packs a slurm protocol header that proceeds every slurm message
  * IN header - the header structure to pack
@@ -711,6 +714,9 @@ pack_msg(slurm_msg_t const *msg, Buf buffer)
 	case REQUEST_JOB_NOTIFY:
 		_pack_job_notify((job_notify_msg_t *) msg->data, buffer);
 		break;
+	case REQUEST_SET_DEBUG_LEVEL:
+		_pack_set_debug_level_msg((set_debug_level_msg_t *)msg->data, buffer);
+		break;
 	default:
 		debug("No pack method for msg type %u", msg->msg_type);
 		return EINVAL;
@@ -1054,6 +1060,9 @@ unpack_msg(slurm_msg_t * msg, Buf buffer)
 		rc =  _unpack_job_notify((job_notify_msg_t **)
 					 &msg->data, buffer);
 		break;
+	case REQUEST_SET_DEBUG_LEVEL:
+		rc = _unpack_set_debug_level_msg((set_debug_level_msg_t **)&(msg->data), buffer);
+		break;
 	default:
 		debug("No unpack method for msg type %u", msg->msg_type);
 		return EINVAL;
@@ -4594,6 +4603,30 @@ unpack_error:
 	return SLURM_ERROR;
 }
 
+static void
+_pack_set_debug_level_msg(set_debug_level_msg_t * msg, Buf buffer)
+{
+	pack32(msg->debug_level, buffer);
+}
+
+static int
+_unpack_set_debug_level_msg(set_debug_level_msg_t ** msg_ptr, Buf buffer)
+{
+	set_debug_level_msg_t *msg;
+	
+	msg = xmalloc(sizeof(set_debug_level_msg_t));
+	*msg_ptr = msg;
+	
+	safe_unpack32(&msg->debug_level, buffer);
+	return SLURM_SUCCESS;
+	
+ unpack_error:
+	xfree(msg);
+	*msg_ptr = NULL;
+	return SLURM_ERROR;
+}
+
+
 /* template 
    void pack_ ( * msg , Buf buffer )
    {
diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c
index de132910dd4..6e535d487a4 100644
--- a/src/scontrol/scontrol.c
+++ b/src/scontrol/scontrol.c
@@ -627,6 +627,51 @@ _process_command (int argc, char *argv[])
 			}
 		}
 	}
+	else if (strncasecmp (argv[0], "setdebug", 4) == 0) {
+		if (argc > 2) {
+			exit_code = 1;
+			if (quiet_flag != 1)
+				fprintf(stderr, "too many arguments for keyword:%s\n",
+					argv[0]);
+		} else if (argc < 2) {
+			exit_code = 1;
+			if (quiet_flag != 1)
+				fprintf(stderr, "too few arguments for keyword:%s\n",
+					argv[0]);
+		} else {
+			int level = -1;
+			char *endptr;
+			char *levels[] = {
+				"quiet", "fatal", "error", "info", "verbose",
+				"debug", "debug2", "debug3", "debug4", "debug5", NULL};
+			int index = 0;
+			while (levels[index]) {
+				if (strcasecmp(argv[1], levels[index]) == 0) {
+					level = index;
+					break;
+				}
+				index ++;
+			}
+			if (level == -1) {
+				level = (int)strtoul (argv[1], &endptr, 10);    /* effective levels: 0 - 9 */
+				if (*endptr != '\0' || level > 9) {
+					level = -1;
+					exit_code = 1;
+					if (quiet_flag != 1)
+						fprintf(stderr, "invalid debug level: %s\n",
+							argv[1]);
+				}
+			}
+			if (level != -1) {
+				error_code = slurm_set_debug_level(level);
+				if (error_code) {
+					exit_code = 1;
+					if (quiet_flag != 1)
+						slurm_perror ("slurm_set_debug_level error");
+				}
+			}
+		}
+	}
 	else if (strncasecmp (argv[0], "show", 3) == 0) {
 		if (argc > 3) {
 			exit_code = 1;
@@ -1015,11 +1060,13 @@ scontrol [<OPTION>] [<COMMAND>]                                            \n\
      quit                     terminate this command.                      \n\
      reconfigure              re-read configuration files.                 \n\
      requeue <job_id>         re-queue a batch job                         \n\
+     setdebug <LEVEL>         reset slurmctld debug level                  \n\
      show <ENTITY> [<ID>]     display state of identified entity, default  \n\
                               is all records.                              \n\
      shutdown                 shutdown slurm controller.                   \n\
      suspend <job_id>         susend specified job                         \n\
      resume <job_id>          resume previously suspended job              \n\
+     setdebug <level>         set slurmctld debug level                    \n\
      update <SPECIFICATIONS>  update job, node, partition, or bluegene     \n\
                               block/subbp configuration                    \n\
      verbose                  enable detailed logging.                     \n\
@@ -1037,6 +1084,10 @@ scontrol [<OPTION>] [<COMMAND>]                                            \n\
        absolute pathname of a file (with leading '/' containing host names \n\
        either separated by commas or new-lines                             \n\
                                                                            \n\
+  <LEVEL> may be an integer value like SlurmctldDebug in the slurm.conf    \n\
+       file or the name of the most detailed errors to report (e.g. \"info\",\n\
+       \"verbose\", \"debug\", \"debug2\", etc.).                          \n\
+                                                                           \n\
   Node names may be specified using simple range expressions,              \n\
   (e.g. \"lx[10-20]\" corresponsds to lx10, lx11, lx12, ...)               \n\
   The job step id is the job id followed by a period and the step id.      \n\
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 223b2e0c3c3..8ce3c6a3122 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -229,6 +229,7 @@ int main(int argc, char *argv[])
 		fatal("Unable to initialize StateSaveLocation");
 
 	if (daemonize) {
+		slurmctld_config.daemonize = 1;
 		error_code = daemon(1, 1);
 		log_alter(log_opts, LOG_DAEMON, 
 			  slurmctld_conf.slurmctld_logfile);
@@ -252,6 +253,8 @@ int main(int argc, char *argv[])
 					slurmctld_conf.state_save_location);
 			}
 		}
+	} else {
+		slurmctld_config.daemonize = 0;
 	}
 	info("slurmctld version %s started", SLURM_VERSION);
 
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 67807d6a863..1c25a7a5d99 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -126,6 +126,7 @@ inline static void  _slurm_rpc_update_node(slurm_msg_t * msg);
 inline static void  _slurm_rpc_update_partition(slurm_msg_t * msg);
 inline static void  _slurm_rpc_end_time(slurm_msg_t * msg);
 inline static void  _update_cred_key(void);
+inline static void  _slurm_rpc_set_debug_level(slurm_msg_t *msg);
 
 
 /*
@@ -296,6 +297,10 @@ void slurmctld_req (slurm_msg_t * msg)
 		_slurm_rpc_job_notify(msg);
 		slurm_free_job_notify_msg(msg->data);
 		break;
+	case REQUEST_SET_DEBUG_LEVEL:
+		_slurm_rpc_set_debug_level(msg);
+		slurm_free_set_debug_level_msg(msg->data);
+		break;
 	default:
 		error("invalid RPC msg_type=%d", msg->msg_type);
 		slurm_send_rc_msg(msg, EINVAL);
@@ -2707,3 +2712,62 @@ inline static void  _slurm_rpc_job_notify(slurm_msg_t * msg)
 	slurm_send_rc_msg(msg, error_code);
 }
 
+/* defined in controller.c */
+inline static void  _slurm_rpc_set_debug_level(slurm_msg_t *msg)
+{
+	int debug_level;
+	uid_t uid;
+	slurmctld_lock_t config_read_lock =
+		{ READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
+	set_debug_level_msg_t *request_msg = (set_debug_level_msg_t *) msg->data;
+	log_options_t log_opts = LOG_OPTS_INITIALIZER;
+	slurm_ctl_conf_t *conf;
+
+	debug2("Processing RPC: REQUEST_SET_DEBUG_LEVEL");
+
+	uid = g_slurm_auth_get_uid(msg->auth_cred);
+	if (!validate_super_user(uid)) {
+		error("set debug level request from non-super user uid=%d", 
+		      uid);
+		slurm_send_rc_msg(msg, EACCES);
+		return;
+	}
+
+	/* NOTE: not offset by LOG_LEVEL_INFO, since it's inconveniet
+	 * to provide negative values for scontrol */
+	debug_level = MIN (request_msg->debug_level, (LOG_LEVEL_END - 1));
+	debug_level = MAX (debug_level, LOG_LEVEL_QUIET);
+
+	info ("Setting debug level to %d", debug_level);
+
+	lock_slurmctld (config_read_lock);
+
+	if (slurmctld_config.daemonize) {
+		log_opts.stderr_level = LOG_LEVEL_QUIET;
+		if (slurmctld_conf.slurmctld_logfile) {
+			log_opts.logfile_level = debug_level;
+			log_opts.syslog_level = LOG_LEVEL_QUIET;
+		} else {
+			log_opts.syslog_level = debug_level;
+			log_opts.logfile_level = LOG_LEVEL_QUIET;
+		}
+	} else {
+		log_opts.syslog_level = LOG_LEVEL_QUIET;
+		log_opts.stderr_level = debug_level;
+		if (slurmctld_conf.slurmctld_logfile)
+			log_opts.logfile_level = debug_level;
+		else
+			log_opts.logfile_level = LOG_LEVEL_QUIET;
+	}
+
+	log_alter(log_opts, LOG_DAEMON, slurmctld_conf.slurmctld_logfile);
+
+	unlock_slurmctld (config_read_lock);
+
+	conf = slurm_conf_lock();
+	conf->slurmctld_debug = debug_level;
+	slurm_conf_unlock();
+	slurmctld_conf.last_update = time(NULL);
+
+	slurm_send_rc_msg(msg, SLURM_SUCCESS);
+}
-- 
GitLab