From 39bc50bd31b6851fb30cbc2ea847d39a05152dbd Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 18 Dec 2007 18:58:35 +0000 Subject: [PATCH] Added new scontrol option "setdebug" that can change the slurmctld daemons debug level at any time (Hongjia Cao, NUDT). --- NEWS | 2 + doc/man/man1/scontrol.1 | 14 ++++++- slurm/slurm.h.in | 8 ++++ src/api/reconfigure.c | 36 ++++++++++++++++++ src/common/slurm_protocol_defs.c | 9 +++++ src/common/slurm_protocol_defs.h | 10 ++++- src/common/slurm_protocol_pack.c | 33 ++++++++++++++++ src/scontrol/scontrol.c | 51 +++++++++++++++++++++++++ src/slurmctld/controller.c | 3 ++ src/slurmctld/proc_req.c | 64 ++++++++++++++++++++++++++++++++ 10 files changed, 227 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 5376b5fc7d1..2f34eba43d8 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,8 @@ documents those changes that are of interest to users and admins. accounting data collection polling interval. -- In sched/wiki2 add support for hostlist expression use in GETNODES command with HostFormat=2 in the wiki.conf file. + -- Added new scontrol option "setdebug" that can change the slurmctld daemons + debug level at any time (Hongjia Cao, NUDT). * Changes in SLURM 1.3.0-pre7 ============================= diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index da1311c9934..16ef7e4e91d 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -1,4 +1,4 @@ -.TH SCONTROL "1" "October 2007" "scontrol 1.3" "Slurm components" +.TH SCONTROL "1" "December 2008" "scontrol 1.3" "Slurm components" .SH "NAME" scontrol \- Used view and modify Slurm configuration and state. @@ -181,6 +181,18 @@ Resume a previously suspended job. \fBrequeue\fP \fIjob_id\fP Requeue a running or pending SLURM batch job. +.TP +\fBsetdebug\fP \fILEVEL\fP +Change the debug level of the slurmctld daemon. +\fILEVEL\fP may be an integer value between zero and nine (using the +same values as \fISlurmctldDebug\fP in the \fIslurm.conf\fP file) or +the name of the most detailed message type to be printed: +"quiet", "fatal", "error", "info", "verbose", "debug", "debug2", "debug3", +"debug4", or "debug5". +This value is temporary and will be overwritten whenever the slurmctld +daemon reads the slurm.conf configuration file (e.g. when the daemon +is restarted or "scontrol reconfigure" is executed). + .TP \fBshow\fP \fIENTITY\fP \fIID\fP Display the state of the specified entity with the specified identification. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 9c3d3cc56c2..66bcc526366 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1859,6 +1859,14 @@ extern int slurm_reconfigure PARAMS(( void )); */ extern int slurm_shutdown PARAMS(( uint16_t core )); +/* + * slurm_set_debug_level - issue RPC to set slurm controller debug level + * IN debug_level - requested debug level + * RET 0 on success, otherwise return -1 and set errno to indicate the error + */ +extern int slurm_set_debug_level PARAMS((uint32_t debug_level)); + + /*****************************************************************************\ * SLURM JOB SUSPEND FUNCTIONS \*****************************************************************************/ diff --git a/src/api/reconfigure.c b/src/api/reconfigure.c index f378a726992..5d23e513f9f 100644 --- a/src/api/reconfigure.c +++ b/src/api/reconfigure.c @@ -165,3 +165,39 @@ _send_message_controller (enum controller_id dest, slurm_msg_t *req) return rc; } +/* + * slurm_set_debug_level - issue RPC to set slurm controller debug level + * IN debug_level - requested debug level + * RET 0 on success, otherwise return -1 and set errno to indicate the error + */ +int +slurm_set_debug_level (uint32_t debug_level) +{ + int rc; + slurm_msg_t req_msg; + slurm_msg_t resp_msg; + set_debug_level_msg_t req; + + slurm_msg_t_init(&req_msg); + slurm_msg_t_init(&resp_msg); + + req.debug_level = debug_level; + req_msg.msg_type = REQUEST_SET_DEBUG_LEVEL; + req_msg.data = &req; + + if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) + return SLURM_ERROR; + + switch (resp_msg.msg_type) { + case RESPONSE_SLURM_RC: + rc = ((return_code_msg_t *) resp_msg.data)->return_code; + slurm_free_return_code_msg(resp_msg.data); + if (rc) + slurm_seterrno_ret(rc); + break; + default: + slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); + break; + } + return SLURM_PROTOCOL_SUCCESS; +} diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 4f1d7397511..04631b8f4fd 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1142,6 +1142,12 @@ void inline slurm_free_trigger_msg(trigger_info_msg_t *msg) xfree(msg); } +void slurm_free_set_debug_level_msg(set_debug_level_msg_t *msg) +{ + xfree(msg); +} + + extern int slurm_free_msg_data(slurm_msg_type_t type, void *data) { switch(type) { @@ -1270,6 +1276,9 @@ extern int slurm_free_msg_data(slurm_msg_type_t type, void *data) case RESPONSE_SLURM_RC: slurm_free_return_code_msg(data); break; + case REQUEST_SET_DEBUG_LEVEL: + slurm_free_set_debug_level_msg(data); + break; case SLURM_SUCCESS: case REQUEST_PING: case REQUEST_RECONFIGURE: diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 6ea5c1fe637..28357539fd1 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -94,7 +94,8 @@ typedef enum { RESPONSE_SHUTDOWN, REQUEST_PING, REQUEST_CONTROL, - + REQUEST_SET_DEBUG_LEVEL, + REQUEST_BUILD_INFO = 2001, RESPONSE_BUILD_INFO, REQUEST_JOB_INFO, @@ -388,6 +389,10 @@ typedef struct last_update_msg { time_t last_update; } last_update_msg_t; +typedef struct set_debug_level_msg { + int32_t debug_level; +} set_debug_level_msg_t; + typedef struct job_step_specs { uint32_t job_id; /* job ID */ uint32_t user_id; /* user the job runs as */ @@ -712,7 +717,7 @@ extern void slurm_msg_t_init (slurm_msg_t *msg); extern void slurm_msg_t_copy(slurm_msg_t *dest, slurm_msg_t *src); /* free message functions */ -void slurm_free_checkpoint_tasks_msg(checkpoint_tasks_msg_t * msg); +void inline slurm_free_checkpoint_tasks_msg(checkpoint_tasks_msg_t * msg); void inline slurm_free_last_update_msg(last_update_msg_t * msg); void inline slurm_free_return_code_msg(return_code_msg_t * msg); void inline slurm_free_job_alloc_info_msg(job_alloc_info_msg_t * msg); @@ -721,6 +726,7 @@ void inline slurm_free_job_step_info_request_msg( job_step_info_request_msg_t *msg); void inline slurm_free_node_info_request_msg(node_info_request_msg_t *msg); void inline slurm_free_part_info_request_msg(part_info_request_msg_t *msg); +void inline slurm_free_set_debug_level_msg(set_debug_level_msg_t *msg); #define slurm_free_timelimit_msg(msg) \ slurm_free_kill_job_msg(msg) diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 15579392003..7fe1a5aa58b 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -336,6 +336,9 @@ static int _unpack_slurmd_status(slurmd_status_t **msg_ptr, Buf buffer); static void _pack_job_notify(job_notify_msg_t *msg, Buf buffer); static int _unpack_job_notify(job_notify_msg_t **msg_ptr, Buf buffer); +static void _pack_set_debug_level_msg(set_debug_level_msg_t * msg, Buf buffer); +static int _unpack_set_debug_level_msg(set_debug_level_msg_t ** msg_ptr, Buf buffer); + /* pack_header * packs a slurm protocol header that proceeds every slurm message * IN header - the header structure to pack @@ -711,6 +714,9 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) case REQUEST_JOB_NOTIFY: _pack_job_notify((job_notify_msg_t *) msg->data, buffer); break; + case REQUEST_SET_DEBUG_LEVEL: + _pack_set_debug_level_msg((set_debug_level_msg_t *)msg->data, buffer); + break; default: debug("No pack method for msg type %u", msg->msg_type); return EINVAL; @@ -1054,6 +1060,9 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) rc = _unpack_job_notify((job_notify_msg_t **) &msg->data, buffer); break; + case REQUEST_SET_DEBUG_LEVEL: + rc = _unpack_set_debug_level_msg((set_debug_level_msg_t **)&(msg->data), buffer); + break; default: debug("No unpack method for msg type %u", msg->msg_type); return EINVAL; @@ -4594,6 +4603,30 @@ unpack_error: return SLURM_ERROR; } +static void +_pack_set_debug_level_msg(set_debug_level_msg_t * msg, Buf buffer) +{ + pack32(msg->debug_level, buffer); +} + +static int +_unpack_set_debug_level_msg(set_debug_level_msg_t ** msg_ptr, Buf buffer) +{ + set_debug_level_msg_t *msg; + + msg = xmalloc(sizeof(set_debug_level_msg_t)); + *msg_ptr = msg; + + safe_unpack32(&msg->debug_level, buffer); + return SLURM_SUCCESS; + + unpack_error: + xfree(msg); + *msg_ptr = NULL; + return SLURM_ERROR; +} + + /* template void pack_ ( * msg , Buf buffer ) { diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index de132910dd4..6e535d487a4 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -627,6 +627,51 @@ _process_command (int argc, char *argv[]) } } } + else if (strncasecmp (argv[0], "setdebug", 4) == 0) { + if (argc > 2) { + exit_code = 1; + if (quiet_flag != 1) + fprintf(stderr, "too many arguments for keyword:%s\n", + argv[0]); + } else if (argc < 2) { + exit_code = 1; + if (quiet_flag != 1) + fprintf(stderr, "too few arguments for keyword:%s\n", + argv[0]); + } else { + int level = -1; + char *endptr; + char *levels[] = { + "quiet", "fatal", "error", "info", "verbose", + "debug", "debug2", "debug3", "debug4", "debug5", NULL}; + int index = 0; + while (levels[index]) { + if (strcasecmp(argv[1], levels[index]) == 0) { + level = index; + break; + } + index ++; + } + if (level == -1) { + level = (int)strtoul (argv[1], &endptr, 10); /* effective levels: 0 - 9 */ + if (*endptr != '\0' || level > 9) { + level = -1; + exit_code = 1; + if (quiet_flag != 1) + fprintf(stderr, "invalid debug level: %s\n", + argv[1]); + } + } + if (level != -1) { + error_code = slurm_set_debug_level(level); + if (error_code) { + exit_code = 1; + if (quiet_flag != 1) + slurm_perror ("slurm_set_debug_level error"); + } + } + } + } else if (strncasecmp (argv[0], "show", 3) == 0) { if (argc > 3) { exit_code = 1; @@ -1015,11 +1060,13 @@ scontrol [<OPTION>] [<COMMAND>] \n\ quit terminate this command. \n\ reconfigure re-read configuration files. \n\ requeue <job_id> re-queue a batch job \n\ + setdebug <LEVEL> reset slurmctld debug level \n\ show <ENTITY> [<ID>] display state of identified entity, default \n\ is all records. \n\ shutdown shutdown slurm controller. \n\ suspend <job_id> susend specified job \n\ resume <job_id> resume previously suspended job \n\ + setdebug <level> set slurmctld debug level \n\ update <SPECIFICATIONS> update job, node, partition, or bluegene \n\ block/subbp configuration \n\ verbose enable detailed logging. \n\ @@ -1037,6 +1084,10 @@ scontrol [<OPTION>] [<COMMAND>] \n\ absolute pathname of a file (with leading '/' containing host names \n\ either separated by commas or new-lines \n\ \n\ + <LEVEL> may be an integer value like SlurmctldDebug in the slurm.conf \n\ + file or the name of the most detailed errors to report (e.g. \"info\",\n\ + \"verbose\", \"debug\", \"debug2\", etc.). \n\ + \n\ Node names may be specified using simple range expressions, \n\ (e.g. \"lx[10-20]\" corresponsds to lx10, lx11, lx12, ...) \n\ The job step id is the job id followed by a period and the step id. \n\ diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 223b2e0c3c3..8ce3c6a3122 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -229,6 +229,7 @@ int main(int argc, char *argv[]) fatal("Unable to initialize StateSaveLocation"); if (daemonize) { + slurmctld_config.daemonize = 1; error_code = daemon(1, 1); log_alter(log_opts, LOG_DAEMON, slurmctld_conf.slurmctld_logfile); @@ -252,6 +253,8 @@ int main(int argc, char *argv[]) slurmctld_conf.state_save_location); } } + } else { + slurmctld_config.daemonize = 0; } info("slurmctld version %s started", SLURM_VERSION); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 67807d6a863..1c25a7a5d99 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -126,6 +126,7 @@ inline static void _slurm_rpc_update_node(slurm_msg_t * msg); inline static void _slurm_rpc_update_partition(slurm_msg_t * msg); inline static void _slurm_rpc_end_time(slurm_msg_t * msg); inline static void _update_cred_key(void); +inline static void _slurm_rpc_set_debug_level(slurm_msg_t *msg); /* @@ -296,6 +297,10 @@ void slurmctld_req (slurm_msg_t * msg) _slurm_rpc_job_notify(msg); slurm_free_job_notify_msg(msg->data); break; + case REQUEST_SET_DEBUG_LEVEL: + _slurm_rpc_set_debug_level(msg); + slurm_free_set_debug_level_msg(msg->data); + break; default: error("invalid RPC msg_type=%d", msg->msg_type); slurm_send_rc_msg(msg, EINVAL); @@ -2707,3 +2712,62 @@ inline static void _slurm_rpc_job_notify(slurm_msg_t * msg) slurm_send_rc_msg(msg, error_code); } +/* defined in controller.c */ +inline static void _slurm_rpc_set_debug_level(slurm_msg_t *msg) +{ + int debug_level; + uid_t uid; + slurmctld_lock_t config_read_lock = + { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; + set_debug_level_msg_t *request_msg = (set_debug_level_msg_t *) msg->data; + log_options_t log_opts = LOG_OPTS_INITIALIZER; + slurm_ctl_conf_t *conf; + + debug2("Processing RPC: REQUEST_SET_DEBUG_LEVEL"); + + uid = g_slurm_auth_get_uid(msg->auth_cred); + if (!validate_super_user(uid)) { + error("set debug level request from non-super user uid=%d", + uid); + slurm_send_rc_msg(msg, EACCES); + return; + } + + /* NOTE: not offset by LOG_LEVEL_INFO, since it's inconveniet + * to provide negative values for scontrol */ + debug_level = MIN (request_msg->debug_level, (LOG_LEVEL_END - 1)); + debug_level = MAX (debug_level, LOG_LEVEL_QUIET); + + info ("Setting debug level to %d", debug_level); + + lock_slurmctld (config_read_lock); + + if (slurmctld_config.daemonize) { + log_opts.stderr_level = LOG_LEVEL_QUIET; + if (slurmctld_conf.slurmctld_logfile) { + log_opts.logfile_level = debug_level; + log_opts.syslog_level = LOG_LEVEL_QUIET; + } else { + log_opts.syslog_level = debug_level; + log_opts.logfile_level = LOG_LEVEL_QUIET; + } + } else { + log_opts.syslog_level = LOG_LEVEL_QUIET; + log_opts.stderr_level = debug_level; + if (slurmctld_conf.slurmctld_logfile) + log_opts.logfile_level = debug_level; + else + log_opts.logfile_level = LOG_LEVEL_QUIET; + } + + log_alter(log_opts, LOG_DAEMON, slurmctld_conf.slurmctld_logfile); + + unlock_slurmctld (config_read_lock); + + conf = slurm_conf_lock(); + conf->slurmctld_debug = debug_level; + slurm_conf_unlock(); + slurmctld_conf.last_update = time(NULL); + + slurm_send_rc_msg(msg, SLURM_SUCCESS); +} -- GitLab