From 31113a5d45104ca6519d836bde66d5e1b43d2e7e Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 3 Mar 2011 21:38:38 +0000 Subject: [PATCH] -- Add the ability for scontrol to modify slurmctld DebugFlags values. --- NEWS | 1 + RELEASE_NOTES | 4 +-- doc/man/man1/scontrol.1 | 10 +++++- slurm/slurm.h.in | 9 +++++ src/api/config_info.c | 2 +- src/api/reconfigure.c | 42 ++++++++++++++++++++++- src/common/slurm_protocol_api.c | 14 ++++++++ src/common/slurm_protocol_api.h | 4 +++ src/common/slurm_protocol_defs.c | 8 +++++ src/common/slurm_protocol_defs.h | 7 ++++ src/common/slurm_protocol_pack.c | 43 +++++++++++++++++++++++ src/common/slurm_protocol_util.c | 1 + src/scontrol/scontrol.c | 59 ++++++++++++++++++++++++++++++++ src/slurmctld/proc_req.c | 44 +++++++++++++++++++++--- 14 files changed, 237 insertions(+), 11 deletions(-) diff --git a/NEWS b/NEWS index 9fb04dfda96..d58266fb635 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,7 @@ documents those changes that are of interest to users and admins. ============================= -- Add GraceTime to Partition and QOS data structures. Preempted jobs will be given this time interval before termination. Work by Bill Brophy, Bull. + -- Add the ability for scontrol to modify slurmctld DebugFlags values. * Changes in SLURM 2.3.0.pre3 ============================= diff --git a/RELEASE_NOTES b/RELEASE_NOTES index f59ab0f2965..d098ac8029e 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -37,9 +37,7 @@ CONFIGURATION FILE CHANGES (see "man slurm.conf" for details) * In order to support more than one front end node, new parameters have been added to support a new data structure: FrontendName, FrontendAddr, Port, State and Reason. - * DebugFlags of Frontend added - * Added new configuration parameter MaxJobId. Use with FirstJobId to limit range of job ID values. @@ -47,7 +45,7 @@ CONFIGURATION FILE CHANGES (see "man slurm.conf" for details) COMMAND CHANGES (see man pages for details) =========================================== * scontrol has the ability to get and set front end node state. - +* scontrol has the ability to set slurmctld's DebugFlags. * Add new scontrol option of "show aliases" to report every NodeName that is associated with a given NodeHostName when running multiple slurmd daemons per compute node (typically used for testing purposes). diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 1c0fe6d0965..ea039f61003 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -1,4 +1,4 @@ -.TH SCONTROL "1" "December 2010" "scontrol 2.3" "Slurm components" +.TH SCONTROL "1" "March 2011" "scontrol 2.3" "Slurm components" .SH "NAME" scontrol \- Used view and modify Slurm configuration and state. @@ -276,6 +276,14 @@ This value is temporary and will be overwritten whenever the slurmctld daemon reads the slurm.conf configuration file (e.g. when the daemon is restarted or \fBscontrol reconfigure\fR is executed). +.TP +\fBsetdebugflags\fP [+|\-]\fIFLAG\fP +Add or remove DebugFlags of the slurmctld daemon. +See "man slurm.conf" for a list of supported DebugFlags. +NOTE: Changing the value of some DebugFlags will have no effect without +restarting the slurmctld daemon, which would set DebugFlags based upon the +contents of the slurm.conf configuration file. + .TP \fBshow\fP \fIENTITY\fP \fIID\fP Display the state of the specified entity with the specified identification. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 3f7904aece4..47cac5a8936 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -3147,6 +3147,15 @@ extern int slurm_shutdown PARAMS((uint16_t options)); */ extern int slurm_takeover PARAMS((void)); +/* + * slurm_set_debugflags - issue RPC to set slurm controller debug flags + * IN debug_flags_plus - debug flags to be added + * IN debug_flags_minus - debug flags to be removed + * RET 0 on success, otherwise return -1 and set errno to indicate the error + */ +extern int slurm_set_debugflags PARAMS((uint32_t debug_flags_plus, + uint32_t debug_flags_minus)); + /* * slurm_set_debug_level - issue RPC to set slurm controller debug level * IN debug_level - requested debug level diff --git a/src/api/config_info.c b/src/api/config_info.c index 351592c466d..0038972dc0f 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -255,7 +255,7 @@ extern void *slurm_ctl_conf_2_key_pairs (slurm_ctl_conf_t* slurm_ctl_conf_ptr) key_pair = xmalloc(sizeof(config_key_pair_t)); key_pair->name = xstrdup("DebugFlags"); - key_pair->value = debug_flags2str(slurm_ctl_conf_ptr->debug_flags);; + key_pair->value = debug_flags2str(slurm_ctl_conf_ptr->debug_flags); list_append(ret_list, key_pair); key_pair = xmalloc(sizeof(config_key_pair_t)); diff --git a/src/api/reconfigure.c b/src/api/reconfigure.c index cb9ff603f4b..a8867490f5e 100644 --- a/src/api/reconfigure.c +++ b/src/api/reconfigure.c @@ -189,12 +189,52 @@ _send_message_controller (enum controller_id dest, slurm_msg_t *req) return rc; } +/* + * slurm_set_debugflags - issue RPC to set slurm controller debug flags + * IN debug_flags_plus - debug flags to be added + * IN debug_flags_minus - debug flags to be removed + * IN debug_flags_set - new debug flags value + * RET 0 on success, otherwise return -1 and set errno to indicate the error + */ +extern int +slurm_set_debugflags (uint32_t debug_flags_plus, uint32_t debug_flags_minus) +{ + int rc; + slurm_msg_t req_msg; + slurm_msg_t resp_msg; + set_debug_flags_msg_t req; + + slurm_msg_t_init(&req_msg); + slurm_msg_t_init(&resp_msg); + + req.debug_flags_minus = debug_flags_minus; + req.debug_flags_plus = debug_flags_plus; + req_msg.msg_type = REQUEST_SET_DEBUG_FLAGS; + req_msg.data = &req; + + if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) + return SLURM_ERROR; + + switch (resp_msg.msg_type) { + case RESPONSE_SLURM_RC: + rc = ((return_code_msg_t *) resp_msg.data)->return_code; + slurm_free_return_code_msg(resp_msg.data); + if (rc) + slurm_seterrno_ret(rc); + break; + default: + slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); + break; + } + return SLURM_PROTOCOL_SUCCESS; +} + /* * slurm_set_debug_level - issue RPC to set slurm controller debug level * IN debug_level - requested debug level * RET 0 on success, otherwise return -1 and set errno to indicate the error */ -int +extern int slurm_set_debug_level (uint32_t debug_level) { int rc; diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 9a205877a0f..2f9478c12ff 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -331,6 +331,20 @@ uint32_t slurm_get_debug_flags(void) return debug_flags; } +/* slurm_set_debug_flags + */ +void slurm_set_debug_flags(uint32_t debug_flags) +{ + slurm_ctl_conf_t *conf; + + if (slurmdbd_conf) { + } else { + conf = slurm_conf_lock(); + conf->debug_flags = debug_flags; + slurm_conf_unlock(); + } +} + /* slurm_get_max_mem_per_cpu * RET MaxMemPerCPU/Node value from slurm.conf */ diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index 146590fd678..46e44eb01fc 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -128,6 +128,10 @@ uint16_t slurm_get_complete_wait(void); */ uint32_t slurm_get_debug_flags(void); +/* slurm_set_debug_flags + */ +void slurm_set_debug_flags(uint32_t debug_flags); + /* slurm_get_def_mem_per_cpu * RET DefMemPerCPU/Node value from slurm.conf */ diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 9c4374fac3b..e415ec75c81 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -2097,6 +2097,11 @@ inline void slurm_free_trigger_msg(trigger_info_msg_t *msg) xfree(msg); } +void slurm_free_set_debug_flags_msg(set_debug_flags_msg_t *msg) +{ + xfree(msg); +} + void slurm_free_set_debug_level_msg(set_debug_level_msg_t *msg) { xfree(msg); @@ -2339,6 +2344,9 @@ extern int slurm_free_msg_data(slurm_msg_type_t type, void *data) case RESPONSE_SLURM_RC: slurm_free_return_code_msg(data); break; + case REQUEST_SET_DEBUG_FLAGS: + slurm_free_set_debug_flags_msg(data); + break; case REQUEST_SET_DEBUG_LEVEL: case REQUEST_SET_SCHEDLOG_LEVEL: slurm_free_set_debug_level_msg(data); diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 5e213e7c4c6..385672a5970 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -179,6 +179,7 @@ typedef enum { REQUEST_HEALTH_CHECK, REQUEST_TAKEOVER, REQUEST_SET_SCHEDLOG_LEVEL, + REQUEST_SET_DEBUG_FLAGS, REQUEST_BUILD_INFO = 2001, RESPONSE_BUILD_INFO, @@ -564,6 +565,11 @@ typedef struct last_update_msg { time_t last_update; } last_update_msg_t; +typedef struct set_debug_flags_msg { + uint32_t debug_flags_minus; + uint32_t debug_flags_plus; +} set_debug_flags_msg_t; + typedef struct set_debug_level_msg { uint32_t debug_level; } set_debug_level_msg_t; @@ -959,6 +965,7 @@ inline void slurm_free_front_end_info_request_msg( inline void slurm_free_node_info_request_msg(node_info_request_msg_t *msg); inline void slurm_free_part_info_request_msg(part_info_request_msg_t *msg); inline void slurm_free_resv_info_request_msg(resv_info_request_msg_t *msg); +inline void slurm_free_set_debug_flags_msg(set_debug_flags_msg_t *msg); inline void slurm_free_set_debug_level_msg(set_debug_level_msg_t *msg); inline void slurm_destroy_association_shares_object(void *object); inline void slurm_free_shares_request_msg(shares_request_msg_t *msg); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 9924cc57de4..8678a5c4fef 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -538,6 +538,12 @@ static void _pack_job_notify(job_notify_msg_t *msg, Buf buffer, static int _unpack_job_notify(job_notify_msg_t **msg_ptr, Buf buffer, uint16_t protocol_version); +static void _pack_set_debug_flags_msg(set_debug_flags_msg_t * msg, Buf buffer, + uint16_t protocol_version); +static int _unpack_set_debug_flags_msg(set_debug_flags_msg_t ** msg_ptr, + Buf buffer, + uint16_t protocol_version); + static void _pack_set_debug_level_msg(set_debug_level_msg_t * msg, Buf buffer, uint16_t protocol_version); static int _unpack_set_debug_level_msg(set_debug_level_msg_t ** msg_ptr, @@ -1098,6 +1104,11 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) _pack_job_notify((job_notify_msg_t *) msg->data, buffer, msg->protocol_version); break; + case REQUEST_SET_DEBUG_FLAGS: + _pack_set_debug_flags_msg( + (set_debug_flags_msg_t *)msg->data, buffer, + msg->protocol_version); + break; case REQUEST_SET_DEBUG_LEVEL: case REQUEST_SET_SCHEDLOG_LEVEL: _pack_set_debug_level_msg( @@ -1624,6 +1635,11 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) &msg->data, buffer, msg->protocol_version); break; + case REQUEST_SET_DEBUG_FLAGS: + rc = _unpack_set_debug_flags_msg( + (set_debug_flags_msg_t **)&(msg->data), buffer, + msg->protocol_version); + break; case REQUEST_SET_DEBUG_LEVEL: case REQUEST_SET_SCHEDLOG_LEVEL: rc = _unpack_set_debug_level_msg( @@ -9302,6 +9318,33 @@ unpack_error: return SLURM_ERROR; } +static void +_pack_set_debug_flags_msg(set_debug_flags_msg_t * msg, Buf buffer, + uint16_t protocol_version) +{ + pack32(msg->debug_flags_minus, buffer); + pack32(msg->debug_flags_plus, buffer); +} + +static int +_unpack_set_debug_flags_msg(set_debug_flags_msg_t ** msg_ptr, Buf buffer, + uint16_t protocol_version) +{ + set_debug_flags_msg_t *msg; + + msg = xmalloc(sizeof(set_debug_flags_msg_t)); + *msg_ptr = msg; + + safe_unpack32(&msg->debug_flags_minus, buffer); + safe_unpack32(&msg->debug_flags_plus, buffer); + return SLURM_SUCCESS; + +unpack_error: + slurm_free_set_debug_flags_msg(msg); + *msg_ptr = NULL; + return SLURM_ERROR; +} + static void _pack_set_debug_level_msg(set_debug_level_msg_t * msg, Buf buffer, uint16_t protocol_version) diff --git a/src/common/slurm_protocol_util.c b/src/common/slurm_protocol_util.c index 5f58245225f..8d783015c6d 100644 --- a/src/common/slurm_protocol_util.c +++ b/src/common/slurm_protocol_util.c @@ -87,6 +87,7 @@ int check_header_version(header_t * header) case REQUEST_PRIORITY_FACTORS: case REQUEST_RECONFIGURE: case REQUEST_RESERVATION_INFO: + case REQUEST_SET_DEBUG_FLAGS: case REQUEST_SET_DEBUG_LEVEL: case REQUEST_SHARE_INFO: case REQUEST_SHUTDOWN: diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index 5a81d2c6e46..f10e561192e 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -846,6 +846,64 @@ _process_command (int argc, char *argv[]) exit_code = 1; } } + else if (strncasecmp (tag, "setdebugflags", MAX(tag_len, 9)) == 0) { + if (argc > 2) { + exit_code = 1; + if (quiet_flag != 1) + fprintf(stderr, + "too many arguments for keyword:%s\n", + tag); + } else if (argc < 2) { + exit_code = 1; + if (quiet_flag != 1) + fprintf(stderr, + "too few arguments for keyword:%s\n", + tag); + } else { + int i, mode = 0; + uint32_t debug_flags_plus = 0; + uint32_t debug_flags_minus = 0, flags; + + for (i = 1; i < argc; i++) { + if (argv[i][0] == '+') + mode = 1; + else if (argv[i][0] == '-') + mode = -1; + else { + mode = 0; + break; + } + flags = debug_str2flags(&argv[i][1]); + if (flags == NO_VAL) + break; + if (mode == 1) + debug_flags_plus |= flags; + else + debug_flags_minus |= flags; + } + if (i < argc) { + exit_code = 1; + if (quiet_flag != 1) { + fprintf(stderr, "invalid debug " + "flag: %s\n", argv[i]); + } + if ((quiet_flag != 1) && (mode = 0)) { + fprintf(stderr, "Usage: setdebugflags" + " [+|-]NAME\n"); + } + } else { + error_code = slurm_set_debugflags( + debug_flags_plus, debug_flags_minus); + if (error_code) { + exit_code = 1; + if (quiet_flag != 1) + slurm_perror( + "slurm_set_debug_flags" + " error"); + } + } + } + } else if (strncasecmp (tag, "setdebug", MAX(tag_len, 2)) == 0) { if (argc > 2) { exit_code = 1; @@ -1623,6 +1681,7 @@ scontrol [<OPTION>] [<COMMAND>] \n\ requeue <job_id> re-queue a batch job \n\ resume <job_id> resume previously suspended job (see suspend)\n\ setdebug <level> set slurmctld debug level \n\ + setdebugflags [+|-]<flag> add or remove slurmctld DebugFlags \n\ schedloglevel <slevel> set scheduler log level \n\ show <ENTITY> [<ID>] display state of identified entity, default \n\ is all records. \n\ diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 15f990e281b..19c6a80e963 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -134,6 +134,7 @@ inline static void _slurm_rpc_resv_delete(slurm_msg_t * msg); inline static void _slurm_rpc_resv_show(slurm_msg_t * msg); inline static void _slurm_rpc_requeue(slurm_msg_t * msg); inline static void _slurm_rpc_takeover(slurm_msg_t * msg); +inline static void _slurm_rpc_set_debug_flags(slurm_msg_t *msg); inline static void _slurm_rpc_set_debug_level(slurm_msg_t *msg); inline static void _slurm_rpc_set_schedlog_level(slurm_msg_t *msg); inline static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg); @@ -390,6 +391,10 @@ void slurmctld_req (slurm_msg_t * msg) _slurm_rpc_job_notify(msg); slurm_free_job_notify_msg(msg->data); break; + case REQUEST_SET_DEBUG_FLAGS: + _slurm_rpc_set_debug_flags(msg); + slurm_free_set_debug_flags_msg(msg->data); + break; case REQUEST_SET_DEBUG_LEVEL: _slurm_rpc_set_debug_level(msg); slurm_free_set_debug_level_msg(msg->data); @@ -3833,13 +3838,42 @@ inline static void _slurm_rpc_job_notify(slurm_msg_t * msg) slurm_send_rc_msg(msg, error_code); } -/* defined in controller.c */ +inline static void _slurm_rpc_set_debug_flags(slurm_msg_t *msg) +{ + uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); + slurmctld_lock_t config_write_lock = + { WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; + set_debug_flags_msg_t *request_msg = + (set_debug_flags_msg_t *) msg->data; + uint32_t debug_flags; + char *flag_string; + + debug2("Processing RPC: REQUEST_SET_DEBUG_FLAGS from uid=%d", uid); + if (!validate_super_user(uid)) { + error("set debug flags request from non-super user uid=%d", + uid); + slurm_send_rc_msg(msg, EACCES); + return; + } + + lock_slurmctld (config_write_lock); + debug_flags = slurm_get_debug_flags(); + debug_flags &= (~request_msg->debug_flags_minus); + debug_flags |= request_msg->debug_flags_plus; + slurm_set_debug_flags(debug_flags); + unlock_slurmctld (config_write_lock); + flag_string = debug_flags2str(debug_flags); + info("Set DebugFlags to %s", flag_string); + xfree(flag_string); + slurm_send_rc_msg(msg, SLURM_SUCCESS); +} + inline static void _slurm_rpc_set_debug_level(slurm_msg_t *msg) { int debug_level, old_debug_level; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); - slurmctld_lock_t config_read_lock = - { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; + slurmctld_lock_t config_write_lock = + { WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; set_debug_level_msg_t *request_msg = (set_debug_level_msg_t *) msg->data; log_options_t log_opts = LOG_OPTS_INITIALIZER; @@ -3858,7 +3892,7 @@ inline static void _slurm_rpc_set_debug_level(slurm_msg_t *msg) debug_level = MIN (request_msg->debug_level, (LOG_LEVEL_END - 1)); debug_level = MAX (debug_level, LOG_LEVEL_QUIET); - lock_slurmctld (config_read_lock); + lock_slurmctld (config_write_lock); if (slurmctld_config.daemonize) { log_opts.stderr_level = LOG_LEVEL_QUIET; if (slurmctld_conf.slurmctld_logfile) { @@ -3877,7 +3911,7 @@ inline static void _slurm_rpc_set_debug_level(slurm_msg_t *msg) log_opts.logfile_level = LOG_LEVEL_QUIET; } log_alter(log_opts, LOG_DAEMON, slurmctld_conf.slurmctld_logfile); - unlock_slurmctld (config_read_lock); + unlock_slurmctld (config_write_lock); conf = slurm_conf_lock(); old_debug_level = conf->slurmctld_debug; -- GitLab