From a86f70b1ff3a82fea59276dd4d7e9171457c3eaa Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 25 Mar 2010 16:09:52 +0000 Subject: [PATCH] Add support for slurmctld and slurmd option of "-n <value>" to reset the daemon's nice value. --- NEWS | 2 ++ RELEASE_NOTES | 3 +++ doc/man/man8/slurmctld.8 | 6 +++++- doc/man/man8/slurmd.8 | 12 +++++++++--- src/slurmctld/controller.c | 30 +++++++++++++++++++++++++++++- src/slurmd/slurmd/slurmd.c | 32 ++++++++++++++++++++++++++++++-- src/slurmd/slurmd/slurmd.h | 5 +++-- 7 files changed, 81 insertions(+), 9 deletions(-) diff --git a/NEWS b/NEWS index 5da2be3cbe7..f54bf276ddd 100644 --- a/NEWS +++ b/NEWS @@ -24,6 +24,8 @@ documents those changes that are of interest to users and admins. partitions. -- Added sacctmgr list events which will list events that have happened on clusters in accounting. + -- Add support for slurmctld and slurmd option of "-n <value>" to reset the + daemon's nice value. * Changes in SLURM 2.2.0.pre3 ============================= diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 2aecfa25c59..a5c9f773511 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -93,6 +93,9 @@ COMMAND CHANGES (see man pages for details) * scontrol now has the ability to change a job step's time limit. +* Add support for slurmctld and slurmd option of "-n <value>" to reset the + daemon's nice value. + BLUEGENE SPECIFIC CHANGES ========================= diff --git a/doc/man/man8/slurmctld.8 b/doc/man/man8/slurmctld.8 index b751282393d..fcd824c475e 100644 --- a/doc/man/man8/slurmctld.8 +++ b/doc/man/man8/slurmctld.8 @@ -1,4 +1,4 @@ -.TH SLURMCTLD "8" "June 2006" "slurmctld 2.0" "Slurm components" +.TH SLURMCTLD "8" "March 2010" "slurmctld 2.2" "Slurm components" .SH "NAME" slurmctld \- The central management daemon of Slurm. .SH "SYNOPSIS" @@ -32,6 +32,10 @@ Help; print a brief summary of command options. \fB\-L <file>\fR Write log messages to the specified file. +.TP +\fB\-n <value>\fR +Set the daemon's nice value to the specified value, typically a negative number. + .TP \fB\-R\fR Recover full state from last checkpoint: jobs, node, and partition state. diff --git a/doc/man/man8/slurmd.8 b/doc/man/man8/slurmd.8 index 6cfde1df931..2eb3e3d5085 100644 --- a/doc/man/man8/slurmd.8 +++ b/doc/man/man8/slurmd.8 @@ -1,4 +1,4 @@ -.TH SLURMD "8" "March 2009" "slurmd 2.0" "Slurm components" +.TH SLURMD "8" "March 2010" "slurmd 2.2" "Slurm components" .SH "NAME" slurmd \- The compute node daemon for SLURM. @@ -19,7 +19,7 @@ abnormally. .TP \fB\-d <file>\fR Specify the fully qualified pathname to the \fBslurmstepd\fR program to be used -for sheperding user job steps. This can be useful for testing purposes. +for shepherding user job steps. This can be useful for testing purposes. .TP \fB\-D\fR Run slurmd in the foreground. Error and debug messages will be copied to stderr. @@ -39,6 +39,12 @@ paging of the slurmd process. This may help in cases where nodes are marked DOWN during periods of heavy swap activity. If the mlockall(2) system call is not available, an error will be printed to the log and slurmd will continue as normal. + +.TP +\fB\-n <value>\fR +Set the daemon's nice value to the specified value, typically a negative number. +Also note the \fBPropagatePrioProcess\fR configuration parameter. + .TP \fB\-v\fR Verbose operation. Multiple \-v's increase verbosity. @@ -66,7 +72,7 @@ configuration file, \fBslurm.conf\fR. .SH "COPYING" Copyright (C) 2002\-2007 The Regents of the University of California. -Copyright (C) 2008\-2009 Lawrence Livermore National Security. +Copyright (C) 2008\-2010 Lawrence Livermore National Security. Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). CODE\-OCEC\-09\-009. All rights reserved. .LP diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 9439e0b5624..8c079d3ad52 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -162,6 +162,7 @@ static int daemonize = DEFAULT_DAEMONIZE; static int debug_level = 0; static char *debug_logfile = NULL; static bool dump_core = false; +static int new_nice = 0; static char node_name[MAX_SLURM_NAME]; static int recover = DEFAULT_RECOVER; static pthread_cond_t server_thread_cond = PTHREAD_COND_INITIALIZER; @@ -196,6 +197,7 @@ static void * _slurmctld_background(void *no_data); static void * _slurmctld_rpc_mgr(void *no_data); static void * _slurmctld_signal_hand(void *no_data); inline static void _update_cred_key(void); +static void _update_nice(void); inline static void _usage(char *prog_name); static bool _valid_controller(void); static bool _wait_for_server_thread(void); @@ -228,6 +230,7 @@ int main(int argc, char *argv[]) slurm_conf_reinit(slurm_conf_filename); update_logging(); + _update_nice(); _kill_old_slurmctld(); /* @@ -1488,9 +1491,10 @@ extern int optind, opterr, optopt; static void _parse_commandline(int argc, char *argv[]) { int c = 0; + char *tmp_char; opterr = 0; - while ((c = getopt(argc, argv, "cdDf:hL:rRvV")) != -1) + while ((c = getopt(argc, argv, "cdDf:hL:n:rRvV")) != -1) switch (c) { case 'c': recover = 0; @@ -1512,6 +1516,11 @@ static void _parse_commandline(int argc, char *argv[]) case 'L': debug_logfile = xstrdup(optarg); break; + case 'n': + new_nice = strtol(optarg, &tmp_char, 10); + if (tmp_char[0] != '\0') + new_nice = NO_VAL; + break; case 'r': recover = 1; bg_recover = 1; @@ -1676,6 +1685,25 @@ void update_logging(void) slurmctld_conf.sched_logfile); } +/* Reset slurmd nice value */ +static void _update_nice(void) +{ + int cur_nice; + id_t pid; + + if (new_nice == NO_VAL) { + error("Invalid option for -n option (nice value), ignored"); + return; + } + + pid = getpid(); + cur_nice = getpriority(PRIO_PROCESS, pid); + if (cur_nice == new_nice) + return; + if (setpriority(PRIO_PROCESS, pid, new_nice)) + error("Unable to reset nice value to %d: %m", new_nice); +} + /* Kill the currently running slurmctld * NOTE: No need to lock the config data since we are still single-threaded */ static void diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index e0ca9f816eb..977debbb937 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -93,7 +93,7 @@ #include "src/slurmd/common/task_plugin.h" #include "src/slurmd/common/set_oomadj.h" -#define GETOPT_ARGS "cd:Df:hL:MN:vV" +#define GETOPT_ARGS "cd:Df:hL:Mn:N:vV" #ifndef MAXHOSTNAMELEN # define MAXHOSTNAMELEN 64 @@ -157,6 +157,7 @@ static int _slurmd_fini(void); static void _spawn_registration_engine(void); static void _term_handler(int); static void _update_logging(void); +static void _update_nice(void); static void _usage(void); static void _wait_for_all_threads(void); @@ -746,6 +747,8 @@ _read_config(void) conf->block_map_size = 0; _update_logging(); + _update_nice(); + get_procs(&conf->actual_cpus); get_cpuinfo(conf->actual_cpus, &conf->actual_sockets, @@ -1027,6 +1030,7 @@ static void _process_cmdline(int ac, char **av) { int c; + char *tmp_char; conf->prog = xbasename(av[0]); @@ -1054,6 +1058,11 @@ _process_cmdline(int ac, char **av) case 'M': conf->mlock_pages = 1; break; + case 'n': + conf->nice = strtol(optarg, &tmp_char, 10); + if (tmp_char[0] != '\0') + conf->nice = NO_VAL; + break; case 'N': conf->node_name = xstrdup(optarg); break; @@ -1465,7 +1474,7 @@ _kill_old_slurmd(void) } } -/* Reset slurmctld logging based upon configuration parameters */ +/* Reset slurmd logging based upon configuration parameters */ static void _update_logging(void) { log_options_t *o = &conf->log_opts; @@ -1501,6 +1510,25 @@ static void _update_logging(void) log_alter(conf->log_opts, SYSLOG_FACILITY_DAEMON, conf->logfile); } +/* Reset slurmd nice value */ +static void _update_nice(void) +{ + int cur_nice; + id_t pid; + + if (conf->nice == NO_VAL) { + error("Invalid option for -n option (nice value), ignored"); + return; + } + + pid = getpid(); + cur_nice = getpriority(PRIO_PROCESS, pid); + if (cur_nice == conf->nice) + return; + if (setpriority(PRIO_PROCESS, pid, conf->nice)) + error("Unable to reset nice value to %d: %m", conf->nice); +} + /* * Lock the fork mutex to protect fork-critical regions */ diff --git a/src/slurmd/slurmd/slurmd.h b/src/slurmd/slurmd/slurmd.h index 89d349a85ad..d890345eb8a 100644 --- a/src/slurmd/slurmd/slurmd.h +++ b/src/slurmd/slurmd/slurmd.h @@ -3,7 +3,7 @@ * $Id$ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. - * Copyright (C) 2008-2009 Lawrence Livermore National Security. + * Copyright (C) 2008-2010 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Mark Grondona <mgrondona@llnl.gov>. * CODE-OCEC-09-009. All rights reserved. @@ -94,9 +94,10 @@ typedef struct slurmd_config { uint16_t block_map_size; /* size of block map */ uint16_t *block_map; /* abstract->machine block map */ uint16_t *block_map_inv; /* machine->abstract (inverse) map */ - uint16_t cr_type; /* Consumable Resource Type: * + uint16_t cr_type; /* Consumable Resource Type: * * CR_SOCKET, CR_CORE, CR_MEMORY, * * CR_DEFAULT, etc. */ + int nice; /* command line nice value spec */ char *node_name; /* node name */ char *node_addr; /* node's address */ char *node_topo_addr; /* node's topology address */ -- GitLab