diff --git a/NEWS b/NEWS index c8a4470440fdb0ddb18370455e53dcc459db3f55..b25b3aa690c5666460b67562abb2f0917c2a60bb 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,9 @@ documents those changes that are of interest to users and admins. -- Fix bug that was setting a job's requeue value on any update of the job using the "scontrol update" command. The invalid value of an updated job prevents it's recovery when slurmctld restarts. + -- Add support for cluster-wide consumable resources. See "Licenses" + parameter in slurm.conf man page and "--licenses" option in salloc, + sbatch and srun man pages. * Changes in SLURM 1.3.0 ======================== diff --git a/doc/html/configurator.html.in b/doc/html/configurator.html.in index 97a67377379ed317fa7fe25319d9767a9cad6795..3bd70de6fea44e18c754e41f12a07be8b49d7432 100644 --- a/doc/html/configurator.html.in +++ b/doc/html/configurator.html.in @@ -20,8 +20,6 @@ details. You should have received a copy of the GNU General Public License along with SLURM; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -$Id$ --> <HTML> <HEAD><TITLE>SLURM System Configuration Tool</TITLE> @@ -148,6 +146,7 @@ function displayfile() get_field("JobCredentialPublicCertificate", document.config.public_key) + "<br>" + "#JobFileAppend=0 <br>" + "#JobRequeue=1 <br>" + + "#Licenses=foo*4,bar <br>" + "#MailProg=/bin/mail <br>" + "#MaxJobCount=5000 <br>" + "MpiDefault=" + get_radio_value(document.config.mpi_default) + "<br>" + @@ -778,7 +777,7 @@ before terminating all remaining tasks. A value of zero indicates unlimited wait <P> </FORM> <HR> -<P class="footer">UCRL-WEB-225274<BR> -Last modified 10 March 2008</P> +<P class="footer">LLNL-WEB-402631<BR> +Last modified 1 April 2008</P> </BODY> diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 618bbdc4226bdcf4fd99664863a5b7a4e472d31f..6bb26e55530ad502a4184f9f5ef34b334cabf2de 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -1,4 +1,4 @@ -.TH "salloc" "1" "SLURM 1.2" "February 2008" "SLURM Commands" +.TH "salloc" "1" "SLURM 1.3" "April 2008" "SLURM Commands" .SH "NAME" .LP salloc \- Obtain a SLURM job allocation (a set of nodes), execute a command, and then release the allocation when the command is finished. @@ -265,6 +265,15 @@ new job steps on the remaining nodes in their allocation. By default SLURM terminates the entire job allocation if any node fails in its range of allocated nodes. +.TP +\fB\-L\fR, \fB\-\-licenses\fR= +Specification of licenses (or other resources available on all +nodes of the cluster) which must be allocated to this job. +License names can be followed by an asterisk and count +(the default count is one). +Multiple license names should be comma separated (e.g. +"\-\-licenses=foo*4,bar"). + .TP \fB\-m\fR, \fB\-\-distribution\fR= (\fIblock\fR|\fIcyclic\fR|\fIarbitrary\fR|\fIplane=<options>\fR) diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 3c4b5960aab583efb8f06f514a0238a6452bef4d..e00f724a62e3003c8f0acdbcfd9ebdcd68e450be 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -1,4 +1,4 @@ -.TH "sbatch" "1" "SLURM 1.3" "February 2008" "SLURM Commands" +.TH "sbatch" "1" "SLURM 1.3" "April 2008" "SLURM Commands" .SH "NAME" .LP sbatch \- Submit a batch script to SLURM. @@ -289,6 +289,15 @@ new job steps on the remaining nodes in their allocation. By default SLURM terminates the entire job allocation if any node fails in its range of allocated nodes. +.TP +\fB\-L\fR, \fB\-\-licenses\fR= +Specification of licenses (or other resources available on all +nodes of the cluster) which must be allocated to this job. +License names can be followed by an asterisk and count +(the default count is one). +Multiple license names should be comma separated (e.g. +"\-\-licenses=foo*4,bar"). + .TP \fB\-m\fR, \fB\-\-distribution\fR= (\fIblock\fR|\fIcyclic\fR|\fIarbitrary\fR|\fIplane=<options>\fR) diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index b94d390df4dd00216adb46e221d2a8b59623441e..aaac754d4041e9624f3c06d25f2b215dfa2709c0 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1,6 +1,6 @@ .\" $Id$ .\" -.TH SRUN "1" "Frebruary 2008" "srun 1.3" "slurm components" +.TH SRUN "1" "April 2008" "srun 1.3" "slurm components" .SH "NAME" srun \- run parallel jobs @@ -356,6 +356,15 @@ from remote tasks is line\-buffered directly to the stdout and stderr of The \fB\-\-label\fR option will prepend lines of output with the remote task id. +.TP +\fB\-L\fR, \fB\-\-licenses\fR= +Specification of licenses (or other resources available on all +nodes of the cluster) which must be allocated to this job. +License names can be followed by an asterisk and count +(the default count is one). +Multiple license names should be comma separated (e.g. +"\-\-licenses=foo*4,bar"). + .TP \fB\-m\fR, \fB\-\-relative\fR (\fIblock\fR|\fIcyclic\fR|\fIarbitrary\fR|\fIplane=<options>\fR) diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 542af8b0adde328cf0cfc47db685a44808a71184..0af7fa482b19bd444128a8c68f611549c4ec2899 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1,4 +1,4 @@ -.TH "slurm.conf" "5" "February 2008" "slurm.conf 1.3" "Slurm configuration file" +.TH "slurm.conf" "5" "April 2008" "slurm.conf 1.3" "Slurm configuration file" .SH "NAME" slurm.conf \- Slurm configuration file .SH "DESCRIPTION" @@ -472,6 +472,15 @@ in the interval specified, it will be forcibly terminated. The default value is 30 seconds. May not exceed 65533. +.TP +\fBLicenses\fR +Specification of licenses (or other resources available on all +nodes of the cluster) which can be allocated to jobs. +License names can be followed by an asterisk and count +(the default count is one). +Multiple license names should be comma separated (e.g. +"Licenses=foo*4,bar"). + .TP \fBMailProg\fR Fully qualified pathname to the program used to send email per user request. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index dc0b309402f0d3391bcf1b407f09a0fb57e6284d..f3f587ee1a175f72dff98b4756aa94b27552fa5a 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -220,6 +220,7 @@ enum job_state_reason { WAIT_PART_STATE, /* requested partition is down */ WAIT_HELD, /* job is held, priority==0 */ WAIT_TIME, /* job waiting for specific begin time */ + WAIT_LICENSES, /* job is waiting for licenses */ WAIT_TBD1, WAIT_TBD2, FAIL_DOWN_PARTITION, /* partition for job is DOWN */ diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index a882fb624c4979b7bd8f4c3002cdbc2657fa42f8..9ea452bf454d3ba0266943d3cb118bd1a26f0936 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -154,6 +154,7 @@ enum { ESLURM_INVALID_BANK_ACCOUNT, ESLURM_INVALID_TASK_MEMORY, ESLURM_INVALID_ACCOUNT, + ESLURM_INVALID_LICENSES, /* switch specific error codes, specific values defined in plugin module */ ESLURM_SWITCH_MIN = 3000, diff --git a/src/api/Makefile.in b/src/api/Makefile.in index cfeee9318e206c1a443713307ebc6a7212b6cb5b..9c8c09c346d5497eab2ce8b59eefb8cbd6fb7c62 100644 --- a/src/api/Makefile.in +++ b/src/api/Makefile.in @@ -382,6 +382,7 @@ libslurm_la_LDFLAGS = \ -version-info $(current):$(rev):$(age) \ $(OTHER_FLAGS) + # # The libpmi_la_LIBADD specification below causes libpmi.la to relink # when running "make install", but removing it prevents essential slurm diff --git a/src/common/read_config.c b/src/common/read_config.c index ed49d58b130eceb83f60cf23d2941e84f97722b1..462c02db59333662ecd110017d44e006cb6a8d3d 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -1635,7 +1635,7 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) if (!s_p_get_uint16(&conf->kill_wait, "KillWait", hashtbl)) conf->kill_wait = DEFAULT_KILL_WAIT; - s_p_get_string(&conf->mail_prog, "Licenses", hashtbl); + s_p_get_string(&conf->licenses, "Licenses", hashtbl); if (!s_p_get_string(&conf->mail_prog, "MailProg", hashtbl)) conf->mail_prog = xstrdup(DEFAULT_MAIL_PROG); diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 467dd863e8fd00dae2be145a460c2543b5224e05..cd9e186fa9bbd81c7be99beeb3b44eb5fde1517f 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -204,6 +204,8 @@ static slurm_errtab_t slurm_errtab[] = { "Memory required by task is not available" }, { ESLURM_INVALID_ACCOUNT, "Job has invalid account" }, + { ESLURM_INVALID_LICENSES, + "Job has invalid license specification" }, /* slurmd error codes */ diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 9baf34bf362e904eeba73ef2890c1dc6c3b0f8d2..efb1e6cc45765c676e9015eac8cb4b0c1dd0066e 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -594,6 +594,8 @@ extern char *job_reason_string(enum job_state_reason inx) return "JobHeld"; case WAIT_TIME: return "BeginTime"; + case WAIT_LICENSES: + return "Licenses"; case FAIL_DOWN_PARTITION: return "PartitionDown"; case FAIL_DOWN_NODE: diff --git a/src/salloc/opt.c b/src/salloc/opt.c index 34f7bb7b0e7f30ce59f36cca8690cf112a729879..c97e3f062164cec79de1020f25d7b5d9286ab051 100644 --- a/src/salloc/opt.c +++ b/src/salloc/opt.c @@ -485,6 +485,7 @@ void set_options(const int argc, char **argv) {"job-name", required_argument, 0, 'J'}, {"no-kill", no_argument, 0, 'k'}, {"kill-command", optional_argument, 0, 'K'}, + {"licenses", required_argument, 0, 'L'}, {"distribution", required_argument, 0, 'm'}, {"tasks", required_argument, 0, 'n'}, {"ntasks", required_argument, 0, 'n'}, @@ -541,7 +542,7 @@ void set_options(const int argc, char **argv) {"get-user-env", optional_argument, 0, LONG_OPT_GET_USER_ENV}, {NULL, 0, 0, 0} }; - char *opt_string = "+a:B:c:C:d:D:F:g:hHIJ:kK:m:n:N:Op:qR:st:uU:vVw:W:x:"; + char *opt_string = "+a:B:c:C:d:D:F:g:hHIJ:kK:L:m:n:N:Op:qR:st:uU:vVw:W:x:"; opt.progname = xbasename(argv[0]); optind = 0; @@ -628,6 +629,10 @@ void set_options(const int argc, char **argv) } opt.kill_command_signal_set = true; break; + case 'L': + xfree(opt.licenses); + opt.licenses = xstrdup(optarg); + break; case 'm': opt.distribution = verify_dist_type(optarg, &opt.plane_size); @@ -1320,7 +1325,7 @@ static void _usage(void) " [[-c cpus-per-node] [-r n] [-p partition] [--hold] [-t minutes]\n" " [--immediate] [--no-kill] [--overcommit] [-D path]\n" " [--share] [-J jobname] [--jobid=id]\n" -" [--verbose] [--gid=group] [--uid=user]\n" +" [--verbose] [--gid=group] [--uid=user] [--licenses=names]\n" " [-W sec] [--minsockets=n] [--mincores=n] [--minthreads=n]\n" " [--contiguous] [--mincpus=n] [--mem=MB] [--tmp=MB] [-C list]\n" " [--account=name] [--dependency=type:jobid] [--comment=name]\n" @@ -1369,6 +1374,7 @@ static void _help(void) " -U, --account=name charge job to specified account\n" " --begin=time defer job until HH:MM DD/MM/YY\n" " --comment=name arbitrary comment\n" +" -L, --licenses=names required license, comma separated\n" " --mail-type=type notify on state change: BEGIN, END, FAIL or ALL\n" " --mail-user=user who to send email notification for job state changes\n" " --bell ring the terminal bell when the job is allocated\n" diff --git a/src/salloc/opt.h b/src/salloc/opt.h index ab76f5ba6bda338fc73dd7031ba714107bde8919..3f964a93797c0e0f3b5d8d3b21d343ea0011f51b 100644 --- a/src/salloc/opt.h +++ b/src/salloc/opt.h @@ -93,6 +93,7 @@ typedef struct salloc_options { bool hold; /* --hold, -H */ bool no_kill; /* --no-kill, -k */ int acctg_freq; /* --acctg-freq=secs */ + char *licenses; /* --licenses, -L */ bool overcommit; /* --overcommit -O */ int kill_command_signal;/* --kill-command, -K */ bool kill_command_signal_set; diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 405e4c820d8f9005b10df5f4ea688411a6aeebb9..df7531c47e3b62143a97b733d1e587143bcb87ba 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -308,6 +308,8 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc) desc->task_dist = opt.distribution; if (opt.plane_size != NO_VAL) desc->plane_size = opt.plane_size; + if (opt.licenses) + desc->licenses = xstrdup(opt.licenses); if (opt.nice) desc->nice = NICE_OFFSET + opt.nice; desc->mail_type = opt.mail_type; diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index 89099797f83852768a0b493123524ecb8819d257..976c4acb7d1bbd4731d42670c8c5b3e2fcbf9da0 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -485,6 +485,7 @@ static struct option long_options[] = { {"immediate", no_argument, 0, 'I'}, {"job-name", required_argument, 0, 'J'}, {"no-kill", no_argument, 0, 'k'}, + {"licenses", required_argument, 0, 'L'}, {"distribution", required_argument, 0, 'm'}, {"tasks", required_argument, 0, 'n'}, {"ntasks", required_argument, 0, 'n'}, @@ -545,7 +546,7 @@ static struct option long_options[] = { }; static char *opt_string = - "+a:bB:c:C:d:D:e:F:g:hHi:IJ:km:n:N:o:Op:qR:st:uU:vVw:x:"; + "+a:bB:c:C:d:D:e:F:g:hHi:IJ:kL:m:n:N:o:Op:qR:st:uU:vVw:x:"; /* @@ -998,6 +999,10 @@ static void _set_options(int argc, char **argv) case 'k': opt.no_kill = true; break; + case 'L': + xfree(opt.licenses); + opt.licenses = xstrdup(optarg); + break; case 'm': opt.distribution = verify_dist_type(optarg, &opt.plane_size); @@ -2103,7 +2108,7 @@ static void _usage(void) "Usage: sbatch [-N nnodes] [-n ntasks]\n" " [-c ncpus] [-r n] [-p partition] [--hold] [-t minutes]\n" " [-D path] [--immediate] [--no-kill] [--overcommit]\n" -" [--input file] [--output file] [--error file]\n" +" [--input file] [--output file] [--error file] [--licenses=names]\n" " [--workdir=directory] [--share] [-m dist] [-J jobname]\n" " [--jobid=id] [--verbose] [--gid=group] [--uid=user]\n" " [-W sec] [--minsockets=n] [--mincores=n] [--minthreads=n]\n" @@ -2153,6 +2158,7 @@ static void _help(void) " -U, --account=name charge job to specified account\n" " --begin=time defer job until HH:MM DD/MM/YY\n" " --comment=name arbitrary comment\n" +" -L, --licenses=names required license, comma separated\n" " --mail-type=type notify on state change: BEGIN, END, FAIL or ALL\n" " --mail-user=user who to send email notification for job state changes\n" " --gid=group_id group ID to run job as (user root only)\n" diff --git a/src/sbatch/opt.h b/src/sbatch/opt.h index 6f458d0e5c0114416e88d765a4cf45bedd6ef937..50a8ac435ec54e3ed5a9f58a8104d2af5dd42d6a 100644 --- a/src/sbatch/opt.h +++ b/src/sbatch/opt.h @@ -102,6 +102,7 @@ typedef struct sbatch_options { int acctg_freq; /* --acctg-freq=secs */ bool overcommit; /* --overcommit -O */ uint16_t shared; /* --share, -s */ + char *licenses; /* --licenses, -L */ int quiet; int verbose; char *wrap; diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index 07439f6246f61ff5e4d2fb9d0f19ef6df301aa01..f007a9aacae54f7a4fdc84d8b81acb4bcd64e200 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -164,6 +164,8 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc) desc->exc_nodes = opt.exc_nodes; desc->partition = opt.partition; desc->min_nodes = opt.min_nodes; + if (opt.licenses) + desc->licenses = xstrdup(opt.licenses); if (opt.max_nodes) desc->max_nodes = opt.max_nodes; desc->user_id = opt.uid; diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index b6839244ccc7fdb358dd5288fe7c836b4c5f89ff..49ae2df8ab6a1baaf52ce4c9e21d80307275241b 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -1,7 +1,8 @@ /*****************************************************************************\ * update_job.c - update job functions for scontrol. ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. + * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * LLNL-CODE-402394. @@ -439,6 +440,10 @@ scontrol_update_job (int argc, char *argv[]) update_cnt++; } #endif + else if (strncasecmp(argv[i], "Licenses=", 9) == 0) { + job_msg.licenses = &argv[i][9]; + update_cnt++; + } else if (strncasecmp(argv[i], "StartTime=", 10) == 0) { job_msg.begin_time = parse_time(&argv[i][10]); update_cnt++; diff --git a/src/slurmctld/Makefile.am b/src/slurmctld/Makefile.am index 39181b917e79f8bda58e36db13a1f0b974ded8be..3f5a6f9395d153651378a7f66b0eb0f8ffc310a4 100644 --- a/src/slurmctld/Makefile.am +++ b/src/slurmctld/Makefile.am @@ -21,6 +21,8 @@ slurmctld_SOURCES = \ job_mgr.c \ job_scheduler.c \ job_scheduler.h \ + licenses.c \ + licenses.h \ locks.c \ locks.h \ node_mgr.c \ diff --git a/src/slurmctld/Makefile.in b/src/slurmctld/Makefile.in index efe47e4c7952021e7caff3e8b63242da6e2b1f5a..fbee0e55171a0f67a67dacecb478bb494c74a79c 100644 --- a/src/slurmctld/Makefile.in +++ b/src/slurmctld/Makefile.in @@ -72,11 +72,12 @@ sbinPROGRAMS_INSTALL = $(INSTALL_PROGRAM) PROGRAMS = $(sbin_PROGRAMS) am_slurmctld_OBJECTS = agent.$(OBJEXT) backup.$(OBJEXT) \ controller.$(OBJEXT) job_mgr.$(OBJEXT) job_scheduler.$(OBJEXT) \ - locks.$(OBJEXT) node_mgr.$(OBJEXT) node_scheduler.$(OBJEXT) \ - partition_mgr.$(OBJEXT) ping_nodes.$(OBJEXT) \ - power_save.$(OBJEXT) proc_req.$(OBJEXT) read_config.$(OBJEXT) \ - sched_plugin.$(OBJEXT) srun_comm.$(OBJEXT) \ - state_save.$(OBJEXT) step_mgr.$(OBJEXT) trigger_mgr.$(OBJEXT) + licenses.$(OBJEXT) locks.$(OBJEXT) node_mgr.$(OBJEXT) \ + node_scheduler.$(OBJEXT) partition_mgr.$(OBJEXT) \ + ping_nodes.$(OBJEXT) power_save.$(OBJEXT) proc_req.$(OBJEXT) \ + read_config.$(OBJEXT) sched_plugin.$(OBJEXT) \ + srun_comm.$(OBJEXT) state_save.$(OBJEXT) step_mgr.$(OBJEXT) \ + trigger_mgr.$(OBJEXT) slurmctld_OBJECTS = $(am_slurmctld_OBJECTS) slurmctld_DEPENDENCIES = $(top_builddir)/src/common/libcommon.la \ $(top_builddir)/src/common/libdaemonize.la @@ -280,6 +281,8 @@ slurmctld_SOURCES = \ job_mgr.c \ job_scheduler.c \ job_scheduler.h \ + licenses.c \ + licenses.h \ locks.c \ locks.h \ node_mgr.c \ @@ -383,6 +386,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/controller.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/job_mgr.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/job_scheduler.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/licenses.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/locks.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/node_mgr.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/node_scheduler.Po@am__quote@ diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index b06a1f87c70f22959fcc6657495ca0a4133787f2..93ec25feca858dfd49a97c8f69562b8022a73bac 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -81,6 +81,7 @@ #include "src/slurmctld/agent.h" #include "src/slurmctld/job_scheduler.h" +#include "src/slurmctld/licenses.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/ping_nodes.h" #include "src/slurmctld/proc_req.h" @@ -241,7 +242,10 @@ int main(int argc, char *argv[]) "jobacct_gather/none")) info("Job accounting information stored, " "but details not gathered"); - } + } + + if (license_init(slurmctld_conf.licenses) != SLURM_SUCCESS) + fatal("Invalid Licenses value: %s", slurmctld_conf.licenses); #ifndef NDEBUG # ifdef PR_SET_DUMPABLE diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 1b438bdc3b724496b152a2564488e561dcfde657..9a13ee76d06af9c069236cf173b6c05417de88d3 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -6,7 +6,7 @@ * $Id$ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. - * Copyright (C) 2008 Lawrence Livermore National Security + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * LLNL-CODE-402394. @@ -73,6 +73,7 @@ #include "src/slurmctld/agent.h" #include "src/slurmctld/job_scheduler.h" +#include "src/slurmctld/licenses.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/node_scheduler.h" #include "src/slurmctld/proc_req.h" @@ -569,6 +570,7 @@ static int _load_job_state(Buf buffer) safe_unpack32(&exit_code, buffer); safe_unpack32(&db_index, buffer); safe_unpack32(&assoc_id, buffer); + safe_unpack_time(&start_time, buffer); safe_unpack_time(&end_time, buffer); safe_unpack_time(&suspend_time, buffer); @@ -1417,6 +1419,8 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, xassert(job_ptr); independent = job_independent(job_ptr); + if (license_job_test(job_ptr) != SLURM_SUCCESS) + independent = false; /* Avoid resource fragmentation if important */ if (independent && switch_no_frag() && @@ -1791,6 +1795,8 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, struct job_record *job_ptr; uint32_t total_nodes, max_procs; acct_association_rec_t assoc_rec; + List license_list = NULL; + bool valid; #if SYSTEM_DIMENSIONS uint16_t geo[SYSTEM_DIMENSIONS]; @@ -1992,6 +1998,14 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, goto cleanup; } + license_list = license_job_validate(job_desc->licenses, &valid); + if (!valid) { + info("Job's requested licenses are invalid: %s", + job_desc->licenses); + error_code = ESLURM_INVALID_LICENSES; + goto cleanup; + } + if ((error_code =_validate_job_create_req(job_desc))) goto cleanup; @@ -2026,6 +2040,9 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, } else job_ptr->batch_flag = 0; + job_ptr->license_list = license_list; + license_list = NULL; + /* Insure that requested partition is valid right now, * otherwise leave job queued and provide warning code */ detail_ptr = job_ptr->details; @@ -2060,6 +2077,8 @@ static int _job_create(job_desc_msg_t * job_desc, int allocate, int will_run, cleanup: + if (license_list) + list_destroy(license_list); FREE_NULL_BITMAP(req_bitmap); FREE_NULL_BITMAP(exc_bitmap); return error_code; @@ -2921,6 +2940,8 @@ static void _list_delete_job(void *job_entry) xfree(job_ptr->account); xfree(job_ptr->resp_host); xfree(job_ptr->licenses); + if (job_ptr->license_list) + list_destroy(job_ptr->license_list); xfree(job_ptr->mail_user); xfree(job_ptr->network); xfree(job_ptr->alloc_lps); @@ -3252,12 +3273,19 @@ void reset_job_bitmaps(void) while ((job_ptr = (struct job_record *) list_next(job_iterator))) { xassert (job_ptr->magic == JOB_MAGIC); job_fail = false; - part_ptr = list_find_first(part_list, &list_find_part, - job_ptr->partition); - if (part_ptr == NULL) { - error("Invalid partition (%s) for job_id %u", - job_ptr->partition, job_ptr->job_id); + + if (job_ptr->partition == NULL) { + error("No partition for job_id %u", job_ptr->job_id); + part_ptr = NULL; job_fail = true; + } else { + part_ptr = list_find_first(part_list, &list_find_part, + job_ptr->partition); + if (part_ptr == NULL) { + error("Invalid partition (%s) for job_id %u", + job_ptr->partition, job_ptr->job_id); + job_fail = true; + } } job_ptr->part_ptr = part_ptr; @@ -4041,6 +4069,50 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) error_code = ESLURM_DISABLED; } + if (job_specs->licenses) { + List license_list = NULL; + bool valid; + license_list = license_job_validate(job_specs->licenses, + &valid); + + if (!valid) { + info("update_job: invalid licenses: %s", + job_specs->licenses); + error_code = ESLURM_INVALID_LICENSES; + } else if (IS_JOB_PENDING(job_ptr)) { + if (job_ptr->license_list) + list_destroy(job_ptr->license_list); + job_ptr->license_list = license_list; + xfree(job_ptr->licenses); + job_ptr->licenses = job_specs->licenses; + job_specs->licenses = NULL; /* nothing to free */ + info("update_job: setting licenses to %s for job %u", + job_ptr->licenses, job_ptr->job_id); + } else if ((job_ptr->job_state == JOB_RUNNING) && super_user) { + /* NOTE: This can result in oversubscription of + * licenses */ + license_job_return(job_ptr); + if (job_ptr->license_list) + list_destroy(job_ptr->license_list); + job_ptr->license_list = license_list; + info("update_job: changing licenses from %s to %s for " + " running job %u", + job_ptr->licenses, job_specs->licenses, + job_ptr->job_id); + xfree(job_ptr->licenses); + job_ptr->licenses = job_specs->licenses; + job_specs->licenses = NULL; /* nothing to free */ + license_job_get(job_ptr); + } else { + /* licenses are valid, but job state or user not + * allowed to make changes */ + info("update_job: could not change licenses for job %u", + job_ptr->job_id); + error_code = ESLURM_DISABLED; + list_destroy(license_list); + } + } + #ifdef HAVE_BG { uint16_t reboot = (uint16_t) NO_VAL; diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 071d3a2003f3dd74545b5a68ba3089e4ecde1d95..da7138dde5472e75172379657acd8ea8a5ea7da5 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -46,16 +46,17 @@ #include <string.h> #include <unistd.h> +#include "src/common/assoc_mgr.h" #include "src/common/list.h" #include "src/common/macros.h" #include "src/common/node_select.h" #include "src/common/slurm_accounting_storage.h" #include "src/common/xassert.h" #include "src/common/xstring.h" -#include "src/common/assoc_mgr.h" #include "src/slurmctld/agent.h" #include "src/slurmctld/job_scheduler.h" +#include "src/slurmctld/licenses.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/node_scheduler.h" #include "src/slurmctld/slurmctld.h" @@ -91,7 +92,7 @@ extern int build_job_queue(struct job_queue **job_queue) (job_ptr->job_state & JOB_COMPLETING) || (job_ptr->priority == 0)) /* held */ continue; - if (!job_independent(job_ptr)) /* waiting for other job */ + if (!job_independent(job_ptr)) /* can not run now */ continue; if (job_buffer_size <= job_queue_size) { job_buffer_size += 200; @@ -288,7 +289,12 @@ extern int schedule(void) failed_part_cnt)) { continue; } - + + if (license_job_test(job_ptr) != SLURM_SUCCESS) { + job_ptr->state_reason = WAIT_LICENSES; + continue; + } + if (assoc_mgr_validate_assoc_id(acct_db_conn, job_ptr->assoc_id, accounting_enforce)) { /* NOTE: This only happens if a user's account is @@ -297,7 +303,7 @@ extern int schedule(void) * very rare. */ info("schedule: JobId=%u has invalid account", job_ptr->job_id); - last_job_update = time(NULL); + last_job_update = time(NULL); job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_BANK_ACCOUNT; diff --git a/src/slurmctld/licenses.c b/src/slurmctld/licenses.c new file mode 100644 index 0000000000000000000000000000000000000000..f3c3bdc747d7f1f6bc506c3302890187c8feac51 --- /dev/null +++ b/src/slurmctld/licenses.c @@ -0,0 +1,391 @@ +/*****************************************************************************\ + * licenses.c - Functions for handling cluster-wide consumable resources + ***************************************************************************** + * Copyright (C) 2008 Lawrence Livermore National Security. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette@llnl.gov>, et. al. + * LLNL-CODE-402394. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#include <ctype.h> +#include <errno.h> +#include <pthread.h> +#include <slurm/slurm_errno.h> +#include <stdlib.h> +#include <string.h> + +#include "src/common/list.h" +#include "src/common/log.h" +#include "src/common/macros.h" +#include "src/common/xmalloc.h" +#include "src/common/xstring.h" +#include "src/slurmctld/licenses.h" +#include "src/slurmctld/slurmctld.h" + +#define _DEBUG 0 + +List license_list = (List) NULL; +static pthread_mutex_t license_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* Print all licenses on a list */ +static inline void _licenses_print(char *header, List licenses) +{ +#if _DEBUG + ListIterator iter; + licenses_t *license_entry; + + info("licenses: %s", header); + if (licenses == NULL) + return; + + iter = list_iterator_create(licenses); + if (iter == NULL) + fatal("malloc failure from list_iterator_create"); + while ((license_entry = (licenses_t *) list_next(iter))) { + info("name:%s total:%u used:%u", license_entry->name, + license_entry->total, license_entry->used); + } + list_iterator_destroy(iter); +#endif +} + +/* Free a license_t record (for use by list_destroy) */ +static void _license_free_rec(void *x) +{ + licenses_t *license_entry = (licenses_t *) x; + + if (license_entry) { + xfree(license_entry->name); + xfree(license_entry); + } +} + +/* Find a license_t record by license name (for use by list_find_first) */ +static int _license_find_rec(void *x, void *key) +{ + licenses_t *license_entry = (licenses_t *) x; + char *name = (char *) key; + + if ((license_entry->name == NULL) || (name == NULL)) + return 0; + if (strcmp(license_entry->name, name)) + return 0; + return 1; +} + +/* Given a license string, return a list of license_t records */ +static List _build_license_list(char *licenses, bool *valid) +{ + int i; + char *end_num, *tmp_str, *token, *last; + licenses_t *license_entry; + List lic_list; + + *valid = true; + if ((licenses == NULL) || (licenses[0] == '\0')) + return NULL; + + lic_list = list_create(_license_free_rec); + tmp_str = xstrdup(licenses); + token = strtok_r(tmp_str, ",;", &last); + while (token && *valid) { + uint16_t num = 1; + for (i=0; token[i]; i++) { + if (isspace(token[i])) { + *valid = false; + break; + } + if (token[i] == '*') { + token[i++] = '\0'; + num = (uint16_t)strtol(&token[i], &end_num, 10); + } + } + if (num <= 0) { + *valid = false; + break; + } + license_entry = xmalloc(sizeof(licenses_t)); + license_entry->name = xstrdup(token); + license_entry->total = num; + list_push(lic_list, license_entry); + token = strtok_r(NULL, ",;", &last); + } + xfree(tmp_str); + + if (*valid == false) { + list_destroy(lic_list); + lic_list = NULL; + } + return lic_list; +} + +/* Initialize licenses on this system based upon slurm.conf */ +extern int license_init(char *licenses) +{ + bool valid; + + slurm_mutex_lock(&license_mutex); + if (license_list) + fatal("license_list already defined"); + + license_list = _build_license_list(licenses, &valid); + if (!valid) + fatal("Invalid configured licenses: %s", licenses); + + _licenses_print("licences_init", license_list); + slurm_mutex_unlock(&license_mutex); + return SLURM_SUCCESS; +} + + +/* Update licenses on this system based upon slurm.conf. + * Preserve all previously allocated licenses */ +extern int license_update(char *licenses) +{ + ListIterator iter; + licenses_t *license_entry, *match; + List new_list; + bool valid; + + new_list = _build_license_list(licenses, &valid); + if (!valid) + fatal("Invalid configured licenses: %s", licenses); + + slurm_mutex_lock(&license_mutex); + if (!license_list) { /* no licenses before now */ + license_list = new_list; + slurm_mutex_unlock(&license_mutex); + return SLURM_SUCCESS; + } + + iter = list_iterator_create(license_list); + if (iter == NULL) + fatal("malloc failure from list_iterator_create"); + while ((license_entry = (licenses_t *) list_next(iter))) { + match = list_find_first(new_list, _license_find_rec, + license_entry->name); + if (!match) { + info("license %s removed with %u in use", + license_entry->name, license_entry->used); + } else { + match->used = license_entry->used; + if (match->used > match->total) + info("license %s count decreased", match->name); + } + } + list_iterator_destroy(iter); + + list_destroy(license_list); + license_list = new_list; + _licenses_print("licences_update", license_list); + slurm_mutex_unlock(&license_mutex); + return SLURM_SUCCESS; +} + +/* Free memory associated with licenses on this system */ +extern void license_free(void) +{ + slurm_mutex_lock(&license_mutex); + if (license_list) { + list_destroy(license_list); + license_list = (List) NULL; + } + slurm_mutex_unlock(&license_mutex); +} + +/* + * license_job_validate - Test if the licenses required by a job are valid + * IN licenses - required licenses + * OUT valid - true if required licenses are valid and a sufficient number + * are configured (though not necessarily available now) + * RET license_list, must be destroyed by caller + */ +extern List license_job_validate(char *licenses, bool *valid) +{ + ListIterator iter; + licenses_t *license_entry, *match; + List job_license_list; + + job_license_list = _build_license_list(licenses, valid); + _licenses_print("job_validate", job_license_list); + if (!job_license_list) + return job_license_list; + + slurm_mutex_lock(&license_mutex); + iter = list_iterator_create(job_license_list); + if (iter == NULL) + fatal("malloc failure from list_iterator_create"); + while ((license_entry = (licenses_t *) list_next(iter))) { + match = list_find_first(license_list, _license_find_rec, + license_entry->name); + if (!match) { + debug("could not find license %s for job", + license_entry->name); + *valid = false; + break; + } else if (license_entry->total > match->total) { + debug("job wants more %s licenses than configured", + match->name); + *valid = false; + break; + } + } + list_iterator_destroy(iter); + slurm_mutex_unlock(&license_mutex); + + if (!(*valid)) { + list_destroy(job_license_list); + job_license_list = NULL; + } + return job_license_list; +} + +/* + * license_job_test - Test if the licenses required for a job are available + * IN job_ptr - job identification + * RET: SLURM_SUCCESS, EAGAIN (not available now), SLURM_ERROR (never runnable) + */ +extern int license_job_test(struct job_record *job_ptr) +{ + ListIterator iter; + licenses_t *license_entry, *match; + int rc = SLURM_SUCCESS; + + if (!job_ptr->license_list) /* no licenses needed */ + return rc; + + slurm_mutex_lock(&license_mutex); + iter = list_iterator_create(job_ptr->license_list); + if (iter == NULL) + fatal("malloc failure from list_iterator_create"); + while ((license_entry = (licenses_t *) list_next(iter))) { + match = list_find_first(license_list, _license_find_rec, + license_entry->name); + if (!match) { + error("could not find license %s for job %u", + license_entry->name, job_ptr->job_id); + rc = SLURM_ERROR; + break; + } else if (license_entry->total > match->total) { + info("job %u wants more %s licenses than configured", + job_ptr->job_id, match->name); + rc = SLURM_ERROR; + break; + } else if ((license_entry->total + match->used) > + match->total) { + rc = EAGAIN; + break; + } + } + list_iterator_destroy(iter); + slurm_mutex_unlock(&license_mutex); + return rc; +} + +/* + * license_job_get - Get the licenses required for a job + * IN job_ptr - job identification + * RET SLURM_SUCCESS or failure code + */ +extern int license_job_get(struct job_record *job_ptr) +{ + ListIterator iter; + licenses_t *license_entry, *match; + int rc = SLURM_SUCCESS; + + if (!job_ptr->license_list) /* no licenses needed */ + return rc; + + slurm_mutex_lock(&license_mutex); + iter = list_iterator_create(job_ptr->license_list); + if (iter == NULL) + fatal("malloc failure from list_iterator_create"); + while ((license_entry = (licenses_t *) list_next(iter))) { + match = list_find_first(license_list, _license_find_rec, + license_entry->name); + if (match) { + match->used += license_entry->total; + license_entry->used += license_entry->total; + } else { + error("could not find license %s for job %u", + license_entry->name, job_ptr->job_id); + rc = SLURM_ERROR; + } + } + list_iterator_destroy(iter); + _licenses_print("licences_job_get", license_list); + slurm_mutex_unlock(&license_mutex); + return rc; +} + +/* + * license_job_return - Return the licenses allocated to a job + * IN job_ptr - job identification + * RET SLURM_SUCCESS or failure code + */ +extern int license_job_return(struct job_record *job_ptr) +{ + ListIterator iter; + licenses_t *license_entry, *match; + int rc = SLURM_SUCCESS; + + if (!job_ptr->license_list) /* no licenses needed */ + return rc; + + slurm_mutex_lock(&license_mutex); + iter = list_iterator_create(job_ptr->license_list); + if (iter == NULL) + fatal("malloc failure from list_iterator_create"); + while ((license_entry = (licenses_t *) list_next(iter))) { + match = list_find_first(license_list, _license_find_rec, + license_entry->name); + if (match) { + if (match->used >= license_entry->total) + match->used -= license_entry->total; + else { + error("license use count underflow for %s", + match->name); + match->used = 0; + rc = SLURM_ERROR; + } + license_entry->used = 0; + } else { + /* This can happen after a reconfiguration */ + error("job returning unknown license %s", + license_entry->name); + } + } + list_iterator_destroy(iter); + _licenses_print("licences_job_return", license_list); + slurm_mutex_unlock(&license_mutex); + return rc; +} + diff --git a/src/slurmctld/licenses.h b/src/slurmctld/licenses.h new file mode 100644 index 0000000000000000000000000000000000000000..449e1002b1138748518a997500cd6affaf62e2d5 --- /dev/null +++ b/src/slurmctld/licenses.h @@ -0,0 +1,94 @@ +/*****************************************************************************\ + * licenses.h - Definitions for handling cluster-wide consumable resources + ***************************************************************************** + * Copyright (C) 2008 Lawrence Livermore National Security. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette@llnl.gov>, et. al. + * LLNL-CODE-402394. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifndef _LICENSES_H +#define _LICENSES_H + +#include "src/common/list.h" +#include "src/slurmctld/slurmctld.h" + +typedef struct licenses { + char * name; /* name associated with a license */ + uint16_t total; /* total license configued */ + uint16_t used; /* used licenses */ +} licenses_t; + +extern List license_list; + + +/* Initialize licenses on this system based upon slurm.conf */ +extern int license_init(char *licenses); + +/* Update licenses on this system based upon slurm.conf. + * Preserve all previously allocated licenses */ +extern int license_update(char *licenses); + +/* Free memory associated with licenses on this system */ +extern void license_free(void); + + +/* + * license_job_get - Get the licenses required for a job + * IN job_ptr - job identification + * RET SLURM_SUCCESS or failure code + */ +extern int license_job_get(struct job_record *job_ptr); + +/* + * license_job_return - Return the licenses allocated to a job + * IN job_ptr - job identification + * RET SLURM_SUCCESS or failure code + */ +extern int license_job_return(struct job_record *job_ptr); + +/* + * license_job_test - Test if the licenses required for a job are available + * IN job_ptr - job identification + * RET SLURM_SUCCESS, EAGAIN (not available now), SLURM_ERROR (never runnable) + */ +extern int license_job_test(struct job_record *job_ptr); + +/* + * license_job_validate - Test if the licenses required by a job are valid + * IN licenses - required licenses + * OUT valid - true if required licenses are valid and a sufficient number + * are configured (though not necessarily available now) + * RET license_list, must be destroyed by caller + */ +extern List license_job_validate(char *licenses, bool *valid); + +#endif /* !_LICENSES_H */ diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 35e94cf38e5e67e1e72b49efcbf3398fc553f2aa..abbfdc775aab1c94a1503f4562db53d9c4d2cd6d 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1,10 +1,9 @@ /*****************************************************************************\ * node_scheduler.c - select and allocated nodes to jobs * Note: there is a global node table (node_record_table_ptr) - * - * $Id$ ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. + * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * LLNL-CODE-402394. @@ -65,6 +64,7 @@ #include "src/common/slurm_accounting_storage.h" #include "src/slurmctld/agent.h" +#include "src/slurmctld/licenses.h" #include "src/slurmctld/node_scheduler.h" #include "src/slurmctld/sched_plugin.h" #include "src/slurmctld/slurmctld.h" @@ -110,10 +110,8 @@ static bitstr_t *_valid_features(struct job_details *detail_ptr, /* * allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED + * also claim required licenses * IN job_ptr - job being allocated resources - * globals: node_record_count - number of nodes in the system - * node_record_table_ptr - pointer to global node table - * last_node_update - last update time of node table */ extern void allocate_nodes(struct job_record *job_ptr) { @@ -125,6 +123,8 @@ extern void allocate_nodes(struct job_record *job_ptr) if (bit_test(job_ptr->node_bitmap, i)) make_node_alloc(&node_record_table_ptr[i], job_ptr); } + + license_job_get(job_ptr); return; } @@ -132,13 +132,12 @@ extern void allocate_nodes(struct job_record *job_ptr) /* * deallocate_nodes - for a given job, deallocate its nodes and make * their state NODE_STATE_COMPLETING + * also release the job's licenses * IN job_ptr - pointer to terminating job (already in some COMPLETING state) * IN timeout - true if job exhausted time limit, send REQUEST_KILL_TIMELIMIT * RPC instead of REQUEST_TERMINATE_JOB * IN suspended - true if job was already suspended (node's job_run_cnt * already decremented); - * globals: node_record_count - number of nodes in the system - * node_record_table_ptr - pointer to global node table */ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, bool suspended) @@ -152,6 +151,7 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, xassert(job_ptr); xassert(job_ptr->details); + license_job_return(job_ptr); if (slurm_sched_freealloc(job_ptr) != SLURM_SUCCESS) error("slurm_sched_freealloc(%u): %m", job_ptr->job_id); if (select_g_job_fini(job_ptr) != SLURM_SUCCESS) diff --git a/src/slurmctld/node_scheduler.h b/src/slurmctld/node_scheduler.h index 6257fe85f1275335c9514d6a44ba083b194987ff..42a4bf7dfc18419ab758c73840b3561e875778d2 100644 --- a/src/slurmctld/node_scheduler.h +++ b/src/slurmctld/node_scheduler.h @@ -1,7 +1,8 @@ /*****************************************************************************\ * node_scheduler.h - definitions of functions in node_scheduler.c ***************************************************************************** - * Copyright (C) 2004 The Regents of the University of California. + * Copyright (C) 2004-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette@llnl.gov> et. al. * LLNL-CODE-402394. @@ -40,10 +41,8 @@ /* * allocate_nodes - change state of specified nodes to NODE_STATE_ALLOCATED + * also claim required licenses * IN job_ptr - job being allocated resources - * globals: node_record_count - number of nodes in the system - * node_record_table_ptr - pointer to global node table - * last_node_update - last update time of node table */ extern void allocate_nodes(struct job_record *job_ptr); @@ -57,13 +56,12 @@ extern void build_node_details(struct job_record *job_ptr); /* * deallocate_nodes - for a given job, deallocate its nodes and make * their state NODE_STATE_COMPLETING + * also release the job's licenses * IN job_ptr - pointer to terminating job (already in some COMPLETING state) * IN timeout - true if job exhausted time limit, send REQUEST_KILL_TIMELIMIT * RPC instead of REQUEST_TERMINATE_JOB * IN suspended - true if job was already suspended (node's job_run_cnt * already decremented); - * globals: node_record_count - number of nodes in the system - * node_record_table_ptr - pointer to global node table */ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, bool suspended); diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 98b18b0101ea074b66076e00483ce55ddbdb8143..ea2909f2dbf02fa8bfcd2982d924caba0b52ad25 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -62,12 +62,12 @@ #include "src/common/parse_spec.h" #include "src/common/read_config.h" #include "src/common/slurm_jobcomp.h" +#include "src/common/slurm_rlimits_info.h" #include "src/common/switch.h" #include "src/common/xstring.h" -#include "src/common/node_select.h" -#include "src/common/slurm_rlimits_info.h" #include "src/slurmctld/job_scheduler.h" +#include "src/slurmctld/licenses.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/node_scheduler.h" #include "src/slurmctld/proc_req.h" @@ -784,6 +784,11 @@ int read_slurm_conf(int recover) if ((error_code = _build_bitmaps())) return error_code; + + license_free(); + if (license_init(slurmctld_conf.licenses) != SLURM_SUCCESS) + fatal("Invalid Licenses value: %s", slurmctld_conf.licenses); + _restore_job_dependencies(); restore_node_features(); #ifdef HAVE_ELAN @@ -1130,7 +1135,7 @@ static void _validate_node_proc_count(void) #endif /* - * _restore_job_dependencies - Build depend_list for every job + * _restore_job_dependencies - Build depend_list and license_list for every job */ static int _restore_job_dependencies(void) { @@ -1141,6 +1146,14 @@ static int _restore_job_dependencies(void) job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { + if (job_ptr->job_state == JOB_RUNNING) { + bool valid; + List license_list; + license_list = license_job_validate(job_ptr->licenses, &valid); + if (valid) + license_job_get(job_ptr); + } + if ((job_ptr->details == NULL) || (job_ptr->details->dependency == NULL)) continue; diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 6119e1e007d143b8e01f30b23a1b758a9cfcf42f..ec20114e15c48de08404bb45831749115fdd42f1 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -382,6 +382,7 @@ struct job_record { * the job step completes, 2 if kill * in progress */ char *licenses; /* licenses required by the job */ + List license_list; /* structure with license info */ uint16_t mail_type; /* see MAIL_JOB_* in slurm.h */ char *mail_user; /* user to get e-mail notification */ uint32_t magic; /* magic cookie for data integrity */ diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 2ca1614fa26b5386b3f1d4a2360cde6af8930992..170c7aa5df9959f2131b760c9a3f2a80662d0b37 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -394,6 +394,8 @@ job_desc_msg_create_from_opts () j->mail_user = xstrdup(opt.mail_user); if (opt.begin) j->begin_time = opt.begin; + if (opt.licenses) + j->licenses = xstrdup(opt.licenses); if (opt.network) j->network = xstrdup(opt.network); if (opt.account) diff --git a/src/srun/opt.c b/src/srun/opt.c index a1749e8f93159d8716245c34fbaa7a1626629b19..96bb478c73a4d49dfe53a0507fd08ba9efa41111 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -956,6 +956,7 @@ static void set_options(const int argc, char **argv) {"no-kill", no_argument, 0, 'k'}, {"kill-on-bad-exit", no_argument, 0, 'K'}, {"label", no_argument, 0, 'l'}, + {"licenses", required_argument, 0, 'L'}, {"distribution", required_argument, 0, 'm'}, {"ntasks", required_argument, 0, 'n'}, {"nodes", required_argument, 0, 'N'}, @@ -1038,7 +1039,7 @@ static void set_options(const int argc, char **argv) {"acctg-freq", required_argument, 0, LONG_OPT_ACCTG_FREQ}, {NULL, 0, 0, 0} }; - char *opt_string = "+aAbB:c:C:d:D:e:g:Hi:IjJ:kKlm:n:N:" + char *opt_string = "+aAbB:c:C:d:D:e:g:Hi:IjJ:kKlL:m:n:N:" "o:Op:P:qQr:R:st:T:uU:vVw:W:x:XZ"; struct option *optz = spank_option_table_create (long_options); @@ -1155,6 +1156,10 @@ static void set_options(const int argc, char **argv) case (int)'l': opt.labelio = true; break; + case 'L': + xfree(opt.licenses); + opt.licenses = xstrdup(optarg); + break; case (int)'m': opt.distribution = verify_dist_type(optarg, &opt.plane_size); @@ -2212,7 +2217,7 @@ static void _usage(void) " [--share] [--label] [--unbuffered] [-m dist] [-J jobname]\n" " [--jobid=id] [--verbose] [--slurmd_debug=#]\n" " [--core=type] [-T threads] [-W sec] [--checkpoint=time]\n" -" [--checkpoint-path=dir]\n" +" [--checkpoint-path=dir] [--licenses=names]\n" " [--contiguous] [--mincpus=n] [--mem=MB] [--tmp=MB] [-C list]\n" " [--mpi=type] [--account=name] [--dependency=type:jobid]\n" " [--kill-on-bad-exit] [--propagate[=rlimits] [--comment=name]\n" @@ -2292,6 +2297,7 @@ static void _help(void) " --multi-prog if set the program name specified is the\n" " configuration specification for multiple programs\n" " --get-user-env used by Moab. See srun man page.\n" +" -L, --licenses=names required license, comma separated\n" " --checkpoint=time job step checkpoint interval\n" " --checkpoint-path=dir path to store job step checkpoint image files\n" #ifdef HAVE_PTY_H diff --git a/src/srun/opt.h b/src/srun/opt.h index 87f9d90b39113b9287cb1e3cf200a742c78d81a0..f4668e37e2ee650a91fed53dacfcb9890a94a5c1 100644 --- a/src/srun/opt.h +++ b/src/srun/opt.h @@ -162,6 +162,7 @@ typedef struct srun_options { char *propagate; /* --propagate[=RLIMIT_CORE,...]*/ char *task_epilog; /* --task-epilog= */ char *task_prolog; /* --task-prolog= */ + char *licenses; /* --licenses, -L */ /* constraint options */ int32_t job_min_cpus; /* --mincpus=n */