diff --git a/NEWS b/NEWS index 5a25e46349cc21b0bcd4ee34e191f8bae66e67dc..e79cce7b7f8cbca7d9fa394a9cb6542610ea1690 100644 --- a/NEWS +++ b/NEWS @@ -134,7 +134,8 @@ documents those changes that are of interest to users and admins. added the default will be set to this if none is given in the sacctmgr line. -- Added configure option --enable-sun-const for Sun Constellation system with 3D torus interconnect. Supports proper smap and sview displays for 3-D - topology. + topology. Node names are automatically put into Hilbert curve order given + a one-line nodelist definition in slurm.conf (e.g. NodeNames=sun[000x533]). * Changes in SLURM 1.3.12 ========================= diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index bd99bd0cbeff82b4de99257ab4233431d341f058..faf296b330270a669097f670294afaa57b8ce94a 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -279,18 +279,21 @@ parameter in slurm.conf. .TP \fB\-\-exclusive\fR -When used to initiate a job step within an existing resource allocation, -proceed only when processors can be dedicated to the job step without -sharing with other job steps. This can be used to initiate many -job steps simultaneously within an existing job allocation and have -SLURM perform resource management for the job. -In this mode, use with the \fB\-\-ntasks\fR option and NOT the -\fB\-\-nodes\fR, \fB\-\-relative\fR, \fB\-\-relative\fR=\fIarbitrary\fR -options (which provide user control over task layout). +When used to initiate a job, the job allocation cannot share nodes with +other running jobs. This is the oposite of \-\-share, whichever option +is seen last on the command line will win. (The default shared/exclusive +behaviour depends on system configuration.) + +This option can also be used when initiating more than job step within +an existing resource allocation and you want separate processors to +be dedicated to each job step. If sufficient processors are not +available to initiate the job step, it will be deferred. This can +be thought of as providing resource management for the job within +it's allocation. Since resource management is provided by +processor, the \fB\-\-ntasks\fR option must be specified, but the +following options should NOT be specified \fB\-\-nodes\fR, +\fB\-\-relative\fR, \fB\-\-relative\fR=\fIarbitrary\fR. See \fBEXAMPLE\fR below. -When used to initiate a job, dedicate whole nodes to the job rather -than individual processors even if consumable resources are enabled -(e.g. \fBSelectType=select/cons_res\fR). .TP \fB\-\-gid\fR=\fIgroup\fR diff --git a/src/slurmctld/Makefile.am b/src/slurmctld/Makefile.am index 584b4a2e3b93aa7855d0311df34d10cdcc4d8de5..5ccac1a4f1ae9306b5829a275a6a3f6822c598be 100644 --- a/src/slurmctld/Makefile.am +++ b/src/slurmctld/Makefile.am @@ -20,6 +20,9 @@ slurmctld_SOURCES = \ agent.h \ backup.c \ controller.c \ + hilbert.c \ + hilbert.h \ + hilbert_slurm.c \ job_mgr.c \ job_scheduler.c \ job_scheduler.h \ diff --git a/src/slurmctld/Makefile.in b/src/slurmctld/Makefile.in index b2916d54a753654f87db000edd4854021ac6f3b9..a735c6182d6efb39a4a9da1797b557045918a5f1 100644 --- a/src/slurmctld/Makefile.in +++ b/src/slurmctld/Makefile.in @@ -75,7 +75,8 @@ am__installdirs = "$(DESTDIR)$(sbindir)" sbinPROGRAMS_INSTALL = $(INSTALL_PROGRAM) PROGRAMS = $(sbin_PROGRAMS) am_slurmctld_OBJECTS = acct_policy.$(OBJEXT) agent.$(OBJEXT) \ - backup.$(OBJEXT) controller.$(OBJEXT) job_mgr.$(OBJEXT) \ + backup.$(OBJEXT) controller.$(OBJEXT) hilbert.$(OBJEXT) \ + hilbert_slurm.$(OBJEXT) job_mgr.$(OBJEXT) \ job_scheduler.$(OBJEXT) licenses.$(OBJEXT) locks.$(OBJEXT) \ node_mgr.$(OBJEXT) node_scheduler.$(OBJEXT) \ partition_mgr.$(OBJEXT) ping_nodes.$(OBJEXT) \ @@ -288,6 +289,9 @@ slurmctld_SOURCES = \ agent.h \ backup.c \ controller.c \ + hilbert.c \ + hilbert.h \ + hilbert_slurm.c \ job_mgr.c \ job_scheduler.c \ job_scheduler.h \ @@ -393,6 +397,8 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/agent.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/backup.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/controller.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hilbert.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hilbert_slurm.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/job_mgr.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/job_scheduler.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/licenses.Po@am__quote@ diff --git a/src/slurmctld/hilbert.c b/src/slurmctld/hilbert.c new file mode 100644 index 0000000000000000000000000000000000000000..6322fe0df981db6e3952c2369cdc1c6b96f7fa7f --- /dev/null +++ b/src/slurmctld/hilbert.c @@ -0,0 +1,88 @@ +//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +// Filename: hilbert.c +// +// Purpose: Hilbert and Linked-list utility procedures for BayeSys3. +// +// History: TreeSys.c 17 Apr 1996 - 31 Dec 2002 +// Peano.c 10 Apr 2001 - 11 Jan 2003 +// merged 1 Feb 2003 +// Arith debug 28 Aug 2003 +// Hilbert.c 14 Oct 2003 +// 2 Dec 2003 +//----------------------------------------------------------------------------- +/* + Copyright (c) 1996-2003 Maximum Entropy Data Consultants Ltd, + 114c Milton Road, Cambridge CB4 1XE, England + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +#include "license.txt" +*/ + +#include "src/slurmctld/hilbert.h" + +extern void TransposetoAxes( +coord_t* X, // I O position [n] +int b, // I # bits +int n) // I dimension +{ + coord_t M, P, Q, t; + int i; + +// Gray decode by H ^ (H/2) + t = X[n-1] >> 1; + for( i = n-1; i; i-- ) + X[i] ^= X[i-1]; + X[0] ^= t; + +// Undo excess work + M = 2 << (b - 1); + for( Q = 2; Q != M; Q <<= 1 ) + { + P = Q - 1; + for( i = n-1; i; i-- ) + if( X[i] & Q ) X[0] ^= P; // invert + else{ t = (X[0] ^ X[i]) & P; X[0] ^= t; X[i] ^= t; } // exchange + if( X[0] & Q ) X[0] ^= P; // invert + } +} +extern void AxestoTranspose( +coord_t* X, // I O position [n] +int b, // I # bits +int n) // I dimension +{ + coord_t P, Q, t; + int i; + +// Inverse undo + for( Q = 1 << (b - 1); Q > 1; Q >>= 1 ) + { + P = Q - 1; + if( X[0] & Q ) X[0] ^= P; // invert + for( i = 1; i < n; i++ ) + if( X[i] & Q ) X[0] ^= P; // invert + else{ t = (X[0] ^ X[i]) & P; X[0] ^= t; X[i] ^= t; } // exchange + } + +// Gray encode (inverse of decode) + for( i = 1; i < n; i++ ) + X[i] ^= X[i-1]; + t = X[n-1]; + for( i = 1; i < b; i <<= 1 ) + X[n-1] ^= X[n-1] >> i; + t ^= X[n-1]; + for( i = n-2; i >= 0; i-- ) + X[i] ^= t; +} diff --git a/src/slurmctld/hilbert.h b/src/slurmctld/hilbert.h new file mode 100644 index 0000000000000000000000000000000000000000..4fec84fd8d118467e8cfe30800f847d16e6dc5b4 --- /dev/null +++ b/src/slurmctld/hilbert.h @@ -0,0 +1,44 @@ +//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +// Filename: hilbert.h +// +// Purpose: Hilbert and Linked-list utility procedures for BayeSys3. +// +// History: TreeSys.c 17 Apr 1996 - 31 Dec 2002 +// Peano.c 10 Apr 2001 - 11 Jan 2003 +// merged 1 Feb 2003 +// Arith debug 28 Aug 2003 +// Hilbert.c 14 Oct 2003 +// 2 Dec 2003 +//----------------------------------------------------------------------------- +/* + Copyright (c) 1996-2003 Maximum Entropy Data Consultants Ltd, + 114c Milton Road, Cambridge CB4 1XE, England + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +#include "license.txt" +*/ + +typedef unsigned int coord_t; // char,short,int for up to 8,16,32 bits per word + +extern void TransposetoAxes( +coord_t* X, // I O position [n] +int b, // I # bits +int n); // I dimension + +extern void AxestoTranspose( +coord_t* X, // I O position [n] +int b, // I # bits +int n); // I dimension diff --git a/src/slurmctld/hilbert_slurm.c b/src/slurmctld/hilbert_slurm.c new file mode 100644 index 0000000000000000000000000000000000000000..ad044fc53ffa7a45587e88fc531637bea9b7ae66 --- /dev/null +++ b/src/slurmctld/hilbert_slurm.c @@ -0,0 +1,161 @@ +/*****************************************************************************\ + * hilbert_slurm.c - Reorder the node records to place them into order + * on a Hilbert curve so that the resource allocation problem in + * N-dimensions can be reduced to a 1-dimension problem + ***************************************************************************** + * Copyright (C) 2008 Lawrence Livermore National Security. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov>, et. al. + * LLNL-CODE-402394. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "src/slurmctld/hilbert.h" +#include "src/slurmctld/slurmctld.h" + +static int _coord(char coord) +{ + if ((coord >= '0') && (coord <= '9')) + return (coord - '0'); + if ((coord >= 'A') && (coord <= 'Z')) + return (coord - 'A'); + return -1; +} + +/* Using the node record table, generate a Hilbert integer for each node + * based upon its coordinates and sort the records in that order. This must + * be called once, immediately after reading the slurm.conf file. */ +extern void nodes_to_hilbert_curve(void) +{ + int coord_inx, i, j, k, max_coord = 0, min_inx; + uint32_t min_val; + int *coords; + struct node_record *node_ptr, *node_ptr2; +#ifdef HAVE_3D + coord_t hilbert[3]; + int dims = 3; +#else + coord_t hilbert[2]; + int dims = 2; + fatal("current logic only supports 3-dimensions"); +#endif /* HAVE_3D */ + + /* Get the coordinates for each node based upon its numeric suffix */ + coords = xmalloc(sizeof(int) * node_record_count * dims); + for (i=0, coord_inx=0, node_ptr=node_record_table_ptr; + i<node_record_count; i++, node_ptr++) { + j = strlen(node_ptr->name); + if (j < dims) { + fatal("hostname %s lacks numeric %d dimension suffix", + node_ptr->name, dims); + } + for (k=dims; k; k--) { + coords[coord_inx] = _coord(node_ptr->name[j-k]); + if (coords[coord_inx] < 0) { + fatal("hostname %s lacks valid numeric suffix", + node_ptr->name); + } + max_coord = MAX(max_coord, coords[coord_inx]); + coord_inx++; /* Don't put into MAX macro */ + } + } + if (max_coord > 31) { + fatal("maximum node coordinate exceeds system limit (%d>32)", + max_coord); + } + + /* Generate each node's Hilbert integer */ + for (i=0, coord_inx=0, node_ptr=node_record_table_ptr; + i<node_record_count; i++, node_ptr++) { + for (j=0; j<dims; j++) + hilbert[j] = coords[coord_inx++]; + AxestoTranspose(hilbert, 5, dims); +#ifdef HAVE_3D + node_ptr->hilbert_integer = + ((hilbert[0]>>4 & 1) << 14) + ((hilbert[1]>>4 & 1) << 13) + + ((hilbert[2]>>4 & 1) << 12) + ((hilbert[0]>>3 & 1) << 11) + + ((hilbert[1]>>3 & 1) << 10) + ((hilbert[2]>>3 & 1) << 9) + + ((hilbert[0]>>2 & 1) << 8) + ((hilbert[1]>>2 & 1) << 7) + + ((hilbert[2]>>2 & 1) << 6) + ((hilbert[0]>>1 & 1) << 5) + + ((hilbert[1]>>1 & 1) << 4) + ((hilbert[2]>>1 & 1) << 3) + + ((hilbert[0]>>0 & 1) << 2) + ((hilbert[1]>>0 & 1) << 1) + + ((hilbert[2]>>0 & 1) << 0); +#else + /* A variation on the above calculation would be required here + * for other dimension counts */ +#endif + } + + /* Now we need to sort the node records. We only need to move a few + * fields since the others were all initialized to identical values */ + for (i=0; i<node_record_count; i++) { + min_val = node_record_table_ptr[i].hilbert_integer; + min_inx = i; + for (j=(i+1); j<node_record_count; j++) { + if (node_record_table_ptr[j].hilbert_integer < + min_val) { + min_val = node_record_table_ptr[j]. + hilbert_integer; + min_inx = j; + } + } + if (min_inx != i) { /* swap records */ + char *tmp_name; + int tmp_val; + node_ptr = node_record_table_ptr + i; + node_ptr2 = node_record_table_ptr + min_inx; + + tmp_name = node_ptr->name; + node_ptr->name = node_ptr2->name; + node_ptr2->name = tmp_name; + + tmp_name = node_ptr->comm_name; + node_ptr->comm_name = node_ptr2->comm_name; + node_ptr2->comm_name = tmp_name; + + tmp_val = node_ptr->hilbert_integer; + node_ptr->hilbert_integer = node_ptr2->hilbert_integer; + node_ptr2->hilbert_integer = tmp_val; + } + } + +#if 0 + /* Log the results */ + for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; + i++, node_ptr++) { + info("%s: %u", node_ptr->name, node_ptr->hilbert_integer); + } +#endif +} + diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index e7ce99fbc3e2526cecb07ffc2934d6b41851e428..74190d9c3277fd3bb52f905a3422324ba6243101 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -587,8 +587,12 @@ static int _build_all_nodeline_info(slurm_ctl_conf_t *conf) if (node_rec == NULL) fatal("No node %s configured", node_000); xfree(node_000); +#ifndef HAVE_BG + if (count == 1) + nodes_to_hilbert_curve(); +#endif /* ! HAVE_BG */ } -#endif +#endif /* HAVE_3D */ return SLURM_SUCCESS; } diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index da25b8de18e0ed2bb0579b7926df95d65fa8266e..84d3679e51072675eff505b9ef58417f8309944c 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -226,7 +226,10 @@ struct node_record { * use for scheduling purposes */ char *arch; /* computer architecture */ char *os; /* operating system currently running */ - struct node_record *node_next; /* next entry with same hash index */ + struct node_record *node_next; /* next entry with same hash index */ +#ifdef HAVE_3D + uint32_t hilbert_integer; /* Hilbert number based on node name */ +#endif }; extern struct node_record *node_record_table_ptr; /* ptr to node records */ @@ -1176,6 +1179,11 @@ extern void node_not_resp (char *name, time_t msg_time); * and log that the node is not responding using a hostlist expression */ extern void node_no_resp_msg(void); +/* Using the node record table, generate a Hilbert integer for each node + * based upon its coordinates and sort the records in that order. This must + * be called once, immediately after reading the slurm.conf file. */ +extern void nodes_to_hilbert_curve(void); + /* * job_alloc_info - get details about an existing job allocation * IN uid - job issuing the code @@ -1185,7 +1193,6 @@ extern void node_no_resp_msg(void); extern int job_alloc_info(uint32_t uid, uint32_t job_id, struct job_record **job_pptr); - /* * pack_all_jobs - dump all job information for all jobs in * machine independent form (for network transmission)