diff --git a/NEWS b/NEWS index eb348e542b751fd76f1ce8aeff49a383eb0502b1..360c9468ebdaec8bc5c02dcd55c0a53a566b5c3c 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,12 @@ documents those changes that are of interest to users and admins. ============================= -- OpenMPI users only: Add srun logic to automatically recreate and re-launch a job step if the step fails with a reserved port conflict. + -- Add TopologyPlugin configuration parameter (not currently used). + -- Add switch topology data structure to slurmctld (for use by select plugin) + add load it based upon new slurm.conf parameters: SwitchName, Nodes, and + Switches. + -- Modify select/linear and select/cons_res plugins to optimize resource + allocation with respect to network topology. * Changes in SLURM 1.4.0-pre8 ============================= diff --git a/RELEASE_NOTES b/RELEASE_NOTES index ad2ed4723370e99ea9be7b953dc3bada5d9704e2..60f442c692a5994ea70c3e5b6f8ec133be6c350c 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -54,6 +54,8 @@ HIGHLIGHTS priority partition will resume once preempting job completes. For more information see: https://computing.llnl.gov/linux/slurm/preempt.html +* Add support for optimize resource allocation with respect to network + topology. Requires switch configuration information be added to slurm.conf. * Support added for Sun Constellation system with optimized resource allocation for a 3-dimensional torus interconnect. For more information see: https://computing.llnl.gov/linux/slurm/sun_const.html @@ -95,6 +97,7 @@ CONFIGURATION FILE CHANGES (see "man slurm.conf" for details) it has been allocated. The entity bound to (sockets, cores or threads) will be automatically set based upon the allocation size and task count SLURM's SPANK cpuset plugin is no longer be needed. +* Added switch topology configuration options: SwitchName, Nodes, Switches. * BLUEGENE - Added option DenyPassthrough in the bluegene.conf. Can be set to any combination of X,Y,Z to not allow passthroughs when running in dynamic layout mode. (see "man bluegene.conf" for details) @@ -113,7 +116,7 @@ COMMAND CHANGES (see man pages for details) rather than "update" with a new partition name. * Time format of all SLURM command set to ISO 8601 (yyyy-mm-ddThh:mm:ss) unless the configure option "--disable-iso8601" is used at build time. -* sacct -S to satus a job will no longer work. Use sstat from now on. +* sacct -S to status a job will no longer work. Use sstat from now on. * sacct and sstat have been rewritten to have a more sacctmgr like feel ACCOUNTING CHANGES diff --git a/RELEASE_NOTES_LLNL b/RELEASE_NOTES_LLNL index f08de5f3239a8d6296a9998afeb16b59a28f7e2e..cbb974d2510114558a3b0c7ff7da1d4bd9b7d840 100644 --- a/RELEASE_NOTES_LLNL +++ b/RELEASE_NOTES_LLNL @@ -9,8 +9,14 @@ said sampling rate from the default (every 30 seconds) by setting the "JobAcctGatherFrequency" option to a different number of seconds in the slurm.conf. -Configure "TaskPlugin=task/affinity" and remove the "auto-affinity.so" -SPANK plugin with equivalent functionality. +For InfiniBand switch systems, add switch topology information to +slurm.conf. Options used are SwitchName, Switches, and Nodes. The +SwitchName is any conveneient name for bookkeeping purposes only. +For example: +SwitchName=s0 Nodes=tux[0-11] +SwitchName=s1 Nodes=tux[12-23] +SwitchName=s2 Nodes=tux[24-35] +SwitchName=s3 Switches=s[0-2] Remove the "preserve-env.so" SPANK plugin. The functionality is now directly in SLURM. diff --git a/doc/html/configurator.html.in b/doc/html/configurator.html.in index ab342d42c40c67fab00efffad37145e131ea9b1f..2e629e036365b61baee61013d7f3de15e26bc263 100644 --- a/doc/html/configurator.html.in +++ b/doc/html/configurator.html.in @@ -186,6 +186,7 @@ function displayfile() "TaskPlugin=task/" + get_radio_value(document.config.task_plugin) + "<br>" + get_task_plugin_param() + "<br>" + get_field("TaskProlog",document.config.task_prolog) + "<br>" + + "#TopologyPlugin=topology/slurm.conf <br>" + "#TmpFs=/tmp <br>" + "#TrackWCKey=no <br>" + "#TreeWidth= <br>" + @@ -835,6 +836,6 @@ before terminating all remaining tasks. A value of zero indicates unlimited wait </FORM> <HR> <P class="footer">LLNL-WEB-402631<BR> -Last modified 15 December 2008</P> +Last modified 4 March 2009</P> </BODY> diff --git a/doc/html/news.shtml b/doc/html/news.shtml index 57de31ad4b1c44af33e004835ceca0f0a5712563..92b765f7f55bf5d7d59b483d1875505a78f3084d 100644 --- a/doc/html/news.shtml +++ b/doc/html/news.shtml @@ -58,6 +58,8 @@ of the higher priority job.</li> preempt or gang schedule jobs.</li> <li>A new configuration parameter, <i>PrologSlurmctld</i>, can be used to support the booting of different operating systems for each job.</li> +<li>Added switch topology configuration options to optimize job resource +allocation with respect to communication performance.</li> <li>Automatic <a href="checkpoint_blcr.html">Checkpoint/Restart using BRCR</a> is now available.</li> </ul> @@ -77,6 +79,6 @@ hierarchical switches).</li> and refresh.</li> </ul> -<p style="text-align:center;">Last modified 3 March 2009</p> +<p style="text-align:center;">Last modified 5 March 2009</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index c89c8026b56bbd6a827c1e7c14f0c517a67a4952..1f3fe20375143d92a9d8c634aed41cb440397c2d 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1,4 +1,4 @@ -.TH "slurm.conf" "5" "February 2009" "slurm.conf 2.0" "Slurm configuration file" +.TH "slurm.conf" "5" "March 2009" "slurm.conf 2.0" "Slurm configuration file" .SH "NAME" slurm.conf \- Slurm configuration file @@ -1493,6 +1493,17 @@ temporary storage. This parameter is used in establishing a node's \fBTmpDisk\fR space. The default value is "/tmp". +.TP +\fBTopologyPlugin\fR +Identifies the plugin to be used for determining the network topology +and optimizing job allocations to minimize network contention. +The only currently acceptable value is +"topology/slurm.conf", which will determine the network topology based +upon information contained in the slurm.conf file. +See \fBNETWORK TOPOLOGY\fR below for details. +Additional plugins may be provided in the future which gather topology +information directly from the network. + .TP \fBTrackWCKey\fR Boolean yes or no. Used to set display and track of the Workload @@ -2028,6 +2039,47 @@ Recommended only for systems running with gang scheduling State of partition or availability for use. Possible values are "UP" or "DOWN". The default value is "UP". +.SH "NETWORK TOPOLOGY" +SLURM is able to optimze job allocations to minimize network contention. +Special SLURM logic is used to optimize allocations on systems with a +three\-dimensional interconnect (BlueGene, Sun Constellation, etc.) +and information about configuring those systems are availble on +web pages available here: <https://computing.llnl.gov/linux/slurm/>. +For a hierarchical network, SLURM needs to have detailed information +about how nodes are configured on the network switches as described below. +.LP +Given network topology information, SLURM allocates all of a job's +resources onto a single leaf of the network (if possible) using a best\-fit +algorithm. +Otherwise it will allocate a job's resources onto multiple leaf switches +so as to minimize the use of higher\-level switches. +The \fBTopologyPlugin\fR parameter controls which plugin is used to +collect network topology information. +The only value presently supported is "topology/slurm.conf", which the +default and will load that information from the slurm.conf file. +Future plugins may gather topology information directly from the network. +The topology information is optional. +If not provided, SLURM will perform a best\-fit algorithm assuming the +nodes are in a one\-dimensional array as configured and the communications +cost is related to the node distance in this array. +.LP +The network topology configuration one one line defining a switch name and +its children, either node names or switch names. +SLURM hostlist expressions can be used for either. +.TP +\fBSwitchName\fR +The name of a switch. This name is internal to SLURM and arbitrary. +Each switch should have a unique name. +This field must be specified. +.TP +\fBSwitches\fR +Child switches of the named switch. +Either this option or the \fBNodes\fR option must be specified. +.TP +\fBNodes\fR +Child Nodes of the named leaf switch. +Either this option or the \fBSwitches\fR option must be specified. + .SH "RELOCATING CONTROLLERS" If the cluster's computers used for the primary or backup controller will be out of service for an extended period of time, it may be @@ -2155,6 +2207,20 @@ PartitionName=debug Nodes=dev[0\-8,18\-25] Default=YES PartitionName=batch Nodes=dev[9\-17] MinNodes=4 .br PartitionName=long Nodes=dev[9\-17] MaxTime=120 AllowGroups=admin +.br +# +.br +# Switch Configurations +.br +# +.br +SwitchName=s0 Nodes=dev[0\-5] +.br +SwitchName=s1 Nodes=dev[6\-11] +.br +SwitchName=s2 Nodes=dev[12\-17] +.br +SwitchName=s3 Switches=s[0\-2] .SH "COPYING" Copyright (C) 2002\-2007 The Regents of the University of California. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index df5a6d801903e6731fde38e6e748a1f215988367..b436dc1c193f51836e255731dc5f226f84884b8c 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1174,6 +1174,7 @@ typedef struct slurm_ctl_conf { uint16_t task_plugin_param; /* see TASK_PARAM_* */ char *task_prolog; /* pathname of task launch prolog */ char *tmp_fs; /* pathname of temporary file system */ + char *topology_plugin; /* network topology plugin */ uint16_t track_wckey; /* see if we are using wckey or not */ uint16_t tree_width; /* number of threads per node to span */ char *unkillable_program; /* program run by the slurmstepd when diff --git a/src/api/config_info.c b/src/api/config_info.c index f719e1f327edc2c6aced5cb3720f5e5bc21dbcaf..b2f2669bab7eb7fba9f9df4598e1eb7c301959eb 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -394,6 +394,8 @@ void slurm_print_ctl_conf ( FILE* out, slurm_ctl_conf_ptr->task_prolog); fprintf(out, "TmpFS = %s\n", slurm_ctl_conf_ptr->tmp_fs); + fprintf(out, "TopologyPlugin = %s\n", + slurm_ctl_conf_ptr->topology_plugin); fprintf(out, "TrackWCKey = %u\n", slurm_ctl_conf_ptr->track_wckey); fprintf(out, "TreeWidth = %u\n", diff --git a/src/common/bitstring.c b/src/common/bitstring.c index 757dbd497670d4d027cea29cb709a3b1f705169d..e99faa631688bff430080071ea91a3bcc9ee74d3 100644 --- a/src/common/bitstring.c +++ b/src/common/bitstring.c @@ -655,12 +655,21 @@ int bit_set_count(bitstr_t *b) { int count = 0; - bitoff_t bit; + bitoff_t bit, bit_cnt; + int word_size = sizeof(bitstr_t) * 8; _assert_bitstr_valid(b); - for (bit = 0; bit < _bitstr_bits(b); bit += sizeof(bitstr_t)*8) + bit_cnt = _bitstr_bits(b); + for (bit = 0; bit < bit_cnt; bit += word_size) { + if ((bit + word_size - 1) >= bit_cnt) + break; count += hweight(b[_bit_word(bit)]); + } + for ( ; bit < bit_cnt; bit++) { + if (bit_test(b, bit)) + count++; + } return count; } @@ -672,14 +681,23 @@ extern int bit_overlap(bitstr_t *b1, bitstr_t *b2) { int count = 0; - bitoff_t bit; - + bitoff_t bit, bit_cnt; + int word_size = sizeof(bitstr_t) * 8; + _assert_bitstr_valid(b1); _assert_bitstr_valid(b2); assert(_bitstr_bits(b1) == _bitstr_bits(b2)); - for (bit = 0; bit < _bitstr_bits(b1); bit += sizeof(bitstr_t)*8) + bit_cnt = _bitstr_bits(b1); + for (bit = 0; bit < bit_cnt; bit += word_size) { + if ((bit + word_size - 1) >= bit_cnt) + break; count += hweight(b1[_bit_word(bit)] & b2[_bit_word(bit)]); + } + for ( ; bit < bit_cnt; bit++) { + if (bit_test(b1, bit) && bit_test(b2, bit)) + count++; + } return count; } @@ -825,6 +843,7 @@ bitstr_t * bit_pick_cnt(bitstr_t *b, bitoff_t nbits) { bitoff_t bit = 0, new_bits, count = 0; bitstr_t *new; + int word_size = sizeof(bitstr_t) * 8; _assert_bitstr_valid(b); @@ -839,15 +858,16 @@ bit_pick_cnt(bitstr_t *b, bitoff_t nbits) { int word = _bit_word(bit); if (b[word] == 0) { - bit += sizeof(bitstr_t)*8; + bit += word_size; continue; } new_bits = hweight(b[word]); - if ((count + new_bits) <= nbits) { + if (((count + new_bits) <= nbits) && + ((bit + word_size - 1) < _bitstr_bits(b))) { new[word] = b[word]; count += new_bits; - bit += sizeof(bitstr_t)*8; + bit += word_size; continue; } while ((bit < _bitstr_bits(b)) && (count < nbits)) { diff --git a/src/common/read_config.c b/src/common/read_config.c index 939e47be8874037b4e52bce29542452735724cd2..ff6a07cd8d227203cb81aa551b51bc8d89abc884 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -119,6 +119,10 @@ static int parse_downnodes(void **dest, slurm_parser_enum_t type, const char *key, const char *value, const char *line, char **leftover); static void destroy_downnodes(void *ptr); +static int parse_switches(void **dest, slurm_parser_enum_t type, + const char *key, const char *value, + const char *line, char **leftover); +static void destroy_switches(void *ptr); static int defunct_option(void **dest, slurm_parser_enum_t type, const char *key, const char *value, const char *line, char **leftover); @@ -252,6 +256,7 @@ s_p_options_t slurm_conf_options[] = { {"TaskPlugin", S_P_STRING}, {"TaskPluginParam", S_P_STRING}, {"TmpFS", S_P_STRING}, + {"TopologyPlugin", S_P_STRING}, {"TrackWCKey", S_P_BOOLEAN}, {"TreeWidth", S_P_UINT16}, {"UnkillableStepProgram", S_P_STRING}, @@ -263,6 +268,7 @@ s_p_options_t slurm_conf_options[] = { {"PartitionName", S_P_ARRAY, parse_partitionname, destroy_partitionname}, {"DownNodes", S_P_ARRAY, parse_downnodes, destroy_downnodes}, + {"SwitchName", S_P_ARRAY, parse_switches, destroy_switches}, {NULL} }; @@ -713,7 +719,7 @@ static void destroy_downnodes(void *ptr) xfree(ptr); } -int slurm_conf_downnodes_array(slurm_conf_downnodes_t **ptr_array[]) +extern int slurm_conf_downnodes_array(slurm_conf_downnodes_t **ptr_array[]) { int count; slurm_conf_downnodes_t **ptr; @@ -727,6 +733,67 @@ int slurm_conf_downnodes_array(slurm_conf_downnodes_t **ptr_array[]) } } +static int parse_switches(void **dest, slurm_parser_enum_t type, + const char *key, const char *value, + const char *line, char **leftover) +{ + s_p_hashtbl_t *tbl; + slurm_conf_switches_t *s; + static s_p_options_t _switch_options[] = { + {"Nodes", S_P_STRING}, + {"Switches", S_P_STRING}, + {NULL} + }; + + tbl = s_p_hashtbl_create(_switch_options); + s_p_parse_line(tbl, *leftover, leftover); + + s = xmalloc(sizeof(slurm_conf_switches_t)); + s->switch_name = xstrdup(value); + s_p_get_string(&s->nodes, "Nodes", tbl); + s_p_get_string(&s->switches, "Switches", tbl); + + if (s->nodes && s->switches) { + error("switch %s has both child switches and nodes", + s->switch_name); + destroy_switches(s); + return -1; + } + if (!s->nodes && !s->switches) { + error("switch %s has neither child switches nor nodes", + s->switch_name); + destroy_switches(s); + return -1; + } + + *dest = (void *)s; + + return 1; +} + +static void destroy_switches(void *ptr) +{ + slurm_conf_switches_t *s = (slurm_conf_switches_t *)ptr; + xfree(s->nodes); + xfree(s->switch_name); + xfree(s->switches); + xfree(ptr); +} + +extern int slurm_conf_switch_array(slurm_conf_switches_t **ptr_array[]) +{ + int count; + slurm_conf_switches_t **ptr; + + if (s_p_get_array((void ***)&ptr, &count, "SwitchName", conf_hashtbl)) { + *ptr_array = ptr; + return count; + } else { + *ptr_array = NULL; + return 0; + } +} + static void _free_name_hashtbl() { int i; @@ -1242,6 +1309,7 @@ free_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr, bool purge_node_hash) xfree (ctl_conf_ptr->task_plugin); xfree (ctl_conf_ptr->task_prolog); xfree (ctl_conf_ptr->tmp_fs); + xfree (ctl_conf_ptr->topology_plugin); xfree (ctl_conf_ptr->unkillable_program); if (purge_node_hash) @@ -1358,6 +1426,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) ctl_conf_ptr->task_plugin_param = 0; xfree (ctl_conf_ptr->task_prolog); xfree (ctl_conf_ptr->tmp_fs); + xfree (ctl_conf_ptr->topology_plugin); ctl_conf_ptr->tree_width = (uint16_t) NO_VAL; xfree (ctl_conf_ptr->unkillable_program); ctl_conf_ptr->unkillable_timeout = (uint16_t) NO_VAL; @@ -2228,7 +2297,10 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) if (!s_p_get_uint16(&conf->wait_time, "WaitTime", hashtbl)) conf->wait_time = DEFAULT_WAIT_TIME; - + + if (!s_p_get_string(&conf->topology_plugin, "TopologyPlugin", hashtbl)) + conf->topology_plugin = xstrdup(DEFAULT_TOPOLOGY_PLUGIN); + if (s_p_get_uint16(&conf->tree_width, "TreeWidth", hashtbl)) { if (conf->tree_width == 0) { error("TreeWidth=0 is invalid"); diff --git a/src/common/read_config.h b/src/common/read_config.h index 949ac154ca1ec5f0b67e499bf0ca6e52ac538486..ca4794b36d44a268e0aef025b93d108c0fcb8070 100644 --- a/src/common/read_config.h +++ b/src/common/read_config.h @@ -122,6 +122,7 @@ extern char *default_plugstack; #define DEFAULT_SWITCH_TYPE "switch/none" #define DEFAULT_TASK_PLUGIN "task/none" #define DEFAULT_TMP_FS "/tmp" +#define DEFAULT_TOPOLOGY_PLUGIN "topology/slurm.conf" #define DEFAULT_WAIT_TIME 0 #define DEFAULT_TREE_WIDTH 50 #define DEFAULT_UNKILLABLE_TIMEOUT 60 /* seconds */ @@ -173,6 +174,14 @@ typedef struct slurm_conf_downnodes { char *state; } slurm_conf_downnodes_t; +typedef struct slurm_conf_switches { + char *nodes; /* names of nodes directly connect to + * this switch, if any */ + char *switch_name; /* name of this switch */ + char *switches; /* names if child switches directly + * connected to this switch, if any */ +} slurm_conf_switches_t; + /* * slurm_conf_init - load the slurm configuration from the a file. * IN file_name - name of the slurm configuration file to be read @@ -238,12 +247,20 @@ extern int slurm_conf_partition_array(slurm_conf_partition_t **ptr_array[]); /* * Set "ptr_array" with the pointer to an array of pointers to - * slurm_conf_node_t structures. + * slurm_conf_downnodes_t structures. * * Return value is the length of the array. */ extern int slurm_conf_downnodes_array(slurm_conf_downnodes_t **ptr_array[]); +/* + * Set "ptr_array" with the pointer to an array of pointers to + * slurm_conf_switch_t structures. + * + * Return value is the length of the array. + */ +extern int slurm_conf_switch_array(slurm_conf_switches_t **ptr_array[]); + /* * slurm_conf_get_hostname - Return the NodeHostname for given NodeName * diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 941523bb6b540c36f0f23c4830d5cbdbcdda82da..6cd913c6e27ba0af9e513a92fa723376b61e5cb5 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -687,6 +687,24 @@ extern char *slurm_get_crypto_type(void) return crypto_type; } +/* slurm_get_topology_plugin + * returns the value of topology_plugin in slurmctld_conf object + * RET char * - topology type, MUST be xfreed by caller + */ +extern char * slurm_get_topology_plugin(void) +{ + char *topology_plugin = NULL; + slurm_ctl_conf_t *conf; + + if(slurmdbd_conf) { + } else { + conf = slurm_conf_lock(); + topology_plugin = xstrdup(conf->topology_plugin); + slurm_conf_unlock(); + } + return topology_plugin; +} + /* slurm_get_propagate_prio_process * return the PropagatePrioProcess flag from slurmctld_conf object */ diff --git a/src/common/slurm_protocol_api.h b/src/common/slurm_protocol_api.h index f8fbcef37d0f072d4da952b70c0ac090a0017721..df90212e2226899ab5b912b381a20ce1859dcf2e 100644 --- a/src/common/slurm_protocol_api.h +++ b/src/common/slurm_protocol_api.h @@ -286,16 +286,24 @@ extern char *slurm_get_crypto_type(void); */ extern uint16_t slurm_get_fast_schedule(void); -/* slurm_get_track_wckey - * returns the value of track_wckey in slurmctld_conf object +/* slurm_get_crypto_type + * returns the crypto_type from slurmctld_conf object + * RET char * - crypto type, MUST be xfreed by caller */ -extern uint16_t slurm_get_track_wckey(void); +extern char *slurm_get_crypto_type(void); + +/* slurm_get_topology_plugin + * returns the value of topology_plugin in slurmctld_conf object + * RET char * - topology type, MUST be xfreed by caller + */ +extern char * slurm_get_topology_plugin(void); /* slurm_set_tree_width * sets the value of tree_width in slurmctld_conf object * RET 0 or error code */ extern int slurm_set_tree_width(uint16_t tree_width); + /* slurm_get_tree_width * returns the value of tree_width in slurmctld_conf object */ diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 722bafc5b6d2e3ab9093ff608786842ba304bddf..b7dc360cd691da78998fc0079d4bf0b7ec7bacac 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1211,6 +1211,7 @@ void slurm_free_ctl_conf(slurm_ctl_conf_info_msg_t * config_ptr) xfree(config_ptr->task_plugin); xfree(config_ptr->task_prolog); xfree(config_ptr->tmp_fs); + xfree(config_ptr->topology_plugin); xfree(config_ptr->unkillable_program); xfree(config_ptr); } diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 1d73ef495d0adf3b7e7b6fdcf8e9c88f158a2c83..41235ee6ec1879a8ac14d2f59d9d976eedc85ae4 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2692,6 +2692,7 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) packstr(build_ptr->task_plugin, buffer); pack16(build_ptr->task_plugin_param, buffer); packstr(build_ptr->tmp_fs, buffer); + packstr(build_ptr->topology_plugin, buffer); pack16(build_ptr->track_wckey, buffer); pack16(build_ptr->tree_width, buffer); @@ -2891,6 +2892,7 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** safe_unpackstr_xmalloc(&build_ptr->task_plugin, &uint32_tmp, buffer); safe_unpack16(&build_ptr->task_plugin_param, buffer); safe_unpackstr_xmalloc(&build_ptr->tmp_fs, &uint32_tmp, buffer); + safe_unpackstr_xmalloc(&build_ptr->topology_plugin, &uint32_tmp, buffer); safe_unpack16(&build_ptr->track_wckey, buffer); safe_unpack16(&build_ptr->tree_width, buffer); diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index 4fdb1d6b91a0625751745c5e8bc71c69377a4eee..02a1030339b6d2923374f92286d9d8bc5ffffc17 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -105,6 +105,16 @@ #include "job_test.h" #include "select_cons_res.h" +#define SELECT_DEBUG 0 + +static int _eval_nodes(struct job_record *job_ptr, bitstr_t *node_map, + uint32_t min_nodes, uint32_t max_nodes, + uint32_t req_nodes, uint32_t cr_node_cnt, + uint16_t *cpu_cnt, uint32_t *freq, uint32_t size); +static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *node_map, + uint32_t min_nodes, uint32_t max_nodes, + uint32_t req_nodes, uint32_t cr_node_cnt, + uint16_t *cpu_cnt, uint32_t *freq, uint32_t size); /* _allocate_sockets - Given the job requirements, determine which sockets * from the given node can be allocated (if any) to this @@ -927,10 +937,25 @@ static int _eval_nodes(struct job_record *job_ptr, bitstr_t *node_map, uint16_t *layout_ptr = job_ptr->details->req_node_layout; xassert(node_map); - + + if (cr_node_cnt != node_record_count) { + error("cons_res: node count inconsistent with slurmctld"); + return error_code; + } if (bit_set_count(node_map) < min_nodes) return error_code; + if ((job_ptr->details->req_node_bitmap) && + (!bit_super_set(job_ptr->details->req_node_bitmap, node_map))) + return error_code; + + if (switch_record_cnt && switch_record_table) { + /* Perform optimized resource selection based upon topology */ + return _eval_nodes_topo(job_ptr, node_map, + min_nodes, max_nodes, req_nodes, + cr_node_cnt, cpu_cnt, freq, size); + } + consec_size = 50; /* start allocation for 50 sets of * consecutive nodes */ consec_cpus = xmalloc(sizeof(int) * consec_size); @@ -1127,6 +1152,296 @@ static int _eval_nodes(struct job_record *job_ptr, bitstr_t *node_map, return error_code; } +/* + * A network topology aware version of _eval_nodes(). + * NOTE: The logic here is almost identical to that of _job_test_topo() + * in select_linear.c. Any bug found here is probably also there. + */ +static int _eval_nodes_topo(struct job_record *job_ptr, bitstr_t *bitmap, + uint32_t min_nodes, uint32_t max_nodes, + uint32_t req_nodes, uint32_t cr_node_cnt, + uint16_t *cpu_cnt, uint32_t *freq, uint32_t size) +{ + bitstr_t **switches_bitmap; /* nodes on this switch */ + int *switches_cpu_cnt; /* total CPUs on switch */ + int *switches_node_cnt; /* total nodes on switch */ + int *switches_required; /* set if has required node */ + + bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */ + bitstr_t *req_nodes_bitmap = NULL; + int rem_cpus, rem_nodes; /* remaining resources desired */ + int avail_cpus, alloc_cpus = 0; + int i, j, rc = SLURM_SUCCESS; + int best_fit_inx, first, last; + int best_fit_nodes, best_fit_cpus; + int best_fit_location = 0, best_fit_sufficient; + bool sufficient; + + rem_cpus = job_ptr->num_procs; + if (req_nodes > min_nodes) + rem_nodes = req_nodes; + else + rem_nodes = min_nodes; + + if (job_ptr->details->req_node_bitmap) { + req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap); + i = bit_set_count(req_nodes_bitmap); + if (i > max_nodes) { + info("job %u requires more nodes than currently " + "available (%u>%u)", + job_ptr->job_id, i, max_nodes); + rc = SLURM_ERROR; + goto fini; + } + } + + /* Construct a set of switch array entries, + * use the same indexes as switch_record_table in slurmctld */ + switches_bitmap = xmalloc(sizeof(bitstr_t *) * switch_record_cnt); + switches_cpu_cnt = xmalloc(sizeof(int) * switch_record_cnt); + switches_node_cnt = xmalloc(sizeof(int) * switch_record_cnt); + switches_required = xmalloc(sizeof(int) * switch_record_cnt); + avail_nodes_bitmap = bit_alloc(node_record_count); + for (i=0; i<switch_record_cnt; i++) { + switches_bitmap[i] = bit_copy(switch_record_table[i]. + node_bitmap); + bit_and(switches_bitmap[i], bitmap); + bit_or(avail_nodes_bitmap, switches_bitmap[i]); + switches_node_cnt[i] = bit_set_count(switches_bitmap[i]); + if (req_nodes_bitmap && + bit_overlap(req_nodes_bitmap, switches_bitmap[i])) { + switches_required[i] = 1; + } + } + bit_nclear(bitmap, 0, node_record_count - 1); + +#if SELECT_DEBUG + /* Don't compile this, it slows things down too much */ + for (i=0; i<switch_record_cnt; i++) { + char *node_names = NULL; + if (switches_node_cnt[i]) + node_names = bitmap2node_name(switches_bitmap[i]); + debug("switch=%s nodes=%u:%s required:%u", + switch_record_table[i].name, + switches_node_cnt[i], node_names, + switches_required[i]); + xfree(node_names); + } +#endif + + if (req_nodes_bitmap && + (!bit_super_set(req_nodes_bitmap, avail_nodes_bitmap))) { + info("job %u requires nodes not available on any switch", + job_ptr->job_id); + rc = SLURM_ERROR; + goto fini; + } + + if (req_nodes_bitmap) { + /* Accumulate specific required resources, if any */ + first = bit_ffs(req_nodes_bitmap); + last = bit_fls(req_nodes_bitmap); + for (i=first; i<=last; i++) { + if (!bit_test(req_nodes_bitmap, i)) + continue; + if (max_nodes <= 0) { + info("job %u requires nodes than allowed", + job_ptr->job_id); + rc = SLURM_ERROR; + goto fini; + } + bit_set(bitmap, i); + bit_clear(avail_nodes_bitmap, i); + rem_nodes--; + max_nodes--; + avail_cpus = _get_cpu_cnt(job_ptr, i, cpu_cnt, + freq, size); + rem_cpus -= avail_cpus; + alloc_cpus += avail_cpus; + for (j=0; j<switch_record_cnt; j++) { + if (!bit_test(switches_bitmap[j], i)) + continue; + bit_clear(switches_bitmap[j], i); + switches_node_cnt[j]--; + } + } + if ((rem_nodes <= 0) && (rem_cpus <= 0)) + goto fini; + + /* Accumulate additional resources from leafs that + * contain required nodes */ + for (j=0; j<switch_record_cnt; j++) { + if ((switch_record_table[j].level != 0) || + (switches_node_cnt[j] == 0) || + (switches_required[j] == 0)) { + continue; + } + while ((max_nodes > 0) && + ((rem_nodes > 0) || (rem_cpus > 0))) { + i = bit_ffs(switches_bitmap[j]); + if (i == -1) + break; + bit_set(bitmap, i); + bit_clear(avail_nodes_bitmap, i); + bit_clear(switches_bitmap[j], i); + switches_node_cnt[j]--; + rem_nodes--; + max_nodes--; + avail_cpus = _get_cpu_cnt(job_ptr, i, cpu_cnt, + freq, size); + rem_cpus -= avail_cpus; + alloc_cpus += avail_cpus; + } + } + if ((rem_nodes <= 0) && (rem_cpus <= 0)) + goto fini; + + /* Update bitmaps and node counts for higher-level switches */ + for (j=0; j<switch_record_cnt; j++) { + if (switches_node_cnt[j] == 0) + continue; + first = bit_ffs(switches_bitmap[j]); + last = bit_fls(switches_bitmap[j]); + for (i=first; i<=last; i++) { + if (!bit_test(switches_bitmap[j], i)) + continue; + if (!bit_test(avail_nodes_bitmap, i)) { + /* cleared from lower level */ + bit_clear(switches_bitmap[j], i); + switches_node_cnt[j]--; + } else { + switches_cpu_cnt[j] += + _get_cpu_cnt(job_ptr, i, + cpu_cnt, freq, + size); + } + } + } + } else { + /* No specific required nodes, calculate CPU counts */ + for (j=0; j<switch_record_cnt; j++) { + first = bit_ffs(switches_bitmap[j]); + last = bit_fls(switches_bitmap[j]); + for (i=first; i<=last; i++) { + if (!bit_test(switches_bitmap[j], i)) + continue; + switches_cpu_cnt[j] += + _get_cpu_cnt(job_ptr, i, cpu_cnt, + freq, size); + } + } + } + + /* Determine lowest level switch satifying request with best fit */ + best_fit_inx = -1; + for (j=0; j<switch_record_cnt; j++) { + if ((switches_cpu_cnt[j] < rem_cpus) || + (!_enough_nodes(switches_node_cnt[j], rem_nodes, + min_nodes, req_nodes))) + continue; + if ((best_fit_inx == -1) || + (switch_record_table[j].level < + switch_record_table[best_fit_inx].level) || + ((switch_record_table[j].level == + switch_record_table[best_fit_inx].level) && + (switches_node_cnt[j] < switches_node_cnt[best_fit_inx]))) + best_fit_inx = j; + } + if (best_fit_inx == -1) { + error("job %u: best_fit topology failure", job_ptr->job_id); + rc = SLURM_ERROR; + goto fini; + } + bit_and(avail_nodes_bitmap, switches_bitmap[best_fit_inx]); + + /* Identify usable leafs (within higher switch having best fit) */ + for (j=0; j<switch_record_cnt; j++) { + if ((switch_record_table[j].level != 0) || + (!bit_super_set(switches_bitmap[j], + switches_bitmap[best_fit_inx]))) { + switches_node_cnt[j] = 0; + } + } + + /* Select resources from these leafs on a best-fit basis */ + while ((max_nodes > 0) && ((rem_nodes > 0) || (rem_cpus > 0))) { + best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0; + for (j=0; j<switch_record_cnt; j++) { + if (switches_node_cnt[j] == 0) + continue; + sufficient = (switches_cpu_cnt[j] >= rem_cpus) && + _enough_nodes(switches_node_cnt[j], + rem_nodes, min_nodes, + req_nodes); + /* If first possibility OR */ + /* first set large enough for request OR */ + /* tightest fit (less resource waste) OR */ + /* nothing yet large enough, but this is biggest */ + if ((best_fit_nodes == 0) || + (sufficient && (best_fit_sufficient == 0)) || + (sufficient && + (switches_cpu_cnt[j] < best_fit_cpus)) || + ((sufficient == 0) && + (switches_cpu_cnt[j] > best_fit_cpus))) { + best_fit_cpus = switches_cpu_cnt[j]; + best_fit_nodes = switches_node_cnt[j]; + best_fit_location = j; + best_fit_sufficient = sufficient; + } + } + if (best_fit_nodes == 0) + break; + if ((switches_node_cnt[best_fit_location] <= max_nodes) && + ((switches_node_cnt[best_fit_location] <= rem_nodes) || + (switches_cpu_cnt[best_fit_location] <= rem_cpus))) { + /* Use the entire leaf */ + bit_or(bitmap, switches_bitmap[best_fit_location]); + rem_nodes -= switches_node_cnt[best_fit_location]; + max_nodes -= switches_node_cnt[best_fit_location]; + rem_cpus -= switches_cpu_cnt[best_fit_location]; + alloc_cpus += switches_cpu_cnt[best_fit_location]; + } else {/* Use select nodes from this leaf */ + first = bit_ffs(switches_bitmap[best_fit_location]); + last = bit_fls(switches_bitmap[best_fit_location]); + for (i=first; i<=last; i++) { + if (!bit_test(switches_bitmap + [best_fit_location], i)) + continue; + bit_set(bitmap, i); + rem_nodes--; + max_nodes--; + avail_cpus = _get_cpu_cnt(job_ptr, i, cpu_cnt, + freq, size); + rem_cpus -= avail_cpus; + alloc_cpus += avail_cpus; + if ((max_nodes <= 0) || + ((rem_nodes <= 0) && (rem_cpus <= 0))) + break; + } + } + switches_node_cnt[best_fit_location] = 0; + } + if ((rem_cpus <= 0) && + _enough_nodes(0, rem_nodes, min_nodes, req_nodes)) { + rc = SLURM_SUCCESS; + } else + rc = SLURM_ERROR; + + fini: if (rc == SLURM_SUCCESS) { + /* Job's total_procs is needed for SELECT_MODE_WILL_RUN */ + job_ptr->total_procs = alloc_cpus; + } + FREE_NULL_BITMAP(avail_nodes_bitmap); + FREE_NULL_BITMAP(req_nodes_bitmap); + for (i=0; i<switch_record_cnt; i++) + bit_free(switches_bitmap[i]); + xfree(switches_bitmap); + xfree(switches_cpu_cnt); + xfree(switches_node_cnt); + xfree(switches_required); + + return rc; +} /* this is an intermediary step between _select_nodes and _eval_nodes * to tackle the knapsack problem. This code incrementally removes nodes diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 98277fe89e6f427315254abc95f010f8bc7d25b3..167c7eb51954358a58a358923dcd1c230ea87b80 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -4,7 +4,7 @@ * of sets of consecutive nodes using a best-fit algorithm. ***************************************************************************** * Copyright (C) 2004-2007 The Regents of the University of California. - * Copyright (C) 2008 Lawrence Livermore National Security. + * Copyright (C) 2008-2009 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * CODE-OCEC-09-009. All rights reserved. @@ -93,6 +93,9 @@ static int _job_count_bitmap(struct node_cr_record *node_cr_ptr, static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes); +static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap, + uint32_t min_nodes, uint32_t max_nodes, + uint32_t req_nodes); static int _rm_job_from_nodes(struct node_cr_record *node_cr_ptr, struct job_record *job_ptr, char *pre_err, int remove_all); @@ -849,8 +852,8 @@ static int _find_job_mate(struct job_record *job_ptr, bitstr_t *bitmap, /* _job_test - does most of the real work for select_p_job_test(), which * pretty much just handles load-leveling and max_share logic */ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, - uint32_t min_nodes, uint32_t max_nodes, - uint32_t req_nodes) + uint32_t min_nodes, uint32_t max_nodes, + uint32_t req_nodes) { int i, index, error_code = EINVAL, sufficient; int *consec_nodes; /* how many nodes we can add from this @@ -867,10 +870,19 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, int best_fit_location = 0, best_fit_sufficient; int avail_cpus, alloc_cpus = 0; + if (bit_set_count(bitmap) < min_nodes) + return error_code; + if ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, bitmap))) return error_code; + if (switch_record_cnt && switch_record_table) { + /* Perform optimized resource selection based upon topology */ + return _job_test_topo(job_ptr, bitmap, + min_nodes, max_nodes, req_nodes); + } + consec_index = 0; consec_size = 50; /* start allocation for 50 sets of * consecutive nodes */ @@ -941,7 +953,7 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, consec_end[consec_index++] = index - 1; #if SELECT_DEBUG - /* don't compile this, slows things down too much */ + /* don't compile this, it slows things down too much */ debug3("rem_cpus=%d, rem_nodes=%d", rem_cpus, rem_nodes); for (i = 0; i < consec_index; i++) { if (consec_req[i] != -1) @@ -1070,6 +1082,289 @@ static int _job_test(struct job_record *job_ptr, bitstr_t *bitmap, return error_code; } +/* + * _job_test_topo - A topology aware version of _job_test() + * NOTE: The logic here is almost identical to that of _eval_nodes_topo() in + * select/cons_res/job_test.c. Any bug found here is probably also there. + */ +static int _job_test_topo(struct job_record *job_ptr, bitstr_t *bitmap, + uint32_t min_nodes, uint32_t max_nodes, + uint32_t req_nodes) +{ + bitstr_t **switches_bitmap; /* nodes on this switch */ + int *switches_cpu_cnt; /* total CPUs on switch */ + int *switches_node_cnt; /* total nodes on switch */ + int *switches_required; /* set if has required node */ + + bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */ + bitstr_t *req_nodes_bitmap = NULL; + int rem_cpus, rem_nodes; /* remaining resources desired */ + int avail_cpus, alloc_cpus = 0; + int i, j, rc = SLURM_SUCCESS; + int best_fit_inx, first, last; + int best_fit_nodes, best_fit_cpus; + int best_fit_location = 0, best_fit_sufficient; + bool sufficient; + + rem_cpus = job_ptr->num_procs; + if (req_nodes > min_nodes) + rem_nodes = req_nodes; + else + rem_nodes = min_nodes; + + if (job_ptr->details->req_node_bitmap) { + req_nodes_bitmap = bit_copy(job_ptr->details->req_node_bitmap); + i = bit_set_count(req_nodes_bitmap); + if (i > max_nodes) { + info("job %u requires more nodes than currently " + "available (%u>%u)", + job_ptr->job_id, i, max_nodes); + rc = EINVAL; + goto fini; + } + } + + /* Construct a set of switch array entries, + * use the same indexes as switch_record_table in slurmctld */ + switches_bitmap = xmalloc(sizeof(bitstr_t *) * switch_record_cnt); + switches_cpu_cnt = xmalloc(sizeof(int) * switch_record_cnt); + switches_node_cnt = xmalloc(sizeof(int) * switch_record_cnt); + switches_required = xmalloc(sizeof(int) * switch_record_cnt); + avail_nodes_bitmap = bit_alloc(node_record_count); + for (i=0; i<switch_record_cnt; i++) { + switches_bitmap[i] = bit_copy(switch_record_table[i]. + node_bitmap); + bit_and(switches_bitmap[i], bitmap); + bit_or(avail_nodes_bitmap, switches_bitmap[i]); + switches_node_cnt[i] = bit_set_count(switches_bitmap[i]); + if (req_nodes_bitmap && + bit_overlap(req_nodes_bitmap, switches_bitmap[i])) { + switches_required[i] = 1; + } + } + bit_nclear(bitmap, 0, node_record_count - 1); + +#if SELECT_DEBUG + /* Don't compile this, it slows things down too much */ + for (i=0; i<switch_record_cnt; i++) { + char *node_names = NULL; + if (switches_node_cnt[i]) + node_names = bitmap2node_name(switches_bitmap[i]); + debug("switch=%s nodes=%u:%s required:%u", + switch_record_table[i].name, + switches_node_cnt[i], node_names, + switches_required[i]); + xfree(node_names); + } +#endif + + if (req_nodes_bitmap && + (!bit_super_set(req_nodes_bitmap, avail_nodes_bitmap))) { + info("job %u requires nodes not available on any switch", + job_ptr->job_id); + rc = EINVAL; + goto fini; + } + + if (req_nodes_bitmap) { + /* Accumulate specific required resources, if any */ + first = bit_ffs(req_nodes_bitmap); + last = bit_fls(req_nodes_bitmap); + for (i=first; i<=last; i++) { + if (!bit_test(req_nodes_bitmap, i)) + continue; + if (max_nodes <= 0) { + info("job %u requires nodes than allowed", + job_ptr->job_id); + rc = EINVAL; + goto fini; + } + bit_set(bitmap, i); + bit_clear(avail_nodes_bitmap, i); + rem_nodes--; + max_nodes--; + avail_cpus = _get_avail_cpus(job_ptr, i); + rem_cpus -= avail_cpus; + alloc_cpus += avail_cpus; + for (j=0; j<switch_record_cnt; j++) { + if (!bit_test(switches_bitmap[j], i)) + continue; + bit_clear(switches_bitmap[j], i); + switches_node_cnt[j]--; + } + } + if ((rem_nodes <= 0) && (rem_cpus <= 0)) + goto fini; + + /* Accumulate additional resources from leafs that + * contain required nodes */ + for (j=0; j<switch_record_cnt; j++) { + if ((switch_record_table[j].level != 0) || + (switches_node_cnt[j] == 0) || + (switches_required[j] == 0)) { + continue; + } + while ((max_nodes > 0) && + ((rem_nodes > 0) || (rem_cpus > 0))) { + i = bit_ffs(switches_bitmap[j]); + if (i == -1) + break; + bit_set(bitmap, i); + bit_clear(avail_nodes_bitmap, i); + bit_clear(switches_bitmap[j], i); + switches_node_cnt[j]--; + rem_nodes--; + max_nodes--; + avail_cpus = _get_avail_cpus(job_ptr, i); + rem_cpus -= avail_cpus; + alloc_cpus += avail_cpus; + } + } + if ((rem_nodes <= 0) && (rem_cpus <= 0)) + goto fini; + + /* Update bitmaps and node counts for higher-level switches */ + for (j=0; j<switch_record_cnt; j++) { + if (switches_node_cnt[j] == 0) + continue; + first = bit_ffs(switches_bitmap[j]); + last = bit_fls(switches_bitmap[j]); + for (i=first; i<=last; i++) { + if (!bit_test(switches_bitmap[j], i)) + continue; + if (!bit_test(avail_nodes_bitmap, i)) { + /* cleared from lower level */ + bit_clear(switches_bitmap[j], i); + switches_node_cnt[j]--; + } else { + switches_cpu_cnt[j] += + _get_avail_cpus(job_ptr, i); + } + } + } + } else { + /* No specific required nodes, calculate CPU counts */ + for (j=0; j<switch_record_cnt; j++) { + first = bit_ffs(switches_bitmap[j]); + last = bit_fls(switches_bitmap[j]); + for (i=first; i<=last; i++) { + if (!bit_test(switches_bitmap[j], i)) + continue; + switches_cpu_cnt[j] += + _get_avail_cpus(job_ptr, i); + } + } + } + + /* Determine lowest level switch satifying request with best fit */ + best_fit_inx = -1; + for (j=0; j<switch_record_cnt; j++) { + if ((switches_cpu_cnt[j] < rem_cpus) || + (!_enough_nodes(switches_node_cnt[j], rem_nodes, + min_nodes, req_nodes))) + continue; + if ((best_fit_inx == -1) || + (switch_record_table[j].level < + switch_record_table[best_fit_inx].level) || + ((switch_record_table[j].level == + switch_record_table[best_fit_inx].level) && + (switches_node_cnt[j] < switches_node_cnt[best_fit_inx]))) + best_fit_inx = j; + } + if (best_fit_inx == -1) { + error("job %u: best_fit topology failure", job_ptr->job_id); + rc = EINVAL; + goto fini; + } + bit_and(avail_nodes_bitmap, switches_bitmap[best_fit_inx]); + + /* Identify usable leafs (within higher switch having best fit) */ + for (j=0; j<switch_record_cnt; j++) { + if ((switch_record_table[j].level != 0) || + (!bit_super_set(switches_bitmap[j], + switches_bitmap[best_fit_inx]))) { + switches_node_cnt[j] = 0; + } + } + + /* Select resources from these leafs on a best-fit basis */ + while ((max_nodes > 0) && ((rem_nodes > 0) || (rem_cpus > 0))) { + best_fit_cpus = best_fit_nodes = best_fit_sufficient = 0; + for (j=0; j<switch_record_cnt; j++) { + if (switches_node_cnt[j] == 0) + continue; + sufficient = (switches_cpu_cnt[j] >= rem_cpus) && + _enough_nodes(switches_node_cnt[j], + rem_nodes, min_nodes, + req_nodes); + /* If first possibility OR */ + /* first set large enough for request OR */ + /* tightest fit (less resource waste) OR */ + /* nothing yet large enough, but this is biggest */ + if ((best_fit_nodes == 0) || + (sufficient && (best_fit_sufficient == 0)) || + (sufficient && + (switches_cpu_cnt[j] < best_fit_cpus)) || + ((sufficient == 0) && + (switches_cpu_cnt[j] > best_fit_cpus))) { + best_fit_cpus = switches_cpu_cnt[j]; + best_fit_nodes = switches_node_cnt[j]; + best_fit_location = j; + best_fit_sufficient = sufficient; + } + } + if (best_fit_nodes == 0) + break; + if ((switches_node_cnt[best_fit_location] <= max_nodes) && + ((switches_node_cnt[best_fit_location] <= rem_nodes) || + (switches_cpu_cnt[best_fit_location] <= rem_cpus))) { + /* Use the entire leaf */ + bit_or(bitmap, switches_bitmap[best_fit_location]); + rem_nodes -= switches_node_cnt[best_fit_location]; + max_nodes -= switches_node_cnt[best_fit_location]; + rem_cpus -= switches_cpu_cnt[best_fit_location]; + alloc_cpus += switches_cpu_cnt[best_fit_location]; + } else {/* Use select nodes from this leaf */ + first = bit_ffs(switches_bitmap[best_fit_location]); + last = bit_fls(switches_bitmap[best_fit_location]); + for (i=first; i<=last; i++) { + if (!bit_test(switches_bitmap + [best_fit_location], i)) + continue; + bit_set(bitmap, i); + rem_nodes--; + max_nodes--; + avail_cpus = _get_avail_cpus(job_ptr, i); + rem_cpus -= avail_cpus; + alloc_cpus += avail_cpus; + if ((max_nodes <= 0) || + ((rem_nodes <= 0) && (rem_cpus <= 0))) + break; + } + } + switches_node_cnt[best_fit_location] = 0; + } + if ((rem_cpus <= 0) && + _enough_nodes(0, rem_nodes, min_nodes, req_nodes)) { + rc = SLURM_SUCCESS; + } else + rc = EINVAL; + + fini: if (rc == SLURM_SUCCESS) { + /* Job's total_procs is needed for SELECT_MODE_WILL_RUN */ + job_ptr->total_procs = alloc_cpus; + } + FREE_NULL_BITMAP(avail_nodes_bitmap); + FREE_NULL_BITMAP(req_nodes_bitmap); + for (i=0; i<switch_record_cnt; i++) + bit_free(switches_bitmap[i]); + xfree(switches_bitmap); + xfree(switches_cpu_cnt); + xfree(switches_node_cnt); + xfree(switches_required); + + return rc; +} extern int select_p_job_begin(struct job_record *job_ptr) { int rc = SLURM_SUCCESS; diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index ed3657fcdaa4398ec3802f0d34f95d3f115073c4..75e42efc915d4b920a1d06bfdecc85e11a8ee10d 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -151,6 +151,8 @@ int accounting_enforce = 0; int association_based_accounting = 0; bool ping_nodes_now = false; int cluster_procs = 0; +struct switch_record *switch_record_table = NULL; +int switch_record_cnt = 0; /* Local variables */ static int daemonize = DEFAULT_DAEMONIZE; @@ -591,6 +593,7 @@ int main(int argc, char *argv[]) trigger_fini(); assoc_mgr_fini(slurmctld_conf.state_save_location); reserve_port_config(NULL); + free_switch_record_table(); /* Some plugins are needed to purge job/node data structures, * unplug after other data structures are purged */ diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 2b0c47d9041d74f1f2c14ed097c41ae7c35f01db..fd9ade6a0cda2558cc91ae49bf13d34a574896c7 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -127,22 +127,28 @@ static void _dump_hash (void); */ char * bitmap2node_name (bitstr_t *bitmap) { - int i; + int i, first, last; hostlist_t hl; char buf[8192]; if (bitmap == NULL) return xstrdup(""); + first = bit_ffs(bitmap); + if (first == -1) + return xstrdup(""); + + last = bit_fls(bitmap); hl = hostlist_create(""); - for (i = 0; i < node_record_count; i++) { - if (bit_test (bitmap, i) == 0) + for (i = first; i <= last; i++) { + if (bit_test(bitmap, i) == 0) continue; hostlist_push(hl, node_record_table_ptr[i].name); } hostlist_uniq(hl); hostlist_ranged_string(hl, sizeof(buf), buf); hostlist_destroy(hl); + return xstrdup(buf); } diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index f7b7646159321733d50521d1fd0fc1c785ba542e..3f7a2e0b5accd1cee6971b2a41dddd3d11ed8b5a 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -523,6 +523,7 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->task_plugin = xstrdup(conf->task_plugin); conf_ptr->task_plugin_param = conf->task_plugin_param; conf_ptr->tmp_fs = xstrdup(conf->tmp_fs); + conf_ptr->topology_plugin = xstrdup(conf->topology_plugin); conf_ptr->track_wckey = conf->track_wckey; conf_ptr->tree_width = conf->tree_width; diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index d730d0cb43140bede0d6c4f300115f74689a9706..6b20bfefc32f56374e82168cfbb93e8d16776681 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -86,24 +86,27 @@ static void _acct_restore_active_jobs(void); static int _build_bitmaps(void); static void _build_bitmaps_pre_select(void); +static int _get_switch_inx(const char *name); static int _init_all_slurm_conf(void); -static void _purge_old_node_state(struct node_record *old_node_table_ptr, - int old_node_record_count); -static int _restore_job_dependencies(void); -static int _restore_node_state(struct node_record *old_node_table_ptr, - int old_node_record_count); +static void _log_switches(void); static int _preserve_select_type_param(slurm_ctl_conf_t * ctl_conf_ptr, select_type_plugin_info_t old_select_type_p); static int _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr, char *old_auth_type, char *old_checkpoint_type, char *old_crypto_type, char *old_sched_type, char *old_select_type, char *old_switch_type); +static void _purge_old_node_state(struct node_record *old_node_table_ptr, + int old_node_record_count); +static int _restore_job_dependencies(void); +static int _restore_node_state(struct node_record *old_node_table_ptr, + int old_node_record_count); static int _sync_nodes_to_comp_job(void); static int _sync_nodes_to_jobs(void); static int _sync_nodes_to_active_job(struct job_record *job_ptr); #ifdef HAVE_ELAN static void _validate_node_proc_count(void); #endif +static void _validate_switches(void); static char *highest_node_name = NULL; int node_record_count = 0; @@ -118,7 +121,6 @@ static void _build_bitmaps_pre_select(void) struct node_record *node_ptr; ListIterator part_iterator; int i; - /* scan partition table and identify nodes in each */ part_iterator = list_iterator_create(part_list); @@ -154,6 +156,8 @@ static void _build_bitmaps_pre_select(void) } } list_iterator_destroy(part_iterator); + + _validate_switches(); return; } @@ -520,7 +524,7 @@ cleanup: return error_code; } -static void _handle_all_downnodes() +static void _handle_all_downnodes(void) { slurm_conf_downnodes_t *ptr, **ptr_array; int count; @@ -598,6 +602,173 @@ static int _build_all_nodeline_info(slurm_ctl_conf_t *conf) return SLURM_SUCCESS; } +static void _validate_switches(void) +{ + slurm_conf_switches_t *ptr, **ptr_array; + int depth, i, j; + struct switch_record *switch_ptr; + hostlist_t hl; + char *child; + bitstr_t *switches_bitmap = NULL; + + free_switch_record_table(); + /* We currently only read the switch configuration directly from + * slurm.conf, but could read it from some other plugin based upon + * the value of TopologyPlugin (topology_plugin). + * We can add support for such a plugin at some time in the future. */ + switch_record_cnt = slurm_conf_switch_array(&ptr_array); + if (switch_record_cnt == 0) { + debug("No switches"); + return; + } + + switch_record_table = xmalloc(sizeof(struct switch_record) * + switch_record_cnt); + switch_ptr = switch_record_table; + for (i=0; i<switch_record_cnt; i++, switch_ptr++) { + ptr = ptr_array[i]; + switch_ptr->name = xstrdup(ptr->switch_name); + if (ptr->nodes) { + switch_ptr->level = 0; /* leaf switch */ + switch_ptr->nodes = xstrdup(ptr->nodes); + if (node_name2bitmap(ptr->nodes, false, + &switch_ptr->node_bitmap)) { + fatal("Invalid node name (%s) in switch " + "config (%s)", + ptr->nodes, ptr->switch_name); + } + } else if (ptr->switches) { + switch_ptr->level = -1; /* determine later */ + switch_ptr->switches = xstrdup(ptr->switches); + } else { + fatal("Switch configuration (%s) lacks children", + ptr->switch_name); + } + } + + for (depth=1; ; depth++) { + bool resolved = true; + switch_ptr = switch_record_table; + for (i=0; i<switch_record_cnt; i++, switch_ptr++) { + if (switch_ptr->level != -1) + continue; + hl = hostlist_create(switch_ptr->switches); + if (!hl) + fatal("hostlist_create: malloc failure"); + while ((child = hostlist_pop(hl))) { + j = _get_switch_inx(child); + if ((j < 0) || (j == i)) { + fatal("Switch configuration %s has " + "invalid child (%s)", + switch_ptr->name, child); + } + if (switch_record_table[j].level == -1) { + /* Children not resolved */ + resolved = false; + switch_ptr->level = -1; + FREE_NULL_BITMAP(switch_ptr-> + node_bitmap); + free(child); + break; + } + if (switch_ptr->level == -1) { + switch_ptr->level = 1 + + switch_record_table[j].level; + switch_ptr->node_bitmap = + bit_copy(switch_record_table[j]. + node_bitmap); + } else { + switch_ptr->level = + MAX(switch_ptr->level, + (1 + switch_record_table[j]. + level)); + bit_or(switch_ptr->node_bitmap, + switch_record_table[j]. + node_bitmap); + } + free(child); + } + hostlist_destroy(hl); + } + if (resolved) + break; + } + + switch_ptr = switch_record_table; + for (i=0; i<switch_record_cnt; i++, switch_ptr++) { + if (switch_ptr->node_bitmap == NULL) { + error("switch %s has no nodes", switch_ptr->name); + continue; + } + if (switches_bitmap) + bit_or(switches_bitmap, switch_ptr->node_bitmap); + else + switches_bitmap = bit_copy(switch_ptr->node_bitmap); + } + if (switches_bitmap) { + bit_not(switches_bitmap); + i = bit_set_count(switches_bitmap); + if (i >= 0) { + child = bitmap2node_name(switches_bitmap); + error("switches lack access to %d nodes: %s", + i, child); + xfree(child); + } + bit_free(switches_bitmap); + } else + fatal("switches contain no nodes"); + _log_switches(); +} + +static void _log_switches(void) +{ + int i; + struct switch_record *switch_ptr; + + switch_ptr = switch_record_table; + for (i=0; i<switch_record_cnt; i++, switch_ptr++) { + if (!switch_ptr->nodes) { + switch_ptr->nodes = bitmap2node_name(switch_ptr-> + node_bitmap); + } + debug("Switch level:%d name:%s nodes:%s switches:%s", + switch_ptr->level, switch_ptr->name, + switch_ptr->nodes, switch_ptr->switches); + } +} + +/* Return the index of a given switch name or -1 if not found */ +static int _get_switch_inx(const char *name) +{ + int i; + struct switch_record *switch_ptr; + + switch_ptr = switch_record_table; + for (i=0; i<switch_record_cnt; i++, switch_ptr++) { + if (strcmp(switch_ptr->name, name) == 0) + return i; + } + + return -1; +} + +/* Free all memory associated with switch_record_table structure */ +extern void free_switch_record_table(void) +{ + int i; + + if (switch_record_table) { + for (i=0; i<switch_record_cnt; i++) { + xfree(switch_record_table[i].name); + xfree(switch_record_table[i].nodes); + xfree(switch_record_table[i].switches); + FREE_NULL_BITMAP(switch_record_table[i].node_bitmap); + } + xfree(switch_record_table); + switch_record_cnt = 0; + } +} + /* * _build_single_partitionline_info - get a array of slurm_conf_partition_t * structures from the slurm.conf reader, build table, and set values diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index cfbd292fc9c5363c1359148ea32a293d8de15542..6aac00f0d4381cfc1241c44f27408f7745e1bfc7 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -263,6 +263,21 @@ extern bitstr_t *idle_node_bitmap; /* bitmap of idle nodes */ extern bitstr_t *share_node_bitmap; /* bitmap of sharable nodes */ extern bitstr_t *up_node_bitmap; /* bitmap of up nodes, not DOWN */ +/*****************************************************************************\ + * SWITCH topology data structures +\*****************************************************************************/ +struct switch_record { + int level; /* level in hierarchy, leaf=0 */ + char *name; /* switch name */ + bitstr_t *node_bitmap; /* bitmap of all nodes descended from + * this switch */ + char *nodes; /* name if direct descendent nodes */ + char *switches; /* name if direct descendent switches */ +}; + +extern struct switch_record *switch_record_table; /* ptr to switch records */ +extern int switch_record_cnt; /* size of switch_record_table */ + /*****************************************************************************\ * PARTITION parameters and data structures \*****************************************************************************/ @@ -774,6 +789,9 @@ extern struct node_record *find_node_record (char *name); */ extern struct part_record *find_part_record (char *name); +/* Free all memory associated with switch_record_table structure */ +extern void free_switch_record_table(void); + /* * get_job_env - return the environment variables and their count for a * given job