From 215a7b93fdc46d460b7d6c4f97585420ff95dba4 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 1 Nov 2005 19:33:38 +0000 Subject: [PATCH] Permit nodes to be in multiple partitions simultaneously. --- NEWS | 1 + doc/html/quickstart_admin.html | 11 ++- doc/man/man5/slurm.conf.5 | 18 +++-- etc/slurm.conf.example | 2 +- slurm/slurm.h.in | 1 - src/api/node_info.c | 13 +--- src/common/slurm_protocol_defs.c | 1 - src/common/slurm_protocol_pack.c | 2 - src/plugins/sched/backfill/backfill.c | 11 ++- src/sinfo/sinfo.c | 104 +++++++++++++------------- src/slurmctld/node_mgr.c | 50 +++++++++---- src/slurmctld/partition_mgr.c | 70 ++++++++++++----- src/slurmctld/read_config.c | 37 ++++----- src/slurmctld/sched_upcalls.c | 11 ++- src/slurmctld/slurmctld.h | 4 +- testsuite/expect/test3.5 | 72 +++++++++++++++++- 16 files changed, 268 insertions(+), 140 deletions(-) diff --git a/NEWS b/NEWS index 1659de504c2..188e01d6e1f 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,7 @@ documents those changes that are of interest to users and admins. with a DRAIN flag. COMPLETING state is changed to a COMPLETING flag. -- Test suite moved into testsuite/expect from separate repository. -- Added new document describing slurm APIs (doc/html/api.html). + -- Permit nodes to be in multiple partitions simultaneously. * Changes in SLURM 0.7.0-pre2 ============================= diff --git a/doc/html/quickstart_admin.html b/doc/html/quickstart_admin.html index d2a2d652f22..6275ba6243d 100644 --- a/doc/html/quickstart_admin.html +++ b/doc/html/quickstart_admin.html @@ -9,7 +9,7 @@ <meta http-equiv="keywords" content="Simple Linux Utility for Resource Management, SLURM, resource management, Linux clusters, high-performance computing, Livermore Computing"> <meta name="LLNLRandR" content="UCRL-WEB-213976"> -<meta name="LLNLRandRdate" content="25 October 2005"> +<meta name="LLNLRandRdate" content="1 November 2005"> <meta name="distribution" content="global"> <meta name="description" content="Simple Linux Utility for Resource Management"> <meta name="copyright" @@ -315,8 +315,11 @@ In this case "emcri" is the private management network interface for the host "mcri". Port numbers to be used for communications are specified as well as various timer values.</p> -<p>A description of the nodes and their grouping into non-overlapping partitions -is required. Partition and node specifications use node range expressions to identify +<p>A description of the nodes and their grouping into partitions is required. +Nodes can be in more than one partition and each partition can have different +constraints (permitted users, time limits, job size limits, etc.). +Each partition can thus be considered a separate queue. +Partition and node specifications use node range expressions to identify nodes in a concise fashion. This configuration file defines a 1154-node cluster for SLURM, but it might be used for a much larger cluster by just changing a few node range expressions. Specify the minimum processor count (Procs), real memory @@ -571,7 +574,7 @@ in the NEWS file. <td colspan="3"><hr> <p>For information about this page, contact <a href="mailto:slurm-dev@lists.llnl.gov">slurm-dev@lists.llnl.gov</a>.</p> <p><a href="http://www.llnl.gov/"><img align=middle src="lll.gif" width="32" height="32" border="0"></a></p> <p class="footer">UCRL-WEB-213976<br> -Last modified 25 October 2005</p></td> +Last modified 1 November 2005</p></td> </tr> </table> </td> diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 47db07119d9..98dd95b7e4e 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -666,9 +666,13 @@ Weight is an integer value with a default value of 1. .LP The partition configuration permits you to establish different job limits or access controls for various groups (or partitions) of nodes. -Nodes may be in only one partition. Jobs are allocated resources -within a single partition. The partition configuration -file contains the following information: +Nodes may be in more than one partition, making partitions serve +as general purpose queues. +For example one may put the same set of nodes into two different +partitions, each with different constraints (time limit, job sizes, +groups allowed to use the partition, etc.). +Jobs are allocated resources within a single partition. +The partition configuration file contains the following information: .TP \fBAllowGroups\fR Comma separated list of group IDs which may execute jobs in the partition. @@ -843,11 +847,13 @@ NodeName=dev20 State=DOWN Reason="power,ETA=Dec25" .br # .br -PartitionName=DEFAULT MaxTime=30 MaxNodes=10 +PartitionName=DEFAULT MaxTime=30 MaxNodes=10 State=UP .br -PartitionName=debug Nodes=dev[0-8,18-25] State=UP Default=YES +PartitionName=debug Nodes=dev[0-8,18-25] Default=YES .br -PartitionName=batch Nodes=dev[9-17] State=UP MinNodes=4 +PartitionName=batch Nodes=dev[9-17] MinNodes=4 +.br +PartitionName=long Nodes=dev[9-17] MaxTime=120 AllowGroups=admin .SH "COPYING" Copyright (C) 2002 The Regents of the University of California. diff --git a/etc/slurm.conf.example b/etc/slurm.conf.example index 6e0830d231a..53b1d4e32e3 100644 --- a/etc/slurm.conf.example +++ b/etc/slurm.conf.example @@ -549,7 +549,7 @@ JobAcctType=jobacct/none # o Partition Configuration # # Paritions are groups of nodes which (possibly) have different limits -# and access controls. Nodes may only be in one partition and jobs will +# and access controls. Nodes may be in multiple partitions. Jobs will # not be allowed to span partitions. The following partition configuration # parameters are recognized: # diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index fcbefd2db86..c8fef5607b3 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -448,7 +448,6 @@ typedef struct node_info { uint32_t tmp_disk; /* configured MB of total disk in TMP_FS */ uint32_t weight; /* arbitrary priority of node for scheduling */ char *features; /* arbitrary list of features for node */ - char *partition; /* name of partition node configured to */ char *reason; /* reason for node being DOWN or DRAINING */ } node_info_t; diff --git a/src/api/node_info.c b/src/api/node_info.c index 3a59be12eca..fe3c1b87b0e 100644 --- a/src/api/node_info.c +++ b/src/api/node_info.c @@ -102,20 +102,13 @@ slurm_print_node_table ( FILE * out, node_info_t * node_ptr, int one_liner ) fprintf ( out, "\n "); /****** Line 2 ******/ - fprintf ( out, "Weight=%u Partition=%s Features=%s", - node_ptr->weight, node_ptr->partition, node_ptr->features); + fprintf ( out, "Weight=%u Features=%s " , + node_ptr->weight, node_ptr->features); + fprintf ( out, "Reason=%s", node_ptr->reason); if (one_liner) fprintf ( out, " "); else fprintf ( out, "\n "); - - - /****** Line 3 ******/ - fprintf ( out, "Reason=%s", node_ptr->reason); - if (one_liner) - fprintf ( out, "\n"); - else - fprintf ( out, "\n\n"); } diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 9ffd53dbd9d..18fe226a9d3 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -866,7 +866,6 @@ static void _slurm_free_node_info_members(node_info_t * node) if (node) { xfree(node->name); xfree(node->features); - xfree(node->partition); xfree(node->reason); } } diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 54bd1865a07..e3e6619036b 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1168,7 +1168,6 @@ _unpack_node_info_members(node_info_t * node, Buf buffer) safe_unpack32(&node->tmp_disk, buffer); safe_unpack32(&node->weight, buffer); safe_unpackstr_xmalloc(&node->features, &uint16_tmp, buffer); - safe_unpackstr_xmalloc(&node->partition, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&node->reason, &uint16_tmp, buffer); return SLURM_SUCCESS; @@ -1176,7 +1175,6 @@ _unpack_node_info_members(node_info_t * node, Buf buffer) unpack_error: xfree(node->name); xfree(node->features); - xfree(node->partition); xfree(node->reason); return SLURM_ERROR; } diff --git a/src/plugins/sched/backfill/backfill.c b/src/plugins/sched/backfill/backfill.c index 57781a9c182..a07e7c0aeb6 100644 --- a/src/plugins/sched/backfill/backfill.c +++ b/src/plugins/sched/backfill/backfill.c @@ -317,7 +317,7 @@ _attempt_backfill(struct part_record *part_ptr) static void _get_part_specs(struct part_record *part_ptr, part_specs_t *part_specs) { - int i; + int i, j; part_specs->idle_node_cnt = 0; part_specs->max_cpus = 0; @@ -327,8 +327,15 @@ _get_part_specs(struct part_record *part_ptr, part_specs_t *part_specs) for (i=0; i<node_record_count; i++) { struct node_record *node_ptr = &node_record_table_ptr[i]; + bool found_part = false; - if (node_ptr->partition_ptr != part_ptr) + for (j=0; j<node_ptr->part_cnt; j++) { + if (node_ptr->part_pptr[j] != part_ptr) + continue; + found_part = true; + break; + } + if (found_part == false) continue; /* different partition */ if (node_ptr->node_state == NODE_STATE_IDLE) part_specs->idle_node_cnt++; diff --git a/src/sinfo/sinfo.c b/src/sinfo/sinfo.c index 6168f66a489..af1930322dd 100644 --- a/src/sinfo/sinfo.c +++ b/src/sinfo/sinfo.c @@ -57,8 +57,7 @@ static void _create_sinfo(List sinfo_list, partition_info_t* part_ptr, uint16_t part_inx, node_info_t *node_ptr); static bool _filter_out(node_info_t *node_ptr); static void _sinfo_list_delete(void *data); -static partition_info_t *_find_part(char *part_name, - partition_info_msg_t *partition_msg, uint16_t *part_inx); +static node_info_t *_find_node(char *node_name, node_info_msg_t *node_msg); static bool _match_node_data(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr); static bool _match_part_data(sinfo_data_t *sinfo_ptr, @@ -67,8 +66,7 @@ static int _query_server(partition_info_msg_t ** part_pptr, node_info_msg_t ** node_pptr); static void _sort_hostlist(List sinfo_list); static int _strcmp(char *data1, char *data2); -static void _update_sinfo(sinfo_data_t *sinfo_ptr, - partition_info_t* part_ptr, node_info_t *node_ptr); +static void _update_sinfo(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr); int main(int argc, char *argv[]) { @@ -277,7 +275,9 @@ static int _build_sinfo_data(List sinfo_list, partition_info_t* part_ptr; ListIterator i; int j; - uint16_t part_inx; + hostlist_t hl; + sinfo_data_t *sinfo_ptr; + char *node_name = NULL; /* by default every partition is shown, even if no nodes */ if ((!params.node_flag) && params.match_flags.partition_flag) { @@ -290,37 +290,42 @@ static int _build_sinfo_data(List sinfo_list, } } - /* make sinfo_list entries for each node */ - for (j=0; j<node_msg->record_count; j++) { - sinfo_data_t *sinfo_ptr; - node_ptr = &(node_msg->node_array[j]); - - if (params.filtering && _filter_out(node_ptr)) - continue; - - part_ptr = _find_part(node_ptr->partition, partition_msg, - &part_inx); - if ( ! part_ptr ) + /* make sinfo_list entries for every node in every partition */ + for (j=0; j<partition_msg->record_count; j++, part_ptr++) { + part_ptr = &(partition_msg->partition_array[j]); + if (params.filtering + && _strcmp(part_ptr->name, params.partition)) continue; - i = list_iterator_create(sinfo_list); - /* test if node can be added to existing sinfo_data entry */ - while ((sinfo_ptr = list_next(i))) { - if (!_match_part_data(sinfo_ptr, part_ptr)) + hl = hostlist_create(part_ptr->nodes); + while (1) { + if (node_name) + free(node_name); + node_name = hostlist_shift(hl); + if (!node_name) + break; + node_ptr = _find_node(node_name, node_msg); + if (!node_ptr) continue; - if (sinfo_ptr->nodes_tot && - (!_match_node_data(sinfo_ptr, node_ptr))) + if (params.filtering && _filter_out(node_ptr)) continue; - - /* This node has the same configuration as this - * sinfo_data, just add to this record */ - _update_sinfo(sinfo_ptr, part_ptr, node_ptr); - break; + i = list_iterator_create(sinfo_list); + while ((sinfo_ptr = list_next(i))) { + if (!_match_part_data(sinfo_ptr, part_ptr)) + continue; + if (sinfo_ptr->nodes_tot + && (!_match_node_data(sinfo_ptr, node_ptr))) + continue; + _update_sinfo(sinfo_ptr, node_ptr); + break; + } + /* if no match, create new sinfo_data entry */ + if (sinfo_ptr == NULL) { + _create_sinfo(sinfo_list, part_ptr, + (uint16_t) j, node_ptr); + } + list_iterator_destroy(i); } - - /* no match, create new sinfo_data entry */ - if (sinfo_ptr == NULL) - _create_sinfo(sinfo_list, part_ptr, part_inx, node_ptr); - list_iterator_destroy(i); + hostlist_destroy(hl); } _sort_hostlist(sinfo_list); @@ -337,10 +342,6 @@ static bool _filter_out(node_info_t *node_ptr) { static hostlist_t host_list = NULL; - if (params.partition && - _strcmp(node_ptr->partition, params.partition)) - return true; - if (params.nodes) { if (host_list == NULL) host_list = hostlist_create(params.nodes); @@ -477,8 +478,7 @@ static bool _match_part_data(sinfo_data_t *sinfo_ptr, return true; } -static void _update_sinfo(sinfo_data_t *sinfo_ptr, partition_info_t* part_ptr, - node_info_t *node_ptr) +static void _update_sinfo(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr) { uint16_t base_state; @@ -494,6 +494,10 @@ static void _update_sinfo(sinfo_data_t *sinfo_ptr, partition_info_t* part_ptr, sinfo_ptr->max_mem = node_ptr->real_memory; sinfo_ptr->min_weight = node_ptr->weight; sinfo_ptr->max_weight = node_ptr->weight; + } else if (hostlist_find(sinfo_ptr->nodes, node_ptr->name) != -1) { + /* we already have this node in this record, + * just return, don't duplicate */ + return; } else { if (sinfo_ptr->min_cpus > node_ptr->cpus) sinfo_ptr->min_cpus = node_ptr->cpus; @@ -585,25 +589,23 @@ static void _create_sinfo(List sinfo_list, partition_info_t* part_ptr, } /* - * _find_part - find a partition by name - * part_name IN - name of partition to locate - * partition_msg IN - partition information message from API - * part_inx OUT - index of the partition within the table (0-origin) + * _find_node - find a node by name + * node_name IN - name of node to locate + * node_msg IN - node information message from API */ -static partition_info_t *_find_part(char *part_name, - partition_info_msg_t *partition_msg, - uint16_t *part_inx) +static node_info_t *_find_node(char *node_name, node_info_msg_t *node_msg) { int i; - for (i=0; i<partition_msg->record_count; i++) { - if (_strcmp(part_name, - partition_msg->partition_array[i].name)) + if (node_name == NULL) + return NULL; + + for (i=0; i<node_msg->record_count; i++) { + if (_strcmp(node_name, node_msg->node_array[i].name)) continue; - *part_inx = i; - return &(partition_msg->partition_array[i]); + return &(node_msg->node_array[i]); } - *part_inx = 0; /* not correct, but better than random data */ + /* not found */ return NULL; } diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index acf3df8ab62..6c9481fdc14 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -80,6 +80,7 @@ static void _list_delete_config (void *config_entry); static int _list_find_config (void *config_entry, void *key); static void _make_node_down(struct node_record *node_ptr); static void _node_did_resp(struct node_record *node_ptr); +static bool _node_is_hidden(struct node_record *node_ptr); static void _node_not_resp (struct node_record *node_ptr, time_t msg_time); static void _pack_node (struct node_record *dump_node_ptr, Buf buffer); static void _sync_bitmaps(struct node_record *node_ptr, int job_count); @@ -197,7 +198,8 @@ create_node_record (struct config_record *config_ptr, char *node_name) node_ptr->node_state = default_node_record.node_state; node_ptr->last_response = default_node_record.last_response; node_ptr->config_ptr = config_ptr; - node_ptr->partition_ptr = NULL; + node_ptr->part_cnt = 0; + node_ptr->part_pptr = NULL; /* these values will be overwritten when the node actually registers */ node_ptr->cpus = config_ptr->cpus; node_ptr->real_memory = config_ptr->real_memory; @@ -414,6 +416,8 @@ extern int load_all_node_state ( bool state_only ) node_ptr->node_state = node_state; xfree(node_ptr->reason); node_ptr->reason = reason; + node_ptr->part_cnt = 0; + xfree(node_ptr->part_pptr); node_ptr->cpus = cpus; node_ptr->real_memory = real_memory; node_ptr->tmp_disk = tmp_disk; @@ -530,7 +534,8 @@ int init_node_conf (void) default_node_record.real_memory = 1; default_node_record.tmp_disk = 1; default_node_record.config_ptr = NULL; - default_node_record.partition_ptr = NULL; + default_node_record.part_cnt = 0; + default_node_record.part_pptr = NULL; default_config_record.cpus = 1; default_config_record.real_memory = 1; default_config_record.tmp_disk = 1; @@ -650,6 +655,22 @@ extern int node_name2bitmap (char *node_names, bool best_effort, return rc; } +static bool _node_is_hidden(struct node_record *node_ptr) +{ + int i; + bool shown = false; + + for (i=0; i<node_ptr->part_cnt; i++) { + if (node_ptr->part_pptr[i]->hidden == 0) { + shown = true; + break; + } + } + + if (shown || (node_ptr->part_cnt == 0)) + return false; + return true; +} /* * pack_all_node - dump all configuration and node information for all nodes @@ -689,9 +710,8 @@ extern void pack_all_node (char **buffer_ptr, int *buffer_size, xassert (node_ptr->config_ptr->magic == CONFIG_MAGIC); - if (((show_flags & SHOW_ALL) == 0) && - (node_ptr->partition_ptr) && - (node_ptr->partition_ptr->hidden)) + if (((show_flags & SHOW_ALL) == 0) + && (_node_is_hidden(node_ptr))) continue; _pack_node(node_ptr, buffer); @@ -735,10 +755,6 @@ static void _pack_node (struct node_record *dump_node_ptr, Buf buffer) } pack32 (dump_node_ptr->config_ptr->weight, buffer); packstr (dump_node_ptr->config_ptr->feature, buffer); - if (dump_node_ptr->partition_ptr) - packstr (dump_node_ptr->partition_ptr->name, buffer); - else - packstr (NULL, buffer); packstr (dump_node_ptr->reason, buffer); } @@ -1037,7 +1053,7 @@ validate_node_specs (char *node_name, uint32_t cpus, uint32_t real_memory, uint32_t tmp_disk, uint32_t job_count, uint32_t status) { - int error_code; + int error_code, i; struct config_record *config_ptr; struct node_record *node_ptr; char *reason_down = NULL; @@ -1056,9 +1072,13 @@ validate_node_specs (char *node_name, uint32_t cpus, error_code = EINVAL; reason_down = "Low CPUs"; } - if ((node_ptr->cpus != cpus) && (node_ptr->partition_ptr) && - (slurmctld_conf.fast_schedule == 0)) - node_ptr->partition_ptr->total_cpus += (cpus - node_ptr->cpus); + if ((node_ptr->cpus != cpus) + && (slurmctld_conf.fast_schedule == 0)) { + for (i=0; i<node_ptr->part_cnt; i++) { + node_ptr->part_pptr[i]->total_cpus += + (cpus - node_ptr->cpus); + } + } node_ptr->cpus = cpus; if (real_memory < config_ptr->real_memory) { @@ -1849,8 +1869,10 @@ void node_fini(void) config_list = NULL; } - for (i=0; i< node_record_count; i++) + for (i=0; i< node_record_count; i++) { + xfree(node_record_table_ptr[i].part_pptr); xfree(node_record_table_ptr[i].reason); + } FREE_NULL_BITMAP(idle_node_bitmap); FREE_NULL_BITMAP(avail_node_bitmap); diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c index 44b69095393..c9680a5baf7 100644 --- a/src/slurmctld/partition_mgr.c +++ b/src/slurmctld/partition_mgr.c @@ -72,7 +72,8 @@ static uid_t *_get_group_members(char *group_name); static time_t _get_group_tlm(void); static void _list_delete_part(void *part_entry); static int _uid_list_size(uid_t * uid_list_ptr); -static void _unlink_free_nodes(bitstr_t *old_bitmap); +static void _unlink_free_nodes(bitstr_t *old_bitmap, + struct part_record *part_ptr); /* * _build_part_bitmap - update the total_cpus, total_nodes, and node_bitmap @@ -109,7 +110,7 @@ static int _build_part_bitmap(struct part_record *part_ptr) } if (part_ptr->nodes == NULL) { /* no nodes in partition */ - _unlink_free_nodes(old_bitmap); + _unlink_free_nodes(old_bitmap, part_ptr); FREE_NULL_BITMAP(old_bitmap); return 0; } @@ -136,7 +137,10 @@ static int _build_part_bitmap(struct part_record *part_ptr) part_ptr->total_cpus += node_ptr->config_ptr->cpus; else part_ptr->total_cpus += node_ptr->cpus; - node_ptr->partition_ptr = part_ptr; + node_ptr->part_cnt++; + xrealloc(node_ptr->part_pptr, (node_ptr->part_cnt * + sizeof(struct part_record *))); + node_ptr->part_pptr[node_ptr->part_cnt-1] = part_ptr; if (old_bitmap) bit_clear(old_bitmap, (int) (node_ptr - @@ -147,27 +151,41 @@ static int _build_part_bitmap(struct part_record *part_ptr) } hostlist_destroy(host_list); - _unlink_free_nodes(old_bitmap); + _unlink_free_nodes(old_bitmap, part_ptr); last_node_update = time(NULL); FREE_NULL_BITMAP(old_bitmap); return 0; } -/* unlink nodes removed from the partition */ -static void _unlink_free_nodes(bitstr_t *old_bitmap) +/* unlink nodes removed from a partition */ +static void _unlink_free_nodes(bitstr_t *old_bitmap, + struct part_record *part_ptr) { - int i, update_nodes = 0; + int i, j, k, update_nodes = 0; + struct node_record *node_ptr; - if (old_bitmap) { - for (i = 0; i < node_record_count; i++) { - if (bit_test(old_bitmap, i) == 0) + if (old_bitmap == NULL) + return; + + node_ptr = &node_record_table_ptr[0]; + for (i = 0; i < node_record_count; i++, node_ptr++) { + if (bit_test(old_bitmap, i) == 0) + continue; + for (j=0; j<node_ptr->part_cnt; j++) { + if (node_ptr->part_pptr[j] != part_ptr) continue; - node_record_table_ptr[i].partition_ptr = NULL; - update_nodes = 1; + node_ptr->part_cnt--; + for (k=j; k<node_ptr->part_cnt; k++) { + node_ptr->part_pptr[k] = + node_ptr->part_pptr[k+1]; + } + break; } - if (update_nodes) - last_node_update = time(NULL); + update_nodes = 1; } + + if (update_nodes) + last_node_update = time(NULL); } @@ -537,14 +555,22 @@ int init_part_conf(void) static void _list_delete_part(void *part_entry) { struct part_record *part_ptr; - int i; + struct node_record *node_ptr; + int i, j, k; part_ptr = (struct part_record *) part_entry; - for (i = 0; i < node_record_count; i++) { - if (node_record_table_ptr[i].partition_ptr != - part_ptr) - continue; - node_record_table_ptr[i].partition_ptr = NULL; + node_ptr = &node_record_table_ptr[0]; + for (i = 0; i < node_record_count; i++, node_ptr++) { + for (j=0; j<node_ptr->part_cnt; j++) { + if (node_ptr->part_pptr[j] != part_ptr) + continue; + node_ptr->part_cnt--; + for (k=j; k<node_ptr->part_cnt; k++) { + node_ptr->part_pptr[k] = + node_ptr->part_pptr[k+1]; + } + break; + } } xfree(part_ptr->allow_groups); xfree(part_ptr->allow_uids); @@ -1038,6 +1064,10 @@ extern int delete_partition(delete_part_msg_t *part_desc_ptr) if (part_ptr == NULL) /* No such partition */ return ESLURM_INVALID_PARTITION_NAME; + if (default_part_loc == part_ptr) { + error("Deleting default partition %s", part_ptr->name); + default_part_loc = NULL; + } (void) kill_job_by_part_name(part_desc_ptr->name); list_delete_all(part_list, list_find_part, part_desc_ptr->name); last_part_update = time(NULL); diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 0027756d4ff..4e3fa9e2fa1 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -106,7 +106,6 @@ static int _build_bitmaps(void) struct node_record *node_ptr; struct job_record *job_ptr; ListIterator job_iterator; - bitstr_t *all_part_node_bitmap; hostlist_t host_list; last_node_update = time(NULL); @@ -187,9 +186,6 @@ static int _build_bitmaps(void) } /* scan partition table and identify nodes in each */ - all_part_node_bitmap = (bitstr_t *) bit_alloc(node_record_count); - if (all_part_node_bitmap == NULL) - fatal ("bit_alloc malloc failure"); part_iterator = list_iterator_create(part_list); if (part_iterator == NULL) fatal ("memory allocation failure"); @@ -222,29 +218,22 @@ static int _build_bitmaps(void) continue; } j = node_ptr - node_record_table_ptr; - if (bit_test(all_part_node_bitmap, j) == 1) { - error("_build_bitmaps: node %s defined in " - "more than one partition", - this_node_name); - error("_build_bitmaps: only the first " - "specification is honored"); - } else { - bit_set(part_ptr->node_bitmap, j); - bit_set(all_part_node_bitmap, j); - part_ptr->total_nodes++; - if (slurmctld_conf.fast_schedule) - part_ptr->total_cpus += - node_ptr->config_ptr->cpus; - else - part_ptr->total_cpus += node_ptr->cpus; - node_ptr->partition_ptr = part_ptr; - } + bit_set(part_ptr->node_bitmap, j); + part_ptr->total_nodes++; + if (slurmctld_conf.fast_schedule) + part_ptr->total_cpus += + node_ptr->config_ptr->cpus; + else + part_ptr->total_cpus += node_ptr->cpus; + node_ptr->part_cnt++; + xrealloc(node_ptr->part_pptr, (node_ptr->part_cnt * + sizeof(struct part_record *))); + node_ptr->part_pptr[node_ptr->part_cnt-1] = part_ptr; free(this_node_name); } hostlist_destroy(host_list); } list_iterator_destroy(part_iterator); - bit_free(all_part_node_bitmap); return error_code; } @@ -934,8 +923,10 @@ static void _purge_old_node_state(struct node_record *old_node_table_ptr, { int i; - for (i = 0; i < old_node_record_count; i++) + for (i = 0; i < old_node_record_count; i++) { + xfree(old_node_table_ptr[i].part_pptr); xfree(old_node_table_ptr[i].reason); + } xfree(old_node_table_ptr); } diff --git a/src/slurmctld/sched_upcalls.c b/src/slurmctld/sched_upcalls.c index 42ef602a875..93634514adb 100644 --- a/src/slurmctld/sched_upcalls.c +++ b/src/slurmctld/sched_upcalls.c @@ -1010,6 +1010,11 @@ sched_get_node_tmp_disk( sched_obj_list_t node_data, /* ************************************************************************ */ /* TAG( sched_get_node_partition ) */ +/* NOTE: A SLURM node can be in multiple partitions/queues at the same time */ +/* We return only the first of these partition names here or NULL if there */ +/* are no associated partitions. There are 'part_cnt' partitions associated */ +/* with each node. There is an array of pointers to these partitions in the */ +/* array 'part_pptr'. We probably want to change this function accordingly. */ /* ************************************************************************ */ void * sched_get_node_partition( sched_obj_list_t node_data, @@ -1017,7 +1022,11 @@ sched_get_node_partition( sched_obj_list_t node_data, char *type ) { if ( type ) *type = 's'; - return ( (struct node_record *) node_data->data )[ idx ].partition_ptr->name; + if ( ((struct node_record *) node_data->data )[ idx ].part_cnt == 0 ) + return NULL; + + return ( (struct node_record *) node_data->data )[ idx ]. + part_pptr[0]->name; } /* ************************************************************************ */ diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 7e17d20e5b5..c4e7245067d 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -167,7 +167,9 @@ struct node_record { uint32_t real_memory; /* MB real memory on the node */ uint32_t tmp_disk; /* MB total disk in TMP_FS */ struct config_record *config_ptr; /* configuration spec ptr */ - struct part_record *partition_ptr; /* partition for this node */ + uint16_t part_cnt; /* number of associated partitions */ + struct part_record **part_pptr; /* array of pointers to partitions + * associated with this node*/ char comm_name[MAX_NAME_LEN]; /* communications path name to node */ slurm_addr slurm_addr; /* network address */ uint16_t comp_job_cnt; /* count of jobs completing on node */ diff --git a/testsuite/expect/test3.5 b/testsuite/expect/test3.5 index 8776a52e0cd..b2fb109de51 100755 --- a/testsuite/expect/test3.5 +++ b/testsuite/expect/test3.5 @@ -70,10 +70,44 @@ if {$found == 1} { exit 1 } +# +# Identify usable nodes in default partition +# +set def_name "" +set def_node "" +spawn $sinfo -h -o %32P +expect { + -re "($alpha_numeric_under)(\\*)" { + set def_name $expect_out(1,string) + exp_continue + } + eof { + wait + } +} +if {[string compare $def_name ""] == 0} { + send_user "\nFAILURE: failed to find default partition\n" + exit 1 +} +spawn $sinfo -h -o %N -p $def_name +expect { + -re "(\[a-zA-Z0-9_-\]+)" { + set def_node $expect_out(1,string) + exp_continue + } + eof { + wait + } +} +if {[string compare $def_node ""] == 0} { + send_user "\nFAILURE:default partition seems to have no nodes\n" + exit 1 +} + # # Create a new partition # -spawn $scontrol update PartitionName=$part_name +spawn $scontrol update PartitionName=$part_name Nodes=$def_node expect { -re "slurm_update error: ($alpha_numeric) ($alpha_numeric)" { set access_err 0 @@ -139,7 +173,7 @@ if {$allow != 1} { } # -# Now set group to mine +# Now set AllowGroups to mine and TimeLimit=1 # spawn $bin_id -gn expect { @@ -155,7 +189,7 @@ expect { wait } } -spawn $scontrol update PartitionName=$part_name AllowGroups=$my_group +spawn $scontrol update PartitionName=$part_name AllowGroups=$my_group MaxTime=1 expect { timeout { send_user "\nFAILURE: scontrol not responding\n" @@ -185,6 +219,38 @@ if {$found != 1} { set exit_code 1 } +# +# Run a job in this new partition and validate the time limit +# +set timed_out 0 +set sleep_time 300 +set timeout [expr $max_job_delay + $sleep_time] +spawn $srun -t1 -p $part_name $bin_sleep $sleep_time +expect { + -re "job exceeded timelimit" { + set timed_out 1 + exp_continue + } + -re "Terminated" { + set timed_out 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun not responding\n" + kill_srun + exp_continue + } + eof { + wait + } +} +if {$timed_out == 1} { + send_user "Early termination is expected, no worries.\n" +} else { + send_user "\nFAILURE: partition time limit not enforced\n" + set exit_code 1 +} + # # Now reset AllowGroups to ALL # -- GitLab