diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 68b63570efcbc9c57f4942c29cd95e0551419662..eda3bdf783f514b22f026a4c56469ca0c0ec7be4 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -1107,9 +1107,11 @@ not possible to also explicitly specify allowed accounts. .TP \fICoreCnt\fP=<num> -Identify number of cores to be reserved. This should only be used for -reservations that are less than one node in size. Otherwise use the -\fINodeCnt\fP option described below. +This option is only suported when SelectType=select/cons_res. Identify number of +cores to be reserved. If NodeCnt is used, this is the total number of cores to +reserve where cores per node is CoreCnt/NodeCnt. If a nodelist is used, this +should be an array of core numbers by node: Nodes=node[1-5] CoreCnt=2,2,3,3,4 + .TP \fILicenses\fP=<license> diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 7795fc833f4597bfd8540fb4c1a5a8635c2fdcf1..1d192d624dbfb3c236ce05a4d7c41ccc3dc1018d 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1790,7 +1790,7 @@ typedef struct resv_desc_msg { uint16_t flags; /* see RESERVE_FLAG_* above */ char *licenses; /* names of licenses to be reserved */ char *name; /* name of reservation (optional on create) */ - uint32_t core_cnt; /* Count of cores required */ + uint32_t *core_cnt; /* Count of cores required */ uint32_t *node_cnt; /* Count of nodes required. Specify set of job * sizes with trailing zero to optimize layout * for those jobs just specify their total size diff --git a/src/common/node_select.c b/src/common/node_select.c index c6cdf4d0dda673142fb542cf43c0bb7b45617317..01beca48821afe20d8d3ec330319796d6bcc5f9b 100644 --- a/src/common/node_select.c +++ b/src/common/node_select.c @@ -1244,7 +1244,7 @@ extern int select_g_reconfigure (void) * RET - nodes selected for use by the reservation */ extern bitstr_t * select_g_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, - uint32_t core_cnt, bitstr_t **core_bitmap) + uint32_t *core_cnt, bitstr_t **core_bitmap) { if (slurm_select_init(0) < 0) return NULL; diff --git a/src/common/node_select.h b/src/common/node_select.h index 608410b3dde861317c1176607447a746481ead2b..ef8990b09e1890350f8094bfea12244f28ea52fb 100644 --- a/src/common/node_select.h +++ b/src/common/node_select.h @@ -675,7 +675,7 @@ extern int select_g_step_finish(struct step_record *step_ptr); * RET - nodes selected for use by the reservation */ extern bitstr_t * select_g_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, - uint32_t core_cnt, + uint32_t *core_cnt, bitstr_t **core_bitmap); /*****************************\ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index bdb88803b35b246e29ebc431f19f387cc8db7091..ad697fac1f914de0f15523a7f349f799e989c610 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -3215,7 +3215,15 @@ _pack_update_resv_msg(resv_desc_msg_t * msg, Buf buffer, } else array_len = 0; pack32_array(msg->node_cnt, array_len, buffer); - pack32(msg->core_cnt, buffer); + if (msg->core_cnt) { + for (array_len = 0; msg->core_cnt[array_len]; + array_len++) { + /* determine array length */ + } + array_len++; /* Include trailing zero */ + } else + array_len = 0; + pack32_array(msg->core_cnt, array_len, buffer); packstr(msg->node_list, buffer); packstr(msg->features, buffer); packstr(msg->licenses, buffer); @@ -3297,7 +3305,15 @@ _unpack_update_resv_msg(resv_desc_msg_t ** msg, Buf buffer, /* This avoids a pointer to a zero length buffer */ xfree(tmp_ptr->node_cnt); } - safe_unpack32(&tmp_ptr->core_cnt, buffer); + safe_unpack32_array(&tmp_ptr->core_cnt, &uint32_tmp, buffer); + if (uint32_tmp > 0) { + /* Must be zero terminated */ + if (tmp_ptr->core_cnt[uint32_tmp-1] != 0) + goto unpack_error; + } else { + /* This avoids a pointer to a zero length buffer */ + xfree(tmp_ptr->core_cnt); + } safe_unpackstr_xmalloc(&tmp_ptr->node_list, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&tmp_ptr->features, diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 18ace3d46c2a23c9942ecf85e6df44d885770e65..abdc27c015b84168dba11ee408ded86c2e3f74f0 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -2493,25 +2493,52 @@ bitstr_t *_make_core_bitmap_filtered(bitstr_t *node_map, int filter) return core_map; } -/* Once here, if core_cnt=0, avail_bitmap has nodes not used by any job or +/* Once here, if core_cnt is NULL, avail_bitmap has nodes not used by any job or * reservation */ bitstr_t *sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, - uint32_t core_cnt, bitstr_t **core_bitmap) + uint32_t *core_cnt, bitstr_t **core_bitmap) { bitstr_t *sp_avail_bitmap; char str[300]; - /* Just allowing symetric requests today */ - uint32_t cores_per_node = core_cnt / MAX(node_cnt, 1); + uint32_t cores_per_node = 0; bitstr_t *tmpcore; + int total_core_cnt = 0; + + /* We have these cases here: + * 1) Reservation requests using just number of nodes + * - core_cnt is null + * 2) Reservations request using number of nodes + number of cores + * 3) Reservations request using node list + * - node_cnt is 0 + * - core_cnt is null + * 4) Reservation request using node list + number of cores list + * - node_cnt is 0 + */ + + if ((node_cnt) && (core_cnt)) { + debug2("reserving %u cores per node in %d nodes", + cores_per_node, node_cnt); + total_core_cnt = core_cnt[0]; + cores_per_node = core_cnt[0] / MAX(node_cnt, 1); + } + if ((!node_cnt) && (core_cnt)) { + int num_nodes = bit_set_count(avail_bitmap); + int i; + bit_fmt(str, (sizeof(str) - 1), avail_bitmap); + debug2("Reserving cores from nodes: %s", str); + for (i=0; i < num_nodes; i++) + total_core_cnt += core_cnt[i]; + } - debug2("reserving %u cores per node in %d nodes", - cores_per_node, node_cnt); + debug2("Reservations requires %d cores", total_core_cnt); sp_avail_bitmap = bit_alloc(bit_size(avail_bitmap)); bit_fmt(str, (sizeof(str) - 1), avail_bitmap); bit_fmt(str, (sizeof(str) - 1), sp_avail_bitmap); if (core_cnt) { /* Reservation is using partial nodes */ + int node_list_inx = 0; + debug2("Reservation is using partial nodes"); /* if not NULL = Cores used by other core based reservations @@ -2527,12 +2554,15 @@ bitstr_t *sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, debug2("tmpcore contains just current free cores: %s", str); bit_and(*core_bitmap, tmpcore); /* clear core_bitmap */ - while (core_cnt) { + while (total_core_cnt) { int inx, coff, coff2; int i; int cores_in_node; int local_cores; + if (node_cnt == 0) + cores_per_node = core_cnt[node_list_inx]; + inx = bit_ffs(avail_bitmap); if (inx < 0) { info("reservation request can not be satisfied"); @@ -2543,8 +2573,6 @@ bitstr_t *sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, debug2("Using node %d", inx); coff = cr_get_coremap_offset(inx); - /* TODO: is next right for the last possible node at - * avail_bitmap? */ coff2 = cr_get_coremap_offset(inx + 1); local_cores = coff2 - coff; @@ -2564,14 +2592,17 @@ bitstr_t *sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, if (cores_in_node < cores_per_node) continue; + debug2("Using node %d (avail: %d, needed: %d)", + inx, cores_in_node, cores_per_node); + cores_in_node = 0; for (i = 0; i < local_cores; i++) { if (bit_test(tmpcore, coff + i)) { bit_set(*core_bitmap, coff + i); - core_cnt--; + total_core_cnt--; cores_in_node++; if ((cores_in_node == cores_per_node) || - (core_cnt == 0)) + (total_core_cnt == 0)) break; } } @@ -2584,6 +2615,7 @@ bitstr_t *sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, } else { debug2("Reservation NOT using node %d", inx); } + node_list_inx++; } FREE_NULL_BITMAP(tmpcore); @@ -2591,7 +2623,7 @@ bitstr_t *sequential_pick(bitstr_t *avail_bitmap, uint32_t node_cnt, bit_fmt(str, (sizeof(str) - 1), *core_bitmap); info("sequential pick using coremap: %s", str); - if (core_cnt) { + if (total_core_cnt) { info("reservation request can not be satisfied"); FREE_NULL_BITMAP(sp_avail_bitmap); return NULL; @@ -2661,7 +2693,7 @@ static int _get_avail_core_in_node(bitstr_t *core_bitmap, int node) * RET - nodes selected for use by the reservation */ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, - uint32_t core_cnt, bitstr_t **core_bitmap) + uint32_t *core_cnt, bitstr_t **core_bitmap) { bitstr_t **switches_bitmap; /* nodes on this switch */ bitstr_t **switches_core_bitmap; /* cores on this switch */ @@ -2681,7 +2713,8 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, xassert(avail_bitmap); - if (!switch_record_cnt || !switch_record_table) { + /* When reservation includes a nodelist we use sequential_pick code */ + if (!switch_record_cnt || !switch_record_table || !node_cnt) { return sequential_pick(avail_bitmap, node_cnt, core_cnt, core_bitmap); } @@ -2694,10 +2727,11 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, *core_bitmap = _make_core_bitmap_filtered(avail_bitmap, 0); rem_nodes = node_cnt; - rem_cores = core_cnt; + rem_cores = core_cnt[0]; - /* TODO: allowing asymmetric cluster */ - cores_per_node = core_cnt / MAX(node_cnt, 1); + /* Assuming symmetric cluster */ + if(core_cnt) + cores_per_node = core_cnt[0] / MAX(node_cnt, 1); /* Construct a set of switch array entries, * use the same indexes as switch_record_table in slurmctld */ @@ -2766,7 +2800,7 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, best_fit_inx = -1; for (j=0; j<switch_record_cnt; j++) { if ((switches_node_cnt[j] < rem_nodes) || - (core_cnt && (switches_cpu_cnt[j] < core_cnt))) + (core_cnt && (switches_cpu_cnt[j] < core_cnt[0]))) continue; if ((best_fit_inx == -1) || (switch_record_table[j].level < @@ -2800,8 +2834,12 @@ extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, for (j=0; j<switch_record_cnt; j++) { if (switches_node_cnt[j] == 0) continue; - sufficient = (switches_node_cnt[j] >= rem_nodes) && - (switches_cpu_cnt[j] >= core_cnt); + if(core_cnt) + sufficient = + (switches_node_cnt[j] >= rem_nodes) && + (switches_cpu_cnt[j] >= core_cnt[0]); + else + sufficient = switches_node_cnt[j] >= rem_nodes; /* If first possibility OR */ /* first set large enough for request OR */ /* tightest fit (less resource waste) OR */ @@ -2886,9 +2924,9 @@ fini: for (i=0; i<switch_record_cnt; i++) { *core_bitmap = bit_alloc(bit_size(exc_core_bitmap)); } - cores_per_node = core_cnt / MAX(node_cnt, 1); + cores_per_node = core_cnt[0] / MAX(node_cnt, 1); - while (core_cnt) { + while (core_cnt[0]) { uint32_t inx, coff; int i; int avail_cores_in_node; @@ -2897,8 +2935,8 @@ fini: for (i=0; i<switch_record_cnt; i++) { if ((inx < 0) || (inx > bit_size(avail_bitmap))) break; - debug2("Using node inx %d cores_per_node: %d " - "core_cnt: %d", inx, cores_per_node, core_cnt); + debug2("Using node inx %d cores_per_node %d " + "core_cnt %d", inx, cores_per_node, core_cnt[0]); coff = cr_get_coremap_offset(inx); /* Clear this node from the initial available bitmap */ @@ -2924,12 +2962,12 @@ fini: for (i=0; i<switch_record_cnt; i++) { for (i = 0; i < cr_node_num_cores[inx]; i++) { if (!bit_test(exc_core_bitmap, coff + i)) { bit_set(*core_bitmap, coff + i); - core_cnt--; + core_cnt[0]--; avail_cores_in_node++; } if ((avail_cores_in_node == cores_per_node) || - (core_cnt == 0)) + (core_cnt[0] == 0)) break; } @@ -2943,7 +2981,7 @@ fini: for (i=0; i<switch_record_cnt; i++) { //bit_fmt(str, (sizeof(str) - 1), *core_bitmap); //info("sequential pick using coremap: %s", str); - if (core_cnt) { + if (core_cnt[0]) { info("reservation request can not be satisfied"); FREE_NULL_BITMAP(sp_avail_bitmap); return NULL; diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index 9ef87b10d1b3b6f5eeb713a0948c0105c26f85cd..a97dd67ad275b7c63fc30e69f33efd30c728ee06 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -848,7 +848,7 @@ extern int select_p_reconfigure(void) } extern bitstr_t * select_p_resv_test(bitstr_t *avail_bitmap, uint32_t node_cnt, - uint32_t core_cnt, bitstr_t **core_bitmap) + uint32_t *core_cnt, bitstr_t **core_bitmap) { return other_resv_test(avail_bitmap, node_cnt, core_cnt, core_bitmap); } diff --git a/src/scontrol/create_res.c b/src/scontrol/create_res.c index 78007f517fe59b0a08f7e5aa8c563be8c28a7877..326eb794d688c35b5c232a77c997bb531fc986dd 100644 --- a/src/scontrol/create_res.c +++ b/src/scontrol/create_res.c @@ -295,8 +295,29 @@ scontrol_parse_res_options(int argc, char *argv[], const char *msg, strncasecmp(tag, "CoreCount", MAX(taglen,5)) == 0 || strncasecmp(tag, "CPUCnt", MAX(taglen,5)) == 0 || strncasecmp(tag, "CPUCount", MAX(taglen,5)) == 0) { - char *endptr = NULL; - resv_msg_ptr->core_cnt = strtol(val, &endptr, 10); + + char *endptr = NULL, *core_cnt, *tok, *ptrptr = NULL; + int node_inx = 0; + + core_cnt = xstrdup(val); + tok = strtok_r(core_cnt, ",", &ptrptr); + while (tok) { + xrealloc(resv_msg_ptr->core_cnt, + sizeof(uint32_t) * (node_inx + 2)); + resv_msg_ptr->core_cnt[node_inx] = + strtol(tok, &endptr, 10); + if ((endptr == NULL) || + (endptr[0] != '\0') || + (tok[0] == '\0')) { + exit_code = 1; + error("Invalid core count %s. %s", argv[i], msg); + xfree(core_cnt); + return -1; + } + node_inx++; + tok = strtok_r(NULL, ",", &ptrptr); + } + xfree(core_cnt); } else if (strncasecmp(tag, "Nodes", MAX(taglen, 5)) == 0) { resv_msg_ptr->node_list = val; diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 9ea669e2f38971c0edcc4003bd4f0315b67a81cf..78bf2f4655cc13e9777e28955137d990e1b9eac8 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -1406,6 +1406,8 @@ static bool _resv_overlap(time_t start_time, time_t end_time, continue; /* no specific nodes in reservation */ if (!bit_overlap(resv_ptr->node_bitmap, node_bitmap)) continue; /* no overlap */ + if (!resv_ptr->full_nodes) + continue; for (i=0; ((i<7) && (!rc)); i++) { /* look forward one week */ s_time1 = start_time; @@ -1579,9 +1581,6 @@ extern int create_resv(resv_desc_msg_t *resv_desc_ptr) } } - if (resv_desc_ptr->core_cnt == NO_VAL) - resv_desc_ptr->core_cnt = 0; - #ifdef HAVE_BG if (!cnodes_per_bp) { select_g_alter_node_cnt(SELECT_GET_NODE_SCALING, @@ -1648,18 +1647,37 @@ extern int create_resv(resv_desc_msg_t *resv_desc_ptr) } total_node_cnt = bit_set_count(node_bitmap); if (!(resv_desc_ptr->flags & RESERVE_FLAG_IGN_JOBS) && + !resv_desc_ptr->core_cnt && _job_overlap(resv_desc_ptr->start_time, resv_desc_ptr->flags, node_bitmap)) { info("Reservation request overlaps jobs"); rc = ESLURM_NODES_BUSY; goto bad_parse; } - /* We do no allow to request cores with nodelist */ - info("Reservation CoreCnt cleared due to Nodes specification"); - resv_desc_ptr->core_cnt = 0; + /* We do allow to request cores with nodelist */ + if (resv_desc_ptr->core_cnt) { + int nodecnt = bit_set_count(node_bitmap); + int nodeinx = 0; + while (nodeinx < nodecnt) { + if (!resv_desc_ptr->core_cnt[nodeinx]) { + info("Core count for reservation node \ + list is not consistent!"); + rc = ESLURM_INVALID_NODE_NAME; + goto bad_parse; + } + debug2("Requesting %d cores for node_list %d", + resv_desc_ptr->core_cnt[nodeinx], + nodeinx); + nodeinx++; + } + if ((rc = _select_nodes(resv_desc_ptr, &part_ptr, &node_bitmap, + &core_bitmap)) != SLURM_SUCCESS) { + goto bad_parse; + } + } } else if (((resv_desc_ptr->node_cnt == NULL) || (resv_desc_ptr->node_cnt[0] == 0)) && - (resv_desc_ptr->core_cnt == 0) && + (!resv_desc_ptr->core_cnt) && ((resv_desc_ptr->flags & RESERVE_FLAG_LIC_ONLY) == 0)) { info("Reservation request lacks node specification"); rc = ESLURM_INVALID_NODE_NAME; @@ -1731,14 +1749,14 @@ extern int create_resv(resv_desc_msg_t *resv_desc_ptr) resv_ptr->user_not = user_not; resv_desc_ptr->users = NULL; /* Nothing left to free */ - if (resv_desc_ptr->core_cnt == 0) { + if (!resv_desc_ptr->core_cnt) { debug2("reservation using full nodes"); _set_cpu_cnt(resv_ptr); resv_ptr->full_nodes = 1; } else { debug2("reservation using partial nodes: core count %u", - resv_desc_ptr->core_cnt); - resv_ptr->cpu_cnt = resv_desc_ptr->core_cnt; + bit_set_count(resv_ptr->core_bitmap)); + resv_ptr->cpu_cnt = bit_set_count(resv_ptr->core_bitmap); resv_ptr->full_nodes = 0; } @@ -2836,7 +2854,10 @@ static int _select_nodes(resv_desc_msg_t *resv_desc_ptr, } /* Start with all nodes in the partition */ - node_bitmap = bit_copy((*part_ptr)->node_bitmap); + if (*resv_bitmap) + node_bitmap = bit_copy(*resv_bitmap); + else + node_bitmap = bit_copy((*part_ptr)->node_bitmap); /* Don't use node already reserved */ if (!(resv_desc_ptr->flags & RESERVE_FLAG_OVERLAP)) { @@ -3135,7 +3156,7 @@ static bitstr_t *_pick_idle_node_cnt(bitstr_t *avail_bitmap, if (job_ptr->end_time < resv_desc_ptr->start_time) continue; - if (resv_desc_ptr->core_cnt == 0) { + if (!resv_desc_ptr->core_cnt) { bit_not(job_ptr->node_bitmap); bit_and(avail_bitmap, job_ptr->node_bitmap); bit_not(job_ptr->node_bitmap); diff --git a/testsuite/expect/inc3.11.1 b/testsuite/expect/inc3.11.1 index b2698db8c73210aba1b6f40eb85f3b3e941fab48..f34e73bc92756118a17efe56bcfbbee699d6a21d 100644 --- a/testsuite/expect/inc3.11.1 +++ b/testsuite/expect/inc3.11.1 @@ -33,7 +33,7 @@ proc inc3_11_1 {} { global def_node user_name def_partition exit_code cluster_cpus res_name - global cons_res_actived + global cons_res_actived def_node_name set num_nodes [available_nodes $def_partition] @@ -60,6 +60,7 @@ proc inc3_11_1 {} { {StartTime=now Duration=5 Nodes=$def_node User=$user_name Flags=badtype,ignore_jobs} {StartTime=now+10minutes EndTime=now Nodes=$def_node User=$user_name Flags=ignore_jobs} {StartTime=now Duration=5 Nodes=$def_node User=$user_name Licenses=DUMMY_FOR_TESTING Flags=ignore_jobs} + {StartTime=now Duration=5 Nodes=$def_node_name\[1\-2\] CoreCnt=1 User=$user_name} " #{StartTime=now Duration=5 Nodes=$def_node Account=badaccountname} if {$cons_res_actived == 1} { diff --git a/testsuite/expect/inc3.11.9 b/testsuite/expect/inc3.11.9 new file mode 100644 index 0000000000000000000000000000000000000000..113abac4ea8575e86da38295baa10901b32ef3fb --- /dev/null +++ b/testsuite/expect/inc3.11.9 @@ -0,0 +1,560 @@ +############################################################################ +# Purpose: Test of SLURM functionality +# to be called from test3.11 +# Several cases for core based reservations using nodelists +# Pluging select/cons_res needed +# +############################################################################ +# Copyright (C) 2009 Lawrence Livermore National Security +# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). +# Written by Dave Bremer <dbremer@llnl.gov> +# CODE-OCEC-09-009. All rights reserved. +# +# +# This file is part of SLURM, a resource management program. +# For details, see <http://www.schedmd.com/slurmdocs/>. +# Please also read the included file: DISCLAIMER. +# +# SLURM is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along +# with SLURM; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +############################################################################ + +proc inc3_11_9 {} { + global user_name exit_code res_name res_nodes res_nodecnt res_corecnt + global bin_rm file_in bin_sleep sbatch number scontrol + global alpha_numeric_under scancel + global cluster_cpus cores_per_node def_partition + global res_nodes res_thread_cnt node_count + global def_node_name + + send_user "\n+++++ STARTING TEST 9 +++++\n" + + # Make the job script + exec $bin_rm -f $file_in + make_bash_script $file_in "$bin_sleep 100" + + # Make a reservation, just to get node size infomation + set ret_code [create_res "StartTime=now Duration=1 NodeCnt=1 User=$user_name" 0] + if {$ret_code != 0} { + send_user "\n\033\[31mFAILURE: Unable to create a valid reservation\033\[m\n" + exit $ret_code + } + # Delete the reservation + set ret_code [delete_res $res_name] + if {$ret_code != 0} { + exit $ret_code + } + + set num_nodes [available_nodes $def_partition] + set core_res_num [ expr $cores_per_node / 2 ] + set thread_res_num [ expr $core_res_num * $res_thread_cnt ] + + # Submit a batch job using half the threads on the nodes + set sbatch_pid [spawn $sbatch --nodes=1-$num_nodes --time=30:00 --ntasks-per-node=$thread_res_num --output=/dev/null $file_in] + expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: sbatch not responding\033\[m\n" + slow_kill $sbatch_pid + set exit_code 1 + } + eof { + wait + } + } + if {$job_id == 0} { + send_user "\n\033\[31mFAILURE: batch submit failure\033\[m\n" + exit 1 + } + + if {[wait_for_job $job_id "RUNNING"] != 0} { + send_user "\nFAILURE: job failed to start\n" + cancel_job $job_id + exit 1 + } + send_user "\nJOB is running as expected\n" + + # Make a reservation using 1 core per node in first 5 nodes + + if {$num_nodes < 5} { + send_user "\nWARNING: test can not work with current nodes\n"; + cancel_job $job_id + exit 1 + + } + + set ret_code [create_res "StartTime=now Duration=60 Nodes=$def_node_name\[1\-5\] CoreCnt=1,1,1,1,1 User=$user_name" 0] + if {$ret_code != 0} { + send_user "\n\033\[31mFAILURE: Unable to create a valid reservation\033\[m\n" + exit $ret_code + } + + if {$res_nodecnt != 5} { + send_user "\n\033\[31mFAILURE: reservation created with $res_nodecnt nodes when 5 were requested\033\[m\n" + set exit_code 1 + exit $ret_code + } + + if {$res_corecnt != 5} { + send_user "\n\033\[31mFAILURE: reservation created with $res_corecnt cores when just 5 was requested\033\[m\n" + set exit_code 1 + exit $ret_code + } + + send_user "\n\033\[32mSUCCESS: reservation was created as expected\033\[m\n" + + # Delete the reservation + set ret_code [delete_res $res_name] + if {$ret_code != 0} { + cancel_job $job_id + exit $ret_code + } + + set core_res_num [expr $core_res_num + 1] + # Make the reservation using more cores then free in a node + set ret_code [create_res "StartTime=now Duration=60 Nodes=$def_node_name\[1\-5\] CoreCnt=1,1,1,1,$core_res_num User=$user_name" 1] + if {$ret_code != 0} { + send_user "\n\033\[32mSUCCESS: Reservation can not be created as expected\033\[m\n" + } else { + send_user "\n\033\[31mFAILURE: reservation was created when it should have not\033\[m\n" + set exit_code 1 + + # Delete the reservation + set ret_code [delete_res $res_name] + } + + # Make the reservation using more cores than free in a node (now) + # but those cores being free at reservation start time + set ret_code [create_res "StartTime=now+3600 Duration=60 Nodes=$def_node_name\[1\-5\] CoreCnt=1,1,1,1,$core_res_num User=$user_name" 0] + if {$ret_code != 0} { + send_user "\n\033\[31mFAILURE: Reservation can not be created when it should\033\[m\n" + set exit_code 1 + } else { + send_user "\n\033\[32mSUCCESS: Reservation can be created as expected\033\[m\n" + # Delete the reservation + set ret_code [delete_res $res_name] + } + + # Make the reservation using more cores than free at reservation start time + set ret_code [create_res "StartTime=now+300 Duration=60 Nodes=$def_node_name\[1\-5\] CoreCnt=1,1,1,1,$core_res_num User=$user_name" 1] + if {$ret_code != 0} { + send_user "\n\033\[32mSUCCESS: Reservation can not be created as expected\033\[m\n" + } else { + send_user "\n\033\[31mFAILURE: Reservation can be created when it should not\033\[m\n" + set exit_code 1 + + # Delete the reservation + set ret_code [delete_res $res_name] + } + + cancel_job $job_id + + + send_user "\n\nLet's check overlapping reservations\n\n"; + + set core_res_num [ expr $cores_per_node / 2 ] + set total_core_res [ expr $core_res_num * $node_count ] + # Make a reservation for all nodes using just half the processor in each node + set ret_code [create_res "StartTime=now Duration=60 Nodecnt=$node_count CoreCnt=$total_core_res User=$user_name" 1] + if {$ret_code != 0} { + send_user "\n\033\[31mFAILURE: Unable to create a valid reservation\033\[m\n" + exit $ret_code + } + + send_user "\n\033\[32mSUCCESS: reservation was created as expected\033\[m\n" + + + if {$core_res_num < 2} { + send_user "\n\033\[32mWARNING: not enough cores for testing\033\[m\n" + set ret_code [delete_res $res_name] + } + + set res_name1 "$res_name" + + set total_core_res [ expr $core_res_num + 1 ] + # Now creating a reservation using first 5 nodes and more cores per node than available + set ret_code [create_res "StartTime=now Duration=60 Nodes=$def_node_name\[1\-5\] CoreCnt=1,1,1,1,$total_core_res User=$user_name" 0] + if {$ret_code == 0} { + send_user "\n\033\[31mFAILURE: reservation was created when it should not\033\[m\n" + set ret_code [delete_res $res_name1] + exit $ret_code + } + + send_user "\n\033\[32mSUCCESS: reservation was not created as expected\033\[m\n" + + # Now creating a reservation using first 5 nodes and just 1 core per node + set ret_code [create_res "StartTime=now Duration=60 Nodes=$def_node_name\[1\-5\] CoreCnt=1,1,1,1,1 User=$user_name" 1] + if {$ret_code != 0} { + send_user "\n\033\[31mFAILURE: Unable to create a valid reservation\033\[m\n" + set ret_code [delete_res $res_name1] + exit $ret_code + } + + send_user "\n\033\[32mSUCCESS: reservation was created as expected\033\[m\n" + + # Submit a batch job: a job using cores availbale in first 5 nodes + set core_res_num [ expr $cores_per_node / 2 ] + set core_res_num [ expr $core_res_num - 1 ] + set sbatch_pid [spawn $sbatch --ntasks-per-node=$core_res_num --nodelist=$def_node_name\[1\-5\] --output=/dev/null $file_in] + expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: sbatch not responding\033\[m\n" + slow_kill $sbatch_pid + set exit_code 1 + } + eof { + wait + } + } + if {$job_id == 0} { + send_user "\n\033\[31mFAILURE: batch submit failure\033\[m\n" + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + sleep 1 + # Show the job, make sure reservation tag is right + spawn $scontrol show job $job_id + expect { + -re "Invalid job id specified" { + send_user "\n\033\[31mFAILURE: Job $job_id not found\033\[m\n" + set exit_code 1 + exp_continue + } + -re "JobState=PENDING" { + send_user "\n\033\[31mFAILURE: Job $job_id is PENDING\033\[m\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: scontrol not responding\033\[m\n" + set exit_code 1 + } + eof { + wait + } + } + + if { $exit_code == 1 } { + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + send_user "\n\033\[32mJOB is running as expected\033\[m\n" + + cancel_job $job_id + + # Submit a batch job: a job using more cores than availbale in first 5 nodes + set core_res_num [ expr $cores_per_node / 2 ] + set sbatch_pid [spawn $sbatch --ntasks-per-node=$core_res_num --nodelist=$def_node_name\[1\-5\] --output=/dev/null $file_in] + expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: sbatch not responding\033\[m\n" + slow_kill $sbatch_pid + set exit_code 1 + } + eof { + wait + } + } + if {$job_id == 0} { + send_user "\n\033\[31mFAILURE: batch submit failure\033\[m\n" + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + sleep 1 + # Show the job, make sure reservation tag is right + spawn $scontrol show job $job_id + expect { + -re "Invalid job id specified" { + send_user "\n\033\[31mFAILURE: Job $job_id not found\033\[m\n" + set exit_code 1 + exp_continue + } + -re "JobState=PENDING" { + send_user "\n\033\[32m Job is PENDING as expected\033\[m\n" + exp_continue + } + -re "JobState=RUNNING" { + send_user "\n\033\[31mFAILURE: Job $job_id is RUNNING\033\[m\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: scontrol not responding\033\[m\n" + set exit_code 1 + } + eof { + wait + } + } + if { $exit_code == 1 } { + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + cancel_job $job_id + + # Submit a batch job: a job using cores reserved in first reservation + set core_res_num [ expr $cores_per_node / 2 ] + set sbatch_pid [spawn $sbatch --ntasks-per-node=$core_res_num --nodelist=$def_node_name\[1\-5\] --reservation=$res_name1 --output=/dev/null $file_in] + expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: sbatch not responding\033\[m\n" + slow_kill $sbatch_pid + set exit_code 1 + } + eof { + wait + } + } + if {$job_id == 0} { + send_user "\n\033\[31mFAILURE: batch submit failure\033\[m\n" + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + sleep 1 + # Show the job, make sure reservation tag is right + spawn $scontrol show job $job_id + expect { + -re "Invalid job id specified" { + send_user "\n\033\[31mFAILURE: Job $job_id not found\033\[m\n" + set exit_code 1 + exp_continue + } + -re "JobState=RUNNING" { + send_user "\n\033\[32m Job is RUNNING as expected\033\[m\n" + exp_continue + } + -re "JobState=PENDING" { + send_user "\n\033\[31mFAILURE: Job $job_id is PENDING\033\[m\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: scontrol not responding\033\[m\n" + set exit_code 1 + } + eof { + wait + } + } + if { $exit_code == 1 } { + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + cancel_job $job_id + + # Submit a batch job: a job using more cores than reserved in first reservation + set core_res_num [ expr $cores_per_node / 2 ] + set core_res_num [ expr $core_res_num * 5 ] + set core_res_num [ expr $core_res_num + 1 ] + set sbatch_pid [spawn $sbatch --ntasks-per-node=$core_res_num --nodelist=$def_node_name\[1\-5\] --reservation=$res_name1 --output=/dev/null $file_in] + expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: sbatch not responding\033\[m\n" + slow_kill $sbatch_pid + set exit_code 1 + } + eof { + wait + } + } + if {$job_id == 0} { + send_user "\n\033\[31mFAILURE: batch submit failure\033\[m\n" + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + sleep 1 + # Show the job, make sure reservation tag is right + spawn $scontrol show job $job_id + expect { + -re "Invalid job id specified" { + send_user "\n\033\[31mFAILURE: Job $job_id not found\033\[m\n" + set exit_code 1 + exp_continue + } + -re "JobState=PENDING" { + send_user "\n\033\[32m Job is PENDING as expected\033\[m\n" + exp_continue + } + -re "JobState=RUNNING" { + send_user "\n\033\[31mFAILURE: Job $job_id is RUNNING\033\[m\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: scontrol not responding\033\[m\n" + set exit_code 1 + } + eof { + wait + } + } + if { $exit_code == 1 } { + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + cancel_job $job_id + + # Submit a batch job: a job using cores reserved in second reservation + set sbatch_pid [spawn $sbatch --ntasks-per-node=1 --nodelist=$def_node_name\[1\-5\] --reservation=$res_name --output=/dev/null $file_in] + expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: sbatch not responding\033\[m\n" + slow_kill $sbatch_pid + set exit_code 1 + } + eof { + wait + } + } + if {$job_id == 0} { + send_user "\n\033\[31mFAILURE: batch submit failure\033\[m\n" + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + sleep 1 + # Show the job, make sure reservation tag is right + spawn $scontrol show job $job_id + expect { + -re "Invalid job id specified" { + send_user "\n\033\[31mFAILURE: Job $job_id not found\033\[m\n" + set exit_code 1 + exp_continue + } + -re "JobState=RUNNING" { + send_user "\n\033\[32m Job is RUNNING as expected\033\[m\n" + exp_continue + } + -re "JobState=PENDING" { + send_user "\n\033\[31mFAILURE: Job $job_id is PENDING\033\[m\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: scontrol not responding\033\[m\n" + set exit_code 1 + } + eof { + wait + } + } + if { $exit_code == 1 } { + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + cancel_job $job_id + + # Submit a batch job: a job using more cores than reserved in second reservation + set sbatch_pid [spawn $sbatch --ntasks-per-node=2 --nodelist=$def_node_name\[1\-5\] --reservation=$res_name --output=/dev/null $file_in] + expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: sbatch not responding\033\[m\n" + slow_kill $sbatch_pid + set exit_code 1 + } + eof { + wait + } + } + if {$job_id == 0} { + send_user "\n\033\[31mFAILURE: batch submit failure\033\[m\n" + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + sleep 1 + # Show the job, make sure reservation tag is right + spawn $scontrol show job $job_id + expect { + -re "Invalid job id specified" { + send_user "\n\033\[31mFAILURE: Job $job_id not found\033\[m\n" + set exit_code 1 + exp_continue + } + -re "JobState=PENDING" { + send_user "\n\033\[32m Job is PENDING as expected\033\[m\n" + exp_continue + } + -re "JobState=RUNNING" { + send_user "\n\033\[31mFAILURE: Job $job_id is RUNNING\033\[m\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\n\033\[31mFAILURE: scontrol not responding\033\[m\n" + set exit_code 1 + } + eof { + wait + } + } + if { $exit_code == 1 } { + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + exit 1 + } + + cancel_job $job_id + + set ret_code [delete_res $res_name1] + set ret_code [delete_res $res_name] + +} diff --git a/testsuite/expect/test3.11 b/testsuite/expect/test3.11 index 824c93675bf939566c31025434e2288d850d2713..354d0b232fc5df9ec540aa46a1b09a8b63e23cd3 100755 --- a/testsuite/expect/test3.11 +++ b/testsuite/expect/test3.11 @@ -40,6 +40,7 @@ source ./inc3.11.5 source ./inc3.11.6 source ./inc3.11.7 source ./inc3.11.8 +source ./inc3.11.9 @@ -53,6 +54,7 @@ set res_thread_cnt 0 set user_name "" set def_partition "" set def_node "" +set def_node_name "" set ii 0 print_header $test_id @@ -276,6 +278,16 @@ spawn $sinfo -h -o "=%N=" -p $def_partition expect { -re "=(.+)=" { set def_node $expect_out(1,string) + send_user "\nDefault node $def_node\n"; + } + eof { + wait + } +} +spawn $sinfo -h -o "=%N=" -p $def_partition +expect { + -re "=($alpha_numeric)(\[)($alpha_numeric)-($alpha_numeric)(\])=" { + set def_node_name $expect_out(1,string) exp_continue } eof { @@ -355,6 +367,7 @@ inc3_11_6 if {$cons_res_actived == 1} { inc3_11_7 inc3_11_8 + inc3_11_9 } if {$exit_code == 0} {