diff --git a/NEWS b/NEWS index 32bbaaad529df77a5738bc75f2e2d3baa216514a..02dd4418424a80739ce12dbb66c3df2b1491ba27 100644 --- a/NEWS +++ b/NEWS @@ -9,11 +9,13 @@ documents those changes that are of interest to users and admins. -- BLUEGENE - Added state save on slurmctld shutdown of blocks in an error state on real systems and total block config on emulation systems. -- Major update to Slurm's PMI internal logic for better scalability. - Communications now supported directly between appliciation tasks via + Communications now supported directly between application tasks via Slurm's PMI library. Srun sends single message to one task on each node - and that forwards key-pairs to other tasks on that nodes. Old code send - key-pairs directly to each task. + and that tasks forwards key-pairs to other tasks on that nodes. The old + code sent key-pairs directly to each task. NOTE: PMI applications must re-link with this new library. + -- For multi-core support: Fix task distribution bug and add automated + tests, patch.1.2.0-pre11.070111.plane from Dan Palermo (HP). * Changes in SLURM 1.2.0-pre11 ============================== diff --git a/src/common/slurm_step_layout.c b/src/common/slurm_step_layout.c index da90dcf53720c0fb1074b32f87277c607f10e11a..1818bdafe6b1da9a253c54873ef822c00e77d05b 100644 --- a/src/common/slurm_step_layout.c +++ b/src/common/slurm_step_layout.c @@ -632,114 +632,73 @@ static int _task_layout_cyclic(slurm_step_layout_t *step_layout, /* - * Compute the number of tasks per node for the plane - * distribution given a plane size of "plane_size". + * The plane distribution results in a block cyclic of block size + * "plane_size". * The plane distribution does not do any workload balancing and * just use the user specified blocksize: "plane_size". * This distribution does not take the hardware (number of CPUs * per node) into account when computing the number of tasks per * hosts. - * - */ -static uint32_t * -_task_count_layout_plane(const uint32_t node_cnt, - const uint32_t num_tasks, - const uint16_t plane_size) -{ - int i, left = 0; - uint32_t *ntask = NULL; - int tasks_all; - int planes_per_host; - - ntask = (uint32_t *) xmalloc(sizeof(uint32_t *) * node_cnt); - - tasks_all = node_cnt*plane_size; - planes_per_host = num_tasks/tasks_all; - - for(i = 0; i <node_cnt; i++) { - ntask[i] = planes_per_host*plane_size; - } - - left = num_tasks - (node_cnt*(planes_per_host*plane_size)); - if(left > 0) { - for (i = 0; i < node_cnt; i++) { - if (left <= 0) - continue; - if (left < plane_size) { - ntask[i] += left; - info("I ntask[%d]: %d", i, ntask[i]); - } else { - ntask[i] += plane_size; - info("II ntask[%d]: %d", i, ntask[i]); - } - left -= plane_size; - } - } - - return ntask; -} - -/* - * The plane distribution results in a block cyclic of block size " - * plane_size". - * The plane distribution does not do any workload balancing and - * just use the user specified blocksize: "plane_size". - * This distribution does not take the hardware (number of CPUs - * per node) into account when computing the number of tasks per - * hosts. - * + * For example: + * plane_size = 2 + * node Node0 Node1 + * -- -- -- -- + * task distribution: 0 1 2 3 + * 4 5 6 7 + * 8 9 10 11 + * 12 13 14 15 etc. */ static int _task_layout_plane(slurm_step_layout_t *step_layout, uint32_t *cpus) { - int i, j, taskid = 0; - uint32_t *temp_tasks = NULL; + int i, j, k, taskid = 0; - debug3("_task_layout_plane plane_size %u", step_layout->plane_size); + debug3("_task_layout_plane plane_size %u node_cnt %u task_cnt %u", + step_layout->plane_size, + step_layout->node_cnt, step_layout->task_cnt); - if(step_layout->plane_size <= 0) + if (step_layout->plane_size <= 0) return SLURM_ERROR; - for (i=0; i<step_layout->node_cnt; i++) { - step_layout->tids[i] = xmalloc(sizeof(int) * step_layout->task_cnt); - } - - temp_tasks = _task_count_layout_plane(step_layout->node_cnt, - step_layout->task_cnt, - step_layout->plane_size); - - for(i=0; i < step_layout->node_cnt; i++) - step_layout->tasks[i]= temp_tasks[i]; - xfree(temp_tasks); + if (step_layout->tasks == NULL) + return SLURM_ERROR; - info("node_cnt %u task_cnt %u plane_size %u", - step_layout->node_cnt, step_layout->task_cnt, - step_layout->plane_size); - for(i = 0; i <step_layout->node_cnt; i++) { - info("tasks[%d]: %u", i, step_layout->tasks[i]); + for (i=0; i<step_layout->node_cnt; i++) { + step_layout->tids[i] = xmalloc(sizeof(uint32_t) + * step_layout->task_cnt); } - if(step_layout->tasks == NULL) - return SLURM_ERROR; - taskid = 0; - for(i = 0; i <step_layout->node_cnt; i++) { - for(j = 0; j < step_layout->tasks[i]; j++) { - step_layout->tids[i][j] = taskid++; + for (j=0; taskid<step_layout->task_cnt; j++) { /* cycle counter */ + for (i=0; ((i<step_layout->node_cnt) + && (taskid<step_layout->task_cnt)); i++) { + /* assign a block of 'plane_size' tasks to this node */ + for (k=0; ((k<step_layout->plane_size) + && (taskid<step_layout->task_cnt)); k++) { + step_layout->tids[i][step_layout->tasks[i]] = + taskid; + taskid++; + step_layout->tasks[i]++; + } } } - if(taskid != step_layout->task_cnt) { + + if (taskid != step_layout->task_cnt) { error("_task_layout_plane: Mismatch in task count (%d != %d) ", taskid, step_layout->task_cnt); return SLURM_ERROR; } + + for (i=0; i < step_layout->node_cnt; i++) { + info("tasks[%d]: %u", i, step_layout->tasks[i]); + } #if(0) - /* debugging only remove */ + /* debugging only */ for (i=0; i < step_layout->node_cnt; i++) { info ("Host %d _plane_ # of tasks %u", i, step_layout->tasks[i]); for (j=0; j<step_layout->tasks[i]; j++) { - info ("Host %d _plane_ taskid %d task %u", + info ("Host %d _plane_ localid %d taskid %u", i, j, step_layout->tids[i][j]); } } diff --git a/testsuite/expect/Makefile.am b/testsuite/expect/Makefile.am index d1c6fa6757d5a3249e37b4676a766476155b148f..dc86c61448dae03cd0d92a55f97b12930411887a 100644 --- a/testsuite/expect/Makefile.am +++ b/testsuite/expect/Makefile.am @@ -88,6 +88,10 @@ EXTRA_DIST = \ test1.89.prog.c \ test1.90 \ test1.90.prog.c \ + test1.91 \ + test1.91.prog.c \ + test1.92 \ + test1.92.bash \ test2.1 \ test2.2 \ test2.3 \ diff --git a/testsuite/expect/Makefile.in b/testsuite/expect/Makefile.in index 5816e2dac8a4e668a4e0f50dfb9c26a7c8329cb0..9f1b541e03ee4f5979a6d8ab74fa32c80ef642d5 100644 --- a/testsuite/expect/Makefile.in +++ b/testsuite/expect/Makefile.in @@ -332,6 +332,10 @@ EXTRA_DIST = \ test1.89.prog.c \ test1.90 \ test1.90.prog.c \ + test1.91 \ + test1.91.prog.c \ + test1.92 \ + test1.92.bash \ test2.1 \ test2.2 \ test2.3 \ diff --git a/testsuite/expect/README b/testsuite/expect/README index 3b5e9de072b18338050ce867ec9a8b6cf944906d..78d69147cf616ae50cb24ed4fc373f0d3153191f 100644 --- a/testsuite/expect/README +++ b/testsuite/expect/README @@ -159,6 +159,8 @@ test1.87 Confirm node selection from within a job step on existing allocation test1.88 Basic MPI functionality tests via srun. test1.89 Test of CPU affinity support. test1.90 Test of memory affinity support for NUMA systems. +test1.91 Test of CPU affinity for multi-core systems. +test1.92 Test of task distribution support on multi-core systems. **NOTE** The above tests for mutliple processor/partition systems only test2.# Testing of scontrol options (to be run as unprivileged user). diff --git a/testsuite/expect/test1.91 b/testsuite/expect/test1.91 new file mode 100644 index 0000000000000000000000000000000000000000..5b84355bd0e25acbf3faf964fb2318a15bc7241a --- /dev/null +++ b/testsuite/expect/test1.91 @@ -0,0 +1,416 @@ +#!/usr/bin/expect +############################################################################ +# Purpose: Test of SLURM functionality +# Test of CPU affinity support for multi-core systems. +# +# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR +# "WARNING: ..." with an explanation of why the test can't be made, OR +# "FAILURE: ..." otherwise with an explanation of the failure, OR +# anything else indicates a failure mode that must be investigated. +############################################################################ +# Copyright (C) 2005 The Regents of the University of California. +# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). +# Written by Morris Jette <jette1@llnl.gov> +# UCRL-CODE-226842. +# +# This file is part of SLURM, a resource management program. +# For details, see <http://www.llnl.gov/linux/slurm/>. +# +# SLURM is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along +# with SLURM; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +############################################################################ +source ./globals + +set test_id "1.91" +set exit_code 0 +set file_prog "test$test_id.prog" + +print_header $test_id + +# +# Test if task affinity support is supported. +# +set affinity 0 +log_user 0 +spawn $scontrol show config +expect { + -re "task/affinity" { + set affinity 1 + exp_continue + } + eof { + wait + } +} +log_user 1 +if {$affinity == 0} { + send_user "\nWARNING: task affinity not supported on this system\n" + exit 0 +} +send_user "\ntask affinity plugin installed\n" + +set num_sockets 0 +set num_cores 0 +set num_threads 0 +log_user 0 +spawn $scontrol show node +expect { + -re "Sockets=($number)" { + set num_sockets $expect_out(1,string) + exp_continue + } + -re "Cores=($number)" { + set num_cores $expect_out(1,string) + exp_continue + } + -re "Threads=($number)" { + set num_threads $expect_out(1,string) + exp_continue + } + eof { + wait + } +} +log_user 1 +if {$num_sockets == 0 || $num_cores == 0 || $num_threads == 0} { + send_user "\nWARNING: Could not determine number of Sockets:Cores:Threads (saw $num_sockets:$num_cores:$num_threads)\n" + exit 0 +} +send_user "Node config: Sockets=$num_sockets Cores=$num_cores Threads=$num_threads\n\n" + +# +# Build a test program to report affinity by task +# +exec $bin_rm -f $file_prog +exec $bin_make -f /dev/null $file_prog +exec $bin_chmod 700 $file_prog + +# +# Create an allocation +# +global env +set env(SLURM_CPU_BIND) "verbose" +set srun_pid [spawn $srun --allocate -N1 --verbose -t2] + +############################################################################# +# +# Run a job step to get allocated processor count and affinity +# +expect -re $prompt +set mask 0 +set task_cnt 0 +send "$srun -c1 $file_prog\n" +expect { + -re "TASK_ID:($number),MASK:($number)" { + incr task_cnt + set mask $expect_out(2,string) + exp_continue + } + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun (from --allocate) not responding " + send_user "or failure to recognize prompt\n" + slow_kill $srun_pid + exit 1 + } + -re $prompt +} + +############################################################################# +# +# Run a job step with affinity to verify unique masks with min -B 1:1:1 +# +set expected_mask [ expr ((1 << $task_cnt) - 1) ] +set task_mask 0 +send "$srun -c1 -n $task_cnt -B 1:1:1 $file_prog\n" +expect { + -re "TASK_ID:($number),MASK:($number)" { + incr task_mask $expect_out(2,string) + exp_continue + } + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun (from --allocate) not responding " + send_user "or failure to recognize prompt\n" + set exit_code 1 + } + -re $prompt +} +if {$task_mask != $expected_mask} { + send_user "\nFAILURE: affinity mask inconsistency ($task_mask,$expected_mask)\n" + set exit_code 1 +} + +############################################################################# +# +# Run varying number of sockets, verify task count and number of set bits +# +set this_cnt 1 +while {$this_cnt <= $num_sockets} { + set expected_tasks [ expr $this_cnt * $num_cores * $num_threads ] + set num_tasks 0 + set num_bits 0 + set task_mask 0 + send "$srun -B $this_cnt-$this_cnt:$num_cores:$num_threads $file_prog\n" + expect { + -re "TASK_ID:($number),MASK:($number)" { + incr task_mask $expect_out(2,string) + incr num_tasks 1 + # count number of set bits + set this_mask $expect_out(2,string) + while {$this_mask > 0} { + if {$this_mask & 1} { + incr num_bits 1 + } + set this_mask [ expr $this_mask >> 1 ] + } + exp_continue + } + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun (from --allocate) not responding " + send_user "or failure to recognize prompt\n" + set exit_code 1 + } + -re $prompt + } + + if {$num_tasks != $expected_tasks} { + send_user "\nFAILURE: number of tasks inconsistent ($num_tasks,$expected_tasks)\n" + set exit_code 1 + } + if {$num_bits != $expected_tasks} { + send_user "\nFAILURE: number of set bits inconsistent ($num_bits,$expected_tasks)\n" + set exit_code 1 + } + incr this_cnt 1 +} + + +############################################################################# +# +# Run varying number of cores, verify task count and number of set bits +# +set this_cnt 1 +while {$this_cnt <= $num_cores} { + set expected_tasks [ expr $num_sockets * $this_cnt * $num_threads ] + set num_tasks 0 + set num_bits 0 + set task_mask 0 + send "$srun -B $num_sockets:$this_cnt-$this_cnt:$num_threads $file_prog\n" + expect { + -re "TASK_ID:($number),MASK:($number)" { + incr task_mask $expect_out(2,string) + incr num_tasks 1 + # count number of set bits + set this_mask $expect_out(2,string) + while {$this_mask > 0} { + if {$this_mask & 1} { + incr num_bits 1 + } + set this_mask [ expr $this_mask >> 1 ] + } + exp_continue + } + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun (from --allocate) not responding " + send_user "or failure to recognize prompt\n" + set exit_code 1 + } + -re $prompt + } + + if {$num_tasks != $expected_tasks} { + send_user "\nFAILURE: number of tasks inconsistent ($num_tasks,$expected_tasks)\n" + set exit_code 1 + } + if {$num_bits != $expected_tasks} { + send_user "\nFAILURE: number of set bits inconsistent ($num_bits,$expected_tasks)\n" + set exit_code 1 + } + incr this_cnt 1 +} + + +############################################################################# +# +# Run varying number of threads, verify task count and number of set bits +# +set this_cnt 1 +while {$this_cnt <= $num_threads} { + set expected_tasks [ expr $num_sockets * $num_cores * $this_cnt ] + set num_tasks 0 + set num_bits 0 + set task_mask 0 + send "$srun -B $num_sockets:$num_cores:$this_cnt-$this_cnt $file_prog\n" + expect { + -re "TASK_ID:($number),MASK:($number)" { + incr task_mask $expect_out(2,string) + incr num_tasks 1 + # count number of set bits + set this_mask $expect_out(2,string) + while {$this_mask > 0} { + if {$this_mask & 1} { + incr num_bits 1 + } + set this_mask [ expr $this_mask >> 1 ] + } + exp_continue + } + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun (from --allocate) not responding " + send_user "or failure to recognize prompt\n" + set exit_code 1 + } + -re $prompt + } + + if {$num_tasks != $expected_tasks} { + send_user "\nFAILURE: number of tasks inconsistent ($num_tasks,$expected_tasks)\n" + set exit_code 1 + } + if {$num_bits != $expected_tasks} { + send_user "\nFAILURE: number of set bits inconsistent ($num_bits,$expected_tasks)\n" + set exit_code 1 + } + incr this_cnt 1 +} + +############################################################################# +# +# Run varying cpus per task, verify task count and number of set bits +# +set this_cnt 1 +while {$this_cnt <= $task_cnt} { + set expected_tasks 1 + set num_tasks 0 + set num_bits 0 + set task_mask 0 + send "$srun -c$this_cnt -B 1:1:1 $file_prog\n" + expect { + -re "TASK_ID:($number),MASK:($number)" { + incr task_mask $expect_out(2,string) + incr num_tasks 1 + # count number of set bits + set this_mask $expect_out(2,string) + while {$this_mask > 0} { + if {$this_mask & 1} { + incr num_bits 1 + } + set this_mask [ expr $this_mask >> 1 ] + } + exp_continue + } + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun (from --allocate) not responding " + send_user "or failure to recognize prompt\n" + set exit_code 1 + } + -re $prompt + } + + if {$num_tasks != $expected_tasks} { + send_user "\nFAILURE: number of tasks inconsistent ($num_tasks,$expected_tasks)\n" + set exit_code 1 + } + if {$num_bits != $this_cnt} { + send_user "\nFAILURE: number of set bits inconsistent ($num_bits,$this_cnt)\n" + set exit_code 1 + } + incr this_cnt 1 +} + +############################################################################# +# +# Run a job step with plane distribution to exercise option +# +set expected_mask [ expr ((1 << $task_cnt) - 1) * $task_cnt ] +set task_mask 0 +send "$srun -n $task_cnt -m plane=4 $file_prog\n" +expect { + -re "TASK_ID:($number),MASK:($number)" { + incr task_mask $expect_out(2,string) + exp_continue + } + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun (from --allocate) not responding " + send_user "or failure to recognize prompt\n" + set exit_code 1 + } + -re $prompt +} +if {$task_mask != $expected_mask} { + send_user "\nFAILURE: affinity mask inconsistency ($task_mask,$expected_mask)\n" + set exit_code 1 +} + +############################################################################# +# +# Terminate the job, free the allocation +# +send "exit\n" +expect { + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + } + timeout { + send_user "\nFAILURE: srun (from --allocate) not responding " + send_user "or failure to recognize prompt\n" + slow_kill $srun_pid + set exit_code 1 + } + eof { + wait + } +} + +if {$exit_code == 0} { + exec $bin_rm -f $file_prog + send_user "\nSUCCESS\n" +} +exit $exit_code + diff --git a/testsuite/expect/test1.91.prog.c b/testsuite/expect/test1.91.prog.c new file mode 100644 index 0000000000000000000000000000000000000000..bcd2c40ec02dfa8477fc0d1026e238c216fcc168 --- /dev/null +++ b/testsuite/expect/test1.91.prog.c @@ -0,0 +1,79 @@ +/*****************************************************************************\ + * test1.89.prog.c - Simple test program for SLURM regression test1.89. + * Reports SLURM task ID and the CPU mask, + * similar functionality to "taskset" command + ***************************************************************************** + * Copyright (C) 2005 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov> + * UCRL-CODE-226842. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ +#define _GNU_SOURCE +#define __USE_GNU +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "../../config.h" + +static void _load_mask(cpu_set_t *mask) +{ + int rc; + +#ifdef SCHED_GETAFFINITY_THREE_ARGS + rc = sched_getaffinity((pid_t) 0, (unsigned int) sizeof(cpu_set_t), + mask); +#else + rc = sched_getaffinity((pid_t) 0, mask); +#endif + if (rc != 0) { + fprintf(stderr, "ERROR: sched_getaffinity: %s\n", + strerror(errno)); + exit(1); + } +} + +static int _mask_to_int(cpu_set_t *mask) +{ + int i, rc = 0; + for (i=0; i<CPU_SETSIZE; i++) { + if (CPU_ISSET(i, mask)) + rc += (1 << i); + } + return rc; +} + + +main (int argc, char **argv) +{ + char *task_str; + cpu_set_t mask; + int task_id; + + _load_mask(&mask); + if ((task_str = getenv("SLURM_PROCID")) == NULL) { + fprintf(stderr, "ERROR: getenv(SLURM_PROCID) failed\n"); + exit(1); + } + task_id = atoi(task_str); + printf("TASK_ID:%d,MASK:%u\n", task_id, _mask_to_int(&mask)); + exit(0); +} diff --git a/testsuite/expect/test1.92 b/testsuite/expect/test1.92 new file mode 100644 index 0000000000000000000000000000000000000000..151005f3f92eb8e88f06c63b6fad476857e9b2b8 --- /dev/null +++ b/testsuite/expect/test1.92 @@ -0,0 +1,264 @@ +#!/usr/bin/expect +############################################################################ +# Purpose: Test of SLURM functionality +# Test of task distribution support on multi-core systems. +# +# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR +# "WARNING: ..." with an explanation of why the test can't be made, OR +# "FAILURE: ..." otherwise with an explanation of the failure, OR +# anything else indicates a failure mode that must be investigated. +############################################################################ +# Copyright (C) 2005 The Regents of the University of California. +# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). +# Written by Morris Jette <jette1@llnl.gov> +# UCRL-CODE-226842. +# +# This file is part of SLURM, a resource management program. +# For details, see <http://www.llnl.gov/linux/slurm/>. +# +# SLURM is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along +# with SLURM; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +############################################################################ +source ./globals + +set test_id "1.92" +set exit_code 0 +set file_bash "test$test_id.bash" + +print_header $test_id + +# +# Create an allocation +# +set srun_pid [spawn $srun --allocate -N2 --verbose -t2] + +############################################################################# +# +# Run a job step to get allocated processor count +# +expect -re $prompt +set mask 0 +set task_cnt 0 +set prev_node -1 +set node_cnt 0 +send "$srun -l -c1 $file_bash | sort -n\n" +expect { + -re "nodeid:($number) taskid:($number)" { + set this_node $expect_out(1,string) + set this_tid $expect_out(2,string) + incr task_cnt 1 + if {$this_node != $prev_node} { + incr node_cnt 1 + set prev_node $this_node + } + exp_continue + } + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun (from --allocate) not responding " + send_user "or failure to recognize prompt\n" + slow_kill $srun_pid + exit 1 + } + -re $prompt +} + +if {$node_cnt != 2} { + send_user "\nWARNING: need 2 nodes to perform test\n" + exit $exit_code +} + +if {$task_cnt < (2 * $node_cnt)} { + send_user "\nWARNING: need at least 2 CPUs per node, test is not applicable\n" + exit $exit_code +} + +############################################################################# +# +# Run a job step with block distribution +# +set this_cnt 0 +set prev_node -1 +set this_node -1 +send "$srun -l -n $task_cnt -m block $file_bash | sort -n\n" +expect { + -re "nodeid:($number) taskid:($number) localid:($number)" { + set this_node $expect_out(1,string) + set this_tid $expect_out(2,string) + set this_lid $expect_out(3,string) + incr this_cnt 1 + if {$prev_node != $this_node} { + if {$prev_node > $this_node } { + send_user "\nFAILURE: incorrect distribution " + send_user " $this_node, $prev_node\n" + set exit_code 1 + } + set prev_node $this_node + set prev_cnt 1 + } else { + incr prev_cnt 1 + } + exp_continue + } + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun not responding " + send_user "or failure to recognize prompt\n" + set exit_code 1 + } + -re $prompt +} +if {$prev_node > $this_node } { + send_user "\nFAILURE: incorrect final distribution\n" + set exit_code 1 +} +if {$this_cnt != $task_cnt} { + send_user "\nFAILURE: task count inconsistency ($this_cnt,$task_cnt)\n" + set exit_code 1 +} + +############################################################################# +# +# Run a job step with cyclic distribution +# +set block_size 1 +set this_cnt 0 +set prev_node -1 +set this_node -1 +set prev_cnt $block_size +send "$srun -l -n $task_cnt -m cyclic $file_bash | sort -n\n" +expect { + -re "nodeid:($number) taskid:($number) localid:($number)" { + set this_node $expect_out(1,string) + set this_tid $expect_out(2,string) + set this_lid $expect_out(3,string) + incr this_cnt 1 + if {$prev_node != $this_node} { + if {$prev_cnt != $block_size } { + send_user "\nFAILURE: incorrect distribution " + send_user " $this_node, $prev_node, $prev_cnt\n" + set exit_code 1 + } + set prev_node $this_node + set prev_cnt 1 + } else { + incr prev_cnt 1 + } + exp_continue + } + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun not responding " + send_user "or failure to recognize prompt\n" + set exit_code 1 + } + -re $prompt +} +if {$prev_cnt != $block_size} { + send_user "\nFAILURE: incorrect final distribution\n" + set exit_code 1 +} +if {$this_cnt != $task_cnt} { + send_user "\nFAILURE: task count inconsistency ($this_cnt,$task_cnt)\n" + set exit_code 1 +} + +############################################################################# +# +# Run a job step with plane distribution +# +set block_size 2 +set this_cnt 0 +set prev_node -1 +set this_node -1 +set prev_cnt $block_size +send "$srun -l -n $task_cnt -m plane=$block_size $file_bash | sort -n\n" +expect { + -re "nodeid:($number) taskid:($number) localid:($number)" { + set this_node $expect_out(1,string) + set this_tid $expect_out(2,string) + set this_lid $expect_out(3,string) + incr this_cnt 1 + if {$prev_node != $this_node} { + if {$prev_cnt != $block_size } { + send_user "\nFAILURE: incorrect distribution " + send_user " $this_node, $prev_node, $prev_cnt\n" + set exit_code 1 + } + set prev_node $this_node + set prev_cnt 1 + } else { + incr prev_cnt 1 + } + exp_continue + } + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + exp_continue + } + timeout { + send_user "\nFAILURE: srun not responding " + send_user "or failure to recognize prompt\n" + set exit_code 1 + } + -re $prompt +} +if {$prev_cnt != $block_size} { + send_user "\nFAILURE: incorrect final distribution\n" + set exit_code 1 +} +if {$this_cnt != $task_cnt} { + send_user "\nFAILURE: task count inconsistency ($this_cnt,$task_cnt)\n" + set exit_code 1 +} + +############################################################################# +# +# Terminate the job, free the allocation +# +send "exit\n" +expect { + -re "error" { + send_user "\nFAILURE: some error occurred\n" + set exit_code 1 + } + timeout { + send_user "\nFAILURE: srun (from --allocate) not responding " + send_user "or failure to recognize prompt\n" + slow_kill $srun_pid + set exit_code 1 + } + eof { + wait + } +} + +if {$exit_code == 0} { + send_user "\nSUCCESS\n" +} +exit $exit_code + diff --git a/testsuite/expect/test1.92.bash b/testsuite/expect/test1.92.bash new file mode 100644 index 0000000000000000000000000000000000000000..8afbe3289e2f769a18475e9b4ebc363c0259ec3c --- /dev/null +++ b/testsuite/expect/test1.92.bash @@ -0,0 +1,5 @@ +#!/bin/bash + +echo nodeid:$SLURM_NODEID taskid:$SLURM_PROCID localid:$SLURM_LOCALID +exit 0 +