diff --git a/testsuite/expect/Makefile.am b/testsuite/expect/Makefile.am index 2d9499f50835eaa670bcf66d23e06f489a5a919b..14a0d2c3e5f599358176aae6d33952a12c8cd69e 100644 --- a/testsuite/expect/Makefile.am +++ b/testsuite/expect/Makefile.am @@ -576,6 +576,7 @@ EXTRA_DIST = \ test29.8 \ test30.1 \ test31.1 \ + test31.2 \ test32.1 \ test32.2 \ test32.3 \ diff --git a/testsuite/expect/Makefile.in b/testsuite/expect/Makefile.in index 0d6c65d28b78c465ebd1ec7dec0444c50ebd0874..d6af38565f9a138ae432035af11acf23aaa2e8c5 100644 --- a/testsuite/expect/Makefile.in +++ b/testsuite/expect/Makefile.in @@ -991,6 +991,7 @@ EXTRA_DIST = \ test29.8 \ test30.1 \ test31.1 \ + test31.2 \ test32.1 \ test32.2 \ test32.3 \ diff --git a/testsuite/expect/README b/testsuite/expect/README index 2356ab6a6b9db3b7616c6db3956ea6b17efa4592..8fd9ed4c44157ca58fc115f128091a3277d787fb 100644 --- a/testsuite/expect/README +++ b/testsuite/expect/README @@ -757,6 +757,7 @@ test30.1 Validates that RPMs are built with the correct prefix. test31.# Test of Slurm Epilog and Prolog. =========================================== test31.1 Validate proper epilog and prolog child process management. +test31.2 Validate that job is set to requeue held state at prolog failure. test32.# Test of sgather command and options. diff --git a/testsuite/expect/test31.2 b/testsuite/expect/test31.2 new file mode 100755 index 0000000000000000000000000000000000000000..bd18f7068643314a37c686ea75522b36f3a61108 --- /dev/null +++ b/testsuite/expect/test31.2 @@ -0,0 +1,282 @@ +#!/usr/bin/env expect +############################################################################ +# Purpose: Test of SLURM functionality +# Validate that a job is held and reason is set to launch +# failed requeued held when the prolog fails +# +# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR +# "FAILURE: ..." otherwise with an explanation of the failure, OR +# anything else indicates a failure mode that must be investigated. +############################################################################ +# Copyright (C) 2015 SchedMD LLC +# Written by Nathan Yee <nyee32@schedmd.com> +# +# This file is part of SLURM, a resource management program. +# For details, see <http://slurm.schedmd.com/>. +# Please also read the included file: DISCLAIMER. +# +# SLURM is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along +# with SLURM; if not, write to the Free Software Foundation, Inc. +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +############################################################################ +source ./globals + +set test_id 31.2 +set exit_code 0 +set good_prolog "test$test_id\_good.sh" +set bad_prolog "test$test_id\_bad.sh" +set test_node [get_idle_node_in_part [default_partition]] +set job_id 0 +set test_script "test$test_id\.sh" + +print_header $test_id + +if { [test_super_user] == 0 } { + send_user "WARNING: Test can only be run as SlurmUser\n" + exit $exit_code +} + +proc update_conf { } { + global scontrol exit_code + + spawn $scontrol reconfigure + expect { + timeout { + send_user "\nFAILURE: scontrol is not responding\n" + set exit_code 1 + } + eof { + wait + } + } + # Wait for the reconfigure to complete (especially with valgrind) + sleep 5 +} + +proc clean_up { } { + + global cwd config_dir exit_code bin_rm bin_cp good_prolog bad_prolog + global update_conf scontrol test_node test_script + + exec $bin_cp -v $cwd/slurm.conf.orig $config_dir/slurm.conf + exec $bin_rm $test_script $good_prolog $bad_prolog $cwd/slurm.conf.orig + update_conf + + spawn $scontrol update node=$test_node state=idle + expect { + timeout { + send_user "\nFAILURE: scontrol is not respinding\n" + set exit_code 1 + } + eof { + wait + } + } +} + +proc cancel_job { job } { + + global scancel exit_code + + spawn $scancel $job + expect { + timeout { + send_user "\nFAILURE: scancel is not responing\n" + set exit_code 1 + } + eof { + wait + } + } +} + +make_bash_script $good_prolog " +exit 0 +" +make_bash_script $bad_prolog " +exit 1 +" +make_bash_script $test_script " +sleep 20 +" + +# Get the location of the slurm.conf file +set got_config 0 +spawn $scontrol show config +expect { + -re "SLURM_CONF.*= (/.*)/($alpha).*SLURM_VERSION" { + set config_dir $expect_out(1,string) + set got_config 1 + exp_continue + } + timeout { + send_user "\nFAILURE: scontrol is not responding\n" + set exit_code 1 + } + eof { + wait + } +} +if {$got_config == 0} { + send_user "\nFAILURE: Could not identify slurm.conf location\n" + exit 1 +} + +# +# Copy slurm.conf file +# +set cwd "[$bin_pwd]" +exec $bin_rm -fr $cwd/slurm.conf.orig +spawn $bin_cp -v $config_dir/slurm.conf $cwd/slurm.conf.orig +expect { + timeout { + send_user "\nFAILURE: slurm.conf was not copied\n" + set exit_code 1 + } + eof { + wait + } +} + +# Append Prolog config to the slurm.conf +exec $bin_echo "prolog=$cwd/$good_prolog" >> $config_dir/slurm.conf +update_conf + +spawn $sbatch -t1 -N1 -w$test_node --exclusive -o/dev/null $test_script +expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\nFAILURE: sbatch is not responding\n" + set exit_code 1 + } + eof { + wait + } +} + +if {$job_id == 0} { + send_user "\nFAILURE: sbatch did not submit job\n" + set exit_code 1 +} + +wait_for_job $job_id "RUNNING" + +# Check that the job was submitted with no error +set match 0 +spawn $squeue -h -j$job_id -o%t|%r +expect { + -re "R.None" { + set match 1 + exp_continue + } + timeout { + send_user "\nFAILURE: squeue is not responding\n" + set exit_code 1 + } + eof { + wait + } +} + +if {$match == 0} { + send_user "\nFAILURE: job $job_id is not in the correct state." + send_user "Job $job_id should be Running and have reason: None\n" + set exit_code 1 +} + +cancel_job $job_id + +# Append Prolog config to the slurm.conf +exec $bin_echo "prolog=$cwd/$bad_prolog" >> $config_dir/slurm.conf +update_conf + +spawn $sbatch -t1 -N1 -w$test_node --exclusive -o/dev/null $test_script +expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + send_user "\nFAILURE: sbatch is not responding\n" + set exit_code 1 + } + eof { + wait + } +} + +if {$job_id == 0} { + send_user "\nFAILURE: sbatch did not submit job\n" + set exit_code 1 +} + +# Wait a bit for the pending reason to be set +sleep 10 + +# Check that the job is held due to the failed prolog +set match 0 +spawn $squeue -h -j$job_id -o%t|%r +expect { + -re "PD.job requeued in held state" { + set match 1 + exp_continue + } + timeout { + send_user "\nFAILURE: squeue is not responding\n" + set exit_code 1 + } + eof { + wait + } +} + +if {$match == 0} { + send_user "\nFAILURE: job $job_id is not in the correct state. " + send_user "Job $job_id should be Pending and have reason: " + send_user "launch failed requeued held\n" + set exit_code 1 +} + +# Check that the node that the job ran on is in fact drained +set match 0 +spawn $sinfo -h -o%T -n$test_node +expect { + -re "drained" { + set match 1 + exp_continue + } + timeout { + send_user "\nFAILURE: sinfo is not repsonding\n" + set exit_code 1 + } + eof { + wait + } +} + +if {$match == 0} { + send_user "\nFAILURE: node $test_node was not drained when " + send_user "it should be\n" + set exit_code 1 +} + +cancel_job $job_id +clean_up + +if {$exit_code == 0} { + send_user "\nSUCCESS\n" +} +exit $exit_code