From acbf625dc998e0280b3996472b4675214a5d8b31 Mon Sep 17 00:00:00 2001 From: Marshall Garey <marshall@schedmd.com> Date: Thu, 13 Feb 2020 12:57:46 -0700 Subject: [PATCH] Testsuite - Add new test to test dependencies Bug 6068 --- testsuite/expect/Makefile.am | 1 + testsuite/expect/Makefile.in | 1 + testsuite/expect/README | 1 + testsuite/expect/test37.17 | 963 +++++++++++++++++++++++++++++++++++ 4 files changed, 966 insertions(+) create mode 100755 testsuite/expect/test37.17 diff --git a/testsuite/expect/Makefile.am b/testsuite/expect/Makefile.am index 251fc14e154..d35f4ae7e41 100644 --- a/testsuite/expect/Makefile.am +++ b/testsuite/expect/Makefile.am @@ -616,6 +616,7 @@ EXTRA_DIST = \ test37.14 \ test37.15 \ test37.16 \ + test37.17 \ test38.1 \ test38.2 \ test38.3 \ diff --git a/testsuite/expect/Makefile.in b/testsuite/expect/Makefile.in index 8274f6fdc1c..d5c543c6d2c 100644 --- a/testsuite/expect/Makefile.in +++ b/testsuite/expect/Makefile.in @@ -1037,6 +1037,7 @@ EXTRA_DIST = \ test37.14 \ test37.15 \ test37.16 \ + test37.17 \ test38.1 \ test38.2 \ test38.3 \ diff --git a/testsuite/expect/README b/testsuite/expect/README index 7d86bdcf4cf..d669f69e600 100644 --- a/testsuite/expect/README +++ b/testsuite/expect/README @@ -762,6 +762,7 @@ test37.13 Validate federated arrays test37.14 Validate federated scontrol notify test37.15 Validate federated scontrol suspend test37.16 Validate job cleanup when clusters are removed federation +test37.17 Test local and remote job dependencies test38.# Testing of heterogeneous jobs. ========================================= diff --git a/testsuite/expect/test37.17 b/testsuite/expect/test37.17 new file mode 100755 index 00000000000..6b432a769ec --- /dev/null +++ b/testsuite/expect/test37.17 @@ -0,0 +1,963 @@ +#!/usr/bin/env expect +############################################################################ +# Purpose: Test local and remote job dependencies +# +# Reqs: 1. Using slurmdbd accounting storage type and is up +# 2. fed_slurm_base is defined in globals.local - set to directory that +# has access to each federation configure (fedc1, fedc2, fedc3). +# Eg. +# fedr/slurm/ (src) +# fedr/fed1/bin +# fedr/fed1/sbin +# fedr/fed1/etc +# fedr/fed1/... +# fedr/fed2/... +# fedr/fed3/... +# 3. controllers are up and running. +# +# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR +# "FAILURE: ..." otherwise with an explanation of the failure, OR +# anything else indicates a failure mode that must be investigated. +############################################################################ +# Copyright (C) 2020 SchedMD LLC. +# Written by Marshall Garey <marshall@schedmd.com> +# +# This file is part of Slurm, a resource management program. +# For details, see <https://slurm.schedmd.com/>. +# Please also read the included file: DISCLAIMER. +# +# Slurm is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along +# with Slurm; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +############################################################################ + +source ./globals +source ./globals_accounting +source ./globals_federation + +set test_id "37.17" +set c1 $fedc1 +set c2 $fedc2 +set c3 $fedc3 +set exit_code 0 +set job_id1 0 +set job_id2 0 +set user_name [get_my_user_name] +set file_in_long "test$test_id\_long.in" +set file_in_short "test$test_id\_short.in" +set my_scancel "${fed_slurm_base}/$c1/bin/scancel" +set my_scontrol "${fed_slurm_base}/$c1/bin/scontrol" +set reason "" +set dependency "" +set fed_name "fed_test$test_id" + +############################################################################### +# Functions +############################################################################### + +proc cancel_all_jobs { } { + global user_name c1 c2 c3 my_scancel + + spawn $my_scancel -M$c1,$c2,$c3 -u $user_name + expect { + eof { + wait + } + } + sleep 5 +} + +proc cancel_job { job_id clusters } { + global my_scancel + + spawn $my_scancel $job_id + expect { + eof { + wait + } + } + wait_for_fed_job $job_id "DONE" $clusters +} + +proc cleanup { rc } { + global bin_rm file_in_long file_in_short fed_name test_id + + delete_federations $fed_name + cancel_all_jobs + exec $bin_rm -f $file_in_long + exec $bin_rm -f $file_in_short + if { $rc > 0 } { + print_failure $test_id + } else { + print_success $test_id + } + exit $rc +} + +proc submit_job { options cdir file_in } { + global bin_sleep sbatch number fed_slurm_base slow_kill test_id + + set job_id 0 + set my_sbatch "${fed_slurm_base}/$cdir/bin/sbatch" + set command "$my_sbatch --job-name=test$test_id\_job -t1 \ + $options --output=/dev/null $file_in" + set sbatch_pid [spawn {*}$command] + expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + timeout { + log_error "sbatch not responding" + slow_kill $sbatch_pid + cleanup 1 + } + eof { + wait + } + } + if { $job_id == 0 } { + log_error "Failed to submit job\n" + cleanup 1 + } + + return $job_id +} + +proc get_job_dependency { job_id } { + global my_scontrol reason dependency alpha + + set reason "" + set dependency "" + set reason_match "\[a-zA-Z_\]+" + # Possible dependency syntax: + # <type>:<jobid>+<time>(state) + # <type>:<jobid>(state) + # singleton(unfulfilled) + # Notes: + # * Multiple dependencies are separated by a comma or a question mark + # * The state for singleton will only ever be "unfulfilled" + # * The state is either failed or unfulfilled. Fulfilled dependencies + # are cleared from the list + # * When there are no dependencies, it will be this string: "(null)" + + # This regex takes care of handling dependencies separated by comma or + # question mark. + # ([a-zA-Z_]+:[0-9_*+]+\([a-zA-Z]+\)\?*\,*|singleton\(unfulfilled\)\?*\,*)+|\(null\) + + +#set depend_match "\[a-zA-Z_\]+:\[0-9_*+\]+\\($alpha\\)\|\\(null\\)\|singleton\\(unfulfilled\\)" + + set type "\[a-zA-Z_\]+" + set jobid_time "\[0-9_*+\]+" + set state "\\($alpha\\)\\" + set delim ",*\\?*" + set depend_regex "\($type:$jobid_time$state$delim\|singleton\\(unfulfilled\\)$delim\)+" + set no_depend "\\(null\\)" + set depend_match "$depend_regex\|$no_depend" + + log_user 0 + spawn $my_scontrol show job $job_id + expect { + -re "Reason=($reason_match) Dependency=($depend_match)" { + set reason $expect_out(1,string) + set dependency $expect_out(2,string) + exp_continue + } + timeout { + log_error "scontrol not responding" + cleanup 1 + } + eof { + wait + } + } + log_user 1 + log_info "job $job_id; actual reason: \"$reason\"; dependency: \"$dependency\"" + return $dependency +} + +proc check_depend { job_id expected_reason expected_dependency } { + global reason dependency + + get_job_dependency $job_id + if { [string compare "$reason" "$expected_reason"] } { + return 1 + } + if { [string compare "$dependency" "$expected_dependency"] } { + return 1 + } + return 0 +} + +proc wait_for_depend { job_id expected_reason expected_dependency } { + global reason dependency + + set error 0 + set my_delay 0 + # max_delay 30 seconds because by default we test remote dependencies + # every 30 seconds, so we might have to wait that long for a result. + # Make this interval shorter by decreasing MinJobAge in slurm.conf + # because dependencies also get tested every MinJobAge seconds. + set max_delay 30 + set poll_interval 3 + + if { [string compare $expected_reason "DependencyNeverSatisfied"] } { + set want_never_satisfied 0 + } else { + set want_never_satisfied 1 + } + + log_info "job $job_id; expected reason: \"$expected_reason\"; dependency: \"$expected_dependency\"" + + while 1 { + if { ![check_depend $job_id $expected_reason \ + $expected_dependency] } { + return 0 + } + + if { (!$want_never_satisfied) && \ + !([string compare $reason \ + "DependencyNeverSatisfied"]) } { + log_error "Job dependency failed, but it shouldn't have." + set error 1 + } + if { $my_delay >= $max_delay } { + log_info "delay $my_delay max $max_delay" + log_error "Timeout waiting for dependency to change." + set error 1 + } + + if { $error } { + log_error "Job $job_id actual: reason=\"$reason\"; dependency=\"$dependency\"; expected: reason=\"$expected_reason\"; dependency=\"expected_dependency\"" + cleanup 1 + } + + exec sleep $poll_interval + set my_delay [expr $my_delay + $poll_interval] + } +} + +proc my_wait_for_fed_job { job_id state cluster } { + set ret_cluster [wait_for_fed_job $job_id $state $cluster] + if { [string compare $cluster $ret_cluster] } { + cleanup 1 + } +} + +proc is_job_on_cluster { job_id cluster } { + global my_scontrol + + log_user 0 + spawn $my_scontrol -M$cluster --local -o show job $job_id + expect { + -re "JobId=$job_id" { + log_user 1 + log_info "Found job $job_id in cluster $cluster" + return 1 + } + -re "Invalid job id specified" { + log_user 1 + log_info "Did not find job $job_id in cluster $cluster" + return 0 + } + timeout { + log_user 1 + log_error "scontrol not responding" + cleanup 1 + } + eof { + wait + } + } +} + +proc test_after { } { + global c1 c2 file_in_long bin_sleep + send_user " +############################################################################# +# Test after +############################################################################# + \n\n" + + # Local dependency succeeds + log_info "after: test that local dependency succeeds:" + set job_id1 [submit_job "-M$c1 --begin=now+5" $c1 $file_in_long] + set job_id2 [submit_job "--depend=after:$job_id1 -M$c1" $c1 \ + $file_in_long] + + wait_for_depend $job_id2 "Dependency" "after:$job_id1\(unfulfilled\)" + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + wait_for_depend $job_id2 "None" "(null)" + my_wait_for_fed_job $job_id2 "RUNNING" $c1 + cancel_job $job_id1 $c1 + cancel_job $job_id2 $c1 + + # Remote dependency succeeds + log_info "after: test that remote dependency succeeds:" + set job_id1 [submit_job "-M$c2 --begin=now+5" $c2 $file_in_long] + set job_id2 [submit_job "--depend=after:$job_id1 -M$c1" $c1 \ + $file_in_long] + + wait_for_depend $job_id2 "Dependency" "after:$job_id1\(unfulfilled\)" + my_wait_for_fed_job $job_id1 "RUNNING" $c2 + wait_for_depend $job_id2 "None" "(null)" + my_wait_for_fed_job $job_id2 "RUNNING" $c1 + cancel_job $job_id1 $c2 + cancel_job $job_id2 $c1 + + # Test after with a time attached. file_in_long sleeps for 60 seconds. + log_info "after: test that a after+time works:" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "--depend=after:$job_id1+1 -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" "after:$job_id1+1(unfulfilled)" + log_info "Check that job $job_id2 is still dependent after 45 seconds" + exec $bin_sleep 45 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" "after:$job_id1+1(unfulfilled)" + log_info "Wait for job $job_id2 dependency to be fulfilled" + wait_for_depend $job_id2 "None" "(null)" + my_wait_for_fed_job $job_id2 "RUNNING" $c1 + cancel_job $job_id2 $c1 + + # After dependency never fails. +} + +proc test_afterany { } { + global c1 c2 file_in_long + + send_user " +############################################################################# +# Test afterany +############################################################################# + \n\n" + + # Local dependency succeeds + log_info "afterany: test that local dependency succeeds:" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "--depend=afterany:$job_id1 -M$c1" $c1 \ + $file_in_long] + + wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)" + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + cancel_job $job_id1 $c1 + wait_for_depend $job_id2 "None" "(null)" + my_wait_for_fed_job $job_id2 "RUNNING" $c1 + cancel_job $job_id2 $c1 + + # Remote dependency succeeds + log_info "afterany: test that remote dependency succeeds:" + set job_id1 [submit_job "-M$c2" $c2 $file_in_long] + set job_id2 [submit_job "--depend=afterany:$job_id1 -M$c1" $c1 \ + $file_in_long] + + wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)" + my_wait_for_fed_job $job_id1 "RUNNING" $c2 + cancel_job $job_id1 $c2 + wait_for_depend $job_id2 "None" "(null)" + my_wait_for_fed_job $job_id2 "RUNNING" $c1 + cancel_job $job_id2 $c1 + + # Test old syntax: --depend=jobid,jobid + log_info "afterany: test old syntax: --depend=jobid\[,jobid,jobid...\]" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "--depend=$job_id1 -M$c1" $c1 \ + $file_in_long] + set job_id3 [submit_job "--depend=$job_id1,$job_id2 -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + my_wait_for_fed_job $job_id3 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)" + wait_for_depend $job_id3 "Dependency" \ + "afterany:$job_id1\(unfulfilled\),afterany:$job_id2\(unfulfilled\)" + cancel_all_jobs +} + +proc test_aftercorr { } { + global c1 c2 kill_invalid_depend file_in_long file_in_short + + send_user " +############################################################################# +# Test aftercorr +############################################################################# + \n\n" + + # Local dependency succeeds + log_info "aftercorr: test that local dependency succeeds:" + set job_array1 [submit_job "-M$c1 --array=1-2" $c1 $file_in_short] + set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \ + --array=1-2" $c1 $file_in_long] + + my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c1 + my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c1 + my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1 + my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1 + wait_for_depend "$job_array2\_1" "Dependency" \ + "aftercorr:$job_array1\_*(unfulfilled)" + wait_for_depend "$job_array2\_2" "Dependency" \ + "aftercorr:$job_array1\_*(unfulfilled)" + + my_wait_for_fed_job "$job_array1\_1" "DONE" $c1 + my_wait_for_fed_job "$job_array1\_2" "DONE" $c1 + wait_for_depend "$job_array2\_1" "None" "(null)" + wait_for_depend "$job_array2\_2" "None" "(null)" + my_wait_for_fed_job "$job_array2\_1" "RUNNING" $c1 + my_wait_for_fed_job "$job_array2\_2" "RUNNING" $c1 + cancel_job $job_array2 $c1 + + # Local dependency fails + log_info "aftercorr: test that local dependency fails:" + set job_array1 [submit_job "-M$c1 --array=1-2" $c1 $file_in_long] + set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \ + --array=1-2" $c1 $file_in_long] + + my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c1 + my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c1 + my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1 + my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1 + wait_for_depend "$job_array2\_1" "Dependency" \ + "aftercorr:$job_array1\_*(unfulfilled)" + wait_for_depend "$job_array2\_2" "Dependency" \ + "aftercorr:$job_array1\_*(unfulfilled)" + + cancel_job "$job_array1\_1" $c1 + wait_for_depend "$job_array2\_1" "DependencyNeverSatisfied" \ + "aftercorr:$job_array1\_*(failed)" + cancel_job "$job_array1\_2" $c1 + wait_for_depend "$job_array2\_2" "DependencyNeverSatisfied" \ + "aftercorr:$job_array1\_*(failed)" + if { !$kill_invalid_depend } { + cancel_job "$job_array2" $c1 + } + + # Remote dependency succeeds + log_info "aftercorr: test that remote dependency succeeds:" + set job_array1 [submit_job "-M$c2 --array=1-2" $c2 $file_in_short] + set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \ + --array=1-2" $c1 $file_in_long] + + my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c2 + my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c2 + my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1 + my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1 + # The dependency on the remote side has _*, but the dependency locally + # doesn't because it couldn't find the remote job. + wait_for_depend "$job_array2\_1" "Dependency" \ + "aftercorr:$job_array1\(unfulfilled)" + wait_for_depend "$job_array2\_2" "Dependency" \ + "aftercorr:$job_array1\(unfulfilled)" + + my_wait_for_fed_job "$job_array1\_1" "DONE" $c2 + my_wait_for_fed_job "$job_array1\_2" "DONE" $c2 + wait_for_depend "$job_array2\_1" "None" "(null)" + wait_for_depend "$job_array2\_2" "None" "(null)" + my_wait_for_fed_job "$job_array2\_1" "RUNNING" $c1 + my_wait_for_fed_job "$job_array2\_2" "RUNNING" $c1 + cancel_job $job_array2 $c1 + + # Remote dependency fails + log_info "aftercorr: test that remote dependency fails:" + set job_array1 [submit_job "-M$c2 --array=1-2" $c2 $file_in_long] + set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \ + --array=1-2" $c1 $file_in_long] + + my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c2 + my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c2 + my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1 + my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1 + wait_for_depend "$job_array2\_1" "Dependency" \ + "aftercorr:$job_array1\(unfulfilled)" + wait_for_depend "$job_array2\_2" "Dependency" \ + "aftercorr:$job_array1\(unfulfilled)" + + cancel_job "$job_array1\_1" $c2 + wait_for_depend "$job_array2\_1" "DependencyNeverSatisfied" \ + "aftercorr:$job_array1\(failed)" + cancel_job "$job_array1\_2" $c2 + wait_for_depend "$job_array2\_2" "DependencyNeverSatisfied" \ + "aftercorr:$job_array1\(failed)" + if { !$kill_invalid_depend } { + cancel_job "$job_array2" $c1 + } +} + +proc test_afterok { } { + global c1 c2 kill_invalid_depend file_in_long file_in_short + + send_user " +############################################################################# +# Test afterok +############################################################################# + \n\n" + + # Local dependency succeeds + log_info "afterok: test that local dependency succeeds:" + set job_id1 [submit_job "-M$c1" $c1 $file_in_short] + set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)" + my_wait_for_fed_job $job_id1 "DONE" $c1 + wait_for_depend $job_id2 "None" "(null)" + my_wait_for_fed_job $job_id2 "RUNNING" $c1 + cancel_job $job_id2 $c1 + + # Local dependency fails + log_info "afterok: test that local dependency fails:" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)" + cancel_job $job_id1 $c1 + wait_for_depend $job_id2 "DependencyNeverSatisfied" \ + "afterok:$job_id1\(failed\)" + if { !$kill_invalid_depend } { + cancel_job $job_id2 $c1 + } + + # Remote dependency succeeds + log_info "afterok: test that remote dependency succeeds:" + set job_id1 [submit_job "-M$c2" $c2 $file_in_short] + set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c2 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)" + my_wait_for_fed_job $job_id1 "DONE" $c2 + wait_for_depend $job_id2 "None" "(null)" + my_wait_for_fed_job $job_id2 "RUNNING" $c1 + cancel_job $job_id2 $c1 + + # Remote dependency fails + log_info "afterok: test that remote dependency fails" + set job_id1 [submit_job "-M$c2" $c2 $file_in_long] + set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c2 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)" + cancel_job $job_id1 $c2 + wait_for_depend $job_id2 "DependencyNeverSatisfied" \ + "afterok:$job_id1\(failed\)" + if { !$kill_invalid_depend } { + cancel_job $job_id2 $c1 + } +} + +proc test_afternotok { } { + global c1 c2 kill_invalid_depend file_in_long file_in_short + + send_user " +############################################################################# +# Test afternotok +############################################################################# + \n\n" + + # Local dependency succeeds + log_info "afternotok: test that local dependency succeeds:" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" \ + "afternotok:$job_id1\(unfulfilled\)" + cancel_job $job_id1 $c1 + wait_for_depend $job_id2 "None" "(null)" + my_wait_for_fed_job $job_id2 "RUNNING" $c1 + cancel_job $job_id2 $c1 + + # Local dependency fails + log_info "afternotok: test that local dependency fails:" + set job_id1 [submit_job "-M$c1" $c1 $file_in_short] + set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" \ + "afternotok:$job_id1\(unfulfilled\)" + my_wait_for_fed_job $job_id1 "DONE" $c1 + wait_for_depend $job_id2 "DependencyNeverSatisfied" \ + "afternotok:$job_id1\(failed\)" + if { !$kill_invalid_depend } { + cancel_job $job_id2 $c1 + } + + # Remote dependency succeeds + log_info "afternotok: test that remote dependency succeeds:" + set job_id1 [submit_job "-M$c2" $c2 $file_in_long] + set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c2 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" \ + "afternotok:$job_id1\(unfulfilled\)" + cancel_job $job_id1 $c2 + wait_for_depend $job_id2 "None" "(null)" + my_wait_for_fed_job $job_id2 "RUNNING" $c1 + cancel_job $job_id2 $c1 + + # Remote dependency fails + log_info "afternotok: test that remote dependency fails" + set job_id1 [submit_job "-M$c2" $c2 $file_in_short] + set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c2 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" \ + "afternotok:$job_id1\(unfulfilled\)" + my_wait_for_fed_job $job_id1 "DONE" $c2 + wait_for_depend $job_id2 "DependencyNeverSatisfied" \ + "afternotok:$job_id1\(failed\)" + if { !$kill_invalid_depend } { + cancel_job $job_id2 $c1 + } +} + +proc test_singleton { } { + global c1 c2 c3 disable_remote_singleton file_in_long + + send_user " +############################################################################# +# Test singleton +############################################################################# + \n\n" + + # Test one cluster + log_info "singleton: test on one cluster" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "--depend=singleton -M$c1" $c1 $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" "singleton(unfulfilled)" + cancel_job $job_id1 $c1 + wait_for_depend $job_id2 "None" "(null)" + my_wait_for_fed_job $job_id2 "RUNNING" $c1 + cancel_job $job_id2 $c1 + + # Test multiple clusters + if { $disable_remote_singleton } { + # Test that remote jobs don't affect the singleton dependency + log_info "singleton: test that disable_remote_singleton works" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "-M$c2" $c2 $file_in_long] + set job_id3 [submit_job "--depend=singleton -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "RUNNING" $c2 + my_wait_for_fed_job $job_id3 "PENDING" $c1 + wait_for_depend $job_id3 "Dependency" "singleton(unfulfilled)" + # Cancel job 1 - job 3 should start running even though job 2 is + # running on another cluster + cancel_job $job_id1 $c1 + wait_for_depend $job_id3 "None" "(null)" + my_wait_for_fed_job $job_id3 "RUNNING" $c1 + cancel_job $job_id2 $c2 + cancel_job $job_id3 $c1 + } else { + # Test that singleton doesn't get cleared until jobs on all + # clusters are done + log_info "singleton: test with jobs on all clusters" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "-M$c2" $c2 $file_in_long] + set job_id3 [submit_job "-M$c3" $c3 $file_in_long] + set job_id4 [submit_job "--depend=singleton -M$c1" $c1 \ + $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "RUNNING" $c2 + my_wait_for_fed_job $job_id3 "RUNNING" $c3 + my_wait_for_fed_job $job_id4 "PENDING" $c1 + wait_for_depend $job_id4 "Dependency" "singleton(unfulfilled)" + + # Job 4 shouldn't start until jobs 1, 2, and 3 are done. + # Test that it starts when a remote job is finished last. + cancel_job $job_id1 $c1 + # Should still have the same dependency + wait_for_depend $job_id4 "Dependency" "singleton(unfulfilled)" + cancel_job $job_id2 $c2 + cancel_job $job_id3 $c3 + # Now the dependency should be cleared + wait_for_depend $job_id4 "None" "(null)" + my_wait_for_fed_job $job_id4 "RUNNING" $c1 + cancel_job $job_id4 $c1 + } +} + +proc test_add_remove_clusters { } { + global c1 c2 c3 fed_name file_in_long \ + disable_remote_singleton kill_invalid_depend + + # Test adding/removing clusters from the federation + # Removing a cluster from a federation should cause dependencies on + # jobs on that cluster to fail. + # Adding a cluster to a federation means that any singleton dependencies + # have to be fulfilled on that cluster. + send_user " +############################################################################# +# Test adding/removing a cluster from the federation. +############################################################################# + \n\n" + + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "-M$c2" $c2 $file_in_long] + set job_id3 [submit_job "-M$c3" $c3 $file_in_long] + set job_id4 [submit_job "--depend=afterok:$job_id3 -M$c1" $c1 \ + $file_in_long] + set job_id5 [submit_job "--depend=singleton -M$c1" $c1 $file_in_long] + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "RUNNING" $c2 + my_wait_for_fed_job $job_id3 "RUNNING" $c3 + my_wait_for_fed_job $job_id4 "PENDING" $c1 + wait_for_depend $job_id4 "Dependency" "afterok:$job_id3\(unfulfilled\)" + wait_for_depend $job_id5 "Dependency" "singleton(unfulfilled)" + + log_info "Test that removing cluster $c3 from fed $fed_name makes dependencies on jobs on $c3 fail" + if { [remove_cluster_from_fed $c3 $fed_name] } { + cleanup 1 + } + wait_for_depend $job_id4 "DependencyNeverSatisfied" \ + "afterok:$job_id3\(failed\)" + if { !$kill_invalid_depend } { + cancel_job $job_id4 $c1 + } + + if { $disable_remote_singleton } { + cancel_job $job_id1 $c1 + cancel_job $job_id2 $c2 + cancel_job $job_id3 $c3 + cancel_job $job_id5 $c1 + return + } + + log_info "Test that the singleton dependency was resent back to cluster $c3 when it was added back to the federation." + if { [add_cluster_to_fed $c3 $fed_name] } { + cleanup 1 + } + cancel_job $job_id1 $c1 + cancel_job $job_id2 $c2 + cancel_job $job_id3 $c3 + wait_for_depend $job_id5 "None" "(null)" + my_wait_for_fed_job $job_id5 "RUNNING" $c1 + cancel_job $job_id5 $c1 +} + +proc test_submit_to_all_clusters { } { + global c1 c2 c3 file_in_long + + send_user " +############################################################################# +# Test submitting a dependent job to all clusters. +############################################################################# + \n\n" + + log_info "Test that a dependent job is only on its origin cluster while dependent and that it is submitted to all clusters when its dependency is cleared." + set job_id1 [submit_job "-M$c2" $c2 $file_in_long] + set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1,$c2,$c3 \ + --begin=now+60" $c1 $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c2 + my_wait_for_fed_job $job_id2 "PENDING" $c1 + wait_for_depend $job_id2 "Dependency" \ + "afternotok:$job_id1\(unfulfilled\)" + + log_info "Test that job $job_id2 is not on clusters $c2 or $c3." + if { [is_job_on_cluster $job_id2 $c2] || \ + [is_job_on_cluster $job_id2 $c3] } { + log_error "Job $job_id2 is in cluster $c2 and/or $c3 when it shouldn't be." + cleanup 1 + } + + log_info "Test that job $job_id2 is submitted to all sibling clusters $c2 and $c3 when its dependency is fulfilled." + cancel_job $job_id1 $c2 + wait_for_depend $job_id2 "BeginTime" "(null)" + my_wait_for_fed_job $job_id2 "PENDING" "$c1" + my_wait_for_fed_job $job_id2 "PENDING" "$c2" + my_wait_for_fed_job $job_id2 "PENDING" "$c3" + cancel_job $job_id2 "$c1,$c2,$c3" +} + +proc test_or_dependencies { } { + global c1 c2 file_in_long kill_invalid_depend + + send_user " +############################################################################# +# Test OR dependencies. +############################################################################# + \n\n" + + log_info "OR dependencies: Test that one fulfilled dependency makes the whole dependency fulfilled:" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "-M$c2" $c2 $file_in_long] + set job_id3 [submit_job \ + "--depend=afternotok:$job_id1?afternotok:$job_id2 -M$c1" \ + $c1 $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "RUNNING" $c2 + my_wait_for_fed_job $job_id3 "PENDING" $c1 + wait_for_depend $job_id3 "Dependency" \ + "afternotok:$job_id1\(unfulfilled\)?afternotok:$job_id2\(unfulfilled\)" + + cancel_job $job_id2 $c2 + wait_for_depend $job_id3 "None" "(null)" + my_wait_for_fed_job $job_id3 "RUNNING" $c1 + cancel_job $job_id1 $c1 + cancel_job $job_id3 $c1 + + log_info "OR dependencies: Test that the dependency doesn't fail until all dependencies have failed:" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "-M$c2" $c2 $file_in_long] + set job_id3 [submit_job "--depend=afterok:$job_id1?afterok:$job_id2 \ + -M$c1" $c1 $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "RUNNING" $c2 + my_wait_for_fed_job $job_id3 "PENDING" $c1 + wait_for_depend $job_id3 "Dependency" \ + "afterok:$job_id1\(unfulfilled\)?afterok:$job_id2\(unfulfilled\)" + + cancel_job $job_id1 $c1 + wait_for_depend $job_id3 "Dependency" \ + "afterok:$job_id1\(failed\)?afterok:$job_id2\(unfulfilled\)" + cancel_job $job_id2 $c2 + wait_for_depend $job_id3 "DependencyNeverSatisfied" \ + "afterok:$job_id1\(failed\)?afterok:$job_id2\(failed\)" + if { !$kill_invalid_depend } { + cancel_job $job_id3 $c1 + } +} + +proc test_and_dependencies { } { + global c1 c2 file_in_long kill_invalid_depend + + send_user " +############################################################################# +# Test AND dependencies. +############################################################################# + \n\n" + + log_info "AND dependencies: Test that the dependency isn't fulfilled until all dependencies are fulfilled:" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "-M$c2" $c2 $file_in_long] + set job_id3 [submit_job \ + "--depend=afternotok:$job_id1,afternotok:$job_id2 -M$c1" \ + $c1 $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "RUNNING" $c2 + my_wait_for_fed_job $job_id3 "PENDING" $c1 + wait_for_depend $job_id3 "Dependency" \ + "afternotok:$job_id1\(unfulfilled\),afternotok:$job_id2\(unfulfilled\)" + + cancel_job $job_id1 $c1 + wait_for_depend $job_id3 "Dependency" \ + "afternotok:$job_id2\(unfulfilled\)" + cancel_job $job_id2 $c2 + wait_for_depend $job_id3 "None" "(null)" + my_wait_for_fed_job $job_id3 "RUNNING" $c1 + cancel_job $job_id3 $c1 + + log_info "AND dependencies: Test that the whole dependency fails when a single dependency fails:" + set job_id1 [submit_job "-M$c1" $c1 $file_in_long] + set job_id2 [submit_job "-M$c2" $c2 $file_in_long] + set job_id3 [submit_job "--depend=afterok:$job_id1,afterok:$job_id2 \ + -M$c1" $c1 $file_in_long] + + my_wait_for_fed_job $job_id1 "RUNNING" $c1 + my_wait_for_fed_job $job_id2 "RUNNING" $c2 + my_wait_for_fed_job $job_id3 "PENDING" $c1 + wait_for_depend $job_id3 "Dependency" \ + "afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(unfulfilled\)" + + cancel_job $job_id2 $c2 + wait_for_depend $job_id3 "DependencyNeverSatisfied" \ + "afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(failed\)" + if { !$kill_invalid_depend } { + cancel_job $job_id3 $c1 + } + cancel_job $job_id1 $c1 +} + +############################################################################### +# Begin test +############################################################################### + +print_header $test_id + +if { [test_account_storage] == 0 } { + log_warn "This test can't be run without a usable AccountStorageType" + exit 0 +} + +if {[test_federation_setup]} { + log_warn "WARNING: This test can't be run without fed_slurm_base, fedc1, fedc2, fedc3 setup in globals.local." + exit 0 +} + +if {[test_all_up]} { + exit 0 +} + +delete_federations $fed_name +if { [setup_federation $fed_name] } { + cleanup 1 +} + +# Use file_in_short when we have to wait for the job to end. +# Use file_in_long everywhere else. +make_bash_script $file_in_long "$bin_sleep 60" +make_bash_script $file_in_short "$bin_sleep 5" + +set permit_job_expansion [test_scheduler_params "permit_job_expansion"] +log_info "permit_job_expansion: $permit_job_expansion" + +set kill_invalid_depend [test_dependency_params "kill_invalid_depend"] +set disable_remote_singleton [test_dependency_params "disable_remote_singleton"] +log_info "kill_invalid_depend: $kill_invalid_depend; disable_remote_singleton: $disable_remote_singleton\n" + +cancel_all_jobs + +test_after +test_afterany +# --depend=afterburstbuffer is tested in test35.6 +test_aftercorr +test_afterok +test_afternotok +test_singleton +# test --depend=expand in another test. +test_add_remove_clusters +test_submit_to_all_clusters +test_or_dependencies +test_and_dependencies + +cleanup 0 -- GitLab