Skip to content
Snippets Groups Projects
Commit acbf625d authored by Marshall Garey's avatar Marshall Garey Committed by Brian Christiansen
Browse files

Testsuite - Add new test to test dependencies

Bug 6068
parent 5fc9a174
No related branches found
No related tags found
No related merge requests found
......@@ -616,6 +616,7 @@ EXTRA_DIST = \
test37.14 \
test37.15 \
test37.16 \
test37.17 \
test38.1 \
test38.2 \
test38.3 \
......
......@@ -1037,6 +1037,7 @@ EXTRA_DIST = \
test37.14 \
test37.15 \
test37.16 \
test37.17 \
test38.1 \
test38.2 \
test38.3 \
......
......@@ -762,6 +762,7 @@ test37.13 Validate federated arrays
test37.14 Validate federated scontrol notify
test37.15 Validate federated scontrol suspend
test37.16 Validate job cleanup when clusters are removed federation
test37.17 Test local and remote job dependencies
test38.# Testing of heterogeneous jobs.
=========================================
......
#!/usr/bin/env expect
############################################################################
# Purpose: Test local and remote job dependencies
#
# Reqs: 1. Using slurmdbd accounting storage type and is up
# 2. fed_slurm_base is defined in globals.local - set to directory that
# has access to each federation configure (fedc1, fedc2, fedc3).
# Eg.
# fedr/slurm/ (src)
# fedr/fed1/bin
# fedr/fed1/sbin
# fedr/fed1/etc
# fedr/fed1/...
# fedr/fed2/...
# fedr/fed3/...
# 3. controllers are up and running.
#
# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR
# "FAILURE: ..." otherwise with an explanation of the failure, OR
# anything else indicates a failure mode that must be investigated.
############################################################################
# Copyright (C) 2020 SchedMD LLC.
# Written by Marshall Garey <marshall@schedmd.com>
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
source ./globals_accounting
source ./globals_federation
set test_id "37.17"
set c1 $fedc1
set c2 $fedc2
set c3 $fedc3
set exit_code 0
set job_id1 0
set job_id2 0
set user_name [get_my_user_name]
set file_in_long "test$test_id\_long.in"
set file_in_short "test$test_id\_short.in"
set my_scancel "${fed_slurm_base}/$c1/bin/scancel"
set my_scontrol "${fed_slurm_base}/$c1/bin/scontrol"
set reason ""
set dependency ""
set fed_name "fed_test$test_id"
###############################################################################
# Functions
###############################################################################
proc cancel_all_jobs { } {
global user_name c1 c2 c3 my_scancel
spawn $my_scancel -M$c1,$c2,$c3 -u $user_name
expect {
eof {
wait
}
}
sleep 5
}
proc cancel_job { job_id clusters } {
global my_scancel
spawn $my_scancel $job_id
expect {
eof {
wait
}
}
wait_for_fed_job $job_id "DONE" $clusters
}
proc cleanup { rc } {
global bin_rm file_in_long file_in_short fed_name test_id
delete_federations $fed_name
cancel_all_jobs
exec $bin_rm -f $file_in_long
exec $bin_rm -f $file_in_short
if { $rc > 0 } {
print_failure $test_id
} else {
print_success $test_id
}
exit $rc
}
proc submit_job { options cdir file_in } {
global bin_sleep sbatch number fed_slurm_base slow_kill test_id
set job_id 0
set my_sbatch "${fed_slurm_base}/$cdir/bin/sbatch"
set command "$my_sbatch --job-name=test$test_id\_job -t1 \
$options --output=/dev/null $file_in"
set sbatch_pid [spawn {*}$command]
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
log_error "sbatch not responding"
slow_kill $sbatch_pid
cleanup 1
}
eof {
wait
}
}
if { $job_id == 0 } {
log_error "Failed to submit job\n"
cleanup 1
}
return $job_id
}
proc get_job_dependency { job_id } {
global my_scontrol reason dependency alpha
set reason ""
set dependency ""
set reason_match "\[a-zA-Z_\]+"
# Possible dependency syntax:
# <type>:<jobid>+<time>(state)
# <type>:<jobid>(state)
# singleton(unfulfilled)
# Notes:
# * Multiple dependencies are separated by a comma or a question mark
# * The state for singleton will only ever be "unfulfilled"
# * The state is either failed or unfulfilled. Fulfilled dependencies
# are cleared from the list
# * When there are no dependencies, it will be this string: "(null)"
# This regex takes care of handling dependencies separated by comma or
# question mark.
# ([a-zA-Z_]+:[0-9_*+]+\([a-zA-Z]+\)\?*\,*|singleton\(unfulfilled\)\?*\,*)+|\(null\)
#set depend_match "\[a-zA-Z_\]+:\[0-9_*+\]+\\($alpha\\)\|\\(null\\)\|singleton\\(unfulfilled\\)"
set type "\[a-zA-Z_\]+"
set jobid_time "\[0-9_*+\]+"
set state "\\($alpha\\)\\"
set delim ",*\\?*"
set depend_regex "\($type:$jobid_time$state$delim\|singleton\\(unfulfilled\\)$delim\)+"
set no_depend "\\(null\\)"
set depend_match "$depend_regex\|$no_depend"
log_user 0
spawn $my_scontrol show job $job_id
expect {
-re "Reason=($reason_match) Dependency=($depend_match)" {
set reason $expect_out(1,string)
set dependency $expect_out(2,string)
exp_continue
}
timeout {
log_error "scontrol not responding"
cleanup 1
}
eof {
wait
}
}
log_user 1
log_info "job $job_id; actual reason: \"$reason\"; dependency: \"$dependency\""
return $dependency
}
proc check_depend { job_id expected_reason expected_dependency } {
global reason dependency
get_job_dependency $job_id
if { [string compare "$reason" "$expected_reason"] } {
return 1
}
if { [string compare "$dependency" "$expected_dependency"] } {
return 1
}
return 0
}
proc wait_for_depend { job_id expected_reason expected_dependency } {
global reason dependency
set error 0
set my_delay 0
# max_delay 30 seconds because by default we test remote dependencies
# every 30 seconds, so we might have to wait that long for a result.
# Make this interval shorter by decreasing MinJobAge in slurm.conf
# because dependencies also get tested every MinJobAge seconds.
set max_delay 30
set poll_interval 3
if { [string compare $expected_reason "DependencyNeverSatisfied"] } {
set want_never_satisfied 0
} else {
set want_never_satisfied 1
}
log_info "job $job_id; expected reason: \"$expected_reason\"; dependency: \"$expected_dependency\""
while 1 {
if { ![check_depend $job_id $expected_reason \
$expected_dependency] } {
return 0
}
if { (!$want_never_satisfied) && \
!([string compare $reason \
"DependencyNeverSatisfied"]) } {
log_error "Job dependency failed, but it shouldn't have."
set error 1
}
if { $my_delay >= $max_delay } {
log_info "delay $my_delay max $max_delay"
log_error "Timeout waiting for dependency to change."
set error 1
}
if { $error } {
log_error "Job $job_id actual: reason=\"$reason\"; dependency=\"$dependency\"; expected: reason=\"$expected_reason\"; dependency=\"expected_dependency\""
cleanup 1
}
exec sleep $poll_interval
set my_delay [expr $my_delay + $poll_interval]
}
}
proc my_wait_for_fed_job { job_id state cluster } {
set ret_cluster [wait_for_fed_job $job_id $state $cluster]
if { [string compare $cluster $ret_cluster] } {
cleanup 1
}
}
proc is_job_on_cluster { job_id cluster } {
global my_scontrol
log_user 0
spawn $my_scontrol -M$cluster --local -o show job $job_id
expect {
-re "JobId=$job_id" {
log_user 1
log_info "Found job $job_id in cluster $cluster"
return 1
}
-re "Invalid job id specified" {
log_user 1
log_info "Did not find job $job_id in cluster $cluster"
return 0
}
timeout {
log_user 1
log_error "scontrol not responding"
cleanup 1
}
eof {
wait
}
}
}
proc test_after { } {
global c1 c2 file_in_long bin_sleep
send_user "
#############################################################################
# Test after
#############################################################################
\n\n"
# Local dependency succeeds
log_info "after: test that local dependency succeeds:"
set job_id1 [submit_job "-M$c1 --begin=now+5" $c1 $file_in_long]
set job_id2 [submit_job "--depend=after:$job_id1 -M$c1" $c1 \
$file_in_long]
wait_for_depend $job_id2 "Dependency" "after:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "RUNNING" $c1
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id1 $c1
cancel_job $job_id2 $c1
# Remote dependency succeeds
log_info "after: test that remote dependency succeeds:"
set job_id1 [submit_job "-M$c2 --begin=now+5" $c2 $file_in_long]
set job_id2 [submit_job "--depend=after:$job_id1 -M$c1" $c1 \
$file_in_long]
wait_for_depend $job_id2 "Dependency" "after:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "RUNNING" $c2
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id1 $c2
cancel_job $job_id2 $c1
# Test after with a time attached. file_in_long sleeps for 60 seconds.
log_info "after: test that a after+time works:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=after:$job_id1+1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "after:$job_id1+1(unfulfilled)"
log_info "Check that job $job_id2 is still dependent after 45 seconds"
exec $bin_sleep 45
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "after:$job_id1+1(unfulfilled)"
log_info "Wait for job $job_id2 dependency to be fulfilled"
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# After dependency never fails.
}
proc test_afterany { } {
global c1 c2 file_in_long
send_user "
#############################################################################
# Test afterany
#############################################################################
\n\n"
# Local dependency succeeds
log_info "afterany: test that local dependency succeeds:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=afterany:$job_id1 -M$c1" $c1 \
$file_in_long]
wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "RUNNING" $c1
cancel_job $job_id1 $c1
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Remote dependency succeeds
log_info "afterany: test that remote dependency succeeds:"
set job_id1 [submit_job "-M$c2" $c2 $file_in_long]
set job_id2 [submit_job "--depend=afterany:$job_id1 -M$c1" $c1 \
$file_in_long]
wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "RUNNING" $c2
cancel_job $job_id1 $c2
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Test old syntax: --depend=jobid,jobid
log_info "afterany: test old syntax: --depend=jobid\[,jobid,jobid...\]"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=$job_id1 -M$c1" $c1 \
$file_in_long]
set job_id3 [submit_job "--depend=$job_id1,$job_id2 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "afterany:$job_id1\(unfulfilled\)"
wait_for_depend $job_id3 "Dependency" \
"afterany:$job_id1\(unfulfilled\),afterany:$job_id2\(unfulfilled\)"
cancel_all_jobs
}
proc test_aftercorr { } {
global c1 c2 kill_invalid_depend file_in_long file_in_short
send_user "
#############################################################################
# Test aftercorr
#############################################################################
\n\n"
# Local dependency succeeds
log_info "aftercorr: test that local dependency succeeds:"
set job_array1 [submit_job "-M$c1 --array=1-2" $c1 $file_in_short]
set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \
--array=1-2" $c1 $file_in_long]
my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c1
my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c1
my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1
my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1
wait_for_depend "$job_array2\_1" "Dependency" \
"aftercorr:$job_array1\_*(unfulfilled)"
wait_for_depend "$job_array2\_2" "Dependency" \
"aftercorr:$job_array1\_*(unfulfilled)"
my_wait_for_fed_job "$job_array1\_1" "DONE" $c1
my_wait_for_fed_job "$job_array1\_2" "DONE" $c1
wait_for_depend "$job_array2\_1" "None" "(null)"
wait_for_depend "$job_array2\_2" "None" "(null)"
my_wait_for_fed_job "$job_array2\_1" "RUNNING" $c1
my_wait_for_fed_job "$job_array2\_2" "RUNNING" $c1
cancel_job $job_array2 $c1
# Local dependency fails
log_info "aftercorr: test that local dependency fails:"
set job_array1 [submit_job "-M$c1 --array=1-2" $c1 $file_in_long]
set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \
--array=1-2" $c1 $file_in_long]
my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c1
my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c1
my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1
my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1
wait_for_depend "$job_array2\_1" "Dependency" \
"aftercorr:$job_array1\_*(unfulfilled)"
wait_for_depend "$job_array2\_2" "Dependency" \
"aftercorr:$job_array1\_*(unfulfilled)"
cancel_job "$job_array1\_1" $c1
wait_for_depend "$job_array2\_1" "DependencyNeverSatisfied" \
"aftercorr:$job_array1\_*(failed)"
cancel_job "$job_array1\_2" $c1
wait_for_depend "$job_array2\_2" "DependencyNeverSatisfied" \
"aftercorr:$job_array1\_*(failed)"
if { !$kill_invalid_depend } {
cancel_job "$job_array2" $c1
}
# Remote dependency succeeds
log_info "aftercorr: test that remote dependency succeeds:"
set job_array1 [submit_job "-M$c2 --array=1-2" $c2 $file_in_short]
set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \
--array=1-2" $c1 $file_in_long]
my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c2
my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c2
my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1
my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1
# The dependency on the remote side has _*, but the dependency locally
# doesn't because it couldn't find the remote job.
wait_for_depend "$job_array2\_1" "Dependency" \
"aftercorr:$job_array1\(unfulfilled)"
wait_for_depend "$job_array2\_2" "Dependency" \
"aftercorr:$job_array1\(unfulfilled)"
my_wait_for_fed_job "$job_array1\_1" "DONE" $c2
my_wait_for_fed_job "$job_array1\_2" "DONE" $c2
wait_for_depend "$job_array2\_1" "None" "(null)"
wait_for_depend "$job_array2\_2" "None" "(null)"
my_wait_for_fed_job "$job_array2\_1" "RUNNING" $c1
my_wait_for_fed_job "$job_array2\_2" "RUNNING" $c1
cancel_job $job_array2 $c1
# Remote dependency fails
log_info "aftercorr: test that remote dependency fails:"
set job_array1 [submit_job "-M$c2 --array=1-2" $c2 $file_in_long]
set job_array2 [submit_job "--depend=aftercorr:$job_array1 -M$c1 \
--array=1-2" $c1 $file_in_long]
my_wait_for_fed_job "$job_array1\_1" "RUNNING" $c2
my_wait_for_fed_job "$job_array1\_2" "RUNNING" $c2
my_wait_for_fed_job "$job_array2\_1" "PENDING" $c1
my_wait_for_fed_job "$job_array2\_2" "PENDING" $c1
wait_for_depend "$job_array2\_1" "Dependency" \
"aftercorr:$job_array1\(unfulfilled)"
wait_for_depend "$job_array2\_2" "Dependency" \
"aftercorr:$job_array1\(unfulfilled)"
cancel_job "$job_array1\_1" $c2
wait_for_depend "$job_array2\_1" "DependencyNeverSatisfied" \
"aftercorr:$job_array1\(failed)"
cancel_job "$job_array1\_2" $c2
wait_for_depend "$job_array2\_2" "DependencyNeverSatisfied" \
"aftercorr:$job_array1\(failed)"
if { !$kill_invalid_depend } {
cancel_job "$job_array2" $c1
}
}
proc test_afterok { } {
global c1 c2 kill_invalid_depend file_in_long file_in_short
send_user "
#############################################################################
# Test afterok
#############################################################################
\n\n"
# Local dependency succeeds
log_info "afterok: test that local dependency succeeds:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_short]
set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "DONE" $c1
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Local dependency fails
log_info "afterok: test that local dependency fails:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)"
cancel_job $job_id1 $c1
wait_for_depend $job_id2 "DependencyNeverSatisfied" \
"afterok:$job_id1\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id2 $c1
}
# Remote dependency succeeds
log_info "afterok: test that remote dependency succeeds:"
set job_id1 [submit_job "-M$c2" $c2 $file_in_short]
set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c2
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "DONE" $c2
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Remote dependency fails
log_info "afterok: test that remote dependency fails"
set job_id1 [submit_job "-M$c2" $c2 $file_in_long]
set job_id2 [submit_job "--depend=afterok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c2
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "afterok:$job_id1\(unfulfilled\)"
cancel_job $job_id1 $c2
wait_for_depend $job_id2 "DependencyNeverSatisfied" \
"afterok:$job_id1\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id2 $c1
}
}
proc test_afternotok { } {
global c1 c2 kill_invalid_depend file_in_long file_in_short
send_user "
#############################################################################
# Test afternotok
#############################################################################
\n\n"
# Local dependency succeeds
log_info "afternotok: test that local dependency succeeds:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)"
cancel_job $job_id1 $c1
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Local dependency fails
log_info "afternotok: test that local dependency fails:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_short]
set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "DONE" $c1
wait_for_depend $job_id2 "DependencyNeverSatisfied" \
"afternotok:$job_id1\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id2 $c1
}
# Remote dependency succeeds
log_info "afternotok: test that remote dependency succeeds:"
set job_id1 [submit_job "-M$c2" $c2 $file_in_long]
set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c2
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)"
cancel_job $job_id1 $c2
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Remote dependency fails
log_info "afternotok: test that remote dependency fails"
set job_id1 [submit_job "-M$c2" $c2 $file_in_short]
set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c2
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)"
my_wait_for_fed_job $job_id1 "DONE" $c2
wait_for_depend $job_id2 "DependencyNeverSatisfied" \
"afternotok:$job_id1\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id2 $c1
}
}
proc test_singleton { } {
global c1 c2 c3 disable_remote_singleton file_in_long
send_user "
#############################################################################
# Test singleton
#############################################################################
\n\n"
# Test one cluster
log_info "singleton: test on one cluster"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "--depend=singleton -M$c1" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" "singleton(unfulfilled)"
cancel_job $job_id1 $c1
wait_for_depend $job_id2 "None" "(null)"
my_wait_for_fed_job $job_id2 "RUNNING" $c1
cancel_job $job_id2 $c1
# Test multiple clusters
if { $disable_remote_singleton } {
# Test that remote jobs don't affect the singleton dependency
log_info "singleton: test that disable_remote_singleton works"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job "--depend=singleton -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" "singleton(unfulfilled)"
# Cancel job 1 - job 3 should start running even though job 2 is
# running on another cluster
cancel_job $job_id1 $c1
wait_for_depend $job_id3 "None" "(null)"
my_wait_for_fed_job $job_id3 "RUNNING" $c1
cancel_job $job_id2 $c2
cancel_job $job_id3 $c1
} else {
# Test that singleton doesn't get cleared until jobs on all
# clusters are done
log_info "singleton: test with jobs on all clusters"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job "-M$c3" $c3 $file_in_long]
set job_id4 [submit_job "--depend=singleton -M$c1" $c1 \
$file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "RUNNING" $c3
my_wait_for_fed_job $job_id4 "PENDING" $c1
wait_for_depend $job_id4 "Dependency" "singleton(unfulfilled)"
# Job 4 shouldn't start until jobs 1, 2, and 3 are done.
# Test that it starts when a remote job is finished last.
cancel_job $job_id1 $c1
# Should still have the same dependency
wait_for_depend $job_id4 "Dependency" "singleton(unfulfilled)"
cancel_job $job_id2 $c2
cancel_job $job_id3 $c3
# Now the dependency should be cleared
wait_for_depend $job_id4 "None" "(null)"
my_wait_for_fed_job $job_id4 "RUNNING" $c1
cancel_job $job_id4 $c1
}
}
proc test_add_remove_clusters { } {
global c1 c2 c3 fed_name file_in_long \
disable_remote_singleton kill_invalid_depend
# Test adding/removing clusters from the federation
# Removing a cluster from a federation should cause dependencies on
# jobs on that cluster to fail.
# Adding a cluster to a federation means that any singleton dependencies
# have to be fulfilled on that cluster.
send_user "
#############################################################################
# Test adding/removing a cluster from the federation.
#############################################################################
\n\n"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job "-M$c3" $c3 $file_in_long]
set job_id4 [submit_job "--depend=afterok:$job_id3 -M$c1" $c1 \
$file_in_long]
set job_id5 [submit_job "--depend=singleton -M$c1" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "RUNNING" $c3
my_wait_for_fed_job $job_id4 "PENDING" $c1
wait_for_depend $job_id4 "Dependency" "afterok:$job_id3\(unfulfilled\)"
wait_for_depend $job_id5 "Dependency" "singleton(unfulfilled)"
log_info "Test that removing cluster $c3 from fed $fed_name makes dependencies on jobs on $c3 fail"
if { [remove_cluster_from_fed $c3 $fed_name] } {
cleanup 1
}
wait_for_depend $job_id4 "DependencyNeverSatisfied" \
"afterok:$job_id3\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id4 $c1
}
if { $disable_remote_singleton } {
cancel_job $job_id1 $c1
cancel_job $job_id2 $c2
cancel_job $job_id3 $c3
cancel_job $job_id5 $c1
return
}
log_info "Test that the singleton dependency was resent back to cluster $c3 when it was added back to the federation."
if { [add_cluster_to_fed $c3 $fed_name] } {
cleanup 1
}
cancel_job $job_id1 $c1
cancel_job $job_id2 $c2
cancel_job $job_id3 $c3
wait_for_depend $job_id5 "None" "(null)"
my_wait_for_fed_job $job_id5 "RUNNING" $c1
cancel_job $job_id5 $c1
}
proc test_submit_to_all_clusters { } {
global c1 c2 c3 file_in_long
send_user "
#############################################################################
# Test submitting a dependent job to all clusters.
#############################################################################
\n\n"
log_info "Test that a dependent job is only on its origin cluster while dependent and that it is submitted to all clusters when its dependency is cleared."
set job_id1 [submit_job "-M$c2" $c2 $file_in_long]
set job_id2 [submit_job "--depend=afternotok:$job_id1 -M$c1,$c2,$c3 \
--begin=now+60" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c2
my_wait_for_fed_job $job_id2 "PENDING" $c1
wait_for_depend $job_id2 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)"
log_info "Test that job $job_id2 is not on clusters $c2 or $c3."
if { [is_job_on_cluster $job_id2 $c2] || \
[is_job_on_cluster $job_id2 $c3] } {
log_error "Job $job_id2 is in cluster $c2 and/or $c3 when it shouldn't be."
cleanup 1
}
log_info "Test that job $job_id2 is submitted to all sibling clusters $c2 and $c3 when its dependency is fulfilled."
cancel_job $job_id1 $c2
wait_for_depend $job_id2 "BeginTime" "(null)"
my_wait_for_fed_job $job_id2 "PENDING" "$c1"
my_wait_for_fed_job $job_id2 "PENDING" "$c2"
my_wait_for_fed_job $job_id2 "PENDING" "$c3"
cancel_job $job_id2 "$c1,$c2,$c3"
}
proc test_or_dependencies { } {
global c1 c2 file_in_long kill_invalid_depend
send_user "
#############################################################################
# Test OR dependencies.
#############################################################################
\n\n"
log_info "OR dependencies: Test that one fulfilled dependency makes the whole dependency fulfilled:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job \
"--depend=afternotok:$job_id1?afternotok:$job_id2 -M$c1" \
$c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afternotok:$job_id1\(unfulfilled\)?afternotok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c2
wait_for_depend $job_id3 "None" "(null)"
my_wait_for_fed_job $job_id3 "RUNNING" $c1
cancel_job $job_id1 $c1
cancel_job $job_id3 $c1
log_info "OR dependencies: Test that the dependency doesn't fail until all dependencies have failed:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job "--depend=afterok:$job_id1?afterok:$job_id2 \
-M$c1" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afterok:$job_id1\(unfulfilled\)?afterok:$job_id2\(unfulfilled\)"
cancel_job $job_id1 $c1
wait_for_depend $job_id3 "Dependency" \
"afterok:$job_id1\(failed\)?afterok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c2
wait_for_depend $job_id3 "DependencyNeverSatisfied" \
"afterok:$job_id1\(failed\)?afterok:$job_id2\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id3 $c1
}
}
proc test_and_dependencies { } {
global c1 c2 file_in_long kill_invalid_depend
send_user "
#############################################################################
# Test AND dependencies.
#############################################################################
\n\n"
log_info "AND dependencies: Test that the dependency isn't fulfilled until all dependencies are fulfilled:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job \
"--depend=afternotok:$job_id1,afternotok:$job_id2 -M$c1" \
$c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afternotok:$job_id1\(unfulfilled\),afternotok:$job_id2\(unfulfilled\)"
cancel_job $job_id1 $c1
wait_for_depend $job_id3 "Dependency" \
"afternotok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c2
wait_for_depend $job_id3 "None" "(null)"
my_wait_for_fed_job $job_id3 "RUNNING" $c1
cancel_job $job_id3 $c1
log_info "AND dependencies: Test that the whole dependency fails when a single dependency fails:"
set job_id1 [submit_job "-M$c1" $c1 $file_in_long]
set job_id2 [submit_job "-M$c2" $c2 $file_in_long]
set job_id3 [submit_job "--depend=afterok:$job_id1,afterok:$job_id2 \
-M$c1" $c1 $file_in_long]
my_wait_for_fed_job $job_id1 "RUNNING" $c1
my_wait_for_fed_job $job_id2 "RUNNING" $c2
my_wait_for_fed_job $job_id3 "PENDING" $c1
wait_for_depend $job_id3 "Dependency" \
"afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(unfulfilled\)"
cancel_job $job_id2 $c2
wait_for_depend $job_id3 "DependencyNeverSatisfied" \
"afterok:$job_id1\(unfulfilled\),afterok:$job_id2\(failed\)"
if { !$kill_invalid_depend } {
cancel_job $job_id3 $c1
}
cancel_job $job_id1 $c1
}
###############################################################################
# Begin test
###############################################################################
print_header $test_id
if { [test_account_storage] == 0 } {
log_warn "This test can't be run without a usable AccountStorageType"
exit 0
}
if {[test_federation_setup]} {
log_warn "WARNING: This test can't be run without fed_slurm_base, fedc1, fedc2, fedc3 setup in globals.local."
exit 0
}
if {[test_all_up]} {
exit 0
}
delete_federations $fed_name
if { [setup_federation $fed_name] } {
cleanup 1
}
# Use file_in_short when we have to wait for the job to end.
# Use file_in_long everywhere else.
make_bash_script $file_in_long "$bin_sleep 60"
make_bash_script $file_in_short "$bin_sleep 5"
set permit_job_expansion [test_scheduler_params "permit_job_expansion"]
log_info "permit_job_expansion: $permit_job_expansion"
set kill_invalid_depend [test_dependency_params "kill_invalid_depend"]
set disable_remote_singleton [test_dependency_params "disable_remote_singleton"]
log_info "kill_invalid_depend: $kill_invalid_depend; disable_remote_singleton: $disable_remote_singleton\n"
cancel_all_jobs
test_after
test_afterany
# --depend=afterburstbuffer is tested in test35.6
test_aftercorr
test_afterok
test_afternotok
test_singleton
# test --depend=expand in another test.
test_add_remove_clusters
test_submit_to_all_clusters
test_or_dependencies
test_and_dependencies
cleanup 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment