diff --git a/testsuite/expect/globals b/testsuite/expect/globals index 65c8a08adc00777e5bb4efa3b45407100badeced..b9a95e7bd802620aeab9d3a11a40b143f368c125 100755 --- a/testsuite/expect/globals +++ b/testsuite/expect/globals @@ -1182,16 +1182,19 @@ proc run_command_status args { proc cancel_job { job_id_list {het_job 0}} { global scancel + set job_list_clean [list] foreach job_id $job_id_list { - if {$job_id == 0} { - continue + if {$job_id != 0} { + lappend job_list_clean $job_id } + } - log_debug "Cancelling $job_id" - if {[run_command_status "$scancel -Q $job_id"]} { - log_warn "scancel command returned error" - return $::RETURN_ERROR - } + log_debug "Cancelling $job_list_clean" + if {[run_command_status "$scancel -Q $job_list_clean"]} { + log_warn "scancel command returned error" + return $::RETURN_ERROR + } + foreach job_id $job_list_clean { if {[wait_for_job $job_id "DONE" $het_job]} { log_warn "Job $job_id not ended" return $::RETURN_ERROR @@ -1485,7 +1488,7 @@ proc kill_srun { } { # -fail # abort the test with failure if the condition is not met # -timeout <float_number> -# time in seconds to wait for the command to complete before +# time in seconds to wait for the condition to be met before # timing out (default is 60.0) # -pollinterval <float_number> # time in seconds between each loop execution and condition check @@ -1563,120 +1566,149 @@ proc wait_for args { ################################################################ # # NAME -# wait_for_command - waits for command output to match a pattern +# wait_for_command - repeat a command until it is successful or meets a specified condition # # SYNOPSIS -# wait_for_command ?options? command args regex ?matches_in? ?or_more? ?matches_out? +# wait_for_command ?options? command ?condition? # # DESCRIPTION -# Executes a command every poll interval until a regex -# pattern is matched in the output, or until timeout. +# A command is repeated until it meets a condition or a timeout is reached. +# If a condition is not specified, the command will be repeated until it +# is successful (the exit code is zero). # # OPTIONS -# -timeout <integer_number> -# time in seconds to wait for the command to complete before -# timing out (default is 120) -# -pollinterval <integer_number> -# time in seconds between each loop execution and condition check -# (default is 1) +# -fail +# abort the test with failure if the condition is not met by +# the timeout +# -timeout <float_number> +# time in seconds to wait for the condition to be met before +# timing out (default is 60.0) +# -pollinterval <float_number> +# time in seconds between each loop execution and condition +# check (default is 1.0) # # ARGUMENTS # command -# The command to run via spawn. -# args -# The arguments to the command, as a single string. -# regex -# The regex pattern to search for in the command -# output. Can be a simple string. -# matches_in -# The number of times to match the regex. Defaults to 1. -# or_more -# If 1, allow for matching the regex pattern match_cnt *or more* -# times, instead of exactly match_cnt times. Defaults to 0. -# matches_out -# The upvar (a variable name to "pass by reference" in TCL) -# to set/return the number of matches found. -# Useful if or_more is 1 and the caller wants the matches found. +# a string containing the command and arguments to execute +# condition +# The boolean expression to test. For each command invocation, +# the result variable will be set to the dictionary returned +# from run_command. +# The condition expression will normally involve a comparison +# with one or more values of this dictionary. If a condition is +# not specified, this condition will be used: +# { [dict get $result exit_code] == 0 } # # RETURN VALUE -# RETURN_SUCCESS on success and RETURN_TIMEOUT on timeout. -# On failure, an error is logged to the output. -# If matches_out is specified, the number of matches will -# be returned via the reference/upvar matches_out. +# RETURN_SUCCESS if the condition is met before the timeout occurs, +# RETURN_TIMEOUT if the timeout occurs before the condition is met # ################################################################ proc wait_for_command args { - global bin_sleep - - set timeout 120 + set fatal false + set timeout 60 set poll_interval 1 - set matches_in 1 - set or_more 0 - set matches_out "" while {[llength $args]} { switch -glob -- [lindex $args 0] { + -fail {set fatal true; set args [lrange $args 1 end]} -time* {set args [lassign $args - timeout]} -poll* {set args [lassign $args - poll_interval]} -* {fail "Unknown option: [lindex $args 0]"} default break } } + set argument_count [llength $args] - if {$argument_count < 3} { + if {$argument_count < 1} { fail "Too few arguments ($argument_count): $args" - } else { - lassign $args command arguments regex - } - if {$argument_count >= 4} { set matches_in [lindex $args 3] } - if {$argument_count >= 5} { set or_more [lindex $args 4] } - if {$argument_count == 6} { set matches_out [lindex $args 5] } - if {$argument_count > 6} { + } elseif {$argument_count > 2} { fail "Too many arguments ($argument_count): $args" } - - if {$matches_out != ""} { - upvar $matches_out matches + lassign $args command + if {$argument_count == 2} { + set condition [lindex $args 1] + } else { + set condition { [dict get $result exit_code] == 0 } } - set delay 0 - while {$delay < $timeout} { - set matches 0 - # `{*}` breaks apart a string into individual pieces - spawn $command {*}$arguments - expect { - -re $regex { - incr matches - exp_continue - } - timeout { - log_error "$command not responding after $delay seconds polling" - return $::RETURN_TIMEOUT - } - eof { - wait - } - } + set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] + + log_debug "Waiting for $condition starting at [clock format [expr int($start_time)] -format %Y-%m-%dT%X].[lindex [split $start_time '.'] 1]" - if {($matches == $matches_in) || ($or_more == 1 && - $matches >= $matches_in)} { + while {1} { + # Run command + set result [run_command $command] + + # Check condition + if {[eval expr [format "{%s}" $condition]]} { + set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] + log_debug "Condition ($condition) was met" return $::RETURN_SUCCESS + } else { + log_trace "Condition ($condition) was not met" } - log_debug "Polled $matches matches of '$regex', but expecting $matches_in" - exec $bin_sleep $poll_interval - incr delay $poll_interval + # Sleep poll interval + log_trace "Sleeping for $poll_interval seconds" + after [expr {int($poll_interval * 1000)}] + # Check if we have surpassed our timeout + set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] + log_trace "Checking whether the current time ([clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]) is greater than the start time plus the timeout ([clock format [expr int($start_time + $timeout)] -format %Y-%m-%dT%X].[lindex [split [expr $start_time + $timeout] '.'] 1])" + if {$now > $start_time + $timeout} { + set message "Condition ($condition) did not occur before timeout ($timeout) seconds" + if {$fatal} { + fail $message + } else { + log_warn $message + return $::RETURN_TIMEOUT + } + } } - if {$or_more == 1} { - set match_str "$matches_in or more times" - } elseif {$matches_in == 1} { - set match_str "exactly $matches_in time" - } else { - set match_str "exactly $matches_in times" - } - log_error "Failed to match regex `$regex` $match_str after $timeout seconds for command `$command $arguments`." - return $::RETURN_TIMEOUT +} + + +################################################################ +# +# NAME +# wait_for_command_match - repeat a command until its output matches the specified pattern +# +# SYNOPSIS +# wait_for_command_match ?options? command pattern +# +# DESCRIPTION +# A command is repeated until its output matches the specified pattern +# +# OPTIONS +# -fail +# abort the test with failure if output does not match the pattern +# by the timeout +# -timeout <float_number> +# time in seconds to wait for the pattern to be matched before +# timing out (default is 60.0) +# -pollinterval <float_number> +# time in seconds between each loop execution and match check +# (default is 1.0) +# +# ARGUMENTS +# command +# a string containing the command and arguments to execute +# pattern +# The regular expression to match against the command output +# +# RETURN VALUE +# RETURN_SUCCESS if the pattern is matched before the timeout occurs, +# RETURN_TIMEOUT if the timeout occurs before the pattern is matched +# +################################################################ + +proc wait_for_command_match args { + + set pattern [lindex $args end] + set args [lrange $args 0 end-1] + + return [wait_for_command {*}$args "\[regexp {$pattern} \[dict get \$result output\]\] == 1"] } diff --git a/testsuite/expect/test12.3 b/testsuite/expect/test12.3 index 44637e2b3a9675405ef3f7b13e4d3760734202be..47dcae6c859f35fee9640374205d8c2f20455bc0 100755 --- a/testsuite/expect/test12.3 +++ b/testsuite/expect/test12.3 @@ -211,14 +211,10 @@ if {$job_2_node_inx != 2} { # # Displaying start and end times to troubleshoot bug 9681 # -if [wait_for_command $sacct "-A 'slurm_test-account.1' --name=$job_name_1 - -X -n -P -j $job_id_1 -o jobid,jobname,start,end" \ - "$job_id_1.$job_name_1"] { +if [wait_for_command_match "$sacct -A 'slurm_test-account.1' --name=$job_name_1 -X -n -P -j $job_id_1 -o jobid,jobname,start,end" "$job_id_1.$job_name_1"] { fail "Accounting data for job 1 ($job_id_1) not propogated to slurmdbd" } -if [wait_for_command $sacct "-A 'slurm_test-account.1' --name=$job_name_2 - -X -n -P -j $job_id_2 -o jobid,jobname,start,end" \ - "$job_id_2.$job_name_2"] { +if [wait_for_command_match "$sacct -A 'slurm_test-account.1' --name=$job_name_2 -X -n -P -j $job_id_2 -o jobid,jobname,start,end" "$job_id_2.$job_name_2"] { fail "Accounting data for job 2 ($job_id_2) not propogated to slurmdbd" } diff --git a/testsuite/expect/test20.15 b/testsuite/expect/test20.15 index a986aba424b42bef16be53cffd5711d2500eab44..ad23ed1f08999415761f5cceda7fd4185242ead1 100755 --- a/testsuite/expect/test20.15 +++ b/testsuite/expect/test20.15 @@ -34,20 +34,20 @@ set out [run_command_output -fail -xfail "$seff -h"] subtest {[regexp "Usage:" $out]} "seff -h should return Usage:" set job_id [submit_job -fail "-H --wrap=\"sleep 10\" -o /dev/null -e /dev/null"] -wait_for_command "$sacct" "-n -j $job_id" "PENDING" +wait_for_command_match "$sacct -n -j $job_id" "PENDING" set out [run_command_output -fail "$seff $job_id" ] subtest {[regexp "State: PENDING" $out]} "Job should be returned as in PENDING state by seff" subtest {[regexp "Cores: 1" $out]} "Job should be reported as 1 core" subtest {[regexp "Efficiency not available for jobs in the PENDING state." $out]} "Job should be reported as efficiency not available for PENDING state" run_command -fail "$scontrol release $job_id" -wait_for_command "$sacct" "-X -n -j $job_id" "RUNNING" +wait_for_command_match "$sacct -X -n -j $job_id" "RUNNING" set out [run_command_output -fail "$seff $job_id"] subtest {[regexp "State: RUNNING" $out]} "Job should be reported as RUNNING by seff" subtest {[regexp "WARNING: Efficiency statistics may be misleading for RUNNING jobs." $out]} "seff output should contain warning about staistics for RUNNING job" -wait_for_command "$sacct" "-X -n -j $job_id" "COMPLETED" +wait_for_command_match "$sacct -X -n -j $job_id" "COMPLETED" set out [run_command_output -fail "$seff $job_id"] subtest {[regexp "State: COMPLETED" $out]} "Job should be reported as COMPLETED by seff" subtest {[regexp "Cores: \(\\d+\)" $out {} cores] || \ diff --git a/testsuite/expect/test24.5 b/testsuite/expect/test24.5 index 0903c1fea9145dda25568d150109ab8c8485e139..0d57af8496d6433ab291b12ab4af8154d37fab55 100755 --- a/testsuite/expect/test24.5 +++ b/testsuite/expect/test24.5 @@ -88,7 +88,8 @@ proc sub_job { cpu_cnt state } { proc sprio_args { args regex match_cnt } { global sprio - if {[wait_for_command $sprio $args $regex $match_cnt]} { + + if [wait_for_command "$sprio $args" "\[regexp -all {$regex} \[dict get \$result output\]\] == $match_cnt"] { fail "sprio did not match" } } diff --git a/testsuite/expect/test31.3 b/testsuite/expect/test31.3 index 6dcfdcc0eb40d5d718c8637f8f0b473536cd373a..c16c64f74d6783fd730e0940d12f1a09a905171c 100755 --- a/testsuite/expect/test31.3 +++ b/testsuite/expect/test31.3 @@ -47,7 +47,7 @@ proc cleanup { } { proc check_prolog { pattern } { global prolog_out exit_code bin_grep - set rc [wait_for_command -timeout 5 "$bin_grep" "$pattern $prolog_out" $pattern] + set rc [wait_for_command_match -timeout 5 "$bin_grep $pattern $prolog_out" $pattern] if { $rc == 0 } { log_info "Prolog: Found '$pattern'" } else { @@ -59,7 +59,7 @@ proc check_prolog { pattern } { proc check_epilog { pattern } { global epilog_out exit_code bin_grep - set rc [wait_for_command -timeout 5 "$bin_grep" "$pattern $epilog_out" $pattern] + set rc [wait_for_command_match -timeout 5 "$bin_grep $pattern $epilog_out" $pattern] if { $rc == 0 } { log_info "Epilog: Found '$pattern'" } else { diff --git a/testsuite/expect/test9.8 b/testsuite/expect/test9.8 index eef9bc7987096e2bc238599a3f56d7ce79754f9f..73cedf1e8f5b0480ab002cb0e233c267359e7616 100755 --- a/testsuite/expect/test9.8 +++ b/testsuite/expect/test9.8 @@ -30,18 +30,19 @@ ############################################################################ source ./globals -set exit_code 0 set file_in "test$test_id.input" set job_cnt 10 set delay 10 set job_name "test$test_id" set sleep_time 300 set task_cnt 60 +set jobs_list [list] proc cleanup {} { - global bin_rm file_in + global bin_rm file_in jobs_list exec $bin_rm -f $file_in + cancel_job $jobs_list } if {![param_contains [get_config_param "SelectTypeParameters"] "CR_*MEMORY"]} { @@ -82,191 +83,47 @@ $srun -N1 -n1 --mem=0 --overlap $bin_sleep $sleep_time set start_cnt 0 set timeout 30 for {set inx 0} {$inx < $job_cnt} {incr inx} { - set sbatch_pid [spawn $sbatch --gres=craynetwork:0 --job-name=$job_name --mem-per-cpu=[expr $mem_per_step * 10] --output=/dev/null --error=/dev/null -t5 $file_in] - expect { - -re "Submitted batch job ($number)" { - incr start_cnt - exp_continue - } - -re "Unable to contact" { - log_error "Slurm appears to be down" - exp_continue - } - timeout { - slow_kill $sbatch_pid - fail "sbatch not responding" - } - eof { - wait - } + set job_id [submit_job -fail "--gres=craynetwork:0 --job-name=$job_name --mem-per-cpu=[expr $mem_per_step * 10] --output=/dev/null --error=/dev/null -t5 $file_in"] + lappend jobs_list $job_id +} +log_debug "All $start_cnt jobs submitted" + +# Wait for at least $job_cnt jobs to be started +# Because we want an external variable set with the match count, it is +# simpler to use wait_for here than wait_for_command +set job_count 0 +if [ + wait_for -timeout $delay {$job_count >= $job_cnt} { + set job_count [ + regexp -all $job_name [ + run_command_output -fail "$squeue --state R --name $job_name" + ] + ] } +] { + log_warn "Not all jobs were started ($job_count < $job_cnt). This is ok as long as it is at least 1" } -if {$start_cnt < $job_cnt} { - log_error "$job_cnt of $start_cnt jobs submitted" - set exit_code 1 -} else { - log_debug "All $start_cnt jobs submitted" +if {$job_count < 1} { + fail "No jobs were started" } -set user_name [get_my_user_name] - # # There could be hundreds of job steps, we don't want to see # the details, but want to make sure that we did start many # set desired_tasks [expr $task_cnt * 2 / 3] -# -# Give the jobs a few seconds to get initiated, check for steps, -# then kill them all -# -exec $bin_sleep $delay - -set job_count 0 -set step_count 0 -set timeout 60 -log_user 0 -while { 1 } { - set job_count 0 - spawn $squeue --state R --name $job_name --user $user_name - expect { - -re "$job_name" { - incr job_count - exp_continue - } - timeout { - fail "squeue not responding" - } - eof { - wait - } - } - - set step_count 0 - spawn $squeue --steps --name $job_name --user $user_name - expect { - -re "sleep" { - incr step_count - exp_continue - } - -re "error:" { - log_error "squeue error" - set exit_code 1 - exp_continue - } - timeout { - fail "squeue not responding" - } - eof { - wait - } - } - if {$step_count >= $desired_tasks || $step_count == 0} { - break - } - set scaled_task_cnt [expr $job_count * $desired_tasks] - if {$step_count >= $scaled_task_cnt} { - log_debug "Only started $job_count jobs, reducing step count target to $scaled_task_cnt" - set desired_tasks $scaled_task_cnt - } - exec $bin_sleep 3 -} - -log_user 1 -if {$step_count < $desired_tasks} { - log_error "Only started $job_count jobs and $step_count steps. We expected at least $desired_tasks and possibly hundreds" - set exit_code 1 -} else { - log_debug "We found $job_count jobs and $step_count steps" -} -spawn $scancel --quiet --user $user_name -expect { - eof { - wait +# We want to see a decent number of steps running +if [ + wait_for -timeout $delay {$step_count >= $desired_tasks} { + set step_count [ + regexp -all sleep [ + run_command_output -fail "$squeue --steps --name $job_name" + ] + ] } +] { + fail "Only started $job_count jobs and $step_count steps. We expected at least $desired_tasks and possibly hundreds" } -# -# Give a few seconds for clean-up and ensure things are still fine -# If message are lost, slurmctld re-sends job kill RPC 120 seconds later -# In any case, make sure that all jobs get completed -# -exec $bin_sleep 10 -set completing_jobs 0 -set running_jobs 0 -spawn $squeue --noheader --user $user_name -expect { - -re "test9.8.*$user_name *CG" { - incr completing_jobs - exp_continue - } - -re "test9.8.*$user_name" { - incr running_jobs - exp_continue - } - eof { - wait - } -} -# -# The following logic handles the scancel request failing -# due to a very busy system (reports failure above) -# -# Increasing the MessageTimeout configuration parameter -# should fix this problem. -# -if {$running_jobs != 0} { - log_error "Jobs not all cancelled" - set exit_code 1 - - spawn $scancel --quiet --user $user_name - expect { - eof { - wait - } - } -} -if {$completing_jobs != 0} { - log_info "Waiting for slurmctld to re-send job kill RPC" - log_info "This will take 120 seconds.." - exec $bin_sleep 120 - set completing_jobs 0 - spawn $squeue --noheader --user $user_name - expect { - -re "$job_name *$user_name *CG" { - incr completing_jobs - exp_continue - } - eof { - wait - } - } - if {$completing_jobs != 0} { - log_error "Jobs not completing" - set exit_code 1 - } -} -if {$completing_jobs != 0} { - set max_wait [expr $sleep_time - 120] - if {$max_wait > 0} { - set completing_jobs 0 - exec $bin_sleep $max_wait - spawn $squeue --noheader --user $user_name - expect { - -re "$job_name *$user_name *CG" { - incr completing_jobs - exp_continue - } - eof { - wait - } - } - } -} -if {$completing_jobs != 0} { - log_error "Jobs not completing. Subsequent tests may fail!" -} - -if {$exit_code != 0} { - fail "Test failed due to previous errors (\$exit_code = $exit_code)" -} +log_debug "We found $job_count jobs and $step_count steps"