diff --git a/testsuite/expect/test9.8 b/testsuite/expect/test9.8 index 73e167ce4efa0dad7978998d24330e329338705e..bbebdca7455ec6fcf983cc7af12e6cfe3388b471 100755 --- a/testsuite/expect/test9.8 +++ b/testsuite/expect/test9.8 @@ -30,18 +30,31 @@ ############################################################################ source ./globals -set exit_code 0 set file_in "test$test_id.input" set job_cnt 10 set delay 10 set job_name "test$test_id" set sleep_time 300 set task_cnt 60 +set user_name [get_my_user_name] proc cleanup {} { - global bin_rm file_in + global bin_rm file_in scancel squeue user_name exec $bin_rm -f $file_in + + # The wait_for_command handles the scancel request failing + # due to a very busy system. Increasing the MessageTimeout + # configuration parameter should fix this problem. + wait_for_command -fail "$scancel --quiet --user $user_name" + + # Wait for our jobs to cancel so it does not interfere with other tests + if {![regexp {^$} [run_command_output "$squeue --noheader --user $user_name"]]} { + log_info "We still have some jobs in the completing state. Waiting for slurmctld to re-send job kill RPC. This could take up to 120 seconds..." + if [wait_for_command_match -timeout 120 "$squeue --noheader --user $user_name" {^$}] { + fail "Jobs not cancelling. Subsequent tests may fail!" + } + } } if {![param_contains [get_config_param "SelectTypeParameters"] "CR_*MEMORY"]} { @@ -102,171 +115,47 @@ for {set inx 0} {$inx < $job_cnt} {incr inx} { } } if {$start_cnt < $job_cnt} { - log_error "$job_cnt of $start_cnt jobs submitted" - set exit_code 1 + fail "Only $job_cnt of $start_cnt jobs submitted" } else { log_debug "All $start_cnt jobs submitted" } -set user_name [get_my_user_name] - -# -# There could be hundreds of job steps, we don't want to see -# the details, but want to make sure that we did start many -# -set desired_tasks [expr $task_cnt * 2 / 3] - -# -# Give the jobs a few seconds to get initiated, check for steps, -# then kill them all -# -exec $bin_sleep $delay - -set job_count 0 -set step_count 0 -set timeout 60 -log_user 0 -while { 1 } { - set job_count 0 - spawn $squeue --state R --name $job_name --user $user_name - expect { - -re "$job_name" { - incr job_count - exp_continue - } - timeout { - fail "squeue not responding" - } - eof { - wait - } - } - - set step_count 0 - spawn $squeue --steps --name $job_name --user $user_name - expect { - -re "sleep" { - incr step_count - exp_continue - } - -re "error:" { - log_error "squeue error" - set exit_code 1 - exp_continue - } - timeout { - fail "squeue not responding" - } - eof { - wait - } +# Wait for at least $job_cnt jobs to be started +# Because we want an external variable set with the match count, it is +# simpler to use wait_for here than wait_for_command +set job_count 0 +if [ + wait_for -timeout $delay {$job_count >= $job_cnt} { + set job_count [ + regexp -all $job_name [ + run_command_output -fail "$squeue --state R --name $job_name --user $user_name" + ] + ] } - if {$step_count >= $desired_tasks || $step_count == 0} { - break - } - set scaled_task_cnt [expr $job_count * $desired_tasks] - if {$step_count >= $scaled_task_cnt} { - log_debug "Only started $job_count jobs, reducing step count target to $scaled_task_cnt" - set desired_tasks $scaled_task_cnt - } - exec $bin_sleep 3 +] { + log_warn "Not all jobs were started ($job_count < $job_cnt). This is ok as long as it is at least 1" } - -log_user 1 -if {$step_count < $desired_tasks} { - log_error "Only started $job_count jobs and $step_count steps. We expected at least $desired_tasks and possibly hundreds" - set exit_code 1 -} else { - log_debug "We found $job_count jobs and $step_count steps" -} -spawn $scancel --quiet --user $user_name -expect { - eof { - wait - } +if {$job_count < 1} { + fail "No jobs were started" } # -# Give a few seconds for clean-up and ensure things are still fine -# If message are lost, slurmctld re-sends job kill RPC 120 seconds later -# In any case, make sure that all jobs get completed -# -exec $bin_sleep 10 -set completing_jobs 0 -set running_jobs 0 -spawn $squeue --noheader --user $user_name -expect { - -re "test9.8.*$user_name *CG" { - incr completing_jobs - exp_continue - } - -re "test9.8.*$user_name" { - incr running_jobs - exp_continue - } - eof { - wait - } -} -# -# The following logic handles the scancel request failing -# due to a very busy system (reports failure above) -# -# Increasing the MessageTimeout configuration parameter -# should fix this problem. +# There could be hundreds of job steps, we don't want to see +# the details, but want to make sure that we did start many # -if {$running_jobs != 0} { - log_error "Jobs not all cancelled" - set exit_code 1 +set desired_tasks [expr $task_cnt * 2 / 3] - spawn $scancel --quiet --user $user_name - expect { - eof { - wait - } +# We want to see a decent number of steps running +if [ + wait_for -timeout $delay {$step_count >= $desired_tasks} { + set step_count [ + regexp -all sleep [ + run_command_output -fail "$squeue --steps --name $job_name --user $user_name" + ] + ] } -} -if {$completing_jobs != 0} { - log_info "Waiting for slurmctld to re-send job kill RPC" - log_info "This will take 120 seconds.." - exec $bin_sleep 120 - set completing_jobs 0 - spawn $squeue --noheader --user $user_name - expect { - -re "$job_name *$user_name *CG" { - incr completing_jobs - exp_continue - } - eof { - wait - } - } - if {$completing_jobs != 0} { - log_error "Jobs not completing" - set exit_code 1 - } -} -if {$completing_jobs != 0} { - set max_wait [expr $sleep_time - 120] - if {$max_wait > 0} { - set completing_jobs 0 - exec $bin_sleep $max_wait - spawn $squeue --noheader --user $user_name - expect { - -re "$job_name *$user_name *CG" { - incr completing_jobs - exp_continue - } - eof { - wait - } - } - } -} -if {$completing_jobs != 0} { - log_error "Jobs not completing. Subsequent tests may fail!" +] { + fail "Only started $job_count jobs and $step_count steps. We expected at least $desired_tasks and possibly hundreds" } -if {$exit_code != 0} { - fail "Test failed due to previous errors (\$exit_code = $exit_code)" -} +log_debug "We found $job_count jobs and $step_count steps"