Skip to content
Snippets Groups Projects
Commit 1e669fe4 authored by Scott Jackson's avatar Scott Jackson Committed by Albert Gil
Browse files

Testsuite - Improve test9.8 using polling rather than sleeps

This avoids intermittent race condition failures and the time to
run the test was significantly reduced.

Bug 10439
parent 3cc64a18
No related branches found
No related tags found
No related merge requests found
......@@ -30,18 +30,31 @@
############################################################################
source ./globals
set exit_code 0
set file_in "test$test_id.input"
set job_cnt 10
set delay 10
set job_name "test$test_id"
set sleep_time 300
set task_cnt 60
set user_name [get_my_user_name]
proc cleanup {} {
global bin_rm file_in
global bin_rm file_in scancel squeue user_name
exec $bin_rm -f $file_in
# The wait_for_command handles the scancel request failing
# due to a very busy system. Increasing the MessageTimeout
# configuration parameter should fix this problem.
wait_for_command -fail "$scancel --quiet --user $user_name"
# Wait for our jobs to cancel so it does not interfere with other tests
if {![regexp {^$} [run_command_output "$squeue --noheader --user $user_name"]]} {
log_info "We still have some jobs in the completing state. Waiting for slurmctld to re-send job kill RPC. This could take up to 120 seconds..."
if [wait_for_command_match -timeout 120 "$squeue --noheader --user $user_name" {^$}] {
fail "Jobs not cancelling. Subsequent tests may fail!"
}
}
}
if {![param_contains [get_config_param "SelectTypeParameters"] "CR_*MEMORY"]} {
......@@ -102,171 +115,47 @@ for {set inx 0} {$inx < $job_cnt} {incr inx} {
}
}
if {$start_cnt < $job_cnt} {
log_error "$job_cnt of $start_cnt jobs submitted"
set exit_code 1
fail "Only $job_cnt of $start_cnt jobs submitted"
} else {
log_debug "All $start_cnt jobs submitted"
}
set user_name [get_my_user_name]
#
# There could be hundreds of job steps, we don't want to see
# the details, but want to make sure that we did start many
#
set desired_tasks [expr $task_cnt * 2 / 3]
#
# Give the jobs a few seconds to get initiated, check for steps,
# then kill them all
#
exec $bin_sleep $delay
set job_count 0
set step_count 0
set timeout 60
log_user 0
while { 1 } {
set job_count 0
spawn $squeue --state R --name $job_name --user $user_name
expect {
-re "$job_name" {
incr job_count
exp_continue
}
timeout {
fail "squeue not responding"
}
eof {
wait
}
}
set step_count 0
spawn $squeue --steps --name $job_name --user $user_name
expect {
-re "sleep" {
incr step_count
exp_continue
}
-re "error:" {
log_error "squeue error"
set exit_code 1
exp_continue
}
timeout {
fail "squeue not responding"
}
eof {
wait
}
# Wait for at least $job_cnt jobs to be started
# Because we want an external variable set with the match count, it is
# simpler to use wait_for here than wait_for_command
set job_count 0
if [
wait_for -timeout $delay {$job_count >= $job_cnt} {
set job_count [
regexp -all $job_name [
run_command_output -fail "$squeue --state R --name $job_name --user $user_name"
]
]
}
if {$step_count >= $desired_tasks || $step_count == 0} {
break
}
set scaled_task_cnt [expr $job_count * $desired_tasks]
if {$step_count >= $scaled_task_cnt} {
log_debug "Only started $job_count jobs, reducing step count target to $scaled_task_cnt"
set desired_tasks $scaled_task_cnt
}
exec $bin_sleep 3
] {
log_warn "Not all jobs were started ($job_count < $job_cnt). This is ok as long as it is at least 1"
}
log_user 1
if {$step_count < $desired_tasks} {
log_error "Only started $job_count jobs and $step_count steps. We expected at least $desired_tasks and possibly hundreds"
set exit_code 1
} else {
log_debug "We found $job_count jobs and $step_count steps"
}
spawn $scancel --quiet --user $user_name
expect {
eof {
wait
}
if {$job_count < 1} {
fail "No jobs were started"
}
#
# Give a few seconds for clean-up and ensure things are still fine
# If message are lost, slurmctld re-sends job kill RPC 120 seconds later
# In any case, make sure that all jobs get completed
#
exec $bin_sleep 10
set completing_jobs 0
set running_jobs 0
spawn $squeue --noheader --user $user_name
expect {
-re "test9.8.*$user_name *CG" {
incr completing_jobs
exp_continue
}
-re "test9.8.*$user_name" {
incr running_jobs
exp_continue
}
eof {
wait
}
}
#
# The following logic handles the scancel request failing
# due to a very busy system (reports failure above)
#
# Increasing the MessageTimeout configuration parameter
# should fix this problem.
# There could be hundreds of job steps, we don't want to see
# the details, but want to make sure that we did start many
#
if {$running_jobs != 0} {
log_error "Jobs not all cancelled"
set exit_code 1
set desired_tasks [expr $task_cnt * 2 / 3]
spawn $scancel --quiet --user $user_name
expect {
eof {
wait
}
# We want to see a decent number of steps running
if [
wait_for -timeout $delay {$step_count >= $desired_tasks} {
set step_count [
regexp -all sleep [
run_command_output -fail "$squeue --steps --name $job_name --user $user_name"
]
]
}
}
if {$completing_jobs != 0} {
log_info "Waiting for slurmctld to re-send job kill RPC"
log_info "This will take 120 seconds.."
exec $bin_sleep 120
set completing_jobs 0
spawn $squeue --noheader --user $user_name
expect {
-re "$job_name *$user_name *CG" {
incr completing_jobs
exp_continue
}
eof {
wait
}
}
if {$completing_jobs != 0} {
log_error "Jobs not completing"
set exit_code 1
}
}
if {$completing_jobs != 0} {
set max_wait [expr $sleep_time - 120]
if {$max_wait > 0} {
set completing_jobs 0
exec $bin_sleep $max_wait
spawn $squeue --noheader --user $user_name
expect {
-re "$job_name *$user_name *CG" {
incr completing_jobs
exp_continue
}
eof {
wait
}
}
}
}
if {$completing_jobs != 0} {
log_error "Jobs not completing. Subsequent tests may fail!"
] {
fail "Only started $job_count jobs and $step_count steps. We expected at least $desired_tasks and possibly hundreds"
}
if {$exit_code != 0} {
fail "Test failed due to previous errors (\$exit_code = $exit_code)"
}
log_debug "We found $job_count jobs and $step_count steps"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment