Skip to content
Snippets Groups Projects
Commit 1e669fe4 authored by Scott Jackson's avatar Scott Jackson Committed by Albert Gil
Browse files

Testsuite - Improve test9.8 using polling rather than sleeps

This avoids intermittent race condition failures and the time to
run the test was significantly reduced.

Bug 10439
parent 3cc64a18
No related branches found
No related tags found
No related merge requests found
...@@ -30,18 +30,31 @@ ...@@ -30,18 +30,31 @@
############################################################################ ############################################################################
source ./globals source ./globals
set exit_code 0
set file_in "test$test_id.input" set file_in "test$test_id.input"
set job_cnt 10 set job_cnt 10
set delay 10 set delay 10
set job_name "test$test_id" set job_name "test$test_id"
set sleep_time 300 set sleep_time 300
set task_cnt 60 set task_cnt 60
set user_name [get_my_user_name]
proc cleanup {} { proc cleanup {} {
global bin_rm file_in global bin_rm file_in scancel squeue user_name
exec $bin_rm -f $file_in exec $bin_rm -f $file_in
# The wait_for_command handles the scancel request failing
# due to a very busy system. Increasing the MessageTimeout
# configuration parameter should fix this problem.
wait_for_command -fail "$scancel --quiet --user $user_name"
# Wait for our jobs to cancel so it does not interfere with other tests
if {![regexp {^$} [run_command_output "$squeue --noheader --user $user_name"]]} {
log_info "We still have some jobs in the completing state. Waiting for slurmctld to re-send job kill RPC. This could take up to 120 seconds..."
if [wait_for_command_match -timeout 120 "$squeue --noheader --user $user_name" {^$}] {
fail "Jobs not cancelling. Subsequent tests may fail!"
}
}
} }
if {![param_contains [get_config_param "SelectTypeParameters"] "CR_*MEMORY"]} { if {![param_contains [get_config_param "SelectTypeParameters"] "CR_*MEMORY"]} {
...@@ -102,171 +115,47 @@ for {set inx 0} {$inx < $job_cnt} {incr inx} { ...@@ -102,171 +115,47 @@ for {set inx 0} {$inx < $job_cnt} {incr inx} {
} }
} }
if {$start_cnt < $job_cnt} { if {$start_cnt < $job_cnt} {
log_error "$job_cnt of $start_cnt jobs submitted" fail "Only $job_cnt of $start_cnt jobs submitted"
set exit_code 1
} else { } else {
log_debug "All $start_cnt jobs submitted" log_debug "All $start_cnt jobs submitted"
} }
set user_name [get_my_user_name] # Wait for at least $job_cnt jobs to be started
# Because we want an external variable set with the match count, it is
# # simpler to use wait_for here than wait_for_command
# There could be hundreds of job steps, we don't want to see set job_count 0
# the details, but want to make sure that we did start many if [
# wait_for -timeout $delay {$job_count >= $job_cnt} {
set desired_tasks [expr $task_cnt * 2 / 3] set job_count [
regexp -all $job_name [
# run_command_output -fail "$squeue --state R --name $job_name --user $user_name"
# Give the jobs a few seconds to get initiated, check for steps, ]
# then kill them all ]
#
exec $bin_sleep $delay
set job_count 0
set step_count 0
set timeout 60
log_user 0
while { 1 } {
set job_count 0
spawn $squeue --state R --name $job_name --user $user_name
expect {
-re "$job_name" {
incr job_count
exp_continue
}
timeout {
fail "squeue not responding"
}
eof {
wait
}
}
set step_count 0
spawn $squeue --steps --name $job_name --user $user_name
expect {
-re "sleep" {
incr step_count
exp_continue
}
-re "error:" {
log_error "squeue error"
set exit_code 1
exp_continue
}
timeout {
fail "squeue not responding"
}
eof {
wait
}
} }
if {$step_count >= $desired_tasks || $step_count == 0} { ] {
break log_warn "Not all jobs were started ($job_count < $job_cnt). This is ok as long as it is at least 1"
}
set scaled_task_cnt [expr $job_count * $desired_tasks]
if {$step_count >= $scaled_task_cnt} {
log_debug "Only started $job_count jobs, reducing step count target to $scaled_task_cnt"
set desired_tasks $scaled_task_cnt
}
exec $bin_sleep 3
} }
if {$job_count < 1} {
log_user 1 fail "No jobs were started"
if {$step_count < $desired_tasks} {
log_error "Only started $job_count jobs and $step_count steps. We expected at least $desired_tasks and possibly hundreds"
set exit_code 1
} else {
log_debug "We found $job_count jobs and $step_count steps"
}
spawn $scancel --quiet --user $user_name
expect {
eof {
wait
}
} }
# #
# Give a few seconds for clean-up and ensure things are still fine # There could be hundreds of job steps, we don't want to see
# If message are lost, slurmctld re-sends job kill RPC 120 seconds later # the details, but want to make sure that we did start many
# In any case, make sure that all jobs get completed
#
exec $bin_sleep 10
set completing_jobs 0
set running_jobs 0
spawn $squeue --noheader --user $user_name
expect {
-re "test9.8.*$user_name *CG" {
incr completing_jobs
exp_continue
}
-re "test9.8.*$user_name" {
incr running_jobs
exp_continue
}
eof {
wait
}
}
#
# The following logic handles the scancel request failing
# due to a very busy system (reports failure above)
#
# Increasing the MessageTimeout configuration parameter
# should fix this problem.
# #
if {$running_jobs != 0} { set desired_tasks [expr $task_cnt * 2 / 3]
log_error "Jobs not all cancelled"
set exit_code 1
spawn $scancel --quiet --user $user_name # We want to see a decent number of steps running
expect { if [
eof { wait_for -timeout $delay {$step_count >= $desired_tasks} {
wait set step_count [
} regexp -all sleep [
run_command_output -fail "$squeue --steps --name $job_name --user $user_name"
]
]
} }
} ] {
if {$completing_jobs != 0} { fail "Only started $job_count jobs and $step_count steps. We expected at least $desired_tasks and possibly hundreds"
log_info "Waiting for slurmctld to re-send job kill RPC"
log_info "This will take 120 seconds.."
exec $bin_sleep 120
set completing_jobs 0
spawn $squeue --noheader --user $user_name
expect {
-re "$job_name *$user_name *CG" {
incr completing_jobs
exp_continue
}
eof {
wait
}
}
if {$completing_jobs != 0} {
log_error "Jobs not completing"
set exit_code 1
}
}
if {$completing_jobs != 0} {
set max_wait [expr $sleep_time - 120]
if {$max_wait > 0} {
set completing_jobs 0
exec $bin_sleep $max_wait
spawn $squeue --noheader --user $user_name
expect {
-re "$job_name *$user_name *CG" {
incr completing_jobs
exp_continue
}
eof {
wait
}
}
}
}
if {$completing_jobs != 0} {
log_error "Jobs not completing. Subsequent tests may fail!"
} }
if {$exit_code != 0} { log_debug "We found $job_count jobs and $step_count steps"
fail "Test failed due to previous errors (\$exit_code = $exit_code)"
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment