Skip to content
Snippets Groups Projects
Commit 1aeb53eb authored by Moe Jette's avatar Moe Jette
Browse files

Make test9.8 more robust on busy systems where MessageTimeout causes

failures in scancel or squeue commands.
parent a23ccbf6
No related branches found
No related tags found
No related merge requests found
...@@ -38,7 +38,7 @@ set test_id "9.8" ...@@ -38,7 +38,7 @@ set test_id "9.8"
set exit_code 0 set exit_code 0
set file_in "test$test_id.input" set file_in "test$test_id.input"
set job_cnt 10 set job_cnt 10
set delay 5 set delay 10
set job_name "test$test_id" set job_name "test$test_id"
set sleep_time 300 set sleep_time 300
set task_cnt 60 set task_cnt 60
...@@ -183,8 +183,9 @@ expect { ...@@ -183,8 +183,9 @@ expect {
# If message are lost, slurmctld re-sends job kill RPC 120 seconds later # If message are lost, slurmctld re-sends job kill RPC 120 seconds later
# In any case, make sure that all jobs get completed # In any case, make sure that all jobs get completed
# #
exec $bin_sleep 10 exec $bin_sleep 10
set completing_jobs 0 set completing_jobs 0
set running_jobs 0
spawn $squeue --noheader --user $user_name spawn $squeue --noheader --user $user_name
expect { expect {
-re "test9.8.*$user_name *CG" { -re "test9.8.*$user_name *CG" {
...@@ -192,13 +193,31 @@ expect { ...@@ -192,13 +193,31 @@ expect {
exp_continue exp_continue
} }
-re "test9.8.*$user_name" { -re "test9.8.*$user_name" {
send_user "\nFAILURE: jobs not all gone\n" incr running_jobs
set exit_code 1 exp_continue
} }
eof { eof {
wait wait
} }
} }
#
# The following logic handles the scancel request failing
# due to a very busy system (reports FAILURE above)
#
# Increasing the MessageTimeout configuration parameter
# should fix this problem.
#
if {$running_jobs != 0} {
send_user "\nFAILURE: jobs not all cancelled\n"
set exit_code 1
spawn $scancel --quiet --user $user_name
expect {
eof {
wait
}
}
}
if {$completing_jobs != 0} { if {$completing_jobs != 0} {
send_user "\nWaiting for slurmctld to re-send job kill RPC\n" send_user "\nWaiting for slurmctld to re-send job kill RPC\n"
send_user "This will take 120 seconds...\n" send_user "This will take 120 seconds...\n"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment