Skip to content
Snippets Groups Projects
Commit b92200d3 authored by Morris Jette's avatar Morris Jette
Browse files

Change job state handling in tests

We want to avoid printing "FAILURE" in the function wait_for_job
as a new use case may result in a job not starting in a timely
fashion and NOT be an error. So change "FAILURE" in wait_for_job
to "WARNING" and add checks for function errors in the tests
as needed (most places already check and log errors).
There were also many cases where "FAILURE would be printed by
wait_for_job, but the job would not have a non-zero exit code
and those are now fixed.
parent 1bc20510
No related branches found
No related tags found
No related merge requests found
Showing
with 143 additions and 32 deletions
...@@ -638,13 +638,13 @@ proc wait_for_job { job_id desired_state } { ...@@ -638,13 +638,13 @@ proc wait_for_job { job_id desired_state } {
"RUNNING" {} "RUNNING" {}
"SUSPENDED" {} "SUSPENDED" {}
default { default {
send_user "FAILURE: wait_for_job with invalid state: $desired_state\n" send_user "WARNING: wait_for_job with invalid state: $desired_state\n"
return 1 return 1
} }
} }
if {$job_id == 0} { if {$job_id == 0} {
send_user "FAILURE: wait_for_job with invalid job ID: $job_id\n" send_user "WARNING: wait_for_job with invalid job ID: $job_id\n"
return 1 return 1
} }
...@@ -713,7 +713,7 @@ proc wait_for_job { job_id desired_state } { ...@@ -713,7 +713,7 @@ proc wait_for_job { job_id desired_state } {
} }
if { $my_delay > $max_job_state_delay } { if { $my_delay > $max_job_state_delay } {
send_user "FAILURE: Timeout waiting for job state $desired_state\n" send_user "WARNING: Timeout waiting for job state $desired_state\n"
return 1 return 1
} }
......
...@@ -60,7 +60,12 @@ proc run_spread_job { task_cnt } { ...@@ -60,7 +60,12 @@ proc run_spread_job { task_cnt } {
exit 1 exit 1
} }
wait_for_job $job_id "DONE" if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
set timeout 10 set timeout 10
set num_nodes 0 set num_nodes 0
spawn $scontrol show job $job_id spawn $scontrol show job $job_id
......
...@@ -118,7 +118,11 @@ proc sub_job { freq } { ...@@ -118,7 +118,11 @@ proc sub_job { freq } {
exit 1 exit 1
} }
wait_for_job $job_id DONE if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
spawn $sacct -j$job_id -oavecpufreq --noheader spawn $sacct -j$job_id -oavecpufreq --noheader
expect { expect {
......
...@@ -319,7 +319,11 @@ expect { ...@@ -319,7 +319,11 @@ expect {
} }
} }
wait_for_job $tmp_id DONE if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
spawn $scontrol show job $tmp_id spawn $scontrol show job $tmp_id
expect { expect {
......
...@@ -238,7 +238,10 @@ if {$tmp_id == 0} { ...@@ -238,7 +238,10 @@ if {$tmp_id == 0} {
exit 1 exit 1
} }
wait_for_job $tmp_id RUNNING if {[wait_for_job $tmp_id "RUNNING"] != 0} {
send_user "\nFAILURE: error waiting for job $tmp_id to start\n"
set exit_code 1
}
set match 0 set match 0
spawn $srun -t1 sleep 10 spawn $srun -t1 sleep 10
expect { expect {
......
...@@ -177,7 +177,10 @@ if {$job_id == 0} { ...@@ -177,7 +177,10 @@ if {$job_id == 0} {
exit 1 exit 1
} }
wait_for_job $job_id RUNNING if {[wait_for_job $job_id "RUNNING"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to start\n"
set exit_code 1
}
set found 0 set found 0
spawn $scontrol show job $job_id spawn $scontrol show job $job_id
...@@ -225,7 +228,10 @@ if {$job_id == 0} { ...@@ -225,7 +228,10 @@ if {$job_id == 0} {
exit 1 exit 1
} }
wait_for_job $job_id RUNNING if {[wait_for_job $job_id "RUNNING"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to start\n"
set exit_code 1
}
# Wait for batch script to start (after message delays, prologs, etc.) # Wait for batch script to start (after message delays, prologs, etc.)
sleep 5 sleep 5
...@@ -241,7 +247,10 @@ mod_state "resume" "test$test_id" ...@@ -241,7 +247,10 @@ mod_state "resume" "test$test_id"
# Check the job state # Check the job state
send_user "\n\nTest 2\n" send_user "\n\nTest 2\n"
wait_for_job $job_id PENDING if {[wait_for_job $job_id "PENDING"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to pend\n"
set exit_code 1
}
set fail_count [check_sacct_states "NODE_FAIL" 1] set fail_count [check_sacct_states "NODE_FAIL" 1]
if {$fail_count != 1} { if {$fail_count != 1} {
endit 1 "FAILURE: Bad NODE_FAIL count ($fail_count != 1)" endit 1 "FAILURE: Bad NODE_FAIL count ($fail_count != 1)"
...@@ -252,7 +261,10 @@ if {$pend_count != 1} { ...@@ -252,7 +261,10 @@ if {$pend_count != 1} {
} }
send_user "So far, so good\n\n" send_user "So far, so good\n\n"
wait_for_job $job_id RUNNING if {[wait_for_job $job_id "RUNNING"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to start\n"
set exit_code 1
}
# Wait for batch script to start (after message delays, prologs, etc.) # Wait for batch script to start (after message delays, prologs, etc.)
sleep 5 sleep 5
...@@ -300,7 +312,10 @@ if {$pend_count != 1} { ...@@ -300,7 +312,10 @@ if {$pend_count != 1} {
} }
send_user "So far, so good\n\n" send_user "So far, so good\n\n"
wait_for_job $job_id RUNNING if {[wait_for_job $job_id "RUNNING"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to start\n"
set exit_code 1
}
# Wait for batch script to start (after message delays, prologs, etc.) # Wait for batch script to start (after message delays, prologs, etc.)
sleep 5 sleep 5
...@@ -327,7 +342,11 @@ if {$run_count != 1} { ...@@ -327,7 +342,11 @@ if {$run_count != 1} {
} }
send_user "So far, so good\n\n" send_user "So far, so good\n\n"
wait_for_job $job_id DONE if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
# Check steps after job has completed # Check steps after job has completed
check_step 2 check_step 2
......
...@@ -73,7 +73,11 @@ if {$job_id == 0} { ...@@ -73,7 +73,11 @@ if {$job_id == 0} {
exit 1 exit 1
} }
wait_for_job $job_id DONE if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
set failed_cnt 0 set failed_cnt 0
set timeout_cnt 0 set timeout_cnt 0
......
...@@ -54,7 +54,11 @@ proc check_job_nodes {test_job node_name find_node} { ...@@ -54,7 +54,11 @@ proc check_job_nodes {test_job node_name find_node} {
set nodelist "" set nodelist ""
wait_for_job $test_job "RUNNING" if {[wait_for_job $test_job "RUNNING"] != 0} {
send_user "\nFAILURE: error waiting for job $test_job to start\n"
cancel_job $test_job
set exit_code 1
}
# Check that job that the job used the correct nodes # Check that job that the job used the correct nodes
log_user 0 log_user 0
...@@ -103,7 +107,11 @@ proc check_job {nnode test_job} { ...@@ -103,7 +107,11 @@ proc check_job {nnode test_job} {
set nodelist "" set nodelist ""
wait_for_job $test_job "RUNNING" if {[wait_for_job $test_job "RUNNING"] != 0} {
send_user "\nFAILURE: error waiting for job $test_job to start\n"
cancel_job $test_job
set exit_code 1
}
# Check that job that the job used the correct nodes # Check that job that the job used the correct nodes
spawn $scontrol show job $test_job spawn $scontrol show job $test_job
......
...@@ -232,7 +232,11 @@ proc check_job { exp_num_jobs } { ...@@ -232,7 +232,11 @@ proc check_job { exp_num_jobs } {
global squeue job_id num_jobs file_in exit_code global squeue job_id num_jobs file_in exit_code
# Wait a bit for the job to start # Wait a bit for the job to start
wait_for_job ${job_id}_0 RUNNING if {[wait_for_job ${job_id}_0 "RUNNING"] != 0} {
send_user "\nFAILURE: error waiting for job ${job_id}_0 to start\n"
cancel_job ${job_id}_0
set exit_code 1
}
set job_cnt 0 set job_cnt 0
# If gang scheduling is configured, some jobs will be suspended # If gang scheduling is configured, some jobs will be suspended
...@@ -319,7 +323,11 @@ if {$job_id == 0} { ...@@ -319,7 +323,11 @@ if {$job_id == 0} {
} }
# Wait a bit for job to start # Wait a bit for job to start
wait_for_job $job_id RUNNING if {[wait_for_job $job_id "RUNNING"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to start\n"
cancel_job $job_id
set exit_code 1
}
# Identify node to use for testing # Identify node to use for testing
set got_node 0 set got_node 0
......
...@@ -60,7 +60,11 @@ if { $job_id1 == 0 } { ...@@ -60,7 +60,11 @@ if { $job_id1 == 0 } {
exit 1 exit 1
} }
wait_for_job $job_id1 RUNNING if {[wait_for_job $job_id1 "RUNNING"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id1 to start\n"
cancel_job $job_id1
set exit_code 1
}
# Submit a job that depends on job above # Submit a job that depends on job above
spawn $sbatch -t1 -dafternotok:$job_id1 -o/dev/null $script spawn $sbatch -t1 -dafternotok:$job_id1 -o/dev/null $script
...@@ -82,7 +86,11 @@ if { $job_id2 == 0 } { ...@@ -82,7 +86,11 @@ if { $job_id2 == 0 } {
exit 1 exit 1
} }
wait_for_job $job_id1 DONE if {[wait_for_job $job_id1 "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id1 to complete\n"
cancel_job $job_id1
set exit_code 1
}
# Check exit code of the first job # Check exit code of the first job
set match 0 set match 0
......
...@@ -95,7 +95,11 @@ if {$job_id == 0} { ...@@ -95,7 +95,11 @@ if {$job_id == 0} {
exit 1 exit 1
} }
wait_for_job $job_id DONE if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
set exit_code 1
cancel_job $job_id
}
set sig 0 set sig 0
spawn $bin_cat $file_out spawn $bin_cat $file_out
...@@ -175,7 +179,11 @@ if {$job_id == 0} { ...@@ -175,7 +179,11 @@ if {$job_id == 0} {
exit 1 exit 1
} }
wait_for_job $job_id DONE if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
set exit_code 1
cancel_job $job_id
}
set sig 0 set sig 0
spawn $bin_cat $file_out spawn $bin_cat $file_out
......
...@@ -141,7 +141,11 @@ if {$match == 0} { ...@@ -141,7 +141,11 @@ if {$match == 0} {
} }
# Wait for the fast job to finish after submitting dependent job # Wait for the fast job to finish after submitting dependent job
wait_for_job $fast_id DONE if {[wait_for_job $fast_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $fast_id to complete\n"
set exit_code 1
cancel_job $fast_id
}
# Wait for dependency job to start once the fast job is complete # Wait for dependency job to start once the fast job is complete
if {[wait_for_job $dep_id RUNNING]} { if {[wait_for_job $dep_id RUNNING]} {
......
...@@ -89,7 +89,12 @@ if {$job_id == 0} { ...@@ -89,7 +89,12 @@ if {$job_id == 0} {
} }
# Wait for the job to finish # Wait for the job to finish
wait_for_job $job_id DONE if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
foreach option [array names check_list] { foreach option [array names check_list] {
#send_user "\n$option=$check_list($option)\n" #send_user "\n$option=$check_list($option)\n"
......
...@@ -92,7 +92,11 @@ if { $job_id == 0 } { ...@@ -92,7 +92,11 @@ if { $job_id == 0 } {
} }
# Wait for the job to be in the complete state # Wait for the job to be in the complete state
wait_for_job $job_id DONE if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
# Requeue the job when it is complete # Requeue the job when it is complete
requeue_job $job_id requeue_job $job_id
...@@ -127,7 +131,11 @@ if { $job_id == 0 } { ...@@ -127,7 +131,11 @@ if { $job_id == 0 } {
} }
# Wait for the job to be in the complete state # Wait for the job to be in the complete state
wait_for_job $job_id DONE if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
# Requeue the job when it is complete # Requeue the job when it is complete
requeue_job $job_id requeue_job $job_id
......
...@@ -121,7 +121,11 @@ if {$job_id == 0} { ...@@ -121,7 +121,11 @@ if {$job_id == 0} {
exit 1 exit 1
} }
wait_for_job $job_id DONE if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
spawn $scontrol requeuehold $job_id spawn $scontrol requeuehold $job_id
expect { expect {
......
...@@ -95,7 +95,11 @@ if { $job_id == 0 } { ...@@ -95,7 +95,11 @@ if { $job_id == 0 } {
exit 1 exit 1
} }
wait_for_job $job_id DONE if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
spawn $scontrol requeuehold State=SpecialExit $job_id spawn $scontrol requeuehold State=SpecialExit $job_id
expect { expect {
......
...@@ -149,7 +149,10 @@ if {$exit_code} { ...@@ -149,7 +149,10 @@ if {$exit_code} {
exit $exit_code exit $exit_code
} }
wait_for_job $job_id "RUNNING" if {[wait_for_job $job_id "RUNNING"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to start\n"
set exit_code 1
}
# Check that the job was submitted with no error # Check that the job was submitted with no error
set match 0 set match 0
......
...@@ -100,7 +100,11 @@ if {$job_id == 0} { ...@@ -100,7 +100,11 @@ if {$job_id == 0} {
exit 1 exit 1
} }
wait_for_job $job_id "DONE" if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
set number_1 -1 set number_1 -1
set number_2 -1 set number_2 -1
......
...@@ -98,7 +98,11 @@ if {$job_id == 0} { ...@@ -98,7 +98,11 @@ if {$job_id == 0} {
exit 1 exit 1
} }
wait_for_job $job_id "DONE" if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
set verbose 0 set verbose 0
set number_1 -1 set number_1 -1
......
...@@ -98,7 +98,11 @@ if {$job_id == 0} { ...@@ -98,7 +98,11 @@ if {$job_id == 0} {
exit 1 exit 1
} }
wait_for_job $job_id "DONE" if {[wait_for_job $job_id "DONE"] != 0} {
send_user "\nFAILURE: error waiting for job $job_id to complete\n"
cancel_job $job_id
set exit_code 1
}
set number_1 -1 set number_1 -1
set number_2 -1 set number_2 -1
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment