Skip to content
Snippets Groups Projects
Commit fd8e7c6b authored by Christopher J. Morrone's avatar Christopher J. Morrone
Browse files

Make wait_for_job a quieter procedure.

parent c9af48ca
No related branches found
No related tags found
No related merge requests found
...@@ -305,83 +305,74 @@ proc wait_for_file { file_name } { ...@@ -305,83 +305,74 @@ proc wait_for_file { file_name } {
# #
# Input: job_id -- The SLURM job id of a job we want to # Input: job_id -- The SLURM job id of a job we want to
# wait for. # wait for.
# state -- The state you want the job to attain before # desired_state -- The state you want the job to attain before
# returning. Currently supports: # returning. Currently supports:
# DONE any terminated state # DONE any terminated state
# RUNNING job is running # RUNNING job is running
# #
# NOTE: We sleep for two seconds before replying that a job is # NOTE: We sleep for two seconds before replying that a job is
# done to give time for I/O completion (stdout/stderr files) # done to give time for I/O completion (stdout/stderr files)
# #
################################################################ ################################################################
proc wait_for_job { job_id state } { proc wait_for_job { job_id desired_state } {
global max_job_delay global scontrol
global scontrol
set is_done 0
set is_running 0
set sleep_time 1
log_user 0 # First verify that desired_state is supported
while { 1 == 1 } { switch $desired_state {
spawn -noecho $scontrol -o show job $job_id "DONE" {}
expect { "RUNNING" {}
-re "Job \[0-9]* not found" { default {
set is_done 1 send_user "Unsupported desired state: $desired_state\n"
exp_continue return 1
} }
-re "JobState=CANCELLED" { }
set is_done 1
exp_continue set sleep_time 1
} while 1 {
-re "JobState=COMPLETE" { set fd [open "|$scontrol -o show job $job_id"]
set is_done 1 gets $fd line
exp_continue close $fd
} if {[regexp {JobState\s*=\s*(\w+)} $line foo state] != 1} {
-re "JobState=FAILED" { set state "NOT_FOUND"
set is_done 1 }
exp_continue
} switch $state {
-re "JobState=TIMEOUT" { "NOT_FOUND" -
set is_done 1 "CANCELLED" -
exp_continue "FAILED" -
} "TIMEOUT" -
-re "JobState=NODE_FAIL" { "NODE_FAIL" -
set is_done 1 "COMPLETED" {
exp_continue if {[string compare $desired_state "DONE"] == 0} {
} send_user "Job $job_id is DONE\n"
-re "JobState=RUNNING" { sleep 2
set is_running 1 return 0
exp_continue } else {
} if {[string compare $desired_state "RUNNING"] == 0} {
timeout { send_user "Job $job_id is $state, but we wanted RUNNING\n"
send_user "\nFAILURE: scontrol not responding\n" }
log_user 1 return 1
return 1
}
eof {
wait
}
}
log_user 1
if {[string compare $state "DONE"] == 0 && $is_done == 1 } {
sleep 2
return 0
}
if {[string compare $state "RUNNING"] == 0 && $is_running == 1 } {
return 0
} }
if { $is_done == 1 } { }
return 1 "RUNNING" {
if {[string compare $desired_state "RUNNING"] == 0} {
send_user "Job $job_id is RUNNING\n"
return 0
} }
send_user "Job $job_id is in state $state, desire $desired_state\n"
}
default {
send_user "Job $job_id is in state $state, desire $desired_state\n"
}
}
sleep $sleep_time sleep $sleep_time
set sleep_time [expr $sleep_time * 2] set sleep_time [expr $sleep_time * 2]
if { $sleep_time > 10 } { if { $sleep_time > 10 } {
set sleep_time 10 set sleep_time 10
}
} }
}
} }
################################################################ ################################################################
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment