diff --git a/src/slaunch/slaunch.c b/src/slaunch/slaunch.c index 2955a9c838cea41e906fd8c3161d4b8de623d842..06ba8c730e84a9bfb5e41174e47aca5ff27fe8ed 100644 --- a/src/slaunch/slaunch.c +++ b/src/slaunch/slaunch.c @@ -62,6 +62,7 @@ #include <slurm/slurm.h> +#include "src/common/macros.h" #include "src/common/fd.h" #include "src/common/log.h" #include "src/common/slurm_protocol_api.h" @@ -86,6 +87,7 @@ extern char **environ; slurm_step_ctx step_ctx; +int global_rc; /* * declaration of static funcs @@ -247,7 +249,7 @@ cleanup: slurm_step_ctx_destroy(step_ctx); _mpir_cleanup(); - return 0; + return global_rc; } /* Set SLURM_UMASK environment variable with current state */ @@ -535,9 +537,23 @@ static void _task_finish(task_exit_msg_t *msg) { static bool first_done = true; + int rc; - verbose("%d tasks finished (rc=%d)", + verbose("%d tasks finished (rc=%u)", msg->num_tasks, msg->return_code); + if (WIFEXITED(msg->return_code)) { + rc = WEXITSTATUS(msg->return_code); + if (rc != 0) { + /* FIXME - needs to print task id list, not + just the first id in the list */ + error("task%u: Exited with exit code %d", + msg->task_id_list[0], rc); + } + } else { + debug("tasks did not exit normally"); + rc = 1; + } + global_rc = MAX(global_rc, rc); /* If these are the first tasks to finish we need to start a timer * to kill off the job step if the other tasks don't finish diff --git a/testsuite/expect/test18.17 b/testsuite/expect/test18.17 index c13345fd14195822169c1cfa12cdf5f25478f3dd..5347f06cf07168bdb4eac55c34ebe47f3d0a5f0d 100755 --- a/testsuite/expect/test18.17 +++ b/testsuite/expect/test18.17 @@ -37,86 +37,75 @@ set exit_script "./test$test_id.exit.bash" set test_script "./test$test_id.bash" print_header $test_id +set timeout $max_job_delay + # # Delete left-over scripts and build new ones # -make_bash_script $exit_script " - RC=`expr \$SLURM_PROCID + 10` - exit \$RC -" - -make_bash_script $test_script " - $salloc -N1 -t1 -v $slaunch -n2 --overcommit $exit_script - echo salloc_exit_code_\$? -" +make_bash_script $exit_script {exit $((SLURM_PROCID + 10))} # -# Spawn program and check for exit code messages from slaunch +# Check the return code of slaunch. To do so, we spawn +# salloc and slaunch command seperately. # -set job_id 0 -set sum 0 -set timeout $max_job_delay -spawn $salloc -N1 -t1 $slaunch -n2 --overcommit $exit_script +set job_id 0 +set matches 0 + +# First start salloc and wait for the allocation +set salloc_pid [spawn $salloc -N1 -t1 --kill-command sleep 300] +set salloc_spawn_id $spawn_id +set spawn_id $salloc_spawn_id expect { -re "Granted job allocation ($number)" { + send_user "got job id\n" set job_id $expect_out(1,string) - exp_continue - } - -re "exit code ($number)" { - send_user "This error is expected, no worries\n" - incr sum $expect_out(1,string) - exp_continue } timeout { - send_user "\nFAILURE: salloc not responding\n" if {$job_id == 0} { - kill_salloc + slow_kill $salloc_pid } else { cancel_job $job_id } - set exit_code 1 - exp_continue - } - eof { - wait } } -if {$sum != 21} { - send_user "\nFAILURE: slaunch failed to report exit code\n" - set exit_code 1 -} -exit $exit_code -# -# Spawn program to check the exit code of slaunch itself -# -set job_id 0 -set matches 0 -spawn $test_script + +# Then spawn slaunch using the job allocation from the previous salloc +set slaunch_pid [spawn $slaunch --jobid=$job_id -n2 --overcommit $exit_script] +set slaunch_spawn_id $spawn_id +set spawn_id $slaunch_spawn_id +set sum 0 expect { - -re "Granted job allocation ($number)" { - set job_id $expect_out(1,string) - exp_continue - } - -re "salloc_exit_code_11" { - send_user "Above error are expected, no worries\n" - incr matches + -re "exit code ($number)" { + send_user "This error is expected, no worries\n" + incr sum $expect_out(1,string) exp_continue } timeout { - if {$job_id == 0} { - kill_salloc - } else { - cancel_job $job_id - } + send_user "\nFAILURE: slaunch not responding\n" + slow_kill $slaunch_pid + set exit_code 1 + exp_continue } eof { - wait + set slaunch_rc [lindex [wait] 3] } } -if {$matches != 1} { - send_user "\nFAILURE: salloc's exit code is bad\n" +if {$sum != 21} { + send_user "\nFAILURE: slaunch failed to report individual task error codes\n" set exit_code 1 } +send_user "slaunch rc = $slaunch_rc\n" +if {$slaunch_rc != 11} { + send_user "\nFAILURE: slaunch's return code was not the maximum task return code\n" + set exit_code 1 +} + +# signal the entire process group of salloc to kill the "sleep 300" +exec $bin_kill -s INT "-$salloc_pid" +set salloc_rc [lindex [wait -i $salloc_spawn_id] 3] +# We don't care about salloc's return code (it is probably 1 because +# we are killing the sleep command with SIGINT. +#send_user "salloc rc = $salloc_rc\n" # # Post-processing diff --git a/testsuite/expect/test18.9 b/testsuite/expect/test18.9 index 91be0eaff39eb9c55605d92d6766b7a5dfcd618a..c09f171bb2a9b2b9ba5dce5675df2fcf61210b25 100755 --- a/testsuite/expect/test18.9 +++ b/testsuite/expect/test18.9 @@ -66,7 +66,7 @@ expect { incr matches exp_continue } - -re "task.0,2-9.: running" { # bracket escape not working + -re "task\\\[0,2-9]: running" { incr matches exp_continue }