From 4823a6488230ef6ca7b0c47cf5ff91374fcca28b Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Fri, 27 Oct 2017 09:41:51 -0600 Subject: [PATCH] Harden pack step SPANK test --- testsuite/expect/test38.6 | 13 ++++++++----- testsuite/expect/test38.6.prog.c | 30 +++++++++++++++++++++++------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/testsuite/expect/test38.6 b/testsuite/expect/test38.6 index b84a2515ccc..802c70771d6 100755 --- a/testsuite/expect/test38.6 +++ b/testsuite/expect/test38.6 @@ -71,6 +71,7 @@ proc end_it { exit_code } { } else { send_user "\nFAILURE: This may be due to NFS sychronization problems: " send_user "Multiple processes on differrent nodes writing to the same file\n" + send_user "The SlurmdLogFile on each node should include SPANK logs for each step\n" } exit $exit_code } @@ -176,10 +177,10 @@ expect { # slurmd/slurmstepd on the compute node will not get the updated # configuration. # -$bin_sleep 30 +exec $bin_sleep 20 send_user "\n\nTest locally logged messages..........\n\n" make_bash_script $file_in " - $srun --mpi=none --test_suite_srun=5 --pack-group=0,1 $bin_uname + $srun --mpi=none --test_suite_srun=5 --pack-group=0,1 $bin_hostname " set job_id 0 @@ -290,7 +291,8 @@ if {[wait_for_file $spank_out] == 0} { set sbatch_arg $expect_out(2,string) set srun_arg $expect_out(3,string) - if {$spank_type eq "exit"} { + if {($spank_type eq "exit") && ($sbatch_arg eq "4")} { +# Skip (possible) external job containers incr matches_spank_exit } elseif {$spank_type eq "task_init"} { incr matches_spank_init @@ -315,6 +317,7 @@ if {[wait_for_file $spank_out] == 0} { wait } } + if {$matches_spank_exit != 3} { send_user "\nFAILURE: remote (slurmd - spank_exit) spank plugin failure ($matches_spank_exit != 3)\n" end_it 1 @@ -331,11 +334,11 @@ if {[wait_for_file $spank_out] == 0} { send_user "\nFAILURE: remote (slurmd) spank plugin failure ($matches != 6)\n" end_it 1 } else { - send_user "\n remote (slurmd) spank plugin success\n" + send_user "\nRemote (slurmd) spank plugin success\n" } } else { send_user "\nWARNING: This can be caused by the plugstack.conf file not\n" - send_user " being propagated to the compute node or not being in a\n" + send_user " being propagated to the compute nodes or not being in a\n" send_user " shared file system.\n" end_it 1 } diff --git a/testsuite/expect/test38.6.prog.c b/testsuite/expect/test38.6.prog.c index 78125097bad..08c29b27eaa 100644 --- a/testsuite/expect/test38.6.prog.c +++ b/testsuite/expect/test38.6.prog.c @@ -144,15 +144,19 @@ int slurm_spank_task_init(spank_t sp, int ac, char **av) uid_t my_uid; int argc, i; char **argv; + char hostname[64] = ""; + gethostname(hostname, sizeof(hostname)); if (opt_out_file) { FILE *fp = NULL; for (i = 0; (i < 10) && !fp; i++) fp = fopen(opt_out_file, "a"); if (!fp) return -1; - fprintf(fp, "%s: opt_arg_sbatch=%d opt_arg_srun=%d\n", - __func__, opt_arg_sbatch, opt_arg_srun); + if (opt_arg_sbatch || opt_arg_srun) + usleep(getpid() % 500000); /* Reduce NFS collisions */ + fprintf(fp, "%s: opt_arg_sbatch=%d opt_arg_srun=%d hostname=%s\n", + __func__, opt_arg_sbatch, opt_arg_srun, hostname); if (spank_get_item(sp, S_JOB_UID, &my_uid) == ESPANK_SUCCESS) fprintf(fp, "spank_get_item: my_uid=%d\n", my_uid); if (spank_get_item(sp, S_JOB_ARGV, &argc, &argv) == @@ -164,6 +168,11 @@ int slurm_spank_task_init(spank_t sp, int ac, char **av) } fclose(fp); } + + slurm_info("%s: opt_arg_sbatch=%d opt_arg_srun=%d hostname=%s out_file=%s", + __func__, opt_arg_sbatch, opt_arg_srun, hostname, + opt_out_file); + return 0; } @@ -176,19 +185,26 @@ int slurm_spank_task_exit(spank_t sp, int ac, char **av) */ /* Called from both srun and slurmd */ int slurm_spank_exit(spank_t sp, int ac, char **av) { + char hostname[64] = ""; int i; + gethostname(hostname, sizeof(hostname)); if (opt_out_file) { FILE *fp = NULL; for (i = 0; (i < 10) && !fp; i++) fp = fopen(opt_out_file, "a"); if (!fp) return -1; - fprintf(fp, "%s: opt_arg_sbatch=%d opt_arg_srun=%d\n", - __func__, opt_arg_sbatch, opt_arg_srun); + if (opt_arg_sbatch || opt_arg_srun) + usleep(getpid() % 500000); /* Reduce NFS collisions */ + fprintf(fp, "%s: opt_arg_sbatch=%d opt_arg_srun=%d hostname=%s\n", + __func__, opt_arg_sbatch, opt_arg_srun, hostname); fclose(fp); - } else - slurm_info("%s: opt_arg_sbatch=%d opt_arg_srun=%d", - __func__, opt_arg_sbatch, opt_arg_srun); + } + + slurm_info("%s: opt_arg_sbatch=%d opt_arg_srun=%d hostname=%s out_file=%s", + __func__, opt_arg_sbatch, opt_arg_srun, hostname, + opt_out_file); + return 0; } -- GitLab