diff --git a/src/slurmctld/reservation.c b/src/slurmctld/reservation.c index 3a15797ce4ca486ca668f0051e889ddc50f2486d..95ab28085aaf2e6f4c67935cb234fff24f33e4f7 100644 --- a/src/slurmctld/reservation.c +++ b/src/slurmctld/reservation.c @@ -1896,7 +1896,8 @@ extern int load_all_resv_state(int recover) unpack_error: _validate_all_reservations(); - error("Incomplete reservation data checkpoint file"); + if (state_fd >= 0) + error("Incomplete reservation data checkpoint file"); info("Recovered state of %d reservations", list_count(resv_list)); if (resv_ptr) _del_resv_rec(resv_ptr); diff --git a/src/srun/Makefile.am b/src/srun/Makefile.am index f16818e2a87d674ac7f74aa09bd64ca21457c46f..5972d540d569bee52fa689540e7f8e57011fe12f 100644 --- a/src/srun/Makefile.am +++ b/src/srun/Makefile.am @@ -21,6 +21,7 @@ srun_SOURCES = \ core-format.c \ core-format.h \ multi_prog.c multi_prog.h \ + task_state.c task_state.h \ srun.wrapper.c convenience_libs = \ diff --git a/src/srun/Makefile.in b/src/srun/Makefile.in index cb6b25ee16284e31c5f41ba3604048194b409a1e..59f6f49f96afb53865aa89fcdbc0103e537051dc 100644 --- a/src/srun/Makefile.in +++ b/src/srun/Makefile.in @@ -78,7 +78,7 @@ PROGRAMS = $(bin_PROGRAMS) am_srun_OBJECTS = srun.$(OBJEXT) opt.$(OBJEXT) srun_job.$(OBJEXT) \ srun_pty.$(OBJEXT) debugger.$(OBJEXT) fname.$(OBJEXT) \ allocate.$(OBJEXT) core-format.$(OBJEXT) multi_prog.$(OBJEXT) \ - srun.wrapper.$(OBJEXT) + task_state.$(OBJEXT) srun.wrapper.$(OBJEXT) srun_OBJECTS = $(am_srun_OBJECTS) am__DEPENDENCIES_1 = $(top_builddir)/src/api/libslurm.o srun_DEPENDENCIES = $(am__DEPENDENCIES_1) @@ -292,6 +292,7 @@ srun_SOURCES = \ core-format.c \ core-format.h \ multi_prog.c multi_prog.h \ + task_state.c task_state.h \ srun.wrapper.c convenience_libs = \ @@ -382,6 +383,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/srun.wrapper.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/srun_job.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/srun_pty.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task_state.Po@am__quote@ .c.o: @am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< diff --git a/src/srun/srun.c b/src/srun/srun.c index 53575f5aa0e1db51702cb6cae4bbad6ba746f0df..02263c884b3bd199dc3b63a8b1d717dfe511a55b 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -93,6 +93,7 @@ #include "src/srun/srun.h" #include "src/srun/srun_pty.h" #include "src/srun/multi_prog.h" +#include "src/srun/task_state.h" #include "src/api/pmi_server.h" #include "src/api/step_launch.h" @@ -120,19 +121,13 @@ mpi_plugin_client_info_t mpi_job_info[1]; static struct termios termdefaults; uint32_t global_rc = 0; srun_job_t *job = NULL; +task_state_t task_state; #define MAX_STEP_RETRIES 4 time_t launch_start_time; bool retry_step_begin = false; int retry_step_cnt = 0; -struct { - bitstr_t *start_success; - bitstr_t *start_failure; - bitstr_t *finish_normal; - bitstr_t *finish_abnormal; -} task_state; - /* * forward declaration of static funcs */ @@ -157,9 +152,6 @@ static int _set_umask_env(void); static int _slurm_debug_env_val (void); static void _task_start(launch_tasks_response_msg_t *msg); static void _task_finish(task_exit_msg_t *msg); -static void _task_state_struct_init(int num_tasks); -static void _task_state_struct_print(void); -static void _task_state_struct_free(void); static char *_uint16_array_to_str(int count, const uint16_t *array); int srun(int ac, char **av) @@ -359,7 +351,7 @@ int srun(int ac, char **av) xfree(env); re_launch: - _task_state_struct_init(opt.nprocs); + task_state = task_state_create(opt.nprocs); slurm_step_launch_params_t_init(&launch_params); launch_params.gid = opt.gid; launch_params.argc = opt.argc; @@ -453,6 +445,7 @@ int srun(int ac, char **av) if (create_job_step(job, true) < 0) exit(1); } + task_state_destroy(task_state); goto re_launch; } @@ -464,7 +457,7 @@ cleanup: _run_srun_epilog(job); slurm_step_ctx_destroy(job->step_ctx); mpir_cleanup(); - _task_state_struct_free(); + task_state_destroy(task_state); log_fini(); return (int)global_rc; @@ -927,9 +920,9 @@ _task_start(launch_tasks_response_msg_t *msg) table->pid = msg->local_pids[i]; if (msg->return_code == 0) { - bit_set(task_state.start_success, taskid); + task_state_update(task_state, taskid, TS_START_SUCCESS); } else { - bit_set(task_state.start_failure, taskid); + task_state_update(task_state, taskid, TS_START_FAILURE); } } @@ -943,6 +936,7 @@ _terminate_job_step(slurm_step_ctx_t *step_ctx) slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_JOBID, &job_id); slurm_step_ctx_get(step_ctx, SLURM_STEP_CTX_STEPID, &step_id); info("Terminating job step %u.%u", job_id, step_id); + update_job_state(job, SRUN_JOB_CANCELLED); slurm_kill_job_step(job_id, step_id, SIGKILL); } @@ -950,7 +944,7 @@ static void _handle_max_wait(int signo) { info("First task exited %ds ago", opt.max_wait); - _task_state_struct_print(); + task_state_print(task_state, (log_f) info); _terminate_job_step(job->step_ctx); } @@ -985,6 +979,32 @@ _taskids_to_nodelist(bitstr_t *tasks_exited) return hostlist_str; } +static void +_update_task_exit_state(uint32_t ntasks, uint32_t taskids[], int abnormal) +{ + int i; + task_state_type_t t = abnormal ? TS_ABNORMAL_EXIT : TS_NORMAL_EXIT; + + for (i = 0; i < ntasks; i++) + task_state_update(task_state, taskids[i], t); +} + +static int _kill_on_bad_exit(void) +{ + return (opt.kill_bad_exit || slurm_get_kill_on_bad_exit()); +} + +static void _setup_max_wait_timer(void) +{ + /* If these are the first tasks to finish we need to + * start a timer to kill off the job step if the other + * tasks don't finish within opt.max_wait seconds. + */ + verbose("First task exited. Terminating job in %ds.", opt.max_wait); + xsignal(SIGALRM, _handle_max_wait); + alarm(opt.max_wait); +} + static void _task_finish(task_exit_msg_t *msg) { @@ -1004,7 +1024,8 @@ _task_finish(task_exit_msg_t *msg) if (WIFEXITED(msg->return_code)) { rc = WEXITSTATUS(msg->return_code); if (rc != 0) { - bit_or(task_state.finish_abnormal, tasks_exited); + _update_task_exit_state(msg->num_tasks, + msg->task_id_list, 1); node_list = _taskids_to_nodelist(tasks_exited); if ((rc == OPEN_MPI_PORT_ERROR) && (opt.resv_port_cnt != NO_VAL) && @@ -1029,11 +1050,13 @@ _task_finish(task_exit_msg_t *msg) node_list, buf, rc); } } else { - bit_or(task_state.finish_normal, tasks_exited); verbose("task %s: Completed", buf); + _update_task_exit_state(msg->num_tasks, + msg->task_id_list, 0); } + } else if (WIFSIGNALED(msg->return_code)) { - bit_or(task_state.finish_abnormal, tasks_exited); + _update_task_exit_state(msg->num_tasks, msg->task_id_list, 0); msg_str = strsignal(WTERMSIG(msg->return_code)); #ifdef WCOREDUMP if (WCOREDUMP(msg->return_code)) @@ -1052,102 +1075,25 @@ _task_finish(task_exit_msg_t *msg) } xfree(node_list); bit_free(tasks_exited); + + /* + * Update global srun return code + */ global_rc = MAX(global_rc, rc); - if (first_error && (rc > 0) && - (opt.kill_bad_exit || slurm_get_kill_on_bad_exit())) { + if (first_error && (task_state_abnormal_count(task_state) > 0) && + _kill_on_bad_exit()) { + _terminate_job_step(job->step_ctx); first_error = false; - _terminate_job_step(job->step_ctx); - } else if (first_done && opt.max_wait > 0) { - /* If these are the first tasks to finish we need to - * start a timer to kill off the job step if the other - * tasks don't finish within opt.max_wait seconds. - */ - first_done = false; - debug2("First task has exited"); - xsignal(SIGALRM, _handle_max_wait); - verbose("starting alarm of %d seconds", opt.max_wait); - alarm(opt.max_wait); - } -} - -static void -_task_state_struct_init(int num_tasks) -{ - task_state.start_success = bit_alloc(num_tasks); - task_state.start_failure = bit_alloc(num_tasks); - task_state.finish_normal = bit_alloc(num_tasks); - task_state.finish_abnormal = bit_alloc(num_tasks); -} - -/* - * Tasks will most likely have bits set in multiple of the task_state - * bit strings (e.g. a task can start normally and then later exit normally) - * so we ensure that a task is only "seen" once. - */ -static void -_task_state_struct_print(void) -{ - bitstr_t *tmp, *seen, *not_seen; - char buf[65536]; - int len; - - len = bit_size(task_state.finish_abnormal); /* all the same length */ - tmp = bit_alloc(len); - seen = bit_alloc(len); - not_seen = bit_alloc(len); - bit_not(not_seen); - - if (bit_set_count(task_state.finish_abnormal) > 0) { - bit_copybits(tmp, task_state.finish_abnormal); - bit_and(tmp, not_seen); - bit_fmt(buf, sizeof(buf), tmp); - info("task %s: exited abnormally", buf); - bit_or(seen, tmp); - bit_copybits(not_seen, seen); - bit_not(not_seen); - } - - if (bit_set_count(task_state.finish_normal) > 0) { - bit_copybits(tmp, task_state.finish_normal); - bit_and(tmp, not_seen); - bit_fmt(buf, sizeof(buf), tmp); - info("task %s: exited", buf); - bit_or(seen, tmp); - bit_copybits(not_seen, seen); - bit_not(not_seen); - } - - if (bit_set_count(task_state.start_failure) > 0) { - bit_copybits(tmp, task_state.start_failure); - bit_and(tmp, not_seen); - bit_fmt(buf, sizeof(buf), tmp); - info("task %s: failed to start", buf); - bit_or(seen, tmp); - bit_copybits(not_seen, seen); - bit_not(not_seen); } - if (bit_set_count(task_state.start_success) > 0) { - bit_copybits(tmp, task_state.start_success); - bit_and(tmp, not_seen); - bit_fmt(buf, BUFSIZ, tmp); - info("task %s: running", buf); - bit_or(seen, tmp); - bit_copybits(not_seen, seen); - bit_not(not_seen); + if (first_done && (task_state_exited_count(task_state) > 0) && + (opt.max_wait > 0)) { + _setup_max_wait_timer(); + first_done = false; } } -static void -_task_state_struct_free(void) -{ - bit_free(task_state.start_success); - bit_free(task_state.start_failure); - bit_free(task_state.finish_normal); - bit_free(task_state.finish_abnormal); -} - static void _handle_intr() { static time_t last_intr = 0; @@ -1163,7 +1109,7 @@ static void _handle_intr() info("interrupt (one more within 1 sec to abort)"); else info("interrupt (abort already in progress)"); - _task_state_struct_print(); + task_state_print(task_state, (log_f) info); last_intr = time(NULL); } else { /* second Ctrl-C in half as many seconds */ update_job_state(job, SRUN_JOB_CANCELLED); diff --git a/src/srun/task_state.c b/src/srun/task_state.c new file mode 100644 index 0000000000000000000000000000000000000000..b4a0e95cdca8223ccfa3f4a7963678d8c3077dea --- /dev/null +++ b/src/srun/task_state.c @@ -0,0 +1,183 @@ +/*****************************************************************************\ + * src/srun/task_state.c - task state container + * $Id$ + ***************************************************************************** + * Copyright (C) 2002 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Mark Grondona <mgrondona@llnl.gov>. + * CODE-OCEC-09-009. All rights reserved. + * + * This file is part of SLURM, a resource management program. + * For details, see <https://computing.llnl.gov/linux/slurm/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <string.h> + +#include "src/common/xmalloc.h" +#include "src/common/bitstring.h" +#include "src/common/xassert.h" + +#include "src/srun/task_state.h" + +struct task_state_struct { + int n_tasks; + int n_started; + int n_abnormal; + int n_exited; + bitstr_t *start_failed; + bitstr_t *running; + bitstr_t *normal_exit; + bitstr_t *abnormal_exit; +}; + +task_state_t task_state_create (int ntasks) +{ + task_state_t ts = xmalloc (sizeof (*ts)); + + /* ts is zero filled by xmalloc() */ + ts->n_tasks = ntasks; + ts->running = bit_alloc (ntasks); + ts->start_failed = bit_alloc (ntasks); + ts->normal_exit = bit_alloc (ntasks); + ts->abnormal_exit = bit_alloc (ntasks); + + return (ts); +} + +void task_state_destroy (task_state_t ts) +{ + if (ts == NULL) + return; + if (ts->start_failed) + bit_free (ts->start_failed); + if (ts->running) + bit_free (ts->running); + if (ts->normal_exit) + bit_free (ts->normal_exit); + if (ts->abnormal_exit) + bit_free (ts->abnormal_exit); + xfree (ts); +} + +static const char *_task_state_type_str (task_state_type_t t) +{ + switch (t) { + case TS_START_SUCCESS: + return ("TS_START_SUCCESS"); + case TS_START_FAILURE: + return ("TS_START_FAILURE"); + case TS_NORMAL_EXIT: + return ("TS_NORMAL_EXIT"); + case TS_ABNORMAL_EXIT: + return ("TS_ABNORMAL_EXIT"); + } + return ("Unknown"); +} + +void task_state_update (task_state_t ts, int taskid, task_state_type_t t) +{ + xassert (ts != NULL); + xassert (taskid >= 0); + xassert (taskid < ts->n_tasks); + + debug3("task_state_update(taskid=%d, %s)", + taskid, _task_state_type_str (t)); + + switch (t) { + case TS_START_SUCCESS: + bit_set (ts->running, taskid); + ts->n_started++; + break; + case TS_START_FAILURE: + bit_set (ts->start_failed, taskid); + break; + case TS_NORMAL_EXIT: + bit_set (ts->normal_exit, taskid); + bit_clear (ts->running, taskid); + ts->n_exited++; + break; + case TS_ABNORMAL_EXIT: + bit_clear (ts->running, taskid); + bit_set (ts->abnormal_exit, taskid); + ts->n_exited++; + ts->n_abnormal++; + break; + } + + xassert ((bit_set_count(ts->abnormal_exit) + + bit_set_count(ts->normal_exit)) == ts->n_exited); +} + +int task_state_exited_count (task_state_t ts) +{ + return (ts->n_exited); +} + +int task_state_abnormal_count (task_state_t ts) +{ + return (ts->n_abnormal); +} + +static void _do_log_msg (bitstr_t *b, log_f fn, const char *msg) +{ + char buf [65536]; + char *s = bit_set_count (b) == 1 ? "" : "s"; + (*fn) ("task%s %s: %s\n", s, bit_fmt (buf, sizeof(buf), b), msg); +} + +void task_state_print (task_state_t ts, log_f fn) +{ + bitstr_t *unseen = bit_alloc (ts->n_tasks); + + if (bit_set_count (ts->start_failed)) { + _do_log_msg (ts->start_failed, fn, "failed to start"); + bit_or (unseen, ts->start_failed); + } + if (bit_set_count (ts->running)) { + _do_log_msg (ts->running, fn, "running"); + bit_or (unseen, ts->running); + } + if (bit_set_count (ts->abnormal_exit)) { + _do_log_msg (ts->abnormal_exit, fn, "exited abnormally"); + bit_or (unseen, ts->abnormal_exit); + } + if (bit_set_count (ts->normal_exit)) { + _do_log_msg (ts->normal_exit, fn, "exited"); + bit_or (unseen, ts->normal_exit); + } + bit_not (unseen); + if (bit_set_count (unseen)) + _do_log_msg (unseen, fn, "unknown"); + bit_free (unseen); +} + diff --git a/src/srun/task_state.h b/src/srun/task_state.h new file mode 100644 index 0000000000000000000000000000000000000000..36d31cc38fdbd06cad6116d14b016a96be2d45e1 --- /dev/null +++ b/src/srun/task_state.h @@ -0,0 +1,66 @@ +/*****************************************************************************\ + * src/srun/task_state.h - task state container for srun + * $Id$ + ***************************************************************************** + * Copyright (C) 2002 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Mark Grondona <mgrondona@llnl.gov>. + * CODE-OCEC-09-009. All rights reserved. + * + * This file is part of SLURM, a resource management program. + * For details, see <https://computing.llnl.gov/linux/slurm/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifndef _HAVE_TASK_STATE_H +#define _HAVE_TASK_STATE_H + +typedef struct task_state_struct * task_state_t; + +typedef enum { + TS_START_SUCCESS, + TS_START_FAILURE, + TS_NORMAL_EXIT, + TS_ABNORMAL_EXIT +} task_state_type_t; + +task_state_t task_state_create (int ntasks); + +void task_state_destroy (task_state_t ts); + +void task_state_update (task_state_t ts, int taskid, task_state_type_t t); + +int task_state_exited_count (task_state_t ts); + +int task_state_abnormal_count (task_state_t ts); + +typedef void (*log_f) (const char *, ...); + +void task_state_print (task_state_t ts, log_f fn); + +#endif /* !_HAVE_TASK_STATE_H */ diff --git a/testsuite/expect/test1.15 b/testsuite/expect/test1.15 index 67f4c56519f3badd46a6e47823d8e26e89080995..b47f9c8ee7c8c9fa8e05e93622c08e012ca53f8b 100755 --- a/testsuite/expect/test1.15 +++ b/testsuite/expect/test1.15 @@ -62,14 +62,10 @@ expect { incr matches exp_continue } - -re "task \\\[0,2-9\\\]: running" { + -re "tasks \\\[0,2-9\\\]: running" { incr matches exp_continue } - -re "Exited with exit code" { - send_user "This error is expected, no worries\n" - exp_continue - } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid diff --git a/testsuite/expect/test1.60 b/testsuite/expect/test1.60 index faa4d9043985c548bcbd31b8ace1a98e6ca623be..09d003c4ae07c49a14e41082a82a6006bc5593f8 100755 --- a/testsuite/expect/test1.60 +++ b/testsuite/expect/test1.60 @@ -78,7 +78,7 @@ if {$job_id == 0} { } set node_count 0 -spawn $squeue -j $job_id -o "%i %D" +spawn $squeue -tall -j $job_id -o "%i %D" expect { -re "$job_id ($number)" { set node_count $expect_out(1,string)