Skip to content
Snippets Groups Projects
Commit 96986199 authored by Hongjia Cao's avatar Hongjia Cao Committed by Morris Jette
Browse files

Add SLURM_SRUN_REDUCE_TASK_EXIT_MSG environment variable

With jobs launched using srun directly which end abnormally, there will
be a step-killed-message(slurmd[cn123]: *** 1234.0 KILLED AT ... WITH
SIGNAL 9 ***) from each node. And/or there will be a
task-exit-message(srun: error: task[0-1]: Terminated) for each node. For
large scale jobs, these messages become tedious and the other error
messages will be buried. The attached two patches(for slurm-2.5.1)
introduce two environment variables to control the output of such
messages:

SLURM_STEP_KILLED_MSG_NODE_ID: if set, only the specified node will
print the step-killed-message;

SLURM_SRUN_REDUCE_TASK_EXIT_MSG: if set and non-zero, successive task
exit messages with the same exit code will be printed only once.
parent 232ab305
No related branches found
No related tags found
No related merge requests found
......@@ -1828,6 +1828,10 @@ Same as \fB\-e, \-\-error\fR
\fBSLURM_STDINMODE\fR
Same as \fB\-i, \-\-input\fR
.TP
\fBSLURM_SRUN_REDUCE_TASK_EXIT_MSG\fR
if set and non-zero, successive task exit messages with the same exit code will
be printed only once.
.TP
\fBSLURM_STEP_GRES\fR
Same as \fB\-\-gres\fR (only applies to job steps, not to job allocations).
Also see \fBSLURM_GRES\fR
......
......@@ -288,9 +288,19 @@ static void _task_finish(task_exit_msg_t *msg)
char *hosts;
uint32_t rc = 0;
int normal_exit = 0;
static int reduce_task_exit_msg = -1;
static int msg_printed = 0, last_task_exit_rc;
const char *task_str = _taskstr(msg->num_tasks);
if (reduce_task_exit_msg == -1) {
char *ptr = getenv("SLURM_SRUN_REDUCE_TASK_EXIT_MSG");
if (ptr && atoi(ptr) != 0)
reduce_task_exit_msg = 1;
else
reduce_task_exit_msg = 0;
}
verbose("Received task exit notification for %d %s (status=0x%04x).",
msg->num_tasks, task_str, msg->return_code);
......@@ -306,8 +316,13 @@ static void _task_finish(task_exit_msg_t *msg)
_handle_openmpi_port_error(tasks, hosts,
local_srun_job->step_ctx);
} else {
error("%s: %s %s: Exited with exit code %d",
hosts, task_str, tasks, rc);
if (reduce_task_exit_msg == 0 ||
msg_printed == 0 ||
msg->return_code != last_task_exit_rc) {
error("%s: %s %s: Exited with exit code %d",
hosts, task_str, tasks, rc);
msg_printed = 1;
}
}
if (!WIFEXITED(*local_global_rc)
|| (rc > WEXITSTATUS(*local_global_rc)))
......@@ -325,8 +340,14 @@ static void _task_finish(task_exit_msg_t *msg)
hosts, task_str, tasks, signal_str, core_str);
} else {
rc = msg->return_code;
error("%s: %s %s: %s%s",
hosts, task_str, tasks, signal_str, core_str);
if (reduce_task_exit_msg == 0 ||
msg_printed == 0 ||
msg->return_code != last_task_exit_rc) {
error("%s: %s %s: %s%s",
hosts, task_str, tasks, signal_str,
core_str);
msg_printed = 1;
}
}
if (*local_global_rc == 0)
*local_global_rc = msg->return_code;
......@@ -344,6 +365,8 @@ static void _task_finish(task_exit_msg_t *msg)
if (task_state_first_exit(task_state) && (opt.max_wait > 0))
_setup_max_wait_timer();
last_task_exit_rc = msg->return_code;
}
/* Load the multi_prog config file into argv, pass the entire file contents
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment