From 232ab3059fce18dc17ba3a14f6cb8313de65b365 Mon Sep 17 00:00:00 2001 From: Hongjia Cao <hjcao@nudt.edu.cn> Date: Mon, 14 Jan 2013 09:58:17 -0800 Subject: [PATCH] Add SLURM_STEP_KILLED_MSG_NODE_ID environment variable With jobs launched using srun directly which end abnormally, there will be a step-killed-message(slurmd[cn123]: *** 1234.0 KILLED AT ... WITH SIGNAL 9 ***) from each node. And/or there will be a task-exit-message(srun: error: task[0-1]: Terminated) for each node. For large scale jobs, these messages become tedious and the other error messages will be buried. The attached two patches(for slurm-2.5.1) introduce two environment variables to control the output of such messages: SLURM_STEP_KILLED_MSG_NODE_ID: if set, only the specified node will print the step-killed-message; SLURM_SRUN_REDUCE_TASK_EXIT_MSG: if set and non-zero, successive task exit messages with the same exit code will be printed only once. --- doc/man/man1/sbatch.1 | 4 ++++ doc/man/man1/srun.1 | 4 ++++ src/slurmd/slurmstepd/req.c | 7 ++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index dfbe5e2e723..850e65b0c63 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -1388,6 +1388,10 @@ Same as \fB\-t, \-\-time\fR .TP \fBSBATCH_WAIT_ALL_NODES\fR Same as \fB\-\-wait\-all\-nodes\fR +.TP +\fBSLURM_STEP_KILLED_MSG_NODE_ID\fR=ID +If set, only the specified node will log when the job or step are killed +by a signal. .SH "OUTPUT ENVIRONMENT VARIABLES" .PP diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index b9a50b78f16..b9c6a3731ac 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1832,6 +1832,10 @@ Same as \fB\-i, \-\-input\fR Same as \fB\-\-gres\fR (only applies to job steps, not to job allocations). Also see \fBSLURM_GRES\fR .TP +\fBSLURM_STEP_KILLED_MSG_NODE_ID\fR=ID +If set, only the specified node will log when the job or step are killed +by a signal. +.TP \fBSLURM_STDOUTMODE\fR Same as \fB\-o, \-\-output\fR .TP diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index d4ab8d45d0e..7effe5b2f71 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -48,6 +48,7 @@ #include <unistd.h> #include <signal.h> #include <time.h> +#include <stdlib.h> #include "src/common/cpu_frequency.h" #include "src/common/fd.h" @@ -591,6 +592,7 @@ _handle_signal_process_group(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int signal; + char *ptr = NULL; debug3("_handle_signal_process_group for job %u.%u", job->jobid, job->stepid); @@ -628,8 +630,11 @@ _handle_signal_process_group(int fd, slurmd_job_t *job, uid_t uid) /* * Print a message in the step output before killing when * SIGTERM or SIGKILL are sent + * hjcao: print JOB/STEP KILLED msg on specific node id only */ - if ((signal == SIGTERM) || (signal == SIGKILL)) { + ptr = getenvp(job->env, "SLURM_STEP_KILLED_MSG_NODE_ID"); + if ((!ptr || atoi(ptr) == job->nodeid) && + ((signal == SIGTERM) || (signal == SIGKILL))) { time_t now = time(NULL); char entity[24], time_str[24]; if (job->stepid == SLURM_BATCH_SCRIPT) { -- GitLab