From 232ab3059fce18dc17ba3a14f6cb8313de65b365 Mon Sep 17 00:00:00 2001
From: Hongjia Cao <hjcao@nudt.edu.cn>
Date: Mon, 14 Jan 2013 09:58:17 -0800
Subject: [PATCH] Add SLURM_STEP_KILLED_MSG_NODE_ID environment variable

With jobs launched using srun directly which end abnormally, there will
be a step-killed-message(slurmd[cn123]: *** 1234.0 KILLED AT ... WITH
SIGNAL 9 ***) from each node. And/or there will be a
task-exit-message(srun: error: task[0-1]: Terminated) for each node. For
large scale jobs, these messages become tedious and the other error
messages will be buried. The attached two patches(for slurm-2.5.1)
introduce two environment variables to control the output of such
messages:

SLURM_STEP_KILLED_MSG_NODE_ID: if set, only the specified node will
print the step-killed-message;

SLURM_SRUN_REDUCE_TASK_EXIT_MSG: if set and non-zero, successive task
exit messages with the same exit code will be printed only once.
---
 doc/man/man1/sbatch.1       | 4 ++++
 doc/man/man1/srun.1         | 4 ++++
 src/slurmd/slurmstepd/req.c | 7 ++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1
index dfbe5e2e723..850e65b0c63 100644
--- a/doc/man/man1/sbatch.1
+++ b/doc/man/man1/sbatch.1
@@ -1388,6 +1388,10 @@ Same as \fB\-t, \-\-time\fR
 .TP
 \fBSBATCH_WAIT_ALL_NODES\fR
 Same as \fB\-\-wait\-all\-nodes\fR
+.TP
+\fBSLURM_STEP_KILLED_MSG_NODE_ID\fR=ID
+If set, only the specified node will log when the job or step are killed
+by a signal.
 
 .SH "OUTPUT ENVIRONMENT VARIABLES"
 .PP
diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1
index b9a50b78f16..b9c6a3731ac 100644
--- a/doc/man/man1/srun.1
+++ b/doc/man/man1/srun.1
@@ -1832,6 +1832,10 @@ Same as \fB\-i, \-\-input\fR
 Same as \fB\-\-gres\fR (only applies to job steps, not to job allocations).
 Also see \fBSLURM_GRES\fR
 .TP
+\fBSLURM_STEP_KILLED_MSG_NODE_ID\fR=ID
+If set, only the specified node will log when the job or step are killed
+by a signal.
+.TP
 \fBSLURM_STDOUTMODE\fR
 Same as \fB\-o, \-\-output\fR
 .TP
diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c
index d4ab8d45d0e..7effe5b2f71 100644
--- a/src/slurmd/slurmstepd/req.c
+++ b/src/slurmd/slurmstepd/req.c
@@ -48,6 +48,7 @@
 #include <unistd.h>
 #include <signal.h>
 #include <time.h>
+#include <stdlib.h>
 
 #include "src/common/cpu_frequency.h"
 #include "src/common/fd.h"
@@ -591,6 +592,7 @@ _handle_signal_process_group(int fd, slurmd_job_t *job, uid_t uid)
 {
 	int rc = SLURM_SUCCESS;
 	int signal;
+	char *ptr = NULL;
 
 	debug3("_handle_signal_process_group for job %u.%u",
 	      job->jobid, job->stepid);
@@ -628,8 +630,11 @@ _handle_signal_process_group(int fd, slurmd_job_t *job, uid_t uid)
 	/*
 	 * Print a message in the step output before killing when
 	 * SIGTERM or SIGKILL are sent
+	 * hjcao: print JOB/STEP KILLED msg on specific node id only
 	 */
-	if ((signal == SIGTERM) || (signal == SIGKILL)) {
+	ptr = getenvp(job->env, "SLURM_STEP_KILLED_MSG_NODE_ID");
+	if ((!ptr || atoi(ptr) == job->nodeid) &&
+	    ((signal == SIGTERM) || (signal == SIGKILL))) {
 		time_t now = time(NULL);
 		char entity[24], time_str[24];
 		if (job->stepid == SLURM_BATCH_SCRIPT) {
-- 
GitLab