diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index dfbe5e2e7236a07f2a2d998c6dc6dbbb600a8592..850e65b0c633ab34d89f945c50eb0edc20d71f84 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -1388,6 +1388,10 @@ Same as \fB\-t, \-\-time\fR .TP \fBSBATCH_WAIT_ALL_NODES\fR Same as \fB\-\-wait\-all\-nodes\fR +.TP +\fBSLURM_STEP_KILLED_MSG_NODE_ID\fR=ID +If set, only the specified node will log when the job or step are killed +by a signal. .SH "OUTPUT ENVIRONMENT VARIABLES" .PP diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index b9a50b78f166b3b3ac7065048e946da99c74dfa8..b9c6a3731ac71f38f706872a1b63f6a697cc0f87 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1832,6 +1832,10 @@ Same as \fB\-i, \-\-input\fR Same as \fB\-\-gres\fR (only applies to job steps, not to job allocations). Also see \fBSLURM_GRES\fR .TP +\fBSLURM_STEP_KILLED_MSG_NODE_ID\fR=ID +If set, only the specified node will log when the job or step are killed +by a signal. +.TP \fBSLURM_STDOUTMODE\fR Same as \fB\-o, \-\-output\fR .TP diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index d4ab8d45d0e61fb0e7b0c4ad45015460e6ed59aa..7effe5b2f711e21346df6b3069dbfe9eb770c6e2 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -48,6 +48,7 @@ #include <unistd.h> #include <signal.h> #include <time.h> +#include <stdlib.h> #include "src/common/cpu_frequency.h" #include "src/common/fd.h" @@ -591,6 +592,7 @@ _handle_signal_process_group(int fd, slurmd_job_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int signal; + char *ptr = NULL; debug3("_handle_signal_process_group for job %u.%u", job->jobid, job->stepid); @@ -628,8 +630,11 @@ _handle_signal_process_group(int fd, slurmd_job_t *job, uid_t uid) /* * Print a message in the step output before killing when * SIGTERM or SIGKILL are sent + * hjcao: print JOB/STEP KILLED msg on specific node id only */ - if ((signal == SIGTERM) || (signal == SIGKILL)) { + ptr = getenvp(job->env, "SLURM_STEP_KILLED_MSG_NODE_ID"); + if ((!ptr || atoi(ptr) == job->nodeid) && + ((signal == SIGTERM) || (signal == SIGKILL))) { time_t now = time(NULL); char entity[24], time_str[24]; if (job->stepid == SLURM_BATCH_SCRIPT) {