From 09adf5005257bbde31c0de316067cf57ddd7757d Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Wed, 10 Oct 2007 23:06:30 +0000
Subject: [PATCH] scontrol "notify" command added to send message to stdout of
 srun for     specified job id. Format is "scontrol notify <jobid> <message>"

---
 NEWS                             |  4 ++-
 RELEASE_NOTES                    |  2 ++
 doc/man/man1/scontrol.1          | 19 ++++++++------
 slurm/slurm.h.in                 |  9 +++++++
 src/api/signal.c                 | 34 +++++++++++++++++++++++++
 src/common/slurm_protocol_defs.c |  7 ++++++
 src/common/slurm_protocol_defs.h |  8 ++++++
 src/common/slurm_protocol_pack.c | 42 +++++++++++++++++++++++++++++++
 src/scontrol/scontrol.c          | 16 ++++++++++--
 src/scontrol/scontrol.h          |  1 +
 src/scontrol/update_job.c        | 32 ++++++++++++++++++++++++
 src/slurmctld/proc_req.c         | 43 ++++++++++++++++++++++++++++++++
 12 files changed, 207 insertions(+), 10 deletions(-)

diff --git a/NEWS b/NEWS
index 70b03228f71..3383d12dcfb 100644
--- a/NEWS
+++ b/NEWS
@@ -5,8 +5,10 @@ documents those changes that are of interest to users and admins.
 =============================
  -- Add select_g_reconfigure() function to node changes in slurmctld configuration
     that can impact node scheduling.
- -- Scontrol to set/get partition's MaxTime and job's Timelimit in minutes plus
+ -- scontrol to set/get partition's MaxTime and job's Timelimit in minutes plus
     new formats: min:sec, hr:min:sec, days-hr:min:sec, days-hr, etc.
+ -- scontrol "notify" command added to send message to stdout of srun for 
+    specified job id.
 
 * Changes in SLURM 1.3.0-pre4
 =============================
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 9daf680c401..f124155889c 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -38,6 +38,8 @@ COMMAND CHANGES
   for times now accept minutes, minutes:seconds, hours:minutes:seconds, 
   days-hours, days-hours:minutes, days-hours:minutes:seconds or "UNLIMITED".
 * sacct -c can now be used to view job completion data
+* scontrol "notify" command added to send message to stdout of srun for 
+  specified job id. 
 
 CONFIGURATION FILE CHANGES
 
diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1
index ca92fee1f2e..c983e1361aa 100644
--- a/doc/man/man1/scontrol.1
+++ b/doc/man/man1/scontrol.1
@@ -110,25 +110,30 @@ Do not display partitiion, job or jobs step information for partitions that are
 configured as hidden or partitions that are unavailable to the user's group. 
 This is the default behavior.
 
+.TP
+\fBnotify\fP \fIjob_id\fP \fImessage\fP
+Send a message to standard output of the srun command associated with the 
+specified \fIjob_id\fP.
+
 .TP
 \fBoneliner\fP
 Print information one line per record.
 This is an independent command with no options meant for use in interactive mode.
 
 .TP
-\fBpidinfo\fP \fIPROC_ID\fP
+\fBpidinfo\fP \fIproc_id\fP
 Print the Slurm job id and scheduled termination time corresponding to the 
-supplied process id, \fIPROC_ID\fP, on the current node.  This will work only
+supplied process id, \fIproc_id\fP, on the current node.  This will work only
 with processes on node on which scontrol is run, and only for those processes
 spawned by SLURM and their descendants.
 
 .TP
-\fBlistpids\fP [JOBID[.STEPID]] [NodeName]
+\fBlistpids\fP [\fIjob_id\fP[.\fIstep_id\fP]] [\fINodeName\fP]
 Print a listing of the process IDs in a job step (if JOBID.STEPID is provided),
-or all of the job steps in a job (if JOBID is provided), or all of the job
-steps in all of the jobs on the local node (if JOBID is not provided or JOBID
-is "*").  This will work only with processes on the node on which
-scontrol is run, and only for those processes spawned by SLURM and
+or all of the job steps in a job (if \fIjob_id\fP is provided), or all of the job
+steps in all of the jobs on the local node (if \fIjob_id\fP is not provided 
+or \fIjob_id\fP is "*").  This will work only with processes on the node on 
+which scontrol is run, and only for those processes spawned by SLURM and
 their descendants. Note that some SLURM configurations
 (\fIProctrackType\fP value of \fIpgid\fP or \fIaix\fP) 
 are unable to identify all processes associated with a job or job step.
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 7706496dff2..7072528a649 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -1528,6 +1528,15 @@ extern int slurm_update_job PARAMS(( job_desc_msg_t * job_msg )) ;
 extern int slurm_get_select_jobinfo PARAMS((select_jobinfo_t jobinfo,
 		enum select_data_type data_type, void *data));
 
+/*
+ * slurm_notify_job - send message to the job's stdout, 
+ *	usable only by user root
+ * IN job_id - slurm job_id or 0 for all jobs
+ * IN message - arbitrary message
+ * RET 0 or -1 on error
+ */
+extern int slurm_notify_job PARAMS(( uint32_t job_id, char *message ));
+
 /*****************************************************************************\
  *	SLURM JOB STEP CONFIGURATION READ/PRINT/UPDATE FUNCTIONS
 \*****************************************************************************/
diff --git a/src/api/signal.c b/src/api/signal.c
index 7d34621e4b0..8139a63fca2 100644
--- a/src/api/signal.c
+++ b/src/api/signal.c
@@ -404,3 +404,37 @@ static int _terminate_batch_script_step(
 	return rc;
 }
 
+/*
+ * slurm_notify_job - send message to the job's stdout, 
+ *	usable only by user root
+ * IN job_id - slurm job_id or 0 for all jobs
+ * IN message - arbitrary message
+ * RET 0 or -1 on error
+ */
+extern int slurm_notify_job (uint32_t job_id, char *message)
+{
+	int rc;
+	slurm_msg_t msg;
+	job_notify_msg_t req;
+
+	slurm_msg_t_init(&msg);
+	/* 
+	 * Request message:
+	 */
+	req.job_id      = job_id;
+	req.job_step_id = NO_VAL;	/* currently not used */
+	req.message     = message;
+	msg.msg_type    = REQUEST_JOB_NOTIFY;
+	msg.data        = &req;
+
+	if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0)
+		return SLURM_FAILURE;
+
+	if (rc) {
+		slurm_seterrno_ret(rc);
+		return SLURM_FAILURE;
+	}
+
+	return SLURM_SUCCESS;
+}
+
diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c
index c6a288d4fc2..05a709b16e8 100644
--- a/src/common/slurm_protocol_defs.c
+++ b/src/common/slurm_protocol_defs.c
@@ -1286,3 +1286,10 @@ extern uint32_t slurm_get_return_code(slurm_msg_type_t type, void *data)
 	return rc;
 }
 
+void inline slurm_free_job_notify_msg(job_notify_msg_t * msg)
+{
+	if (msg) {
+		xfree(msg->message);
+		xfree(msg);
+	}
+}
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index c3ac8a3e9b2..831f729d104 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -142,6 +142,7 @@ typedef enum {
 	REQUEST_JOB_READY,
 	RESPONSE_JOB_READY,
 	REQUEST_JOB_END_TIME,
+	REQUEST_JOB_NOTIFY,
 
 	REQUEST_JOB_STEP_CREATE = 5001,
 	RESPONSE_JOB_STEP_CREATE,
@@ -287,6 +288,12 @@ typedef struct job_step_kill_msg {
 	uint16_t batch_flag;
 } job_step_kill_msg_t;
 
+typedef struct job_notify_msg {
+	uint32_t job_id;
+	uint32_t job_step_id;	/* currently not used */
+	char *   message;
+} job_notify_msg_t;
+
 typedef struct job_id_msg {
 	uint32_t job_id;
 } job_id_msg_t;
@@ -796,6 +803,7 @@ void inline slurm_free_step_complete_msg(step_complete_msg_t *msg);
 void inline slurm_free_stat_jobacct_msg(stat_jobacct_msg_t *msg);
 void inline slurm_free_node_select_msg(
 		node_info_select_request_msg_t *msg);
+void inline slurm_free_job_notify_msg(job_notify_msg_t * msg);
 extern int slurm_free_msg_data(slurm_msg_type_t type, void *data);
 extern uint32_t slurm_get_return_code(slurm_msg_type_t type, void *data);
 
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index 16072f1da07..07bd55df9ec 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -326,6 +326,9 @@ static int  _unpack_trigger_msg(trigger_info_msg_t ** msg_ptr , Buf buffer );
 static void _pack_slurmd_status(slurmd_status_t *msg, Buf buffer);
 static int  _unpack_slurmd_status(slurmd_status_t **msg_ptr, Buf buffer);
 
+static void _pack_job_notify(job_notify_msg_t *msg, Buf buffer);
+static int  _unpack_job_notify(job_notify_msg_t **msg_ptr, Buf buffer);
+
 /* pack_header
  * packs a slurm protocol header that proceeds every slurm message
  * IN header - the header structure to pack
@@ -689,6 +692,9 @@ pack_msg(slurm_msg_t const *msg, Buf buffer)
 	case RESPONSE_SLURMD_STATUS:
 		_pack_slurmd_status((slurmd_status_t *) msg->data, buffer);
 		break;
+	case REQUEST_JOB_NOTIFY:
+		_pack_job_notify((job_notify_msg_t *) msg->data, buffer);
+		break;
 	default:
 		debug("No pack method for msg type %u", msg->msg_type);
 		return EINVAL;
@@ -1020,6 +1026,10 @@ unpack_msg(slurm_msg_t * msg, Buf buffer)
 		rc = _unpack_slurmd_status((slurmd_status_t **)
 					&msg->data, buffer);
 		break;
+	case REQUEST_JOB_NOTIFY:
+		rc =  _unpack_job_notify((job_notify_msg_t **)
+					 &msg->data, buffer);
+		break;
 	default:
 		debug("No unpack method for msg type %u", msg->msg_type);
 		return EINVAL;
@@ -4427,6 +4437,38 @@ unpack_error:
 	return SLURM_ERROR;
 }
 
+static void _pack_job_notify(job_notify_msg_t *msg, Buf buffer)
+{
+	xassert(msg);
+
+	pack32(msg->job_id,      buffer);
+	pack32(msg->job_step_id, buffer);
+	packstr(msg->message,    buffer);
+}
+
+static int  _unpack_job_notify(job_notify_msg_t **msg_ptr, Buf buffer)
+{
+	uint16_t uint16_tmp;
+	job_notify_msg_t *msg;
+
+	xassert(msg_ptr);
+
+	msg = xmalloc(sizeof(job_notify_msg_t));
+
+	safe_unpack32(&msg->job_id,      buffer);
+	safe_unpack32(&msg->job_step_id, buffer);
+	safe_unpackstr_xmalloc(&msg->message, &uint16_tmp, buffer);
+
+	*msg_ptr = msg;
+	return SLURM_SUCCESS;
+
+unpack_error:
+	xfree(msg->message);
+	xfree(msg);
+	*msg_ptr = NULL;
+	return SLURM_ERROR;
+}
+
 /* template 
    void pack_ ( * msg , Buf buffer )
    {
diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c
index 86fc0095ce2..de132910dd4 100644
--- a/src/scontrol/scontrol.c
+++ b/src/scontrol/scontrol.c
@@ -572,7 +572,7 @@ _process_command (int argc, char *argv[])
 				        argv[0]);
 		}
 		else {
-			error_code =scontrol_checkpoint(argv[1], argv[2]);
+			error_code = scontrol_checkpoint(argv[1], argv[2]);
 			if (error_code) {
 				exit_code = 1;
 				if (quiet_flag != 1)
@@ -594,7 +594,7 @@ _process_command (int argc, char *argv[])
 					"too few arguments for keyword:%s\n",
 					argv[0]);
 		} else {
-			error_code =scontrol_requeue(argv[1]);
+			error_code = scontrol_requeue(argv[1]);
 			if (error_code) {
 				exit_code = 1;
 				if (quiet_flag != 1)
@@ -773,6 +773,17 @@ _process_command (int argc, char *argv[])
 					    argc <= 2 ? NULL : argv[2]);
 		}
 	}
+	else if (strncasecmp (argv[0], "notify", 6) == 0) {
+		if (argc < 3) {
+			exit_code = 1;
+			fprintf (stderr, 
+				 "too few arguments for keyword:%s\n", 
+				 argv[0]);
+		} else if (scontrol_job_notify(argc-1, &argv[1])) {
+			exit_code = 1;
+			slurm_perror("job notify failure");
+		}
+	}
 	else {
 		exit_code = 1;
 		fprintf (stderr, "invalid keyword: %s\n", argv[0]);
@@ -996,6 +1007,7 @@ scontrol [<OPTION>] [<COMMAND>]                                            \n\
                               scontrol is ran on, and only for those       \n\
                               processes spawned by SLURM and their         \n\
                               descendants)                                 \n\
+     notify <job_id> msg      send message to specified job                \n\
      oneliner                 report output one record per line.           \n\
      pidinfo <pid>            return slurm job information for given pid.  \n\
      ping                     print status of slurmctld daemons.           \n\
diff --git a/src/scontrol/scontrol.h b/src/scontrol/scontrol.h
index f943382c333..1de5fdd8f4d 100644
--- a/src/scontrol/scontrol.h
+++ b/src/scontrol/scontrol.h
@@ -99,6 +99,7 @@ extern int quiet_flag;	/* quiet=1, verbose=-1, normal=0 */
 
 extern int	scontrol_checkpoint(char *op, char *job_step_id_str);
 extern int	scontrol_encode_hostlist(char *hostlist);
+extern int	scontrol_job_notify(int argc, char *argv[]);
 extern int 	scontrol_load_jobs (job_info_msg_t ** job_buffer_pptr);
 extern int 	scontrol_load_nodes (node_info_msg_t ** node_buffer_pptr, 
 			uint16_t show_flags);
diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c
index 809f39fa0d8..1a03924bb6e 100644
--- a/src/scontrol/update_job.c
+++ b/src/scontrol/update_job.c
@@ -448,3 +448,35 @@ scontrol_update_job (int argc, char *argv[])
 	else
 		return 0;
 }
+
+/*
+ * Send message to stdout of specified job
+ * argv[0] == jobid
+ * argv[1]++ the message
+ */
+extern int
+scontrol_job_notify(int argc, char *argv[])
+{
+	int i;
+	uint32_t job_id;
+	char message[256];
+
+	job_id = atoi(argv[0]);
+	if (job_id <= 0) {
+		fprintf(stderr, "Invalid job_id %s", argv[0]);
+		return 1;
+	}
+
+	message[0] = '\0';
+	for (i=1; i<argc; i++) {
+		if (i > 1)
+			strncat(message, " ", sizeof(message));
+		strncat(message, argv[i], sizeof(message));
+	}
+			
+	if (slurm_notify_job(job_id, message))
+		return slurm_get_errno ();
+	else
+		return 0;
+}
+
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 7934d99775d..f4e973b0580 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -76,6 +76,7 @@
 #include "src/slurmctld/read_config.h"
 #include "src/slurmctld/sched_plugin.h"
 #include "src/slurmctld/slurmctld.h"
+#include "src/slurmctld/srun_comm.h"
 #include "src/slurmctld/state_save.h"
 #include "src/slurmctld/trigger_mgr.h"
 
@@ -96,6 +97,7 @@ inline static void  _slurm_rpc_dump_jobs(slurm_msg_t * msg);
 inline static void  _slurm_rpc_dump_nodes(slurm_msg_t * msg);
 inline static void  _slurm_rpc_dump_partitions(slurm_msg_t * msg);
 inline static void  _slurm_rpc_epilog_complete(slurm_msg_t * msg);
+inline static void  _slurm_rpc_job_notify(slurm_msg_t * msg);
 inline static void  _slurm_rpc_job_ready(slurm_msg_t * msg);
 inline static void  _slurm_rpc_job_step_kill(slurm_msg_t * msg);
 inline static void  _slurm_rpc_job_step_create(slurm_msg_t * msg);
@@ -285,6 +287,10 @@ void slurmctld_req (slurm_msg_t * msg)
 		_slurm_rpc_trigger_clear(msg);
 		slurm_free_trigger_msg(msg->data);
 		break;
+	case REQUEST_JOB_NOTIFY:
+		_slurm_rpc_job_notify(msg);
+		slurm_free_job_notify_msg(msg->data);
+		break;
 	default:
 		error("invalid RPC msg_type=%d", msg->msg_type);
 		slurm_send_rc_msg(msg, EINVAL);
@@ -2626,3 +2632,40 @@ inline static void  _slurm_rpc_trigger_set(slurm_msg_t * msg)
 
 	slurm_send_rc_msg(msg, rc);
 }
+
+inline static void  _slurm_rpc_job_notify(slurm_msg_t * msg)
+{
+	int error_code = SLURM_SUCCESS;
+	/* Locks: read job */
+	slurmctld_lock_t job_read_lock = { 
+		NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
+	uid_t uid;
+	job_notify_msg_t * notify_msg = (job_notify_msg_t *) msg->data;
+	DEF_TIMERS;
+
+	START_TIMER;
+	debug("Processing RPC: REQUEST_JOB_NOTIFY");
+	uid = g_slurm_auth_get_uid(msg->auth_cred);
+	if (!validate_super_user(uid)) {
+		error_code = ESLURM_USER_ID_MISSING;
+		error("Security violation, REQUEST_JOB_NOTIFY RPC from uid=%u",
+		      (unsigned int) uid);
+	}
+
+	if (error_code == SLURM_SUCCESS) {
+		/* do RPC call */
+		struct job_record *job_ptr;
+		lock_slurmctld(job_read_lock);
+		job_ptr = find_job_record(notify_msg->job_id);
+		if (job_ptr)
+			srun_user_message(job_ptr, notify_msg->message);
+		else
+			error_code = ESLURM_INVALID_JOB_ID;
+		unlock_slurmctld(job_read_lock);
+	}
+
+	END_TIMER2("_slurm_rpc_job_notify");
+info("NOTIFY job %u: %s %d", notify_msg->job_id, notify_msg->message, error_code);
+	slurm_send_rc_msg(msg, error_code);
+}
+
-- 
GitLab