diff --git a/NEWS b/NEWS index 70b03228f71323b30192ce880afff8009132d5ff..3383d12dcfb85b965a9c9dea96cbde8071287157 100644 --- a/NEWS +++ b/NEWS @@ -5,8 +5,10 @@ documents those changes that are of interest to users and admins. ============================= -- Add select_g_reconfigure() function to node changes in slurmctld configuration that can impact node scheduling. - -- Scontrol to set/get partition's MaxTime and job's Timelimit in minutes plus + -- scontrol to set/get partition's MaxTime and job's Timelimit in minutes plus new formats: min:sec, hr:min:sec, days-hr:min:sec, days-hr, etc. + -- scontrol "notify" command added to send message to stdout of srun for + specified job id. * Changes in SLURM 1.3.0-pre4 ============================= diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 9daf680c401d68e12405473ebd23ddbfd72b071e..f124155889cb97bf397dbb5d5982f41d0aa8d9fa 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -38,6 +38,8 @@ COMMAND CHANGES for times now accept minutes, minutes:seconds, hours:minutes:seconds, days-hours, days-hours:minutes, days-hours:minutes:seconds or "UNLIMITED". * sacct -c can now be used to view job completion data +* scontrol "notify" command added to send message to stdout of srun for + specified job id. CONFIGURATION FILE CHANGES diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index ca92fee1f2ee1084d18d906d29bbd7d19d3f1973..c983e1361aab5aba516e2712d86e99387ff62b01 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -110,25 +110,30 @@ Do not display partitiion, job or jobs step information for partitions that are configured as hidden or partitions that are unavailable to the user's group. This is the default behavior. +.TP +\fBnotify\fP \fIjob_id\fP \fImessage\fP +Send a message to standard output of the srun command associated with the +specified \fIjob_id\fP. + .TP \fBoneliner\fP Print information one line per record. This is an independent command with no options meant for use in interactive mode. .TP -\fBpidinfo\fP \fIPROC_ID\fP +\fBpidinfo\fP \fIproc_id\fP Print the Slurm job id and scheduled termination time corresponding to the -supplied process id, \fIPROC_ID\fP, on the current node. This will work only +supplied process id, \fIproc_id\fP, on the current node. This will work only with processes on node on which scontrol is run, and only for those processes spawned by SLURM and their descendants. .TP -\fBlistpids\fP [JOBID[.STEPID]] [NodeName] +\fBlistpids\fP [\fIjob_id\fP[.\fIstep_id\fP]] [\fINodeName\fP] Print a listing of the process IDs in a job step (if JOBID.STEPID is provided), -or all of the job steps in a job (if JOBID is provided), or all of the job -steps in all of the jobs on the local node (if JOBID is not provided or JOBID -is "*"). This will work only with processes on the node on which -scontrol is run, and only for those processes spawned by SLURM and +or all of the job steps in a job (if \fIjob_id\fP is provided), or all of the job +steps in all of the jobs on the local node (if \fIjob_id\fP is not provided +or \fIjob_id\fP is "*"). This will work only with processes on the node on +which scontrol is run, and only for those processes spawned by SLURM and their descendants. Note that some SLURM configurations (\fIProctrackType\fP value of \fIpgid\fP or \fIaix\fP) are unable to identify all processes associated with a job or job step. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 7706496dff2891dded1617a66f0621ca6b2813fa..7072528a64930d11ce888d5f94c1c740e86725d3 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1528,6 +1528,15 @@ extern int slurm_update_job PARAMS(( job_desc_msg_t * job_msg )) ; extern int slurm_get_select_jobinfo PARAMS((select_jobinfo_t jobinfo, enum select_data_type data_type, void *data)); +/* + * slurm_notify_job - send message to the job's stdout, + * usable only by user root + * IN job_id - slurm job_id or 0 for all jobs + * IN message - arbitrary message + * RET 0 or -1 on error + */ +extern int slurm_notify_job PARAMS(( uint32_t job_id, char *message )); + /*****************************************************************************\ * SLURM JOB STEP CONFIGURATION READ/PRINT/UPDATE FUNCTIONS \*****************************************************************************/ diff --git a/src/api/signal.c b/src/api/signal.c index 7d34621e4b064b9a7681fd95050c899cba0bf028..8139a63fca2628f5187dd86c17fb33eccadc2153 100644 --- a/src/api/signal.c +++ b/src/api/signal.c @@ -404,3 +404,37 @@ static int _terminate_batch_script_step( return rc; } +/* + * slurm_notify_job - send message to the job's stdout, + * usable only by user root + * IN job_id - slurm job_id or 0 for all jobs + * IN message - arbitrary message + * RET 0 or -1 on error + */ +extern int slurm_notify_job (uint32_t job_id, char *message) +{ + int rc; + slurm_msg_t msg; + job_notify_msg_t req; + + slurm_msg_t_init(&msg); + /* + * Request message: + */ + req.job_id = job_id; + req.job_step_id = NO_VAL; /* currently not used */ + req.message = message; + msg.msg_type = REQUEST_JOB_NOTIFY; + msg.data = &req; + + if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0) + return SLURM_FAILURE; + + if (rc) { + slurm_seterrno_ret(rc); + return SLURM_FAILURE; + } + + return SLURM_SUCCESS; +} + diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index c6a288d4fc293be132b03707ddd831f41a3369e1..05a709b16e892b4aaa1b2f8debcbce535f68995c 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1286,3 +1286,10 @@ extern uint32_t slurm_get_return_code(slurm_msg_type_t type, void *data) return rc; } +void inline slurm_free_job_notify_msg(job_notify_msg_t * msg) +{ + if (msg) { + xfree(msg->message); + xfree(msg); + } +} diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index c3ac8a3e9b2b2ccce64e9909a748f03d76def596..831f729d104d569bf9baa7827ad5fdc22927fe9e 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -142,6 +142,7 @@ typedef enum { REQUEST_JOB_READY, RESPONSE_JOB_READY, REQUEST_JOB_END_TIME, + REQUEST_JOB_NOTIFY, REQUEST_JOB_STEP_CREATE = 5001, RESPONSE_JOB_STEP_CREATE, @@ -287,6 +288,12 @@ typedef struct job_step_kill_msg { uint16_t batch_flag; } job_step_kill_msg_t; +typedef struct job_notify_msg { + uint32_t job_id; + uint32_t job_step_id; /* currently not used */ + char * message; +} job_notify_msg_t; + typedef struct job_id_msg { uint32_t job_id; } job_id_msg_t; @@ -796,6 +803,7 @@ void inline slurm_free_step_complete_msg(step_complete_msg_t *msg); void inline slurm_free_stat_jobacct_msg(stat_jobacct_msg_t *msg); void inline slurm_free_node_select_msg( node_info_select_request_msg_t *msg); +void inline slurm_free_job_notify_msg(job_notify_msg_t * msg); extern int slurm_free_msg_data(slurm_msg_type_t type, void *data); extern uint32_t slurm_get_return_code(slurm_msg_type_t type, void *data); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 16072f1da077ce7dcd364f7a5c140c00c58a4131..07bd55df9ec6d53301eccd8226c83bee9393f13e 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -326,6 +326,9 @@ static int _unpack_trigger_msg(trigger_info_msg_t ** msg_ptr , Buf buffer ); static void _pack_slurmd_status(slurmd_status_t *msg, Buf buffer); static int _unpack_slurmd_status(slurmd_status_t **msg_ptr, Buf buffer); +static void _pack_job_notify(job_notify_msg_t *msg, Buf buffer); +static int _unpack_job_notify(job_notify_msg_t **msg_ptr, Buf buffer); + /* pack_header * packs a slurm protocol header that proceeds every slurm message * IN header - the header structure to pack @@ -689,6 +692,9 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) case RESPONSE_SLURMD_STATUS: _pack_slurmd_status((slurmd_status_t *) msg->data, buffer); break; + case REQUEST_JOB_NOTIFY: + _pack_job_notify((job_notify_msg_t *) msg->data, buffer); + break; default: debug("No pack method for msg type %u", msg->msg_type); return EINVAL; @@ -1020,6 +1026,10 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) rc = _unpack_slurmd_status((slurmd_status_t **) &msg->data, buffer); break; + case REQUEST_JOB_NOTIFY: + rc = _unpack_job_notify((job_notify_msg_t **) + &msg->data, buffer); + break; default: debug("No unpack method for msg type %u", msg->msg_type); return EINVAL; @@ -4427,6 +4437,38 @@ unpack_error: return SLURM_ERROR; } +static void _pack_job_notify(job_notify_msg_t *msg, Buf buffer) +{ + xassert(msg); + + pack32(msg->job_id, buffer); + pack32(msg->job_step_id, buffer); + packstr(msg->message, buffer); +} + +static int _unpack_job_notify(job_notify_msg_t **msg_ptr, Buf buffer) +{ + uint16_t uint16_tmp; + job_notify_msg_t *msg; + + xassert(msg_ptr); + + msg = xmalloc(sizeof(job_notify_msg_t)); + + safe_unpack32(&msg->job_id, buffer); + safe_unpack32(&msg->job_step_id, buffer); + safe_unpackstr_xmalloc(&msg->message, &uint16_tmp, buffer); + + *msg_ptr = msg; + return SLURM_SUCCESS; + +unpack_error: + xfree(msg->message); + xfree(msg); + *msg_ptr = NULL; + return SLURM_ERROR; +} + /* template void pack_ ( * msg , Buf buffer ) { diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index 86fc0095ce20c59c55339cab93fb82b829a678cf..de132910dd4cc4e14e3effe730ac2207d361291e 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -572,7 +572,7 @@ _process_command (int argc, char *argv[]) argv[0]); } else { - error_code =scontrol_checkpoint(argv[1], argv[2]); + error_code = scontrol_checkpoint(argv[1], argv[2]); if (error_code) { exit_code = 1; if (quiet_flag != 1) @@ -594,7 +594,7 @@ _process_command (int argc, char *argv[]) "too few arguments for keyword:%s\n", argv[0]); } else { - error_code =scontrol_requeue(argv[1]); + error_code = scontrol_requeue(argv[1]); if (error_code) { exit_code = 1; if (quiet_flag != 1) @@ -773,6 +773,17 @@ _process_command (int argc, char *argv[]) argc <= 2 ? NULL : argv[2]); } } + else if (strncasecmp (argv[0], "notify", 6) == 0) { + if (argc < 3) { + exit_code = 1; + fprintf (stderr, + "too few arguments for keyword:%s\n", + argv[0]); + } else if (scontrol_job_notify(argc-1, &argv[1])) { + exit_code = 1; + slurm_perror("job notify failure"); + } + } else { exit_code = 1; fprintf (stderr, "invalid keyword: %s\n", argv[0]); @@ -996,6 +1007,7 @@ scontrol [<OPTION>] [<COMMAND>] \n\ scontrol is ran on, and only for those \n\ processes spawned by SLURM and their \n\ descendants) \n\ + notify <job_id> msg send message to specified job \n\ oneliner report output one record per line. \n\ pidinfo <pid> return slurm job information for given pid. \n\ ping print status of slurmctld daemons. \n\ diff --git a/src/scontrol/scontrol.h b/src/scontrol/scontrol.h index f943382c333f9e78287dbf4a50c8fb850bbea9ac..1de5fdd8f4d0504b53c4b767241ea99e34d1ccb2 100644 --- a/src/scontrol/scontrol.h +++ b/src/scontrol/scontrol.h @@ -99,6 +99,7 @@ extern int quiet_flag; /* quiet=1, verbose=-1, normal=0 */ extern int scontrol_checkpoint(char *op, char *job_step_id_str); extern int scontrol_encode_hostlist(char *hostlist); +extern int scontrol_job_notify(int argc, char *argv[]); extern int scontrol_load_jobs (job_info_msg_t ** job_buffer_pptr); extern int scontrol_load_nodes (node_info_msg_t ** node_buffer_pptr, uint16_t show_flags); diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index 809f39fa0d859904ddc95ccbf160dc61f4bde8ef..1a03924bb6e9dee62e0991a74956ec0a709dd886 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -448,3 +448,35 @@ scontrol_update_job (int argc, char *argv[]) else return 0; } + +/* + * Send message to stdout of specified job + * argv[0] == jobid + * argv[1]++ the message + */ +extern int +scontrol_job_notify(int argc, char *argv[]) +{ + int i; + uint32_t job_id; + char message[256]; + + job_id = atoi(argv[0]); + if (job_id <= 0) { + fprintf(stderr, "Invalid job_id %s", argv[0]); + return 1; + } + + message[0] = '\0'; + for (i=1; i<argc; i++) { + if (i > 1) + strncat(message, " ", sizeof(message)); + strncat(message, argv[i], sizeof(message)); + } + + if (slurm_notify_job(job_id, message)) + return slurm_get_errno (); + else + return 0; +} + diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 7934d99775d38f56c71a5d10c358084806f62c80..f4e973b0580e6788fa7872b808448a1d95880384 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -76,6 +76,7 @@ #include "src/slurmctld/read_config.h" #include "src/slurmctld/sched_plugin.h" #include "src/slurmctld/slurmctld.h" +#include "src/slurmctld/srun_comm.h" #include "src/slurmctld/state_save.h" #include "src/slurmctld/trigger_mgr.h" @@ -96,6 +97,7 @@ inline static void _slurm_rpc_dump_jobs(slurm_msg_t * msg); inline static void _slurm_rpc_dump_nodes(slurm_msg_t * msg); inline static void _slurm_rpc_dump_partitions(slurm_msg_t * msg); inline static void _slurm_rpc_epilog_complete(slurm_msg_t * msg); +inline static void _slurm_rpc_job_notify(slurm_msg_t * msg); inline static void _slurm_rpc_job_ready(slurm_msg_t * msg); inline static void _slurm_rpc_job_step_kill(slurm_msg_t * msg); inline static void _slurm_rpc_job_step_create(slurm_msg_t * msg); @@ -285,6 +287,10 @@ void slurmctld_req (slurm_msg_t * msg) _slurm_rpc_trigger_clear(msg); slurm_free_trigger_msg(msg->data); break; + case REQUEST_JOB_NOTIFY: + _slurm_rpc_job_notify(msg); + slurm_free_job_notify_msg(msg->data); + break; default: error("invalid RPC msg_type=%d", msg->msg_type); slurm_send_rc_msg(msg, EINVAL); @@ -2626,3 +2632,40 @@ inline static void _slurm_rpc_trigger_set(slurm_msg_t * msg) slurm_send_rc_msg(msg, rc); } + +inline static void _slurm_rpc_job_notify(slurm_msg_t * msg) +{ + int error_code = SLURM_SUCCESS; + /* Locks: read job */ + slurmctld_lock_t job_read_lock = { + NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; + uid_t uid; + job_notify_msg_t * notify_msg = (job_notify_msg_t *) msg->data; + DEF_TIMERS; + + START_TIMER; + debug("Processing RPC: REQUEST_JOB_NOTIFY"); + uid = g_slurm_auth_get_uid(msg->auth_cred); + if (!validate_super_user(uid)) { + error_code = ESLURM_USER_ID_MISSING; + error("Security violation, REQUEST_JOB_NOTIFY RPC from uid=%u", + (unsigned int) uid); + } + + if (error_code == SLURM_SUCCESS) { + /* do RPC call */ + struct job_record *job_ptr; + lock_slurmctld(job_read_lock); + job_ptr = find_job_record(notify_msg->job_id); + if (job_ptr) + srun_user_message(job_ptr, notify_msg->message); + else + error_code = ESLURM_INVALID_JOB_ID; + unlock_slurmctld(job_read_lock); + } + + END_TIMER2("_slurm_rpc_job_notify"); +info("NOTIFY job %u: %s %d", notify_msg->job_id, notify_msg->message, error_code); + slurm_send_rc_msg(msg, error_code); +} +