From 13623e6da65af442f6ddddd37819a5df849d31c7 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 10 Feb 2010 21:21:35 +0000 Subject: [PATCH] Add scontrol ability to update job step time limits. --- NEWS | 1 + doc/man/Makefile.am | 4 +- doc/man/Makefile.in | 4 +- doc/man/man1/scontrol.1 | 25 +++- doc/man/man3/slurm_init_update_step_msg.3 | 1 + doc/man/man3/slurm_reconfigure.3 | 39 +----- doc/man/man3/slurm_update_job.3 | 154 +++++++++++++++++++++- doc/man/man3/slurm_update_step.3 | 1 + slurm/slurm.h.in | 22 +++- src/api/init_msg.c | 13 ++ src/api/update_config.c | 10 +- src/common/slurm_protocol_defs.c | 9 +- src/common/slurm_protocol_defs.h | 3 +- src/common/slurm_protocol_pack.c | 43 ++++++ src/scontrol/Makefile.am | 3 +- src/scontrol/Makefile.in | 7 +- src/scontrol/scontrol.c | 29 ++-- src/scontrol/scontrol.h | 8 +- src/scontrol/update_job.c | 4 +- src/scontrol/update_step.c | 117 ++++++++++++++++ src/slurmctld/proc_req.c | 40 +++++- src/slurmctld/slurmctld.h | 6 +- src/slurmctld/step_mgr.c | 53 +++++++- 23 files changed, 520 insertions(+), 76 deletions(-) create mode 100644 doc/man/man3/slurm_init_update_step_msg.3 create mode 100644 doc/man/man3/slurm_update_step.3 create mode 100644 src/scontrol/update_step.c diff --git a/NEWS b/NEWS index cfc70e3b38d..d4a76f85e60 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,7 @@ documents those changes that are of interest to users and admins. killing an entire job if one step exceeds its memory limit. -- Added configuration parameter VSizeFactor to enforce virtual memory limits for jobs and job steps as a percentage of their real memory allocation. + -- Add scontrol ability to update job step time limits. * Changes in SLURM 2.2.0.pre1 ============================= diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am index 3e229630d17..6dc61e4d584 100644 --- a/doc/man/Makefile.am +++ b/doc/man/Makefile.am @@ -76,6 +76,7 @@ man3_MANS = man3/slurm_hostlist_create.3 \ man3/slurm_init_part_desc_msg.3 \ man3/slurm_init_resv_desc_msg.3 \ man3/slurm_init_update_node_msg.3 \ + man3/slurm_init_update_step_msg.3 \ man3/slurm_job_step_create.3 \ man3/slurm_job_step_launch_t_init.3 \ man3/slurm_job_step_layout_get.3 \ @@ -142,7 +143,8 @@ man3_MANS = man3/slurm_hostlist_create.3 \ man3/slurm_update_job.3 \ man3/slurm_update_node.3 \ man3/slurm_update_partition.3 \ - man3/slurm_update_reservation.3 + man3/slurm_update_reservation.3 \ + man3/slurm_update_step.3 man5_MANS = man5/bluegene.conf.5 \ man5/slurm.conf.5 \ diff --git a/doc/man/Makefile.in b/doc/man/Makefile.in index 7c8d19d1936..3f97a917b23 100644 --- a/doc/man/Makefile.in +++ b/doc/man/Makefile.in @@ -369,6 +369,7 @@ man3_MANS = man3/slurm_hostlist_create.3 \ man3/slurm_init_part_desc_msg.3 \ man3/slurm_init_resv_desc_msg.3 \ man3/slurm_init_update_node_msg.3 \ + man3/slurm_init_update_step_msg.3 \ man3/slurm_job_step_create.3 \ man3/slurm_job_step_launch_t_init.3 \ man3/slurm_job_step_layout_get.3 \ @@ -435,7 +436,8 @@ man3_MANS = man3/slurm_hostlist_create.3 \ man3/slurm_update_job.3 \ man3/slurm_update_node.3 \ man3/slurm_update_partition.3 \ - man3/slurm_update_reservation.3 + man3/slurm_update_reservation.3 \ + man3/slurm_update_step.3 man5_MANS = man5/bluegene.conf.5 \ man5/slurm.conf.5 \ diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index 23e1759987a..65890fe6917 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -1,4 +1,4 @@ -.TH SCONTROL "1" "September 2009" "scontrol 2.1" "Slurm components" +.TH SCONTROL "1" "February 2010" "scontrol 2.1" "Slurm components" .SH "NAME" scontrol \- Used view and modify Slurm configuration and state. @@ -294,8 +294,8 @@ primary SLURM controller is scheduled down. .TP \fBupdate\fP \fISPECIFICATION\fP -Update job, node, partition, or reservation configuration per the supplied -specification. \fISPECIFICATION\fP is in the same format as the Slurm +Update job, step, node, partition, or reservation configuration per the +supplied specification. \fISPECIFICATION\fP is in the same format as the Slurm configuration file and the output of the \fIshow\fP command described above. It may be desirable to execute the \fIshow\fP command (described above) on the specific entity you which to update, then use cut\-and\-paste tools to enter @@ -543,6 +543,25 @@ a further state such as RUNNING, COMPLETE, etc. In these cases the code explicitly returns zero for these values. These values are meaningless once the job resources have been allocated and the job has started. + +.TP +\fBSPECIFICATIONS FOR UPDATE COMMAND, STEPS\fR +.TP +\fIStepId\fP=<job_id>[.<step_id>] +Identify the step to be updated. +If the job_id is given, but no step_id is specified then all steps of +the identified job will be modified. +This specification is required. +.TP +\fITimeLimit\fP=<time> +The job's time limit. +Output format is [days\-]hours:minutes:seconds or "UNLIMITED". +Input format (for \fBupdate\fR command) set is minutes, minutes:seconds, +hours:minutes:seconds, days\-hours, days\-hours:minutes or +days\-hours:minutes:seconds. +Time resolution is one minute and second values are rounded up to +the next minute. + .TP \fBSPECIFICATIONS FOR UPDATE COMMAND, NODES\fR .TP diff --git a/doc/man/man3/slurm_init_update_step_msg.3 b/doc/man/man3/slurm_init_update_step_msg.3 new file mode 100644 index 00000000000..ba66f51f6b2 --- /dev/null +++ b/doc/man/man3/slurm_init_update_step_msg.3 @@ -0,0 +1 @@ +.so man3/slurm_update_job.3 diff --git a/doc/man/man3/slurm_reconfigure.3 b/doc/man/man3/slurm_reconfigure.3 index dabc039a4b8..7779f35d095 100644 --- a/doc/man/man3/slurm_reconfigure.3 +++ b/doc/man/man3/slurm_reconfigure.3 @@ -3,7 +3,7 @@ slurm_create_partition, slurm_create_reservation, slurm_delete_partition, slurm_delete_reservation, slurm_init_part_desc_msg, slurm_init_resv_desc_msg, -slurm_reconfigure, slurm_shutdown, slurm_takeover, slurm_update_job, +slurm_reconfigure, slurm_shutdown, slurm_takeover, ,slurm_init_update_node_msg slurm_update_node, slurm_update_partition, slurm_update_reservation \- Slurm administrative functions @@ -57,12 +57,6 @@ int \fBslurm_shutdown\fR ( .LP int \fBslurm_takeover\fR ( ); .LP -int \fBslurm_update_job\fR ( -.br - job_desc_msg_t *\fIupdate_job_msg_ptr\fP -.br -); -.LP void \fBslurm_init_update_node_msg\fR( .br update_node_msg_t *\fIupdate_node_msg_ptr\fP @@ -104,10 +98,6 @@ See slurm.h for full details on the data structure's contents. Specifies the pointer to a reservation delete request specification. See slurm.h for full details on the data structure's contents. .TP -\fIupdate_job_msg_ptr\fP -Specifies the pointer to a job update request specification. See slurm.h -for full details on the data structure's contents. -.TP \fIupdate_node_msg_ptr\fP Specifies the pointer to a node update request specification. See slurm.h for full details on the data structure's contents. @@ -168,17 +158,6 @@ function may only be successfully executed by user root. immediately and the backup controller take over. This function may only be successfully executed by user root. .LP -\fBslurm_update_job\fR Request that the configuration of a job be updated. Note -that most, but not all parameters of a job may be changed by this function. -Initialize the data structure using the \fBslurm_init_job_desc_msg\fR function -prior to setting values of the parameters to be changed. Note: -\fBslurm_init_job_desc_msg\fR is not equivalent to setting the data structure -values to zero. This function may only be successfully executed by user root. -Note the job priority of zero represents a job that will not be scheduled. -Slurm uses the priority one to represent jobs that can not be scheduled until -additional nodes are returned to service (i.e. not DOWN, DRAINED, or FAILED). -This permits lower priority jobs to utilize those resources which are available. -.LP \fBslurm_init_update_node_msg\fR Initialize the contents of an update mpde descriptor with default values. Note: \fBslurm_init_update_node_msg\fR is not equivalent to setting the data structure values to zero. Execute @@ -280,8 +259,6 @@ use an expired reservation. int main (int argc, char *argv[]) .br { -.br - job_desc_msg_t update_job_msg; .br update_node_msg_t update_node_msg; .br @@ -300,20 +277,6 @@ int main (int argc, char *argv[]) exit (1); .br } -.LP - slurm_init_job_desc_msg( &update_job_msg ); -.br - update_job_msg.job_id = 1234; -.br - update_job_msg time_limit = 200; -.br - if (slurm_update_job (&update_job_msg)) { -.br - slurm_perror ("slurm_update_job error"); -.br - exit (1); -.br - } .LP slurm_init_part_desc_msg ( &update_part_msg ); .br diff --git a/doc/man/man3/slurm_update_job.3 b/doc/man/man3/slurm_update_job.3 index 8c2ed98140d..789c73478d8 100644 --- a/doc/man/man3/slurm_update_job.3 +++ b/doc/man/man3/slurm_update_job.3 @@ -1 +1,153 @@ -.so man3/slurm_reconfigure.3 +.TH "Slurm API" "3" "February 2010" "Morris Jette" "Slurm job and step update functions" + +.SH "NAME" +slurm_init_job_desc_msg, slurm_init_update_step_msg, +slurm_update_job, slurm_update_step +\- Slurm job and step update functions + +.SH "SYNTAX" +.LP +#include <slurm/slurm.h> +.LP +void \fBslurm_init_job_desc_msg\fR ( +.br + job_desc_msg_t *\fIjob_desc_msg_ptr\fP +.br +); +.LP +void \fBslurm_init_update_step_msg\fR ( +.br + step_update_request_msg_t * \fIstep_msg\fP +.br +); +.LP +int \fBslurm_update_job\fR ( +.br + job_desc_msg_t * \fIjob_msg\fP +.br +); +.LP +int \fBslurm_update_step\fR ( +.br + step_update_request_msg_t * \step_msg\fP +.br +); + +.SH "ARGUMENTS" +.LP +.TP +\fIjob_msg\fP +Specifies the pointer to a job descriptor. +See slurm.h for full details on the data structure's contents. +.TP +\fIstep_msg\fP +Specifies the pointer to a step descriptor. +See slurm.h for full details on the data structure's contents. + +.SH "DESCRIPTION" +.LP +\fBslurm_init_job_desc_msg\fR Initialize the contents of a job descriptor with default values. +Execute this function before issuing a request to submit or modify a job. +.LP +\fBslurm_init_update_step_msg\fR Initialize the contents of a job step update +descriptor with default values. +Execute this function before issuing a request to modify a job step. +.LP +\fBslurm_update_job\fR Update a job with the changes made to the data +structure passed as an argument to the function. +.LP +\fBslurm_update_step\fR Update a job step with the changes made to the data +structure passed as an argument to the function. + +.SH "RETURN VALUE" +.LP +On success, zero is returned. On error, \-1 is returned, and the Slurm error +code is set appropriately. + +.SH "ERRORS" +.LP +\fBSLURM_PROTOCOL_VERSION_ERROR\fR Protocol version has changed, re\-link your +code. +.LP +\fBESLURM_ACCESS_DENIED\fR The requesting user lacks authorization for +the requested action (e.g. trying to modify another user's job). +.LP +\fBESLURM_INVALID_TIME_VALUE\fR Invalid time value. + +.SH "EXAMPLE" +.LP +#include <stdio.h> +.br +#include <slurm/slurm.h> +.br +#include <slurm/slurm_errno.h> +.LP +int main (int argc, char *argv[]) +.br +{ +.br + job_desc_msg_t update_job_msg; +.br + step_update_request_msg_t update_step_msg; +.LP + slurm_init_job_desc_msg( &update_job_msg ); +.br + update_job_msg.job_id = 1234; +.br + update_job_msg time_limit = 200; +.br + if (slurm_update_job (&update_job_msg)) { +.br + slurm_perror ("slurm_update_job error"); +.br + exit (1); +.br + } +.LP + slurm_init_update_step_msg( &update_step_msg ); +.br + update_step_msg.job_id = 1234; +.br + update_step_msg.step_id = 2; +.br + update_step_msg time_limit = 30; +.br + if (slurm_update_step (&update_step_msg)) { +.br + slurm_perror ("slurm_update_step error"); +.br + exit (1); +.br + } +.br + exit (0); +.br +} + +.SH "NOTE" +These functions are included in the libslurm library, +which must be linked to your process for use +(e.g. "cc \-lslurm myprog.c"). + +.SH "COPYING" +Copyright (C) 2009\-2010 Lawrence Livermore National Security. +Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). +CODE\-OCEC\-09\-009. All rights reserved. +.LP +This file is part of SLURM, a resource management program. +For details, see <https://computing.llnl.gov/linux/slurm/>. +.LP +SLURM is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2 of the License, or (at your option) +any later version. +.LP +SLURM is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +details. + +.SH "SEE ALSO" +.LP +\fBscontrol\fR(1), \fBslurm_get_errno\fR(3), +\fBslurm_perror\fR(3), \fBslurm_strerror\fR(3) diff --git a/doc/man/man3/slurm_update_step.3 b/doc/man/man3/slurm_update_step.3 new file mode 100644 index 00000000000..ba66f51f6b2 --- /dev/null +++ b/doc/man/man3/slurm_update_step.3 @@ -0,0 +1 @@ +.so man3/slurm_update_job.3 diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 8cb9e29d25e..bb781d55fa2 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -743,6 +743,12 @@ typedef struct job_info_msg { job_info_t *job_array; /* the job records */ } job_info_msg_t; +typedef struct step_update_request_msg { + uint32_t job_id; + uint32_t step_id; + uint32_t time_limit; /* In minutes */ +} step_update_request_msg_t; + typedef struct slurm_step_layout { uint32_t node_cnt; /* node count */ char *node_list; /* list of nodes in step */ @@ -1898,7 +1904,8 @@ extern void *slurm_ctl_conf_2_key_pairs PARAMS(( * RET 0 or -1 on error * NOTE: free the response using slurm_free_slurmd_status() */ -extern int slurm_load_slurmd_status PARAMS((slurmd_status_t **slurmd_status_ptr)); +extern int slurm_load_slurmd_status PARAMS(( + slurmd_status_t **slurmd_status_ptr)); /* * slurm_free_slurmd_status - free slurmd state information @@ -1927,6 +1934,19 @@ void slurm_print_slurmd_status PARAMS(( void slurm_print_key_pairs PARAMS(( FILE* out, void* key_pairs, char *title)); +/* + * slurm_init_update_step_msg - initialize step update message with default + * values before calling slurm_update_step() + * OUT step_msg - step update messasge descriptor + */ +extern void slurm_init_update_step_msg PARAMS(( + step_update_request_msg_t * step_msg)); + +/* Update the time limit of a job step, + * IN step_msg - step update messasge descriptor + * RET 0 or -1 on error */ +extern int slurm_update_step PARAMS(( step_update_request_msg_t * step_msg )); + /*****************************************************************************\ * SLURM JOB RESOURCES READ/PRINT FUNCTIONS \*****************************************************************************/ diff --git a/src/api/init_msg.c b/src/api/init_msg.c index 7fbfd7945fc..acab69ff9ad 100644 --- a/src/api/init_msg.c +++ b/src/api/init_msg.c @@ -101,6 +101,19 @@ void slurm_init_job_desc_msg(job_desc_msg_t * job_desc_msg) job_desc_msg->user_id = NO_VAL; } +/* + * slurm_init_update_step_msg - initialize step update message with default + * values before calling slurm_update_step() + * OUT step_msg - step update messasge descriptor + */ +extern void slurm_init_update_step_msg (step_update_request_msg_t * step_msg) +{ + memset(step_msg, 0, sizeof(step_update_request_msg_t)); + step_msg->job_id = NO_VAL; + step_msg->step_id = NO_VAL; + step_msg->time_limit = NO_VAL; +} + /* * slurm_init_part_desc_msg - initialize partition descriptor with * default values diff --git a/src/api/update_config.c b/src/api/update_config.c index be7076bc7ce..69ad8c415a9 100644 --- a/src/api/update_config.c +++ b/src/api/update_config.c @@ -3,7 +3,7 @@ * $Id$ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. - * Copyright (C) 2008-2009 Lawrence Livermore National Security. + * Copyright (C) 2008-2010 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> and Kevin Tew <tew1@llnl.gov>. * CODE-OCEC-09-009. All rights reserved. @@ -186,6 +186,14 @@ slurm_update_block ( update_block_msg_t * block_msg ) return _slurm_update ((void *) block_msg, REQUEST_UPDATE_BLOCK); } +/* Update the time limit of a job step, + * step_id == NO_VAL updates all job steps of the specified job_id + * RET 0 or -1 on error */ +int +slurm_update_step ( step_update_request_msg_t * step_msg ) +{ + return _slurm_update ((void *) step_msg, REQUEST_UPDATE_JOB_STEP); +} /* _slurm_update - issue RPC for all update requests */ static int diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 989c0b86440..fd98890b3a2 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -4,7 +4,7 @@ * the slurm daemons directly, not for user client use. ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. - * Copyright (C) 2008-2009 Lawrence Livermore National Security. + * Copyright (C) 2008-2010 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Kevin Tew <tew1@llnl.gov> et. al. * CODE-OCEC-09-009. All rights reserved. @@ -269,6 +269,11 @@ void slurm_free_job_id_request_msg(job_id_request_msg_t * msg) xfree(msg); } +void slurm_free_update_step_msg(step_update_request_msg_t * msg) +{ + xfree(msg); +} + void slurm_free_job_id_response_msg(job_id_response_msg_t * msg) { xfree(msg); @@ -1956,6 +1961,8 @@ extern int slurm_free_msg_data(slurm_msg_type_t type, void *data) case RESPONSE_TOPO_INFO: slurm_free_topo_info_msg(data); break; + case REQUEST_UPDATE_JOB_STEP: + slurm_free_update_step_msg(data); default: error("invalid type trying to be freed %u", type); break; diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index b9bd4b67b41..0c43ec1f9d3 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -240,7 +240,7 @@ typedef enum { RESPONSE_RUN_JOB_STEP, REQUEST_CANCEL_JOB_STEP, RESPONSE_CANCEL_JOB_STEP, - DEFUNCT_REQUEST_COMPLETE_JOB_STEP, /* DEFUNCT */ + REQUEST_UPDATE_JOB_STEP, DEFUNCT_RESPONSE_COMPLETE_JOB_STEP, /* DEFUNCT */ REQUEST_CHECKPOINT, RESPONSE_CHECKPOINT, @@ -1001,6 +1001,7 @@ void inline slurm_free_checkpoint_comp_msg(checkpoint_comp_msg_t *msg); void inline slurm_free_checkpoint_task_comp_msg(checkpoint_task_comp_msg_t *msg); void inline slurm_free_checkpoint_resp_msg(checkpoint_resp_msg_t *msg); void inline slurm_free_suspend_msg(suspend_msg_t *msg); +void slurm_free_update_step_msg(step_update_request_msg_t * msg); void slurm_free_resource_allocation_response_msg ( resource_allocation_response_msg_t * msg); void slurm_free_job_alloc_info_response_msg ( diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 850a17c67bf..2e9038089a4 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -550,6 +550,11 @@ static int _unpack_job_sbcast_cred_msg(job_sbcast_cred_msg_t **msg, Buf buffer, uint16_t protocol_version); +static void _pack_update_job_step_msg(step_update_request_msg_t * msg, + Buf buffer, uint16_t protocol_version); +static int _unpack_update_job_step_msg(step_update_request_msg_t ** msg_ptr, + Buf buffer, uint16_t protocol_version); + /* pack_header * packs a slurm protocol header that proceeds every slurm message * IN header - the header structure to pack @@ -684,6 +689,10 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) msg->data, buffer, msg->protocol_version); break; + case REQUEST_UPDATE_JOB_STEP: + _pack_update_job_step_msg((step_update_request_msg_t *) + msg->data, buffer, + msg->protocol_version); case REQUEST_JOB_END_TIME: case REQUEST_JOB_ALLOCATION_INFO: case REQUEST_JOB_ALLOCATION_INFO_LITE: @@ -1147,6 +1156,11 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) buffer, msg->protocol_version); break; + case REQUEST_UPDATE_JOB_STEP: + rc = _unpack_update_job_step_msg( + (step_update_request_msg_t **) & (msg->data), + buffer, msg->protocol_version); + break; case REQUEST_JOB_END_TIME: case REQUEST_JOB_ALLOCATION_INFO: case REQUEST_JOB_ALLOCATION_INFO_LITE: @@ -5718,6 +5732,35 @@ unpack_error: return SLURM_ERROR; } +static void +_pack_update_job_step_msg(step_update_request_msg_t * msg, Buf buffer, + uint16_t protocol_version) +{ + pack32(msg->job_id, buffer); + pack32(msg->step_id, buffer); + pack32(msg->time_limit, buffer); +} + +static int +_unpack_update_job_step_msg(step_update_request_msg_t ** msg_ptr, Buf buffer, + uint16_t protocol_version) +{ + step_update_request_msg_t *msg; + + msg = xmalloc(sizeof(step_update_request_msg_t)); + *msg_ptr = msg; + + safe_unpack32(&msg->job_id, buffer); + safe_unpack32(&msg->step_id, buffer); + safe_unpack32(&msg->time_limit, buffer); + return SLURM_SUCCESS; + +unpack_error: + slurm_free_update_step_msg(msg); + *msg_ptr = NULL; + return SLURM_ERROR; +} + static void _pack_complete_job_allocation_msg( complete_job_allocation_msg_t * msg, Buf buffer, diff --git a/src/scontrol/Makefile.am b/src/scontrol/Makefile.am index 48df13f8dbb..23b4a62f227 100644 --- a/src/scontrol/Makefile.am +++ b/src/scontrol/Makefile.am @@ -17,7 +17,8 @@ scontrol_SOURCES = \ scontrol.h \ update_job.c \ update_node.c \ - update_part.c + update_part.c \ + update_step.c convenience_libs = $(top_builddir)/src/api/libslurm.o -ldl -lm diff --git a/src/scontrol/Makefile.in b/src/scontrol/Makefile.in index b5daf3da792..e65115d147e 100644 --- a/src/scontrol/Makefile.in +++ b/src/scontrol/Makefile.in @@ -86,7 +86,8 @@ PROGRAMS = $(bin_PROGRAMS) am_scontrol_OBJECTS = create_res.$(OBJEXT) info_block.$(OBJEXT) \ info_job.$(OBJEXT) info_node.$(OBJEXT) info_part.$(OBJEXT) \ info_res.$(OBJEXT) scontrol.$(OBJEXT) update_job.$(OBJEXT) \ - update_node.$(OBJEXT) update_part.$(OBJEXT) + update_node.$(OBJEXT) update_part.$(OBJEXT) \ + update_step.$(OBJEXT) scontrol_OBJECTS = $(am_scontrol_OBJECTS) am__DEPENDENCIES_1 = $(top_builddir)/src/api/libslurm.o am__DEPENDENCIES_2 = @@ -309,7 +310,8 @@ scontrol_SOURCES = \ scontrol.h \ update_job.c \ update_node.c \ - update_part.c + update_part.c \ + update_step.c convenience_libs = $(top_builddir)/src/api/libslurm.o -ldl -lm scontrol_LDADD = \ @@ -414,6 +416,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/update_job.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/update_node.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/update_part.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/update_step.Po@am__quote@ .c.o: @am__fastdepCC_TRUE@ $(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index 9c461d8af41..4f7e27dad26 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -3,7 +3,7 @@ * provides interface to read, write, update, and configurations. ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. - * Copyright (C) 2008-2009 Lawrence Livermore National Security. + * Copyright (C) 2008-2010 Lawrence Livermore National Security. * Portions Copyright (C) 2008 Vijay Ramasubramanian. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> @@ -729,8 +729,9 @@ _process_command (int argc, char *argv[]) level = -1; exit_code = 1; if (quiet_flag != 1) - fprintf(stderr, "invalid debug " - "level: %s\n", argv[1]); + fprintf(stderr, "invalid " + "debug level: %s\n", + argv[1]); } } if (level != -1) { @@ -1081,7 +1082,7 @@ _update_it (int argc, char *argv[]) int i, error_code = SLURM_SUCCESS; int nodetag=0, partag=0, jobtag=0; int blocktag=0, subtag=0, restag=0; - int debugtag=0; + int debugtag=0, steptag=0; /* First identify the entity to update */ for (i=0; i<argc; i++) { @@ -1097,8 +1098,10 @@ _update_it (int argc, char *argv[]) nodetag=1; } else if (!strncasecmp(tag, "PartitionName", MAX(taglen, 3))) { partag=1; - } else if (!strncasecmp(tag, "JobId", MAX(taglen, 1))) { + } else if (!strncasecmp(tag, "JobId", MAX(taglen, 3))) { jobtag=1; + } else if (!strncasecmp(tag, "StepId", MAX(taglen, 4))) { + steptag=1; } else if (!strncasecmp(tag, "BlockName", MAX(taglen, 3))) { blocktag=1; } else if (!strncasecmp(tag, "SubBPName", MAX(taglen, 3))) { @@ -1113,14 +1116,16 @@ _update_it (int argc, char *argv[]) } /* The order of tests matters here. An update job request can include - partition and reservation tags, possibly before the jobid tag, but - none of the other updates have a jobid tag, so check jobtag first. - Likewise, check restag next, because reservations can have a - partition tag. The order of the rest doesn't matter because there - aren't any other duplicate tags. */ + * partition and reservation tags, possibly before the jobid tag, but + * none of the other updates have a jobid tag, so check jobtag first. + * Likewise, check restag next, because reservations can have a + * partition tag. The order of the rest doesn't matter because there + * aren't any other duplicate tags. */ if (jobtag) error_code = scontrol_update_job (argc, argv); + else if (steptag) + error_code = scontrol_update_step (argc, argv); else if (restag) error_code = scontrol_update_res (argc, argv); else if (nodetag) @@ -1394,8 +1399,8 @@ scontrol [<OPTION>] [<COMMAND>] \n\ (the primary controller will be stopped) \n\ suspend <job_id> susend specified job \n\ resume <job_id> resume previously suspended job \n\ - update <SPECIFICATIONS> update job, node, partition, reservation, or \n\ - bluegene block/subbp configuration \n\ + update <SPECIFICATIONS> update job, node, partition, reservation, \n\ + step or bluegene block/subbp configuration \n\ verbose enable detailed logging. \n\ version display tool version number. \n\ !! Repeat the last command entered. \n\ diff --git a/src/scontrol/scontrol.h b/src/scontrol/scontrol.h index 4e2baeb610b..ba31d76bf68 100644 --- a/src/scontrol/scontrol.h +++ b/src/scontrol/scontrol.h @@ -107,10 +107,9 @@ extern int scontrol_job_notify(int argc, char *argv[]); extern int scontrol_load_jobs (job_info_msg_t ** job_buffer_pptr); extern int scontrol_load_nodes (node_info_msg_t ** node_buffer_pptr, uint16_t show_flags); -extern int scontrol_load_partitions ( - partition_info_msg_t **part_info_pptr); -extern int scontrol_load_block( - block_info_msg_t **block_info_pptr); +extern int scontrol_load_partitions (partition_info_msg_t ** + part_info_pptr); +extern int scontrol_load_block (block_info_msg_t **block_info_pptr); extern void scontrol_pid_info(pid_t job_pid); extern void scontrol_print_completing (void); extern void scontrol_print_completing_job(job_info_t *job_ptr, @@ -131,6 +130,7 @@ extern int scontrol_update_job (int argc, char *argv[]); extern int scontrol_update_node (int argc, char *argv[]); extern int scontrol_update_part (int argc, char *argv[]); extern int scontrol_update_res (int argc, char *argv[]); +extern int scontrol_update_step (int argc, char *argv[]); extern void scontrol_list_pids(const char *jobid_str, const char *node_name); extern int scontrol_create_part(int argc, char *argv[]); diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index b1a8d96268e..91d81d2c408 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -2,7 +2,7 @@ * update_job.c - update job functions for scontrol. ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. - * Copyright (C) 2008-2009 Lawrence Livermore National Security. + * Copyright (C) 2008-2010 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> * CODE-OCEC-09-009. All rights reserved. @@ -296,7 +296,7 @@ scontrol_update_job (int argc, char *argv[]) return -1; } - if (strncasecmp(tag, "JobId", MAX(taglen, 1)) == 0) { + if (strncasecmp(tag, "JobId", MAX(taglen, 3)) == 0) { job_msg.job_id = (uint32_t) strtol(val, (char **) NULL, 10); } diff --git a/src/scontrol/update_step.c b/src/scontrol/update_step.c new file mode 100644 index 00000000000..313b65b8520 --- /dev/null +++ b/src/scontrol/update_step.c @@ -0,0 +1,117 @@ +/*****************************************************************************\ + * update_step.c - update step functions for scontrol. + ***************************************************************************** + * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008-2010 Lawrence Livermore National Security. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov> + * CODE-OCEC-09-009. All rights reserved. + * + * This file is part of SLURM, a resource management program. + * For details, see <https://computing.llnl.gov/linux/slurm/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#include "scontrol.h" +#include "src/common/proc_args.h" + +/* + * scontrol_update_step - update the slurm step configuration per the supplied + * arguments + * IN argc - count of arguments + * IN argv - list of arguments + * RET 0 if no slurm error, errno otherwise. parsing error prints + * error message and returns 0 + */ +extern int scontrol_update_step (int argc, char *argv[]) +{ + int i, update_cnt = 0; + char *tag, *val; + int taglen, vallen; + step_update_request_msg_t step_msg; + + slurm_init_update_step_msg (&step_msg); + + for (i=0; i<argc; i++) { + tag = argv[i]; + val = strchr(argv[i], '='); + if (val) { + taglen = val - argv[i]; + val++; + vallen = strlen(val); + } else { + exit_code = 1; + fprintf (stderr, "Invalid input: %s\n", argv[i]); + fprintf (stderr, "Request aborted\n"); + return -1; + } + + if (strncasecmp(tag, "StepId", MAX(taglen, 4)) == 0) { + char *end_ptr; + step_msg.job_id = (uint32_t) strtol(val, &end_ptr, 10); + if (end_ptr[0] == '.') { + step_msg.step_id = (uint32_t) + strtol(end_ptr+1, (char **) NULL, 10); + } else if (end_ptr[0] != '\0') { + exit_code = 1; + fprintf (stderr, "Invalid StepID parameter: " + "%s\n", argv[i]); + fprintf (stderr, "Request aborted\n"); + return 0; + } /* else apply to all steps of this job_id */ + } + else if (strncasecmp(tag, "TimeLimit", MAX(taglen, 2)) == 0) { + int new_limit = time_str2mins(val); + if ((new_limit < 0) && (new_limit != INFINITE)) { + error("Invalid TimeLimit value"); + exit_code = 1; + return 0; + } + step_msg.time_limit = new_limit; + update_cnt++; + } + else { + exit_code = 1; + fprintf (stderr, "Update of this parameter is not " + "supported: %s\n", argv[i]); + fprintf (stderr, "Request aborted\n"); + return 0; + } + } + + if (update_cnt == 0) { + exit_code = 1; + fprintf (stderr, "No changes specified\n"); + return 0; + } + + if (slurm_update_step(&step_msg)) + return slurm_get_errno (); + else + return 0; +} diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 39d744afa43..6a06f47ca95 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -90,6 +90,9 @@ static int _launch_batch_step(job_desc_msg_t *job_desc_msg, uid_t uid, uint32_t *step_id); static int _make_step_cred(struct step_record *step_rec, slurm_cred_t **slurm_cred); + +inline static void _slurm_rpc_accounting_update_msg(slurm_msg_t *msg); +inline static void _slurm_rpc_accounting_first_reg(slurm_msg_t *msg); inline static void _slurm_rpc_allocate_resources(slurm_msg_t * msg); inline static void _slurm_rpc_checkpoint(slurm_msg_t * msg); inline static void _slurm_rpc_checkpoint_comp(slurm_msg_t * msg); @@ -105,6 +108,7 @@ inline static void _slurm_rpc_get_topo(slurm_msg_t * msg); inline static void _slurm_rpc_get_priority_factors(slurm_msg_t *msg); inline static void _slurm_rpc_dump_nodes(slurm_msg_t * msg); inline static void _slurm_rpc_dump_partitions(slurm_msg_t * msg); +inline static void _slurm_rpc_end_time(slurm_msg_t * msg); inline static void _slurm_rpc_epilog_complete(slurm_msg_t * msg); inline static void _slurm_rpc_job_notify(slurm_msg_t * msg); inline static void _slurm_rpc_job_ready(slurm_msg_t * msg); @@ -125,11 +129,13 @@ inline static void _slurm_rpc_resv_delete(slurm_msg_t * msg); inline static void _slurm_rpc_resv_show(slurm_msg_t * msg); inline static void _slurm_rpc_requeue(slurm_msg_t * msg); inline static void _slurm_rpc_takeover(slurm_msg_t * msg); +inline static void _slurm_rpc_set_debug_level(slurm_msg_t *msg); inline static void _slurm_rpc_shutdown_controller(slurm_msg_t * msg); inline static void _slurm_rpc_shutdown_controller_immediate(slurm_msg_t * msg); inline static void _slurm_rpc_step_complete(slurm_msg_t * msg); inline static void _slurm_rpc_step_layout(slurm_msg_t * msg); +inline static void _slurm_rpc_step_update(slurm_msg_t * msg); inline static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg); inline static void _slurm_rpc_suspend(slurm_msg_t * msg); inline static void _slurm_rpc_trigger_clear(slurm_msg_t * msg); @@ -139,11 +145,8 @@ inline static void _slurm_rpc_update_job(slurm_msg_t * msg); inline static void _slurm_rpc_update_node(slurm_msg_t * msg); inline static void _slurm_rpc_update_partition(slurm_msg_t * msg); inline static void _slurm_rpc_update_block(slurm_msg_t * msg); -inline static void _slurm_rpc_end_time(slurm_msg_t * msg); + inline static void _update_cred_key(void); -inline static void _slurm_rpc_set_debug_level(slurm_msg_t *msg); -inline static void _slurm_rpc_accounting_update_msg(slurm_msg_t *msg); -inline static void _slurm_rpc_accounting_first_reg(slurm_msg_t *msg); /* @@ -347,6 +350,10 @@ void slurmctld_req (slurm_msg_t * msg) _slurm_rpc_step_layout(msg); slurm_free_job_step_id_msg(msg->data); break; + case REQUEST_UPDATE_JOB_STEP: + _slurm_rpc_step_update(msg); + slurm_free_update_step_msg(msg->data); + break; case REQUEST_TRIGGER_SET: _slurm_rpc_trigger_set(msg); slurm_free_trigger_msg(msg->data); @@ -2178,6 +2185,31 @@ static void _slurm_rpc_step_layout(slurm_msg_t *msg) slurm_step_layout_destroy(step_layout); } +/* _slurm_rpc_step_update - update a job step + */ +static void _slurm_rpc_step_update(slurm_msg_t *msg) +{ + DEF_TIMERS; + step_update_request_msg_t *req = + (step_update_request_msg_t *) msg->data; + /* Locks: Write job */ + slurmctld_lock_t job_write_lock = { + NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; + uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); + int rc; + + START_TIMER; + debug2("Processing RPC: REQUEST_STEP_UPDATE, from uid=%u", + (unsigned int) uid); + + lock_slurmctld(job_write_lock); + rc = update_step(req, uid); + unlock_slurmctld(job_write_lock); + END_TIMER2("_slurm_rpc_step_update"); + + slurm_send_rc_msg(msg, rc); +} + /* _slurm_rpc_submit_batch_job - process RPC to submit a batch job */ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) { diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index bb2bdaf7f87..7831b5db5d5 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -274,7 +274,7 @@ typedef struct slurmctld_resv { /*****************************************************************************\ * JOB parameters and data structures \*****************************************************************************/ -extern time_t last_job_update; /* time of last update to part records */ +extern time_t last_job_update; /* time of last update to job records */ #define DETAILS_MAGIC 0xdea84e7 #define JOB_MAGIC 0xf0b7392c @@ -1549,6 +1549,10 @@ extern int update_node ( update_node_msg_t * update_node_msg ) ; */ extern int update_part (update_part_msg_t * part_desc, bool create_flag); +/* Process job step update request from specified user, + * RET - 0 or error code */ +extern int update_step(step_update_request_msg_t *req, uid_t uid); + /* * validate_alloc_node - validate that the allocating node * is allowed to use this partition diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 7b5656b29c3..05125fe335d 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -2302,7 +2302,8 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer, uint32_t step_id, time_limit; time_t start_time, pre_sus_time, tot_sus_time, ckpt_time; char *host = NULL, *ckpt_dir = NULL, *core_job = NULL; - char *resv_ports = NULL, *name = NULL, *network = NULL, *bit_fmt = NULL; + char *resv_ports = NULL, *name = NULL, *network = NULL; + char *bit_fmt = NULL; switch_jobinfo_t *switch_tmp = NULL; check_jobinfo_t check_tmp = NULL; slurm_step_layout_t *step_layout = NULL; @@ -2616,7 +2617,6 @@ check_job_step_time_limit (struct job_record *job_ptr, time_t now) _signal_step_timelimit(job_ptr, step_ptr, now); } } - list_iterator_destroy (step_iterator); } @@ -2639,3 +2639,52 @@ static bool _is_mem_resv(void) return mem_resv_value; } + +/* Process job step update request from specified user, + * RET - 0 or error code */ +extern int update_step(step_update_request_msg_t *req, uid_t uid) +{ + struct job_record *job_ptr; + struct step_record *step_ptr; + ListIterator step_iterator; + int mod_cnt = 0; + + job_ptr = find_job_record(req->job_id); + if (job_ptr == NULL) { + error("update_step: invalid job id %u", req->job_id); + return ESLURM_INVALID_JOB_ID; + } + + if ((job_ptr->user_id != uid) && (uid != 0) && (uid != getuid())) { + error("Security violation, STEP_UPDATE RPC from uid %d", + uid); + return ESLURM_USER_ID_MISSING; + } + + /* No need to limit step time limit as job time limit will kill + * any steps with any time limit */ + if (req->step_id == NO_VAL) { + step_iterator = list_iterator_create (job_ptr->step_list); + while ((step_ptr = (struct step_record *) + list_next (step_iterator))) { + step_ptr->time_limit = req->time_limit; + mod_cnt++; + info("Updating step %u.%u time limit to %u", + req->job_id, step_ptr->step_id, req->time_limit); + } + list_iterator_destroy (step_iterator); + } else { + step_ptr = find_step_record(job_ptr, req->step_id); + if (step_ptr) { + step_ptr->time_limit = req->time_limit; + mod_cnt++; + info("Updating step %u.%u time limit to %u", + req->job_id, req->step_id, req->time_limit); + } else + return ESLURM_INVALID_JOB_ID; + } + if (mod_cnt) + last_job_update = time(NULL); + + return SLURM_SUCCESS; +} -- GitLab