From d9ad1b45d4ce5f9fe0c9c1cac98acb4f1eaa4d6d Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 23 Mar 2006 02:52:29 +0000 Subject: [PATCH] Add new function slurm_get_rem_time() for job's time limit. --- NEWS | 6 +- doc/man/Makefile.am | 2 +- doc/man/man3/slurm_free_job_info_msg.3 | 63 +++----- ...{slurm_job_warn.3 => slurm_get_rem_time.3} | 0 slurm/slurm.h.in | 17 +-- src/api/job_info.c | 144 +++++++++--------- src/common/slurm_protocol_defs.c | 2 +- src/common/slurm_protocol_defs.h | 1 + src/common/slurm_protocol_pack.c | 2 + src/scontrol/scontrol.c | 13 +- src/slurmctld/job_mgr.c | 25 ++- src/slurmctld/proc_req.c | 43 +++++- src/slurmctld/slurmctld.h | 9 ++ src/slurmctld/srun_comm.c | 12 +- src/slurmctld/srun_comm.h | 7 +- testsuite/expect/test2.7 | 24 ++- 16 files changed, 223 insertions(+), 147 deletions(-) rename doc/man/man3/{slurm_job_warn.3 => slurm_get_rem_time.3} (100%) diff --git a/NEWS b/NEWS index 16d6afe7e43..7ac64d4f83d 100644 --- a/NEWS +++ b/NEWS @@ -11,12 +11,12 @@ documents those changes that are of interest to users and admins. WARNING: A NodeName may now occur only once in a slurm.conf file. If you want to temporarily make nodes DOWN in the slurm.conf, use the new DownNodes keyword (see "man slurm.conf"). - -- Gracefully handle request to submit batch job from within an existing batch job. + -- Gracefully handle request to submit batch job from within an existing + batch job. -- Warn user attempting to create a job allocation from within an existing job allocation. -- Add web page description for proctrack plugin. - -- Add new function slurm_job_warn() to notify when a job's time limit approches - (not yet fully implemented). + -- Add new function slurm_get_rem_time() for job's time limit. -- JobAcct plugin renamed from "log" to "linux" in preparation for support of new system types. WARNING: "JobAcctType=jobacct/log" is no longer supported. diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am index 758481a8b0f..328bc49c199 100644 --- a/doc/man/Makefile.am +++ b/doc/man/Makefile.am @@ -43,11 +43,11 @@ man3_MANS = man3/slurm_hostlist_create.3 \ man3/slurm_get_end_time.3 \ man3/slurm_get_errno.3 \ man3/slurm_get_job_steps.3 \ + man3/slurm_get_rem_time.3 \ man3/slurm_get_select_jobinfo.3 \ man3/slurm_init_job_desc_msg.3 \ man3/slurm_init_part_desc_msg.3 \ man3/slurm_job_step_create.3 \ - man3/slurm_job_warn.3 \ man3/slurm_job_will_run.3 \ man3/slurm_jobinfo_ctx_get.3 \ man3/slurm_kill_job.3 \ diff --git a/doc/man/man3/slurm_free_job_info_msg.3 b/doc/man/man3/slurm_free_job_info_msg.3 index dcd8911eb02..80f3e92d44e 100644 --- a/doc/man/man3/slurm_free_job_info_msg.3 +++ b/doc/man/man3/slurm_free_job_info_msg.3 @@ -1,15 +1,16 @@ .TH "Slurm API" "3" "March 2006" "Morris Jette" "Slurm job information reporting functions" .SH "NAME" slurm_free_job_info_msg, slurm_get_end_time, -slurm_get_select_jobinfo, slurm_job_warn, +slurm_get_rem_time, slurm_get_select_jobinfo, slurm_load_jobs, slurm_pid2jobid, slurm_print_job_info, slurm_print_job_info_msg \- Slurm job information reporting functions - .SH "SYNTAX" .LP #include <stdio.h> .br +#include <time.h> +.br #include <slurm/slurm.h> .br #include <sys/types.h> @@ -46,17 +47,9 @@ int \fBslurm_get_end_time\fR ( .br ); .LP -int \fBslurm_job_warn\fR ( -.br - uint32_t \fIjobid\fP, -.br - uint32_t \fImin_time\fP, -.br - uint32_t * \fIrem_time_ptr\fP, +long \fBslurm_get_rem_time\fR ( .br - uint16_t \fIsignal\fP, -.br - uint16_t * \fIwarn_ptr\fP + uint32_t \fIjobid\fP .br ); .LP @@ -117,7 +110,7 @@ number into the node information records and the data is terminated with a value of -1. See slurm.h for full details on the data structure's contents. .TP \fIjob_id\fP -Specifies a slurm job id. If zero, use the SLURM_JOBID environment variable +Specifies a slurm job id. If zero, use the SLURM_JOBID environment variable to get the jobid. .TP \fIjob_id_ptr\fP @@ -137,23 +130,13 @@ Specifies a process id of some process on the current node. \fIjob_ptr\fP Specifies a pointer to a single job records from the \fIjob_info_msg_ptr\fP data structure. -.TP -\fImin_time\fP -Number of seconds before termination to notify job. -.TP +.TP \fIone_liner\fP Print one record per line if non-zero. .TP \fIout_file\fP Specifies the file to print data to. -.TP -\fIrem_time_ptr\fP -Pointer to variable that is set to the number of seconds remaining -before job termination. Unused if NULL. -.TP -\fIsignal\fP -Signal to send to the job. No signal is sent if value is zero. -.TP +.TP \fIshow_flags\fP Job filtering flags, may be ORed. Information about jobs in partitions that are configured as @@ -167,27 +150,20 @@ For all of the following informational calls, if update_time is equal to or greater than the last time changes where made to that information, new information is not returned. Otherwise all the configuration. job, node, or partition records are returned. -.TP -\fIwarn_ptr\fP -Pointer to integer which is set to 1 when remaining time is less than or -equal to \fImin_time\fP, user should initialize the integer to 0. -Unused if NULL. - .SH "DESCRIPTION" .LP \fBslurm_free_job_info_msg\fR Release the storage generated by the \fBslurm_load_jobs\fR function. .LP \fBslurm_get_end_time\fR Returns the expected termination time of a specified -Slurm job id. The time corresponds to the exhaustion of the job's or partition's -time limit. +SLURM job. The time corresponds to the exhaustion of the job's or partition's +time limit. NOTE: The data is cached locally and only retrieved from the +SLURM controller once per minute. .LP -\fBslurm_job_warn\fR Warns a job before it reaches its time limit and is -terminated. The user's job can be warned via signal and/or a flag with a -lead time that the user specifies. The signal will be sent to the specific -process issuing this call. -NOTE: Only the last function call with a \fIsignal\fP or \fIwarn_ptr\fP -set will be honored. Earlier requests for notification will be ignored. +\fBslurm_get_rem_time\fR Returns the number of seconds remaining before the +expected termination time of a specified SLURM job id. The time corresponds +to the exhaustion of the job's or partition's time limit. NOTE: The data is +cached locally and only retrieved from the SLURM controller once per minute. .LP \fBslurm_load_jobs\fR Returns a job_info_msg_t that contains an update time, record count, and array of job_table records for all jobs. @@ -202,10 +178,12 @@ describing a single job records from the data loaded by the .LP \fBslurm_print_job_info_msg\fR Prints the contents of the data structure describing all job records loaded by the \fBslurm_load_node\fR function. + .SH "RETURN VALUE" .LP -On success, zero is returned. On error, -1 is returned, and Slurm error code -is set appropriately. +For \fBslurm_get_rem_time\fR on success a number of seconds is returned. +For all other functions zero is returned on success. +On error, -1 is returned, and Slurm error code is set appropriately. .SH "ERRORS" .LP @@ -218,6 +196,8 @@ your code. .LP \fBSLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT\fR Timeout in communicating with SLURM controller. +.LP +\fBINVAL\fR Invalid function argument. .SH "EXAMPLE" .LP @@ -341,6 +321,7 @@ SLURM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + .SH "SEE ALSO" .LP \fBscontrol\fR(1), \fBsqueue\fR(1), \fBslurm_allocation_lookup\fR(3), diff --git a/doc/man/man3/slurm_job_warn.3 b/doc/man/man3/slurm_get_rem_time.3 similarity index 100% rename from doc/man/man3/slurm_job_warn.3 rename to doc/man/man3/slurm_get_rem_time.3 diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 08f6a65df46..dba14ba52ea 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1034,20 +1034,11 @@ extern void slurm_print_job_info PARAMS(( FILE*, job_info_t * job_ptr, extern int slurm_get_end_time PARAMS((uint32_t jobid, time_t *end_time_ptr)); /* - * slurm_job_warn - warn a job before it reaches its time limit and is - * terminated. - * IN jobid - slurm job id (if zero, use SLURM_JOBID env var) - * IN min_time - number of seconds before termination to notify job - * OUT rem_time_ptr - number of seconds remaining before termination, - * unused if NULL - * IN signal - signal to send job (if zero, do not signal) - * OUT warn_ptr - set to 1 when remaining time is less than or - * equal to min_time, user should initialize to 0, - * unused if NULL - * RET 0 or -1 on error + * slurm_get_rem_time - get the expected time remaining for a given job + * IN jobid - slurm job id + * RET remaining time in seconds or -1 on error */ -extern int slurm_job_warn PARAMS((uint32_t jobid, uint32_t min_time, - uint32_t *rem_time_ptr, uint16_t signal, uint16_t *warn_ptr)); +extern long slurm_get_rem_time PARAMS((uint32_t jobid)); /* * slurm_pid2jobid - issue RPC to get the slurm job_id given a process_id diff --git a/src/api/job_info.c b/src/api/job_info.c index 08310666cc7..ec1591fff57 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -45,6 +45,7 @@ #include "src/common/node_select.h" #include "src/common/slurm_protocol_api.h" #include "src/common/uid.h" +#include "src/common/xstring.h" /* * slurm_print_job_info_msg - output information about all Slurm @@ -412,98 +413,103 @@ slurm_pid2jobid (pid_t job_pid, uint32_t *jobid) } /* - * slurm_get_end_time - get the expected end time for a given slurm job + * slurm_get_rem_time - get the expected time remaining for a given job * IN jobid - slurm job id - * end_time_ptr - location in which to store scheduled end time for job - * RET 0 or -1 on error + * RET remaining time in seconds or -1 on error */ -extern int -slurm_get_end_time(uint32_t jobid, time_t *end_time_ptr) +extern long slurm_get_rem_time(uint32_t jobid) { - int error_code, i; - job_info_msg_t *jinfo; - job_info_t *job_ptr; - - if (jobid == 0) { - char *env = getenv("SLURM_JOBID"); - if (env) - jobid = (uint32_t) atol(env); - if (jobid == 0) { - slurm_seterrno(ESLURM_INVALID_JOB_ID); - return SLURM_ERROR; - } - } - - if ((error_code = slurm_load_jobs ((time_t) NULL, &jinfo, 1))) - return error_code; + time_t now = time(NULL); + time_t end_time; + long rc; - error_code = SLURM_ERROR; /* error until job found */ - job_ptr = jinfo->job_array; - for (i = 0; i < jinfo->record_count; i++) { - if (job_ptr[i].job_id != jobid) - continue; - *end_time_ptr = job_ptr[i].end_time; - error_code = SLURM_SUCCESS; - break; - } - slurm_free_job_info_msg(jinfo); + if (slurm_get_end_time(jobid, &end_time) != SLURM_SUCCESS) + return -1L; - if (error_code) - slurm_seterrno(ESLURM_INVALID_JOB_ID); - return error_code; + rc = difftime(end_time, now); + if (rc < 0) + rc = 0L; + return rc; } /* - * slurm_job_warn - warn a job before it reaches its time limit and is - * terminated. - * IN jobid - slurm job id (if zero, use SLURM_JOBID env var) - * IN min_time - number of seconds before termination to notify job - * OUT rem_time_ptr - number of seconds remaining before termination, - * unused if NULL - * IN signal - signal to send job (if zero, do not signal) - * OUT warn_ptr - set to 1 when remaining time is less than or - * equal to min_time, user should initialize to 0, - * unused if NULL + * slurm_get_end_time - get the expected end time for a given slurm job + * IN jobid - slurm job id + * end_time_ptr - location in which to store scheduled end time for job * RET 0 or -1 on error */ extern int -slurm_job_warn(uint32_t jobid, uint32_t min_time, uint32_t *rem_time_ptr, - uint16_t signal, uint16_t *warn_ptr) +slurm_get_end_time(uint32_t jobid, time_t *end_time_ptr) { - time_t end_time; - long end_delay; + int rc; + slurm_msg_t resp_msg; + slurm_msg_t req_msg; + old_job_alloc_msg_t job_msg; + srun_timeout_msg_t *timeout_msg; + time_t now = time(NULL); + static uint32_t jobid_cache = 0; + static uint32_t jobid_env = 0; + static time_t endtime_cache = 0; + static time_t last_test_time = 0; + + if (!end_time_ptr) + slurm_seterrno_ret(EINVAL); if (jobid == 0) { - char *env = getenv("SLURM_JOBID"); - if (env) - jobid = (uint32_t) atol(env); + if (jobid_env) { + jobid = jobid_env; + } else { + char *env = getenv("SLURM_JOBID"); + if (env) { + jobid = (uint32_t) atol(env); + jobid_env = jobid; + } + } if (jobid == 0) { slurm_seterrno(ESLURM_INVALID_JOB_ID); return SLURM_ERROR; } } - /* If/when there is a new system call for this, modify - * slurm_get_end_time() to use it as well */ - if (slurm_get_end_time(jobid, &end_time)) + /* Just use cached data if data less than 60 seconds old */ + if ((jobid == jobid_cache) + && (difftime(now, last_test_time) < 60)) { + *end_time_ptr = endtime_cache; + return SLURM_SUCCESS; + } + + job_msg.job_id = jobid; + req_msg.msg_type = REQUEST_JOB_END_TIME; + req_msg.data = &job_msg; + + if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; - end_delay = (long) difftime(end_time, time(NULL)); - if (rem_time_ptr) - *rem_time_ptr = (uint32_t) MAX(end_delay, 0); - if (min_time <= *rem_time_ptr) { - if (warn_ptr) - *warn_ptr = 1; - if (signal) - kill(getpid(), signal); - } else { - /* work to be done in the future */ - /* we need to address changing time limits */ - if (warn_ptr || signal) { - slurm_seterrno(ESLURM_NOT_SUPPORTED); - return SLURM_ERROR; - } + switch (resp_msg.msg_type) { + case SRUN_TIMEOUT: + timeout_msg = (srun_timeout_msg_t *) resp_msg.data; + last_test_time = time(NULL); + jobid_cache = jobid; + endtime_cache = timeout_msg->timeout; + *end_time_ptr = endtime_cache; + slurm_free_srun_timeout_msg(resp_msg.data); + break; + case RESPONSE_SLURM_RC: + rc = ((return_code_msg_t *) resp_msg.data)->return_code; + slurm_free_return_code_msg(resp_msg.data); + if (endtime_cache) + *end_time_ptr = endtime_cache; + else if (rc) + slurm_seterrno_ret(rc); + break; + default: + if (endtime_cache) + *end_time_ptr = endtime_cache; + else + slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); + break; } + return SLURM_SUCCESS; } diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 540b8d8df9d..403c1103138 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -5,7 +5,7 @@ * * $Id$ ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. + * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Kevin Tew <tew1@llnl.gov> et. al. * UCRL-CODE-217948. diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 8c2b2fd6ee1..ef58ee2e898 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -126,6 +126,7 @@ typedef enum { REQUEST_UPDATE_JOB_TIME, REQUEST_JOB_READY, RESPONSE_JOB_READY, + REQUEST_JOB_END_TIME, REQUEST_JOB_STEP_CREATE = 5001, RESPONSE_JOB_STEP_CREATE, diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 5900b8d97dc..639b39945f0 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -431,6 +431,7 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) _pack_job_desc_msg((job_desc_msg_t *) msg->data, buffer); break; + case REQUEST_JOB_END_TIME: case REQUEST_OLD_JOB_RESOURCE_ALLOCATION: _pack_old_job_desc_msg((old_job_alloc_msg_t *) msg->data, buffer); @@ -700,6 +701,7 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) rc = _unpack_job_desc_msg((job_desc_msg_t **) & (msg->data), buffer); break; + case REQUEST_JOB_END_TIME: case REQUEST_OLD_JOB_RESOURCE_ALLOCATION: rc = _unpack_old_job_desc_msg((old_job_alloc_msg_t **) & (msg->data), buffer); diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index 42963c0b8d6..e09da89c171 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -446,9 +446,10 @@ _load_partitions (partition_info_msg_t **part_buffer_pptr) static void _pid_info(pid_t job_pid) { - int error_code; + int error_code, i; uint32_t job_id; - time_t end_time; + time_t end_time, start_time = time(NULL); + long rem_time; error_code = slurm_pid2jobid (job_pid, &job_id); if (error_code) { @@ -465,9 +466,13 @@ _pid_info(pid_t job_pid) slurm_perror ("slurm_get_end_time error"); return; } + for (i=0; i<10000; i++) + rem_time = slurm_get_rem_time(0); - /* printf("Slurm job id: %u\n", job_id); old format */ - printf("Slurm job id %u ends at %s", job_id, ctime(&end_time)); + printf("Slurm job id %u ends at %s\n", job_id, ctime(&end_time)); + printf("slurm_get_rem_time is %ld\n", rem_time); + printf("10000 slurm_get_rem_time calls in %d seconds\n", + (int) difftime(time(NULL), start_time)); return; } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 477b43fbfda..d9f40599602 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2357,8 +2357,8 @@ void job_time_limit(void) } /* Give srun command warning message about pending timeout */ - if (job_ptr->end_time <= (now + 60)) - srun_timeout (job_ptr->job_id, job_ptr->end_time); + if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2)) + srun_timeout (job_ptr); /* test for and purge inactive job steps */ if (slurmctld_conf.inactive_limit == 0) @@ -4139,3 +4139,24 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, return rc; } +/* + * job_end_time - Process JOB_END_TIME + * IN time_req_msg - job end time request + * OUT timeout_msg - job timeout response to be sent + * RET SLURM_SUCESS or an error code + */ +extern int job_end_time(old_job_alloc_msg_t *time_req_msg, + srun_timeout_msg_t *timeout_msg) +{ + struct job_record *job_ptr; + xassert(timeout_msg); + + job_ptr = find_job_record(time_req_msg->job_id); + if (!job_ptr) + return ESLURM_INVALID_JOB_ID; + + timeout_msg->job_id = time_req_msg->job_id; + timeout_msg->step_id = NO_VAL; + timeout_msg->timeout = job_ptr->end_time; + return SLURM_SUCCESS; +} diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 2589ec722ea..2043dc510be 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -78,6 +78,7 @@ inline static void _slurm_rpc_allocate_resources(slurm_msg_t * msg); inline static void _slurm_rpc_allocate_and_run(slurm_msg_t * msg); inline static void _slurm_rpc_checkpoint(slurm_msg_t * msg); inline static void _slurm_rpc_checkpoint_comp(slurm_msg_t * msg); +inline static void _slurm_rpc_delete_partition(slurm_msg_t * msg); inline static void _slurm_rpc_dump_conf(slurm_msg_t * msg); inline static void _slurm_rpc_dump_jobs(slurm_msg_t * msg); inline static void _slurm_rpc_dump_nodes(slurm_msg_t * msg); @@ -102,7 +103,7 @@ inline static void _slurm_rpc_suspend(slurm_msg_t * msg); inline static void _slurm_rpc_update_job(slurm_msg_t * msg); inline static void _slurm_rpc_update_node(slurm_msg_t * msg); inline static void _slurm_rpc_update_partition(slurm_msg_t * msg); -inline static void _slurm_rpc_delete_partition(slurm_msg_t * msg); +inline static void _slurm_rpc_end_time(slurm_msg_t * msg); inline static void _update_cred_key(void); @@ -147,6 +148,10 @@ void slurmctld_req (slurm_msg_t * msg) _slurm_rpc_dump_jobs(msg); slurm_free_job_info_request_msg(msg->data); break; + case REQUEST_JOB_END_TIME: + _slurm_rpc_end_time(msg); + slurm_free_old_job_alloc_msg(msg->data); + break; case REQUEST_NODE_INFO: _slurm_rpc_dump_nodes(msg); slurm_free_node_info_request_msg(msg->data); @@ -263,7 +268,7 @@ void slurmctld_req (slurm_msg_t * msg) rc); slurm_free_jobacct_msg(msg->data); } - break; + break; default: error("invalid RPC msg_type=%d", msg->msg_type); slurm_send_rc_msg(msg, EINVAL); @@ -697,6 +702,40 @@ static void _slurm_rpc_dump_jobs(slurm_msg_t * msg) } } +/* _slurm_rpc_end_time - Process RPC for job end time */ +static void _slurm_rpc_end_time(slurm_msg_t * msg) +{ + DEF_TIMERS; + old_job_alloc_msg_t *time_req_msg = + (old_job_alloc_msg_t *) msg->data; + srun_timeout_msg_t timeout_msg; + slurm_msg_t response_msg; + int rc; + /* Locks: Read job */ + slurmctld_lock_t job_read_lock = { + NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; + + START_TIMER; + debug2("Processing RPC: REQUEST JOB_END_TIME"); + lock_slurmctld(job_read_lock); + rc = job_end_time(time_req_msg, &timeout_msg); + unlock_slurmctld(job_read_lock); + END_TIMER; + + if (rc != SLURM_SUCCESS) { + slurm_send_rc_msg(msg, rc); + } else { + response_msg.address = msg->address; + response_msg.msg_type = SRUN_TIMEOUT; + response_msg.data = &timeout_msg; + forward_init(&response_msg.forward, NULL); + response_msg.ret_list = NULL; + slurm_send_node_msg(msg->conn_fd, &response_msg); + } + debug2("_slurm_rpc_end_time jobid=%u %s", + time_req_msg->job_id, TIME_STR); +} + /* _slurm_rpc_dump_nodes - process RPC for node state information */ static void _slurm_rpc_dump_nodes(slurm_msg_t * msg) { diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index c8fb4aea1c6..4376d4703ab 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -656,6 +656,15 @@ extern void job_completion_logger(struct job_record *job_ptr); extern bool job_epilog_complete(uint32_t job_id, char *node_name, uint32_t return_code); +/* + * job_end_time - Process JOB_END_TIME + * IN time_req_msg - job end time request + * OUT timeout_msg - job timeout response to be sent + * RET SLURM_SUCESS or an error code + */ +extern int job_end_time(old_job_alloc_msg_t *time_req_msg, + srun_timeout_msg_t *timeout_msg); + /* job_fini - free all memory associated with job records */ extern void job_fini (void); diff --git a/src/slurmctld/srun_comm.c b/src/slurmctld/srun_comm.c index 119e4e78cbc..2e417224f3b 100644 --- a/src/slurmctld/srun_comm.c +++ b/src/slurmctld/srun_comm.c @@ -215,12 +215,10 @@ extern void srun_ping (void) /* * srun_timeout - notify srun of a job's imminent timeout - * IN job_id - if of job to notify - * IN timeout - when job is scheduled to be killed + * IN job_ptr - pointer to the slurmctld job record */ -extern void srun_timeout (uint32_t job_id, time_t timeout) +extern void srun_timeout (struct job_record *job_ptr) { - struct job_record *job_ptr = find_job_record (job_id); slurm_addr * addr; srun_timeout_msg_t *msg_arg; ListIterator step_iterator; @@ -234,9 +232,9 @@ extern void srun_timeout (uint32_t job_id, time_t timeout) addr = xmalloc(sizeof(struct sockaddr_in)); slurm_set_addr(addr, job_ptr->port, job_ptr->host); msg_arg = xmalloc(sizeof(srun_timeout_msg_t)); - msg_arg->job_id = job_id; + msg_arg->job_id = job_ptr->job_id; msg_arg->step_id = NO_VAL; - msg_arg->timeout = timeout; + msg_arg->timeout = job_ptr->end_time; _srun_agent_launch(addr, job_ptr->host, SRUN_TIMEOUT, msg_arg); } @@ -254,7 +252,7 @@ extern void srun_timeout (uint32_t job_id, time_t timeout) msg_arg = xmalloc(sizeof(srun_timeout_msg_t)); msg_arg->job_id = job_ptr->job_id; msg_arg->step_id = step_ptr->step_id; - msg_arg->timeout = timeout; + msg_arg->timeout = job_ptr->end_time; _srun_agent_launch(addr, step_ptr->host, SRUN_TIMEOUT, msg_arg); } diff --git a/src/slurmctld/srun_comm.h b/src/slurmctld/srun_comm.h index 081b5843e5c..cc297272c0e 100644 --- a/src/slurmctld/srun_comm.h +++ b/src/slurmctld/srun_comm.h @@ -30,6 +30,8 @@ #include <sys/types.h> #include <time.h> +#include "src/slurmctld/slurmctld.h" + /* * srun_allocate - notify srun of a resource allocation * IN job_id - id of the job allocated resource @@ -55,9 +57,8 @@ extern void srun_response(uint32_t job_id, uint32_t step_id); /* * srun_timeout - notify srun of a job's timeout - * IN job_id - if of job to notify - * IN timeout - when job is scheduled to be killed + * IN job_ptr - pointer to the slurmctld job record */ -extern void srun_timeout (uint32_t job_id, time_t timeout); +extern void srun_timeout (struct job_record *job_ptr); #endif /* !_HAVE_SRUN_COMM_H */ diff --git a/testsuite/expect/test2.7 b/testsuite/expect/test2.7 index d7a9204cbf2..7c4eea833e1 100755 --- a/testsuite/expect/test2.7 +++ b/testsuite/expect/test2.7 @@ -7,7 +7,7 @@ # "FAILURE: ..." otherwise with an explanation of the failure, OR # anything else indicates a failure mode that must be investigated. ############################################################################ -# Copyright (C) 2002 The Regents of the University of California. +# Copyright (C) 2002-2006 The Regents of the University of California. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette <jette1@llnl.gov> # UCRL-CODE-217948. @@ -88,6 +88,8 @@ if {[wait_for_job $job_id "DONE"] != 0} { # # Verify job_id in output file # +set rem_time 999 +set delta 999 if {[wait_for_file $file_out] == 0} { spawn $bin_cat $file_out expect { @@ -95,11 +97,31 @@ if {[wait_for_file $file_out] == 0} { set scontrol_id $expect_out(1,string) exp_continue } + -re "slurm_get_rem_time is ($number)" { + set rem_time $expect_out(1,string) + exp_continue + } + -re "slurm_get_rem_time calls in ($number) seconds" { + set delta $expect_out(1,string) + exp_continue + } eof { wait } } } +if {$rem_time > 60} { + send_user "\nFAILURE: job remaining time is wrong\n" + set exit_code 1 +} +if {$rem_time < 59} { + send_user "\nFAILURE: job remaining time seems too small\n" + set exit_code 1 +} +if {$delta > 1} { + send_user "\nFAILURE: slurm_get_rem_time calls too slow\n" + set exit_code 1 +} # # Check for errors in log -- GitLab