diff --git a/NEWS b/NEWS index b0c0011debae7a72e26b5c174798ee769246e217..58cdf28fd9a04f3327e57b24fa5b02a551d66b30 100644 --- a/NEWS +++ b/NEWS @@ -37,7 +37,8 @@ documents those changes that are of interest to users and admins. Put into new RPM: sjstat. -- Add sched/wiki2 (Moab) JOBMODIFY command support for VARIABLELIST option to set supplemental environment variables for pending batch jobs. - -- BLUEGENE - add support for scontrol show blocks + -- BLUEGENE - add support for scontrol show blocks. + -- Added support for job step time limits. * Changes in SLURM 2.0.2 ======================== diff --git a/doc/man/man1/squeue.1 b/doc/man/man1/squeue.1 index c9b841c8d791e5471e25d9f7443b68b31896c7b4..7708f4d18f9c9b26cec51dc2e77c7f6aa1c3b854 100644 --- a/doc/man/man1/squeue.1 +++ b/doc/man/man1/squeue.1 @@ -157,7 +157,7 @@ Minimum number of threads per core requested by the job. This reports the value of the \fBsrun \-\-minthreads\fR option. .TP \fB%l\fR -Time limit of the job in days\-hours:minutes:seconds. +Time limit of the job or job step in days\-hours:minutes:seconds. The value may be "NOT_SET" if not yet established or "UNLIMITED" for no limit. .TP \fB%m\fR diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 1d0641ec74f60c1dea62db38de66cc2aa1ac5d89..d82d39540c9259beede9e254f5b47a3096b2efe3 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -910,9 +910,12 @@ very small memory computers. .TP \fB\-t\fR, \fB\-\-time\fR=<\fItime\fR> -Set a limit on the total run time of the job step. If the -requested time limit exceeds the partition's time limit, the job will -be left in a PENDING state (possibly indefinitely). The default time +Set a limit on the total run time of the job or job step step. +If the requested time limit for a job exceeds the partition's time limit, +the job will be left in a PENDING state (possibly indefinitely). +If the requested time limit for a job step exceeds the partition's +time limit, the job step will not be initiated. +The default time limit is the partition's time limit. When the time limit is reached, all of the job's tasks are sent SIGTERM followed by SIGKILL. The interval between signals is specified by the SLURM configuration diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index fc28ad9dcc9bbd29edccfb7ae2ec4f586706babb..3ae430bd9d1da1b67a9fe8941b99c7c66166ca36 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -836,6 +836,7 @@ typedef struct { uint32_t task_count; /* number of tasks required */ uint16_t task_dist; /* see enum task_dist_state, default * is SLURM_DIST_CYCLIC */ + uint32_t time_limit; /* step time limit */ uid_t uid; /* user ID */ uint16_t verbose_level; /* for extra logging decisions in step * launch api */ @@ -917,6 +918,7 @@ typedef struct { time_t run_time; /* net run time (factor out time suspended) */ time_t start_time; /* step start time */ uint32_t step_id; /* step ID */ + uint32_t time_limit; /* step time limit */ uint32_t user_id; /* user the job runs as */ } job_step_info_t; diff --git a/src/api/job_step_info.c b/src/api/job_step_info.c index 06996a0cbedb203abc49ac15abdc721b16477a37..fc0454038dc0af8453e9e1c84c8039be64a5ad8a 100644 --- a/src/api/job_step_info.c +++ b/src/api/job_step_info.c @@ -112,16 +112,22 @@ slurm_sprint_job_step_info ( job_step_info_t * job_step_ptr, int one_liner ) { char time_str[32]; + char limit_str[32]; char tmp_line[128]; char *out = NULL; /****** Line 1 ******/ slurm_make_time_str ((time_t *)&job_step_ptr->start_time, time_str, sizeof(time_str)); + if (job_step_ptr->time_limit == INFINITE) + sprintf(limit_str, "UNLIMITED"); + else + secs2time_str ((time_t)job_step_ptr->time_limit * 60, + limit_str, sizeof(limit_str)); snprintf(tmp_line, sizeof(tmp_line), - "StepId=%u.%u UserId=%u Tasks=%u StartTime=%s", + "StepId=%u.%u UserId=%u StartTime=%s TimeLimit=%s", job_step_ptr->job_id, job_step_ptr->step_id, - job_step_ptr->user_id, job_step_ptr->num_tasks, time_str); + job_step_ptr->user_id, time_str, limit_str); out = xstrdup(tmp_line); if (one_liner) xstrcat(out, " "); @@ -130,9 +136,10 @@ slurm_sprint_job_step_info ( job_step_info_t * job_step_ptr, /****** Line 2 ******/ snprintf(tmp_line, sizeof(tmp_line), - "Partition=%s Nodes=%s Name=%s Network=%s", + "Partition=%s Nodes=%s Tasks=%u Name=%s Network=%s", job_step_ptr->partition, job_step_ptr->nodes, - job_step_ptr->name, job_step_ptr->network); + job_step_ptr->num_tasks, job_step_ptr->name, + job_step_ptr->network); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); diff --git a/src/api/step_ctx.c b/src/api/step_ctx.c index b1bb5cf876eee77195d7d1bb2bd7c8ac3ff25153..83220c5d1f795de9b6fb7e923724d673f4b0105c 100644 --- a/src/api/step_ctx.c +++ b/src/api/step_ctx.c @@ -103,6 +103,7 @@ static job_step_create_request_msg_t *_create_step_request( step_req->no_kill = step_params->no_kill; step_req->overcommit = step_params->overcommit ? 1 : 0; step_req->mem_per_task = step_params->mem_per_task; + step_req->time_limit = step_params->time_limit; return step_req; } diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index fb5ddd2c81dd0a09c160db2bb84d954f0c72c4d8..7e2cec65473cd392427e0d937940b29f8b6f1650 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -562,6 +562,8 @@ typedef struct job_step_specs { uint16_t relative; /* first node to use of job's allocation */ uint16_t resv_port_cnt; /* reserve ports for MPI if set */ uint16_t task_dist; /* see enum task_dist_state */ + uint32_t time_limit; /* maximum run time in minutes, default is + * partition limit */ uint32_t user_id; /* user the job runs as */ } job_step_create_request_msg_t; @@ -665,6 +667,7 @@ typedef struct return_code_msg { #define SIG_FAILURE 999 /* Dummy signal value to signify sys failure */ typedef struct kill_job_msg { uint32_t job_id; + uint32_t step_id; uint16_t job_state; uint32_t job_uid; time_t time; /* slurmctld's time of request */ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 3d8dc983a5fadc20c1ecd7bc74d35cda95c2d244..8dcea5962b8a699c9ffec5c17f64b362ea2df168 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2253,6 +2253,7 @@ _pack_job_step_create_request_msg(job_step_create_request_msg_t pack32(msg->cpu_count, buffer); pack32(msg->num_tasks, buffer); pack32(msg->mem_per_task, buffer); + pack32(msg->time_limit, buffer); pack16(msg->relative, buffer); pack16(msg->task_dist, buffer); @@ -2291,6 +2292,7 @@ _unpack_job_step_create_request_msg(job_step_create_request_msg_t ** msg, safe_unpack32(&(tmp_ptr->cpu_count), buffer); safe_unpack32(&(tmp_ptr->num_tasks), buffer); safe_unpack32(&(tmp_ptr->mem_per_task), buffer); + safe_unpack32(&(tmp_ptr->time_limit), buffer); safe_unpack16(&(tmp_ptr->relative), buffer); safe_unpack16(&(tmp_ptr->task_dist), buffer); @@ -2324,6 +2326,7 @@ _pack_kill_job_msg(kill_job_msg_t * msg, Buf buffer) xassert(msg != NULL); pack32(msg->job_id, buffer); + pack32(msg->step_id, buffer); pack16(msg->job_state, buffer); pack32(msg->job_uid, buffer); pack_time(msg->time, buffer); @@ -2344,6 +2347,7 @@ _unpack_kill_job_msg(kill_job_msg_t ** msg, Buf buffer) *msg = tmp_ptr; safe_unpack32(&(tmp_ptr->job_id), buffer); + safe_unpack32(&(tmp_ptr->step_id), buffer); safe_unpack16(&(tmp_ptr->job_state), buffer); safe_unpack32(&(tmp_ptr->job_uid), buffer); safe_unpack_time(&(tmp_ptr->time), buffer); @@ -2669,6 +2673,7 @@ _unpack_job_step_info_members(job_step_info_t * step, Buf buffer) safe_unpack32(&step->user_id, buffer); safe_unpack32(&step->num_tasks, buffer); + safe_unpack32(&step->time_limit, buffer); safe_unpack_time(&step->start_time, buffer); safe_unpack_time(&step->run_time, buffer); safe_unpackstr_xmalloc(&step->partition, &uint32_tmp, buffer); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 89139ef9d3a1363ff4601e289c06b82b8efe97fa..56ec17c3695e740b2d49adb904a43286c723bc09 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3577,6 +3577,12 @@ void job_time_limit(void) xfree(job_ptr->state_desc); continue; } + + /* check if any individual job steps have exceeded + * their time limit */ + if (job_ptr->step_list && + (list_count(job_ptr->step_list) > 0)) + check_job_step_time_limit(job_ptr, now); /* Too be added later once qos actually works. The * idea here is for qos to trump what an association @@ -5676,6 +5682,7 @@ abort_job_on_node(uint32_t job_id, struct job_record *job_ptr, kill_req = xmalloc(sizeof(kill_job_msg_t)); kill_req->job_id = job_id; + kill_req->step_id = NO_VAL; kill_req->time = time(NULL); kill_req->nodes = xstrdup(node_ptr->name); if (job_ptr) { /* NULL if unknown */ @@ -5713,6 +5720,7 @@ kill_job_on_node(uint32_t job_id, struct job_record *job_ptr, kill_req = xmalloc(sizeof(kill_job_msg_t)); kill_req->job_id = job_id; + kill_req->step_id = NO_VAL; kill_req->time = time(NULL); kill_req->nodes = xstrdup(node_ptr->name); if (job_ptr) { /* NULL if unknown */ diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index a17b516db42a735e5c4eab3e8a5af756cec39ec1..49a9e33b6c3d9e716b73c06adec47c55420d91a3 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -181,6 +181,7 @@ extern void deallocate_nodes(struct job_record *job_ptr, bool timeout, kill_job = xmalloc(sizeof(kill_job_msg_t)); last_node_update = time(NULL); kill_job->job_id = job_ptr->job_id; + kill_job->step_id = NO_VAL; kill_job->job_state = job_ptr->job_state; kill_job->job_uid = job_ptr->user_id; kill_job->nodes = xstrdup(job_ptr->nodes); @@ -1767,6 +1768,7 @@ extern void re_kill_job(struct job_record *job_ptr) agent_args->retry = 0; kill_job = xmalloc(sizeof(kill_job_msg_t)); kill_job->job_id = job_ptr->job_id; + kill_job->step_id = NO_VAL; kill_job->job_uid = job_ptr->user_id; kill_job->job_state = job_ptr->job_state; kill_job->time = time(NULL); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index c828d7cb3bcc26ea0455dae6ddad3c2d329b0cc5..fb7232a9b537b368f98a135c605fee88a5e4f20b 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -602,7 +602,8 @@ struct step_record { int *resv_port_array; /* reserved port indexes */ uint16_t resv_port_cnt; /* count of ports reserved per node */ char *resv_ports; /* ports reserved for job */ - time_t start_time; /* step allocation time */ + time_t start_time; /* step allocation start time */ + uint32_t time_limit; /* step allocation time limit */ uint32_t step_id; /* step number */ slurm_step_layout_t *step_layout;/* info about how tasks are laid out * in the step */ @@ -1146,6 +1147,14 @@ extern int job_step_signal(uint32_t job_id, uint32_t step_id, */ extern void job_time_limit (void); +/* + * check_job_step_time_limit - terminate jobsteps which have exceeded + * their time limit + * IN job_ptr - pointer to job containing steps to check + * IN now - current time to use for the limit check + */ +extern void check_job_step_time_limit (struct job_record *job_ptr, time_t now); + /* * kill_job_by_part_name - Given a partition name, deallocate resource for * its jobs and kill them diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 5e9d562105799f616834677e4032b0721944e983..884ecf8ec85168190d3dd3e80d83fc995b638312 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -112,6 +112,7 @@ static struct step_record * _create_step_record(struct job_record *job_ptr) last_job_update = time(NULL); step_ptr->job_ptr = job_ptr; step_ptr->start_time = time(NULL) ; + step_ptr->time_limit = INFINITE ; step_ptr->jobacct = jobacct_gather_g_create(NULL); step_ptr->ckpt_dir = NULL; if (list_append (job_ptr->step_list, step_ptr) == NULL) @@ -238,8 +239,8 @@ dump_step_desc(job_step_create_request_msg_t *step_spec) debug3(" mem_per_task=%u resv_port_cnt=%u immediate=%u no_kill=%u", step_spec->mem_per_task, step_spec->resv_port_cnt, step_spec->immediate, step_spec->no_kill); - debug3(" overcommit=%d", - step_spec->overcommit); + debug3(" overcommit=%d time_limit=%u", + step_spec->overcommit, step_spec->time_limit); } @@ -1290,8 +1291,8 @@ step_create(job_step_create_request_msg_t *step_specs, /* set the step_record values */ /* Here is where the node list is set for the step */ - if(step_specs->node_list - && step_specs->task_dist == SLURM_DIST_ARBITRARY) { + if(step_specs->node_list && + step_specs->task_dist == SLURM_DIST_ARBITRARY) { step_node_list = xstrdup(step_specs->node_list); xfree(step_specs->node_list); step_specs->node_list = bitmap2node_name(nodeset); @@ -1341,6 +1342,28 @@ step_create(job_step_create_request_msg_t *step_specs, else step_ptr->network = xstrdup(job_ptr->network); + /* the step time_limit is recorded as submitted (INFINITE + * or partition->max_time by default), but the allocation + * time limits may cut it short */ + if (step_specs->time_limit == NO_VAL || step_specs->time_limit == 0 || + step_specs->time_limit == INFINITE) { + if (job_ptr->part_ptr->default_time != NO_VAL) + step_ptr->time_limit = job_ptr->part_ptr->default_time; + else + step_ptr->time_limit = job_ptr->part_ptr->max_time; + } else { + /* enforce partition limits if necessary */ + if ((step_specs->time_limit > job_ptr->part_ptr->max_time) && + slurmctld_conf.enforce_part_limits) { + info("_step_create: step time greater than partition's " + "(%u > %u)", step_specs->time_limit, + job_ptr->part_ptr->max_time); + delete_step_record (job_ptr, step_ptr->step_id); + return ESLURM_INVALID_TIME_LIMIT; + } + step_ptr->time_limit = step_specs->time_limit; + } + /* a batch script does not need switch info */ if (!batch_step) { step_ptr->step_layout = @@ -1510,6 +1533,7 @@ static void _pack_ctld_job_step_info(struct step_record *step_ptr, Buf buffer) pack32(step_ptr->job_ptr->user_id, buffer); pack32(task_cnt, buffer); + pack32(step_ptr->time_limit, buffer); pack_time(step_ptr->start_time, buffer); if (IS_JOB_SUSPENDED(step_ptr->job_ptr)) { run_time = step_ptr->pre_sus_time; @@ -2171,6 +2195,7 @@ extern void dump_job_step_state(struct step_record *step_ptr, Buf buffer) } else pack32((uint32_t) 0, buffer); + pack32(step_ptr->time_limit, buffer); pack_time(step_ptr->start_time, buffer); pack_time(step_ptr->pre_sus_time, buffer); pack_time(step_ptr->tot_sus_time, buffer); @@ -2202,7 +2227,7 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer) uint16_t cyclic_alloc, port, batch_step, bit_cnt; uint16_t ckpt_interval, cpus_per_task, resv_port_cnt; uint32_t core_size, cpu_count, exit_code, mem_per_task, name_len; - uint32_t step_id; + uint32_t step_id, time_limit; time_t start_time, pre_sus_time, tot_sus_time, ckpt_time; char *host = NULL, *ckpt_dir = NULL, *core_job = NULL; char *resv_ports = NULL, *name = NULL, *network = NULL, *bit_fmt = NULL; @@ -2230,6 +2255,7 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer) if (core_size) safe_unpackstr_xmalloc(&core_job, &name_len, buffer); + safe_unpack32(&time_limit, buffer); safe_unpack_time(&start_time, buffer); safe_unpack_time(&pre_sus_time, buffer); safe_unpack_time(&tot_sus_time, buffer); @@ -2288,6 +2314,7 @@ extern int load_step_state(struct job_record *job_ptr, Buf buffer) host = NULL; /* re-used, nothing left to free */ step_ptr->batch_step = batch_step; step_ptr->start_time = start_time; + step_ptr->time_limit = time_limit; step_ptr->pre_sus_time = pre_sus_time; step_ptr->tot_sus_time = tot_sus_time; step_ptr->ckpt_time = ckpt_time; @@ -2435,3 +2462,85 @@ extern void step_checkpoint(void) } list_iterator_destroy(job_iterator); } + +static void _signal_step_timelimit(struct job_record *job_ptr, + struct step_record *step_ptr, time_t now) +{ + int i; + kill_job_msg_t *kill_step; + agent_arg_t *agent_args = NULL; + + xassert(step_ptr); + agent_args = xmalloc(sizeof(agent_arg_t)); + agent_args->msg_type = REQUEST_KILL_TIMELIMIT; + agent_args->retry = 1; + agent_args->hostlist = hostlist_create(""); + kill_step = xmalloc(sizeof(kill_job_msg_t)); + kill_step->job_id = job_ptr->job_id; + kill_step->step_id = step_ptr->step_id; + kill_step->job_state = job_ptr->job_state; + kill_step->job_uid = job_ptr->user_id; + kill_step->nodes = xstrdup(job_ptr->nodes); + kill_step->time = now; + kill_step->select_jobinfo = select_g_select_jobinfo_copy( + job_ptr->select_jobinfo); + + for (i = 0; i < node_record_count; i++) { + if (bit_test(step_ptr->step_node_bitmap, i) == 0) + continue; + hostlist_push(agent_args->hostlist, + node_record_table_ptr[i].name); + agent_args->node_count++; +#ifdef HAVE_FRONT_END /* Operate only on front-end */ + break; +#endif + } + + if (agent_args->node_count == 0) { + hostlist_destroy(agent_args->hostlist); + xfree(agent_args); + if (kill_step->select_jobinfo) { + select_g_select_jobinfo_free( + kill_step->select_jobinfo); + } + xfree(kill_step); + return; + } + + agent_args->msg_args = kill_step; + agent_queue_request(agent_args); + return; +} + +extern void +check_job_step_time_limit (struct job_record *job_ptr, time_t now) +{ + ListIterator step_iterator; + struct step_record *step_ptr; + uint32_t job_run_mins = 0; + + xassert(job_ptr); + + if (job_ptr->job_state != JOB_RUNNING) + return; + + step_iterator = list_iterator_create (job_ptr->step_list); + while ((step_ptr = (struct step_record *) list_next (step_iterator))) { + + if (step_ptr->time_limit == INFINITE || + step_ptr->time_limit == NO_VAL) + continue; + job_run_mins = (uint32_t) (((now - step_ptr->start_time) - + step_ptr->tot_sus_time) / 60); + if (job_run_mins >= step_ptr->time_limit) { + /* this step has timed out */ + info("check_job_step_time_limit: job %u step %u " + "has timed out (%u)", + job_ptr->job_id, step_ptr->step_id, + step_ptr->time_limit); + _signal_step_timelimit(job_ptr, step_ptr, now); + } + } + + list_iterator_destroy (step_iterator); +} diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 4e66d8c5c082df8c0a7433d749fdbfbec6d99a92..5e6546c9bb85978b8d926815b22df5adf0e58ea5 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -1483,44 +1483,31 @@ _rpc_health_check(slurm_msg_t *msg) return rc; } -static void -_rpc_signal_tasks(slurm_msg_t *msg) + +static int +_signal_jobstep(uint32_t jobid, uint32_t stepid, uid_t req_uid, uint32_t signal) { - int fd; - int rc = SLURM_SUCCESS; - uid_t req_uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); - kill_tasks_msg_t *req = (kill_tasks_msg_t *) msg->data; + int fd, rc = SLURM_SUCCESS; slurmstepd_info_t *step; -#ifdef HAVE_XCPU - if (!_slurm_authorized_user(req_uid)) { - error("REQUEST_SIGNAL_TASKS not support with XCPU system"); - rc = ESLURM_NOT_SUPPORTED; - goto done; - } -#endif - - fd = stepd_connect(conf->spooldir, conf->node_name, - req->job_id, req->job_step_id); + fd = stepd_connect(conf->spooldir, conf->node_name, jobid, stepid); if (fd == -1) { debug("signal for nonexistant %u.%u stepd_connect failed: %m", - req->job_id, req->job_step_id); - rc = ESLURM_INVALID_JOB_ID; - goto done; + jobid, stepid); + return ESLURM_INVALID_JOB_ID; } if ((step = stepd_get_info(fd)) == NULL) { debug("signal for nonexistent job %u.%u requested", - req->job_id, req->job_step_id); - rc = ESLURM_INVALID_JOB_ID; - goto done2; + jobid, stepid); + close(fd); + return ESLURM_INVALID_JOB_ID; } if ((req_uid != step->uid) && (!_slurm_authorized_user(req_uid))) { debug("kill req from uid %ld for job %u.%u owned by uid %ld", - (long) req_uid, req->job_id, req->job_step_id, - (long) step->uid); + (long) req_uid, jobid, stepid, (long) step->uid); rc = ESLURM_USER_ID_MISSING; /* or bad in this case */ - goto done3; + goto done2; } #ifdef HAVE_AIX @@ -1529,23 +1516,45 @@ _rpc_signal_tasks(slurm_msg_t *msg) /* SIGMIGRATE and SIGSOUND are used to initiate job checkpoint on AIX. * These signals are not sent to the entire process group, but just a * single process, namely the PMD. */ - if (req->signal == SIGMIGRATE || req->signal == SIGSOUND) { - rc = stepd_signal_task_local(fd, req->signal, 0); - goto done; + if (signal == SIGMIGRATE || signal == SIGSOUND) { + rc = stepd_signal_task_local(fd, signal, 0); + goto done2; } # endif # endif #endif - rc = stepd_signal(fd, req->signal); - if (rc == -1) - rc = ESLURMD_JOB_NOTRUNNING; + if (signal == SIG_TIME_LIMIT) { + /* notify this step that it has exceeded its time limit */ + rc = stepd_signal_container(fd, SIG_TIME_LIMIT); + } else { + rc = stepd_signal(fd, signal); + if (rc == -1) + rc = ESLURMD_JOB_NOTRUNNING; + } -done3: - xfree(step); done2: + xfree(step); close(fd); -done: + return rc; +} + +static void +_rpc_signal_tasks(slurm_msg_t *msg) +{ + int rc = SLURM_SUCCESS; + uid_t req_uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); + kill_tasks_msg_t *req = (kill_tasks_msg_t *) msg->data; + +#ifdef HAVE_XCPU + if (!_slurm_authorized_user(req_uid)) { + error("REQUEST_SIGNAL_TASKS not support with XCPU system"); + return ESLURM_NOT_SUPPORTED; + } +#endif + + rc = _signal_jobstep(req->job_id, req->job_step_id, req_uid, + req->signal); slurm_send_rc_msg(msg, rc); } @@ -1822,7 +1831,6 @@ _rpc_stat_jobacct(slurm_msg_t *msg) return SLURM_SUCCESS; } - /* * For the specified job_id: reply to slurmctld, * sleep(configured kill_wait), then send SIGKILL @@ -1832,7 +1840,7 @@ _rpc_timelimit(slurm_msg_t *msg) { uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); kill_job_msg_t *req = msg->data; - int nsteps; + int nsteps, rc; if (!_slurm_authorized_user(uid)) { error ("Security violation: rpc_timelimit req from uid %ld", @@ -1848,6 +1856,34 @@ _rpc_timelimit(slurm_msg_t *msg) slurm_close_accepted_conn(msg->conn_fd); msg->conn_fd = -1; + if (req->step_id != NO_VAL) { + slurm_ctl_conf_t *cf; + int delay; + /* A jobstep has timed out: + * - send the container a SIG_TIME_LIMIT to note the occasion + * - send a SIGCONT to resume any suspended tasks + * - send a SIGTERM to begin termination + * - sleep KILL_WAIT + * - send a SIGKILL to clean up + */ + rc = _signal_jobstep(req->job_id, req->step_id, uid, + SIG_TIME_LIMIT); + if (rc != SLURM_SUCCESS) + return; + rc = _signal_jobstep(req->job_id, req->step_id, uid, SIGCONT); + if (rc != SLURM_SUCCESS) + return; + rc = _signal_jobstep(req->job_id, req->step_id, uid, SIGTERM); + if (rc != SLURM_SUCCESS) + return; + cf = slurm_conf_lock(); + delay = MAX(cf->kill_wait, 5); + slurm_conf_unlock(); + sleep(delay); + _signal_jobstep(req->job_id, req->step_id, uid, SIGKILL); + return; + } + _kill_all_active_steps(req->job_id, SIG_TIME_LIMIT, true); nsteps = xcpu_signal(SIGTERM, req->nodes) + _kill_all_active_steps(req->job_id, SIGTERM, false); diff --git a/src/squeue/opts.c b/src/squeue/opts.c index c8bab6d300cb0d6ea22d26ede013c293ffc7a381..f326db513d44eb5601ba3be655c6a8bea2315320 100644 --- a/src/squeue/opts.c +++ b/src/squeue/opts.c @@ -430,7 +430,11 @@ extern int parse_format( char* format ) field_size, right_justify, suffix ); - + else if (field[0] == 'l') + step_format_add_time_limit( params.format_list, + field_size, + right_justify, + suffix ); else if (field[0] == 'M') step_format_add_time_used( params.format_list, field_size, diff --git a/src/squeue/print.c b/src/squeue/print.c index 0fb98b76e6c46b45b968f3f9c6d7ba7a5f33f036..490e2e651776646dee9381988a728c19e195850b 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -1156,6 +1156,20 @@ int _print_step_user_name(job_step_info_t * step, int width, bool right, return SLURM_SUCCESS; } +int _print_step_time_limit(job_step_info_t * step, int width, bool right, + char* suffix) +{ + if (step == NULL) /* Print the Header instead */ + _print_str("LIMIT", width, false, true); + else if (step->time_limit == INFINITE) + _print_str("UNLIMITED", width, right, true); + else + _print_secs(step->time_limit * 60, width, right, false); + if (suffix) + printf("%s", suffix); + return SLURM_SUCCESS; +} + int _print_step_time_start(job_step_info_t * step, int width, bool right, char* suffix) { diff --git a/src/squeue/print.h b/src/squeue/print.h index d77c9d5f10b28fe8fe683b009f78ea69c7f8595a..05a8cda8cc9c920c7b58c4026364973aef86f25a 100644 --- a/src/squeue/print.h +++ b/src/squeue/print.h @@ -274,6 +274,8 @@ int step_format_add_function(List list, int width, bool right_justify, step_format_add_function(list,wid,right,suffix,_print_step_user_id) #define step_format_add_user_name(list,wid,right,suffix) \ step_format_add_function(list,wid,right,suffix,_print_step_user_name) +#define step_format_add_time_limit(list,wid,right,suffix) \ + step_format_add_function(list,wid,right,suffix,_print_step_time_limit) #define step_format_add_time_start(list,wid,right,suffix) \ step_format_add_function(list,wid,right,suffix,_print_step_time_start) #define step_format_add_time_used(list,wid,right,suffix) \ @@ -298,6 +300,8 @@ int _print_step_user_id(job_step_info_t * step, int width, bool right_justify, char *suffix); int _print_step_user_name(job_step_info_t * step, int width, bool right_justify, char *suffix); +int _print_step_time_limit(job_step_info_t * step, int width, + bool right_justify, char *suffix); int _print_step_time_start(job_step_info_t * step, int width, bool right_justify, char *suffix); int _print_step_time_used(job_step_info_t * step, int width, diff --git a/src/squeue/sort.c b/src/squeue/sort.c index d3c89242d5660dd7ff9e67d57424c81fcdd248b1..62bd4cad295236eb5e307d07493727f841f631a0 100644 --- a/src/squeue/sort.c +++ b/src/squeue/sort.c @@ -79,6 +79,7 @@ static int _sort_step_by_id(void *void1, void *void2); static int _sort_step_by_node_list(void *void1, void *void2); static int _sort_step_by_partition(void *void1, void *void2); static int _sort_step_by_time_start(void *void1, void *void2); +static int _sort_step_by_time_limit(void *void1, void *void2); static int _sort_step_by_time_used(void *void1, void *void2); static int _sort_step_by_user_id(void *void1, void *void2); static int _sort_step_by_user_name(void *void1, void *void2); @@ -190,6 +191,8 @@ void sort_step_list(List step_list) list_sort(step_list, _sort_step_by_node_list); else if (params.sort[i] == 'P') list_sort(step_list, _sort_step_by_partition); + else if (params.sort[i] == 'l') + list_sort(step_list, _sort_step_by_time_limit); else if (params.sort[i] == 'S') list_sort(step_list, _sort_step_by_time_start); else if (params.sort[i] == 'M') @@ -693,6 +696,19 @@ static int _sort_step_by_partition(void *void1, void *void2) return diff; } +static int _sort_step_by_time_limit(void *void1, void *void2) +{ + int diff; + job_step_info_t *step1 = (job_step_info_t *) void1; + job_step_info_t *step2 = (job_step_info_t *) void2; + + diff = step1->time_limit - step2->time_limit; + + if (reverse_order) + diff = -diff; + return diff; +} + static int _sort_step_by_time_start(void *void1, void *void2) { int diff; diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 924ef08b4230d639efbe73a38ba5818152612c61..6665a987a570cef9c9ad471920beb932c8aef1c8 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -721,6 +721,8 @@ create_job_step(srun_job_t *job, bool use_all_cpus) job->ctx_params.ckpt_dir = opt.ckpt_dir; job->ctx_params.exclusive = (uint16_t)opt.exclusive; job->ctx_params.immediate = (uint16_t)opt.immediate; + if (opt.time_limit != NO_VAL) + job->ctx_params.time_limit = (uint32_t)opt.time_limit; job->ctx_params.verbose_level = (uint16_t)_verbose; if (opt.resv_port_cnt != NO_VAL) job->ctx_params.resv_port_cnt = (uint16_t) opt.resv_port_cnt;