diff --git a/NEWS b/NEWS index 26f577d255739f5472e1dfd6473a097c9308ca23..533546d314168cee07a4ef37f165b0742ed9e810 100644 --- a/NEWS +++ b/NEWS @@ -72,8 +72,12 @@ documents those changes that are of interest to users and admins. -- Fixed bug in sview to be able to edit partitions correctly -- Fixed bug so in slurm.conf files where SlurmdPort isn't defined things work correctly. - - In sched/wiki2 and sched/wiki add support for batch job being requeued - in Slurm either when nodes fail or upon request. + -- In sched/wiki2 and sched/wiki add support for batch job being requeued + in Slurm either when nodes fail or upon request. + -- In sched/wiki2 and sched/wiki with FastSchedule=2 configured and nodes + configured with more CPUs than actually exist, return a value of TASKS + equal to the number of configured CPUs that are allocated to a job rather + than the number of physical CPUs allocated. * Changes in SLURM 1.2.17 ========================= diff --git a/src/plugins/sched/wiki2/get_jobs.c b/src/plugins/sched/wiki2/get_jobs.c index 08f394157b555de230d6e6e61c263e4aedde67ad..c070b585f41cd1905d82b4101df3942b9c14882f 100644 --- a/src/plugins/sched/wiki2/get_jobs.c +++ b/src/plugins/sched/wiki2/get_jobs.c @@ -96,8 +96,8 @@ reject_msg_t reject_msgs[REJECT_MSG_MAX]; * UPDATETIME=<uts>; time last active * [FLAGS=INTERACTIVE;] set if interactive (not batch) job * WCLIMIT=<secs>; wall clock time limit, seconds - * [TASKS=<cpus>;] CPUs required - * [NODES=<nodes>;] nodes required + * TASKS=<cpus>; CPUs required + * NODES=<nodes>; nodes required * QUEUETIME=<uts>; submission time * STARTTIME=<uts>; time execution started * RCLASS=<partition>; SLURM partition name @@ -294,15 +294,11 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) (uint32_t) _get_job_time_limit(job_ptr)); xstrcat(buf, tmp); - if (job_ptr->job_state == JOB_PENDING) { - /* Don't report actual tasks or nodes allocated since - * this can impact requeue on heterogenous clusters */ - snprintf(tmp, sizeof(tmp), - "TASKS=%u;NODES=%u;", - _get_job_tasks(job_ptr), - _get_job_min_nodes(job_ptr)); - xstrcat(buf, tmp); - } + snprintf(tmp, sizeof(tmp), + "TASKS=%u;NODES=%u;", + _get_job_tasks(job_ptr), + _get_job_min_nodes(job_ptr)); + xstrcat(buf, tmp); snprintf(tmp, sizeof(tmp), "QUEUETIME=%u;STARTTIME=%u;RCLASS=%s;", diff --git a/src/plugins/sched/wiki2/start_job.c b/src/plugins/sched/wiki2/start_job.c index 1d818382d25f7eec87273a77ed796d973ebad23c..26d7f0e3b999b8cfd695121db25df0f8d9d4571a 100644 --- a/src/plugins/sched/wiki2/start_job.c +++ b/src/plugins/sched/wiki2/start_job.c @@ -245,11 +245,12 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, /* No errors so far */ (void) schedule(); /* provides own locking */ + /* Check to insure the job was actually started */ lock_slurmctld(job_write_lock); - /* job_ptr = find_job_record(jobid); don't bother */ - - if ((job_ptr->job_id == jobid) && job_ptr->details && + if (job_ptr->job_id != jobid) + job_ptr = find_job_record(jobid); + if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details && (job_ptr->job_state == JOB_RUNNING)) { /* Restore required node list */ xfree(job_ptr->details->req_nodes); @@ -260,10 +261,11 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, } else { xfree(save_req_nodes); FREE_NULL_BITMAP(save_req_bitmap); - FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); + if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details) + FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); } - if ((job_ptr->job_id == jobid) + if (job_ptr && (job_ptr->job_id == jobid) && (job_ptr->job_state != JOB_RUNNING)) { uint16_t wait_reason = 0; char *wait_string; diff --git a/src/slurmctld/trigger_mgr.c b/src/slurmctld/trigger_mgr.c index a337b3bd3a72d1d0c676bb50471d4af950bff840..f3eaef6d87da7fa0ee53688f56ef6b0354caa941 100644 --- a/src/slurmctld/trigger_mgr.c +++ b/src/slurmctld/trigger_mgr.c @@ -874,6 +874,12 @@ static void _trigger_node_event(trig_mgr_info_t *trig_in, time_t now) } } +/* Ideally we would use the existing proctrack plugin to prevent any + * processes from escaping our control, but that plugin is tied + * to various slurmd data structures. We just the process group ID + * to kill the spawned program after MAX_PROG_TIME. Since triggers are + * meant primarily for system administrators rather than users, this + * may be sufficient. */ static void _trigger_run_program(trig_mgr_info_t *trig_in) { char program[1024], arg0[1024], arg1[1024], user_name[1024], *pname;