Skip to content
Snippets Groups Projects
Commit 9744078c authored by Moe Jette's avatar Moe Jette
Browse files
parent 5fd5cc92
No related branches found
No related tags found
No related merge requests found
...@@ -72,8 +72,12 @@ documents those changes that are of interest to users and admins. ...@@ -72,8 +72,12 @@ documents those changes that are of interest to users and admins.
-- Fixed bug in sview to be able to edit partitions correctly -- Fixed bug in sview to be able to edit partitions correctly
-- Fixed bug so in slurm.conf files where SlurmdPort isn't defined things -- Fixed bug so in slurm.conf files where SlurmdPort isn't defined things
work correctly. work correctly.
- In sched/wiki2 and sched/wiki add support for batch job being requeued -- In sched/wiki2 and sched/wiki add support for batch job being requeued
in Slurm either when nodes fail or upon request. in Slurm either when nodes fail or upon request.
-- In sched/wiki2 and sched/wiki with FastSchedule=2 configured and nodes
configured with more CPUs than actually exist, return a value of TASKS
equal to the number of configured CPUs that are allocated to a job rather
than the number of physical CPUs allocated.
* Changes in SLURM 1.2.17 * Changes in SLURM 1.2.17
========================= =========================
......
...@@ -96,8 +96,8 @@ reject_msg_t reject_msgs[REJECT_MSG_MAX]; ...@@ -96,8 +96,8 @@ reject_msg_t reject_msgs[REJECT_MSG_MAX];
* UPDATETIME=<uts>; time last active * UPDATETIME=<uts>; time last active
* [FLAGS=INTERACTIVE;] set if interactive (not batch) job * [FLAGS=INTERACTIVE;] set if interactive (not batch) job
* WCLIMIT=<secs>; wall clock time limit, seconds * WCLIMIT=<secs>; wall clock time limit, seconds
* [TASKS=<cpus>;] CPUs required * TASKS=<cpus>; CPUs required
* [NODES=<nodes>;] nodes required * NODES=<nodes>; nodes required
* QUEUETIME=<uts>; submission time * QUEUETIME=<uts>; submission time
* STARTTIME=<uts>; time execution started * STARTTIME=<uts>; time execution started
* RCLASS=<partition>; SLURM partition name * RCLASS=<partition>; SLURM partition name
...@@ -294,15 +294,11 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) ...@@ -294,15 +294,11 @@ static char * _dump_job(struct job_record *job_ptr, int state_info)
(uint32_t) _get_job_time_limit(job_ptr)); (uint32_t) _get_job_time_limit(job_ptr));
xstrcat(buf, tmp); xstrcat(buf, tmp);
if (job_ptr->job_state == JOB_PENDING) { snprintf(tmp, sizeof(tmp),
/* Don't report actual tasks or nodes allocated since "TASKS=%u;NODES=%u;",
* this can impact requeue on heterogenous clusters */ _get_job_tasks(job_ptr),
snprintf(tmp, sizeof(tmp), _get_job_min_nodes(job_ptr));
"TASKS=%u;NODES=%u;", xstrcat(buf, tmp);
_get_job_tasks(job_ptr),
_get_job_min_nodes(job_ptr));
xstrcat(buf, tmp);
}
snprintf(tmp, sizeof(tmp), snprintf(tmp, sizeof(tmp),
"QUEUETIME=%u;STARTTIME=%u;RCLASS=%s;", "QUEUETIME=%u;STARTTIME=%u;RCLASS=%s;",
......
...@@ -245,11 +245,12 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, ...@@ -245,11 +245,12 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist,
/* No errors so far */ /* No errors so far */
(void) schedule(); /* provides own locking */ (void) schedule(); /* provides own locking */
/* Check to insure the job was actually started */ /* Check to insure the job was actually started */
lock_slurmctld(job_write_lock); lock_slurmctld(job_write_lock);
/* job_ptr = find_job_record(jobid); don't bother */ if (job_ptr->job_id != jobid)
job_ptr = find_job_record(jobid);
if ((job_ptr->job_id == jobid) && job_ptr->details && if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details &&
(job_ptr->job_state == JOB_RUNNING)) { (job_ptr->job_state == JOB_RUNNING)) {
/* Restore required node list */ /* Restore required node list */
xfree(job_ptr->details->req_nodes); xfree(job_ptr->details->req_nodes);
...@@ -260,10 +261,11 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, ...@@ -260,10 +261,11 @@ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist,
} else { } else {
xfree(save_req_nodes); xfree(save_req_nodes);
FREE_NULL_BITMAP(save_req_bitmap); FREE_NULL_BITMAP(save_req_bitmap);
FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details)
FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
} }
if ((job_ptr->job_id == jobid) if (job_ptr && (job_ptr->job_id == jobid)
&& (job_ptr->job_state != JOB_RUNNING)) { && (job_ptr->job_state != JOB_RUNNING)) {
uint16_t wait_reason = 0; uint16_t wait_reason = 0;
char *wait_string; char *wait_string;
......
...@@ -874,6 +874,12 @@ static void _trigger_node_event(trig_mgr_info_t *trig_in, time_t now) ...@@ -874,6 +874,12 @@ static void _trigger_node_event(trig_mgr_info_t *trig_in, time_t now)
} }
} }
/* Ideally we would use the existing proctrack plugin to prevent any
* processes from escaping our control, but that plugin is tied
* to various slurmd data structures. We just the process group ID
* to kill the spawned program after MAX_PROG_TIME. Since triggers are
* meant primarily for system administrators rather than users, this
* may be sufficient. */
static void _trigger_run_program(trig_mgr_info_t *trig_in) static void _trigger_run_program(trig_mgr_info_t *trig_in)
{ {
char program[1024], arg0[1024], arg1[1024], user_name[1024], *pname; char program[1024], arg0[1024], arg1[1024], user_name[1024], *pname;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment