svn merge -r12423:12432 https://eris.llnl.gov/svn/slurm/branches/slurm-1.2

9744078c · Moe Jette · 5fd5cc92 · 9744078c · 9744078c · 9744078c
Commit 9744078c authored 17 years ago by Moe Jette
--- a/NEWS
+++ b/NEWS
@@ -72,8 +72,12 @@ documents those changes that are of interest to users and admins.
 -- Fixed bug in sview to be able to edit partitions correctly
 -- Fixed bug so in slurm.conf files where SlurmdPort isn't defined things
    work correctly.
- - In sched/wiki2 and sched/wiki add support for batch job being requeued
+ -- In sched/wiki2 and sched/wiki add support for batch job being requeued
-   in Slurm either when nodes fail or upon request.
+    in Slurm either when nodes fail or upon request.
+ -- In sched/wiki2 and sched/wiki with FastSchedule=2 configured and nodes 
+    configured with more CPUs than actually exist, return a value of TASKS 
+    equal to the number of configured CPUs that are allocated to a job rather 
+    than the number of physical CPUs allocated.
 * Changes in SLURM 1.2.17
 =========================

--- a/src/plugins/sched/wiki2/get_jobs.c
+++ b/src/plugins/sched/wiki2/get_jobs.c
@@ -96,8 +96,8 @@ reject_msg_t reject_msgs[REJECT_MSG_MAX];
 *	UPDATETIME=<uts>;		time last active
 *	[FLAGS=INTERACTIVE;]		set if interactive (not batch) job
 *	WCLIMIT=<secs>;			wall clock time limit, seconds
- *	[TASKS=<cpus>;]			CPUs required
+ *	TASKS=<cpus>;			CPUs required
- *	[NODES=<nodes>;]		nodes required
+ *	NODES=<nodes>;			nodes required
 *	QUEUETIME=<uts>;		submission time
 *	STARTTIME=<uts>;		time execution started
 *	RCLASS=<partition>;		SLURM partition name
@@ -294,15 +294,11 @@ static char *	_dump_job(struct job_record *job_ptr, int state_info)
 		(uint32_t) _get_job_time_limit(job_ptr));
 	xstrcat(buf, tmp);
-	if (job_ptr->job_state  == JOB_PENDING) {
+	snprintf(tmp, sizeof(tmp),
-		/* Don't report actual tasks or nodes allocated since
+		"TASKS=%u;NODES=%u;",
-		 * this can impact requeue on heterogenous clusters */
+		_get_job_tasks(job_ptr),
-		snprintf(tmp, sizeof(tmp),
+		_get_job_min_nodes(job_ptr));
-			"TASKS=%u;NODES=%u;",
+	xstrcat(buf, tmp);
-			_get_job_tasks(job_ptr),
-			_get_job_min_nodes(job_ptr));
-		xstrcat(buf, tmp);
-	}
 	snprintf(tmp, sizeof(tmp),
 		"QUEUETIME=%u;STARTTIME=%u;RCLASS=%s;",

--- a/src/plugins/sched/wiki2/start_job.c
+++ b/src/plugins/sched/wiki2/start_job.c
@@ -245,11 +245,12 @@ static int	_start_job(uint32_t jobid, int task_cnt, char *hostlist,
 	/* No errors so far */
 	(void) schedule();	/* provides own locking */
 	/* Check to insure the job was actually started */
 	lock_slurmctld(job_write_lock);
-	/* job_ptr = find_job_record(jobid);	don't bother */
+	if (job_ptr->job_id != jobid)
+		job_ptr = find_job_record(jobid);
-	if ((job_ptr->job_id == jobid) && job_ptr->details &&
+	if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details &&
 	    (job_ptr->job_state == JOB_RUNNING)) {
 		/* Restore required node list */
 		xfree(job_ptr->details->req_nodes);
@@ -260,10 +261,11 @@ static int	_start_job(uint32_t jobid, int task_cnt, char *hostlist,
 	} else {
 		xfree(save_req_nodes);
 		FREE_NULL_BITMAP(save_req_bitmap);
-		FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
+		if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details)
+			FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
 	}
-	if ((job_ptr->job_id == jobid) 
+	if (job_ptr && (job_ptr->job_id == jobid) 
 	&&  (job_ptr->job_state != JOB_RUNNING)) {
 		uint16_t wait_reason = 0;
 		char *wait_string;

--- a/src/slurmctld/trigger_mgr.c
+++ b/src/slurmctld/trigger_mgr.c
@@ -874,6 +874,12 @@ static void _trigger_node_event(trig_mgr_info_t *trig_in, time_t now)
 	}
 }
+/* Ideally we would use the existing proctrack plugin to prevent any 
+ * processes from escaping our control, but that plugin is tied
+ * to various slurmd data structures. We just the process group ID
+ * to kill the spawned program after MAX_PROG_TIME. Since triggers are 
+ * meant primarily for system administrators rather than users, this 
+ * may be sufficient. */
 static void _trigger_run_program(trig_mgr_info_t *trig_in)
 {
 	char program[1024], arg0[1024], arg1[1024], user_name[1024], *pname;