From 8e3c864c126c3aca019f17d6edfeebe1b8a42501 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Fri, 23 May 2008 19:52:37 +0000
Subject: [PATCH] fix infinite loop in task/affinity when there is a
 bookkeeping error.

---
 NEWS                                   |  1 +
 src/plugins/task/affinity/dist_tasks.c | 41 ++++++++++++++++----------
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/NEWS b/NEWS
index 6ede0cb6040..e289042bc98 100644
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,7 @@ documents those changes that are of interest to users and admins.
     CEA).
  -- For BlueGene only, log "Prolog failure" once per job not once per node.
  -- Reopen slurmctld log file after reconfigure or SIGHUP is received.
+ -- In TaskPlugin=task/affinity, fix possible infinite loop for slurmd.
 
 * Changes in SLURM 1.3.2
 ========================
diff --git a/src/plugins/task/affinity/dist_tasks.c b/src/plugins/task/affinity/dist_tasks.c
index 8a707a15dfb..61ec4ea6945 100644
--- a/src/plugins/task/affinity/dist_tasks.c
+++ b/src/plugins/task/affinity/dist_tasks.c
@@ -162,24 +162,24 @@ void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id)
 	switch (req->task_dist) {
 	case SLURM_DIST_BLOCK_BLOCK:
 	case SLURM_DIST_CYCLIC_BLOCK:
-		_task_layout_lllp_block(req, gtid, maxtasks, &masks);
+		rc = _task_layout_lllp_block(req, gtid, maxtasks, &masks);
 		break;
 	case SLURM_DIST_CYCLIC:
 	case SLURM_DIST_BLOCK:
 	case SLURM_DIST_CYCLIC_CYCLIC:
 	case SLURM_DIST_BLOCK_CYCLIC:
-		_task_layout_lllp_cyclic(req, gtid, maxtasks, &masks); 
+		rc = _task_layout_lllp_cyclic(req, gtid, maxtasks, &masks); 
 		break;
 	case SLURM_DIST_PLANE:
-		_task_layout_lllp_plane(req, gtid, maxtasks, &masks); 
+		rc = _task_layout_lllp_plane(req, gtid, maxtasks, &masks); 
 		break;
 	default:
-		_task_layout_lllp_cyclic(req, gtid, maxtasks, &masks); 
+		rc = _task_layout_lllp_cyclic(req, gtid, maxtasks, &masks); 
 		req->task_dist = SLURM_DIST_BLOCK_CYCLIC;
 		break;
 	}
 
-	if (masks) {
+	if (rc == SLURM_SUCCESS) {
 		_task_layout_display_masks(req, gtid, maxtasks, masks); 
 		if (req->cpus_per_task > 1) {
 			_lllp_enlarge_masks(req, maxtasks, masks);
@@ -190,12 +190,8 @@ void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id)
 	    	_lllp_map_abstract_masks(maxtasks, masks);
 		_task_layout_display_masks(req, gtid, maxtasks, masks); 
 	    	_lllp_generate_cpu_bind(req, maxtasks, masks);
-	    	_lllp_free_masks(req, maxtasks, masks);
 	}
-
-	if(rc != SLURM_SUCCESS)
-		error (" Error in lllp_distribution_create %s ", 
-		       req->task_dist);
+	_lllp_free_masks(req, maxtasks, masks);
 }
 
 static
@@ -725,7 +721,7 @@ static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
 				    const uint32_t maxtasks,
 				    bitstr_t ***masks_p)
 {
-	int retval, i, taskcount = 0, taskid = 0;
+	int retval, i, last_taskcount = -1, taskcount = 0, taskid = 0;
 	uint16_t socket_index = 0, core_index = 0, thread_index = 0;
 	uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0;
 	uint16_t usable_cpus = 0, avail_cpus = 0;
@@ -754,7 +750,12 @@ static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
 		return retval;
 	masks = *masks_p;
 
-	for (i=0; taskcount<maxtasks; i++) { 
+	for (i=0; taskcount<maxtasks; i++) {
+		if (taskcount == last_taskcount) {
+			error("_task_layout_lllp_cyclic failure");
+			return SLURM_ERROR;
+		}
+		last_taskcount = taskcount; 
 		for (thread_index=0; thread_index<usable_threads; thread_index++) {
 			for (core_index=0; core_index<usable_cores; core_index++) {
 				for (socket_index=0; socket_index<usable_sockets; 
@@ -809,7 +810,7 @@ static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
 				   const uint32_t maxtasks,
 				   bitstr_t ***masks_p)
 {
-        int retval, j, k, l, m, taskcount = 0, taskid = 0;
+	int retval, j, k, l, m, last_taskcount = -1, taskcount = 0, taskid = 0;
 	int over_subscribe  = 0, space_remaining = 0;
 	uint16_t core_index = 0, thread_index = 0;
 	uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0;
@@ -846,6 +847,11 @@ static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
 	}
 	
 	while(taskcount < maxtasks) {
+		if (taskcount == last_taskcount) {
+			error("_task_layout_lllp_block failure");
+			return SLURM_ERROR;
+		}
+		last_taskcount = taskcount;
 		for (j=0; j<usable_sockets; j++) {
 			for(core_index=0; core_index < usable_cores; core_index++) {
 				if((core_index < usable_cores) || (over_subscribe)) {
@@ -885,7 +891,7 @@ static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
 	}
 	
 	/* Distribute the tasks and create masks for the task
-	   affinity plug-in */
+	 * affinity plug-in */
 	taskid = 0;
 	taskcount = 0;
 	for (j=0; j<usable_sockets; j++) {
@@ -948,7 +954,7 @@ static int _task_layout_lllp_plane(launch_tasks_request_msg_t *req,
 				   const uint32_t maxtasks,
 				   bitstr_t ***masks_p)
 {
-        int retval, j, k, l, m, taskid = 0, next = 0;
+	int retval, j, k, l, m, taskid = 0, last_taskcount = -1, next = 0;
 	uint16_t core_index = 0, thread_index = 0;
 	uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0;
 	uint16_t usable_cpus = 0, avail_cpus = 0;
@@ -984,6 +990,11 @@ static int _task_layout_lllp_plane(launch_tasks_request_msg_t *req,
 	next = 0;
 
 	for (j=0; next<maxtasks; j++) {
+		if (next == last_taskcount) {
+			error("_task_layout_lllp_plan failure");
+			return SLURM_ERROR;
+		}
+		last_taskcount = next;
 		for (k=0; k<usable_sockets; k++) {
 			max_plane_size = (plane_size > usable_cores) ? plane_size : usable_cores;
 			for (m=0; m<max_plane_size; m++) {
-- 
GitLab