From 148562d97bc34cb311231ef922a308b5f40b87c7 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Thu, 11 Dec 2003 21:29:38 +0000
Subject: [PATCH] Fix bug in slurmd -c handling of shared memory deletion.

---
 NEWS             |  5 ++++-
 src/slurmd/shm.c | 13 ++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/NEWS b/NEWS
index 62de0e0e93f..33bb27b682e 100644
--- a/NEWS
+++ b/NEWS
@@ -1,11 +1,12 @@
 This file describes changes in recent versions of SLURM. It primarily
 documents those changes that are of interest to users and admins. 
 
-* Changes in SLURM 0.3.0-pre3 (not yet tagged or built)
+* Changes in SLURM 0.3.0-pre3
 =============================
  -- Fixes for reported problems:
    - slurm/328: Slurmd was restarting with a new shared memory segment and 
      losing track of jobs
+   - slurm/329: Job processing may be left running when one task dies
    - slurm/333: Slurmd fails to launch a job and deletes a step, due to 
      a race condition in shared memory management
    - slurm/334: Slurmd was getting a segv due to a race condition in shared 
@@ -35,6 +36,8 @@ documents those changes that are of interest to users and admins.
  -- Insure proper enforcement of node sharing by job
  -- Treat lack of SpoolDir or StateSaveDir as a fatal error
  -- Quickstart.html guide expanded
+ -- Increase maximum jobs steps per node from 16 to 64
+ -- Delete correct shared memory segment on slurmd -c (clean start)
 
 * Changes in SLURM 0.3.0-pre2
 =============================
diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c
index af4ac0f9fa9..a3b9858efa5 100644
--- a/src/slurmd/shm.c
+++ b/src/slurmd/shm.c
@@ -73,8 +73,7 @@
 /* We use Chris Dunlap's POSIX semaphore implementation if necessary */
 #include "src/slurmd/semaphore.h"
 
-#define MAX_JOB_STEPS	16
-#define MAX_BATCH_JOBS	128
+#define MAX_JOB_STEPS	64
 #define MAX_TASKS	1024
 
 #define SHM_LOCKNAME	"/.slurm.lock"
@@ -200,7 +199,7 @@ shm_fini(void)
 	slurmd_shm = NULL;
 
 	if (destroy && (shmctl(shmid, IPC_RMID, NULL) < 0)) {
-		error("shmctl: %m");
+		error("Can't delete shm segment (%d): %m", shmid);
 		goto error;
 	}
 	_shm_unlock();
@@ -239,10 +238,10 @@ shm_cleanup(void)
 		 *  region.
 		 */
 		id = shmget(key, 1, 0);
-	} 
+	}
 
-	if ((id > 0) && (shmctl(shmid, IPC_RMID, NULL) < 0)) {
-		error ("Unable to destroy existing shm segment");
+	if ((id > 0) && (shmctl(id, IPC_RMID, NULL) < 0)) {
+		error ("Can't destroy existing shm segment (%d): %m", id);
 	}
 }
 
@@ -961,7 +960,7 @@ _shm_attach()
 		      shmid, (int)(shmi.shm_segsz/1024), 
 		      (sizeof(slurmd_shm_t)/1024));
 		error("You probably need to run with `-c' "
-			"or just delete old segment.");
+			"or just delete old segment (see `ipcrm').");
 		slurm_seterrno_ret(EINVAL);
 	}
 
-- 
GitLab