From 148562d97bc34cb311231ef922a308b5f40b87c7 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 11 Dec 2003 21:29:38 +0000 Subject: [PATCH] Fix bug in slurmd -c handling of shared memory deletion. --- NEWS | 5 ++++- src/slurmd/shm.c | 13 ++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index 62de0e0e93f..33bb27b682e 100644 --- a/NEWS +++ b/NEWS @@ -1,11 +1,12 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. -* Changes in SLURM 0.3.0-pre3 (not yet tagged or built) +* Changes in SLURM 0.3.0-pre3 ============================= -- Fixes for reported problems: - slurm/328: Slurmd was restarting with a new shared memory segment and losing track of jobs + - slurm/329: Job processing may be left running when one task dies - slurm/333: Slurmd fails to launch a job and deletes a step, due to a race condition in shared memory management - slurm/334: Slurmd was getting a segv due to a race condition in shared @@ -35,6 +36,8 @@ documents those changes that are of interest to users and admins. -- Insure proper enforcement of node sharing by job -- Treat lack of SpoolDir or StateSaveDir as a fatal error -- Quickstart.html guide expanded + -- Increase maximum jobs steps per node from 16 to 64 + -- Delete correct shared memory segment on slurmd -c (clean start) * Changes in SLURM 0.3.0-pre2 ============================= diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index af4ac0f9fa9..a3b9858efa5 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -73,8 +73,7 @@ /* We use Chris Dunlap's POSIX semaphore implementation if necessary */ #include "src/slurmd/semaphore.h" -#define MAX_JOB_STEPS 16 -#define MAX_BATCH_JOBS 128 +#define MAX_JOB_STEPS 64 #define MAX_TASKS 1024 #define SHM_LOCKNAME "/.slurm.lock" @@ -200,7 +199,7 @@ shm_fini(void) slurmd_shm = NULL; if (destroy && (shmctl(shmid, IPC_RMID, NULL) < 0)) { - error("shmctl: %m"); + error("Can't delete shm segment (%d): %m", shmid); goto error; } _shm_unlock(); @@ -239,10 +238,10 @@ shm_cleanup(void) * region. */ id = shmget(key, 1, 0); - } + } - if ((id > 0) && (shmctl(shmid, IPC_RMID, NULL) < 0)) { - error ("Unable to destroy existing shm segment"); + if ((id > 0) && (shmctl(id, IPC_RMID, NULL) < 0)) { + error ("Can't destroy existing shm segment (%d): %m", id); } } @@ -961,7 +960,7 @@ _shm_attach() shmid, (int)(shmi.shm_segsz/1024), (sizeof(slurmd_shm_t)/1024)); error("You probably need to run with `-c' " - "or just delete old segment."); + "or just delete old segment (see `ipcrm')."); slurm_seterrno_ret(EINVAL); } -- GitLab