From af6b702bd96a7f16c5e4321a6f4dc1d0f05a720d Mon Sep 17 00:00:00 2001
From: Mark Grondona <mgrondona@llnl.gov>
Date: Wed, 23 Oct 2002 20:49:37 +0000
Subject: [PATCH]  o typo fixes. better shm_init() failure handling.

---
 src/slurmd/elan_interconnect.c |  3 ---
 src/slurmd/io.c                | 49 +++++++++++++++++-----------------
 src/slurmd/mgr.c               |  2 ++
 src/slurmd/shm.c               | 48 ++++++++++++++++++++-------------
 src/slurmd/slurmd.c            |  3 ++-
 5 files changed, 58 insertions(+), 47 deletions(-)

diff --git a/src/slurmd/elan_interconnect.c b/src/slurmd/elan_interconnect.c
index 2032eb604f3..9a9739069f4 100644
--- a/src/slurmd/elan_interconnect.c
+++ b/src/slurmd/elan_interconnect.c
@@ -52,8 +52,6 @@ _wait_and_destroy_prg(qsw_jobinfo_t qsw_job, pid_t pid)
 	int i = 0;
 	int sleeptime = 1;
 
-	shm_init();
-
 	debug3("waiting to destory program description...");
   again:
 	if (waitpid(pid, NULL, 0) < 0) {
@@ -80,7 +78,6 @@ _wait_and_destroy_prg(qsw_jobinfo_t qsw_job, pid_t pid)
 		sleep(sleeptime*=2);
 	}
 
-	shm_fini();
 	exit(0);
 	return SLURM_SUCCESS;
 }
diff --git a/src/slurmd/io.c b/src/slurmd/io.c
index 5ca27c814df..11894afc89d 100644
--- a/src/slurmd/io.c
+++ b/src/slurmd/io.c
@@ -63,9 +63,8 @@ typedef enum slurmd_io_tupe {
 	CLIENT_STDOUT,
 } slurmd_io_type_t;
 
-static char *slurmd_io_str[] = 
+static char *_io_str[] = 
 {
-	"domain socket",
 	"task stderr",
 	"task stdout",
 	"task stdin",
@@ -332,8 +331,8 @@ _validate_task_out(struct io_info *t, int type)
 	while ((r = list_next(i))) {
 		if (r->type != type) {
 			fatal("_validate_io: %s reader is %s",
-					slurmd_io_str[t->type],
-					slurmd_io_str[r->type]);
+					_io_str[t->type],
+					_io_str[r->type]);
 		}
 	}
 	list_iterator_destroy(i);
@@ -357,13 +356,13 @@ _validate_task_in(struct io_info *t)
 	while ((r = list_next(i)) != NULL) {
 		if (r->magic != IO_MAGIC) {
 			error("_validate_io: %s writer is invalid", 
-					slurmd_io_str[t->type]);
+					_io_str[t->type]);
 			return 0;
 		}
 		if (r->type != CLIENT_STDOUT) {
 			error("_validate_io: %s writer is %s",
-					slurmd_io_str[t->type],
-					slurmd_io_str[r->type]);
+					_io_str[t->type],
+					_io_str[r->type]);
 			retval = 0;
 		}
 	}
@@ -386,7 +385,7 @@ _validate_client_stdout(struct io_info *client)
 	while ((t = list_next(i))) {
 		if (t->type != TASK_STDIN) {
 			error("_validate_io: client stdin reader is %s",
-					slurmd_io_str[t->type]);
+					_io_str[t->type]);
 			retval = 0;
 		}
 	}
@@ -396,7 +395,7 @@ _validate_client_stdout(struct io_info *client)
 	while ((t = list_next(i))) {
 		if (t->type != TASK_STDOUT) {
 			error("_validate_io: client stdout writer is %s",
-					slurmd_io_str[t->type]);
+					_io_str[t->type]);
 			retval = 0;
 		}
 	}
@@ -421,7 +420,7 @@ _validate_client_stderr(struct io_info *client)
 	while ((t = list_next(i))) {
 		if (t->type != TASK_STDERR) {
 			error("_validate_io: client stderr writer is %s",
-					slurmd_io_str[t->type]);
+					_io_str[t->type]);
 			retval = 0;
 		}
 	}
@@ -691,8 +690,7 @@ _readable(io_obj_t *obj)
 
 	xassert(io->magic == IO_MAGIC);
 
-	if ((rc = (!io->disconnected && !io->eof && (obj->fd > 0))))
-		debug3("readable %s", slurmd_io_str[io->type]);
+	rc = (!io->disconnected && !io->eof && (obj->fd > 0));
 
 	return rc;
 }
@@ -707,8 +705,7 @@ _writable(io_obj_t *obj)
 
 	rc = (!io->disconnected 
 		&& ((cbuf_used(io->buf) > 0) || io->eof));
-	if (rc)
-		debug3("writable %s", slurmd_io_str[io->type]);
+
 	return rc;
 }
 
@@ -725,7 +722,7 @@ _write(io_obj_t *obj, List objs)
 		return 0;
 
 	verbose("Need to write %ld bytes to %s %d", 
-		cbuf_used(io->buf), slurmd_io_str[io->type], io->id);
+		cbuf_used(io->buf), _io_str[io->type], io->id);
 
 
 	if (io->eof && (cbuf_used(io->buf) == 0)) {
@@ -741,16 +738,18 @@ _write(io_obj_t *obj, List objs)
 	}
 
 	while ((n = cbuf_read_fd(io->buf, obj->fd, -1)) < 0) {
-		int local_errno = errno;
 		if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) 
 			continue;
-		error("task <%ld> write failed: %s", io->id, 
-				slurm_strerror(local_errno));
+		error("task <%ld> write failed: %m", io->id);
+		if (io->type == CLIENT_STDERR || io->type == CLIENT_STDOUT)
+			_io_disconnect_client(io);
+		else
+			_shutdown_task_obj(io);
 		return -1;
 	}
 
 	verbose("Wrote %d bytes to %s %d", 
-		 n, slurmd_io_str[io->type], io->id);
+		 n, _io_str[io->type], io->id);
 
 	return 0;
 }
@@ -796,7 +795,7 @@ _task_read(io_obj_t *obj, List objs)
 			goto again;
 		if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
 		        error("%s %d: read returned EAGAIN",
-			       slurmd_io_str[t->type], t->id);
+			       _io_str[t->type], t->id);
 			return 0;
 		}
 		error("Unable to read from task %ld fd %d errno %d %m", 
@@ -804,7 +803,7 @@ _task_read(io_obj_t *obj, List objs)
 		return -1;
 	}
 	verbose("read %d bytes from %s %d", 
-		n, slurmd_io_str[t->type], t->id);
+		n, _io_str[t->type], t->id);
 
 	if (n == 0) {  /* got eof */
 		verbose("got eof on task %ld", t->id);
@@ -821,7 +820,7 @@ _task_read(io_obj_t *obj, List objs)
 	while((r = list_next(i))) {
 		n = cbuf_write(r->buf, (void *) buf, n, NULL);
 		verbose("wrote %ld bytes into %s buf", n, 
-				slurmd_io_str[r->type]);
+				_io_str[r->type]);
 	}
 	list_iterator_destroy(i);
 
@@ -834,7 +833,7 @@ _task_error(io_obj_t *obj, List objs)
 	struct io_info *t = (struct io_info *) obj->arg;
 	xassert(t->magic == IO_MAGIC);
 
-	error("error on %s %d", slurmd_io_str[t->type], t->id);
+	error("error on %s %d", _io_str[t->type], t->id);
 	_shutdown_task_obj(t);
 	obj->fd = -1;
 	list_delete_all(objs, (ListFindF) find_obj, obj);
@@ -867,7 +866,7 @@ _client_read(io_obj_t *obj, List objs)
 		return -1;
 	}
 
-	debug("read %d bytes from %s %d", n, slurmd_io_str[client->type],
+	debug("read %d bytes from %s %d", n, _io_str[client->type],
 			client->id);
 
 	if (n == 0)  { /* got eof, disconnect this client */
@@ -904,7 +903,7 @@ _client_error(io_obj_t *obj, List objs)
 
 	xassert(io->magic == IO_MAGIC);
 
-	error("%s task %d", slurmd_io_str[io->type], io->id); 
+	error("%s task %d", _io_str[io->type], io->id); 
 	return 0;
 }
 
diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c
index a2885ab179a..1d21e197fff 100644
--- a/src/slurmd/mgr.c
+++ b/src/slurmd/mgr.c
@@ -61,6 +61,8 @@ mgr_launch_tasks(launch_tasks_request_msg_t *msg)
 	slurmd_job_t *job;
 
 	log_reinit();
+
+	/* New process, so we must reinit shm */
 	if (shm_init() < 0) 
 		return SLURM_ERROR;
 	if (!(job = job_create(msg)))
diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c
index da6b43d25a5..f3d8646b1e9 100644
--- a/src/slurmd/shm.c
+++ b/src/slurmd/shm.c
@@ -121,7 +121,6 @@ int
 shm_fini(void)
 {
 	int destroy = 0;
-	info("process %ld detaching from shm", getpid());
 	xassert(slurmd_shm != NULL);
 	_shm_lock();
 	if (--slurmd_shm->users == 0)
@@ -316,17 +315,20 @@ shm_signal_step(uint32_t jobid, uint32_t stepid, uint32_t signal)
 	int         retval = SLURM_SUCCESS;
 	int         i;
 	job_step_t *s;
-	task_t     *t;
+	task_t     *t, *tlast;
 
 	_shm_lock();
 	if ((i = _shm_find_step(jobid, stepid)) >= 0) {
 		s = &slurmd_shm->step[i];
+		tlast = NULL;
 		for (t = s->task_list; t; t = t->next) {
+			xassert(t != tlast);
 			if (t->pid > 0 && kill(t->pid, signo) < 0) {
-				error("kill %d.%d pid %ld: %m", 
-				      jobid, stepid, (long)t->pid);
+				error("kill %d.%d task %d pid %ld: %m", 
+				      jobid, stepid, t->id, (long)t->pid);
 				retval = errno;
 			}
+			tlast = t;
 		}	
 	} else
 		retval = ESRCH;
@@ -550,6 +552,7 @@ shm_add_task(uint32_t jobid, uint32_t stepid, task_t *task)
 		slurm_seterrno_ret(ESRCH);
 	} 
 	s = &slurmd_shm->step[i];
+	debug2("adding task %d to step %d.%d", task->id, jobid, stepid);
 	if (_shm_find_task_in_step(s, task->id)) {
 		_shm_unlock();
 		slurm_seterrno_ret(EEXIST);
@@ -578,9 +581,9 @@ _shm_find_task_in_step(job_step_t *s, int taskid)
 	task_t *t = NULL;
 	for (t = s->task_list; t && t->used; t = t->next) {
 		if (t->id == taskid)
-			break;
+			return t;
 	}
-	return t;
+	return NULL;
 }
 
 static task_t *
@@ -588,8 +591,10 @@ _shm_alloc_task(void)
 {
 	int i;
 	for (i = 0; i < MAX_TASKS; i++) {
-		if (!slurmd_shm->task[i].used) 
+		if (!slurmd_shm->task[i].used) {
+			slurmd_shm->task[i].used = true;
 			return &slurmd_shm->task[i];
+		}
 	}
 	return NULL;
 }
@@ -599,6 +604,7 @@ _shm_task_copy(task_t *to, task_t *from)
 {
 	*to = *from;
 	/* next and step are not valid for copying */
+	to->used = true;
 	to->next = NULL;
 	to->job_step = NULL;
 }
@@ -606,12 +612,9 @@ _shm_task_copy(task_t *to, task_t *from)
 static void 
 _shm_step_copy(job_step_t *to, job_step_t *from)
 {
-	task_t *t = NULL;
-	if (to->task_list)
-		t = to->task_list;
 	*to = *from;
 	to->state = SLURMD_JOB_ALLOCATED;
-	to->task_list = t; /* addition of tasks is another step */
+	to->task_list = NULL; /* addition of tasks is another step */
 }
 
 static void
@@ -640,9 +643,14 @@ _shm_create()
 	key_t key = ftok(".", 'a');
 
 	if ((shmid = shmget(key, sizeof(slurmd_shm_t), oflags)) < 0) {
-		if ((shmid = shmget(key, sizeof(slurmd_shm_t), 0600)) < 0)
-		error("shmget: %m");
-		return SLURM_ERROR;
+		if ((shmid = shmget(key, sizeof(slurmd_shm_t), 0600)) < 0) {
+			if (errno == EINVAL) {
+				error("shm_init: Existing shm invalid. "
+				      "Please remove.");
+			} else
+				error("shmget: %m");
+			return SLURM_ERROR;
+		}
 	}
 
 	slurmd_shm = shmat(shmid, NULL, 0);
@@ -663,13 +671,13 @@ _shm_attach()
 	key_t key = ftok(".", 'a');
 
 	if ((shmid = shmget(key, sizeof(slurmd_shm_t), oflags)) < 0) 
-		fatal("shm_attach: %m");
+		return SLURM_ERROR;
 
 	slurmd_shm = shmat(shmid, NULL, 0);
 	if (slurmd_shm == (void *)-1 || !slurmd_shm) 
-		fatal("shmat: %m");
+		return SLURM_ERROR;
 
-	return 1;
+	return SLURM_SUCCESS;
 }
 
 /* 
@@ -701,7 +709,11 @@ _shm_reopen()
 	}
 
 	/* Attach to shared memory region */
-	_shm_attach();
+	if ((_shm_attach() < 0) && (_shm_create() < 0)) {
+		error("shm_create(): %m");
+		return SLURM_FAILURE;
+	}
+
 
 	/* Lock and unlock semaphore to ensure data is initialized */
 	_shm_lock();
diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c
index 6c1cb963f25..e6dafb1b71e 100644
--- a/src/slurmd/slurmd.c
+++ b/src/slurmd/slurmd.c
@@ -128,7 +128,8 @@ int main(int argc, char *argv[])
 	}
 
 	/* shared memory init */
-	slurmd_init();
+	if (slurmd_init() < 0)
+		exit (1);
 
 	if ((rc = getnodename(node_name, MAX_NAME_LEN)))
 		fatal("getnodename: %m");
-- 
GitLab