From af6b702bd96a7f16c5e4321a6f4dc1d0f05a720d Mon Sep 17 00:00:00 2001 From: Mark Grondona <mgrondona@llnl.gov> Date: Wed, 23 Oct 2002 20:49:37 +0000 Subject: [PATCH] o typo fixes. better shm_init() failure handling. --- src/slurmd/elan_interconnect.c | 3 --- src/slurmd/io.c | 49 +++++++++++++++++----------------- src/slurmd/mgr.c | 2 ++ src/slurmd/shm.c | 48 ++++++++++++++++++++------------- src/slurmd/slurmd.c | 3 ++- 5 files changed, 58 insertions(+), 47 deletions(-) diff --git a/src/slurmd/elan_interconnect.c b/src/slurmd/elan_interconnect.c index 2032eb604f3..9a9739069f4 100644 --- a/src/slurmd/elan_interconnect.c +++ b/src/slurmd/elan_interconnect.c @@ -52,8 +52,6 @@ _wait_and_destroy_prg(qsw_jobinfo_t qsw_job, pid_t pid) int i = 0; int sleeptime = 1; - shm_init(); - debug3("waiting to destory program description..."); again: if (waitpid(pid, NULL, 0) < 0) { @@ -80,7 +78,6 @@ _wait_and_destroy_prg(qsw_jobinfo_t qsw_job, pid_t pid) sleep(sleeptime*=2); } - shm_fini(); exit(0); return SLURM_SUCCESS; } diff --git a/src/slurmd/io.c b/src/slurmd/io.c index 5ca27c814df..11894afc89d 100644 --- a/src/slurmd/io.c +++ b/src/slurmd/io.c @@ -63,9 +63,8 @@ typedef enum slurmd_io_tupe { CLIENT_STDOUT, } slurmd_io_type_t; -static char *slurmd_io_str[] = +static char *_io_str[] = { - "domain socket", "task stderr", "task stdout", "task stdin", @@ -332,8 +331,8 @@ _validate_task_out(struct io_info *t, int type) while ((r = list_next(i))) { if (r->type != type) { fatal("_validate_io: %s reader is %s", - slurmd_io_str[t->type], - slurmd_io_str[r->type]); + _io_str[t->type], + _io_str[r->type]); } } list_iterator_destroy(i); @@ -357,13 +356,13 @@ _validate_task_in(struct io_info *t) while ((r = list_next(i)) != NULL) { if (r->magic != IO_MAGIC) { error("_validate_io: %s writer is invalid", - slurmd_io_str[t->type]); + _io_str[t->type]); return 0; } if (r->type != CLIENT_STDOUT) { error("_validate_io: %s writer is %s", - slurmd_io_str[t->type], - slurmd_io_str[r->type]); + _io_str[t->type], + _io_str[r->type]); retval = 0; } } @@ -386,7 +385,7 @@ _validate_client_stdout(struct io_info *client) while ((t = list_next(i))) { if (t->type != TASK_STDIN) { error("_validate_io: client stdin reader is %s", - slurmd_io_str[t->type]); + _io_str[t->type]); retval = 0; } } @@ -396,7 +395,7 @@ _validate_client_stdout(struct io_info *client) while ((t = list_next(i))) { if (t->type != TASK_STDOUT) { error("_validate_io: client stdout writer is %s", - slurmd_io_str[t->type]); + _io_str[t->type]); retval = 0; } } @@ -421,7 +420,7 @@ _validate_client_stderr(struct io_info *client) while ((t = list_next(i))) { if (t->type != TASK_STDERR) { error("_validate_io: client stderr writer is %s", - slurmd_io_str[t->type]); + _io_str[t->type]); retval = 0; } } @@ -691,8 +690,7 @@ _readable(io_obj_t *obj) xassert(io->magic == IO_MAGIC); - if ((rc = (!io->disconnected && !io->eof && (obj->fd > 0)))) - debug3("readable %s", slurmd_io_str[io->type]); + rc = (!io->disconnected && !io->eof && (obj->fd > 0)); return rc; } @@ -707,8 +705,7 @@ _writable(io_obj_t *obj) rc = (!io->disconnected && ((cbuf_used(io->buf) > 0) || io->eof)); - if (rc) - debug3("writable %s", slurmd_io_str[io->type]); + return rc; } @@ -725,7 +722,7 @@ _write(io_obj_t *obj, List objs) return 0; verbose("Need to write %ld bytes to %s %d", - cbuf_used(io->buf), slurmd_io_str[io->type], io->id); + cbuf_used(io->buf), _io_str[io->type], io->id); if (io->eof && (cbuf_used(io->buf) == 0)) { @@ -741,16 +738,18 @@ _write(io_obj_t *obj, List objs) } while ((n = cbuf_read_fd(io->buf, obj->fd, -1)) < 0) { - int local_errno = errno; if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) continue; - error("task <%ld> write failed: %s", io->id, - slurm_strerror(local_errno)); + error("task <%ld> write failed: %m", io->id); + if (io->type == CLIENT_STDERR || io->type == CLIENT_STDOUT) + _io_disconnect_client(io); + else + _shutdown_task_obj(io); return -1; } verbose("Wrote %d bytes to %s %d", - n, slurmd_io_str[io->type], io->id); + n, _io_str[io->type], io->id); return 0; } @@ -796,7 +795,7 @@ _task_read(io_obj_t *obj, List objs) goto again; if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { error("%s %d: read returned EAGAIN", - slurmd_io_str[t->type], t->id); + _io_str[t->type], t->id); return 0; } error("Unable to read from task %ld fd %d errno %d %m", @@ -804,7 +803,7 @@ _task_read(io_obj_t *obj, List objs) return -1; } verbose("read %d bytes from %s %d", - n, slurmd_io_str[t->type], t->id); + n, _io_str[t->type], t->id); if (n == 0) { /* got eof */ verbose("got eof on task %ld", t->id); @@ -821,7 +820,7 @@ _task_read(io_obj_t *obj, List objs) while((r = list_next(i))) { n = cbuf_write(r->buf, (void *) buf, n, NULL); verbose("wrote %ld bytes into %s buf", n, - slurmd_io_str[r->type]); + _io_str[r->type]); } list_iterator_destroy(i); @@ -834,7 +833,7 @@ _task_error(io_obj_t *obj, List objs) struct io_info *t = (struct io_info *) obj->arg; xassert(t->magic == IO_MAGIC); - error("error on %s %d", slurmd_io_str[t->type], t->id); + error("error on %s %d", _io_str[t->type], t->id); _shutdown_task_obj(t); obj->fd = -1; list_delete_all(objs, (ListFindF) find_obj, obj); @@ -867,7 +866,7 @@ _client_read(io_obj_t *obj, List objs) return -1; } - debug("read %d bytes from %s %d", n, slurmd_io_str[client->type], + debug("read %d bytes from %s %d", n, _io_str[client->type], client->id); if (n == 0) { /* got eof, disconnect this client */ @@ -904,7 +903,7 @@ _client_error(io_obj_t *obj, List objs) xassert(io->magic == IO_MAGIC); - error("%s task %d", slurmd_io_str[io->type], io->id); + error("%s task %d", _io_str[io->type], io->id); return 0; } diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index a2885ab179a..1d21e197fff 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -61,6 +61,8 @@ mgr_launch_tasks(launch_tasks_request_msg_t *msg) slurmd_job_t *job; log_reinit(); + + /* New process, so we must reinit shm */ if (shm_init() < 0) return SLURM_ERROR; if (!(job = job_create(msg))) diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index da6b43d25a5..f3d8646b1e9 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -121,7 +121,6 @@ int shm_fini(void) { int destroy = 0; - info("process %ld detaching from shm", getpid()); xassert(slurmd_shm != NULL); _shm_lock(); if (--slurmd_shm->users == 0) @@ -316,17 +315,20 @@ shm_signal_step(uint32_t jobid, uint32_t stepid, uint32_t signal) int retval = SLURM_SUCCESS; int i; job_step_t *s; - task_t *t; + task_t *t, *tlast; _shm_lock(); if ((i = _shm_find_step(jobid, stepid)) >= 0) { s = &slurmd_shm->step[i]; + tlast = NULL; for (t = s->task_list; t; t = t->next) { + xassert(t != tlast); if (t->pid > 0 && kill(t->pid, signo) < 0) { - error("kill %d.%d pid %ld: %m", - jobid, stepid, (long)t->pid); + error("kill %d.%d task %d pid %ld: %m", + jobid, stepid, t->id, (long)t->pid); retval = errno; } + tlast = t; } } else retval = ESRCH; @@ -550,6 +552,7 @@ shm_add_task(uint32_t jobid, uint32_t stepid, task_t *task) slurm_seterrno_ret(ESRCH); } s = &slurmd_shm->step[i]; + debug2("adding task %d to step %d.%d", task->id, jobid, stepid); if (_shm_find_task_in_step(s, task->id)) { _shm_unlock(); slurm_seterrno_ret(EEXIST); @@ -578,9 +581,9 @@ _shm_find_task_in_step(job_step_t *s, int taskid) task_t *t = NULL; for (t = s->task_list; t && t->used; t = t->next) { if (t->id == taskid) - break; + return t; } - return t; + return NULL; } static task_t * @@ -588,8 +591,10 @@ _shm_alloc_task(void) { int i; for (i = 0; i < MAX_TASKS; i++) { - if (!slurmd_shm->task[i].used) + if (!slurmd_shm->task[i].used) { + slurmd_shm->task[i].used = true; return &slurmd_shm->task[i]; + } } return NULL; } @@ -599,6 +604,7 @@ _shm_task_copy(task_t *to, task_t *from) { *to = *from; /* next and step are not valid for copying */ + to->used = true; to->next = NULL; to->job_step = NULL; } @@ -606,12 +612,9 @@ _shm_task_copy(task_t *to, task_t *from) static void _shm_step_copy(job_step_t *to, job_step_t *from) { - task_t *t = NULL; - if (to->task_list) - t = to->task_list; *to = *from; to->state = SLURMD_JOB_ALLOCATED; - to->task_list = t; /* addition of tasks is another step */ + to->task_list = NULL; /* addition of tasks is another step */ } static void @@ -640,9 +643,14 @@ _shm_create() key_t key = ftok(".", 'a'); if ((shmid = shmget(key, sizeof(slurmd_shm_t), oflags)) < 0) { - if ((shmid = shmget(key, sizeof(slurmd_shm_t), 0600)) < 0) - error("shmget: %m"); - return SLURM_ERROR; + if ((shmid = shmget(key, sizeof(slurmd_shm_t), 0600)) < 0) { + if (errno == EINVAL) { + error("shm_init: Existing shm invalid. " + "Please remove."); + } else + error("shmget: %m"); + return SLURM_ERROR; + } } slurmd_shm = shmat(shmid, NULL, 0); @@ -663,13 +671,13 @@ _shm_attach() key_t key = ftok(".", 'a'); if ((shmid = shmget(key, sizeof(slurmd_shm_t), oflags)) < 0) - fatal("shm_attach: %m"); + return SLURM_ERROR; slurmd_shm = shmat(shmid, NULL, 0); if (slurmd_shm == (void *)-1 || !slurmd_shm) - fatal("shmat: %m"); + return SLURM_ERROR; - return 1; + return SLURM_SUCCESS; } /* @@ -701,7 +709,11 @@ _shm_reopen() } /* Attach to shared memory region */ - _shm_attach(); + if ((_shm_attach() < 0) && (_shm_create() < 0)) { + error("shm_create(): %m"); + return SLURM_FAILURE; + } + /* Lock and unlock semaphore to ensure data is initialized */ _shm_lock(); diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index 6c1cb963f25..e6dafb1b71e 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -128,7 +128,8 @@ int main(int argc, char *argv[]) } /* shared memory init */ - slurmd_init(); + if (slurmd_init() < 0) + exit (1); if ((rc = getnodename(node_name, MAX_NAME_LEN))) fatal("getnodename: %m"); -- GitLab