diff --git a/Makefile.am b/Makefile.am index 2b2ca549ea6025f84d5081fb42a5a6bc25245be3..00dad97e1a6b81b8724cc31e27c598da5acc2b59 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2,9 +2,15 @@ AUTOMAKE_OPTIONS = foreign SUBDIRS = src testsuite doc -EXTRA_DIST = auxdir etc +EXTRA_DIST = \ + auxdir \ + etc \ + doc \ + autogen.sh MAINTAINERCLEANFILES = Makefile.in aclocal.m4 config.guess \ config.h.in config.sub configure install-sh \ ltconfig ltmain.sh missing mkinstalldirs \ stamp-h.in + +include $(top_srcdir)/Make-rpm.mk diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 015230b3b614257d61e418a84323f740ec188ab0..3cf46711be34608497b0796428d70c2316cec44f 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -154,8 +154,8 @@ verbose operation. Multiple \fB-v\fR's will further increase the verbosity of .TP \fB\-d\fR, \fB\-\-slurmd-debug\fR=\fIlevel\fR Specify a debug level for slurmd(8). \fIlevel\fR may be an integer value -between 0 [quiet, only errors are displayed] and 6 [insanely verbose -operation]. The slurmd debug information is copied to the stderr of +between 0 [quiet, only errors are displayed] and 5 [insanely verbose +operation]. The slurmd debug information is copied onto the stderr of the job. .TP \fB\-W\fR, \fB\-\-wait\fR=\fIseconds\fR diff --git a/src/common/daemonize.c b/src/common/daemonize.c index 08b07ade2db59222a288ebf83c20c7ec233b1a47..5790b999e0b506b7bbfc91b4219620ef420d365a 100644 --- a/src/common/daemonize.c +++ b/src/common/daemonize.c @@ -83,8 +83,14 @@ daemon(int nochdir, int noclose) } +/* + * Read and return pid stored in pidfile. + * Returns 0 if file doesn't exist or pid cannot be read. + * If pidfd != NULL, the file will be kept open and the fd + * returned. + */ pid_t -read_pidfile(const char *pidfile) +read_pidfile(const char *pidfile, int *pidfd) { int fd; FILE *fp = NULL; @@ -113,6 +119,11 @@ read_pidfile(const char *pidfile) fatal ("pidfile locked by %ld but contains pid=%ld", (long) lpid, pid); + if (pidfd != NULL) + *pidfd = fd; + else + (void) close(fd); /* Ignore errors */ + return (lpid); } diff --git a/src/common/daemonize.h b/src/common/daemonize.h index a4f382ee7d2cf577723ba45c47734592c6afde68..903464826fac8a0bb3a19fc55bc4a022c8f56427 100644 --- a/src/common/daemonize.h +++ b/src/common/daemonize.h @@ -44,7 +44,9 @@ int create_pidfile(char *pidfilename); /* * Attempt to read an old pid from the configured pidfile * Returns 0 if no pidfile exists (No running process) + * If pidfilefd is not NULL, returns open file descriptor for + * pidfile (when pid != 0). */ -pid_t read_pidfile(char *pidfilename); +pid_t read_pidfile(char *pidfilename, int *pidfilefd); #endif /* !_HAVE_DAEMONIZE_H */ diff --git a/src/common/log.c b/src/common/log.c index 1cebb6f6e6c798c46cd87ba48437cf2461f4a2ab..5659c778a69298f1291c55e430ff07b4c218e841 100644 --- a/src/common/log.c +++ b/src/common/log.c @@ -179,7 +179,7 @@ _log_init(char *prog, log_options_t opt, log_facility_t fac, char *logfile ) if (log->opt.syslog_level > LOG_LEVEL_QUIET) log->facility = fac; - if (logfile && log->opt.logfile_level > LOG_LEVEL_QUIET) { + if (logfile && (log->opt.logfile_level > LOG_LEVEL_QUIET)) { FILE *fp; fp = safeopen(logfile, "a", SAFEOPEN_LINK_OK); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index dc91240987089c00812bf039826da9606d9aabbd..b6c5b2ad291a8d5e8eaaac523fb989a57673a404 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -193,7 +193,7 @@ int main(int argc, char *argv[]) if (chdir(slurmctld_conf.state_save_location)) fatal("chdir to %s error %m", slurmctld_conf.state_save_location); - error_code = daemon(0, 0); + error_code = daemon(1, 0); log_alter(log_opts, LOG_DAEMON, slurmctld_conf.slurmctld_logfile); if (error_code) diff --git a/src/slurmd/req.c b/src/slurmd/req.c index 6ce669b8200600a9a52d106de96c468ffab6a41b..43f8cdc167b63530abed58e5ceaa4af19e868712 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -186,7 +186,8 @@ _launch_tasks(launch_tasks_request_msg_t *req, slurm_addr *cli) static void _rpc_launch_tasks(slurm_msg_t *msg, slurm_addr *cli) { - int rc; + int retval = 0; + int rc = 0; uint16_t port; char host[MAXHOSTNAMELEN]; uid_t req_uid; @@ -199,9 +200,8 @@ _rpc_launch_tasks(slurm_msg_t *msg, slurm_addr *cli) if ((super_user == false) && (req_uid != req->uid)) { error("Security violation, launch task RCP from uid %u", (unsigned int) req_uid); - rc = ESLURM_USER_ID_MISSING; /* or invalid user */ - slurm_send_rc_msg(msg, rc); - return; + retval = ESLURM_USER_ID_MISSING; /* or invalid user */ + goto done; } slurm_get_addr(cli, &port, host, sizeof(host)); @@ -216,21 +216,23 @@ _rpc_launch_tasks(slurm_msg_t *msg, slurm_addr *cli) conf->cred_state_list); if ((rc != SLURM_SUCCESS) && (super_user == false)) { + retval = errno; error("Invalid credential from %ld@%s", req_uid, host); - slurm_send_rc_msg(msg, rc); - return; + goto done; } /* Run job prolog if necessary */ if (run_prolog && (_run_prolog(req->job_id, req->uid) != 0)) { error("[job %d] prolog failed", req->job_id); - slurm_send_rc_msg(msg, ESLURMD_PROLOG_FAILED); - return; + retval = ESLURMD_PROLOG_FAILED; + goto done; } - rc = _launch_tasks(req, cli); + if (_launch_tasks(req, cli) < 0) + retval = errno; - slurm_send_rc_msg(msg, rc); + done: + slurm_send_rc_msg(msg, retval); } diff --git a/src/slurmd/semaphore.c b/src/slurmd/semaphore.c index ae6c362041cb18743d2677f019def85f0a134e33..29794104b92e6456544398843ee60c28ee5970ab 100644 --- a/src/slurmd/semaphore.c +++ b/src/slurmd/semaphore.c @@ -48,6 +48,8 @@ #include <stdio.h> #include "semaphore.h" +#include "../common/log.h" + #define MAX_TRIES 3 @@ -157,6 +159,7 @@ sem_t * sem_open(const char *name, int oflag, ...) } } + /* Open (presumably) existing semaphore. Either O_CREAT was not specified, * or O_CREAT was specified w/o O_EXCL and the semaphore already exists. */ @@ -239,13 +242,15 @@ int sem_unlink(const char *name) if ((key = ftok(name, 1)) == -1) { return(-1); } - if (unlink(name) == -1) { - return(-1); - } if ((semid = semget(key, 0, 0)) == -1) { - return(-1); + goto done; } if (semctl(semid, 0, IPC_RMID) == -1) { + goto done; + } + + done: + if (unlink(name) == -1) { return(-1); } return(0); @@ -262,7 +267,7 @@ int sem_wait(sem_t *sem) } op.sem_num = 0; op.sem_op = -1; - op.sem_flg = 0; + op.sem_flg = SEM_UNDO; if (semop(sem->id, &op, 1) == -1) { return(-1); } diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index 4a0f9380bdc981bd4e2c0ef633798ed5c582cb5a..5034ede844a69050a4c487a940aea483c8225648 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -156,11 +156,9 @@ shm_fini(void) debug3("%ld calling shm_fini() (attached by %ld)", getpid(), attach_pid); - /* xassert(attach_pid == getpid()); */ - /* if ((attach_pid == getpid()) && (--slurmd_shm->users == 0)) - * destroy = 1; - */ + debug("[%ld] shm_fini: shm_users = %d", getpid(), slurmd_shm->users); + if (--slurmd_shm->users == 0) destroy = 1; @@ -191,7 +189,7 @@ shm_cleanup(void) char *s; if ((s = _create_ipc_name(SHM_LOCKNAME))) { - verbose("request to destroy shm lock `%s'", s); + info("request to destroy shm lock `%s'", s); if (sem_unlink(s) < 0) error("sem_unlink: %m"); xfree(s); diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index bd05ff97d2ee7ebb52f8b0dccec4b2194bd40a08..b5efe2ea34df801f236681edc6363c7a8a37d323 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -66,7 +66,7 @@ # define MAXHOSTNAMELEN 64 #endif -#define DEFAULT_SPOOLDIR "/tmp" +#define DEFAULT_SPOOLDIR "/tmp/slurmd" #define DEFAULT_PIDFILE "/var/run/slurmd.pid" typedef struct connection { @@ -136,12 +136,13 @@ main (int argc, char *argv[]) _print_conf(); _kill_old_slurmd(); + _create_msg_socket(); + + conf->pid = getpid(); create_pidfile(conf->pidfile); info("%s started on %T", xbasename(argv[0])); - _create_msg_socket(); - conf->pid = getpid(); if (_slurmd_init() < 0) exit(1); @@ -217,11 +218,12 @@ _wait_for_all_threads() ListIterator i; pthread_t *ptid; - debug("Cancelling %d running threads", list_count(conf->threads)); + debug("Waiting for %d running threads", list_count(conf->threads)); i = list_iterator_create(conf->threads); while ((ptid = list_next(i))) { - pthread_cancel(*ptid); + pthread_join(*ptid, NULL); + debug2("thread %d finished", *ptid); } list_iterator_destroy(i); } @@ -452,6 +454,7 @@ _init_conf() conf->prolog = NULL; conf->port = 0; conf->daemonize = 1; + conf->shm_cleanup = 0; conf->lfd = -1; conf->log_opts = lopts; conf->pidfile = xstrdup(DEFAULT_PIDFILE); @@ -486,7 +489,7 @@ _process_cmdline(int ac, char **av) conf->logfile = xstrdup(optarg); break; case 'c': - shm_cleanup(); + conf->shm_cleanup = 1; break; default: _usage(c); @@ -529,6 +532,8 @@ _slurmd_init() slurm_init_verifier(&conf->vctx, conf->pubkey); _restore_cred_state(&conf->cred_state_list); conf->threads = list_create((ListDelF) _tid_free); + if (conf->shm_cleanup) + shm_cleanup(); if (shm_init() < 0) return SLURM_FAILURE; return SLURM_SUCCESS; @@ -680,11 +685,18 @@ _set_slurmd_spooldir(void) static void _kill_old_slurmd(void) { - pid_t oldpid = read_pidfile(conf->pidfile); + int fd; + pid_t oldpid = read_pidfile(conf->pidfile, &fd); if (oldpid != (pid_t) 0) { info ("killing old slurmd[%ld]", (long) oldpid); kill(oldpid, SIGTERM); - sleep(2); + + /* + * Wait for previous daemon to terminate + */ + if (fd_get_readw_lock(fd) < 0) + fatal ("unable to wait for readw lock: %m"); + (void) close(fd); /* Ignore errors */ } } diff --git a/src/slurmd/slurmd.h b/src/slurmd/slurmd.h index 2d2d62ba1c7e47005e64797ea92c0e292de65f37..144d72b8fd43657d5fe3d79e7826ef13b29d98ee 100644 --- a/src/slurmd/slurmd.h +++ b/src/slurmd/slurmd.h @@ -75,6 +75,7 @@ typedef struct slurmd_config { log_options_t log_opts; /* current logging options */ int debug_level; /* logging detail level */ int daemonize:1; /* daemonize flag */ + int shm_cleanup:1; List cred_state_list; /* credential stat list */ List threads; /* list of active threads */