From 685b46c7a90137fe11b0d8cc8c7e093879f86904 Mon Sep 17 00:00:00 2001 From: Mark Grondona <mgrondona@llnl.gov> Date: Thu, 20 Mar 2003 00:08:36 +0000 Subject: [PATCH] o fix bug in slurmd shared memory initialization when lockfile exists but sysv semaphore doesn't o changes to some of the logic in slurm_cred_t validity checks --- src/slurmd/req.c | 32 ++++++++++++++++++------ src/slurmd/semaphore.c | 4 +-- src/slurmd/shm.c | 55 ++++++++++++++++++++++++++++++++---------- 3 files changed, 68 insertions(+), 23 deletions(-) diff --git a/src/slurmd/req.c b/src/slurmd/req.c index f20e8676846..65a363db0ac 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -53,6 +53,7 @@ #define MAXHOSTNAMELEN 64 #endif + static bool _job_still_running(uint32_t job_id); static int _kill_all_active_steps(uint32_t jobid, int sig); static int _launch_tasks(launch_tasks_request_msg_t *, slurm_addr *); @@ -71,6 +72,7 @@ static int _run_prolog(uint32_t jobid, uid_t uid); static int _run_epilog(uint32_t jobid, uid_t uid); static void _wait_for_procs(uint32_t job_id, uid_t job_uid); + void slurmd_req(slurm_msg_t *msg, slurm_addr *cli) { @@ -144,7 +146,6 @@ _launch_batch_job(batch_job_launch_msg_t *req, slurm_addr *cli) { pid_t pid; int rc; - switch ((pid = fork())) { case -1: @@ -202,9 +203,21 @@ _check_job_credential(slurm_cred_t cred, uint32_t jobid, slurm_cred_arg_t arg; hostset_t hset = NULL; - if (slurm_cred_verify(conf->vctx, cred, &arg) < 0) + /* + * First call slurm_cred_verify() so that all valid + * credentials are checked + */ + if ( (slurm_cred_verify(conf->vctx, cred, &arg) < 0) + && (uid != conf->slurm_user_id) ) return SLURM_ERROR; + /* + * If the requesting user is the slurm user, do not perform + * any more validity checks + */ + if (uid == conf->slurm_user_id) + return SLURM_SUCCESS; + if ((arg.jobid != jobid) || (arg.stepid != stepid)) { error("job credential for %d.%d, expected %d.%d", arg.jobid, arg.stepid, jobid, stepid); @@ -232,6 +245,9 @@ _check_job_credential(slurm_cred_t cred, uint32_t jobid, goto fail; } + hostset_destroy(hset); + xfree(arg.hostlist); + return SLURM_SUCCESS; fail: @@ -273,14 +289,14 @@ _rpc_launch_tasks(slurm_msg_t *msg, slurm_addr *cli) if (!slurm_cred_jobid_cached(conf->vctx, req->job_id)) run_prolog = true; - - if ( (_check_job_credential(req->cred, jobid, stepid, req_uid) < 0) - && (super_user == false) ) { + if (_check_job_credential(req->cred, jobid, stepid, req_uid) < 0) { retval = errno; - error("Invalid credential from %ld@%s: %m", req_uid, host); + error("Invalid job credential from %ld@%s: %m", req_uid, host); goto done; } + xassert(slurm_cred_jobid_cached(conf->vctx, req->job_id)); + /* Run job prolog if necessary */ if (run_prolog && (_run_prolog(req->job_id, req->uid) != 0)) { error("[job %d] prolog failed", req->job_id); @@ -439,7 +455,6 @@ _kill_running_session_mgrs(uint32_t jobid, int signum) List steps = shm_get_steps(); ListIterator i = list_iterator_create(steps); job_step_t *s = NULL; - int step_cnt = 0; while ((s = list_next(i))) { if (s->jobid == jobid) { @@ -448,7 +463,7 @@ _kill_running_session_mgrs(uint32_t jobid, int signum) } list_destroy(steps); - return step_cnt; + return; } /* For the specified job_id: Send SIGXCPU, reply to slurmctld, @@ -771,3 +786,4 @@ _run_epilog(uint32_t jobid, uid_t uid) return error_code; } + diff --git a/src/slurmd/semaphore.c b/src/slurmd/semaphore.c index 29794104b92..80eeb9385be 100644 --- a/src/slurmd/semaphore.c +++ b/src/slurmd/semaphore.c @@ -46,9 +46,9 @@ #include <sys/ipc.h> #include <sys/sem.h> #include <stdio.h> -#include "semaphore.h" -#include "../common/log.h" +#include "src/slurmd/semaphore.h" +#include "src/common/log.h" #define MAX_TRIES 3 diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index 99b8968f111..05fbcee25e4 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -192,9 +192,9 @@ shm_cleanup(void) key_t key; int id = -1; + info("request to destroy shm lock [%s]", SHM_LOCKNAME); if ((s = _create_ipc_name(SHM_LOCKNAME))) { key = ftok(s, 1); - info("request to destroy shm lock `%s'", s); if (sem_unlink(s) < 0) error("sem_unlink: %m"); xfree(s); @@ -923,24 +923,52 @@ static int _shm_reopen() { int retval = SLURM_SUCCESS; + int oflags = O_EXCL; /* Try to reopen semaphore first */ + + debug2("going to reopen slurmd shared memory"); + + shm_lock = _sem_open(SHM_LOCKNAME, oflags, 0600, 0); + /* + * If open of shm lock failed, we could be in one of two + * situations: + * + * 1. The lockfile associated with the semaphore exists, + * but the semaphore does not exist (errno == ENOENT) + * or + * 2. system failure trying to attach to semaphore. + * + * For 1, we can cleanup the shm lock, then initialize + * a new shared memory region, but for 2, we need to + * exit with a failure + */ - if ((shm_lock = _sem_open(SHM_LOCKNAME, O_CREAT|O_EXCL, 0600, 0)) - == SEM_FAILED) { + if ((shm_lock == SEM_FAILED)) { if (errno != ENOENT) { error("Unable to initialize semaphore: %m"); return SLURM_FAILURE; } - debug("Lockfile found but semaphore deleted: " - "creating new shm segment"); - shm_cleanup(); - if ((shm_lock = _sem_open(SHM_LOCKNAME,O_CREAT|O_EXCL, - 0600, 0)) == SEM_FAILED) { - error("Unable to initialize semaphore: %m"); - return SLURM_FAILURE; - } + + debug2( "lockfile exists, but semaphore was deleted: " + "reinitializing shm" ); + + /* + * Unlink old lockfile, reopen semaphore with create flag, + * and create new shared memory area + */ + sem_unlink(lockname); + shm_lock = _sem_open(SHM_LOCKNAME, oflags|O_CREAT, 0600, 0); + return _shm_new(); + } + + if (shm_lock == SEM_FAILED) { + error("Unable to initialize semaphore: %m"); + return SLURM_FAILURE; } - /* Attach to shared memory region */ + /* + * Attach to shared memory region + * If attach fails, try to create a new shm segment + */ if ((_shm_attach() < 0) && (_shm_create() < 0)) { error("shm_create(): %m"); return SLURM_FAILURE; @@ -983,7 +1011,8 @@ _shm_lock_and_initialize() } shm_lock = _sem_open(SHM_LOCKNAME, O_CREAT|O_EXCL, 0600, 0); - debug3("lockname is `%s'", lockname); + debug3("slurmd lockfile is `%s': %m", lockname); + if (shm_lock != SEM_FAILED) /* lock didn't exist. Create shmem */ return _shm_new(); else /* lock exists. Attach to shared memory */ -- GitLab