Skip to content
Snippets Groups Projects
Commit 685b46c7 authored by Mark Grondona's avatar Mark Grondona
Browse files

o fix bug in slurmd shared memory initialization when lockfile exists

   but sysv semaphore doesn't

 o changes to some of the logic in slurm_cred_t validity checks
parent 53362884
No related branches found
No related tags found
No related merge requests found
...@@ -53,6 +53,7 @@ ...@@ -53,6 +53,7 @@
#define MAXHOSTNAMELEN 64 #define MAXHOSTNAMELEN 64
#endif #endif
static bool _job_still_running(uint32_t job_id); static bool _job_still_running(uint32_t job_id);
static int _kill_all_active_steps(uint32_t jobid, int sig); static int _kill_all_active_steps(uint32_t jobid, int sig);
static int _launch_tasks(launch_tasks_request_msg_t *, slurm_addr *); static int _launch_tasks(launch_tasks_request_msg_t *, slurm_addr *);
...@@ -71,6 +72,7 @@ static int _run_prolog(uint32_t jobid, uid_t uid); ...@@ -71,6 +72,7 @@ static int _run_prolog(uint32_t jobid, uid_t uid);
static int _run_epilog(uint32_t jobid, uid_t uid); static int _run_epilog(uint32_t jobid, uid_t uid);
static void _wait_for_procs(uint32_t job_id, uid_t job_uid); static void _wait_for_procs(uint32_t job_id, uid_t job_uid);
void void
slurmd_req(slurm_msg_t *msg, slurm_addr *cli) slurmd_req(slurm_msg_t *msg, slurm_addr *cli)
{ {
...@@ -144,7 +146,6 @@ _launch_batch_job(batch_job_launch_msg_t *req, slurm_addr *cli) ...@@ -144,7 +146,6 @@ _launch_batch_job(batch_job_launch_msg_t *req, slurm_addr *cli)
{ {
pid_t pid; pid_t pid;
int rc; int rc;
switch ((pid = fork())) { switch ((pid = fork())) {
case -1: case -1:
...@@ -202,9 +203,21 @@ _check_job_credential(slurm_cred_t cred, uint32_t jobid, ...@@ -202,9 +203,21 @@ _check_job_credential(slurm_cred_t cred, uint32_t jobid,
slurm_cred_arg_t arg; slurm_cred_arg_t arg;
hostset_t hset = NULL; hostset_t hset = NULL;
if (slurm_cred_verify(conf->vctx, cred, &arg) < 0) /*
* First call slurm_cred_verify() so that all valid
* credentials are checked
*/
if ( (slurm_cred_verify(conf->vctx, cred, &arg) < 0)
&& (uid != conf->slurm_user_id) )
return SLURM_ERROR; return SLURM_ERROR;
/*
* If the requesting user is the slurm user, do not perform
* any more validity checks
*/
if (uid == conf->slurm_user_id)
return SLURM_SUCCESS;
if ((arg.jobid != jobid) || (arg.stepid != stepid)) { if ((arg.jobid != jobid) || (arg.stepid != stepid)) {
error("job credential for %d.%d, expected %d.%d", error("job credential for %d.%d, expected %d.%d",
arg.jobid, arg.stepid, jobid, stepid); arg.jobid, arg.stepid, jobid, stepid);
...@@ -232,6 +245,9 @@ _check_job_credential(slurm_cred_t cred, uint32_t jobid, ...@@ -232,6 +245,9 @@ _check_job_credential(slurm_cred_t cred, uint32_t jobid,
goto fail; goto fail;
} }
hostset_destroy(hset);
xfree(arg.hostlist);
return SLURM_SUCCESS; return SLURM_SUCCESS;
fail: fail:
...@@ -273,14 +289,14 @@ _rpc_launch_tasks(slurm_msg_t *msg, slurm_addr *cli) ...@@ -273,14 +289,14 @@ _rpc_launch_tasks(slurm_msg_t *msg, slurm_addr *cli)
if (!slurm_cred_jobid_cached(conf->vctx, req->job_id)) if (!slurm_cred_jobid_cached(conf->vctx, req->job_id))
run_prolog = true; run_prolog = true;
if (_check_job_credential(req->cred, jobid, stepid, req_uid) < 0) {
if ( (_check_job_credential(req->cred, jobid, stepid, req_uid) < 0)
&& (super_user == false) ) {
retval = errno; retval = errno;
error("Invalid credential from %ld@%s: %m", req_uid, host); error("Invalid job credential from %ld@%s: %m", req_uid, host);
goto done; goto done;
} }
xassert(slurm_cred_jobid_cached(conf->vctx, req->job_id));
/* Run job prolog if necessary */ /* Run job prolog if necessary */
if (run_prolog && (_run_prolog(req->job_id, req->uid) != 0)) { if (run_prolog && (_run_prolog(req->job_id, req->uid) != 0)) {
error("[job %d] prolog failed", req->job_id); error("[job %d] prolog failed", req->job_id);
...@@ -439,7 +455,6 @@ _kill_running_session_mgrs(uint32_t jobid, int signum) ...@@ -439,7 +455,6 @@ _kill_running_session_mgrs(uint32_t jobid, int signum)
List steps = shm_get_steps(); List steps = shm_get_steps();
ListIterator i = list_iterator_create(steps); ListIterator i = list_iterator_create(steps);
job_step_t *s = NULL; job_step_t *s = NULL;
int step_cnt = 0;
while ((s = list_next(i))) { while ((s = list_next(i))) {
if (s->jobid == jobid) { if (s->jobid == jobid) {
...@@ -448,7 +463,7 @@ _kill_running_session_mgrs(uint32_t jobid, int signum) ...@@ -448,7 +463,7 @@ _kill_running_session_mgrs(uint32_t jobid, int signum)
} }
list_destroy(steps); list_destroy(steps);
return step_cnt; return;
} }
/* For the specified job_id: Send SIGXCPU, reply to slurmctld, /* For the specified job_id: Send SIGXCPU, reply to slurmctld,
...@@ -771,3 +786,4 @@ _run_epilog(uint32_t jobid, uid_t uid) ...@@ -771,3 +786,4 @@ _run_epilog(uint32_t jobid, uid_t uid)
return error_code; return error_code;
} }
...@@ -46,9 +46,9 @@ ...@@ -46,9 +46,9 @@
#include <sys/ipc.h> #include <sys/ipc.h>
#include <sys/sem.h> #include <sys/sem.h>
#include <stdio.h> #include <stdio.h>
#include "semaphore.h"
#include "../common/log.h" #include "src/slurmd/semaphore.h"
#include "src/common/log.h"
#define MAX_TRIES 3 #define MAX_TRIES 3
......
...@@ -192,9 +192,9 @@ shm_cleanup(void) ...@@ -192,9 +192,9 @@ shm_cleanup(void)
key_t key; key_t key;
int id = -1; int id = -1;
info("request to destroy shm lock [%s]", SHM_LOCKNAME);
if ((s = _create_ipc_name(SHM_LOCKNAME))) { if ((s = _create_ipc_name(SHM_LOCKNAME))) {
key = ftok(s, 1); key = ftok(s, 1);
info("request to destroy shm lock `%s'", s);
if (sem_unlink(s) < 0) if (sem_unlink(s) < 0)
error("sem_unlink: %m"); error("sem_unlink: %m");
xfree(s); xfree(s);
...@@ -923,24 +923,52 @@ static int ...@@ -923,24 +923,52 @@ static int
_shm_reopen() _shm_reopen()
{ {
int retval = SLURM_SUCCESS; int retval = SLURM_SUCCESS;
int oflags = O_EXCL; /* Try to reopen semaphore first */
debug2("going to reopen slurmd shared memory");
shm_lock = _sem_open(SHM_LOCKNAME, oflags, 0600, 0);
/*
* If open of shm lock failed, we could be in one of two
* situations:
*
* 1. The lockfile associated with the semaphore exists,
* but the semaphore does not exist (errno == ENOENT)
* or
* 2. system failure trying to attach to semaphore.
*
* For 1, we can cleanup the shm lock, then initialize
* a new shared memory region, but for 2, we need to
* exit with a failure
*/
if ((shm_lock = _sem_open(SHM_LOCKNAME, O_CREAT|O_EXCL, 0600, 0)) if ((shm_lock == SEM_FAILED)) {
== SEM_FAILED) {
if (errno != ENOENT) { if (errno != ENOENT) {
error("Unable to initialize semaphore: %m"); error("Unable to initialize semaphore: %m");
return SLURM_FAILURE; return SLURM_FAILURE;
} }
debug("Lockfile found but semaphore deleted: "
"creating new shm segment"); debug2( "lockfile exists, but semaphore was deleted: "
shm_cleanup(); "reinitializing shm" );
if ((shm_lock = _sem_open(SHM_LOCKNAME,O_CREAT|O_EXCL,
0600, 0)) == SEM_FAILED) { /*
error("Unable to initialize semaphore: %m"); * Unlink old lockfile, reopen semaphore with create flag,
return SLURM_FAILURE; * and create new shared memory area
} */
sem_unlink(lockname);
shm_lock = _sem_open(SHM_LOCKNAME, oflags|O_CREAT, 0600, 0);
return _shm_new();
}
if (shm_lock == SEM_FAILED) {
error("Unable to initialize semaphore: %m");
return SLURM_FAILURE;
} }
/* Attach to shared memory region */ /*
* Attach to shared memory region
* If attach fails, try to create a new shm segment
*/
if ((_shm_attach() < 0) && (_shm_create() < 0)) { if ((_shm_attach() < 0) && (_shm_create() < 0)) {
error("shm_create(): %m"); error("shm_create(): %m");
return SLURM_FAILURE; return SLURM_FAILURE;
...@@ -983,7 +1011,8 @@ _shm_lock_and_initialize() ...@@ -983,7 +1011,8 @@ _shm_lock_and_initialize()
} }
shm_lock = _sem_open(SHM_LOCKNAME, O_CREAT|O_EXCL, 0600, 0); shm_lock = _sem_open(SHM_LOCKNAME, O_CREAT|O_EXCL, 0600, 0);
debug3("lockname is `%s'", lockname); debug3("slurmd lockfile is `%s': %m", lockname);
if (shm_lock != SEM_FAILED) /* lock didn't exist. Create shmem */ if (shm_lock != SEM_FAILED) /* lock didn't exist. Create shmem */
return _shm_new(); return _shm_new();
else /* lock exists. Attach to shared memory */ else /* lock exists. Attach to shared memory */
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment