From e371646323d3b70f2aa010fc89c497e5bdeabd30 Mon Sep 17 00:00:00 2001 From: Mark Grondona <mgrondona@llnl.gov> Date: Fri, 21 Nov 2003 21:46:20 +0000 Subject: [PATCH] o Merged fixes from slurm-0-2-branch at slurm-0-2-22-1 - fixes to help ensure slurmd uses the same key for shared memory on a restart (to avoid losing track of jobs) - slurmd only runs one launch thread at a time - fix bug in slurmd where multiple threads used same address space for connecting client address. - srun always sends SIGKILL to job step before issuing complete request - Changed short string for draining nodes to drng from drain. - srun default launch message timeout increased to 5s. --- doc/man/man5/slurm.conf.5 | 13 +++-- src/common/slurm_protocol_defs.c | 2 +- src/slurmd/req.c | 5 ++ src/slurmd/shm.c | 43 ++++++++++----- src/slurmd/slurmd.c | 93 +++++++++++++++++--------------- src/srun/job.c | 20 +++++-- src/srun/job.h | 3 +- src/srun/opt.c | 2 +- 8 files changed, 113 insertions(+), 68 deletions(-) diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index d41de1d60d7..0f1b6b78751 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -238,10 +238,15 @@ on the same nodes or the values of \fBSlurmctldPort\fR and \fBSlurmdPort\fR must be different. .TP \fBSlurmdSpoolDir\fR -Fully qualified pathname of a file into which the \fBslurmd\fR daemon's state -information is written. This must be a common pathname for all nodes, but -should represent a file which is local to each node (reference a local file -system). The default value is "/tmp/slurmd". +Fully qualified pathname of a directory into which the \fBslurmd\fR +daemon's state information and batch job script information are written. This +must be a common pathname for all nodes, but should represent a directory which +is local to each node (reference a local file system). The default value +is "/var/spool/slurmd." \fBNOTE\fR: This directory is also used to store \fBslurmd\fR's +shared memory lockfile, and \fBshould not be changed\fR unless the system +is being cleanly restarted. If the location of \fBSlurmdSpoolDir\fR is +changed and \fBslurmd\fR is restarted, the new daemon will attach to a +different shared memory region and lose track of any running jobs. .TP \fBSlurmdTimeout\fR The interval, in seconds, that the SLURM controller waits for \fBslurmd\fR diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 594cada2f0e..44f94dabaf2 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -417,7 +417,7 @@ char *node_state_string_compact(enum node_states inx) "IDLE", "ALLOC", "DRAIN", - "DRAIN", + "DRNG", "COMP", "END" }; diff --git a/src/slurmd/req.c b/src/slurmd/req.c index 6490dcb70c9..7b09998321a 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -78,6 +78,7 @@ static int _run_prolog(uint32_t jobid, uid_t uid); static int _run_epilog(uint32_t jobid, uid_t uid); static int _wait_for_procs(uint32_t job_id); +static pthread_mutex_t launch_mutex = PTHREAD_MUTEX_INITIALIZER; void slurmd_req(slurm_msg_t *msg, slurm_addr *cli) @@ -86,12 +87,16 @@ slurmd_req(slurm_msg_t *msg, slurm_addr *cli) switch(msg->msg_type) { case REQUEST_BATCH_JOB_LAUNCH: + slurm_mutex_lock(&launch_mutex); _rpc_batch_job(msg, cli); slurm_free_job_launch_msg(msg->data); + slurm_mutex_unlock(&launch_mutex); break; case REQUEST_LAUNCH_TASKS: + slurm_mutex_lock(&launch_mutex); _rpc_launch_tasks(msg, cli); slurm_free_launch_tasks_request_msg(msg->data); + slurm_mutex_unlock(&launch_mutex); break; case REQUEST_KILL_TASKS: _rpc_kill_tasks(msg, cli); diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index 5ab547f3cb0..af4ac0f9fa9 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -117,7 +117,8 @@ typedef struct shmem_struct { * static variables: * */ static sem_t *shm_lock; -static char *lockname; +static char *lockname; +static char *lockdir; static slurmd_shm_t *slurmd_shm; static int shmid; static pid_t attach_pid = (pid_t) 0; @@ -193,22 +194,25 @@ shm_fini(void) /* detach segment from local memory */ if (shmdt(slurmd_shm) < 0) { error("shmdt: %m"); - return -1; + goto error; } slurmd_shm = NULL; if (destroy && (shmctl(shmid, IPC_RMID, NULL) < 0)) { error("shmctl: %m"); - return -1; + goto error; } _shm_unlock(); if (destroy && (_shm_unlink_lock() < 0)) { error("_shm_unlink_lock: %m"); - return -1; + goto error; } return 0; + + error: + return -1; } void @@ -218,8 +222,13 @@ shm_cleanup(void) key_t key; int id = -1; - info("request to destroy shm lock [%s]", SHM_LOCKNAME); + if (!lockdir) + lockdir = xstrdup(conf->spooldir); + if ((s = _create_ipc_name(SHM_LOCKNAME))) { + + info("request to destroy shm lock [%s]", s); + key = ftok(s, 1); if (sem_unlink(s) < 0) error("sem_unlink: %m"); @@ -304,25 +313,24 @@ _is_valid_ipc_name(const char *name) return(1); } +/* + * Create IPC name by appending `name' to slurmd spooldir + * setting. + */ static char * _create_ipc_name(const char *name) { char *dst = NULL, *dir = NULL, *slash = NULL; int rc; + xassert (lockdir != NULL); + if ((rc = _is_valid_ipc_name(name)) != 1) fatal("invalid ipc name: `%s' %d", name, rc); else if (!(dst = xmalloc(PATH_MAX))) fatal("memory allocation failure"); -#if defined(POSIX_IPC_PREFIX) && defined(HAVE_POSIX_SEMS) - dir = POSIX_IPC_PREFIX; -#else - if ( !(dir = conf->spooldir) - && !(strlen(dir)) - && !(dir = getenv("TMPDIR"))) - dir = "/tmp"; -#endif /* POSIX_IPC_PREFIX */ + dir = lockdir; slash = (dir[strlen(dir) - 1] == '/') ? "" : "/"; @@ -1086,6 +1094,15 @@ _shm_lock_and_initialize() /* * Create locked semaphore (initial value == 0) */ + + /* + * Init lockdir to slurmd spooldir. + * Make sure it does not change for this instance of slurmd, + * even if spooldir does. + */ + if (!lockdir) + lockdir = xstrdup(conf->spooldir); + shm_lock = _sem_open(SHM_LOCKNAME, O_CREAT|O_EXCL, 0600, 0); debug3("slurmd lockfile is \"%s\"", lockname); diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index 3920164b7b4..3a732375a74 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -31,6 +31,7 @@ #include <fcntl.h> #include <string.h> +#include <stdlib.h> #include <pthread.h> #include <sys/stat.h> #include <sys/time.h> @@ -138,6 +139,10 @@ main (int argc, char *argv[]) log_init(argv[0], conf->log_opts, LOG_DAEMON, conf->logfile); + xsignal(SIGTERM, &_term_handler); + xsignal(SIGINT, &_term_handler); + xsignal(SIGHUP, &_hup_handler ); + /* * Run slurmd_init() here in order to report early errors * (with shared memory and public keyfile) @@ -161,6 +166,12 @@ main (int argc, char *argv[]) _kill_old_slurmd(); + /* + * Restore any saved revoked credential information + */ + if (_restore_cred_state(conf->vctx)) + return SLURM_FAILURE; + if (interconnect_node_init() < 0) fatal("Unable to initialize interconnect."); @@ -174,10 +185,6 @@ main (int argc, char *argv[]) if (send_registration_msg(SLURM_SUCCESS) < 0) error("Unable to register with slurm controller"); - xsignal(SIGTERM, &_term_handler); - xsignal(SIGINT, &_term_handler); - xsignal(SIGHUP, &_hup_handler ); - _install_fork_handlers(); list_install_fork_handlers(); @@ -201,34 +208,32 @@ main (int argc, char *argv[]) return 0; } + static void _msg_engine() { slurm_fd sock; - slurm_addr cli; - while (1) { - if (_shutdown) - break; - again: - if ((sock = slurm_accept_msg_conn(conf->lfd, &cli)) < 0) { - if (errno == EINTR) { - if (_shutdown) { - verbose("got shutdown request"); - break; - } - if (_reconfig) { - _reconfigure(); - verbose("got reconfigure request"); - } - goto again; - } - error("accept: %m"); + while (!_shutdown) { + slurm_addr *cli = xmalloc (sizeof (*cli)); + if ((sock = slurm_accept_msg_conn(conf->lfd, cli)) >= 0) { + _handle_connection(sock, cli); continue; } - if (sock > 0) - _handle_connection(sock, &cli); + /* + * Otherwise, accept() failed. + */ + xfree (cli); + if (errno == EINTR) { + if (_reconfig) { + verbose("got reconfigure request"); + _reconfigure(); + } + continue; + } + error("accept: %m"); } + verbose("got shutdown request"); slurm_shutdown_msg_engine(conf->lfd); return; } @@ -334,6 +339,7 @@ _service_connection(void *arg) error ("close(%d): %m", con->fd); done: + xfree(con->cli_addr); xfree(con); slurm_free_msg(msg); _decrement_thd_count(); @@ -343,21 +349,24 @@ _service_connection(void *arg) int send_registration_msg(uint32_t status) { + int retval = SLURM_SUCCESS; slurm_msg_t req; slurm_msg_t resp; - slurm_node_registration_status_msg_t msg; + slurm_node_registration_status_msg_t *msg = xmalloc (sizeof (*msg)); - _fill_registration_msg(&msg); - msg.status = status; + _fill_registration_msg(msg); + msg->status = status; req.msg_type = MESSAGE_NODE_REGISTRATION_STATUS; - req.data = &msg; + req.data = msg; if (slurm_send_recv_controller_msg(&req, &resp) < 0) { error("Unable to register: %m"); - return SLURM_FAILURE; + retval = SLURM_FAILURE; } + slurm_free_node_registration_status_msg (msg); + /* XXX look at response msg */ @@ -372,7 +381,7 @@ _fill_registration_msg(slurm_node_registration_status_msg_t *msg) job_step_t *s; int n; - msg->node_name = conf->hostname; + msg->node_name = xstrdup (conf->hostname); get_procs(&msg->cpus); get_memory(&msg->real_memory_size); @@ -640,10 +649,12 @@ _slurmd_init() return SLURM_FAILURE; /* - * Restore any saved revoked credential information + * Create slurmd spool directory if necessary. */ - if (_restore_cred_state(conf->vctx)) + if (_set_slurmd_spooldir() < 0) { + error("Unable to initialize slurmd spooldir"); return SLURM_FAILURE; + } /* * Cleanup shared memory if so configured @@ -660,18 +671,14 @@ _slurmd_init() /* * Initialize slurmd shared memory + * This *must* be called after _set_slurmd_spooldir() + * since the default location of the slurmd lockfile is + * _in_ the spooldir. + * */ if (shm_init(true) < 0) return SLURM_FAILURE; - /* - * Create slurmd spool directory if necessary. - */ - if (_set_slurmd_spooldir() < 0) { - error("Unable to initialize slurmd spooldir"); - return SLURM_FAILURE; - } - if (conf->daemonize && (chdir("/tmp") < 0)) { error("Unable to chdir to /tmp"); return SLURM_FAILURE; @@ -689,8 +696,8 @@ _restore_cred_state(slurm_cred_ctx_t ctx) int cred_fd, data_allocated, data_read = 0; Buf buffer = NULL; - if ((mkdir(conf->spooldir, 0755) < 0) && - (errno != EEXIST)) { + if ( (mkdir(conf->spooldir, 0755) < 0) + && (errno != EEXIST) ) { error("mkdir(%s): %m", conf->spooldir); return SLURM_ERROR; } @@ -725,7 +732,7 @@ static int _slurmd_fini() { save_cred_state(conf->vctx); - shm_fini(); + shm_fini(); return SLURM_SUCCESS; } diff --git a/src/srun/job.c b/src/srun/job.c index cf27a2107c6..d3b901a5d35 100644 --- a/src/srun/job.c +++ b/src/srun/job.c @@ -219,12 +219,15 @@ int job_rc(job_t *job) { int i; - int rc; + int rc = 0; - if (job->rc) return(job->rc); + if (job->rc >= 0) return(job->rc); - for (i = 0; i < opt.nprocs; i++) - job->rc |= job->tstatus[i]; + + for (i = 0; i < opt.nprocs; i++) { + if (job->rc < job->tstatus[i]) + job->rc = job->tstatus[i]; + } if ((rc = WEXITSTATUS(job->rc))) job->rc = rc; @@ -248,8 +251,12 @@ void job_fatal(job_t *job, const char *msg) void job_destroy(job_t *job, int error) { + if (job->removed) + return; + if (job->old_job) { debug("cancelling job step %u.%u", job->jobid, job->stepid); + slurm_kill_job_step(job->jobid, job->stepid, SIGKILL); slurm_complete_job_step(job->jobid, job->stepid, 0, error); } else if (!opt.no_alloc) { debug("cancelling job %u", job->jobid); @@ -263,6 +270,8 @@ job_destroy(job_t *job, int error) #ifdef HAVE_TOTALVIEW if (error) tv_launch_failure(); #endif + + job->removed = true; } @@ -389,7 +398,7 @@ _job_create_internal(allocation_info_t *info) job->state = SRUN_JOB_INIT; job->signaled = false; - job->rc = 0; + job->rc = -1; job->nodelist = xstrdup(info->nodelist); hl = hostlist_create(job->nodelist); @@ -398,6 +407,7 @@ _job_create_internal(allocation_info_t *info) job->jobid = info->jobid; job->stepid = info->stepid; job->old_job = false; + job->removed = false; /* * Initialize Launch and Exit timeout values diff --git a/src/srun/job.h b/src/srun/job.h index 6c254d331ec..177f6c2b6b9 100644 --- a/src/srun/job.h +++ b/src/srun/job.h @@ -75,6 +75,7 @@ typedef struct srun_job { uint32_t jobid; /* assigned job id */ uint32_t stepid; /* assigned step id */ bool old_job; /* run job step under previous allocation */ + bool removed; /* job has been removed from SLURM */ job_state_t state; /* job state */ pthread_mutex_t state_mutex; @@ -92,7 +93,7 @@ typedef struct srun_job { uint32_t **tids; /* host id => task ids mapping */ uint32_t *hostid; /* task id => host id mapping */ - slurm_addr *slurmd_addr;/* slurm_addr vector to slurmd's */ + slurm_addr *slurmd_addr;/* slurm_addr vector to slurmd's */ pthread_t sigid; /* signals thread tid */ diff --git a/src/srun/opt.c b/src/srun/opt.c index 6797678be34..6d204c26a37 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -424,7 +424,7 @@ static void _opt_default() opt.exc_nodes = NULL; opt.max_launch_time = 60; /* 60 seconds to launch job */ opt.max_exit_timeout= 60; /* Warn user 60 seconds after task exit */ - opt.msg_timeout = 2; /* Default launch msg timeout */ + opt.msg_timeout = 5; /* Default launch msg timeout */ mode = MODE_NORMAL; -- GitLab