From e371646323d3b70f2aa010fc89c497e5bdeabd30 Mon Sep 17 00:00:00 2001
From: Mark Grondona <mgrondona@llnl.gov>
Date: Fri, 21 Nov 2003 21:46:20 +0000
Subject: [PATCH]  o Merged fixes from slurm-0-2-branch at slurm-0-2-22-1    -
 fixes to help ensure slurmd uses the same key for shared memory      on a
 restart (to avoid losing track of jobs)    - slurmd only runs one launch
 thread at a time    - fix bug in slurmd where multiple threads used same
 address space      for connecting client address.    - srun always sends
 SIGKILL to job step before issuing complete request    - Changed short string
 for draining nodes to drng from drain.    - srun default launch message
 timeout increased to 5s.

---
 doc/man/man5/slurm.conf.5        | 13 +++--
 src/common/slurm_protocol_defs.c |  2 +-
 src/slurmd/req.c                 |  5 ++
 src/slurmd/shm.c                 | 43 ++++++++++-----
 src/slurmd/slurmd.c              | 93 +++++++++++++++++---------------
 src/srun/job.c                   | 20 +++++--
 src/srun/job.h                   |  3 +-
 src/srun/opt.c                   |  2 +-
 8 files changed, 113 insertions(+), 68 deletions(-)

diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5
index d41de1d60d7..0f1b6b78751 100644
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -238,10 +238,15 @@ on the same nodes or the values of \fBSlurmctldPort\fR and \fBSlurmdPort\fR
 must be different.
 .TP
 \fBSlurmdSpoolDir\fR
-Fully qualified pathname of a file into which the \fBslurmd\fR daemon's state 
-information is written. This must be a common pathname for all nodes, but 
-should represent a file which is local to each node (reference a local file
-system). The default value is "/tmp/slurmd".
+Fully qualified pathname of a directory into which the \fBslurmd\fR
+daemon's state information and batch job script information are written. This
+must be a common pathname for all nodes, but should represent a directory which
+is local to each node (reference a local file system). The default value
+is "/var/spool/slurmd." \fBNOTE\fR: This directory is also used to store \fBslurmd\fR's
+shared memory lockfile, and \fBshould not be changed\fR unless the system
+is being cleanly restarted. If the location of \fBSlurmdSpoolDir\fR is
+changed and \fBslurmd\fR is restarted, the new daemon will attach to a
+different shared memory region and lose track of any running jobs.
 .TP
 \fBSlurmdTimeout\fR
 The interval, in seconds, that the SLURM controller waits for \fBslurmd\fR 
diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c
index 594cada2f0e..44f94dabaf2 100644
--- a/src/common/slurm_protocol_defs.c
+++ b/src/common/slurm_protocol_defs.c
@@ -417,7 +417,7 @@ char *node_state_string_compact(enum node_states inx)
 		"IDLE",
 		"ALLOC",
 		"DRAIN",
-		"DRAIN",
+		"DRNG",
 		"COMP",
 		"END"
 	};
diff --git a/src/slurmd/req.c b/src/slurmd/req.c
index 6490dcb70c9..7b09998321a 100644
--- a/src/slurmd/req.c
+++ b/src/slurmd/req.c
@@ -78,6 +78,7 @@ static int  _run_prolog(uint32_t jobid, uid_t uid);
 static int  _run_epilog(uint32_t jobid, uid_t uid);
 static int  _wait_for_procs(uint32_t job_id);
 
+static pthread_mutex_t launch_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 void
 slurmd_req(slurm_msg_t *msg, slurm_addr *cli)
@@ -86,12 +87,16 @@ slurmd_req(slurm_msg_t *msg, slurm_addr *cli)
 
 	switch(msg->msg_type) {
 	case REQUEST_BATCH_JOB_LAUNCH:
+		slurm_mutex_lock(&launch_mutex);
 		_rpc_batch_job(msg, cli);
 		slurm_free_job_launch_msg(msg->data);
+		slurm_mutex_unlock(&launch_mutex);
 		break;
 	case REQUEST_LAUNCH_TASKS:
+		slurm_mutex_lock(&launch_mutex);
 		_rpc_launch_tasks(msg, cli);
 		slurm_free_launch_tasks_request_msg(msg->data);
+		slurm_mutex_unlock(&launch_mutex);
 		break;
 	case REQUEST_KILL_TASKS:
 		_rpc_kill_tasks(msg, cli);
diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c
index 5ab547f3cb0..af4ac0f9fa9 100644
--- a/src/slurmd/shm.c
+++ b/src/slurmd/shm.c
@@ -117,7 +117,8 @@ typedef struct shmem_struct {
  * static variables: 
  * */
 static sem_t        *shm_lock;
-static char         *lockname;
+static char         *lockname; 
+static char         *lockdir;
 static slurmd_shm_t *slurmd_shm;
 static int           shmid;
 static pid_t         attach_pid = (pid_t) 0;
@@ -193,22 +194,25 @@ shm_fini(void)
 	/* detach segment from local memory */
 	if (shmdt(slurmd_shm) < 0) {
 		error("shmdt: %m");
-		return -1;
+		goto error;
 	}
 
 	slurmd_shm = NULL;
 
 	if (destroy && (shmctl(shmid, IPC_RMID, NULL) < 0)) {
 		error("shmctl: %m");
-		return -1;
+		goto error;
 	}
 	_shm_unlock();
 	if (destroy && (_shm_unlink_lock() < 0)) {
 		error("_shm_unlink_lock: %m");
-		return -1;
+		goto error;
 	}
 
 	return 0;
+
+    error:
+	return -1;
 }
 
 void
@@ -218,8 +222,13 @@ shm_cleanup(void)
 	key_t key;
 	int id = -1;
 
-	info("request to destroy shm lock [%s]", SHM_LOCKNAME);
+	if (!lockdir)
+		lockdir = xstrdup(conf->spooldir);
+
 	if ((s = _create_ipc_name(SHM_LOCKNAME))) {
+
+		info("request to destroy shm lock [%s]", s);
+
 		key = ftok(s, 1);
 		if (sem_unlink(s) < 0)
 			error("sem_unlink: %m");
@@ -304,25 +313,24 @@ _is_valid_ipc_name(const char *name)
 	return(1);
 }
 
+/*
+ *  Create IPC name by appending `name' to slurmd spooldir
+ *   setting. 
+ */
 static char *
 _create_ipc_name(const char *name)
 {
 	char *dst = NULL, *dir = NULL, *slash = NULL;
 	int rc;
 
+	xassert (lockdir != NULL);
+
 	if ((rc = _is_valid_ipc_name(name)) != 1)
 		fatal("invalid ipc name: `%s' %d", name, rc);
 	else if (!(dst = xmalloc(PATH_MAX)))
 		fatal("memory allocation failure");
 
-#if defined(POSIX_IPC_PREFIX) && defined(HAVE_POSIX_SEMS)
-	dir = POSIX_IPC_PREFIX;
-#else
-	if (  !(dir = conf->spooldir) 
-	   && !(strlen(dir)) 
-	   && !(dir = getenv("TMPDIR"))) 
-		dir = "/tmp";
-#endif /* POSIX_IPC_PREFIX */
+	dir = lockdir;
 
 	slash = (dir[strlen(dir) - 1] == '/') ? "" : "/";
 
@@ -1086,6 +1094,15 @@ _shm_lock_and_initialize()
 	/*
 	 * Create locked semaphore (initial value == 0)
 	 */
+
+	/* 
+	 * Init lockdir to slurmd spooldir. 
+	 * Make sure it does not change for this instance of slurmd,
+	 *  even if spooldir does.
+	 */
+	if (!lockdir)
+		lockdir = xstrdup(conf->spooldir);
+
 	shm_lock = _sem_open(SHM_LOCKNAME, O_CREAT|O_EXCL, 0600, 0);
 	debug3("slurmd lockfile is \"%s\"", lockname);
 
diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c
index 3920164b7b4..3a732375a74 100644
--- a/src/slurmd/slurmd.c
+++ b/src/slurmd/slurmd.c
@@ -31,6 +31,7 @@
 
 #include <fcntl.h>
 #include <string.h>
+#include <stdlib.h>
 #include <pthread.h>
 #include <sys/stat.h>
 #include <sys/time.h>
@@ -138,6 +139,10 @@ main (int argc, char *argv[])
 
 	log_init(argv[0], conf->log_opts, LOG_DAEMON, conf->logfile);
 
+	xsignal(SIGTERM, &_term_handler);
+	xsignal(SIGINT,  &_term_handler);
+	xsignal(SIGHUP,  &_hup_handler );
+
 	/* 
 	 * Run slurmd_init() here in order to report early errors
 	 * (with shared memory and public keyfile)
@@ -161,6 +166,12 @@ main (int argc, char *argv[])
 
 	_kill_old_slurmd();
 
+	/* 
+	 * Restore any saved revoked credential information
+	 */
+	if (_restore_cred_state(conf->vctx))
+		return SLURM_FAILURE;
+	
 	if (interconnect_node_init() < 0)
 		fatal("Unable to initialize interconnect.");
 
@@ -174,10 +185,6 @@ main (int argc, char *argv[])
         if (send_registration_msg(SLURM_SUCCESS) < 0) 
 		error("Unable to register with slurm controller");
 
-	xsignal(SIGTERM, &_term_handler);
-	xsignal(SIGINT,  &_term_handler);
-	xsignal(SIGHUP,  &_hup_handler );
-
 	_install_fork_handlers();
 	list_install_fork_handlers();
 
@@ -201,34 +208,32 @@ main (int argc, char *argv[])
 	return 0;
 }
 
+
 static void
 _msg_engine()
 {
 	slurm_fd sock;
-	slurm_addr cli;
 
-	while (1) {
-		if (_shutdown)
-			break;
-  again:
-		if ((sock = slurm_accept_msg_conn(conf->lfd, &cli)) < 0) {
-			if (errno == EINTR) {
-				if (_shutdown) {
-					verbose("got shutdown request");
-					break;
-				}
-				if (_reconfig) {
-					_reconfigure();
-					verbose("got reconfigure request");
-				}
-				goto again;
-			}
-			error("accept: %m");
+	while (!_shutdown) {
+		slurm_addr *cli = xmalloc (sizeof (*cli));
+		if ((sock = slurm_accept_msg_conn(conf->lfd, cli)) >= 0) {
+			_handle_connection(sock, cli);
 			continue;
 		}
-		if (sock > 0)
-			_handle_connection(sock, &cli);
+		/*
+		 *  Otherwise, accept() failed.
+		 */
+		xfree (cli);
+		if (errno == EINTR) {
+			if (_reconfig) {
+				verbose("got reconfigure request");
+				_reconfigure();
+			}
+			continue;
+		} 
+		error("accept: %m");
 	}
+	verbose("got shutdown request");
 	slurm_shutdown_msg_engine(conf->lfd);
 	return;
 }
@@ -334,6 +339,7 @@ _service_connection(void *arg)
 		error ("close(%d): %m", con->fd);
 
     done:
+	xfree(con->cli_addr);
 	xfree(con);
 	slurm_free_msg(msg);
 	_decrement_thd_count();
@@ -343,21 +349,24 @@ _service_connection(void *arg)
 int
 send_registration_msg(uint32_t status)
 {
+	int retval = SLURM_SUCCESS;
 	slurm_msg_t req;
 	slurm_msg_t resp;
-	slurm_node_registration_status_msg_t msg;
+	slurm_node_registration_status_msg_t *msg = xmalloc (sizeof (*msg));
 
-	_fill_registration_msg(&msg);
-	msg.status   = status;
+	_fill_registration_msg(msg);
+	msg->status  = status;
 
 	req.msg_type = MESSAGE_NODE_REGISTRATION_STATUS;
-	req.data     = &msg;
+	req.data     = msg;
 
 	if (slurm_send_recv_controller_msg(&req, &resp) < 0) {
 		error("Unable to register: %m");
-		return SLURM_FAILURE;
+		retval = SLURM_FAILURE;
 	}
 
+	slurm_free_node_registration_status_msg (msg);
+
 	/* XXX look at response msg
 	 */
 
@@ -372,7 +381,7 @@ _fill_registration_msg(slurm_node_registration_status_msg_t *msg)
 	job_step_t  *s;
 	int          n;
 
-	msg->node_name = conf->hostname;
+	msg->node_name = xstrdup (conf->hostname);
 
 	get_procs(&msg->cpus);
 	get_memory(&msg->real_memory_size);
@@ -640,10 +649,12 @@ _slurmd_init()
 		return SLURM_FAILURE;
 
 	/* 
-	 * Restore any saved revoked credential information
+	 * Create slurmd spool directory if necessary.
 	 */
-	if (_restore_cred_state(conf->vctx))
+	if (_set_slurmd_spooldir() < 0) {
+		error("Unable to initialize slurmd spooldir");
 		return SLURM_FAILURE;
+	}
 
 	/*
 	 * Cleanup shared memory if so configured
@@ -660,18 +671,14 @@ _slurmd_init()
 
 	/*
 	 * Initialize slurmd shared memory
+	 *  This *must* be called after _set_slurmd_spooldir()
+	 *  since the default location of the slurmd lockfile is
+	 *  _in_ the spooldir.
+	 *
 	 */
 	if (shm_init(true) < 0)
 		return SLURM_FAILURE;
 
-	/* 
-	 * Create slurmd spool directory if necessary.
-	 */
-	if (_set_slurmd_spooldir() < 0) {
-		error("Unable to initialize slurmd spooldir");
-		return SLURM_FAILURE;
-	}
-
 	if (conf->daemonize && (chdir("/tmp") < 0)) {
 		error("Unable to chdir to /tmp");
 		return SLURM_FAILURE;
@@ -689,8 +696,8 @@ _restore_cred_state(slurm_cred_ctx_t ctx)
 	int cred_fd, data_allocated, data_read = 0;
 	Buf buffer = NULL;
 
-	if ((mkdir(conf->spooldir, 0755) < 0) && 
-	    (errno != EEXIST)) {
+	if ( (mkdir(conf->spooldir, 0755) < 0) 
+	   && (errno != EEXIST) ) {
 		error("mkdir(%s): %m", conf->spooldir);
 		return SLURM_ERROR;
 	}
@@ -725,7 +732,7 @@ static int
 _slurmd_fini()
 {
 	save_cred_state(conf->vctx);
-	shm_fini();
+	shm_fini(); 
 	return SLURM_SUCCESS;
 }
 
diff --git a/src/srun/job.c b/src/srun/job.c
index cf27a2107c6..d3b901a5d35 100644
--- a/src/srun/job.c
+++ b/src/srun/job.c
@@ -219,12 +219,15 @@ int
 job_rc(job_t *job)
 {
 	int i;
-	int rc;
+	int rc = 0;
 
-	if (job->rc) return(job->rc);
+	if (job->rc >= 0) return(job->rc);
 
-	for (i = 0; i < opt.nprocs; i++) 
-		job->rc |= job->tstatus[i];
+
+	for (i = 0; i < opt.nprocs; i++) {
+		if (job->rc < job->tstatus[i])
+			job->rc = job->tstatus[i];
+	}
 
 	if ((rc = WEXITSTATUS(job->rc)))
 		job->rc = rc;
@@ -248,8 +251,12 @@ void job_fatal(job_t *job, const char *msg)
 void 
 job_destroy(job_t *job, int error)
 {
+	if (job->removed)
+		return;
+
 	if (job->old_job) {
 		debug("cancelling job step %u.%u", job->jobid, job->stepid);
+		slurm_kill_job_step(job->jobid, job->stepid, SIGKILL);
 		slurm_complete_job_step(job->jobid, job->stepid, 0, error);
 	} else if (!opt.no_alloc) {
 		debug("cancelling job %u", job->jobid);
@@ -263,6 +270,8 @@ job_destroy(job_t *job, int error)
 #ifdef HAVE_TOTALVIEW
 	if (error) tv_launch_failure();
 #endif
+
+	job->removed = true;
 }
 
 
@@ -389,7 +398,7 @@ _job_create_internal(allocation_info_t *info)
 	job->state = SRUN_JOB_INIT;
 
 	job->signaled = false;
-	job->rc       = 0;
+	job->rc       = -1;
 
 	job->nodelist = xstrdup(info->nodelist);
 	hl = hostlist_create(job->nodelist);
@@ -398,6 +407,7 @@ _job_create_internal(allocation_info_t *info)
 	job->jobid   = info->jobid;
 	job->stepid  = info->stepid;
 	job->old_job = false;
+	job->removed = false;
 
 	/* 
 	 *  Initialize Launch and Exit timeout values
diff --git a/src/srun/job.h b/src/srun/job.h
index 6c254d331ec..177f6c2b6b9 100644
--- a/src/srun/job.h
+++ b/src/srun/job.h
@@ -75,6 +75,7 @@ typedef struct srun_job {
 	uint32_t jobid;		/* assigned job id 	                  */
 	uint32_t stepid;	/* assigned step id 	                  */
 	bool old_job;           /* run job step under previous allocation */
+	bool removed;       /* job has been removed from SLURM */
 
 	job_state_t state;	/* job state	   	                  */
 	pthread_mutex_t state_mutex; 
@@ -92,7 +93,7 @@ typedef struct srun_job {
 	uint32_t **tids;	/* host id => task ids mapping    */
 	uint32_t *hostid;	/* task id => host id mapping     */
 
-        slurm_addr *slurmd_addr;/* slurm_addr vector to slurmd's */
+	slurm_addr *slurmd_addr;/* slurm_addr vector to slurmd's */
 
 	pthread_t sigid;	/* signals thread tid		  */
 
diff --git a/src/srun/opt.c b/src/srun/opt.c
index 6797678be34..6d204c26a37 100644
--- a/src/srun/opt.c
+++ b/src/srun/opt.c
@@ -424,7 +424,7 @@ static void _opt_default()
 	opt.exc_nodes	    = NULL;
 	opt.max_launch_time = 60; /* 60 seconds to launch job             */
 	opt.max_exit_timeout= 60; /* Warn user 60 seconds after task exit */
-	opt.msg_timeout     = 2;  /* Default launch msg timeout           */
+	opt.msg_timeout     = 5;  /* Default launch msg timeout           */
 
 	mode	= MODE_NORMAL;
 
-- 
GitLab