From 808f912a69557d6c0d59cd8df10360ba56aaa47c Mon Sep 17 00:00:00 2001
From: Mark Grondona <mgrondona@llnl.gov>
Date: Fri, 21 Mar 2003 01:00:57 +0000
Subject: [PATCH]  o src/slurmd : fail job launch if shared memory is filled. 
 o src/slurmd : do not bother to sleep full timeout in rpc_timelimit    if the
 job step goes away on SIGTERM.  o src/common/slurm_auth.c : get default
 plugin dir from SLURM_PLUGIN_DIR    rather that always "/usr/local/lib/slurm"
  o other small fixes

---
 slurm/slurm_errno.h           |  2 ++
 src/common/io_hdr.c           |  2 ++
 src/common/slurm_auth.c       |  2 +-
 src/common/slurm_errno.c      |  4 ++++
 src/plugins/auth/auth_munge.c |  2 ++
 src/slurmd/job.c              |  8 +++++---
 src/slurmd/job.h              |  2 +-
 src/slurmd/mgr.c              |  8 +++++++-
 src/slurmd/req.c              | 21 ++++++++++++++-------
 src/slurmd/shm.c              |  7 +++----
 src/slurmd/smgr.c             |  2 +-
 11 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h
index 2a976cd2b6f..2173a6dcd0a 100644
--- a/slurm/slurm_errno.h
+++ b/slurm/slurm_errno.h
@@ -158,6 +158,8 @@ enum {
 	ESLURMD_PROLOG_FAILED,
 	ESLURMD_EPILOG_FAILED,
 	ESLURMD_SESSION_KILLED,
+	ESLURMD_TOOMANYSTEPS,
+	ESLURMD_STEP_EXISTS,
 
 	/* slurmd errors in user batch job */
 	ESCRIPT_CHDIR_FAILED =			4100,
diff --git a/src/common/io_hdr.c b/src/common/io_hdr.c
index 5bebe568d91..c855c28e992 100644
--- a/src/common/io_hdr.c
+++ b/src/common/io_hdr.c
@@ -34,6 +34,7 @@
 
 #define IO_HDR_VERSION 0xa001
 
+/*
 static void
 _print_data(char *data, int datalen)
 {
@@ -46,6 +47,7 @@ _print_data(char *data, int datalen)
 
 	info("data: %s", buf);
 }
+*/
 
 
 static void
diff --git a/src/common/slurm_auth.c b/src/common/slurm_auth.c
index 7f45a7dbe7e..465533a0776 100644
--- a/src/common/slurm_auth.c
+++ b/src/common/slurm_auth.c
@@ -104,7 +104,7 @@ get_plugin_dir( void )
 		read_slurm_conf_ctl( &conf );
 	}
 	if ( conf.plugindir == NULL ) {
-		conf.plugindir = xstrdup( "/usr/local/lib" );
+		conf.plugindir = xstrdup( SLURM_PLUGIN_PATH );
 	}
 	slurm_mutex_unlock( &config_lock );
 	
diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c
index ca1413abcef..23e96bf8e38 100644
--- a/src/common/slurm_errno.c
+++ b/src/common/slurm_errno.c
@@ -218,6 +218,10 @@ static slurm_errtab_t slurm_errtab[] = {
 	  "Job epilog failed"			        	},
 	{ ESLURMD_SESSION_KILLED,
 	  "Session manager killed"		        	},
+	{ ESLURMD_TOOMANYSTEPS,
+	  "Too many job steps on node"		        	},
+	{ ESLURMD_STEP_EXISTS,
+	  "Job step already in shared memory"	        	},
 
 	/* slurmd errors in user batch job */
 	{ ESCRIPT_CHDIR_FAILED,
diff --git a/src/plugins/auth/auth_munge.c b/src/plugins/auth/auth_munge.c
index dd48818827c..cd5a55bd0a5 100644
--- a/src/plugins/auth/auth_munge.c
+++ b/src/plugins/auth/auth_munge.c
@@ -171,6 +171,7 @@ slurm_auth_free( slurm_auth_credential_t *cred )
 	 */
 	if (cred->m_str) free(cred->m_str);
 	xfree(cred);
+	return SLURM_SUCCESS;
 }
 
 /*
@@ -381,6 +382,7 @@ slurm_auth_print( slurm_auth_credential_t *cred, FILE *fp )
 	fprintf(fp, "BEGIN SLURM MUNGE AUTHENTICATION CREDENTIAL\n" );
 	fprintf(fp, "%s\n", cred->m_str );
 	fprintf(fp, "END SLURM MUNGE AUTHENTICATION CREDENTIAL\n" );
+	return SLURM_SUCCESS;
 }
 
 int
diff --git a/src/slurmd/job.c b/src/slurmd/job.c
index ac53d461749..117fb5fec5c 100644
--- a/src/slurmd/job.c
+++ b/src/slurmd/job.c
@@ -422,7 +422,7 @@ task_info_destroy(task_info_t *t)
 	xfree(t);
 }
 
-void
+int
 job_update_shm(slurmd_job_t *job)
 {
 	job_step_t s;
@@ -436,13 +436,15 @@ job_update_shm(slurmd_job_t *job)
 	s.sw_id     = 0;
 	s.io_update = false;
 
-	if (shm_insert_step(&s) < 0)
-		error("Updating shm with new step info: %m");
+	if (shm_insert_step(&s) < 0) 
+		return SLURM_ERROR;
 
 	if (job->stepid == NO_VAL)
 		debug("updated shm with job %d", job->jobid);
 	else
 		debug("updated shm with step %d.%d", job->jobid, job->stepid);
+
+	return SLURM_SUCCESS;
 }
 
 void 
diff --git a/src/slurmd/job.h b/src/slurmd/job.h
index 735f9d5eb9b..f337bc47251 100644
--- a/src/slurmd/job.h
+++ b/src/slurmd/job.h
@@ -141,7 +141,7 @@ struct task_info * task_info_create(int taskid, int gtaskid);
 
 void task_info_destroy(struct task_info *t);
 
-void job_update_shm(slurmd_job_t *job);
+int job_update_shm(slurmd_job_t *job);
 
 void job_delete_shm(slurmd_job_t *job);
 
diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c
index 376df11a530..3a77bed2173 100644
--- a/src/slurmd/mgr.c
+++ b/src/slurmd/mgr.c
@@ -362,7 +362,13 @@ _job_mgr(slurmd_job_t *job)
 	if (shm_init() < 0)
 		goto fail0;
 
-	job_update_shm(job);
+	if (job_update_shm(job) < 0) {
+		if (errno == ENOSPC) 
+			rc = ESLURMD_TOOMANYSTEPS;
+		else if (errno == EEXIST)
+			rc = ESLURMD_STEP_EXISTS;
+		goto fail0;
+	}
 
 	if (!job->batch && (interconnect_preinit(job) < 0)) {
 		rc = ESLURM_INTERCONNECT_FAILURE;
diff --git a/src/slurmd/req.c b/src/slurmd/req.c
index 65a363db0ac..fd0b9471a6f 100644
--- a/src/slurmd/req.c
+++ b/src/slurmd/req.c
@@ -490,13 +490,21 @@ _rpc_timelimit(slurm_msg_t *msg, slurm_addr *cli_addr)
 	 */
 	_kill_running_session_mgrs(req->job_id, SIGXCPU);
 
-	step_cnt = _kill_all_active_steps(req->job_id, SIGTERM);
+	if ((step_cnt = _kill_all_active_steps(req->job_id, SIGTERM)))
+		found_job = true;
 
-	info("Timeout for job=%u, step_cnt=%d, kill_wait=%u", 
-	     req->job_id, step_cnt, conf->cf.kill_wait);
+	verbose( "Job %u: timeout: sent SIGTERM to %d active steps", 
+	         req->job_id, step_cnt );
+
+	sleep(1);
+	/*
+	 * Check to see if any processes are still around
+	 */
+	if (found_job && _kill_all_active_steps(req->job_id, 0)) {
+
+		verbose( "Job %u: waiting %d secs for SIGKILL", 
+			 req->job_id, conf->cf.kill_wait       );
 
-	if (step_cnt) {
-		found_job = true;
 		sleep(conf->cf.kill_wait);
 	}
 
@@ -653,8 +661,7 @@ _kill_all_active_steps(uint32_t jobid, int sig)
 	}
 	list_destroy(steps);
 	if (step_cnt == 0)
-		debug2("No steps in jobid %d to send signal %d",
-		       jobid, sig);
+		debug2("No steps in jobid %d to send signal %d", jobid, sig);
 	return step_cnt;
 }
 
diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c
index 05fbcee25e4..1f41896aa78 100644
--- a/src/slurmd/shm.c
+++ b/src/slurmd/shm.c
@@ -278,7 +278,7 @@ _is_valid_ipc_name(const char *name)
 static char *
 _create_ipc_name(const char *name)
 {
-	char *dst, *dir, *slash;
+	char *dst = NULL, *dir = NULL, *slash = NULL;
 	int rc;
 
 	if ((rc = _is_valid_ipc_name(name)) != 1)
@@ -419,11 +419,10 @@ shm_signal_step(uint32_t jobid, uint32_t stepid, uint32_t signal)
 	if ((i = _shm_find_step(jobid, stepid)) >= 0) {
 		s = &slurmd_shm->step[i];
 		for (t = _taskp(s->task_list); t; t = _taskp(t->next)) {
+			pid_t sid = getsid(t->pid);
 
-			if (getsid(t->pid) != s->sid) {
-				error ("Task pid is not in my session!");
+			if ((sid < (pid_t) 0) || (sid != s->sid))
 				continue;
-			}
 
 			if (t->pid > 0 && kill(t->pid, signo) < 0) {
 				error("kill %d.%d task %d pid %ld: %m", 
diff --git a/src/slurmd/smgr.c b/src/slurmd/smgr.c
index 6a9d910c2fc..67162b9cd5d 100644
--- a/src/slurmd/smgr.c
+++ b/src/slurmd/smgr.c
@@ -282,7 +282,7 @@ _exec_task(slurmd_job_t *job, int i)
 }
 
 static sig_atomic_t timelimit_exceeded = 0;
-static
+static void
 _xcpu_handler()
 {
 	timelimit_exceeded = 1;
-- 
GitLab