From 1cb0007c2df5b12e4157282ae15f44bcdda2fa22 Mon Sep 17 00:00:00 2001
From: "Christopher J. Morrone" <morrone2@llnl.gov>
Date: Fri, 16 Dec 2005 19:47:57 +0000
Subject: [PATCH] svn merge -r6839:6846 
 https://eris.llnl.gov/svn/slurm/branches/slurm-0-6-branch

---
 NEWS                                          |   1 +
 src/plugins/proctrack/linuxproc/kill_tree.c   | 159 +++++++++---------
 src/plugins/proctrack/linuxproc/kill_tree.h   |   1 -
 .../proctrack/linuxproc/proctrack_linuxproc.c |   2 +-
 src/plugins/proctrack/rms/proctrack_rms.c     |  11 ++
 src/plugins/switch/elan/qsw.c                 |  12 ++
 src/slurmd/slurmstepd/mgr.c                   |  44 ++++-
 src/slurmd/slurmstepd/pdebug.c                |  19 ++-
 8 files changed, 161 insertions(+), 88 deletions(-)

diff --git a/NEWS b/NEWS
index 4d59acaa052..ca9af4de580 100644
--- a/NEWS
+++ b/NEWS
@@ -102,6 +102,7 @@ documents those changes that are of interest to users and admins.
  -- Add job_id to maui scheduler plugin start job status message.
  -- Fix for srun's handling of null characters in stdout or stderr.
  -- Update job accounting for larger systems (Andy Riebs, uptodate.patch).
+ -- Fixes for proctrack/linuxproc and mpich-gm support (Takao Hatazaki, HP).
 
 * Changes in SLURM 0.6.9
 ========================
diff --git a/src/plugins/proctrack/linuxproc/kill_tree.c b/src/plugins/proctrack/linuxproc/kill_tree.c
index 2d60024a3e3..9bd2303aca6 100644
--- a/src/plugins/proctrack/linuxproc/kill_tree.c
+++ b/src/plugins/proctrack/linuxproc/kill_tree.c
@@ -39,12 +39,15 @@
 #include <strings.h>
 #include <unistd.h>
 #include <string.h>
+#include <limits.h>
 
 #include "src/common/xmalloc.h"
 #include "src/common/log.h"
 
 typedef struct xpid_s {
 	pid_t pid;
+	int is_usercmd;
+	char *cmd;
 	struct xpid_s *next;
 } xpid_t;
 
@@ -54,33 +57,36 @@ typedef struct xppid_s {
 	struct xppid_s *next;
 } xppid_t;
 
-#define MAX_NAME_LEN 64
 #define HASH_LEN 64
 
 #define GET_HASH_IDX(ppid) ((ppid)%HASH_LEN)
 
-static xpid_t *_alloc_pid(pid_t pid, xpid_t *next)
+static xpid_t *_alloc_pid(pid_t pid, int is_usercmd, char *cmd, xpid_t *next)
 {
 	xpid_t *new;
 
 	new = (xpid_t *)xmalloc(sizeof(*new));
 	new->pid = pid;
+	new->is_usercmd = is_usercmd;
+	new->cmd = xstrdup(cmd);
 	new->next = next;
 	return new;
 }
 
-static xppid_t *_alloc_ppid(pid_t ppid, pid_t pid, xppid_t *next)
+static xppid_t *_alloc_ppid(pid_t ppid, pid_t pid, int is_usercmd, char *cmd,
+			    xppid_t *next)
 {
 	xppid_t *new;
 
 	new = xmalloc(sizeof(*new));
 	new->ppid = ppid;
-	new->list = _alloc_pid(pid, NULL);
+	new->list = _alloc_pid(pid, is_usercmd, cmd, NULL);
 	new->next = next;
 	return new;
 }
 
-static void _push_to_hashtbl(pid_t ppid, pid_t pid, xppid_t **hashtbl)
+static void _push_to_hashtbl(pid_t ppid, pid_t pid,
+			     int is_usercmd, char *cmd, xppid_t **hashtbl)
 {
 	int idx;
 	xppid_t *ppids, *newppid;
@@ -90,21 +96,45 @@ static void _push_to_hashtbl(pid_t ppid, pid_t pid, xppid_t **hashtbl)
 	ppids = hashtbl[idx];
 	while (ppids) {
 		if (ppids->ppid == ppid) {
-			newpid = _alloc_pid(pid, ppids->list);
+			newpid = _alloc_pid(pid, is_usercmd, cmd, ppids->list);
 			ppids->list = newpid;
 			return;
 		}
 		ppids = ppids->next;
 	}
-	newppid = _alloc_ppid(ppid, pid, hashtbl[idx]);
+	newppid = _alloc_ppid(ppid, pid, is_usercmd, cmd, hashtbl[idx]);
 	hashtbl[idx] = newppid;    
 }
 
+static int get_myname(char *s)
+{
+	char path[PATH_MAX], rbuf[1024];
+	int fd;
+
+	sprintf(path, "/proc/%ld/stat", (long)getpid());
+	if ((fd = open(path, O_RDONLY)) < 0) {
+		error("Cannot open /proc/getpid()/stat");
+		return -1;
+	}
+	if (read(fd, rbuf, 1024) <= 0) {
+		error("Cannot read /proc/getpid()/stat");
+		close(fd);
+		return -1;
+	}
+	close(fd);
+	if (sscanf(rbuf, "%*ld %s ", s) != 1) {
+		error("Cannot get the command name from /proc/getpid()/stat");
+		return -1;
+	}
+	return 0;
+}
+
 static xppid_t **_build_hashtbl()
 {
 	DIR *dir;
 	struct dirent *de;
-	char path[MAX_NAME_LEN], *endptr, *num, rbuf[1024];
+	char path[PATH_MAX], *endptr, *num, rbuf[1024];
+	char myname[1024], cmd[1024];
 	int fd;
 	long pid, ppid;
 	xppid_t **hashtbl;
@@ -113,6 +143,8 @@ static xppid_t **_build_hashtbl()
 		error("opendir(/proc): %m");
 		return NULL;
 	}
+	if (get_myname(myname) < 0) return NULL;
+	debug3("Myname in build_hashtbl: %s", myname);
 
 	hashtbl = (xppid_t **)xmalloc(HASH_LEN * sizeof(xppid_t *));
 
@@ -121,7 +153,7 @@ static xppid_t **_build_hashtbl()
 		strtol(num, &endptr, 10);
 		if (endptr == NULL || *endptr != 0)
 			continue;
-		snprintf(path, MAX_NAME_LEN, "/proc/%s/stat", num);
+		sprintf(path, "/proc/%s/stat", num);
 		if ((fd = open(path, O_RDONLY)) < 0) {
 			continue;
 		}
@@ -129,35 +161,44 @@ static xppid_t **_build_hashtbl()
 			close(fd);
 			continue;
 		}
-		if (sscanf(rbuf, "%ld %*s %*s %ld", &pid, &ppid) != 2) {
+		if (sscanf(rbuf, "%ld %s %*s %ld", &pid, cmd, &ppid) != 3) {
 			close(fd);
 			continue;
 		}
 		close(fd);
-		_push_to_hashtbl((pid_t)ppid, (pid_t)pid, hashtbl);
+
+		/* Record cmd for debugging purpose */
+		_push_to_hashtbl((pid_t)ppid, (pid_t)pid, 
+				 strcmp(myname, cmd), cmd, hashtbl);
 	}
 	closedir(dir);
 	return hashtbl;
 }
 
+static void _destroy_list(xpid_t *list)
+{
+	xpid_t *tmp;
+
+	while (list) {
+		tmp = list->next;
+		xfree(list->cmd);
+		xfree(list);
+		list = tmp;
+	}
+}
+
 static void _destroy_hashtbl(xppid_t **hashtbl)
 {
 	int i;
-	xppid_t *ppid, *tmp2;
-	xpid_t *list, *tmp;
+	xppid_t *ppid, *tmp;
 
 	for (i=0; i<HASH_LEN; i++) {
 		ppid = hashtbl[i];
 		while (ppid) {
-			list = ppid->list;
-			while (list) {
-				tmp = list->next;
-				xfree(list);
-				list = tmp;
-			}
-			tmp2 = ppid->next;
+			_destroy_list(ppid->list);
+			tmp = ppid->next;
 			xfree(ppid);
-			ppid = tmp2;
+			ppid = tmp;
 		}
 	}
 	xfree(hashtbl);
@@ -174,7 +215,10 @@ static xpid_t *_get_list(int top, xpid_t *list, xppid_t **hashtbl)
 		if (ppid->ppid == top) {
 			children = ppid->list;
 			while (children) {
-				list = _alloc_pid(children->pid, list);
+				list = _alloc_pid(children->pid,
+						  children->is_usercmd,
+						  children->cmd,
+						  list);
 				children = children->next;
 			}
 			children = ppid->list;
@@ -189,25 +233,23 @@ static xpid_t *_get_list(int top, xpid_t *list, xppid_t **hashtbl)
 	return list;
 }
 
-static void _destroy_list(xpid_t *list)
-{
-	xpid_t *tmp;
-
-	while (list) {
-		tmp = list->next;
-		xfree(list);
-		list = tmp;
-	}
-}
-
 static int _kill_proclist(xpid_t *list, int sig)
 {
-	int rc = -1;
+	int rc, rc0;
 
+	rc = 0;
 	while (list) {
 		if (list->pid > 1) {
-			verbose("Sending %d to %d", sig, list->pid);
-			rc &= kill(list->pid, sig);
+			if (! list->is_usercmd) {
+				debug2("%ld %s is not a user command.  "
+				       "Skipped sending signal %d",
+				       (long)list->pid, list->cmd, sig);
+			} else {
+				verbose("Sending %d to %d %s",
+					sig, list->pid, list->cmd);
+				rc0 = kill(list->pid, sig);
+				if (rc0) rc = errno; /* save the last error */
+			}
 		}
 		list = list->next;
 	}
@@ -230,7 +272,7 @@ extern int kill_proc_tree(pid_t top, int sig)
 	if ((hashtbl = _build_hashtbl()) == NULL)
 		return -1;
 	
-	list = _get_list(top, _alloc_pid(top, NULL), hashtbl);
+	list = _get_list(top, NULL, hashtbl);
 	rc = _kill_proclist(list, sig);
 	_destroy_hashtbl(hashtbl);
 	_destroy_list(list);
@@ -238,50 +280,13 @@ extern int kill_proc_tree(pid_t top, int sig)
 }
 
 
-static int _kill_proclist_exclude(xpid_t *list, pid_t exclude, int sig)
-{
-	int rc = -1;
-
-	while (list) {
-		if (list->pid > 1 && list->pid != exclude) {
-			verbose("Sending %d to %d", sig, list->pid);
-			rc &= kill(list->pid, sig);
-		}
-		list = list->next;
-	}
-
-	return rc;
-}
-
-
-/*
- * Send signal "sig" to every process in the tree EXCEPT for the top.
- */
-extern int kill_proc_tree_not_top(pid_t top, int sig)
-{
-	xpid_t *list;
-	int rc;
-	xppid_t **hashtbl;
-
-	if ((hashtbl = _build_hashtbl()) == NULL)
-		return -1;
-
-	list = _get_list(top, _alloc_pid(top, NULL), hashtbl);
-	rc = _kill_proclist_exclude(list, top, sig);
-	_destroy_hashtbl(hashtbl);
-	_destroy_list(list);
-	
-	return rc;
-}
-
-
 /*
  * Return the pid of the process named "process_name" 
  * which is the ancestor of "process".
  */
 extern pid_t find_ancestor(pid_t process, char *process_name)
 {
-	char path[MAX_NAME_LEN], rbuf[1024];
+	char path[PATH_MAX], rbuf[1024];
 	int fd;
 	long pid, ppid;
 
@@ -291,7 +296,7 @@ extern pid_t find_ancestor(pid_t process, char *process_name)
 			return 0;
 		}
 
-		snprintf(path, MAX_NAME_LEN, "/proc/%d/stat", ppid);
+		sprintf(path, "/proc/%d/stat", ppid);
 		if ((fd = open(path, O_RDONLY)) < 0) {
 			return 0;
 		}
@@ -304,7 +309,7 @@ extern pid_t find_ancestor(pid_t process, char *process_name)
 			return 0;
 		}
 
-		snprintf(path, MAX_NAME_LEN, "/proc/%d/cmdline", pid);
+		sprintf(path, "/proc/%d/cmdline", pid);
 		if ((fd = open(path, O_RDONLY)) < 0) {
 			continue;
 		}
diff --git a/src/plugins/proctrack/linuxproc/kill_tree.h b/src/plugins/proctrack/linuxproc/kill_tree.h
index e185dd0441f..7e34ce6aaa4 100644
--- a/src/plugins/proctrack/linuxproc/kill_tree.h
+++ b/src/plugins/proctrack/linuxproc/kill_tree.h
@@ -31,7 +31,6 @@
 #include <sys/types.h>
 
 extern int kill_proc_tree(pid_t top, int sig);
-extern int kill_proc_tree_not_top(pid_t top, int sig);
 extern pid_t find_ancestor(pid_t process, char *process_name);
 /*
  * Some of processes may not be in the same process group
diff --git a/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c b/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c
index 1941c6e5d83..37c541e3485 100644
--- a/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c
+++ b/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c
@@ -105,7 +105,7 @@ extern int slurm_container_add ( slurmd_job_t *job, pid_t pid )
 
 extern int slurm_container_signal ( uint32_t id, int signal )
 {
-	return kill_proc_tree_not_top((pid_t)id, signal);
+	return kill_proc_tree((pid_t)id, signal);
 }
 
 extern int slurm_container_destroy ( uint32_t id )
diff --git a/src/plugins/proctrack/rms/proctrack_rms.c b/src/plugins/proctrack/rms/proctrack_rms.c
index ed91fa1692a..4d1f945fc77 100644
--- a/src/plugins/proctrack/rms/proctrack_rms.c
+++ b/src/plugins/proctrack/rms/proctrack_rms.c
@@ -218,10 +218,21 @@ _prg_destructor_fork()
 	} else if (pid > 0) {
 		/* parent */
 		close(fdpair[0]);
+		waitpid(pid, (int *)NULL, 0);
 		return fdpair[1];
 	}
 	
 	/****************************************/
+	/* fork again so the destructor process
+         * will not be a child of the slurmd
+	 */
+	pid = fork();
+	if (pid < 0) {
+		error("_prg_destructor_fork: second fork failed");
+	} else if (pid > 0) {
+		exit(0);
+	}
+
 	/* child */
 	close(fdpair[1]);
 
diff --git a/src/plugins/switch/elan/qsw.c b/src/plugins/switch/elan/qsw.c
index 3ad42944ad6..cb213882ef9 100644
--- a/src/plugins/switch/elan/qsw.c
+++ b/src/plugins/switch/elan/qsw.c
@@ -1021,10 +1021,22 @@ _prg_destructor_fork()
 	} else if (pid > 0) {
 		/* parent */
 		close(fdpair[0]);
+		waitpid(pid, (int *)NULL, 0);
 		return fdpair[1];
 	}
 	
 	/****************************************/
+	/*
+	 * fork again so the destructor process
+	 * will not be a child of the slurmd
+	 */
+	pid = fork();
+	if (pid < 0) {
+		error("switch/elan: second fork failed");
+	} else if (pid > 0) {
+		exit(0);
+	}
+
 	/* child */
 	close(fdpair[1]);
 
diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c
index 5eb46b3da07..a8884bd2e0e 100644
--- a/src/slurmd/slurmstepd/mgr.c
+++ b/src/slurmd/slurmstepd/mgr.c
@@ -725,6 +725,9 @@ _send_pending_exit_msgs(slurmd_job_t *job)
  *
  * If waitflag is false, do repeated non-blocking waits until
  * there are no more processes to reap (waitpid returns 0).
+ *
+ * Returns the number of tasks for which a wait3() was succesfully
+ * performed, or -1 if there are no child tasks.
  */
 static int
 _wait_for_any_task(slurmd_job_t *job, bool waitflag)
@@ -738,8 +741,19 @@ _wait_for_any_task(slurmd_job_t *job, bool waitflag)
 
 	do {
 		pid = wait3(&status, waitflag ? 0 : WNOHANG, &rusage);
-		if (pid <= 0)
-			continue;
+		if (pid == -1) {
+			if (errno == ECHILD) {
+				debug("No child processes");
+				completed = -1;
+				break;
+			} else if (errno == EINTR) {
+				debug("wait3 was interrupted");
+				continue;
+			} else {
+				debug("Unknown errno %d", errno);
+				continue;
+			}
+		}
 
 		/* See if the pid matches that of one of the tasks */
 		for (i = 0; i < job->ntasks; i++) {
@@ -785,12 +799,30 @@ _wait_for_any_task(slurmd_job_t *job, bool waitflag)
 static void
 _wait_for_all_tasks(slurmd_job_t *job)
 {
+	int tasks_left = 0;
 	int i;
 
-	for (i = 0; i < job->ntasks; ) {
-		i += _wait_for_any_task(job, true);
-		if (i < job->ntasks)
-			i += _wait_for_any_task(job, false);
+	for (i = 0; i < job->ntasks; i++) {
+		if (job->task[i]->state < SLURMD_TASK_COMPLETE) {
+			tasks_left++;
+		}
+	}
+	if (tasks_left < job->ntasks)
+		verbose("Only %d of %d requested tasks successfully launched",
+			tasks_left, job->ntasks);
+
+	for (i = 0; i < tasks_left; ) {
+		int rc;
+		rc = _wait_for_any_task(job, true);
+		if (rc == -1) /* Got ECHILD */
+			break;
+		i += rc;
+		if (i < job->ntasks) {
+			rc = _wait_for_any_task(job, false);
+			if (rc == -1) /* Got ECHILD */
+				break;
+			i += rc;
+		}
 
 		while (_send_pending_exit_msgs(job)) {;}
 	}
diff --git a/src/slurmd/slurmstepd/pdebug.c b/src/slurmd/slurmstepd/pdebug.c
index e84ac80ed98..a7560d5b309 100644
--- a/src/slurmd/slurmstepd/pdebug.c
+++ b/src/slurmd/slurmstepd/pdebug.c
@@ -43,16 +43,29 @@ pdebug_trace_process(slurmd_job_t *job, pid_t pid)
 		int status;
 		waitpid(pid, &status, WUNTRACED);
 		if (!WIFSTOPPED(status)) {
-			debug("pdebug_trace_process WIFSTOPPED false"
+			int i;
+			error("pdebug_trace_process WIFSTOPPED false"
 			      " for pid %lu", pid);
 			if (WIFEXITED(status)) {
-				debug("Process %lu exited \"normally\""
+				error("Process %lu exited \"normally\""
 				      " with return code %d",
 				      pid, WEXITSTATUS(status));
 			} else if (WIFSIGNALED(status)) {
-				debug("Process %lu kill by signal %d",
+				error("Process %lu killed by signal %d",
 				      pid, WTERMSIG(status));
 			}
+
+			/*
+			 * Mark this process as complete since it died
+			 * prematurely.
+			 */
+			for (i = 0; i < job->ntasks; i++) {
+				if (job->task[i]->pid == pid) {
+					job->task[i]->state =
+						SLURMD_TASK_COMPLETE;
+				}
+			}
+
 			return SLURM_ERROR;
 		}
 		if ((pid > (pid_t) 0) && (kill(pid, SIGSTOP) < 0)) {
-- 
GitLab