From cf519f6252aaaec21355de1e9365bd69d3d6e55a Mon Sep 17 00:00:00 2001
From: Mark Grondona <mgrondona@llnl.gov>
Date: Wed, 11 Jun 2003 18:16:04 +0000
Subject: [PATCH]  o use setproctitle() to attempt process title change in
 slurmd.  o add calls to interconnect_node_init/fini() in slurmd.  o move
 interconnect_postfini() above wait_for_io()

---
 src/slurmd/Makefile.am    |  31 ++---
 src/slurmd/mgr.c          |  52 ++++++-
 src/slurmd/setproctitle.c | 275 ++++++++++++++++++++++++++++++++++++++
 src/slurmd/setproctitle.h |  41 ++++++
 src/slurmd/slurmd.c       |  12 +-
 5 files changed, 389 insertions(+), 22 deletions(-)
 create mode 100644 src/slurmd/setproctitle.c
 create mode 100644 src/slurmd/setproctitle.h

diff --git a/src/slurmd/Makefile.am b/src/slurmd/Makefile.am
index f84f534fe18..5a42ad5db44 100644
--- a/src/slurmd/Makefile.am
+++ b/src/slurmd/Makefile.am
@@ -22,21 +22,22 @@ slurmd_LDADD = 					   \
 	$(SSL_LIBS) 
 
 
-common_sources = 	        \
-	slurmd.c slurmd.h       \
-	req.c req.h		\
-	mgr.c mgr.h		\
-	smgr.c smgr.h		\
-	get_mach_stat.c         \
-	get_mach_stat.h		\
-	read_proc.c 	        \
-	job.c job.h		\
-	io.c io.h		\
-	semaphore.c semaphore.h	\
-	shm.c shm.h		\
-	fname.c fname.h		\
-	ulimits.c ulimits.h     \
-	setenvpf.c setenvpf.h   \
+common_sources = 	        	\
+	slurmd.c slurmd.h       	\
+	req.c req.h			\
+	mgr.c mgr.h			\
+	smgr.c smgr.h			\
+	get_mach_stat.c         	\
+	get_mach_stat.h			\
+	read_proc.c 	        	\
+	job.c job.h			\
+	io.c io.h			\
+	semaphore.c semaphore.h		\
+	shm.c shm.h			\
+	fname.c fname.h			\
+	ulimits.c ulimits.h     	\
+	setenvpf.c setenvpf.h   	\
+	setproctitle.c setproctitle.h 	\
 	interconnect.h
 
 slurmd_SOURCES = $(common_sources) $(interconnect_sources)
diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c
index 31bc1e3e63e..ed4a32f9107 100644
--- a/src/slurmd/mgr.c
+++ b/src/slurmd/mgr.c
@@ -62,6 +62,7 @@
 
 #include "src/slurmd/slurmd.h"
 #include "src/slurmd/setenvpf.h"
+#include "src/slurmd/setproctitle.h"
 #include "src/slurmd/smgr.h"
 #include "src/slurmd/io.h"
 #include "src/slurmd/shm.h"
@@ -112,7 +113,9 @@ static int  _send_exit_msg(slurmd_job_t *job, int tid[], int n, int status);
 static void _set_unexited_task_status(slurmd_job_t *job, int status);
 static int  _send_pending_exit_msgs(slurmd_job_t *job);
 
-static void _setargs(slurmd_job_t *job, char **argv, int argc);
+static void _setargs(slurmd_job_t *job);
+
+static void _random_sleep(slurmd_job_t *job);
 
 /*
  * Batch job mangement prototypes:
@@ -140,7 +143,7 @@ mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *cli)
 
 	_set_job_log_prefix(job);
 
-	_setargs(job, *conf->argv, *conf->argc);
+	_setargs(job);
 
 	if (_job_mgr(job) < 0)
 		return SLURM_ERROR;
@@ -164,7 +167,7 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli)
 
 	_set_job_log_prefix(job);
 
-	_setargs(job, *conf->argv, *conf->argc);
+	_setargs(job);
 
 	if ((batchdir = _make_batch_dir(job)) == NULL) 
 		goto cleanup1;
@@ -308,6 +311,18 @@ _setup_io(slurmd_job_t *job)
 	return SLURM_SUCCESS;
 }
 
+static void
+_random_sleep(slurmd_job_t *job)
+{
+	long int delay = 0;
+	long int max   = (3 * job->nnodes); 
+
+	srand48((long int) (job->jobid + job->nodeid));
+
+	delay = lrand48() % ( max + 1 );
+	debug3("delaying %dms", delay);
+	poll(NULL, 0, delay);
+}
 
 /*
  * Send task exit message for n tasks. tid is the list of _global_
@@ -329,6 +344,13 @@ _send_exit_msg(slurmd_job_t *job, int tid[], int n, int status)
 	resp.data        = &msg;
 	resp.msg_type    = MESSAGE_TASK_EXIT;
 
+	/*
+	 *  XXX Hack for TCP timeouts on exit of large, synchronized
+	 *  jobs. Delay a random amount if job->nnodes > 100
+	 */
+	if (job->nnodes > 100) 
+		_random_sleep(job);
+
 	/*
 	 * XXX: Should srun_list be associated with each task?
 	 */
@@ -418,14 +440,24 @@ _job_mgr(slurmd_job_t *job)
 
 	job_update_state(job, SLURMD_JOB_ENDING);
     fail2:
+
+	/*
+	 *  First call interconnect_postfini() - In at least one case,
+	 *    this will clean up any straggling processes. If this call
+	 *    is moved behind wait_for_io(), we may block waiting for IO
+	 *    on a hung process.
+	 */
+	if (!job->batch && (interconnect_postfini(job) < 0))
+		error("interconnect_postfini: %m");
+
 	/*
 	 * Wait for io thread to complete
 	 */
 	_wait_for_io(job);
+
+
 	job_update_state(job, SLURMD_JOB_COMPLETE);
 
-	if (!job->batch && (interconnect_postfini(job) < 0))
-		error("interconnect_postfini: %m");
     fail1:
 	job_delete_shm(job);
 	shm_fini();
@@ -1040,11 +1072,20 @@ _slurmd_job_log_init(slurmd_job_t *job)
 
 
 
+static void
+_setargs(slurmd_job_t *job)
+{
+	if (job->stepid == NO_VAL)
+		setproctitle("%s [%d]", "slurmd", job->jobid);
+	else
+		setproctitle("%s %d.%d", "slurmd", job->jobid, job->stepid); 
+}
 
 /*
  * Attempt to change the cmdline argument list for slurmd
  * to denote the job/job step that this process is managing.
  */
+#if 0
 static void
 _setargs(slurmd_job_t *job, char **argv, int argc)
 {
@@ -1071,5 +1112,6 @@ _setargs(slurmd_job_t *job, char **argv, int argc)
 	xfree(arg);
 	return;
 }
+#endif
 
 
diff --git a/src/slurmd/setproctitle.c b/src/slurmd/setproctitle.c
new file mode 100644
index 00000000000..da32c7ac6b7
--- /dev/null
+++ b/src/slurmd/setproctitle.c
@@ -0,0 +1,275 @@
+/*****************************************************************************\
+ * src/slurmd/setproctitle.c - argv manipulation 
+ * $Id$
+ *****************************************************************************
+ *  Copyright (C) 2002 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Mark Grondona <mgrondona@llnl.gov>.
+ *  UCRL-CODE-2002-040.
+ *  
+ *  This file is part of SLURM, a resource management program.
+ *  For details, see <http://www.llnl.gov/linux/slurm/>.
+ *  
+ *  SLURM is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *  
+ *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+ *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ *  details.
+ *  
+ *  You should have received a copy of the GNU General Public License along
+ *  with SLURM; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
+\*****************************************************************************/
+
+/*
+ * Based on src/backend/utils/misc/pg_status.c from 
+ * PostgreSQL Database Management System
+ * 
+ * Portions Copyright (c) 1996-2001, The PostgreSQL Global Development Group
+ * 
+ * Portions Copyright (c) 1994, The Regents of the University of California
+ * 
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation for any purpose, without fee, and without a written agreement
+ * is hereby granted, provided that the above copyright notice and this
+ * paragraph and the following two paragraphs appear in all copies.
+ * 
+ * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
+ * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
+ * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
+ * DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * 
+ * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
+ * ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
+ * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+ */
+
+/*--------------------------------------------------------------------
+ * ps_status.c
+ *
+ * Routines to support changing the ps display of PostgreSQL backends
+ * to contain some useful information. Mechanism differs wildly across
+ * platforms.
+ *
+ * $Header: /var/cvs/openssh/openbsd-compat/setproctitle.c,v 1.5 2003/01/20 02:1
+ *
+ * Copyright 2000 by PostgreSQL Global Development Group
+ * various details abducted from various places
+ *--------------------------------------------------------------------
+ */
+
+#if HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#ifndef HAVE_SETPROCTITLE
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <unistd.h>
+#ifdef HAVE_SYS_PSTAT_H
+#include <sys/pstat.h>		/* for HP-UX */
+#endif
+#ifdef HAVE_PS_STRINGS
+#include <machine/vmparam.h>	/* for old BSD */
+#include <sys/exec.h>
+#endif
+
+/*------
+ * Alternative ways of updating ps display:
+ *
+ * SETPROCTITLE_STRATEGY == PS_USE_PSTAT
+ *	   use the pstat(PSTAT_SETCMD, )
+ *	   (HPUX)
+ * SETPROCTITLE_STRATEGY == PS_USE_PS_STRINGS
+ *	   assign PS_STRINGS->ps_argvstr = "string"
+ *	   (some BSD systems)
+ * SETPROCTITLE_STRATEGY == PS_USE_CHANGE_ARGV
+ *	   assign argv[0] = "string"
+ *	   (some other BSD systems)
+ * SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV
+ *	   write over the argv and environment area
+ *	   (most SysV-like systems)
+ * SETPROCTITLE_STRATEGY == PS_USE_NONE
+ *	   don't update ps display
+ *	   (This is the default, as it is safest.)
+ */
+
+#define PS_USE_NONE			0
+#define PS_USE_PSTAT			1
+#define PS_USE_PS_STRINGS		2
+#define PS_USE_CHANGE_ARGV		3
+#define PS_USE_CLOBBER_ARGV		4
+
+#ifndef SETPROCTITLE_STRATEGY
+# define SETPROCTITLE_STRATEGY	PS_USE_NONE 
+#endif
+
+#ifndef SETPROCTITLE_PS_PADDING
+# define SETPROCTITLE_PS_PADDING	' '
+#endif
+#endif /* HAVE_SETPROCTITLE */
+
+extern char **environ;
+
+/*
+ * argv clobbering uses existing argv space, all other methods need a buffer
+ */
+#if SETPROCTITLE_STRATEGY != PS_USE_CLOBBER_ARGV
+static char ps_buffer[256];
+static const size_t ps_buffer_size = sizeof(ps_buffer);
+#else
+static char *ps_buffer;			/* will point to argv area */
+static size_t ps_buffer_size;		/* space determined at run time */
+#endif
+
+/* save the original argv[] location here */
+static int	save_argc;
+static char **save_argv;
+
+extern char *__progname;
+
+#ifndef HAVE_SETPROCTITLE
+/*
+ * Call this to update the ps status display to a fixed prefix plus an
+ * indication of what you're currently doing passed in the argument.
+ */
+void
+setproctitle(const char *fmt, ...)
+{
+#if SETPROCTITLE_STRATEGY == PS_USE_PSTAT
+	union pstun pst;
+#endif
+#if SETPROCTITLE_STRATEGY != PS_USE_NONE
+	ssize_t used;
+	va_list ap;
+
+	/* no ps display if you didn't call save_ps_display_args() */
+	if (save_argv == NULL)
+		return;
+#if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV
+	/* If ps_buffer is a pointer, it might still be null */
+	if (ps_buffer == NULL)
+		return;
+#endif /* PS_USE_CLOBBER_ARGV */
+
+	/*
+	 * Overwrite argv[] to point at appropriate space, if needed
+	 */
+#if SETPROCTITLE_STRATEGY == PS_USE_CHANGE_ARGV
+	save_argv[0] = ps_buffer;
+	save_argv[1] = NULL;
+#endif /* PS_USE_CHANGE_ARGV */
+
+#if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV
+	save_argv[1] = NULL;
+#endif /* PS_USE_CLOBBER_ARGV */
+
+	/*
+	 * Make fixed prefix of ps display.
+	 */
+
+	va_start(ap, fmt);
+	if (fmt == NULL)
+		snprintf(ps_buffer, ps_buffer_size, "%s", __progname);
+	else {
+		used = snprintf(ps_buffer, ps_buffer_size, "%s: ", __progname);
+		if (used == -1 || used >= ps_buffer_size)
+			used = ps_buffer_size;
+		vsnprintf(ps_buffer + used, ps_buffer_size - used, fmt, ap);
+	}
+	va_end(ap);
+
+#if SETPROCTITLE_STRATEGY == PS_USE_PSTAT
+	pst.pst_command = ps_buffer;
+	pstat(PSTAT_SETCMD, pst, strlen(ps_buffer), 0, 0);
+#endif   /* PS_USE_PSTAT */
+
+#if SETPROCTITLE_STRATEGY == PS_USE_PS_STRINGS
+	PS_STRINGS->ps_nargvstr = 1;
+	PS_STRINGS->ps_argvstr = ps_buffer;
+#endif   /* PS_USE_PS_STRINGS */
+
+#if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV
+	/* pad unused memory */
+	used = strlen(ps_buffer);
+	memset(ps_buffer + used, SETPROCTITLE_PS_PADDING, 
+	    ps_buffer_size - used);
+#endif   /* PS_USE_CLOBBER_ARGV */
+
+#endif /* PS_USE_NONE */
+}
+
+#endif /* HAVE_SETPROCTITLE */
+
+/*
+ * Call this early in startup to save the original argc/argv values.
+ *
+ * argv[] will not be overwritten by this routine, but may be overwritten
+ * during setproctitle. Also, the physical location of the environment
+ * strings may be moved, so this should be called before any code that
+ * might try to hang onto a getenv() result.
+ */
+void
+init_setproctitle(int argc, char *argv[])
+{
+#if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV
+	char *end_of_area = NULL;
+	char **new_environ;
+	int i;
+#endif
+
+	save_argc = argc;
+	save_argv = argv;
+
+#if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV
+	/*
+	 * If we're going to overwrite the argv area, count the available
+	 * space.  Also move the environment to make additional room.
+	 */
+
+	/*
+	 * check for contiguous argv strings
+	 */
+	for (i = 0; i < argc; i++) {
+		if (i == 0 || end_of_area + 1 == argv[i])
+			end_of_area = argv[i] + strlen(argv[i]);
+	}
+
+	/* probably can't happen? */
+	if (end_of_area == NULL) {
+		ps_buffer = NULL;
+		ps_buffer_size = 0;
+		return;
+	}
+
+	/*
+	 * check for contiguous environ strings following argv
+	 */
+	for (i = 0; environ[i] != NULL; i++) {
+		if (end_of_area + 1 == environ[i])
+			end_of_area = environ[i] + strlen(environ[i]);
+	}
+
+	ps_buffer = argv[0];
+	ps_buffer_size = end_of_area - argv[0] - 1;
+
+	/*
+	 * Duplicate and move the environment out of the way
+	 */
+	new_environ = malloc(sizeof(char *) * (i + 1));
+	for (i = 0; environ[i] != NULL; i++)
+		new_environ[i] = strdup(environ[i]);
+	new_environ[i] = NULL;
+	environ = new_environ;
+#endif /* PS_USE_CLOBBER_ARGV */
+}
+
diff --git a/src/slurmd/setproctitle.h b/src/slurmd/setproctitle.h
new file mode 100644
index 00000000000..fe4d0f7c43e
--- /dev/null
+++ b/src/slurmd/setproctitle.h
@@ -0,0 +1,41 @@
+/*****************************************************************************\
+ * src/slurmd/setproctitle.h - Emulation of BSD setproctitle()
+ * $Id$
+ *****************************************************************************
+ *  Copyright (C) 2002 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Mark Grondona <mgrondona@llnl.gov>.
+ *  UCRL-CODE-2002-040.
+ *  
+ *  This file is part of SLURM, a resource management program.
+ *  For details, see <http://www.llnl.gov/linux/slurm/>.
+ *  
+ *  SLURM is free software; you can redistribute it and/or modify it under
+ *  the terms of the GNU General Public License as published by the Free
+ *  Software Foundation; either version 2 of the License, or (at your option)
+ *  any later version.
+ *  
+ *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+ *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ *  details.
+ *  
+ *  You should have received a copy of the GNU General Public License along
+ *  with SLURM; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
+\*****************************************************************************/
+
+#ifndef _BSD_SETPROCTITLE_H
+#define _BSD_SETPROCTITLE_H
+
+#if HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#ifndef HAVE_SETPROCTITLE
+void setproctitle(const char *fmt, ...);
+void init_setproctitle(int argc, char *argv[]);
+#endif
+
+#endif /* _BSD_SETPROCTITLE_H */
+
diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c
index 5ea4be860c0..9748eaf1ac8 100644
--- a/src/slurmd/slurmd.c
+++ b/src/slurmd/slurmd.c
@@ -58,6 +58,7 @@
 #include "src/slurmd/slurmd.h"
 #include "src/slurmd/req.h"
 #include "src/slurmd/shm.h"
+#include "src/slurmd/setproctitle.h"
 #include "src/slurmd/get_mach_stat.h"
 
 #define GETOPT_ARGS	"L:f:Dvhc"
@@ -130,6 +131,8 @@ main (int argc, char *argv[])
 	conf->argv = &argv;
 	conf->argc = &argc;
 
+	init_setproctitle(argc, argv);
+
 	log_init(argv[0], conf->log_opts, LOG_DAEMON, conf->logfile);
 
 	/* 
@@ -155,6 +158,9 @@ main (int argc, char *argv[])
 
 	_kill_old_slurmd();
 
+	if (interconnect_node_init() < 0)
+		fatal("Unable to initialize interconnect.");
+
 	_create_msg_socket();
 
 	conf->pid = getpid();
@@ -182,8 +188,9 @@ main (int argc, char *argv[])
 
 	_wait_for_all_threads();
 
-	_slurmd_fini();
+	interconnect_node_fini();
 
+	_slurmd_fini();
 
 	return 0;
 }
@@ -785,7 +792,8 @@ _usage()
 			"\tPrint this help message.\n");
 }
 
-/* create spool directory as needed and "cd" to it 
+/* 
+ * create spool directory as needed and "cd" to it 
  */
 static int
 _set_slurmd_spooldir(void)
-- 
GitLab