From cf519f6252aaaec21355de1e9365bd69d3d6e55a Mon Sep 17 00:00:00 2001 From: Mark Grondona <mgrondona@llnl.gov> Date: Wed, 11 Jun 2003 18:16:04 +0000 Subject: [PATCH] o use setproctitle() to attempt process title change in slurmd. o add calls to interconnect_node_init/fini() in slurmd. o move interconnect_postfini() above wait_for_io() --- src/slurmd/Makefile.am | 31 ++--- src/slurmd/mgr.c | 52 ++++++- src/slurmd/setproctitle.c | 275 ++++++++++++++++++++++++++++++++++++++ src/slurmd/setproctitle.h | 41 ++++++ src/slurmd/slurmd.c | 12 +- 5 files changed, 389 insertions(+), 22 deletions(-) create mode 100644 src/slurmd/setproctitle.c create mode 100644 src/slurmd/setproctitle.h diff --git a/src/slurmd/Makefile.am b/src/slurmd/Makefile.am index f84f534fe18..5a42ad5db44 100644 --- a/src/slurmd/Makefile.am +++ b/src/slurmd/Makefile.am @@ -22,21 +22,22 @@ slurmd_LDADD = \ $(SSL_LIBS) -common_sources = \ - slurmd.c slurmd.h \ - req.c req.h \ - mgr.c mgr.h \ - smgr.c smgr.h \ - get_mach_stat.c \ - get_mach_stat.h \ - read_proc.c \ - job.c job.h \ - io.c io.h \ - semaphore.c semaphore.h \ - shm.c shm.h \ - fname.c fname.h \ - ulimits.c ulimits.h \ - setenvpf.c setenvpf.h \ +common_sources = \ + slurmd.c slurmd.h \ + req.c req.h \ + mgr.c mgr.h \ + smgr.c smgr.h \ + get_mach_stat.c \ + get_mach_stat.h \ + read_proc.c \ + job.c job.h \ + io.c io.h \ + semaphore.c semaphore.h \ + shm.c shm.h \ + fname.c fname.h \ + ulimits.c ulimits.h \ + setenvpf.c setenvpf.h \ + setproctitle.c setproctitle.h \ interconnect.h slurmd_SOURCES = $(common_sources) $(interconnect_sources) diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index 31bc1e3e63e..ed4a32f9107 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -62,6 +62,7 @@ #include "src/slurmd/slurmd.h" #include "src/slurmd/setenvpf.h" +#include "src/slurmd/setproctitle.h" #include "src/slurmd/smgr.h" #include "src/slurmd/io.h" #include "src/slurmd/shm.h" @@ -112,7 +113,9 @@ static int _send_exit_msg(slurmd_job_t *job, int tid[], int n, int status); static void _set_unexited_task_status(slurmd_job_t *job, int status); static int _send_pending_exit_msgs(slurmd_job_t *job); -static void _setargs(slurmd_job_t *job, char **argv, int argc); +static void _setargs(slurmd_job_t *job); + +static void _random_sleep(slurmd_job_t *job); /* * Batch job mangement prototypes: @@ -140,7 +143,7 @@ mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *cli) _set_job_log_prefix(job); - _setargs(job, *conf->argv, *conf->argc); + _setargs(job); if (_job_mgr(job) < 0) return SLURM_ERROR; @@ -164,7 +167,7 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli) _set_job_log_prefix(job); - _setargs(job, *conf->argv, *conf->argc); + _setargs(job); if ((batchdir = _make_batch_dir(job)) == NULL) goto cleanup1; @@ -308,6 +311,18 @@ _setup_io(slurmd_job_t *job) return SLURM_SUCCESS; } +static void +_random_sleep(slurmd_job_t *job) +{ + long int delay = 0; + long int max = (3 * job->nnodes); + + srand48((long int) (job->jobid + job->nodeid)); + + delay = lrand48() % ( max + 1 ); + debug3("delaying %dms", delay); + poll(NULL, 0, delay); +} /* * Send task exit message for n tasks. tid is the list of _global_ @@ -329,6 +344,13 @@ _send_exit_msg(slurmd_job_t *job, int tid[], int n, int status) resp.data = &msg; resp.msg_type = MESSAGE_TASK_EXIT; + /* + * XXX Hack for TCP timeouts on exit of large, synchronized + * jobs. Delay a random amount if job->nnodes > 100 + */ + if (job->nnodes > 100) + _random_sleep(job); + /* * XXX: Should srun_list be associated with each task? */ @@ -418,14 +440,24 @@ _job_mgr(slurmd_job_t *job) job_update_state(job, SLURMD_JOB_ENDING); fail2: + + /* + * First call interconnect_postfini() - In at least one case, + * this will clean up any straggling processes. If this call + * is moved behind wait_for_io(), we may block waiting for IO + * on a hung process. + */ + if (!job->batch && (interconnect_postfini(job) < 0)) + error("interconnect_postfini: %m"); + /* * Wait for io thread to complete */ _wait_for_io(job); + + job_update_state(job, SLURMD_JOB_COMPLETE); - if (!job->batch && (interconnect_postfini(job) < 0)) - error("interconnect_postfini: %m"); fail1: job_delete_shm(job); shm_fini(); @@ -1040,11 +1072,20 @@ _slurmd_job_log_init(slurmd_job_t *job) +static void +_setargs(slurmd_job_t *job) +{ + if (job->stepid == NO_VAL) + setproctitle("%s [%d]", "slurmd", job->jobid); + else + setproctitle("%s %d.%d", "slurmd", job->jobid, job->stepid); +} /* * Attempt to change the cmdline argument list for slurmd * to denote the job/job step that this process is managing. */ +#if 0 static void _setargs(slurmd_job_t *job, char **argv, int argc) { @@ -1071,5 +1112,6 @@ _setargs(slurmd_job_t *job, char **argv, int argc) xfree(arg); return; } +#endif diff --git a/src/slurmd/setproctitle.c b/src/slurmd/setproctitle.c new file mode 100644 index 00000000000..da32c7ac6b7 --- /dev/null +++ b/src/slurmd/setproctitle.c @@ -0,0 +1,275 @@ +/*****************************************************************************\ + * src/slurmd/setproctitle.c - argv manipulation + * $Id$ + ***************************************************************************** + * Copyright (C) 2002 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Mark Grondona <mgrondona@llnl.gov>. + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +/* + * Based on src/backend/utils/misc/pg_status.c from + * PostgreSQL Database Management System + * + * Portions Copyright (c) 1996-2001, The PostgreSQL Global Development Group + * + * Portions Copyright (c) 1994, The Regents of the University of California + * + * Permission to use, copy, modify, and distribute this software and its + * documentation for any purpose, without fee, and without a written agreement + * is hereby granted, provided that the above copyright notice and this + * paragraph and the following two paragraphs appear in all copies. + * + * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR + * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING + * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS + * DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS + * ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO + * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + */ + +/*-------------------------------------------------------------------- + * ps_status.c + * + * Routines to support changing the ps display of PostgreSQL backends + * to contain some useful information. Mechanism differs wildly across + * platforms. + * + * $Header: /var/cvs/openssh/openbsd-compat/setproctitle.c,v 1.5 2003/01/20 02:1 + * + * Copyright 2000 by PostgreSQL Global Development Group + * various details abducted from various places + *-------------------------------------------------------------------- + */ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#ifndef HAVE_SETPROCTITLE + +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <unistd.h> +#ifdef HAVE_SYS_PSTAT_H +#include <sys/pstat.h> /* for HP-UX */ +#endif +#ifdef HAVE_PS_STRINGS +#include <machine/vmparam.h> /* for old BSD */ +#include <sys/exec.h> +#endif + +/*------ + * Alternative ways of updating ps display: + * + * SETPROCTITLE_STRATEGY == PS_USE_PSTAT + * use the pstat(PSTAT_SETCMD, ) + * (HPUX) + * SETPROCTITLE_STRATEGY == PS_USE_PS_STRINGS + * assign PS_STRINGS->ps_argvstr = "string" + * (some BSD systems) + * SETPROCTITLE_STRATEGY == PS_USE_CHANGE_ARGV + * assign argv[0] = "string" + * (some other BSD systems) + * SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV + * write over the argv and environment area + * (most SysV-like systems) + * SETPROCTITLE_STRATEGY == PS_USE_NONE + * don't update ps display + * (This is the default, as it is safest.) + */ + +#define PS_USE_NONE 0 +#define PS_USE_PSTAT 1 +#define PS_USE_PS_STRINGS 2 +#define PS_USE_CHANGE_ARGV 3 +#define PS_USE_CLOBBER_ARGV 4 + +#ifndef SETPROCTITLE_STRATEGY +# define SETPROCTITLE_STRATEGY PS_USE_NONE +#endif + +#ifndef SETPROCTITLE_PS_PADDING +# define SETPROCTITLE_PS_PADDING ' ' +#endif +#endif /* HAVE_SETPROCTITLE */ + +extern char **environ; + +/* + * argv clobbering uses existing argv space, all other methods need a buffer + */ +#if SETPROCTITLE_STRATEGY != PS_USE_CLOBBER_ARGV +static char ps_buffer[256]; +static const size_t ps_buffer_size = sizeof(ps_buffer); +#else +static char *ps_buffer; /* will point to argv area */ +static size_t ps_buffer_size; /* space determined at run time */ +#endif + +/* save the original argv[] location here */ +static int save_argc; +static char **save_argv; + +extern char *__progname; + +#ifndef HAVE_SETPROCTITLE +/* + * Call this to update the ps status display to a fixed prefix plus an + * indication of what you're currently doing passed in the argument. + */ +void +setproctitle(const char *fmt, ...) +{ +#if SETPROCTITLE_STRATEGY == PS_USE_PSTAT + union pstun pst; +#endif +#if SETPROCTITLE_STRATEGY != PS_USE_NONE + ssize_t used; + va_list ap; + + /* no ps display if you didn't call save_ps_display_args() */ + if (save_argv == NULL) + return; +#if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV + /* If ps_buffer is a pointer, it might still be null */ + if (ps_buffer == NULL) + return; +#endif /* PS_USE_CLOBBER_ARGV */ + + /* + * Overwrite argv[] to point at appropriate space, if needed + */ +#if SETPROCTITLE_STRATEGY == PS_USE_CHANGE_ARGV + save_argv[0] = ps_buffer; + save_argv[1] = NULL; +#endif /* PS_USE_CHANGE_ARGV */ + +#if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV + save_argv[1] = NULL; +#endif /* PS_USE_CLOBBER_ARGV */ + + /* + * Make fixed prefix of ps display. + */ + + va_start(ap, fmt); + if (fmt == NULL) + snprintf(ps_buffer, ps_buffer_size, "%s", __progname); + else { + used = snprintf(ps_buffer, ps_buffer_size, "%s: ", __progname); + if (used == -1 || used >= ps_buffer_size) + used = ps_buffer_size; + vsnprintf(ps_buffer + used, ps_buffer_size - used, fmt, ap); + } + va_end(ap); + +#if SETPROCTITLE_STRATEGY == PS_USE_PSTAT + pst.pst_command = ps_buffer; + pstat(PSTAT_SETCMD, pst, strlen(ps_buffer), 0, 0); +#endif /* PS_USE_PSTAT */ + +#if SETPROCTITLE_STRATEGY == PS_USE_PS_STRINGS + PS_STRINGS->ps_nargvstr = 1; + PS_STRINGS->ps_argvstr = ps_buffer; +#endif /* PS_USE_PS_STRINGS */ + +#if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV + /* pad unused memory */ + used = strlen(ps_buffer); + memset(ps_buffer + used, SETPROCTITLE_PS_PADDING, + ps_buffer_size - used); +#endif /* PS_USE_CLOBBER_ARGV */ + +#endif /* PS_USE_NONE */ +} + +#endif /* HAVE_SETPROCTITLE */ + +/* + * Call this early in startup to save the original argc/argv values. + * + * argv[] will not be overwritten by this routine, but may be overwritten + * during setproctitle. Also, the physical location of the environment + * strings may be moved, so this should be called before any code that + * might try to hang onto a getenv() result. + */ +void +init_setproctitle(int argc, char *argv[]) +{ +#if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV + char *end_of_area = NULL; + char **new_environ; + int i; +#endif + + save_argc = argc; + save_argv = argv; + +#if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV + /* + * If we're going to overwrite the argv area, count the available + * space. Also move the environment to make additional room. + */ + + /* + * check for contiguous argv strings + */ + for (i = 0; i < argc; i++) { + if (i == 0 || end_of_area + 1 == argv[i]) + end_of_area = argv[i] + strlen(argv[i]); + } + + /* probably can't happen? */ + if (end_of_area == NULL) { + ps_buffer = NULL; + ps_buffer_size = 0; + return; + } + + /* + * check for contiguous environ strings following argv + */ + for (i = 0; environ[i] != NULL; i++) { + if (end_of_area + 1 == environ[i]) + end_of_area = environ[i] + strlen(environ[i]); + } + + ps_buffer = argv[0]; + ps_buffer_size = end_of_area - argv[0] - 1; + + /* + * Duplicate and move the environment out of the way + */ + new_environ = malloc(sizeof(char *) * (i + 1)); + for (i = 0; environ[i] != NULL; i++) + new_environ[i] = strdup(environ[i]); + new_environ[i] = NULL; + environ = new_environ; +#endif /* PS_USE_CLOBBER_ARGV */ +} + diff --git a/src/slurmd/setproctitle.h b/src/slurmd/setproctitle.h new file mode 100644 index 00000000000..fe4d0f7c43e --- /dev/null +++ b/src/slurmd/setproctitle.h @@ -0,0 +1,41 @@ +/*****************************************************************************\ + * src/slurmd/setproctitle.h - Emulation of BSD setproctitle() + * $Id$ + ***************************************************************************** + * Copyright (C) 2002 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Mark Grondona <mgrondona@llnl.gov>. + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#ifndef _BSD_SETPROCTITLE_H +#define _BSD_SETPROCTITLE_H + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#ifndef HAVE_SETPROCTITLE +void setproctitle(const char *fmt, ...); +void init_setproctitle(int argc, char *argv[]); +#endif + +#endif /* _BSD_SETPROCTITLE_H */ + diff --git a/src/slurmd/slurmd.c b/src/slurmd/slurmd.c index 5ea4be860c0..9748eaf1ac8 100644 --- a/src/slurmd/slurmd.c +++ b/src/slurmd/slurmd.c @@ -58,6 +58,7 @@ #include "src/slurmd/slurmd.h" #include "src/slurmd/req.h" #include "src/slurmd/shm.h" +#include "src/slurmd/setproctitle.h" #include "src/slurmd/get_mach_stat.h" #define GETOPT_ARGS "L:f:Dvhc" @@ -130,6 +131,8 @@ main (int argc, char *argv[]) conf->argv = &argv; conf->argc = &argc; + init_setproctitle(argc, argv); + log_init(argv[0], conf->log_opts, LOG_DAEMON, conf->logfile); /* @@ -155,6 +158,9 @@ main (int argc, char *argv[]) _kill_old_slurmd(); + if (interconnect_node_init() < 0) + fatal("Unable to initialize interconnect."); + _create_msg_socket(); conf->pid = getpid(); @@ -182,8 +188,9 @@ main (int argc, char *argv[]) _wait_for_all_threads(); - _slurmd_fini(); + interconnect_node_fini(); + _slurmd_fini(); return 0; } @@ -785,7 +792,8 @@ _usage() "\tPrint this help message.\n"); } -/* create spool directory as needed and "cd" to it +/* + * create spool directory as needed and "cd" to it */ static int _set_slurmd_spooldir(void) -- GitLab