diff --git a/configure.ac b/configure.ac index a0c10159e3b31de6b2386760f7bf33e8ad3bb593..03b9ab09c5a8df6945119f4a531e023d59ac6e42 100644 --- a/configure.ac +++ b/configure.ac @@ -214,6 +214,7 @@ AC_CONFIG_FILES([Makefile src/plugins/proctrack/aix/Makefile src/plugins/proctrack/pgid/Makefile src/plugins/proctrack/linuxproc/Makefile + src/plugins/proctrack/rms/Makefile src/plugins/sched/Makefile src/plugins/sched/backfill/Makefile src/plugins/sched/builtin/Makefile diff --git a/src/plugins/proctrack/Makefile.am b/src/plugins/proctrack/Makefile.am index 8945c1faf94fd91b159cafef176a4be837d6f0cc..738cdc2cf9f844bc3550b8b12665089716bef1ec 100644 --- a/src/plugins/proctrack/Makefile.am +++ b/src/plugins/proctrack/Makefile.am @@ -2,7 +2,7 @@ # Makefile for proctrack plugins if HAVE_AIX_PROCTRACK -SUBDIRS = aix pgid linuxproc +SUBDIRS = aix pgid linuxproc rms else -SUBDIRS = pgid linuxproc +SUBDIRS = pgid linuxproc rms endif diff --git a/src/plugins/proctrack/aix/proctrack_aix.c b/src/plugins/proctrack/aix/proctrack_aix.c index fa3777e66ab9ec73141f999e7890b3c7652a467c..11d49667a5a25ce6299f6ef83a64ffb3b48d5572 100644 --- a/src/plugins/proctrack/aix/proctrack_aix.c +++ b/src/plugins/proctrack/aix/proctrack_aix.c @@ -106,11 +106,16 @@ extern int fini ( void ) return SLURM_SUCCESS; } +extern int slurm_container_create ( slurmd_job_t *job ) +{ + return SLURM_SUCCESS; +} + /* * Uses job step process group id as a unique identifier. Job id * and step id are not unique by themselves. */ -extern uint32_t slurm_container_create ( slurmd_job_t *job ) +extern int slurm_container_add ( slurmd_job_t *job, pid_t pid ) { int pgid = (int) job->pgid; int i; @@ -118,19 +123,12 @@ extern uint32_t slurm_container_create ( slurmd_job_t *job ) xassert(job); xassert(pgid > 1); - for (i = 0; i < job->ntasks; i++) { - if (proctrack_job_reg_pid(&pgid, &job->task[i]->pid) != 0) { - error("proctrack_job_reg(%d): %m", pgid); - return (uint32_t) 0; - } + if (proctrack_job_reg_pid(&pgid, pid) != 0) { + error("proctrack_job_reg_pid(%d, %d): %m", pgid, (int)pid); + return SLURM_ERROR; } - return (uint32_t) pgid; - -} -extern int slurm_container_add ( uint32_t id, pid_t pid ) -{ - debug("slurm_container_add not supported"); + job->cont_id = (uint32_t)pgid; return SLURM_SUCCESS; } diff --git a/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c b/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c index f737f86bc0d1337e066c1d84a1334ad75bd10b73..5807609b24c89cab7401fb6bd8ec9b928523746d 100644 --- a/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c +++ b/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c @@ -92,12 +92,13 @@ extern int fini ( void ) /* * Uses slurmd job-step manager's pid as the unique container id. */ -extern uint32_t slurm_container_create ( slurmd_job_t *job ) +extern int slurm_container_create ( slurmd_job_t *job ) { - return (uint32_t) job->jmgr_pid; + job->cont_id = (uint32_t)job->jmgr_pid; + return SLURM_SUCCESS; } -extern int slurm_container_add ( uint32_t id, pid_t pid ) +extern int slurm_container_add ( slurmd_job_t *job, pid_t pid ) { return SLURM_SUCCESS; } diff --git a/src/plugins/proctrack/pgid/proctrack_pgid.c b/src/plugins/proctrack/pgid/proctrack_pgid.c index e0c2f73fb982143f445c1dd36ca8c1c99ca9c5bc..c522e0c4b182050c7ee17c22326a034814a633bb 100644 --- a/src/plugins/proctrack/pgid/proctrack_pgid.c +++ b/src/plugins/proctrack/pgid/proctrack_pgid.c @@ -91,16 +91,17 @@ extern int fini ( void ) return SLURM_SUCCESS; } -/* - * Uses job step process group id. - */ -extern uint32_t slurm_container_create ( slurmd_job_t *job ) +extern int slurm_container_create ( slurmd_job_t *job ) { - return (uint32_t) job->pgid; + return SLURM_SUCCESS; } -extern int slurm_container_add ( uint32_t id, pid_t pid ) +/* + * Uses job step process group id. + */ +extern int slurm_container_add ( slurmd_job_t *job, pid_t pid ) { + job->cont_id = (uint32_t)job->pgid; return SLURM_SUCCESS; } @@ -124,8 +125,7 @@ extern int slurm_container_destroy ( uint32_t id ) return SLURM_SUCCESS; } -extern uint32_t -slurm_container_find(pid_t pid) +extern uint32_t slurm_container_find(pid_t pid) { pid_t rc = getpgid(pid); diff --git a/src/plugins/proctrack/rms/Makefile.am b/src/plugins/proctrack/rms/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..829c7a40704ba57d0ebc964aedfedb5c0d9ce37c --- /dev/null +++ b/src/plugins/proctrack/rms/Makefile.am @@ -0,0 +1,18 @@ +# $Id: Makefile.am,v 1.3 2005/05/26 22:04:39 morrone Exp $ +# Makefile for proctrack/aix plugin + +AUTOMAKE_OPTIONS = foreign + +PLUGIN_FLAGS = -module -avoid-version --export-dynamic + +INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common + +if HAVE_ELAN +pkglib_LTLIBRARIES = proctrack_rms.la +else +pkglib_LTLIBRARIES = +endif + +proctrack_rms_la_SOURCES = proctrack_rms.c +proctrack_rms_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) +proctrack_rms_la_LIBADD = -lrmscall diff --git a/src/plugins/proctrack/rms/proctrack_rms.c b/src/plugins/proctrack/rms/proctrack_rms.c new file mode 100644 index 0000000000000000000000000000000000000000..729314a44ee28e29a39dbb914cdc6ffbf5ab5ecd --- /dev/null +++ b/src/plugins/proctrack/rms/proctrack_rms.c @@ -0,0 +1,290 @@ +/*****************************************************************************\ + * proctrack_rms.c - process tracking via QsNet rms kernel module + ***************************************************************************** + * Copyright (C) 2005 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * UCRL-CODE-2002-040. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#if HAVE_STDINT_H +# include <stdint.h> +#endif +#if HAVE_INTTYPES_H +# include <inttypes.h> +#endif + +#include <rms/rmscall.h> + +#include <sys/types.h> +#include <signal.h> +#include <stdlib.h> +#include <unistd.h> +#include <slurm/slurm.h> +#include <slurm/slurm_errno.h> +#include "src/common/log.h" +#include "src/slurmd/proctrack.h" + +const char plugin_name[] = "Process tracking for QsNet via the rms module"; +const char plugin_type[] = "proctrack/rms"; +const uint32_t plugin_version = 1; + +static int _prg_destructor_fork(void); +static void _prg_destructor_send(int fd, int prgid); + +#define MAX_IDS 512 + +extern int init (void) +{ + /* close librmscall's internal fd to /proc/rms/control */ + pthread_atfork(NULL, NULL, rmsmod_fini); + return SLURM_SUCCESS; +} + +extern int fini (void) +{ + return SLURM_SUCCESS; +} + + +/* + * When proctrack/rms is used in conjunction with switch/elan, + * slurm_container_create will not normally create the program description. + * It just retrieves the prgid created in switch/elan. + * + * When the program description cannot be retrieved (switch/elan is not + * being used, the job step is a batch script, etc.) then rms_prgcreate() + * is called here. + */ +extern int slurm_container_create (slurmd_job_t *job) +{ + int prgid; + /* + * Return a handle to an existing prgid or create a new one + */ + if (rms_getprgid (job->jmgr_pid, &prgid) < 0) { + int fd = _prg_destructor_fork(); + /* Use slurmd job-step manager's pid as a unique identifier */ + prgid = job->jmgr_pid; + if ((rms_prgcreate (prgid, job->uid, 1)) < 0) { + error ("ptrack/rms: rms_prgcreate: %m"); + _prg_destructor_send(fd, -1); + return SLURM_ERROR; + } + _prg_destructor_send(fd, prgid); + } + debug3("proctrack/rms: prgid = %d", prgid); + + job->cont_id = (uint32_t)prgid; + return SLURM_SUCCESS; +} + +extern int slurm_container_add (slurmd_job_t *job, pid_t pid) +{ + return SLURM_SUCCESS; +} + +/* + * slurm_container_signal assumes that the slurmd jobstep manager + * is always the last process in the rms program description. + * No signals are sent to the last process. + */ +extern int slurm_container_signal (uint32_t id, int signal) +{ + pid_t *pids; + int nids = 0; + int i; + int rc; + int ids[MAX_IDS]; + bool cont_exists = false; + + if (id <= 0) + return -1; + + pids = malloc(MAX_IDS * sizeof(pid_t)); + if (!pids) { + error("proctrack/rms container signal: malloc failed: %m"); + return -1; + } + if ((rc = rms_prginfo((int)id, MAX_IDS, pids, &nids)) < 0) { + error("proctrack/rms rms_prginfo failed %d: %m", rc); + free(pids); + /* + * Ignore errors, program desc has probably already + * been cleaned up. + */ + return -1; + } + + rc = -1; + for (i = nids-2; i >= 0 ; i--) { + debug3("proctrack/rms(pid %d) Sending signal %d to process %d", + getpid(), signal, pids[i]); + rc &= kill(pids[i], signal); + debug("rc = %d", rc); + } + free(pids); + debug3("proctrack/rms signal container returning %d", rc); + return rc; +} + + +/* + * The switch/elan plugin is really responsible for creating and + * destroying rms program descriptions. slurm_destroy_container simply + * returns SLURM_SUCCESS when the program description contains one and + * only one process, assumed to be the slurmd jobstep manager. + */ +extern int slurm_container_destroy (uint32_t id) +{ + pid_t pids[8]; + int nids = 0; + int i; + + debug2("proctrack/rms: destroying container %u\n", id); + if (id == 0) + return SLURM_SUCCESS; + + if (slurm_container_signal(id, 0) == -1) + return SLURM_SUCCESS; + + return SLURM_ERROR; +} + +extern uint32_t slurm_container_find (pid_t pid) +{ + int prgid = 0; + + if (rms_getprgid ((int) pid, &prgid) < 0) + return (uint32_t) 0; + return (uint32_t) prgid; +} + + +static void +_close_all_fd_except(int fd) +{ + int openmax; + int i; + + openmax = sysconf(_SC_OPEN_MAX); + for (i = 0; i <= openmax; i++) { + if (i != fd) + close(i); + } +} + + +/* + * Fork a child process that waits for a pipe to close, signalling that the + * parent process has exited. Then call rms_prgdestroy. + */ +static int +_prg_destructor_fork() +{ + pid_t pid; + int fdpair[2]; + int prgid; + int i; + int dummy; + + if (pipe(fdpair) < 0) { + error("_prg_destructor_fork: failed creating pipe"); + return -1; + } + + pid = fork(); + if (pid < 0) { + error("_prg_destructor_fork: failed to fork program destructor"); + } else if (pid > 0) { + /* parent */ + close(fdpair[0]); + return fdpair[1]; + } + + /****************************************/ + /* child */ + close(fdpair[1]); + + /* close librmscall's internal fd to /proc/rms/control */ + rmsmod_fini(); + + _close_all_fd_except(fdpair[0]); + /* Wait for the program description id from the child */ + if (read(fdpair[0], &prgid, sizeof(prgid)) != sizeof(prgid)) { + error("_prg_destructor_fork read failed: %m"); + exit(1); + } + + if (prgid == -1) + exit(1); + + /* + * Wait for the pipe to close, signalling that the parent + * has exited. + */ + while (read(fdpair[0], &dummy, sizeof(dummy)) > 0) {} + + /* + * Verify that program description is empty. If not, send a SIGKILL. + */ + for (i = 0; i < 30; i++) { + int maxids = 8; + pid_t pids[8]; + int nids = 0; + + if (rms_prginfo(prgid, maxids, pids, &nids) < 0) { + error("_prg_destructor_fork: rms_prginfo: %m"); + } + if (nids == 0) + break; + if (rms_prgsignal(prgid, SIGKILL) < 0) { + error("_prg_destructor_fork: rms_prgsignal: %m"); + } + sleep(1); + } + + if (rms_prgdestroy(prgid) < 0) { + error("rms_prgdestroy"); + } + exit(0); +} + + + +/* + * Send the prgid of the newly created program description to the process + * forked earlier by _prg_destructor_fork(), using the file descriptor + * "fd" which was returned by the call to _prg_destructor_fork(). + */ +static void +_prg_destructor_send(int fd, int prgid) +{ + debug3("_prg_destructor_send %d", prgid); + if (write (fd, &prgid, sizeof(prgid)) != sizeof(prgid)) { + error ("_prg_destructor_send failed: %m"); + } + /* Deliberately avoid closing fd. When this process exits, it + will close fd signalling to the child process that it is + time to call rms_prgdestroy */ + /*close(fd);*/ +} diff --git a/src/plugins/switch/elan/qsw.c b/src/plugins/switch/elan/qsw.c index 9b726914971660cc6de35efbd7a20808e1ea1281..44eaf73fc58a8ad454d02c07ddafaf60308284bf 100644 --- a/src/plugins/switch/elan/qsw.c +++ b/src/plugins/switch/elan/qsw.c @@ -949,7 +949,6 @@ _prg_destructor_fork() int i; int dummy; -/* debug3("Entering _prg_destructor_fork"); */ if (pipe(fdpair) < 0) { error("switch/elan: failed creating pipe"); return -1; @@ -968,6 +967,9 @@ _prg_destructor_fork() /* child */ close(fdpair[1]); + /* close librmscall's internal fd to /proc/rms/control */ + rmsmod_fini(); + _close_all_fd_except(fdpair[0]); /* Wait for the program description id from the child */ if (read(fdpair[0], &prgid, sizeof(prgid)) != sizeof(prgid)) { @@ -975,11 +977,8 @@ _prg_destructor_fork() exit(1); } -/* debug3("_prg_destructor program is %d, waiting on process %d", */ -/* prgid, pid); */ - if (prgid == -1) - exit(-1); + exit(1); /* * Wait for the pipe to close, signalling that the parent @@ -1000,15 +999,12 @@ _prg_destructor_fork() } if (nids == 0) break; -/* error("_prg_destructor_fork program desc is not empty %d", */ -/* nids); */ if (rms_prgsignal(prgid, SIGKILL) < 0) { error("switch/elan: rms_prgsignal: %m"); } sleep(1); } -/* debug3("_prg_desctrutor attempting to call rms_prgdestroy"); */ if (rms_prgdestroy(prgid) < 0) { error("rms_prgdestroy"); } diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index e3cab987fce04d41b91baa3888798f18116085ff..241f4260bb38eaf661b3d3ab337e7547dad18cd1 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -615,6 +615,11 @@ _fork_all_tasks(slurmd_job_t *job) xassert(job != NULL); + if (slurm_container_create(job) == SLURM_ERROR) { + error("slurm_container_create: %m"); + exit(3); + } + /* * Pre-allocate a pipe for each of the tasks */ @@ -666,7 +671,8 @@ _fork_all_tasks(slurmd_job_t *job) exec_task(job, i, readfds[i]); } - /* Parent continues: + /* + * Parent continues: */ close(readfds[i]); verbose ("task %lu (%lu) started %M", @@ -683,6 +689,11 @@ _fork_all_tasks(slurmd_job_t *job) error ("Unable to put task %d (pid %ld) into pgrp %ld", i, pid, job->pgid); + if (slurm_container_add(job, pid) == SLURM_ERROR) { + error("slurm_container_create: %m"); + exit(3); + } + task.id = i; task.global_id = job->task[i]->gtid; task.pid = job->task[i]->pid; @@ -696,12 +707,7 @@ _fork_all_tasks(slurmd_job_t *job) * will wait for our signal before calling exec. */ shm_update_step_pgid(job->jobid, job->stepid, job->pgid); - cont_id = slurm_container_create(job); - if (cont_id == 0) { - error("slurm_container_create: %m"); - exit(3); - } - shm_update_step_cont_id(job->jobid, job->stepid, cont_id); + shm_update_step_cont_id(job->jobid, job->stepid, job->cont_id); /* * Now it's ok to unblock the tasks, so they may call exec. diff --git a/src/slurmd/proctrack.c b/src/slurmd/proctrack.c index a7daa31efd625e0194f15220920dc3f910beeafd..d07aaafcd075b69ce78fce9ace60b38cad78521b 100644 --- a/src/slurmd/proctrack.c +++ b/src/slurmd/proctrack.c @@ -37,8 +37,8 @@ /* TAG( slurm_proctrack_ops_t ) */ /* ************************************************************************ */ typedef struct slurm_proctrack_ops { - uint32_t (*create) ( slurmd_job_t *job ); - int (*add) ( uint32_t id, pid_t pid ); + int (*create) ( slurmd_job_t *job ); + int (*add) ( slurmd_job_t *job, pid_t pid ); int (*signal) ( uint32_t id, int signal ); int (*destroy) ( uint32_t id ); uint32_t (*find_cont) ( pid_t pid ); @@ -218,11 +218,13 @@ slurm_proctrack_fini( void ) /* * Create a container - * job_id IN - SLURM job ID + * job IN - slurmd_job_t structure + * job->cont_id OUT - Plugin must fill in job->cont_id either here + * or in slurm_container_add() * - * Returns container ID or zero on error + * Returns a SLURM errno. */ -extern uint32_t +extern int slurm_container_create(slurmd_job_t *job) { if ( slurm_proctrack_init() < 0 ) @@ -233,18 +235,20 @@ slurm_container_create(slurmd_job_t *job) /* * Add a process to the specified container - * cont_id IN - container ID as returned by slurm_container_create() + * job IN - slurmd_job_t structure * pid IN - process ID to be added to the container + * job->cont_id OUT - Plugin must fill in job->cont_id either here + * or in slurm_container_create() * * Returns a SLURM errno. */ extern int -slurm_container_add(uint32_t cont_id, pid_t pid) +slurm_container_add(slurmd_job_t *job, pid_t pid) { if ( slurm_proctrack_init() < 0 ) return SLURM_ERROR; - return (*(g_proctrack_context->ops.add))( cont_id , pid ); + return (*(g_proctrack_context->ops.add))( job , pid ); } /* diff --git a/src/slurmd/proctrack.h b/src/slurmd/proctrack.h index f104cf067d0842ff38d613db82680fb939967aaa..c0e652f91711a7f851c23f49a9c9be5d7281ae07 100644 --- a/src/slurmd/proctrack.h +++ b/src/slurmd/proctrack.h @@ -28,6 +28,7 @@ #define __PROC_TRACK_H__ #include <slurm/slurm.h> +#include "src/slurmd/slurmd_job.h" /* * Initialize the process tracking plugin. @@ -51,20 +52,24 @@ extern int slurm_proctrack_fini(void); /* * Create a container - * job_id IN - SLURM job ID + * job IN - slurmd_job_t structure + * job->cont_id OUT - Plugin must fill in job->cont_id either here + * or in slurm_container_add() * - * Returns container ID or zero on error + * Returns a SLURM errno. */ -extern uint32_t slurm_container_create(slurmd_job_t *job); +extern int slurm_container_create(slurmd_job_t *job); /* * Add a process to the specified container - * cont_id IN - container ID as returned by slurm_container_create() + * job IN - slurmd_job_t structure * pid IN - process ID to be added to the container + * job->cont_id OUT - Plugin must fill in job->cont_id either here + * or in slurm_container_create() * * Returns a SLURM errno. */ -extern int slurm_container_add(uint32_t cont_id, pid_t pid); +extern int slurm_container_add(slurmd_job_t *job, pid_t pid); /* * Signal all processes within a container diff --git a/src/slurmd/slurmd_job.h b/src/slurmd/slurmd_job.h index 7272a34991bfe54a5c7346d15e2a3d66819f1a66..277f444d3abecce17d6ff43ac632d58e6e6df8f5 100644 --- a/src/slurmd/slurmd_job.h +++ b/src/slurmd/slurmd_job.h @@ -133,6 +133,7 @@ typedef struct slurmd_job { uint16_t task_flags; env_t *envtp; + uint32_t cont_id; } slurmd_job_t;