From b2ec589808d52876e71f8d45982e00d21c59fc12 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Fri, 17 Nov 2006 17:10:30 +0000 Subject: [PATCH] merged global_srun.* into srun proper --- NEWS | 1 + src/common/Makefile.am | 2 - src/common/global_srun.c | 188 ---------------------------- src/common/global_srun.h | 160 ----------------------- src/common/mpi.h | 4 +- src/plugins/mpi/mpichgm/Makefile.am | 2 +- src/plugins/mpi/mpichgm/mpichgm.c | 4 +- src/plugins/mpi/mvapich/Makefile.am | 2 +- src/plugins/mpi/mvapich/mvapich.c | 2 +- src/srun/fname.h | 4 +- src/srun/msg.c | 2 +- src/srun/signals.c | 7 +- src/srun/signals.h | 2 +- src/srun/srun_job.c | 124 +++++++++++++++++- src/srun/srun_job.h | 119 +++++++++++++++++- 15 files changed, 254 insertions(+), 369 deletions(-) delete mode 100644 src/common/global_srun.c delete mode 100644 src/common/global_srun.h diff --git a/NEWS b/NEWS index 958115ac905..6fb8a4db4a2 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,7 @@ documents those changes that are of interest to users and admins. by Don Albert, Bull). -- Fixed bug with aix not looking in the correct dir for the proctrack include files + -- Removed global_srun.* from common merged it into srun proper * Changes in SLURM 1.2.0-pre6 ============================= diff --git a/src/common/Makefile.am b/src/common/Makefile.am index 77ef3470f43..26f6ab8f326 100644 --- a/src/common/Makefile.am +++ b/src/common/Makefile.am @@ -24,8 +24,6 @@ libcommon_la_SOURCES = \ xstring.c xstring.h \ xsignal.c xsignal.h \ forward.c forward.h \ - global_srun.c \ - global_srun.h \ strlcpy.c strlcpy.h \ list.c list.h \ net.c net.h \ diff --git a/src/common/global_srun.c b/src/common/global_srun.c deleted file mode 100644 index 8270e30953e..00000000000 --- a/src/common/global_srun.c +++ /dev/null @@ -1,188 +0,0 @@ -/*****************************************************************************\ - * src/common/global_srun.c - functions needed by more than just srun - ***************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Mark Grondona <mgrondona@llnl.gov>, and - * Morris Jette <jette1@llnl.gov> - * UCRL-CODE-217948. - * - * This file is part of SLURM, a resource management program. - * For details, see <http://www.llnl.gov/linux/slurm/>. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under - * certain conditions as described in each individual source file, and - * distribute linked combinations including the two. You must obey the GNU - * General Public License in all respects for all of the code used other than - * OpenSSL. If you modify file(s) with this exception, you may extend this - * exception to your version of the file(s), but you are not obligated to do - * so. If you do not wish to do so, delete this exception statement from your - * version. If you delete this exception statement from all source files in - * the program, then also delete it here. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with SLURM; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -\*****************************************************************************/ - -#ifdef HAVE_CONFIG_H -# include "config.h" -#endif - -#if HAVE_PTHREAD -#include <pthread.h> -#endif - -#include <signal.h> -#include <string.h> - -#include <slurm/slurm_errno.h> -#include <stdlib.h> - -#include "src/common/log.h" -#include "src/common/macros.h" -#include "src/common/slurm_protocol_api.h" -#include "src/common/slurm_protocol_defs.h" -#include "src/common/xmalloc.h" -#include "src/common/xstring.h" -#include "src/common/xsignal.h" -#include "src/common/forward.h" -#include "src/common/global_srun.h" - -typedef enum {DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_FAILED} state_t; - -typedef struct thd { - pthread_t thread; /* thread ID */ - pthread_attr_t attr; /* thread attributes */ - state_t state; /* thread state */ -} thd_t; - -int message_thread = 0; - -void -fwd_signal(srun_job_t *job, int signo, int max_threads) -{ - int i; - slurm_msg_t req; - kill_tasks_msg_t msg; - static pthread_mutex_t sig_mutex = PTHREAD_MUTEX_INITIALIZER; - pipe_enum_t pipe_enum = PIPE_SIGNALED; - hostlist_t hl; - char *name = NULL; - char buf[8192]; - List ret_list = NULL; - ListIterator itr; - ret_data_info_t *ret_data_info = NULL; - int rc = SLURM_SUCCESS; - - slurm_mutex_lock(&sig_mutex); - - if (signo == SIGKILL || signo == SIGINT || signo == SIGTERM) { - slurm_mutex_lock(&job->state_mutex); - job->signaled = true; - slurm_mutex_unlock(&job->state_mutex); - if(message_thread) { - write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &job->signaled,sizeof(int)); - } - } - - debug2("forward signal %d to job", signo); - - /* common to all tasks */ - msg.job_id = job->jobid; - msg.job_step_id = job->stepid; - msg.signal = (uint32_t) signo; - - hl = hostlist_create(""); - for (i = 0; i < job->nhosts; i++) { - if (job->host_state[i] != SRUN_HOST_REPLIED) { - name = nodelist_nth_host( - job->step_layout->node_list, i); - debug2("%s has not yet replied\n", name); - free(name); - continue; - } - if (job_active_tasks_on_host(job, i) == 0) - continue; - name = nodelist_nth_host(job->step_layout->node_list, i); - hostlist_push(hl, name); - free(name); - } - if(!hostlist_count(hl)) { - hostlist_destroy(hl); - goto nothing_left; - } - hostlist_ranged_string(hl, sizeof(buf), buf); - hostlist_destroy(hl); - name = xstrdup(buf); - - slurm_msg_t_init(&req); - req.msg_type = REQUEST_SIGNAL_TASKS; - req.data = &msg; - - debug3("sending signal to host %s", name); - - if (!(ret_list = slurm_send_recv_msgs(name, &req, 0))) { - error("fwd_signal: slurm_send_recv_msgs really failed bad"); - xfree(name); - slurm_mutex_unlock(&sig_mutex); - return; - } - xfree(name); - itr = list_iterator_create(ret_list); - while((ret_data_info = list_next(itr))) { - rc = slurm_get_return_code(ret_data_info->type, - ret_data_info->data); - /* - * Report error unless it is "Invalid job id" which - * probably just means the tasks exited in the meanwhile. - */ - if ((rc != 0) && (rc != ESLURM_INVALID_JOB_ID) - && (rc != ESLURMD_JOB_NOTRUNNING) && (rc != ESRCH)) { - error("%s: signal: %s", - ret_data_info->node_name, - slurm_strerror(rc)); - destroy_data_info(ret_data_info); - } - } - list_iterator_destroy(itr); - list_destroy(ret_list); -nothing_left: - debug2("All tasks have been signalled"); - - slurm_mutex_unlock(&sig_mutex); -} - -int -job_active_tasks_on_host(srun_job_t *job, int hostid) -{ - int i; - int retval = 0; - - slurm_mutex_lock(&job->task_mutex); - for (i = 0; i < job->step_layout->tasks[hostid]; i++) { - uint32_t *tids = job->step_layout->tids[hostid]; - xassert(tids != NULL); - debug("Task %d state: %d", tids[i], job->task_state[tids[i]]); - if (job->task_state[tids[i]] == SRUN_TASK_RUNNING) - retval++; - } - slurm_mutex_unlock(&job->task_mutex); - return retval; -} - - diff --git a/src/common/global_srun.h b/src/common/global_srun.h deleted file mode 100644 index bc6eba242c3..00000000000 --- a/src/common/global_srun.h +++ /dev/null @@ -1,160 +0,0 @@ -/*****************************************************************************\ - * src/common/global_srun.c - functions needed by more than just srun - ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Mark Grodnona <mgrondona@llnl.gov>. - * UCRL-CODE-217948. - * - * This file is part of SLURM, a resource management program. - * For details, see <http://www.llnl.gov/linux/slurm/>. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under - * certain conditions as described in each individual source file, and - * distribute linked combinations including the two. You must obey the GNU - * General Public License in all respects for all of the code used other than - * OpenSSL. If you modify file(s) with this exception, you may extend this - * exception to your version of the file(s), but you are not obligated to do - * so. If you do not wish to do so, delete this exception statement from your - * version. If you delete this exception statement from all source files in - * the program, then also delete it here. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with SLURM; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -\*****************************************************************************/ - -#ifndef _GLOBAL_SRUN_H -#define _GLOBAL_SRUN_H - -#include <slurm/slurm.h> -#include "src/common/slurm_protocol_common.h" -#include "src/api/step_io.h" - -typedef enum { - SRUN_JOB_INIT = 0, /* Job's initial state */ - SRUN_JOB_LAUNCHING, /* Launch thread is running */ - SRUN_JOB_STARTING, /* Launch thread is complete */ - SRUN_JOB_RUNNING, /* Launch thread complete */ - SRUN_JOB_TERMINATING, /* Once first task terminates */ - SRUN_JOB_TERMINATED, /* All tasks terminated (may have IO) */ - SRUN_JOB_WAITING_ON_IO, /* All tasks terminated; waiting for IO */ - SRUN_JOB_DONE, /* tasks and IO complete */ - SRUN_JOB_DETACHED, /* Detached IO from job (Not used now) */ - SRUN_JOB_FAILED, /* Job failed for some reason */ - SRUN_JOB_CANCELLED, /* CTRL-C cancelled */ - SRUN_JOB_FORCETERM /* Forced termination of IO thread */ -} srun_job_state_t; - -typedef enum { - SRUN_HOST_INIT = 0, - SRUN_HOST_CONTACTED, - SRUN_HOST_UNREACHABLE, - SRUN_HOST_REPLIED -} srun_host_state_t; - -typedef enum { - SRUN_TASK_INIT = 0, - SRUN_TASK_RUNNING, - SRUN_TASK_FAILED, - SRUN_TASK_IO_WAIT, /* this state deprecated with new eio stdio engine */ - SRUN_TASK_EXITED, - SRUN_TASK_ABNORMAL_EXIT -} srun_task_state_t; - -typedef enum { - PIPE_NONE = 0, - PIPE_JOB_STATE, - PIPE_TASK_STATE, - PIPE_TASK_EXITCODE, - PIPE_HOST_STATE, - PIPE_SIGNALED, - PIPE_MPIR_DEBUG_STATE, - PIPE_UPDATE_MPIR_PROCTABLE, - PIPE_UPDATE_STEP_LAYOUT, - PIPE_NODE_FAIL -} pipe_enum_t; - -/* For Message thread */ -typedef struct forked_msg_pipe { - int msg_pipe[2]; - int pid; -} forked_msg_pipe_t; - -typedef struct forked_message { - forked_msg_pipe_t * par_msg; - forked_msg_pipe_t * msg_par; - enum job_states * job_state; -} forked_msg_t; - -typedef struct io_filename io_filename_t; - -typedef struct srun_job { - slurm_step_layout_t *step_layout; /* holds info about how the task is - laid out */ - uint32_t jobid; /* assigned job id */ - uint32_t stepid; /* assigned step id */ - bool old_job; /* run job step under previous allocation */ - bool removed; /* job has been removed from SLURM */ - - uint32_t nhosts; /* node count */ - uint32_t ntasks; /* task count */ - srun_job_state_t state; /* job state */ - pthread_mutex_t state_mutex; - pthread_cond_t state_cond; - - bool signaled; /* True if user generated signal to job */ - int rc; /* srun return code */ - - slurm_cred_t cred; /* Slurm job credential */ - char *nodelist; /* nodelist in string form */ - - pthread_t sigid; /* signals thread tid */ - - pthread_t jtid; /* job control thread id */ - slurm_fd *jfd; /* job control info fd */ - - pthread_t lid; /* launch thread id */ - - client_io_t *client_io; - time_t ltimeout; /* Time by which all tasks must be running */ - time_t etimeout; /* exit timeout (see opt.max_wait */ - - srun_host_state_t *host_state; /* nhost host states */ - - int *tstatus; /* ntask exit statii */ - srun_task_state_t *task_state; /* ntask task states */ - - switch_jobinfo_t switch_job; - io_filename_t *ifname; - io_filename_t *ofname; - io_filename_t *efname; - forked_msg_t *forked_msg; - char *task_epilog; /* task-epilog */ - char *task_prolog; /* task-prolog */ - pthread_mutex_t task_mutex; - int njfds; /* number of job control info fds */ - slurm_addr *jaddr; /* job control info ports */ - int thr_count; /* count of threads in job launch */ - - /* Output streams and stdin fileno */ - select_jobinfo_t select_jobinfo; - -} srun_job_t; - - -void fwd_signal(srun_job_t *job, int signal, int max_threads); -int job_active_tasks_on_host(srun_job_t *job, int hostid); - -#endif /* !_GLOBAL_SRUN_H */ diff --git a/src/common/mpi.h b/src/common/mpi.h index 854170baa03..5243c2d077a 100644 --- a/src/common/mpi.h +++ b/src/common/mpi.h @@ -15,7 +15,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -42,7 +42,7 @@ # include "config.h" #endif -#include "src/common/global_srun.h" +#include "src/srun/srun_job.h" #include "src/slurmd/slurmstepd/slurmstepd_job.h" typedef struct slurm_mpi_context *slurm_mpi_context_t; diff --git a/src/plugins/mpi/mpichgm/Makefile.am b/src/plugins/mpi/mpichgm/Makefile.am index a91adb0aeb4..06317f6f21d 100644 --- a/src/plugins/mpi/mpichgm/Makefile.am +++ b/src/plugins/mpi/mpichgm/Makefile.am @@ -10,6 +10,6 @@ pkglib_LTLIBRARIES = mpi_mpichgm.la # Null switch plugin. mpi_mpichgm_la_SOURCES = mpi_mpichgm.c mpichgm.c mpichgm.h\ - $(top_srcdir)/src/common/global_srun.c \ + $(top_srcdir)/src/srun/srun_job.c \ $(top_srcdir)/src/common/net.c mpi_mpichgm_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) diff --git a/src/plugins/mpi/mpichgm/mpichgm.c b/src/plugins/mpi/mpichgm/mpichgm.c index 15276b5ca02..f9ea791bc52 100644 --- a/src/plugins/mpi/mpichgm/mpichgm.c +++ b/src/plugins/mpi/mpichgm/mpichgm.c @@ -16,7 +16,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -54,7 +54,7 @@ #include "src/common/xmalloc.h" #include "src/common/xstring.h" #include "src/common/net.h" -#include "src/common/global_srun.h" +#include "src/srun/srun_job.h" #include "src/srun/opt.h" #include "src/plugins/mpi/mpichgm/mpichgm.h" diff --git a/src/plugins/mpi/mvapich/Makefile.am b/src/plugins/mpi/mvapich/Makefile.am index a918f231b18..08ada9e55a2 100644 --- a/src/plugins/mpi/mvapich/Makefile.am +++ b/src/plugins/mpi/mvapich/Makefile.am @@ -11,6 +11,6 @@ pkglib_LTLIBRARIES = mpi_mvapich.la # Null switch plugin. mpi_mvapich_la_SOURCES = mpi_mvapich.c mvapich.c mvapich.h\ $(top_srcdir)/src/common/net.c \ - $(top_srcdir)/src/common/global_srun.c + $(top_srcdir)/src/srun/srun_job.c mpi_mvapich_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) diff --git a/src/plugins/mpi/mvapich/mvapich.c b/src/plugins/mpi/mvapich/mvapich.c index 3b455fb3638..7ec2c92bc08 100644 --- a/src/plugins/mpi/mvapich/mvapich.c +++ b/src/plugins/mpi/mvapich/mvapich.c @@ -56,7 +56,7 @@ #include "src/common/xstring.h" #include "src/common/net.h" #include "src/common/fd.h" -#include "src/common/global_srun.h" +#include "src/srun/srun_job.h" #include "src/srun/opt.h" /* NOTE: MVAPICH has changed protocols without changing version numbers. diff --git a/src/srun/fname.h b/src/srun/fname.h index cd3b939f607..a8f919a7d99 100644 --- a/src/srun/fname.h +++ b/src/srun/fname.h @@ -15,7 +15,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -42,7 +42,7 @@ # include "config.h" #endif -#include "src/common/global_srun.h" +#include "src/srun/srun_job.h" enum io_t { IO_ALL = 0, /* multiplex output from all/bcast stdin to all */ diff --git a/src/srun/msg.c b/src/srun/msg.c index 61d08f01f9c..c08d41d9f8e 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -67,7 +67,6 @@ #include "src/common/xmalloc.h" #include "src/common/mpi.h" #include "src/common/forward.h" -#include "src/common/global_srun.h" #include "src/api/pmi_server.h" #include "src/srun/srun_job.h" @@ -77,6 +76,7 @@ #include "src/srun/attach.h" #include "src/srun/allocate.h" #include "src/srun/multi_prog.h" +#include "src/srun/signals.h" #include "src/common/xstring.h" diff --git a/src/srun/signals.c b/src/srun/signals.c index 89c6884c73f..cc796cf9d0e 100644 --- a/src/srun/signals.c +++ b/src/srun/signals.c @@ -16,7 +16,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -54,12 +54,12 @@ #include "src/common/slurm_protocol_api.h" #include "src/common/slurm_protocol_defs.h" #include "src/common/xmalloc.h" +#include "src/common/xstring.h" #include "src/common/xsignal.h" -#include "src/common/global_srun.h" #include "src/srun/opt.h" #include "src/srun/srun_job.h" -#include "src/srun/opt.h" +#include "src/srun/signals.h" #define MAX_RETRIES 3 @@ -128,7 +128,6 @@ sig_thr_create(srun_job_t *job) } - static void _sigterm_handler(int signum) { diff --git a/src/srun/signals.h b/src/srun/signals.h index 14f6412d557..aa2472409dc 100644 --- a/src/srun/signals.h +++ b/src/srun/signals.h @@ -15,7 +15,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c index a2656288f9b..210d0f82a78 100644 --- a/src/srun/srun_job.c +++ b/src/srun/srun_job.c @@ -60,7 +60,7 @@ #include "src/common/xmalloc.h" #include "src/common/xstring.h" #include "src/common/io_hdr.h" -#include "src/common/global_srun.h" +#include "src/common/forward.h" #include "src/srun/srun_job.h" #include "src/srun/opt.h" @@ -68,6 +68,7 @@ #include "src/srun/attach.h" #include "src/srun/msg.h" +typedef enum {DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_FAILED} state_t; /* * allocation information structure used to store general information @@ -84,6 +85,13 @@ typedef struct allocation_info { select_jobinfo_t select_jobinfo; } allocation_info_t; +typedef struct thd { + pthread_t thread; /* thread ID */ + pthread_attr_t attr; /* thread attributes */ + state_t state; /* thread state */ +} thd_t; + +int message_thread = 0; /* * Prototypes: */ @@ -596,6 +604,120 @@ report_task_status(srun_job_t *job) } +void +fwd_signal(srun_job_t *job, int signo, int max_threads) +{ + int i; + slurm_msg_t req; + kill_tasks_msg_t msg; + static pthread_mutex_t sig_mutex = PTHREAD_MUTEX_INITIALIZER; + pipe_enum_t pipe_enum = PIPE_SIGNALED; + hostlist_t hl; + char *name = NULL; + char buf[8192]; + List ret_list = NULL; + ListIterator itr; + ret_data_info_t *ret_data_info = NULL; + int rc = SLURM_SUCCESS; + + slurm_mutex_lock(&sig_mutex); + + if (signo == SIGKILL || signo == SIGINT || signo == SIGTERM) { + slurm_mutex_lock(&job->state_mutex); + job->signaled = true; + slurm_mutex_unlock(&job->state_mutex); + if(message_thread) { + write(job->forked_msg->par_msg->msg_pipe[1], + &pipe_enum,sizeof(int)); + write(job->forked_msg->par_msg->msg_pipe[1], + &job->signaled,sizeof(int)); + } + } + + debug2("forward signal %d to job", signo); + + /* common to all tasks */ + msg.job_id = job->jobid; + msg.job_step_id = job->stepid; + msg.signal = (uint32_t) signo; + + hl = hostlist_create(""); + for (i = 0; i < job->nhosts; i++) { + if (job->host_state[i] != SRUN_HOST_REPLIED) { + name = nodelist_nth_host( + job->step_layout->node_list, i); + debug2("%s has not yet replied\n", name); + free(name); + continue; + } + if (job_active_tasks_on_host(job, i) == 0) + continue; + name = nodelist_nth_host(job->step_layout->node_list, i); + hostlist_push(hl, name); + free(name); + } + if(!hostlist_count(hl)) { + hostlist_destroy(hl); + goto nothing_left; + } + hostlist_ranged_string(hl, sizeof(buf), buf); + hostlist_destroy(hl); + name = xstrdup(buf); + + slurm_msg_t_init(&req); + req.msg_type = REQUEST_SIGNAL_TASKS; + req.data = &msg; + + debug3("sending signal to host %s", name); + + if (!(ret_list = slurm_send_recv_msgs(name, &req, 0))) { + error("fwd_signal: slurm_send_recv_msgs really failed bad"); + xfree(name); + slurm_mutex_unlock(&sig_mutex); + return; + } + xfree(name); + itr = list_iterator_create(ret_list); + while((ret_data_info = list_next(itr))) { + rc = slurm_get_return_code(ret_data_info->type, + ret_data_info->data); + /* + * Report error unless it is "Invalid job id" which + * probably just means the tasks exited in the meanwhile. + */ + if ((rc != 0) && (rc != ESLURM_INVALID_JOB_ID) + && (rc != ESLURMD_JOB_NOTRUNNING) && (rc != ESRCH)) { + error("%s: signal: %s", + ret_data_info->node_name, + slurm_strerror(rc)); + destroy_data_info(ret_data_info); + } + } + list_iterator_destroy(itr); + list_destroy(ret_list); +nothing_left: + debug2("All tasks have been signalled"); + + slurm_mutex_unlock(&sig_mutex); +} + +int +job_active_tasks_on_host(srun_job_t *job, int hostid) +{ + int i; + int retval = 0; + + slurm_mutex_lock(&job->task_mutex); + for (i = 0; i < job->step_layout->tasks[hostid]; i++) { + uint32_t *tids = job->step_layout->tids[hostid]; + xassert(tids != NULL); + debug("Task %d state: %d", tids[i], job->task_state[tids[i]]); + if (job->task_state[tids[i]] == SRUN_TASK_RUNNING) + retval++; + } + slurm_mutex_unlock(&job->task_mutex); + return retval; +} static inline int _estimate_nports(int nclients, int cli_per_port) diff --git a/src/srun/srun_job.h b/src/srun/srun_job.h index 7d1e993ef4a..ed2db1ffb26 100644 --- a/src/srun/srun_job.h +++ b/src/srun/srun_job.h @@ -16,7 +16,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -51,10 +51,120 @@ #include "src/common/macros.h" #include "src/common/node_select.h" #include "src/common/slurm_protocol_defs.h" -#include "src/common/global_srun.h" #include "src/api/step_io.h" -#include "src/srun/fname.h" +//#include "src/srun/fname.h" + +typedef enum { + SRUN_JOB_INIT = 0, /* Job's initial state */ + SRUN_JOB_LAUNCHING, /* Launch thread is running */ + SRUN_JOB_STARTING, /* Launch thread is complete */ + SRUN_JOB_RUNNING, /* Launch thread complete */ + SRUN_JOB_TERMINATING, /* Once first task terminates */ + SRUN_JOB_TERMINATED, /* All tasks terminated (may have IO) */ + SRUN_JOB_WAITING_ON_IO, /* All tasks terminated; waiting for IO */ + SRUN_JOB_DONE, /* tasks and IO complete */ + SRUN_JOB_DETACHED, /* Detached IO from job (Not used now) */ + SRUN_JOB_FAILED, /* Job failed for some reason */ + SRUN_JOB_CANCELLED, /* CTRL-C cancelled */ + SRUN_JOB_FORCETERM /* Forced termination of IO thread */ +} srun_job_state_t; + +typedef enum { + SRUN_HOST_INIT = 0, + SRUN_HOST_CONTACTED, + SRUN_HOST_UNREACHABLE, + SRUN_HOST_REPLIED +} srun_host_state_t; + +typedef enum { + SRUN_TASK_INIT = 0, + SRUN_TASK_RUNNING, + SRUN_TASK_FAILED, + SRUN_TASK_IO_WAIT,/* this state deprecated with new eio stdio engine */ + SRUN_TASK_EXITED, + SRUN_TASK_ABNORMAL_EXIT +} srun_task_state_t; + +typedef enum { + PIPE_NONE = 0, + PIPE_JOB_STATE, + PIPE_TASK_STATE, + PIPE_TASK_EXITCODE, + PIPE_HOST_STATE, + PIPE_SIGNALED, + PIPE_MPIR_DEBUG_STATE, + PIPE_UPDATE_MPIR_PROCTABLE, + PIPE_UPDATE_STEP_LAYOUT, + PIPE_NODE_FAIL +} pipe_enum_t; + +/* For Message thread */ +typedef struct forked_msg_pipe { + int msg_pipe[2]; + int pid; +} forked_msg_pipe_t; + +typedef struct forked_message { + forked_msg_pipe_t * par_msg; + forked_msg_pipe_t * msg_par; + enum job_states * job_state; +} forked_msg_t; + +typedef struct io_filename io_filename_t; + +typedef struct srun_job { + slurm_step_layout_t *step_layout; /* holds info about how the task is + laid out */ + uint32_t jobid; /* assigned job id */ + uint32_t stepid; /* assigned step id */ + bool old_job; /* run job step under previous allocation */ + bool removed; /* job has been removed from SLURM */ + + uint32_t nhosts; /* node count */ + uint32_t ntasks; /* task count */ + srun_job_state_t state; /* job state */ + pthread_mutex_t state_mutex; + pthread_cond_t state_cond; + + bool signaled; /* True if user generated signal to job */ + int rc; /* srun return code */ + + slurm_cred_t cred; /* Slurm job credential */ + char *nodelist; /* nodelist in string form */ + + pthread_t sigid; /* signals thread tid */ + + pthread_t jtid; /* job control thread id */ + slurm_fd *jfd; /* job control info fd */ + + pthread_t lid; /* launch thread id */ + + client_io_t *client_io; + time_t ltimeout; /* Time by which all tasks must be running */ + time_t etimeout; /* exit timeout (see opt.max_wait */ + + srun_host_state_t *host_state; /* nhost host states */ + + int *tstatus; /* ntask exit statii */ + srun_task_state_t *task_state; /* ntask task states */ + + switch_jobinfo_t switch_job; + io_filename_t *ifname; + io_filename_t *ofname; + io_filename_t *efname; + forked_msg_t *forked_msg; + char *task_epilog; /* task-epilog */ + char *task_prolog; /* task-prolog */ + pthread_mutex_t task_mutex; + int njfds; /* number of job control info fds */ + slurm_addr *jaddr; /* job control info ports */ + int thr_count; /* count of threads in job launch */ + + /* Output streams and stdin fileno */ + select_jobinfo_t select_jobinfo; + +} srun_job_t; extern int message_thread; @@ -105,4 +215,7 @@ void report_job_status(srun_job_t *job); */ int job_rc(srun_job_t *job); +void fwd_signal(srun_job_t *job, int signal, int max_threads); +int job_active_tasks_on_host(srun_job_t *job, int hostid); + #endif /* !_HAVE_JOB_H */ -- GitLab