From b2ec589808d52876e71f8d45982e00d21c59fc12 Mon Sep 17 00:00:00 2001
From: Danny Auble <da@llnl.gov>
Date: Fri, 17 Nov 2006 17:10:30 +0000
Subject: [PATCH] merged global_srun.* into srun proper

---
 NEWS                                |   1 +
 src/common/Makefile.am              |   2 -
 src/common/global_srun.c            | 188 ----------------------------
 src/common/global_srun.h            | 160 -----------------------
 src/common/mpi.h                    |   4 +-
 src/plugins/mpi/mpichgm/Makefile.am |   2 +-
 src/plugins/mpi/mpichgm/mpichgm.c   |   4 +-
 src/plugins/mpi/mvapich/Makefile.am |   2 +-
 src/plugins/mpi/mvapich/mvapich.c   |   2 +-
 src/srun/fname.h                    |   4 +-
 src/srun/msg.c                      |   2 +-
 src/srun/signals.c                  |   7 +-
 src/srun/signals.h                  |   2 +-
 src/srun/srun_job.c                 | 124 +++++++++++++++++-
 src/srun/srun_job.h                 | 119 +++++++++++++++++-
 15 files changed, 254 insertions(+), 369 deletions(-)
 delete mode 100644 src/common/global_srun.c
 delete mode 100644 src/common/global_srun.h

diff --git a/NEWS b/NEWS
index 958115ac905..6fb8a4db4a2 100644
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,7 @@ documents those changes that are of interest to users and admins.
     by Don Albert, Bull).
  -- Fixed bug with aix not looking in the correct dir for the proctrack
     include files
+ -- Removed global_srun.* from common merged it into srun proper
 
 * Changes in SLURM 1.2.0-pre6
 =============================
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 77ef3470f43..26f6ab8f326 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -24,8 +24,6 @@ libcommon_la_SOURCES = 			\
 	xstring.c xstring.h		\
 	xsignal.c xsignal.h		\
 	forward.c forward.h     	\
-	global_srun.c                   \
-	global_srun.h                   \
 	strlcpy.c strlcpy.h		\
 	list.c list.h 			\
 	net.c net.h                     \
diff --git a/src/common/global_srun.c b/src/common/global_srun.c
deleted file mode 100644
index 8270e30953e..00000000000
--- a/src/common/global_srun.c
+++ /dev/null
@@ -1,188 +0,0 @@
-/*****************************************************************************\
- * src/common/global_srun.c - functions needed by more than just srun
- *****************************************************************************
- *  Copyright (C) 2002 The Regents of the University of California.
- *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Mark Grondona <mgrondona@llnl.gov>, and
- *             Morris Jette  <jette1@llnl.gov>
- *  UCRL-CODE-217948.
- *  
- *  This file is part of SLURM, a resource management program.
- *  For details, see <http://www.llnl.gov/linux/slurm/>.
- *  
- *  SLURM is free software; you can redistribute it and/or modify it under
- *  the terms of the GNU General Public License as published by the Free
- *  Software Foundation; either version 2 of the License, or (at your option)
- *  any later version.
- *
- *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
- *  certain conditions as described in each individual source file, and 
- *  distribute linked combinations including the two. You must obey the GNU 
- *  General Public License in all respects for all of the code used other than 
- *  OpenSSL. If you modify file(s) with this exception, you may extend this 
- *  exception to your version of the file(s), but you are not obligated to do 
- *  so. If you do not wish to do so, delete this exception statement from your
- *  version.  If you delete this exception statement from all source files in 
- *  the program, then also delete it here.
- *  
- *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
- *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- *  details.
- *  
- *  You should have received a copy of the GNU General Public License along
- *  with SLURM; if not, write to the Free Software Foundation, Inc.,
- *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
-\*****************************************************************************/
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
-
-#if HAVE_PTHREAD
-#include <pthread.h>
-#endif
-
-#include <signal.h>
-#include <string.h>
-
-#include <slurm/slurm_errno.h>
-#include <stdlib.h>
-
-#include "src/common/log.h"
-#include "src/common/macros.h"
-#include "src/common/slurm_protocol_api.h"
-#include "src/common/slurm_protocol_defs.h"
-#include "src/common/xmalloc.h"
-#include "src/common/xstring.h"
-#include "src/common/xsignal.h"
-#include "src/common/forward.h"
-#include "src/common/global_srun.h"
-
-typedef enum {DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_FAILED} state_t;
-
-typedef struct thd {
-        pthread_t	thread;			/* thread ID */
-        pthread_attr_t	attr;			/* thread attributes */
-        state_t		state;      		/* thread state */
-} thd_t;
-
-int message_thread = 0;
-
-void 
-fwd_signal(srun_job_t *job, int signo, int max_threads)
-{
-	int i;
-	slurm_msg_t req;
-	kill_tasks_msg_t msg;
-	static pthread_mutex_t sig_mutex = PTHREAD_MUTEX_INITIALIZER;
-	pipe_enum_t pipe_enum = PIPE_SIGNALED;
-	hostlist_t hl;
-	char *name = NULL;
-	char buf[8192];
-	List ret_list = NULL;
-	ListIterator itr;
-	ret_data_info_t *ret_data_info = NULL;
-	int rc = SLURM_SUCCESS;
-
-	slurm_mutex_lock(&sig_mutex);
-
-	if (signo == SIGKILL || signo == SIGINT || signo == SIGTERM) {
-		slurm_mutex_lock(&job->state_mutex);
-		job->signaled = true;
-		slurm_mutex_unlock(&job->state_mutex);
-		if(message_thread) {
-			write(job->forked_msg->par_msg->msg_pipe[1],
-			      &pipe_enum,sizeof(int));
-			write(job->forked_msg->par_msg->msg_pipe[1],
-			      &job->signaled,sizeof(int));
-		}
-	}
-
-	debug2("forward signal %d to job", signo);
-
-	/* common to all tasks */
-	msg.job_id      = job->jobid;
-	msg.job_step_id = job->stepid;
-	msg.signal      = (uint32_t) signo;
-	
-	hl = hostlist_create("");
-	for (i = 0; i < job->nhosts; i++) {
-		if (job->host_state[i] != SRUN_HOST_REPLIED) {
-			name = nodelist_nth_host(
-				job->step_layout->node_list, i);
-			debug2("%s has not yet replied\n", name);
-			free(name);
-			continue;
-		}
-		if (job_active_tasks_on_host(job, i) == 0)
-			continue;
-		name = nodelist_nth_host(job->step_layout->node_list, i);
-		hostlist_push(hl, name);
-		free(name);
-	}
-	if(!hostlist_count(hl)) {
-		hostlist_destroy(hl);
-		goto nothing_left;
-	}
-	hostlist_ranged_string(hl, sizeof(buf), buf);
-	hostlist_destroy(hl);
-	name = xstrdup(buf);
-
-	slurm_msg_t_init(&req);	
-	req.msg_type = REQUEST_SIGNAL_TASKS;
-	req.data     = &msg;
-	
-	debug3("sending signal to host %s", name);
-	
-	if (!(ret_list = slurm_send_recv_msgs(name, &req, 0))) { 
-		error("fwd_signal: slurm_send_recv_msgs really failed bad");
-		xfree(name);
-		slurm_mutex_unlock(&sig_mutex);
-		return;
-	}
-	xfree(name);
-	itr = list_iterator_create(ret_list);		
-	while((ret_data_info = list_next(itr))) {
-		rc = slurm_get_return_code(ret_data_info->type, 
-					   ret_data_info->data);
-		/*
-		 *  Report error unless it is "Invalid job id" which 
-		 *    probably just means the tasks exited in the meanwhile.
-		 */
-		if ((rc != 0) && (rc != ESLURM_INVALID_JOB_ID)
-		    &&  (rc != ESLURMD_JOB_NOTRUNNING) && (rc != ESRCH)) {
-			error("%s: signal: %s", 
-			      ret_data_info->node_name, 
-			      slurm_strerror(rc));
-			destroy_data_info(ret_data_info);
-		}
-	}
-	list_iterator_destroy(itr);
-	list_destroy(ret_list);
-nothing_left:
-	debug2("All tasks have been signalled");
-	
-	slurm_mutex_unlock(&sig_mutex);
-}
-
-int
-job_active_tasks_on_host(srun_job_t *job, int hostid)
-{
-	int i;
-	int retval = 0;
-
-	slurm_mutex_lock(&job->task_mutex);
-	for (i = 0; i < job->step_layout->tasks[hostid]; i++) {
-		uint32_t *tids = job->step_layout->tids[hostid];
-		xassert(tids != NULL);
-		debug("Task %d state: %d", tids[i], job->task_state[tids[i]]);
-		if (job->task_state[tids[i]] == SRUN_TASK_RUNNING) 
-			retval++;
-	}
-	slurm_mutex_unlock(&job->task_mutex);
-	return retval;
-}
-
-
diff --git a/src/common/global_srun.h b/src/common/global_srun.h
deleted file mode 100644
index bc6eba242c3..00000000000
--- a/src/common/global_srun.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*****************************************************************************\
- *  src/common/global_srun.c - functions needed by more than just srun
- *****************************************************************************
- *  Copyright (C) 2002-2006 The Regents of the University of California.
- *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
- *  Written by Mark Grodnona <mgrondona@llnl.gov>.
- *  UCRL-CODE-217948.
- *  
- *  This file is part of SLURM, a resource management program.
- *  For details, see <http://www.llnl.gov/linux/slurm/>.
- *  
- *  SLURM is free software; you can redistribute it and/or modify it under
- *  the terms of the GNU General Public License as published by the Free
- *  Software Foundation; either version 2 of the License, or (at your option)
- *  any later version.
- *
- *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
- *  certain conditions as described in each individual source file, and 
- *  distribute linked combinations including the two. You must obey the GNU 
- *  General Public License in all respects for all of the code used other than 
- *  OpenSSL. If you modify file(s) with this exception, you may extend this 
- *  exception to your version of the file(s), but you are not obligated to do 
- *  so. If you do not wish to do so, delete this exception statement from your
- *  version.  If you delete this exception statement from all source files in 
- *  the program, then also delete it here.
- *  
- *  SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
- *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
- *  details.
- *  
- *  You should have received a copy of the GNU General Public License along
- *  with SLURM; if not, write to the Free Software Foundation, Inc.,
- *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
-\*****************************************************************************/
-
-#ifndef _GLOBAL_SRUN_H
-#define _GLOBAL_SRUN_H
-
-#include <slurm/slurm.h>
-#include "src/common/slurm_protocol_common.h"
-#include "src/api/step_io.h"
-
-typedef enum {
-	SRUN_JOB_INIT = 0,         /* Job's initial state                   */
-	SRUN_JOB_LAUNCHING,        /* Launch thread is running              */
-	SRUN_JOB_STARTING,         /* Launch thread is complete             */
-	SRUN_JOB_RUNNING,          /* Launch thread complete                */
-	SRUN_JOB_TERMINATING,      /* Once first task terminates            */
-	SRUN_JOB_TERMINATED,       /* All tasks terminated (may have IO)    */
-	SRUN_JOB_WAITING_ON_IO,    /* All tasks terminated; waiting for IO  */
-	SRUN_JOB_DONE,             /* tasks and IO complete                 */
-	SRUN_JOB_DETACHED,         /* Detached IO from job (Not used now)   */
-	SRUN_JOB_FAILED,           /* Job failed for some reason            */
-	SRUN_JOB_CANCELLED,        /* CTRL-C cancelled                      */
-	SRUN_JOB_FORCETERM         /* Forced termination of IO thread       */
-} srun_job_state_t;
-
-typedef enum {
-	SRUN_HOST_INIT = 0,
-	SRUN_HOST_CONTACTED,
-	SRUN_HOST_UNREACHABLE,
-	SRUN_HOST_REPLIED
-} srun_host_state_t;
-
-typedef enum {
-	SRUN_TASK_INIT = 0,
-	SRUN_TASK_RUNNING,
-	SRUN_TASK_FAILED,
-	SRUN_TASK_IO_WAIT, /* this state deprecated with new eio stdio engine */
-	SRUN_TASK_EXITED,
-	SRUN_TASK_ABNORMAL_EXIT
-} srun_task_state_t;
-
-typedef enum { 
-	PIPE_NONE = 0, 
-	PIPE_JOB_STATE, 
-	PIPE_TASK_STATE, 
-	PIPE_TASK_EXITCODE,
-	PIPE_HOST_STATE, 
-	PIPE_SIGNALED,
-	PIPE_MPIR_DEBUG_STATE,
-	PIPE_UPDATE_MPIR_PROCTABLE,
-	PIPE_UPDATE_STEP_LAYOUT,
-	PIPE_NODE_FAIL
-} pipe_enum_t;
-
-/* For Message thread */
-typedef struct forked_msg_pipe {
-	int msg_pipe[2];
-	int pid;
-} forked_msg_pipe_t;
-
-typedef struct forked_message {
-	forked_msg_pipe_t *          par_msg;
-	forked_msg_pipe_t *          msg_par;
-	enum job_states	*	     job_state;
-} forked_msg_t;
-
-typedef struct io_filename io_filename_t;
-
-typedef struct srun_job {
-	slurm_step_layout_t *step_layout; /* holds info about how the task is 
-					     laid out */
-	uint32_t jobid;		/* assigned job id 	                  */
-	uint32_t stepid;	/* assigned step id 	                  */
-	bool old_job;           /* run job step under previous allocation */
-	bool removed;       /* job has been removed from SLURM */
-
-	uint32_t nhosts;	/* node count */
-	uint32_t ntasks;	/* task count */
-	srun_job_state_t state;	/* job state	   	                  */
-	pthread_mutex_t state_mutex; 
-	pthread_cond_t  state_cond;
-
-	bool signaled;          /* True if user generated signal to job   */
-	int  rc;                /* srun return code                       */
-
-	slurm_cred_t  cred;     /* Slurm job credential    */
-	char *nodelist;		/* nodelist in string form */
-
-	pthread_t sigid;	/* signals thread tid		  */
-
-	pthread_t jtid;		/* job control thread id 	  */
-	slurm_fd *jfd;		/* job control info fd   	  */
-	
-	pthread_t lid;		  /* launch thread id */
-
-	client_io_t *client_io;
-	time_t    ltimeout;       /* Time by which all tasks must be running */
-	time_t    etimeout;       /* exit timeout (see opt.max_wait          */
-
-	srun_host_state_t *host_state; /* nhost host states */
-
-	int *tstatus;	          /* ntask exit statii */
-	srun_task_state_t *task_state; /* ntask task states */
-	
-	switch_jobinfo_t switch_job;
-	io_filename_t *ifname;
-	io_filename_t *ofname;
-	io_filename_t *efname;
-	forked_msg_t *forked_msg;
-	char *task_epilog;	/* task-epilog */
-	char *task_prolog;	/* task-prolog */
-	pthread_mutex_t task_mutex;
-	int njfds;		/* number of job control info fds */
-	slurm_addr *jaddr;	/* job control info ports 	  */
-	int thr_count;  	/* count of threads in job launch */
-
-	/* Output streams and stdin fileno */
-	select_jobinfo_t select_jobinfo;
-	
-} srun_job_t;
-
-
-void fwd_signal(srun_job_t *job, int signal, int max_threads);
-int job_active_tasks_on_host(srun_job_t *job, int hostid);
-
-#endif /* !_GLOBAL_SRUN_H */
diff --git a/src/common/mpi.h b/src/common/mpi.h
index 854170baa03..5243c2d077a 100644
--- a/src/common/mpi.h
+++ b/src/common/mpi.h
@@ -15,7 +15,7 @@
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
+ *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and 
  *  distribute linked combinations including the two. You must obey the GNU 
  *  General Public License in all respects for all of the code used other than 
@@ -42,7 +42,7 @@
 # include "config.h"
 #endif 
 
-#include "src/common/global_srun.h"
+#include "src/srun/srun_job.h"
 #include "src/slurmd/slurmstepd/slurmstepd_job.h"
 
 typedef struct slurm_mpi_context *slurm_mpi_context_t;
diff --git a/src/plugins/mpi/mpichgm/Makefile.am b/src/plugins/mpi/mpichgm/Makefile.am
index a91adb0aeb4..06317f6f21d 100644
--- a/src/plugins/mpi/mpichgm/Makefile.am
+++ b/src/plugins/mpi/mpichgm/Makefile.am
@@ -10,6 +10,6 @@ pkglib_LTLIBRARIES = mpi_mpichgm.la
 
 # Null switch plugin.
 mpi_mpichgm_la_SOURCES = mpi_mpichgm.c mpichgm.c mpichgm.h\
-                         $(top_srcdir)/src/common/global_srun.c \
+                         $(top_srcdir)/src/srun/srun_job.c \
 			  $(top_srcdir)/src/common/net.c
 mpi_mpichgm_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS)
diff --git a/src/plugins/mpi/mpichgm/mpichgm.c b/src/plugins/mpi/mpichgm/mpichgm.c
index 15276b5ca02..f9ea791bc52 100644
--- a/src/plugins/mpi/mpichgm/mpichgm.c
+++ b/src/plugins/mpi/mpichgm/mpichgm.c
@@ -16,7 +16,7 @@
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
+ *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and 
  *  distribute linked combinations including the two. You must obey the GNU 
  *  General Public License in all respects for all of the code used other than 
@@ -54,7 +54,7 @@
 #include "src/common/xmalloc.h"
 #include "src/common/xstring.h"
 #include "src/common/net.h"
-#include "src/common/global_srun.h"
+#include "src/srun/srun_job.h"
 #include "src/srun/opt.h"
 
 #include "src/plugins/mpi/mpichgm/mpichgm.h"
diff --git a/src/plugins/mpi/mvapich/Makefile.am b/src/plugins/mpi/mvapich/Makefile.am
index a918f231b18..08ada9e55a2 100644
--- a/src/plugins/mpi/mvapich/Makefile.am
+++ b/src/plugins/mpi/mvapich/Makefile.am
@@ -11,6 +11,6 @@ pkglib_LTLIBRARIES = mpi_mvapich.la
 # Null switch plugin.
 mpi_mvapich_la_SOURCES = mpi_mvapich.c mvapich.c mvapich.h\
 	$(top_srcdir)/src/common/net.c \
-	$(top_srcdir)/src/common/global_srun.c
+	$(top_srcdir)/src/srun/srun_job.c
 
 mpi_mvapich_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS)
diff --git a/src/plugins/mpi/mvapich/mvapich.c b/src/plugins/mpi/mvapich/mvapich.c
index 3b455fb3638..7ec2c92bc08 100644
--- a/src/plugins/mpi/mvapich/mvapich.c
+++ b/src/plugins/mpi/mvapich/mvapich.c
@@ -56,7 +56,7 @@
 #include "src/common/xstring.h"
 #include "src/common/net.h"
 #include "src/common/fd.h"
-#include "src/common/global_srun.h"
+#include "src/srun/srun_job.h"
 #include "src/srun/opt.h"
 
 /* NOTE: MVAPICH has changed protocols without changing version numbers.
diff --git a/src/srun/fname.h b/src/srun/fname.h
index cd3b939f607..a8f919a7d99 100644
--- a/src/srun/fname.h
+++ b/src/srun/fname.h
@@ -15,7 +15,7 @@
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
+ *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and 
  *  distribute linked combinations including the two. You must obey the GNU 
  *  General Public License in all respects for all of the code used other than 
@@ -42,7 +42,7 @@
 # include "config.h"
 #endif 
 
-#include "src/common/global_srun.h"
+#include "src/srun/srun_job.h"
 
 enum io_t {
 	IO_ALL          = 0, /* multiplex output from all/bcast stdin to all */
diff --git a/src/srun/msg.c b/src/srun/msg.c
index 61d08f01f9c..c08d41d9f8e 100644
--- a/src/srun/msg.c
+++ b/src/srun/msg.c
@@ -67,7 +67,6 @@
 #include "src/common/xmalloc.h"
 #include "src/common/mpi.h"
 #include "src/common/forward.h"
-#include "src/common/global_srun.h"
 #include "src/api/pmi_server.h"
 
 #include "src/srun/srun_job.h"
@@ -77,6 +76,7 @@
 #include "src/srun/attach.h"
 #include "src/srun/allocate.h"
 #include "src/srun/multi_prog.h"
+#include "src/srun/signals.h"
 
 #include "src/common/xstring.h"
 
diff --git a/src/srun/signals.c b/src/srun/signals.c
index 89c6884c73f..cc796cf9d0e 100644
--- a/src/srun/signals.c
+++ b/src/srun/signals.c
@@ -16,7 +16,7 @@
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
+ *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and 
  *  distribute linked combinations including the two. You must obey the GNU 
  *  General Public License in all respects for all of the code used other than 
@@ -54,12 +54,12 @@
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/slurm_protocol_defs.h"
 #include "src/common/xmalloc.h"
+#include "src/common/xstring.h"
 #include "src/common/xsignal.h"
-#include "src/common/global_srun.h"
 
 #include "src/srun/opt.h"
 #include "src/srun/srun_job.h"
-#include "src/srun/opt.h"
+#include "src/srun/signals.h"
 
 #define MAX_RETRIES 3
 
@@ -128,7 +128,6 @@ sig_thr_create(srun_job_t *job)
 }
 
 
-
 static void
 _sigterm_handler(int signum)
 {
diff --git a/src/srun/signals.h b/src/srun/signals.h
index 14f6412d557..aa2472409dc 100644
--- a/src/srun/signals.h
+++ b/src/srun/signals.h
@@ -15,7 +15,7 @@
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
+ *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and 
  *  distribute linked combinations including the two. You must obey the GNU 
  *  General Public License in all respects for all of the code used other than 
diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c
index a2656288f9b..210d0f82a78 100644
--- a/src/srun/srun_job.c
+++ b/src/srun/srun_job.c
@@ -60,7 +60,7 @@
 #include "src/common/xmalloc.h"
 #include "src/common/xstring.h"
 #include "src/common/io_hdr.h"
-#include "src/common/global_srun.h"
+#include "src/common/forward.h"
 
 #include "src/srun/srun_job.h"
 #include "src/srun/opt.h"
@@ -68,6 +68,7 @@
 #include "src/srun/attach.h"
 #include "src/srun/msg.h"
 
+typedef enum {DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_FAILED} state_t;
 
 /*
  * allocation information structure used to store general information
@@ -84,6 +85,13 @@ typedef struct allocation_info {
 	select_jobinfo_t select_jobinfo;
 } allocation_info_t;
 
+typedef struct thd {
+        pthread_t	thread;			/* thread ID */
+        pthread_attr_t	attr;			/* thread attributes */
+        state_t		state;      		/* thread state */
+} thd_t;
+
+int message_thread = 0;
 /*
  * Prototypes:
  */
@@ -596,6 +604,120 @@ report_task_status(srun_job_t *job)
 
 }
 
+void 
+fwd_signal(srun_job_t *job, int signo, int max_threads)
+{
+	int i;
+	slurm_msg_t req;
+	kill_tasks_msg_t msg;
+	static pthread_mutex_t sig_mutex = PTHREAD_MUTEX_INITIALIZER;
+	pipe_enum_t pipe_enum = PIPE_SIGNALED;
+	hostlist_t hl;
+	char *name = NULL;
+	char buf[8192];
+	List ret_list = NULL;
+	ListIterator itr;
+	ret_data_info_t *ret_data_info = NULL;
+	int rc = SLURM_SUCCESS;
+
+	slurm_mutex_lock(&sig_mutex);
+
+	if (signo == SIGKILL || signo == SIGINT || signo == SIGTERM) {
+		slurm_mutex_lock(&job->state_mutex);
+		job->signaled = true;
+		slurm_mutex_unlock(&job->state_mutex);
+		if(message_thread) {
+			write(job->forked_msg->par_msg->msg_pipe[1],
+			      &pipe_enum,sizeof(int));
+			write(job->forked_msg->par_msg->msg_pipe[1],
+			      &job->signaled,sizeof(int));
+		}
+	}
+
+	debug2("forward signal %d to job", signo);
+
+	/* common to all tasks */
+	msg.job_id      = job->jobid;
+	msg.job_step_id = job->stepid;
+	msg.signal      = (uint32_t) signo;
+	
+	hl = hostlist_create("");
+	for (i = 0; i < job->nhosts; i++) {
+		if (job->host_state[i] != SRUN_HOST_REPLIED) {
+			name = nodelist_nth_host(
+				job->step_layout->node_list, i);
+			debug2("%s has not yet replied\n", name);
+			free(name);
+			continue;
+		}
+		if (job_active_tasks_on_host(job, i) == 0)
+			continue;
+		name = nodelist_nth_host(job->step_layout->node_list, i);
+		hostlist_push(hl, name);
+		free(name);
+	}
+	if(!hostlist_count(hl)) {
+		hostlist_destroy(hl);
+		goto nothing_left;
+	}
+	hostlist_ranged_string(hl, sizeof(buf), buf);
+	hostlist_destroy(hl);
+	name = xstrdup(buf);
+
+	slurm_msg_t_init(&req);	
+	req.msg_type = REQUEST_SIGNAL_TASKS;
+	req.data     = &msg;
+	
+	debug3("sending signal to host %s", name);
+	
+	if (!(ret_list = slurm_send_recv_msgs(name, &req, 0))) { 
+		error("fwd_signal: slurm_send_recv_msgs really failed bad");
+		xfree(name);
+		slurm_mutex_unlock(&sig_mutex);
+		return;
+	}
+	xfree(name);
+	itr = list_iterator_create(ret_list);		
+	while((ret_data_info = list_next(itr))) {
+		rc = slurm_get_return_code(ret_data_info->type, 
+					   ret_data_info->data);
+		/*
+		 *  Report error unless it is "Invalid job id" which 
+		 *    probably just means the tasks exited in the meanwhile.
+		 */
+		if ((rc != 0) && (rc != ESLURM_INVALID_JOB_ID)
+		    &&  (rc != ESLURMD_JOB_NOTRUNNING) && (rc != ESRCH)) {
+			error("%s: signal: %s", 
+			      ret_data_info->node_name, 
+			      slurm_strerror(rc));
+			destroy_data_info(ret_data_info);
+		}
+	}
+	list_iterator_destroy(itr);
+	list_destroy(ret_list);
+nothing_left:
+	debug2("All tasks have been signalled");
+	
+	slurm_mutex_unlock(&sig_mutex);
+}
+
+int
+job_active_tasks_on_host(srun_job_t *job, int hostid)
+{
+	int i;
+	int retval = 0;
+
+	slurm_mutex_lock(&job->task_mutex);
+	for (i = 0; i < job->step_layout->tasks[hostid]; i++) {
+		uint32_t *tids = job->step_layout->tids[hostid];
+		xassert(tids != NULL);
+		debug("Task %d state: %d", tids[i], job->task_state[tids[i]]);
+		if (job->task_state[tids[i]] == SRUN_TASK_RUNNING) 
+			retval++;
+	}
+	slurm_mutex_unlock(&job->task_mutex);
+	return retval;
+}
 
 static inline int
 _estimate_nports(int nclients, int cli_per_port)
diff --git a/src/srun/srun_job.h b/src/srun/srun_job.h
index 7d1e993ef4a..ed2db1ffb26 100644
--- a/src/srun/srun_job.h
+++ b/src/srun/srun_job.h
@@ -16,7 +16,7 @@
  *  any later version.
  *
  *  In addition, as a special exception, the copyright holders give permission 
- *  to link the code of portions of this program with the OpenSSL library under 
+ *  to link the code of portions of this program with the OpenSSL library under
  *  certain conditions as described in each individual source file, and 
  *  distribute linked combinations including the two. You must obey the GNU 
  *  General Public License in all respects for all of the code used other than 
@@ -51,10 +51,120 @@
 #include "src/common/macros.h"
 #include "src/common/node_select.h"
 #include "src/common/slurm_protocol_defs.h"
-#include "src/common/global_srun.h"
 #include "src/api/step_io.h"
 
-#include "src/srun/fname.h"
+//#include "src/srun/fname.h"
+
+typedef enum {
+	SRUN_JOB_INIT = 0,         /* Job's initial state                   */
+	SRUN_JOB_LAUNCHING,        /* Launch thread is running              */
+	SRUN_JOB_STARTING,         /* Launch thread is complete             */
+	SRUN_JOB_RUNNING,          /* Launch thread complete                */
+	SRUN_JOB_TERMINATING,      /* Once first task terminates            */
+	SRUN_JOB_TERMINATED,       /* All tasks terminated (may have IO)    */
+	SRUN_JOB_WAITING_ON_IO,    /* All tasks terminated; waiting for IO  */
+	SRUN_JOB_DONE,             /* tasks and IO complete                 */
+	SRUN_JOB_DETACHED,         /* Detached IO from job (Not used now)   */
+	SRUN_JOB_FAILED,           /* Job failed for some reason            */
+	SRUN_JOB_CANCELLED,        /* CTRL-C cancelled                      */
+	SRUN_JOB_FORCETERM         /* Forced termination of IO thread       */
+} srun_job_state_t;
+
+typedef enum {
+	SRUN_HOST_INIT = 0,
+	SRUN_HOST_CONTACTED,
+	SRUN_HOST_UNREACHABLE,
+	SRUN_HOST_REPLIED
+} srun_host_state_t;
+
+typedef enum {
+	SRUN_TASK_INIT = 0,
+	SRUN_TASK_RUNNING,
+	SRUN_TASK_FAILED,
+	SRUN_TASK_IO_WAIT,/* this state deprecated with new eio stdio engine */
+	SRUN_TASK_EXITED,
+	SRUN_TASK_ABNORMAL_EXIT
+} srun_task_state_t;
+
+typedef enum { 
+	PIPE_NONE = 0, 
+	PIPE_JOB_STATE, 
+	PIPE_TASK_STATE, 
+	PIPE_TASK_EXITCODE,
+	PIPE_HOST_STATE, 
+	PIPE_SIGNALED,
+	PIPE_MPIR_DEBUG_STATE,
+	PIPE_UPDATE_MPIR_PROCTABLE,
+	PIPE_UPDATE_STEP_LAYOUT,
+	PIPE_NODE_FAIL
+} pipe_enum_t;
+
+/* For Message thread */
+typedef struct forked_msg_pipe {
+	int msg_pipe[2];
+	int pid;
+} forked_msg_pipe_t;
+
+typedef struct forked_message {
+	forked_msg_pipe_t *          par_msg;
+	forked_msg_pipe_t *          msg_par;
+	enum job_states	*	     job_state;
+} forked_msg_t;
+
+typedef struct io_filename io_filename_t;
+
+typedef struct srun_job {
+	slurm_step_layout_t *step_layout; /* holds info about how the task is 
+					     laid out */
+	uint32_t jobid;		/* assigned job id 	                  */
+	uint32_t stepid;	/* assigned step id 	                  */
+	bool old_job;           /* run job step under previous allocation */
+	bool removed;       /* job has been removed from SLURM */
+
+	uint32_t nhosts;	/* node count */
+	uint32_t ntasks;	/* task count */
+	srun_job_state_t state;	/* job state	   	                  */
+	pthread_mutex_t state_mutex; 
+	pthread_cond_t  state_cond;
+
+	bool signaled;          /* True if user generated signal to job   */
+	int  rc;                /* srun return code                       */
+
+	slurm_cred_t  cred;     /* Slurm job credential    */
+	char *nodelist;		/* nodelist in string form */
+
+	pthread_t sigid;	/* signals thread tid		  */
+
+	pthread_t jtid;		/* job control thread id 	  */
+	slurm_fd *jfd;		/* job control info fd   	  */
+	
+	pthread_t lid;		  /* launch thread id */
+
+	client_io_t *client_io;
+	time_t    ltimeout;       /* Time by which all tasks must be running */
+	time_t    etimeout;       /* exit timeout (see opt.max_wait          */
+
+	srun_host_state_t *host_state; /* nhost host states */
+
+	int *tstatus;	          /* ntask exit statii */
+	srun_task_state_t *task_state; /* ntask task states */
+	
+	switch_jobinfo_t switch_job;
+	io_filename_t *ifname;
+	io_filename_t *ofname;
+	io_filename_t *efname;
+	forked_msg_t *forked_msg;
+	char *task_epilog;	/* task-epilog */
+	char *task_prolog;	/* task-prolog */
+	pthread_mutex_t task_mutex;
+	int njfds;		/* number of job control info fds */
+	slurm_addr *jaddr;	/* job control info ports 	  */
+	int thr_count;  	/* count of threads in job launch */
+
+	/* Output streams and stdin fileno */
+	select_jobinfo_t select_jobinfo;
+	
+} srun_job_t;
 
 extern int message_thread;
 
@@ -105,4 +215,7 @@ void    report_job_status(srun_job_t *job);
  */
 int    job_rc(srun_job_t *job);
 
+void   fwd_signal(srun_job_t *job, int signal, int max_threads);
+int    job_active_tasks_on_host(srun_job_t *job, int hostid);
+
 #endif /* !_HAVE_JOB_H */
-- 
GitLab