From 3fe018ae6d127506a7aa25c442bad00c9a2a2c52 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Sat, 5 Jun 2004 01:05:39 +0000
Subject: [PATCH] Added srun option --debugger-test to confirm that slurm's
 debugger infrastructure is operational.

---
 NEWS           |  2 ++
 src/srun/msg.c | 30 +++++++++++++++++++++++++++---
 src/srun/opt.c | 13 ++++++++++++-
 src/srun/opt.h |  1 +
 4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/NEWS b/NEWS
index 83d91d43de6..32bede9be7b 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,8 @@ documents those changes that are of interest to users and admins.
  -- Fix sinfo -R, --list-reasons to list all relevant node states.
  -- Fix slurmctld to honor srun's node configuration specifications 
     with FastSchedule==0 configuration.
+ -- Added srun option --debugger-test to confirm that slurm's debugger 
+    infrastructure is operational.
  -- Removed debugging hacks for srun.wrapper.c. Temporarily use 
     RPM's debugedit utility if available for similar effect.
 
diff --git a/src/srun/msg.c b/src/srun/msg.c
index 04e120d4b56..cbb9d7d4c94 100644
--- a/src/srun/msg.c
+++ b/src/srun/msg.c
@@ -74,6 +74,7 @@ static slurm_fd slurmctld_fd   = (slurm_fd) NULL;
  */
 static void	_accept_msg_connection(job_t *job, int fdnum);
 static void	_confirm_launch_complete(job_t *job);
+static void	_dump_proctable(job_t *job);
 static void 	_exit_handler(job_t *job, slurm_msg_t *exit_msg);
 static void	_handle_msg(job_t *job, slurm_msg_t *msg);
 static inline bool _job_msg_done(job_t *job);
@@ -131,11 +132,34 @@ _build_proctable(job_t *job, char *host, int nodeid, int ntasks, uint32_t *pid)
 
 	if (tasks_recorded == opt.nprocs) {
 		MPIR_debug_state = MPIR_DEBUG_SPAWNED;
-		MPIR_Breakpoint(); 
+		MPIR_Breakpoint();
+		if (opt.debugger_test)
+			_dump_proctable(job); 
 	}
 }
 
-
+static void _dump_proctable(job_t *job)
+{
+	int node_inx, task_inx, taskid, max_task;
+	MPIR_PROCDESC *tv;
+
+	if (opt.overcommit)
+		max_task = opt.nprocs;
+
+	for (node_inx=0; node_inx<job->nhosts; node_inx++) {
+		if (!opt.overcommit)
+			max_task = job->cpus[node_inx];
+		for (task_inx=0; ; task_inx++) {
+			taskid = job->tids[node_inx][task_inx];
+			if ((task_inx > 0) && (taskid == 0))
+				break;
+			tv = &MPIR_proctable[taskid];
+			info("task:%d, host:%s, pid:%d",
+				taskid, tv->host_name, tv->pid);
+		}
+	} 
+}
+	
 void debugger_launch_failure(void)
 {
 	if (opt.parallel_debug) {
@@ -148,7 +172,7 @@ void MPIR_Breakpoint(void)
 {
 	debug("In MPIR_Breakpoint");
 	/* This just notifies parallel 
-         * debugger that some event of interest occured */ 
+         * debugger that some event of interest occurred */ 
 }
 
 /*
diff --git a/src/srun/opt.c b/src/srun/opt.c
index d3fb2a83d55..6e3f2b80231 100644
--- a/src/srun/opt.c
+++ b/src/srun/opt.c
@@ -103,7 +103,7 @@
 #define LONG_OPT_MPI      0x10c
 #define LONG_OPT_CORE	  0x10e
 #define LONG_OPT_NOSHELL  0x10f
-
+#define LONG_OPT_DEBUG_TS 0x110
 /*---- forward declarations of static functions  ----*/
 
 typedef struct env_vars env_vars_t;
@@ -651,6 +651,7 @@ static void _opt_args(int argc, char **argv)
 		{"max-exit-timeout", required_argument, 0, LONG_OPT_XTO},
 		{"uid",              required_argument, 0, LONG_OPT_UID},
 		{"gid",              required_argument, 0, LONG_OPT_GID},
+		{"debugger-test",    no_argument,       0, LONG_OPT_DEBUG_TS},
 		{"help",             no_argument,       0, LONG_OPT_HELP},
 		{"usage",            no_argument,       0, LONG_OPT_USAGE},
 		{NULL,               0,                 0, 0}
@@ -882,6 +883,16 @@ static void _opt_args(int argc, char **argv)
 			if (opt.egid == (gid_t) -1)
 				fatal ("--gid=\"%s\" invalid", optarg);
 			break;
+		case LONG_OPT_DEBUG_TS:
+			opt.debugger_test    = true;
+			/* make other parameters look like debugger 
+			 * is really attached */
+			opt.parallel_debug   = true;
+			MPIR_being_debugged  = 1;
+			opt.max_launch_time = 120;
+			opt.max_threads     = 1;
+			opt.msg_timeout     = 15;
+			break;
 		case LONG_OPT_HELP:
 			_help();
 			exit(0);
diff --git a/src/srun/opt.h b/src/srun/opt.h
index 54b0f3b8d19..281429f9246 100644
--- a/src/srun/opt.h
+++ b/src/srun/opt.h
@@ -135,6 +135,7 @@ typedef struct srun_options {
 	bool quit_on_intr;      /* --quit-on-interrupt, -q      */
 	int  quiet;
 	bool parallel_debug;	/* srun controlled by debugger	*/
+	bool debugger_test;	/* --debugger-test		*/
 
 	/* constraint options */
 	int mincpus;		/* --mincpus=n			*/
-- 
GitLab