From 3fe018ae6d127506a7aa25c442bad00c9a2a2c52 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Sat, 5 Jun 2004 01:05:39 +0000 Subject: [PATCH] Added srun option --debugger-test to confirm that slurm's debugger infrastructure is operational. --- NEWS | 2 ++ src/srun/msg.c | 30 +++++++++++++++++++++++++++--- src/srun/opt.c | 13 ++++++++++++- src/srun/opt.h | 1 + 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 83d91d43de6..32bede9be7b 100644 --- a/NEWS +++ b/NEWS @@ -8,6 +8,8 @@ documents those changes that are of interest to users and admins. -- Fix sinfo -R, --list-reasons to list all relevant node states. -- Fix slurmctld to honor srun's node configuration specifications with FastSchedule==0 configuration. + -- Added srun option --debugger-test to confirm that slurm's debugger + infrastructure is operational. -- Removed debugging hacks for srun.wrapper.c. Temporarily use RPM's debugedit utility if available for similar effect. diff --git a/src/srun/msg.c b/src/srun/msg.c index 04e120d4b56..cbb9d7d4c94 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -74,6 +74,7 @@ static slurm_fd slurmctld_fd = (slurm_fd) NULL; */ static void _accept_msg_connection(job_t *job, int fdnum); static void _confirm_launch_complete(job_t *job); +static void _dump_proctable(job_t *job); static void _exit_handler(job_t *job, slurm_msg_t *exit_msg); static void _handle_msg(job_t *job, slurm_msg_t *msg); static inline bool _job_msg_done(job_t *job); @@ -131,11 +132,34 @@ _build_proctable(job_t *job, char *host, int nodeid, int ntasks, uint32_t *pid) if (tasks_recorded == opt.nprocs) { MPIR_debug_state = MPIR_DEBUG_SPAWNED; - MPIR_Breakpoint(); + MPIR_Breakpoint(); + if (opt.debugger_test) + _dump_proctable(job); } } - +static void _dump_proctable(job_t *job) +{ + int node_inx, task_inx, taskid, max_task; + MPIR_PROCDESC *tv; + + if (opt.overcommit) + max_task = opt.nprocs; + + for (node_inx=0; node_inx<job->nhosts; node_inx++) { + if (!opt.overcommit) + max_task = job->cpus[node_inx]; + for (task_inx=0; ; task_inx++) { + taskid = job->tids[node_inx][task_inx]; + if ((task_inx > 0) && (taskid == 0)) + break; + tv = &MPIR_proctable[taskid]; + info("task:%d, host:%s, pid:%d", + taskid, tv->host_name, tv->pid); + } + } +} + void debugger_launch_failure(void) { if (opt.parallel_debug) { @@ -148,7 +172,7 @@ void MPIR_Breakpoint(void) { debug("In MPIR_Breakpoint"); /* This just notifies parallel - * debugger that some event of interest occured */ + * debugger that some event of interest occurred */ } /* diff --git a/src/srun/opt.c b/src/srun/opt.c index d3fb2a83d55..6e3f2b80231 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -103,7 +103,7 @@ #define LONG_OPT_MPI 0x10c #define LONG_OPT_CORE 0x10e #define LONG_OPT_NOSHELL 0x10f - +#define LONG_OPT_DEBUG_TS 0x110 /*---- forward declarations of static functions ----*/ typedef struct env_vars env_vars_t; @@ -651,6 +651,7 @@ static void _opt_args(int argc, char **argv) {"max-exit-timeout", required_argument, 0, LONG_OPT_XTO}, {"uid", required_argument, 0, LONG_OPT_UID}, {"gid", required_argument, 0, LONG_OPT_GID}, + {"debugger-test", no_argument, 0, LONG_OPT_DEBUG_TS}, {"help", no_argument, 0, LONG_OPT_HELP}, {"usage", no_argument, 0, LONG_OPT_USAGE}, {NULL, 0, 0, 0} @@ -882,6 +883,16 @@ static void _opt_args(int argc, char **argv) if (opt.egid == (gid_t) -1) fatal ("--gid=\"%s\" invalid", optarg); break; + case LONG_OPT_DEBUG_TS: + opt.debugger_test = true; + /* make other parameters look like debugger + * is really attached */ + opt.parallel_debug = true; + MPIR_being_debugged = 1; + opt.max_launch_time = 120; + opt.max_threads = 1; + opt.msg_timeout = 15; + break; case LONG_OPT_HELP: _help(); exit(0); diff --git a/src/srun/opt.h b/src/srun/opt.h index 54b0f3b8d19..281429f9246 100644 --- a/src/srun/opt.h +++ b/src/srun/opt.h @@ -135,6 +135,7 @@ typedef struct srun_options { bool quit_on_intr; /* --quit-on-interrupt, -q */ int quiet; bool parallel_debug; /* srun controlled by debugger */ + bool debugger_test; /* --debugger-test */ /* constraint options */ int mincpus; /* --mincpus=n */ -- GitLab