Skip to content
Snippets Groups Projects
Commit 3fe018ae authored by Moe Jette's avatar Moe Jette
Browse files

Added srun option --debugger-test to confirm that slurm's debugger

infrastructure is operational.
parent 60aadd1e
No related branches found
No related tags found
No related merge requests found
...@@ -8,6 +8,8 @@ documents those changes that are of interest to users and admins. ...@@ -8,6 +8,8 @@ documents those changes that are of interest to users and admins.
-- Fix sinfo -R, --list-reasons to list all relevant node states. -- Fix sinfo -R, --list-reasons to list all relevant node states.
-- Fix slurmctld to honor srun's node configuration specifications -- Fix slurmctld to honor srun's node configuration specifications
with FastSchedule==0 configuration. with FastSchedule==0 configuration.
-- Added srun option --debugger-test to confirm that slurm's debugger
infrastructure is operational.
-- Removed debugging hacks for srun.wrapper.c. Temporarily use -- Removed debugging hacks for srun.wrapper.c. Temporarily use
RPM's debugedit utility if available for similar effect. RPM's debugedit utility if available for similar effect.
......
...@@ -74,6 +74,7 @@ static slurm_fd slurmctld_fd = (slurm_fd) NULL; ...@@ -74,6 +74,7 @@ static slurm_fd slurmctld_fd = (slurm_fd) NULL;
*/ */
static void _accept_msg_connection(job_t *job, int fdnum); static void _accept_msg_connection(job_t *job, int fdnum);
static void _confirm_launch_complete(job_t *job); static void _confirm_launch_complete(job_t *job);
static void _dump_proctable(job_t *job);
static void _exit_handler(job_t *job, slurm_msg_t *exit_msg); static void _exit_handler(job_t *job, slurm_msg_t *exit_msg);
static void _handle_msg(job_t *job, slurm_msg_t *msg); static void _handle_msg(job_t *job, slurm_msg_t *msg);
static inline bool _job_msg_done(job_t *job); static inline bool _job_msg_done(job_t *job);
...@@ -131,11 +132,34 @@ _build_proctable(job_t *job, char *host, int nodeid, int ntasks, uint32_t *pid) ...@@ -131,11 +132,34 @@ _build_proctable(job_t *job, char *host, int nodeid, int ntasks, uint32_t *pid)
if (tasks_recorded == opt.nprocs) { if (tasks_recorded == opt.nprocs) {
MPIR_debug_state = MPIR_DEBUG_SPAWNED; MPIR_debug_state = MPIR_DEBUG_SPAWNED;
MPIR_Breakpoint(); MPIR_Breakpoint();
if (opt.debugger_test)
_dump_proctable(job);
} }
} }
static void _dump_proctable(job_t *job)
{
int node_inx, task_inx, taskid, max_task;
MPIR_PROCDESC *tv;
if (opt.overcommit)
max_task = opt.nprocs;
for (node_inx=0; node_inx<job->nhosts; node_inx++) {
if (!opt.overcommit)
max_task = job->cpus[node_inx];
for (task_inx=0; ; task_inx++) {
taskid = job->tids[node_inx][task_inx];
if ((task_inx > 0) && (taskid == 0))
break;
tv = &MPIR_proctable[taskid];
info("task:%d, host:%s, pid:%d",
taskid, tv->host_name, tv->pid);
}
}
}
void debugger_launch_failure(void) void debugger_launch_failure(void)
{ {
if (opt.parallel_debug) { if (opt.parallel_debug) {
...@@ -148,7 +172,7 @@ void MPIR_Breakpoint(void) ...@@ -148,7 +172,7 @@ void MPIR_Breakpoint(void)
{ {
debug("In MPIR_Breakpoint"); debug("In MPIR_Breakpoint");
/* This just notifies parallel /* This just notifies parallel
* debugger that some event of interest occured */ * debugger that some event of interest occurred */
} }
/* /*
......
...@@ -103,7 +103,7 @@ ...@@ -103,7 +103,7 @@
#define LONG_OPT_MPI 0x10c #define LONG_OPT_MPI 0x10c
#define LONG_OPT_CORE 0x10e #define LONG_OPT_CORE 0x10e
#define LONG_OPT_NOSHELL 0x10f #define LONG_OPT_NOSHELL 0x10f
#define LONG_OPT_DEBUG_TS 0x110
/*---- forward declarations of static functions ----*/ /*---- forward declarations of static functions ----*/
typedef struct env_vars env_vars_t; typedef struct env_vars env_vars_t;
...@@ -651,6 +651,7 @@ static void _opt_args(int argc, char **argv) ...@@ -651,6 +651,7 @@ static void _opt_args(int argc, char **argv)
{"max-exit-timeout", required_argument, 0, LONG_OPT_XTO}, {"max-exit-timeout", required_argument, 0, LONG_OPT_XTO},
{"uid", required_argument, 0, LONG_OPT_UID}, {"uid", required_argument, 0, LONG_OPT_UID},
{"gid", required_argument, 0, LONG_OPT_GID}, {"gid", required_argument, 0, LONG_OPT_GID},
{"debugger-test", no_argument, 0, LONG_OPT_DEBUG_TS},
{"help", no_argument, 0, LONG_OPT_HELP}, {"help", no_argument, 0, LONG_OPT_HELP},
{"usage", no_argument, 0, LONG_OPT_USAGE}, {"usage", no_argument, 0, LONG_OPT_USAGE},
{NULL, 0, 0, 0} {NULL, 0, 0, 0}
...@@ -882,6 +883,16 @@ static void _opt_args(int argc, char **argv) ...@@ -882,6 +883,16 @@ static void _opt_args(int argc, char **argv)
if (opt.egid == (gid_t) -1) if (opt.egid == (gid_t) -1)
fatal ("--gid=\"%s\" invalid", optarg); fatal ("--gid=\"%s\" invalid", optarg);
break; break;
case LONG_OPT_DEBUG_TS:
opt.debugger_test = true;
/* make other parameters look like debugger
* is really attached */
opt.parallel_debug = true;
MPIR_being_debugged = 1;
opt.max_launch_time = 120;
opt.max_threads = 1;
opt.msg_timeout = 15;
break;
case LONG_OPT_HELP: case LONG_OPT_HELP:
_help(); _help();
exit(0); exit(0);
......
...@@ -135,6 +135,7 @@ typedef struct srun_options { ...@@ -135,6 +135,7 @@ typedef struct srun_options {
bool quit_on_intr; /* --quit-on-interrupt, -q */ bool quit_on_intr; /* --quit-on-interrupt, -q */
int quiet; int quiet;
bool parallel_debug; /* srun controlled by debugger */ bool parallel_debug; /* srun controlled by debugger */
bool debugger_test; /* --debugger-test */
/* constraint options */ /* constraint options */
int mincpus; /* --mincpus=n */ int mincpus; /* --mincpus=n */
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment