diff --git a/src/srun/Makefile.am b/src/srun/Makefile.am index 7659b6de51cef702adb455f4a58b1ecb9713063b..1aa928c589e672456163a06c6e1ae99429fdae96 100644 --- a/src/srun/Makefile.am +++ b/src/srun/Makefile.am @@ -12,7 +12,7 @@ bin_PROGRAMS = srun srun_SOURCES = srun.c opt.c env.c opt.h env.h job.c job.h net.c net.h \ msg.c msg.h io.c io.h launch.h launch.c attach.h \ - reattach.c reattach.h fname.h fname.c + reattach.c reattach.h fname.h fname.c srun.wrapper.c srun_LDADD = $(top_builddir)/src/common/libcommon.la \ $(top_builddir)/src/api/libslurm.la \ - $(POPT_LIBS) $(TOTALVIEW_LIBS) + $(POPT_LIBS) diff --git a/src/srun/attach.h b/src/srun/attach.h index 1512ad285a226a2dffb7fc81805c22067c80b38a..b9537e9fa5312e7b2915318195a7d94e577dd0fc 100644 --- a/src/srun/attach.h +++ b/src/srun/attach.h @@ -71,4 +71,12 @@ extern int MPIR_being_debugged; /* Cause extra info on internal state #define MPIR_DEBUG_SPAWNED 1 #define MPIR_DEBUG_ABORTING 2 +/* SLURM specific declarations */ +extern int MPIR_i_am_starter; +extern int MPIR_acquired_pre_main; +extern int MPIR_debug_gate; + +extern void MPIR_Breakpoint(void); +extern void tv_launch_failure(void); + #endif diff --git a/src/srun/msg.c b/src/srun/msg.c index d4a0bb9bb3f185eaabdda121de9d7bb0fb79b8ac..5989684d63da7e40a5be1e4e7db6e4c80f175744 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -34,6 +34,7 @@ #include <errno.h> #include <fcntl.h> +#include <string.h> #include <sys/poll.h> #include <time.h> @@ -87,22 +88,83 @@ static char * _taskid2hostname(int task_id, job_t * job); #ifdef HAVE_TOTALVIEW +/* Convert node name to address string, eg. "123.45.67.8", + * also return the index in the job table (-1 if not found) */ +static char * +_node_name_to_addr(const char *name, job_t *job, int *inx) +{ + int i; + char *buf = xmalloc(28); + char *colon; + + for (i=0; i<job->nhosts; i++) { + if (strcmp(name, job->host[i])) + continue; + slurm_print_slurm_addr(&job->slurmd_addr[i], buf, 128); + /* This returns address:port, we need to remove ":port" */ + colon = strchr(buf, (int)':'); + if (colon) + colon[0] = '\0'; + *inx = i; + return buf; + } + + error("_node_name_to_addr error on %s", name); + *inx = -1; + return NULL; +} + static void -_build_tv_list(launch_tasks_response_msg_t *msg) +_build_tv_list(launch_tasks_response_msg_t *msg, job_t *job) { MPIR_PROCDESC * tv_tasks; - int i; + int i, node_inx, task_id; + char *node_addr; + static int tasks_recorded = 0; if (!opt.totalview) return; + node_addr = _node_name_to_addr(msg->node_name, job, &node_inx); + if ((node_addr == NULL) || (node_inx < 0)) + return; + + if (MPIR_proctable_size == 0) { + MPIR_proctable_size = opt.nprocs; + MPIR_proctable = xmalloc(sizeof(MPIR_PROCDESC) * opt.nprocs); + } + for (i=0; i<msg->count_of_pids; i++) { - tv_tasks = &MPIR_proctable[MPIR_proctable_size++]; - tv_tasks->host_name = msg->node_name; - tv_tasks->executable_name = opt.progname; + tasks_recorded++; + task_id = job->tids[node_inx][i]; + tv_tasks = &MPIR_proctable[task_id]; + tv_tasks->host_name = node_addr; + tv_tasks->executable_name = remote_argv[0]; tv_tasks->pid = msg->local_pids[i]; + debug("task=%d host=%s executable=%s pid=%d", task_id, + tv_tasks->host_name, tv_tasks->executable_name, + tv_tasks->pid); } msg->node_name = NULL; /* nothing to free */ + + if (tasks_recorded == opt.nprocs) { + MPIR_debug_state = MPIR_DEBUG_SPAWNED; + MPIR_Breakpoint(); + } +} + +void tv_launch_failure(void) +{ + if (opt.totalview) { + MPIR_debug_state = MPIR_DEBUG_ABORTING; + MPIR_Breakpoint(); + } +} + +void MPIR_Breakpoint(void) +{ + debug("In MPIR_Breakpoint"); + /* This just notifies TotalView that some event of interest occured */ } #endif @@ -118,7 +180,7 @@ _process_launch_resp(job_t *job, launch_tasks_response_msg_t *msg) job->host_state[msg->srun_node_id] = SRUN_HOST_REPLIED; pthread_mutex_unlock(&job->task_mutex); #ifdef HAVE_TOTALVIEW - _build_tv_list(msg); + _build_tv_list(msg, job); #endif if (_verbose) { hostlist_t pids = hostlist_create(NULL); @@ -133,9 +195,13 @@ _process_launch_resp(job_t *job, launch_tasks_response_msg_t *msg) verbose("%s: %s", msg->node_name, buf); } - } else + } else { error("launch resp from %s has bad task_id %d", msg->node_name, msg->srun_node_id); +#ifdef HAVE_TOTALVIEW + tv_launch_failure(); +#endif + } } static void @@ -182,6 +248,9 @@ _launch_handler(job_t *job, slurm_msg_t *resp) update_job_state(job, SRUN_JOB_FAILED); else update_failed_tasks(job, msg->srun_node_id); +#ifdef HAVE_TOTALVIEW + tv_launch_failure(); +#endif return; } else _process_launch_resp(job, msg); diff --git a/src/srun/opt.c b/src/srun/opt.c index 0d978d7b2c2e522cf11755373e781b3d9cb6f679..50053fd1c66a104303b86939d2405f0ea5c9d9b0 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -48,12 +48,23 @@ #include "src/common/xmalloc.h" #include "src/common/xstring.h" -#ifdef HAVE_TOTALVIEW -#include "src/srun/attach.h" -#endif #include "src/srun/env.h" #include "src/srun/opt.h" +#ifdef HAVE_TOTALVIEW +# include "src/srun/attach.h" + + MPIR_PROCDESC *MPIR_proctable; + int MPIR_proctable_size; + VOLATILE int MPIR_debug_state; + VOLATILE int MPIR_debug_gate; + char * MPIR_debug_abort_string; + int MPIR_being_debugged; + int MPIR_i_am_starter; + int MPIR_acquired_pre_main; + int MPIR_debug_gate; +#endif + #define __DEBUG 0 /*---[ popt definitions ]------------------------------------------------*/ @@ -298,7 +309,7 @@ static bool _opt_verify(poptContext); static char * _base_name(char* command); #ifdef HAVE_TOTALVIEW -static bool _under_totalview(void); + static bool _under_totalview(void); #endif /* list known options and their settings @@ -1107,10 +1118,6 @@ void _opt_list() /* Determine if srun is under the control of a TotalView debugger or not */ static bool _under_totalview(void) { - if (MPIR_being_debugged) { - debug("Being executed under totalview"); - return true; - } else - return false; + return (MPIR_being_debugged != 0); } #endif diff --git a/src/srun/srun.c b/src/srun/srun.c index c301d2bfca59fdb8939781fa80209dcfcce74b11..d69ac3e9989f4a1be070a1bc5d643876aa8b6c37 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -67,6 +67,10 @@ #include "src/srun/msg.h" #include "src/srun/io.h" +#ifdef HAVE_TOTALVIEW +# include "src/srun/attach.h" +#endif + #define MAX_RETRIES 20 typedef resource_allocation_response_msg_t allocation_resp; @@ -126,7 +130,7 @@ _int_handler(int signal) { pthread_cancel(pthread_self());} int -main(int ac, char **av) +srun(int ac, char **av) { sigset_t sigset; allocation_resp *resp; @@ -380,6 +384,9 @@ _sig_kill_alloc(int signum) if (signum == SIGINT) { /* <Control-C> */ slurm_complete_job (job_id, 0, 0); +#ifdef HAVE_TOTALVIEW + tv_launch_failure(); +#endif exit (0); } else if (signum < 0) job_id = (uint32_t) (0 - signum); /* kluge to pass job id */ @@ -443,6 +450,9 @@ _create_job_step(job_t *job) if (slurm_job_step_create(&req, &resp) || (resp == NULL)) { error("unable to create job step: %s", slurm_strerror(errno)); slurm_complete_job(job->jobid, 0, errno); +#ifdef HAVE_TOTALVIEW + tv_launch_failure(); +#endif exit(1); } diff --git a/src/srun/srun.wrapper.c b/src/srun/srun.wrapper.c new file mode 100644 index 0000000000000000000000000000000000000000..886c127a1629081aace9f3b1bfae0e1f679e4353 --- /dev/null +++ b/src/srun/srun.wrapper.c @@ -0,0 +1,15 @@ +/* + * srun.wrapper.c - srun command wrapper for use with the TotalView debugger + * srun is the SLURM parallel job initiator and resource allocator + * TotalView is a parallel job debugger from Etnus <http://www.etnus.com> + * + * Type "<ctrl-a>" to specify arguments for srun + * Type "g" to start the program + */ + +extern int srun(int argc, char **argv); + +int main(int argc, char **argv) +{ + return srun(argc, argv); +}