diff --git a/src/srun/launch.c b/src/srun/launch.c index 633a3cce72813185e8c5bc58aef4949710ce4c24..beb32631d855bb157bd35b0bc76e9816dc4d42db 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -204,6 +204,29 @@ static int _check_pending_threads(thd_t *thd, int count) return 0; } +/* + * When running under parallel debugger, do not create threads in + * detached state, as this seems to confuse TotalView specifically + */ +static void _set_attr_detached (pthread_attr_t *attr) +{ + int err; + if (!opt.parallel_debug) + return; + if ((err = pthread_attr_setdetachstate(attr, PTHREAD_CREATE_DETACHED))) + error ("pthread_attr_setdetachstate: %s", slurm_strerror(err)); + return; +} + +static void _join_attached_threads (int nthreads, thd_t *th) +{ + int i; + void *retval; + for (i = 0; i < nthreads; i++) + pthread_join (th[i].thread, &retval); + return; +} + static void _spawn_launch_thr(thd_t *th) { @@ -211,9 +234,7 @@ static void _spawn_launch_thr(thd_t *th) int err = 0; slurm_attr_init (&attr); - err = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); - if (err) - error ("pthread_attr_setdetachstate: %s", slurm_strerror(err)); + _set_attr_detached (&attr); err = pthread_create(&th->thread, &attr, _p_launch_task, (void *)th); if (err) { @@ -297,6 +318,12 @@ static void _p_launch(slurm_msg_t *req, job_t *job) _wait_on_active(thd, job); pthread_mutex_unlock(&active_mutex); + /* + * Need to join with all attached threads if running + * under parallel debugger + */ + _join_attached_threads (job->nhosts, thd); + /* * xsignal_restore_mask(&set); * xsignal(SIGALRM, oldh); diff --git a/src/srun/signals.c b/src/srun/signals.c index 20035699a9fd46c2573bab5562692037fb941d9a..6947bcf7d0a4beb759be60866353182e5177319d 100644 --- a/src/srun/signals.c +++ b/src/srun/signals.c @@ -234,7 +234,12 @@ _sig_thr(void *arg) xsignal_sigset_create(srun_sigarray, &set); - sigwait(&set, &signo); + if (sigwait(&set, &signo) < 0) { + if (errno != EINTR) + error ("sigwait: %m"); + continue; + } + debug2("recvd signal %d", signo); switch (signo) { case SIGINT: