diff --git a/src/srun/launch.c b/src/srun/launch.c index b466bca8fb3f32192b548118732b39764c7ed4dd..c7b4f6016bb2a1a779fa6ebf8c4d3ad8ea3b4886 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -51,9 +51,10 @@ extern char **environ; static pthread_mutex_t active_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t active_cond = PTHREAD_COND_INITIALIZER; static int active = 0; +static int joinable = 0; static int fail_launch_cnt = 0; -typedef enum {DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_FAILED} state_t; +typedef enum {DSH_NEW, DSH_ACTIVE, DSH_DONE, DSH_FAILED, DSH_JOINED} state_t; typedef struct task_info { slurm_msg_t *req; @@ -208,23 +209,33 @@ static int _check_pending_threads(thd_t *thd, int count) static void _set_attr_detached (pthread_attr_t *attr) { int err; - if (!opt.parallel_debug) + if (opt.parallel_debug) { return; + } if ((err = pthread_attr_setdetachstate(attr, PTHREAD_CREATE_DETACHED))) error ("pthread_attr_setdetachstate: %s", slurm_strerror(err)); return; } + +/* + * Need to join with all attached threads if running + * under parallel debugger + */ static void _join_attached_threads (int nthreads, thd_t *th) { int i; void *retval; - if (!opt.parallel_debug) - return; - for (i = 0; i < nthreads; i++) { - if (th[i].thread != (pthread_t) NULL) - pthread_join (th[i].thread, &retval); + if (opt.parallel_debug) { + for (i = 0; i < nthreads; i++) { + if (th[i].thread != (pthread_t) NULL + && th[i].state == DSH_DONE) { + pthread_join (th[i].thread, &retval); + th[i].state = DSH_JOINED; + } + } } + return; } @@ -302,7 +313,8 @@ static void _p_launch(slurm_msg_t *req, srun_job_t *job) pthread_mutex_lock(&active_mutex); while (active >= opt.max_threads || rc < 0) rc = _wait_on_active(thd, job); - + if (joinable >= (opt.max_threads/2)) + _join_attached_threads(job->nhosts, thd); active++; pthread_mutex_unlock(&active_mutex); @@ -320,10 +332,6 @@ static void _p_launch(slurm_msg_t *req, srun_job_t *job) _wait_on_active(thd, job); pthread_mutex_unlock(&active_mutex); - /* - * Need to join with all attached threads if running - * under parallel debugger - */ _join_attached_threads (job->nhosts, thd); /* @@ -454,10 +462,11 @@ static void * _p_launch_task(void *arg) } else _update_contacted_node(job, nodeid); - pthread_mutex_lock(&active_mutex); th->state = DSH_DONE; active--; + if (opt.parallel_debug) + joinable++; fail_launch_cnt += failure; pthread_cond_signal(&active_cond); pthread_mutex_unlock(&active_mutex);