diff --git a/NEWS b/NEWS index 050551dcb0bcb276b84f14ebfebcba90be114a1b..d97e473b06c978b9a5ab2c75767afe05a2d25cdd 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,7 @@ documents those changes that are of interest to users and administrators. -- When resuming node only send one message to the slurmdbd. -- Modify srun --pty option to use configured SrunPortRange range. -- Fix issue with whole gres not being printed out with Slurm tools. + -- Fix issue with multiple jobs from an array to start. * Changes in Slurm 17.02.6 ========================== diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 50500acc6b1cc10299273511df148ed837eff234..523b15d13d4876cdf30105fdb4a3037334e9c089 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -6449,8 +6449,8 @@ _remove_starting_step(uint16_t type, void *req) starting_step.job_id, starting_step.step_id); rc = SLURM_FAILURE; - } else - slurm_cond_broadcast(&conf->starting_steps_cond); + } + slurm_cond_broadcast(&conf->starting_steps_cond); fail: return rc; } @@ -6476,7 +6476,10 @@ static int _compare_starting_steps(void *listentry, void *key) static int _wait_for_starting_step(uint32_t job_id, uint32_t step_id) { - pthread_mutex_t dummy_lock = PTHREAD_MUTEX_INITIALIZER; + static pthread_mutex_t dummy_lock = PTHREAD_MUTEX_INITIALIZER; + struct timespec ts = {0, 0}; + struct timeval now; + starting_step_t starting_step; int num_passes = 0; @@ -6496,8 +6499,13 @@ static int _wait_for_starting_step(uint32_t job_id, uint32_t step_id) } num_passes++; + gettimeofday(&now, NULL); + ts.tv_sec = now.tv_sec+1; + ts.tv_nsec = now.tv_usec * 1000; + slurm_mutex_lock(&dummy_lock); - slurm_cond_wait(&conf->starting_steps_cond, &dummy_lock); + slurm_cond_timedwait(&conf->starting_steps_cond, + &dummy_lock, &ts); slurm_mutex_unlock(&dummy_lock); } if (num_passes > 0) { @@ -6560,8 +6568,7 @@ static void _remove_job_running_prolog(uint32_t job_id) if (!list_delete_all(conf->prolog_running_jobs, _match_jobid, &job_id)) error("_remove_job_running_prolog: job not found"); - else - slurm_cond_broadcast(&conf->prolog_running_cond); + slurm_cond_broadcast(&conf->prolog_running_cond); } static int _match_jobid(void *listentry, void *key) @@ -6584,13 +6591,21 @@ static int _prolog_is_running (uint32_t jobid) /* Wait for the job's prolog to complete */ static void _wait_for_job_running_prolog(uint32_t job_id) { - pthread_mutex_t dummy_lock = PTHREAD_MUTEX_INITIALIZER; + static pthread_mutex_t dummy_lock = PTHREAD_MUTEX_INITIALIZER; + struct timespec ts = {0, 0}; + struct timeval now; debug( "Waiting for job %d's prolog to complete", job_id); while (_prolog_is_running (job_id)) { + + gettimeofday(&now, NULL); + ts.tv_sec = now.tv_sec+1; + ts.tv_nsec = now.tv_usec * 1000; + slurm_mutex_lock(&dummy_lock); - slurm_cond_wait(&conf->prolog_running_cond, &dummy_lock); + slurm_cond_timedwait(&conf->prolog_running_cond, + &dummy_lock, &ts); slurm_mutex_unlock(&dummy_lock); }