diff --git a/NEWS b/NEWS index 2a1c015964c5f4f0b9079b1e2464fb231989bff3..d3f0927562cb23b63b8ed81c1ae75e990d9059c4 100644 --- a/NEWS +++ b/NEWS @@ -34,6 +34,8 @@ documents those changes that are of interest to users and administrators. on device name instead of brand name. -- cli_filter - fix logic error with option lookup functions. -- Fix bad testing of NodeFeatures debug flag in contribs/cray. + -- Cleanup track_script code to avoid race conditions and invalid memory + access. * Changes in Slurm 19.05.1-2 ============================ diff --git a/src/common/track_script.c b/src/common/track_script.c index c8de12b96c660dffbff57e527fa2ac2fc3d121f8..9398534aead632b7eb227f759ff487b0a65c89d0 100644 --- a/src/common/track_script.c +++ b/src/common/track_script.c @@ -38,12 +38,31 @@ #include <sys/time.h> #include <sys/wait.h> +#include "slurm/slurm_errno.h" + #include "src/common/macros.h" #include "src/common/xmalloc.h" #include "src/common/list.h" #include "src/common/track_script.h" static List track_script_thd_list = NULL; +static pthread_mutex_t flush_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t flush_cond = PTHREAD_COND_INITIALIZER; +static int flush_cnt = 0; + +typedef struct { + uint32_t job_id; + pid_t cpid; + pthread_t tid; + pthread_mutex_t timer_mutex; + pthread_cond_t timer_cond; +} track_script_rec_t; + +typedef struct { + pthread_t tid; + int status; + bool rc; +} foreach_broadcast_rec_t; static void _track_script_rec_destroy(void *arg) { @@ -72,11 +91,12 @@ static void _kill_script(track_script_rec_t *r) * this will make the caller thread to finalize, so wait also for it to * avoid any zombies. */ -static void _track_script_rec_cleanup(track_script_rec_t *r) +static void *_track_script_rec_cleanup(void *arg) { int rc = 1; struct timeval tvnow; struct timespec abs; + track_script_rec_t *r = (track_script_rec_t *)arg; debug("script for jobid=%u found running, tid=%lu, force ending.", r->job_id, (unsigned long)r->tid); @@ -105,6 +125,18 @@ static void _track_script_rec_cleanup(track_script_rec_t *r) pthread_cancel(r->tid); pthread_join(r->tid, NULL); + + slurm_mutex_lock(&flush_mutex); + flush_cnt++; + slurm_cond_signal(&flush_cond); + slurm_mutex_unlock(&flush_mutex); + + return NULL; +} + +static void _make_cleanup_thread(track_script_rec_t *r) +{ + slurm_thread_create_detached(NULL, _track_script_rec_cleanup, r); } static int _flush_job(void *r, void *x) @@ -145,9 +177,7 @@ static int _reset_cpid(void *object, void *key) ((track_script_rec_t *)object)->cpid = ((track_script_rec_t *)key)->cpid; - /* - * When we find the one we care about we can break out of the for_each - */ + /* Exit for_each after we found the one we care about. */ return -1; } @@ -159,10 +189,38 @@ extern void track_script_init(void) extern void track_script_flush(void) { + int count; + List tmp_list = list_create(_track_script_rec_destroy); + + /* + * Transfer list within mutex and work off of copy to prevent race + * condition of track_script_remove() removing track_script_rec_t while + * in cleanup thread. + */ + slurm_mutex_lock(&flush_mutex); + + list_transfer(tmp_list, track_script_thd_list); + + count = list_count(tmp_list); + if (!count) { + slurm_mutex_unlock(&flush_mutex); + return; + } + + flush_cnt = 0; /* kill all scripts we are tracking */ - (void) list_for_each(track_script_thd_list, - (ListForF)_track_script_rec_cleanup, NULL); - (void) list_flush(track_script_thd_list); + (void) list_for_each(tmp_list, + (ListForF)_make_cleanup_thread, NULL); + + while (flush_cnt < count) { + slurm_cond_wait(&flush_cond, &flush_mutex); + debug("%s: got %d scripts out of %d flushed", + __func__, flush_cnt, count); + } + + FREE_NULL_LIST(tmp_list); + slurm_mutex_unlock(&flush_mutex); + } extern void track_script_flush_job(uint32_t job_id) @@ -177,8 +235,7 @@ extern void track_script_fini(void) FREE_NULL_LIST(track_script_thd_list); } -extern track_script_rec_t *track_script_rec_add( - uint32_t job_id, pid_t cpid, pthread_t tid) +extern void track_script_rec_add(uint32_t job_id, pid_t cpid, pthread_t tid) { track_script_rec_t *track_script_rec = xmalloc(sizeof(track_script_rec_t)); @@ -189,25 +246,47 @@ extern track_script_rec_t *track_script_rec_add( slurm_mutex_init(&track_script_rec->timer_mutex); slurm_cond_init(&track_script_rec->timer_cond, NULL); list_append(track_script_thd_list, track_script_rec); - - return track_script_rec; } -extern bool track_script_broadcast(track_script_rec_t *track_script_rec, - int status) +static int _script_broadcast(void *object, void *key) { + track_script_rec_t *track_script_rec = (track_script_rec_t *)object; + foreach_broadcast_rec_t *tmp_rec = (foreach_broadcast_rec_t *)key; + + if (!_match_tid(object, &tmp_rec->tid)) + return 0; + bool rc = false; /* I was killed by slurmtrack, bail out right now */ slurm_mutex_lock(&track_script_rec->timer_mutex); - if (WIFSIGNALED(status) && (WTERMSIG(status) == SIGKILL) - && track_script_rec->cpid == -1) { + if (WIFSIGNALED(tmp_rec->status) && + (WTERMSIG(tmp_rec->status) == SIGKILL) && + (track_script_rec->cpid == -1)) { slurm_cond_broadcast(&track_script_rec->timer_cond); rc = true; } slurm_mutex_unlock(&track_script_rec->timer_mutex); - return rc; + tmp_rec->rc = rc; + + /* Exit for_each after we found the one we care about. */ + return -1; +} + +extern bool track_script_broadcast(pthread_t tid, int status) +{ + foreach_broadcast_rec_t tmp_rec; + + memset(&tmp_rec, 0, sizeof(tmp_rec)); + tmp_rec.tid = tid; + tmp_rec.status = status; + + if (list_for_each(track_script_thd_list, _script_broadcast, &tmp_rec)) + return tmp_rec.rc; + + debug("%s: didn't find track_script for tid %ld", __func__, tid); + return true; } /* Remove this job from the list of jobs currently running a script */ diff --git a/src/common/track_script.h b/src/common/track_script.h index 12aaebd7bd3ee67f271a79d991f59bf342d31de4..55e004bb70248b96c85d1fe389b8876a9b158319 100644 --- a/src/common/track_script.h +++ b/src/common/track_script.h @@ -41,14 +41,6 @@ #include <inttypes.h> -typedef struct { - uint32_t job_id; - pid_t cpid; - pthread_t tid; - pthread_mutex_t timer_mutex; - pthread_cond_t timer_cond; -} track_script_rec_t; - /* Init track_script system */ extern void track_script_init(void); @@ -67,16 +59,13 @@ extern void track_script_flush_job(uint32_t job_id); * IN cpid - If non-zero this track_script_rec_t will implicitly call * track_script_add * IN tid - thread id of thread we are tracking - * Returns track_script_rec_t */ -extern track_script_rec_t *track_script_rec_add( - uint32_t job_id, pid_t cpid, pthread_t tid); +extern void track_script_rec_add(uint32_t job_id, pid_t cpid, pthread_t tid); /* * Signal script thread to end */ -extern bool track_script_broadcast(track_script_rec_t *track_script_rec, - int status); +extern bool track_script_broadcast(pthread_t tid, int status); /* Remove this thread from the track_script system */ extern void track_script_remove(pthread_t tid); diff --git a/src/plugins/burst_buffer/datawarp/burst_buffer_datawarp.c b/src/plugins/burst_buffer/datawarp/burst_buffer_datawarp.c index 194d72fd12a57c694bdbafedf0ef125b8d22a4b8..6318cbc315fdd1871c9eb97aaff08573fbca0ba8 100644 --- a/src/plugins/burst_buffer/datawarp/burst_buffer_datawarp.c +++ b/src/plugins/burst_buffer/datawarp/burst_buffer_datawarp.c @@ -1574,8 +1574,7 @@ static void *_start_stage_in(void *x) bb_job_t *bb_job; bool get_real_size = false; DEF_TIMERS; - track_script_rec_t *track_script_rec = - track_script_rec_add(stage_args->job_id, 0, pthread_self()); + track_script_rec_add(stage_args->job_id, 0, pthread_self()); setup_argv = stage_args->args1; data_in_argv = stage_args->args2; @@ -1594,7 +1593,7 @@ static void *_start_stage_in(void *x) info("%s: setup for job JobId=%u ran for %s", __func__, stage_args->job_id, TIME_STR); - if (track_script_broadcast(track_script_rec, status)) { + if (track_script_broadcast(pthread_self(), status)) { /* I was killed by slurmtrack, bail out right now */ info("%s: setup for JobId=%u terminated by slurmctld", __func__, stage_args->job_id); @@ -1603,10 +1602,7 @@ static void *_start_stage_in(void *x) xfree(resp_msg); xfree(stage_args->pool); xfree(stage_args); - /* - * Don't need to free track_script_rec here, - * it is handled elsewhere since it still being tracked. - */ + track_script_remove(pthread_self()); return NULL; } track_script_reset_cpid(pthread_self(), 0); @@ -1675,7 +1671,7 @@ static void *_start_stage_in(void *x) END_TIMER; info("%s: dws_data_in for JobId=%u ran for %s", __func__, stage_args->job_id, TIME_STR); - if (track_script_broadcast(track_script_rec, status)) { + if (track_script_broadcast(pthread_self(), status)) { /* I was killed by slurmtrack, bail out right now */ info("%s: dws_data_in for JobId=%u terminated by slurmctld", __func__, stage_args->job_id); @@ -1733,7 +1729,7 @@ static void *_start_stage_in(void *x) bb_state.bb_config.debug_flag) info("%s: real_size ran for %s", __func__, TIME_STR); - if (track_script_broadcast(track_script_rec, status)) { + if (track_script_broadcast(pthread_self(), status)) { /* I was killed by slurmtrack, bail out right now */ info("%s: real_size for JobId=%u terminated by slurmctld", __func__, stage_args->job_id); @@ -1904,8 +1900,7 @@ static void *_start_stage_out(void *x) bb_alloc_t *bb_alloc = NULL; bb_job_t *bb_job = NULL; DEF_TIMERS - track_script_rec_t *track_script_rec = - track_script_rec_add(stage_args->job_id, 0, pthread_self()); + track_script_rec_add(stage_args->job_id, 0, pthread_self()); data_out_argv = stage_args->args1; post_run_argv = stage_args->args2; @@ -1927,7 +1922,7 @@ static void *_start_stage_out(void *x) __func__, stage_args->job_id, TIME_STR); } - if (track_script_broadcast(track_script_rec, status)) { + if (track_script_broadcast(pthread_self(), status)) { /* I was killed by slurmtrack, bail out right now */ info("%s: dws_post_run for JobId=%u terminated by slurmctld", __func__, stage_args->job_id); @@ -1936,10 +1931,7 @@ static void *_start_stage_out(void *x) xfree(resp_msg); xfree(stage_args->pool); xfree(stage_args); - /* - * Don't need to free track_script_rec here, - * it is handled elsewhere since it still being tracked. - */ + track_script_remove(pthread_self()); return NULL; } track_script_reset_cpid(pthread_self(), 0); @@ -1992,7 +1984,7 @@ static void *_start_stage_out(void *x) __func__, stage_args->job_id, TIME_STR); } - if (track_script_broadcast(track_script_rec, status)) { + if (track_script_broadcast(pthread_self(), status)) { /* I was killed by slurmtrack, bail out right now */ info("%s: dws_data_out for JobId=%u terminated by slurmctld", __func__, stage_args->job_id); @@ -2001,10 +1993,7 @@ static void *_start_stage_out(void *x) xfree(resp_msg); xfree(stage_args->pool); xfree(stage_args); - /* - * Don't need to free track_script_rec here, - * it is handled elsewhere since it still being tracked. - */ + track_script_remove(pthread_self()); return NULL; } track_script_reset_cpid(pthread_self(), 0); @@ -2165,9 +2154,7 @@ static void *_start_teardown(void *x) NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; DEF_TIMERS; bool hurry; - track_script_rec_t *track_script_rec = - track_script_rec_add(teardown_args->job_id, 0, - pthread_self()); + track_script_rec_add(teardown_args->job_id, 0, pthread_self()); teardown_argv = teardown_args->args1; @@ -2188,17 +2175,14 @@ static void *_start_teardown(void *x) info("%s: teardown for JobId=%u ran for %s", __func__, teardown_args->job_id, TIME_STR); - if (track_script_broadcast(track_script_rec, status)) { + if (track_script_broadcast(pthread_self(), status)) { /* I was killed by slurmtrack, bail out right now */ info("%s: teardown for JobId=%u terminated by slurmctld", __func__, teardown_args->job_id); xfree(resp_msg); free_command_argv(teardown_argv); xfree(teardown_args); - /* - * Don't need to free track_script_rec here, - * it is handled elsewhere since it still being tracked. - */ + track_script_remove(pthread_self()); return NULL; } /* track_script_reset_cpid(pthread_self(), 0); */ @@ -4185,8 +4169,7 @@ static void *_start_pre_run(void *x) uint32_t timeout; bool hold_job = false, nodes_ready = false; DEF_TIMERS; - track_script_rec_t *track_script_rec = - track_script_rec_add(pre_run_args->job_id, 0, pthread_self()); + track_script_rec_add(pre_run_args->job_id, 0, pthread_self()); /* Wait for node boot to complete */ while (!nodes_ready) { @@ -4216,17 +4199,14 @@ static void *_start_pre_run(void *x) &status); END_TIMER; - if (track_script_broadcast(track_script_rec, status)) { + if (track_script_broadcast(pthread_self(), status)) { /* I was killed by slurmtrack, bail out right now */ info("%s: dws_pre_run for JobId=%u terminated by slurmctld", __func__, pre_run_args->job_id); xfree(resp_msg); free_command_argv(pre_run_args->args); xfree(pre_run_args); - /* - * Don't need to free track_script_rec here, - * it is handled elsewhere since it still being tracked. - */ + track_script_remove(pthread_self()); return NULL; } /* track_script_reset_cpid(pthread_self(), 0); */ @@ -4770,8 +4750,7 @@ static void *_create_persistent(void *x) int i, status = 0; uint32_t timeout; DEF_TIMERS; - track_script_rec_t *track_script_rec = - track_script_rec_add(create_args->job_id, 0, pthread_self()); + track_script_rec_add(create_args->job_id, 0, pthread_self()); script_argv = xcalloc(20, sizeof(char *)); /* NULL terminated */ script_argv[0] = xstrdup("dw_wlm_cli"); @@ -4815,16 +4794,13 @@ static void *_create_persistent(void *x) info("create_persistent of %s ran for %s", create_args->name, TIME_STR); - if (track_script_broadcast(track_script_rec, status)) { + if (track_script_broadcast(pthread_self(), status)) { /* I was killed by slurmtrack, bail out right now */ info("%s: create_persistent for JobId=%u terminated by slurmctld", __func__, create_args->job_id); xfree(resp_msg); _free_create_args(create_args); - /* - * Don't need to free track_script_rec here, - * it is handled elsewhere since it still being tracked. - */ + track_script_remove(pthread_self()); return NULL; } /* track_script_reset_cpid(pthread_self(), 0); */ @@ -4940,8 +4916,7 @@ static void *_destroy_persistent(void *x) int status = 0; uint32_t timeout; DEF_TIMERS; - track_script_rec_t *track_script_rec = - track_script_rec_add(destroy_args->job_id, 0, pthread_self()); + track_script_rec_add(destroy_args->job_id, 0, pthread_self()); slurm_mutex_lock(&bb_state.bb_mutex); bb_alloc = bb_find_name_rec(destroy_args->name, destroy_args->user_id, @@ -4978,16 +4953,14 @@ static void *_destroy_persistent(void *x) info("destroy_persistent of %s ran for %s", destroy_args->name, TIME_STR); - if (track_script_broadcast(track_script_rec, status)) { + if (track_script_broadcast(pthread_self(), status)) { /* I was killed by slurmtrack, bail out right now */ info("%s: destroy_persistent for JobId=%u terminated by slurmctld", __func__, destroy_args->job_id); xfree(resp_msg); _free_create_args(destroy_args); - /* - * Don't need to free track_script_rec here, - * it is handled elsewhere since it still being tracked. - */ + + track_script_remove(pthread_self()); return NULL; } /* track_script_reset_cpid(pthread_self(), 0); */ diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 564309418285d770e6d44425a1ef4a7565d824fe..6dd62cd61ac0fd966aca7e15dac1402feca49f33 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -3824,7 +3824,6 @@ static void *_run_epilog(void *arg) int i, status, wait_rc; char *argv[2]; uint16_t tm; - track_script_rec_t *track_script_rec; argv[0] = epilog_arg->epilog_slurmctld; argv[1] = NULL; @@ -3842,8 +3841,7 @@ static void *_run_epilog(void *arg) } /* Start tracking this new process */ - track_script_rec = track_script_rec_add(epilog_arg->job_id, cpid, - pthread_self()); + track_script_rec_add(epilog_arg->job_id, cpid, pthread_self()); /* Prolog and epilog use the same timeout */ @@ -3860,7 +3858,7 @@ static void *_run_epilog(void *arg) } } - if (track_script_broadcast(track_script_rec, status)) { + if (track_script_broadcast(pthread_self(), status)) { info("epilog_slurmctld JobId=%u epilog killed by signal %u", epilog_arg->job_id, WTERMSIG(status)); @@ -3869,6 +3867,7 @@ static void *_run_epilog(void *arg) xfree(epilog_arg->my_env[i]); xfree(epilog_arg->my_env); xfree(epilog_arg); + track_script_remove(pthread_self()); return NULL; } else if (status != 0) { error("epilog_slurmctld JobId=%u epilog exit status %u:%u", @@ -4276,7 +4275,6 @@ static void *_run_prolog(void *arg) time_t now = time(NULL); uint16_t resume_timeout = slurm_get_resume_timeout(); uint16_t tm; - track_script_rec_t *track_script_rec; lock_slurmctld(config_read_lock); job_id = job_ptr->job_id; @@ -4310,8 +4308,7 @@ static void *_run_prolog(void *arg) } /* Start tracking this new process */ - track_script_rec = track_script_rec_add(job_ptr->job_id, - cpid, pthread_self()); + track_script_rec_add(job_ptr->job_id, cpid, pthread_self()); tm = slurm_get_prolog_timeout(); while (1) { @@ -4326,7 +4323,7 @@ static void *_run_prolog(void *arg) } } - if (track_script_broadcast(track_script_rec, status)) { + if (track_script_broadcast(pthread_self(), status)) { info("prolog_slurmctld JobId=%u prolog killed by signal %u", job_id, WTERMSIG(status)); @@ -4335,6 +4332,7 @@ static void *_run_prolog(void *arg) xfree(my_env[i]); xfree(my_env); FREE_NULL_BITMAP(node_bitmap); + track_script_remove(pthread_self()); return NULL; } else if (status != 0) { bool kill_job = false;