From 03a92da4ac02313d0cade8ad3935a57379b714a8 Mon Sep 17 00:00:00 2001 From: jette <jette@schedmd.com> Date: Thu, 11 Jul 2013 13:13:56 -0700 Subject: [PATCH] Minor restructing of process signaling logic with deferal for core The most significant change is that we don't signal individual processes unless there are some processes in the step which are currently core dumping. This change signals the processes closer together in time when possible. --- src/slurmd/common/proctrack.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/slurmd/common/proctrack.c b/src/slurmd/common/proctrack.c index 30f7f573933..5d170548aa9 100644 --- a/src/slurmd/common/proctrack.c +++ b/src/slurmd/common/proctrack.c @@ -264,7 +264,6 @@ static void *_sig_agent(void *args) int i, npids = 0, hung_pids = 0; char *stat_fname = NULL; - sleep(5); if (slurm_container_get_pids(agent_arg_ptr->cont_id, &pids, &npids) == SLURM_SUCCESS) { hung_pids = 0; @@ -277,14 +276,15 @@ static void *_sig_agent(void *args) (int) pids[i]); hung_pids++; } else { + /* Kill processes that we can now */ kill(pids[i], agent_arg_ptr->signal); - pids[i] = 0; } xfree(stat_fname); } } if (hung_pids == 0) break; + sleep(5); } (void) (*(ops.signal)) (agent_arg_ptr->cont_id, agent_arg_ptr->signal); @@ -347,7 +347,7 @@ extern int slurm_container_signal(uint64_t cont_id, int signal) (int) pids[i]); hung_pids++; } else { - kill(pids[i], signal); + /* Don't test this PID again */ pids[i] = 0; } xfree(stat_fname); @@ -357,8 +357,8 @@ extern int slurm_container_signal(uint64_t cont_id, int signal) } xfree(pids); if (hung_pids) { - info("Defering sending signal to processes " - "currently core dumping"); + info("Defering sending signal, processes in " + "job are currently core dumping"); _spawn_signal_thread(cont_id, signal); return SLURM_SUCCESS; } -- GitLab