From f9d8de6f623a0530710b50ab86b67fdf31e7f8b6 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Tue, 24 Aug 2010 22:00:29 +0000 Subject: [PATCH] revert try_lock code in controller.c since it can lead to starvation under heavy load --- src/slurmctld/controller.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 69c1f62e484..3f412fed948 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1307,25 +1307,25 @@ static void *_slurmctld_background(void *no_data) break; } - if ((difftime(now, last_resv_time) >= 2) && - (try_lock_slurmctld(node_write_lock) == 0)) { + if (difftime(now, last_resv_time) >= 2) { last_resv_time = now; + lock_slurmctld(node_write_lock); set_node_maint_mode(); unlock_slurmctld(node_write_lock); } - if ((difftime(now, last_no_resp_msg_time) >= - no_resp_msg_interval) && - (try_lock_slurmctld(node_write_lock2) == 0)) { + if (difftime(now, last_no_resp_msg_time) >= + no_resp_msg_interval) { last_no_resp_msg_time = now; + lock_slurmctld(node_write_lock2); node_no_resp_msg(); unlock_slurmctld(node_write_lock2); } - if ((difftime(now, last_timelimit_time) >= PERIODIC_TIMEOUT) && - (try_lock_slurmctld(job_write_lock) == 0)) { + if (difftime(now, last_timelimit_time) >= PERIODIC_TIMEOUT) { last_timelimit_time = now; debug2("Testing job time limits and checkpoints"); + lock_slurmctld(job_write_lock); job_time_limit(); step_checkpoint(); unlock_slurmctld(job_write_lock); @@ -1334,8 +1334,8 @@ static void *_slurmctld_background(void *no_data) if (slurmctld_conf.health_check_interval && (difftime(now, last_health_check_time) >= slurmctld_conf.health_check_interval) && - is_ping_done() && - (try_lock_slurmctld(node_write_lock) == 0)) { + is_ping_done()) { + lock_slurmctld(node_write_lock); last_health_check_time = now; run_health_check(); #ifdef HAVE_CRAY_XT @@ -1344,12 +1344,11 @@ static void *_slurmctld_background(void *no_data) unlock_slurmctld(node_write_lock); } if (((difftime(now, last_ping_node_time) >= ping_interval) || - ping_nodes_now) && - is_ping_done() && - (try_lock_slurmctld(node_write_lock) == 0)) { + ping_nodes_now) && is_ping_done()) { ping_msg_sent = false; last_ping_node_time = now; ping_nodes_now = false; + lock_slurmctld(node_write_lock); ping_nodes(); unlock_slurmctld(node_write_lock); } else if ((difftime(now, last_ping_node_time) >= @@ -1366,10 +1365,10 @@ static void *_slurmctld_background(void *no_data) if (slurmctld_conf.inactive_limit && (difftime(now, last_ping_srun_time) >= - (slurmctld_conf.inactive_limit / 3)) && - (try_lock_slurmctld(job_read_lock) == 0)) { + (slurmctld_conf.inactive_limit / 3))) { last_ping_srun_time = now; debug2("Performing srun ping"); + lock_slurmctld(job_read_lock); srun_ping(); unlock_slurmctld(job_read_lock); } @@ -1379,21 +1378,21 @@ static void *_slurmctld_background(void *no_data) group_time = slurmctld_conf.group_info & GROUP_TIME_MASK; if (group_time && - (difftime(now, last_group_time) >= group_time) && - (try_lock_slurmctld(part_write_lock) == 0)) { + (difftime(now, last_group_time) >= group_time)) { if (slurmctld_conf.group_info & GROUP_FORCE) group_force = 1; else group_force = 0; last_group_time = now; + lock_slurmctld(part_write_lock); load_part_uid_allow_list(group_force); unlock_slurmctld(part_write_lock); } - if ((difftime(now, last_purge_job_time) >=purge_job_interval)&& - (try_lock_slurmctld(job_write_lock) == 0)) { + if (difftime(now, last_purge_job_time) >= purge_job_interval) { last_purge_job_time = now; debug2("Performing purge of old job records"); + lock_slurmctld(job_write_lock); purge_old_job(); unlock_slurmctld(job_write_lock); } -- GitLab