From 4e8545b66e3879aa9f70bf9dcf77fcd801ae7e26 Mon Sep 17 00:00:00 2001 From: Didier GAZEN <didier.gazen@aero.obs-mip.fr> Date: Wed, 10 Jun 2015 16:33:53 -0700 Subject: [PATCH] Fix for node reboot/down state In your node_mgr fix to keep rebooted nodes down (commit 9cd15dfe96b54), you forgot to consider the case of nodes that are powered up but are responding after ResumeTimeout seconds (the maximum time permitted). Such nodes are marked DOWN (because they didn't respond within ResumeTimeout seconds) than should become silently available when ReturnToService=1 (as stated in the slurm.conf manual) With your modification when such nodes are finally responding, they are seen as rebooted nodes and remain in the DOWN state (with the new reason: Node unexpectedly rebooted) even when ReturnToService=1 ! Correction of commit 3c2b46af01c50bd03c85235d1aaeb75acd62bb20 --- src/slurmctld/node_mgr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 76baa611946..afb68c4974c 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -2226,6 +2226,8 @@ extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg, } if (IS_NODE_NO_RESPOND(node_ptr)) { + if (IS_NODE_POWER_UP(node_ptr)) + node_ptr->last_response = now; node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); node_ptr->node_state &= (~NODE_STATE_POWER_UP); last_node_update = time (NULL); -- GitLab