diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 51a84e74f66c45d3bcc197b02e27843544b178f0..852a3fbcca783adac5dde14a3a409c234ac49eae 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -70,6 +70,8 @@ #define BUF_SIZE 1024 /* Temporary buffer size */ #define DEFAULT_DAEMONIZE 1 /* Run as daemon by default if set */ #define DEFAULT_RECOVER 1 /* Recover state by default if set */ +#define MIN_CHECKIN_TIME 3 /* Nodes have this number of seconds to + * check-in before we ping them */ #define MAX_SERVER_THREADS 20 /* Max threads to service RPCs */ #define MEM_LEAK_TEST 0 /* Running memory leak test if set */ @@ -501,8 +503,11 @@ static void *_slurmctld_background(void *no_data) }; /* Let the dust settle before doing work */ - last_sched_time = last_checkpoint_time = last_timelimit_time = - last_ping_time = last_rpc_retry_time = time(NULL); + now = time(NULL); + last_sched_time = last_checkpoint_time = now; + last_timelimit_time = last_rpc_retry_time = now; + last_ping_time = now + (time_t)MIN_CHECKIN_TIME - + (time_t)slurmctld_conf.heartbeat_interval; (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); (void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); debug3("_slurmctld_background pid = %u", getpid()); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 75ecf27518af231d54646dd727a703a7c9894da6..fcc25e22dd2e029b1610075a36584b797d9abc96 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -95,7 +95,8 @@ char * bitmap2node_name (bitstr_t *bitmap) char prefix[MAX_NAME_LEN], suffix[MAX_NAME_LEN]; char format[MAX_NAME_LEN], temp[MAX_NAME_LEN]; char last_prefix[MAX_NAME_LEN], last_suffix[MAX_NAME_LEN]; - int first_index = 0, last_index = 0, index, first_digits, last_digits; + int first_index = 0, last_index = 0, index; + int first_digits = 0, last_digits = 0; if (bitmap == NULL) { node_list_ptr = xmalloc (1); /* returns ptr to "\0" */ @@ -484,7 +485,7 @@ int load_all_node_state ( void ) node_ptr->cpus = cpus; node_ptr->real_memory = real_memory; node_ptr->tmp_disk = tmp_disk; - node_ptr->last_response = time (NULL); + node_ptr->last_response = (time_t) 0; } else { error ("Node %s has vanished from configuration", node_name); @@ -1258,7 +1259,8 @@ void ping_nodes (void) if (age < slurmctld_conf.heartbeat_interval) continue; - if ((age >= slurmctld_conf.slurmd_timeout) && + if ((node_record_table_ptr[i].last_response != (time_t)0) && + (age >= slurmctld_conf.slurmd_timeout) && (base_state != NODE_STATE_DOWN)) { error ("Node %s not responding, setting DOWN", node_record_table_ptr[i].name); @@ -1294,6 +1296,9 @@ void ping_nodes (void) } debug3 ("ping %s now", node_record_table_ptr[i].name); + if (node_record_table_ptr[i].last_response == (time_t)0) + node_record_table_ptr[i].last_response = now; + if ((ping_agent_args->node_count+1) > ping_buf_rec_size) { ping_buf_rec_size += 32; xrealloc ((ping_agent_args->slurm_addr), diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index d75e429e84c1a6a93e4d9f154372c304c2b57b78..538709067b7659b346a621e856e1a264c9ef820b 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -377,7 +377,7 @@ static int _parse_node_spec(char *in_line) if ((state_val != NO_VAL) && (state_val != NODE_STATE_UNKNOWN)) node_record_point->node_state = state_val; - node_record_point->last_response = time(NULL); + node_record_point->last_response = (time_t) 0; if (node_addr) this_node_addr = hostlist_shift(addr_list); else