Skip to content
Snippets Groups Projects
Commit dc5cf7cb authored by Moe Jette's avatar Moe Jette
Browse files

Load HeartbeatInterval from config file (was ignored).

Remove non-agent based credential revoke.
Minor clean-up of code to handle non-responsive node.
parent ea597062
No related branches found
No related tags found
No related merge requests found
......@@ -1152,16 +1152,21 @@ ping_nodes (void)
debug3 ("ping %s now", node_record_table_ptr[i].name);
if ((agent_args->addr_count+1) > buf_rec_size) {
buf_rec_size += 32;
xrealloc ((agent_args->slurm_addr), (sizeof (struct sockaddr_in) * buf_rec_size));
xrealloc ((agent_args->node_names), (MAX_NAME_LEN * buf_rec_size));
xrealloc ((agent_args->slurm_addr),
(sizeof (struct sockaddr_in) * buf_rec_size));
xrealloc ((agent_args->node_names),
(MAX_NAME_LEN * buf_rec_size));
}
agent_args->slurm_addr[agent_args->addr_count] = node_record_table_ptr[i].slurm_addr;
agent_args->slurm_addr[agent_args->addr_count] =
node_record_table_ptr[i].slurm_addr;
strncpy (&agent_args->node_names[MAX_NAME_LEN*agent_args->addr_count],
node_record_table_ptr[i].name, MAX_NAME_LEN);
agent_args->addr_count++;
if (age >= slurmctld_conf.slurmd_timeout) {
error ("node %s not responding", node_record_table_ptr[i].name);
if ((age >= slurmctld_conf.slurmd_timeout) &&
(node_record_table_ptr[i].node_state != NODE_STATE_DOWN)) {
error ("Node %s not responding, setting DOWN",
node_record_table_ptr[i].name);
last_node_update = time (NULL);
bit_clear (up_node_bitmap, i);
bit_clear (idle_node_bitmap, i);
......@@ -1174,6 +1179,7 @@ ping_nodes (void)
xfree (agent_args);
return;
}
debug ("Spawning ping agent");
if (pthread_attr_init (&attr_agent))
fatal ("pthread_attr_init error %m");
......
......@@ -42,7 +42,6 @@
#include <src/slurmctld/agent.h>
#include <src/slurmctld/slurmctld.h>
#define AGENT_TEST 1
#define BUF_SIZE 1024
struct node_set { /* set of nodes with same configuration */
......@@ -115,7 +114,6 @@ deallocate_nodes (struct job_record * job_ptr)
{
int i;
revoke_credential_msg_t *revoke_job_cred;
#if AGENT_TEST
agent_arg_t *agent_args;
pthread_attr_t attr_agent;
pthread_t thread_agent;
......@@ -123,7 +121,6 @@ deallocate_nodes (struct job_record * job_ptr)
agent_args = xmalloc (sizeof (agent_arg_t));
agent_args->msg_type = REQUEST_REVOKE_JOB_CREDENTIAL;
#endif
revoke_job_cred = xmalloc (sizeof (revoke_credential_msg_t));
last_node_update = time (NULL);
revoke_job_cred->job_id = job_ptr->job_id;
......@@ -133,7 +130,6 @@ deallocate_nodes (struct job_record * job_ptr)
for (i = 0; i < node_record_count; i++) {
if (bit_test (job_ptr->node_bitmap, i) == 0)
continue;
#if AGENT_TEST
if ((agent_args->addr_count+1) > buf_rec_size) {
buf_rec_size += 32;
xrealloc ((agent_args->slurm_addr), (sizeof (struct sockaddr_in) * buf_rec_size));
......@@ -143,14 +139,10 @@ deallocate_nodes (struct job_record * job_ptr)
strncpy (&agent_args->node_names[MAX_NAME_LEN*agent_args->addr_count],
node_record_table_ptr[i].name, MAX_NAME_LEN);
agent_args->addr_count++;
#else
slurm_revoke_job_cred (&node_record_table_ptr[i], revoke_job_cred);
#endif
node_record_table_ptr[i].node_state = NODE_STATE_IDLE;
bit_set (idle_node_bitmap, i);
}
#if AGENT_TEST
agent_args->msg_args = revoke_job_cred;
debug ("Spawning revoke credential agent");
if (pthread_attr_init (&attr_agent))
......@@ -167,9 +159,6 @@ deallocate_nodes (struct job_record * job_ptr)
if (pthread_create (&thread_agent, &attr_agent, agent, (void *)agent_args))
fatal ("pthread_create error %m");
}
#else
xfree (revoke_job_cred);
#endif
return;
}
......
......@@ -297,6 +297,9 @@ parse_config_spec (char *in_line)
if ( hash_base )
slurmctld_conf.hash_base = hash_base;
if ( heartbeat_interval )
slurmctld_conf.heartbeat_interval = heartbeat_interval;
if ( kill_wait )
slurmctld_conf.kill_wait = kill_wait;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment