Skip to content
Snippets Groups Projects
Commit 411bb1a8 authored by Moe Jette's avatar Moe Jette
Browse files

if checkpoint of jobs and nodes out of sync, set node state busy if job is

associated with it.
parent f47a0930
No related branches found
No related tags found
No related merge requests found
...@@ -956,6 +956,9 @@ read_slurm_conf (int recover) { ...@@ -956,6 +956,9 @@ read_slurm_conf (int recover) {
unlock_slurmctld (config_write_lock); unlock_slurmctld (config_write_lock);
return error_code; return error_code;
} }
if (recover) {
(void) sync_nodes_to_jobs ();
}
/* sort config_list by weight for scheduling */ /* sort config_list by weight for scheduling */
list_sort (config_list, &list_compare_config); list_sort (config_list, &list_compare_config);
...@@ -967,3 +970,45 @@ read_slurm_conf (int recover) { ...@@ -967,3 +970,45 @@ read_slurm_conf (int recover) {
unlock_slurmctld (config_write_lock); unlock_slurmctld (config_write_lock);
return SLURM_SUCCESS; return SLURM_SUCCESS;
} }
/*
* sync_nodes_to_jobs - sync the node state to job states on slurmctld restart.
* we perform "lazy" updates on node states due to their number (assumes
* number of jobs is much smaller than the number of nodes). This routine
* marks nodes allocated to a job as busy no matter what the node's last
* saved state
* output: returns count of nodes having state changed
*/
int
sync_nodes_to_jobs (void)
{
struct job_record *job_ptr;
ListIterator job_record_iterator;
int i, update_cnt = 0;
job_record_iterator = list_iterator_create (job_list);
while ((job_ptr = (struct job_record *) list_next (job_record_iterator))) {
if ((job_ptr->job_state == JOB_PENDING) ||
(job_ptr->job_state == JOB_COMPLETE) ||
(job_ptr->job_state == JOB_FAILED) ||
(job_ptr->job_state == JOB_TIMEOUT))
continue;
if (job_ptr->node_bitmap == NULL)
continue;
for (i = 0; i < node_record_count; i++) {
if (bit_test (job_ptr->node_bitmap, i) == 0)
continue;
if (node_record_table_ptr[i].node_state == NODE_STATE_ALLOCATED)
continue; /* already in proper state */
update_cnt++;
if (node_record_table_ptr[i].node_state & NODE_STATE_NO_RESPOND)
node_record_table_ptr[i].node_state = NODE_STATE_ALLOCATED |
NODE_STATE_NO_RESPOND;
else
node_record_table_ptr[i].node_state = NODE_STATE_ALLOCATED;
}
}
if (update_cnt)
info ("sync_nodes_to_jobs updated state of %d nodes", update_cnt);
return update_cnt;
}
...@@ -95,7 +95,8 @@ extern time_t last_node_update; /* time of last update to node records */ ...@@ -95,7 +95,8 @@ extern time_t last_node_update; /* time of last update to node records */
struct node_record { struct node_record {
uint32_t magic; /* magic cookie to test data integrity */ uint32_t magic; /* magic cookie to test data integrity */
char name[MAX_NAME_LEN]; /* name of the node. a null name indicates defunct node */ char name[MAX_NAME_LEN]; /* name of the node. a null name indicates defunct node */
uint16_t node_state; /* enum node_states, ORed with STATE_NO_RESPOND if down */ uint16_t node_state; /* enum node_states, ORed with
NODE_STATE_NO_RESPOND if not responding */
time_t last_response; /* last response from the node */ time_t last_response; /* last response from the node */
uint32_t cpus; /* actual count of cpus running on the node */ uint32_t cpus; /* actual count of cpus running on the node */
uint32_t real_memory; /* actual megabytes of real memory on the node */ uint32_t real_memory; /* actual megabytes of real memory on the node */
...@@ -448,10 +449,10 @@ void purge_old_job (void); ...@@ -448,10 +449,10 @@ void purge_old_job (void);
extern int read_slurm_conf (int recover); extern int read_slurm_conf (int recover);
/* rehash - build a hash table of the node_record entries */ /* rehash - build a hash table of the node_record entries */
extern void rehash (); extern void rehash (void);
/* reset_job_bitmaps - reestablish bitmaps for existing jobs */ /* reset_job_bitmaps - reestablish bitmaps for existing jobs */
extern void reset_job_bitmaps (); extern void reset_job_bitmaps (void);
/* rmdir2 - issues system call to rmdir (if root) */ /* rmdir2 - issues system call to rmdir (if root) */
extern int rmdir2 (char * path); extern int rmdir2 (char * path);
...@@ -475,10 +476,13 @@ extern void set_slurmd_addr (void); ...@@ -475,10 +476,13 @@ extern void set_slurmd_addr (void);
extern int step_create ( step_specs *step_specs, struct step_record** ); extern int step_create ( step_specs *step_specs, struct step_record** );
/* step_lock - lock the step information */ /* step_lock - lock the step information */
extern void step_lock (); extern void step_lock (void);
/* step_unlock - unlock the step information */ /* step_unlock - unlock the step information */
extern void step_unlock (); extern void step_unlock (void);
/* sync_nodes_to_jobs - sync the node state to job states on slurmctld restart */
extern int sync_nodes_to_jobs (void);
/* update_job - update a job's parameters per the supplied specification */ /* update_job - update a job's parameters per the supplied specification */
extern int update_job (job_desc_msg_t * job_specs); extern int update_job (job_desc_msg_t * job_specs);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment