diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 362e71b2e3191eb61a9f451757e61517fbb9f992..d763e945c216ea443f6a1fad4c5a51504d75f5d7 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -584,7 +584,14 @@ enum node_states { }; #define NODE_STATE_BASE 0x000f #define NODE_STATE_FLAGS 0xfff0 -#define NODE_STATE_RES 0x0020 +#define NODE_STATE_NET 0x0010 /* If a node is using Cray's + * Network Performance + * Counters but isn't in a + * allocation. */ +#define NODE_STATE_RES 0x0020 /* If a node is in a + * reservation (used primarily + * to note a node isn't idle + * for non-reservation jobs) */ #define NODE_STATE_UNDRAIN 0x0040 /* Clear DRAIN flag for a node */ #define NODE_STATE_CLOUD 0x0080 /* node comes from cloud */ #define NODE_RESUME 0x0100 /* Restore a DRAINED, DRAINING, DOWN diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index e241abbe985358245a659079b31bb3c6bdf5e77d..a2f3b3c72939087cb71b6fc452cf10d0637d2349 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1506,6 +1506,7 @@ extern char *node_state_string(uint16_t inx) bool drain_flag = (inx & NODE_STATE_DRAIN); bool fail_flag = (inx & NODE_STATE_FAIL); bool maint_flag = (inx & NODE_STATE_MAINT); + bool net_flag = (inx & NODE_STATE_NET); bool res_flag = (inx & NODE_STATE_RES); bool resume_flag = (inx & NODE_RESUME); bool no_resp_flag = (inx & NODE_STATE_NO_RESPOND); @@ -1585,6 +1586,8 @@ extern char *node_state_string(uint16_t inx) return "IDLE~"; if (no_resp_flag) return "IDLE*"; + if (net_flag) + return "NETPERF"; if (res_flag) return "RESERVED"; return "IDLE"; @@ -1628,8 +1631,9 @@ extern char *node_state_string_compact(uint16_t inx) bool drain_flag = (inx & NODE_STATE_DRAIN); bool fail_flag = (inx & NODE_STATE_FAIL); bool maint_flag = (inx & NODE_STATE_MAINT); - bool resume_flag = (inx & NODE_RESUME); + bool net_flag = (inx & NODE_STATE_NET); bool res_flag = (inx & NODE_STATE_RES); + bool resume_flag = (inx & NODE_RESUME); bool no_resp_flag = (inx & NODE_STATE_NO_RESPOND); bool power_down_flag = (inx & NODE_STATE_POWER_SAVE); bool power_up_flag = (inx & NODE_STATE_POWER_UP); @@ -1708,6 +1712,8 @@ extern char *node_state_string_compact(uint16_t inx) return "IDLE~"; if (no_resp_flag) return "IDLE*"; + if (net_flag) + return "NPC"; if (res_flag) return "RESV"; return "IDLE"; diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index a727d0ed44db12717a67ef03cbf5850585d57bf6..9032eab88162e302b9ea49ce63dc0119354534a0 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -142,12 +142,14 @@ int bg_recover __attribute__((weak_import)) = NOT_FROM_CONTROLLER; slurmdb_cluster_rec_t *working_cluster_rec __attribute__((weak_import)) = NULL; struct node_record *node_record_table_ptr __attribute__((weak_import)); int node_record_count __attribute__((weak_import)); +time_t last_node_update __attribute__((weak_import)); #else slurm_ctl_conf_t slurmctld_conf; int bg_recover = NOT_FROM_CONTROLLER; slurmdb_cluster_rec_t *working_cluster_rec = NULL; struct node_record *node_record_table_ptr; int node_record_count; +time_t last_node_update; #endif static blade_info_t *blade_array = NULL; @@ -155,6 +157,7 @@ static bitstr_t *blades_running_jobs = NULL; static bitstr_t *blades_running_npc = NULL; static uint32_t blade_cnt = 0; static pthread_mutex_t blade_mutex = PTHREAD_MUTEX_INITIALIZER; +static time_t last_npc_update; #ifdef HAVE_NATIVE_CRAY @@ -743,6 +746,10 @@ static void _remove_job_from_blades(select_jobinfo_t *jobinfo) bit_not(blades_running_jobs); } } + + if (jobinfo->npc) + last_npc_update = time(NULL); + slurm_mutex_unlock(&blade_mutex); } @@ -802,6 +809,9 @@ static void _set_job_running(struct job_record *job_ptr) node_bitmap); } } + + if (jobinfo->npc) + last_npc_update = time(NULL); } /* job_write and blade_mutex must be locked before calling */ @@ -824,6 +834,9 @@ static void _set_job_running_restore(select_jobinfo_t *jobinfo) if (jobinfo->npc) bit_or(blades_running_npc, blade_array[i].node_bitmap); } + + if (jobinfo->npc) + last_npc_update = time(NULL); } static void *_job_fini(void *args) @@ -1804,6 +1817,34 @@ unpack_error: extern int select_p_select_nodeinfo_set_all(void) { + int i; + static time_t last_set_all = 0; + + /* only set this once when the last_bg_update is newer than + the last time we set things up. */ + if (last_set_all && (last_npc_update-1 < last_set_all)) { + debug3("Node select info for set all hasn't " + "changed since %ld", + last_set_all); + return SLURM_NO_CHANGE_IN_DATA; + } + last_set_all = last_npc_update; + + /* set this here so we know things have changed */ + last_node_update = time(NULL); + + slurm_mutex_lock(&blade_mutex); + /* clear all marks */ + for (i=0; i<node_record_count; i++) { + struct node_record *node_ptr = &(node_record_table_ptr[i]); + if (bit_test(blades_running_npc, i)) + node_ptr->node_state |= NODE_STATE_NET; + else + node_ptr->node_state &= (~NODE_STATE_NET); + } + + slurm_mutex_unlock(&blade_mutex); + return other_select_nodeinfo_set_all(); }