From 89500f1b18d0e9d083350691e62d2b90da9428b5 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@schedmd.com> Date: Wed, 15 Jan 2014 16:04:11 -0800 Subject: [PATCH] CRAY - Add node flag noting it is in a blade using Network performance counters --- slurm/slurm.h.in | 9 +++++- src/common/slurm_protocol_defs.c | 8 +++++- src/plugins/select/cray/select_cray.c | 41 +++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 362e71b2e31..d763e945c21 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -584,7 +584,14 @@ enum node_states { }; #define NODE_STATE_BASE 0x000f #define NODE_STATE_FLAGS 0xfff0 -#define NODE_STATE_RES 0x0020 +#define NODE_STATE_NET 0x0010 /* If a node is using Cray's + * Network Performance + * Counters but isn't in a + * allocation. */ +#define NODE_STATE_RES 0x0020 /* If a node is in a + * reservation (used primarily + * to note a node isn't idle + * for non-reservation jobs) */ #define NODE_STATE_UNDRAIN 0x0040 /* Clear DRAIN flag for a node */ #define NODE_STATE_CLOUD 0x0080 /* node comes from cloud */ #define NODE_RESUME 0x0100 /* Restore a DRAINED, DRAINING, DOWN diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index e241abbe985..a2f3b3c7293 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -1506,6 +1506,7 @@ extern char *node_state_string(uint16_t inx) bool drain_flag = (inx & NODE_STATE_DRAIN); bool fail_flag = (inx & NODE_STATE_FAIL); bool maint_flag = (inx & NODE_STATE_MAINT); + bool net_flag = (inx & NODE_STATE_NET); bool res_flag = (inx & NODE_STATE_RES); bool resume_flag = (inx & NODE_RESUME); bool no_resp_flag = (inx & NODE_STATE_NO_RESPOND); @@ -1585,6 +1586,8 @@ extern char *node_state_string(uint16_t inx) return "IDLE~"; if (no_resp_flag) return "IDLE*"; + if (net_flag) + return "NETPERF"; if (res_flag) return "RESERVED"; return "IDLE"; @@ -1628,8 +1631,9 @@ extern char *node_state_string_compact(uint16_t inx) bool drain_flag = (inx & NODE_STATE_DRAIN); bool fail_flag = (inx & NODE_STATE_FAIL); bool maint_flag = (inx & NODE_STATE_MAINT); - bool resume_flag = (inx & NODE_RESUME); + bool net_flag = (inx & NODE_STATE_NET); bool res_flag = (inx & NODE_STATE_RES); + bool resume_flag = (inx & NODE_RESUME); bool no_resp_flag = (inx & NODE_STATE_NO_RESPOND); bool power_down_flag = (inx & NODE_STATE_POWER_SAVE); bool power_up_flag = (inx & NODE_STATE_POWER_UP); @@ -1708,6 +1712,8 @@ extern char *node_state_string_compact(uint16_t inx) return "IDLE~"; if (no_resp_flag) return "IDLE*"; + if (net_flag) + return "NPC"; if (res_flag) return "RESV"; return "IDLE"; diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index a727d0ed44d..9032eab8816 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -142,12 +142,14 @@ int bg_recover __attribute__((weak_import)) = NOT_FROM_CONTROLLER; slurmdb_cluster_rec_t *working_cluster_rec __attribute__((weak_import)) = NULL; struct node_record *node_record_table_ptr __attribute__((weak_import)); int node_record_count __attribute__((weak_import)); +time_t last_node_update __attribute__((weak_import)); #else slurm_ctl_conf_t slurmctld_conf; int bg_recover = NOT_FROM_CONTROLLER; slurmdb_cluster_rec_t *working_cluster_rec = NULL; struct node_record *node_record_table_ptr; int node_record_count; +time_t last_node_update; #endif static blade_info_t *blade_array = NULL; @@ -155,6 +157,7 @@ static bitstr_t *blades_running_jobs = NULL; static bitstr_t *blades_running_npc = NULL; static uint32_t blade_cnt = 0; static pthread_mutex_t blade_mutex = PTHREAD_MUTEX_INITIALIZER; +static time_t last_npc_update; #ifdef HAVE_NATIVE_CRAY @@ -743,6 +746,10 @@ static void _remove_job_from_blades(select_jobinfo_t *jobinfo) bit_not(blades_running_jobs); } } + + if (jobinfo->npc) + last_npc_update = time(NULL); + slurm_mutex_unlock(&blade_mutex); } @@ -802,6 +809,9 @@ static void _set_job_running(struct job_record *job_ptr) node_bitmap); } } + + if (jobinfo->npc) + last_npc_update = time(NULL); } /* job_write and blade_mutex must be locked before calling */ @@ -824,6 +834,9 @@ static void _set_job_running_restore(select_jobinfo_t *jobinfo) if (jobinfo->npc) bit_or(blades_running_npc, blade_array[i].node_bitmap); } + + if (jobinfo->npc) + last_npc_update = time(NULL); } static void *_job_fini(void *args) @@ -1804,6 +1817,34 @@ unpack_error: extern int select_p_select_nodeinfo_set_all(void) { + int i; + static time_t last_set_all = 0; + + /* only set this once when the last_bg_update is newer than + the last time we set things up. */ + if (last_set_all && (last_npc_update-1 < last_set_all)) { + debug3("Node select info for set all hasn't " + "changed since %ld", + last_set_all); + return SLURM_NO_CHANGE_IN_DATA; + } + last_set_all = last_npc_update; + + /* set this here so we know things have changed */ + last_node_update = time(NULL); + + slurm_mutex_lock(&blade_mutex); + /* clear all marks */ + for (i=0; i<node_record_count; i++) { + struct node_record *node_ptr = &(node_record_table_ptr[i]); + if (bit_test(blades_running_npc, i)) + node_ptr->node_state |= NODE_STATE_NET; + else + node_ptr->node_state &= (~NODE_STATE_NET); + } + + slurm_mutex_unlock(&blade_mutex); + return other_select_nodeinfo_set_all(); } -- GitLab