diff --git a/contribs/cray/capmc_resume.c b/contribs/cray/capmc_resume.c index b6496fb7058214cced98bc6cab81c5a6b5ded47c..1d84b80f8933f58c3de6f8559e2e23594ea83f6e 100644 --- a/contribs/cray/capmc_resume.c +++ b/contribs/cray/capmc_resume.c @@ -416,7 +416,9 @@ static int _update_all_nodes(char *host_list) } error("%s: capmc(%s,%s,%s): %d %s", prog_name, argv[1], argv[2], argv[3], status, resp_msg); - if (resp_msg && strstr(resp_msg, "Could not lookup") && + if (resp_msg && + (strstr(resp_msg, "Could not lookup") || + strstr(resp_msg, "Internal server error")) && (retry <= capmc_retries)) { /* State Manager is down. Sleep and retry */ sleep(1); diff --git a/src/plugins/node_features/knl_cray/node_features_knl_cray.c b/src/plugins/node_features/knl_cray/node_features_knl_cray.c index 44ba897331fe278fedc37708499f51d660c8fb19..994172fc324bb8a2ac664fcbcd655eecba4dedc7 100644 --- a/src/plugins/node_features/knl_cray/node_features_knl_cray.c +++ b/src/plugins/node_features/knl_cray/node_features_knl_cray.c @@ -232,6 +232,8 @@ typedef struct numa_cfg2 { char *numa_cfg; } numa_cfg2_t; +static void _check_node_disabled(void); +static void _check_node_status(void); static s_p_hashtbl_t *_config_make_tbl(char *filename); static void _free_script_argv(char **script_argv); static mcdram_cap_t *_json_parse_mcdram_cap_array(json_object *jobj, char *key, @@ -1751,7 +1753,7 @@ extern int node_features_p_reconfig(void) return SLURM_SUCCESS; } -/* Put any node NOT found by "capmc node_status" into DRAIN state */ +/* Put any nodes NOT found by "capmc node_status" into DRAIN state */ static void _check_node_status(void) { json_object *j_obj; @@ -1861,6 +1863,33 @@ static void _check_node_status(void) } } +/* Put any disabled nodes into DRAIN state */ +static void _check_node_disabled(void) +{ +/* FIXME: To be added + * + * STEP 0 (for testing), disable/enable nodes: + * > xtcli disable ${TARGET_NODE} + * > xtcli enable ${TARGET_NODE} + * + * STEP 1: Identify disabled compute nodes + * > xtshow --compute --disabled + * L1s ... + * L0s ... + * Nodes ... + * c0-0c0s7n0: -| disabled [noflags|] + * SeaStars ... + * Links ... + * c1-0c2s1s1l1: -| disabled [noflags|] + * + * STEP 2: Map cname to nid name + * > rtr -Im ${TARGET_BLADE} + * + * STEP 3: Drain the disabled compute nodes + * See logic in _check_node_status() above. + */ +} + /* Update only the current MCDRAM and NUMA mode for identified nodes */ static int _update_current_mode(char *node_list) { @@ -1930,7 +1959,8 @@ extern int node_features_p_get_node(char *node_list) } slurm_mutex_unlock(&config_mutex); - _check_node_status(); /* Flag nodes not found by capmc */ + _check_node_status(); /* Drain nodes not found by capmc */ + _check_node_disabled(); /* Drain disabled nodes */ if (mcdram_per_node && node_list && /* Selected node updated and */ (mcdram_pct[0] != -1)) /* have needd global info */ diff --git a/testsuite/expect/test1.113 b/testsuite/expect/test1.113 index 1a4a98d4359609ed2503f974e3a225008d379002..0e852e2be57ff2d5df2eb8f3f28d9127bcd0566a 100755 --- a/testsuite/expect/test1.113 +++ b/testsuite/expect/test1.113 @@ -73,7 +73,7 @@ if {$matches < $needed_nodes} { set nodes 0 set timeout $max_job_delay -set srun_pid [spawn $srun -p $partition -N1-2 --use-min-nodes -n$ncpus --ntasks-per-thread=1--cpu-bind=thread $bin_printenv SLURM_NNODES] +set srun_pid [spawn $srun -p $partition -N1-2 --use-min-nodes -n$ncpus $bin_printenv SLURM_NNODES] expect { -re "($number)" { set nodes $expect_out(1,string) diff --git a/testsuite/expect/test15.39 b/testsuite/expect/test15.39 index a7e6218beddf6b2f5c573b645e5f93393f1d52f7..de2d751a98d1c8ed4694e03ee84232878254b67b 100755 --- a/testsuite/expect/test15.39 +++ b/testsuite/expect/test15.39 @@ -73,7 +73,7 @@ if {$matches < $needed_nodes} { set job_id 0 set timeout $max_job_delay -set salloc_pid [spawn $salloc -p $partition -N1-2 --use-min-nodes -n$ncpus --hint=nomultithread $bin_printenv SLURM_NNODES] +set salloc_pid [spawn $salloc -p $partition -N1-2 --use-min-nodes -n$ncpus $bin_printenv SLURM_NNODES] #set salloc_pid [spawn $salloc -p $partition -N1-2 -n$ncpus $bin_printenv SLURM_NNODES] expect { -re "Granted job allocation ($number)" { diff --git a/testsuite/expect/test17.63 b/testsuite/expect/test17.63 index ffaf3f381daf3ce1f52c7665eaaa9785911b317b..046bfa37285024b4529f82b192c6ca129921f889 100755 --- a/testsuite/expect/test17.63 +++ b/testsuite/expect/test17.63 @@ -87,7 +87,7 @@ make_bash_script $file_in " set cwd "[$bin_pwd]" make_bash_script $file_script " - $sbatch -p $partition -N1-2 --use-min-nodes -n$ncpus --hint=nomultithread --output=$file_out --error=$file_err $cwd/$file_in + $sbatch -p $partition -N1-2 --use-min-nodes -n$ncpus --output=$file_out --error=$file_err $cwd/$file_in exit 0 "