Skip to content
Snippets Groups Projects
Commit 922281a7 authored by Morris Jette's avatar Morris Jette
Browse files

Merge branch 'slurm-16.05'

parents 6b556606 8ff2bf25
No related branches found
No related tags found
No related merge requests found
......@@ -416,7 +416,9 @@ static int _update_all_nodes(char *host_list)
}
error("%s: capmc(%s,%s,%s): %d %s", prog_name,
argv[1], argv[2], argv[3], status, resp_msg);
if (resp_msg && strstr(resp_msg, "Could not lookup") &&
if (resp_msg &&
(strstr(resp_msg, "Could not lookup") ||
strstr(resp_msg, "Internal server error")) &&
(retry <= capmc_retries)) {
/* State Manager is down. Sleep and retry */
sleep(1);
......
......@@ -232,6 +232,8 @@ typedef struct numa_cfg2 {
char *numa_cfg;
} numa_cfg2_t;
static void _check_node_disabled(void);
static void _check_node_status(void);
static s_p_hashtbl_t *_config_make_tbl(char *filename);
static void _free_script_argv(char **script_argv);
static mcdram_cap_t *_json_parse_mcdram_cap_array(json_object *jobj, char *key,
......@@ -1751,7 +1753,7 @@ extern int node_features_p_reconfig(void)
return SLURM_SUCCESS;
}
/* Put any node NOT found by "capmc node_status" into DRAIN state */
/* Put any nodes NOT found by "capmc node_status" into DRAIN state */
static void _check_node_status(void)
{
json_object *j_obj;
......@@ -1861,6 +1863,33 @@ static void _check_node_status(void)
}
}
/* Put any disabled nodes into DRAIN state */
static void _check_node_disabled(void)
{
/* FIXME: To be added
*
* STEP 0 (for testing), disable/enable nodes:
* > xtcli disable ${TARGET_NODE}
* > xtcli enable ${TARGET_NODE}
*
* STEP 1: Identify disabled compute nodes
* > xtshow --compute --disabled
* L1s ...
* L0s ...
* Nodes ...
* c0-0c0s7n0: -| disabled [noflags|]
* SeaStars ...
* Links ...
* c1-0c2s1s1l1: -| disabled [noflags|]
*
* STEP 2: Map cname to nid name
* > rtr -Im ${TARGET_BLADE}
*
* STEP 3: Drain the disabled compute nodes
* See logic in _check_node_status() above.
*/
}
/* Update only the current MCDRAM and NUMA mode for identified nodes */
static int _update_current_mode(char *node_list)
{
......@@ -1930,7 +1959,8 @@ extern int node_features_p_get_node(char *node_list)
}
slurm_mutex_unlock(&config_mutex);
_check_node_status(); /* Flag nodes not found by capmc */
_check_node_status(); /* Drain nodes not found by capmc */
_check_node_disabled(); /* Drain disabled nodes */
if (mcdram_per_node && node_list && /* Selected node updated and */
(mcdram_pct[0] != -1)) /* have needd global info */
......
......@@ -73,7 +73,7 @@ if {$matches < $needed_nodes} {
set nodes 0
set timeout $max_job_delay
set srun_pid [spawn $srun -p $partition -N1-2 --use-min-nodes -n$ncpus --ntasks-per-thread=1--cpu-bind=thread $bin_printenv SLURM_NNODES]
set srun_pid [spawn $srun -p $partition -N1-2 --use-min-nodes -n$ncpus $bin_printenv SLURM_NNODES]
expect {
-re "($number)" {
set nodes $expect_out(1,string)
......
......@@ -73,7 +73,7 @@ if {$matches < $needed_nodes} {
set job_id 0
set timeout $max_job_delay
set salloc_pid [spawn $salloc -p $partition -N1-2 --use-min-nodes -n$ncpus --hint=nomultithread $bin_printenv SLURM_NNODES]
set salloc_pid [spawn $salloc -p $partition -N1-2 --use-min-nodes -n$ncpus $bin_printenv SLURM_NNODES]
#set salloc_pid [spawn $salloc -p $partition -N1-2 -n$ncpus $bin_printenv SLURM_NNODES]
expect {
-re "Granted job allocation ($number)" {
......
......@@ -87,7 +87,7 @@ make_bash_script $file_in "
set cwd "[$bin_pwd]"
make_bash_script $file_script "
$sbatch -p $partition -N1-2 --use-min-nodes -n$ncpus --hint=nomultithread --output=$file_out --error=$file_err $cwd/$file_in
$sbatch -p $partition -N1-2 --use-min-nodes -n$ncpus --output=$file_out --error=$file_err $cwd/$file_in
exit 0
"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment