Skip to content
Snippets Groups Projects
Commit 57e5f01c authored by Moe Jette's avatar Moe Jette
Browse files

Report changes in node's processor count on "scontrol reconfig"

parent 65dcea8e
No related branches found
No related tags found
No related merge requests found
...@@ -155,6 +155,7 @@ enum { ...@@ -155,6 +155,7 @@ enum {
ESLURM_INVALID_TASK_MEMORY, ESLURM_INVALID_TASK_MEMORY,
ESLURM_INVALID_ACCOUNT, ESLURM_INVALID_ACCOUNT,
ESLURM_INVALID_LICENSES, ESLURM_INVALID_LICENSES,
ESLURM_NEED_RESTART,
/* switch specific error codes, specific values defined in plugin module */ /* switch specific error codes, specific values defined in plugin module */
ESLURM_SWITCH_MIN = 3000, ESLURM_SWITCH_MIN = 3000,
......
...@@ -170,17 +170,23 @@ static slurm_errtab_t slurm_errtab[] = { ...@@ -170,17 +170,23 @@ static slurm_errtab_t slurm_errtab[] = {
{ ESLURM_INVALID_FEATURE, { ESLURM_INVALID_FEATURE,
"Invalid feature specification" }, "Invalid feature specification" },
{ ESLURM_INVALID_AUTHTYPE_CHANGE, { ESLURM_INVALID_AUTHTYPE_CHANGE,
"AuthType change requires restart of all SLURM daemons and commands"}, "AuthType change requires restart of all SLURM daemons and "
"commands to take effect"},
{ ESLURM_INVALID_CHECKPOINT_TYPE_CHANGE, { ESLURM_INVALID_CHECKPOINT_TYPE_CHANGE,
"Invalid change in CheckpointType requested" }, "CheckpointType change requires restart of all SLURM daemons "
"to take effect" },
{ ESLURM_INVALID_CRYPTO_TYPE_CHANGE, { ESLURM_INVALID_CRYPTO_TYPE_CHANGE,
"Invalid change in CryptoType requested" }, "CryptoType change requires restart of all SLURM daemons "
"to take effect" },
{ ESLURM_INVALID_SCHEDTYPE_CHANGE, { ESLURM_INVALID_SCHEDTYPE_CHANGE,
"Invalid change in SchedulerType requested" }, "SchedulerType change requires restart of the slurmctld daemon "
"to take effect" },
{ ESLURM_INVALID_SELECTTYPE_CHANGE, { ESLURM_INVALID_SELECTTYPE_CHANGE,
"Invalid change in SelectType requested" }, "SelectType change requires restart of the slurmctld daemon "
"to take effect" },
{ ESLURM_INVALID_SWITCHTYPE_CHANGE, { ESLURM_INVALID_SWITCHTYPE_CHANGE,
"SwitchType change requires restart of all SLURM daemons and jobs"}, "SwitchType change requires restart of all SLURM daemons and "
"jobs to take effect" },
{ ESLURM_FRAGMENTATION, { ESLURM_FRAGMENTATION,
"Immediate execution impossible, " "Immediate execution impossible, "
"resources too fragmented for allocation" }, "resources too fragmented for allocation" },
...@@ -206,6 +212,9 @@ static slurm_errtab_t slurm_errtab[] = { ...@@ -206,6 +212,9 @@ static slurm_errtab_t slurm_errtab[] = {
"Job has invalid account" }, "Job has invalid account" },
{ ESLURM_INVALID_LICENSES, { ESLURM_INVALID_LICENSES,
"Job has invalid license specification" }, "Job has invalid license specification" },
{ ESLURM_NEED_RESTART,
"The node configuration changes that were made require restart"
"of the slurmctld daemon to take effect"},
/* slurmd error codes */ /* slurmd error codes */
......
...@@ -82,7 +82,7 @@ static int _init_all_slurm_conf(void); ...@@ -82,7 +82,7 @@ static int _init_all_slurm_conf(void);
static void _purge_old_node_state(struct node_record *old_node_table_ptr, static void _purge_old_node_state(struct node_record *old_node_table_ptr,
int old_node_record_count); int old_node_record_count);
static int _restore_job_dependencies(void); static int _restore_job_dependencies(void);
static void _restore_node_state(struct node_record *old_node_table_ptr, static int _restore_node_state(struct node_record *old_node_table_ptr,
int old_node_record_count); int old_node_record_count);
static int _preserve_select_type_param(slurm_ctl_conf_t * ctl_conf_ptr, static int _preserve_select_type_param(slurm_ctl_conf_t * ctl_conf_ptr,
select_type_plugin_info_t old_select_type_p); select_type_plugin_info_t old_select_type_p);
...@@ -708,7 +708,7 @@ static int _build_all_partitionline_info() ...@@ -708,7 +708,7 @@ static int _build_all_partitionline_info()
int read_slurm_conf(int recover) int read_slurm_conf(int recover)
{ {
DEF_TIMERS; DEF_TIMERS;
int error_code, i; int error_code, i, rc;
int old_node_record_count; int old_node_record_count;
struct node_record *old_node_table_ptr; struct node_record *old_node_table_ptr;
char *old_auth_type = xstrdup(slurmctld_conf.authtype); char *old_auth_type = xstrdup(slurmctld_conf.authtype);
...@@ -740,6 +740,11 @@ int read_slurm_conf(int recover) ...@@ -740,6 +740,11 @@ int read_slurm_conf(int recover)
xfree(old_node_table_ptr[i].os); xfree(old_node_table_ptr[i].os);
old_node_table_ptr[i].features = xstrdup( old_node_table_ptr[i].features = xstrdup(
old_node_table_ptr[i].config_ptr->feature); old_node_table_ptr[i].config_ptr->feature);
/* Store the original configured CPU count somewhere
* (port is reused here for that purpose) so we can
* report changes in its configuration. */
old_node_table_ptr[i].port = old_node_table_ptr[i].
config_ptr->cpus;
} }
node_record_table_ptr = NULL; node_record_table_ptr = NULL;
node_record_count = 0; node_record_count = 0;
...@@ -782,9 +787,10 @@ int read_slurm_conf(int recover) ...@@ -782,9 +787,10 @@ int read_slurm_conf(int recover)
(void) load_all_job_state(); (void) load_all_job_state();
} else { /* Load no info, preserve all state */ } else { /* Load no info, preserve all state */
if (old_node_table_ptr) { if (old_node_table_ptr) {
debug("restoring original state of nodes"); info("restoring original state of nodes");
_restore_node_state(old_node_table_ptr, rc = _restore_node_state(old_node_table_ptr,
old_node_record_count); old_node_record_count);
error_code = MAX(error_code, rc); /* not fatal */
} }
reset_first_job_id(); reset_first_job_id();
(void) slurm_sched_reconfig(); (void) slurm_sched_reconfig();
...@@ -807,8 +813,8 @@ int read_slurm_conf(int recover) ...@@ -807,8 +813,8 @@ int read_slurm_conf(int recover)
(void) sync_job_files(); (void) sync_job_files();
_purge_old_node_state(old_node_table_ptr, old_node_record_count); _purge_old_node_state(old_node_table_ptr, old_node_record_count);
if ((error_code = _build_bitmaps())) if ((rc = _build_bitmaps()))
return error_code; return rc; /* fatal error */
license_free(); license_free();
if (license_init(slurmctld_conf.licenses) != SLURM_SUCCESS) if (license_init(slurmctld_conf.licenses) != SLURM_SUCCESS)
...@@ -829,17 +835,16 @@ int read_slurm_conf(int recover) ...@@ -829,17 +835,16 @@ int read_slurm_conf(int recover)
list_sort(config_list, &list_compare_config); list_sort(config_list, &list_compare_config);
/* Update plugins as possible */ /* Update plugins as possible */
error_code = _preserve_plugins(&slurmctld_conf, rc = _preserve_plugins(&slurmctld_conf,
old_auth_type, old_checkpoint_type, old_auth_type, old_checkpoint_type,
old_crypto_type, old_sched_type, old_crypto_type, old_sched_type,
old_select_type, old_switch_type); old_select_type, old_switch_type);
if (error_code) error_code = MAX(error_code, rc); /* not fatal */
return error_code;
/* Update plugin parameters as possible */ /* Update plugin parameters as possible */
error_code = _preserve_select_type_param( rc = _preserve_select_type_param(&slurmctld_conf,
&slurmctld_conf, old_select_type_p);
old_select_type_p); error_code = MAX(error_code, rc); /* not fatal */
slurmctld_conf.last_update = time(NULL); slurmctld_conf.last_update = time(NULL);
END_TIMER2("read_slurm_conf"); END_TIMER2("read_slurm_conf");
...@@ -849,11 +854,11 @@ int read_slurm_conf(int recover) ...@@ -849,11 +854,11 @@ int read_slurm_conf(int recover)
/* Restore node state and size information from saved records. /* Restore node state and size information from saved records.
* If a node was re-configured to be down or drained, we set those states */ * If a node was re-configured to be down or drained, we set those states */
static void _restore_node_state(struct node_record *old_node_table_ptr, static int _restore_node_state(struct node_record *old_node_table_ptr,
int old_node_record_count) int old_node_record_count)
{ {
struct node_record *node_ptr; struct node_record *node_ptr;
int i; int i, rc = SLURM_SUCCESS;
for (i = 0; i < old_node_record_count; i++) { for (i = 0; i < old_node_record_count; i++) {
uint16_t drain_flag = false, down_flag = false; uint16_t drain_flag = false, down_flag = false;
...@@ -874,6 +879,12 @@ static void _restore_node_state(struct node_record *old_node_table_ptr, ...@@ -874,6 +879,12 @@ static void _restore_node_state(struct node_record *old_node_table_ptr,
node_ptr->node_state |= NODE_STATE_DRAIN; node_ptr->node_state |= NODE_STATE_DRAIN;
node_ptr->last_response = old_node_table_ptr[i].last_response; node_ptr->last_response = old_node_table_ptr[i].last_response;
if (old_node_table_ptr[i].port != node_ptr->config_ptr->cpus) {
rc = ESLURM_NEED_RESTART;
error("Configured cpu count change on %s (%u to %u)",
node_ptr->name, old_node_table_ptr[i].port,
node_ptr->config_ptr->cpus);
}
node_ptr->cpus = old_node_table_ptr[i].cpus; node_ptr->cpus = old_node_table_ptr[i].cpus;
node_ptr->sockets = old_node_table_ptr[i].sockets; node_ptr->sockets = old_node_table_ptr[i].sockets;
node_ptr->cores = old_node_table_ptr[i].cores; node_ptr->cores = old_node_table_ptr[i].cores;
...@@ -901,6 +912,7 @@ static void _restore_node_state(struct node_record *old_node_table_ptr, ...@@ -901,6 +912,7 @@ static void _restore_node_state(struct node_record *old_node_table_ptr,
old_node_table_ptr[i].os = NULL; old_node_table_ptr[i].os = NULL;
} }
} }
return rc;
} }
/* Purge old node state information */ /* Purge old node state information */
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment