Skip to content
Snippets Groups Projects
Commit 5de9389b authored by Moe Jette's avatar Moe Jette
Browse files

Permit change in SwitchType configuration only when daemons are restarted.

  Attempted "scontrol reconfig" or SIGHUP requests result in error and
  ignoring change to SwitchType. Document how to change a cluster's SwitchType.
parent 8677dab8
Branches
No related tags found
No related merge requests found
.TH "Slurm API" "3" "October 2003" "Morris Jette" "Slurm administrative calls"
.TH "Slurm API" "3" "February 2004" "Morris Jette" "Slurm administrative calls"
.SH "NAME"
slurm_delete_partition, slurm_init_part_desc_msg,
slurm_reconfigure, slurm_shutdown, slurm_update_job,
......@@ -122,14 +122,24 @@ code is set appropriately.
\fBSLURM_PROTOCOL_VERSION_ERROR\fR Protocol version has changed, re-link your
code.
.LP
\fBESLURM_INVALID_NODE_NAME\fR the requested node name(s) is/are not valid.
\fBESLURM_INVALID_NODE_NAME\fR The requested node name(s) is/are not valid.
.LP
\fBESLURM_INVALID_NODE_STATE\fR the specified state node state or requested
\fBESLURM_INVALID_NODE_STATE\fR The specified state node state or requested
node state transition is not valid.
.LP
\fBESLURM_INVALID_PARTITION_NAME\fR the requested partition name is not valid.
\fBESLURM_INVALID_PARTITION_NAME\fR The requested partition name is not valid.
.LP
\fBESLURM_ACCESS_DENIED\fR the requesting user lacks authorization for
\fBESLURM_INVALID_SCHEDTYPE_CHANGE\fR The \fBSchedulerType\fR parameter can
not be changed using the \fBslurm_reconfigure\fR function, but the
\fBslurmctld\fR daemon must be restarted. Manual changes to existing job
parameters may also be required. See \fBslurm.conf\fR(5) for more information.
.LP
\fBESLURM_INVALID_SWITCHTYPE_CHANGE\fR The \fBSwitchType\fR parameter can
not be changed using the \fBslurm_reconfigure\fR function, but all
SLURM daemons and commands must be restarted. All previously running
jobs will be lost. See \fBslurm.conf\fR(5) for more information.
.LP
\fBESLURM_ACCESS_DENIED\fR The requesting user lacks authorization for
the requested action (e.g. trying to delete or modify another user's job).
.LP
\fBSLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT\fR Timeout in communicating with
......@@ -236,4 +246,4 @@ details.
.SH "SEE ALSO"
.LP
\fBscontrol\fR(1), \fBslurm_get_errno\fR(3), \fBslurm_init_job_desc_msg\fR(3),
\fBslurm_perror\fR(3), \fBslurm_strerror\fR(3)
\fBslurm_perror\fR(3), \fBslurm_strerror\fR(3), \fBslurm.conf\fR(5)
......@@ -194,6 +194,8 @@ must have their priority set to zero (held).
When changing the value from "sched/wiki", all pending jobs
should have their priority change from zero to some large number.
The \fBscontrol\fR command can be used to change job priorities.
The \fBslurmctld\fR daemon must be restarted for a change in
scheduler type to become effective.
.TP
\fBSlurmUser\fR
The name of the user that the \fBslurmctld\fR daemon executes as.
......@@ -285,9 +287,9 @@ Acceptable values include
or termination (Myrinet, Ethernet, and InfiniBand),
"switch/elan" for Quadrics Elan 3 or Elan 4 interconnect.
The default value is "switch/none".
All SLURM daemons and running jobs must be restared for a change in
\fBSwitchType\fR to take effect.
If running jobs exist at the time slurmctld is restarted with a new
All SLURM daemons, commands and running jobs must be restared for a
change in \fBSwitchType\fR to take effect.
If running jobs exist at the time \fBslurmctld\fR is restarted with a new
value of \fBSwitchType\fR, records of all jobs in any state may be lost.
.TP
\fBTmpFS\fR
......
......
......@@ -124,6 +124,8 @@ enum {
ESLURM_IN_STANDBY_MODE,
ESLURM_INVALID_NODE_STATE,
ESLURM_INVALID_FEATURE,
ESLURM_INVALID_SCHEDTYPE_CHANGE,
ESLURM_INVALID_SWITCHTYPE_CHANGE,
/* Quadrics Elan routine error codes */
ENOSLURM = 3000,
......
......
......@@ -149,6 +149,10 @@ static slurm_errtab_t slurm_errtab[] = {
"Invalid node state specified" },
{ ESLURM_INVALID_FEATURE,
"Invalid feature specification" },
{ ESLURM_INVALID_SCHEDTYPE_CHANGE,
"Invalid change in SchedulerType requested" },
{ ESLURM_INVALID_SWITCHTYPE_CHANGE,
"SwitchType change requires restart of all SLURM daemons and jobs"},
/* Quadrics Elan routine error codes */
......
......
......@@ -397,7 +397,7 @@ static void *_slurmctld_signal_hand(void *no_data)
error_code = read_slurm_conf(0);
unlock_slurmctld(config_write_lock);
if (error_code)
error("read_slurm_conf error %s",
error("read_slurm_conf: %s",
slurm_strerror(error_code));
else {
_update_cred_key();
......
......
......@@ -64,8 +64,8 @@ static void _purge_old_node_state(struct node_record *old_node_table_ptr,
int old_node_record_count);
static void _restore_node_state(struct node_record *old_node_table_ptr,
int old_node_record_count);
static void _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr,
char *old_auth_type,
static int _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr,
char *old_auth_type, char *old_sched_type,
char *old_switch_type);
static int _sync_nodes_to_comp_job(void);
static int _sync_nodes_to_jobs(void);
......@@ -687,6 +687,7 @@ int read_slurm_conf(int recover)
int old_node_record_count;
struct node_record *old_node_table_ptr;
char *old_auth_type = xstrdup(slurmctld_conf.authtype);
char *old_sched_type = xstrdup(slurmctld_conf.schedtype);
char *old_switch_type = xstrdup(slurmctld_conf.switch_type);
/* initialization */
......@@ -769,8 +770,6 @@ int read_slurm_conf(int recover)
}
fclose(slurm_spec_file);
_preserve_plugins(&slurmctld_conf,
old_auth_type, old_switch_type);
validate_config(&slurmctld_conf);
update_logging();
g_slurm_jobcomp_init(slurmctld_conf.job_comp_loc);
......@@ -822,12 +821,16 @@ int read_slurm_conf(int recover)
/* sort config_list by weight for scheduling */
list_sort(config_list, &list_compare_config);
/* Update plugins as possible */
error_code = _preserve_plugins(&slurmctld_conf,
old_auth_type, old_sched_type, old_switch_type);
slurmctld_conf.last_update = time(NULL);
END_TIMER;
debug("read_slurm_conf: finished loading configuration %s",
TIME_STR);
return SLURM_SUCCESS;
return error_code;
}
......@@ -867,21 +870,42 @@ static void _purge_old_node_state(struct node_record *old_node_table_ptr,
}
/*
* _preserve_plugins - either load new plugins (if possible) or preserve
* original plugin values over restart. slurmctld must restart for some
* _preserve_plugins - preserve original plugin values over reconfiguration
* as required. daemons and/or commands must be restarted for some
* plugin value changes to take effect.
* RET zero or error code
*/
static void _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr,
char *old_auth_type, char *old_switch_type)
static int _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr,
char *old_auth_type,
char *old_sched_type, char *old_switch_type)
{
int rc = SLURM_SUCCESS;
xfree(ctl_conf_ptr->authtype);
ctl_conf_ptr->authtype = old_auth_type;
if (old_sched_type) {
if (strcmp(old_sched_type, ctl_conf_ptr->schedtype)) {
xfree(ctl_conf_ptr->schedtype);
ctl_conf_ptr->schedtype = old_sched_type;
rc = ESLURM_INVALID_SCHEDTYPE_CHANGE;
} else /* free duplicate value */
xfree(old_sched_type);
}
if (old_switch_type) {
if (strcmp(old_switch_type, ctl_conf_ptr->switch_type)) {
xfree(ctl_conf_ptr->switch_type);
ctl_conf_ptr->switch_type = old_switch_type;
rc = ESLURM_INVALID_SWITCHTYPE_CHANGE;
} else /* free duplicate value */
xfree(old_switch_type);
}
if (ctl_conf_ptr->backup_controller == NULL)
info("read_slurm_conf: backup_controller not specified.");
return rc;
}
......@@ -954,7 +978,8 @@ static int _sync_nodes_to_active_job(struct job_record *job_ptr)
node_record_table_ptr[i].run_job_cnt++; /* NOTE:
* This counter moved to comp_job_cnt
* by _sync_nodes_to_comp_job() */
if ((job_ptr->job_state == JOB_RUNNING) &&
if (((job_ptr->job_state == JOB_RUNNING) ||
(job_ptr->job_state & JOB_COMPLETING)) &&
(job_ptr->details) && (job_ptr->details->shared == 0))
node_record_table_ptr[i].no_share_job_cnt++;
......
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment