diff --git a/NEWS b/NEWS index b45677413c4b9dde2e14a51920de9ce63df11b35..e8b3505728ce6b055c823e5b3b2865877c102837 100644 --- a/NEWS +++ b/NEWS @@ -7,6 +7,7 @@ documents those changes that are of interest to users and admins. -- NOTE: "startclean" when transitioning from version 0.3, JOBS ARE LOST -- Added support for job account information (arbitrary string) -- Added support for job dependencies (start job X after job Y completes) + -- Added support for configuration parameter CheckpointType * Changes in SLURM 0.3.7 ======================== diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index bb44bc0fce7658f766836027b82b6a4faac5ecb8..2da94ce898fd465d29ee949e359c207f28f6bbdb 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1,4 +1,4 @@ -.TH "slurm.conf" "5" "July 2004" "Morris Jette" "Slurm configuration file" +.TH "slurm.conf" "5" "August 2004" "slurm.conf 0.4" "Slurm configuration file" .SH "NAME" slurm.conf \- Slurm configuration file .SH "DESCRIPTION" @@ -54,6 +54,11 @@ to a "standby" mode when the ControlMachine becomes available once again. This should be a node name without the full domain name (e.g. "lx0002"). While not essential, it is recommended that you specify a backup controller. .TP +\fBCheckpointType\fR +Define the system-initiated checkpoint method to be used for user jobs. +The slurmctld daemon must be restarted for a change in CheckpointType +to take effect. +.TP \fBControlAddr\fR Name that \fBControlMachine\fR should be referred to in establishing a communications path. This name will diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index b3ca0d82b11d9229c129a01cd1057b85d1690dde..232ccec277a1943b733804ad923f684667cb393e 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -407,6 +407,7 @@ typedef struct slurm_ctl_conf { char *authtype; /* authentication type */ char *backup_addr; /* comm path of slurmctld secondary server */ char *backup_controller;/* name of slurmctld secondary server */ + char *checkpoint_type; /* checkpoint plugin type */ char *control_addr; /* comm path of slurmctld primary server */ char *control_machine; /* name of slurmctld primary server */ char *epilog; /* pathname of job epilog */ diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index 2702920a2033faa1a8d59b823bdecb6e0c87d963..befbbdd24dbfb463f51fba4011e4e9a8a7cf307c 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -125,6 +125,7 @@ enum { ESLURM_INVALID_NODE_STATE, ESLURM_INVALID_FEATURE, ESLURM_INVALID_AUTHTYPE_CHANGE, + ESLURM_INVALID_CHECKPOINT_TYPE_CHANGE, ESLURM_INVALID_SCHEDTYPE_CHANGE, ESLURM_INVALID_SWITCHTYPE_CHANGE, ESLURM_FRAGMENTATION, diff --git a/src/api/config_info.c b/src/api/config_info.c index 040d861c97090667a10fd475a2638d5e3bd90ecd..31caf68fe919ac49c005fe5bb30837e493307e53 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -4,7 +4,7 @@ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Moe Jette <jette1@llnl.gov> and Kevin Tew <tew1@llnl.gov>. + * Written by Morris Jette <jette1@llnl.gov> and Kevin Tew <tew1@llnl.gov>. * UCRL-CODE-2002-040. * * This file is part of SLURM, a resource management program. @@ -71,6 +71,8 @@ void slurm_print_ctl_conf ( FILE* out, slurm_ctl_conf_ptr->backup_addr); fprintf(out, "BackupController = %s\n", slurm_ctl_conf_ptr->backup_controller); + fprintf(out, "CheckpointType = %s\n", + slurm_ctl_conf_ptr->checkpoint_type); fprintf(out, "ControlAddr = %s\n", slurm_ctl_conf_ptr->control_addr); fprintf(out, "ControlMachine = %s\n", diff --git a/src/common/checkpoint.c b/src/common/checkpoint.c index cadb31a1cf322874bcfd9e6108c843e39ad62b7c..fa906fad3dc83bf59019f44072973b50442dbb2b 100644 --- a/src/common/checkpoint.c +++ b/src/common/checkpoint.c @@ -180,20 +180,14 @@ _slurm_checkpoint_get_ops( slurm_checkpoint_context_t c ) /* initialize checkpoint plugin */ extern int -checkpoint_init(void) +checkpoint_init(char *checkpoint_type) { int retval = SLURM_SUCCESS; - char *checkpoint_type; slurm_mutex_lock( &context_lock ); if ( g_context ) _slurm_checkpoint_context_destroy(g_context); -#if 0 - checkpoint_type = slurm_get_checkpoint_type(); -#else - checkpoint_type = xstrdup("checkpoint/none"); -#endif g_context = _slurm_checkpoint_context_create( checkpoint_type ); if ( g_context == NULL ) { error( "cannot create a context for %s", checkpoint_type ); @@ -209,7 +203,6 @@ checkpoint_init(void) retval = SLURM_ERROR; } verbose("Checkpoint plugin loaded: %s", checkpoint_type); - xfree(checkpoint_type); done: slurm_mutex_unlock( &context_lock ); diff --git a/src/common/checkpoint.h b/src/common/checkpoint.h index c983b222a9fb1b941e76d66844fd5a1c3faaa29c..90dfd06984d18e3673ba43bbbd2f0c8c97ac78a2 100644 --- a/src/common/checkpoint.h +++ b/src/common/checkpoint.h @@ -53,7 +53,7 @@ enum check_opts { typedef struct slurm_checkpoint_context * slurm_checkpoint_context_t; /* initialize checkpoint plugin */ -extern int checkpoint_init(void); +extern int checkpoint_init(char *checkpoint_type); /* shutdown checkpoint plugin */ extern void checkpoint_fini(void); diff --git a/src/common/read_config.c b/src/common/read_config.c index 6f85f4bc894b15c59256d25ff2fe848fb9a745f9..0420ddce738e70706a32405021bb796df360e362 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Moe Mette <jette1@llnl.gov>. + * Written by Morris Jette <jette1@llnl.gov>. * UCRL-CODE-2002-040. * * This file is part of SLURM, a resource management program. @@ -94,6 +94,7 @@ void free_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) { xfree (ctl_conf_ptr->authtype); + xfree (ctl_conf_ptr->checkpoint_type); xfree (ctl_conf_ptr->backup_addr); xfree (ctl_conf_ptr->backup_controller); xfree (ctl_conf_ptr->control_addr); @@ -130,6 +131,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) { ctl_conf_ptr->last_update = time(NULL); xfree (ctl_conf_ptr->authtype); + xfree (ctl_conf_ptr->checkpoint_type); xfree (ctl_conf_ptr->backup_addr); xfree (ctl_conf_ptr->backup_controller); xfree (ctl_conf_ptr->control_addr); @@ -199,7 +201,8 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) int max_job_cnt = -1, min_job_age = -1, wait_time = -1; int slurmctld_port = -1, slurmd_port = -1; char *backup_addr = NULL, *backup_controller = NULL; - char *control_addr = NULL, *control_machine = NULL, *epilog = NULL; + char *checkpoint_type = NULL, *control_addr = NULL; + char *control_machine = NULL, *epilog = NULL; char *prolog = NULL; char *sched_type = NULL, *sched_auth = NULL; char *state_save_location = NULL, *tmp_fs = NULL; @@ -215,6 +218,7 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) error_code = slurm_parser (in_line, "AuthType=", 's', &auth_type, + "CheckpointType=", 's', &checkpoint_type, "BackupAddr=", 's', &backup_addr, "BackupController=", 's', &backup_controller, "ControlAddr=", 's', &control_addr, @@ -268,6 +272,14 @@ parse_config_spec (char *in_line, slurm_ctl_conf_t *ctl_conf_ptr) ctl_conf_ptr->authtype = auth_type; } + if ( checkpoint_type ) { + if ( ctl_conf_ptr->checkpoint_type ) { + error( MULTIPLE_VALUE_MSG, "CheckpointType" ); + xfree( ctl_conf_ptr->checkpoint_type ); + } + ctl_conf_ptr->checkpoint_type = checkpoint_type; + } + if ( backup_addr ) { if ( ctl_conf_ptr->backup_addr ) { error (MULTIPLE_VALUE_MSG, "BackupAddr"); @@ -821,6 +833,10 @@ validate_config (slurm_ctl_conf_t *ctl_conf_ptr) if (ctl_conf_ptr->authtype == NULL) ctl_conf_ptr->authtype = xstrdup(DEFAULT_AUTH_TYPE); + if (ctl_conf_ptr->checkpoint_type == NULL) + ctl_conf_ptr->checkpoint_type = + xstrdup(DEFAULT_CHECKPOINT_TYPE); + if (ctl_conf_ptr->fast_schedule == (uint16_t) NO_VAL) ctl_conf_ptr->fast_schedule = DEFAULT_FAST_SCHEDULE; diff --git a/src/common/read_config.h b/src/common/read_config.h index 65deae016d2d0061830149577aac0a4a8f4d4ac0..dbde82090d0d167751069e0e634fe28c3da50d06 100644 --- a/src/common/read_config.h +++ b/src/common/read_config.h @@ -4,7 +4,7 @@ ***************************************************************************** * Copyright (C) 2002 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Moe Mette <jette1@llnl.gov>. + * Written by Morris Mette <jette1@llnl.gov>. * UCRL-CODE-2002-040. * * This file is part of SLURM, a resource management program. @@ -31,6 +31,7 @@ #include "src/common/slurm_protocol_defs.h" #define DEFAULT_AUTH_TYPE "auth/none" +#define DEFAULT_CHECKPOINT_TYPE "checkpoint/none" #define DEFAULT_FAST_SCHEDULE 1 #define DEFAULT_FIRST_JOB_ID 1 #define DEFAULT_HASH_BASE 10 diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 6290cca0ff8ea5296902e9b198d5dc6a486fdb6a..35967e9f6dae4667198e673af05247be1468d92d 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -154,6 +154,8 @@ static slurm_errtab_t slurm_errtab[] = { "Invalid feature specification" }, { ESLURM_INVALID_AUTHTYPE_CHANGE, "AuthType change requires restart of all SLURM daemons and commands"}, + { ESLURM_INVALID_CHECKPOINT_TYPE_CHANGE, + "Invalid change in CheckpointType requested" }, { ESLURM_INVALID_SCHEDTYPE_CHANGE, "Invalid change in SchedulerType requested" }, { ESLURM_INVALID_SWITCHTYPE_CHANGE, diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 8548998fced128fcdd0f4db550ef5bd7ee34a513..61ae7d4d4a3215544aefd9b845b0addd5b61724c 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1677,6 +1677,7 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) packstr(build_ptr->authtype, buffer); packstr(build_ptr->backup_addr, buffer); packstr(build_ptr->backup_controller, buffer); + packstr(build_ptr->checkpoint_type, buffer); packstr(build_ptr->control_addr, buffer); packstr(build_ptr->control_machine, buffer); packstr(build_ptr->epilog, buffer); @@ -1736,6 +1737,8 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** safe_unpackstr_xmalloc(&build_ptr->backup_addr, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&build_ptr->backup_controller, &uint16_tmp, buffer); + safe_unpackstr_xmalloc(&build_ptr->checkpoint_type, &uint16_tmp, + buffer); safe_unpackstr_xmalloc(&build_ptr->control_addr, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&build_ptr->control_machine, &uint16_tmp, buffer); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index ff2272ccfe3e35ca8a08fc324a23e4320c695e8f..f433581615848760efefdae96f079ad681197672 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -214,7 +214,8 @@ int main(int argc, char *argv[]) */ if ( slurm_sched_init() != SLURM_SUCCESS ) fatal( "failed to initialize scheduling plugin" ); - if ( checkpoint_init() != SLURM_SUCCESS ) + if ( checkpoint_init(slurmctld_conf.checkpoint_type) != + SLURM_SUCCESS ) fatal( "failed to initialize checkpoint plugin" ); while (1) { diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 3787f84f4f7d77f90f0384ad4ebe5b2603a7bf58..b11fb86ce9419d1ba66c5f23af4b762eb8cf7a78 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -63,7 +63,6 @@ #define BUF_SIZE 1024 /* Temporary buffer size */ static void _fill_ctld_conf(slurm_ctl_conf_t * build_ptr); -static void _free_ctld_conf(slurm_ctl_conf_t * build_ptr); static inline bool _is_super_user(uid_t uid); static void _kill_job_on_msg_fail(uint32_t job_id); static int _make_step_cred(struct step_record *step_rec, @@ -245,6 +244,7 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->backup_addr = xstrdup(slurmctld_conf.backup_addr); conf_ptr->backup_controller = xstrdup(slurmctld_conf. backup_controller); + conf_ptr->checkpoint_type = xstrdup(slurmctld_conf.checkpoint_type); conf_ptr->control_addr = xstrdup(slurmctld_conf.control_addr); conf_ptr->control_machine = xstrdup(slurmctld_conf. control_machine); @@ -298,35 +298,6 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) return; } -/* _free_ctld_conf - free memory allocated by _fill_ctld_conf */ -static void _free_ctld_conf(slurm_ctl_conf_t * conf_ptr) -{ - xfree(conf_ptr->authtype); - xfree(conf_ptr->backup_addr); - xfree(conf_ptr->backup_controller); - xfree(conf_ptr->control_addr); - xfree(conf_ptr->control_machine); - xfree(conf_ptr->epilog); - xfree(conf_ptr->job_comp_loc); - xfree(conf_ptr->job_comp_type); - xfree(conf_ptr->job_credential_private_key); - xfree(conf_ptr->job_credential_public_certificate); - xfree(conf_ptr->plugindir); - xfree(conf_ptr->prolog); - xfree(conf_ptr->schedauth); - xfree(conf_ptr->schedtype); - xfree(conf_ptr->slurm_user_name); - xfree(conf_ptr->slurmctld_logfile); - xfree(conf_ptr->slurmctld_pidfile); - xfree(conf_ptr->slurmd_logfile); - xfree(conf_ptr->slurmd_pidfile); - xfree(conf_ptr->slurmd_spooldir); - xfree(conf_ptr->slurm_conf); - xfree(conf_ptr->state_save_location); - xfree(conf_ptr->switch_type); - xfree(conf_ptr->tmp_fs); -} - /* return true if supplied uid is a super-user: root, self, or SlurmUser */ static inline bool _is_super_user(uid_t uid) { @@ -608,7 +579,7 @@ static void _slurm_rpc_dump_conf(slurm_msg_t * msg) /* send message */ slurm_send_node_msg(msg->conn_fd, &response_msg); - _free_ctld_conf(&config_tbl); + free_slurm_conf(&config_tbl); } } diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index e6712281fb81b7f81f188debd2155103bae9d9d5..6b0c132e058f57e676543ae285d6a6e1bc8871b4 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -65,8 +65,8 @@ static void _purge_old_node_state(struct node_record *old_node_table_ptr, static void _restore_node_state(struct node_record *old_node_table_ptr, int old_node_record_count); static int _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr, - char *old_auth_type, char *old_sched_type, - char *old_switch_type); + char *old_auth_type, char *old_checkpoint_type, + char *old_sched_type, char *old_switch_type); static int _sync_nodes_to_comp_job(void); static int _sync_nodes_to_jobs(void); static int _sync_nodes_to_active_job(struct job_record *job_ptr); @@ -725,9 +725,10 @@ int read_slurm_conf(int recover) int i, j, error_code; int old_node_record_count; struct node_record *old_node_table_ptr; - char *old_auth_type = xstrdup(slurmctld_conf.authtype); - char *old_sched_type = xstrdup(slurmctld_conf.schedtype); - char *old_switch_type = xstrdup(slurmctld_conf.switch_type); + char *old_auth_type = xstrdup(slurmctld_conf.authtype); + char *old_checkpoint_type = xstrdup(slurmctld_conf.checkpoint_type); + char *old_sched_type = xstrdup(slurmctld_conf.schedtype); + char *old_switch_type = xstrdup(slurmctld_conf.switch_type); /* initialization */ START_TIMER; @@ -862,7 +863,8 @@ int read_slurm_conf(int recover) /* Update plugins as possible */ error_code = _preserve_plugins(&slurmctld_conf, - old_auth_type, old_sched_type, old_switch_type); + old_auth_type, old_checkpoint_type, + old_sched_type, old_switch_type); slurmctld_conf.last_update = time(NULL); END_TIMER; @@ -915,8 +917,8 @@ static void _purge_old_node_state(struct node_record *old_node_table_ptr, * RET zero or error code */ static int _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr, - char *old_auth_type, char *old_sched_type, - char *old_switch_type) + char *old_auth_type, char *old_checkpoint_type, + char *old_sched_type, char *old_switch_type) { int rc = SLURM_SUCCESS; @@ -929,6 +931,16 @@ static int _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr, xfree(old_auth_type); } + if (old_checkpoint_type) { + if (strcmp(old_checkpoint_type, + ctl_conf_ptr->checkpoint_type)) { + xfree(ctl_conf_ptr->checkpoint_type); + ctl_conf_ptr->checkpoint_type = old_checkpoint_type; + rc = ESLURM_INVALID_CHECKPOINT_TYPE_CHANGE; + } else /* free duplicate value */ + xfree(old_checkpoint_type); + } + if (old_sched_type) { if (strcmp(old_sched_type, ctl_conf_ptr->schedtype)) { xfree(ctl_conf_ptr->schedtype);