From 99cd7e74ed84cb1b5794fdfd93e2d24e15c59ae0 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 6 Oct 2008 23:26:12 +0000 Subject: [PATCH] svn merge -r15285:15315 https://eris.llnl.gov/svn/slurm/branches/slurm-1.3 --- NEWS | 10 +++++ doc/man/man1/sbatch.1 | 3 ++ doc/man/man1/srun.1 | 5 ++- doc/man/man5/slurm.conf.5 | 29 +++++++++++---- src/common/read_config.c | 1 + src/common/slurm_protocol_defs.c | 3 ++ src/plugins/jobcomp/filetxt/jobcomp_filetxt.c | 37 +++++++++++++++++-- src/sbatch/opt.c | 2 + src/slurmctld/node_mgr.c | 9 +++-- src/slurmctld/proc_req.c | 9 +++-- src/srun/opt.c | 2 + 11 files changed, 90 insertions(+), 20 deletions(-) diff --git a/NEWS b/NEWS index 86db691a896..b28b7dc93dc 100644 --- a/NEWS +++ b/NEWS @@ -107,6 +107,16 @@ documents those changes that are of interest to users and admins. -- Fix bug in logic to remove whitespace from plugstack.conf. -- Add new configuration parameter SallocDefaultCommand to control what shell that salloc launches by default. + -- When enforcing PrivateData configuration parameter, failures return + "Access/permission denied" rather than "Invalid user id". + -- From sbatch and srun, if the --dependency option is specified then set + the environment variable SLURM_JOB_DEPENDENCY to the same value. + -- In plugin jobcomp/filetxt, use ISO8601 formats for time by default (e.g. + YYYY-MM-DDTHH:MM:SS rather than MM/DD-HH:MM:SS). This restores the default + behavior from Slurm version 1.2. Change the value of USE_ISO8601 in + src/plusings/jobcomp/filetxt/jobcomp_filetxt.c to revert the behavior. + -- Add support for configuration option of ReturnToService=2, which will + return a DOWN to use if the node was previous set DOWN for any reason. * Changes in SLURM 1.3.8 ======================== diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 838ac2bfd6c..e8283f90c6f 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -783,6 +783,9 @@ The select/cons_res plugin allocates individual processors to jobs, so this number indicates the number of processors on this node allocated to the job. .TP +\fBSLURM_JOB_DEPENDENCY\fR +Set to value of the \-\-dependency option. +.TP \fBSLURM_JOB_NAME\fR Name of the job. .TP diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index cb709e6200e..cb967334742 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1310,10 +1310,13 @@ on this node allocated to the job. \fBSLURM_GTIDS\fR Global task IDs running on this node. Zero origin and comma separated. - +.TP +\fBSLURM_JOB_DEPENDENCY\fR +Set to value of the \-\-dependency option. .TP \fBSLURM_JOBID\fR Job id of the executing job + .TP \fBSLURM_LAUNCH_NODE_IPADDR\fR IP address of the node from which the task launch was diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 6cd295e2023..ca4df80e56e 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -843,14 +843,27 @@ Related configuration options include \fBResumeProgram\fR, \fBSuspendRate\fR, .TP \fBReturnToService\fR -If set to 1, then a non\-responding (DOWN) node will become available -for use upon registration. Note that DOWN node's state will be changed -only if it was set DOWN due to being non\-responsive. If the node was -set DOWN for any other reason (low memory, prolog failure, epilog -failure, etc.), its state will not automatically be changed. The -default value is 0, which means that a node will remain in the -DOWN state until a system administrator explicitly changes its state -(even if the slurmd daemon registers and resumes communications). +Controls when a DOWN node will be returned to service. +The default value is 0. +Supported values include +.RS +.TP 4 +\fB0\fR +A node will remain in the DOWN state until a system administrator +explicitly changes its state (even if the slurmd daemon registers +and resumes communications). +.TP +\fB1\fR +A non\-responding (DOWN) node will become available for use upon +registration. Note that DOWN node's state will be changed only if +it was set DOWN due to being non\-responsive. If the node was +set DOWN for any other reason (low memory, prolog failure, epilog +failure, etc.), its state will not automatically be changed. +.TP +\fB2\fR +A DOWN node will become available for use upon registration with a +valid configuration. The node could have been set DOWN for any reason. +.RE .TP \fBSallocDefaultCommand\fR diff --git a/src/common/read_config.c b/src/common/read_config.c index 492b8ba517a..40ac4b37888 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -1271,6 +1271,7 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->resume_program); ctl_conf_ptr->resume_rate = (uint16_t) NO_VAL; ctl_conf_ptr->ret2service = (uint16_t) NO_VAL; + xfree( ctl_conf_ptr->salloc_default_command); xfree( ctl_conf_ptr->sched_params ); ctl_conf_ptr->sched_time_slice = (uint16_t) NO_VAL; xfree( ctl_conf_ptr->schedtype ); diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index fc37eacce36..bac3161262c 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -755,6 +755,9 @@ private_data_string(uint16_t private_data, char *str, int str_len) strcat(str, "accounts"); //9 len } // total len 42 + + if (str[0] == '\0') + strcat(str, "none"); } char *job_state_string(enum job_states inx) diff --git a/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c b/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c index 77b558162e3..6180f24d1a9 100644 --- a/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c +++ b/src/plugins/jobcomp/filetxt/jobcomp_filetxt.c @@ -56,6 +56,8 @@ #include "src/common/uid.h" #include "filetxt_jobcomp_process.h" +#define USE_ISO8601 1 + /* * These variables are required by the generic plugin interface. If they * are not found in the plugin, the plugin loader will ignore it. @@ -208,6 +210,36 @@ extern int slurm_jobcomp_set_location ( char * location ) return rc; } +/* This is a variation of slurm_make_time_str() in src/common/parse_time.h + * This version uses ISO8601 format by default. */ +static void _make_time_str (time_t *time, char *string, int size) +{ + struct tm time_tm; + + localtime_r(time, &time_tm); + if ( *time == (time_t) 0 ) { + snprintf(string, size, "Unknown"); + } else { +#if USE_ISO8601 + /* Format YYYY-MM-DDTHH:MM:SS, ISO8601 standard format, + * NOTE: This is expected to break Maui, Moab and LSF + * schedulers management of SLURM. */ + snprintf(string, size, + "%4.4u-%2.2u-%2.2uT%2.2u:%2.2u:%2.2u", + (time_tm.tm_year + 1900), (time_tm.tm_mon+1), + time_tm.tm_mday, time_tm.tm_hour, time_tm.tm_min, + time_tm.tm_sec); +#else + /* Format MM/DD-HH:MM:SS */ + snprintf(string, size, + "%2.2u/%2.2u-%2.2u:%2.2u:%2.2u", + (time_tm.tm_mon+1), time_tm.tm_mday, + time_tm.tm_hour, time_tm.tm_min, time_tm.tm_sec); + +#endif + } +} + extern int slurm_jobcomp_log_record ( struct job_record *job_ptr ) { int rc = SLURM_SUCCESS; @@ -236,9 +268,8 @@ extern int slurm_jobcomp_log_record ( struct job_record *job_ptr ) * JOB_FAILED, JOB_TIMEOUT, etc. */ job_state = job_ptr->job_state & (~JOB_COMPLETING); - slurm_make_time_str(&(job_ptr->start_time), - start_str, sizeof(start_str)); - slurm_make_time_str(&(job_ptr->end_time), end_str, sizeof(end_str)); + _make_time_str(&(job_ptr->start_time), start_str, sizeof(start_str)); + _make_time_str(&(job_ptr->end_time), end_str, sizeof(end_str)); select_g_sprint_jobinfo(job_ptr->select_jobinfo, select_buf, sizeof(select_buf), SELECT_PRINT_MIXED); diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index 6fa693ccbe9..37355194653 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -1893,6 +1893,8 @@ static bool _opt_verify(void) error( "--propagate=%s is not valid.", opt.propagate ); verified = false; } + if (opt.dependency) + setenvfs("SLURM_JOB_DEPENDENCY=%s", opt.dependency); if (opt.acctg_freq >= 0) setenvf(NULL, "SLURM_ACCTG_FREQ", "%d", opt.acctg_freq); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 5864b3ae378..2b81c4a87e7 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -1631,10 +1631,11 @@ extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg) slurmctld_cluster_name, node_ptr, now); } else if ((base_state == NODE_STATE_DOWN) && - (slurmctld_conf.ret2service == 1) && - (node_ptr->reason != NULL) && - (strncmp(node_ptr->reason, "Not responding", 14) - == 0)) { + ((slurmctld_conf.ret2service == 2) || + ((slurmctld_conf.ret2service == 1) && + (node_ptr->reason != NULL) && + (strncmp(node_ptr->reason, "Not responding", 14) + == 0)))) { last_node_update = time (NULL); if (reg_msg->job_count) { node_ptr->node_state = NODE_STATE_ALLOCATED | diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index fb0f842bc29..298f2a6b556 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -434,7 +434,8 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->resume_rate = conf->resume_rate; conf_ptr->ret2service = conf->ret2service; - conf_ptr->salloc_default_command = xstrdup(conf->salloc_default_command); + conf_ptr->salloc_default_command = xstrdup(conf-> + salloc_default_command); if (conf->sched_params) conf_ptr->sched_params = xstrdup(conf->sched_params); else @@ -871,7 +872,7 @@ static void _slurm_rpc_dump_nodes(slurm_msg_t * msg) && (!validate_super_user(uid))) { unlock_slurmctld(node_read_lock); error("Security violation, REQUEST_NODE_INFO RPC from uid=%d", uid); - slurm_send_rc_msg(msg, ESLURM_USER_ID_MISSING); + slurm_send_rc_msg(msg, ESLURM_ACCESS_DENIED); } else if ((node_req_msg->last_update - 1) >= last_node_update) { unlock_slurmctld(node_read_lock); debug2("_slurm_rpc_dump_nodes, no change"); @@ -921,7 +922,7 @@ static void _slurm_rpc_dump_partitions(slurm_msg_t * msg) && (!validate_super_user(uid))) { unlock_slurmctld(part_read_lock); debug2("Security violation, PARTITION_INFO RPC from uid=%d", uid); - slurm_send_rc_msg(msg, ESLURM_USER_ID_MISSING); + slurm_send_rc_msg(msg, ESLURM_ACCESS_DENIED); } else if ((part_req_msg->last_update - 1) >= last_part_update) { unlock_slurmctld(part_read_lock); debug2("_slurm_rpc_dump_partitions, no change"); @@ -2335,7 +2336,7 @@ static void _slurm_rpc_node_select_info(slurm_msg_t * msg) lock_slurmctld(config_read_lock); if ((slurmctld_conf.private_data & PRIVATE_DATA_NODES) && (!validate_super_user(uid))) { - error_code = ESLURM_USER_ID_MISSING; + error_code = ESLURM_ACCESS_DENIED; error("Security violation, NODE_SELECT_INFO RPC from uid=u", (unsigned int) uid); } diff --git a/src/srun/opt.c b/src/srun/opt.c index 4f567e29089..8c57987720b 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -1655,6 +1655,8 @@ static void _opt_args(int argc, char **argv) setenv("SLURM_NETWORK", opt.network, 1); } #endif + if (opt.dependency) + setenvfs("SLURM_JOB_DEPENDENCY=%s", opt.dependency); if (opt.nodelist && (!opt.test_only)) { #ifdef HAVE_BG -- GitLab