From 5b4282bebba604945e48a3696501a92bfbc3d64d Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Thu, 14 Feb 2008 22:51:27 +0000 Subject: [PATCH] Add JobRequeue configuration parameter and --requeue option to the sbatch command. --- NEWS | 2 ++ RELEASE_NOTES | 2 ++ doc/man/man1/sbatch.1 | 15 +++++++++++-- doc/man/man1/srun.1 | 10 +-------- doc/man/man5/slurm.conf.5 | 13 +++++++++++- slurm/slurm.h.in | 4 +++- src/api/config_info.c | 4 +++- src/api/init_msg.c | 2 +- src/common/read_config.c | 15 ++++++++++--- src/common/slurm_protocol_pack.c | 9 ++++---- src/sbatch/opt.c | 29 ++++++++++++++++++++----- src/sbatch/opt.h | 2 +- src/sbatch/sbatch.c | 3 ++- src/slurmctld/job_mgr.c | 36 +++++++++++++++++--------------- src/slurmctld/proc_req.c | 4 ++-- src/slurmctld/slurmctld.h | 5 ++--- 16 files changed, 104 insertions(+), 51 deletions(-) diff --git a/NEWS b/NEWS index c74d3c3ef53..ce5bd4c794a 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,8 @@ documents those changes that are of interest to users and admins. -- In sched/wiki and sched/wiki2: add HostFormat and HidePartitionJobs to "scontrol show config" SCHEDULER_CONF output. -- In sched/wiki2: accept hostname expression as input for GETNODES command. + -- Add JobRequeue configuration parameter and --requeue option to the sbatch + command. * Changes in SLURM 1.3.0-pre9 ============================= diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 5725db06c63..026b2126418 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -66,6 +66,8 @@ CONFIGURATION FILE CHANGES time slices. * Added new parameters "DefMemPerTask" and "MaxMemPerTask" any task that exceeds the specified size will be terminated. +* Added new parameter "JobRequeue" to control default job behavior after a node + failure (requeue or kill the job). * Added new partition parameter "Priority". A job's scheduling priority is based upon two factors. First the priority of its partition and the job's priority. Since nodes can be configured in multiple partitions, this can be used to configure diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 18062f2bbd2..f23c3635c84 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -1,4 +1,4 @@ -.TH "sbatch" "1" "SLURM 1.3" "January 2008" "SLURM Commands" +.TH "sbatch" "1" "SLURM 1.3" "February 2008" "SLURM Commands" .SH "NAME" .LP sbatch \- Submit a batch script to SLURM. @@ -409,10 +409,13 @@ ignored if \fISchedulerType=sched/wiki\fR or .TP \fB\-\-no\-requeue\fR -Specifies that the batch job should not be requeued. +Specifies that the batch job should not be requeued after node failure. Setting this option will prevent system administrators from being able to restart the job (for example, after a scheduled downtime). When a job is requeued, the batch script is initiated from its beginning. +Also see the \fB\-\-requeue\fR option. +The \fIJobRequeue\fR configuration parameter controls the default +behavior on the cluster. .TP \fB\-\-ntasks\-per\-core\fR=\fIntasks\fR @@ -521,6 +524,14 @@ The maximum stack size \fB\-q\fR, \fB\-\-quiet\fR Suppress informational messages from sbatch. Errors will still be displayed. +.TP +\fB\-\-requeue\fR +Specifies that the batch job should be requeued after node failure. +When a job is requeued, the batch script is initiated from its beginning. +Also see the \fB\-\-no\-requeue\fR option. +The \fIJobRequeue\fR configuration parameter controls the default +behavior on the cluster. + .TP \fB\-s\fR, \fB\-\-share\fR The job allocation can share nodes with other running jobs. (The default diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 0555879a49b..3dfe06e23d9 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -1,6 +1,6 @@ \." $Id$ .\" -.TH SRUN "1" "January 2008" "srun 1.3" "slurm components" +.TH SRUN "1" "Frebruary 2008" "srun 1.3" "slurm components" .SH "NAME" srun \- run parallel jobs @@ -611,14 +611,6 @@ a negative adjustment. NOTE: This option is presently ignored if \fISchedulerType=sched/wiki\fR or \fISchedulerType=sched/wiki2\fR. -.TP -\fB\-\-no\-requeue\fR -Specifies that the batch job is not requeue. -Setting this option will prevent system administrators from being able -to restart the job (for example, after a scheduled downtime). -When a job is requeued, the batch script is initiated from its beginning. -This option is only applicable to batch job submission (see \fB\-\-batch\fR). - .TP \fB\-\-ntasks\-per\-core\fR=\fIntasks\fR Request that no more than \fIntasks\fR be invoked on each core. diff --git a/doc/man/man5/slurm.conf.5 b/doc/man/man5/slurm.conf.5 index 4d9ce85a09e..532b6164471 100644 --- a/doc/man/man5/slurm.conf.5 +++ b/doc/man/man5/slurm.conf.5 @@ -1,4 +1,4 @@ -.TH "slurm.conf" "5" "January 2008" "slurm.conf 1.3" "Slurm configuration file" +.TH "slurm.conf" "5" "February 2008" "slurm.conf 1.3" "Slurm configuration file" .SH "NAME" slurm.conf \- Slurm configuration file .SH "DESCRIPTION" @@ -322,6 +322,17 @@ If \fBJobFileAppend\fR is set to a value of 1, then append to the existing file. By default, any existing file is truncated. +.TP +\fBJobRequeue\fR +This option controls what to do by default after a node failure. +If \fBJobRequeue\fR is set to a value of 1, then any job running +on the failed node will be requeued for execution on different nodes. +If \fBJobRequeue\fR is set to a value of 0, then any job running +on the failed node will be terminated. +Use the \fBsbatch\fR \fI\-\-no\-requeue\fR or \fI\-\-requeue\fR +option to change the default behavior for individual jobs. +The default value is 1. + .TP \fBKillTree\fR This option is mapped to "ProctrackType=proctrack/linuxproc". diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 9a7b543e406..73818bac409 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -2,6 +2,7 @@ * slurm.h - Definitions for all of the SLURM RPCs ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov>, * Joey Ekstrom <ekstrom1@llnl.gov> et. al. @@ -549,7 +550,7 @@ typedef struct job_descriptor { /* For submit, allocate, and update requests */ time_t begin_time; /* delay initiation until this time */ uint16_t mail_type; /* see MAIL_JOB_ definitions above */ char *mail_user; /* user to receive notification */ - uint16_t no_requeue; /* disable job requeue option */ + uint16_t requeue; /* enable or disable job requeue option */ /* * The following parameters are only meaningful on a Blue Gene * system at present. Some will be of value on other system. Don't remove these @@ -958,6 +959,7 @@ typedef struct slurm_ctl_conf { char *job_credential_private_key; /* path to private key */ char *job_credential_public_certificate;/* path to public certificate*/ uint16_t job_file_append; /* if set, append to stdout/err file */ + uint16_t job_requeue; /* If set, jobs get requeued on node failre */ uint16_t kill_wait; /* seconds between SIGXCPU to SIGKILL * on job termination */ char *mail_prog; /* pathname of mail program */ diff --git a/src/api/config_info.c b/src/api/config_info.c index 5f625e78599..3ccb4cf5e18 100644 --- a/src/api/config_info.c +++ b/src/api/config_info.c @@ -1,8 +1,8 @@ /****************************************************************************\ * config_info.c - get/print the system configuration information of slurm - * $Id$ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> and Kevin Tew <tew1@llnl.gov>. * UCRL-CODE-226842. @@ -185,6 +185,8 @@ void slurm_print_ctl_conf ( FILE* out, slurm_ctl_conf_ptr->job_credential_public_certificate); fprintf(out, "JobFileAppend = %u\n", slurm_ctl_conf_ptr->job_file_append); + fprintf(out, "JobRequeue = %u\n", + slurm_ctl_conf_ptr->job_requeue); fprintf(out, "KillWait = %u\n", slurm_ctl_conf_ptr->kill_wait); fprintf(out, "MailProg = %s\n", diff --git a/src/api/init_msg.c b/src/api/init_msg.c index bc1e929dd92..6043ae4e573 100644 --- a/src/api/init_msg.c +++ b/src/api/init_msg.c @@ -118,7 +118,7 @@ void slurm_init_job_desc_msg(job_desc_msg_t * job_desc_msg) job_desc_msg->mail_type = 0; job_desc_msg->mail_user = NULL; job_desc_msg->begin_time = 0; - job_desc_msg->no_requeue = (uint16_t) NO_VAL; + job_desc_msg->requeue = (uint16_t) NO_VAL; #if SYSTEM_DIMENSIONS { int i; diff --git a/src/common/read_config.c b/src/common/read_config.c index 6ab20683535..2b43f284f62 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -2,6 +2,7 @@ * read_config.c - read the overall slurm configuration file ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov>. * UCRL-CODE-226842. @@ -156,6 +157,7 @@ s_p_options_t slurm_conf_options[] = { {"JobCredentialPrivateKey", S_P_STRING}, {"JobCredentialPublicCertificate", S_P_STRING}, {"JobFileAppend", S_P_UINT16}, + {"JobRequeue", S_P_UINT16}, {"GetEnvTimeout", S_P_UINT16}, {"KillTree", S_P_UINT16, defunct_option}, {"KillWait", S_P_UINT16}, @@ -1147,10 +1149,10 @@ init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->job_comp_host); xfree (ctl_conf_ptr->job_comp_pass); ctl_conf_ptr->job_comp_port = 0; - xfree (ctl_conf_ptr->job_credential_private_key); xfree (ctl_conf_ptr->job_credential_public_certificate); ctl_conf_ptr->job_file_append = (uint16_t) NO_VAL; + ctl_conf_ptr->job_requeue = (uint16_t) NO_VAL; ctl_conf_ptr->kill_wait = (uint16_t) NO_VAL; xfree (ctl_conf_ptr->mail_prog); ctl_conf_ptr->max_job_cnt = (uint16_t) NO_VAL; @@ -1616,6 +1618,11 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) if (!s_p_get_uint16(&conf->job_file_append, "JobFileAppend", hashtbl)) conf->job_file_append = 0; + if (!s_p_get_uint16(&conf->job_requeue, "JobRequeue", hashtbl)) + conf->job_requeue = 1; + else if (conf->job_requeue > 1) + conf->job_requeue = 1; + if (!s_p_get_uint16(&conf->get_env_timeout, "GetEnvTimeout", hashtbl)) conf->get_env_timeout = DEFAULT_GET_ENV_TIMEOUT; @@ -1633,8 +1640,10 @@ validate_and_set_defaults(slurm_ctl_conf_t *conf, s_p_hashtbl_t *hashtbl) if (!s_p_get_uint16(&conf->msg_timeout, "MessageTimeout", hashtbl)) conf->msg_timeout = DEFAULT_MSG_TIMEOUT; - else if (conf->msg_timeout > 100) - info("WARNING: MessageTimeout is too high for effective fault-tolerance"); + else if (conf->msg_timeout > 100) { + info("WARNING: MessageTimeout is too high for effective " + "fault-tolerance"); + } if (!s_p_get_uint16(&conf->min_job_age, "MinJobAge", hashtbl)) conf->min_job_age = DEFAULT_MIN_JOB_AGE; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 4ce27d6b70f..c7950967708 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -1,9 +1,8 @@ /****************************************************************************\ * slurm_protocol_pack.c - functions to pack and unpack structures for RPCs - * - * $Id$ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Kevin Tew <tew1@llnl.gov>, et. al. * UCRL-CODE-226842. @@ -2238,6 +2237,7 @@ _pack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t * build_ptr, Buf buffer) packstr(build_ptr->job_credential_private_key, buffer); packstr(build_ptr->job_credential_public_certificate, buffer); pack16(build_ptr->job_file_append, buffer); + pack16(build_ptr->job_requeue, buffer); pack16(build_ptr->kill_wait, buffer); @@ -2381,6 +2381,7 @@ _unpack_slurm_ctl_conf_msg(slurm_ctl_conf_info_msg_t ** job_credential_public_certificate, &uint32_tmp, buffer); safe_unpack16(&build_ptr->job_file_append, buffer); + safe_unpack16(&build_ptr->job_requeue, buffer); safe_unpack16(&build_ptr->kill_wait, buffer); @@ -2581,7 +2582,7 @@ _pack_job_desc_msg(job_desc_msg_t * job_desc_ptr, Buf buffer) packstr(job_desc_ptr->work_dir, buffer); pack16(job_desc_ptr->immediate, buffer); - pack16(job_desc_ptr->no_requeue, buffer); + pack16(job_desc_ptr->requeue, buffer); pack16(job_desc_ptr->shared, buffer); pack16(job_desc_ptr->cpus_per_task, buffer); pack16(job_desc_ptr->ntasks_per_node, buffer); @@ -2712,7 +2713,7 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer) safe_unpackstr_xmalloc(&job_desc_ptr->work_dir, &uint32_tmp, buffer); safe_unpack16(&job_desc_ptr->immediate, buffer); - safe_unpack16(&job_desc_ptr->no_requeue, buffer); + safe_unpack16(&job_desc_ptr->requeue, buffer); safe_unpack16(&job_desc_ptr->shared, buffer); safe_unpack16(&job_desc_ptr->cpus_per_task, buffer); safe_unpack16(&job_desc_ptr->ntasks_per_node, buffer); diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index be1014d50ab..21c741871c5 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -2,6 +2,7 @@ * opt.c - options processing for sbatch ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Mark Grondona <grondona1@llnl.gov>, et. al. * UCRL-CODE-226842. @@ -87,6 +88,8 @@ #define OPT_OVERCOMMIT 0x11 #define OPT_OPEN_MODE 0x12 #define OPT_ACCTG_FREQ 0x13 +#define OPT_NO_REQUEUE 0x14 +#define OPT_REQUEUE 0x15 /* generic getopt_long flags, integers and *not* valid characters */ #define LONG_OPT_PROPAGATE 0x100 @@ -110,6 +113,7 @@ #define LONG_OPT_NO_REQUEUE 0x116 #define LONG_OPT_COMMENT 0x117 #define LONG_OPT_WRAP 0x118 +#define LONG_OPT_REQUEUE 0x119 #define LONG_OPT_SOCKETSPERNODE 0x130 #define LONG_OPT_CORESPERSOCKET 0x131 #define LONG_OPT_THREADSPERCORE 0x132 @@ -252,7 +256,7 @@ static void _opt_default() opt.no_kill = false; opt.immediate = false; - opt.no_requeue = false; + opt.requeue = NO_VAL; opt.overcommit = false; opt.quiet = 0; @@ -323,7 +327,8 @@ env_vars_t env_vars[] = { {"SBATCH_JOB_NAME", OPT_STRING, &opt.job_name, NULL }, {"SBATCH_LINUX_IMAGE", OPT_STRING, &opt.linuximage, NULL }, {"SBATCH_MLOADER_IMAGE", OPT_STRING, &opt.mloaderimage, NULL }, - {"SBATCH_NO_REQUEUE", OPT_BOOL, &opt.no_requeue, NULL }, + {"SBATCH_NO_REQUEUE", OPT_NO_REQUEUE, NULL, NULL }, + {"SBATCH_REQUEUE", OPT_REQUEUE, NULL, NULL }, {"SBATCH_NO_ROTATE", OPT_BOOL, &opt.no_rotate, NULL }, {"SBATCH_OVERCOMMIT", OPT_OVERCOMMIT, NULL, NULL }, {"SBATCH_PARTITION", OPT_STRING, &opt.partition, NULL }, @@ -444,6 +449,14 @@ _process_env_var(env_vars_t *e, const char *val) error("Invalid SBATCH_OPEN_MODE: %s. Ignored", val); break; + case OPT_NO_REQUEUE: + opt.requeue = 0; + break; + + case OPT_REQUEUE: + opt.requeue = 1; + break; + default: /* do nothing */ break; @@ -509,6 +522,7 @@ static struct option long_options[] = { {"mail-user", required_argument, 0, LONG_OPT_MAIL_USER}, {"nice", optional_argument, 0, LONG_OPT_NICE}, {"no-requeue", no_argument, 0, LONG_OPT_NO_REQUEUE}, + {"requeue", no_argument, 0, LONG_OPT_REQUEUE}, {"comment", required_argument, 0, LONG_OPT_COMMENT}, {"sockets-per-node", required_argument, 0, LONG_OPT_SOCKETSPERNODE}, {"cores-per-socket", required_argument, 0, LONG_OPT_CORESPERSOCKET}, @@ -1183,7 +1197,10 @@ static void _set_options(int argc, char **argv) } break; case LONG_OPT_NO_REQUEUE: - opt.no_requeue = true; + opt.requeue = 0; + break; + case LONG_OPT_REQUEUE: + opt.requeue = 1; break; case LONG_OPT_COMMENT: xfree(opt.comment); @@ -2023,7 +2040,8 @@ static void _opt_list() info("plane size : %u", opt.plane_size); info("verbose : %d", opt.verbose); info("immediate : %s", tf_(opt.immediate)); - info("no-requeue : %s", tf_(opt.no_requeue)); + if (opt.requeue != NO_VAL) + info("requeue : %u", opt.requeue); info("overcommit : %s", tf_(opt.overcommit)); if (opt.time_limit == INFINITE) info("time_limit : INFINITE"); @@ -2097,7 +2115,7 @@ static void _usage(void) " [--mloader-image=path] [--ramdisk-image=path]\n" #endif " [--mail-type=type] [--mail-user=user][--nice[=value]]\n" -" [--no-requeue] [--ntasks-per-node=n] [--propagate]\n" +" [--requeue] [--no-requeue] [--ntasks-per-node=n] [--propagate]\n" " [--nodefile=file] [--nodelist=hosts] [--exclude=hosts]\n" " executable [args...]\n"); } @@ -2141,6 +2159,7 @@ static void _help(void) " --uid=user_id user ID to run job as (user root only)\n" " --get-user-env used by Moab. See srun man page.\n" " --no-requeue if set, do not permit the job to be requeued\n" +" --requeue if set, permit the job to be requeued\n" " --propagate[=rlimits] propagate all [or specific list of] rlimits\n" "\n" "Constraint options:\n" diff --git a/src/sbatch/opt.h b/src/sbatch/opt.h index 4989beb7f5d..ab6f9751d98 100644 --- a/src/sbatch/opt.h +++ b/src/sbatch/opt.h @@ -97,7 +97,7 @@ typedef struct sbatch_options { bool hold; /* --hold, -H */ bool no_kill; /* --no-kill, -k */ - bool no_requeue; /* --no-requeue */ + int requeue; /* --requeue and --no-requeue */ uint8_t open_mode; /* --open-mode */ int acctg_freq; /* --acctg-freq=secs */ bool overcommit; /* --overcommit -O */ diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index c1e103911f1..a0dbb1bf387 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -277,7 +277,8 @@ static int fill_job_desc_from_opts(job_desc_msg_t *desc) desc->in = opt.ifname; desc->out = opt.ofname; desc->work_dir = opt.cwd; - desc->no_requeue = opt.no_requeue; + if (opt.requeue != NO_VAL) + desc->requeue = opt.requeue; if (opt.open_mode) desc->open_mode = opt.open_mode; if (opt.acctg_freq >= 0) diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index e92b110d8a7..02871653d25 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -749,7 +749,7 @@ void _dump_job_details(struct job_details *detail_ptr, Buf buffer) pack16(detail_ptr->contiguous, buffer); pack16(detail_ptr->cpus_per_task, buffer); pack16(detail_ptr->ntasks_per_node, buffer); - pack16(detail_ptr->no_requeue, buffer); + pack16(detail_ptr->requeue, buffer); pack16(detail_ptr->acctg_freq, buffer); pack8(detail_ptr->open_mode, buffer); @@ -787,7 +787,7 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer) uint32_t job_min_memory, job_min_tmp_disk; uint32_t num_tasks, name_len, argc = 0; uint16_t shared, contiguous, ntasks_per_node; - uint16_t acctg_freq, cpus_per_task, no_requeue; + uint16_t acctg_freq, cpus_per_task, requeue; uint8_t open_mode, overcommit; time_t begin_time, submit_time; int i; @@ -802,7 +802,7 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer) safe_unpack16(&contiguous, buffer); safe_unpack16(&cpus_per_task, buffer); safe_unpack16(&ntasks_per_node, buffer); - safe_unpack16(&no_requeue, buffer); + safe_unpack16(&requeue, buffer); safe_unpack16(&acctg_freq, buffer); safe_unpack8(&open_mode, buffer); @@ -834,9 +834,9 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer) job_ptr->job_id, contiguous); goto unpack_error; } - if ((no_requeue > 1) || (overcommit > 1)) { - error("Invalid data for job %u: no_requeue=%u overcommit=%u", - no_requeue, overcommit); + if ((requeue > 1) || (overcommit > 1)) { + error("Invalid data for job %u: requeue=%u overcommit=%u", + requeue, overcommit); goto unpack_error; } @@ -866,7 +866,7 @@ static int _load_job_details(struct job_record *job_ptr, Buf buffer) job_ptr->details->job_min_procs = job_min_procs; job_ptr->details->job_min_memory = job_min_memory; job_ptr->details->job_min_tmp_disk = job_min_tmp_disk; - job_ptr->details->no_requeue = no_requeue; + job_ptr->details->requeue = requeue; job_ptr->details->open_mode = open_mode; job_ptr->details->overcommit = overcommit; job_ptr->details->begin_time = begin_time; @@ -1063,7 +1063,7 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) node_name, job_ptr->job_id); _excise_node_from_job(job_ptr, node_ptr); } else if (job_ptr->batch_flag && job_ptr->details && - (job_ptr->details->no_requeue == 0)) { + (job_ptr->details->requeue > 0)) { uint16_t save_state; char requeue_msg[128]; @@ -1149,7 +1149,7 @@ void dump_job_desc(job_desc_msg_t * job_specs) long job_min_memory, job_min_tmp_disk, num_procs; long time_limit, priority, contiguous, acctg_freq; long kill_on_node_fail, shared, immediate; - long cpus_per_task, no_requeue, num_tasks, overcommit; + long cpus_per_task, requeue, num_tasks, overcommit; long ntasks_per_node, ntasks_per_socket, ntasks_per_core; char buf[100]; @@ -1274,10 +1274,10 @@ void dump_job_desc(job_desc_msg_t * job_specs) slurm_make_time_str(&job_specs->begin_time, buf, sizeof(buf)); cpus_per_task = (job_specs->cpus_per_task != (uint16_t) NO_VAL) ? (long) job_specs->cpus_per_task : -1L; - no_requeue = (job_specs->no_requeue != (uint16_t) NO_VAL) ? - (long) job_specs->no_requeue : -1L; - debug3(" network=%s begin=%s cpus_per_task=%ld no_requeue=%ld", - job_specs->network, buf, cpus_per_task, no_requeue); + requeue = (job_specs->requeue != (uint16_t) NO_VAL) ? + (long) job_specs->requeue : -1L; + debug3(" network=%s begin=%s cpus_per_task=%ld requeue=%ld", + job_specs->network, buf, cpus_per_task, requeue); ntasks_per_node = (job_specs->ntasks_per_node != (uint16_t) NO_VAL) ? (long) job_specs->ntasks_per_node : -1L; @@ -2570,11 +2570,13 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, if (job_desc->task_dist != (uint16_t) NO_VAL) detail_ptr->task_dist = job_desc->task_dist; if (job_desc->cpus_per_task != (uint16_t) NO_VAL) - detail_ptr->cpus_per_task = job_desc->cpus_per_task; + detail_ptr->cpus_per_task = MIN(job_desc->cpus_per_task, 1); if (job_desc->ntasks_per_node != (uint16_t) NO_VAL) detail_ptr->ntasks_per_node = job_desc->ntasks_per_node; - if (job_desc->no_requeue != (uint16_t) NO_VAL) - detail_ptr->no_requeue = job_desc->no_requeue; + if (job_desc->requeue != (uint16_t) NO_VAL) + detail_ptr->requeue = MIN(job_desc->requeue, 1); + else + detail_ptr->requeue = slurmctld_conf.job_requeue; if (job_desc->job_min_procs != (uint16_t) NO_VAL) detail_ptr->job_min_procs = job_desc->job_min_procs; detail_ptr->job_min_procs = MAX(detail_ptr->job_min_procs, @@ -4892,7 +4894,7 @@ extern int job_requeue (uid_t uid, uint32_t job_id, slurm_fd conn_fd) rc = ESLURM_ALREADY_DONE; goto reply; } - if ((job_ptr->details == NULL) || job_ptr->details->no_requeue) { + if ((job_ptr->details == NULL) || (job_ptr->details->requeue == 0)) { rc = ESLURM_DISABLED; goto reply; } diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 1bfacdbbd63..b55af125113 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1,9 +1,8 @@ /*****************************************************************************\ * proc_req.c - process incomming messages to slurmctld - * - * $Id$ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette@llnl.gov>, Kevin Tew * <tew1@llnl.gov>, et. al. @@ -352,6 +351,7 @@ void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->job_credential_public_certificate = xstrdup(conf-> job_credential_public_certificate); conf_ptr->job_file_append = conf->job_file_append; + conf_ptr->job_requeue = conf->job_requeue; conf_ptr->get_env_timeout = conf->get_env_timeout; conf_ptr->kill_wait = conf->kill_wait; conf_ptr->mail_prog = xstrdup(conf->mail_prog); diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 9fe8e4998ca..a2289063330 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1,9 +1,8 @@ /*****************************************************************************\ * slurmctld.h - definitions of functions and structures for slurmcltd use - * - * $Id$ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette <jette1@llnl.gov> et. al. * UCRL-CODE-226842. @@ -331,7 +330,7 @@ struct job_details { char *work_dir; /* pathname of working directory */ char **argv; /* arguments for a batch job script */ uint16_t argc; /* count of argv elements */ - uint16_t no_requeue; /* don't requeue job if set */ + uint16_t requeue; /* controls ability requeue job */ multi_core_data_t *mc_ptr; /* multi-core specific data */ char *dependency; /* wait for other jobs */ List depend_list; /* list of job_ptr:state pairs */ -- GitLab