diff --git a/NEWS b/NEWS index a7e84f1064555e2b50b9d48cfa1b20cee8f56054..18e9a895e2999f5bdffc3945de0e0b2464b123a8 100644 --- a/NEWS +++ b/NEWS @@ -65,7 +65,11 @@ documents those changes that are of interest to users and admins. -- Add srun support for SLURM_THREADS and PMI_FANOUT environment variables. -- Fix support in squeue for output format with left justification of reason (%r) and reason/node_list (%R) output. - + -- Automatically requeue a batch job when a node allocated to it fails + or the prolog fails (unless --no-requeue or --no-kill option used). + -- In sched/wiki, enable use of wiki.conf parameter ExcludePartitions to + directly schedule selected partitions without Maui control. + * Changes in SLURM 1.2.16 ========================= -- Add --overcommit option to the salloc command. diff --git a/doc/html/maui.shtml b/doc/html/maui.shtml index 3331360df00e7e88149cdf313732b808d7880b86..a76aef47205db4df9293de784b08df67b613b115 100644 --- a/doc/html/maui.shtml +++ b/doc/html/maui.shtml @@ -108,9 +108,11 @@ includes a description of keywords presently only supported by the sched/wiki2 plugin for use with the Moab Scheduler.</p> -<p>They only wiki.conf keyword used by the sched/wiki plugin -is <b>AuthKey</b>, which should match the key used to configure -Maui at build time. +<p>Only two wiki.conf parameters are used by the sched/wiki plugin: +<b>AuthKey</b> should match the key used to configure +Maui at build time and +<b>ExcludePartitions</b> can be used for SLURM to directly +schedule jobs in select partitions without Maui control. Note that SLURM's wiki plugin does not include a mechanism to submit new jobs, so even without this key nobody could run jobs as another user. @@ -130,11 +132,14 @@ configuration.</p> # # Matches Maui's --with-key configuration parameter AuthKey=42 +# +# SLURM to directly schedule "debug" partition +ExcludePartitions=debug </pre> </p> <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 21 August 2007</p> +<p style="text-align:center;">Last modified 17 September 2007</p> <!--#include virtual="footer.txt"--> diff --git a/doc/man/man5/wiki.conf.5 b/doc/man/man5/wiki.conf.5 index 5b6fce9dd6bbb5368a96a4b17b5dc59036958739..536327fb2ccfe54e0cebf066bebf5197b04b95df 100644 --- a/doc/man/man5/wiki.conf.5 +++ b/doc/man/man5/wiki.conf.5 @@ -49,19 +49,18 @@ Not applicable to wiki plugin, only the wiki2 plugin. .TP \fBExcludePartitions\fR Identifies partitions whose jobs are to be scheduled directly -by SLURM rather than Moab. -This only effects jobs which are submitted using Slurm +by SLURM rather than Moab/Maui. +This only effects jobs which are submitted using SLURM commands (i.e. srun, salloc or sbatch, NOT msub from Moab). These jobs will be scheduled on a First\-Come\-First\-Served -basis. -This may provide faster response times than Moab scheduling. -Moab will account for and report the jobs, but their initiation -will be outside of Moab's control. -Note that Moab controls for resource reservation, fair share +basis directly by SLURM. +Moab/Maui controls for resource reservation, fair share scheduling, etc. will not apply to the initiation of these jobs. +While Moab/Maui will not control the initiation of jobs in these +partitions, it will account for and report the jobs. If more than one partition is to be scheduled directly by -Slurm, use a comma separator between their names. -Not applicable to wiki plugin, only the wiki2 plugin. +SLURM, use a comma separator between their names. +This may provide faster response times than Moab/Maui scheduling. .TP \fBHostFormat\fR diff --git a/src/plugins/sched/wiki/msg.c b/src/plugins/sched/wiki/msg.c index 01e0c5fb43a8efd2e1a05dc03df9fe0a8e904a83..0f9b9e9b303a5b348d20bdfe30c86a1fb01f5acc 100644 --- a/src/plugins/sched/wiki/msg.c +++ b/src/plugins/sched/wiki/msg.c @@ -54,6 +54,7 @@ char auth_key[KEY_SIZE] = ""; char e_host[E_HOST_SIZE] = ""; char e_host_bu[E_HOST_SIZE] = ""; uint16_t e_port = 0; +struct part_record *exclude_part_ptr[EXC_PART_CNT]; uint16_t job_aggregation_time = 10; /* Default value is 10 seconds */ int init_prio_mode = PRIO_HOLD; uint16_t kill_wait; @@ -228,15 +229,20 @@ extern int parse_wiki_config(void) {"EHost", S_P_STRING}, {"EHostBackup", S_P_STRING}, {"EPort", S_P_UINT16}, + {"ExcludePartitions", S_P_STRING}, {"JobAggregationTime", S_P_UINT16}, {"JobPriority", S_P_STRING}, {NULL} }; s_p_hashtbl_t *tbl; + char *exclude_partitions; char *key = NULL, *priority_mode = NULL, *wiki_conf; struct stat buf; slurm_ctl_conf_t *conf; + int i; /* Set default values */ + for (i=0; i<EXC_PART_CNT; i++) + exclude_part_ptr[i] = NULL; conf = slurm_conf_lock(); strncpy(e_host, conf->control_addr, sizeof(e_host)); if (conf->backup_addr) { @@ -276,6 +282,25 @@ extern int parse_wiki_config(void) s_p_get_uint16(&e_port, "EPort", tbl); s_p_get_uint16(&job_aggregation_time, "JobAggregationTime", tbl); + if (s_p_get_string(&exclude_partitions, "ExcludePartitions", tbl)) { + char *tok, *tok_p; + tok = strtok_r(exclude_partitions, ",", &tok_p); + i = 0; + while (tok) { + if (i >= EXC_PART_CNT) { + error("ExcludePartitions has too many entries " + "skipping %s and later entries"); + break; + } + exclude_part_ptr[i] = find_part_record(tok); + if (exclude_part_ptr[i]) + i++; + else + error("ExcludePartitions %s not found", tok); + tok = strtok_r(NULL, ",", &tok_p); + } + } + if (s_p_get_string(&priority_mode, "JobPriority", tbl)) { if (strcasecmp(priority_mode, "hold") == 0) init_prio_mode = PRIO_HOLD; diff --git a/src/plugins/sched/wiki/msg.h b/src/plugins/sched/wiki/msg.h index 12fdec816b4485cd13cd517fb65d7bce78dc867c..76458521a45c5bac36e9b5eb5a8a03fb6946eaa4 100644 --- a/src/plugins/sched/wiki/msg.h +++ b/src/plugins/sched/wiki/msg.h @@ -79,9 +79,11 @@ #include "src/common/xmalloc.h" #include "src/common/xsignal.h" #include "src/common/xstring.h" +#include "src/slurmctld/slurmctld.h" /* Global configuration parameters */ #define E_HOST_SIZE 256 +#define EXC_PART_CNT 10 #define KEY_SIZE 32 #define PRIO_HOLD 0 #define PRIO_DECREMENT 1 @@ -90,6 +92,7 @@ extern char auth_key[KEY_SIZE]; extern char e_host[E_HOST_SIZE]; extern char e_host_bu[E_HOST_SIZE]; extern uint16_t e_port; +extern struct part_record *exclude_part_ptr[EXC_PART_CNT]; extern uint16_t job_aggregation_time; extern uint16_t kill_wait; extern uint16_t use_host_exp; diff --git a/src/plugins/sched/wiki/sched_wiki.c b/src/plugins/sched/wiki/sched_wiki.c index f01f1badfb77b877e556d4a4b3b9643d7a9ad4f5..64606a366d998632fb2c9ef1e3945a77ae943151 100644 --- a/src/plugins/sched/wiki/sched_wiki.c +++ b/src/plugins/sched/wiki/sched_wiki.c @@ -84,13 +84,30 @@ extern uint32_t slurm_sched_plugin_initial_priority( uint32_t last_prio, struct job_record *job_ptr ) { + if (exclude_part_ptr[0]) { + /* Interactive job (initiated by srun) in partition + * excluded from Moab scheduling */ + int i; + static int exclude_prio = 100000000; + for (i=0; i<EXC_PART_CNT; i++) { + if (exclude_part_ptr[i] == NULL) + break; + if (exclude_part_ptr[i] == job_ptr->part_ptr) { + debug("Scheduiling job %u directly (no Maui)", + job_ptr->job_id); + return (exclude_prio--); + } + } + return 0; + } + if (init_prio_mode == PRIO_DECREMENT) { if (last_prio >= 2) return (last_prio - 1); else return 1; - } else - return 0; + } + return 0; } /**************************************************************************/ diff --git a/src/slurmctld/agent.c b/src/slurmctld/agent.c index e6e44d17dc2829ade4fbe5b0afc999d22f13e767..64eb83f83af7d9a950940525da2457177b328b68 100644 --- a/src/slurmctld/agent.c +++ b/src/slurmctld/agent.c @@ -871,9 +871,10 @@ static void *_thread_per_group_rpc(void *args) run_scheduler = true; unlock_slurmctld(job_write_lock); } - - /* SPECIAL CASE: Kill non-startable batch job */ - if ((msg_type == REQUEST_BATCH_JOB_LAUNCH) && rc && + /* SPECIAL CASE: Kill non-startable batch job, + * Requeue the job on ESLURMD_PROLOG_FAILED */ + if ((msg_type == REQUEST_BATCH_JOB_LAUNCH) && + (rc != SLURM_SUCCESS) && (rc != ESLURMD_PROLOG_FAILED) && (ret_data_info->type != RESPONSE_FORWARD_FAILED)) { batch_job_launch_msg_t *launch_msg_ptr = task_ptr->msg_args_ptr; @@ -890,9 +891,9 @@ static void *_thread_per_group_rpc(void *args) #endif - if (((msg_type == REQUEST_SIGNAL_TASKS) - || (msg_type == REQUEST_TERMINATE_TASKS)) - && (rc == ESRCH)) { + if (((msg_type == REQUEST_SIGNAL_TASKS) || + (msg_type == REQUEST_TERMINATE_TASKS)) && + (rc == ESRCH)) { /* process is already dead, not a real error */ rc = SLURM_SUCCESS; } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index a472e70acb64fa1119cec38d81542b2d219a2aad..70bee45dc9882f91d5366096a0d493f358612739 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1016,6 +1016,7 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) struct node_record *node_ptr; int bit_position; int job_count = 0; + time_t now = time(NULL); node_ptr = find_node_record(node_name); if (node_ptr == NULL) /* No such node */ @@ -1055,10 +1056,28 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) continue; job_count++; - srun_node_fail(job_ptr->job_id, node_name); - if ((job_ptr->details == NULL) || - (job_ptr->kill_on_node_fail) || - (job_ptr->node_cnt <= 1)) { + if ((job_ptr->details) && + (job_ptr->kill_on_node_fail == 0) && + (job_ptr->node_cnt > 1)) { + /* keep job running on remaining nodes */ + srun_node_fail(job_ptr->job_id, node_name); + error("Removing failed node %s from job_id %u", + node_name, job_ptr->job_id); + _excise_node_from_job(job_ptr, node_ptr); + } else if (job_ptr->batch_flag && job_ptr->details && + (job_ptr->details->no_requeue == 0)) { + info("requeue job %u due to failure of node %s", + job_ptr->job_id, node_name); + _set_job_prio(job_ptr); + job_ptr->time_last_active = now; + job_ptr->job_state = JOB_PENDING | JOB_COMPLETING; + if (suspended) + job_ptr->end_time = job_ptr->suspend_time; + else + job_ptr->end_time = now; + deallocate_nodes(job_ptr, false, suspended); + job_completion_logger(job_ptr); + } else { info("Killing job_id %u on failed node %s", job_ptr->job_id, node_name); job_ptr->job_state = JOB_NODE_FAIL | @@ -1073,17 +1092,13 @@ extern int kill_running_job_by_node_name(char *node_name, bool step_test) job_ptr->end_time = time(NULL); job_completion_logger(job_ptr); deallocate_nodes(job_ptr, false, suspended); - } else { - error("Removing failed node %s from job_id %u", - node_name, job_ptr->job_id); - _excise_node_from_job(job_ptr, node_ptr); } } } list_iterator_destroy(job_iterator); if (job_count) - last_job_update = time(NULL); + last_job_update = now; return job_count; }