diff --git a/NEWS b/NEWS index 19bc21a4f1267c3f91afe73e5e10110f780199a6..2337486492576411c6b6e6feef0386c6486cf805 100644 --- a/NEWS +++ b/NEWS @@ -41,6 +41,7 @@ documents those changes that are of interest to users and admins. by the resp is NULL. -- Fix switch/federation plugin so backup controller can assume control repeatedly without leaking or corrupting memory. + -- Add new error code (for Maui/Moab scheduler): ESLURM_JOB_HELD * Changes in SLURM 1.1.9 ======================== diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index 3fcfe7642181280a5cb4f9f090bac7018298f71b..e20de3359598f50a1eb7047c095f7a2a1b7f84a7 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -1,7 +1,7 @@ /*****************************************************************************\ * slurm_errno.h - error codes and functions for slurm ****************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. + * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Kevin Tew <tew1@llnl.gov>, * Jim Garlick <garlick@llnl.gov>, et. al. @@ -136,6 +136,7 @@ enum { ESLURM_BATCH_ONLY, ESLURM_TASKDIST_ARBITRARY_UNSUPPORTED, ESLURM_TASKDIST_REQUIRES_OVERCOMMIT, + ESLURM_JOB_HELD, /* switch specific error codes, specific values defined in plugin module */ ESLURM_SWITCH_MIN = 3000, diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index e84004bc5e03fac573829eb67d76aa153d249fd6..b1dc992d722002b47c125e27658fc5a486783587 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -1,7 +1,7 @@ /*****************************************************************************\ * slurm_errno.c - error codes and functions for slurm ****************************************************************************** - * Copyright (C) 2002 The Regents of the University of California. + * Copyright (C) 2002-2006 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Jim Garlick <garlick@llnl.gov>, et. al. * UCRL-CODE-217948. @@ -179,6 +179,8 @@ static slurm_errtab_t slurm_errtab[] = { "Current SwitchType does not permit arbitrary task distribution"}, { ESLURM_TASKDIST_REQUIRES_OVERCOMMIT, "Requested more tasks than available processors" }, + { ESLURM_JOB_HELD, + "Job is in held state, pending scheduler release" }, /* slurmd error codes */ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 64b66c54e186d80515f07be0f14dad3a86081238..c479cfd876960e02400d3d1451aeb51308d4e62a 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1411,8 +1411,9 @@ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int will_run, error_code = select_nodes(job_ptr, no_alloc); - if ((error_code == ESLURM_NODES_BUSY) || - (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) { + if ((error_code == ESLURM_NODES_BUSY) + || (error_code == ESLURM_JOB_HELD) + || (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) { /* Not fatal error, but job can't be scheduled right now */ if (immediate) { job_ptr->job_state = JOB_FAILED; @@ -2514,7 +2515,7 @@ static int _validate_job_desc(job_desc_msg_t * job_desc_msg, int allocate, if ((submit_uid != 0) && (submit_uid != slurmctld_conf.slurm_user_id)) { info("attempt by uid %u to set job_id", submit_uid); - return ESLURM_DUPLICATE_JOB_ID; + return ESLURM_INVALID_JOB_ID; } if (job_desc_msg->job_id == 0) { info("attempt by uid %u to set zero job_id", submit_uid); diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 0df933363f2f4dfe2d8aca76d76e260f1d8890a8..1e7c68945e9fedf58983c521d3e24646181fd349 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -838,9 +838,10 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only) if (fail_reason != WAIT_NO_REASON) { if (detail_ptr) detail_ptr->wait_reason = fail_reason; - if (job_ptr->priority != 0) /* not user/admin hold */ - job_ptr->priority = 1; /* sys hold, move to end of queue */ last_job_update = time(NULL); + if (job_ptr->priority == 0) /* user/admin hold */ + return ESLURM_JOB_HELD; + job_ptr->priority = 1; /* sys hold, move to end of queue */ return ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; } diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 1b223b1634b90da6fc1457847c991cac68239182..dd59b3a2813b11d6dd740affaf4bd9de1f5edfcb 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -458,6 +458,7 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) uid_t uid; int immediate = job_desc_msg->immediate; bool do_unlock = false; + bool job_waiting = false; struct job_record *job_ptr; START_TIMER; @@ -483,9 +484,12 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) } /* return result */ - if ((error_code == SLURM_SUCCESS) || - ((immediate == 0) && - (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE))) { + if ((error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) + || (error_code == ESLURM_JOB_HELD)) + job_waiting = true; + + if ((error_code == SLURM_SUCCESS) + || ((immediate == 0) && job_waiting)) { xassert(job_ptr); info("_slurm_rpc_allocate_resources JobId=%u NodeList=%s %s", job_ptr->job_id, job_ptr->nodes, TIME_STR); @@ -1767,8 +1771,9 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) } /* return result */ - if ((error_code != SLURM_SUCCESS) && - (error_code != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) { + if ((error_code != SLURM_SUCCESS) + && (error_code != ESLURM_JOB_HELD) + && (error_code != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE)) { info("_slurm_rpc_submit_batch_job: %s", slurm_strerror(error_code)); slurm_send_rc_msg(msg, error_code); diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 95ee43ee11c1e1fd95f2da59b88b6a8ea2d9516d..7062b56676833cc44f5d476617ed867611cd298a 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -121,7 +121,7 @@ allocate_nodes(void) if ((rc == 0) && (resp->node_list == NULL)) { if (resp->error_code) - info("Warning: %s", slurm_strerror(resp->error_code)); + verbose("Warning: %s", slurm_strerror(resp->error_code)); _wait_for_resources(&resp); } /* For diagnosing a node problem, administrators need to sometimes