diff --git a/NEWS b/NEWS index 16e83326c24f521377a611ae4c045ec64ac10902..4703a2762dafbb2a4233c11ce2c71a82637676f9 100644 --- a/NEWS +++ b/NEWS @@ -473,6 +473,7 @@ documents those changes that are of interest to users and admins. -- If an invalid assoc_ptr comes in don't use the id to verify it. -- Sched/backfill modified to avoid using nodes in completing state. -- Correct support for job --profile=none option and related documentation. + -- Properly enforce job --requeue and --norequeue options. * Changes in Slurm 2.6.9 ======================== diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 1ca0f829c4242d598d02eb6bbc5d1659899d5c3d..c398154801e6f46073f6849882ac1e7f4cc0c3b7 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -2618,8 +2618,7 @@ extern int kill_running_job_by_node_name(char *node_name) excise_node_from_job(job_ptr, node_ptr); job_post_resize_acctg(job_ptr); } else if (job_ptr->batch_flag && job_ptr->details && - slurmctld_conf.job_requeue && - (job_ptr->details->requeue > 0)) { + job_ptr->details->requeue) { char requeue_msg[128]; srun_node_fail(job_ptr->job_id, node_name); @@ -8471,7 +8470,7 @@ int update_job(job_desc_msg_t * job_specs, uid_t uid) goto fini; if ((job_specs->requeue != (uint16_t) NO_VAL) && detail_ptr) { - detail_ptr->requeue = job_specs->requeue; + detail_ptr->requeue = MIN(job_specs->requeue, 1); info("sched: update_job: setting requeue to %u for job_id %u", job_specs->requeue, job_specs->job_id); } @@ -9614,8 +9613,8 @@ static void _purge_missing_jobs(int node_inx, time_t now) (job_ptr->start_time < startup_time) && (node_inx == bit_ffs(job_ptr->node_bitmap))) { bool requeue = false; - if (slurmctld_conf.job_requeue && - (job_ptr->start_time < node_ptr->boot_time)) + if ((job_ptr->start_time < node_ptr->boot_time) && + (job_ptr->details && job_ptr->details->requeue)) requeue = true; info("Batch JobId=%u missing from node 0", job_ptr->job_id); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index e467aee6b417f6c58f3c037664e0adb356cac698..12785dc329d4b7da101fc642c685e56a9902ec12 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1668,7 +1668,6 @@ static void _slurm_rpc_complete_job_allocation(slurm_msg_t * msg) NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); - bool job_requeue = false; /* init */ START_TIMER; @@ -1682,7 +1681,7 @@ static void _slurm_rpc_complete_job_allocation(slurm_msg_t * msg) /* do RPC call */ /* Mark job and/or job step complete */ error_code = job_complete(comp_msg->job_id, uid, - job_requeue, false, comp_msg->job_rc); + false, false, comp_msg->job_rc); unlock_slurmctld(job_write_lock); _throttle_fini(&active_rpc_cnt); END_TIMER2("_slurm_rpc_complete_job_allocation"); @@ -1892,7 +1891,8 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) getuid()); #endif /* !HAVE_FRONT_END */ #endif /* !HAVE_BG */ - if (comp_msg->job_rc != SLURM_SUCCESS) + if ((comp_msg->job_rc != SLURM_SUCCESS) && job_ptr && + job_ptr->details && job_ptr->details->requeue) job_requeue = true; dump_job = true; dump_node = true; @@ -2740,7 +2740,6 @@ static void _slurm_rpc_step_complete(slurm_msg_t *msg) slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); - bool job_requeue = false; bool dump_job = false, dump_node = false; /* init */ @@ -2767,7 +2766,7 @@ static void _slurm_rpc_step_complete(slurm_msg_t *msg) if (req->job_step_id == SLURM_BATCH_SCRIPT) { /* FIXME: test for error, possibly cause batch job requeue */ - error_code = job_complete(req->job_id, uid, job_requeue, + error_code = job_complete(req->job_id, uid, false, false, step_rc); unlock_slurmctld(job_write_lock); _throttle_fini(&active_rpc_cnt); @@ -2786,7 +2785,7 @@ static void _slurm_rpc_step_complete(slurm_msg_t *msg) } } else { error_code = job_step_complete(req->job_id, req->job_step_id, - uid, job_requeue, step_rc); + uid, false, step_rc); unlock_slurmctld(job_write_lock); _throttle_fini(&active_rpc_cnt); END_TIMER2("_slurm_rpc_step_complete");