diff --git a/src/common/job_resources.h b/src/common/job_resources.h index 83a4a991bbf191246f44737b8c23d7a01ad0ba62..d1d5691e6975bf47b7fe856ce19fb5e342339980 100644 --- a/src/common/job_resources.h +++ b/src/common/job_resources.h @@ -154,6 +154,15 @@ extern int build_job_resources_cpu_array(job_resources_t *job_resrcs_ptr); * Return total CPU count or -1 on error */ extern int build_job_resources_cpus_array(job_resources_t *job_resrcs_ptr); +/* + * Given that one batch job just completed, attempt to launch a suitable + * replacement batch job in a response messge as a REQUEST_BATCH_JOB_LAUNCH + * message type, alternately send a return code fo SLURM_SUCCESS + * msg IN - The original message from slurmd + * fini_job_ptr IN - Pointer to job that just completed and needs replacement + */ +extern void replace_batch_job(slurm_msg_t * msg, void *fini_job_ptr); + /* Validate a job_resources data structure originally built using * build_job_resources() is still valid based upon slurmctld state. * NOTE: Reset the node_bitmap field before calling this function. diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index d336d05e22bf13b6b21d3bda26ac49f178e5d8da..cc5745841210c91e7f8d450839bd14efc146c4f7 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -344,6 +344,63 @@ static void do_diag_stats(struct timeval tv1, struct timeval tv2) } +/* + * Given that one batch job just completed, attempt to launch a suitable + * replacement batch job in a response messge as a REQUEST_BATCH_JOB_LAUNCH + * message type, alternately send a return code fo SLURM_SUCCESS + * msg IN - The original message from slurmd + * fini_job_ptr IN - Pointer to job that just completed and needs replacement + */ +extern void replace_batch_job(slurm_msg_t * msg, void *fini_job) +{ + /* Locks: Read config, write job, write node, read partition */ + slurmctld_lock_t job_write_lock = + { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK }; + struct job_record *job_ptr; + struct job_record *fini_job_ptr = (struct job_record *) fini_job; + ListIterator job_iterator; + batch_job_launch_msg_t *launch_msg = NULL; + int error_code; + + lock_slurmctld(job_write_lock); + if (!avail_front_end()) + goto no_test; + job_iterator = list_iterator_create(job_list); + if (job_iterator == NULL) + fatal("list_iterator_create memory allocation failure"); + while ((job_ptr = (struct job_record *) list_next(job_iterator))) { + if (!IS_JOB_PENDING(job_ptr) || (job_ptr->priority == 0)) + continue; +/* FIXME: LOTS OF JOB VALIDATION, PROLOG, ETC. NEEDS TO HAPPEN HERE */ +/* FIXME: select_nodes() can be vastly streamlined to only use the resources + * just released by fini_job */ + if (!job_ptr->batch_flag) + continue; + error_code = select_nodes(job_ptr, false, NULL); + if (error_code == SLURM_SUCCESS) { + last_job_update = now; + info("sched: Allocate JobId=%u NodeList=%s #CPUs=%u", + job_ptr->job_id, job_ptr->nodes, + job_ptr->total_cpus); + launch_msg = build_launch_job_msg(job_ptr); + } + break; + } +no_test: unlock_slurmctld(job_write_lock); + + if (launch_msg) { + slurm_msg_t response_msg; + slurm_msg_t_init(&response_msg); + response_msg.flags = msg->flags; + response_msg.protocol_version = msg->protocol_version; + response_msg.address = msg->address; + response_msg.msg_type = REQUEST_BATCH_JOB_LAUNCH; + response_msg.data = launch_msg; + slurm_send_node_msg(msg->conn_fd, &response_msg); + } else + slurm_send_rc_msg(msg, SLURM_SUCCESS); +} + /* * schedule - attempt to schedule all pending jobs * pending jobs for each partition will be scheduled in priority diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 7065dcf4ee14037ad2c2bb66f645fbcd46262ae9..444742407adcd5c683d196937733d3955f6ddb98 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1627,13 +1627,12 @@ static void _slurm_rpc_complete_batch_script(slurm_msg_t * msg) } else { debug2("_slurm_rpc_complete_batch_script JobId=%u %s", comp_msg->job_id, TIME_STR); - slurm_send_rc_msg(msg, SLURM_SUCCESS); slurmctld_diag_stats.jobs_completed++; dump_job = true; + if (msg->msg_type == REQUEST_COMPLETE_BATCH_JOB) + replace_batch_job(msg, job_ptr); } - if (msg->msg_type == REQUEST_COMPLETE_BATCH_JOB) - (void) schedule(0); if (dump_job) (void) schedule_job_save(); /* Has own locking */ if (dump_node) diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index ed9005d543a1ec7aed028dd723a9ce41c77f86b4..829556961e91eb17c92ac54c42cf56edb9f91949 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -140,7 +140,7 @@ static int _step_limits_match(void *x, void *key); static int _terminate_all_steps(uint32_t jobid, bool batch); static void _rpc_launch_tasks(slurm_msg_t *); static void _rpc_abort_job(slurm_msg_t *); -static void _rpc_batch_job(slurm_msg_t *); +static void _rpc_batch_job(slurm_msg_t *msg, bool new_msg); static void _rpc_job_notify(slurm_msg_t *); static void _rpc_signal_tasks(slurm_msg_t *); static void _rpc_checkpoint_tasks(slurm_msg_t *); @@ -245,7 +245,7 @@ slurmd_req(slurm_msg_t *msg) /* Mutex locking moved into _rpc_batch_job() due to * very slow prolog on Blue Gene system. Only batch * jobs are supported on Blue Gene (no job steps). */ - _rpc_batch_job(msg); + _rpc_batch_job(msg, true); last_slurmctld_msg = time(NULL); slurm_free_job_launch_msg(msg->data); break; @@ -1255,21 +1255,23 @@ _set_batch_job_limits(slurm_msg_t *msg) } static void -_rpc_batch_job(slurm_msg_t *msg) +_rpc_batch_job(slurm_msg_t *msg, bool new_msg) { batch_job_launch_msg_t *req = (batch_job_launch_msg_t *)msg->data; bool first_job_run = true; int rc = SLURM_SUCCESS; - uid_t req_uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); char *resv_id = NULL; bool replied = false; slurm_addr_t *cli = &msg->orig_addr; - if (!_slurm_authorized_user(req_uid)) { - error("Security violation, batch launch RPC from uid %d", - req_uid); - rc = ESLURM_USER_ID_MISSING; /* or bad in this case */ - goto done; + if (new_msg) { + uid_t req_uid = g_slurm_auth_get_uid(msg->auth_cred, NULL); + if (!_slurm_authorized_user(req_uid)) { + error("Security violation, batch launch RPC from uid %d", + req_uid); + rc = ESLURM_USER_ID_MISSING; /* or bad in this case */ + goto done; + } } slurm_cred_handle_reissue(conf->vctx, req->cred); if (slurm_cred_revoked(conf->vctx, req->cred)) { @@ -1295,7 +1297,7 @@ _rpc_batch_job(slurm_msg_t *msg) * Just reply now and send a separate kill job request if the * prolog or launch fail. */ replied = true; - if (slurm_send_rc_msg(msg, rc) < 1) { + if (new_msg && (slurm_send_rc_msg(msg, rc) < 1)) { /* The slurmctld is no longer waiting for a reply. * This typically indicates that the slurmd was * blocked from memory and/or CPUs and the slurmctld @@ -1381,7 +1383,7 @@ _rpc_batch_job(slurm_msg_t *msg) done: if (!replied) { - if (slurm_send_rc_msg(msg, rc) < 1) { + if (new_msg && (slurm_send_rc_msg(msg, rc) < 1)) { /* The slurmctld is no longer waiting for a reply. * This typically indicates that the slurmd was * blocked from memory and/or CPUs and the slurmctld @@ -3536,7 +3538,7 @@ _rpc_complete_batch(slurm_msg_t *msg) /* (resp_msg.msg_type == REQUEST_BATCH_JOB_LAUNCH) */ debug2("Processing RPC: REQUEST_BATCH_JOB_LAUNCH"); last_slurmctld_msg = time(NULL); - _rpc_batch_job(&resp_msg); + _rpc_batch_job(&resp_msg, false); slurm_free_job_launch_msg(resp_msg.data); }