diff --git a/doc/html/heterogeneous_jobs.shtml b/doc/html/heterogeneous_jobs.shtml index ff57932a3f56368543dc206393339685eeff8ae1..5b37a64a3a8c3affae68a944f2276a9842bce643 100644 --- a/doc/html/heterogeneous_jobs.shtml +++ b/doc/html/heterogeneous_jobs.shtml @@ -46,7 +46,7 @@ A list of propogated options follows.</p> <li>--clusters</li> <li>--comment</li> <li>--deadline</li> -<li>delay-boot</li> +<li>--delay-boot</li> <li>--dependency</li> <li>--error</li> <li>--export</li> diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 382cf7c6ef59811ec5fb8aab481400b1cbcbad4b..31338e2fb3d1e51e8ec39a0400d458c4c935fc39 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -3172,7 +3172,7 @@ extern void slurm_free_resource_allocation_response_msg(resource_allocation_resp * allocate resources for a list of job requests. This call will block * until the entire allocation is granted, or the specified timeout limit * is reached. - * IN req - List of resource allocation requests + * IN job_req_list - List of resource allocation requests, type job_desc_msg_t * IN timeout - amount of time, in seconds, to wait for a response before * giving up. * A timeout of zero will wait indefinitely. @@ -3198,17 +3198,18 @@ List slurm_allocate_pack_job_blocking(List job_req_list, time_t timeout, * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ extern int slurm_allocation_lookup(uint32_t job_id, - resource_allocation_response_msg_t **info); + resource_allocation_response_msg_t **resp); /* * slurm_pack_job_lookup - retrieve info for an existing heterogeneous job * allocation without the addrs and such * IN jobid - job allocation identifier - * OUT info - job allocation information + * OUT resp - list of job allocation information, type + * resource_allocation_response_msg_t * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using list_destroy() */ -extern int slurm_pack_job_lookup(uint32_t jobid, List* info); +extern int slurm_pack_job_lookup(uint32_t jobid, List *resp); /* * slurm_read_hostfile - Read a SLURM hostfile specified by "filename". @@ -3257,6 +3258,17 @@ extern void slurm_allocation_msg_thr_destroy(allocation_msg_thread_t *msg_thr); extern int slurm_submit_batch_job(job_desc_msg_t *job_desc_msg, submit_response_msg_t **slurm_alloc_msg); +/* + * slurm_submit_batch_pack_job - issue RPC to submit a heterogeneous job for + * later execution + * NOTE: free the response using slurm_free_submit_response_response_msg + * IN job_req_list - List of resource allocation requests, type job_desc_msg_t + * OUT slurm_alloc_msg - response to request + * RET 0 on success, otherwise return -1 and set errno to indicate the error + */ +extern int slurm_submit_batch_pack_job(List job_req_list, + submit_response_msg_t **slurm_alloc_msg); + /* * slurm_free_submit_response_response_msg - free slurm * job submit response message diff --git a/src/api/submit.c b/src/api/submit.c index 0d53ff5f342ee648a8e51281944626d136b44fc5..29cc815d5f49773d10a9392a790cdb7ec67dabab 100644 --- a/src/api/submit.c +++ b/src/api/submit.c @@ -49,56 +49,106 @@ extern pid_t getsid(pid_t pid); /* missing from <unistd.h> */ #include "src/common/read_config.h" #include "src/common/slurm_protocol_api.h" +#include "src/common/xmalloc.h" +#include "src/common/xstring.h" /* * slurm_submit_batch_job - issue RPC to submit a job for later execution * NOTE: free the response using slurm_free_submit_response_response_msg * IN job_desc_msg - description of batch job request - * OUT slurm_alloc_msg - response to request + * OUT resp - response to request * RET 0 on success, otherwise return -1 and set errno to indicate the error */ -int -slurm_submit_batch_job (job_desc_msg_t *req, - submit_response_msg_t **resp) +extern int slurm_submit_batch_job(job_desc_msg_t *req, + submit_response_msg_t **resp) { - int rc; - slurm_msg_t req_msg; - slurm_msg_t resp_msg; - bool host_set = false; - char host[64]; + int rc; + slurm_msg_t req_msg; + slurm_msg_t resp_msg; + char *local_hostname = NULL; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); - /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); - if ( (req->alloc_node == NULL) - && (gethostname_short(host, sizeof(host)) == 0) ) { - req->alloc_node = host; - host_set = true; + if (req->alloc_node == NULL) { + local_hostname = xshort_hostname(); + req->alloc_node = local_hostname; } - req_msg.msg_type = REQUEST_SUBMIT_BATCH_JOB ; + req_msg.msg_type = REQUEST_SUBMIT_BATCH_JOB; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); + xfree(local_hostname); + if (rc == SLURM_SOCKET_ERROR) + return SLURM_ERROR; + + switch (resp_msg.msg_type) { + case RESPONSE_SLURM_RC: + rc = ((return_code_msg_t *) resp_msg.data)->return_code; + if (rc) + slurm_seterrno_ret(rc); + *resp = NULL; + break; + case RESPONSE_SUBMIT_BATCH_JOB: + *resp = (submit_response_msg_t *) resp_msg.data; + break; + default: + slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); + } + + return SLURM_PROTOCOL_SUCCESS; +} + +/* + * slurm_submit_batch_pack_job - issue RPC to submit a heterogeneous job for + * later execution + * NOTE: free the response using slurm_free_submit_response_response_msg + * IN job_req_list - List of resource allocation requests, type job_desc_msg_t + * OUT resp - response to request + * RET 0 on success, otherwise return -1 and set errno to indicate the error + */ +extern int slurm_submit_batch_pack_job(List job_req_list, + submit_response_msg_t **resp) +{ + int rc; + job_desc_msg_t *req; + slurm_msg_t req_msg; + slurm_msg_t resp_msg; + char *local_hostname = NULL; + ListIterator iter; + + slurm_msg_t_init(&req_msg); + slurm_msg_t_init(&resp_msg); /* - * Clear this hostname if set internally to this function - * (memory is on the stack) + * set Node and session id for this request */ - if (host_set) - req->alloc_node = NULL; + local_hostname = xshort_hostname(); + iter = list_iterator_create(job_req_list); + while ((req = (job_desc_msg_t *) list_next(iter))) { + if (req->alloc_sid == NO_VAL) + req->alloc_sid = getsid(0); + if (!req->alloc_node) + req->alloc_node = local_hostname; + } + list_iterator_destroy(iter); + req_msg.msg_type = REQUEST_SUBMIT_BATCH_JOB_PACK; + req_msg.data = job_req_list; + + rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, + working_cluster_rec); + xfree(local_hostname); if (rc == SLURM_SOCKET_ERROR) return SLURM_ERROR; - switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 0fe67fe986c49968d9dee17e7920ab37eb26a20f..e8a98d672bf0e75ec774c4ce2457ecce1a8b7e10 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -4300,6 +4300,7 @@ extern int slurm_free_msg_data(slurm_msg_type_t type, void *data) slurm_free_job_info(data); break; case REQUEST_JOB_PACK_ALLOCATION: + case REQUEST_SUBMIT_BATCH_JOB_PACK: case RESPONSE_JOB_PACK_ALLOCATION: FREE_NULL_LIST(data); break; diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index cb814f987a4da28cc634b10938a90a38f13eea99..99c91d5bbbbee45ee455569d157d80978dfee31d 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -308,6 +308,7 @@ typedef enum { REQUEST_CTLD_MULT_MSG, RESPONSE_CTLD_MULT_MSG, REQUEST_JOB_PACK_ALLOC_INFO, + REQUEST_SUBMIT_BATCH_JOB_PACK, REQUEST_JOB_STEP_CREATE = 5001, RESPONSE_JOB_STEP_CREATE, diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 49df8c0627b3334167b959c99cc49a19b59a00e3..7ddf1fabf22676d87a1cb7585518e57241883826 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -943,6 +943,7 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) msg->protocol_version); break; case REQUEST_JOB_PACK_ALLOCATION: + case REQUEST_SUBMIT_BATCH_JOB_PACK: _pack_job_desc_list_msg((List) msg->data, buffer, msg->protocol_version); break; @@ -1633,6 +1634,7 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) buffer, msg->protocol_version); break; case REQUEST_JOB_PACK_ALLOCATION: + case REQUEST_SUBMIT_BATCH_JOB_PACK: rc = _unpack_job_desc_list_msg((List *) &(msg->data), buffer, msg->protocol_version); break; diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 78be24585176dd12b3b8fdba719cbc81488d5061..d7050aba909f37dd98980e043f272a2fa052970e 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -213,6 +213,7 @@ inline static void _slurm_rpc_step_layout(slurm_msg_t * msg); inline static void _slurm_rpc_step_update(slurm_msg_t * msg); inline static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg, bool is_sib_job); +inline static void _slurm_rpc_submit_batch_pack_job(slurm_msg_t * msg); inline static void _slurm_rpc_suspend(slurm_msg_t * msg); inline static void _slurm_rpc_top_job(slurm_msg_t * msg); inline static void _slurm_rpc_trigger_clear(slurm_msg_t * msg); @@ -445,6 +446,9 @@ void slurmctld_req(slurm_msg_t *msg, connection_arg_t *arg) case REQUEST_SUBMIT_BATCH_JOB: _slurm_rpc_submit_batch_job(msg, false); break; + case REQUEST_SUBMIT_BATCH_JOB_PACK: + _slurm_rpc_submit_batch_pack_job(msg); + break; case REQUEST_UPDATE_FRONT_END: _slurm_rpc_update_front_end(msg); break; @@ -3687,7 +3691,7 @@ static void _slurm_rpc_step_update(slurm_msg_t *msg) } /* _slurm_rpc_submit_batch_job - process RPC to submit a batch job */ -static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg, bool is_sib_job) +static void _slurm_rpc_submit_batch_job(slurm_msg_t *msg, bool is_sib_job) { static int active_rpc_cnt = 0; int error_code = SLURM_SUCCESS; @@ -3710,7 +3714,7 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg, bool is_sib_job) bool reject_job = false; START_TIMER; - + debug2("Processing RPC: REQUEST_SUBMIT_BATCH_JOB from uid=%d", uid); if (slurmctld_config.submissions_disabled) { info("Submissions disabled on system"); error_code = ESLURM_SUBMISSIONS_DISABLED; @@ -3718,8 +3722,6 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg, bool is_sib_job) goto send_msg; } - debug2("Processing RPC: REQUEST_SUBMIT_BATCH_JOB from uid=%d", uid); - slurm_msg_t_init(&response_msg); response_msg.flags = msg->flags; response_msg.protocol_version = msg->protocol_version; @@ -3734,7 +3736,8 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg, bool is_sib_job) if ((job_desc_msg->alloc_node == NULL) || (job_desc_msg->alloc_node[0] == '\0')) { error_code = ESLURM_INVALID_NODE_NAME; - error("REQUEST_SUBMIT_BATCH_JOB lacks alloc_node from uid=%d", uid); + error("REQUEST_SUBMIT_BATCH_JOB lacks alloc_node from uid=%d", + uid); } dump_job_desc(job_desc_msg); @@ -3786,15 +3789,13 @@ send_msg: END_TIMER2("_slurm_rpc_submit_batch_job"); if (reject_job) { - info("_slurm_rpc_submit_batch_job: %s", - slurm_strerror(error_code)); + info("%s: %s", __func__, slurm_strerror(error_code)); if (err_msg) slurm_send_rc_err_msg(msg, error_code, err_msg); else slurm_send_rc_msg(msg, error_code); } else { - info("_slurm_rpc_submit_batch_job JobId=%u %s", - job_id, TIME_STR); + info("%s: JobId=%u %s", __func__, job_id, TIME_STR); /* send job_ID */ submit_msg.job_id = job_id; submit_msg.step_id = step_id; @@ -3813,6 +3814,41 @@ send_msg: xfree(err_msg); } +/* _slurm_rpc_submit_batch_pack_job - process RPC to submit a batch pack job */ +static void _slurm_rpc_submit_batch_pack_job(slurm_msg_t *msg) +{ + ListIterator iter; + int error_code = SLURM_SUCCESS; + job_desc_msg_t *job_desc_msg; + List job_req_list = (List) msg->data; + uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, + slurmctld_config.auth_info); + + info("Processing RPC: REQUEST_SUBMIT_BATCH_PACK_JOB from uid=%d", uid); + if (!job_req_list || (list_count(job_req_list) == 0)) { + info("REQUEST_SUBMIT_BATCH_PACK_JOB from uid=%d with empty job list", + uid); + error_code = SLURM_ERROR; + goto send_msg; + } + if (slurmctld_config.submissions_disabled) { + info("Submissions disabled on system"); + error_code = ESLURM_SUBMISSIONS_DISABLED; + goto send_msg; + } + + iter = list_iterator_create(job_req_list); + while ((job_desc_msg = (job_desc_msg_t *) list_next(iter))) { +//FIXME: FLesh out the logic here + dump_job_desc(job_desc_msg); + } + list_iterator_destroy(iter); + +error_code = SLURM_ERROR; +send_msg: + slurm_send_rc_msg(msg, error_code); +} + /* _slurm_rpc_update_job - process RPC to update the configuration of a * job (e.g. priority) */