diff --git a/NEWS b/NEWS index 190276e67b2f478afb75577b9b0ea9add757a619..1d78083a8c65965b5180ef1daf215329f36cbd7f 100644 --- a/NEWS +++ b/NEWS @@ -45,6 +45,7 @@ documents those changes that are of interest to users and administrators. tasks in a job array independently from the maximum task ID (MaxArraySize). -- Fix issue where number of nodes is not properly allocated when sbatch and salloc are requested with -n tasks < hosts from -w hostlist or from -N. + -- Add infrastructure for submitting federated jobs. * Changes in Slurm 17.02.0pre2 ============================== diff --git a/RELEASE_NOTES b/RELEASE_NOTES index a84a8543fef81c48bcf7b10097c6dc53bf403c26..668dc70e1a291869126447bb1d4f868c0ce56206 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -85,6 +85,11 @@ In slurmctld_lock_t: Added federation In will_run_response_msg_t: Added double sys_usage_per to report back how busy a cluster is. In slurm_ctl_conf: Added mail_domain. +In slurm_msg_t: Added buffer to keep received message buffer to use for later + purposes. +In job_desc_msg_t: Added fed_siblings to track which clusters have sibling jobs. +In slurm_job_info_t: Added fed_origin_str, fed_siblings, fed_siblings_str to + display job federation information. Added the following struct definitions ====================================== @@ -93,6 +98,7 @@ Added slurmdb_cluster_fed_t to store federation information on Added slurmdb_federation_cond_t for selecting federations from db. Added slurmdb_federation_rec_t to represent federation objects. Added job_fed_details_t for storing federated job information. +Added sib_msg_t for sending messages to siblings. Removed members from the following struct definitions ===================================================== @@ -106,6 +112,8 @@ Changed DEFAULT_MAX_JOB_ID from 0x7fff0000 to 0x03ff0000. Added SELECT_NODEDATA_TRES_ALLOC_FMT_STR to select_nodedata_type. Added SELECT_NODEDATA_TRES_ALLOC_WEIGHTED to select_nodedata_type. Changed MEM_PER_CPU flag to 0x8000000000000000 from 0x80000000. +Added SLURM_MSG_KEEP_BUFFER msg flag to instruct slurm_receive_msg() to save the + buffer ptr. Added the following API's ========================= diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1 index 6913d29e9783caf199db2aea0085ead7bc3ead08..7d0347840b79a9a5fadd0a6295964658b6aff5dd 100644 --- a/doc/man/man1/salloc.1 +++ b/doc/man/man1/salloc.1 @@ -608,6 +608,16 @@ License names can be followed by a colon and count Multiple license names should be comma separated (e.g. "\-\-licenses=foo:4,bar"). +.TP +\fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR> +Clusters to issue commands to. Multiple cluster names may be comma separated. +The job will be submitted to the one cluster providing the earliest expected +job initiation time. The default value is the current cluster. A value of +\(aq\fIall\fR' will query to run on all clusters. Note the +\fB\-\-export\fR option to control environment variables exported +between clusters. +Note that the SlurmDBD must be up for this option to work properly. + .TP \fB\-m\fR, \fB\-\-distribution\fR= \fIarbitrary\fR|<\fIblock\fR|\fIcyclic\fR|\fIplane=<options>\fR[:\fIblock\fR|\fIcyclic\fR|\fIfcyclic\fR]> @@ -1476,6 +1486,9 @@ Same as \fB\-\-bell\fR \fBSALLOC_BURST_BUFFER\fR Same as \fB\-\-bb\fR .TP +\fBSALLOC_CLUSTERS\fR or \fBSLURM_CLUSTERS\fR +Same as \fB\-\-clusters\fR +.TP \fBSALLOC_CONN_TYPE\fR Same as \fB\-\-conn\-type\fR .TP diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1 index 87e6220b0fbf9d21a3d3b21baa31cb495a19dc27..5a80fd8c3e4d16b5efe17694906c566e6e7c8987 100644 --- a/doc/man/man1/sbatch.1 +++ b/doc/man/man1/sbatch.1 @@ -701,6 +701,7 @@ job initiation time. The default value is the current cluster. A value of \(aq\fIall\fR' will query to run on all clusters. Note the \fB\-\-export\fR option to control environment variables exported between clusters. +Note that the SlurmDBD must be up for this option to work properly. .TP \fB\-m\fR, \fB\-\-distribution\fR= diff --git a/doc/man/man1/scancel.1 b/doc/man/man1/scancel.1 index 1419ab53ffee5786557b04ce14a7e1e62fd2adc1..0eec081b375d9410b77f5ba622e4ed247a12310f 100644 --- a/doc/man/man1/scancel.1 +++ b/doc/man/man1/scancel.1 @@ -61,7 +61,8 @@ Interactive mode. Confirm each job_id.step_id before performing the cancel opera .TP \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR> -Cluster to issue commands to. +Clusters to issue commands to. +Note that the SlurmDBD must be up for this option to work properly. .TP \fB\-n\fR, \fB\-\-jobname\fR=\fIjob_name\fR, \fB\-\-name\fR=\fIjob_name\fR diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1 index ff3a6f8670a5543cafd50ebe5209497c484b1f96..8e91a7e42a8c5f5d74fc7b342ebc2407d50e029a 100644 --- a/doc/man/man1/scontrol.1 +++ b/doc/man/man1/scontrol.1 @@ -48,6 +48,7 @@ unavailable to user's group will be displayed (i.e. this is the default behavior .TP \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR> The cluster to issue commands to. Only one cluster name may be specified. +Note that the SlurmDBD must be up for this option to work properly. .TP \fB\-o\fR, \fB\-\-oneliner\fR diff --git a/doc/man/man1/sinfo.1 b/doc/man/man1/sinfo.1 index 8186484bac8b5f58620836410cee9527843912ac..37f3bccb0ebbad82c2965ed9e24b0e37b42727f5 100644 --- a/doc/man/man1/sinfo.1 +++ b/doc/man/man1/sinfo.1 @@ -62,6 +62,7 @@ This is ignored if the \fB\-\-format\fR option is specified. \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR> Clusters to issue commands to. Multiple cluster names may be comma separated. A value of of '\fIall\fR' will query to run on all clusters. +Note that the SlurmDBD must be up for this option to work properly. .TP \fB\-n <nodes>\fR, \fB\-\-nodes=<nodes>\fR diff --git a/doc/man/man1/smap.1 b/doc/man/man1/smap.1 index 6a927b20e7e6ce0e40781128c4c8989b25f8f4b7..26570ebc31173ff75308547b0b5dc390f4f26734 100644 --- a/doc/man/man1/smap.1 +++ b/doc/man/man1/smap.1 @@ -79,6 +79,7 @@ name with the '\-n' option. .TP \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR> Clusters to issue commands to. +Note that the SlurmDBD must be up for this option to work properly. .TP \fB\-n\fR, \fB\-\-nodes\fR diff --git a/doc/man/man1/sprio.1 b/doc/man/man1/sprio.1 index d25e42f7129b25131c5852956d84a0cc7d1438db..2101006aef6d06532151e00576d947fd46cfbf63 100644 --- a/doc/man/man1/sprio.1 +++ b/doc/man/man1/sprio.1 @@ -39,6 +39,7 @@ Report more of the available information for the selected jobs. .TP \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR> The cluster to issue commands to. Only one cluster name may be specified. +Note that the SlurmDBD must be up for this option to work properly. .TP \fB\-n\fR, \fB\-\-norm\fR diff --git a/doc/man/man1/squeue.1 b/doc/man/man1/squeue.1 index 43bffaa58bedf6c8dd6bc0745544fa076aaa4bb3..ef7aa80bfa74f3c08b5caaa639e9d056c5377d28 100644 --- a/doc/man/man1/squeue.1 +++ b/doc/man/man1/squeue.1 @@ -579,6 +579,22 @@ The exit code for the job. Features required by the job. (Valid for jobs only) .TP +\fBfedorigin\fR +Cluster name where federated job originated from. +(Valid for federated jobs only) +.TP +\fBfedoriginraw\fR +Cluster ID where federated job originated from. +(Valid for federated jobs only) +.TP +\fBfedsiblings\fR +Cluster names of where federated job can run. +(Valid for federated jobs only) +.TP +\fBfedsiblingsraw\fR +Cluster IDs of where federated job can run. +(Valid for federated jobs only) +.TP \fBgres\fR Generic resources (gres) required by the job or step. (Valid for jobs and job steps) diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index 6b497a0242f84ab13ff31ce86d00ddcd2a91df20..8a657adf4ba5b59083d7b8d9447cc7822c7e2a1e 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -911,6 +911,17 @@ License names can be followed by a colon and count Multiple license names should be comma separated (e.g. "\-\-licenses=foo:4,bar"). This option applies to job allocations. +.TP +\fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR> +Clusters to issue commands to. Multiple cluster names may be comma separated. +The job will be submitted to the one cluster providing the earliest expected +job initiation time. The default value is the current cluster. A value of +\(aq\fIall\fR' will query to run on all clusters. Note the +\fB\-\-export\fR option to control environment variables exported +between clusters. +This option applies only to job allocations. +Note that the SlurmDBD must be up for this option to work properly. + .TP .na \fB\-m\fR, \fB\-\-distribution\fR= diff --git a/doc/man/man1/sshare.1 b/doc/man/man1/sshare.1 index dcd35297db7bc1ae6cac4518ba1f87d2f4ae43cb..39ca0ce5aa1a8717a24e75b5377335ba4903cf1f 100644 --- a/doc/man/man1/sshare.1 +++ b/doc/man/man1/sshare.1 @@ -39,6 +39,7 @@ Long listing - includes the normalized usage information. .TP \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR> Clusters to issue commands to. +Note that the SlurmDBD must be up for this option to work properly. .TP \fB\-m\fR, \fB\-\-partition\fR diff --git a/doc/man/man1/strigger.1 b/doc/man/man1/strigger.1 index 9686621ff6c106c324a4032144cf7b872a4e01fc..3520baa3b3e28da81ab5bf9fbbddf12973c8e0af 100644 --- a/doc/man/man1/strigger.1 +++ b/doc/man/man1/strigger.1 @@ -167,6 +167,7 @@ trigger event. .TP \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR> Clusters to issue commands to. +Note that the SlurmDBD must be up for this option to work properly. .TP \fB\-n\fR, \fB\-\-node\fR[=\fIhost\fR] diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 2135399a56ed66e68de701e705c35935793160b6..4244f108282c5e951e542f91178d0b89430d0d9a 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -878,6 +878,7 @@ enum node_states { #define SHOW_DETAIL 0x0002 /* Show detailed resource information */ #define SHOW_DETAIL2 0x0004 /* Show batch script listing */ #define SHOW_MIXED 0x0008 /* Automatically set node MIXED state */ +#define SHOW_FED_TRACK 0x0010 /* Show tracking only federated jobs */ /* Define keys for ctx_key argument of slurm_step_ctx_get() */ enum ctx_keys { @@ -1334,6 +1335,12 @@ typedef struct power_mgmt_data { } power_mgmt_data_t; #define CORE_SPEC_THREAD 0x8000 /* If set, this is a thread count not core count */ + +/* + * Update: + * _copy_job_desc_to_job_record() + * slurm_free_job_desc_msg() + */ typedef struct job_descriptor { /* For submit, allocate, and update requests */ char *account; /* charge to specified account */ char *acctg_freq; /* accounting polling intervals (seconds) */ @@ -1386,6 +1393,7 @@ typedef struct job_descriptor { /* For submit, allocate, and update requests */ * from job's allocation, default NONE */ char *features; /* required feature specification, * default NONE */ + uint64_t fed_siblings; /* Bitmap of federation siblings */ char *gres; /* comma separated list of required generic * resources, default NONE */ uint32_t group_id; /* group to assume, if run as root. */ @@ -1559,6 +1567,9 @@ typedef struct job_info { * start_range_2, .., -1 */ uint32_t exit_code; /* exit code for job (status from wait call) */ char *features; /* comma separated list of required features */ + char *fed_origin_str; /* Origin cluster's name */ + uint64_t fed_siblings; /* bitmap of sibling cluster ids */ + char *fed_siblings_str; /* string of sibling cluster names */ char *gres; /* comma separated list of generic resources */ uint32_t group_id; /* group job submitted as */ uint32_t job_id; /* job ID */ diff --git a/src/api/job_info.c b/src/api/job_info.c index 0337fefe9be1bf48b0fd24d0bd8dc2c668895d4c..70c0acb725d7a3f8eb5aaec4dbf855f7d0e1a6cf 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -556,6 +556,13 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) xstrcat(out, line_end); } + /****** Line 14a (optional) ******/ + if (job_ptr->fed_siblings) { + xstrfmtcat(out, "FedOrigin=%s FedSiblings=%s", + job_ptr->fed_origin_str, job_ptr->fed_siblings_str); + xstrcat(out, line_end); + } + /****** Line 15 ******/ if (cluster_flags & CLUSTER_FLAG_BG) { select_g_select_jobinfo_get(job_ptr->select_jobinfo, diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c index 7c6c988e820b118b75b99a09a60216559c267d53..ac8f32ac92b73eba435c28ace699193d8adddd98 100644 --- a/src/common/slurm_protocol_api.c +++ b/src/common/slurm_protocol_api.c @@ -3128,6 +3128,7 @@ extern int slurm_unpack_received_msg(slurm_msg_t *msg, int fd, Buf buffer) header_t header; int rc; void *auth_cred = NULL; + uint32_t body_offset = 0; if (unpack_header(&header, buffer) == SLURM_ERROR) { rc = SLURM_COMMUNICATIONS_RECEIVE_ERROR; @@ -3199,6 +3200,8 @@ extern int slurm_unpack_received_msg(slurm_msg_t *msg, int fd, Buf buffer) msg->msg_type = header.msg_type; msg->flags = header.flags; + body_offset = get_buf_offset(buffer); + if ((header.body_length > remaining_buf(buffer)) || (unpack_msg(msg, buffer) != SLURM_SUCCESS)) { rc = ESLURM_PROTOCOL_INCOMPLETE_PACKET; @@ -3206,6 +3209,8 @@ extern int slurm_unpack_received_msg(slurm_msg_t *msg, int fd, Buf buffer) goto total_return; } + set_buf_offset(buffer, body_offset); + msg->auth_cred = (void *)auth_cred; rc = SLURM_SUCCESS; @@ -3243,6 +3248,10 @@ int slurm_receive_msg(int fd, slurm_msg_t *msg, int timeout) size_t buflen = 0; int rc; Buf buffer; + bool keep_buffer = false; + + if (msg->flags & SLURM_MSG_KEEP_BUFFER) + keep_buffer = true; if (msg->conn) { persist_msg_t persist_msg; @@ -3255,7 +3264,13 @@ int slurm_receive_msg(int fd, slurm_msg_t *msg, int timeout) } memset(&persist_msg, 0, sizeof(persist_msg_t)); rc = slurm_persist_msg_unpack(msg->conn, &persist_msg, buffer); - free_buf(buffer); + + if (keep_buffer) { + set_buf_offset(buffer, 0); + msg->buffer = buffer; + } else { + free_buf(buffer); + } if (rc) { error("%s: Failed to unpack persist msg", __func__); @@ -3302,7 +3317,10 @@ int slurm_receive_msg(int fd, slurm_msg_t *msg, int timeout) rc = slurm_unpack_received_msg(msg, fd, buffer); - free_buf(buffer); + if (keep_buffer) + msg->buffer = buffer; + else + free_buf(buffer); endit: slurm_seterrno(rc); @@ -4671,6 +4689,7 @@ extern void slurm_free_msg_members(slurm_msg_t *msg) if (msg) { if (msg->auth_cred) (void) g_slurm_auth_destroy(msg->auth_cred); + free_buf(msg->buffer); slurm_free_msg_data(msg->msg_type, msg->data); FREE_NULL_LIST(msg->ret_list); } diff --git a/src/common/slurm_protocol_common.h b/src/common/slurm_protocol_common.h index e9e6d7c155c068d06a196319be3cc7a6aaba5c12..8042a15e5cd479978932bdaad69bd2e692d299d2 100644 --- a/src/common/slurm_protocol_common.h +++ b/src/common/slurm_protocol_common.h @@ -104,6 +104,7 @@ #define SLURM_PROTOCOL_NO_FLAGS 0 #define SLURM_GLOBAL_AUTH_KEY 0x0001 #define SLURMDBD_CONNECTION 0x0002 +#define SLURM_MSG_KEEP_BUFFER 0x0004 #include "src/common/slurm_protocol_socket_common.h" diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c index 62372161c5dfe244b66349eb2c19934bda575444..b01f92f06f37183fa0ee9e7c76bb74460034a2ad 100644 --- a/src/common/slurm_protocol_defs.c +++ b/src/common/slurm_protocol_defs.c @@ -788,6 +788,15 @@ extern void slurm_free_job_desc_msg(job_desc_msg_t * msg) } } +extern void slurm_free_sib_msg(sib_msg_t *msg) +{ + if (msg) { + free_buf(msg->data_buffer); + slurm_free_msg_data(msg->data_type, msg->data); + xfree(msg); + } +} + extern void slurm_free_event_log_msg(slurm_event_log_msg_t * msg) { if (msg) { @@ -3830,6 +3839,17 @@ extern int slurm_free_msg_data(slurm_msg_type_t type, void *data) case REQUEST_UPDATE_JOB: slurm_free_job_desc_msg(data); break; + case REQUEST_SIB_JOB_WILL_RUN: + case REQUEST_SIB_SUBMIT_BATCH_JOB: + case REQUEST_SIB_RESOURCE_ALLOCATION: + slurm_free_sib_msg(data); + break; + case RESPONSE_JOB_WILL_RUN: + slurm_free_will_run_response_msg(data); + break; + case RESPONSE_SUBMIT_BATCH_JOB: + slurm_free_submit_response_response_msg(data); + break; case RESPONSE_ACCT_GATHER_UPDATE: slurm_free_acct_gather_node_resp_msg(data); break; @@ -4425,6 +4445,12 @@ rpc_num2string(uint16_t opcode) return "RESPONSE_JOB_ATTACH"; case REQUEST_JOB_WILL_RUN: return "REQUEST_JOB_WILL_RUN"; + case REQUEST_SIB_JOB_WILL_RUN: + return "REQUEST_SIB_JOB_WILL_RUN"; + case REQUEST_SIB_SUBMIT_BATCH_JOB: + return "REQUEST_SIB_SUBMIT_BATCH_JOB"; + case REQUEST_SIB_RESOURCE_ALLOCATION: + return "REQUEST_SIB_RESOURCE_ALLOCATION"; case RESPONSE_JOB_WILL_RUN: return "RESPONSE_JOB_WILL_RUN"; case REQUEST_JOB_ALLOCATION_INFO: diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 635421fb3d5730403ebb4b7108418aaaf919a0b1..e88a1ebe2517e6f61798cb02991350704a713a23 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -286,6 +286,9 @@ typedef enum { REQUEST_JOB_NOTIFY, REQUEST_JOB_SBCAST_CRED, RESPONSE_JOB_SBCAST_CRED, + REQUEST_SIB_JOB_WILL_RUN, + REQUEST_SIB_SUBMIT_BATCH_JOB, + REQUEST_SIB_RESOURCE_ALLOCATION, REQUEST_JOB_STEP_CREATE = 5001, RESPONSE_JOB_STEP_CREATE, @@ -433,6 +436,7 @@ typedef struct slurm_protocol_config { typedef struct slurm_msg { slurm_addr_t address; void *auth_cred; + Buf buffer; /* DON't PACK! ptr to buffer that msg was unpacked from. */ slurm_persist_conn_t *conn; /* DON'T PACK OR FREE! this is here to * distinquish a persistant connection from * a normal connection it should be filled @@ -1200,6 +1204,18 @@ typedef struct slurm_event_log_msg { char * string; /* String for slurmctld to log */ } slurm_event_log_msg_t; +typedef struct { + void *data; /* Unpacked buffer + * Only populated on the receiving side. */ + Buf data_buffer; /* Buffer that holds an unpacked data type. + * Only populated on the sending side. */ + uint16_t data_type; /* date type to unpack */ + uint16_t data_version; /* Version that data is packed with */ + uint64_t fed_siblings; /* sibling bitmap of job */ + uint32_t job_id; /* job_id of job - set in job_desc on receiving + * side */ +} sib_msg_t; + /*****************************************************************************\ * ACCOUNTING PUSHS \*****************************************************************************/ @@ -1263,6 +1279,7 @@ extern void slurm_free_front_end_info_request_msg( extern void slurm_free_node_info_request_msg(node_info_request_msg_t *msg); extern void slurm_free_node_info_single_msg(node_info_single_msg_t *msg); extern void slurm_free_part_info_request_msg(part_info_request_msg_t *msg); +extern void slurm_free_sib_msg(sib_msg_t *msg); extern void slurm_free_stats_info_request_msg(stats_info_request_msg_t *msg); extern void slurm_free_stats_response_msg(stats_info_response_msg_t *msg); extern void slurm_free_step_alloc_info_msg(step_alloc_info_msg_t * msg); diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 2fc12bb5fcdeaba28ebe65d194ee3dff59ad4d0d..38087582b3e12241da851d87a8316891df6795b0 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -628,6 +628,11 @@ static int _unpack_will_run_response_msg(will_run_response_msg_t ** msg_ptr, Buf buffer, uint16_t protocol_version); +static void _pack_sib_msg(sib_msg_t *sib_msg_ptr, Buf buffer, + uint16_t protocol_version); +static int _unpack_sib_msg(sib_msg_t **sib_msg_buffer_ptr, Buf buffer, + uint16_t protocol_version); + static void _pack_accounting_update_msg(accounting_update_msg_t *msg, Buf buffer, uint16_t protocol_version); @@ -1085,6 +1090,12 @@ pack_msg(slurm_msg_t const *msg, Buf buffer) msg->data, buffer, msg->protocol_version); break; + case REQUEST_SIB_JOB_WILL_RUN: + case REQUEST_SIB_SUBMIT_BATCH_JOB: + case REQUEST_SIB_RESOURCE_ALLOCATION: + _pack_sib_msg((sib_msg_t *)msg->data, buffer, + msg->protocol_version); + break; case REQUEST_UPDATE_JOB_STEP: _pack_update_job_step_msg((step_update_request_msg_t *) msg->data, buffer, @@ -1756,6 +1767,12 @@ unpack_msg(slurm_msg_t * msg, Buf buffer) buffer, msg->protocol_version); break; + case REQUEST_SIB_JOB_WILL_RUN: + case REQUEST_SIB_SUBMIT_BATCH_JOB: + case REQUEST_SIB_RESOURCE_ALLOCATION: + rc = _unpack_sib_msg((sib_msg_t **)&(msg->data), buffer, + msg->protocol_version); + break; case REQUEST_UPDATE_JOB_STEP: rc = _unpack_update_job_step_msg( (step_update_request_msg_t **) & (msg->data), @@ -6279,6 +6296,12 @@ _unpack_job_info_members(job_info_t * job, Buf buffer, safe_unpackstr_xmalloc(&job->tres_req_str, &uint32_tmp, buffer); safe_unpack16(&job->start_protocol_ver, buffer); + + safe_unpackstr_xmalloc(&job->fed_origin_str, &uint32_tmp, + buffer); + safe_unpack64(&job->fed_siblings, buffer); + safe_unpackstr_xmalloc(&job->fed_siblings_str, &uint32_tmp, + buffer); } else if (protocol_version >= SLURM_16_05_PROTOCOL_VERSION) { uint32_t tmp_mem; safe_unpack32(&job->array_job_id, buffer); @@ -8664,6 +8687,77 @@ unpack_error: return SLURM_ERROR; } +static void +_pack_sib_msg(sib_msg_t *sib_msg_ptr, Buf buffer, uint16_t protocol_version) +{ + xassert(sib_msg_ptr); + + if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { + pack16(sib_msg_ptr->data_type, buffer); + pack16(sib_msg_ptr->data_version, buffer); + pack64(sib_msg_ptr->fed_siblings, buffer); + pack32(sib_msg_ptr->job_id, buffer); + + /* add already packed data_buffer to buffer */ + if (size_buf(sib_msg_ptr->data_buffer)) { + Buf dbuf = sib_msg_ptr->data_buffer; + uint32_t grow_size = + size_buf(dbuf) - get_buf_offset(dbuf); + + grow_buf(buffer, grow_size); + memcpy(&buffer->head[get_buf_offset(buffer)], + &dbuf->head[get_buf_offset(dbuf)], grow_size); + set_buf_offset(buffer, + get_buf_offset(buffer) + grow_size); + } + } else { + error("_pack_sib_msg: protocol_version " + "%hu not supported", protocol_version); + } +} + +static int +_unpack_sib_msg(sib_msg_t **sib_msg_buffer_ptr, Buf buffer, + uint16_t protocol_version) +{ + sib_msg_t *sib_msg_ptr = NULL; + slurm_msg_t tmp_msg; + + xassert(sib_msg_buffer_ptr); + + /* alloc memory for structure */ + if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { + sib_msg_ptr = xmalloc(sizeof(sib_msg_t)); + *sib_msg_buffer_ptr = sib_msg_ptr; + + /* load the data values */ + safe_unpack16(&sib_msg_ptr->data_type, buffer); + safe_unpack16(&sib_msg_ptr->data_version, buffer); + safe_unpack64(&sib_msg_ptr->fed_siblings, buffer); + safe_unpack32(&sib_msg_ptr->job_id, buffer); + + if (remaining_buf(buffer)) { + slurm_msg_t_init(&tmp_msg); + tmp_msg.msg_type = sib_msg_ptr->data_type; + tmp_msg.protocol_version = sib_msg_ptr->data_version; + + if (unpack_msg(&tmp_msg, buffer)) + goto unpack_error; + + sib_msg_ptr->data = tmp_msg.data; + tmp_msg.data = NULL; + slurm_free_msg_members(&tmp_msg); + } + } + + return SLURM_SUCCESS; + +unpack_error: + slurm_free_sib_msg(sib_msg_ptr); + *sib_msg_buffer_ptr = NULL; + return SLURM_ERROR; +} + /* _pack_job_desc_msg * packs a job_desc struct * IN job_desc_ptr - pointer to the job descriptor to pack @@ -8684,6 +8778,7 @@ _pack_job_desc_msg(job_desc_msg_t * job_desc_ptr, Buf buffer, pack32(job_desc_ptr->task_dist, buffer); pack16(job_desc_ptr->kill_on_node_fail, buffer); packstr(job_desc_ptr->features, buffer); + pack64(job_desc_ptr->fed_siblings, buffer); packstr(job_desc_ptr->gres, buffer); pack32(job_desc_ptr->job_id, buffer); packstr(job_desc_ptr->job_id_str, buffer); @@ -9204,6 +9299,7 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer, safe_unpack16(&job_desc_ptr->kill_on_node_fail, buffer); safe_unpackstr_xmalloc(&job_desc_ptr->features, &uint32_tmp, buffer); + safe_unpack64(&job_desc_ptr->fed_siblings, buffer); safe_unpackstr_xmalloc(&job_desc_ptr->gres, &uint32_tmp,buffer); safe_unpack32(&job_desc_ptr->job_id, buffer); safe_unpackstr_xmalloc(&job_desc_ptr->job_id_str, diff --git a/src/common/slurmdb_defs.c b/src/common/slurmdb_defs.c index 52c2ef95f14118534d72558eed91c47318cdd990..cb0a1fc678e341a6ee54d13943cbfed38cac20ae 100644 --- a/src/common/slurmdb_defs.c +++ b/src/common/slurmdb_defs.c @@ -2997,6 +2997,23 @@ extern char *slurmdb_get_selected_step_id( return job_id_str; } +static int _find_char_in_list(void *name, void *key) +{ + char *name_str = (char *)name; + char *key_str = (char *)key; + + if (!xstrcmp(name_str,key_str)) + return 1; + + return 0; +} + +/* Return the cluster with the fastest start_time. + * + * Note: The will_runs are not threaded. Currently it relies on the + * working_cluster_rec to pack the job_desc's jobinfo. See previous commit for + * an example of how to thread this. + */ extern int slurmdb_get_first_avail_cluster(job_desc_msg_t *req, char *cluster_names, slurmdb_cluster_rec_t **cluster_rec) { @@ -3007,6 +3024,7 @@ extern int slurmdb_get_first_avail_cluster(job_desc_msg_t *req, ListIterator itr; List cluster_list = NULL; List ret_list = NULL; + List tried_feds = list_create(NULL); *cluster_rec = NULL; cluster_list = slurmdb_get_info_cluster(cluster_names); @@ -3032,13 +3050,25 @@ extern int slurmdb_get_first_avail_cluster(job_desc_msg_t *req, ret_list = list_create(_destroy_local_cluster_rec); itr = list_iterator_create(cluster_list); while ((working_cluster_rec = list_next(itr))) { - if ((local_cluster = _job_will_run(req))) + + /* only try one cluster from each federation */ + if (working_cluster_rec->fed.id && + list_find_first(tried_feds, _find_char_in_list, + working_cluster_rec->fed.name)) + continue; + + if ((local_cluster = _job_will_run(req))) { list_append(ret_list, local_cluster); - else + if (working_cluster_rec->fed.id) + list_append(tried_feds, + working_cluster_rec->fed.name); + } else { error("Problem with submit to cluster %s: %m", working_cluster_rec->name); + } } list_iterator_destroy(itr); + FREE_NULL_LIST(tried_feds); /* restore working_cluster_rec in case it was already set */ if (*cluster_rec) { diff --git a/src/plugins/job_submit/pbs/job_submit_pbs.c b/src/plugins/job_submit/pbs/job_submit_pbs.c index d50a3d5171bd319da83d9015e67563bb14676c5e..b9098b912c11a1a6cbac7e89107fec8f0072286b 100644 --- a/src/plugins/job_submit/pbs/job_submit_pbs.c +++ b/src/plugins/job_submit/pbs/job_submit_pbs.c @@ -302,7 +302,7 @@ extern int job_submit(struct job_descriptor *job_desc, uint32_t submit_uid) char *std_out, *tok; uint32_t my_job_id; - my_job_id = get_next_job_id(); + my_job_id = get_next_job_id(true); _xlate_dependency(job_desc, submit_uid, my_job_id); if (job_desc->account) diff --git a/src/salloc/opt.c b/src/salloc/opt.c index 9151518f15d546b31044707cd90d3ef9b517b72c..089f5727b7268afecc7f115408e2d3200e362a54 100644 --- a/src/salloc/opt.c +++ b/src/salloc/opt.c @@ -286,6 +286,7 @@ static void _opt_default() opt.uid = uid; opt.gid = getgid(); + opt.clusters = NULL; opt.cwd = NULL; opt.progname = NULL; @@ -408,6 +409,8 @@ env_vars_t env_vars[] = { {"SALLOC_ACCTG_FREQ", OPT_STRING, &opt.acctg_freq, NULL }, {"SALLOC_BELL", OPT_BELL, NULL, NULL }, {"SALLOC_BURST_BUFFER", OPT_STRING, &opt.burst_buffer, NULL }, + {"SALLOC_CLUSTERS", OPT_STRING, &opt.clusters, NULL }, + {"SLURM_CLUSTERS", OPT_STRING, &opt.clusters, NULL }, {"SALLOC_CONN_TYPE", OPT_CONN_TYPE, NULL, NULL }, {"SALLOC_CORE_SPEC", OPT_INT, &opt.core_spec, NULL }, {"SALLOC_CPU_FREQ_REQ", OPT_CPU_FREQ, NULL, NULL }, @@ -682,6 +685,8 @@ void set_options(const int argc, char **argv) {"kill-command", optional_argument, 0, 'K'}, {"licenses", required_argument, 0, 'L'}, {"distribution", required_argument, 0, 'm'}, + {"cluster", required_argument, 0, 'M'}, + {"clusters", required_argument, 0, 'M'}, {"tasks", required_argument, 0, 'n'}, {"ntasks", required_argument, 0, 'n'}, {"nodes", required_argument, 0, 'N'}, @@ -763,7 +768,7 @@ void set_options(const int argc, char **argv) {NULL, 0, 0, 0} }; char *opt_string = - "+A:B:c:C:d:D:F:g:hHI::J:kK::L:m:n:N:Op:P:QRsS:t:uU:vVw:W:x:"; + "+A:B:c:C:d:D:F:g:hHI::J:kK::L:m:M:n:N:Op:P:QRsS:t:uU:vVw:W:x:"; char *pos_delimit; struct option *optz = spank_option_table_create(long_options); @@ -882,6 +887,10 @@ void set_options(const int argc, char **argv) exit(error_exit); } break; + case 'M': + xfree(opt.clusters); + opt.clusters = xstrdup(optarg); + break; case 'n': opt.ntasks_set = true; opt.ntasks = @@ -2076,6 +2085,7 @@ static void _usage(void) " [--immediate[=secs]] [--no-kill] [--overcommit] [-D path]\n" " [--oversubscribe] [-J jobname] [--jobid=id]\n" " [--verbose] [--gid=group] [--uid=user] [--licenses=names]\n" +" [--clusters=cluster_names]\n" " [--contiguous] [--mincpus=n] [--mem=MB] [--tmp=MB] [-C list]\n" " [--account=name] [--dependency=type:jobid] [--comment=name]\n" #ifdef HAVE_BG /* Blue gene specific options */ @@ -2128,6 +2138,10 @@ static void _help(void) " -k, --no-kill do not kill job on node failure\n" " -K, --kill-command[=signal] signal to send terminating job\n" " -L, --licenses=names required license, comma separated\n" +" -M, --clusters=names Comma separated list of clusters to issue\n" +" commands to. Default is current cluster.\n" +" Name of 'all' will submit to run on all clusters.\n" +" NOTE: SlurmDBD must up.\n" " -m, --distribution=type distribution method for processes to nodes\n" " (type = block|cyclic|arbitrary)\n" " --mail-type=type notify on state change: BEGIN, END, FAIL or ALL\n" diff --git a/src/salloc/opt.h b/src/salloc/opt.h index 8b96d15441f123c42b53ce583f079dc26376645a..21bc42504ffbc6201bf519b31e1e60cf518aabd4 100644 --- a/src/salloc/opt.h +++ b/src/salloc/opt.h @@ -60,7 +60,7 @@ typedef enum {BELL_NEVER, BELL_AFTER_DELAY, BELL_ALWAYS} bell_flag_t; typedef struct salloc_options { - + char *clusters; /* cluster to run this on. */ char *progname; /* argv[0] of this program or * configuration file if multi_prog */ char* user; /* local username */ diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 817f90ab0b76ea94c5363ad5e4274911aef6891b..a772993dc4d4644b7bf19cbd380a8d9a5e97f52d 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -59,6 +59,7 @@ #include "src/common/cpu_frequency.h" #include "src/common/env.h" #include "src/common/plugstack.h" +#include "src/common/proc_args.h" #include "src/common/read_config.h" #include "src/common/slurm_rlimits_info.h" #include "src/common/slurm_time.h" @@ -305,6 +306,16 @@ int main(int argc, char *argv[]) } } + /* If can run on multiple clusters find the earliest run time + * and run it there */ + desc.clusters = xstrdup(opt.clusters); + if (opt.clusters && + slurmdb_get_first_avail_cluster(&desc, opt.clusters, + &working_cluster_rec) != SLURM_SUCCESS) { + print_db_notok(opt.clusters, 0); + exit(error_exit); + } + callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; @@ -560,6 +571,8 @@ relinquish: } } } + + xfree(desc.clusters); return rc; } diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c index 036937bc24054b8233c727e1e5787c70eabf136e..dabcfaf84aa3829a30fbdc291dcdd93081a1a16f 100644 --- a/src/sbatch/opt.c +++ b/src/sbatch/opt.c @@ -3350,9 +3350,6 @@ static void _help(void) " --bb=<spec> burst buffer specifications\n" " --bbf=<file_name> burst buffer specification file\n" " --begin=time defer job until HH:MM MM/DD/YY\n" -" -M, --clusters=names Comma separated list of clusters to issue\n" -" commands to. Default is current cluster.\n" -" Name of 'all' will submit to run on all clusters.\n" " --comment=name arbitrary comment\n" " --cpu-freq=min[-max[:gov]] requested cpu frequency (and governor)\n" " -c, --cpus-per-task=ncpus number of cpus required per task\n" @@ -3378,9 +3375,12 @@ static void _help(void) " -J, --job-name=jobname name of job\n" " -k, --no-kill do not kill job on node failure\n" " -L, --licenses=names required license, comma separated\n" +" -M, --clusters=names Comma separated list of clusters to issue\n" +" commands to. Default is current cluster.\n" +" Name of 'all' will submit to run on all clusters.\n" +" NOTE: SlurmDBD must up.\n" " -m, --distribution=type distribution method for processes to nodes\n" " (type = block|cyclic|arbitrary)\n" - " --mail-type=type notify on state change: BEGIN, END, FAIL or ALL\n" " --mail-user=user who to send email notification for job state\n" " changes\n" diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c index 8992a4c5a29f4b6c7e45190fa9a2b4afe1cb6ad5..f0a7fe784fd1d3a6136a913e2a3c4e36ea1feda5 100644 --- a/src/sbatch/sbatch.c +++ b/src/sbatch/sbatch.c @@ -161,6 +161,7 @@ int main(int argc, char *argv[]) /* If can run on multiple clusters find the earliest run time * and run it there */ + desc.clusters = xstrdup(opt.clusters); if (opt.clusters && slurmdb_get_first_avail_cluster(&desc, opt.clusters, &working_cluster_rec) != SLURM_SUCCESS) { @@ -168,7 +169,6 @@ int main(int argc, char *argv[]) exit(error_exit); } - if (_check_cluster_specific_settings(&desc) != SLURM_SUCCESS) exit(error_exit); @@ -221,6 +221,7 @@ int main(int argc, char *argv[]) if (opt.wait) rc = _job_wait(resp->job_id); + xfree(desc.clusters); xfree(desc.name); xfree(desc.script); env_array_free(desc.environment); diff --git a/src/scancel/opt.c b/src/scancel/opt.c index bbe4f783c361261186975f8aa189b24404e568be..18011629d8d4a7729eb3594c42b256956b9875d8 100644 --- a/src/scancel/opt.c +++ b/src/scancel/opt.c @@ -701,6 +701,8 @@ static void _help(void) /* printf(" --ctld send request directly to slurmctld\n"); */ printf(" -f, --full signal batch shell and all steps for specified job\n"); printf(" -i, --interactive require response from user for each job\n"); + printf(" -M, --clusters clusters to issue commands to.\n"); + printf(" NOTE: SlurmDBD must be up.\n"); printf(" -n, --name=job_name act only on jobs with this name\n"); printf(" -p, --partition=partition act only on jobs in this partition\n"); printf(" -Q, --quiet disable warnings\n"); diff --git a/src/scancel/scancel.c b/src/scancel/scancel.c index e9db52e6acb8731475e10a295fbf6de67fc98e92..e96c83fb84ee6f1236b42d69a2eeb06d5052fe30 100644 --- a/src/scancel/scancel.c +++ b/src/scancel/scancel.c @@ -183,7 +183,8 @@ _load_job_records (void) /* We need the fill job array string representation for identifying * and killing job arrays */ setenv("SLURM_BITSTR_LEN", "0", 1); - error_code = slurm_load_jobs ((time_t) NULL, &job_buffer_ptr, 1); + error_code = slurm_load_jobs ((time_t) NULL, &job_buffer_ptr, + (SHOW_ALL | SHOW_FED_TRACK)); if (error_code) { slurm_perror ("slurm_load_jobs error"); diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c index f9da84f77d35309e43f41f752e7756c2fb159238..f7a3ab94fc3ebc53ec7fedbae2261bcf0845b146 100644 --- a/src/scontrol/scontrol.c +++ b/src/scontrol/scontrol.c @@ -1917,6 +1917,7 @@ scontrol [<OPTION>] [<COMMAND>] \n\ -h or --help: equivalent to \"help\" command \n\ --hide: equivalent to \"hide\" command \n\ -M or --cluster: equivalent to \"cluster\" command \n\ + NOTE: SlurmDBD must be up. \n\ -o or --oneliner: equivalent to \"oneliner\" command \n\ -Q or --quiet: equivalent to \"quiet\" command \n\ -v or --verbose: equivalent to \"verbose\" command \n\ @@ -1934,6 +1935,7 @@ scontrol [<OPTION>] [<COMMAND>] \n\ cluster cluster to issue commands to. Default is \n\ current cluster. cluster with no name will \n\ reset to default. \n\ + NOTE: SlurmDBD must be up. \n\ checkpoint <CH_OP><ID> perform a checkpoint operation on identified \n\ job or job step \n\ completing display jobs in completing state along with \n\ diff --git a/src/sinfo/opts.c b/src/sinfo/opts.c index c3766d5307c4e0124889e57aac7f5eda14385d38..4b5181b6b6c579eae5214aab84c44fe8a464f268 100644 --- a/src/sinfo/opts.c +++ b/src/sinfo/opts.c @@ -1311,6 +1311,8 @@ Usage: sinfo [OPTIONS]\n\ --hide do not show hidden or non-accessible partitions\n\ -i, --iterate=seconds specify an iteration period\n\ -l, --long long output - displays more information\n\ + -M, --clusters=names clusters to issue commands to.\n\ + NOTE: SlurmDBD must be up.\n\ -n, --nodes=NODES report on specific node(s)\n\ --noconvert don't convert units from their original type\n\ (e.g. 2048M won't be converted to 2G).\n\ diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 60945950b633f8fdd158b9d9478485500bfeda06..8762e12e6d6b70370bf7ff3e45e1f6fe00482d2e 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1094,6 +1094,7 @@ static void *_service_connection(void *arg) } #endif slurm_msg_t_init(&msg); + msg.flags |= SLURM_MSG_KEEP_BUFFER; /* * slurm_receive_msg sets msg connection fd to accepted fd. This allows * possibility for slurmctld_req() to close accepted connection. diff --git a/src/slurmctld/fed_mgr.c b/src/slurmctld/fed_mgr.c index dea283cb5badf07c1934b97327eac9d4668dc52f..cc917b8eecb5b9b52e208d2d2df88a228dbbe97a 100644 --- a/src/slurmctld/fed_mgr.c +++ b/src/slurmctld/fed_mgr.c @@ -45,6 +45,7 @@ #include "src/common/list.h" #include "src/common/macros.h" +#include "src/common/parse_time.h" #include "src/common/slurm_protocol_api.h" #include "src/common/slurmdbd_defs.h" #include "src/common/xmalloc.h" @@ -57,6 +58,8 @@ #define FED_MGR_STATE_FILE "fed_mgr_state" #define FED_MGR_CLUSTER_ID_BEGIN 26 +#define FED_SIBLING_BIT(x) ((uint64_t)1 << (x - 1)) + slurmdb_federation_rec_t *fed_mgr_fed_rec = NULL; static slurmdb_cluster_rec_t *fed_mgr_cluster_rec = NULL; @@ -66,6 +69,31 @@ static pthread_mutex_t open_send_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t update_mutex = PTHREAD_MUTEX_INITIALIZER; +/* structs to pass to threads */ +typedef struct { + will_run_response_msg_t *resp; + slurmdb_cluster_rec_t *sibling; + sib_msg_t *sib_msg; + uid_t uid; + pthread_t thread_id; + int thread_rc; +} sib_willrun_t; + +typedef struct { + slurmdb_cluster_rec_t *sibling; + sib_msg_t *sib_msg; + pthread_t thread_id; + int thread_rc; +} sib_submit_t; + +typedef struct { + job_desc_msg_t *job_desc; + slurmdb_cluster_rec_t *sibling; + pthread_t thread_id; + int thread_rc; +} sib_update_t; + + static int _close_controller_conn(slurmdb_cluster_rec_t *cluster) { int rc = SLURM_SUCCESS; @@ -438,8 +466,8 @@ static void _persist_callback_fini(void *arg) } if (!(cluster = list_find_first(fed_mgr_fed_rec->cluster_list, - slurmdb_find_cluster_in_list, - persist_conn->cluster_name))) { + slurmdb_find_cluster_in_list, + persist_conn->cluster_name))) { info("Couldn't find cluster %s?", persist_conn->cluster_name); unlock_slurmctld(fed_write_lock); @@ -485,6 +513,168 @@ static void _join_federation(slurmdb_federation_rec_t *fed, _create_ping_thread(); } +static int _persist_job_will_run(slurmdb_cluster_rec_t *conn, + sib_msg_t *sib_msg, + will_run_response_msg_t **will_run_resp) +{ + int rc = SLURM_PROTOCOL_SUCCESS; + slurm_msg_t req_msg, resp_msg; + + slurm_msg_t_init(&req_msg); + slurm_msg_t_init(&resp_msg); + + req_msg.msg_type = REQUEST_SIB_JOB_WILL_RUN; + req_msg.data = sib_msg; + + rc = _send_recv_msg(conn, &req_msg, &resp_msg, false); + if (rc < 0) { + rc = SLURM_PROTOCOL_ERROR; + goto end_it; + } + + switch (resp_msg.msg_type) { + case RESPONSE_SLURM_RC: + if ((rc = slurm_get_return_code(resp_msg.msg_type, + resp_msg.data))) { + info("persistent will_run failed/resources not avail: %d", rc); + slurm_seterrno(rc); + rc = SLURM_PROTOCOL_ERROR; + } + break; + case RESPONSE_JOB_WILL_RUN: + *will_run_resp = (will_run_response_msg_t *) resp_msg.data; + resp_msg.data = NULL; + break; + default: + slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR); + rc = SLURM_PROTOCOL_ERROR; + break; + } + +end_it: + slurm_free_msg_members(&resp_msg); + + return rc; +} + +static int _persist_submit_batch_job(slurmdb_cluster_rec_t *conn, + sib_msg_t *sib_msg, + submit_response_msg_t **resp) +{ + int rc = SLURM_PROTOCOL_SUCCESS; + slurm_msg_t req_msg, resp_msg; + + *resp = NULL; + + slurm_msg_t_init(&req_msg); + slurm_msg_t_init(&resp_msg); + + req_msg.msg_type = REQUEST_SIB_SUBMIT_BATCH_JOB; + req_msg.data = sib_msg; + + rc = _send_recv_msg(conn, &req_msg, &resp_msg, false); + if (rc) { + rc = SLURM_PROTOCOL_ERROR; + goto end_it; + } + + switch (resp_msg.msg_type) { + case RESPONSE_SLURM_RC: + if ((rc = ((return_code_msg_t *) resp_msg.data)->return_code)) { + slurm_seterrno(rc); + rc = SLURM_PROTOCOL_ERROR; + } + break; + case RESPONSE_SUBMIT_BATCH_JOB: + *resp = (submit_response_msg_t *) resp_msg.data; + resp_msg.data = NULL; + break; + default: + slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR); + rc = SLURM_PROTOCOL_ERROR; + } + +end_it: + slurm_free_msg_members(&resp_msg); + + return rc; +} + +static int _persist_allocte_resources(slurmdb_cluster_rec_t *conn, + sib_msg_t *sib_msg, + resource_allocation_response_msg_t **resp) +{ + int rc = SLURM_PROTOCOL_SUCCESS; + slurm_msg_t req_msg, resp_msg; + + *resp = NULL; + + slurm_msg_t_init(&req_msg); + slurm_msg_t_init(&resp_msg); + + req_msg.msg_type = REQUEST_SIB_RESOURCE_ALLOCATION; + req_msg.data = sib_msg; + + rc = _send_recv_msg(conn, &req_msg, &resp_msg, false); + if (rc) { + rc = SLURM_PROTOCOL_ERROR; + goto end_it; + } + + switch (resp_msg.msg_type) { + case RESPONSE_SLURM_RC: + if ((rc = ((return_code_msg_t *) resp_msg.data)->return_code)) { + slurm_seterrno(rc); + rc = SLURM_PROTOCOL_ERROR; + } + break; + case RESPONSE_RESOURCE_ALLOCATION: + *resp = (resource_allocation_response_msg_t *) resp_msg.data; + resp_msg.data = NULL; + break; + default: + slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR); + rc = SLURM_PROTOCOL_ERROR; + } + +end_it: + slurm_free_msg_members(&resp_msg); + + return rc; +} + +static int _persist_update_job(slurmdb_cluster_rec_t *conn, + job_desc_msg_t *data) +{ + int rc; + slurm_msg_t req_msg; + slurm_msg_t resp_msg; + + slurm_msg_t_init(&req_msg); + req_msg.msg_type = REQUEST_UPDATE_JOB; + req_msg.data = data; + + rc = _send_recv_msg(conn, &req_msg, &resp_msg, false); + if (rc == SLURM_SOCKET_ERROR) + return SLURM_ERROR; + + switch (resp_msg.msg_type) { + case RESPONSE_SLURM_RC: + rc = ((return_code_msg_t *) resp_msg.data)->return_code; + if (rc) { + slurm_free_msg_members(&resp_msg); + slurm_seterrno_ret(rc); + } + break; + default: + slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); + } + + slurm_free_msg_members(&resp_msg); + + return SLURM_PROTOCOL_SUCCESS; +} + extern int fed_mgr_init(void *db_conn) { int rc = SLURM_SUCCESS; @@ -567,10 +757,14 @@ extern int fed_mgr_fini() lock_slurmctld(fed_write_lock); - slurm_persist_conn_recv_server_fini(); - + /* Call _leave_federation() before slurm_persist_conn_recv_server_fini() + * as this will NULL out the cluster's recv persistent connection before + * _server_fini() actually destroy's it. That way the cluster's recv + * connection won't be pointing to bad memory. */ _leave_federation(); + slurm_persist_conn_recv_server_fini(); + unlock_slurmctld(fed_write_lock); return SLURM_SUCCESS; @@ -771,7 +965,7 @@ extern slurmdb_federation_rec_t *fed_mgr_state_load(char *state_save_location) !list_count(ret_fed->cluster_list)) { slurmdb_destroy_federation_rec(ret_fed); ret_fed = NULL; - error("No feds retrieved"); + debug("No feds to retrieve from state"); } else { /* We want to free the connections here since they don't exist * anymore, but they were packed when state was saved. */ @@ -797,34 +991,17 @@ unpack_error: return NULL; } -extern int _find_sibling_by_ip(void *x, void *key) +static int _find_sibling_by_id(void *x, void *key) { slurmdb_cluster_rec_t *object = (slurmdb_cluster_rec_t *)x; - char *ip = (char *)key; + int id = (intptr_t)key; - if (!xstrcmp(object->control_host, ip)) + if (object->fed.id == id) return 1; return 0; } -extern char *fed_mgr_find_sibling_name_by_ip(char *ip) -{ - char *name = NULL; - slurmdb_cluster_rec_t *sibling = NULL; - slurmctld_lock_t fed_read_lock = { - NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; - - lock_slurmctld(fed_read_lock); - if (fed_mgr_fed_rec && fed_mgr_fed_rec->cluster_list && - (sibling = list_find_first(fed_mgr_fed_rec->cluster_list, - _find_sibling_by_ip, ip))) - name = xstrdup(sibling->name); - unlock_slurmctld(fed_read_lock); - - return name; -} - /* * Returns true if the cluster is part of a federation. */ @@ -950,3 +1127,752 @@ extern int fed_mgr_add_sibling_conn(slurm_persist_conn_t *persist_conn, return rc; } + +static void _destroy_sib_willrun(void *object) +{ + sib_willrun_t *resp = (sib_willrun_t *)object; + if (resp) { + slurm_free_will_run_response_msg(resp->resp); + xfree(resp); + } +} + +static void _xfree_f(void* p) +{ + xfree(p); +} + +static void *_sib_will_run(void *arg) +{ + int rc = SLURM_SUCCESS; + sib_willrun_t *sib_willrun = (sib_willrun_t *)arg; + + if (sib_willrun->sibling == fed_mgr_cluster_rec) { + char *err_msg = NULL; + struct job_record *job_ptr = NULL; + job_desc_msg_t *job_desc; + sib_msg_t *sib_msg = sib_willrun->sib_msg; + slurmctld_lock_t job_write_lock = { + NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; + + lock_slurmctld(job_write_lock); + job_desc = sib_msg->data; + + if (job_desc->job_id == NO_VAL) { + /* Get a job_id now without incrementing the job_id + * count. This prevents burning job_ids on will_runs */ + job_desc->job_id = get_next_job_id(true); + } + + rc = job_allocate(sib_msg->data, false, true, + &sib_willrun->resp, true, sib_willrun->uid, + &job_ptr, &err_msg, sib_msg->data_version); + unlock_slurmctld(job_write_lock); + + if (rc) + debug2("%s: %s", __func__, slurm_strerror(rc)); + } else if ((rc = _persist_job_will_run(sib_willrun->sibling, + sib_willrun->sib_msg, + &sib_willrun->resp))) { + error("Failed to get will_run response from sibling %s", + sib_willrun->sibling->name); + } + + sib_willrun->thread_rc = rc; + + return NULL; +} + +static int _sort_sib_will_runs(void *x, void *y) +{ + int idle_rc = 0; + sib_willrun_t *run1 = *(sib_willrun_t **)x; + sib_willrun_t *run2 = *(sib_willrun_t **)y; + + if (!run1->resp) + return 1; + if (!run2->resp) + return -1; + + if (run1->sibling->fed.weight < run2->sibling->fed.weight) + return -1; + if (run1->sibling->fed.weight > run2->sibling->fed.weight) + return 1; + + /* pack jobs onto clusters with most avail resources. */ + if (run1->resp->sys_usage_per < run2->resp->sys_usage_per) + idle_rc = 1; + if (run1->resp->sys_usage_per > run2->resp->sys_usage_per) + idle_rc = -1; + + /* spread jobs across clusters */ + if (fed_mgr_fed_rec->flags & FEDERATION_FLAG_LLC) + idle_rc *= -1; + + return idle_rc; +} + +/* + * Convert comma separated list of cluster names to bitmap of cluster ids. + */ +static uint64_t _cluster_names_to_ids(char *clusters) +{ + uint64_t cluster_ids = 0; + List cluster_names = list_create(slurm_destroy_char); + + xassert(clusters); + + if (!xstrcasecmp(clusters, "all")) + return INFINITE64; + + if (slurm_addto_char_list(cluster_names, clusters)) { + ListIterator itr = list_iterator_create(cluster_names); + char *cluster_name; + slurmdb_cluster_rec_t *sibling; + + while ((cluster_name = list_next(itr))) { + if ((sibling = + list_find_first(fed_mgr_fed_rec->cluster_list, + slurmdb_find_cluster_in_list, + cluster_name))) { + cluster_ids |= FED_SIBLING_BIT(sibling->fed.id); + } + } + list_iterator_destroy(itr); + } + FREE_NULL_LIST(cluster_names); + + return cluster_ids; +} + +/* + * Get will_run responses from all clusters in a federation. + * IN msg - contains the original job_desc buffer to send to the siblings and to + * be able to create a job_desc copy to willrun itself. + * IN job_desc - original job_desc. It contains the federated job_id to put on + * the unpacked job_desc. This is not used for the actual will_run because + * job_allocate will modify the job_desc. + * IN uid - uid of user submitting the job + * RET returns a list of will_run_response_msg_t*'s. + */ +static List _get_sib_will_runs(slurm_msg_t *msg, job_desc_msg_t *job_desc, + uid_t uid) +{ + sib_willrun_t *sib_willrun = NULL; + slurmdb_cluster_rec_t *sibling = NULL; + ListIterator sib_itr, resp_itr; + List sib_willruns = NULL; + pthread_attr_t attr; + sib_msg_t sib_msg; + uint32_t buf_offset; + uint64_t cluster_list = INFINITE64; /* all clusters available */ + slurm_msg_t tmp_msg; + + xassert(job_desc); + xassert(msg); + + slurm_attr_init(&attr); + sib_willruns = list_create(_destroy_sib_willrun); + + /* Create copy of submitted job_desc since job_allocate() can modify the + * original job_desc. */ + buf_offset = get_buf_offset(msg->buffer); + slurm_msg_t_init(&tmp_msg); + tmp_msg.flags = msg->flags; + tmp_msg.msg_type = msg->msg_type; + tmp_msg.protocol_version = msg->protocol_version; + + unpack_msg(&tmp_msg, msg->buffer); + set_buf_offset(msg->buffer, buf_offset); + + ((job_desc_msg_t *)tmp_msg.data)->job_id = job_desc->job_id; + sib_msg.data = tmp_msg.data; + sib_msg.data_buffer = msg->buffer; + sib_msg.data_version = msg->protocol_version; + sib_msg.data_type = msg->msg_type; + + if (job_desc->clusters) + cluster_list = _cluster_names_to_ids(job_desc->clusters); + + /* willrun the sibling clusters */ + sib_itr = list_iterator_create(fed_mgr_fed_rec->cluster_list); + while ((sibling = list_next(sib_itr))) { + if (!(cluster_list & FED_SIBLING_BIT(sibling->fed.id))) { + if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) + info("skipping cluster %s -- not in cluster list to submit job to", + sibling->name); + + continue; + } + + sib_willrun = xmalloc(sizeof(sib_willrun_t)); + sib_willrun->sibling = sibling; + sib_willrun->uid = uid; + sib_willrun->sib_msg = &sib_msg; + + if (pthread_create(&sib_willrun->thread_id, &attr, + _sib_will_run, sib_willrun) != 0) { + error("failed to create sib_will_run thread for sib %s", + sibling->name); + _destroy_sib_willrun(sib_willrun); + continue; + } + + list_append(sib_willruns, sib_willrun); + } + list_iterator_destroy(sib_itr); + + slurm_attr_destroy(&attr); + + resp_itr = list_iterator_create(sib_willruns); + while ((sib_willrun = list_next(resp_itr))) { + pthread_join(sib_willrun->thread_id, NULL); + + if (sib_willrun->resp && + (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR)) { + char buf[64]; + slurm_make_time_str(&sib_willrun->resp->start_time, + buf, sizeof(buf)); + info("will_run_resp for %s: " + "start:%s sys_usage:%-6.2f weight:%d", + sib_willrun->sibling->name, buf, + sib_willrun->resp->sys_usage_per, + sib_willrun->sibling->fed.weight); + } + } + + list_iterator_destroy(resp_itr); + + /* Free unpacked job_desc data */ + slurm_free_msg_members(&tmp_msg); + + return sib_willruns; +} + +/* + * Find a sibling that can start the job now. + * IN msg - contains the original job_desc buffer to send to the siblings and to + * be able to create a job_desc copy to willrun itself. + * IN job_desc - original job_desc. It contains the federated job_id to put on + * the unpacked job_desc. This is not used for the actual will_run because + * job_allocate will modify the job_desc. + * IN uid - uid of user submitting the job + * OUT avail_sibs - bitmap of cluster ids that returned a will_run_response. + * RET returns a ptr to a cluster_rec that can or start the job now or NULL if + * no cluster can start the job now. + */ +static slurmdb_cluster_rec_t *_find_start_now_sib(slurm_msg_t *msg, + job_desc_msg_t *job_desc, + uid_t uid, + uint64_t *avail_sibs) +{ + ListIterator itr; + List sib_willruns; + sib_willrun_t *sib_willrun = NULL; + sib_willrun_t *start_now_sib = NULL; + slurmdb_cluster_rec_t *ret_sib = NULL; + time_t now = 0; + + xassert(avail_sibs); + xassert(job_desc); + xassert(msg); + + if (!(sib_willruns = _get_sib_will_runs(msg, job_desc, uid))) { + error("Failed to get any will_run responses from any sibs"); + return NULL; + } + + list_sort(sib_willruns, (ListCmpF)_sort_sib_will_runs); + + now = time(NULL); + + itr = list_iterator_create(sib_willruns); + while ((sib_willrun = list_next(itr))) { + if (!sib_willrun->resp) /* no response if job couldn't run? */ + continue; + + *avail_sibs |= FED_SIBLING_BIT(sib_willrun->sibling->fed.id); + + /* Pick first sibling that can start the job now. siblings are + * sorted by weight and resources. */ + if (sib_willrun->resp->start_time <= now) { + start_now_sib = sib_willrun; + break; + } + } + if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) { + if (start_now_sib) + info("Earliest cluster:%s time:%ld now:%ld", + start_now_sib->sibling->name, + start_now_sib->resp->start_time, now); + else + info("No siblings can start the job now (%ld))", now); + } + list_iterator_destroy(itr); + + if (start_now_sib) + ret_sib = start_now_sib->sibling; + + FREE_NULL_LIST(sib_willruns); + + return ret_sib; +} + +static void *_submit_sibling_allocation(void *arg) +{ + int rc = SLURM_SUCCESS; + resource_allocation_response_msg_t *alloc_resp = NULL; + sib_submit_t *sub = (sib_submit_t *)arg; + slurmdb_cluster_rec_t *sibling = sub->sibling; + sib_msg_t *sib_msg = sub->sib_msg; + + if ((rc = _persist_allocte_resources(sibling, sib_msg, &alloc_resp))) { + error("Failed to submit job to sibling %s: %m", sibling->name); + } else if (!alloc_resp) { + error("Got a success back without a resp. This shouldn't happen"); + rc = SLURM_ERROR; + } else if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) { + info("Submitted federated allocation %u to %s", + alloc_resp->job_id, sibling->name); + } + sub->thread_rc = rc; + + slurm_free_resource_allocation_response_msg(alloc_resp); + + return NULL; +} + +static void *_submit_sibling_batch_job(void *arg) +{ + int rc = SLURM_SUCCESS; + submit_response_msg_t *resp = NULL; + sib_submit_t *sub = (sib_submit_t *)arg; + slurmdb_cluster_rec_t *sibling = sub->sibling; + sib_msg_t *sib_msg = sub->sib_msg; + + if ((rc = _persist_submit_batch_job(sibling, sib_msg, &resp))) { + error("Failed to submit job to sibling %s: %m", sibling->name); + } else if (!resp) { + error("Got a success back without a resp. This shouldn't happen"); + rc = SLURM_ERROR; + } else if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) { + info("Submitted federated job %u to %s", + resp->job_id, sibling->name); + } + sub->thread_rc = rc; + + slurm_free_submit_response_response_msg(resp); + + return NULL; +} + +static void *_update_sibling_job(void *arg) +{ + sib_update_t *sub = (sib_update_t *)arg; + sub->thread_rc = _persist_update_job(sub->sibling, sub->job_desc); + + return NULL; +} + +/* + * Submit sibling jobs to designated (job_desc->fed_siblings) siblings. + * + * Will update job_desc->fed_siblings if a sibling fails to submit a job. + * + * IN job_desc - job_desc containing job_id and fed_siblings of job to be. + * IN msg - contains the original job_desc buffer to send to the siblings. + * IN alloc_only - true if just an allocation. false if a batch job. + * RET returns SLURM_SUCCESS if all siblings recieved the job sucessfully or + * SLURM_ERROR if any siblings failed to receive the job. If a sibling + * fails, then the sucessful siblings will be updated with the correct + * sibling bitmap. + */ +static int _submit_sibling_jobs(job_desc_msg_t *job_desc, slurm_msg_t *msg, + bool alloc_only) +{ + int rc = SLURM_SUCCESS; + ListIterator sib_itr, thread_itr; + List submit_threads = NULL; + sib_submit_t *tmp_sub = NULL; + sib_msg_t sib_msg; + slurmdb_cluster_rec_t *sibling = NULL; + pthread_attr_t attr; + + xassert(job_desc); + xassert(msg); + + slurm_attr_init(&attr); + submit_threads = list_create(_xfree_f); + + sib_msg.data_buffer = msg->buffer; + sib_msg.data_type = msg->msg_type; + sib_msg.data_version = msg->protocol_version; + sib_msg.fed_siblings = job_desc->fed_siblings; + sib_msg.job_id = job_desc->job_id; + + sib_itr = list_iterator_create(fed_mgr_fed_rec->cluster_list); + while ((sibling = list_next(sib_itr))) { + pthread_t thread_id = 0; + sib_submit_t *sub; + + if (sibling == fed_mgr_cluster_rec) + continue; + + /* fed_siblings is set prior to siblings that responded */ + if (!(job_desc->fed_siblings & + FED_SIBLING_BIT(sibling->fed.id))) + continue; + + sub = xmalloc(sizeof(sib_submit_t)); + sub->sibling = sibling; + sub->sib_msg = &sib_msg; + if (pthread_create(&thread_id, &attr, + ((alloc_only) ? + _submit_sibling_allocation : + _submit_sibling_batch_job), sub) != 0) { + error("failed to create submit_sibling_job_thread"); + xfree(sub); + continue; + } + sub->thread_id = thread_id; + + list_append(submit_threads, sub); + } + + thread_itr = list_iterator_create(submit_threads); + while ((tmp_sub = list_next(thread_itr))) { + pthread_join(tmp_sub->thread_id, NULL); + rc |= tmp_sub->thread_rc; + + /* take out the job from the siblings bitmap if there was an + * error. The local host should stay in it if it's there. */ + if (tmp_sub->thread_rc) + job_desc->fed_siblings &= + (~FED_SIBLING_BIT(tmp_sub->sibling->fed.id)); + } + list_iterator_destroy(thread_itr); + + if (rc && job_desc->fed_siblings) { + /* failed to submit a job to sibling. Need to update all of the + * job's fed_siblings bitmaps */ + List update_threads = list_create(_xfree_f); + job_desc_msg_t *job_update_msg = + xmalloc(sizeof(job_desc_msg_t)); + + slurm_init_job_desc_msg(job_update_msg); + job_update_msg->job_id = job_desc->job_id; + job_update_msg->fed_siblings = job_desc->fed_siblings; + + list_iterator_reset(sib_itr); + while ((sibling = list_next(sib_itr))) { + pthread_t thread_id = 0; + sib_update_t *sub; + + /* Local is handled outside */ + if (sibling == fed_mgr_cluster_rec) + continue; + + if (!(job_desc->fed_siblings & + FED_SIBLING_BIT(sibling->fed.id))) + continue; + + sub = xmalloc(sizeof(sib_submit_t)); + sub->job_desc = job_update_msg; + sub->sibling = sibling; + if (pthread_create(&thread_id, &attr, + _update_sibling_job, sub) != 0) { + error("failed to create submit_sibling_job_thread"); + xfree(sub); + continue; + } + sub->thread_id = thread_id; + + list_append(update_threads, sub); + } + + thread_itr = list_iterator_create(update_threads); + while ((tmp_sub = list_next(thread_itr))) { + pthread_join(tmp_sub->thread_id, NULL); + if (tmp_sub->thread_rc) { + error("failed to update sibling job with updated sibling bitmap on sibling %s", + tmp_sub->sibling->name); + /* other cluster should get update when it syncs + * up */ + } + } + list_iterator_destroy(thread_itr); + } + + slurm_attr_destroy(&attr); + list_iterator_destroy(sib_itr); + FREE_NULL_LIST(submit_threads); + + return rc; +} + +/* Determine how to submit a federated a job. + * + * First tries to find a cluster that can start the job now. If a cluster can + * start the job now, then a sibling job is submitted to that cluster. If no + * cluster can start the job now, then siblings jobs are submitted to each + * sibling. + * + * Does its own locking (job and fed). Doesn't have a job write lock when + * communicating with siblings to prevent blocking on sibling communications. + * + * IN msg - msg that contains packed job_desc msg to send to siblings. + * IN job_desc - original job_desc msg. + * IN alloc_only - true if requesting just an allocation (srun/salloc). + * IN uid - uid of user requesting allocation. + * IN protocol_version - version of the code the caller is using + * OUT job_id_ptr - job_id of allocated job + * OUT alloc_code - error_code returned from job_allocate + * OUT err_msg - error message returned if any + * RET returns SLURM_SUCCESS if the allocation was successful, SLURM_ERROR + * otherwise. + */ +extern int fed_mgr_job_allocate(slurm_msg_t *msg, job_desc_msg_t *job_desc, + bool alloc_only, uid_t uid, + uint16_t protocol_version, + uint32_t *job_id_ptr, int *alloc_code, + char **err_msg) +{ + int rc = SLURM_SUCCESS; + slurmdb_cluster_rec_t *start_now_sib; + uint64_t avail_sibs = 0; + struct job_record *job_ptr = NULL; + slurmctld_lock_t fed_read_lock = { + NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; + slurmctld_lock_t job_write_lock = { + NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, NO_LOCK }; + + xassert(msg); + xassert(job_desc); + xassert(job_id_ptr); + xassert(alloc_code); + xassert(err_msg); + + lock_slurmctld(fed_read_lock); + + lock_slurmctld(job_write_lock); + /* get job_id now. Can't submit job to get job_id as job_allocate will + * change the job_desc. */ + job_desc->job_id = get_next_job_id(false); + unlock_slurmctld(job_write_lock); + + /* Don't job/node write lock on _find_start_now_sib. It locks inside + * _sib_will_run */ + start_now_sib = _find_start_now_sib(msg, job_desc, uid, &avail_sibs); + + if (!avail_sibs) { + debug("No cluster responded to sibling will_runs, submitting to self"); + avail_sibs = FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id); + } + + if (start_now_sib == NULL) { + job_desc->fed_siblings = avail_sibs; + } else if (start_now_sib == fed_mgr_cluster_rec) { + job_desc->fed_siblings |= + FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id); + } else { + job_desc->fed_siblings |= + FED_SIBLING_BIT(start_now_sib->fed.id); + } + + /* Submit local job first. Then submit to all siblings. If the local job + * fails, then don't worry about sending to the siblings. */ + lock_slurmctld(job_write_lock); + *alloc_code = job_allocate(job_desc, job_desc->immediate, false, NULL, + alloc_only, uid, &job_ptr, err_msg, + protocol_version); + + if (!job_ptr || (*alloc_code && job_ptr->job_state == JOB_FAILED)) { + unlock_slurmctld(job_write_lock); + rc = SLURM_ERROR; + /* There may be an rc but the job won't be failed. Will sit in + * qeueue */ + info("failed to submit federated job to local cluster"); + goto end_it; + } + + *job_id_ptr = job_ptr->job_id; + + info("Submitted %sfederated job %u to %s(self)", + (!(job_ptr->fed_details->siblings & + FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id)) ? + "tracking " : ""), + job_ptr->job_id, fed_mgr_cluster_rec->name); + + unlock_slurmctld(job_write_lock); + + if (_submit_sibling_jobs(job_desc, msg, alloc_only)) { + /* failed to submit a sibling job to a sibling. Need to update + * the local job's sibling bitmap */ + + lock_slurmctld(job_write_lock); + if ((job_ptr->magic == JOB_MAGIC) && + (job_ptr->job_id == *job_id_ptr)) { + + if (!job_desc->fed_siblings) { + /* we know that we already have a job_ptr so + * just make it a scheduleable job. */ + error("Failed to submit fed job to siblings, submitting to local cluster"); + job_desc->fed_siblings |= + FED_SIBLING_BIT( + fed_mgr_cluster_rec->fed.id); + } + set_job_fed_details(job_ptr, job_desc->fed_siblings); + } else { + error("%s: job got messed up. this should never happen", + __func__); + } + + unlock_slurmctld(job_write_lock); + } + +end_it: + unlock_slurmctld(fed_read_lock); + + return rc; +} + +/* Tests if the job is a tracker only federated job. + * Tracker only job: a job that shouldn't run on the local cluster but should be + * kept around to facilitate communications for it's sibling jobs on other + * clusters. + */ +extern bool fed_mgr_is_tracker_only_job(struct job_record *job_ptr) +{ + bool rc = false; + slurmctld_lock_t fed_read_lock = { + NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; + + xassert(job_ptr); + + lock_slurmctld(fed_read_lock); + + if (job_ptr->fed_details && + fed_mgr_cluster_rec && + (fed_mgr_get_cluster_id(job_ptr->job_id) == + fed_mgr_cluster_rec->fed.id) && + (!(job_ptr->fed_details->siblings & + FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id)))) + rc = true; + + unlock_slurmctld(fed_read_lock); + + return rc; +} + +/* Return the cluster name for the given cluster id. + * Must xfree returned string + */ +extern char *fed_mgr_get_cluster_name(uint32_t id) +{ + slurmdb_cluster_rec_t *sibling; + char *name = NULL; + + if ((sibling = + list_find_first(fed_mgr_fed_rec->cluster_list, + _find_sibling_by_id, + (void *)(intptr_t)id))) { + name = xstrdup(sibling->name); + } + + return name; +} + + +/* Convert cluster ids to cluster names. + * + * RET: return string of comma-separated clsuter names. + * Must free returned string. + */ +extern char *fed_mgr_cluster_ids_to_names(uint64_t cluster_ids) +{ + int bit = 1; + char *names = NULL; + + if (!fed_mgr_fed_rec || !fed_mgr_fed_rec->cluster_list) + return names; + + while (cluster_ids) { + if (cluster_ids & 1) { + slurmdb_cluster_rec_t *sibling; + if ((sibling = + list_find_first(fed_mgr_fed_rec->cluster_list, + _find_sibling_by_id, + (void *)(intptr_t)bit))){ + xstrfmtcat(names, "%s%s", + (names) ? "," : "", sibling->name); + } else { + error("Couldn't find a sibling cluster with id %d", + bit); + } + } + + cluster_ids >>= 1; + bit++; + } + + return names; +} + +/* Find the earliest time a job can start by doing willruns to all clusters in + * the federation and returning the fastest time. + * + * IN msg - msg that contains packed job_desc msg to send to siblings. + * IN job_desc - original job_desc msg. + * IN uid - uid of user requesting will_run. + * OUT resp - will_run_response to return + * RET returns a SLURM_SUCCESS if a will_run_response is found, SLURM_ERROR + * otherwise. + */ +extern int fed_mgr_sib_will_run(slurm_msg_t *msg, job_desc_msg_t *job_desc, + uid_t uid, will_run_response_msg_t **resp) +{ + int rc = SLURM_SUCCESS; + ListIterator itr; + List sib_willruns; + sib_willrun_t *sib_willrun; + sib_willrun_t *earliest_willrun = NULL; + slurmctld_lock_t fed_read_lock = { + NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; + + xassert(msg); + xassert(job_desc); + xassert(resp); + + *resp = NULL; + + lock_slurmctld(fed_read_lock); + + if (!(sib_willruns = _get_sib_will_runs(msg, job_desc, uid))) { + error("Failed to get any will_run responses from any sibs"); + return SLURM_ERROR; + } + + itr = list_iterator_create(sib_willruns); + while ((sib_willrun = list_next(itr))) { + if (!sib_willrun->resp) /* no response if job couldn't run? */ + continue; + + if ((earliest_willrun == NULL) || + (sib_willrun->resp->start_time < + earliest_willrun->resp->start_time)) + earliest_willrun = sib_willrun; + } + list_iterator_destroy(itr); + + if (earliest_willrun) { + *resp = earliest_willrun->resp; + earliest_willrun->resp = NULL; + } else { + rc = SLURM_ERROR; + } + + FREE_NULL_LIST(sib_willruns); + unlock_slurmctld(fed_read_lock); + + return rc; +} diff --git a/src/slurmctld/fed_mgr.h b/src/slurmctld/fed_mgr.h index b8fa7935cffe9f99c5dda575fa00e27f7bb8ae87..f85782df39d342082c8155b3a3f68f00a3a6efd4 100644 --- a/src/slurmctld/fed_mgr.h +++ b/src/slurmctld/fed_mgr.h @@ -42,17 +42,26 @@ extern slurmdb_federation_rec_t *fed_mgr_fed_rec; -extern int fed_mgr_init(void *db_conn); -extern int fed_mgr_fini(); -extern int fed_mgr_update_feds(slurmdb_update_object_t *update); -extern int fed_mgr_state_save(char *state_save_location); +extern int fed_mgr_add_sibling_conn(slurm_persist_conn_t *persist_conn, + char **out_buffer); +extern char *fed_mgr_cluster_ids_to_names(uint64_t cluster_ids); +extern int fed_mgr_fini(); +extern uint32_t fed_mgr_get_cluster_id(uint32_t id); +extern char *fed_mgr_get_cluster_name(uint32_t id); +extern uint32_t fed_mgr_get_job_id(uint32_t orig); +extern uint32_t fed_mgr_get_local_id(uint32_t id); +extern int fed_mgr_init(void *db_conn); +extern bool fed_mgr_is_active(); +extern bool fed_mgr_is_tracker_only_job(struct job_record *job_ptr); +extern int fed_mgr_job_allocate(slurm_msg_t *msg, + job_desc_msg_t *job_desc, bool alloc_only, + uid_t uid, uint16_t protocol_version, + uint32_t *job_id_ptr, int *alloc_code, + char **err_msg); +extern int fed_mgr_sib_will_run(slurm_msg_t *msg, + job_desc_msg_t *job_desc, uid_t uid, + will_run_response_msg_t **resp); extern slurmdb_federation_rec_t *fed_mgr_state_load(char *state_save_location); -extern char *fed_mgr_find_sibling_name_by_ip(char *ip); -extern bool fed_mgr_is_active(); -extern uint32_t fed_mgr_get_job_id(uint32_t orig); -extern uint32_t fed_mgr_get_local_id(uint32_t id); -extern uint32_t fed_mgr_get_cluster_id(uint32_t id); -extern int fed_mgr_add_sibling_conn(slurm_persist_conn_t *persist_conn, - char **out_buffer); - +extern int fed_mgr_state_save(char *state_save_location); +extern int fed_mgr_update_feds(slurmdb_update_object_t *update); #endif /* _SLURM_FED_MGR_H */ diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index b18b0120f120c16ec3b566417af0b000b4fc239f..ad3d9491befc891b4eabe08789eb91229549ac4a 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -176,6 +176,7 @@ static slurmdb_qos_rec_t *_determine_and_validate_qos( bool admin, slurmdb_qos_rec_t *qos_rec, int *error_code, bool locked); static void _dump_job_details(struct job_details *detail_ptr, Buf buffer); static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer); +static void _free_job_fed_details(job_fed_details_t **fed_details_pptr); static void _get_batch_job_dir_ids(List batch_dirs); static time_t _get_last_state_write_time(void); static void _job_array_comp(struct job_record *job_ptr, bool was_running); @@ -3531,6 +3532,9 @@ void dump_job_desc(job_desc_msg_t * job_specs) int spec_count; char *mem_type, buf[100], *signal_flags, *spec_type, *job_id; + if (get_log_level() < LOG_LEVEL_DEBUG3) + return; + if (job_specs == NULL) return; @@ -6535,6 +6539,9 @@ extern int validate_job_create_req(job_desc_msg_t * job_desc, uid_t submit_uid, if (rc != SLURM_SUCCESS) return rc; + if (job_desc->array_inx && fed_mgr_is_active()) + return ESLURM_NOT_SUPPORTED; + if (!_valid_array_inx(job_desc)) return ESLURM_INVALID_ARRAY; @@ -7338,6 +7345,8 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, } if (job_desc->features) detail_ptr->features = xstrdup(job_desc->features); + if (job_desc->fed_siblings) + set_job_fed_details(job_ptr, job_desc->fed_siblings); if ((job_desc->shared == JOB_SHARED_NONE) && (select_serial == 0)) { detail_ptr->share_res = 0; detail_ptr->whole_node = 1; @@ -8240,6 +8249,7 @@ static void _list_delete_job(void *job_entry) xfree(job_ptr->burst_buffer); checkpoint_free_jobinfo(job_ptr->check_job); xfree(job_ptr->comment); + _free_job_fed_details(&job_ptr->fed_details); free_job_resources(&job_ptr->job_resrcs); xfree(job_ptr->gres); xfree(job_ptr->gres_alloc); @@ -8419,8 +8429,13 @@ static bool _all_parts_hidden(struct job_record *job_ptr) } /* Determine if a given job should be seen by a specific user */ -static bool _hide_job(struct job_record *job_ptr, uid_t uid) +static bool _hide_job(struct job_record *job_ptr, uid_t uid, + uint16_t show_flags) { + if (!(show_flags & SHOW_FED_TRACK) && + job_ptr->fed_details && fed_mgr_is_tracker_only_job(job_ptr)) + return true; + if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) && (job_ptr->user_id != uid) && !validate_operator(uid) && (((slurm_mcs_get_privatedata() == 0) && @@ -8474,7 +8489,7 @@ extern void pack_all_jobs(char **buffer_ptr, int *buffer_size, _all_parts_hidden(job_ptr)) continue; - if (_hide_job(job_ptr, uid)) + if (_hide_job(job_ptr, uid, show_flags)) continue; if ((filter_uid != NO_VAL) && (filter_uid != job_ptr->user_id)) @@ -8529,7 +8544,7 @@ extern int pack_one_job(char **buffer_ptr, int *buffer_size, job_ptr = find_job_record(job_id); if (job_ptr && (job_ptr->array_task_id == NO_VAL) && !job_ptr->array_recs) { - if (!_hide_job(job_ptr, uid)) { + if (!_hide_job(job_ptr, uid, show_flags)) { pack_job(job_ptr, show_flags, buffer, protocol_version, uid); jobs_packed++; @@ -8540,7 +8555,7 @@ extern int pack_one_job(char **buffer_ptr, int *buffer_size, /* Either the job is not found or it is a job array */ if (job_ptr) { packed_head = true; - if (!_hide_job(job_ptr, uid)) { + if (!_hide_job(job_ptr, uid, show_flags)) { pack_job(job_ptr, show_flags, buffer, protocol_version, uid); jobs_packed++; @@ -8552,7 +8567,7 @@ extern int pack_one_job(char **buffer_ptr, int *buffer_size, if ((job_ptr->job_id == job_id) && packed_head) { ; /* Already packed */ } else if (job_ptr->array_job_id == job_id) { - if (_hide_job(job_ptr, uid)) + if (_hide_job(job_ptr, uid, show_flags)) break; pack_job(job_ptr, show_flags, buffer, protocol_version, uid); @@ -8777,6 +8792,17 @@ void pack_job(struct job_record *dump_job_ptr, uint16_t show_flags, Buf buffer, packstr(dump_job_ptr->tres_fmt_alloc_str, buffer); packstr(dump_job_ptr->tres_fmt_req_str, buffer); pack16(dump_job_ptr->start_protocol_ver, buffer); + + if (dump_job_ptr->fed_details) { + packstr(dump_job_ptr->fed_details->origin_str, buffer); + pack64(dump_job_ptr->fed_details->siblings, buffer); + packstr(dump_job_ptr->fed_details->siblings_str, + buffer); + } else { + packnull(buffer); + pack64((uint64_t)0, buffer); + packnull(buffer); + } } else if (protocol_version >= SLURM_16_05_PROTOCOL_VERSION) { detail_ptr = dump_job_ptr->details; pack32(dump_job_ptr->array_job_id, buffer); @@ -9871,18 +9897,44 @@ void reset_first_job_id(void) } /* - * get_next_job_id - return the job_id to be used by default for - * the next job + * Return the next available job_id to be used. + * + * Must have job_write and fed_read locks when grabbing a job_id + * + * IN test_only - if true, doesn't advance the job_id sequence, just returns + * what the next job id will be. + * RET a valid job_id or SLURM_ERROR if all job_ids are exhausted. */ -extern uint32_t get_next_job_id(void) +extern uint32_t get_next_job_id(bool test_only) { - uint32_t next_id; + int i; + uint32_t new_id, max_jobs, tmp_id_sequence; - job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id); - next_id = job_id_sequence + 1; - if (next_id >= slurmctld_conf.max_job_id) - next_id = slurmctld_conf.first_job_id; - return next_id; + max_jobs = slurmctld_conf.max_job_id - slurmctld_conf.first_job_id; + tmp_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id); + + /* Insure no conflict in job id if we roll over 32 bits */ + for (i = 0; i < max_jobs; i++) { + if (++tmp_id_sequence >= slurmctld_conf.max_job_id) + tmp_id_sequence = slurmctld_conf.first_job_id; + + new_id = fed_mgr_get_job_id(tmp_id_sequence); + + if (find_job_record(new_id)) + continue; + if (_dup_job_file_test(new_id)) + continue; + + if (!test_only) + job_id_sequence = tmp_id_sequence; + + return new_id; + } + + error("We have exhausted our supply of valid job id values. " + "FirstJobId=%u MaxJobId=%u", slurmctld_conf.first_job_id, + slurmctld_conf.max_job_id); + return SLURM_ERROR; } /* @@ -9891,38 +9943,20 @@ extern uint32_t get_next_job_id(void) */ static int _set_job_id(struct job_record *job_ptr) { - int i; - uint32_t new_id, max_jobs; + uint32_t new_id; xassert(job_ptr); xassert (job_ptr->magic == JOB_MAGIC); - max_jobs = slurmctld_conf.max_job_id - slurmctld_conf.first_job_id; - job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id); - - /* Insure no conflict in job id if we roll over 32 bits */ - for (i = 0; i < max_jobs; i++) { - if (++job_id_sequence >= slurmctld_conf.max_job_id) - job_id_sequence = slurmctld_conf.first_job_id; - new_id = job_id_sequence; - if (find_job_record(new_id)) - continue; - if (_dup_job_file_test(new_id)) - continue; - - if (fed_mgr_is_active()) - job_ptr->job_id = fed_mgr_get_job_id(new_id); - else - job_ptr->job_id = new_id; + if ((new_id = get_next_job_id(false)) != SLURM_ERROR) { + job_ptr->job_id = new_id; /* When we get a new job id might as well make sure * the db_index is 0 since there is no way it will be * correct otherwise :). */ job_ptr->db_index = 0; return SLURM_SUCCESS; } - error("We have exhausted our supply of valid job id values. " - "FirstJobId=%u MaxJobId=%u", slurmctld_conf.first_job_id, - slurmctld_conf.max_job_id); + job_ptr->job_id = NO_VAL; return EAGAIN; } @@ -12038,6 +12072,23 @@ static int _update_job(struct job_record *job_ptr, job_desc_msg_t * job_specs, } } + if (job_specs->fed_siblings) { + slurmctld_lock_t fed_read_lock = { + NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; + if (job_ptr->fed_details) + info("update_job: setting fed_siblings from %"PRIu64" to %"PRIu64" for job_id %u", + job_ptr->fed_details->siblings, + job_specs->fed_siblings, + job_ptr->job_id); + else + info("update_job: setting fed_siblings to %"PRIu64" for job_id %u", + job_specs->fed_siblings, + job_ptr->job_id); + lock_slurmctld(fed_read_lock); + set_job_fed_details(job_ptr, job_specs->fed_siblings); + unlock_slurmctld(fed_read_lock); + } + fini: /* This was a local variable, so set it back to NULL */ job_specs->tres_req_cnt = NULL; @@ -16072,3 +16123,37 @@ _kill_dependent(struct job_record *job_ptr) last_job_update = now; srun_allocate_abort(job_ptr); } + +static void _free_job_fed_details(job_fed_details_t **fed_details_pptr) +{ + job_fed_details_t *fed_details_ptr = *fed_details_pptr; + + if (fed_details_ptr) { + xfree(fed_details_ptr->origin_str); + xfree(fed_details_ptr->siblings_str); + xfree(fed_details_ptr); + *fed_details_pptr = NULL; + } +} + + +extern void set_job_fed_details(struct job_record *job_ptr, + uint64_t fed_siblings) +{ + xassert(job_ptr); + + if (!job_ptr->fed_details) { + job_ptr->fed_details = + xmalloc(sizeof(job_fed_details_t)); + } else { + xfree(job_ptr->fed_details->siblings_str); + xfree(job_ptr->fed_details->origin_str); + } + + job_ptr->fed_details->siblings = fed_siblings; + job_ptr->fed_details->siblings_str = + fed_mgr_cluster_ids_to_names(fed_siblings); + job_ptr->fed_details->origin_str = + fed_mgr_get_cluster_name( + fed_mgr_get_cluster_id(job_ptr->job_id)); +} diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index 103f9a04140be7926637fd23b0ceee10421219e7..5b9be28c8fa7fb3eef73a0d545ff710f0580e229 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -75,6 +75,7 @@ #include "src/slurmctld/acct_policy.h" #include "src/slurmctld/agent.h" #include "src/slurmctld/burst_buffer.h" +#include "src/slurmctld/fed_mgr.h" #include "src/slurmctld/front_end.h" #include "src/slurmctld/job_scheduler.h" #include "src/slurmctld/licenses.h" @@ -278,6 +279,9 @@ static bool _job_runnable_test1(struct job_record *job_ptr, bool sched_plugin) if (!IS_JOB_PENDING(job_ptr) || IS_JOB_COMPLETING(job_ptr)) return false; + if (job_ptr->fed_details && fed_mgr_is_tracker_only_job(job_ptr)) + return false; + select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_CLEANING, &cleaning); @@ -3146,7 +3150,8 @@ static void _delayed_job_start_time(struct job_record *job_ptr) if (!IS_JOB_PENDING(job_q_ptr) || !job_q_ptr->details || (job_q_ptr->part_ptr != job_ptr->part_ptr) || (job_q_ptr->priority < job_ptr->priority) || - (job_q_ptr->job_id == job_ptr->job_id)) + (job_q_ptr->job_id == job_ptr->job_id) || + (fed_mgr_is_tracker_only_job(job_q_ptr))) continue; if (job_q_ptr->details->min_nodes == NO_VAL) job_size_nodes = 1; diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index f5b9a38bf4d3c6e3b7b7b710f1ac291071cb6a9f..6f9219d3d856465b0e82085142cd619486b15714 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -165,7 +165,7 @@ inline static void _slurm_rpc_job_sbcast_cred(slurm_msg_t * msg); inline static void _slurm_rpc_job_step_kill(slurm_msg_t * msg); inline static void _slurm_rpc_job_step_create(slurm_msg_t * msg); inline static void _slurm_rpc_job_step_get_info(slurm_msg_t * msg); -inline static void _slurm_rpc_job_will_run(slurm_msg_t * msg); +inline static void _slurm_rpc_job_will_run(slurm_msg_t * msg, bool allow_sibs); inline static void _slurm_rpc_job_alloc_info(slurm_msg_t * msg); inline static void _slurm_rpc_job_alloc_info_lite(slurm_msg_t * msg); inline static void _slurm_rpc_kill_job2(slurm_msg_t *msg); @@ -356,8 +356,61 @@ void slurmctld_req(slurm_msg_t *msg, connection_arg_t *arg) _slurm_rpc_job_step_get_info(msg); break; case REQUEST_JOB_WILL_RUN: - _slurm_rpc_job_will_run(msg); + _slurm_rpc_job_will_run(msg, true); break; + case REQUEST_SIB_JOB_WILL_RUN: + { + sib_msg_t *sib_msg = msg->data; + job_desc_msg_t *job_desc = sib_msg->data; + + msg->data = job_desc; + _slurm_rpc_job_will_run(msg, false); + msg->data = sib_msg; + + break; + } + case REQUEST_SIB_SUBMIT_BATCH_JOB: + { + uint16_t tmp_version = msg->protocol_version; + sib_msg_t *sib_msg = msg->data; + job_desc_msg_t *job_desc = sib_msg->data; + job_desc->job_id = sib_msg->job_id; + job_desc->fed_siblings = sib_msg->fed_siblings; + + /* set protocol version to that of the client's version so that + * the job's start_protocol_version is that of the client's and + * not the calling controllers. */ + msg->protocol_version = sib_msg->data_version; + msg->data = job_desc; + + _slurm_rpc_submit_batch_job(msg); + + msg->data = sib_msg; + msg->protocol_version = tmp_version; + + break; + } + case REQUEST_SIB_RESOURCE_ALLOCATION: + { + uint16_t tmp_version = msg->protocol_version; + sib_msg_t *sib_msg = msg->data; + job_desc_msg_t *job_desc = sib_msg->data; + job_desc->job_id = sib_msg->job_id; + job_desc->fed_siblings = sib_msg->fed_siblings; + + /* set protocol version to that of the client's version so that + * the job's start_protocol_version is that of the client's and + * not the calling controllers. */ + msg->protocol_version = sib_msg->data_version; + msg->data = job_desc; + + _slurm_rpc_allocate_resources(msg); + + msg->data = sib_msg; + msg->protocol_version = tmp_version; + + break; + } case MESSAGE_NODE_REGISTRATION_STATUS: _slurm_rpc_node_registration(msg, 0); break; @@ -605,8 +658,20 @@ static void _throttle_fini(int *active_rpc_cnt) */ static void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) { - char *licenses_used = get_licenses_used(); /* Do before config lock */ - slurm_ctl_conf_t *conf = slurm_conf_lock(); + slurm_ctl_conf_t *conf; + char *licenses_used; + uint32_t next_job_id; + slurmctld_lock_t job_write_lock = { + NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK }; + + /* Do before config lock */ + licenses_used = get_licenses_used(); + + lock_slurmctld(job_write_lock); + next_job_id = get_next_job_id(true); + unlock_slurmctld(job_write_lock); + + conf = slurm_conf_lock(); memset(conf_ptr, 0, sizeof(slurm_ctl_conf_t)); @@ -736,7 +801,7 @@ static void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr) conf_ptr->msg_aggr_params = xstrdup(conf->msg_aggr_params); conf_ptr->msg_timeout = conf->msg_timeout; - conf_ptr->next_job_id = get_next_job_id(); + conf_ptr->next_job_id = next_job_id; conf_ptr->node_features_plugins = xstrdup(conf->node_features_plugins); conf_ptr->node_prefix = xstrdup(conf->node_prefix); @@ -993,10 +1058,10 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) resource_allocation_response_msg_t alloc_msg; /* Locks: Read config, read job, read node, read partition */ slurmctld_lock_t job_read_lock = { - READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, NO_LOCK }; + READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK }; /* Locks: Read config, write job, write node, read partition */ slurmctld_lock_t job_write_lock = { - READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; + READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, slurmctld_config.auth_info); int immediate = job_desc_msg->immediate; @@ -1056,24 +1121,60 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) if (error_code == SLURM_SUCCESS) { do_unlock = true; _throttle_start(&active_rpc_cnt); - lock_slurmctld(job_write_lock); - error_code = job_allocate(job_desc_msg, immediate, - false, NULL, - true, uid, &job_ptr, - &err_msg, - msg->protocol_version); - /* unlock after finished using the job structure data */ + if (job_desc_msg->job_id == SLURM_BATCH_SCRIPT && + fed_mgr_is_active()) { + uint32_t job_id; + if (fed_mgr_job_allocate( + msg, job_desc_msg, true, + uid, + msg->protocol_version, + &job_id, &error_code, + &err_msg)) { + do_unlock = false; + _throttle_fini(&active_rpc_cnt); + reject_job = true; + } else { + /* fed_mgr_job_allocate grabs and + * releases job_write_lock on its own to + * prevent waiting/locking on siblings + * to reply. Now grab the lock and grab + * the jobid. */ + lock_slurmctld(job_write_lock); + if (!(job_ptr = + find_job_record(job_id))) { + error("%s: can't find fed job that was just created. this should never happen", + __func__); + reject_job = true; + error_code = SLURM_ERROR; + } + } + } else { + lock_slurmctld(job_write_lock); + + error_code = job_allocate( + job_desc_msg, immediate, false, + NULL, true, uid, &job_ptr, + &err_msg, + msg->protocol_version); + /* unlock after finished using the job structure + * data */ + + /* return result */ + if (!job_ptr || + (error_code && + job_ptr->job_state == JOB_FAILED)) + reject_job = true; + } END_TIMER2("_slurm_rpc_allocate_resources"); } - } else if (errno) - error_code = errno; - else - error_code = SLURM_ERROR; - - /* return result */ - if (!job_ptr || (error_code && job_ptr->job_state == JOB_FAILED)) + } else { reject_job = true; + if (errno) + error_code = errno; + else + error_code = SLURM_ERROR; + } if (!reject_job) { xassert(job_ptr); @@ -1161,6 +1262,7 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg) } slurm_msg_t_init(&response_msg); + response_msg.conn = msg->conn; response_msg.flags = msg->flags; response_msg.protocol_version = msg->protocol_version; response_msg.msg_type = RESPONSE_RESOURCE_ALLOCATION; @@ -2465,7 +2567,7 @@ static bool _is_valid_will_run_user(job_desc_msg_t *job_desc_msg, uid_t uid) /* _slurm_rpc_job_will_run - process RPC to determine if job with given * configuration can be initiated */ -static void _slurm_rpc_job_will_run(slurm_msg_t * msg) +static void _slurm_rpc_job_will_run(slurm_msg_t * msg, bool allow_sibs) { /* init */ DEF_TIMERS; @@ -2474,10 +2576,10 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg) job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data; /* Locks: Read config, read job, read node, read partition */ slurmctld_lock_t job_read_lock = { - READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, NO_LOCK }; + READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK }; /* Locks: Write job, Write node, read partition */ slurmctld_lock_t job_write_lock = { - NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; + NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, slurmctld_config.auth_info); uint16_t port; /* dummy value */ @@ -2512,18 +2614,37 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg) job_desc_msg->resp_host, 16); dump_job_desc(job_desc_msg); if (error_code == SLURM_SUCCESS) { - lock_slurmctld(job_write_lock); if (job_desc_msg->job_id == NO_VAL) { - error_code = job_allocate(job_desc_msg, false, - true, &resp, - true, uid, &job_ptr, - &err_msg, - msg->protocol_version); + if (allow_sibs && fed_mgr_is_active()) { + /* don't job_write lock here. fed_mgr + * locks around the job_allocate when + * doing a will_run to itself. */ + error_code = + fed_mgr_sib_will_run( + msg, job_desc_msg, uid, + &resp); + } else { + lock_slurmctld(job_write_lock); + + /* Get a job_id now without incrementing + * the job_id count. This prevents + * burning job_ids on will_runs */ + job_desc_msg->job_id = + get_next_job_id(true); + + error_code = job_allocate( + job_desc_msg, false, + true, &resp, true, uid, + &job_ptr, &err_msg, + msg->protocol_version); + unlock_slurmctld(job_write_lock); + } } else { /* existing job test */ + lock_slurmctld(job_write_lock); error_code = job_start_data(job_desc_msg, &resp); + unlock_slurmctld(job_write_lock); } - unlock_slurmctld(job_write_lock); END_TIMER2("_slurm_rpc_job_will_run"); } } else if (errno) @@ -3380,17 +3501,17 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) static int active_rpc_cnt = 0; int error_code = SLURM_SUCCESS; DEF_TIMERS; - uint32_t step_id = 0; + uint32_t step_id = SLURM_BATCH_SCRIPT, job_id = 0; struct job_record *job_ptr = NULL; slurm_msg_t response_msg; submit_response_msg_t submit_msg; job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data; /* Locks: Read config, read job, read node, read partition */ slurmctld_lock_t job_read_lock = { - READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, NO_LOCK }; + READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK }; /* Locks: Write job, read node, read partition */ slurmctld_lock_t job_write_lock = { - NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, NO_LOCK }; + NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, READ_LOCK }; uid_t uid = g_slurm_auth_get_uid(msg->auth_cred, slurmctld_config.auth_info); char *err_msg = NULL; @@ -3416,6 +3537,8 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) error("REQUEST_SUBMIT_BATCH_JOB lacks alloc_node from uid=%d", uid); } + dump_job_desc(job_desc_msg); + if (error_code == SLURM_SUCCESS) { /* Locks are for job_submit plugin use */ lock_slurmctld(job_read_lock); @@ -3423,23 +3546,32 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) unlock_slurmctld(job_read_lock); } - dump_job_desc(job_desc_msg); - if (error_code == SLURM_SUCCESS) { - _throttle_start(&active_rpc_cnt); + if (error_code) { + reject_job = true; + goto send_msg; + } + + _throttle_start(&active_rpc_cnt); + if (job_desc_msg->job_id == SLURM_BATCH_SCRIPT && + fed_mgr_is_active()) { /* make sure it's not a submitted sib job. */ + + if (fed_mgr_job_allocate(msg, job_desc_msg, false, uid, + msg->protocol_version, &job_id, + &error_code, &err_msg)) + reject_job = true; + } else { lock_slurmctld(job_write_lock); START_TIMER; /* Restart after we have locks */ + if (job_desc_msg->job_id != SLURM_BATCH_SCRIPT) { job_ptr = find_job_record(job_desc_msg->job_id); if (job_ptr && IS_JOB_FINISHED(job_ptr)) { if (IS_JOB_COMPLETING(job_ptr)) { info("Attempt to re-use active " "job id %u", job_ptr->job_id); - slurm_send_rc_msg( - msg, - ESLURM_DUPLICATE_JOB_ID); - unlock_slurmctld(job_write_lock); - _throttle_fini(&active_rpc_cnt); - goto fini; + reject_job = true; + error_code = ESLURM_DUPLICATE_JOB_ID; + goto unlock; } job_ptr = NULL; /* OK to re-use job id */ } @@ -3455,10 +3587,9 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) if (!validate_slurm_user(uid)) { info("Attempt to execute batch job step by " "uid=%d", uid); - slurm_send_rc_msg(msg, ESLURM_NO_STEPS); - unlock_slurmctld(job_write_lock); - _throttle_fini(&active_rpc_cnt); - goto fini; + error_code = ESLURM_NO_STEPS; + reject_job = true; + goto unlock; } #endif @@ -3468,63 +3599,56 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) "by user %u", uid, job_ptr->job_id, job_ptr->user_id); - slurm_send_rc_msg(msg, ESLURM_USER_ID_MISSING); - unlock_slurmctld(job_write_lock); - _throttle_fini(&active_rpc_cnt); - goto fini; + error_code = ESLURM_USER_ID_MISSING; + reject_job = true; + goto unlock; } if (job_ptr->details && job_ptr->details->prolog_running) { - slurm_send_rc_msg(msg, EAGAIN); - unlock_slurmctld(job_write_lock); - _throttle_fini(&active_rpc_cnt); - goto fini; + error_code = EAGAIN; + reject_job = true; + goto unlock; } error_code = _launch_batch_step(job_desc_msg, uid, &step_id, msg->protocol_version); - unlock_slurmctld(job_write_lock); - _throttle_fini(&active_rpc_cnt); - END_TIMER2("_slurm_rpc_submit_batch_job"); - if (error_code != SLURM_SUCCESS) { info("_launch_batch_step: %s", slurm_strerror(error_code)); - slurm_send_rc_msg(msg, error_code); - } else { - info("_launch_batch_step StepId=%u.%u %s", - job_desc_msg->job_id, step_id, - TIME_STR); - submit_msg.job_id = job_desc_msg->job_id; - submit_msg.step_id = step_id; - submit_msg.error_code = error_code; - response_msg.msg_type = - RESPONSE_SUBMIT_BATCH_JOB; - - response_msg.data = &submit_msg; - slurm_send_node_msg(msg->conn_fd, - &response_msg); - schedule_job_save(); + reject_job = true; + goto unlock; } - goto fini; - } - /* Create new job allocation */ - error_code = job_allocate(job_desc_msg, - job_desc_msg->immediate, false, - NULL, 0, uid, &job_ptr, &err_msg, - msg->protocol_version); + job_id = job_desc_msg->job_id; + + info("_launch_batch_step StepId=%u.%u %s", + job_id, step_id, TIME_STR); + } else { + /* Create new job allocation */ + error_code = job_allocate(job_desc_msg, + job_desc_msg->immediate, + false, NULL, 0, uid, &job_ptr, + &err_msg, + msg->protocol_version); + if (!job_ptr || + (error_code && job_ptr->job_state == JOB_FAILED)) + reject_job = true; + else + job_id = job_ptr->job_id; + + if (job_desc_msg->immediate && + (error_code != SLURM_SUCCESS)) + error_code = ESLURM_CAN_NOT_START_IMMEDIATELY; + } +unlock: unlock_slurmctld(job_write_lock); - _throttle_fini(&active_rpc_cnt); - END_TIMER2("_slurm_rpc_submit_batch_job"); - if (job_desc_msg->immediate && (error_code != SLURM_SUCCESS)) - error_code = ESLURM_CAN_NOT_START_IMMEDIATELY; } - /* return result */ - if (!job_ptr || (error_code && job_ptr->job_state == JOB_FAILED)) - reject_job = true; + _throttle_fini(&active_rpc_cnt); + +send_msg: + END_TIMER2("_slurm_rpc_submit_batch_job"); if (reject_job) { info("_slurm_rpc_submit_batch_job: %s", @@ -3535,21 +3659,23 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg) slurm_send_rc_msg(msg, error_code); } else { info("_slurm_rpc_submit_batch_job JobId=%u %s", - job_ptr->job_id, TIME_STR); + job_id, TIME_STR); /* send job_ID */ - submit_msg.job_id = job_ptr->job_id; - submit_msg.step_id = SLURM_BATCH_SCRIPT; + submit_msg.job_id = job_id; + submit_msg.step_id = step_id; submit_msg.error_code = error_code; response_msg.msg_type = RESPONSE_SUBMIT_BATCH_JOB; response_msg.data = &submit_msg; slurm_send_node_msg(msg->conn_fd, &response_msg); schedule_job_save(); /* Has own locks */ - schedule_node_save(); /* Has own locks */ - queue_job_scheduler(); + if (step_id == SLURM_BATCH_SCRIPT) { + schedule_node_save(); /* Has own locks */ + queue_job_scheduler(); + } } -fini: xfree(err_msg); + xfree(err_msg); } /* _slurm_rpc_update_job - process RPC to update the configuration of a diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 14f99303c75fe814a471e808ecaa3dd66ccf38dc..ff36a96570a8eaf0bfbf3f7f0291ca66ac3e7bef 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -553,10 +553,10 @@ typedef struct { uint16_t *tres; } acct_policy_limit_set_t; - typedef struct { - bitstr_t *siblings; /* bitmap of sibling cluster ids where - * sibling jobs exist */ + char *origin_str; /* origin cluster name */ + uint64_t siblings; /* bitmap of sibling cluster ids */ + char *siblings_str; /* comma separated list of sibling names */ } job_fed_details_t; /* @@ -1108,10 +1108,12 @@ extern char **get_job_env (struct job_record *job_ptr, uint32_t *env_size); extern char *get_job_script (struct job_record *job_ptr); /* - * get_next_job_id - return the job_id to be used by default for - * the next job + * Return the next available job_id to be used. + * IN test_only - if true, doesn't advance the job_id sequence, just returns + * what the next job id will be. + * RET a valid job_id or SLURM_ERROR if all job_ids are exhausted. */ -extern uint32_t get_next_job_id(void); +extern uint32_t get_next_job_id(bool test_only); /* * get_part_list - find record for named partition(s) @@ -2411,4 +2413,9 @@ waitpid_timeout(const char *, pid_t, int *, int); */ extern void set_partition_tres(); +/* + * Set job's siblings and make sibling strings + */ +extern void set_job_fed_details(struct job_record *job_ptr, + uint64_t fed_siblings); #endif /* !_HAVE_SLURMCTLD_H */ diff --git a/src/smap/opts.c b/src/smap/opts.c index b3ea19fed1d0a6b2902b41c9cbedd3a98065f79f..337005224d74d28d4f992ddebe83972406741147 100644 --- a/src/smap/opts.c +++ b/src/smap/opts.c @@ -335,6 +335,7 @@ Usage: smap [OPTIONS]\n\ -M, --cluster=cluster_name cluster to issue commands to. Default is\n\ current cluster. cluster with no name will\n\ reset to default.\n\ + NOTE: SlurmDBD must be up.\n\ -n, --nodes=[nodes] only show objects with these nodes.\n\ If querying to the ionode level use the -I\n\ option in conjunction with this option.\n\ diff --git a/src/sprio/opts.c b/src/sprio/opts.c index aef26c8ee30683ecc9ff162bc6bb54e02f44a10f..8811e3b7607b34a7ae35a45e559d2ff7c2ddd23e 100644 --- a/src/sprio/opts.c +++ b/src/sprio/opts.c @@ -489,6 +489,7 @@ Usage: sprio [OPTIONS]\n\ -M, --cluster=cluster_name cluster to issue commands to. Default is\n\ current cluster. cluster with no name will\n\ reset to default.\n\ + NOTE: SlurmDBD must be up.\n\ -n, --norm display normalized values\n\ -o, --format=format format specification\n\ -u, --user=user_name comma separated list of users to view\n\ diff --git a/src/squeue/opts.c b/src/squeue/opts.c index d115a0032e553c4a866f9f87a3e172d11b89d623..ae7e4de028ef49fa5d9fbe366445cf132cfc195d 100644 --- a/src/squeue/opts.c +++ b/src/squeue/opts.c @@ -60,6 +60,7 @@ #define OPT_LONG_HIDE 0x102 #define OPT_LONG_START 0x103 #define OPT_LONG_NOCONVERT 0x104 +#define OPT_LONG_FEDTRACK 0x105 /* FUNCTIONS */ static List _build_job_list( char* str ); @@ -94,6 +95,7 @@ parse_command_line( int argc, char* argv[] ) {"accounts", required_argument, 0, 'A'}, {"all", no_argument, 0, 'a'}, {"array", no_argument, 0, 'r'}, + {"fedtrack", no_argument, 0, OPT_LONG_FEDTRACK}, {"Format", required_argument, 0, 'O'}, {"format", required_argument, 0, 'o'}, {"help", no_argument, 0, OPT_LONG_HELP}, @@ -292,6 +294,9 @@ parse_command_line( int argc, char* argv[] ) exit(1); } break; + case OPT_LONG_FEDTRACK: + params.show_fedtrack = true; + break; case OPT_LONG_HELP: _help(); exit(0); @@ -1371,6 +1376,28 @@ extern int parse_long_format( char* format_long ) field_size, right_justify, suffix ); + else if (!xstrcasecmp(token, "fedorigin")) + job_format_add_fed_origin(params.format_list, + field_size, + right_justify, + suffix ); + else if (!xstrcasecmp(token, "fedoriginraw")) + job_format_add_fed_origin_raw( + params.format_list, + field_size, + right_justify, + suffix ); + else if (!xstrcasecmp(token, "fedsiblings")) + job_format_add_fed_siblings(params.format_list, + field_size, + right_justify, + suffix ); + else if (!xstrcasecmp(token, "fedsiblingsraw")) + job_format_add_fed_siblings_raw( + params.format_list, + field_size, + right_justify, + suffix ); else if (!xstrcasecmp(token, "maxcpus")) job_format_add_max_cpus(params.format_list, field_size, @@ -1952,6 +1979,7 @@ Usage: squeue [OPTIONS]\n\ --noconvert don't convert units from their original type\n\ (e.g. 2048M won't be converted to 2G).\n\ -o, --format=format format specification\n\ + -O, --Format=format format specification\n\ -p, --partition=partition(s) comma separated list of partitions\n\ to view, default is all partitions\n\ -q, --qos=qos(s) comma separated list of qos's\n\ diff --git a/src/squeue/print.c b/src/squeue/print.c index 39780418d0a6b02991bc5941fbfc1cad1e130912..47f8183f0faf37457c5db1c689eecaa64fd8c683 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -1652,6 +1652,87 @@ int _print_job_exit_code(job_info_t * job, int width, bool right_justify, return SLURM_SUCCESS; } +int _print_job_fed_origin(job_info_t * job, int width, bool right_justify, + char* suffix) +{ + if (job == NULL) + _print_str("FED_ORIGIN", width, right_justify, true); + else { + if (job->fed_origin_str) + _print_str(job->fed_origin_str, width, right_justify, + true); + else + _print_str("NA", width, right_justify, true); + } + + if (suffix) + printf("%s", suffix); + return SLURM_SUCCESS; +} + +int _print_job_fed_origin_raw(job_info_t * job, int width, bool right_justify, + char* suffix) +{ + if (job == NULL) + _print_str("FED_ORIGIN_RAW", width, right_justify, true); + else { + int id = job->job_id >> 26; + if (id) + _print_int(id, width, right_justify, true); + else + _print_str("NA", width, right_justify, true); + } + + if (suffix) + printf("%s", suffix); + return SLURM_SUCCESS; +} + +int _print_job_fed_siblings(job_info_t * job, int width, bool right_justify, + char* suffix) +{ + if (job == NULL) + _print_str("FED_SIBLINGS", width, right_justify, true); + else { + if (job->fed_siblings_str) + _print_str(job->fed_siblings_str, width, right_justify, + true); + else + _print_str("NA", width, right_justify, true); + } + + if (suffix) + printf("%s", suffix); + return SLURM_SUCCESS; +} + +int _print_job_fed_siblings_raw(job_info_t * job, int width, bool right_justify, + char* suffix) +{ + if (job == NULL) + _print_str("FED_SIBLINGS_RAW", width, right_justify, true); + else { + int bit = 1; + char *ids = NULL; + uint64_t tmp_sibs = job->fed_siblings; + while (tmp_sibs) { + if (tmp_sibs & 1) + xstrfmtcat(ids, "%s%d", (ids) ? "," : "", bit); + + tmp_sibs >>= 1; + bit++; + } + if (ids) + _print_str(ids, width, right_justify, true); + else + _print_str("NA", width, right_justify, true); + } + + if (suffix) + printf("%s", suffix); + return SLURM_SUCCESS; +} + int _print_job_max_cpus(job_info_t * job, int width, bool right_justify, char* suffix) { diff --git a/src/squeue/print.h b/src/squeue/print.h index 92bb85df81cfaf0f4a0b35f3e9ef7d97b1e40874..4df2e0a35c71ab1f17dc1681be09ae65ff91f907 100644 --- a/src/squeue/print.h +++ b/src/squeue/print.h @@ -236,6 +236,16 @@ int job_format_add_function(List list, int width, bool right_justify, _print_job_eligible_time) #define job_format_add_exit_code(list,wid,right,suffix) \ job_format_add_function(list,wid,right,suffix,_print_job_exit_code) +#define job_format_add_fed_origin(list,wid,right,suffix) \ + job_format_add_function(list,wid,right,suffix, _print_job_fed_origin) +#define job_format_add_fed_origin_raw(list,wid,right,suffix) \ + job_format_add_function(list,wid,right,suffix, \ + _print_job_fed_origin_raw) +#define job_format_add_fed_siblings(list,wid,right,suffix) \ + job_format_add_function(list,wid,right,suffix, _print_job_fed_siblings) +#define job_format_add_fed_siblings_raw(list,wid,right,suffix) \ + job_format_add_function(list,wid,right,suffix, \ + _print_job_fed_siblings_raw) #define job_format_add_max_cpus(list,wid,right,suffix) \ job_format_add_function(list,wid,right,suffix,_print_job_max_cpus) #define job_format_add_max_nodes(list,wid,right,suffix) \ @@ -433,6 +443,14 @@ int _print_job_eligible_time(job_info_t * job, int width, bool right_justify, char* suffix); int _print_job_exit_code(job_info_t * job, int width, bool right_justify, char* suffix); +int _print_job_fed_origin(job_info_t * job, int width, bool right_justify, + char* suffix); +int _print_job_fed_origin_raw(job_info_t * job, int width, bool right_justify, + char* suffix); +int _print_job_fed_siblings(job_info_t * job, int width, bool right_justify, + char* suffix); +int _print_job_fed_siblings_raw(job_info_t * job, int width, bool right_justify, + char* suffix); int _print_job_max_cpus(job_info_t * job, int width, bool right_justify, char* suffix); int _print_job_max_nodes(job_info_t * job, int width, bool right_justify, diff --git a/src/squeue/squeue.c b/src/squeue/squeue.c index fd4e3a8c60c4835212bafb604fe93345e1602f84..990feb8fafc8cb7c74c8848c91aa15904ea3abd8 100644 --- a/src/squeue/squeue.c +++ b/src/squeue/squeue.c @@ -175,6 +175,9 @@ _print_job ( bool clear_old ) if (params.all_flag || (params.job_list && list_count(params.job_list))) show_flags |= SHOW_ALL; + if (params.show_fedtrack) + show_flags |= SHOW_FED_TRACK; + /* We require detail data when CPUs are requested */ if (params.format && strstr(params.format, "C")) show_flags |= SHOW_DETAIL; diff --git a/src/squeue/squeue.h b/src/squeue/squeue.h index 5901d047ca6481df2b4a96eebcdcf21246103fc8..d2dc6ac3f688b04636d623db972c99d9c9b722db 100644 --- a/src/squeue/squeue.h +++ b/src/squeue/squeue.h @@ -69,6 +69,7 @@ struct squeue_parameters { bool array_flag; int iterate; bool job_flag; + bool show_fedtrack; bool start_flag; bool step_flag; bool long_format; diff --git a/src/srun/libsrun/allocate.c b/src/srun/libsrun/allocate.c index d409eb98658c42f838395f6c18d7cd57a5151bc4..45db52f08cc858ecad47a9cbfdf0b8d9a97928e8 100644 --- a/src/srun/libsrun/allocate.c +++ b/src/srun/libsrun/allocate.c @@ -50,6 +50,7 @@ #include "src/common/forward.h" #include "src/common/log.h" #include "src/common/macros.h" +#include "src/common/proc_args.h" #include "src/common/slurm_auth.h" #include "src/common/slurm_protocol_api.h" #include "src/common/slurm_time.h" @@ -869,6 +870,16 @@ job_desc_msg_create_from_opts (void) if (opt.mcs_label) j->mcs_label = opt.mcs_label; + /* If can run on multiple clusters find the earliest run time + * and run it there */ + j->clusters = xstrdup(opt.clusters); + if (opt.clusters && + slurmdb_get_first_avail_cluster(j, opt.clusters, + &working_cluster_rec) != SLURM_SUCCESS) { + print_db_notok(opt.clusters, 0); + exit(error_exit); + } + return j; } diff --git a/src/srun/libsrun/opt.c b/src/srun/libsrun/opt.c index f60f7831ceda7eae2003aee56c3fff3be3ab964c..7c5a801f9f69d69f5a34aee073cf04d00290d797 100644 --- a/src/srun/libsrun/opt.c +++ b/src/srun/libsrun/opt.c @@ -394,6 +394,7 @@ static void _opt_default(void) opt.cwd = xstrdup(buf); opt.cwd_set = false; + opt.clusters = NULL; opt.progname = NULL; opt.ntasks = 1; @@ -579,6 +580,7 @@ env_vars_t env_vars[] = { {"SLURM_BCAST", OPT_BCAST, NULL, NULL }, {"SLURM_BLRTS_IMAGE", OPT_STRING, &opt.blrtsimage, NULL }, {"SLURM_BURST_BUFFER", OPT_STRING, &opt.burst_buffer, NULL }, +{"SLURM_CLUSTERS", OPT_STRING, &opt.clusters, NULL }, {"SLURM_CHECKPOINT", OPT_STRING, &opt.ckpt_interval_str, NULL }, {"SLURM_CHECKPOINT_DIR",OPT_STRING, &opt.ckpt_dir, NULL }, {"SLURM_CNLOAD_IMAGE", OPT_STRING, &opt.linuximage, NULL }, @@ -935,6 +937,8 @@ static void _set_options(const int argc, char **argv) {"kill-on-bad-exit", optional_argument, 0, 'K'}, {"label", no_argument, 0, 'l'}, {"licenses", required_argument, 0, 'L'}, + {"cluster", required_argument, 0, 'M'}, + {"clusters", required_argument, 0, 'M'}, {"distribution", required_argument, 0, 'm'}, {"ntasks", required_argument, 0, 'n'}, {"nodes", required_argument, 0, 'N'}, @@ -1045,7 +1049,7 @@ static void _set_options(const int argc, char **argv) {"wckey", required_argument, 0, LONG_OPT_WCKEY}, {NULL, 0, 0, 0} }; - char *opt_string = "+A:B:c:C:d:D:e:Eg:hHi:I::jJ:kK::lL:m:n:N:" + char *opt_string = "+A:B:c:C:d:D:e:Eg:hHi:I::jJ:kK::lL:m:M:n:N:" "o:Op:P:qQr:RsS:t:T:uU:vVw:W:x:XZ"; char *pos_delimit; bool ntasks_set_opt = false; @@ -1185,6 +1189,10 @@ static void _set_options(const int argc, char **argv) xfree(opt.licenses); opt.licenses = xstrdup(optarg); break; + case 'M': + xfree(opt.clusters); + opt.clusters = xstrdup(optarg); + break; case (int)'m': opt.distribution = verify_dist_type(optarg, &opt.plane_size); @@ -2742,7 +2750,7 @@ static void _usage(void) " [--oversubscribe] [--label] [--unbuffered] [-m dist] [-J jobname]\n" " [--jobid=id] [--verbose] [--slurmd_debug=#] [--gres=list]\n" " [-T threads] [-W sec] [--checkpoint=time] [--gres-flags=opts]\n" -" [--checkpoint-dir=dir] [--licenses=names]\n" +" [--checkpoint-dir=dir] [--licenses=names] [--clusters=cluster_names]\n" " [--restart-dir=dir] [--qos=qos] [--time-min=minutes]\n" " [--contiguous] [--mincpus=n] [--mem=MB] [--tmp=MB] [-C list]\n" " [--mpi=type] [--account=name] [--dependency=type:jobid]\n" @@ -2819,10 +2827,14 @@ static void _help(void) " -K, --kill-on-bad-exit kill the job if any task terminates with a\n" " non-zero exit code\n" " -l, --label prepend task number to lines of stdout/err\n" -" -L, --licenses=names required license, comma separated\n" " --launch-cmd print external launcher command line if not SLURM\n" " --launcher-opts= options for the external launcher command if not\n" " SLURM\n" +" -L, --licenses=names required license, comma separated\n" +" -M, --clusters=names Comma separated list of clusters to issue\n" +" commands to. Default is current cluster.\n" +" Name of 'all' will submit to run on all clusters.\n" +" NOTE: SlurmDBD must up.\n" " -m, --distribution=type distribution method for processes to nodes\n" " (type = block|cyclic|arbitrary)\n" " --mail-type=type notify on state change: BEGIN, END, FAIL or ALL\n" diff --git a/src/srun/libsrun/opt.h b/src/srun/libsrun/opt.h index 1380867a8726dbd69c877bf1ce5b4b287eab8398..1c8e9aa0fe6df4e0794992fc5e20dc94ca301d1a 100644 --- a/src/srun/libsrun/opt.h +++ b/src/srun/libsrun/opt.h @@ -66,7 +66,7 @@ extern int _verbose; extern enum modes mode; typedef struct srun_options { - + char *clusters; /* cluster to run this on. */ char *progname; /* argv[0] of this program or * configuration file if multi_prog */ bool multi_prog; /* multiple programs to execute */ diff --git a/src/sshare/sshare.c b/src/sshare/sshare.c index 6a8dc3c7bc63441ce7156457440c7367478a9cde..24926ce167b8f1734f93d037f57f2b3e81b8425a 100644 --- a/src/sshare/sshare.c +++ b/src/sshare/sshare.c @@ -456,9 +456,8 @@ Usage: sshare [OPTION] \n\ with the '--format' option \n\ -l or --long include normalized usage in output \n\ -m or --partition print the partition part of the association \n\ - -M or --cluster=name cluster to issue commands to. Default is \n\ - current cluster. cluster with no name will \n\ - reset to default. \n\ + -M or --cluster=names clusters to issue commands to. \n\ + NOTE: SlurmDBD must be up. \n\ -n or --noheader omit header from output \n\ -o or --format= Comma separated list of fields. (use \n\ (\"--helpformat\" for a list of available fields).\n\ diff --git a/src/strigger/opts.c b/src/strigger/opts.c index 81236614cfdafcf247f93a553de8536120b15b28..979258b4c69476a152b9424546affc0dedea4204 100644 --- a/src/strigger/opts.c +++ b/src/strigger/opts.c @@ -536,6 +536,7 @@ Usage: strigger [--set | --get | --clear] [OPTIONS]\n\ -M, --cluster=name cluster to issue commands to. Default is\n\ current cluster. cluster with no name will\n\ reset to default.\n\ + NOTE: SlurmDBD must up.\n\ -n, --node[=host] trigger related to specific node, all nodes by default\n\ -N, --noheader Do not print the message header\n\ -o, --offset=# trigger's offset time from event, negative to precede\n\ diff --git a/testsuite/expect/Makefile.am b/testsuite/expect/Makefile.am index 7d4adb5e8698741df8296f0a41e0aec75c64c2dd..e970e8fc33b68e997fc896e2208d38b0fc605c9a 100644 --- a/testsuite/expect/Makefile.am +++ b/testsuite/expect/Makefile.am @@ -614,7 +614,8 @@ EXTRA_DIST = \ test36.4 \ test37.1 \ test37.2 \ - test37.3 + test37.3 \ + test37.4 distclean-local: rm -rf *error *output diff --git a/testsuite/expect/Makefile.in b/testsuite/expect/Makefile.in index 1cc42cb20a865b86bae79f34a4358735fe4ce6d0..cfcdc093d505ba87c4010eaf5a7f13b37c364c24 100644 --- a/testsuite/expect/Makefile.in +++ b/testsuite/expect/Makefile.in @@ -1028,7 +1028,8 @@ EXTRA_DIST = \ test36.4 \ test37.1 \ test37.2 \ - test37.3 + test37.3 \ + test37.4 all: all-am diff --git a/testsuite/expect/README b/testsuite/expect/README index 4bc34d8e3a1ac93bfe83ffa9d4615d3751e0f464..0380564e49106bea3bb00c136e69b7ffc85a30a8 100644 --- a/testsuite/expect/README +++ b/testsuite/expect/README @@ -804,3 +804,4 @@ test37.# Testing of federations. test37.1 sacctmgr operations on clusters and federations. test37.2 Validate federated clusters return federated job ids. test37.3 scontrol show federations +test37.4 federated job submission diff --git a/testsuite/expect/globals_federation b/testsuite/expect/globals_federation index d27b9c6a5a5a2cd2a051cd5aa21006e631ace1fb..50fe0d72a3326104bd502c56334db86c3efb1d5c 100644 --- a/testsuite/expect/globals_federation +++ b/testsuite/expect/globals_federation @@ -59,45 +59,77 @@ proc test_federation_setup { } { proc setup_federation { fed_name } { global sacctmgr fedc1 fedc2 fedc3 eol set rc 0 - set my_pid [spawn $sacctmgr -i add federation $fed_name cluster=$fedc1,$fedc2,$fedc3] + + set my_pid [spawn $sacctmgr -i add federation $fed_name] set matches 0 expect { -re "Adding Federation\\(s\\)$eol" { incr matches - exp_continue + exp_continue } -re "$fed_name$eol" { incr matches - exp_continue - } - -re "Settings$eol" { - incr matches - exp_continue - } - -re "\\s+Cluster\\s+=\\s+$fedc1$eol" { - incr matches - exp_continue - } - -re "\\s+Cluster\\s+=\\s+$fedc2$eol" { - incr matches - exp_continue - } - -re "\\s+Cluster\\s+=\\s+$fedc3$eol" { - incr matches - exp_continue + exp_continue } timeout { send_user "\nFAILURE: sacctmgr add not responding\n" - slow_kill $my_pid - set rc 1 + slow_kill $my_pid + set rc 1 } eof { wait } } - if {!$rc && $matches != 6} { + if {!$rc && $matches != 2} { send_user "$matches FAILURE: failed to create federation.\n" set rc 1 + return $rc + } + + set count 0 + foreach cluster [list $fedc1 $fedc2 $fedc3] { + incr count + set my_pid [spawn $sacctmgr -i mod cluster $cluster set federation=$fed_name weight=1] + set matches 0 + expect { + -re "Setting$eol" { + incr matches + exp_continue + } + -re "^\\s+Federation\\s+=\\s+$fed_name$eol" { + incr matches + exp_continue + } + -re "^\\s+Weight\\s+=\\s+1$eol" { + incr matches + exp_continue + } + -re "Modified cluster...$eol" { + incr matches + exp_continue + } + -re "^\\s+$cluster$eol" { + incr matches + exp_continue + } + timeout { + send_user "\nFAILURE: sacctmgr add not responding\n" + slow_kill $my_pid + set rc 1 + } + eof { + wait + } + } + if {!$rc && $matches != 5} { + send_user "$matches FAILURE: failed to add $cluster to federation.\n" + set rc 1 + break; + } + + if {$count > 1} { + sleep 5; + } } return $rc } @@ -311,3 +343,90 @@ proc remove_cluster_from_fed {cname fed_name} { return $rc } + + +proc modify_federation_flags {fed_name mode flags} { + global sacctmgr eol + set matches 0 + set my_pid [spawn $sacctmgr -i modify federation $fed_name set flags$mode$flags] + expect { + -re "Setting$eol" { + incr matches + exp_continue + } + -re "^\\s+Flags\\s+\\$mode\\s+$flags$eol" { + incr matches + exp_continue + } + -re "^\\s+Modified federation...$eol" { + incr matches + exp_continue + } + -re "^\\s+$fed_name$eol" { + incr matches + exp_continue + } + timeout { + send_user "\nFAILURE: sacctmgr add not responding\n" + slow_kill $my_pid + end_it 1 + } + eof { + wait + } + } + if {$matches != 4} { + send_user "$matches FAILURE: unexpected error.\n" + end_it 1 + } +} + +proc modify_cluster_weight {cname weight} { + global sacctmgr eol + set matches 0 + set my_pid [spawn $sacctmgr -i mod cluster $cname set weight=$weight] + expect { + -re "Setting$eol" { + incr matches + exp_continue + } + -re "^\\s+Weight\\s+=\\s+$weight$eol" { + incr matches + exp_continue + } + -re "Modified cluster...$eol" { + incr matches + exp_continue + } + -re "^\\s+$cname$eol" { + incr matches + exp_continue + } + timeout { + send_user "\nFAILURE: sacctmgr add not responding\n" + slow_kill $my_pid + end_it 1 + } + eof { + wait + } + } + if {$matches != 4} { + send_user "$matches FAILURE: failed to set weight for $cname\n" + end_it 1 + } +} + + +proc log_error {msg} { + send_user "\nFAILURE: $msg\n" +} + +proc log_warn {msg} { + send_user "\nWARNING: $msg\n" +} + +proc log_info {msg} { + send_user "INFO: $msg\n" +} + diff --git a/testsuite/expect/test1.43 b/testsuite/expect/test1.43 index 50177be27739c005bfbe3d551bbf4f92999c38bf..de6932260fe04e934033291ed2aba87fe0a6b667 100755 --- a/testsuite/expect/test1.43 +++ b/testsuite/expect/test1.43 @@ -58,7 +58,7 @@ for {set node_cnt 1} {$node_cnt > 0} {set node_cnt [expr $node_cnt * 2]} { incr jobs_run exp_continue } - -re "allocation failure" { + -re "allocation failure:.*?\r\n" { set alloc_fail 1 set node_cnt 0 exp_continue diff --git a/testsuite/expect/test17.36 b/testsuite/expect/test17.36 index b9cf62dac3fd6020d8275fa195d39eaed6cb6aa8..d6e470754068bf0f54b9850d0b83afa8c58a1348 100755 --- a/testsuite/expect/test17.36 +++ b/testsuite/expect/test17.36 @@ -350,7 +350,7 @@ cancel_job $job_id # Test partition with oversubscribe=NO # #################################### -send_user "\n\nTest partition with overoubscribe=NO\n" +send_user "\n\nTest partition with oversubscribe=NO\n" # Determine the number of cores or CPUs set num_jobs [cr_core_cpu $node_name] diff --git a/testsuite/expect/test37.1 b/testsuite/expect/test37.1 index 97e29835bc88835b7eb8b62ede07460315835b07..84342fde3f71691461a861c2e429cbb8bd13f7d6 100755 --- a/testsuite/expect/test37.1 +++ b/testsuite/expect/test37.1 @@ -2617,137 +2617,143 @@ expect { } expect -re $ -for {set i 1} {$i <= $max_federations} {incr i} { - set matches 0 - set tmpc "max${i}_$test_id_2" - set my_pid [spawn $sacctmgr -i add cluster $tmpc federation=$fed1] - if {$i < $max_federations} { - expect { - -re "Adding Cluster\\(s\\)$eol" { - incr matches - exp_continue - } - -re "\\s+$tmpc$eol" { - incr matches - exp_continue - } - -re "\\s+Setting$eol" { - incr matches - exp_continue - } - -re "\\s+Federation\\s+=\\s+$fed1$eol" { - incr matches - exp_continue - } - timeout { - send_user "\nFAILURE: sacctmgr add not responding\n" - slow_kill $my_pid - set exit_code 1 - } - eof { - wait - } - } - if {$exit_code || $matches != 4} { - send_user "$matches FAILURE: unexpected error.\n" - end_it 1 - } - } else { - expect { - -re "Adding Cluster\\(s\\)$eol" { - incr matches - exp_continue - } - -re "\\s+$tmpc$eol" { - incr matches - exp_continue - } - -re "\\s+Setting$eol" { - incr matches - exp_continue - } - -re "\\s+Federation\\s+=\\s+$fed1$eol" { - incr matches - exp_continue - } - -re "\\s+Problem adding clusters: Too many clusters in federation?" { - incr matches - exp_continue - } - timeout { - send_user "\nFAILURE: sacctmgr add not responding\n" - slow_kill $my_pid - set exit_code 1 - } - eof { - wait - } - } - if {$exit_code || $matches != 5} { - send_user "$matches FAILURE: unexpected error.\n" - end_it 1 - } +set matches 0 +set tmp_clusters "" +for {set i 1} {$i < $max_federations} {incr i} { + if {$i > 1} { + append tmp_clusters "," + } + append tmp_clusters "max${i}_$test_id_2" +} +set timeout 300 +set my_pid [spawn $sacctmgr -i add cluster $tmp_clusters federation=$fed1] +expect { + -re "Adding Cluster\\(s\\)$eol" { + incr matches + exp_continue + } + -re "\\s+max\[1-6\]{0,1}\\d{1}_$test_id_2$eol" { + incr matches + exp_continue + } + -re "\\s+Setting$eol" { + incr matches + exp_continue + } + -re "\\s+Federation\\s+=\\s+$fed1$eol" { + incr matches + exp_continue + } + timeout { + send_user "\nFAILURE: sacctmgr add not responding\n" + slow_kill $my_pid + set exit_code 1 + } + eof { + wait + } +} +if {$exit_code || $matches != 66} { + send_user "$matches FAILURE: unexpected error.\n" + end_it 1 +} - set matches 0 - ##################################### - # TEST: modify cluster to exceed max clusters in federation - ##################################### - #add last cluster without federation - set my_pid [spawn $sacctmgr -i add cluster $tmpc] - expect { - -re "Adding Cluster\\(s\\)$eol" { - incr matches - exp_continue - } - -re "\\s+$tmpc$eol" { - incr matches - exp_continue - } - timeout { - send_user "\nFAILURE: sacctmgr add not responding\n" - slow_kill $my_pid - set exit_code 1 - } - eof { - wait - } - } - if {$exit_code || $matches != 2} { - send_user "$matches FAILURE: unexpected error.\n" - end_it 1 - } +set matches 0 +set tmpc "max${i}_$test_id_2" +set my_pid [spawn $sacctmgr -i add cluster $tmpc federation=$fed1] +expect { + -re "Adding Cluster\\(s\\)$eol" { + incr matches + exp_continue + } + -re "\\s+$tmpc$eol" { + incr matches + exp_continue + } + -re "\\s+Setting$eol" { + incr matches + exp_continue + } + -re "\\s+Federation\\s+=\\s+$fed1$eol" { + incr matches + exp_continue + } + -re "\\s+Problem adding clusters: Too many clusters in federation?" { + incr matches + exp_continue + } + timeout { + send_user "\nFAILURE: sacctmgr add not responding\n" + slow_kill $my_pid + set exit_code 1 + } + eof { + wait + } +} +if {$exit_code || $matches != 5} { + send_user "$matches FAILURE: unexpected error.\n" + end_it 1 +} - set matches 0 - set my_pid [spawn $sacctmgr -i modify cluster $tmpc set federation=$fed1] - expect { - -re "Setting$eol" { - incr matches - exp_continue - } - -re "^\\s+Federation\\s+=\\s+$fed1$eol" { - incr matches - exp_continue - } - -re "sacctmgr: error: Too many clusters in federation$eol" { - incr matches - exp_continue - } - timeout { - send_user "\nFAILURE: sacctmgr add not responding\n" - slow_kill $my_pid - set exit_code 1 - } - eof { - wait - } - } - if {$exit_code || $matches != 3} { - send_user "$matches FAILURE: unexpected error.\n" - end_it 1 - } +set matches 0 +##################################### +# TEST: modify cluster to exceed max clusters in federation +##################################### +#add last cluster without federation +set my_pid [spawn $sacctmgr -i add cluster $tmpc] +expect { + -re "Adding Cluster\\(s\\)$eol" { + incr matches + exp_continue + } + -re "\\s+$tmpc$eol" { + incr matches + exp_continue } + timeout { + send_user "\nFAILURE: sacctmgr add not responding\n" + slow_kill $my_pid + set exit_code 1 + } + eof { + wait + } +} +if {$exit_code || $matches != 2} { + send_user "$matches FAILURE: unexpected error.\n" + end_it 1 +} +set matches 0 +set my_pid [spawn $sacctmgr -i modify cluster $tmpc set federation=$fed1] +expect { + -re "Setting$eol" { + incr matches + exp_continue + } + -re "^\\s+Federation\\s+=\\s+$fed1$eol" { + incr matches + exp_continue + } + -re "sacctmgr: error: Too many clusters in federation$eol" { + incr matches + exp_continue + } + timeout { + send_user "\nFAILURE: sacctmgr add not responding\n" + slow_kill $my_pid + set exit_code 1 + } + eof { + wait + } } +if {$exit_code || $matches != 3} { + send_user "$matches FAILURE: unexpected error.\n" + end_it 1 +} + set matches 0 set my_pid [spawn $sacctmgr show federation $fed1 format="federation%20,cluster%20"] expect { diff --git a/testsuite/expect/test37.2 b/testsuite/expect/test37.2 index e1939c745d26aba77450c488fd58e89cc9a276d6..80bf10b98cf515ff120e3c6d2a0c450c91bc3f79 100755 --- a/testsuite/expect/test37.2 +++ b/testsuite/expect/test37.2 @@ -124,7 +124,7 @@ proc test_fed_job_id { cname cid} { set clust_id [expr $job_id >> 26] send_user "Fed JobID:$job_id Local JobID:$local_id Cluster ID:$clust_id\n" - if {$clust_id != $cid} { + if {!$rc && ($clust_id != $cid)} { send_user "\nFAILURE: jobid($job_id) from $cname didn't give\ correct partition id ($part_id != $cid)\n" incr rc diff --git a/testsuite/expect/test37.3 b/testsuite/expect/test37.3 index df1e88402e514b7107d12ab27d8cd97cf127b883..2a5daef8823e76d74311cb5ec74b0d29ac719e4b 100755 --- a/testsuite/expect/test37.3 +++ b/testsuite/expect/test37.3 @@ -119,7 +119,7 @@ proc test_fed_status {cname fed_flags cluster_list} { send_user "matched self: $name\n" incr matches } - } elseif {[regexp {Sibling:\s+(\S+):(\S+):(\d+) ID:(\d+) FedState:(\S*) Weight:(\d+) PersistConn:(\S+)} \ + } elseif {[regexp {Sibling:\s+(\S+):(\S+):(\d+) ID:(\d+) FedState:(\S*) Weight:(\d+) PersistConnSend/Recv:(\S+)} \ $line match name host port id state weight conn]} { send_user "matched: $match\n" if {$expected_matches && @@ -136,7 +136,7 @@ proc test_fed_status {cname fed_flags cluster_list} { exp_continue } timeout { - send_user "\nFAILURE: sacctmgr add not responding\n" + send_user "\nFAILURE: scontrol not responding\n" slow_kill $my_pid end_it 1 } @@ -187,19 +187,19 @@ set fed_flags "None" send_user "\n\ntest from $fedc1\n" dict set clusters($fedc1) conn Self -dict set clusters($fedc2) conn Connected -dict set clusters($fedc3) conn Connected +dict set clusters($fedc2) conn "Yes/Yes" +dict set clusters($fedc3) conn "Yes/Yes" test_fed_status $fedc1 $fed_flags [array get clusters] send_user "\n\ntest from $fedc2\n" -dict set clusters($fedc1) conn Connected +dict set clusters($fedc1) conn "Yes/Yes" dict set clusters($fedc2) conn Self -dict set clusters($fedc3) conn Connected +dict set clusters($fedc3) conn "Yes/Yes" test_fed_status $fedc2 $fed_flags [array get clusters] send_user "\n\ntest from $fedc3\n" -dict set clusters($fedc1) conn Connected -dict set clusters($fedc2) conn Connected +dict set clusters($fedc1) conn "Yes/Yes" +dict set clusters($fedc2) conn "Yes/Yes" dict set clusters($fedc3) conn Self test_fed_status $fedc3 $fed_flags [array get clusters] @@ -209,10 +209,10 @@ if {[remove_cluster_from_fed $fedc3 $fed_name]} { } array unset clusters $fedc3 dict set clusters($fedc1) conn Self -dict set clusters($fedc2) conn Connected +dict set clusters($fedc2) conn "Yes/Yes" test_fed_status $fedc1 $fed_flags [array get clusters] -dict set clusters($fedc1) conn Connected +dict set clusters($fedc1) conn "Yes/Yes" dict set clusters($fedc2) conn Self test_fed_status $fedc2 $fed_flags [array get clusters] @@ -243,19 +243,21 @@ send_user "\n\nadd $fedc2 and test from $fedc1\n" if {[add_cluster_to_fed $fedc2 $fed_name]} { end_it 1 } +sleep 5 array set clusters [get_clusterfed_info $fed_name] dict set clusters($fedc1) conn Self -dict set clusters($fedc2) conn Connected +dict set clusters($fedc2) conn "Yes/Yes" test_fed_status $fedc1 $fed_flags [array get clusters] send_user "\n\nadd $fedc3 and test from $fedc1\n" if {[add_cluster_to_fed $fedc3 $fed_name]} { end_it 1 } +sleep 5 array set clusters [get_clusterfed_info $fed_name] dict set clusters($fedc1) conn Self -dict set clusters($fedc2) conn Connected -dict set clusters($fedc3) conn Connected +dict set clusters($fedc2) conn "Yes/Yes" +dict set clusters($fedc3) conn "Yes/Yes" test_fed_status $fedc1 $fed_flags [array get clusters] @@ -296,8 +298,8 @@ if {$exit_code || $matches != 4} { set fed_flags "LLC" array set clusters [get_clusterfed_info $fed_name] dict set clusters($fedc1) conn Self -dict set clusters($fedc2) conn Connected -dict set clusters($fedc3) conn Connected +dict set clusters($fedc2) conn "Yes/Yes" +dict set clusters($fedc3) conn "Yes/Yes" test_fed_status $fedc1 $fed_flags [array get clusters] @@ -338,8 +340,8 @@ if {$exit_code || $matches != 4} { set fed_flags "None" array set clusters [get_clusterfed_info $fed_name] dict set clusters($fedc1) conn Self -dict set clusters($fedc2) conn Connected -dict set clusters($fedc3) conn Connected +dict set clusters($fedc2) conn "Yes/Yes" +dict set clusters($fedc3) conn "Yes/Yes" test_fed_status $fedc1 $fed_flags [array get clusters] diff --git a/testsuite/expect/test37.4 b/testsuite/expect/test37.4 new file mode 100755 index 0000000000000000000000000000000000000000..11b8959967e15149997b870ff64387b48a59bcc8 --- /dev/null +++ b/testsuite/expect/test37.4 @@ -0,0 +1,442 @@ +#!/usr/bin/expect +############################################################################ +# Purpose: Test federated submissions +# +# Reqs: 1. Using slurmdbd accounting storage type and is up +# 2. fed_slurm_base is defined in globals.local - set to directory that +# has access to each federation configure (fedc1, fedc2, fedc3). +# Eg. +# fedr/slurm/ (src) +# fedr/fed1/bin +# fedr/fed1/sbin +# fedr/fed1/etc +# fedr/fed1/... +# fedr/fed2/... +# fedr/fed3/... +# 3. controllers are up and running. +# +# Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR +# "FAILURE: ..." otherwise with an explanation of the failure, OR +# anything else indicates a failure mode that must be investigated. +############################################################################ +# Copyright (C) 2016 SchedMD LLC. +# Written by Brian Christiansen <brian@schedmd.com> +# +# This file is part of SLURM, a resource management program. +# For details, see <http://slurm.schedmd.com/>. +# Please also read the included file: DISCLAIMER. +# +# SLURM is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. +# +# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License along +# with SLURM; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +############################################################################ + +source ./globals +source ./globals_accounting +source ./globals_federation + +set test_id "37.4" +set exit_code 0 +set fed_name "feda" +set file_in "test$test_id.in" +set user_name "" + +set eol "\r\n" + +set timeout 5 +print_header $test_id + +# +# Check accounting config and bail if not found. +# +if { [test_account_storage] == 0 } { + log_warn "This test can't be run without a usable AccountStorageType" + exit 0 +} + +if { [string compare [check_accounting_admin_level] "Administrator"] } { + log_warn "This test can't be run without being an Accounting administrator.\n \ + Use: sacctmgr mod user \$USER set admin=admin." + exit 0 +} + +proc cancel_all_jobs { } { + global scancel user_name fedc1 fedc2 fedc3 + + spawn $scancel -M$fedc1,$fedc2,$fedc3 --user $user_name + expect { + eof { + wait + } + } + sleep 5 +} + +proc cleanup { } { + global scancel fed_name user_name bin_rm file_in fedc1 fedc2 fedc3 + + cancel_all_jobs + exec $bin_rm -f $file_in + + return [delete_federations $fed_name]; +} + +proc end_it { exit_code } { + global test_id + cleanup + if {$exit_code == 0} { + print_success $test_id + } + exit $exit_code +} + +proc submit_fed_job { cname expected_origin expected_sib spec_clusters } { + global fed_slurm_base file_in node_count number squeue + + set submit_cluster "" + set origin "" + set sibling "" + set job_id 0 + set my_sbatch "${fed_slurm_base}/$cname/bin/sbatch" + set command "$my_sbatch -N$node_count --exclusive --output=/dev/null --error=/dev/null -t3" + if {$spec_clusters ne ""} { + append command " -M$spec_clusters" + } + append command " $file_in" + set sbatch_pid [spawn {*}$command] + expect { + -re "Submitted batch job ($number)" { + set job_id $expect_out(1,string) + exp_continue + } + -re "on cluster (\\S+)" { + set submit_cluster $expect_out(1,string) + exp_continue + } + timeout { + log_error "sbatch not responding" + slow_kill $sbatch_pid + end_it 1 + } + eof { + wait + } + } + if {$job_id == 0} { + log_error "batch submit failure" + end_it 1 + } + + sleep 3 + + set my_squeue "${fed_slurm_base}/$cname/bin/squeue" + if {$submit_cluster ne ""} { + set my_squeue "${fed_slurm_base}/$submit_cluster/bin/squeue" + } + spawn $my_squeue --jobs=$job_id --noheader -Ofedorigin,fedsiblings --fedtrack + expect { + -re "(\\S+)\\s+(\\S+)" { + set origin $expect_out(1,string) + set sibling $expect_out(2,string) + } + } + + log_info "origin:$origin sibling:$sibling" + + if {($expected_origin ne "") && ($origin ne $expected_origin)} { + log_error "origin:$origin != expected_origin:$expected_origin" + end_it 1 + } + + if {($expected_sib ne "") && ($sibling ne $expected_sib)} { + log_error "sibling:$sibling != expected_sib:$expected_sib" + end_it 1 + } + + + # Verify that siblings have the job as well. + foreach tmp_sib [split $sibling ","] { + if {$tmp_sib eq $origin} { + continue + } + set my_squeue "${fed_slurm_base}/$tmp_sib/bin/squeue" + spawn $my_squeue --jobs=$job_id --noheader -Ofedorigin,fedsiblings + set match 0 + expect { + -re "(\\S+)\\s+(\\S+)" { + set match 1 + if {$origin ne $expect_out(1,string)} { + log_error "origin not the same on $sibling" + } + if {$sibling ne $expect_out(2,string)} { + log_error "sibling not the same on $sibling" + } + } + timeout { + log_error "$my_squeue not responding" + end_it 1 + } + eof { + wait + } + } + + if {!$match} { + log_error "didn't find origin or sibling from job" + end_it 1 + } + } + + return $sibling +} + +if {[test_federation_setup]} { + log_warn "WARNING: This test can't be run without fed_slurm_base,\ + fedc1, fedc2, fedc3 setup in globals.local." + exit 0 +} + +if {[test_cluster_up $fedc1] || + [test_cluster_up $fedc2] || + [test_cluster_up $fedc3]} { + end_it 1 +} + +spawn $bin_id -un +expect { + -re "($alpha_numeric_under)" { + set user_name $expect_out(1,string) + } + eof { + wait + } +} + +# Remove existing setup +if {[cleanup] != 0} { + log_error "failed to cleanup" + end_it 1 +} + +# add clusters to federation +if {[setup_federation $fed_name]} { + log_error "failed to setup federation" + end_it 1 +} + +# get number of nodes per cluster +# devide by 2 to get 2 jobs per clusters +set node_count [expr [available_nodes "" ""] / 2] + +make_bash_script $file_in "$bin_sleep 300" + + +############################################################################### +send_user "\n\n" +send_user "Test packing across clusters\n" +############################################################################### + +# Submit first job and get a sibling +set first_sib [submit_fed_job $fedc1 $fedc1 "" ""] +# Second job should have same sibling as first +submit_fed_job $fedc1 $fedc1 $first_sib "" + + +# Third job should get a different a sib +set second_sib [submit_fed_job $fedc1 $fedc1 "" ""] +if {$second_sib eq $first_sib} { + log_error "$second_sib == $first_sib" + end_it 1 +} +submit_fed_job $fedc1 $fedc1 $second_sib "" + + +# Fifth job should be on a different sib that the first two +set third_sib [submit_fed_job $fedc1 $fedc1 "" ""] +if {($third_sib eq $first_sib) || ($third_sib eq $second_sib)} { + log_error "$third_sib == ($first_sib || $second_sib)" + end_it 1 +} +submit_fed_job $fedc1 $fedc1 $third_sib "" + + +# last job should be submitted to all siblings +submit_fed_job $fedc1 $fedc1 "$fedc1,$fedc2,$fedc3" "" + + + +############################################################################### +send_user "\n\n" +send_user "Test packing across clusters with weights\n\n" +# Set fed1's weight to 2. Should pack on fed2 and fed 3 before getting to fed1 +############################################################################### +cancel_all_jobs +modify_cluster_weight $fedc1 2 + +# Submit first job and get a sibling -- not fed1 +set first_sib [submit_fed_job $fedc1 $fedc1 "" ""] +if {$first_sib eq $fedc1} { + log_error "$first_sib == $fedc1" + end_it 1 +} +# Second job should have same sibling as first +submit_fed_job $fedc1 $fedc1 $first_sib "" + + +# Third job should get a different a sib +set second_sib [submit_fed_job $fedc1 $fedc1 "" ""] +if {$second_sib eq $fedc1 || $second_sib eq $first_sib} { + log_error "$second_sib == $first_sib" + end_it 1 +} +submit_fed_job $fedc1 $fedc1 $second_sib "" + + +# Fifth job should be on fed1 +set third_sib [submit_fed_job $fedc1 $fedc1 $fedc1 ""] +submit_fed_job $fedc1 $fedc1 $third_sib "" + + +# last job should be submitted to all siblings +submit_fed_job $fedc1 $fedc1 "$fedc1,$fedc2,$fedc3" "" + + +# reset fed1's weight +modify_cluster_weight $fedc1 1 + + +############################################################################### +send_user "\n\n" +send_user "Test -M<clusters> with federated jobs\n" +############################################################################### +cancel_all_jobs + +# Submit job to only fed1 +submit_fed_job $fedc1 $fedc1 $fedc1 $fedc1 +submit_fed_job $fedc1 $fedc1 $fedc1 $fedc1 +submit_fed_job $fedc1 $fedc1 $fedc1 $fedc1 + +# Submit job to only fed1,fed2 +# Will go to fed2 since fed1 is full and third job should go to both +submit_fed_job $fedc1 $fedc1 $fedc2 "$fedc1,$fedc2" +submit_fed_job $fedc1 $fedc1 $fedc2 "$fedc1,$fedc2" +submit_fed_job $fedc1 $fedc1 "$fedc1,$fedc2" "$fedc1,$fedc2" + +# Submit job to fed2,fed3. +# Should choose fed2 to be origin and submit +submit_fed_job $fedc1 $fedc2 $fedc3 "$fedc2,$fedc3" +submit_fed_job $fedc1 $fedc2 $fedc3 "$fedc2,$fedc3" +submit_fed_job $fedc1 $fedc2 "$fedc2,$fedc3" "$fedc2,$fedc3" + + +############################################################################### +send_user "\n\n" +send_user "Test spreading across clusters with LLC flag\n" +############################################################################### +cancel_all_jobs +# Now make set the Federation to LLC and make sure that it spreads across the +# cluster. +modify_federation_flags $fed_name "=" "LLC" + + +# Submit first job and get a sibling +set sib1 [submit_fed_job $fedc1 $fedc1 "" ""] + +# Second job shouldln't have same sibling as first +set sib2 [submit_fed_job $fedc1 $fedc1 "" ""] +if {$sib2 eq $sib1} { + log_error "$sib1 == $sib2" + end_it 1 +} + +# Third job shouldln't have same sibling as first or second +set sib3 [submit_fed_job $fedc1 $fedc1 "" ""] +if {$sib3 eq $sib1 || $sib3 eq $sib2} { + log_error "$sib1 == ($sib2 || $sib3)" + end_it 1 +} + +# Repeat +# Fourth job could get any sib but I would expect it to get sib1 +set sib1 [submit_fed_job $fedc1 $fedc1 $sib1 ""] + +# Second job shouldln't have same sibling as first +set sib2 [submit_fed_job $fedc1 $fedc1 "" ""] +if {$sib2 eq $sib1} { + log_error "$sib1 == $sib2" + end_it 1 +} + +# Third job shouldln't have same sibling as first or second +set sib3 [submit_fed_job $fedc1 $fedc1 "" ""] +if {$sib3 eq $sib1 || $sib3 eq $sib2} { + log_error "$sib1 == ($sib2 || $sib3)" + end_it 1 +} + +# last job should be submitted to all siblings +submit_fed_job $fedc1 $fedc1 "$fedc1,$fedc2,$fedc3" "" + + +############################################################################### +send_user "\n\n" +send_user "Test spreading across clusters with LLC flag with weights\n" +# Set fed1's weight to 2. Should spread spread between fed2 and fed3 before +# going to fed1 +############################################################################### +cancel_all_jobs +modify_cluster_weight $fedc1 2 + +# Submit first job and get a sibling +set sib1 [submit_fed_job $fedc1 $fedc1 "" ""] +if {$sib1 eq $fedc1} { + log_error "$sib1 == $fedc1" + end_it 1 +} + +# Second job shouldln't have same sibling as first +set sib2 [submit_fed_job $fedc1 $fedc1 "" ""] +if {$sib2 eq $fedc1 || $sib2 eq $sib1} { + log_error "$sib1 == $sib2" + end_it 1 +} + +# Repeat +# job could get any sib but I would expect it to get sib1 +set sib1 [submit_fed_job $fedc1 $fedc1 $sib1 ""] +if {$sib1 eq $fedc1} { + log_error "$sib1 == $fedc1" + end_it 1 +} + +# Second job shouldln't have same sibling as first +set sib2 [submit_fed_job $fedc1 $fedc1 "" ""] +if {$sib2 eq $fedc1 || $sib2 eq $sib1} { + log_error "$sib1 == $sib2" + end_it 1 +} + +# Third job shouldln't have same sibling as first or second +set sib3 [submit_fed_job $fedc1 $fedc1 $fedc1 ""] +submit_fed_job $fedc1 $fedc1 $sib3 "" + +# last job should be submitted to all siblings +submit_fed_job $fedc1 $fedc1 "$fedc1,$fedc2,$fedc3" "" + +# reset fed1's weight +modify_cluster_weight $fedc1 1 + + + +# All Done +end_it 0