diff --git a/NEWS b/NEWS
index 190276e67b2f478afb75577b9b0ea9add757a619..1d78083a8c65965b5180ef1daf215329f36cbd7f 100644
--- a/NEWS
+++ b/NEWS
@@ -45,6 +45,7 @@ documents those changes that are of interest to users and administrators.
     tasks in a job array independently from the maximum task ID (MaxArraySize).
  -- Fix issue where number of nodes is not properly allocated when sbatch and
     salloc are requested with -n tasks < hosts from -w hostlist or from -N.
+ -- Add infrastructure for submitting federated jobs.
 
 * Changes in Slurm 17.02.0pre2
 ==============================
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index a84a8543fef81c48bcf7b10097c6dc53bf403c26..668dc70e1a291869126447bb1d4f868c0ce56206 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -85,6 +85,11 @@ In slurmctld_lock_t: Added federation
 In will_run_response_msg_t: Added double sys_usage_per to report back how busy a
 	cluster is.
 In slurm_ctl_conf: Added mail_domain.
+In slurm_msg_t: Added buffer to keep received message buffer to use for later
+		      purposes.
+In job_desc_msg_t: Added fed_siblings to track which clusters have sibling jobs.
+In slurm_job_info_t: Added fed_origin_str, fed_siblings, fed_siblings_str to
+		      display job federation information.
 
 Added the following struct definitions
 ======================================
@@ -93,6 +98,7 @@ Added slurmdb_cluster_fed_t to store federation information on
 Added slurmdb_federation_cond_t for selecting federations from db.
 Added slurmdb_federation_rec_t to represent federation objects.
 Added job_fed_details_t for storing federated job information.
+Added sib_msg_t for sending messages to siblings.
 
 Removed members from the following struct definitions
 =====================================================
@@ -106,6 +112,8 @@ Changed DEFAULT_MAX_JOB_ID from 0x7fff0000 to 0x03ff0000.
 Added SELECT_NODEDATA_TRES_ALLOC_FMT_STR to select_nodedata_type.
 Added SELECT_NODEDATA_TRES_ALLOC_WEIGHTED to select_nodedata_type.
 Changed MEM_PER_CPU flag to 0x8000000000000000 from 0x80000000.
+Added SLURM_MSG_KEEP_BUFFER msg flag to instruct slurm_receive_msg() to save the
+	buffer ptr.
 
 Added the following API's
 =========================
diff --git a/doc/man/man1/salloc.1 b/doc/man/man1/salloc.1
index 6913d29e9783caf199db2aea0085ead7bc3ead08..7d0347840b79a9a5fadd0a6295964658b6aff5dd 100644
--- a/doc/man/man1/salloc.1
+++ b/doc/man/man1/salloc.1
@@ -608,6 +608,16 @@ License names can be followed by a colon and count
 Multiple license names should be comma separated (e.g.
 "\-\-licenses=foo:4,bar").
 
+.TP
+\fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR>
+Clusters to issue commands to.  Multiple cluster names may be comma separated.
+The job will be submitted to the one cluster providing the earliest expected
+job initiation time. The default value is the current cluster. A value of
+\(aq\fIall\fR' will query to run on all clusters.  Note the
+\fB\-\-export\fR option to control environment variables exported
+between clusters.
+Note that the SlurmDBD must be up for this option to work properly.
+
 .TP
 \fB\-m\fR, \fB\-\-distribution\fR=
 \fIarbitrary\fR|<\fIblock\fR|\fIcyclic\fR|\fIplane=<options>\fR[:\fIblock\fR|\fIcyclic\fR|\fIfcyclic\fR]>
@@ -1476,6 +1486,9 @@ Same as \fB\-\-bell\fR
 \fBSALLOC_BURST_BUFFER\fR
 Same as \fB\-\-bb\fR
 .TP
+\fBSALLOC_CLUSTERS\fR or \fBSLURM_CLUSTERS\fR
+Same as \fB\-\-clusters\fR
+.TP
 \fBSALLOC_CONN_TYPE\fR
 Same as \fB\-\-conn\-type\fR
 .TP
diff --git a/doc/man/man1/sbatch.1 b/doc/man/man1/sbatch.1
index 87e6220b0fbf9d21a3d3b21baa31cb495a19dc27..5a80fd8c3e4d16b5efe17694906c566e6e7c8987 100644
--- a/doc/man/man1/sbatch.1
+++ b/doc/man/man1/sbatch.1
@@ -701,6 +701,7 @@ job initiation time. The default value is the current cluster. A value of
 \(aq\fIall\fR' will query to run on all clusters.  Note the
 \fB\-\-export\fR option to control environment variables exported
 between clusters.
+Note that the SlurmDBD must be up for this option to work properly.
 
 .TP
 \fB\-m\fR, \fB\-\-distribution\fR=
diff --git a/doc/man/man1/scancel.1 b/doc/man/man1/scancel.1
index 1419ab53ffee5786557b04ce14a7e1e62fd2adc1..0eec081b375d9410b77f5ba622e4ed247a12310f 100644
--- a/doc/man/man1/scancel.1
+++ b/doc/man/man1/scancel.1
@@ -61,7 +61,8 @@ Interactive mode. Confirm each job_id.step_id before performing the cancel opera
 
 .TP
 \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR>
-Cluster to issue commands to.
+Clusters to issue commands to.
+Note that the SlurmDBD must be up for this option to work properly.
 
 .TP
 \fB\-n\fR, \fB\-\-jobname\fR=\fIjob_name\fR, \fB\-\-name\fR=\fIjob_name\fR
diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1
index ff3a6f8670a5543cafd50ebe5209497c484b1f96..8e91a7e42a8c5f5d74fc7b342ebc2407d50e029a 100644
--- a/doc/man/man1/scontrol.1
+++ b/doc/man/man1/scontrol.1
@@ -48,6 +48,7 @@ unavailable to user's group will be displayed (i.e. this is the default behavior
 .TP
 \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR>
 The cluster to issue commands to. Only one cluster name may be specified.
+Note that the SlurmDBD must be up for this option to work properly.
 
 .TP
 \fB\-o\fR, \fB\-\-oneliner\fR
diff --git a/doc/man/man1/sinfo.1 b/doc/man/man1/sinfo.1
index 8186484bac8b5f58620836410cee9527843912ac..37f3bccb0ebbad82c2965ed9e24b0e37b42727f5 100644
--- a/doc/man/man1/sinfo.1
+++ b/doc/man/man1/sinfo.1
@@ -62,6 +62,7 @@ This is ignored if the \fB\-\-format\fR option is specified.
 \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR>
 Clusters to issue commands to.  Multiple cluster names may be comma separated.
 A value of of '\fIall\fR' will query to run on all clusters.
+Note that the SlurmDBD must be up for this option to work properly.
 
 .TP
 \fB\-n <nodes>\fR, \fB\-\-nodes=<nodes>\fR
diff --git a/doc/man/man1/smap.1 b/doc/man/man1/smap.1
index 6a927b20e7e6ce0e40781128c4c8989b25f8f4b7..26570ebc31173ff75308547b0b5dc390f4f26734 100644
--- a/doc/man/man1/smap.1
+++ b/doc/man/man1/smap.1
@@ -79,6 +79,7 @@ name with the '\-n' option.
 .TP
 \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR>
 Clusters to issue commands to.
+Note that the SlurmDBD must be up for this option to work properly.
 
 .TP
 \fB\-n\fR, \fB\-\-nodes\fR
diff --git a/doc/man/man1/sprio.1 b/doc/man/man1/sprio.1
index d25e42f7129b25131c5852956d84a0cc7d1438db..2101006aef6d06532151e00576d947fd46cfbf63 100644
--- a/doc/man/man1/sprio.1
+++ b/doc/man/man1/sprio.1
@@ -39,6 +39,7 @@ Report more of the available information for the selected jobs.
 .TP
 \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR>
 The cluster to issue commands to. Only one cluster name may be specified.
+Note that the SlurmDBD must be up for this option to work properly.
 
 .TP
 \fB\-n\fR, \fB\-\-norm\fR
diff --git a/doc/man/man1/squeue.1 b/doc/man/man1/squeue.1
index 43bffaa58bedf6c8dd6bc0745544fa076aaa4bb3..ef7aa80bfa74f3c08b5caaa639e9d056c5377d28 100644
--- a/doc/man/man1/squeue.1
+++ b/doc/man/man1/squeue.1
@@ -579,6 +579,22 @@ The exit code for the job.
 Features required by the job.
 (Valid for jobs only)
 .TP
+\fBfedorigin\fR
+Cluster name where federated job originated from.
+(Valid for federated jobs only)
+.TP
+\fBfedoriginraw\fR
+Cluster ID where federated job originated from.
+(Valid for federated jobs only)
+.TP
+\fBfedsiblings\fR
+Cluster names of where federated job can run.
+(Valid for federated jobs only)
+.TP
+\fBfedsiblingsraw\fR
+Cluster IDs of where federated job can run.
+(Valid for federated jobs only)
+.TP
 \fBgres\fR
 Generic resources (gres) required by the job or step.
 (Valid for jobs and job steps)
diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1
index 6b497a0242f84ab13ff31ce86d00ddcd2a91df20..8a657adf4ba5b59083d7b8d9447cc7822c7e2a1e 100644
--- a/doc/man/man1/srun.1
+++ b/doc/man/man1/srun.1
@@ -911,6 +911,17 @@ License names can be followed by a colon and count
 Multiple license names should be comma separated (e.g.
 "\-\-licenses=foo:4,bar"). This option applies to job allocations.
 
+.TP
+\fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR>
+Clusters to issue commands to.  Multiple cluster names may be comma separated.
+The job will be submitted to the one cluster providing the earliest expected
+job initiation time. The default value is the current cluster. A value of
+\(aq\fIall\fR' will query to run on all clusters.  Note the
+\fB\-\-export\fR option to control environment variables exported
+between clusters.
+This option applies only to job allocations.
+Note that the SlurmDBD must be up for this option to work properly.
+
 .TP
 .na
 \fB\-m\fR, \fB\-\-distribution\fR=
diff --git a/doc/man/man1/sshare.1 b/doc/man/man1/sshare.1
index dcd35297db7bc1ae6cac4518ba1f87d2f4ae43cb..39ca0ce5aa1a8717a24e75b5377335ba4903cf1f 100644
--- a/doc/man/man1/sshare.1
+++ b/doc/man/man1/sshare.1
@@ -39,6 +39,7 @@ Long listing - includes the normalized usage information.
 .TP
 \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR>
 Clusters to issue commands to.
+Note that the SlurmDBD must be up for this option to work properly.
 
 .TP
 \fB\-m\fR, \fB\-\-partition\fR
diff --git a/doc/man/man1/strigger.1 b/doc/man/man1/strigger.1
index 9686621ff6c106c324a4032144cf7b872a4e01fc..3520baa3b3e28da81ab5bf9fbbddf12973c8e0af 100644
--- a/doc/man/man1/strigger.1
+++ b/doc/man/man1/strigger.1
@@ -167,6 +167,7 @@ trigger event.
 .TP
 \fB\-M\fR, \fB\-\-clusters\fR=<\fIstring\fR>
 Clusters to issue commands to.
+Note that the SlurmDBD must be up for this option to work properly.
 
 .TP
 \fB\-n\fR, \fB\-\-node\fR[=\fIhost\fR]
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 2135399a56ed66e68de701e705c35935793160b6..4244f108282c5e951e542f91178d0b89430d0d9a 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -878,6 +878,7 @@ enum node_states {
 #define SHOW_DETAIL	0x0002	/* Show detailed resource information */
 #define SHOW_DETAIL2	0x0004	/* Show batch script listing */
 #define SHOW_MIXED	0x0008	/* Automatically set node MIXED state */
+#define SHOW_FED_TRACK	0x0010	/* Show tracking only federated jobs */
 
 /* Define keys for ctx_key argument of slurm_step_ctx_get() */
 enum ctx_keys {
@@ -1334,6 +1335,12 @@ typedef struct power_mgmt_data {
 } power_mgmt_data_t;
 
 #define CORE_SPEC_THREAD 0x8000	/* If set, this is a thread count not core count */
+
+/*
+ * Update:
+ * _copy_job_desc_to_job_record()
+ * slurm_free_job_desc_msg()
+ */
 typedef struct job_descriptor {	/* For submit, allocate, and update requests */
 	char *account;		/* charge to specified account */
 	char *acctg_freq;	/* accounting polling intervals (seconds) */
@@ -1386,6 +1393,7 @@ typedef struct job_descriptor {	/* For submit, allocate, and update requests */
 				 * from job's allocation, default NONE */
 	char *features;		/* required feature specification,
 				 * default NONE */
+	uint64_t fed_siblings;	/* Bitmap of federation siblings */
 	char *gres;		/* comma separated list of required generic
 				 * resources, default NONE */
 	uint32_t group_id;	/* group to assume, if run as root. */
@@ -1559,6 +1567,9 @@ typedef struct job_info {
 				 * start_range_2, .., -1  */
 	uint32_t exit_code;	/* exit code for job (status from wait call) */
 	char *features;		/* comma separated list of required features */
+	char *fed_origin_str;	/* Origin cluster's name */
+	uint64_t fed_siblings;	/* bitmap of sibling cluster ids */
+	char *fed_siblings_str;	/* string of sibling cluster names */
 	char *gres;		/* comma separated list of generic resources */
 	uint32_t group_id;	/* group job submitted as */
 	uint32_t job_id;	/* job ID */
diff --git a/src/api/job_info.c b/src/api/job_info.c
index 0337fefe9be1bf48b0fd24d0bd8dc2c668895d4c..70c0acb725d7a3f8eb5aaec4dbf855f7d0e1a6cf 100644
--- a/src/api/job_info.c
+++ b/src/api/job_info.c
@@ -556,6 +556,13 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner )
 		xstrcat(out, line_end);
 	}
 
+	/****** Line 14a (optional) ******/
+	if (job_ptr->fed_siblings) {
+		xstrfmtcat(out, "FedOrigin=%s FedSiblings=%s",
+			   job_ptr->fed_origin_str, job_ptr->fed_siblings_str);
+		xstrcat(out, line_end);
+	}
+
 	/****** Line 15 ******/
 	if (cluster_flags & CLUSTER_FLAG_BG) {
 		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
diff --git a/src/common/slurm_protocol_api.c b/src/common/slurm_protocol_api.c
index 7c6c988e820b118b75b99a09a60216559c267d53..ac8f32ac92b73eba435c28ace699193d8adddd98 100644
--- a/src/common/slurm_protocol_api.c
+++ b/src/common/slurm_protocol_api.c
@@ -3128,6 +3128,7 @@ extern int slurm_unpack_received_msg(slurm_msg_t *msg, int fd, Buf buffer)
 	header_t header;
 	int rc;
 	void *auth_cred = NULL;
+	uint32_t body_offset = 0;
 
 	if (unpack_header(&header, buffer) == SLURM_ERROR) {
 		rc = SLURM_COMMUNICATIONS_RECEIVE_ERROR;
@@ -3199,6 +3200,8 @@ extern int slurm_unpack_received_msg(slurm_msg_t *msg, int fd, Buf buffer)
 	msg->msg_type = header.msg_type;
 	msg->flags = header.flags;
 
+	body_offset = get_buf_offset(buffer);
+
 	if ((header.body_length > remaining_buf(buffer)) ||
 	    (unpack_msg(msg, buffer) != SLURM_SUCCESS)) {
 		rc = ESLURM_PROTOCOL_INCOMPLETE_PACKET;
@@ -3206,6 +3209,8 @@ extern int slurm_unpack_received_msg(slurm_msg_t *msg, int fd, Buf buffer)
 		goto total_return;
 	}
 
+	set_buf_offset(buffer, body_offset);
+
 	msg->auth_cred = (void *)auth_cred;
 
 	rc = SLURM_SUCCESS;
@@ -3243,6 +3248,10 @@ int slurm_receive_msg(int fd, slurm_msg_t *msg, int timeout)
 	size_t buflen = 0;
 	int rc;
 	Buf buffer;
+	bool keep_buffer = false;
+
+	if (msg->flags & SLURM_MSG_KEEP_BUFFER)
+		keep_buffer = true;
 
 	if (msg->conn) {
 		persist_msg_t persist_msg;
@@ -3255,7 +3264,13 @@ int slurm_receive_msg(int fd, slurm_msg_t *msg, int timeout)
 		}
 		memset(&persist_msg, 0, sizeof(persist_msg_t));
 		rc = slurm_persist_msg_unpack(msg->conn, &persist_msg, buffer);
-		free_buf(buffer);
+
+		if (keep_buffer) {
+			set_buf_offset(buffer, 0);
+			msg->buffer = buffer;
+		} else {
+			free_buf(buffer);
+		}
 
 		if (rc) {
 			error("%s: Failed to unpack persist msg", __func__);
@@ -3302,7 +3317,10 @@ int slurm_receive_msg(int fd, slurm_msg_t *msg, int timeout)
 
 	rc = slurm_unpack_received_msg(msg, fd, buffer);
 
-	free_buf(buffer);
+	if (keep_buffer)
+		msg->buffer = buffer;
+	else
+		free_buf(buffer);
 
 endit:
 	slurm_seterrno(rc);
@@ -4671,6 +4689,7 @@ extern void slurm_free_msg_members(slurm_msg_t *msg)
 	if (msg) {
 		if (msg->auth_cred)
 			(void) g_slurm_auth_destroy(msg->auth_cred);
+		free_buf(msg->buffer);
 		slurm_free_msg_data(msg->msg_type, msg->data);
 		FREE_NULL_LIST(msg->ret_list);
 	}
diff --git a/src/common/slurm_protocol_common.h b/src/common/slurm_protocol_common.h
index e9e6d7c155c068d06a196319be3cc7a6aaba5c12..8042a15e5cd479978932bdaad69bd2e692d299d2 100644
--- a/src/common/slurm_protocol_common.h
+++ b/src/common/slurm_protocol_common.h
@@ -104,6 +104,7 @@
 #define SLURM_PROTOCOL_NO_FLAGS 0
 #define SLURM_GLOBAL_AUTH_KEY   0x0001
 #define SLURMDBD_CONNECTION     0x0002
+#define SLURM_MSG_KEEP_BUFFER   0x0004
 
 #include "src/common/slurm_protocol_socket_common.h"
 
diff --git a/src/common/slurm_protocol_defs.c b/src/common/slurm_protocol_defs.c
index 62372161c5dfe244b66349eb2c19934bda575444..b01f92f06f37183fa0ee9e7c76bb74460034a2ad 100644
--- a/src/common/slurm_protocol_defs.c
+++ b/src/common/slurm_protocol_defs.c
@@ -788,6 +788,15 @@ extern void slurm_free_job_desc_msg(job_desc_msg_t * msg)
 	}
 }
 
+extern void slurm_free_sib_msg(sib_msg_t *msg)
+{
+	if (msg) {
+		free_buf(msg->data_buffer);
+		slurm_free_msg_data(msg->data_type, msg->data);
+		xfree(msg);
+	}
+}
+
 extern void slurm_free_event_log_msg(slurm_event_log_msg_t * msg)
 {
 	if (msg) {
@@ -3830,6 +3839,17 @@ extern int slurm_free_msg_data(slurm_msg_type_t type, void *data)
 	case REQUEST_UPDATE_JOB:
 		slurm_free_job_desc_msg(data);
 		break;
+	case REQUEST_SIB_JOB_WILL_RUN:
+	case REQUEST_SIB_SUBMIT_BATCH_JOB:
+	case REQUEST_SIB_RESOURCE_ALLOCATION:
+		slurm_free_sib_msg(data);
+		break;
+	case RESPONSE_JOB_WILL_RUN:
+		slurm_free_will_run_response_msg(data);
+		break;
+	case RESPONSE_SUBMIT_BATCH_JOB:
+		slurm_free_submit_response_response_msg(data);
+		break;
 	case RESPONSE_ACCT_GATHER_UPDATE:
 		slurm_free_acct_gather_node_resp_msg(data);
 		break;
@@ -4425,6 +4445,12 @@ rpc_num2string(uint16_t opcode)
 		return "RESPONSE_JOB_ATTACH";
 	case REQUEST_JOB_WILL_RUN:
 		return "REQUEST_JOB_WILL_RUN";
+	case REQUEST_SIB_JOB_WILL_RUN:
+		return "REQUEST_SIB_JOB_WILL_RUN";
+	case REQUEST_SIB_SUBMIT_BATCH_JOB:
+		return "REQUEST_SIB_SUBMIT_BATCH_JOB";
+	case REQUEST_SIB_RESOURCE_ALLOCATION:
+		return "REQUEST_SIB_RESOURCE_ALLOCATION";
 	case RESPONSE_JOB_WILL_RUN:
 		return "RESPONSE_JOB_WILL_RUN";
 	case REQUEST_JOB_ALLOCATION_INFO:
diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h
index 635421fb3d5730403ebb4b7108418aaaf919a0b1..e88a1ebe2517e6f61798cb02991350704a713a23 100644
--- a/src/common/slurm_protocol_defs.h
+++ b/src/common/slurm_protocol_defs.h
@@ -286,6 +286,9 @@ typedef enum {
 	REQUEST_JOB_NOTIFY,
 	REQUEST_JOB_SBCAST_CRED,
 	RESPONSE_JOB_SBCAST_CRED,
+	REQUEST_SIB_JOB_WILL_RUN,
+	REQUEST_SIB_SUBMIT_BATCH_JOB,
+	REQUEST_SIB_RESOURCE_ALLOCATION,
 
 	REQUEST_JOB_STEP_CREATE = 5001,
 	RESPONSE_JOB_STEP_CREATE,
@@ -433,6 +436,7 @@ typedef struct slurm_protocol_config {
 typedef struct slurm_msg {
 	slurm_addr_t address;
 	void *auth_cred;
+	Buf buffer; /* DON't PACK! ptr to buffer that msg was unpacked from. */
 	slurm_persist_conn_t *conn; /* DON'T PACK OR FREE! this is here to
 				     * distinquish a persistant connection from
 				     * a normal connection it should be filled
@@ -1200,6 +1204,18 @@ typedef struct slurm_event_log_msg {
 	char *   string;	/* String for slurmctld to log */
 } slurm_event_log_msg_t;
 
+typedef struct {
+	void    *data;		/* Unpacked buffer
+				 * Only populated on the receiving side. */
+	Buf      data_buffer;	/* Buffer that holds an unpacked data type.
+				 * Only populated on the sending side. */
+	uint16_t data_type;	/* date type to unpack */
+	uint16_t data_version;	/* Version that data is packed with */
+	uint64_t fed_siblings;	/* sibling bitmap of job */
+	uint32_t job_id;	/* job_id of job - set in job_desc on receiving
+				 * side */
+} sib_msg_t;
+
 /*****************************************************************************\
  *      ACCOUNTING PUSHS
 \*****************************************************************************/
@@ -1263,6 +1279,7 @@ extern void slurm_free_front_end_info_request_msg(
 extern void slurm_free_node_info_request_msg(node_info_request_msg_t *msg);
 extern void slurm_free_node_info_single_msg(node_info_single_msg_t *msg);
 extern void slurm_free_part_info_request_msg(part_info_request_msg_t *msg);
+extern void slurm_free_sib_msg(sib_msg_t *msg);
 extern void slurm_free_stats_info_request_msg(stats_info_request_msg_t *msg);
 extern void slurm_free_stats_response_msg(stats_info_response_msg_t *msg);
 extern void slurm_free_step_alloc_info_msg(step_alloc_info_msg_t * msg);
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index 2fc12bb5fcdeaba28ebe65d194ee3dff59ad4d0d..38087582b3e12241da851d87a8316891df6795b0 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -628,6 +628,11 @@ static int  _unpack_will_run_response_msg(will_run_response_msg_t ** msg_ptr,
 					  Buf buffer,
 					  uint16_t protocol_version);
 
+static void _pack_sib_msg(sib_msg_t *sib_msg_ptr, Buf buffer,
+			  uint16_t protocol_version);
+static int _unpack_sib_msg(sib_msg_t **sib_msg_buffer_ptr, Buf buffer,
+			   uint16_t protocol_version);
+
 static void _pack_accounting_update_msg(accounting_update_msg_t *msg,
 					Buf buffer,
 					uint16_t protocol_version);
@@ -1085,6 +1090,12 @@ pack_msg(slurm_msg_t const *msg, Buf buffer)
 				   msg->data, buffer,
 				   msg->protocol_version);
 		break;
+	case REQUEST_SIB_JOB_WILL_RUN:
+	case REQUEST_SIB_SUBMIT_BATCH_JOB:
+	case REQUEST_SIB_RESOURCE_ALLOCATION:
+		_pack_sib_msg((sib_msg_t *)msg->data, buffer,
+			      msg->protocol_version);
+		break;
 	case REQUEST_UPDATE_JOB_STEP:
 		_pack_update_job_step_msg((step_update_request_msg_t *)
 					  msg->data, buffer,
@@ -1756,6 +1767,12 @@ unpack_msg(slurm_msg_t * msg, Buf buffer)
 					  buffer,
 					  msg->protocol_version);
 		break;
+	case REQUEST_SIB_JOB_WILL_RUN:
+	case REQUEST_SIB_SUBMIT_BATCH_JOB:
+	case REQUEST_SIB_RESOURCE_ALLOCATION:
+		rc = _unpack_sib_msg((sib_msg_t **)&(msg->data), buffer,
+				     msg->protocol_version);
+		break;
 	case REQUEST_UPDATE_JOB_STEP:
 		rc = _unpack_update_job_step_msg(
 			(step_update_request_msg_t **) & (msg->data),
@@ -6279,6 +6296,12 @@ _unpack_job_info_members(job_info_t * job, Buf buffer,
 		safe_unpackstr_xmalloc(&job->tres_req_str,
 				       &uint32_tmp, buffer);
 		safe_unpack16(&job->start_protocol_ver, buffer);
+
+		safe_unpackstr_xmalloc(&job->fed_origin_str, &uint32_tmp,
+				       buffer);
+		safe_unpack64(&job->fed_siblings, buffer);
+		safe_unpackstr_xmalloc(&job->fed_siblings_str, &uint32_tmp,
+				       buffer);
 	} else if (protocol_version >= SLURM_16_05_PROTOCOL_VERSION) {
 		uint32_t tmp_mem;
 		safe_unpack32(&job->array_job_id, buffer);
@@ -8664,6 +8687,77 @@ unpack_error:
 	return SLURM_ERROR;
 }
 
+static void
+_pack_sib_msg(sib_msg_t *sib_msg_ptr, Buf buffer, uint16_t protocol_version)
+{
+	xassert(sib_msg_ptr);
+
+	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
+		pack16(sib_msg_ptr->data_type, buffer);
+		pack16(sib_msg_ptr->data_version, buffer);
+		pack64(sib_msg_ptr->fed_siblings, buffer);
+		pack32(sib_msg_ptr->job_id, buffer);
+
+		/* add already packed data_buffer to buffer */
+		if (size_buf(sib_msg_ptr->data_buffer)) {
+			Buf dbuf = sib_msg_ptr->data_buffer;
+			uint32_t grow_size =
+				size_buf(dbuf) - get_buf_offset(dbuf);
+
+			grow_buf(buffer, grow_size);
+			memcpy(&buffer->head[get_buf_offset(buffer)],
+			       &dbuf->head[get_buf_offset(dbuf)], grow_size);
+			set_buf_offset(buffer,
+				       get_buf_offset(buffer) + grow_size);
+		}
+	} else {
+		error("_pack_sib_msg: protocol_version "
+		      "%hu not supported", protocol_version);
+	}
+}
+
+static int
+_unpack_sib_msg(sib_msg_t **sib_msg_buffer_ptr, Buf buffer,
+		uint16_t protocol_version)
+{
+	sib_msg_t *sib_msg_ptr = NULL;
+	slurm_msg_t tmp_msg;
+
+	xassert(sib_msg_buffer_ptr);
+
+	/* alloc memory for structure */
+	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
+		sib_msg_ptr = xmalloc(sizeof(sib_msg_t));
+		*sib_msg_buffer_ptr = sib_msg_ptr;
+
+		/* load the data values */
+		safe_unpack16(&sib_msg_ptr->data_type, buffer);
+		safe_unpack16(&sib_msg_ptr->data_version, buffer);
+		safe_unpack64(&sib_msg_ptr->fed_siblings, buffer);
+		safe_unpack32(&sib_msg_ptr->job_id, buffer);
+
+		if (remaining_buf(buffer)) {
+			slurm_msg_t_init(&tmp_msg);
+			tmp_msg.msg_type         = sib_msg_ptr->data_type;
+			tmp_msg.protocol_version = sib_msg_ptr->data_version;
+
+			if (unpack_msg(&tmp_msg, buffer))
+				goto unpack_error;
+
+			sib_msg_ptr->data = tmp_msg.data;
+			tmp_msg.data = NULL;
+			slurm_free_msg_members(&tmp_msg);
+		}
+	}
+
+	return SLURM_SUCCESS;
+
+unpack_error:
+	slurm_free_sib_msg(sib_msg_ptr);
+	*sib_msg_buffer_ptr = NULL;
+	return SLURM_ERROR;
+}
+
 /* _pack_job_desc_msg
  * packs a job_desc struct
  * IN job_desc_ptr - pointer to the job descriptor to pack
@@ -8684,6 +8778,7 @@ _pack_job_desc_msg(job_desc_msg_t * job_desc_ptr, Buf buffer,
 		pack32(job_desc_ptr->task_dist, buffer);
 		pack16(job_desc_ptr->kill_on_node_fail, buffer);
 		packstr(job_desc_ptr->features, buffer);
+		pack64(job_desc_ptr->fed_siblings, buffer);
 		packstr(job_desc_ptr->gres, buffer);
 		pack32(job_desc_ptr->job_id, buffer);
 		packstr(job_desc_ptr->job_id_str, buffer);
@@ -9204,6 +9299,7 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer,
 		safe_unpack16(&job_desc_ptr->kill_on_node_fail, buffer);
 		safe_unpackstr_xmalloc(&job_desc_ptr->features,
 				       &uint32_tmp, buffer);
+		safe_unpack64(&job_desc_ptr->fed_siblings, buffer);
 		safe_unpackstr_xmalloc(&job_desc_ptr->gres, &uint32_tmp,buffer);
 		safe_unpack32(&job_desc_ptr->job_id, buffer);
 		safe_unpackstr_xmalloc(&job_desc_ptr->job_id_str,
diff --git a/src/common/slurmdb_defs.c b/src/common/slurmdb_defs.c
index 52c2ef95f14118534d72558eed91c47318cdd990..cb0a1fc678e341a6ee54d13943cbfed38cac20ae 100644
--- a/src/common/slurmdb_defs.c
+++ b/src/common/slurmdb_defs.c
@@ -2997,6 +2997,23 @@ extern char *slurmdb_get_selected_step_id(
 	return job_id_str;
 }
 
+static int _find_char_in_list(void *name, void *key)
+{
+	char *name_str = (char *)name;
+	char *key_str  = (char *)key;
+
+	if (!xstrcmp(name_str,key_str))
+		return 1;
+
+	return 0;
+}
+
+/* Return the cluster with the fastest start_time.
+ *
+ * Note: The will_runs are not threaded. Currently it relies on the
+ * working_cluster_rec to pack the job_desc's jobinfo. See previous commit for
+ * an example of how to thread this.
+ */
 extern int slurmdb_get_first_avail_cluster(job_desc_msg_t *req,
 	char *cluster_names, slurmdb_cluster_rec_t **cluster_rec)
 {
@@ -3007,6 +3024,7 @@ extern int slurmdb_get_first_avail_cluster(job_desc_msg_t *req,
 	ListIterator itr;
 	List cluster_list = NULL;
 	List ret_list = NULL;
+	List tried_feds = list_create(NULL);
 
 	*cluster_rec = NULL;
 	cluster_list = slurmdb_get_info_cluster(cluster_names);
@@ -3032,13 +3050,25 @@ extern int slurmdb_get_first_avail_cluster(job_desc_msg_t *req,
 	ret_list = list_create(_destroy_local_cluster_rec);
 	itr = list_iterator_create(cluster_list);
 	while ((working_cluster_rec = list_next(itr))) {
-		if ((local_cluster = _job_will_run(req)))
+
+		/* only try one cluster from each federation */
+		if (working_cluster_rec->fed.id &&
+		    list_find_first(tried_feds, _find_char_in_list,
+				    working_cluster_rec->fed.name))
+			continue;
+
+		if ((local_cluster = _job_will_run(req))) {
 			list_append(ret_list, local_cluster);
-		else
+			if (working_cluster_rec->fed.id)
+				list_append(tried_feds,
+					    working_cluster_rec->fed.name);
+		} else {
 			error("Problem with submit to cluster %s: %m",
 			      working_cluster_rec->name);
+		}
 	}
 	list_iterator_destroy(itr);
+	FREE_NULL_LIST(tried_feds);
 
 	/* restore working_cluster_rec in case it was already set */
 	if (*cluster_rec) {
diff --git a/src/plugins/job_submit/pbs/job_submit_pbs.c b/src/plugins/job_submit/pbs/job_submit_pbs.c
index d50a3d5171bd319da83d9015e67563bb14676c5e..b9098b912c11a1a6cbac7e89107fec8f0072286b 100644
--- a/src/plugins/job_submit/pbs/job_submit_pbs.c
+++ b/src/plugins/job_submit/pbs/job_submit_pbs.c
@@ -302,7 +302,7 @@ extern int job_submit(struct job_descriptor *job_desc, uint32_t submit_uid)
 	char *std_out, *tok;
 	uint32_t my_job_id;
 
-	my_job_id = get_next_job_id();
+	my_job_id = get_next_job_id(true);
 	_xlate_dependency(job_desc, submit_uid, my_job_id);
 
 	if (job_desc->account)
diff --git a/src/salloc/opt.c b/src/salloc/opt.c
index 9151518f15d546b31044707cd90d3ef9b517b72c..089f5727b7268afecc7f115408e2d3200e362a54 100644
--- a/src/salloc/opt.c
+++ b/src/salloc/opt.c
@@ -286,6 +286,7 @@ static void _opt_default()
 	opt.uid = uid;
 	opt.gid = getgid();
 
+	opt.clusters = NULL;
 	opt.cwd = NULL;
 	opt.progname = NULL;
 
@@ -408,6 +409,8 @@ env_vars_t env_vars[] = {
   {"SALLOC_ACCTG_FREQ",    OPT_STRING,     &opt.acctg_freq,    NULL          },
   {"SALLOC_BELL",          OPT_BELL,       NULL,               NULL          },
   {"SALLOC_BURST_BUFFER",  OPT_STRING,     &opt.burst_buffer,  NULL          },
+  {"SALLOC_CLUSTERS",      OPT_STRING,     &opt.clusters,      NULL          },
+  {"SLURM_CLUSTERS",       OPT_STRING,     &opt.clusters,      NULL          },
   {"SALLOC_CONN_TYPE",     OPT_CONN_TYPE,  NULL,               NULL          },
   {"SALLOC_CORE_SPEC",     OPT_INT,        &opt.core_spec,     NULL          },
   {"SALLOC_CPU_FREQ_REQ",  OPT_CPU_FREQ,   NULL,               NULL          },
@@ -682,6 +685,8 @@ void set_options(const int argc, char **argv)
 		{"kill-command",  optional_argument, 0, 'K'},
 		{"licenses",      required_argument, 0, 'L'},
 		{"distribution",  required_argument, 0, 'm'},
+		{"cluster",       required_argument, 0, 'M'},
+		{"clusters",      required_argument, 0, 'M'},
 		{"tasks",         required_argument, 0, 'n'},
 		{"ntasks",        required_argument, 0, 'n'},
 		{"nodes",         required_argument, 0, 'N'},
@@ -763,7 +768,7 @@ void set_options(const int argc, char **argv)
 		{NULL,            0,                 0, 0}
 	};
 	char *opt_string =
-		"+A:B:c:C:d:D:F:g:hHI::J:kK::L:m:n:N:Op:P:QRsS:t:uU:vVw:W:x:";
+		"+A:B:c:C:d:D:F:g:hHI::J:kK::L:m:M:n:N:Op:P:QRsS:t:uU:vVw:W:x:";
 	char *pos_delimit;
 
 	struct option *optz = spank_option_table_create(long_options);
@@ -882,6 +887,10 @@ void set_options(const int argc, char **argv)
 				exit(error_exit);
 			}
 			break;
+		case 'M':
+			xfree(opt.clusters);
+			opt.clusters = xstrdup(optarg);
+			break;
 		case 'n':
 			opt.ntasks_set = true;
 			opt.ntasks =
@@ -2076,6 +2085,7 @@ static void _usage(void)
 "              [--immediate[=secs]] [--no-kill] [--overcommit] [-D path]\n"
 "              [--oversubscribe] [-J jobname] [--jobid=id]\n"
 "              [--verbose] [--gid=group] [--uid=user] [--licenses=names]\n"
+"              [--clusters=cluster_names]\n"
 "              [--contiguous] [--mincpus=n] [--mem=MB] [--tmp=MB] [-C list]\n"
 "              [--account=name] [--dependency=type:jobid] [--comment=name]\n"
 #ifdef HAVE_BG		/* Blue gene specific options */
@@ -2128,6 +2138,10 @@ static void _help(void)
 "  -k, --no-kill               do not kill job on node failure\n"
 "  -K, --kill-command[=signal] signal to send terminating job\n"
 "  -L, --licenses=names        required license, comma separated\n"
+"  -M, --clusters=names        Comma separated list of clusters to issue\n"
+"                              commands to.  Default is current cluster.\n"
+"                              Name of 'all' will submit to run on all clusters.\n"
+"                              NOTE: SlurmDBD must up.\n"
 "  -m, --distribution=type     distribution method for processes to nodes\n"
 "                              (type = block|cyclic|arbitrary)\n"
 "      --mail-type=type        notify on state change: BEGIN, END, FAIL or ALL\n"
diff --git a/src/salloc/opt.h b/src/salloc/opt.h
index 8b96d15441f123c42b53ce583f079dc26376645a..21bc42504ffbc6201bf519b31e1e60cf518aabd4 100644
--- a/src/salloc/opt.h
+++ b/src/salloc/opt.h
@@ -60,7 +60,7 @@
 typedef enum {BELL_NEVER, BELL_AFTER_DELAY, BELL_ALWAYS} bell_flag_t;
 
 typedef struct salloc_options {
-
+	char *clusters;		/* cluster to run this on. */
 	char *progname;		/* argv[0] of this program or
 				 * configuration file if multi_prog */
 	char* user;		/* local username		*/
diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c
index 817f90ab0b76ea94c5363ad5e4274911aef6891b..a772993dc4d4644b7bf19cbd380a8d9a5e97f52d 100644
--- a/src/salloc/salloc.c
+++ b/src/salloc/salloc.c
@@ -59,6 +59,7 @@
 #include "src/common/cpu_frequency.h"
 #include "src/common/env.h"
 #include "src/common/plugstack.h"
+#include "src/common/proc_args.h"
 #include "src/common/read_config.h"
 #include "src/common/slurm_rlimits_info.h"
 #include "src/common/slurm_time.h"
@@ -305,6 +306,16 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	/* If can run on multiple clusters find the earliest run time
+	 * and run it there */
+	desc.clusters = xstrdup(opt.clusters);
+	if (opt.clusters &&
+	    slurmdb_get_first_avail_cluster(&desc, opt.clusters,
+			&working_cluster_rec) != SLURM_SUCCESS) {
+		print_db_notok(opt.clusters, 0);
+		exit(error_exit);
+	}
+
 	callbacks.ping = _ping_handler;
 	callbacks.timeout = _timeout_handler;
 	callbacks.job_complete = _job_complete_handler;
@@ -560,6 +571,8 @@ relinquish:
 			}
 		}
 	}
+
+	xfree(desc.clusters);
 	return rc;
 }
 
diff --git a/src/sbatch/opt.c b/src/sbatch/opt.c
index 036937bc24054b8233c727e1e5787c70eabf136e..dabcfaf84aa3829a30fbdc291dcdd93081a1a16f 100644
--- a/src/sbatch/opt.c
+++ b/src/sbatch/opt.c
@@ -3350,9 +3350,6 @@ static void _help(void)
 "      --bb=<spec>             burst buffer specifications\n"
 "      --bbf=<file_name>       burst buffer specification file\n"
 "      --begin=time            defer job until HH:MM MM/DD/YY\n"
-"  -M, --clusters=names        Comma separated list of clusters to issue\n"
-"                              commands to.  Default is current cluster.\n"
-"                              Name of 'all' will submit to run on all clusters.\n"
 "      --comment=name          arbitrary comment\n"
 "      --cpu-freq=min[-max[:gov]] requested cpu frequency (and governor)\n"
 "  -c, --cpus-per-task=ncpus   number of cpus required per task\n"
@@ -3378,9 +3375,12 @@ static void _help(void)
 "  -J, --job-name=jobname      name of job\n"
 "  -k, --no-kill               do not kill job on node failure\n"
 "  -L, --licenses=names        required license, comma separated\n"
+"  -M, --clusters=names        Comma separated list of clusters to issue\n"
+"                              commands to.  Default is current cluster.\n"
+"                              Name of 'all' will submit to run on all clusters.\n"
+"                              NOTE: SlurmDBD must up.\n"
 "  -m, --distribution=type     distribution method for processes to nodes\n"
 "                              (type = block|cyclic|arbitrary)\n"
-
 "      --mail-type=type        notify on state change: BEGIN, END, FAIL or ALL\n"
 "      --mail-user=user        who to send email notification for job state\n"
 "                              changes\n"
diff --git a/src/sbatch/sbatch.c b/src/sbatch/sbatch.c
index 8992a4c5a29f4b6c7e45190fa9a2b4afe1cb6ad5..f0a7fe784fd1d3a6136a913e2a3c4e36ea1feda5 100644
--- a/src/sbatch/sbatch.c
+++ b/src/sbatch/sbatch.c
@@ -161,6 +161,7 @@ int main(int argc, char *argv[])
 
 	/* If can run on multiple clusters find the earliest run time
 	 * and run it there */
+	desc.clusters = xstrdup(opt.clusters);
 	if (opt.clusters &&
 	    slurmdb_get_first_avail_cluster(&desc, opt.clusters,
 			&working_cluster_rec) != SLURM_SUCCESS) {
@@ -168,7 +169,6 @@ int main(int argc, char *argv[])
 		exit(error_exit);
 	}
 
-
 	if (_check_cluster_specific_settings(&desc) != SLURM_SUCCESS)
 		exit(error_exit);
 
@@ -221,6 +221,7 @@ int main(int argc, char *argv[])
 	if (opt.wait)
 		rc = _job_wait(resp->job_id);
 
+	xfree(desc.clusters);
 	xfree(desc.name);
 	xfree(desc.script);
 	env_array_free(desc.environment);
diff --git a/src/scancel/opt.c b/src/scancel/opt.c
index bbe4f783c361261186975f8aa189b24404e568be..18011629d8d4a7729eb3594c42b256956b9875d8 100644
--- a/src/scancel/opt.c
+++ b/src/scancel/opt.c
@@ -701,6 +701,8 @@ static void _help(void)
 /*	printf("      --ctld                      send request directly to slurmctld\n"); */
 	printf("  -f, --full                      signal batch shell and all steps for specified job\n");
 	printf("  -i, --interactive               require response from user for each job\n");
+	printf("  -M, --clusters                  clusters to issue commands to.\n");
+	printf("                                  NOTE: SlurmDBD must be up.\n");
 	printf("  -n, --name=job_name             act only on jobs with this name\n");
 	printf("  -p, --partition=partition       act only on jobs in this partition\n");
 	printf("  -Q, --quiet                     disable warnings\n");
diff --git a/src/scancel/scancel.c b/src/scancel/scancel.c
index e9db52e6acb8731475e10a295fbf6de67fc98e92..e96c83fb84ee6f1236b42d69a2eeb06d5052fe30 100644
--- a/src/scancel/scancel.c
+++ b/src/scancel/scancel.c
@@ -183,7 +183,8 @@ _load_job_records (void)
 	/* We need the fill job array string representation for identifying
 	 * and killing job arrays */
 	setenv("SLURM_BITSTR_LEN", "0", 1);
-	error_code = slurm_load_jobs ((time_t) NULL, &job_buffer_ptr, 1);
+	error_code = slurm_load_jobs ((time_t) NULL, &job_buffer_ptr,
+				      (SHOW_ALL | SHOW_FED_TRACK));
 
 	if (error_code) {
 		slurm_perror ("slurm_load_jobs error");
diff --git a/src/scontrol/scontrol.c b/src/scontrol/scontrol.c
index f9da84f77d35309e43f41f752e7756c2fb159238..f7a3ab94fc3ebc53ec7fedbae2261bcf0845b146 100644
--- a/src/scontrol/scontrol.c
+++ b/src/scontrol/scontrol.c
@@ -1917,6 +1917,7 @@ scontrol [<OPTION>] [<COMMAND>]                                            \n\
      -h or --help: equivalent to \"help\" command                          \n\
      --hide: equivalent to \"hide\" command                                \n\
      -M or --cluster: equivalent to \"cluster\" command                    \n\
+             NOTE: SlurmDBD must be up.                                    \n\
      -o or --oneliner: equivalent to \"oneliner\" command                  \n\
      -Q or --quiet: equivalent to \"quiet\" command                        \n\
      -v or --verbose: equivalent to \"verbose\" command                    \n\
@@ -1934,6 +1935,7 @@ scontrol [<OPTION>] [<COMMAND>]                                            \n\
      cluster                  cluster to issue commands to.  Default is    \n\
 			      current cluster.  cluster with no name will  \n\
 			      reset to default.                            \n\
+                              NOTE: SlurmDBD must be up.                   \n\
      checkpoint <CH_OP><ID>   perform a checkpoint operation on identified \n\
 			      job or job step \n\
      completing               display jobs in completing state along with  \n\
diff --git a/src/sinfo/opts.c b/src/sinfo/opts.c
index c3766d5307c4e0124889e57aac7f5eda14385d38..4b5181b6b6c579eae5214aab84c44fe8a464f268 100644
--- a/src/sinfo/opts.c
+++ b/src/sinfo/opts.c
@@ -1311,6 +1311,8 @@ Usage: sinfo [OPTIONS]\n\
   --hide                     do not show hidden or non-accessible partitions\n\
   -i, --iterate=seconds      specify an iteration period\n\
   -l, --long                 long output - displays more information\n\
+  -M, --clusters=names       clusters to issue commands to.\n\
+                             NOTE: SlurmDBD must be up.\n\
   -n, --nodes=NODES          report on specific node(s)\n\
   --noconvert                don't convert units from their original type\n\
 			     (e.g. 2048M won't be converted to 2G).\n\
diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c
index 60945950b633f8fdd158b9d9478485500bfeda06..8762e12e6d6b70370bf7ff3e45e1f6fe00482d2e 100644
--- a/src/slurmctld/controller.c
+++ b/src/slurmctld/controller.c
@@ -1094,6 +1094,7 @@ static void *_service_connection(void *arg)
 	}
 #endif
 	slurm_msg_t_init(&msg);
+	msg.flags |= SLURM_MSG_KEEP_BUFFER;
 	/*
 	 * slurm_receive_msg sets msg connection fd to accepted fd. This allows
 	 * possibility for slurmctld_req() to close accepted connection.
diff --git a/src/slurmctld/fed_mgr.c b/src/slurmctld/fed_mgr.c
index dea283cb5badf07c1934b97327eac9d4668dc52f..cc917b8eecb5b9b52e208d2d2df88a228dbbe97a 100644
--- a/src/slurmctld/fed_mgr.c
+++ b/src/slurmctld/fed_mgr.c
@@ -45,6 +45,7 @@
 
 #include "src/common/list.h"
 #include "src/common/macros.h"
+#include "src/common/parse_time.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/slurmdbd_defs.h"
 #include "src/common/xmalloc.h"
@@ -57,6 +58,8 @@
 #define FED_MGR_STATE_FILE       "fed_mgr_state"
 #define FED_MGR_CLUSTER_ID_BEGIN 26
 
+#define FED_SIBLING_BIT(x) ((uint64_t)1 << (x - 1))
+
 slurmdb_federation_rec_t     *fed_mgr_fed_rec      = NULL;
 static slurmdb_cluster_rec_t *fed_mgr_cluster_rec  = NULL;
 
@@ -66,6 +69,31 @@ static pthread_mutex_t open_send_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t update_mutex = PTHREAD_MUTEX_INITIALIZER;
 
+/* structs to pass to threads */
+typedef struct {
+	will_run_response_msg_t *resp;
+	slurmdb_cluster_rec_t  	*sibling;
+	sib_msg_t               *sib_msg;
+	uid_t                    uid;
+	pthread_t                thread_id;
+	int                      thread_rc;
+} sib_willrun_t;
+
+typedef struct {
+	slurmdb_cluster_rec_t *sibling;
+	sib_msg_t             *sib_msg;
+	pthread_t              thread_id;
+	int                    thread_rc;
+} sib_submit_t;
+
+typedef struct {
+	job_desc_msg_t        *job_desc;
+	slurmdb_cluster_rec_t *sibling;
+	pthread_t              thread_id;
+	int                    thread_rc;
+} sib_update_t;
+
+
 static int _close_controller_conn(slurmdb_cluster_rec_t *cluster)
 {
 	int rc = SLURM_SUCCESS;
@@ -438,8 +466,8 @@ static void _persist_callback_fini(void *arg)
 	}
 
 	if (!(cluster = list_find_first(fed_mgr_fed_rec->cluster_list,
-				       slurmdb_find_cluster_in_list,
-				       persist_conn->cluster_name))) {
+					slurmdb_find_cluster_in_list,
+					persist_conn->cluster_name))) {
 		info("Couldn't find cluster %s?",
 		     persist_conn->cluster_name);
 		unlock_slurmctld(fed_write_lock);
@@ -485,6 +513,168 @@ static void _join_federation(slurmdb_federation_rec_t *fed,
 	_create_ping_thread();
 }
 
+static int _persist_job_will_run(slurmdb_cluster_rec_t *conn,
+				 sib_msg_t *sib_msg,
+				 will_run_response_msg_t **will_run_resp)
+{
+	int rc = SLURM_PROTOCOL_SUCCESS;
+	slurm_msg_t req_msg, resp_msg;
+
+	slurm_msg_t_init(&req_msg);
+	slurm_msg_t_init(&resp_msg);
+
+	req_msg.msg_type = REQUEST_SIB_JOB_WILL_RUN;
+	req_msg.data     = sib_msg;
+
+	rc = _send_recv_msg(conn, &req_msg, &resp_msg, false);
+	if (rc < 0) {
+		rc = SLURM_PROTOCOL_ERROR;
+		goto end_it;
+	}
+
+	switch (resp_msg.msg_type) {
+	case RESPONSE_SLURM_RC:
+		if ((rc = slurm_get_return_code(resp_msg.msg_type,
+						resp_msg.data))) {
+			info("persistent will_run failed/resources not avail: %d", rc);
+			slurm_seterrno(rc);
+			rc = SLURM_PROTOCOL_ERROR;
+		}
+		break;
+	case RESPONSE_JOB_WILL_RUN:
+		*will_run_resp = (will_run_response_msg_t *) resp_msg.data;
+		resp_msg.data = NULL;
+		break;
+	default:
+		slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR);
+		rc = SLURM_PROTOCOL_ERROR;
+		break;
+	}
+
+end_it:
+	slurm_free_msg_members(&resp_msg);
+
+	return rc;
+}
+
+static int _persist_submit_batch_job(slurmdb_cluster_rec_t *conn,
+				     sib_msg_t *sib_msg,
+				     submit_response_msg_t **resp)
+{
+        int rc = SLURM_PROTOCOL_SUCCESS;
+        slurm_msg_t req_msg, resp_msg;
+
+	*resp = NULL;
+
+	slurm_msg_t_init(&req_msg);
+	slurm_msg_t_init(&resp_msg);
+
+	req_msg.msg_type = REQUEST_SIB_SUBMIT_BATCH_JOB;
+	req_msg.data     = sib_msg;
+
+	rc = _send_recv_msg(conn, &req_msg, &resp_msg, false);
+	if (rc) {
+		rc = SLURM_PROTOCOL_ERROR;
+		goto end_it;
+	}
+
+	switch (resp_msg.msg_type) {
+	case RESPONSE_SLURM_RC:
+		if ((rc = ((return_code_msg_t *) resp_msg.data)->return_code)) {
+			slurm_seterrno(rc);
+			rc = SLURM_PROTOCOL_ERROR;
+		}
+		break;
+	case RESPONSE_SUBMIT_BATCH_JOB:
+		*resp = (submit_response_msg_t *) resp_msg.data;
+		resp_msg.data = NULL;
+		break;
+	default:
+		slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR);
+		rc = SLURM_PROTOCOL_ERROR;
+	}
+
+end_it:
+	slurm_free_msg_members(&resp_msg);
+
+	return rc;
+}
+
+static int _persist_allocte_resources(slurmdb_cluster_rec_t *conn,
+				      sib_msg_t *sib_msg,
+				      resource_allocation_response_msg_t **resp)
+{
+        int rc = SLURM_PROTOCOL_SUCCESS;
+        slurm_msg_t req_msg, resp_msg;
+
+	*resp = NULL;
+
+	slurm_msg_t_init(&req_msg);
+	slurm_msg_t_init(&resp_msg);
+
+	req_msg.msg_type = REQUEST_SIB_RESOURCE_ALLOCATION;
+	req_msg.data     = sib_msg;
+
+	rc = _send_recv_msg(conn, &req_msg, &resp_msg, false);
+	if (rc) {
+		rc = SLURM_PROTOCOL_ERROR;
+		goto end_it;
+	}
+
+	switch (resp_msg.msg_type) {
+	case RESPONSE_SLURM_RC:
+		if ((rc = ((return_code_msg_t *) resp_msg.data)->return_code)) {
+			slurm_seterrno(rc);
+			rc = SLURM_PROTOCOL_ERROR;
+		}
+		break;
+	case RESPONSE_RESOURCE_ALLOCATION:
+		*resp = (resource_allocation_response_msg_t *) resp_msg.data;
+		resp_msg.data = NULL;
+		break;
+	default:
+		slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR);
+		rc = SLURM_PROTOCOL_ERROR;
+	}
+
+end_it:
+	slurm_free_msg_members(&resp_msg);
+
+	return rc;
+}
+
+static int _persist_update_job(slurmdb_cluster_rec_t *conn,
+			       job_desc_msg_t *data)
+{
+	int rc;
+	slurm_msg_t req_msg;
+	slurm_msg_t resp_msg;
+
+	slurm_msg_t_init(&req_msg);
+	req_msg.msg_type = REQUEST_UPDATE_JOB;
+	req_msg.data     = data;
+
+	rc = _send_recv_msg(conn, &req_msg, &resp_msg, false);
+	if (rc == SLURM_SOCKET_ERROR)
+		return SLURM_ERROR;
+
+	switch (resp_msg.msg_type) {
+	case RESPONSE_SLURM_RC:
+		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
+		if (rc) {
+			slurm_free_msg_members(&resp_msg);
+			slurm_seterrno_ret(rc);
+		}
+		break;
+	default:
+		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
+	}
+
+	slurm_free_msg_members(&resp_msg);
+
+        return SLURM_PROTOCOL_SUCCESS;
+}
+
 extern int fed_mgr_init(void *db_conn)
 {
 	int rc = SLURM_SUCCESS;
@@ -567,10 +757,14 @@ extern int fed_mgr_fini()
 
 	lock_slurmctld(fed_write_lock);
 
-	slurm_persist_conn_recv_server_fini();
-
+	/* Call _leave_federation() before slurm_persist_conn_recv_server_fini()
+	 * as this will NULL out the cluster's recv persistent connection before
+	 * _server_fini() actually destroy's it. That way the cluster's recv
+	 * connection won't be pointing to bad memory. */
 	_leave_federation();
 
+	slurm_persist_conn_recv_server_fini();
+
 	unlock_slurmctld(fed_write_lock);
 
 	return SLURM_SUCCESS;
@@ -771,7 +965,7 @@ extern slurmdb_federation_rec_t *fed_mgr_state_load(char *state_save_location)
 		 !list_count(ret_fed->cluster_list)) {
 		slurmdb_destroy_federation_rec(ret_fed);
 		ret_fed = NULL;
-		error("No feds retrieved");
+		debug("No feds to retrieve from state");
 	} else {
 		/* We want to free the connections here since they don't exist
 		 * anymore, but they were packed when state was saved. */
@@ -797,34 +991,17 @@ unpack_error:
 	return NULL;
 }
 
-extern int _find_sibling_by_ip(void *x, void *key)
+static int _find_sibling_by_id(void *x, void *key)
 {
 	slurmdb_cluster_rec_t *object = (slurmdb_cluster_rec_t *)x;
-	char *ip = (char *)key;
+	int id = (intptr_t)key;
 
-	if (!xstrcmp(object->control_host, ip))
+	if (object->fed.id == id)
 		return 1;
 
 	return 0;
 }
 
-extern char *fed_mgr_find_sibling_name_by_ip(char *ip)
-{
-	char *name = NULL;
-	slurmdb_cluster_rec_t *sibling = NULL;
-	slurmctld_lock_t fed_read_lock = {
-		NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
-
-	lock_slurmctld(fed_read_lock);
-	if (fed_mgr_fed_rec && fed_mgr_fed_rec->cluster_list &&
-	    (sibling = list_find_first(fed_mgr_fed_rec->cluster_list,
-				       _find_sibling_by_ip, ip)))
-		name = xstrdup(sibling->name);
-	unlock_slurmctld(fed_read_lock);
-
-	return name;
-}
-
 /*
  * Returns true if the cluster is part of a federation.
  */
@@ -950,3 +1127,752 @@ extern int fed_mgr_add_sibling_conn(slurm_persist_conn_t *persist_conn,
 
 	return rc;
 }
+
+static void _destroy_sib_willrun(void *object)
+{
+	sib_willrun_t *resp = (sib_willrun_t *)object;
+	if (resp) {
+		slurm_free_will_run_response_msg(resp->resp);
+		xfree(resp);
+	}
+}
+
+static void _xfree_f(void* p)
+{
+	xfree(p);
+}
+
+static void *_sib_will_run(void *arg)
+{
+	int rc = SLURM_SUCCESS;
+	sib_willrun_t *sib_willrun = (sib_willrun_t *)arg;
+
+	if (sib_willrun->sibling == fed_mgr_cluster_rec) {
+		char *err_msg = NULL;
+		struct job_record *job_ptr = NULL;
+		job_desc_msg_t *job_desc;
+		sib_msg_t *sib_msg = sib_willrun->sib_msg;
+		slurmctld_lock_t job_write_lock = {
+			NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK };
+
+		lock_slurmctld(job_write_lock);
+		job_desc = sib_msg->data;
+
+		if (job_desc->job_id == NO_VAL) {
+			/* Get a job_id now without incrementing the job_id
+			 * count. This prevents burning job_ids on will_runs */
+			job_desc->job_id = get_next_job_id(true);
+		}
+
+		rc = job_allocate(sib_msg->data, false, true,
+				  &sib_willrun->resp, true, sib_willrun->uid,
+				  &job_ptr, &err_msg, sib_msg->data_version);
+		unlock_slurmctld(job_write_lock);
+
+		if (rc)
+			debug2("%s: %s", __func__, slurm_strerror(rc));
+	} else if ((rc = _persist_job_will_run(sib_willrun->sibling,
+					       sib_willrun->sib_msg,
+					       &sib_willrun->resp))) {
+		error("Failed to get will_run response from sibling %s",
+		      sib_willrun->sibling->name);
+	}
+
+	sib_willrun->thread_rc = rc;
+
+	return NULL;
+}
+
+static int _sort_sib_will_runs(void *x, void *y)
+{
+	int idle_rc = 0;
+	sib_willrun_t *run1 = *(sib_willrun_t **)x;
+	sib_willrun_t *run2 = *(sib_willrun_t **)y;
+
+	if (!run1->resp)
+		return 1;
+	if (!run2->resp)
+		return -1;
+
+	if (run1->sibling->fed.weight < run2->sibling->fed.weight)
+		return -1;
+	if (run1->sibling->fed.weight > run2->sibling->fed.weight)
+		return 1;
+
+	/* pack jobs onto clusters with most avail resources. */
+	if (run1->resp->sys_usage_per < run2->resp->sys_usage_per)
+		idle_rc = 1;
+	if (run1->resp->sys_usage_per > run2->resp->sys_usage_per)
+		idle_rc = -1;
+
+	/* spread jobs across clusters */
+	if (fed_mgr_fed_rec->flags & FEDERATION_FLAG_LLC)
+		idle_rc *= -1;
+
+	return idle_rc;
+}
+
+/*
+ * Convert comma separated list of cluster names to bitmap of cluster ids.
+ */
+static uint64_t _cluster_names_to_ids(char *clusters)
+{
+	uint64_t cluster_ids = 0;
+	List cluster_names = list_create(slurm_destroy_char);
+
+	xassert(clusters);
+
+	if (!xstrcasecmp(clusters, "all"))
+	    return INFINITE64;
+
+	if (slurm_addto_char_list(cluster_names, clusters)) {
+		ListIterator itr = list_iterator_create(cluster_names);
+		char *cluster_name;
+		slurmdb_cluster_rec_t *sibling;
+
+		while ((cluster_name = list_next(itr))) {
+			if ((sibling =
+			     list_find_first(fed_mgr_fed_rec->cluster_list,
+					     slurmdb_find_cluster_in_list,
+					     cluster_name))) {
+				cluster_ids |= FED_SIBLING_BIT(sibling->fed.id);
+			}
+		}
+		list_iterator_destroy(itr);
+	}
+	FREE_NULL_LIST(cluster_names);
+
+	return cluster_ids;
+}
+
+/*
+ * Get will_run responses from all clusters in a federation.
+ * IN msg - contains the original job_desc buffer to send to the siblings and to
+ * 	be able to create a job_desc copy to willrun itself.
+ * IN job_desc - original job_desc. It contains the federated job_id to put on
+ * 	the unpacked job_desc. This is not used for the actual will_run because
+ * 	job_allocate will modify the job_desc.
+ * IN uid - uid of user submitting the job
+ * RET returns a list of will_run_response_msg_t*'s.
+ */
+static List _get_sib_will_runs(slurm_msg_t *msg, job_desc_msg_t *job_desc,
+			       uid_t uid)
+{
+	sib_willrun_t *sib_willrun     = NULL;
+	slurmdb_cluster_rec_t *sibling = NULL;
+	ListIterator sib_itr, resp_itr;
+	List sib_willruns = NULL;
+	pthread_attr_t attr;
+	sib_msg_t sib_msg;
+	uint32_t buf_offset;
+	uint64_t cluster_list = INFINITE64; /* all clusters available */
+	slurm_msg_t tmp_msg;
+
+	xassert(job_desc);
+	xassert(msg);
+
+	slurm_attr_init(&attr);
+	sib_willruns = list_create(_destroy_sib_willrun);
+
+	/* Create copy of submitted job_desc since job_allocate() can modify the
+	 * original job_desc. */
+	buf_offset = get_buf_offset(msg->buffer);
+	slurm_msg_t_init(&tmp_msg);
+	tmp_msg.flags            = msg->flags;
+	tmp_msg.msg_type         = msg->msg_type;
+	tmp_msg.protocol_version = msg->protocol_version;
+
+	unpack_msg(&tmp_msg, msg->buffer);
+	set_buf_offset(msg->buffer, buf_offset);
+
+	((job_desc_msg_t *)tmp_msg.data)->job_id = job_desc->job_id;
+	sib_msg.data         = tmp_msg.data;
+	sib_msg.data_buffer  = msg->buffer;
+	sib_msg.data_version = msg->protocol_version;
+	sib_msg.data_type    = msg->msg_type;
+
+	if (job_desc->clusters)
+		cluster_list = _cluster_names_to_ids(job_desc->clusters);
+
+	/* willrun the sibling clusters */
+	sib_itr = list_iterator_create(fed_mgr_fed_rec->cluster_list);
+	while ((sibling = list_next(sib_itr))) {
+		if (!(cluster_list & FED_SIBLING_BIT(sibling->fed.id))) {
+			if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR)
+				info("skipping cluster %s -- not in cluster list to submit job to",
+				     sibling->name);
+
+			continue;
+		}
+
+		sib_willrun = xmalloc(sizeof(sib_willrun_t));
+		sib_willrun->sibling = sibling;
+		sib_willrun->uid     = uid;
+		sib_willrun->sib_msg = &sib_msg;
+
+		if (pthread_create(&sib_willrun->thread_id, &attr,
+				   _sib_will_run, sib_willrun) != 0) {
+			error("failed to create sib_will_run thread for sib %s",
+			      sibling->name);
+			_destroy_sib_willrun(sib_willrun);
+			continue;
+		}
+
+		list_append(sib_willruns, sib_willrun);
+	}
+	list_iterator_destroy(sib_itr);
+
+	slurm_attr_destroy(&attr);
+
+	resp_itr = list_iterator_create(sib_willruns);
+	while ((sib_willrun = list_next(resp_itr))) {
+		pthread_join(sib_willrun->thread_id, NULL);
+
+		if (sib_willrun->resp &&
+		    (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR)) {
+			char buf[64];
+			slurm_make_time_str(&sib_willrun->resp->start_time,
+					    buf, sizeof(buf));
+			info("will_run_resp for %s: "
+			     "start:%s sys_usage:%-6.2f weight:%d",
+			     sib_willrun->sibling->name, buf,
+			     sib_willrun->resp->sys_usage_per,
+			     sib_willrun->sibling->fed.weight);
+		}
+	}
+
+	list_iterator_destroy(resp_itr);
+
+	/* Free unpacked job_desc data */
+	slurm_free_msg_members(&tmp_msg);
+
+	return sib_willruns;
+}
+
+/*
+ * Find a sibling that can start the job now.
+ * IN msg - contains the original job_desc buffer to send to the siblings and to
+ * 	be able to create a job_desc copy to willrun itself.
+ * IN job_desc - original job_desc. It contains the federated job_id to put on
+ * 	the unpacked job_desc. This is not used for the actual will_run because
+ * 	job_allocate will modify the job_desc.
+ * IN uid - uid of user submitting the job
+ * OUT avail_sibs - bitmap of cluster ids that returned a will_run_response.
+ * RET returns a ptr to a cluster_rec that can or start the job now or NULL if
+ * 	no cluster can start the job now.
+ */
+static slurmdb_cluster_rec_t *_find_start_now_sib(slurm_msg_t *msg,
+						  job_desc_msg_t *job_desc,
+						  uid_t uid,
+						  uint64_t *avail_sibs)
+{
+	ListIterator itr;
+	List sib_willruns;
+	sib_willrun_t *sib_willrun     = NULL;
+	sib_willrun_t *start_now_sib   = NULL;
+	slurmdb_cluster_rec_t *ret_sib = NULL;
+	time_t now = 0;
+
+	xassert(avail_sibs);
+	xassert(job_desc);
+	xassert(msg);
+
+	if (!(sib_willruns = _get_sib_will_runs(msg, job_desc, uid))) {
+		error("Failed to get any will_run responses from any sibs");
+		return NULL;
+	}
+
+	list_sort(sib_willruns, (ListCmpF)_sort_sib_will_runs);
+
+	now = time(NULL);
+
+	itr = list_iterator_create(sib_willruns);
+	while ((sib_willrun = list_next(itr))) {
+		if (!sib_willrun->resp) /* no response if job couldn't run? */
+			continue;
+
+		*avail_sibs |= FED_SIBLING_BIT(sib_willrun->sibling->fed.id);
+
+		/* Pick first sibling that can start the job now. siblings are
+		 * sorted by weight and resources. */
+		if (sib_willrun->resp->start_time <= now) {
+			start_now_sib = sib_willrun;
+			break;
+		}
+	}
+	if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) {
+		if (start_now_sib)
+			info("Earliest cluster:%s time:%ld now:%ld",
+			     start_now_sib->sibling->name,
+			     start_now_sib->resp->start_time, now);
+		else
+			info("No siblings can start the job now (%ld))", now);
+	}
+	list_iterator_destroy(itr);
+
+	if (start_now_sib)
+		ret_sib = start_now_sib->sibling;
+
+	FREE_NULL_LIST(sib_willruns);
+
+	return ret_sib;
+}
+
+static void *_submit_sibling_allocation(void *arg)
+{
+	int rc = SLURM_SUCCESS;
+	resource_allocation_response_msg_t *alloc_resp = NULL;
+	sib_submit_t *sub = (sib_submit_t *)arg;
+	slurmdb_cluster_rec_t *sibling = sub->sibling;
+	sib_msg_t *sib_msg             = sub->sib_msg;
+
+	if ((rc = _persist_allocte_resources(sibling, sib_msg, &alloc_resp))) {
+		error("Failed to submit job to sibling %s: %m", sibling->name);
+	} else if (!alloc_resp) {
+		error("Got a success back without a resp. This shouldn't happen");
+		rc = SLURM_ERROR;
+	} else if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) {
+		info("Submitted federated allocation %u to %s",
+		     alloc_resp->job_id, sibling->name);
+	}
+	sub->thread_rc = rc;
+
+	slurm_free_resource_allocation_response_msg(alloc_resp);
+
+	return NULL;
+}
+
+static void *_submit_sibling_batch_job(void *arg)
+{
+	int rc = SLURM_SUCCESS;
+	submit_response_msg_t *resp = NULL;
+	sib_submit_t *sub = (sib_submit_t *)arg;
+	slurmdb_cluster_rec_t *sibling = sub->sibling;
+	sib_msg_t *sib_msg             = sub->sib_msg;
+
+	if ((rc = _persist_submit_batch_job(sibling, sib_msg, &resp))) {
+		error("Failed to submit job to sibling %s: %m", sibling->name);
+	} else if (!resp) {
+		error("Got a success back without a resp. This shouldn't happen");
+		rc = SLURM_ERROR;
+        } else if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) {
+		info("Submitted federated job %u to %s",
+		     resp->job_id, sibling->name);
+	}
+	sub->thread_rc = rc;
+
+	slurm_free_submit_response_response_msg(resp);
+
+	return NULL;
+}
+
+static void *_update_sibling_job(void *arg)
+{
+	sib_update_t *sub = (sib_update_t *)arg;
+	sub->thread_rc = _persist_update_job(sub->sibling, sub->job_desc);
+
+	return NULL;
+}
+
+/*
+ * Submit sibling jobs to designated (job_desc->fed_siblings) siblings.
+ *
+ * Will update job_desc->fed_siblings if a sibling fails to submit a job.
+ *
+ * IN job_desc - job_desc containing job_id and fed_siblings of job to be.
+ * IN msg - contains the original job_desc buffer to send to the siblings.
+ * IN alloc_only - true if just an allocation. false if a batch job.
+ * RET returns SLURM_SUCCESS if all siblings recieved the job sucessfully or
+ * 	SLURM_ERROR if any siblings failed to receive the job. If a sibling
+ * 	fails, then the sucessful siblings will be updated with the correct
+ * 	sibling bitmap.
+ */
+static int _submit_sibling_jobs(job_desc_msg_t *job_desc, slurm_msg_t *msg,
+				bool alloc_only)
+{
+	int rc = SLURM_SUCCESS;
+	ListIterator sib_itr, thread_itr;
+	List submit_threads = NULL;
+	sib_submit_t *tmp_sub = NULL;
+	sib_msg_t sib_msg;
+	slurmdb_cluster_rec_t *sibling = NULL;
+	pthread_attr_t attr;
+
+	xassert(job_desc);
+	xassert(msg);
+
+	slurm_attr_init(&attr);
+	submit_threads = list_create(_xfree_f);
+
+	sib_msg.data_buffer  = msg->buffer;
+	sib_msg.data_type    = msg->msg_type;
+	sib_msg.data_version = msg->protocol_version;
+	sib_msg.fed_siblings = job_desc->fed_siblings;
+	sib_msg.job_id       = job_desc->job_id;
+
+	sib_itr = list_iterator_create(fed_mgr_fed_rec->cluster_list);
+	while ((sibling = list_next(sib_itr))) {
+		pthread_t thread_id = 0;
+		sib_submit_t *sub;
+
+		if (sibling == fed_mgr_cluster_rec)
+			continue;
+
+		/* fed_siblings is set prior to siblings that responded */
+		if (!(job_desc->fed_siblings &
+		      FED_SIBLING_BIT(sibling->fed.id)))
+			continue;
+
+		sub = xmalloc(sizeof(sib_submit_t));
+		sub->sibling = sibling;
+		sub->sib_msg = &sib_msg;
+		if (pthread_create(&thread_id, &attr,
+				   ((alloc_only) ?
+				    _submit_sibling_allocation :
+				    _submit_sibling_batch_job), sub) != 0) {
+			error("failed to create submit_sibling_job_thread");
+			xfree(sub);
+			continue;
+		}
+		sub->thread_id = thread_id;
+
+		list_append(submit_threads, sub);
+	}
+
+	thread_itr = list_iterator_create(submit_threads);
+	while ((tmp_sub = list_next(thread_itr))) {
+		pthread_join(tmp_sub->thread_id, NULL);
+		rc |= tmp_sub->thread_rc;
+
+		/* take out the job from the siblings bitmap if there was an
+		 * error. The local host should stay in it if it's there. */
+		if (tmp_sub->thread_rc)
+			job_desc->fed_siblings &=
+				(~FED_SIBLING_BIT(tmp_sub->sibling->fed.id));
+	}
+	list_iterator_destroy(thread_itr);
+
+	if (rc && job_desc->fed_siblings) {
+		/* failed to submit a job to sibling. Need to update all of the
+		 * job's fed_siblings bitmaps */
+		List update_threads = list_create(_xfree_f);
+		job_desc_msg_t *job_update_msg =
+			xmalloc(sizeof(job_desc_msg_t));
+
+		slurm_init_job_desc_msg(job_update_msg);
+		job_update_msg->job_id       = job_desc->job_id;
+		job_update_msg->fed_siblings = job_desc->fed_siblings;
+
+		list_iterator_reset(sib_itr);
+		while ((sibling = list_next(sib_itr))) {
+			pthread_t thread_id = 0;
+			sib_update_t *sub;
+
+			/* Local is handled outside */
+			if (sibling == fed_mgr_cluster_rec)
+				continue;
+
+			if (!(job_desc->fed_siblings &
+			      FED_SIBLING_BIT(sibling->fed.id)))
+				continue;
+
+			sub = xmalloc(sizeof(sib_submit_t));
+			sub->job_desc = job_update_msg;
+			sub->sibling  = sibling;
+			if (pthread_create(&thread_id, &attr,
+					   _update_sibling_job, sub) != 0) {
+				error("failed to create submit_sibling_job_thread");
+				xfree(sub);
+				continue;
+			}
+			sub->thread_id = thread_id;
+
+			list_append(update_threads, sub);
+		}
+
+		thread_itr = list_iterator_create(update_threads);
+		while ((tmp_sub = list_next(thread_itr))) {
+			pthread_join(tmp_sub->thread_id, NULL);
+			if (tmp_sub->thread_rc) {
+				error("failed to update sibling job with updated sibling bitmap on sibling %s",
+				      tmp_sub->sibling->name);
+				/* other cluster should get update when it syncs
+				 * up */
+			}
+		}
+		list_iterator_destroy(thread_itr);
+	}
+
+	slurm_attr_destroy(&attr);
+	list_iterator_destroy(sib_itr);
+	FREE_NULL_LIST(submit_threads);
+
+	return rc;
+}
+
+/* Determine how to submit a federated a job.
+ *
+ * First tries to find a cluster that can start the job now. If a cluster can
+ * start the job now, then a sibling job is submitted to that cluster. If no
+ * cluster can start the job now, then siblings jobs are submitted to each
+ * sibling.
+ *
+ * Does its own locking (job and fed). Doesn't have a job write lock when
+ * communicating with siblings to prevent blocking on sibling communications.
+ *
+ * IN msg - msg that contains packed job_desc msg to send to siblings.
+ * IN job_desc - original job_desc msg.
+ * IN alloc_only - true if requesting just an allocation (srun/salloc).
+ * IN uid - uid of user requesting allocation.
+ * IN protocol_version - version of the code the caller is using
+ * OUT job_id_ptr - job_id of allocated job
+ * OUT alloc_code - error_code returned from job_allocate
+ * OUT err_msg - error message returned if any
+ * RET returns SLURM_SUCCESS if the allocation was successful, SLURM_ERROR
+ * 	otherwise.
+ */
+extern int fed_mgr_job_allocate(slurm_msg_t *msg, job_desc_msg_t *job_desc,
+				bool alloc_only, uid_t uid,
+				uint16_t protocol_version,
+				uint32_t *job_id_ptr, int *alloc_code,
+				char **err_msg)
+{
+	int rc = SLURM_SUCCESS;
+	slurmdb_cluster_rec_t *start_now_sib;
+	uint64_t avail_sibs = 0;
+	struct job_record *job_ptr = NULL;
+	slurmctld_lock_t fed_read_lock = {
+		NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
+	slurmctld_lock_t job_write_lock = {
+		NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, NO_LOCK };
+
+	xassert(msg);
+	xassert(job_desc);
+	xassert(job_id_ptr);
+	xassert(alloc_code);
+	xassert(err_msg);
+
+	lock_slurmctld(fed_read_lock);
+
+	lock_slurmctld(job_write_lock);
+	/* get job_id now. Can't submit job to get job_id as job_allocate will
+	 * change the job_desc. */
+	job_desc->job_id = get_next_job_id(false);
+	unlock_slurmctld(job_write_lock);
+
+	/* Don't job/node write lock on _find_start_now_sib. It locks inside
+	 * _sib_will_run */
+	start_now_sib = _find_start_now_sib(msg, job_desc, uid, &avail_sibs);
+
+	if (!avail_sibs) {
+		debug("No cluster responded to sibling will_runs, submitting to self");
+		avail_sibs = FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id);
+	}
+
+	if (start_now_sib == NULL) {
+		job_desc->fed_siblings = avail_sibs;
+	} else if (start_now_sib == fed_mgr_cluster_rec) {
+		job_desc->fed_siblings |=
+			FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id);
+	} else {
+		job_desc->fed_siblings |=
+			FED_SIBLING_BIT(start_now_sib->fed.id);
+	}
+
+	/* Submit local job first. Then submit to all siblings. If the local job
+	 * fails, then don't worry about sending to the siblings. */
+	lock_slurmctld(job_write_lock);
+	*alloc_code = job_allocate(job_desc, job_desc->immediate, false, NULL,
+				   alloc_only, uid, &job_ptr, err_msg,
+				   protocol_version);
+
+	if (!job_ptr || (*alloc_code && job_ptr->job_state == JOB_FAILED)) {
+		unlock_slurmctld(job_write_lock);
+		rc = SLURM_ERROR;
+		/* There may be an rc but the job won't be failed. Will sit in
+		 * qeueue */
+		info("failed to submit federated job to local cluster");
+		goto end_it;
+	}
+
+	*job_id_ptr = job_ptr->job_id;
+
+	info("Submitted %sfederated job %u to %s(self)",
+	     (!(job_ptr->fed_details->siblings &
+		FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id)) ?
+	      "tracking " : ""),
+	     job_ptr->job_id, fed_mgr_cluster_rec->name);
+
+	unlock_slurmctld(job_write_lock);
+
+	if (_submit_sibling_jobs(job_desc, msg, alloc_only)) {
+		/* failed to submit a sibling job to a sibling. Need to update
+		 * the local job's sibling bitmap */
+
+		lock_slurmctld(job_write_lock);
+		if ((job_ptr->magic  == JOB_MAGIC) &&
+		    (job_ptr->job_id == *job_id_ptr)) {
+
+			if (!job_desc->fed_siblings) {
+				/* we know that we already have a job_ptr so
+				 * just make it a scheduleable job. */
+				error("Failed to submit fed job to siblings, submitting to local cluster");
+				job_desc->fed_siblings |=
+					FED_SIBLING_BIT(
+						fed_mgr_cluster_rec->fed.id);
+			}
+			set_job_fed_details(job_ptr, job_desc->fed_siblings);
+		} else {
+			error("%s: job got messed up. this should never happen",
+			      __func__);
+		}
+
+		unlock_slurmctld(job_write_lock);
+	}
+
+end_it:
+	unlock_slurmctld(fed_read_lock);
+
+	return rc;
+}
+
+/* Tests if the job is a tracker only federated job.
+ * Tracker only job: a job that shouldn't run on the local cluster but should be
+ * kept around to facilitate communications for it's sibling jobs on other
+ * clusters.
+ */
+extern bool fed_mgr_is_tracker_only_job(struct job_record *job_ptr)
+{
+	bool rc = false;
+	slurmctld_lock_t fed_read_lock = {
+		NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
+
+	xassert(job_ptr);
+
+	lock_slurmctld(fed_read_lock);
+
+	if (job_ptr->fed_details &&
+	    fed_mgr_cluster_rec &&
+	    (fed_mgr_get_cluster_id(job_ptr->job_id) ==
+	     fed_mgr_cluster_rec->fed.id) &&
+	    (!(job_ptr->fed_details->siblings &
+	      FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id))))
+		rc = true;
+
+	unlock_slurmctld(fed_read_lock);
+
+	return rc;
+}
+
+/* Return the cluster name for the given cluster id.
+ * Must xfree returned string
+ */
+extern char *fed_mgr_get_cluster_name(uint32_t id)
+{
+	slurmdb_cluster_rec_t *sibling;
+	char *name = NULL;
+
+	if ((sibling =
+	     list_find_first(fed_mgr_fed_rec->cluster_list,
+			     _find_sibling_by_id,
+			     (void *)(intptr_t)id))) {
+		name = xstrdup(sibling->name);
+	}
+
+	return name;
+}
+
+
+/* Convert cluster ids to cluster names.
+ *
+ * RET: return string of comma-separated clsuter names.
+ *      Must free returned string.
+ */
+extern char *fed_mgr_cluster_ids_to_names(uint64_t cluster_ids)
+{
+	int bit = 1;
+	char *names = NULL;
+
+	if (!fed_mgr_fed_rec || !fed_mgr_fed_rec->cluster_list)
+		return names;
+
+	while (cluster_ids) {
+		if (cluster_ids & 1) {
+			slurmdb_cluster_rec_t *sibling;
+			if ((sibling =
+			     list_find_first(fed_mgr_fed_rec->cluster_list,
+					     _find_sibling_by_id,
+					     (void *)(intptr_t)bit))){
+				xstrfmtcat(names, "%s%s",
+					   (names) ? "," : "", sibling->name);
+			} else {
+				error("Couldn't find a sibling cluster with id %d",
+				      bit);
+			}
+		}
+
+		cluster_ids >>= 1;
+		bit++;
+	}
+
+	return names;
+}
+
+/* Find the earliest time a job can start by doing willruns to all clusters in
+ * the federation and returning the fastest time.
+ *
+ * IN msg - msg that contains packed job_desc msg to send to siblings.
+ * IN job_desc - original job_desc msg.
+ * IN uid - uid of user requesting will_run.
+ * OUT resp - will_run_response to return
+ * RET returns a SLURM_SUCCESS if a will_run_response is found, SLURM_ERROR
+ * 	otherwise.
+ */
+extern int fed_mgr_sib_will_run(slurm_msg_t *msg, job_desc_msg_t *job_desc,
+				uid_t uid, will_run_response_msg_t **resp)
+{
+	int rc = SLURM_SUCCESS;
+	ListIterator itr;
+	List sib_willruns;
+	sib_willrun_t *sib_willrun;
+	sib_willrun_t *earliest_willrun = NULL;
+	slurmctld_lock_t fed_read_lock = {
+		NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
+
+	xassert(msg);
+	xassert(job_desc);
+	xassert(resp);
+
+	*resp = NULL;
+
+	lock_slurmctld(fed_read_lock);
+
+	if (!(sib_willruns = _get_sib_will_runs(msg, job_desc, uid))) {
+		error("Failed to get any will_run responses from any sibs");
+		return SLURM_ERROR;
+	}
+
+	itr = list_iterator_create(sib_willruns);
+	while ((sib_willrun = list_next(itr))) {
+		if (!sib_willrun->resp) /* no response if job couldn't run? */
+			continue;
+
+		if ((earliest_willrun == NULL) ||
+		    (sib_willrun->resp->start_time <
+		     earliest_willrun->resp->start_time))
+			earliest_willrun = sib_willrun;
+	}
+	list_iterator_destroy(itr);
+
+	if (earliest_willrun) {
+		*resp = earliest_willrun->resp;
+		earliest_willrun->resp = NULL;
+	} else {
+		rc = SLURM_ERROR;
+	}
+
+	FREE_NULL_LIST(sib_willruns);
+	unlock_slurmctld(fed_read_lock);
+
+	return rc;
+}
diff --git a/src/slurmctld/fed_mgr.h b/src/slurmctld/fed_mgr.h
index b8fa7935cffe9f99c5dda575fa00e27f7bb8ae87..f85782df39d342082c8155b3a3f68f00a3a6efd4 100644
--- a/src/slurmctld/fed_mgr.h
+++ b/src/slurmctld/fed_mgr.h
@@ -42,17 +42,26 @@
 
 extern slurmdb_federation_rec_t *fed_mgr_fed_rec;
 
-extern int fed_mgr_init(void *db_conn);
-extern int fed_mgr_fini();
-extern int fed_mgr_update_feds(slurmdb_update_object_t *update);
-extern int fed_mgr_state_save(char *state_save_location);
+extern int       fed_mgr_add_sibling_conn(slurm_persist_conn_t *persist_conn,
+					  char **out_buffer);
+extern char     *fed_mgr_cluster_ids_to_names(uint64_t cluster_ids);
+extern int       fed_mgr_fini();
+extern uint32_t  fed_mgr_get_cluster_id(uint32_t id);
+extern char     *fed_mgr_get_cluster_name(uint32_t id);
+extern uint32_t  fed_mgr_get_job_id(uint32_t orig);
+extern uint32_t  fed_mgr_get_local_id(uint32_t id);
+extern int       fed_mgr_init(void *db_conn);
+extern bool      fed_mgr_is_active();
+extern bool      fed_mgr_is_tracker_only_job(struct job_record *job_ptr);
+extern int       fed_mgr_job_allocate(slurm_msg_t *msg,
+				      job_desc_msg_t *job_desc, bool alloc_only,
+				      uid_t uid, uint16_t protocol_version,
+				      uint32_t *job_id_ptr, int *alloc_code,
+				      char **err_msg);
+extern int       fed_mgr_sib_will_run(slurm_msg_t *msg,
+				      job_desc_msg_t *job_desc, uid_t uid,
+				      will_run_response_msg_t **resp);
 extern slurmdb_federation_rec_t *fed_mgr_state_load(char *state_save_location);
-extern char *fed_mgr_find_sibling_name_by_ip(char *ip);
-extern bool fed_mgr_is_active();
-extern uint32_t fed_mgr_get_job_id(uint32_t orig);
-extern uint32_t fed_mgr_get_local_id(uint32_t id);
-extern uint32_t fed_mgr_get_cluster_id(uint32_t id);
-extern int fed_mgr_add_sibling_conn(slurm_persist_conn_t *persist_conn,
-				    char **out_buffer);
-
+extern int       fed_mgr_state_save(char *state_save_location);
+extern int       fed_mgr_update_feds(slurmdb_update_object_t *update);
 #endif /* _SLURM_FED_MGR_H */
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index b18b0120f120c16ec3b566417af0b000b4fc239f..ad3d9491befc891b4eabe08789eb91229549ac4a 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -176,6 +176,7 @@ static slurmdb_qos_rec_t *_determine_and_validate_qos(
 	bool admin, slurmdb_qos_rec_t *qos_rec,	int *error_code, bool locked);
 static void _dump_job_details(struct job_details *detail_ptr, Buf buffer);
 static void _dump_job_state(struct job_record *dump_job_ptr, Buf buffer);
+static void _free_job_fed_details(job_fed_details_t **fed_details_pptr);
 static void _get_batch_job_dir_ids(List batch_dirs);
 static time_t _get_last_state_write_time(void);
 static void _job_array_comp(struct job_record *job_ptr, bool was_running);
@@ -3531,6 +3532,9 @@ void dump_job_desc(job_desc_msg_t * job_specs)
 	int spec_count;
 	char *mem_type, buf[100], *signal_flags, *spec_type, *job_id;
 
+	if (get_log_level() < LOG_LEVEL_DEBUG3)
+		return;
+
 	if (job_specs == NULL)
 		return;
 
@@ -6535,6 +6539,9 @@ extern int validate_job_create_req(job_desc_msg_t * job_desc, uid_t submit_uid,
 	if (rc != SLURM_SUCCESS)
 		return rc;
 
+	if (job_desc->array_inx && fed_mgr_is_active())
+		return ESLURM_NOT_SUPPORTED;
+
 	if (!_valid_array_inx(job_desc))
 		return ESLURM_INVALID_ARRAY;
 
@@ -7338,6 +7345,8 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
 	}
 	if (job_desc->features)
 		detail_ptr->features = xstrdup(job_desc->features);
+	if (job_desc->fed_siblings)
+		set_job_fed_details(job_ptr, job_desc->fed_siblings);
 	if ((job_desc->shared == JOB_SHARED_NONE) && (select_serial == 0)) {
 		detail_ptr->share_res  = 0;
 		detail_ptr->whole_node = 1;
@@ -8240,6 +8249,7 @@ static void _list_delete_job(void *job_entry)
 	xfree(job_ptr->burst_buffer);
 	checkpoint_free_jobinfo(job_ptr->check_job);
 	xfree(job_ptr->comment);
+	_free_job_fed_details(&job_ptr->fed_details);
 	free_job_resources(&job_ptr->job_resrcs);
 	xfree(job_ptr->gres);
 	xfree(job_ptr->gres_alloc);
@@ -8419,8 +8429,13 @@ static bool _all_parts_hidden(struct job_record *job_ptr)
 }
 
 /* Determine if a given job should be seen by a specific user */
-static bool _hide_job(struct job_record *job_ptr, uid_t uid)
+static bool _hide_job(struct job_record *job_ptr, uid_t uid,
+		      uint16_t show_flags)
 {
+	if (!(show_flags & SHOW_FED_TRACK) &&
+	    job_ptr->fed_details && fed_mgr_is_tracker_only_job(job_ptr))
+		return true;
+
 	if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) &&
 	    (job_ptr->user_id != uid) && !validate_operator(uid) &&
 	    (((slurm_mcs_get_privatedata() == 0) &&
@@ -8474,7 +8489,7 @@ extern void pack_all_jobs(char **buffer_ptr, int *buffer_size,
 		    _all_parts_hidden(job_ptr))
 			continue;
 
-		if (_hide_job(job_ptr, uid))
+		if (_hide_job(job_ptr, uid, show_flags))
 			continue;
 
 		if ((filter_uid != NO_VAL) && (filter_uid != job_ptr->user_id))
@@ -8529,7 +8544,7 @@ extern int pack_one_job(char **buffer_ptr, int *buffer_size,
 	job_ptr = find_job_record(job_id);
 	if (job_ptr && (job_ptr->array_task_id == NO_VAL) &&
 	    !job_ptr->array_recs) {
-		if (!_hide_job(job_ptr, uid)) {
+		if (!_hide_job(job_ptr, uid, show_flags)) {
 			pack_job(job_ptr, show_flags, buffer, protocol_version,
 				 uid);
 			jobs_packed++;
@@ -8540,7 +8555,7 @@ extern int pack_one_job(char **buffer_ptr, int *buffer_size,
 		/* Either the job is not found or it is a job array */
 		if (job_ptr) {
 			packed_head = true;
-			if (!_hide_job(job_ptr, uid)) {
+			if (!_hide_job(job_ptr, uid, show_flags)) {
 				pack_job(job_ptr, show_flags, buffer,
 					 protocol_version, uid);
 				jobs_packed++;
@@ -8552,7 +8567,7 @@ extern int pack_one_job(char **buffer_ptr, int *buffer_size,
 			if ((job_ptr->job_id == job_id) && packed_head) {
 				;	/* Already packed */
 			} else if (job_ptr->array_job_id == job_id) {
-				if (_hide_job(job_ptr, uid))
+				if (_hide_job(job_ptr, uid, show_flags))
 					break;
 				pack_job(job_ptr, show_flags, buffer,
 					 protocol_version, uid);
@@ -8777,6 +8792,17 @@ void pack_job(struct job_record *dump_job_ptr, uint16_t show_flags, Buf buffer,
 		packstr(dump_job_ptr->tres_fmt_alloc_str, buffer);
 		packstr(dump_job_ptr->tres_fmt_req_str, buffer);
 		pack16(dump_job_ptr->start_protocol_ver, buffer);
+
+		if (dump_job_ptr->fed_details) {
+			packstr(dump_job_ptr->fed_details->origin_str, buffer);
+			pack64(dump_job_ptr->fed_details->siblings, buffer);
+			packstr(dump_job_ptr->fed_details->siblings_str,
+				buffer);
+		} else {
+			packnull(buffer);
+			pack64((uint64_t)0, buffer);
+			packnull(buffer);
+		}
 	} else if (protocol_version >= SLURM_16_05_PROTOCOL_VERSION) {
 		detail_ptr = dump_job_ptr->details;
 		pack32(dump_job_ptr->array_job_id, buffer);
@@ -9871,18 +9897,44 @@ void reset_first_job_id(void)
 }
 
 /*
- * get_next_job_id - return the job_id to be used by default for
- *	the next job
+ * Return the next available job_id to be used.
+ *
+ * Must have job_write and fed_read locks when grabbing a job_id
+ *
+ * IN test_only - if true, doesn't advance the job_id sequence, just returns
+ * 	what the next job id will be.
+ * RET a valid job_id or SLURM_ERROR if all job_ids are exhausted.
  */
-extern uint32_t get_next_job_id(void)
+extern uint32_t get_next_job_id(bool test_only)
 {
-	uint32_t next_id;
+	int i;
+	uint32_t new_id, max_jobs, tmp_id_sequence;
 
-	job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
-	next_id = job_id_sequence + 1;
-	if (next_id >= slurmctld_conf.max_job_id)
-		next_id = slurmctld_conf.first_job_id;
-	return next_id;
+	max_jobs = slurmctld_conf.max_job_id - slurmctld_conf.first_job_id;
+	tmp_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
+
+	/* Insure no conflict in job id if we roll over 32 bits */
+	for (i = 0; i < max_jobs; i++) {
+		if (++tmp_id_sequence >= slurmctld_conf.max_job_id)
+			tmp_id_sequence = slurmctld_conf.first_job_id;
+
+		new_id = fed_mgr_get_job_id(tmp_id_sequence);
+
+		if (find_job_record(new_id))
+			continue;
+		if (_dup_job_file_test(new_id))
+			continue;
+
+		if (!test_only)
+			job_id_sequence = tmp_id_sequence;
+
+		return new_id;
+	}
+
+	error("We have exhausted our supply of valid job id values. "
+	      "FirstJobId=%u MaxJobId=%u", slurmctld_conf.first_job_id,
+	      slurmctld_conf.max_job_id);
+	return SLURM_ERROR;
 }
 
 /*
@@ -9891,38 +9943,20 @@ extern uint32_t get_next_job_id(void)
  */
 static int _set_job_id(struct job_record *job_ptr)
 {
-	int i;
-	uint32_t new_id, max_jobs;
+	uint32_t new_id;
 
 	xassert(job_ptr);
 	xassert (job_ptr->magic == JOB_MAGIC);
 
-	max_jobs = slurmctld_conf.max_job_id - slurmctld_conf.first_job_id;
-	job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id);
-
-	/* Insure no conflict in job id if we roll over 32 bits */
-	for (i = 0; i < max_jobs; i++) {
-		if (++job_id_sequence >= slurmctld_conf.max_job_id)
-			job_id_sequence = slurmctld_conf.first_job_id;
-		new_id = job_id_sequence;
-		if (find_job_record(new_id))
-			continue;
-		if (_dup_job_file_test(new_id))
-			continue;
-
-		if (fed_mgr_is_active())
-			job_ptr->job_id = fed_mgr_get_job_id(new_id);
-		else
-			job_ptr->job_id = new_id;
+	if ((new_id = get_next_job_id(false)) != SLURM_ERROR) {
+		job_ptr->job_id = new_id;
 		/* When we get a new job id might as well make sure
 		 * the db_index is 0 since there is no way it will be
 		 * correct otherwise :). */
 		job_ptr->db_index = 0;
 		return SLURM_SUCCESS;
 	}
-	error("We have exhausted our supply of valid job id values. "
-	      "FirstJobId=%u MaxJobId=%u", slurmctld_conf.first_job_id,
-	      slurmctld_conf.max_job_id);
+
 	job_ptr->job_id = NO_VAL;
 	return EAGAIN;
 }
@@ -12038,6 +12072,23 @@ static int _update_job(struct job_record *job_ptr, job_desc_msg_t * job_specs,
 		}
 	}
 
+	if (job_specs->fed_siblings) {
+		slurmctld_lock_t fed_read_lock = {
+			NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
+		if (job_ptr->fed_details)
+			info("update_job: setting fed_siblings from %"PRIu64" to %"PRIu64" for job_id %u",
+			     job_ptr->fed_details->siblings,
+			     job_specs->fed_siblings,
+			     job_ptr->job_id);
+		else
+			info("update_job: setting fed_siblings to %"PRIu64" for job_id %u",
+			     job_specs->fed_siblings,
+			     job_ptr->job_id);
+		lock_slurmctld(fed_read_lock);
+		set_job_fed_details(job_ptr, job_specs->fed_siblings);
+		unlock_slurmctld(fed_read_lock);
+	}
+
 fini:
 	/* This was a local variable, so set it back to NULL */
 	job_specs->tres_req_cnt = NULL;
@@ -16072,3 +16123,37 @@ _kill_dependent(struct job_record *job_ptr)
 	last_job_update = now;
 	srun_allocate_abort(job_ptr);
 }
+
+static void _free_job_fed_details(job_fed_details_t **fed_details_pptr)
+{
+	job_fed_details_t *fed_details_ptr = *fed_details_pptr;
+
+	if (fed_details_ptr) {
+		xfree(fed_details_ptr->origin_str);
+		xfree(fed_details_ptr->siblings_str);
+		xfree(fed_details_ptr);
+		*fed_details_pptr = NULL;
+	}
+}
+
+
+extern void set_job_fed_details(struct job_record *job_ptr,
+				uint64_t fed_siblings)
+{
+	xassert(job_ptr);
+
+	if (!job_ptr->fed_details) {
+		job_ptr->fed_details =
+			xmalloc(sizeof(job_fed_details_t));
+	} else {
+		xfree(job_ptr->fed_details->siblings_str);
+		xfree(job_ptr->fed_details->origin_str);
+	}
+
+	job_ptr->fed_details->siblings = fed_siblings;
+	job_ptr->fed_details->siblings_str =
+		fed_mgr_cluster_ids_to_names(fed_siblings);
+	job_ptr->fed_details->origin_str =
+		fed_mgr_get_cluster_name(
+				fed_mgr_get_cluster_id(job_ptr->job_id));
+}
diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c
index 103f9a04140be7926637fd23b0ceee10421219e7..5b9be28c8fa7fb3eef73a0d545ff710f0580e229 100644
--- a/src/slurmctld/job_scheduler.c
+++ b/src/slurmctld/job_scheduler.c
@@ -75,6 +75,7 @@
 #include "src/slurmctld/acct_policy.h"
 #include "src/slurmctld/agent.h"
 #include "src/slurmctld/burst_buffer.h"
+#include "src/slurmctld/fed_mgr.h"
 #include "src/slurmctld/front_end.h"
 #include "src/slurmctld/job_scheduler.h"
 #include "src/slurmctld/licenses.h"
@@ -278,6 +279,9 @@ static bool _job_runnable_test1(struct job_record *job_ptr, bool sched_plugin)
 	if (!IS_JOB_PENDING(job_ptr) || IS_JOB_COMPLETING(job_ptr))
 		return false;
 
+	if (job_ptr->fed_details && fed_mgr_is_tracker_only_job(job_ptr))
+		return false;
+
 	select_g_select_jobinfo_get(job_ptr->select_jobinfo,
 				    SELECT_JOBDATA_CLEANING,
 				    &cleaning);
@@ -3146,7 +3150,8 @@ static void _delayed_job_start_time(struct job_record *job_ptr)
 		if (!IS_JOB_PENDING(job_q_ptr) || !job_q_ptr->details ||
 		    (job_q_ptr->part_ptr != job_ptr->part_ptr) ||
 		    (job_q_ptr->priority < job_ptr->priority) ||
-		    (job_q_ptr->job_id == job_ptr->job_id))
+		    (job_q_ptr->job_id == job_ptr->job_id) ||
+		    (fed_mgr_is_tracker_only_job(job_q_ptr)))
 			continue;
 		if (job_q_ptr->details->min_nodes == NO_VAL)
 			job_size_nodes = 1;
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index f5b9a38bf4d3c6e3b7b7b710f1ac291071cb6a9f..6f9219d3d856465b0e82085142cd619486b15714 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -165,7 +165,7 @@ inline static void  _slurm_rpc_job_sbcast_cred(slurm_msg_t * msg);
 inline static void  _slurm_rpc_job_step_kill(slurm_msg_t * msg);
 inline static void  _slurm_rpc_job_step_create(slurm_msg_t * msg);
 inline static void  _slurm_rpc_job_step_get_info(slurm_msg_t * msg);
-inline static void  _slurm_rpc_job_will_run(slurm_msg_t * msg);
+inline static void  _slurm_rpc_job_will_run(slurm_msg_t * msg, bool allow_sibs);
 inline static void  _slurm_rpc_job_alloc_info(slurm_msg_t * msg);
 inline static void  _slurm_rpc_job_alloc_info_lite(slurm_msg_t * msg);
 inline static void  _slurm_rpc_kill_job2(slurm_msg_t *msg);
@@ -356,8 +356,61 @@ void slurmctld_req(slurm_msg_t *msg, connection_arg_t *arg)
 		_slurm_rpc_job_step_get_info(msg);
 		break;
 	case REQUEST_JOB_WILL_RUN:
-		_slurm_rpc_job_will_run(msg);
+		_slurm_rpc_job_will_run(msg, true);
 		break;
+	case REQUEST_SIB_JOB_WILL_RUN:
+	{
+		sib_msg_t *sib_msg       = msg->data;
+		job_desc_msg_t *job_desc = sib_msg->data;
+
+		msg->data = job_desc;
+		_slurm_rpc_job_will_run(msg, false);
+		msg->data = sib_msg;
+
+		break;
+	}
+	case REQUEST_SIB_SUBMIT_BATCH_JOB:
+	{
+		uint16_t tmp_version     = msg->protocol_version;
+		sib_msg_t *sib_msg       = msg->data;
+		job_desc_msg_t *job_desc = sib_msg->data;
+		job_desc->job_id         = sib_msg->job_id;
+		job_desc->fed_siblings   = sib_msg->fed_siblings;
+
+		/* set protocol version to that of the client's version so that
+		 * the job's start_protocol_version is that of the client's and
+		 * not the calling controllers. */
+		msg->protocol_version = sib_msg->data_version;
+		msg->data = job_desc;
+
+		_slurm_rpc_submit_batch_job(msg);
+
+		msg->data = sib_msg;
+		msg->protocol_version = tmp_version;
+
+		break;
+	}
+	case REQUEST_SIB_RESOURCE_ALLOCATION:
+	{
+		uint16_t tmp_version     = msg->protocol_version;
+		sib_msg_t *sib_msg       = msg->data;
+		job_desc_msg_t *job_desc = sib_msg->data;
+		job_desc->job_id         = sib_msg->job_id;
+		job_desc->fed_siblings   = sib_msg->fed_siblings;
+
+		/* set protocol version to that of the client's version so that
+		 * the job's start_protocol_version is that of the client's and
+		 * not the calling controllers. */
+		msg->protocol_version = sib_msg->data_version;
+		msg->data = job_desc;
+
+		_slurm_rpc_allocate_resources(msg);
+
+		msg->data = sib_msg;
+		msg->protocol_version = tmp_version;
+
+		break;
+	}
 	case MESSAGE_NODE_REGISTRATION_STATUS:
 		_slurm_rpc_node_registration(msg, 0);
 		break;
@@ -605,8 +658,20 @@ static void _throttle_fini(int *active_rpc_cnt)
  */
 static void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr)
 {
-	char *licenses_used = get_licenses_used();  /* Do before config lock */
-	slurm_ctl_conf_t *conf = slurm_conf_lock();
+	slurm_ctl_conf_t *conf;
+	char *licenses_used;
+	uint32_t next_job_id;
+	slurmctld_lock_t job_write_lock = {
+		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK };
+
+	/* Do before config lock */
+	licenses_used = get_licenses_used();
+
+	lock_slurmctld(job_write_lock);
+	next_job_id   = get_next_job_id(true);
+	unlock_slurmctld(job_write_lock);
+
+	conf = slurm_conf_lock();
 
 	memset(conf_ptr, 0, sizeof(slurm_ctl_conf_t));
 
@@ -736,7 +801,7 @@ static void _fill_ctld_conf(slurm_ctl_conf_t * conf_ptr)
 	conf_ptr->msg_aggr_params     = xstrdup(conf->msg_aggr_params);
 	conf_ptr->msg_timeout         = conf->msg_timeout;
 
-	conf_ptr->next_job_id         = get_next_job_id();
+	conf_ptr->next_job_id         = next_job_id;
 	conf_ptr->node_features_plugins = xstrdup(conf->node_features_plugins);
 	conf_ptr->node_prefix         = xstrdup(conf->node_prefix);
 
@@ -993,10 +1058,10 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 	resource_allocation_response_msg_t alloc_msg;
 	/* Locks: Read config, read job, read node, read partition */
 	slurmctld_lock_t job_read_lock = {
-		READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, NO_LOCK };
+		READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK };
 	/* Locks: Read config, write job, write node, read partition */
 	slurmctld_lock_t job_write_lock = {
-		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK };
+		READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
 	uid_t uid = g_slurm_auth_get_uid(msg->auth_cred,
 					 slurmctld_config.auth_info);
 	int immediate = job_desc_msg->immediate;
@@ -1056,24 +1121,60 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 		if (error_code == SLURM_SUCCESS) {
 			do_unlock = true;
 			_throttle_start(&active_rpc_cnt);
-			lock_slurmctld(job_write_lock);
 
-			error_code = job_allocate(job_desc_msg, immediate,
-						  false, NULL,
-						  true, uid, &job_ptr,
-						  &err_msg,
-						  msg->protocol_version);
-			/* unlock after finished using the job structure data */
+			if (job_desc_msg->job_id == SLURM_BATCH_SCRIPT &&
+			    fed_mgr_is_active()) {
+				uint32_t job_id;
+				if (fed_mgr_job_allocate(
+							msg, job_desc_msg, true,
+							uid,
+							msg->protocol_version,
+							&job_id, &error_code,
+							&err_msg)) {
+					do_unlock = false;
+					_throttle_fini(&active_rpc_cnt);
+					reject_job = true;
+				} else {
+					/* fed_mgr_job_allocate grabs and
+					 * releases job_write_lock on its own to
+					 * prevent waiting/locking on siblings
+					 * to reply. Now grab the lock and grab
+					 * the jobid. */
+					lock_slurmctld(job_write_lock);
+					if (!(job_ptr =
+					      find_job_record(job_id))) {
+						error("%s: can't find fed job that was just created. this should never happen",
+						      __func__);
+						reject_job = true;
+						error_code = SLURM_ERROR;
+					}
+				}
+			} else {
+				lock_slurmctld(job_write_lock);
+
+				error_code = job_allocate(
+						job_desc_msg, immediate, false,
+						NULL, true, uid, &job_ptr,
+						&err_msg,
+						msg->protocol_version);
+				/* unlock after finished using the job structure
+				 * data */
+
+				/* return result */
+				if (!job_ptr ||
+				    (error_code &&
+				     job_ptr->job_state == JOB_FAILED))
+					reject_job = true;
+			}
 			END_TIMER2("_slurm_rpc_allocate_resources");
 		}
-	} else if (errno)
-		error_code = errno;
-	else
-		error_code = SLURM_ERROR;
-
-	/* return result */
-	if (!job_ptr || (error_code && job_ptr->job_state == JOB_FAILED))
+	} else {
 		reject_job = true;
+		if (errno)
+			error_code = errno;
+		else
+			error_code = SLURM_ERROR;
+	}
 
 	if (!reject_job) {
 		xassert(job_ptr);
@@ -1161,6 +1262,7 @@ static void _slurm_rpc_allocate_resources(slurm_msg_t * msg)
 		}
 
 		slurm_msg_t_init(&response_msg);
+		response_msg.conn = msg->conn;
 		response_msg.flags = msg->flags;
 		response_msg.protocol_version = msg->protocol_version;
 		response_msg.msg_type = RESPONSE_RESOURCE_ALLOCATION;
@@ -2465,7 +2567,7 @@ static bool _is_valid_will_run_user(job_desc_msg_t *job_desc_msg, uid_t uid)
 
 /* _slurm_rpc_job_will_run - process RPC to determine if job with given
  *	configuration can be initiated */
-static void _slurm_rpc_job_will_run(slurm_msg_t * msg)
+static void _slurm_rpc_job_will_run(slurm_msg_t * msg, bool allow_sibs)
 {
 	/* init */
 	DEF_TIMERS;
@@ -2474,10 +2576,10 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg)
 	job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data;
 	/* Locks: Read config, read job, read node, read partition */
 	slurmctld_lock_t job_read_lock = {
-		READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, NO_LOCK };
+		READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK };
 	/* Locks: Write job, Write node, read partition */
 	slurmctld_lock_t job_write_lock = {
-		NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK };
+		NO_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
 	uid_t uid = g_slurm_auth_get_uid(msg->auth_cred,
 					 slurmctld_config.auth_info);
 	uint16_t port;	/* dummy value */
@@ -2512,18 +2614,37 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg)
 				 job_desc_msg->resp_host, 16);
 		dump_job_desc(job_desc_msg);
 		if (error_code == SLURM_SUCCESS) {
-			lock_slurmctld(job_write_lock);
 			if (job_desc_msg->job_id == NO_VAL) {
-				error_code = job_allocate(job_desc_msg, false,
-							  true, &resp,
-							  true, uid, &job_ptr,
-							  &err_msg,
-							  msg->protocol_version);
+				if (allow_sibs && fed_mgr_is_active()) {
+					/* don't job_write lock here. fed_mgr
+					 * locks around the job_allocate when
+					 * doing a will_run to itself. */
+					error_code =
+						fed_mgr_sib_will_run(
+							msg, job_desc_msg, uid,
+							&resp);
+				} else {
+					lock_slurmctld(job_write_lock);
+
+					/* Get a job_id now without incrementing
+					 * the job_id count. This prevents
+					 * burning job_ids on will_runs */
+					job_desc_msg->job_id =
+						get_next_job_id(true);
+
+					error_code = job_allocate(
+							job_desc_msg, false,
+							true, &resp, true, uid,
+							&job_ptr, &err_msg,
+							msg->protocol_version);
+					unlock_slurmctld(job_write_lock);
+				}
 			} else {	/* existing job test */
+				lock_slurmctld(job_write_lock);
 				error_code = job_start_data(job_desc_msg,
 							    &resp);
+				unlock_slurmctld(job_write_lock);
 			}
-			unlock_slurmctld(job_write_lock);
 			END_TIMER2("_slurm_rpc_job_will_run");
 		}
 	} else if (errno)
@@ -3380,17 +3501,17 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 	static int active_rpc_cnt = 0;
 	int error_code = SLURM_SUCCESS;
 	DEF_TIMERS;
-	uint32_t step_id = 0;
+	uint32_t step_id = SLURM_BATCH_SCRIPT, job_id = 0;
 	struct job_record *job_ptr = NULL;
 	slurm_msg_t response_msg;
 	submit_response_msg_t submit_msg;
 	job_desc_msg_t *job_desc_msg = (job_desc_msg_t *) msg->data;
 	/* Locks: Read config, read job, read node, read partition */
 	slurmctld_lock_t job_read_lock = {
-		READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, NO_LOCK };
+		READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK, READ_LOCK };
 	/* Locks: Write job, read node, read partition */
 	slurmctld_lock_t job_write_lock = {
-		NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, NO_LOCK };
+		NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, READ_LOCK };
 	uid_t uid = g_slurm_auth_get_uid(msg->auth_cred,
 					 slurmctld_config.auth_info);
 	char *err_msg = NULL;
@@ -3416,6 +3537,8 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 		error("REQUEST_SUBMIT_BATCH_JOB lacks alloc_node from uid=%d", uid);
 	}
 
+	dump_job_desc(job_desc_msg);
+
 	if (error_code == SLURM_SUCCESS) {
 		/* Locks are for job_submit plugin use */
 		lock_slurmctld(job_read_lock);
@@ -3423,23 +3546,32 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 		unlock_slurmctld(job_read_lock);
 	}
 
-	dump_job_desc(job_desc_msg);
-	if (error_code == SLURM_SUCCESS) {
-		_throttle_start(&active_rpc_cnt);
+	if (error_code) {
+		reject_job = true;
+		goto send_msg;
+	}
+
+	_throttle_start(&active_rpc_cnt);
+	if (job_desc_msg->job_id == SLURM_BATCH_SCRIPT &&
+	    fed_mgr_is_active()) { /* make sure it's not a submitted sib job. */
+
+		if (fed_mgr_job_allocate(msg, job_desc_msg, false, uid,
+					 msg->protocol_version, &job_id,
+					 &error_code, &err_msg))
+			reject_job = true;
+	} else {
 		lock_slurmctld(job_write_lock);
 		START_TIMER;	/* Restart after we have locks */
+
 		if (job_desc_msg->job_id != SLURM_BATCH_SCRIPT) {
 			job_ptr = find_job_record(job_desc_msg->job_id);
 			if (job_ptr && IS_JOB_FINISHED(job_ptr)) {
 				if (IS_JOB_COMPLETING(job_ptr)) {
 					info("Attempt to re-use active "
 					     "job id %u", job_ptr->job_id);
-					slurm_send_rc_msg(
-						msg,
-						ESLURM_DUPLICATE_JOB_ID);
-					unlock_slurmctld(job_write_lock);
-					_throttle_fini(&active_rpc_cnt);
-					goto fini;
+					reject_job = true;
+					error_code = ESLURM_DUPLICATE_JOB_ID;
+					goto unlock;
 				}
 				job_ptr = NULL;	/* OK to re-use job id */
 			}
@@ -3455,10 +3587,9 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 			if (!validate_slurm_user(uid)) {
 				info("Attempt to execute batch job step by "
 				     "uid=%d", uid);
-				slurm_send_rc_msg(msg, ESLURM_NO_STEPS);
-				unlock_slurmctld(job_write_lock);
-				_throttle_fini(&active_rpc_cnt);
-				goto fini;
+				error_code = ESLURM_NO_STEPS;
+				reject_job = true;
+				goto unlock;
 			}
 #endif
 
@@ -3468,63 +3599,56 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 				      "by user %u",
 				      uid, job_ptr->job_id,
 				      job_ptr->user_id);
-				slurm_send_rc_msg(msg, ESLURM_USER_ID_MISSING);
-				unlock_slurmctld(job_write_lock);
-				_throttle_fini(&active_rpc_cnt);
-				goto fini;
+				error_code = ESLURM_USER_ID_MISSING;
+				reject_job = true;
+				goto unlock;
 			}
 			if (job_ptr->details &&
 			    job_ptr->details->prolog_running) {
-				slurm_send_rc_msg(msg, EAGAIN);
-				unlock_slurmctld(job_write_lock);
-				_throttle_fini(&active_rpc_cnt);
-				goto fini;
+				error_code = EAGAIN;
+				reject_job = true;
+				goto unlock;
 			}
 
 			error_code = _launch_batch_step(job_desc_msg, uid,
 							&step_id,
 							msg->protocol_version);
-			unlock_slurmctld(job_write_lock);
-			_throttle_fini(&active_rpc_cnt);
-			END_TIMER2("_slurm_rpc_submit_batch_job");
-
 			if (error_code != SLURM_SUCCESS) {
 				info("_launch_batch_step: %s",
 				     slurm_strerror(error_code));
-				slurm_send_rc_msg(msg, error_code);
-			} else {
-				info("_launch_batch_step StepId=%u.%u %s",
-				     job_desc_msg->job_id, step_id,
-				     TIME_STR);
-				submit_msg.job_id     = job_desc_msg->job_id;
-				submit_msg.step_id    = step_id;
-				submit_msg.error_code = error_code;
-				response_msg.msg_type =
-					RESPONSE_SUBMIT_BATCH_JOB;
-
-				response_msg.data = &submit_msg;
-				slurm_send_node_msg(msg->conn_fd,
-						    &response_msg);
-				schedule_job_save();
+				reject_job = true;
+				goto unlock;
 			}
-			goto fini;
-		}
 
-		/* Create new job allocation */
-		error_code = job_allocate(job_desc_msg,
-					  job_desc_msg->immediate, false,
-					  NULL, 0, uid, &job_ptr, &err_msg,
-					  msg->protocol_version);
+			job_id = job_desc_msg->job_id;
+
+			info("_launch_batch_step StepId=%u.%u %s",
+			     job_id, step_id, TIME_STR);
+		} else {
+			/* Create new job allocation */
+			error_code = job_allocate(job_desc_msg,
+						  job_desc_msg->immediate,
+						  false, NULL, 0, uid, &job_ptr,
+						  &err_msg,
+						  msg->protocol_version);
+			if (!job_ptr ||
+			    (error_code && job_ptr->job_state == JOB_FAILED))
+				reject_job = true;
+			else
+				job_id = job_ptr->job_id;
+
+			if (job_desc_msg->immediate &&
+			    (error_code != SLURM_SUCCESS))
+				error_code = ESLURM_CAN_NOT_START_IMMEDIATELY;
+		}
+unlock:
 		unlock_slurmctld(job_write_lock);
-		_throttle_fini(&active_rpc_cnt);
-		END_TIMER2("_slurm_rpc_submit_batch_job");
-		if (job_desc_msg->immediate && (error_code != SLURM_SUCCESS))
-			error_code = ESLURM_CAN_NOT_START_IMMEDIATELY;
 	}
 
-	/* return result */
-	if (!job_ptr || (error_code && job_ptr->job_state == JOB_FAILED))
-		reject_job = true;
+	_throttle_fini(&active_rpc_cnt);
+
+send_msg:
+	END_TIMER2("_slurm_rpc_submit_batch_job");
 
 	if (reject_job) {
 		info("_slurm_rpc_submit_batch_job: %s",
@@ -3535,21 +3659,23 @@ static void _slurm_rpc_submit_batch_job(slurm_msg_t * msg)
 			slurm_send_rc_msg(msg, error_code);
 	} else {
 		info("_slurm_rpc_submit_batch_job JobId=%u %s",
-		     job_ptr->job_id, TIME_STR);
+		     job_id, TIME_STR);
 		/* send job_ID */
-		submit_msg.job_id     = job_ptr->job_id;
-		submit_msg.step_id    = SLURM_BATCH_SCRIPT;
+		submit_msg.job_id     = job_id;
+		submit_msg.step_id    = step_id;
 		submit_msg.error_code = error_code;
 		response_msg.msg_type = RESPONSE_SUBMIT_BATCH_JOB;
 		response_msg.data = &submit_msg;
 		slurm_send_node_msg(msg->conn_fd, &response_msg);
 
 		schedule_job_save();	/* Has own locks */
-		schedule_node_save();	/* Has own locks */
-		queue_job_scheduler();
+		if (step_id == SLURM_BATCH_SCRIPT) {
+			schedule_node_save();	/* Has own locks */
+			queue_job_scheduler();
+		}
 	}
 
-fini:	xfree(err_msg);
+	xfree(err_msg);
 }
 
 /* _slurm_rpc_update_job - process RPC to update the configuration of a
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 14f99303c75fe814a471e808ecaa3dd66ccf38dc..ff36a96570a8eaf0bfbf3f7f0291ca66ac3e7bef 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -553,10 +553,10 @@ typedef struct {
 	uint16_t *tres;
 } acct_policy_limit_set_t;
 
-
 typedef struct {
-	bitstr_t *siblings;		/* bitmap of sibling cluster ids where
-					 * sibling jobs exist */
+	char    *origin_str;	/* origin cluster name */
+	uint64_t siblings;	/* bitmap of sibling cluster ids */
+	char    *siblings_str;	/* comma separated list of sibling names */
 } job_fed_details_t;
 
 /*
@@ -1108,10 +1108,12 @@ extern char **get_job_env (struct job_record *job_ptr, uint32_t *env_size);
 extern char *get_job_script (struct job_record *job_ptr);
 
 /*
- * get_next_job_id - return the job_id to be used by default for
- *	the next job
+ * Return the next available job_id to be used.
+ * IN test_only - if true, doesn't advance the job_id sequence, just returns
+ * 	what the next job id will be.
+ * RET a valid job_id or SLURM_ERROR if all job_ids are exhausted.
  */
-extern uint32_t get_next_job_id(void);
+extern uint32_t get_next_job_id(bool test_only);
 
 /*
  * get_part_list - find record for named partition(s)
@@ -2411,4 +2413,9 @@ waitpid_timeout(const char *, pid_t, int *, int);
  */
 extern void set_partition_tres();
 
+/*
+ * Set job's siblings and make sibling strings
+ */
+extern void set_job_fed_details(struct job_record *job_ptr,
+				uint64_t fed_siblings);
 #endif /* !_HAVE_SLURMCTLD_H */
diff --git a/src/smap/opts.c b/src/smap/opts.c
index b3ea19fed1d0a6b2902b41c9cbedd3a98065f79f..337005224d74d28d4f992ddebe83972406741147 100644
--- a/src/smap/opts.c
+++ b/src/smap/opts.c
@@ -335,6 +335,7 @@ Usage: smap [OPTIONS]\n\
   -M, --cluster=cluster_name cluster to issue commands to.  Default is\n\
                              current cluster.  cluster with no name will\n\
                              reset to default.\n\
+                             NOTE: SlurmDBD must be up.\n\
   -n, --nodes=[nodes]        only show objects with these nodes.\n\
                              If querying to the ionode level use the -I\n\
                              option in conjunction with this option.\n\
diff --git a/src/sprio/opts.c b/src/sprio/opts.c
index aef26c8ee30683ecc9ff162bc6bb54e02f44a10f..8811e3b7607b34a7ae35a45e559d2ff7c2ddd23e 100644
--- a/src/sprio/opts.c
+++ b/src/sprio/opts.c
@@ -489,6 +489,7 @@ Usage: sprio [OPTIONS]\n\
   -M, --cluster=cluster_name      cluster to issue commands to.  Default is\n\
                                   current cluster.  cluster with no name will\n\
                                   reset to default.\n\
+                                  NOTE: SlurmDBD must be up.\n\
   -n, --norm                      display normalized values\n\
   -o, --format=format             format specification\n\
   -u, --user=user_name            comma separated list of users to view\n\
diff --git a/src/squeue/opts.c b/src/squeue/opts.c
index d115a0032e553c4a866f9f87a3e172d11b89d623..ae7e4de028ef49fa5d9fbe366445cf132cfc195d 100644
--- a/src/squeue/opts.c
+++ b/src/squeue/opts.c
@@ -60,6 +60,7 @@
 #define OPT_LONG_HIDE      0x102
 #define OPT_LONG_START     0x103
 #define OPT_LONG_NOCONVERT 0x104
+#define OPT_LONG_FEDTRACK  0x105
 
 /* FUNCTIONS */
 static List  _build_job_list( char* str );
@@ -94,6 +95,7 @@ parse_command_line( int argc, char* argv[] )
 		{"accounts",   required_argument, 0, 'A'},
 		{"all",        no_argument,       0, 'a'},
 		{"array",      no_argument,       0, 'r'},
+		{"fedtrack",   no_argument,       0, OPT_LONG_FEDTRACK},
 		{"Format",     required_argument, 0, 'O'},
 		{"format",     required_argument, 0, 'o'},
 		{"help",       no_argument,       0, OPT_LONG_HELP},
@@ -292,6 +294,9 @@ parse_command_line( int argc, char* argv[] )
 				exit(1);
 			}
 			break;
+		case OPT_LONG_FEDTRACK:
+			params.show_fedtrack = true;
+			break;
 		case OPT_LONG_HELP:
 			_help();
 			exit(0);
@@ -1371,6 +1376,28 @@ extern int parse_long_format( char* format_long )
 							 field_size,
 							 right_justify,
 							 suffix );
+			else if (!xstrcasecmp(token, "fedorigin"))
+				job_format_add_fed_origin(params.format_list,
+							  field_size,
+							  right_justify,
+							  suffix );
+			else if (!xstrcasecmp(token, "fedoriginraw"))
+				job_format_add_fed_origin_raw(
+							params.format_list,
+							field_size,
+							right_justify,
+							suffix );
+			else if (!xstrcasecmp(token, "fedsiblings"))
+				job_format_add_fed_siblings(params.format_list,
+							    field_size,
+							    right_justify,
+							    suffix );
+			else if (!xstrcasecmp(token, "fedsiblingsraw"))
+				job_format_add_fed_siblings_raw(
+							params.format_list,
+							field_size,
+							right_justify,
+							suffix );
 			else if (!xstrcasecmp(token, "maxcpus"))
 				job_format_add_max_cpus(params.format_list,
 							 field_size,
@@ -1952,6 +1979,7 @@ Usage: squeue [OPTIONS]\n\
   --noconvert                     don't convert units from their original type\n\
 				  (e.g. 2048M won't be converted to 2G).\n\
   -o, --format=format             format specification\n\
+  -O, --Format=format             format specification\n\
   -p, --partition=partition(s)    comma separated list of partitions\n\
 				  to view, default is all partitions\n\
   -q, --qos=qos(s)                comma separated list of qos's\n\
diff --git a/src/squeue/print.c b/src/squeue/print.c
index 39780418d0a6b02991bc5941fbfc1cad1e130912..47f8183f0faf37457c5db1c689eecaa64fd8c683 100644
--- a/src/squeue/print.c
+++ b/src/squeue/print.c
@@ -1652,6 +1652,87 @@ int _print_job_exit_code(job_info_t * job, int width, bool right_justify,
 	return SLURM_SUCCESS;
 }
 
+int _print_job_fed_origin(job_info_t * job, int width, bool right_justify,
+			    char* suffix)
+{
+	if (job == NULL)
+		_print_str("FED_ORIGIN", width, right_justify, true);
+	else {
+		if (job->fed_origin_str)
+			_print_str(job->fed_origin_str, width, right_justify,
+				   true);
+		else
+			_print_str("NA", width, right_justify, true);
+	}
+
+	if (suffix)
+		printf("%s", suffix);
+	return SLURM_SUCCESS;
+}
+
+int _print_job_fed_origin_raw(job_info_t * job, int width, bool right_justify,
+			      char* suffix)
+{
+	if (job == NULL)
+		_print_str("FED_ORIGIN_RAW", width, right_justify, true);
+	else {
+		int id = job->job_id >> 26;
+		if (id)
+			_print_int(id, width, right_justify, true);
+		else
+			_print_str("NA", width, right_justify, true);
+	}
+
+	if (suffix)
+		printf("%s", suffix);
+	return SLURM_SUCCESS;
+}
+
+int _print_job_fed_siblings(job_info_t * job, int width, bool right_justify,
+			    char* suffix)
+{
+	if (job == NULL)
+		_print_str("FED_SIBLINGS", width, right_justify, true);
+	else {
+		if (job->fed_siblings_str)
+			_print_str(job->fed_siblings_str, width, right_justify,
+				   true);
+		else
+			_print_str("NA", width, right_justify, true);
+	}
+
+	if (suffix)
+		printf("%s", suffix);
+	return SLURM_SUCCESS;
+}
+
+int _print_job_fed_siblings_raw(job_info_t * job, int width, bool right_justify,
+				char* suffix)
+{
+	if (job == NULL)
+		_print_str("FED_SIBLINGS_RAW", width, right_justify, true);
+	else {
+		int bit = 1;
+		char *ids = NULL;
+		uint64_t tmp_sibs = job->fed_siblings;
+		while (tmp_sibs) {
+			if (tmp_sibs & 1)
+				xstrfmtcat(ids, "%s%d", (ids) ? "," : "", bit);
+
+			tmp_sibs >>= 1;
+			bit++;
+		}
+		if (ids)
+			_print_str(ids, width, right_justify, true);
+		else
+			_print_str("NA", width, right_justify, true);
+	}
+
+	if (suffix)
+		printf("%s", suffix);
+	return SLURM_SUCCESS;
+}
+
 int _print_job_max_cpus(job_info_t * job, int width, bool right_justify,
 		    char* suffix)
 {
diff --git a/src/squeue/print.h b/src/squeue/print.h
index 92bb85df81cfaf0f4a0b35f3e9ef7d97b1e40874..4df2e0a35c71ab1f17dc1681be09ae65ff91f907 100644
--- a/src/squeue/print.h
+++ b/src/squeue/print.h
@@ -236,6 +236,16 @@ int job_format_add_function(List list, int width, bool right_justify,
 				_print_job_eligible_time)
 #define job_format_add_exit_code(list,wid,right,suffix) \
 	job_format_add_function(list,wid,right,suffix,_print_job_exit_code)
+#define job_format_add_fed_origin(list,wid,right,suffix) \
+	job_format_add_function(list,wid,right,suffix, _print_job_fed_origin)
+#define job_format_add_fed_origin_raw(list,wid,right,suffix) \
+	job_format_add_function(list,wid,right,suffix, \
+				_print_job_fed_origin_raw)
+#define job_format_add_fed_siblings(list,wid,right,suffix) \
+	job_format_add_function(list,wid,right,suffix, _print_job_fed_siblings)
+#define job_format_add_fed_siblings_raw(list,wid,right,suffix) \
+	job_format_add_function(list,wid,right,suffix, \
+				_print_job_fed_siblings_raw)
 #define job_format_add_max_cpus(list,wid,right,suffix) \
 	job_format_add_function(list,wid,right,suffix,_print_job_max_cpus)
 #define job_format_add_max_nodes(list,wid,right,suffix) \
@@ -433,6 +443,14 @@ int _print_job_eligible_time(job_info_t * job, int width, bool right_justify,
 			     char* suffix);
 int _print_job_exit_code(job_info_t * job, int width, bool right_justify,
 			 char* suffix);
+int _print_job_fed_origin(job_info_t * job, int width, bool right_justify,
+			  char* suffix);
+int _print_job_fed_origin_raw(job_info_t * job, int width, bool right_justify,
+			      char* suffix);
+int _print_job_fed_siblings(job_info_t * job, int width, bool right_justify,
+			    char* suffix);
+int _print_job_fed_siblings_raw(job_info_t * job, int width, bool right_justify,
+				char* suffix);
 int _print_job_max_cpus(job_info_t * job, int width, bool right_justify,
 			char* suffix);
 int _print_job_max_nodes(job_info_t * job, int width, bool right_justify,
diff --git a/src/squeue/squeue.c b/src/squeue/squeue.c
index fd4e3a8c60c4835212bafb604fe93345e1602f84..990feb8fafc8cb7c74c8848c91aa15904ea3abd8 100644
--- a/src/squeue/squeue.c
+++ b/src/squeue/squeue.c
@@ -175,6 +175,9 @@ _print_job ( bool clear_old )
 	if (params.all_flag || (params.job_list && list_count(params.job_list)))
 		show_flags |= SHOW_ALL;
 
+	if (params.show_fedtrack)
+		show_flags |= SHOW_FED_TRACK;
+
 	/* We require detail data when CPUs are requested */
 	if (params.format && strstr(params.format, "C"))
 		show_flags |= SHOW_DETAIL;
diff --git a/src/squeue/squeue.h b/src/squeue/squeue.h
index 5901d047ca6481df2b4a96eebcdcf21246103fc8..d2dc6ac3f688b04636d623db972c99d9c9b722db 100644
--- a/src/squeue/squeue.h
+++ b/src/squeue/squeue.h
@@ -69,6 +69,7 @@ struct squeue_parameters {
 	bool array_flag;
 	int  iterate;
 	bool job_flag;
+	bool show_fedtrack;
 	bool start_flag;
 	bool step_flag;
 	bool long_format;
diff --git a/src/srun/libsrun/allocate.c b/src/srun/libsrun/allocate.c
index d409eb98658c42f838395f6c18d7cd57a5151bc4..45db52f08cc858ecad47a9cbfdf0b8d9a97928e8 100644
--- a/src/srun/libsrun/allocate.c
+++ b/src/srun/libsrun/allocate.c
@@ -50,6 +50,7 @@
 #include "src/common/forward.h"
 #include "src/common/log.h"
 #include "src/common/macros.h"
+#include "src/common/proc_args.h"
 #include "src/common/slurm_auth.h"
 #include "src/common/slurm_protocol_api.h"
 #include "src/common/slurm_time.h"
@@ -869,6 +870,16 @@ job_desc_msg_create_from_opts (void)
 	if (opt.mcs_label)
 		j->mcs_label = opt.mcs_label;
 
+	/* If can run on multiple clusters find the earliest run time
+	 * and run it there */
+	j->clusters = xstrdup(opt.clusters);
+	if (opt.clusters &&
+	    slurmdb_get_first_avail_cluster(j, opt.clusters,
+				&working_cluster_rec) != SLURM_SUCCESS) {
+		print_db_notok(opt.clusters, 0);
+		exit(error_exit);
+	}
+
 	return j;
 }
 
diff --git a/src/srun/libsrun/opt.c b/src/srun/libsrun/opt.c
index f60f7831ceda7eae2003aee56c3fff3be3ab964c..7c5a801f9f69d69f5a34aee073cf04d00290d797 100644
--- a/src/srun/libsrun/opt.c
+++ b/src/srun/libsrun/opt.c
@@ -394,6 +394,7 @@ static void _opt_default(void)
 	opt.cwd = xstrdup(buf);
 	opt.cwd_set = false;
 
+	opt.clusters = NULL;
 	opt.progname = NULL;
 
 	opt.ntasks = 1;
@@ -579,6 +580,7 @@ env_vars_t env_vars[] = {
 {"SLURM_BCAST",         OPT_BCAST,      NULL,               NULL             },
 {"SLURM_BLRTS_IMAGE",   OPT_STRING,     &opt.blrtsimage,    NULL             },
 {"SLURM_BURST_BUFFER",  OPT_STRING,     &opt.burst_buffer,  NULL             },
+{"SLURM_CLUSTERS",      OPT_STRING,     &opt.clusters,      NULL             },
 {"SLURM_CHECKPOINT",    OPT_STRING,     &opt.ckpt_interval_str, NULL         },
 {"SLURM_CHECKPOINT_DIR",OPT_STRING,     &opt.ckpt_dir,      NULL             },
 {"SLURM_CNLOAD_IMAGE",  OPT_STRING,     &opt.linuximage,    NULL             },
@@ -935,6 +937,8 @@ static void _set_options(const int argc, char **argv)
 		{"kill-on-bad-exit", optional_argument, 0, 'K'},
 		{"label",         no_argument,       0, 'l'},
 		{"licenses",      required_argument, 0, 'L'},
+		{"cluster",       required_argument, 0, 'M'},
+		{"clusters",      required_argument, 0, 'M'},
 		{"distribution",  required_argument, 0, 'm'},
 		{"ntasks",        required_argument, 0, 'n'},
 		{"nodes",         required_argument, 0, 'N'},
@@ -1045,7 +1049,7 @@ static void _set_options(const int argc, char **argv)
 		{"wckey",            required_argument, 0, LONG_OPT_WCKEY},
 		{NULL,               0,                 0, 0}
 	};
-	char *opt_string = "+A:B:c:C:d:D:e:Eg:hHi:I::jJ:kK::lL:m:n:N:"
+	char *opt_string = "+A:B:c:C:d:D:e:Eg:hHi:I::jJ:kK::lL:m:M:n:N:"
 		"o:Op:P:qQr:RsS:t:T:uU:vVw:W:x:XZ";
 	char *pos_delimit;
 	bool ntasks_set_opt = false;
@@ -1185,6 +1189,10 @@ static void _set_options(const int argc, char **argv)
 			xfree(opt.licenses);
 			opt.licenses = xstrdup(optarg);
 			break;
+		case 'M':
+			xfree(opt.clusters);
+			opt.clusters = xstrdup(optarg);
+			break;
 		case (int)'m':
 			opt.distribution = verify_dist_type(optarg,
 							     &opt.plane_size);
@@ -2742,7 +2750,7 @@ static void _usage(void)
 "            [--oversubscribe] [--label] [--unbuffered] [-m dist] [-J jobname]\n"
 "            [--jobid=id] [--verbose] [--slurmd_debug=#] [--gres=list]\n"
 "            [-T threads] [-W sec] [--checkpoint=time] [--gres-flags=opts]\n"
-"            [--checkpoint-dir=dir]  [--licenses=names]\n"
+"            [--checkpoint-dir=dir] [--licenses=names] [--clusters=cluster_names]\n"
 "            [--restart-dir=dir] [--qos=qos] [--time-min=minutes]\n"
 "            [--contiguous] [--mincpus=n] [--mem=MB] [--tmp=MB] [-C list]\n"
 "            [--mpi=type] [--account=name] [--dependency=type:jobid]\n"
@@ -2819,10 +2827,14 @@ static void _help(void)
 "  -K, --kill-on-bad-exit      kill the job if any task terminates with a\n"
 "                              non-zero exit code\n"
 "  -l, --label                 prepend task number to lines of stdout/err\n"
-"  -L, --licenses=names        required license, comma separated\n"
 "      --launch-cmd            print external launcher command line if not SLURM\n"
 "      --launcher-opts=        options for the external launcher command if not\n"
 "                              SLURM\n"
+"  -L, --licenses=names        required license, comma separated\n"
+"  -M, --clusters=names        Comma separated list of clusters to issue\n"
+"                              commands to.  Default is current cluster.\n"
+"                              Name of 'all' will submit to run on all clusters.\n"
+"                              NOTE: SlurmDBD must up.\n"
 "  -m, --distribution=type     distribution method for processes to nodes\n"
 "                              (type = block|cyclic|arbitrary)\n"
 "      --mail-type=type        notify on state change: BEGIN, END, FAIL or ALL\n"
diff --git a/src/srun/libsrun/opt.h b/src/srun/libsrun/opt.h
index 1380867a8726dbd69c877bf1ce5b4b287eab8398..1c8e9aa0fe6df4e0794992fc5e20dc94ca301d1a 100644
--- a/src/srun/libsrun/opt.h
+++ b/src/srun/libsrun/opt.h
@@ -66,7 +66,7 @@ extern int _verbose;
 extern enum modes mode;
 
 typedef struct srun_options {
-
+	char *clusters;		/* cluster to run this on. */
 	char *progname;		/* argv[0] of this program or
 				 * configuration file if multi_prog */
 	bool multi_prog;	/* multiple programs to execute */
diff --git a/src/sshare/sshare.c b/src/sshare/sshare.c
index 6a8dc3c7bc63441ce7156457440c7367478a9cde..24926ce167b8f1734f93d037f57f2b3e81b8425a 100644
--- a/src/sshare/sshare.c
+++ b/src/sshare/sshare.c
@@ -456,9 +456,8 @@ Usage:  sshare [OPTION]                                                    \n\
                            with the '--format' option                      \n\
     -l or --long           include normalized usage in output              \n\
     -m or --partition      print the partition part of the association     \n\
-    -M or --cluster=name   cluster to issue commands to.  Default is       \n\
-                           current cluster.  cluster with no name will     \n\
-                           reset to default.                               \n\
+    -M or --cluster=names  clusters to issue commands to.                  \n\
+                           NOTE: SlurmDBD must be up.                      \n\
     -n or --noheader       omit header from output                         \n\
     -o or --format=        Comma separated list of fields. (use            \n\
                            (\"--helpformat\" for a list of available fields).\n\
diff --git a/src/strigger/opts.c b/src/strigger/opts.c
index 81236614cfdafcf247f93a553de8536120b15b28..979258b4c69476a152b9424546affc0dedea4204 100644
--- a/src/strigger/opts.c
+++ b/src/strigger/opts.c
@@ -536,6 +536,7 @@ Usage: strigger [--set | --get | --clear] [OPTIONS]\n\
   -M, --cluster=name  cluster to issue commands to.  Default is\n\
                       current cluster.  cluster with no name will\n\
                       reset to default.\n\
+                      NOTE: SlurmDBD must up.\n\
   -n, --node[=host]   trigger related to specific node, all nodes by default\n\
   -N, --noheader      Do not print the message header\n\
   -o, --offset=#      trigger's offset time from event, negative to precede\n\
diff --git a/testsuite/expect/Makefile.am b/testsuite/expect/Makefile.am
index 7d4adb5e8698741df8296f0a41e0aec75c64c2dd..e970e8fc33b68e997fc896e2208d38b0fc605c9a 100644
--- a/testsuite/expect/Makefile.am
+++ b/testsuite/expect/Makefile.am
@@ -614,7 +614,8 @@ EXTRA_DIST = \
 	test36.4			\
 	test37.1			\
 	test37.2			\
-	test37.3
+	test37.3			\
+	test37.4
 
 distclean-local:
 	rm -rf *error *output
diff --git a/testsuite/expect/Makefile.in b/testsuite/expect/Makefile.in
index 1cc42cb20a865b86bae79f34a4358735fe4ce6d0..cfcdc093d505ba87c4010eaf5a7f13b37c364c24 100644
--- a/testsuite/expect/Makefile.in
+++ b/testsuite/expect/Makefile.in
@@ -1028,7 +1028,8 @@ EXTRA_DIST = \
 	test36.4			\
 	test37.1			\
 	test37.2			\
-	test37.3
+	test37.3			\
+	test37.4
 
 all: all-am
 
diff --git a/testsuite/expect/README b/testsuite/expect/README
index 4bc34d8e3a1ac93bfe83ffa9d4615d3751e0f464..0380564e49106bea3bb00c136e69b7ffc85a30a8 100644
--- a/testsuite/expect/README
+++ b/testsuite/expect/README
@@ -804,3 +804,4 @@ test37.#   Testing of federations.
 test37.1   sacctmgr operations on clusters and federations.
 test37.2   Validate federated clusters return federated job ids.
 test37.3   scontrol show federations
+test37.4   federated job submission
diff --git a/testsuite/expect/globals_federation b/testsuite/expect/globals_federation
index d27b9c6a5a5a2cd2a051cd5aa21006e631ace1fb..50fe0d72a3326104bd502c56334db86c3efb1d5c 100644
--- a/testsuite/expect/globals_federation
+++ b/testsuite/expect/globals_federation
@@ -59,45 +59,77 @@ proc test_federation_setup { } {
 proc setup_federation { fed_name } {
 	global sacctmgr fedc1 fedc2 fedc3 eol
 	set rc 0
-	set my_pid [spawn $sacctmgr -i add federation $fed_name cluster=$fedc1,$fedc2,$fedc3]
+
+	set my_pid [spawn $sacctmgr -i add federation $fed_name]
 	set matches 0
 	expect {
 		-re "Adding Federation\\(s\\)$eol" {
 			incr matches
-			exp_continue
+				exp_continue
 		}
 		-re "$fed_name$eol" {
 			incr matches
-			exp_continue
-		}
-		-re "Settings$eol" {
-			incr matches
-			exp_continue
-		}
-		-re "\\s+Cluster\\s+=\\s+$fedc1$eol" {
-			incr matches
-			exp_continue
-		}
-		-re "\\s+Cluster\\s+=\\s+$fedc2$eol" {
-			incr matches
-			exp_continue
-		}
-		-re "\\s+Cluster\\s+=\\s+$fedc3$eol" {
-			incr matches
-			exp_continue
+				exp_continue
 		}
 		timeout {
 			send_user "\nFAILURE: sacctmgr add not responding\n"
-			slow_kill $my_pid
-			set rc 1
+				slow_kill $my_pid
+				set rc 1
 		}
 		eof {
 			wait
 		}
 	}
-	if {!$rc && $matches != 6} {
+	if {!$rc && $matches != 2} {
 		send_user "$matches FAILURE: failed to create federation.\n"
 		set rc 1
+		return $rc
+	}
+
+	set count 0
+	foreach cluster [list $fedc1 $fedc2 $fedc3] {
+		incr count
+		set my_pid [spawn $sacctmgr -i mod cluster $cluster set federation=$fed_name weight=1]
+		set matches 0
+		expect {
+			-re "Setting$eol" {
+				incr matches
+					exp_continue
+			}
+			-re "^\\s+Federation\\s+=\\s+$fed_name$eol" {
+				incr matches
+				exp_continue
+			}
+			-re "^\\s+Weight\\s+=\\s+1$eol" {
+				incr matches
+				exp_continue
+			}
+			-re "Modified cluster...$eol" {
+				incr matches
+				exp_continue
+			}
+			-re "^\\s+$cluster$eol" {
+				incr matches
+				exp_continue
+			}
+			timeout {
+				send_user "\nFAILURE: sacctmgr add not responding\n"
+					slow_kill $my_pid
+					set rc 1
+			}
+			eof {
+				wait
+			}
+		}
+		if {!$rc && $matches != 5} {
+			send_user "$matches FAILURE: failed to add $cluster to federation.\n"
+			set rc 1
+			break;
+		}
+
+		if {$count > 1} {
+			sleep 5;
+		}
 	}
 	return $rc
 }
@@ -311,3 +343,90 @@ proc remove_cluster_from_fed {cname fed_name} {
 
 	return $rc
 }
+
+
+proc modify_federation_flags {fed_name mode flags} {
+	global sacctmgr eol
+	set matches 0
+	set my_pid [spawn $sacctmgr -i modify federation $fed_name set flags$mode$flags]
+	expect {
+		-re "Setting$eol" {
+			incr matches
+			exp_continue
+		}
+		-re "^\\s+Flags\\s+\\$mode\\s+$flags$eol" {
+			incr matches
+			exp_continue
+		}
+		-re "^\\s+Modified federation...$eol" {
+			incr matches
+			exp_continue
+		}
+		-re "^\\s+$fed_name$eol" {
+			incr matches
+			exp_continue
+		}
+		timeout {
+			send_user "\nFAILURE: sacctmgr add not responding\n"
+			slow_kill $my_pid
+			end_it 1
+		}
+		eof {
+			wait
+		}
+	}
+	if {$matches != 4} {
+		send_user "$matches FAILURE: unexpected error.\n"
+		end_it 1
+	}
+}
+
+proc modify_cluster_weight {cname weight} {
+	global sacctmgr eol
+	set matches 0
+	set my_pid [spawn $sacctmgr -i mod cluster $cname set weight=$weight]
+	expect {
+		-re "Setting$eol" {
+			incr matches
+			exp_continue
+		}
+		-re "^\\s+Weight\\s+=\\s+$weight$eol" {
+			incr matches
+			exp_continue
+		}
+		-re "Modified cluster...$eol" {
+			incr matches
+			exp_continue
+		}
+		-re "^\\s+$cname$eol" {
+			incr matches
+			exp_continue
+		}
+		timeout {
+			send_user "\nFAILURE: sacctmgr add not responding\n"
+			slow_kill $my_pid
+			end_it 1
+		}
+		eof {
+			wait
+		}
+	}
+	if {$matches != 4} {
+		send_user "$matches FAILURE: failed to set weight for $cname\n"
+		end_it 1
+	}
+}
+
+
+proc log_error {msg} {
+	send_user "\nFAILURE: $msg\n"
+}
+
+proc log_warn {msg} {
+	send_user "\nWARNING:    $msg\n"
+}
+
+proc log_info {msg} {
+	send_user "INFO:    $msg\n"
+}
+
diff --git a/testsuite/expect/test1.43 b/testsuite/expect/test1.43
index 50177be27739c005bfbe3d551bbf4f92999c38bf..de6932260fe04e934033291ed2aba87fe0a6b667 100755
--- a/testsuite/expect/test1.43
+++ b/testsuite/expect/test1.43
@@ -58,7 +58,7 @@ for {set node_cnt 1} {$node_cnt > 0} {set node_cnt [expr $node_cnt * 2]} {
 			incr jobs_run
 			exp_continue
 		}
-		-re "allocation failure" {
+		-re "allocation failure:.*?\r\n" {
 			set alloc_fail 1
 			set node_cnt 0
 			exp_continue
diff --git a/testsuite/expect/test17.36 b/testsuite/expect/test17.36
index b9cf62dac3fd6020d8275fa195d39eaed6cb6aa8..d6e470754068bf0f54b9850d0b83afa8c58a1348 100755
--- a/testsuite/expect/test17.36
+++ b/testsuite/expect/test17.36
@@ -350,7 +350,7 @@ cancel_job $job_id
 # Test partition with oversubscribe=NO
 #
 ####################################
-send_user "\n\nTest partition with overoubscribe=NO\n"
+send_user "\n\nTest partition with oversubscribe=NO\n"
 
 # Determine the number of cores or CPUs
 set num_jobs [cr_core_cpu $node_name]
diff --git a/testsuite/expect/test37.1 b/testsuite/expect/test37.1
index 97e29835bc88835b7eb8b62ede07460315835b07..84342fde3f71691461a861c2e429cbb8bd13f7d6 100755
--- a/testsuite/expect/test37.1
+++ b/testsuite/expect/test37.1
@@ -2617,137 +2617,143 @@ expect {
 }
 expect -re $
 
-for {set i 1} {$i <= $max_federations} {incr i} {
-	set matches 0
-	set tmpc "max${i}_$test_id_2"
-	set my_pid [spawn $sacctmgr -i add cluster $tmpc federation=$fed1]
-	if {$i < $max_federations} {
-		expect {
-			-re "Adding Cluster\\(s\\)$eol" {
-				incr matches
-				exp_continue
-			}
-			-re "\\s+$tmpc$eol" {
-				incr matches
-				exp_continue
-			}
-			-re "\\s+Setting$eol" {
-				incr matches
-				exp_continue
-			}
-			-re "\\s+Federation\\s+=\\s+$fed1$eol" {
-				incr matches
-				exp_continue
-			}
-			timeout {
-				send_user "\nFAILURE: sacctmgr add not responding\n"
-				slow_kill $my_pid
-				set exit_code 1
-			}
-			eof {
-				wait
-			}
-		}
-		if {$exit_code || $matches != 4} {
-			send_user "$matches FAILURE: unexpected error.\n"
-			end_it 1
-		}
-	} else {
-		expect {
-			-re "Adding Cluster\\(s\\)$eol" {
-				incr matches
-				exp_continue
-			}
-			-re "\\s+$tmpc$eol" {
-				incr matches
-				exp_continue
-			}
-			-re "\\s+Setting$eol" {
-				incr matches
-				exp_continue
-			}
-			-re "\\s+Federation\\s+=\\s+$fed1$eol" {
-				incr matches
-				exp_continue
-			}
-			-re "\\s+Problem adding clusters: Too many clusters in federation?" {
-				incr matches
-				exp_continue
-			}
-			timeout {
-				send_user "\nFAILURE: sacctmgr add not responding\n"
-				slow_kill $my_pid
-				set exit_code 1
-			}
-			eof {
-				wait
-			}
-		}
-		if {$exit_code || $matches != 5} {
-			send_user "$matches FAILURE: unexpected error.\n"
-			end_it 1
-		}
+set matches 0
+set tmp_clusters ""
+for {set i 1} {$i < $max_federations} {incr i} {
+	if {$i > 1} {
+		append tmp_clusters ","
+	}
+	append tmp_clusters "max${i}_$test_id_2"
+}
+set timeout 300
+set my_pid [spawn $sacctmgr -i add cluster $tmp_clusters federation=$fed1]
+expect {
+	-re "Adding Cluster\\(s\\)$eol" {
+		incr matches
+		exp_continue
+	}
+	-re "\\s+max\[1-6\]{0,1}\\d{1}_$test_id_2$eol" {
+		incr matches
+		exp_continue
+	}
+	-re "\\s+Setting$eol" {
+		incr matches
+		exp_continue
+	}
+	-re "\\s+Federation\\s+=\\s+$fed1$eol" {
+		incr matches
+		exp_continue
+	}
+	timeout {
+		send_user "\nFAILURE: sacctmgr add not responding\n"
+		slow_kill $my_pid
+		set exit_code 1
+	}
+	eof {
+		wait
+	}
+}
+if {$exit_code || $matches != 66} {
+	send_user "$matches FAILURE: unexpected error.\n"
+	end_it 1
+}
 
-		set matches 0
-		#####################################
-		# TEST: modify cluster to exceed max clusters in federation
-		#####################################
-		#add last cluster without federation
-		set my_pid [spawn $sacctmgr -i add cluster $tmpc]
-		expect {
-			-re "Adding Cluster\\(s\\)$eol" {
-				incr matches
-				exp_continue
-			}
-			-re "\\s+$tmpc$eol" {
-				incr matches
-				exp_continue
-			}
-			timeout {
-				send_user "\nFAILURE: sacctmgr add not responding\n"
-				slow_kill $my_pid
-				set exit_code 1
-			}
-			eof {
-				wait
-			}
-		}
-		if {$exit_code || $matches != 2} {
-			send_user "$matches FAILURE: unexpected error.\n"
-			end_it 1
-		}
+set matches 0
+set tmpc "max${i}_$test_id_2"
+set my_pid [spawn $sacctmgr -i add cluster $tmpc federation=$fed1]
+expect {
+	-re "Adding Cluster\\(s\\)$eol" {
+		incr matches
+		exp_continue
+	}
+	-re "\\s+$tmpc$eol" {
+		incr matches
+		exp_continue
+	}
+	-re "\\s+Setting$eol" {
+		incr matches
+		exp_continue
+	}
+	-re "\\s+Federation\\s+=\\s+$fed1$eol" {
+		incr matches
+		exp_continue
+	}
+	-re "\\s+Problem adding clusters: Too many clusters in federation?" {
+		incr matches
+		exp_continue
+	}
+	timeout {
+		send_user "\nFAILURE: sacctmgr add not responding\n"
+		slow_kill $my_pid
+		set exit_code 1
+	}
+	eof {
+		wait
+	}
+}
+if {$exit_code || $matches != 5} {
+	send_user "$matches FAILURE: unexpected error.\n"
+	end_it 1
+}
 
-		set matches 0
-		set my_pid [spawn $sacctmgr -i modify cluster $tmpc set federation=$fed1]
-		expect {
-			-re "Setting$eol" {
-				incr matches
-				exp_continue
-			}
-			-re "^\\s+Federation\\s+=\\s+$fed1$eol" {
-				incr matches
-				exp_continue
-			}
-			-re "sacctmgr: error: Too many clusters in federation$eol" {
-				incr matches
-				exp_continue
-			}
-			timeout {
-				send_user "\nFAILURE: sacctmgr add not responding\n"
-				slow_kill $my_pid
-				set exit_code 1
-			}
-			eof {
-				wait
-			}
-		}
-		if {$exit_code || $matches != 3} {
-			send_user "$matches FAILURE: unexpected error.\n"
-			end_it 1
-		}
+set matches 0
+#####################################
+# TEST: modify cluster to exceed max clusters in federation
+#####################################
+#add last cluster without federation
+set my_pid [spawn $sacctmgr -i add cluster $tmpc]
+expect {
+	-re "Adding Cluster\\(s\\)$eol" {
+		incr matches
+		exp_continue
+	}
+	-re "\\s+$tmpc$eol" {
+		incr matches
+		exp_continue
 	}
+	timeout {
+		send_user "\nFAILURE: sacctmgr add not responding\n"
+		slow_kill $my_pid
+		set exit_code 1
+	}
+	eof {
+		wait
+	}
+}
+if {$exit_code || $matches != 2} {
+	send_user "$matches FAILURE: unexpected error.\n"
+	end_it 1
+}
 
+set matches 0
+set my_pid [spawn $sacctmgr -i modify cluster $tmpc set federation=$fed1]
+expect {
+	-re "Setting$eol" {
+		incr matches
+		exp_continue
+	}
+	-re "^\\s+Federation\\s+=\\s+$fed1$eol" {
+		incr matches
+		exp_continue
+	}
+	-re "sacctmgr: error: Too many clusters in federation$eol" {
+		incr matches
+		exp_continue
+	}
+	timeout {
+		send_user "\nFAILURE: sacctmgr add not responding\n"
+		slow_kill $my_pid
+		set exit_code 1
+	}
+	eof {
+		wait
+	}
 }
+if {$exit_code || $matches != 3} {
+	send_user "$matches FAILURE: unexpected error.\n"
+	end_it 1
+}
+
 set matches 0
 set my_pid [spawn $sacctmgr show federation $fed1 format="federation%20,cluster%20"]
 expect {
diff --git a/testsuite/expect/test37.2 b/testsuite/expect/test37.2
index e1939c745d26aba77450c488fd58e89cc9a276d6..80bf10b98cf515ff120e3c6d2a0c450c91bc3f79 100755
--- a/testsuite/expect/test37.2
+++ b/testsuite/expect/test37.2
@@ -124,7 +124,7 @@ proc test_fed_job_id { cname cid} {
 	set clust_id [expr $job_id >> 26]
 
 	send_user "Fed JobID:$job_id Local JobID:$local_id Cluster ID:$clust_id\n"
-	if {$clust_id != $cid} {
+	if {!$rc && ($clust_id != $cid)} {
 		send_user "\nFAILURE: jobid($job_id) from $cname didn't give\
 			correct partition id ($part_id != $cid)\n"
 		incr rc
diff --git a/testsuite/expect/test37.3 b/testsuite/expect/test37.3
index df1e88402e514b7107d12ab27d8cd97cf127b883..2a5daef8823e76d74311cb5ec74b0d29ac719e4b 100755
--- a/testsuite/expect/test37.3
+++ b/testsuite/expect/test37.3
@@ -119,7 +119,7 @@ proc test_fed_status {cname fed_flags cluster_list} {
 					send_user "matched self: $name\n"
 					incr matches
 				}
-			} elseif {[regexp {Sibling:\s+(\S+):(\S+):(\d+) ID:(\d+) FedState:(\S*) Weight:(\d+) PersistConn:(\S+)} \
+			} elseif {[regexp {Sibling:\s+(\S+):(\S+):(\d+) ID:(\d+) FedState:(\S*) Weight:(\d+) PersistConnSend/Recv:(\S+)} \
 					  $line match name host port id state weight conn]} {
 				send_user "matched: $match\n"
 				if {$expected_matches &&
@@ -136,7 +136,7 @@ proc test_fed_status {cname fed_flags cluster_list} {
 			exp_continue
 		}
 		timeout {
-			send_user "\nFAILURE: sacctmgr add not responding\n"
+			send_user "\nFAILURE: scontrol not responding\n"
 			slow_kill $my_pid
 			end_it 1
 		}
@@ -187,19 +187,19 @@ set fed_flags "None"
 
 send_user "\n\ntest from $fedc1\n"
 dict set clusters($fedc1) conn Self
-dict set clusters($fedc2) conn Connected
-dict set clusters($fedc3) conn Connected
+dict set clusters($fedc2) conn "Yes/Yes"
+dict set clusters($fedc3) conn "Yes/Yes"
 test_fed_status $fedc1 $fed_flags [array get clusters]
 
 send_user "\n\ntest from $fedc2\n"
-dict set clusters($fedc1) conn Connected
+dict set clusters($fedc1) conn "Yes/Yes"
 dict set clusters($fedc2) conn Self
-dict set clusters($fedc3) conn Connected
+dict set clusters($fedc3) conn "Yes/Yes"
 test_fed_status $fedc2 $fed_flags [array get clusters]
 
 send_user "\n\ntest from $fedc3\n"
-dict set clusters($fedc1) conn Connected
-dict set clusters($fedc2) conn Connected
+dict set clusters($fedc1) conn "Yes/Yes"
+dict set clusters($fedc2) conn "Yes/Yes"
 dict set clusters($fedc3) conn Self
 test_fed_status $fedc3 $fed_flags [array get clusters]
 
@@ -209,10 +209,10 @@ if {[remove_cluster_from_fed $fedc3 $fed_name]} {
 }
 array unset clusters $fedc3
 dict set clusters($fedc1) conn Self
-dict set clusters($fedc2) conn Connected
+dict set clusters($fedc2) conn "Yes/Yes"
 test_fed_status $fedc1 $fed_flags [array get clusters]
 
-dict set clusters($fedc1) conn Connected
+dict set clusters($fedc1) conn "Yes/Yes"
 dict set clusters($fedc2) conn Self
 test_fed_status $fedc2 $fed_flags [array get clusters]
 
@@ -243,19 +243,21 @@ send_user "\n\nadd $fedc2 and test from $fedc1\n"
 if {[add_cluster_to_fed $fedc2 $fed_name]} {
 	end_it 1
 }
+sleep 5
 array set clusters [get_clusterfed_info $fed_name]
 dict set clusters($fedc1) conn Self
-dict set clusters($fedc2) conn Connected
+dict set clusters($fedc2) conn "Yes/Yes"
 test_fed_status $fedc1 $fed_flags [array get clusters]
 
 send_user "\n\nadd $fedc3 and test from $fedc1\n"
 if {[add_cluster_to_fed $fedc3 $fed_name]} {
 	end_it 1
 }
+sleep 5
 array set clusters [get_clusterfed_info $fed_name]
 dict set clusters($fedc1) conn Self
-dict set clusters($fedc2) conn Connected
-dict set clusters($fedc3) conn Connected
+dict set clusters($fedc2) conn "Yes/Yes"
+dict set clusters($fedc3) conn "Yes/Yes"
 test_fed_status $fedc1 $fed_flags [array get clusters]
 
 
@@ -296,8 +298,8 @@ if {$exit_code || $matches != 4} {
 set fed_flags "LLC"
 array set clusters [get_clusterfed_info $fed_name]
 dict set clusters($fedc1) conn Self
-dict set clusters($fedc2) conn Connected
-dict set clusters($fedc3) conn Connected
+dict set clusters($fedc2) conn "Yes/Yes"
+dict set clusters($fedc3) conn "Yes/Yes"
 test_fed_status $fedc1 $fed_flags [array get clusters]
 
 
@@ -338,8 +340,8 @@ if {$exit_code || $matches != 4} {
 set fed_flags "None"
 array set clusters [get_clusterfed_info $fed_name]
 dict set clusters($fedc1) conn Self
-dict set clusters($fedc2) conn Connected
-dict set clusters($fedc3) conn Connected
+dict set clusters($fedc2) conn "Yes/Yes"
+dict set clusters($fedc3) conn "Yes/Yes"
 test_fed_status $fedc1 $fed_flags [array get clusters]
 
 
diff --git a/testsuite/expect/test37.4 b/testsuite/expect/test37.4
new file mode 100755
index 0000000000000000000000000000000000000000..11b8959967e15149997b870ff64387b48a59bcc8
--- /dev/null
+++ b/testsuite/expect/test37.4
@@ -0,0 +1,442 @@
+#!/usr/bin/expect
+############################################################################
+# Purpose: Test federated submissions
+#
+# Reqs:    1. Using slurmdbd accounting storage type and is up
+#          2. fed_slurm_base is defined in globals.local - set to directory that
+#          has access to each federation configure (fedc1, fedc2, fedc3).
+#          Eg.
+#          fedr/slurm/ (src)
+#          fedr/fed1/bin
+#          fedr/fed1/sbin
+#          fedr/fed1/etc
+#          fedr/fed1/...
+#          fedr/fed2/...
+#          fedr/fed3/...
+#          3. controllers are up and running.
+#
+# Output:  "TEST: #.#" followed by "SUCCESS" if test was successful, OR
+#          "FAILURE: ..." otherwise with an explanation of the failure, OR
+#          anything else indicates a failure mode that must be investigated.
+############################################################################
+# Copyright (C) 2016 SchedMD LLC.
+# Written by Brian Christiansen <brian@schedmd.com>
+#
+# This file is part of SLURM, a resource management program.
+# For details, see <http://slurm.schedmd.com/>.
+# Please also read the included file: DISCLAIMER.
+#
+# SLURM is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 2 of the License, or (at your option)
+# any later version.
+#
+# SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along
+# with SLURM; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
+############################################################################
+
+source ./globals
+source ./globals_accounting
+source ./globals_federation
+
+set test_id     "37.4"
+set exit_code   0
+set fed_name    "feda"
+set file_in     "test$test_id.in"
+set user_name   ""
+
+set eol "\r\n"
+
+set timeout	5
+print_header $test_id
+
+#
+# Check accounting config and bail if not found.
+#
+if { [test_account_storage] == 0 } {
+	log_warn "This test can't be run without a usable AccountStorageType"
+	exit 0
+}
+
+if { [string compare [check_accounting_admin_level] "Administrator"] } {
+	log_warn "This test can't be run without being an Accounting administrator.\n \
+	 	  Use: sacctmgr mod user \$USER set admin=admin."
+	exit 0
+}
+
+proc cancel_all_jobs { } {
+	global scancel user_name fedc1 fedc2 fedc3
+
+	spawn $scancel -M$fedc1,$fedc2,$fedc3 --user $user_name
+	expect {
+		eof {
+			wait
+		}
+	}
+	sleep 5
+}
+
+proc cleanup { } {
+	global scancel fed_name user_name bin_rm file_in fedc1 fedc2 fedc3
+
+	cancel_all_jobs
+	exec $bin_rm -f $file_in
+
+	return [delete_federations $fed_name];
+}
+
+proc end_it { exit_code } {
+	global test_id
+	cleanup
+	if {$exit_code == 0} {
+		print_success $test_id
+	}
+	exit $exit_code
+}
+
+proc submit_fed_job { cname expected_origin expected_sib spec_clusters } {
+	global fed_slurm_base file_in node_count number squeue
+
+	set submit_cluster ""
+	set origin  ""
+	set sibling ""
+	set job_id 0
+	set my_sbatch "${fed_slurm_base}/$cname/bin/sbatch"
+	set command "$my_sbatch -N$node_count --exclusive --output=/dev/null --error=/dev/null -t3"
+	if {$spec_clusters ne ""} {
+		append command " -M$spec_clusters"
+	}
+	append command " $file_in"
+	set sbatch_pid [spawn {*}$command]
+	expect {
+		-re "Submitted batch job ($number)" {
+			set job_id $expect_out(1,string)
+			exp_continue
+		}
+		-re "on cluster (\\S+)" {
+			set submit_cluster $expect_out(1,string)
+			exp_continue
+		}
+		timeout {
+			log_error "sbatch not responding"
+			slow_kill $sbatch_pid
+			end_it 1
+		}
+		eof {
+			wait
+		}
+	}
+	if {$job_id == 0} {
+		log_error "batch submit failure"
+		end_it 1
+	}
+
+	sleep 3
+
+	set my_squeue "${fed_slurm_base}/$cname/bin/squeue"
+	if {$submit_cluster ne ""} {
+		set my_squeue "${fed_slurm_base}/$submit_cluster/bin/squeue"
+	}
+	spawn $my_squeue --jobs=$job_id --noheader -Ofedorigin,fedsiblings --fedtrack
+	expect {
+		-re "(\\S+)\\s+(\\S+)" {
+			set origin  $expect_out(1,string)
+			set sibling $expect_out(2,string)
+		}
+	}
+
+	log_info "origin:$origin sibling:$sibling"
+
+	if {($expected_origin ne "") && ($origin ne $expected_origin)} {
+		log_error "origin:$origin != expected_origin:$expected_origin"
+		end_it 1
+	}
+
+	if {($expected_sib ne "") && ($sibling ne $expected_sib)} {
+		log_error "sibling:$sibling != expected_sib:$expected_sib"
+		end_it 1
+	}
+
+
+	# Verify that siblings have the job as well.
+	foreach tmp_sib [split $sibling ","] {
+		if {$tmp_sib eq  $origin} {
+			continue
+		}
+		set my_squeue "${fed_slurm_base}/$tmp_sib/bin/squeue"
+		spawn $my_squeue --jobs=$job_id --noheader -Ofedorigin,fedsiblings
+		set match 0
+		expect {
+			-re "(\\S+)\\s+(\\S+)" {
+				set match 1
+				if {$origin ne $expect_out(1,string)} {
+					log_error "origin not the same on $sibling"
+				}
+				if {$sibling ne $expect_out(2,string)} {
+					log_error "sibling not the same on $sibling"
+				}
+			}
+			timeout {
+				log_error "$my_squeue not responding"
+					end_it 1
+			}
+			eof {
+				wait
+			}
+		}
+
+		if {!$match} {
+			log_error "didn't find origin or sibling from job"
+			end_it 1
+		}
+	}
+
+	return $sibling
+}
+
+if {[test_federation_setup]} {
+	log_warn "WARNING: This test can't be run without fed_slurm_base,\
+		fedc1, fedc2, fedc3 setup in globals.local."
+	exit 0
+}
+
+if {[test_cluster_up $fedc1] ||
+    [test_cluster_up $fedc2] ||
+    [test_cluster_up $fedc3]} {
+	end_it 1
+}
+
+spawn $bin_id -un
+expect {
+	-re "($alpha_numeric_under)" {
+		set user_name $expect_out(1,string)
+	}
+	eof {
+		wait
+	}
+}
+
+# Remove existing setup
+if {[cleanup] != 0} {
+	log_error "failed to cleanup"
+	end_it 1
+}
+
+# add clusters to federation
+if {[setup_federation $fed_name]} {
+	log_error "failed to setup federation"
+	end_it 1
+}
+
+# get number of nodes per cluster
+# devide by 2 to get 2 jobs per clusters
+set node_count [expr [available_nodes "" ""] / 2]
+
+make_bash_script $file_in "$bin_sleep 300"
+
+
+###############################################################################
+send_user "\n\n"
+send_user "Test packing across clusters\n"
+###############################################################################
+
+# Submit first job and get a sibling
+set first_sib [submit_fed_job $fedc1 $fedc1 "" ""]
+# Second job should have same sibling as first
+submit_fed_job $fedc1  $fedc1 $first_sib ""
+
+
+# Third job should get a different a sib
+set second_sib [submit_fed_job $fedc1 $fedc1 "" ""]
+if {$second_sib eq $first_sib} {
+	log_error "$second_sib == $first_sib"
+	end_it 1
+}
+submit_fed_job $fedc1  $fedc1 $second_sib ""
+
+
+# Fifth job should be on a different sib that the first two
+set third_sib [submit_fed_job $fedc1 $fedc1 "" ""]
+if {($third_sib eq $first_sib) || ($third_sib eq $second_sib)} {
+	log_error "$third_sib == ($first_sib || $second_sib)"
+	end_it 1
+}
+submit_fed_job $fedc1  $fedc1 $third_sib ""
+
+
+# last job should be submitted to all siblings
+submit_fed_job $fedc1  $fedc1 "$fedc1,$fedc2,$fedc3" ""
+
+
+
+###############################################################################
+send_user "\n\n"
+send_user "Test packing across clusters with weights\n\n"
+# Set fed1's weight to 2. Should pack on fed2 and fed 3 before getting to fed1
+###############################################################################
+cancel_all_jobs
+modify_cluster_weight $fedc1 2
+
+# Submit first job and get a sibling -- not fed1
+set first_sib [submit_fed_job $fedc1 $fedc1 "" ""]
+if {$first_sib eq $fedc1} {
+	log_error "$first_sib == $fedc1"
+	end_it 1
+}
+# Second job should have same sibling as first
+submit_fed_job $fedc1  $fedc1 $first_sib ""
+
+
+# Third job should get a different a sib
+set second_sib [submit_fed_job $fedc1 $fedc1 "" ""]
+if {$second_sib eq $fedc1 || $second_sib eq $first_sib} {
+	log_error "$second_sib == $first_sib"
+	end_it 1
+}
+submit_fed_job $fedc1  $fedc1 $second_sib ""
+
+
+# Fifth job should be on fed1
+set third_sib [submit_fed_job $fedc1 $fedc1 $fedc1 ""]
+submit_fed_job $fedc1  $fedc1 $third_sib ""
+
+
+# last job should be submitted to all siblings
+submit_fed_job $fedc1  $fedc1 "$fedc1,$fedc2,$fedc3" ""
+
+
+# reset fed1's weight
+modify_cluster_weight $fedc1 1
+
+
+###############################################################################
+send_user "\n\n"
+send_user "Test -M<clusters> with federated jobs\n"
+###############################################################################
+cancel_all_jobs
+
+# Submit job to only fed1
+submit_fed_job $fedc1 $fedc1 $fedc1 $fedc1
+submit_fed_job $fedc1 $fedc1 $fedc1 $fedc1
+submit_fed_job $fedc1 $fedc1 $fedc1 $fedc1
+
+# Submit job to only fed1,fed2
+# Will go to fed2 since fed1 is full and third job should go to both
+submit_fed_job $fedc1 $fedc1 $fedc2 "$fedc1,$fedc2"
+submit_fed_job $fedc1 $fedc1 $fedc2 "$fedc1,$fedc2"
+submit_fed_job $fedc1 $fedc1 "$fedc1,$fedc2" "$fedc1,$fedc2"
+
+# Submit job to fed2,fed3.
+# Should choose fed2 to be origin and submit
+submit_fed_job $fedc1 $fedc2 $fedc3 "$fedc2,$fedc3"
+submit_fed_job $fedc1 $fedc2 $fedc3 "$fedc2,$fedc3"
+submit_fed_job $fedc1 $fedc2 "$fedc2,$fedc3" "$fedc2,$fedc3"
+
+
+###############################################################################
+send_user "\n\n"
+send_user "Test spreading across clusters with LLC flag\n"
+###############################################################################
+cancel_all_jobs
+# Now make set the Federation to LLC and make sure that it spreads across the
+# cluster.
+modify_federation_flags $fed_name "=" "LLC"
+
+
+# Submit first job and get a sibling
+set sib1 [submit_fed_job $fedc1 $fedc1 "" ""]
+
+# Second job shouldln't have same sibling as first
+set sib2 [submit_fed_job $fedc1 $fedc1 "" ""]
+if {$sib2 eq $sib1} {
+	log_error "$sib1 == $sib2"
+	end_it 1
+}
+
+# Third job shouldln't have same sibling as first or second
+set sib3 [submit_fed_job $fedc1 $fedc1 "" ""]
+if {$sib3 eq $sib1 || $sib3 eq $sib2} {
+	log_error "$sib1 == ($sib2 || $sib3)"
+	end_it 1
+}
+
+# Repeat
+# Fourth job could get any sib but I would expect it to get sib1
+set sib1 [submit_fed_job $fedc1 $fedc1 $sib1 ""]
+
+# Second job shouldln't have same sibling as first
+set sib2 [submit_fed_job $fedc1 $fedc1 "" ""]
+if {$sib2 eq $sib1} {
+	log_error "$sib1 == $sib2"
+	end_it 1
+}
+
+# Third job shouldln't have same sibling as first or second
+set sib3 [submit_fed_job $fedc1 $fedc1 "" ""]
+if {$sib3 eq $sib1 || $sib3 eq $sib2} {
+	log_error "$sib1 == ($sib2 || $sib3)"
+	end_it 1
+}
+
+# last job should be submitted to all siblings
+submit_fed_job $fedc1  $fedc1 "$fedc1,$fedc2,$fedc3" ""
+
+
+###############################################################################
+send_user "\n\n"
+send_user "Test spreading across clusters with LLC flag with weights\n"
+# Set fed1's weight to 2. Should spread spread between fed2 and fed3 before
+# going to fed1
+###############################################################################
+cancel_all_jobs
+modify_cluster_weight $fedc1 2
+
+# Submit first job and get a sibling
+set sib1 [submit_fed_job $fedc1 $fedc1 "" ""]
+if {$sib1 eq $fedc1} {
+	log_error "$sib1 == $fedc1"
+	end_it 1
+}
+
+# Second job shouldln't have same sibling as first
+set sib2 [submit_fed_job $fedc1 $fedc1 "" ""]
+if {$sib2 eq $fedc1 || $sib2 eq $sib1} {
+	log_error "$sib1 == $sib2"
+	end_it 1
+}
+
+# Repeat
+# job could get any sib but I would expect it to get sib1
+set sib1 [submit_fed_job $fedc1 $fedc1 $sib1 ""]
+if {$sib1 eq $fedc1} {
+	log_error "$sib1 == $fedc1"
+	end_it 1
+}
+
+# Second job shouldln't have same sibling as first
+set sib2 [submit_fed_job $fedc1 $fedc1 "" ""]
+if {$sib2 eq $fedc1 || $sib2 eq $sib1} {
+	log_error "$sib1 == $sib2"
+	end_it 1
+}
+
+# Third job shouldln't have same sibling as first or second
+set sib3 [submit_fed_job $fedc1 $fedc1 $fedc1 ""]
+submit_fed_job $fedc1  $fedc1 $sib3 ""
+
+# last job should be submitted to all siblings
+submit_fed_job $fedc1  $fedc1 "$fedc1,$fedc2,$fedc3" ""
+
+# reset fed1's weight
+modify_cluster_weight $fedc1 1
+
+
+
+# All Done
+end_it 0