From d80db48ffcc90d551ce95ef7fd39daff95cd4d38 Mon Sep 17 00:00:00 2001
From: Brian Christiansen <brian@schedmd.com>
Date: Wed, 15 Mar 2017 22:20:36 -0600
Subject: [PATCH] Distinguish viable vs active siblings

Now viable siblings -- siblings where siblings job could run on
(e.g. after requested cluster and cluster features applied) and active
siblings are distinguished. The remote sibling jobs only need to know
about the viable siblings and not the actual siblings. This simplies
things a little bit by not having to update the remote sibling jobs when
the active siblings change (e.g. cluster rejects the submission), only
when the viable siblings are changed (scontrol update clusterfeatures).
---
 RELEASE_NOTES                    |   7 +-
 doc/man/man1/squeue.1            |  24 ++--
 slurm/slurm.h.in                 |   9 +-
 slurm/slurm_errno.h              |   1 +
 src/api/job_info.c               |   8 +-
 src/common/slurm_errno.c         |   2 +
 src/common/slurm_protocol_pack.c |  23 ++--
 src/slurmctld/fed_mgr.c          | 209 ++++++++++++++-----------------
 src/slurmctld/job_mgr.c          | 125 +++++++++++-------
 src/slurmctld/proc_req.c         |   8 +-
 src/slurmctld/slurmctld.h        |  19 +--
 src/squeue/opts.c                |  25 ++--
 src/squeue/print.c               |  63 ++++++++--
 src/squeue/print.h               |  27 ++--
 14 files changed, 329 insertions(+), 221 deletions(-)

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index a0680bcf792..944cea72916 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -81,11 +81,16 @@ Added members to the following struct definitions
 In slurmbdb_cluster_fed_t: Added feature_list to hold cluster features.
 In job_desc_msg_t: Added cluster_features for passing cluster features to
 	controller.
+		   Renamed fed_siblings to fed_siblings_active.
+		   Added fed_siblings_viable.
 In job_info_t: Added cluster_features for passing back a job's cluster features
 	from the controller.
+               Renamed fed_siblings[_str] fed_siblings_active[_str]
+	       Added fed_siblings_viable[_str].
 In struct job_details: Added cluster_features to hold requestsed cluster
 	features.
-
+In job_fed_details_t: Rename siblings to siblings_active.
+		      Added siblings_viable.
 
 Added the following struct definitions
 ======================================
diff --git a/doc/man/man1/squeue.1 b/doc/man/man1/squeue.1
index 5eab7e3ef19..88498eb7e54 100644
--- a/doc/man/man1/squeue.1
+++ b/doc/man/man1/squeue.1
@@ -592,14 +592,6 @@ Cluster name where federated job originated from.
 Cluster ID where federated job originated from.
 (Valid for federated jobs only)
 .TP
-\fBfedsiblings\fR
-Cluster names of where federated job can run.
-(Valid for federated jobs only)
-.TP
-\fBfedsiblingsraw\fR
-Cluster IDs of where federated job can run.
-(Valid for federated jobs only)
-.TP
 \fBgres\fR
 Generic resources (gres) required by the job or step.
 (Valid for jobs and job steps)
@@ -826,6 +818,22 @@ Permit rotation of geometry (yes or no),
 Node use (VIRTUAL or COPROCESSOR), etc.
 (Valid for jobs only)
 .TP
+\fBsiblingsactive\fR
+Cluster names of where federated sibling jobs exist.
+(Valid for federated jobs only)
+.TP
+\fBsiblingsactiveraw\fR
+Cluster IDs of where federated sibling jobs exist.
+(Valid for federated jobs only)
+.TP
+\fBsiblingsviable\fR
+Cluster names of where federated sibling jobs are viable to run.
+(Valid for federated jobs only)
+.TP
+\fBsiblingsviableraw\fR
+Cluster IDs of where federated sibling jobs viable to run.
+(Valid for federated jobs only)
+.TP
 \fBsockets\fR
 Number of sockets per node requested by the job.
 This reports the value of the \fBsrun \-\-sockets\-per\-node\fR option.
diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in
index 71c6d749a34..e6336f42012 100644
--- a/slurm/slurm.h.in
+++ b/slurm/slurm.h.in
@@ -1417,7 +1417,8 @@ typedef struct job_descriptor {	/* For submit, allocate, and update requests */
 				 * from job's allocation, default NONE */
 	char *features;		/* required feature specification,
 				 * default NONE */
-	uint64_t fed_siblings;	/* Bitmap of federation siblings */
+	uint64_t fed_siblings_active; /* Bitmap of active fed sibling ids */
+	uint64_t fed_siblings_viable; /* Bitmap of viable fed sibling ids */
 	char *gres;		/* comma separated list of required generic
 				 * resources, default NONE */
 	uint32_t group_id;	/* group to assume, if run as root. */
@@ -1596,8 +1597,10 @@ typedef struct job_info {
 	uint32_t exit_code;	/* exit code for job (status from wait call) */
 	char *features;		/* comma separated list of required features */
 	char *fed_origin_str;	/* Origin cluster's name */
-	uint64_t fed_siblings;	/* bitmap of sibling cluster ids */
-	char *fed_siblings_str;	/* string of sibling cluster names */
+	uint64_t fed_siblings_active;  /* bitmap of active fed sibling ids */
+	char *fed_siblings_active_str; /* string of active sibling names */
+	uint64_t fed_siblings_viable;  /* bitmap of viable fed sibling ids */
+	char *fed_siblings_viable_str; /* string of viable sibling names */
 	char *gres;		/* comma separated list of generic resources */
 	uint32_t gres_detail_cnt; /* Count of gres_detail_str records,
 				 * one per allocated node */
diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h
index aa102b6d8ac..ea361f9c79a 100644
--- a/slurm/slurm_errno.h
+++ b/slurm/slurm_errno.h
@@ -265,6 +265,7 @@ enum {
 	ESLURM_FED_CLUSTER_MAX_CNT              = 7100,
 	ESLURM_FED_CLUSTER_MULTIPLE_ASSIGNMENT,
 	ESLURM_INVALID_CLUSTER_FEATURE,
+	ESLURM_JOB_NOT_FEDERATED,
 
 	/* plugin and custom errors */
 	ESLURM_MISSING_TIME_LIMIT       = 8000,
diff --git a/src/api/job_info.c b/src/api/job_info.c
index d7854972ccb..c995ed4010e 100644
--- a/src/api/job_info.c
+++ b/src/api/job_info.c
@@ -563,9 +563,11 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner )
 	}
 
 	/****** Line 14a (optional) ******/
-	if (job_ptr->fed_siblings) {
-		xstrfmtcat(out, "FedOrigin=%s FedSiblings=%s",
-			   job_ptr->fed_origin_str, job_ptr->fed_siblings_str);
+	if (job_ptr->fed_siblings_active || job_ptr->fed_siblings_viable) {
+		xstrfmtcat(out, "FedOrigin=%s ViableSiblings=%s ActiveSiblings=%s",
+			   job_ptr->fed_origin_str,
+			   job_ptr->fed_siblings_viable_str,
+			   job_ptr->fed_siblings_active_str);
 		xstrcat(out, line_end);
 	}
 
diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c
index d55e048a853..f28d3f2cf16 100644
--- a/src/common/slurm_errno.c
+++ b/src/common/slurm_errno.c
@@ -446,6 +446,8 @@ static slurm_errtab_t slurm_errtab[] = {
 	  "Clusters can only be assigned to one federation" 	},
 	{ ESLURM_INVALID_CLUSTER_FEATURE,
 	  "Invalid cluster feature specification"		},
+	{ ESLURM_JOB_NOT_FEDERATED,
+	  "Not a valid federated job"				},
 
 	/* plugin and custom errors */
 	{ ESLURM_MISSING_TIME_LIMIT,
diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c
index a565310c59b..309f965f3ac 100644
--- a/src/common/slurm_protocol_pack.c
+++ b/src/common/slurm_protocol_pack.c
@@ -5635,9 +5635,12 @@ _unpack_job_info_members(job_info_t * job, Buf buffer,
 
 		safe_unpackstr_xmalloc(&job->fed_origin_str, &uint32_tmp,
 				       buffer);
-		safe_unpack64(&job->fed_siblings, buffer);
-		safe_unpackstr_xmalloc(&job->fed_siblings_str, &uint32_tmp,
+		safe_unpack64(&job->fed_siblings_active, buffer);
+		safe_unpackstr_xmalloc(&job->fed_siblings_active_str, &uint32_tmp,
 				       buffer);
+		safe_unpack64(&job->fed_siblings_viable, buffer);
+		safe_unpackstr_xmalloc(&job->fed_siblings_viable_str,
+				       &uint32_tmp, buffer);
 	} else if (protocol_version >= SLURM_17_02_PROTOCOL_VERSION) {
 		safe_unpack32(&job->array_job_id, buffer);
 		safe_unpack32(&job->array_task_id, buffer);
@@ -5781,9 +5784,9 @@ _unpack_job_info_members(job_info_t * job, Buf buffer,
 
 		safe_unpackstr_xmalloc(&job->fed_origin_str, &uint32_tmp,
 				       buffer);
-		safe_unpack64(&job->fed_siblings, buffer);
-		safe_unpackstr_xmalloc(&job->fed_siblings_str, &uint32_tmp,
-				       buffer);
+		safe_unpack64(&job->fed_siblings_viable, buffer);
+		safe_unpackstr_xmalloc(&job->fed_siblings_viable_str,
+				       &uint32_tmp, buffer);
 	} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
 		char *node_inx_str;
 		uint32_t tmp_mem;
@@ -7528,7 +7531,8 @@ _pack_job_desc_msg(job_desc_msg_t * job_desc_ptr, Buf buffer,
 		pack32(job_desc_ptr->task_dist, buffer);
 		pack16(job_desc_ptr->kill_on_node_fail, buffer);
 		packstr(job_desc_ptr->features, buffer);
-		pack64(job_desc_ptr->fed_siblings, buffer);
+		pack64(job_desc_ptr->fed_siblings_active, buffer);
+		pack64(job_desc_ptr->fed_siblings_viable, buffer);
 		packstr(job_desc_ptr->gres, buffer);
 		pack32(job_desc_ptr->job_id, buffer);
 		packstr(job_desc_ptr->job_id_str, buffer);
@@ -7699,7 +7703,7 @@ _pack_job_desc_msg(job_desc_msg_t * job_desc_ptr, Buf buffer,
 		pack32(job_desc_ptr->task_dist, buffer);
 		pack16(job_desc_ptr->kill_on_node_fail, buffer);
 		packstr(job_desc_ptr->features, buffer);
-		pack64(job_desc_ptr->fed_siblings, buffer);
+		pack64(job_desc_ptr->fed_siblings_viable, buffer);
 		packstr(job_desc_ptr->gres, buffer);
 		pack32(job_desc_ptr->job_id, buffer);
 		packstr(job_desc_ptr->job_id_str, buffer);
@@ -8055,7 +8059,8 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer,
 		safe_unpack16(&job_desc_ptr->kill_on_node_fail, buffer);
 		safe_unpackstr_xmalloc(&job_desc_ptr->features,
 				       &uint32_tmp, buffer);
-		safe_unpack64(&job_desc_ptr->fed_siblings, buffer);
+		safe_unpack64(&job_desc_ptr->fed_siblings_active, buffer);
+		safe_unpack64(&job_desc_ptr->fed_siblings_viable, buffer);
 		safe_unpackstr_xmalloc(&job_desc_ptr->gres, &uint32_tmp,buffer);
 		safe_unpack32(&job_desc_ptr->job_id, buffer);
 		safe_unpackstr_xmalloc(&job_desc_ptr->job_id_str,
@@ -8228,7 +8233,7 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer,
 		safe_unpack16(&job_desc_ptr->kill_on_node_fail, buffer);
 		safe_unpackstr_xmalloc(&job_desc_ptr->features,
 				       &uint32_tmp, buffer);
-		safe_unpack64(&job_desc_ptr->fed_siblings, buffer);
+		safe_unpack64(&job_desc_ptr->fed_siblings_viable, buffer);
 		safe_unpackstr_xmalloc(&job_desc_ptr->gres, &uint32_tmp,buffer);
 		safe_unpack32(&job_desc_ptr->job_id, buffer);
 		safe_unpackstr_xmalloc(&job_desc_ptr->job_id_str,
diff --git a/src/slurmctld/fed_mgr.c b/src/slurmctld/fed_mgr.c
index 8491ba7ffb8..6209a5681bf 100644
--- a/src/slurmctld/fed_mgr.c
+++ b/src/slurmctld/fed_mgr.c
@@ -1008,7 +1008,7 @@ static void _revoke_sibling_jobs(struct job_record *job_ptr,
 				 uint32_t cluster_id, time_t start_time)
 {
 	int id = 1;
-	uint64_t tmp_sibs = job_ptr->fed_details->siblings;
+	uint64_t tmp_sibs = job_ptr->fed_details->siblings_active;
 	while (tmp_sibs) {
 		if ((tmp_sibs & 1) &&
 		    (id != fed_mgr_cluster_rec->fed.id) &&
@@ -1596,7 +1596,7 @@ static uint64_t _cluster_names_to_ids(char *clusters)
  *
  * Must have fed_read_lock before entering and NO job locks.
  *
- * Will send willruns to the clusters set in job_desc->fed.siblings.
+ * Will send willruns to the clusters set in job_desc->fed_siblings_viable.
  *
  * IN msg - contains the original job_desc buffer to send to the siblings and to
  * 	be able to create a job_desc copy to willrun itself.
@@ -1645,7 +1645,7 @@ static List _get_sib_will_runs(slurm_msg_t *msg, job_desc_msg_t *job_desc,
 	/* willrun the sibling clusters */
 	sib_itr = list_iterator_create(fed_mgr_fed_rec->cluster_list);
 	while ((sibling = list_next(sib_itr))) {
-		if (!(job_desc->fed_siblings &
+		if (!(job_desc->fed_siblings_viable &
 		      FED_SIBLING_BIT(sibling->fed.id))) {
 			if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR)
 				info("skipping cluster %s -- not in cluster list to submit job to",
@@ -1711,8 +1711,7 @@ static List _get_sib_will_runs(slurm_msg_t *msg, job_desc_msg_t *job_desc,
  */
 static slurmdb_cluster_rec_t *_find_start_now_sib(slurm_msg_t *msg,
 						  job_desc_msg_t *job_desc,
-						  uid_t uid,
-						  uint64_t *avail_sibs)
+						  uid_t uid)
 {
 	ListIterator itr;
 	List sib_willruns;
@@ -1721,7 +1720,6 @@ static slurmdb_cluster_rec_t *_find_start_now_sib(slurm_msg_t *msg,
 	slurmdb_cluster_rec_t *ret_sib = NULL;
 	time_t now = 0;
 
-	xassert(avail_sibs);
 	xassert(job_desc);
 	xassert(msg);
 
@@ -1739,8 +1737,6 @@ static slurmdb_cluster_rec_t *_find_start_now_sib(slurm_msg_t *msg,
 		if (!sib_willrun->resp) /* no response if job couldn't run? */
 			continue;
 
-		*avail_sibs |= FED_SIBLING_BIT(sib_willrun->sibling->fed.id);
-
 		/* Pick first sibling that can start the job now. siblings are
 		 * sorted by weight and resources. */
 		if (sib_willrun->resp->start_time <= now) {
@@ -1836,8 +1832,8 @@ static void _update_sib_job_siblings(job_desc_msg_t *job_desc, uint64_t sibs)
 	slurm_attr_init(&attr);
 
 	slurm_init_job_desc_msg(&job_update_msg);
-	job_update_msg.job_id       = job_desc->job_id;
-	job_update_msg.fed_siblings = job_desc->fed_siblings;
+	job_update_msg.job_id              = job_desc->job_id;
+	job_update_msg.fed_siblings_viable = job_desc->fed_siblings_viable;
 
 	sib_itr = list_iterator_create(fed_mgr_fed_rec->cluster_list);
 	while ((sibling = list_next(sib_itr))) {
@@ -1882,11 +1878,12 @@ static void _update_sib_job_siblings(job_desc_msg_t *job_desc, uint64_t sibs)
 }
 
 /*
- * Submit sibling jobs to designated (job_desc->fed_siblings) siblings.
+ * Submit sibling jobs to designated siblings (job_desc->fed_siblings_viable).
  *
- * Will update job_desc->fed_siblings if a sibling fails to submit a job.
+ * Will update job_desc->fed_siblings_active with the successful submissions.
  *
- * IN job_desc - job_desc containing job_id and fed_siblings of job to be.
+ * IN job_desc - job_desc containing job_id and fed_siblings_viable of job to be
+ * 	submitted.
  * IN msg - contains the original job_desc buffer to send to the siblings.
  * IN alloc_only - true if just an allocation. false if a batch job.
  * RET returns SLURM_SUCCESS if all siblings recieved the job sucessfully or
@@ -1914,7 +1911,7 @@ static int _submit_sibling_jobs(job_desc_msg_t *job_desc, slurm_msg_t *msg,
 	sib_msg.data_buffer  = msg->buffer;
 	sib_msg.data_type    = msg->msg_type;
 	sib_msg.data_version = msg->protocol_version;
-	sib_msg.fed_siblings = job_desc->fed_siblings;
+	sib_msg.fed_siblings = job_desc->fed_siblings_viable;
 	sib_msg.job_id       = job_desc->job_id;
 	sib_msg.resp_host    = job_desc->resp_host;
 
@@ -1926,11 +1923,16 @@ static int _submit_sibling_jobs(job_desc_msg_t *job_desc, slurm_msg_t *msg,
 		if (sibling == fed_mgr_cluster_rec)
 			continue;
 
-		/* fed_siblings is set prior to siblings that responded */
-		if (!(job_desc->fed_siblings &
+		/* Only send to available siblings */
+		if (!(job_desc->fed_siblings_viable &
 		      FED_SIBLING_BIT(sibling->fed.id)))
 			continue;
 
+		/* skip sibling if the sibling already has a job */
+		if (job_desc->fed_siblings_active &
+		    FED_SIBLING_BIT(sibling->fed.id))
+			continue;
+
 		sub = xmalloc(sizeof(sib_submit_t));
 		sub->sibling = sibling;
 		sub->sib_msg = &sib_msg;
@@ -1953,18 +1955,13 @@ static int _submit_sibling_jobs(job_desc_msg_t *job_desc, slurm_msg_t *msg,
 		pthread_join(tmp_sub->thread_id, NULL);
 		rc |= tmp_sub->thread_rc;
 
-		/* take out the job from the siblings bitmap if there was an
-		 * error. The local host should stay in it if it's there. */
-		if (tmp_sub->thread_rc)
-			job_desc->fed_siblings &=
-				(~FED_SIBLING_BIT(tmp_sub->sibling->fed.id));
+		/* Mark successful submission as active in fed_siblings */
+		if (!tmp_sub->thread_rc)
+			job_desc->fed_siblings_active |=
+				FED_SIBLING_BIT(tmp_sub->sibling->fed.id);
 	}
 	list_iterator_destroy(thread_itr);
 
-	if (rc && job_desc->fed_siblings) {
-		_update_sib_job_siblings(job_desc, INFINITE64);
-	}
-
 	slurm_attr_destroy(&attr);
 	FREE_NULL_LIST(submit_threads);
 
@@ -2078,9 +2075,10 @@ extern int fed_mgr_job_allocate(slurm_msg_t *msg, job_desc_msg_t *job_desc,
 {
 	int rc = SLURM_SUCCESS;
 	slurmdb_cluster_rec_t *start_now_sib = NULL;
-	uint64_t avail_sibs = 0, feature_sibs = 0;
+	uint64_t feature_sibs = 0;
 	struct job_record *job_ptr = NULL;
 	time_t now = time(NULL);
+	bool job_held = false;
 	slurmctld_lock_t fed_read_lock = {
 		NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
 	slurmctld_lock_t job_write_lock = {
@@ -2106,6 +2104,9 @@ extern int fed_mgr_job_allocate(slurm_msg_t *msg, job_desc_msg_t *job_desc,
 		return SLURM_ERROR;
 	}
 
+	if (job_desc->priority == 0)
+		job_held = true;
+
 	lock_slurmctld(job_write_lock);
 	/* get job_id now. Can't submit job to get job_id as job_allocate will
 	 * change the job_desc. */
@@ -2115,41 +2116,25 @@ extern int fed_mgr_job_allocate(slurm_msg_t *msg, job_desc_msg_t *job_desc,
 	lock_slurmctld(fed_read_lock);
 
 	/* Set potential siblings */
-	job_desc->fed_siblings = _get_all_sibling_bits();
+	job_desc->fed_siblings_viable = _get_all_sibling_bits();
 	if (job_desc->clusters)
-		job_desc->fed_siblings &=
+		job_desc->fed_siblings_viable &=
 			_cluster_names_to_ids(job_desc->clusters);
 	if (feature_sibs)
-		job_desc->fed_siblings &= feature_sibs;
-	/* Set avail_sibs to fed.siblings in case job can't start now or is
-	 * being held. */
-	avail_sibs = job_desc->fed_siblings;
+		job_desc->fed_siblings_viable &= feature_sibs;
 
-	if ((job_desc->priority != 0) && (job_desc->begin_time <= now)) {
+	if (!job_held && (job_desc->begin_time <= now)) {
 		/* Don't job/node write lock on _find_start_now_sib. It locks
 		 * inside _sib_will_run */
-		start_now_sib = _find_start_now_sib(msg, job_desc, uid,
-						    &avail_sibs);
-
-		if (!avail_sibs) {
-			debug("No cluster responded to sibling will_runs");
-			avail_sibs = job_desc->fed_siblings;
-		}
+		start_now_sib = _find_start_now_sib(msg, job_desc, uid);
 	}
 
-	if (job_desc->priority == 0) {
-		/* don't submit siblings if the job held, siblings will be
-		 * submitted when the job is released. */
-		job_desc->fed_siblings = 0;
-	} else if (start_now_sib == NULL) {
-		job_desc->fed_siblings = avail_sibs;
-	} else if (start_now_sib == fed_mgr_cluster_rec) {
-		job_desc->fed_siblings =
-			FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id);
-	} else {
-		job_desc->fed_siblings =
+	if (start_now_sib)
+		job_desc->fed_siblings_viable =
 			FED_SIBLING_BIT(start_now_sib->fed.id);
-	}
+
+	/* ensure that fed_siblings_active is clear since this is a new job */
+	job_desc->fed_siblings_active = 0;
 
 	/* Submit local job first. Then submit to all siblings. If the local job
 	 * fails, then don't worry about sending to the siblings. */
@@ -2167,46 +2152,32 @@ extern int fed_mgr_job_allocate(slurm_msg_t *msg, job_desc_msg_t *job_desc,
 		goto end_it;
 	}
 
+	/* mark this cluster as an active sibling if it's in the viable list */
+	if (job_desc->fed_siblings_viable &
+	    FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id))
+		job_desc->fed_siblings_active |=
+			FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id);
+
 	*job_id_ptr = job_ptr->job_id;
 
-	if (job_desc->priority == 0) {
-		job_ptr->fed_details = xmalloc(sizeof(job_fed_details_t));
+	if (job_held) {
 		info("Submitted held federated job %u to %s(self)",
 		     job_ptr->job_id, fed_mgr_cluster_rec->name);
 	} else {
 		info("Submitted %sfederated job %u to %s(self)",
-		     (!(job_ptr->fed_details->siblings &
+		     (!(job_ptr->fed_details->siblings_viable &
 			FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id)) ?
 		      "tracking " : ""),
 		     job_ptr->job_id, fed_mgr_cluster_rec->name);
 	}
 
-	unlock_slurmctld(job_write_lock);
-
-	if (_submit_sibling_jobs(job_desc, msg, alloc_only)) {
-		/* failed to submit a sibling job to a sibling. Need to update
-		 * the local job's sibling bitmap */
+	if (!job_held && _submit_sibling_jobs(job_desc, msg, alloc_only))
+		info("failed to submit sibling job to one or more siblings");
 
-		lock_slurmctld(job_write_lock);
-		if ((job_ptr->magic  == JOB_MAGIC) &&
-		    (job_ptr->job_id == *job_id_ptr)) {
-
-			if (!job_desc->fed_siblings) {
-				/* we know that we already have a job_ptr so
-				 * just make it a locally scheduleable job. */
-				error("Failed to submit fed job to siblings, submitting to local cluster");
-				job_desc->fed_siblings |=
-					FED_SIBLING_BIT(
-						fed_mgr_cluster_rec->fed.id);
-			}
-			set_job_fed_details(job_ptr, job_desc->fed_siblings);
-		} else {
-			error("%s: job got messed up. this should never happen",
-			      __func__);
-		}
+	job_ptr->fed_details->siblings_active = job_desc->fed_siblings_active;
+	update_job_fed_details(job_ptr);
 
-		unlock_slurmctld(job_write_lock);
-	}
+	unlock_slurmctld(job_write_lock);
 
 end_it:
 	unlock_slurmctld(fed_read_lock);
@@ -2231,8 +2202,8 @@ extern bool fed_mgr_is_tracker_only_job(struct job_record *job_ptr)
 
 	if (job_ptr->fed_details &&
 	    (origin_id == fed_mgr_cluster_rec->fed.id) &&
-	    job_ptr->fed_details->siblings &&
-	    (!(job_ptr->fed_details->siblings &
+	    job_ptr->fed_details->siblings_active &&
+	    (!(job_ptr->fed_details->siblings_active &
 	      FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id))))
 		rc = true;
 
@@ -2303,8 +2274,10 @@ extern int fed_mgr_job_lock(struct job_record *job_ptr, uint32_t cluster_id)
 		     job_ptr->job_id, cluster_id);
 
 	/* if this cluster is the only sibling, then just assume the lock */
-	if ((job_ptr->fed_details->siblings & FED_SIBLING_BIT(cluster_id)) &&
-	    (!(job_ptr->fed_details->siblings & ~FED_SIBLING_BIT(cluster_id))))
+	if ((job_ptr->fed_details->siblings_viable &
+	     FED_SIBLING_BIT(cluster_id)) &&
+	    (!(job_ptr->fed_details->siblings_viable &
+	       ~FED_SIBLING_BIT(cluster_id))))
 		return SLURM_SUCCESS;
 
 	if (origin_id != fed_mgr_cluster_rec->fed.id) {
@@ -2358,8 +2331,10 @@ extern int fed_mgr_job_unlock(struct job_record *job_ptr, uint32_t cluster_id)
 		     job_ptr->job_id, cluster_id);
 
 	/* if this cluster is the only sibling, then dont worry */
-	if ((job_ptr->fed_details->siblings & FED_SIBLING_BIT(cluster_id)) &&
-	    (!(job_ptr->fed_details->siblings & ~FED_SIBLING_BIT(cluster_id))))
+	if ((job_ptr->fed_details->siblings_viable &
+	     FED_SIBLING_BIT(cluster_id)) &&
+	    (!(job_ptr->fed_details->siblings_viable &
+	       ~FED_SIBLING_BIT(cluster_id))))
 		return SLURM_SUCCESS;
 
 	if (origin_id != fed_mgr_cluster_rec->fed.id) {
@@ -2422,15 +2397,19 @@ extern int fed_mgr_job_start(struct job_record *job_ptr, uint32_t cluster_id,
 			return SLURM_ERROR;
 		}
 
-		set_job_fed_details(job_ptr, FED_SIBLING_BIT(cluster_id));
+		job_ptr->fed_details->siblings_active =
+			FED_SIBLING_BIT(cluster_id);
+		update_job_fed_details(job_ptr);
 
 		return _persist_fed_job_start(origin_cluster, job_ptr->job_id,
 					      cluster_id, job_ptr->start_time);
 	}
 
 	/* Origin Cluster: */
-	if ((job_ptr->fed_details->siblings & FED_SIBLING_BIT(cluster_id)) &&
-	    (!(job_ptr->fed_details->siblings & ~FED_SIBLING_BIT(cluster_id))))
+	if ((job_ptr->fed_details->siblings_viable &
+	     FED_SIBLING_BIT(cluster_id)) &&
+	    (!(job_ptr->fed_details->siblings_viable &
+	       ~FED_SIBLING_BIT(cluster_id))))
 	{
 		/* if this cluster is the only sibling, then just assume the
 		 * lock */
@@ -2444,7 +2423,7 @@ extern int fed_mgr_job_start(struct job_record *job_ptr, uint32_t cluster_id,
 		error("attempt to start sib job %d by cluster %d which doesn't have job lock",
 		     job_ptr->job_id, cluster_id);
 		rc = SLURM_ERROR;
-	} else if (job_ptr->fed_details->siblings &
+	} else if (job_ptr->fed_details->siblings_active &
 		   ~FED_SIBLING_BIT(cluster_id)) {
 		/* cancel all sibling jobs if there are more siblings than just
 		 * the cluster that it came from */
@@ -2453,7 +2432,9 @@ extern int fed_mgr_job_start(struct job_record *job_ptr, uint32_t cluster_id,
 
 	if (!rc) {
 		/* Update where sibling jobs are running */
-		set_job_fed_details(job_ptr, FED_SIBLING_BIT(cluster_id));
+		job_ptr->fed_details->siblings_active =
+			FED_SIBLING_BIT(cluster_id);
+		update_job_fed_details(job_ptr);
 
 		if (cluster_id != fed_mgr_cluster_rec->fed.id) {
 			/* leave as pending so that it will stay around */
@@ -2658,14 +2639,15 @@ extern int fed_mgr_sib_will_run(slurm_msg_t *msg, job_desc_msg_t *job_desc,
 		unlock_slurmctld(job_write_lock);
 	}
 
-	if (!job_desc->fed_siblings) { /* may have been set to existing job's */
+	if (!job_desc->fed_siblings_viable) { /* may have been set to existing job's */
 		/* Set potential siblings */
-		job_desc->fed_siblings = _get_all_sibling_bits();
 		if (job_desc->clusters)
-			job_desc->fed_siblings &=
+			job_desc->fed_siblings_viable &=
 				_cluster_names_to_ids(job_desc->clusters);
+		else
+			job_desc->fed_siblings_viable = _get_all_sibling_bits();
 		if (feature_sibs)
-			job_desc->fed_siblings &= feature_sibs;
+			job_desc->fed_siblings_viable &= feature_sibs;
 	}
 
 	if (!(sib_willruns = _get_sib_will_runs(msg, job_desc, uid))) {
@@ -2807,12 +2789,14 @@ extern int fed_mgr_job_requeue(struct job_record *job_ptr)
 	if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR)
 		info("requeueing fed job %d", job_ptr->job_id);
 
+	/* clear where actual siblings were */
+	job_ptr->fed_details->siblings_active = 0;
+
 	/* don't submit siblings for jobs that are held */
 	if (job_ptr->priority == 0) {
 		job_ptr->job_state &= (~JOB_REQUEUE_FED);
 
-		/* clear siblings */
-		set_job_fed_details(job_ptr, 0);
+		update_job_fed_details(job_ptr);
 
 		/* clear cluster lock */
 		job_ptr->fed_details->cluster_lock = 0;
@@ -2829,12 +2813,12 @@ extern int fed_mgr_job_requeue(struct job_record *job_ptr)
 					  &feature_sibs);
 
 	if (job_ptr->clusters)
-		job_desc->fed_siblings =
+		job_desc->fed_siblings_viable =
 			_cluster_names_to_ids(job_ptr->clusters);
 	else
-		job_desc->fed_siblings = _get_all_sibling_bits();
+		job_desc->fed_siblings_viable = _get_all_sibling_bits();
 	if (feature_sibs)
-		job_desc->fed_siblings &= feature_sibs;
+		job_desc->fed_siblings_viable &= feature_sibs;
 
 	/* have to pack job_desc into a buffer */
 	slurm_msg_t_init(&msg);
@@ -2847,21 +2831,18 @@ extern int fed_mgr_job_requeue(struct job_record *job_ptr)
 	set_buf_offset(buffer, 0);
 	msg.buffer           = buffer;
 
-	if (_submit_sibling_jobs(job_desc, &msg, false)) {
-		/* failed to submit a sibling job to a sibling. Need to update
-		 * the local job's sibling bitmap */
-		if (!job_desc->fed_siblings) {
-			/* we know that we already have a job_ptr so
-			 * just make it a locallly scheduleable job. */
-			error("Failed to submit fed job to siblings, submitting to local cluster");
-			job_desc->fed_siblings |=
-				FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id);
-		}
-	}
+	if (_submit_sibling_jobs(job_desc, &msg, false))
+		if (!job_desc->fed_siblings_active)
+			error("Failed to submit fed job to any siblings");
+
+	/* mark this cluster as an active sibling */
+	if (job_desc->fed_siblings_viable &
+	    FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id))
+		job_desc->fed_siblings_active |=
+			FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id);
 
-	/* set local job's fed_siblings. Could have been modified in
-	 * _submit_sibling_jobs() */
-	set_job_fed_details(job_ptr, job_desc->fed_siblings);
+	job_ptr->fed_details->siblings_active = job_desc->fed_siblings_active;
+	update_job_fed_details(job_ptr);
 
 	free_buf(buffer);
 	/* free the environment since all strings are stored in one
@@ -2887,7 +2868,7 @@ static int _cancel_sibling_jobs(struct job_record *job_ptr, uint16_t signal,
 				uint16_t flags, uid_t uid)
 {
 	int id = 1;
-	uint64_t tmp_sibs = job_ptr->fed_details->siblings;
+	uint64_t tmp_sibs = job_ptr->fed_details->siblings_active;
 	while (tmp_sibs) {
 		if ((tmp_sibs & 1) &&
 		    (id != fed_mgr_cluster_rec->fed.id)) {
diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c
index 6a1a6bbbbfb..d4ea6ba2e82 100644
--- a/src/slurmctld/job_mgr.c
+++ b/src/slurmctld/job_mgr.c
@@ -7327,8 +7327,12 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc,
 	if (job_desc->cluster_features)
 		detail_ptr->cluster_features =
 			xstrdup(job_desc->cluster_features);
-	if (job_desc->fed_siblings)
-		set_job_fed_details(job_ptr, job_desc->fed_siblings);
+	if (job_desc->fed_siblings_viable) {
+		job_ptr->fed_details = xmalloc(sizeof(job_fed_details_t));
+		job_ptr->fed_details->siblings_viable =
+			job_desc->fed_siblings_viable;
+		update_job_fed_details(job_ptr);
+	}
 	if ((job_desc->shared == JOB_SHARED_NONE) && (select_serial == 0)) {
 		detail_ptr->share_res  = 0;
 		detail_ptr->whole_node = WHOLE_NODE_REQUIRED;
@@ -8916,13 +8920,20 @@ void pack_job(struct job_record *dump_job_ptr, uint16_t show_flags, Buf buffer,
 
 		if (dump_job_ptr->fed_details) {
 			packstr(dump_job_ptr->fed_details->origin_str, buffer);
-			pack64(dump_job_ptr->fed_details->siblings, buffer);
-			packstr(dump_job_ptr->fed_details->siblings_str,
+			pack64(dump_job_ptr->fed_details->siblings_active,
+			       buffer);
+			packstr(dump_job_ptr->fed_details->siblings_active_str,
+				buffer);
+			pack64(dump_job_ptr->fed_details->siblings_viable,
+			       buffer);
+			packstr(dump_job_ptr->fed_details->siblings_viable_str,
 				buffer);
 		} else {
 			packnull(buffer);
 			pack64((uint64_t)0, buffer);
 			packnull(buffer);
+			pack64((uint64_t)0, buffer);
+			packnull(buffer);
 		}
 	} else if (protocol_version >= SLURM_17_02_PROTOCOL_VERSION) {
 		detail_ptr = dump_job_ptr->details;
@@ -9104,8 +9115,9 @@ void pack_job(struct job_record *dump_job_ptr, uint16_t show_flags, Buf buffer,
 
 		if (dump_job_ptr->fed_details) {
 			packstr(dump_job_ptr->fed_details->origin_str, buffer);
-			pack64(dump_job_ptr->fed_details->siblings, buffer);
-			packstr(dump_job_ptr->fed_details->siblings_str,
+			pack64(dump_job_ptr->fed_details->siblings_active,
+			       buffer);
+			packstr(dump_job_ptr->fed_details->siblings_active_str,
 				buffer);
 		} else {
 			packnull(buffer);
@@ -12156,21 +12168,20 @@ static int _update_job(struct job_record *job_ptr, job_desc_msg_t * job_specs,
 		}
 	}
 
-	if (job_specs->fed_siblings) {
-		slurmctld_lock_t fed_read_lock = {
-			NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
-		if (job_ptr->fed_details)
-			info("update_job: setting fed_siblings from %"PRIu64" to %"PRIu64" for job_id %u",
-			     job_ptr->fed_details->siblings,
-			     job_specs->fed_siblings,
-			     job_ptr->job_id);
-		else
-			info("update_job: setting fed_siblings to %"PRIu64" for job_id %u",
-			     job_specs->fed_siblings,
-			     job_ptr->job_id);
-		lock_slurmctld(fed_read_lock);
-		set_job_fed_details(job_ptr, job_specs->fed_siblings);
-		unlock_slurmctld(fed_read_lock);
+	if (job_specs->fed_siblings_viable) {
+		if (!job_ptr->fed_details) {
+			error_code = ESLURM_JOB_NOT_FEDERATED;
+			goto fini;
+		}
+
+		info("update_job: setting fed_siblings from %"PRIu64" to %"PRIu64" for job_id %u",
+		     job_ptr->fed_details->siblings_viable,
+		     job_specs->fed_siblings_viable,
+		     job_ptr->job_id);
+
+		job_ptr->fed_details->siblings_viable =
+			job_specs->fed_siblings_viable;
+		update_job_fed_details(job_ptr);
 	}
 
 fini:
@@ -15658,8 +15669,12 @@ extern job_desc_msg_t *copy_job_record_to_job_desc(struct job_record *job_ptr)
 	job_desc->ntasks_per_socket = mc_ptr->ntasks_per_socket;
 	job_desc->ntasks_per_core   = mc_ptr->ntasks_per_core;
 
-	if (job_ptr->fed_details)
-		job_desc->fed_siblings = job_ptr->fed_details->siblings;
+	if (job_ptr->fed_details) {
+		job_desc->fed_siblings_active =
+			job_ptr->fed_details->siblings_active;
+		job_desc->fed_siblings_viable =
+			job_ptr->fed_details->siblings_viable;
+	}
 #if 0
 	/* select_jobinfo is unused at job submit time, only it's
 	 * components are set. We recover those from the structure below.
@@ -16282,7 +16297,8 @@ static void _free_job_fed_details(job_fed_details_t **fed_details_pptr)
 
 	if (fed_details_ptr) {
 		xfree(fed_details_ptr->origin_str);
-		xfree(fed_details_ptr->siblings_str);
+		xfree(fed_details_ptr->siblings_active_str);
+		xfree(fed_details_ptr->siblings_viable_str);
 		xfree(fed_details_ptr);
 		*fed_details_pptr = NULL;
 	}
@@ -16295,8 +16311,10 @@ static void _dump_job_fed_details(job_fed_details_t *fed_details_ptr,
 		pack16(1, buffer);
 		pack32(fed_details_ptr->cluster_lock, buffer);
 		packstr(fed_details_ptr->origin_str, buffer);
-		pack64(fed_details_ptr->siblings, buffer);
-		packstr(fed_details_ptr->siblings_str, buffer);
+		pack64(fed_details_ptr->siblings_active, buffer);
+		packstr(fed_details_ptr->siblings_active_str, buffer);
+		pack64(fed_details_ptr->siblings_viable, buffer);
+		packstr(fed_details_ptr->siblings_viable_str, buffer);
 	} else {
 		pack16(0, buffer);
 	}
@@ -16320,9 +16338,16 @@ static int _load_job_fed_details(job_fed_details_t **fed_details_pptr,
 			safe_unpack32(&fed_details_ptr->cluster_lock, buffer);
 			safe_unpackstr_xmalloc(&fed_details_ptr->origin_str,
 					       &tmp_uint32, buffer);
-			safe_unpack64(&fed_details_ptr->siblings, buffer);
-			safe_unpackstr_xmalloc(&fed_details_ptr->siblings_str,
-					       &tmp_uint32, buffer);
+			safe_unpack64(&fed_details_ptr->siblings_active,
+				      buffer);
+			safe_unpackstr_xmalloc(
+					&fed_details_ptr->siblings_active_str,
+					&tmp_uint32, buffer);
+			safe_unpack64(&fed_details_ptr->siblings_viable,
+				      buffer);
+			safe_unpackstr_xmalloc(
+					&fed_details_ptr->siblings_viable_str,
+					&tmp_uint32, buffer);
 		}
 	} else if (protocol_version >= SLURM_17_02_PROTOCOL_VERSION) {
 		safe_unpack16(&tmp_uint16, buffer);
@@ -16332,9 +16357,11 @@ static int _load_job_fed_details(job_fed_details_t **fed_details_pptr,
 			safe_unpack32(&fed_details_ptr->cluster_lock, buffer);
 			safe_unpackstr_xmalloc(&fed_details_ptr->origin_str,
 					       &tmp_uint32, buffer);
-			safe_unpack64(&fed_details_ptr->siblings, buffer);
-			safe_unpackstr_xmalloc(&fed_details_ptr->siblings_str,
-					       &tmp_uint32, buffer);
+			safe_unpack64(&fed_details_ptr->siblings_viable,
+				      buffer);
+			safe_unpackstr_xmalloc(
+					&fed_details_ptr->siblings_viable_str,
+					&tmp_uint32, buffer);
 		}
 	}
 
@@ -16347,24 +16374,26 @@ unpack_error:
 	return SLURM_ERROR;
 }
 
-extern void set_job_fed_details(struct job_record *job_ptr,
-				uint64_t fed_siblings)
+/* Set federated job's sibling strings. */
+extern void update_job_fed_details(struct job_record *job_ptr)
 {
 	xassert(job_ptr);
-
-	if (!job_ptr->fed_details) {
-		job_ptr->fed_details =
-			xmalloc(sizeof(job_fed_details_t));
-	} else {
-		xfree(job_ptr->fed_details->siblings_str);
-		xfree(job_ptr->fed_details->origin_str);
-	}
-
-	job_ptr->fed_details->siblings = fed_siblings;
-	job_ptr->fed_details->siblings_str =
-		fed_mgr_cluster_ids_to_names(fed_siblings);
-	job_ptr->fed_details->origin_str =
-		fed_mgr_get_cluster_name(
+	xassert(job_ptr->fed_details);
+
+	xfree(job_ptr->fed_details->siblings_active_str);
+	xfree(job_ptr->fed_details->siblings_viable_str);
+
+	job_ptr->fed_details->siblings_active_str =
+		fed_mgr_cluster_ids_to_names(
+					job_ptr->fed_details->siblings_active);
+	job_ptr->fed_details->siblings_viable_str =
+		fed_mgr_cluster_ids_to_names(
+					job_ptr->fed_details->siblings_viable);
+
+	/* only set once */
+	if (!job_ptr->fed_details->origin_str)
+		job_ptr->fed_details->origin_str =
+			fed_mgr_get_cluster_name(
 				fed_mgr_get_cluster_id(job_ptr->job_id));
 }
 
diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c
index 3c503d19ddf..63ba8d39b69 100644
--- a/src/slurmctld/proc_req.c
+++ b/src/slurmctld/proc_req.c
@@ -2588,8 +2588,8 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg, bool is_sib_job)
 				if ((job_ptr =
 				     find_job_record(job_desc_msg->job_id)) &&
 				    job_ptr->fed_details)
-					job_desc_msg->fed_siblings =
-						job_ptr->fed_details->siblings;
+					job_desc_msg->fed_siblings_viable =
+					job_ptr->fed_details->siblings_active;
 				else if (!is_sib_job)
 					error_code = ESLURM_INVALID_JOB_ID;
 				unlock_slurmctld(job_read_lock);
@@ -6084,7 +6084,7 @@ static void _slurm_rpc_sib_submit_batch_job(uint32_t uid, slurm_msg_t *msg)
 	sib_msg_t *sib_msg       = msg->data;
 	job_desc_msg_t *job_desc = sib_msg->data;
 	job_desc->job_id         = sib_msg->job_id;
-	job_desc->fed_siblings   = sib_msg->fed_siblings;
+	job_desc->fed_siblings_viable = sib_msg->fed_siblings;
 
 	slurmctld_lock_t job_write_lock = {
 		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK };
@@ -6122,8 +6122,8 @@ static void _slurm_rpc_sib_resource_allocation(uint32_t uid, slurm_msg_t *msg)
 	sib_msg_t *sib_msg       = msg->data;
 	job_desc_msg_t *job_desc = sib_msg->data;
 	job_desc->job_id         = sib_msg->job_id;
-	job_desc->fed_siblings   = sib_msg->fed_siblings;
 	job_desc->resp_host      = xstrdup(sib_msg->resp_host);
+	job_desc->fed_siblings_viable = sib_msg->fed_siblings;
 
 	if (!msg->conn) {
 		error("Security violation, SIB_RESOURCE_ALLOCATION RPC from uid=%d",
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
index 872d243023f..c1fb580c2fb 100644
--- a/src/slurmctld/slurmctld.h
+++ b/src/slurmctld/slurmctld.h
@@ -562,10 +562,14 @@ typedef struct {
 } acct_policy_limit_set_t;
 
 typedef struct {
-	uint32_t cluster_lock;	/* sibling that has lock on job */
-	char    *origin_str;	/* origin cluster name */
-	uint64_t siblings;	/* bitmap of sibling cluster ids */
-	char    *siblings_str;	/* comma separated list of sibling names */
+	uint32_t cluster_lock;		/* sibling that has lock on job */
+	char    *origin_str;		/* origin cluster name */
+	uint64_t siblings_active;	/* bitmap of active sibling ids. */
+	char    *siblings_active_str;	/* comma separated list of actual
+					   sibling names */
+	uint64_t siblings_viable;	/* bitmap of viable sibling ids. */
+	char    *siblings_viable_str;	/* comma separated list of viable
+					   sibling names */
 } job_fed_details_t;
 
 /*
@@ -2442,10 +2446,11 @@ waitpid_timeout(const char *, pid_t, int *, int);
 extern void set_partition_tres();
 
 /*
- * Set job's siblings and make sibling strings
+ * Update job's federated siblings strings.
+ *
+ * IN job_ptr - job_ptr to update
  */
-extern void set_job_fed_details(struct job_record *job_ptr,
-				uint64_t fed_siblings);
+extern void update_job_fed_details(struct job_record *job_ptr);
 
 /*
  * purge_job_record - purge specific job record. No testing is performed to
diff --git a/src/squeue/opts.c b/src/squeue/opts.c
index 0b674cb1a3d..a4c1a72d61f 100644
--- a/src/squeue/opts.c
+++ b/src/squeue/opts.c
@@ -1395,13 +1395,24 @@ extern int parse_long_format( char* format_long )
 							field_size,
 							right_justify,
 							suffix );
-			else if (!xstrcasecmp(token, "fedsiblings"))
-				job_format_add_fed_siblings(params.format_list,
-							    field_size,
-							    right_justify,
-							    suffix );
-			else if (!xstrcasecmp(token, "fedsiblingsraw"))
-				job_format_add_fed_siblings_raw(
+			else if (!xstrcasecmp(token, "siblingsactive"))
+				job_format_add_fed_siblings_active(
+							params.format_list,
+							field_size,
+							right_justify, suffix );
+			else if (!xstrcasecmp(token, "siblingsactiveraw"))
+				job_format_add_fed_siblings_active_raw(
+							params.format_list,
+							field_size,
+							right_justify,
+							suffix );
+			else if (!xstrcasecmp(token, "siblingsviable"))
+				job_format_add_fed_siblings_viable(
+							params.format_list,
+							field_size,
+							right_justify, suffix );
+			else if (!xstrcasecmp(token, "siblingsviableraw"))
+				job_format_add_fed_siblings_viable_raw(
 							params.format_list,
 							field_size,
 							right_justify,
diff --git a/src/squeue/print.c b/src/squeue/print.c
index e3155d56ae3..20b588a3bdc 100644
--- a/src/squeue/print.c
+++ b/src/squeue/print.c
@@ -1695,14 +1695,14 @@ int _print_job_fed_origin_raw(job_info_t * job, int width, bool right_justify,
 	return SLURM_SUCCESS;
 }
 
-int _print_job_fed_siblings(job_info_t * job, int width, bool right_justify,
-			    char* suffix)
+int _print_job_fed_siblings_active(job_info_t * job, int width,
+				   bool right_justify, char* suffix)
 {
 	if (job == NULL)
-		_print_str("FED_SIBLINGS", width, right_justify, true);
+		_print_str("ACTIVE_SIBLINGS", width, right_justify, true);
 	else {
-		if (job->fed_siblings_str)
-			_print_str(job->fed_siblings_str, width, right_justify,
+		if (job->fed_siblings_active_str)
+			_print_str(job->fed_siblings_active_str, width, right_justify,
 				   true);
 		else
 			_print_str("NA", width, right_justify, true);
@@ -1713,15 +1713,60 @@ int _print_job_fed_siblings(job_info_t * job, int width, bool right_justify,
 	return SLURM_SUCCESS;
 }
 
-int _print_job_fed_siblings_raw(job_info_t * job, int width, bool right_justify,
-				char* suffix)
+int _print_job_fed_siblings_active_raw(job_info_t * job, int width,
+				       bool right_justify, char* suffix)
+{
+	if (job == NULL)
+		_print_str("ACTIVE_SIBLINGS_RAW", width, right_justify, true);
+	else {
+		int bit = 1;
+		char *ids = NULL;
+		uint64_t tmp_sibs = job->fed_siblings_active;
+		while (tmp_sibs) {
+			if (tmp_sibs & 1)
+				xstrfmtcat(ids, "%s%d", (ids) ? "," : "", bit);
+
+			tmp_sibs >>= 1;
+			bit++;
+		}
+		if (ids)
+			_print_str(ids, width, right_justify, true);
+		else
+			_print_str("NA", width, right_justify, true);
+	}
+
+	if (suffix)
+		printf("%s", suffix);
+	return SLURM_SUCCESS;
+}
+
+int _print_job_fed_siblings_viable(job_info_t * job, int width,
+				   bool right_justify, char* suffix)
+{
+	if (job == NULL)
+		_print_str("VIABLE_SIBLINGS", width, right_justify, true);
+	else {
+		if (job->fed_siblings_viable_str)
+			_print_str(job->fed_siblings_viable_str, width,
+				   right_justify, true);
+		else
+			_print_str("NA", width, right_justify, true);
+	}
+
+	if (suffix)
+		printf("%s", suffix);
+	return SLURM_SUCCESS;
+}
+
+int _print_job_fed_siblings_viable_raw(job_info_t * job, int width,
+				       bool right_justify, char* suffix)
 {
 	if (job == NULL)
-		_print_str("FED_SIBLINGS_RAW", width, right_justify, true);
+		_print_str("VIALBLE_SIBLINGS_RAW", width, right_justify, true);
 	else {
 		int bit = 1;
 		char *ids = NULL;
-		uint64_t tmp_sibs = job->fed_siblings;
+		uint64_t tmp_sibs = job->fed_siblings_viable;
 		while (tmp_sibs) {
 			if (tmp_sibs & 1)
 				xstrfmtcat(ids, "%s%d", (ids) ? "," : "", bit);
diff --git a/src/squeue/print.h b/src/squeue/print.h
index 80d14bc1436..a8ef101c0d5 100644
--- a/src/squeue/print.h
+++ b/src/squeue/print.h
@@ -244,11 +244,18 @@ int job_format_add_function(List list, int width, bool right_justify,
 #define job_format_add_fed_origin_raw(list,wid,right,suffix) \
 	job_format_add_function(list,wid,right,suffix, \
 				_print_job_fed_origin_raw)
-#define job_format_add_fed_siblings(list,wid,right,suffix) \
-	job_format_add_function(list,wid,right,suffix, _print_job_fed_siblings)
-#define job_format_add_fed_siblings_raw(list,wid,right,suffix) \
+#define job_format_add_fed_siblings_active(list,wid,right,suffix) \
 	job_format_add_function(list,wid,right,suffix, \
-				_print_job_fed_siblings_raw)
+				_print_job_fed_siblings_active)
+#define job_format_add_fed_siblings_active_raw(list,wid,right,suffix) \
+	job_format_add_function(list,wid,right,suffix, \
+				_print_job_fed_siblings_active_raw)
+#define job_format_add_fed_siblings_viable(list,wid,right,suffix) \
+	job_format_add_function(list,wid,right,suffix, \
+				_print_job_fed_siblings_viable)
+#define job_format_add_fed_siblings_viable_raw(list,wid,right,suffix) \
+	job_format_add_function(list,wid,right,suffix, \
+				_print_job_fed_siblings_viable_raw)
 #define job_format_add_max_cpus(list,wid,right,suffix) \
 	job_format_add_function(list,wid,right,suffix,_print_job_max_cpus)
 #define job_format_add_max_nodes(list,wid,right,suffix) \
@@ -454,10 +461,14 @@ int _print_job_fed_origin(job_info_t * job, int width, bool right_justify,
 			  char* suffix);
 int _print_job_fed_origin_raw(job_info_t * job, int width, bool right_justify,
 			      char* suffix);
-int _print_job_fed_siblings(job_info_t * job, int width, bool right_justify,
-			    char* suffix);
-int _print_job_fed_siblings_raw(job_info_t * job, int width, bool right_justify,
-				char* suffix);
+int _print_job_fed_siblings_active(job_info_t * job, int width,
+				   bool right_justify, char* suffix);
+int _print_job_fed_siblings_active_raw(job_info_t * job, int width,
+				       bool right_justify, char* suffix);
+int _print_job_fed_siblings_viable(job_info_t * job, int width,
+				   bool right_justify, char* suffix);
+int _print_job_fed_siblings_viable_raw(job_info_t * job, int width,
+				       bool right_justify, char* suffix);
 int _print_job_max_cpus(job_info_t * job, int width, bool right_justify,
 			char* suffix);
 int _print_job_max_nodes(job_info_t * job, int width, bool right_justify,
-- 
GitLab