diff --git a/RELEASE_NOTES b/RELEASE_NOTES index a0680bcf792d4dbb543f48a8d76757a78ff89a40..944cea729162fbdde2c785aa47771329c2704e2a 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -81,11 +81,16 @@ Added members to the following struct definitions In slurmbdb_cluster_fed_t: Added feature_list to hold cluster features. In job_desc_msg_t: Added cluster_features for passing cluster features to controller. + Renamed fed_siblings to fed_siblings_active. + Added fed_siblings_viable. In job_info_t: Added cluster_features for passing back a job's cluster features from the controller. + Renamed fed_siblings[_str] fed_siblings_active[_str] + Added fed_siblings_viable[_str]. In struct job_details: Added cluster_features to hold requestsed cluster features. - +In job_fed_details_t: Rename siblings to siblings_active. + Added siblings_viable. Added the following struct definitions ====================================== diff --git a/doc/man/man1/squeue.1 b/doc/man/man1/squeue.1 index 5eab7e3ef19446515eb8625ae697f08bc2b5933e..88498eb7e5444a5a139096656429fa944dd92402 100644 --- a/doc/man/man1/squeue.1 +++ b/doc/man/man1/squeue.1 @@ -592,14 +592,6 @@ Cluster name where federated job originated from. Cluster ID where federated job originated from. (Valid for federated jobs only) .TP -\fBfedsiblings\fR -Cluster names of where federated job can run. -(Valid for federated jobs only) -.TP -\fBfedsiblingsraw\fR -Cluster IDs of where federated job can run. -(Valid for federated jobs only) -.TP \fBgres\fR Generic resources (gres) required by the job or step. (Valid for jobs and job steps) @@ -826,6 +818,22 @@ Permit rotation of geometry (yes or no), Node use (VIRTUAL or COPROCESSOR), etc. (Valid for jobs only) .TP +\fBsiblingsactive\fR +Cluster names of where federated sibling jobs exist. +(Valid for federated jobs only) +.TP +\fBsiblingsactiveraw\fR +Cluster IDs of where federated sibling jobs exist. +(Valid for federated jobs only) +.TP +\fBsiblingsviable\fR +Cluster names of where federated sibling jobs are viable to run. +(Valid for federated jobs only) +.TP +\fBsiblingsviableraw\fR +Cluster IDs of where federated sibling jobs viable to run. +(Valid for federated jobs only) +.TP \fBsockets\fR Number of sockets per node requested by the job. This reports the value of the \fBsrun \-\-sockets\-per\-node\fR option. diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index 71c6d749a34b3249863a0f377c19dc6a0df54f30..e6336f420123f35bd17941f13595d74823c9e1c1 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -1417,7 +1417,8 @@ typedef struct job_descriptor { /* For submit, allocate, and update requests */ * from job's allocation, default NONE */ char *features; /* required feature specification, * default NONE */ - uint64_t fed_siblings; /* Bitmap of federation siblings */ + uint64_t fed_siblings_active; /* Bitmap of active fed sibling ids */ + uint64_t fed_siblings_viable; /* Bitmap of viable fed sibling ids */ char *gres; /* comma separated list of required generic * resources, default NONE */ uint32_t group_id; /* group to assume, if run as root. */ @@ -1596,8 +1597,10 @@ typedef struct job_info { uint32_t exit_code; /* exit code for job (status from wait call) */ char *features; /* comma separated list of required features */ char *fed_origin_str; /* Origin cluster's name */ - uint64_t fed_siblings; /* bitmap of sibling cluster ids */ - char *fed_siblings_str; /* string of sibling cluster names */ + uint64_t fed_siblings_active; /* bitmap of active fed sibling ids */ + char *fed_siblings_active_str; /* string of active sibling names */ + uint64_t fed_siblings_viable; /* bitmap of viable fed sibling ids */ + char *fed_siblings_viable_str; /* string of viable sibling names */ char *gres; /* comma separated list of generic resources */ uint32_t gres_detail_cnt; /* Count of gres_detail_str records, * one per allocated node */ diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index aa102b6d8ac5bdd236a193587b0856fa982dfcfc..ea361f9c79ab2013e4371c699548dc8ce06506c8 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -265,6 +265,7 @@ enum { ESLURM_FED_CLUSTER_MAX_CNT = 7100, ESLURM_FED_CLUSTER_MULTIPLE_ASSIGNMENT, ESLURM_INVALID_CLUSTER_FEATURE, + ESLURM_JOB_NOT_FEDERATED, /* plugin and custom errors */ ESLURM_MISSING_TIME_LIMIT = 8000, diff --git a/src/api/job_info.c b/src/api/job_info.c index d7854972ccbf50cb3d3ec531b03d3a2d7b0a6fa9..c995ed4010e37cef760758b42d9525133a1d8f13 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -563,9 +563,11 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) } /****** Line 14a (optional) ******/ - if (job_ptr->fed_siblings) { - xstrfmtcat(out, "FedOrigin=%s FedSiblings=%s", - job_ptr->fed_origin_str, job_ptr->fed_siblings_str); + if (job_ptr->fed_siblings_active || job_ptr->fed_siblings_viable) { + xstrfmtcat(out, "FedOrigin=%s ViableSiblings=%s ActiveSiblings=%s", + job_ptr->fed_origin_str, + job_ptr->fed_siblings_viable_str, + job_ptr->fed_siblings_active_str); xstrcat(out, line_end); } diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index d55e048a85309fbf276fcfef43665ec796493f04..f28d3f2cf1629bffab8cffd31cfd8394edd0ebf1 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -446,6 +446,8 @@ static slurm_errtab_t slurm_errtab[] = { "Clusters can only be assigned to one federation" }, { ESLURM_INVALID_CLUSTER_FEATURE, "Invalid cluster feature specification" }, + { ESLURM_JOB_NOT_FEDERATED, + "Not a valid federated job" }, /* plugin and custom errors */ { ESLURM_MISSING_TIME_LIMIT, diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index a565310c59b9eba5f60bb6dcf697d9f53e9aa471..309f965f3acdf3e3cea4abe28b7437a7bdd655ac 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -5635,9 +5635,12 @@ _unpack_job_info_members(job_info_t * job, Buf buffer, safe_unpackstr_xmalloc(&job->fed_origin_str, &uint32_tmp, buffer); - safe_unpack64(&job->fed_siblings, buffer); - safe_unpackstr_xmalloc(&job->fed_siblings_str, &uint32_tmp, + safe_unpack64(&job->fed_siblings_active, buffer); + safe_unpackstr_xmalloc(&job->fed_siblings_active_str, &uint32_tmp, buffer); + safe_unpack64(&job->fed_siblings_viable, buffer); + safe_unpackstr_xmalloc(&job->fed_siblings_viable_str, + &uint32_tmp, buffer); } else if (protocol_version >= SLURM_17_02_PROTOCOL_VERSION) { safe_unpack32(&job->array_job_id, buffer); safe_unpack32(&job->array_task_id, buffer); @@ -5781,9 +5784,9 @@ _unpack_job_info_members(job_info_t * job, Buf buffer, safe_unpackstr_xmalloc(&job->fed_origin_str, &uint32_tmp, buffer); - safe_unpack64(&job->fed_siblings, buffer); - safe_unpackstr_xmalloc(&job->fed_siblings_str, &uint32_tmp, - buffer); + safe_unpack64(&job->fed_siblings_viable, buffer); + safe_unpackstr_xmalloc(&job->fed_siblings_viable_str, + &uint32_tmp, buffer); } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { char *node_inx_str; uint32_t tmp_mem; @@ -7528,7 +7531,8 @@ _pack_job_desc_msg(job_desc_msg_t * job_desc_ptr, Buf buffer, pack32(job_desc_ptr->task_dist, buffer); pack16(job_desc_ptr->kill_on_node_fail, buffer); packstr(job_desc_ptr->features, buffer); - pack64(job_desc_ptr->fed_siblings, buffer); + pack64(job_desc_ptr->fed_siblings_active, buffer); + pack64(job_desc_ptr->fed_siblings_viable, buffer); packstr(job_desc_ptr->gres, buffer); pack32(job_desc_ptr->job_id, buffer); packstr(job_desc_ptr->job_id_str, buffer); @@ -7699,7 +7703,7 @@ _pack_job_desc_msg(job_desc_msg_t * job_desc_ptr, Buf buffer, pack32(job_desc_ptr->task_dist, buffer); pack16(job_desc_ptr->kill_on_node_fail, buffer); packstr(job_desc_ptr->features, buffer); - pack64(job_desc_ptr->fed_siblings, buffer); + pack64(job_desc_ptr->fed_siblings_viable, buffer); packstr(job_desc_ptr->gres, buffer); pack32(job_desc_ptr->job_id, buffer); packstr(job_desc_ptr->job_id_str, buffer); @@ -8055,7 +8059,8 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer, safe_unpack16(&job_desc_ptr->kill_on_node_fail, buffer); safe_unpackstr_xmalloc(&job_desc_ptr->features, &uint32_tmp, buffer); - safe_unpack64(&job_desc_ptr->fed_siblings, buffer); + safe_unpack64(&job_desc_ptr->fed_siblings_active, buffer); + safe_unpack64(&job_desc_ptr->fed_siblings_viable, buffer); safe_unpackstr_xmalloc(&job_desc_ptr->gres, &uint32_tmp,buffer); safe_unpack32(&job_desc_ptr->job_id, buffer); safe_unpackstr_xmalloc(&job_desc_ptr->job_id_str, @@ -8228,7 +8233,7 @@ _unpack_job_desc_msg(job_desc_msg_t ** job_desc_buffer_ptr, Buf buffer, safe_unpack16(&job_desc_ptr->kill_on_node_fail, buffer); safe_unpackstr_xmalloc(&job_desc_ptr->features, &uint32_tmp, buffer); - safe_unpack64(&job_desc_ptr->fed_siblings, buffer); + safe_unpack64(&job_desc_ptr->fed_siblings_viable, buffer); safe_unpackstr_xmalloc(&job_desc_ptr->gres, &uint32_tmp,buffer); safe_unpack32(&job_desc_ptr->job_id, buffer); safe_unpackstr_xmalloc(&job_desc_ptr->job_id_str, diff --git a/src/slurmctld/fed_mgr.c b/src/slurmctld/fed_mgr.c index 8491ba7ffb8f24bc35c055fd60464c4f0d1f0827..6209a5681bfefe441fabde82e16a3f91bf51e090 100644 --- a/src/slurmctld/fed_mgr.c +++ b/src/slurmctld/fed_mgr.c @@ -1008,7 +1008,7 @@ static void _revoke_sibling_jobs(struct job_record *job_ptr, uint32_t cluster_id, time_t start_time) { int id = 1; - uint64_t tmp_sibs = job_ptr->fed_details->siblings; + uint64_t tmp_sibs = job_ptr->fed_details->siblings_active; while (tmp_sibs) { if ((tmp_sibs & 1) && (id != fed_mgr_cluster_rec->fed.id) && @@ -1596,7 +1596,7 @@ static uint64_t _cluster_names_to_ids(char *clusters) * * Must have fed_read_lock before entering and NO job locks. * - * Will send willruns to the clusters set in job_desc->fed.siblings. + * Will send willruns to the clusters set in job_desc->fed_siblings_viable. * * IN msg - contains the original job_desc buffer to send to the siblings and to * be able to create a job_desc copy to willrun itself. @@ -1645,7 +1645,7 @@ static List _get_sib_will_runs(slurm_msg_t *msg, job_desc_msg_t *job_desc, /* willrun the sibling clusters */ sib_itr = list_iterator_create(fed_mgr_fed_rec->cluster_list); while ((sibling = list_next(sib_itr))) { - if (!(job_desc->fed_siblings & + if (!(job_desc->fed_siblings_viable & FED_SIBLING_BIT(sibling->fed.id))) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) info("skipping cluster %s -- not in cluster list to submit job to", @@ -1711,8 +1711,7 @@ static List _get_sib_will_runs(slurm_msg_t *msg, job_desc_msg_t *job_desc, */ static slurmdb_cluster_rec_t *_find_start_now_sib(slurm_msg_t *msg, job_desc_msg_t *job_desc, - uid_t uid, - uint64_t *avail_sibs) + uid_t uid) { ListIterator itr; List sib_willruns; @@ -1721,7 +1720,6 @@ static slurmdb_cluster_rec_t *_find_start_now_sib(slurm_msg_t *msg, slurmdb_cluster_rec_t *ret_sib = NULL; time_t now = 0; - xassert(avail_sibs); xassert(job_desc); xassert(msg); @@ -1739,8 +1737,6 @@ static slurmdb_cluster_rec_t *_find_start_now_sib(slurm_msg_t *msg, if (!sib_willrun->resp) /* no response if job couldn't run? */ continue; - *avail_sibs |= FED_SIBLING_BIT(sib_willrun->sibling->fed.id); - /* Pick first sibling that can start the job now. siblings are * sorted by weight and resources. */ if (sib_willrun->resp->start_time <= now) { @@ -1836,8 +1832,8 @@ static void _update_sib_job_siblings(job_desc_msg_t *job_desc, uint64_t sibs) slurm_attr_init(&attr); slurm_init_job_desc_msg(&job_update_msg); - job_update_msg.job_id = job_desc->job_id; - job_update_msg.fed_siblings = job_desc->fed_siblings; + job_update_msg.job_id = job_desc->job_id; + job_update_msg.fed_siblings_viable = job_desc->fed_siblings_viable; sib_itr = list_iterator_create(fed_mgr_fed_rec->cluster_list); while ((sibling = list_next(sib_itr))) { @@ -1882,11 +1878,12 @@ static void _update_sib_job_siblings(job_desc_msg_t *job_desc, uint64_t sibs) } /* - * Submit sibling jobs to designated (job_desc->fed_siblings) siblings. + * Submit sibling jobs to designated siblings (job_desc->fed_siblings_viable). * - * Will update job_desc->fed_siblings if a sibling fails to submit a job. + * Will update job_desc->fed_siblings_active with the successful submissions. * - * IN job_desc - job_desc containing job_id and fed_siblings of job to be. + * IN job_desc - job_desc containing job_id and fed_siblings_viable of job to be + * submitted. * IN msg - contains the original job_desc buffer to send to the siblings. * IN alloc_only - true if just an allocation. false if a batch job. * RET returns SLURM_SUCCESS if all siblings recieved the job sucessfully or @@ -1914,7 +1911,7 @@ static int _submit_sibling_jobs(job_desc_msg_t *job_desc, slurm_msg_t *msg, sib_msg.data_buffer = msg->buffer; sib_msg.data_type = msg->msg_type; sib_msg.data_version = msg->protocol_version; - sib_msg.fed_siblings = job_desc->fed_siblings; + sib_msg.fed_siblings = job_desc->fed_siblings_viable; sib_msg.job_id = job_desc->job_id; sib_msg.resp_host = job_desc->resp_host; @@ -1926,11 +1923,16 @@ static int _submit_sibling_jobs(job_desc_msg_t *job_desc, slurm_msg_t *msg, if (sibling == fed_mgr_cluster_rec) continue; - /* fed_siblings is set prior to siblings that responded */ - if (!(job_desc->fed_siblings & + /* Only send to available siblings */ + if (!(job_desc->fed_siblings_viable & FED_SIBLING_BIT(sibling->fed.id))) continue; + /* skip sibling if the sibling already has a job */ + if (job_desc->fed_siblings_active & + FED_SIBLING_BIT(sibling->fed.id)) + continue; + sub = xmalloc(sizeof(sib_submit_t)); sub->sibling = sibling; sub->sib_msg = &sib_msg; @@ -1953,18 +1955,13 @@ static int _submit_sibling_jobs(job_desc_msg_t *job_desc, slurm_msg_t *msg, pthread_join(tmp_sub->thread_id, NULL); rc |= tmp_sub->thread_rc; - /* take out the job from the siblings bitmap if there was an - * error. The local host should stay in it if it's there. */ - if (tmp_sub->thread_rc) - job_desc->fed_siblings &= - (~FED_SIBLING_BIT(tmp_sub->sibling->fed.id)); + /* Mark successful submission as active in fed_siblings */ + if (!tmp_sub->thread_rc) + job_desc->fed_siblings_active |= + FED_SIBLING_BIT(tmp_sub->sibling->fed.id); } list_iterator_destroy(thread_itr); - if (rc && job_desc->fed_siblings) { - _update_sib_job_siblings(job_desc, INFINITE64); - } - slurm_attr_destroy(&attr); FREE_NULL_LIST(submit_threads); @@ -2078,9 +2075,10 @@ extern int fed_mgr_job_allocate(slurm_msg_t *msg, job_desc_msg_t *job_desc, { int rc = SLURM_SUCCESS; slurmdb_cluster_rec_t *start_now_sib = NULL; - uint64_t avail_sibs = 0, feature_sibs = 0; + uint64_t feature_sibs = 0; struct job_record *job_ptr = NULL; time_t now = time(NULL); + bool job_held = false; slurmctld_lock_t fed_read_lock = { NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; slurmctld_lock_t job_write_lock = { @@ -2106,6 +2104,9 @@ extern int fed_mgr_job_allocate(slurm_msg_t *msg, job_desc_msg_t *job_desc, return SLURM_ERROR; } + if (job_desc->priority == 0) + job_held = true; + lock_slurmctld(job_write_lock); /* get job_id now. Can't submit job to get job_id as job_allocate will * change the job_desc. */ @@ -2115,41 +2116,25 @@ extern int fed_mgr_job_allocate(slurm_msg_t *msg, job_desc_msg_t *job_desc, lock_slurmctld(fed_read_lock); /* Set potential siblings */ - job_desc->fed_siblings = _get_all_sibling_bits(); + job_desc->fed_siblings_viable = _get_all_sibling_bits(); if (job_desc->clusters) - job_desc->fed_siblings &= + job_desc->fed_siblings_viable &= _cluster_names_to_ids(job_desc->clusters); if (feature_sibs) - job_desc->fed_siblings &= feature_sibs; - /* Set avail_sibs to fed.siblings in case job can't start now or is - * being held. */ - avail_sibs = job_desc->fed_siblings; + job_desc->fed_siblings_viable &= feature_sibs; - if ((job_desc->priority != 0) && (job_desc->begin_time <= now)) { + if (!job_held && (job_desc->begin_time <= now)) { /* Don't job/node write lock on _find_start_now_sib. It locks * inside _sib_will_run */ - start_now_sib = _find_start_now_sib(msg, job_desc, uid, - &avail_sibs); - - if (!avail_sibs) { - debug("No cluster responded to sibling will_runs"); - avail_sibs = job_desc->fed_siblings; - } + start_now_sib = _find_start_now_sib(msg, job_desc, uid); } - if (job_desc->priority == 0) { - /* don't submit siblings if the job held, siblings will be - * submitted when the job is released. */ - job_desc->fed_siblings = 0; - } else if (start_now_sib == NULL) { - job_desc->fed_siblings = avail_sibs; - } else if (start_now_sib == fed_mgr_cluster_rec) { - job_desc->fed_siblings = - FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id); - } else { - job_desc->fed_siblings = + if (start_now_sib) + job_desc->fed_siblings_viable = FED_SIBLING_BIT(start_now_sib->fed.id); - } + + /* ensure that fed_siblings_active is clear since this is a new job */ + job_desc->fed_siblings_active = 0; /* Submit local job first. Then submit to all siblings. If the local job * fails, then don't worry about sending to the siblings. */ @@ -2167,46 +2152,32 @@ extern int fed_mgr_job_allocate(slurm_msg_t *msg, job_desc_msg_t *job_desc, goto end_it; } + /* mark this cluster as an active sibling if it's in the viable list */ + if (job_desc->fed_siblings_viable & + FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id)) + job_desc->fed_siblings_active |= + FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id); + *job_id_ptr = job_ptr->job_id; - if (job_desc->priority == 0) { - job_ptr->fed_details = xmalloc(sizeof(job_fed_details_t)); + if (job_held) { info("Submitted held federated job %u to %s(self)", job_ptr->job_id, fed_mgr_cluster_rec->name); } else { info("Submitted %sfederated job %u to %s(self)", - (!(job_ptr->fed_details->siblings & + (!(job_ptr->fed_details->siblings_viable & FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id)) ? "tracking " : ""), job_ptr->job_id, fed_mgr_cluster_rec->name); } - unlock_slurmctld(job_write_lock); - - if (_submit_sibling_jobs(job_desc, msg, alloc_only)) { - /* failed to submit a sibling job to a sibling. Need to update - * the local job's sibling bitmap */ + if (!job_held && _submit_sibling_jobs(job_desc, msg, alloc_only)) + info("failed to submit sibling job to one or more siblings"); - lock_slurmctld(job_write_lock); - if ((job_ptr->magic == JOB_MAGIC) && - (job_ptr->job_id == *job_id_ptr)) { - - if (!job_desc->fed_siblings) { - /* we know that we already have a job_ptr so - * just make it a locally scheduleable job. */ - error("Failed to submit fed job to siblings, submitting to local cluster"); - job_desc->fed_siblings |= - FED_SIBLING_BIT( - fed_mgr_cluster_rec->fed.id); - } - set_job_fed_details(job_ptr, job_desc->fed_siblings); - } else { - error("%s: job got messed up. this should never happen", - __func__); - } + job_ptr->fed_details->siblings_active = job_desc->fed_siblings_active; + update_job_fed_details(job_ptr); - unlock_slurmctld(job_write_lock); - } + unlock_slurmctld(job_write_lock); end_it: unlock_slurmctld(fed_read_lock); @@ -2231,8 +2202,8 @@ extern bool fed_mgr_is_tracker_only_job(struct job_record *job_ptr) if (job_ptr->fed_details && (origin_id == fed_mgr_cluster_rec->fed.id) && - job_ptr->fed_details->siblings && - (!(job_ptr->fed_details->siblings & + job_ptr->fed_details->siblings_active && + (!(job_ptr->fed_details->siblings_active & FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id)))) rc = true; @@ -2303,8 +2274,10 @@ extern int fed_mgr_job_lock(struct job_record *job_ptr, uint32_t cluster_id) job_ptr->job_id, cluster_id); /* if this cluster is the only sibling, then just assume the lock */ - if ((job_ptr->fed_details->siblings & FED_SIBLING_BIT(cluster_id)) && - (!(job_ptr->fed_details->siblings & ~FED_SIBLING_BIT(cluster_id)))) + if ((job_ptr->fed_details->siblings_viable & + FED_SIBLING_BIT(cluster_id)) && + (!(job_ptr->fed_details->siblings_viable & + ~FED_SIBLING_BIT(cluster_id)))) return SLURM_SUCCESS; if (origin_id != fed_mgr_cluster_rec->fed.id) { @@ -2358,8 +2331,10 @@ extern int fed_mgr_job_unlock(struct job_record *job_ptr, uint32_t cluster_id) job_ptr->job_id, cluster_id); /* if this cluster is the only sibling, then dont worry */ - if ((job_ptr->fed_details->siblings & FED_SIBLING_BIT(cluster_id)) && - (!(job_ptr->fed_details->siblings & ~FED_SIBLING_BIT(cluster_id)))) + if ((job_ptr->fed_details->siblings_viable & + FED_SIBLING_BIT(cluster_id)) && + (!(job_ptr->fed_details->siblings_viable & + ~FED_SIBLING_BIT(cluster_id)))) return SLURM_SUCCESS; if (origin_id != fed_mgr_cluster_rec->fed.id) { @@ -2422,15 +2397,19 @@ extern int fed_mgr_job_start(struct job_record *job_ptr, uint32_t cluster_id, return SLURM_ERROR; } - set_job_fed_details(job_ptr, FED_SIBLING_BIT(cluster_id)); + job_ptr->fed_details->siblings_active = + FED_SIBLING_BIT(cluster_id); + update_job_fed_details(job_ptr); return _persist_fed_job_start(origin_cluster, job_ptr->job_id, cluster_id, job_ptr->start_time); } /* Origin Cluster: */ - if ((job_ptr->fed_details->siblings & FED_SIBLING_BIT(cluster_id)) && - (!(job_ptr->fed_details->siblings & ~FED_SIBLING_BIT(cluster_id)))) + if ((job_ptr->fed_details->siblings_viable & + FED_SIBLING_BIT(cluster_id)) && + (!(job_ptr->fed_details->siblings_viable & + ~FED_SIBLING_BIT(cluster_id)))) { /* if this cluster is the only sibling, then just assume the * lock */ @@ -2444,7 +2423,7 @@ extern int fed_mgr_job_start(struct job_record *job_ptr, uint32_t cluster_id, error("attempt to start sib job %d by cluster %d which doesn't have job lock", job_ptr->job_id, cluster_id); rc = SLURM_ERROR; - } else if (job_ptr->fed_details->siblings & + } else if (job_ptr->fed_details->siblings_active & ~FED_SIBLING_BIT(cluster_id)) { /* cancel all sibling jobs if there are more siblings than just * the cluster that it came from */ @@ -2453,7 +2432,9 @@ extern int fed_mgr_job_start(struct job_record *job_ptr, uint32_t cluster_id, if (!rc) { /* Update where sibling jobs are running */ - set_job_fed_details(job_ptr, FED_SIBLING_BIT(cluster_id)); + job_ptr->fed_details->siblings_active = + FED_SIBLING_BIT(cluster_id); + update_job_fed_details(job_ptr); if (cluster_id != fed_mgr_cluster_rec->fed.id) { /* leave as pending so that it will stay around */ @@ -2658,14 +2639,15 @@ extern int fed_mgr_sib_will_run(slurm_msg_t *msg, job_desc_msg_t *job_desc, unlock_slurmctld(job_write_lock); } - if (!job_desc->fed_siblings) { /* may have been set to existing job's */ + if (!job_desc->fed_siblings_viable) { /* may have been set to existing job's */ /* Set potential siblings */ - job_desc->fed_siblings = _get_all_sibling_bits(); if (job_desc->clusters) - job_desc->fed_siblings &= + job_desc->fed_siblings_viable &= _cluster_names_to_ids(job_desc->clusters); + else + job_desc->fed_siblings_viable = _get_all_sibling_bits(); if (feature_sibs) - job_desc->fed_siblings &= feature_sibs; + job_desc->fed_siblings_viable &= feature_sibs; } if (!(sib_willruns = _get_sib_will_runs(msg, job_desc, uid))) { @@ -2807,12 +2789,14 @@ extern int fed_mgr_job_requeue(struct job_record *job_ptr) if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR) info("requeueing fed job %d", job_ptr->job_id); + /* clear where actual siblings were */ + job_ptr->fed_details->siblings_active = 0; + /* don't submit siblings for jobs that are held */ if (job_ptr->priority == 0) { job_ptr->job_state &= (~JOB_REQUEUE_FED); - /* clear siblings */ - set_job_fed_details(job_ptr, 0); + update_job_fed_details(job_ptr); /* clear cluster lock */ job_ptr->fed_details->cluster_lock = 0; @@ -2829,12 +2813,12 @@ extern int fed_mgr_job_requeue(struct job_record *job_ptr) &feature_sibs); if (job_ptr->clusters) - job_desc->fed_siblings = + job_desc->fed_siblings_viable = _cluster_names_to_ids(job_ptr->clusters); else - job_desc->fed_siblings = _get_all_sibling_bits(); + job_desc->fed_siblings_viable = _get_all_sibling_bits(); if (feature_sibs) - job_desc->fed_siblings &= feature_sibs; + job_desc->fed_siblings_viable &= feature_sibs; /* have to pack job_desc into a buffer */ slurm_msg_t_init(&msg); @@ -2847,21 +2831,18 @@ extern int fed_mgr_job_requeue(struct job_record *job_ptr) set_buf_offset(buffer, 0); msg.buffer = buffer; - if (_submit_sibling_jobs(job_desc, &msg, false)) { - /* failed to submit a sibling job to a sibling. Need to update - * the local job's sibling bitmap */ - if (!job_desc->fed_siblings) { - /* we know that we already have a job_ptr so - * just make it a locallly scheduleable job. */ - error("Failed to submit fed job to siblings, submitting to local cluster"); - job_desc->fed_siblings |= - FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id); - } - } + if (_submit_sibling_jobs(job_desc, &msg, false)) + if (!job_desc->fed_siblings_active) + error("Failed to submit fed job to any siblings"); + + /* mark this cluster as an active sibling */ + if (job_desc->fed_siblings_viable & + FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id)) + job_desc->fed_siblings_active |= + FED_SIBLING_BIT(fed_mgr_cluster_rec->fed.id); - /* set local job's fed_siblings. Could have been modified in - * _submit_sibling_jobs() */ - set_job_fed_details(job_ptr, job_desc->fed_siblings); + job_ptr->fed_details->siblings_active = job_desc->fed_siblings_active; + update_job_fed_details(job_ptr); free_buf(buffer); /* free the environment since all strings are stored in one @@ -2887,7 +2868,7 @@ static int _cancel_sibling_jobs(struct job_record *job_ptr, uint16_t signal, uint16_t flags, uid_t uid) { int id = 1; - uint64_t tmp_sibs = job_ptr->fed_details->siblings; + uint64_t tmp_sibs = job_ptr->fed_details->siblings_active; while (tmp_sibs) { if ((tmp_sibs & 1) && (id != fed_mgr_cluster_rec->fed.id)) { diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 6a1a6bbbbfb8f12123ae671a275c1e572ffb8de0..d4ea6ba2e828e404245fd06a9545ef8a3721b077 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -7327,8 +7327,12 @@ _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, if (job_desc->cluster_features) detail_ptr->cluster_features = xstrdup(job_desc->cluster_features); - if (job_desc->fed_siblings) - set_job_fed_details(job_ptr, job_desc->fed_siblings); + if (job_desc->fed_siblings_viable) { + job_ptr->fed_details = xmalloc(sizeof(job_fed_details_t)); + job_ptr->fed_details->siblings_viable = + job_desc->fed_siblings_viable; + update_job_fed_details(job_ptr); + } if ((job_desc->shared == JOB_SHARED_NONE) && (select_serial == 0)) { detail_ptr->share_res = 0; detail_ptr->whole_node = WHOLE_NODE_REQUIRED; @@ -8916,13 +8920,20 @@ void pack_job(struct job_record *dump_job_ptr, uint16_t show_flags, Buf buffer, if (dump_job_ptr->fed_details) { packstr(dump_job_ptr->fed_details->origin_str, buffer); - pack64(dump_job_ptr->fed_details->siblings, buffer); - packstr(dump_job_ptr->fed_details->siblings_str, + pack64(dump_job_ptr->fed_details->siblings_active, + buffer); + packstr(dump_job_ptr->fed_details->siblings_active_str, + buffer); + pack64(dump_job_ptr->fed_details->siblings_viable, + buffer); + packstr(dump_job_ptr->fed_details->siblings_viable_str, buffer); } else { packnull(buffer); pack64((uint64_t)0, buffer); packnull(buffer); + pack64((uint64_t)0, buffer); + packnull(buffer); } } else if (protocol_version >= SLURM_17_02_PROTOCOL_VERSION) { detail_ptr = dump_job_ptr->details; @@ -9104,8 +9115,9 @@ void pack_job(struct job_record *dump_job_ptr, uint16_t show_flags, Buf buffer, if (dump_job_ptr->fed_details) { packstr(dump_job_ptr->fed_details->origin_str, buffer); - pack64(dump_job_ptr->fed_details->siblings, buffer); - packstr(dump_job_ptr->fed_details->siblings_str, + pack64(dump_job_ptr->fed_details->siblings_active, + buffer); + packstr(dump_job_ptr->fed_details->siblings_active_str, buffer); } else { packnull(buffer); @@ -12156,21 +12168,20 @@ static int _update_job(struct job_record *job_ptr, job_desc_msg_t * job_specs, } } - if (job_specs->fed_siblings) { - slurmctld_lock_t fed_read_lock = { - NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; - if (job_ptr->fed_details) - info("update_job: setting fed_siblings from %"PRIu64" to %"PRIu64" for job_id %u", - job_ptr->fed_details->siblings, - job_specs->fed_siblings, - job_ptr->job_id); - else - info("update_job: setting fed_siblings to %"PRIu64" for job_id %u", - job_specs->fed_siblings, - job_ptr->job_id); - lock_slurmctld(fed_read_lock); - set_job_fed_details(job_ptr, job_specs->fed_siblings); - unlock_slurmctld(fed_read_lock); + if (job_specs->fed_siblings_viable) { + if (!job_ptr->fed_details) { + error_code = ESLURM_JOB_NOT_FEDERATED; + goto fini; + } + + info("update_job: setting fed_siblings from %"PRIu64" to %"PRIu64" for job_id %u", + job_ptr->fed_details->siblings_viable, + job_specs->fed_siblings_viable, + job_ptr->job_id); + + job_ptr->fed_details->siblings_viable = + job_specs->fed_siblings_viable; + update_job_fed_details(job_ptr); } fini: @@ -15658,8 +15669,12 @@ extern job_desc_msg_t *copy_job_record_to_job_desc(struct job_record *job_ptr) job_desc->ntasks_per_socket = mc_ptr->ntasks_per_socket; job_desc->ntasks_per_core = mc_ptr->ntasks_per_core; - if (job_ptr->fed_details) - job_desc->fed_siblings = job_ptr->fed_details->siblings; + if (job_ptr->fed_details) { + job_desc->fed_siblings_active = + job_ptr->fed_details->siblings_active; + job_desc->fed_siblings_viable = + job_ptr->fed_details->siblings_viable; + } #if 0 /* select_jobinfo is unused at job submit time, only it's * components are set. We recover those from the structure below. @@ -16282,7 +16297,8 @@ static void _free_job_fed_details(job_fed_details_t **fed_details_pptr) if (fed_details_ptr) { xfree(fed_details_ptr->origin_str); - xfree(fed_details_ptr->siblings_str); + xfree(fed_details_ptr->siblings_active_str); + xfree(fed_details_ptr->siblings_viable_str); xfree(fed_details_ptr); *fed_details_pptr = NULL; } @@ -16295,8 +16311,10 @@ static void _dump_job_fed_details(job_fed_details_t *fed_details_ptr, pack16(1, buffer); pack32(fed_details_ptr->cluster_lock, buffer); packstr(fed_details_ptr->origin_str, buffer); - pack64(fed_details_ptr->siblings, buffer); - packstr(fed_details_ptr->siblings_str, buffer); + pack64(fed_details_ptr->siblings_active, buffer); + packstr(fed_details_ptr->siblings_active_str, buffer); + pack64(fed_details_ptr->siblings_viable, buffer); + packstr(fed_details_ptr->siblings_viable_str, buffer); } else { pack16(0, buffer); } @@ -16320,9 +16338,16 @@ static int _load_job_fed_details(job_fed_details_t **fed_details_pptr, safe_unpack32(&fed_details_ptr->cluster_lock, buffer); safe_unpackstr_xmalloc(&fed_details_ptr->origin_str, &tmp_uint32, buffer); - safe_unpack64(&fed_details_ptr->siblings, buffer); - safe_unpackstr_xmalloc(&fed_details_ptr->siblings_str, - &tmp_uint32, buffer); + safe_unpack64(&fed_details_ptr->siblings_active, + buffer); + safe_unpackstr_xmalloc( + &fed_details_ptr->siblings_active_str, + &tmp_uint32, buffer); + safe_unpack64(&fed_details_ptr->siblings_viable, + buffer); + safe_unpackstr_xmalloc( + &fed_details_ptr->siblings_viable_str, + &tmp_uint32, buffer); } } else if (protocol_version >= SLURM_17_02_PROTOCOL_VERSION) { safe_unpack16(&tmp_uint16, buffer); @@ -16332,9 +16357,11 @@ static int _load_job_fed_details(job_fed_details_t **fed_details_pptr, safe_unpack32(&fed_details_ptr->cluster_lock, buffer); safe_unpackstr_xmalloc(&fed_details_ptr->origin_str, &tmp_uint32, buffer); - safe_unpack64(&fed_details_ptr->siblings, buffer); - safe_unpackstr_xmalloc(&fed_details_ptr->siblings_str, - &tmp_uint32, buffer); + safe_unpack64(&fed_details_ptr->siblings_viable, + buffer); + safe_unpackstr_xmalloc( + &fed_details_ptr->siblings_viable_str, + &tmp_uint32, buffer); } } @@ -16347,24 +16374,26 @@ unpack_error: return SLURM_ERROR; } -extern void set_job_fed_details(struct job_record *job_ptr, - uint64_t fed_siblings) +/* Set federated job's sibling strings. */ +extern void update_job_fed_details(struct job_record *job_ptr) { xassert(job_ptr); - - if (!job_ptr->fed_details) { - job_ptr->fed_details = - xmalloc(sizeof(job_fed_details_t)); - } else { - xfree(job_ptr->fed_details->siblings_str); - xfree(job_ptr->fed_details->origin_str); - } - - job_ptr->fed_details->siblings = fed_siblings; - job_ptr->fed_details->siblings_str = - fed_mgr_cluster_ids_to_names(fed_siblings); - job_ptr->fed_details->origin_str = - fed_mgr_get_cluster_name( + xassert(job_ptr->fed_details); + + xfree(job_ptr->fed_details->siblings_active_str); + xfree(job_ptr->fed_details->siblings_viable_str); + + job_ptr->fed_details->siblings_active_str = + fed_mgr_cluster_ids_to_names( + job_ptr->fed_details->siblings_active); + job_ptr->fed_details->siblings_viable_str = + fed_mgr_cluster_ids_to_names( + job_ptr->fed_details->siblings_viable); + + /* only set once */ + if (!job_ptr->fed_details->origin_str) + job_ptr->fed_details->origin_str = + fed_mgr_get_cluster_name( fed_mgr_get_cluster_id(job_ptr->job_id)); } diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 3c503d19ddf29a48ca8b152638594545b22932ce..63ba8d39b693b5331f37fe8940b5266a70925a44 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -2588,8 +2588,8 @@ static void _slurm_rpc_job_will_run(slurm_msg_t * msg, bool is_sib_job) if ((job_ptr = find_job_record(job_desc_msg->job_id)) && job_ptr->fed_details) - job_desc_msg->fed_siblings = - job_ptr->fed_details->siblings; + job_desc_msg->fed_siblings_viable = + job_ptr->fed_details->siblings_active; else if (!is_sib_job) error_code = ESLURM_INVALID_JOB_ID; unlock_slurmctld(job_read_lock); @@ -6084,7 +6084,7 @@ static void _slurm_rpc_sib_submit_batch_job(uint32_t uid, slurm_msg_t *msg) sib_msg_t *sib_msg = msg->data; job_desc_msg_t *job_desc = sib_msg->data; job_desc->job_id = sib_msg->job_id; - job_desc->fed_siblings = sib_msg->fed_siblings; + job_desc->fed_siblings_viable = sib_msg->fed_siblings; slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK }; @@ -6122,8 +6122,8 @@ static void _slurm_rpc_sib_resource_allocation(uint32_t uid, slurm_msg_t *msg) sib_msg_t *sib_msg = msg->data; job_desc_msg_t *job_desc = sib_msg->data; job_desc->job_id = sib_msg->job_id; - job_desc->fed_siblings = sib_msg->fed_siblings; job_desc->resp_host = xstrdup(sib_msg->resp_host); + job_desc->fed_siblings_viable = sib_msg->fed_siblings; if (!msg->conn) { error("Security violation, SIB_RESOURCE_ALLOCATION RPC from uid=%d", diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index 872d243023f98d97a1452437e86dcb689c859ecb..c1fb580c2fb156543a8f6c8721bc1b9577b094ae 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -562,10 +562,14 @@ typedef struct { } acct_policy_limit_set_t; typedef struct { - uint32_t cluster_lock; /* sibling that has lock on job */ - char *origin_str; /* origin cluster name */ - uint64_t siblings; /* bitmap of sibling cluster ids */ - char *siblings_str; /* comma separated list of sibling names */ + uint32_t cluster_lock; /* sibling that has lock on job */ + char *origin_str; /* origin cluster name */ + uint64_t siblings_active; /* bitmap of active sibling ids. */ + char *siblings_active_str; /* comma separated list of actual + sibling names */ + uint64_t siblings_viable; /* bitmap of viable sibling ids. */ + char *siblings_viable_str; /* comma separated list of viable + sibling names */ } job_fed_details_t; /* @@ -2442,10 +2446,11 @@ waitpid_timeout(const char *, pid_t, int *, int); extern void set_partition_tres(); /* - * Set job's siblings and make sibling strings + * Update job's federated siblings strings. + * + * IN job_ptr - job_ptr to update */ -extern void set_job_fed_details(struct job_record *job_ptr, - uint64_t fed_siblings); +extern void update_job_fed_details(struct job_record *job_ptr); /* * purge_job_record - purge specific job record. No testing is performed to diff --git a/src/squeue/opts.c b/src/squeue/opts.c index 0b674cb1a3d73d5239b736141f707479354b1392..a4c1a72d61f393ace442d748be45b15d5c106fac 100644 --- a/src/squeue/opts.c +++ b/src/squeue/opts.c @@ -1395,13 +1395,24 @@ extern int parse_long_format( char* format_long ) field_size, right_justify, suffix ); - else if (!xstrcasecmp(token, "fedsiblings")) - job_format_add_fed_siblings(params.format_list, - field_size, - right_justify, - suffix ); - else if (!xstrcasecmp(token, "fedsiblingsraw")) - job_format_add_fed_siblings_raw( + else if (!xstrcasecmp(token, "siblingsactive")) + job_format_add_fed_siblings_active( + params.format_list, + field_size, + right_justify, suffix ); + else if (!xstrcasecmp(token, "siblingsactiveraw")) + job_format_add_fed_siblings_active_raw( + params.format_list, + field_size, + right_justify, + suffix ); + else if (!xstrcasecmp(token, "siblingsviable")) + job_format_add_fed_siblings_viable( + params.format_list, + field_size, + right_justify, suffix ); + else if (!xstrcasecmp(token, "siblingsviableraw")) + job_format_add_fed_siblings_viable_raw( params.format_list, field_size, right_justify, diff --git a/src/squeue/print.c b/src/squeue/print.c index e3155d56ae32aa8227bec35849f88427b24ef647..20b588a3bdc4abf1aa8ceb61afadb94143d5fd71 100644 --- a/src/squeue/print.c +++ b/src/squeue/print.c @@ -1695,14 +1695,14 @@ int _print_job_fed_origin_raw(job_info_t * job, int width, bool right_justify, return SLURM_SUCCESS; } -int _print_job_fed_siblings(job_info_t * job, int width, bool right_justify, - char* suffix) +int _print_job_fed_siblings_active(job_info_t * job, int width, + bool right_justify, char* suffix) { if (job == NULL) - _print_str("FED_SIBLINGS", width, right_justify, true); + _print_str("ACTIVE_SIBLINGS", width, right_justify, true); else { - if (job->fed_siblings_str) - _print_str(job->fed_siblings_str, width, right_justify, + if (job->fed_siblings_active_str) + _print_str(job->fed_siblings_active_str, width, right_justify, true); else _print_str("NA", width, right_justify, true); @@ -1713,15 +1713,60 @@ int _print_job_fed_siblings(job_info_t * job, int width, bool right_justify, return SLURM_SUCCESS; } -int _print_job_fed_siblings_raw(job_info_t * job, int width, bool right_justify, - char* suffix) +int _print_job_fed_siblings_active_raw(job_info_t * job, int width, + bool right_justify, char* suffix) +{ + if (job == NULL) + _print_str("ACTIVE_SIBLINGS_RAW", width, right_justify, true); + else { + int bit = 1; + char *ids = NULL; + uint64_t tmp_sibs = job->fed_siblings_active; + while (tmp_sibs) { + if (tmp_sibs & 1) + xstrfmtcat(ids, "%s%d", (ids) ? "," : "", bit); + + tmp_sibs >>= 1; + bit++; + } + if (ids) + _print_str(ids, width, right_justify, true); + else + _print_str("NA", width, right_justify, true); + } + + if (suffix) + printf("%s", suffix); + return SLURM_SUCCESS; +} + +int _print_job_fed_siblings_viable(job_info_t * job, int width, + bool right_justify, char* suffix) +{ + if (job == NULL) + _print_str("VIABLE_SIBLINGS", width, right_justify, true); + else { + if (job->fed_siblings_viable_str) + _print_str(job->fed_siblings_viable_str, width, + right_justify, true); + else + _print_str("NA", width, right_justify, true); + } + + if (suffix) + printf("%s", suffix); + return SLURM_SUCCESS; +} + +int _print_job_fed_siblings_viable_raw(job_info_t * job, int width, + bool right_justify, char* suffix) { if (job == NULL) - _print_str("FED_SIBLINGS_RAW", width, right_justify, true); + _print_str("VIALBLE_SIBLINGS_RAW", width, right_justify, true); else { int bit = 1; char *ids = NULL; - uint64_t tmp_sibs = job->fed_siblings; + uint64_t tmp_sibs = job->fed_siblings_viable; while (tmp_sibs) { if (tmp_sibs & 1) xstrfmtcat(ids, "%s%d", (ids) ? "," : "", bit); diff --git a/src/squeue/print.h b/src/squeue/print.h index 80d14bc14364f0d1d519e739ad22025dcad3b09d..a8ef101c0d5195652493c901b919fa7c51065a73 100644 --- a/src/squeue/print.h +++ b/src/squeue/print.h @@ -244,11 +244,18 @@ int job_format_add_function(List list, int width, bool right_justify, #define job_format_add_fed_origin_raw(list,wid,right,suffix) \ job_format_add_function(list,wid,right,suffix, \ _print_job_fed_origin_raw) -#define job_format_add_fed_siblings(list,wid,right,suffix) \ - job_format_add_function(list,wid,right,suffix, _print_job_fed_siblings) -#define job_format_add_fed_siblings_raw(list,wid,right,suffix) \ +#define job_format_add_fed_siblings_active(list,wid,right,suffix) \ job_format_add_function(list,wid,right,suffix, \ - _print_job_fed_siblings_raw) + _print_job_fed_siblings_active) +#define job_format_add_fed_siblings_active_raw(list,wid,right,suffix) \ + job_format_add_function(list,wid,right,suffix, \ + _print_job_fed_siblings_active_raw) +#define job_format_add_fed_siblings_viable(list,wid,right,suffix) \ + job_format_add_function(list,wid,right,suffix, \ + _print_job_fed_siblings_viable) +#define job_format_add_fed_siblings_viable_raw(list,wid,right,suffix) \ + job_format_add_function(list,wid,right,suffix, \ + _print_job_fed_siblings_viable_raw) #define job_format_add_max_cpus(list,wid,right,suffix) \ job_format_add_function(list,wid,right,suffix,_print_job_max_cpus) #define job_format_add_max_nodes(list,wid,right,suffix) \ @@ -454,10 +461,14 @@ int _print_job_fed_origin(job_info_t * job, int width, bool right_justify, char* suffix); int _print_job_fed_origin_raw(job_info_t * job, int width, bool right_justify, char* suffix); -int _print_job_fed_siblings(job_info_t * job, int width, bool right_justify, - char* suffix); -int _print_job_fed_siblings_raw(job_info_t * job, int width, bool right_justify, - char* suffix); +int _print_job_fed_siblings_active(job_info_t * job, int width, + bool right_justify, char* suffix); +int _print_job_fed_siblings_active_raw(job_info_t * job, int width, + bool right_justify, char* suffix); +int _print_job_fed_siblings_viable(job_info_t * job, int width, + bool right_justify, char* suffix); +int _print_job_fed_siblings_viable_raw(job_info_t * job, int width, + bool right_justify, char* suffix); int _print_job_max_cpus(job_info_t * job, int width, bool right_justify, char* suffix); int _print_job_max_nodes(job_info_t * job, int width, bool right_justify,