diff --git a/NEWS b/NEWS index 197fed2866db258a62104f0f164136a161a4f574..134e17276493ef81f1424774d4e7dc3514234ead 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,9 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.4.0-pre5 ============================= -- Correction inf setting of SLURM_CPU_BIND environment variable. + -- Rebuild slurmctld's job select_jobinfo->node_bitmap on restart/reconfigure + of the daemon rather than restoring the bitmap since the nodes in a system + can change (be added or removed). * Changes in SLURM 1.4.0-pre4 ============================= diff --git a/src/common/select_job_res.c b/src/common/select_job_res.c index 6ef494c5fd6c5cb0bba1ac904f9ad2414de7ebc1..758eecc095d46e702e8e0eb386852aab4d835f16 100644 --- a/src/common/select_job_res.c +++ b/src/common/select_job_res.c @@ -56,7 +56,18 @@ extern select_job_res_t create_select_job_res(void) return select_job_res; } - +/* Set the socket and core counts associated with a set of selected + * nodes of a select_job_res data structure based upon slurmctld state. + * (sets cores_per_socket, sockets_per_node, and sock_core_rep_count based + * upon the value of node_bitmap, also creates core_bitmap based upon + * the total number of cores in the allocation). Call this ONLY from + * slurmctld. Example of use: + * + * select_job_res_t select_job_res_ptr = create_select_job_res(); + * node_name2bitmap("dummy[2,5,12,16]", true, &(select_res_ptr->node_bitmap)); + * rc = build_select_job_res(select_job_res_ptr, node_record_table_ptr, + * slurmctld_conf.fast_schedule); + */ extern int build_select_job_res(select_job_res_t select_job_res, void *node_rec_table, uint16_t fast_schedule) @@ -198,6 +209,23 @@ extern int build_select_job_res_cpus_array(select_job_res_t select_job_res_ptr) return SLURM_SUCCESS; } +/* Reset the node_bitmap in a select_job_res data structure + * This is needed after a restart/reconfiguration since nodes can + * be added or removed from the system resulting in changing in + * the bitmap size or bit positions */ +extern void reset_node_bitmap(select_job_res_t select_job_res_ptr, + bitstr_t *new_node_bitmap) +{ + if (select_job_res_ptr) { + if (select_job_res_ptr->node_bitmap) + bit_free(select_job_res_ptr->node_bitmap); + if (new_node_bitmap) { + select_job_res_ptr->node_bitmap = + bit_copy(new_node_bitmap); + } + } +} + extern int valid_select_job_res(select_job_res_t select_job_res, void *node_rec_table, uint16_t fast_schedule) @@ -482,7 +510,7 @@ extern void pack_select_job_res(select_job_res_t select_job_res_ptr, Buf buffer) { int i; - uint32_t core_cnt = 0, host_cnt = 0, sock_recs = 0; + uint32_t core_cnt = 0, sock_recs = 0; if (select_job_res_ptr == NULL) { uint32_t empty = NO_VAL; @@ -555,18 +583,15 @@ extern void pack_select_job_res(select_job_res_t select_job_res_ptr, pack_bit_fmt(select_job_res_ptr->core_bitmap, buffer); xassert(core_cnt == bit_size(select_job_res_ptr->core_bitmap_used)); pack_bit_fmt(select_job_res_ptr->core_bitmap_used, buffer); - host_cnt = bit_size(select_job_res_ptr->node_bitmap); - /* FIXME: don't pack the node_bitmap, but rebuild it based upon - * select_job_res_ptr->node_list */ - pack32(host_cnt, buffer); - pack_bit_fmt(select_job_res_ptr->node_bitmap, buffer); + /* Do not pack the node_bitmap, but rebuild it in reset_node_bitmap() + * based upon job_ptr->nodes and the current node table */ } extern int unpack_select_job_res(select_job_res_t *select_job_res_pptr, Buf buffer) { char *bit_fmt = NULL; - uint32_t core_cnt, empty, host_cnt, tmp32; + uint32_t core_cnt, empty, tmp32; select_job_res_t select_job_res; xassert(select_job_res_pptr); @@ -626,15 +651,9 @@ extern int unpack_select_job_res(select_job_res_t *select_job_res_pptr, if (bit_unfmt(select_job_res->core_bitmap_used, bit_fmt)) goto unpack_error; xfree(bit_fmt); + /* node_bitmap is not packed, but rebuilt in reset_node_bitmap() + * based upon job_ptr->nodes and the current node table */ - /* FIXME: but recreate node_bitmap based upon - * select_job_res_ptr->node_list */ - safe_unpack32(&host_cnt, buffer); /* NOTE: Not part of struct */ - safe_unpackstr_xmalloc(&bit_fmt, &tmp32, buffer); - select_job_res->node_bitmap = bit_alloc((bitoff_t) host_cnt); - if (bit_unfmt(select_job_res->node_bitmap, bit_fmt)) - goto unpack_error; - xfree(bit_fmt); *select_job_res_pptr = select_job_res; return SLURM_SUCCESS; diff --git a/src/common/select_job_res.h b/src/common/select_job_res.h index cc0a10011f31683f4b9578468db250e2fe8f8c05..f2db10bafe7cf8d96ba84f3ec5f8f15c33d8cd78 100644 --- a/src/common/select_job_res.h +++ b/src/common/select_job_res.h @@ -176,6 +176,13 @@ extern void pack_select_job_res(select_job_res_t select_job_res_ptr, extern int unpack_select_job_res(select_job_res_t *select_job_res_pptr, Buf buffer); +/* Reset the node_bitmap in a select_job_res data structure + * This is needed after a restart/reconfiguration since nodes can + * be added or removed from the system resulting in changing in + * the bitmap size or bit positions */ +extern void reset_node_bitmap(select_job_res_t select_job_res_ptr, + bitstr_t *new_node_bitmap); + /* For a given node_id, socket_id and core_id, get it's offset within * the core bitmap */ extern int get_select_job_res_offset(select_job_res_t select_job_res_ptr, diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 0d985ad95e4ba071b9e0ab2577d23b0d184f3c4b..208efb4deb6ac355a9fbd14dbf24ba05c9ec131e 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -3746,6 +3746,8 @@ void reset_job_bitmaps(void) job_ptr->nodes, job_ptr->job_id); job_fail = true; } + reset_node_bitmap(job_ptr->select_job, + job_ptr->node_bitmap); _reset_step_bitmaps(job_ptr); build_node_details(job_ptr); /* set node_addr */