diff --git a/src/api/job_info.c b/src/api/job_info.c index 6e7b8e75f6d7dda793b015aae6eb15192280e66d..704820dc521877a4949d0a0002b56fd58d2dd68b 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -1111,15 +1111,19 @@ extern int slurm_job_cpus_allocated_on_node_id( return job_resrcs_ptr->cpu_array_value[i]; } -extern int slurm_job_cpus_allocated_on_node( - job_resources_t *job_resrcs_ptr, const char *node) +extern int slurm_job_cpus_allocated_on_node(job_resources_t *job_resrcs_ptr, + const char *node) { + hostlist_t node_hl; int node_id; - if (!job_resrcs_ptr || !node || !job_resrcs_ptr->node_hl) + if (!job_resrcs_ptr || !node || !job_resrcs_ptr->nodes) slurm_seterrno_ret(EINVAL); - if ((node_id = hostlist_find(job_resrcs_ptr->node_hl, node)) == -1) + node_hl = hostlist_create(job_resrcs_ptr->nodes); + node_id = hostlist_find(node_hl, node); + hostlist_destroy(node_hl); + if (node_id == -1) return (0); /* No cpus allocated on this node */ return slurm_job_cpus_allocated_on_node_id(job_resrcs_ptr, node_id); diff --git a/src/common/job_resources.c b/src/common/job_resources.c index df5f777ab910954d63de1415f37cb3b3cd68cbeb..da6f5828507d99faa19314357fcb8b7bc8411a27 100644 --- a/src/common/job_resources.c +++ b/src/common/job_resources.c @@ -2,7 +2,7 @@ * job_resources.c - functions to manage data structure identifying specific * CPUs allocated to a job, step or partition ***************************************************************************** - * Copyright (C) 2008 Lawrence Livermore National Security. + * Copyright (C) 2008-2010 Lawrence Livermore National Security. * Written by Morris Jette <jette1@llnl.gov>. * CODE-OCEC-09-009. All rights reserved. * @@ -41,10 +41,11 @@ #include <slurm/slurm_errno.h> #include "src/common/hostlist.h" -#include "src/common/log.h" #include "src/common/job_resources.h" -#include "src/common/xmalloc.h" +#include "src/common/log.h" +#include "src/common/pack.h" #include "src/common/xassert.h" +#include "src/common/xmalloc.h" #include "src/slurmctld/slurmctld.h" @@ -219,19 +220,29 @@ extern int build_job_resources_cpus_array(job_resources_t *job_resrcs_ptr) * This is needed after a restart/reconfiguration since nodes can * be added or removed from the system resulting in changing in * the bitmap size or bit positions */ -extern void reset_node_bitmap(job_resources_t *job_resrcs_ptr, - bitstr_t *new_node_bitmap) +extern int reset_node_bitmap(job_resources_t *job_resrcs_ptr, uint32_t job_id) { - if (job_resrcs_ptr) { - if (job_resrcs_ptr->node_bitmap) - bit_free(job_resrcs_ptr->node_bitmap); - if (new_node_bitmap) { - job_resrcs_ptr->node_bitmap = - bit_copy(new_node_bitmap); - job_resrcs_ptr->nhosts = bit_set_count(new_node_bitmap); - } else - job_resrcs_ptr->nhosts = 0; + int i; + + if (!job_resrcs_ptr) + return SLURM_SUCCESS; + + if (job_resrcs_ptr->node_bitmap) + bit_free(job_resrcs_ptr->node_bitmap); + if (job_resrcs_ptr->nodes && + (node_name2bitmap(job_resrcs_ptr->nodes, false, + &job_resrcs_ptr->node_bitmap))) { + error("Invalid nodes (%s) for job_id %u", + job_resrcs_ptr->nodes, job_id); + return SLURM_ERROR; + } + i = bit_set_count(job_resrcs_ptr->node_bitmap); + if (job_resrcs_ptr->nhosts != i) { + error("Invalid change in resource allocation node count for " + "job %u, %u to %d", job_id, job_resrcs_ptr->nhosts, i); + return SLURM_ERROR; } + return SLURM_SUCCESS; } extern int valid_job_resources(job_resources_t *job_resrcs, @@ -287,8 +298,7 @@ extern int valid_job_resources(job_resources_t *job_resrcs, return SLURM_SUCCESS; } -extern job_resources_t *copy_job_resources( - job_resources_t *job_resrcs_ptr) +extern job_resources_t *copy_job_resources(job_resources_t *job_resrcs_ptr) { int i, sock_inx = 0; job_resources_t *new_layout = xmalloc(sizeof(struct job_resources)); @@ -405,8 +415,7 @@ extern void free_job_resources(job_resources_t **job_resrcs_pptr) xfree(job_resrcs_ptr->memory_used); if (job_resrcs_ptr->node_bitmap) bit_free(job_resrcs_ptr->node_bitmap); - if (job_resrcs_ptr->node_hl) - hostlist_destroy(job_resrcs_ptr->node_hl); + xfree(job_resrcs_ptr->nodes); xfree(job_resrcs_ptr->sock_core_rep_count); xfree(job_resrcs_ptr->sockets_per_node); xfree(job_resrcs_ptr); @@ -428,9 +437,9 @@ extern void log_job_resources(uint32_t job_id, } info("===================="); - info("job_id:%u nhosts:%u ncpus:%u node_req:%u", + info("job_id:%u nhosts:%u ncpus:%u node_req:%u nodes=%s", job_id, job_resrcs_ptr->nhosts, job_resrcs_ptr->ncpus, - job_resrcs_ptr->node_req); + job_resrcs_ptr->node_req, job_resrcs_ptr->nodes); if (job_resrcs_ptr->cpus == NULL) { error("log_job_resources: cpus array is NULL"); @@ -520,7 +529,87 @@ extern void log_job_resources(uint32_t job_id, extern void pack_job_resources(job_resources_t *job_resrcs_ptr, Buf buffer, uint16_t protocol_version) { - if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { + if (job_resrcs_ptr == NULL) { + uint32_t empty = NO_VAL; + pack32(empty, buffer); + return; + } + + xassert(job_resrcs_ptr->nhosts); + + pack32(job_resrcs_ptr->nhosts, buffer); + pack32(job_resrcs_ptr->ncpus, buffer); + pack8(job_resrcs_ptr->node_req, buffer); + packstr(job_resrcs_ptr->nodes, buffer); + + if (job_resrcs_ptr->cpu_array_reps) + pack32_array(job_resrcs_ptr->cpu_array_reps, + job_resrcs_ptr->cpu_array_cnt, buffer); + else + pack32_array(job_resrcs_ptr->cpu_array_reps, 0, buffer); + + if (job_resrcs_ptr->cpu_array_value) + pack16_array(job_resrcs_ptr->cpu_array_value, + job_resrcs_ptr->cpu_array_cnt, buffer); + else + pack16_array(job_resrcs_ptr->cpu_array_value, 0, buffer); + + if (job_resrcs_ptr->cpus) + pack16_array(job_resrcs_ptr->cpus, + job_resrcs_ptr->nhosts, buffer); + else + pack16_array(job_resrcs_ptr->cpus, 0, buffer); + + if (job_resrcs_ptr->cpus_used) + pack16_array(job_resrcs_ptr->cpus_used, + job_resrcs_ptr->nhosts, buffer); + else + pack16_array(job_resrcs_ptr->cpus_used, 0, buffer); + + if (job_resrcs_ptr->memory_allocated) + pack32_array(job_resrcs_ptr->memory_allocated, + job_resrcs_ptr->nhosts, buffer); + else + pack32_array(job_resrcs_ptr->memory_allocated, 0, buffer); + + if (job_resrcs_ptr->memory_used) + pack32_array(job_resrcs_ptr->memory_used, + job_resrcs_ptr->nhosts, buffer); + else + pack32_array(job_resrcs_ptr->memory_used, 0, buffer); +#ifndef HAVE_BG + { + int i; + uint32_t core_cnt = 0, sock_recs = 0; + xassert(job_resrcs_ptr->cores_per_socket); + xassert(job_resrcs_ptr->sock_core_rep_count); + xassert(job_resrcs_ptr->sockets_per_node); + + for (i=0; i<job_resrcs_ptr->nhosts; i++) { + core_cnt += job_resrcs_ptr->sockets_per_node[i] * + job_resrcs_ptr->cores_per_socket[i] * + job_resrcs_ptr->sock_core_rep_count[i]; + sock_recs += job_resrcs_ptr-> + sock_core_rep_count[i]; + if (sock_recs >= job_resrcs_ptr->nhosts) + break; + } + i++; + pack16_array(job_resrcs_ptr->sockets_per_node, + (uint32_t) i, buffer); + pack16_array(job_resrcs_ptr->cores_per_socket, + (uint32_t) i, buffer); + pack32_array(job_resrcs_ptr->sock_core_rep_count, + (uint32_t) i, buffer); + + xassert(job_resrcs_ptr->core_bitmap); + xassert(job_resrcs_ptr->core_bitmap_used); + pack_bit_str(job_resrcs_ptr->core_bitmap, buffer); + pack_bit_str(job_resrcs_ptr->core_bitmap_used, buffer); + } +#endif + } else { if (job_resrcs_ptr == NULL) { uint32_t empty = NO_VAL; pack32(empty, buffer); @@ -581,7 +670,8 @@ extern void pack_job_resources(job_resources_t *job_resrcs_ptr, Buf buffer, core_cnt += job_resrcs_ptr->sockets_per_node[i] * job_resrcs_ptr->cores_per_socket[i] * job_resrcs_ptr->sock_core_rep_count[i]; - sock_recs += job_resrcs_ptr->sock_core_rep_count[i]; + sock_recs += job_resrcs_ptr-> + sock_core_rep_count[i]; if (sock_recs >= job_resrcs_ptr->nhosts) break; } @@ -597,24 +687,20 @@ extern void pack_job_resources(job_resources_t *job_resrcs_ptr, Buf buffer, xassert(job_resrcs_ptr->core_bitmap_used); pack_bit_str(job_resrcs_ptr->core_bitmap, buffer); pack_bit_str(job_resrcs_ptr->core_bitmap_used, buffer); - /* Do not pack the node_bitmap, but rebuild it in - * reset_node_bitmap() based upon job_ptr->nodes and - * the current node table */ } #endif } } extern int unpack_job_resources(job_resources_t **job_resrcs_pptr, - char *nodelist, Buf buffer, - uint16_t protocol_version) + Buf buffer, uint16_t protocol_version) { char *bit_fmt = NULL; uint32_t empty, tmp32; job_resources_t *job_resrcs; xassert(job_resrcs_pptr); - if(protocol_version >= SLURM_2_1_PROTOCOL_VERSION) { + if(protocol_version >= SLURM_2_2_PROTOCOL_VERSION) { safe_unpack32(&empty, buffer); if (empty == NO_VAL) { *job_resrcs_pptr = NULL; @@ -625,6 +711,7 @@ extern int unpack_job_resources(job_resources_t **job_resrcs_pptr, job_resrcs->nhosts = empty; safe_unpack32(&job_resrcs->ncpus, buffer); safe_unpack8(&job_resrcs->node_req, buffer); + safe_unpackstr_xmalloc(&job_resrcs->nodes, &tmp32, buffer); safe_unpack32_array(&job_resrcs->cpu_array_reps, &tmp32, buffer); @@ -673,13 +760,68 @@ extern int unpack_job_resources(job_resources_t **job_resrcs_pptr, unpack_bit_str(&job_resrcs->core_bitmap, buffer); unpack_bit_str(&job_resrcs->core_bitmap_used, buffer); - /* node_bitmap is not packed, but rebuilt in reset_node_bitmap() - * based upon job_ptr->nodes and the current node table */ #endif - } - if(nodelist) - job_resrcs->node_hl = hostlist_create(nodelist); + } else { + safe_unpack32(&empty, buffer); + if (empty == NO_VAL) { + *job_resrcs_pptr = NULL; + return SLURM_SUCCESS; + } + job_resrcs = xmalloc(sizeof(struct job_resources)); + job_resrcs->nhosts = empty; + safe_unpack32(&job_resrcs->ncpus, buffer); + safe_unpack8(&job_resrcs->node_req, buffer); + + safe_unpack32_array(&job_resrcs->cpu_array_reps, + &tmp32, buffer); + if (tmp32 == 0) + xfree(job_resrcs->cpu_array_reps); + job_resrcs->cpu_array_cnt = tmp32; + + safe_unpack16_array(&job_resrcs->cpu_array_value, + &tmp32, buffer); + if (tmp32 == 0) + xfree(job_resrcs->cpu_array_value); + + if (tmp32 != job_resrcs->cpu_array_cnt) + goto unpack_error; + + safe_unpack16_array(&job_resrcs->cpus, &tmp32, buffer); + if (tmp32 == 0) + xfree(job_resrcs->cpus); + if (tmp32 != job_resrcs->nhosts) + goto unpack_error; + safe_unpack16_array(&job_resrcs->cpus_used, &tmp32, buffer); + if (tmp32 == 0) + xfree(job_resrcs->cpus_used); + + safe_unpack32_array(&job_resrcs->memory_allocated, + &tmp32, buffer); + if (tmp32 == 0) + xfree(job_resrcs->memory_allocated); + safe_unpack32_array(&job_resrcs->memory_used, &tmp32, buffer); + if (tmp32 == 0) + xfree(job_resrcs->memory_used); + +#ifndef HAVE_BG + safe_unpack16_array(&job_resrcs->sockets_per_node, + &tmp32, buffer); + if (tmp32 == 0) + xfree(job_resrcs->sockets_per_node); + safe_unpack16_array(&job_resrcs->cores_per_socket, + &tmp32, buffer); + if (tmp32 == 0) + xfree(job_resrcs->cores_per_socket); + safe_unpack32_array(&job_resrcs->sock_core_rep_count, + &tmp32, buffer); + if (tmp32 == 0) + xfree(job_resrcs->sock_core_rep_count); + + unpack_bit_str(&job_resrcs->core_bitmap, buffer); + unpack_bit_str(&job_resrcs->core_bitmap_used, buffer); +#endif + } *job_resrcs_pptr = job_resrcs; return SLURM_SUCCESS; diff --git a/src/common/job_resources.h b/src/common/job_resources.h index 561e828ad6594a2ec21e146ee2bf7ea10c4f26f1..5159b600a505c41484042e89440fee1f2b05997d 100644 --- a/src/common/job_resources.h +++ b/src/common/job_resources.h @@ -79,6 +79,7 @@ * here do NOT get cleared as the job completes on a * node * node_req - NODE_CR_RESERVED|NODE_CR_ONE_ROW|NODE_CR_AVAILABLE + * nodes - Names of nodes in original job allocation * ncpus - Number of processors in the allocation * sock_core_rep_count - How many consecutive nodes that sockets_per_node * and cores_per_socket apply to, build by @@ -118,11 +119,8 @@ struct job_resources { uint32_t * memory_used; uint32_t nhosts; bitstr_t * node_bitmap; - hostlist_t node_hl; /* will be set on unpack if - a nodelist is given to create. - Used primarily for api - functions */ uint8_t node_req; + char * nodes; uint32_t ncpus; uint32_t * sock_core_rep_count; uint16_t * sockets_per_node; @@ -186,15 +184,13 @@ extern void log_job_resources(uint32_t job_id, extern void pack_job_resources(job_resources_t *job_resrcs_ptr, Buf buffer, uint16_t protocol_version); extern int unpack_job_resources(job_resources_t **job_resrcs_pptr, - char *nodelist, Buf buffer, - uint16_t protocol_version); + Buf buffer, uint16_t protocol_version); /* Reset the node_bitmap in a job_resources data structure * This is needed after a restart/reconfiguration since nodes can * be added or removed from the system resulting in changing in * the bitmap size or bit positions */ -extern void reset_node_bitmap(job_resources_t *job_resrcs_ptr, - bitstr_t *new_node_bitmap); +extern int reset_node_bitmap(job_resources_t *job_resrcs_ptr, uint32_t job_id); /* For a given node_id, socket_id and core_id, get it's offset within * the core bitmap */ diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index 199ce8ae1efb86f1ca99a5bb01f03af15c9e9ab0..387979e6eb29c9f71fa055e8658e2008cfa17d42 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -3570,7 +3570,7 @@ _unpack_job_info_members(job_info_t * job, Buf buffer, safe_unpackstr_xmalloc(&job->resv_name, &uint32_tmp, buffer); safe_unpack32(&job->exit_code, buffer); - unpack_job_resources(&job->job_resrcs, job->nodes, buffer, + unpack_job_resources(&job->job_resrcs, buffer, protocol_version); safe_unpackstr_xmalloc(&job->name, &uint32_tmp, buffer); @@ -3673,8 +3673,10 @@ _unpack_job_info_members(job_info_t * job, Buf buffer, safe_unpackstr_xmalloc(&job->resv_name, &uint32_tmp, buffer); safe_unpack32(&job->exit_code, buffer); - unpack_job_resources(&job->job_resrcs, job->nodes, buffer, + unpack_job_resources(&job->job_resrcs, buffer, protocol_version); + /* Kludge for lack of resource node list in SLURM version 2.1 */ + job->job_resrcs->nodes = xstrdup(job->nodes); safe_unpackstr_xmalloc(&job->name, &uint32_tmp, buffer); safe_unpackstr_xmalloc(&job->wckey, &uint32_tmp, buffer); diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index e856010156122383e0b81615f16244eec84ae3eb..b7c657a22fe7c0cc2f3dbf3bb080c9fb537f8472 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -3,7 +3,7 @@ * functions. ***************************************************************************** * Copyright (C) 2004-2007 The Regents of the University of California. - * Copyright (C) 2008 Lawrence Livermore National Security. + * Copyright (C) 2008-2010 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Dan Phung <phung4@llnl.gov> and Morris Jette <jette1@llnl.gov> * @@ -1310,6 +1310,7 @@ static void _build_select_struct(struct job_record *job_ptr, job_resrcs_ptr->nhosts = bit_set_count(bitmap); job_resrcs_ptr->ncpus = job_ptr->details->min_cpus; job_resrcs_ptr->node_bitmap = bit_copy(bitmap); + job_resrcs_ptr->nodes = bitmap2node_name(bitmap); if (job_resrcs_ptr->node_bitmap == NULL) fatal("bit_copy malloc failure"); diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index 9c1b5280f3a42de425b25adf83bcc4e67b374d71..78022ebb4ea6b474b52665079d4ca86d69862828 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -2216,13 +2216,14 @@ alloc_job: /** create the struct_job_res **/ job_res = create_job_resources(); job_res->node_bitmap = bit_copy(bitmap); + job_res->nodes = bitmap2node_name(bitmap); if (job_res->node_bitmap == NULL) fatal("bit_copy malloc failure"); job_res->nhosts = bit_set_count(bitmap); - job_res->ncpus = job_res->nhosts; + job_res->ncpus = job_res->nhosts; if (job_ptr->details->ntasks_per_node) - job_res->ncpus *= job_ptr->details->ntasks_per_node; - job_res->ncpus = MAX(job_res->ncpus, + job_res->ncpus *= job_ptr->details->ntasks_per_node; + job_res->ncpus = MAX(job_res->ncpus, job_ptr->details->min_cpus); job_res->node_req = job_node_req; job_res->cpus = cpu_count; diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index 970e57b200622c8f0779ba33a1a9c356e34af3d0..93c2f22cf75ca71d31eabeac5116c680c687b885 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -564,6 +564,7 @@ static void _build_select_struct(struct job_record *job_ptr, bitstr_t *bitmap) job_resrcs_ptr->memory_used = xmalloc(sizeof(uint32_t) * node_cnt); job_resrcs_ptr->nhosts = node_cnt; job_resrcs_ptr->node_bitmap = bit_copy(bitmap); + job_resrcs_ptr->nodes = bitmap2node_name(bitmap); if (job_resrcs_ptr->node_bitmap == NULL) fatal("bit_copy malloc failure"); job_resrcs_ptr->ncpus = job_ptr->total_cpus; diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 492c2e451ab4ac54ab498a286da0ae6259be8f0f..517b90f15bb58a4af175469e999d528b4d2d5293 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -913,7 +913,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer, protocol_version)) goto unpack_error; - if (unpack_job_resources(&job_resources, NULL, buffer, + if (unpack_job_resources(&job_resources, buffer, protocol_version)) goto unpack_error; @@ -1043,7 +1043,7 @@ static int _load_job_state(Buf buffer, uint16_t protocol_version) if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer, protocol_version)) goto unpack_error; - if (unpack_job_resources(&job_resources, NULL, buffer, + if (unpack_job_resources(&job_resources, buffer, protocol_version)) goto unpack_error; @@ -4983,7 +4983,8 @@ void reset_job_bitmaps(void) job_ptr->nodes, job_ptr->job_id); job_fail = true; } - reset_node_bitmap(job_ptr->job_resrcs, job_ptr->node_bitmap); + if (reset_node_bitmap(job_ptr->job_resrcs, job_ptr->job_id)) + job_fail = true; if (!job_fail && !IS_JOB_FINISHED(job_ptr) && job_ptr->job_resrcs && (cr_flag || gang_flag) && valid_job_resources(job_ptr->job_resrcs, diff --git a/src/slurmctld/node_scheduler.c b/src/slurmctld/node_scheduler.c index 9b5411c0e81d679f887ef1d4de5400daff5956f9..ad9ac24317b1276296a7fd8ced69f1c6a8eb8efe 100644 --- a/src/slurmctld/node_scheduler.c +++ b/src/slurmctld/node_scheduler.c @@ -1220,7 +1220,12 @@ extern int select_nodes(struct job_record *job_ptr, bool test_only, job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); - job_ptr->nodes = bitmap2node_name(select_bitmap); + if (job_ptr->job_resrcs && job_ptr->job_resrcs->nodes) + job_ptr->nodes = xstrdup(job_ptr->job_resrcs->nodes); + else { + error("Select plugin failed to set job resources, nodes"); + job_ptr->nodes = bitmap2node_name(select_bitmap); + } select_bitmap = NULL; /* nothing left to free */ allocate_nodes(job_ptr); build_node_details(job_ptr);