From c2444b3d146768de5db43498efe17582f2960b3f Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 7 Apr 2010 22:31:59 +0000 Subject: [PATCH] In the job_resources structure, If a job changes size (reliquishes nodes), the node_bitmap will remain unchanged, but cpus, cpus_used, cpus_array_*, and memory_used will be updated (e.g. cpus and mem_used on that node cleared). --- src/common/job_resources.c | 33 +++++----- src/common/job_resources.h | 10 ++- src/plugins/select/cons_res/select_cons_res.c | 11 ++-- src/scontrol/update_job.c | 21 +++---- src/slurmctld/job_mgr.c | 7 +-- src/slurmctld/proc_req.c | 63 ++++++++++++++----- 6 files changed, 86 insertions(+), 59 deletions(-) diff --git a/src/common/job_resources.c b/src/common/job_resources.c index e5a8f90e2e1..8ce8c82fbeb 100644 --- a/src/common/job_resources.c +++ b/src/common/job_resources.c @@ -968,47 +968,46 @@ extern void add_job_to_cores(job_resources_t *job_resrcs_ptr, } /* Given a job pointer and a global node index, return the index of that - * node in the job_resrcs_ptr->cpu_array_value. Return -1 if invalid */ -extern int job_resources_node_inx_to_cpu_array_inx( - job_resources_t *job_resrcs_ptr, int node_inx) + * node in the job_resrcs_ptr->cpus. Return -1 if invalid */ +extern int job_resources_node_inx_to_cpu_inx(job_resources_t *job_resrcs_ptr, + int node_inx) { - int first_inx, i, node_cnt, node_sum; + int first_inx, i, node_offset; /* Test for error cases */ if (!job_resrcs_ptr || !job_resrcs_ptr->node_bitmap) { - error("job_resources_node_inx_to_cpu_array_inx: " + error("job_resources_node_inx_to_cpu_inx: " "no job_resrcs or node_bitmap"); return -1; } if (!bit_test(job_resrcs_ptr->node_bitmap, node_inx)) { - error("job_resources_node_inx_to_cpu_array_inx: " + error("job_resources_node_inx_to_cpu_inx: " "Invalid node_inx"); return -1; } if (job_resrcs_ptr->cpu_array_cnt == 0) { - error("job_resources_node_inx_to_cpu_array_inx: " + error("job_resources_node_inx_to_cpu_inx: " "Invalid cpu_array_cnt"); return -1; } /* Only one record, no need to search */ - if (job_resrcs_ptr->cpu_array_cnt == 1) + if (job_resrcs_ptr->nhosts == 1) return 0; /* Scan bitmap, convert node_inx to node_cnt within job's allocation */ first_inx = bit_ffs(job_resrcs_ptr->node_bitmap); - for (i=first_inx, node_cnt=0; i<node_inx; i++) { + for (i=first_inx, node_offset=-1; i<=node_inx; i++) { if (bit_test(job_resrcs_ptr->node_bitmap, i)) - node_cnt++; + node_offset++; } - /* if (bit_test(job_resrcs_ptr->node_bitmap, node_inx)) */ - node_cnt++; - for (i=0, node_sum=0; i<job_resrcs_ptr->cpu_array_cnt; i++) { - node_sum += job_resrcs_ptr->cpu_array_reps[i]; - if (node_sum >= node_cnt) - return i; + if (node_offset >= job_resrcs_ptr->nhosts) { + error("job_resources_node_inx_to_cpu_inx: " + "Found %d of %d nodes", + job_resrcs_ptr->nhosts, node_offset); + return -1; } - return -1; + return node_offset; } diff --git a/src/common/job_resources.h b/src/common/job_resources.h index 6f812b3fd97..80f70781871 100644 --- a/src/common/job_resources.h +++ b/src/common/job_resources.h @@ -100,6 +100,10 @@ * | Sock_0 | Sock_1 | Sock_0 | Sock_1 | * | Core_0 | Core_1 | Core_0 | Core_1 | Core_0 | Core_1 | Core_0 | Core_1 | * | Bit_0 | Bit_1 | Bit_2 | Bit_3 | Bit_4 | Bit_5 | Bit_6 | Bit_7 | + * + * If a job changes size (reliquishes nodes), the node_bitmap will remain + * unchanged, but cpus, cpus_used, cpus_array_*, and memory_used will be + * updated (e.g. cpus and mem_used on that node cleared). */ struct job_resources { bitstr_t * core_bitmap; @@ -237,8 +241,8 @@ extern void add_job_to_cores(job_resources_t *job_resrcs_ptr, const uint32_t *core_rep_count); /* Given a job pointer and a global node index, return the index of that - * node in the job_resrcs_ptr->cpu_array_value. Return -1 if invalid */ -extern int job_resources_node_inx_to_cpu_array_inx( - job_resources_t *job_resrcs_ptr, int node_inx); + * node in the job_resrcs_ptr->cpus. Return -1 if invalid */ +extern int job_resources_node_inx_to_cpu_inx(job_resources_t *job_resrcs_ptr, + int node_inx); #endif /* !_JOB_RESOURCES_H */ diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index a2d5768ef49..014b3d0fbaf 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -1013,9 +1013,8 @@ static int _rm_job_from_res(struct part_res_record *part_record_ptr, return SLURM_ERROR; } - if (!p_ptr->row) { + if (!p_ptr->row) return SLURM_SUCCESS; - } /* remove the job from the job_list */ n = 0; @@ -1104,6 +1103,11 @@ static int _rm_job_from_one_node(struct job_record *job_ptr, if (i != node_inx) continue; + if (job->cpus[n] == 0) { + info("attempt to remove node %s from job %u again", + node_ptr->name, job_ptr->job_id); + return SLURM_SUCCESS; + } job->cpus[n] = 0; job->nprocs = build_job_resources_cpu_array(job); clear_job_resources_node(job, n); @@ -1139,9 +1143,8 @@ static int _rm_job_from_one_node(struct job_record *job_ptr, return SLURM_ERROR; } - if (!p_ptr->row) { + if (!p_ptr->row) return SLURM_SUCCESS; - } /* look for the job in the partition's job_list */ n = 0; diff --git a/src/scontrol/update_job.c b/src/scontrol/update_job.c index 2ea03f68f19..37116d83be7 100644 --- a/src/scontrol/update_job.c +++ b/src/scontrol/update_job.c @@ -629,6 +629,9 @@ static void _update_job_size(uint32_t job_id) char *fname_csh = NULL, *fname_sh = NULL; FILE *resize_csh = NULL, *resize_sh = NULL; + if (!getenv("SLURM_JOBID")) + return; /*No job environment here to update */ + if (slurm_allocation_lookup_lite(job_id, &alloc_info) != SLURM_SUCCESS) { slurm_perror("slurm_allocation_lookup_lite"); @@ -649,8 +652,8 @@ static void _update_job_size(uint32_t job_id) strerror(errno)); goto fini; } - chmod(fname_csh, 0500); /* Make file executable */ - chmod(fname_sh, 0500); + chmod(fname_csh, 0700); /* Make file executable */ + chmod(fname_sh, 0700); if (getenv("SLURM_NODELIST")) { fprintf(resize_sh, "export SLURM_NODELIST=\"%s\"\n", @@ -677,15 +680,6 @@ static void _update_job_size(uint32_t job_id) alloc_info->node_cnt); } if (getenv("SLURM_JOB_CPUS_PER_NODE")) { -#if 1 - fprintf(resize_sh, "unset SLURM_JOB_CPUS_PER_NODE\n"); - fprintf(resize_csh, "unsetenv SLURM_JOB_CPUS_PER_NODE\n"); -#else - /* The job resource structure is currently based upon the - * original job allocation, so we don't have sufficient - * information to recreate this environment variable today. - * This is a possible future enhancement if/when job resource - * information is corrected after job resize */ char *tmp; tmp = uint32_compressed_to_str(alloc_info->num_cpu_groups, alloc_info->cpus_per_node, @@ -695,7 +689,6 @@ static void _update_job_size(uint32_t job_id) fprintf(resize_csh, "setenv SLURM_JOB_CPUS_PER_NODE \"%s\"\n", tmp); xfree(tmp); -#endif } if (getenv("SLURM_TASKS_PER_NODE")) { /* We don't have sufficient information to recreate this */ @@ -704,8 +697,8 @@ static void _update_job_size(uint32_t job_id) } printf("To reset SLURM environment variables, execute\n"); - printf(" For bash or sh shells: . ./%s\n", fname_sh); - printf(" For csh shells: source ./%s\n", fname_csh); + printf(" For bash or sh shells: . ./%s\n", fname_sh); + printf(" For csh shells: source ./%s\n", fname_csh); fini: slurm_free_resource_allocation_response_msg(alloc_info); xfree(fname_csh); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index f80a2cd41ea..a0038894d02 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -4147,18 +4147,15 @@ extern int job_update_cpu_cnt(struct job_record *job_ptr, int node_inx) * cpu count isn't set up on that system. */ return SLURM_SUCCESS; #endif - if((offset = job_resources_node_inx_to_cpu_array_inx( + if ((offset = job_resources_node_inx_to_cpu_inx( job_ptr->job_resrcs, node_inx)) < 0) { error("job_update_cpu_cnt: problem getting offset of job %u", job_ptr->job_id); job_ptr->cpu_cnt = 0; return SLURM_ERROR; } - /* info("cpu for %d is %d out of %d", */ - /* node_inx, job_ptr->job_resrcs->cpu_array_value[offset], */ - /* job_ptr->cpu_cnt); */ - cnt = job_ptr->job_resrcs->cpu_array_value[offset]; + cnt = job_ptr->job_resrcs->cpus[offset]; if (cnt > job_ptr->cpu_cnt) { error("job_update_cpu_cnt: cpu_cnt underflow on job_id %u", job_ptr->job_id); diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 824c4240165..624ffc7f7f7 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1743,7 +1743,7 @@ static void _slurm_rpc_job_alloc_info(slurm_msg_t * msg) on existing job */ static void _slurm_rpc_job_alloc_info_lite(slurm_msg_t * msg) { - int error_code = SLURM_SUCCESS; + int error_code = SLURM_SUCCESS, i, j; slurm_msg_t response_msg; struct job_record *job_ptr; DEF_TIMERS; @@ -1777,21 +1777,52 @@ static void _slurm_rpc_job_alloc_info_lite(slurm_msg_t * msg) debug("_slurm_rpc_job_alloc_info_lite JobId=%u NodeList=%s %s", job_info_msg->job_id, job_ptr->nodes, TIME_STR); - /* send job_ID and node_name_ptr */ - job_info_resp_msg.num_cpu_groups = job_ptr->job_resrcs-> - cpu_array_cnt; - job_info_resp_msg.cpu_count_reps = - xmalloc(sizeof(uint32_t) * - job_ptr->job_resrcs->cpu_array_cnt); - memcpy(job_info_resp_msg.cpu_count_reps, - job_ptr->job_resrcs->cpu_array_reps, - (sizeof(uint32_t) * job_ptr->job_resrcs->cpu_array_cnt)); - job_info_resp_msg.cpus_per_node = - xmalloc(sizeof(uint16_t) * - job_ptr->job_resrcs->cpu_array_cnt); - memcpy(job_info_resp_msg.cpus_per_node, - job_ptr->job_resrcs->cpu_array_value, - (sizeof(uint16_t) * job_ptr->job_resrcs->cpu_array_cnt)); + /* send job_ID and node_name_ptr */ + if (bit_equal(job_ptr->node_bitmap, + job_ptr->job_resrcs->node_bitmap)) { + job_info_resp_msg.num_cpu_groups = job_ptr->job_resrcs-> + cpu_array_cnt; + job_info_resp_msg.cpu_count_reps = + xmalloc(sizeof(uint32_t) * + job_ptr->job_resrcs-> + cpu_array_cnt); + memcpy(job_info_resp_msg.cpu_count_reps, + job_ptr->job_resrcs->cpu_array_reps, + (sizeof(uint32_t) * + job_ptr->job_resrcs->cpu_array_cnt)); + job_info_resp_msg.cpus_per_node = + xmalloc(sizeof(uint16_t) * + job_ptr->job_resrcs-> + cpu_array_cnt); + memcpy(job_info_resp_msg.cpus_per_node, + job_ptr->job_resrcs->cpu_array_value, + (sizeof(uint16_t) * + job_ptr->job_resrcs->cpu_array_cnt)); + } else { + /* Job has changed size, rebuild CPU count info */ + job_info_resp_msg.num_cpu_groups = job_ptr->node_cnt; + job_info_resp_msg.cpu_count_reps = + xmalloc(sizeof(uint32_t) * + job_ptr->node_cnt); + job_info_resp_msg.cpus_per_node = + xmalloc(sizeof(uint32_t) * + job_ptr->node_cnt); + for (i=0, j=-1; i<job_ptr->job_resrcs->nhosts; i++) { + if (job_ptr->job_resrcs->cpus[i] == 0) + continue; + if ((j == -1) || + (job_info_resp_msg.cpus_per_node[j] != + job_ptr->job_resrcs->cpus[i])) { + j++; + job_info_resp_msg.cpus_per_node[j] = + job_ptr->job_resrcs->cpus[i]; + job_info_resp_msg.cpu_count_reps[j] = 1; + } else { + job_info_resp_msg.cpu_count_reps[j]++; + } + } + job_info_resp_msg.num_cpu_groups = j + 1; + } job_info_resp_msg.error_code = error_code; job_info_resp_msg.job_id = job_info_msg->job_id; job_info_resp_msg.node_cnt = job_ptr->node_cnt; -- GitLab