From fbc606288e4b0a4d8e11b74036e2ce8d11c352e9 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Mon, 13 Jun 2016 15:37:23 -0700 Subject: [PATCH] Recalculate a job's memory size Recalculate a job's memory allocation after node reboot if job requests all of a node's memory and FastSchedule=0 is configurated. Intel KNL memory size can change on reboot with various MCDRAM modes. bug 2822 --- NEWS | 3 ++ doc/html/selectplugins.shtml | 16 ++++++++- slurm/slurm.h.in | 2 ++ src/common/node_select.c | 16 +++++++++ src/common/node_select.h | 10 ++++++ src/plugins/select/alps/select_alps.c | 5 +++ src/plugins/select/bluegene/select_bluegene.c | 5 +++ src/plugins/select/cons_res/job_test.c | 4 +++ src/plugins/select/cons_res/select_cons_res.c | 36 +++++++++++++++++++ src/plugins/select/cray/select_cray.c | 7 ++++ src/plugins/select/linear/select_linear.c | 5 +++ src/plugins/select/other/other_select.c | 15 +++++++- src/plugins/select/other/other_select.h | 6 ++++ src/plugins/select/serial/select_serial.c | 5 +++ src/slurmctld/job_mgr.c | 36 +++++++++++++++++++ src/slurmctld/job_scheduler.c | 9 +++-- src/slurmctld/power_save.c | 1 + src/slurmctld/slurmctld.h | 6 ++++ 18 files changed, 183 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 37cd101b6a2..4f31eac5d83 100644 --- a/NEWS +++ b/NEWS @@ -32,6 +32,9 @@ documents those changes that are of interest to users and administrators. as needed. -- Correct task affinity support for FreeBSD. -- Fix for task affinity on KNL in SNC2/Flat mode. + -- Recalculate a job's memory allocation after node reboot if job requests all + of a node's memory and FastSchedule=0 is configurated. Intel KNL memory size + can change on reboot with various MCDRAM modes. * Changes in Slurm 16.05.0 ========================== diff --git a/doc/html/selectplugins.shtml b/doc/html/selectplugins.shtml index ab5b697c5ab..5381727b14a 100644 --- a/doc/html/selectplugins.shtml +++ b/doc/html/selectplugins.shtml @@ -564,6 +564,20 @@ be sent to the job.</p> <p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. On failure, the plugin should return a Slurm error code.</p> + +<p class="commandline">int select_p_job_mem_confirm (struct job_record *job_ptr);</p> +<p style="margin-left:.2in"><b>Description</b>: Confirm that a job's memory +allocation is still valid after a node is restarted. This is an issue if the +job is allocated all of the memory on a node and that node is restarted with a +different memory size than at the time it is allocated to the job. This would +mostly be an issue on an Intel KNL node where the memory size would vary with +the MCDRAM cache mode.</p> +<p style="margin-left:.2in"><b>Arguments</b>:<br> +<span class="commandline"> job_ptr</span> (input) pointer +to the job to be validated.</p> +<p style="margin-left:.2in"><b>Returns</b>: SLURM_SUCCESS if successful. On +failure, the plugin should return a Slurm error code.</p> + <p class="commandline">int select_p_job_suspend (struct job_record *job_ptr, bool indf_susp);</p> <p style="margin-left:.2in"><b>Description</b>: Suspend the specified job. @@ -779,6 +793,6 @@ cnodelist (e.g. on a BGQ it would look something like '[00000x11331]').</br> <p class="footer"><a href="#top">top</a></p> -<p style="text-align:center;">Last modified 10 December 2015</p> +<p style="text-align:center;">Last modified 11 June 2016</p> <!--#include virtual="footer.txt"--> diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index f5ef5ee1c3a..19a7f71f4e9 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -996,6 +996,8 @@ enum ctx_keys { #define BACKFILL_TEST 0x00000008 /* Backfill test in progress */ #define GRES_ENFORCE_BIND 0x00000010 /* Enforce CPU/GRES binding */ #define TEST_NOW_ONLY 0x00000020 /* Test for immediately start only */ +#define NODE_MEM_CALC 0x00000040 /* Per-node memory limit calculated */ +#define NODE_REBOOT 0x00000080 /* Waiting for node reboot */ /*****************************************************************************\ * SLURM HOSTLIST FUNCTIONS diff --git a/src/common/node_select.c b/src/common/node_select.c index 84906bdeba3..a93a258cb21 100644 --- a/src/common/node_select.c +++ b/src/common/node_select.c @@ -80,6 +80,7 @@ const char *node_select_syms[] = { "select_p_job_expand", "select_p_job_resized", "select_p_job_signal", + "select_p_job_mem_confirm", "select_p_job_fini", "select_p_job_suspend", "select_p_job_resume", @@ -679,6 +680,21 @@ extern int select_g_job_signal(struct job_record *job_ptr, int signal) (job_ptr, signal); } +/* + * Confirm that a job's memory allocation is still valid after a node is + * restarted. This is an issue if the job is allocated all of the memory on a + * node and that node is restarted with a different memory size than at the time + * it is allocated to the job. This would mostly be an issue on an Intel KNL + * node where the memory size would vary with the MCDRAM cache mode. + */ +extern int select_g_job_mem_confirm(struct job_record *job_ptr) +{ + if (slurm_select_init(0) < 0) + return SLURM_ERROR; + + return (*(ops[select_context_default].job_mem_confirm)) (job_ptr); +} + /* * Note termination of job is starting. Executed from slurmctld. * IN job_ptr - pointer to job being terminated diff --git a/src/common/node_select.h b/src/common/node_select.h index d682071018d..af7be2a6381 100644 --- a/src/common/node_select.h +++ b/src/common/node_select.h @@ -157,6 +157,7 @@ typedef struct slurm_select_ops { struct node_record *node_ptr); int (*job_signal) (struct job_record *job_ptr, int signal); + int (*job_mem_confirm) (struct job_record *job_ptr); int (*job_fini) (struct job_record *job_ptr); int (*job_suspend) (struct job_record *job_ptr, bool indf_susp); @@ -606,6 +607,15 @@ extern int select_g_job_fini(struct job_record *job_ptr); */ extern int select_g_job_signal(struct job_record *job_ptr, int signal); +/* + * Confirm that a job's memory allocation is still valid after a node is + * restarted. This is an issue if the job is allocated all of the memory on a + * node and that node is restarted with a different memory size than at the time + * it is allocated to the job. This would mostly be an issue on an Intel KNL + * node where the memory size would vary with the MCDRAM cache mode. + */ +extern int select_g_job_mem_confirm(struct job_record *job_ptr); + /* * Suspend a job. Executed from slurmctld. * IN job_ptr - pointer to job being suspended diff --git a/src/plugins/select/alps/select_alps.c b/src/plugins/select/alps/select_alps.c index 68c5330164e..43cdd2e8d04 100644 --- a/src/plugins/select/alps/select_alps.c +++ b/src/plugins/select/alps/select_alps.c @@ -437,6 +437,11 @@ extern int select_p_job_signal(struct job_record *job_ptr, int signal) return other_job_signal(job_ptr, signal); } +extern int select_p_job_mem_confirm(struct job_record *job_ptr) +{ + return SLURM_SUCCESS; +} + extern int select_p_job_fini(struct job_record *job_ptr) { if (job_ptr == NULL) diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index f9eb2bf0fc1..7560831a283 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -1752,6 +1752,11 @@ extern int select_p_job_signal(struct job_record *job_ptr, int signal) return SLURM_SUCCESS; } +extern int select_p_job_mem_confirm(struct job_record *job_ptr) +{ + return SLURM_SUCCESS; +} + extern int select_p_job_fini(struct job_record *job_ptr) { int rc = SLURM_ERROR; diff --git a/src/plugins/select/cons_res/job_test.c b/src/plugins/select/cons_res/job_test.c index 9a8718a2779..2ba7a1c048a 100644 --- a/src/plugins/select/cons_res/job_test.c +++ b/src/plugins/select/cons_res/job_test.c @@ -3081,6 +3081,10 @@ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *node_bitmap, job_ptr->job_id, bit_set_count(node_bitmap)); } + if ((details_ptr->pn_min_memory == 0) && + (select_fast_schedule == 0)) + job_ptr->bit_flags |= NODE_MEM_CALC; /* To be calculated */ + orig_map = bit_copy(node_bitmap); avail_cores = _make_core_bitmap(node_bitmap, job_ptr->details->core_spec); diff --git a/src/plugins/select/cons_res/select_cons_res.c b/src/plugins/select/cons_res/select_cons_res.c index 3ae2e0652da..589159cd3af 100644 --- a/src/plugins/select/cons_res/select_cons_res.c +++ b/src/plugins/select/cons_res/select_cons_res.c @@ -2334,6 +2334,42 @@ extern int select_p_job_signal(struct job_record *job_ptr, int signal) return SLURM_SUCCESS; } +extern int select_p_job_mem_confirm(struct job_record *job_ptr) +{ + int i_first, i_last, i, offset; + uint32_t avail_mem, lowest_mem = 0; + + xassert(job_ptr); + + if (((job_ptr->bit_flags & NODE_MEM_CALC) == 0) || + (select_fast_schedule != 0)) + return SLURM_SUCCESS; + if ((job_ptr->details == NULL) || + (job_ptr->job_resrcs == NULL) || + (job_ptr->job_resrcs->node_bitmap == NULL) || + (job_ptr->job_resrcs->memory_allocated == NULL)) + return SLURM_ERROR; + i_first = bit_ffs(job_ptr->job_resrcs->node_bitmap); + if (i_first >= 0) + i_last = bit_fls(job_ptr->job_resrcs->node_bitmap); + else + i_last = i_first - 1; + for (i = i_first, offset = 0; i <= i_last; i++) { + if (!bit_test(job_ptr->job_resrcs->node_bitmap, i)) + continue; + avail_mem = select_node_record[i].real_memory - + select_node_record[i].mem_spec_limit; + job_ptr->job_resrcs->memory_allocated[offset] = avail_mem; + select_node_usage[i].alloc_memory = avail_mem; + if ((offset == 0) || (lowest_mem > avail_mem)) + lowest_mem = avail_mem; + offset++; + } + job_ptr->details->pn_min_memory = lowest_mem; + + return SLURM_SUCCESS; +} + extern int select_p_job_fini(struct job_record *job_ptr) { xassert(job_ptr); diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index 03449e86649..d5d7fac05d1 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -2057,6 +2057,13 @@ extern int select_p_job_signal(struct job_record *job_ptr, int signal) return other_job_signal(job_ptr, signal); } +extern int select_p_job_mem_confirm(struct job_record *job_ptr) +{ + xassert(job_ptr); + + return other_job_mem_confirm(job_ptr); +} + extern int select_p_job_fini(struct job_record *job_ptr) { select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data; diff --git a/src/plugins/select/linear/select_linear.c b/src/plugins/select/linear/select_linear.c index c02e36fa25b..cc16cdc824d 100644 --- a/src/plugins/select/linear/select_linear.c +++ b/src/plugins/select/linear/select_linear.c @@ -3743,6 +3743,11 @@ extern int select_p_job_signal(struct job_record *job_ptr, int signal) return SLURM_SUCCESS; } +extern int select_p_job_mem_confirm(struct job_record *job_ptr) +{ + return SLURM_SUCCESS; +} + /* * Note termination of job is starting. Executed from slurmctld. * IN job_ptr - pointer to job being terminated diff --git a/src/plugins/select/other/other_select.c b/src/plugins/select/other/other_select.c index 68babf73a45..0fb3286f0fb 100644 --- a/src/plugins/select/other/other_select.c +++ b/src/plugins/select/other/other_select.c @@ -77,6 +77,7 @@ const char *node_select_syms[] = { "select_p_job_expand", "select_p_job_resized", "select_p_job_signal", + "select_p_job_mem_confirm", "select_p_job_fini", "select_p_job_suspend", "select_p_job_resume", @@ -356,11 +357,23 @@ extern int other_job_resized(struct job_record *job_ptr, extern int other_job_signal(struct job_record *job_ptr, int signal) { if (other_select_init() < 0) - return -1; + return SLURM_ERROR; return (*(ops.job_signal))(job_ptr, signal); } +/* + * Pass job memory allocation confirmation request to other plugin. + * IN job_ptr - job to be signalled + */ +extern int other_job_mem_confirm(struct job_record *job_ptr) +{ + if (other_select_init() < 0) + return SLURM_ERROR; + + return (*(ops.job_mem_confirm))(job_ptr); +} + /* * Note termination of job is starting. Executed from slurmctld. * IN job_ptr - pointer to job being terminated diff --git a/src/plugins/select/other/other_select.h b/src/plugins/select/other/other_select.h index d53eb226bec..51b080fef78 100644 --- a/src/plugins/select/other/other_select.h +++ b/src/plugins/select/other/other_select.h @@ -212,6 +212,12 @@ extern int other_job_resized(struct job_record *job_ptr, */ extern int other_job_signal(struct job_record *job_ptr, int signal); +/* + * Pass job memory allocation confirmation request to other plugin. + * IN job_ptr - job to be signalled + */ +extern int other_job_mem_confirm(struct job_record *job_ptr); + /* * Note termination of job is starting. Executed from slurmctld. * IN job_ptr - pointer to job being terminated diff --git a/src/plugins/select/serial/select_serial.c b/src/plugins/select/serial/select_serial.c index 9e98b28c66f..6a044d24dd3 100644 --- a/src/plugins/select/serial/select_serial.c +++ b/src/plugins/select/serial/select_serial.c @@ -1711,6 +1711,11 @@ extern int select_p_job_signal(struct job_record *job_ptr, int signal) return SLURM_SUCCESS; } +extern int select_p_job_mem_confirm(struct job_record *job_ptr) +{ + return SLURM_SUCCESS; +} + extern int select_p_job_fini(struct job_record *job_ptr) { xassert(job_ptr); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 2523b0d047d..6c40c99316e 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -7355,6 +7355,36 @@ static bool _test_nodes_ready(struct job_record *job_ptr) } #endif +/* + * Modify a job's memory limit if allocated all memory on a node and the node + * reboots, possibly with a different memory size (e.g. KNL MCDRAM mode changed) + */ +extern void job_validate_mem(struct job_record *job_ptr) +{ + uint64_t tres_count; + + if ((job_ptr->bit_flags & NODE_MEM_CALC) && + (slurmctld_conf.fast_schedule == 0)) { + select_g_job_mem_confirm(job_ptr); + tres_count = (uint64_t)job_ptr->details->pn_min_memory; + if (tres_count & MEM_PER_CPU) { + tres_count &= (~MEM_PER_CPU); + tres_count *= job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU]; + } else { + tres_count *= job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE]; + } + job_ptr->tres_alloc_cnt[TRES_ARRAY_MEM] = tres_count; + job_ptr->tres_alloc_str = + assoc_mgr_make_tres_str_from_array( + job_ptr->tres_alloc_cnt, TRES_STR_FLAG_SIMPLE, true); + + job_ptr->tres_fmt_alloc_str = + assoc_mgr_make_tres_str_from_array( + job_ptr->tres_alloc_cnt, TRES_STR_CONVERT_UNITS, true); + jobacct_storage_job_start_direct(acct_db_conn, job_ptr); + } +} + /* * job_time_limit - terminate jobs which have exceeded their time limit * global: job_list - pointer global job list @@ -7399,6 +7429,12 @@ void job_time_limit(void) info("%s: Configuration for job %u is complete", __func__, job_ptr->job_id); job_config_fini(job_ptr); + if (job_ptr->bit_flags & NODE_REBOOT) { + job_ptr->bit_flags &= (~NODE_REBOOT); + job_validate_mem(job_ptr); + if (job_ptr->batch_flag) + launch_job(job_ptr); + } } #endif /* This needs to be near the top of the loop, checks every diff --git a/src/slurmctld/job_scheduler.c b/src/slurmctld/job_scheduler.c index d8d3e7b972e..94cdeb72d45 100644 --- a/src/slurmctld/job_scheduler.c +++ b/src/slurmctld/job_scheduler.c @@ -835,7 +835,8 @@ next_part: part_ptr = (struct part_record *) info("sched: Allocate JobId=%u Partition=%s NodeList=%s #CPUs=%u", job_ptr->job_id, job_ptr->part_ptr->name, job_ptr->nodes, job_ptr->total_cpus); - if (job_ptr->details->prolog_running == 0) { + if ((job_ptr->details->prolog_running == 0) && + ((job_ptr->bit_flags & NODE_REBOOT) == 0)) { launch_msg = build_launch_job_msg(job_ptr, msg->protocol_version); } @@ -1821,8 +1822,10 @@ next_task: #endif if (job_ptr->batch_flag == 0) srun_allocate(job_ptr->job_id); - else if (job_ptr->details->prolog_running == 0) + else if ((job_ptr->details->prolog_running == 0) && + ((job_ptr->bit_flags & NODE_REBOOT) == 0)) { launch_job(job_ptr); + } rebuild_job_part_list(job_ptr); job_cnt++; if (is_job_array_head && @@ -3618,6 +3621,7 @@ static void *_wait_boot(void *arg) lock_slurmctld(job_write_lock); prolog_running_decr(job_ptr); + job_validate_mem(job_ptr); unlock_slurmctld(job_write_lock); return NULL; @@ -3800,6 +3804,7 @@ extern void prolog_running_decr(struct job_record *job_ptr) job_ptr->job_state &= ~JOB_CONFIGURING; if (job_ptr->batch_flag && + ((job_ptr->bit_flags & NODE_REBOOT) == 0) && (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { launch_job(job_ptr); } diff --git a/src/slurmctld/power_save.c b/src/slurmctld/power_save.c index ac2fc312e9e..1ac60064fa3 100644 --- a/src/slurmctld/power_save.c +++ b/src/slurmctld/power_save.c @@ -285,6 +285,7 @@ extern int power_job_reboot(struct job_record *job_ptr) if (nodes) { job_ptr->job_state |= JOB_CONFIGURING; job_ptr->wait_all_nodes = 1; + job_ptr->bit_flags |= NODE_REBOOT; if (job_ptr->details && job_ptr->details->features && node_features_g_user_update(job_ptr->user_id)) { features = node_features_g_job_xlate( diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index dba1df43fcb..ca7d93bc329 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -1555,6 +1555,12 @@ extern void job_set_alloc_tres( */ extern int job_update_tres_cnt(struct job_record *job_ptr, int node_inx); +/* + * Modify a job's memory limit if allocated all memory on a node and that node + * reboots, possibly with a different memory size (e.g. KNL MCDRAM mode changed) + */ +extern void job_validate_mem(struct job_record *job_ptr); + /* * check_job_step_time_limit - terminate jobsteps which have exceeded * their time limit -- GitLab