diff --git a/src/plugins/select/bluegene/plugin/Makefile.am b/src/plugins/select/bluegene/plugin/Makefile.am index 817ccd600c50122c42b8dd7a83dd9a1bae840cd2..fde0b65dc943164a9f92b0e59d239eee2fed685d 100644 --- a/src/plugins/select/bluegene/plugin/Makefile.am +++ b/src/plugins/select/bluegene/plugin/Makefile.am @@ -11,6 +11,7 @@ pkglib_LTLIBRARIES = select_bluegene.la libsched_if64.la # Blue Gene node selection plugin. select_bluegene_la_SOURCES = select_bluegene.c \ + bg_boot_time.h \ bg_job_place.c bg_job_place.h \ bg_job_run.c bg_job_run.h \ bg_block_info.c bg_block_info.h \ diff --git a/src/plugins/select/bluegene/plugin/Makefile.in b/src/plugins/select/bluegene/plugin/Makefile.in index 6f705c3df54b92a9e9918151f7f0448f4c79858d..e21b8b361a2eff20a88ace3cbdb67413756b81eb 100644 --- a/src/plugins/select/bluegene/plugin/Makefile.in +++ b/src/plugins/select/bluegene/plugin/Makefile.in @@ -310,6 +310,7 @@ pkglib_LTLIBRARIES = select_bluegene.la libsched_if64.la # Blue Gene node selection plugin. select_bluegene_la_SOURCES = select_bluegene.c \ + bg_boot_time.h \ bg_job_place.c bg_job_place.h \ bg_job_run.c bg_job_run.h \ bg_block_info.c bg_block_info.h \ diff --git a/src/plugins/select/bluegene/plugin/bg_boot_time.h b/src/plugins/select/bluegene/plugin/bg_boot_time.h new file mode 100644 index 0000000000000000000000000000000000000000..dd53c74c36c8502ff1c6ee5a66fcd631691c4fe4 --- /dev/null +++ b/src/plugins/select/bluegene/plugin/bg_boot_time.h @@ -0,0 +1,54 @@ +/*****************************************************************************\ + * bg_boot_time.h - Block boot time parameters for use by slurm_prolog + * and slurmctld + ***************************************************************************** + * Copyright (C) 2008 Lawrence Livermore National Security. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifndef _BG_BOOT_TIME_H_ +#define _BG_BOOT_TIME_H_ + +/* + * Total time to boot a bglblock should not exceed + * BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT + + * (BG_INCR_BLOCK_BOOT * base partition count). + * For example, if BG_MIN_BLOCK_BOOT=300, BG_MIN_BLOCK_BOOT=200, + * BG_INCR_BLOCK_BOOT=20 and there are 4 blocks being booted, + * wait up to 580 seconds (300 + 200 (20 * 4)). + */ + +#define BG_FREE_PREVIOUS_BLOCK 300 /* time in seconds */ +#define BG_MIN_BLOCK_BOOT 300 /* time in seconds */ +#define BG_INCR_BLOCK_BOOT 20 /* time in seconds per BP */ + +#endif /* _BG_BOOT_TIME_H_ */ diff --git a/src/plugins/select/bluegene/plugin/slurm_prolog.c b/src/plugins/select/bluegene/plugin/slurm_prolog.c index dbe8a93328074e7aa174110588617de0587e4e20..af652b7879cab947ecc2e5ab908d64b8c9d1e4bd 100644 --- a/src/plugins/select/bluegene/plugin/slurm_prolog.c +++ b/src/plugins/select/bluegene/plugin/slurm_prolog.c @@ -54,23 +54,12 @@ #include "src/common/hostlist.h" #include "src/common/node_select.h" #include "src/api/node_select_info.h" +#include "src/plugins/select/bluegene/plugin/bg_boot_time.h" #define _DEBUG 0 - -/* - * Check the bgblock's status every POLL_SLEEP seconds. - * Retry for a period of - * MIN_FREE_PERVIOUS_BLOCK_DELAY + MIN_DELAY + (INCR_DELAY * base partition count) - * For example if MIN_FREE_PERVIOUS_BLOCK_DELAY=300 and MIN_DELAY=600 and - * INCR_DELAY=20 and job_size=4 base partitions then wait up to 980 seconds - * 300 + 600 + (20 * 4) - */ #define POLL_SLEEP 3 /* retry interval in seconds */ -#define MIN_FREE_PERVIOUS_BLOCK_DELAY 300 /* time in seconds */ -#define MIN_DELAY 600 /* time in seconds */ -#define INCR_DELAY 20 /* time in seconds per BP */ -int max_delay = MIN_DELAY + MIN_FREE_PERVIOUS_BLOCK_DELAY; +int max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT; int cur_delay = 0; enum rm_partition_state {RM_PARTITION_FREE, @@ -113,8 +102,8 @@ static int _wait_part_ready(uint32_t job_id) { int is_ready = 0, i, rc; - max_delay = MIN_FREE_PERVIOUS_BLOCK_DELAY + MIN_DELAY + - (INCR_DELAY * _get_job_size(job_id)); + max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT + + (BG_INCR_BLOCK_BOOT * _get_job_size(job_id)); #if _DEBUG printf("Waiting for job %u to become ready.", job_id); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 834ecc65ad60589c253498f7937432dbf8a3c39c..2b3060b7a3a3acc89ccd85c0858e685eaa5d4e29 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -70,6 +70,7 @@ #include "src/slurmctld/sched_plugin.h" #include "src/slurmctld/slurmctld.h" #include "src/slurmctld/trigger_mgr.h" +#include "src/plugins/select/bluegene/plugin/bg_boot_time.h" #define _DEBUG 0 #define MAX_RETRIES 10 @@ -1720,7 +1721,9 @@ extern int validate_nodes_via_front_end( * completes which waits for bgblock boot to complete. * This can take several minutes on BlueGene. */ if (difftime(now, job_ptr->time_last_active) <= - (1400 + 5 * job_ptr->node_cnt)) + + (BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT + + BG_INCR_BLOCK_BOOT * job_ptr->node_cnt)) continue; #else if (difftime(now, job_ptr->time_last_active) <= 5)