diff --git a/NEWS b/NEWS index 1ddc651fa0a737302bd80a0cd1ed15212871d20d..2558a95e1c52b0bc33e9261f290f1a0a839e3e1e 100644 --- a/NEWS +++ b/NEWS @@ -36,6 +36,9 @@ documents those changes that are of interest to users and administrators. -- CRAY - Collect energy using a uint64_t instead of uint32_t. -- Fix incorrect if statements when determining if the user has a default account or wckey. + -- Prevent job stuck in configuring state if slurmctld daemon restarted while + PrologSlurmctld is running. Also re-issue burst_buffer/pre-load operation + as needed. * Changes in Slurm 16.05.0 ========================== @@ -288,6 +291,13 @@ documents those changes that are of interest to users and administrators. -- Increase buffer size to read /proc/*/stat files. -- Fix hostrange_hn_within to allow hostname lookup within a set of nodes with different length prefixes. + -- MYSQL - Handle ER_HOST_IS_BLOCKED better by failing when it occurs instead + of continuously printing the message over and over as the problem will + most likely not resolve itself. + -- Add --disable-bluegene to configure. This will make it so Slurm + can work on a BGAS node. + -- Prevent job stuck in configuring state if slurmctld daemon restarted while + PrologSlurmctld is running. * Changes in Slurm 15.08.12 =========================== diff --git a/auxdir/x_ac_bluegene.m4 b/auxdir/x_ac_bluegene.m4 index 34837a403b98919f3d08c107eb8229e498122904..207c07d4af864cf484c8cd66b1e98de49bc3fea9 100644 --- a/auxdir/x_ac_bluegene.m4 +++ b/auxdir/x_ac_bluegene.m4 @@ -10,12 +10,28 @@ # If found define HAVE_BG and HAVE_FRONT_END and others ##***************************************************************************** - -AC_DEFUN([X_AC_BGL], +AC_DEFUN([X_AC_BG], [ ac_real_bluegene_loaded=no ac_bluegene_loaded=no + AC_MSG_CHECKING([whether BG is explicitly disabled]) + AC_ARG_ENABLE( + [bluegene], + AS_HELP_STRING(--disable-bluegene,Disable Bluegene support for BGAS nodes (or wherever you run a Slurm on a bluegene system not wanting it to act like a Bluegene)), + [ case "$enableval" in + yes) ac_bluegene_loaded=no ;; + no) ac_bluegene_loaded=yes ;; + *) AC_MSG_RESULT([doh!]) + AC_MSG_ERROR([bad value "$enableval" for --disable-bluegene]) ;; + esac ] + ) + + AC_MSG_RESULT([${ac_bluegene_loaded=yes}]) +]) + +AC_DEFUN([X_AC_BGL], +[ AC_ARG_WITH(db2-dir, AS_HELP_STRING(--with-db2-dir=PATH,Specify path to parent directory of DB2 library), [ trydb2dir=$withval ]) # test for bluegene emulation mode @@ -34,7 +50,10 @@ AC_DEFUN([X_AC_BGL], *) AC_MSG_ERROR([bad value "$enableval" for --enable-bgl-emulation]) ;; esac ]) - if test "x$bluegene_emulation" = "xyes" -o "x$bgl_emulation" = "xyes"; then + # Skip if already set + if test "x$ac_bluegene_loaded" = "xyes" ; then + bg_default_dirs="" + elif test "x$bluegene_emulation" = "xyes" -o "x$bgl_emulation" = "xyes"; then AC_DEFINE(HAVE_3D, 1, [Define to 1 if 3-dimensional architecture]) AC_DEFINE(SYSTEM_DIMENSIONS, 3, [3-dimensional architecture]) AC_DEFINE(HAVE_BG, 1, [Define to 1 if emulating or running on Blue Gene system]) diff --git a/configure b/configure index ea6656c5a74a4cbea481eded9aa7f26d277f61dd..792faefb0359fd230e8552389e731663cb578ab5 100755 --- a/configure +++ b/configure @@ -977,6 +977,7 @@ enable_option_checking enable_silent_rules enable_maintainer_mode with_rpath +enable_bluegene with_db2_dir enable_bluegene_emulation enable_bgl_emulation @@ -1695,6 +1696,9 @@ Optional Features: --enable-maintainer-mode enable make rules and dependencies not useful (and sometimes confusing) to the casual installer + --disable-bluegene Disable Bluegene support for BGAS nodes (or wherever + you run a Slurm on a bluegene system not wanting it + to act like a Bluegene) --enable-bluegene-emulation deprecated use --enable-bgl-emulation --enable-bgl-emulation Run SLURM in BGL mode on a non-bluegene system @@ -3683,6 +3687,30 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_with_rpath" >&5 $as_echo "$ac_with_rpath" >&6; } + + + ac_real_bluegene_loaded=no + ac_bluegene_loaded=no + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether BG is explicitly disabled" >&5 +$as_echo_n "checking whether BG is explicitly disabled... " >&6; } + # Check whether --enable-bluegene was given. +if test "${enable_bluegene+set}" = set; then : + enableval=$enable_bluegene; case "$enableval" in + yes) ac_bluegene_loaded=no ;; + no) ac_bluegene_loaded=yes ;; + *) { $as_echo "$as_me:${as_lineno-$LINENO}: result: doh!" >&5 +$as_echo "doh!" >&6; } + as_fn_error $? "bad value \"$enableval\" for --disable-bluegene" "$LINENO" 5 ;; + esac + +fi + + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${ac_bluegene_loaded=yes}" >&5 +$as_echo "${ac_bluegene_loaded=yes}" >&6; } + + DEPDIR="${am__leading_dot}deps" ac_config_commands="$ac_config_commands depfiles" @@ -4724,9 +4752,6 @@ fi - ac_real_bluegene_loaded=no - ac_bluegene_loaded=no - # Check whether --with-db2-dir was given. if test "${with_db2_dir+set}" = set; then : @@ -4756,7 +4781,10 @@ if test "${enable_bgl_emulation+set}" = set; then : fi - if test "x$bluegene_emulation" = "xyes" -o "x$bgl_emulation" = "xyes"; then + # Skip if already set + if test "x$ac_bluegene_loaded" = "xyes" ; then + bg_default_dirs="" + elif test "x$bluegene_emulation" = "xyes" -o "x$bgl_emulation" = "xyes"; then $as_echo "#define HAVE_3D 1" >>confdefs.h @@ -19907,9 +19935,10 @@ char *malloc (); int main () { -return ! malloc (0); - ; - return 0; +char *a = malloc(0); +int b = !a; +free(a); +return b; } _ACEOF if ac_fn_c_try_run "$LINENO"; then : diff --git a/configure.ac b/configure.ac index e0ac4ed279033014c92e3236bf4a5f21d21c6229..43deb44c073d2ee0303c5a6054df1802b7308839 100644 --- a/configure.ac +++ b/configure.ac @@ -30,6 +30,9 @@ AC_CONFIG_HEADERS([slurm/slurm.h]) dnl This needs to be close to the front to set CFLAGS=-m64 X_AC_RPATH + +X_AC_BG + X_AC_BGL dnl we need to know if this is a bgl in the Makefile.am to do diff --git a/contribs/torque/qstat.pl b/contribs/torque/qstat.pl index a040d6db9e614209738aa288b2b0e9049b8b2701..b108f871814cee01cea3d35ff3a6f47c94cfbe6a 100755 --- a/contribs/torque/qstat.pl +++ b/contribs/torque/qstat.pl @@ -227,8 +227,13 @@ if(defined($queueList)) { print_job_brief($job, $line); $line++; } + $rc = 0; + } + + # return 0 even if no records printed when using -u flag + if (@userIds) { + $rc = 0; } - $rc = 0; } # Exit with status code diff --git a/src/database/mysql_common.c b/src/database/mysql_common.c index 54ed9e95ba561a566be78d91efac8f062c34c410..89394b956002dadbd18eee4857fb9c7dd4e569af 100644 --- a/src/database/mysql_common.c +++ b/src/database/mysql_common.c @@ -153,17 +153,21 @@ static int _mysql_query_internal(MYSQL *db_conn, char *query) } error("mysql_query failed: %d %s\n%s", errno, err_str, query); if (errno == ER_LOCK_WAIT_TIMEOUT) { + /* FIXME: If we get ER_LOCK_WAIT_TIMEOUT here we need + * to restart the connections, but it appears restarting + * the calling program is the only way to handle this. + * If anyone in the future figures out a way to handle + * this, super. Until then we will need to restart the + * calling program if you ever get this error. + */ fatal("mysql gave ER_LOCK_WAIT_TIMEOUT as an error. " "The only way to fix this is restart the " "calling program"); + } else if (errno == ER_HOST_IS_BLOCKED) { + fatal("MySQL gave ER_HOST_IS_BLOCKED as an error. " + "You will need to call 'mysqladmin flush-hosts' " + "to regain connectivity."); } - /* FIXME: If we get ER_LOCK_WAIT_TIMEOUT here we need - * to restart the connections, but it appears restarting - * the calling program is the only way to handle this. - * If anyone in the future figures out a way to handle - * this, super. Until then we will need to restart the - * calling program if you ever get this error. - */ rc = SLURM_ERROR; } end_it: diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index d17d1d2cec96b6fb4f190b21f6a500a0240854ab..7689349ae7563175a645df81bc17534929ef00fa 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -126,7 +126,7 @@ static int _restore_part_state(List old_part_list, char *old_def_part_name, uint16_t flags); static void _stat_slurm_dirs(void); static int _sync_nodes_to_comp_job(void); -static int _sync_nodes_to_jobs(void); +static int _sync_nodes_to_jobs(bool reconfig); static int _sync_nodes_to_active_job(struct job_record *job_ptr); static void _sync_nodes_to_suspended_job(struct job_record *job_ptr); static void _sync_part_prio(void); @@ -1062,7 +1062,7 @@ int read_slurm_conf(int recover, bool reconfig) _gres_reconfig(reconfig); reset_job_bitmaps(); /* must follow select_g_job_init() */ - (void) _sync_nodes_to_jobs(); + (void) _sync_nodes_to_jobs(reconfig); (void) sync_job_files(); _purge_old_node_state(old_node_table_ptr, old_node_record_count); _purge_old_part_state(old_part_list, old_def_part_name); @@ -2021,7 +2021,7 @@ static int _preserve_plugins(slurm_ctl_conf_t * ctl_conf_ptr, * RET count of nodes having state changed * Note: Operates on common variables, no arguments */ -static int _sync_nodes_to_jobs(void) +static int _sync_nodes_to_jobs(bool reconfig) { struct job_record *job_ptr; ListIterator job_iterator; @@ -2029,9 +2029,14 @@ static int _sync_nodes_to_jobs(void) job_iterator = list_iterator_create(job_list); while ((job_ptr = (struct job_record *) list_next(job_iterator))) { - if (job_ptr->details && job_ptr->details->prolog_running && - !IS_JOB_CONFIGURING(job_ptr)) + if (!reconfig && + job_ptr->details && job_ptr->details->prolog_running) { job_ptr->details->prolog_running = 0; + if (IS_JOB_CONFIGURING(job_ptr)) { + (void) prolog_slurmctld(job_ptr); + (void) bb_g_job_begin(job_ptr); + } + } if (job_ptr->node_bitmap == NULL) ;