diff --git a/NEWS b/NEWS index 974491b62ca5e4ec5961d86c5bda97b0042a3dbf..980b8ba6c35fb5754f81189ec05c3f8d4bddc103 100644 --- a/NEWS +++ b/NEWS @@ -90,6 +90,9 @@ documents those changes that are of interest to users and admins. -- Sched/backfill - Change default max_job_bf parameter from 50 to 100. -- Added -I|--item-extract option to sh5util to extract data item from series. +* Changes in Slurm 2.6.4 +======================== + * Changes in Slurm 2.6.3 ======================== -- Add support for some new #PBS options in sbatch scripts and qsub wrapper: @@ -137,6 +140,10 @@ documents those changes that are of interest to users and admins. updated the srun man page. -- BLUEGENE/CRAY - Don't set env variables that pertain to a node when Slurm isn't doing the launching. + -- gres/gpu and gres/mic - Do not treat the existence of an empty gres.conf + file as a fatal error. + -- Fixed for if hours are specified as 0 the time days-0:min specification + is not parsed correctly. * Changes in Slurm 2.6.2 ======================== diff --git a/doc/man/man1/srun.1 b/doc/man/man1/srun.1 index b028fb2640623e498e1bd2397e2f15e5ac58e584..fe1cb17fa07b7148cfd57a5d01ef8b8697ac709b 100644 --- a/doc/man/man1/srun.1 +++ b/doc/man/man1/srun.1 @@ -2096,7 +2096,7 @@ Number of processes in the step. \fBSLURM_STEP_TASKS_PER_NODE\fR Number of processes per node within the step. .TP -\fBSLURM_STEPID\fR (and \fBSLURM_STEP_ID\fR for backwards compatibility) +\fBSLURM_STEP_ID\fR (and \fBSLURM_STEPID\fR for backwards compatibility) The step ID of the current job .TP \fBSLURM_SUBMIT_DIR\fR diff --git a/src/common/slurm_acct_gather.c b/src/common/slurm_acct_gather.c index 612d880f8f57e118cc2eadde9e59a629f91803e4..ad8799c282cb9beb1df1f72fce07278d34a0b39b 100644 --- a/src/common/slurm_acct_gather.c +++ b/src/common/slurm_acct_gather.c @@ -118,8 +118,9 @@ extern int acct_gather_conf_init(void) acct_gather_profile_g_conf_set(tbl); acct_gather_infiniband_g_conf_set(tbl); acct_gather_filesystem_g_conf_set(tbl); - /* ADD MORE HERE */ - /******************************************/ + /*********************************************************************/ + /* ADD MORE HERE AND FREE MEMORY IN acct_gather_conf_destroy() BELOW */ + /*********************************************************************/ s_p_hashtbl_destroy(tbl); @@ -128,10 +129,16 @@ extern int acct_gather_conf_init(void) extern int acct_gather_conf_destroy(void) { + int rc; + if (!inited) return SLURM_SUCCESS; - return SLURM_SUCCESS; + rc = acct_gather_energy_fini(); + rc = MAX(rc, acct_gather_filesystem_fini()); + rc = MAX(rc, acct_gather_infiniband_fini()); + rc = MAX(rc, acct_gather_profile_fini()); + return rc; } extern List acct_gather_conf_values(void) diff --git a/src/common/slurm_acct_gather_profile.c b/src/common/slurm_acct_gather_profile.c index 008e3aa56eb5b08e44ab0e0d2eb285110c1cee62..ed19e02ed6b0b40ccbf3597e74a756f2dcfd78bf 100644 --- a/src/common/slurm_acct_gather_profile.c +++ b/src/common/slurm_acct_gather_profile.c @@ -202,7 +202,7 @@ extern int acct_gather_profile_fini(void) slurm_mutex_lock(&g_context_lock); - if (g_context) + if (!g_context) goto done; init_run = false; diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 82b49bbd796c6e2f7d9e89daa21d5509f6cfdced..a15a17f74fd916f495f006e28ab068f662688ede 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -399,7 +399,7 @@ static slurm_errtab_t slurm_errtab[] = { /* plugin and custom errors */ { ESLURM_MISSING_TIME_LIMIT, - "Missing time limit" } + "Missing time limit" } }; /* diff --git a/src/plugins/gres/gpu/gres_gpu.c b/src/plugins/gres/gpu/gres_gpu.c index 755ea4c71a14860b3486e3bc20904fa80db0d58f..ca7100d9e98f45906c9fd71a36e2db372006882f 100644 --- a/src/plugins/gres/gpu/gres_gpu.c +++ b/src/plugins/gres/gpu/gres_gpu.c @@ -133,7 +133,7 @@ extern int fini(void) */ extern int node_config_load(List gres_conf_list) { - int i, rc = SLURM_ERROR; + int i, rc = SLURM_SUCCESS; ListIterator iter; gres_slurmd_conf_t *gres_slurmd_conf; int nb_gpu = 0; /* Number of GPUs in the list */ @@ -144,7 +144,6 @@ extern int node_config_load(List gres_conf_list) while ((gres_slurmd_conf = list_next(iter))) { if (strcmp(gres_slurmd_conf->name, gres_name)) continue; - rc = SLURM_SUCCESS; if (gres_slurmd_conf->file) nb_gpu++; } @@ -299,7 +298,7 @@ extern void step_set_env(char ***job_env_ptr, void *gres_ptr) } } -/* Send GRES information to slurmstepd on the specified file descriptor*/ +/* Send GRES information to slurmstepd on the specified file descriptor */ extern void send_stepd(int fd) { int i; @@ -312,7 +311,7 @@ extern void send_stepd(int fd) rwfail: error("gres_plugin_send_stepd failed"); } -/* Receive GRES information from slurmd on the specified file descriptor*/ +/* Receive GRES information from slurmd on the specified file descriptor */ extern void recv_stepd(int fd) { int i; diff --git a/src/plugins/gres/mic/gres_mic.c b/src/plugins/gres/mic/gres_mic.c index 8c072369428aa78b4679db59b565b6e0b87c8569..b98f465359e63c8a35fc685225225beb3b9adf86 100644 --- a/src/plugins/gres/mic/gres_mic.c +++ b/src/plugins/gres/mic/gres_mic.c @@ -1,298 +1,297 @@ -/*****************************************************************************\ - * gres_mic.c - Support MICs as a generic resources. - ***************************************************************************** - * Copyright (C) 2012 CSC-IT Center for Science Ltd. - * Written by Olli-Pekka Lehto - * Based upon gres_gpu.c with the copyright notice shown below: - * Copyright (C) 2010 Lawrence Livermore National Security. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Morris Jette <jette1@llnl.gov> - * - * This file is part of SLURM, a resource management program. - * For details, see <http://slurm.schedmd.com/>. - * Please also read the included file: DISCLAIMER. - * - * SLURM is free software; you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under - * certain conditions as described in each individual source file, and - * distribute linked combinations including the two. You must obey the GNU - * General Public License in all respects for all of the code used other than - * OpenSSL. If you modify file(s) with this exception, you may extend this - * exception to your version of the file(s), but you are not obligated to do - * so. If you do not wish to do so, delete this exception statement from your - * version. If you delete this exception statement from all source files in - * the program, then also delete it here. - * - * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY - * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along - * with SLURM; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -\*****************************************************************************/ - -#if HAVE_CONFIG_H -# include "config.h" -# if STDC_HEADERS -# include <string.h> -# endif -# if HAVE_SYS_TYPES_H -# include <sys/types.h> -# endif /* HAVE_SYS_TYPES_H */ -# if HAVE_UNISTD_H -# include <unistd.h> -# endif -# if HAVE_INTTYPES_H -# include <inttypes.h> -# else /* ! HAVE_INTTYPES_H */ -# if HAVE_STDINT_H -# include <stdint.h> -# endif -# endif /* HAVE_INTTYPES_H */ -#else /* ! HAVE_CONFIG_H */ -# include <sys/types.h> -# include <unistd.h> -# include <stdint.h> -# include <string.h> -#endif /* HAVE_CONFIG_H */ - -#ifdef HAVE_HWLOC -# include <hwloc.h> -#endif /* HAVE_HWLOC */ - -#include <stdio.h> -#include <stdlib.h> -#include <ctype.h> - -#include "slurm/slurm.h" -#include "slurm/slurm_errno.h" - -#include "src/common/slurm_xlator.h" -#include "src/common/bitstring.h" -#include "src/common/env.h" -#include "src/common/gres.h" -#include "src/common/list.h" -#include "src/common/xstring.h" - -/* - * These variables are required by the generic plugin interface. If they - * are not found in the plugin, the plugin loader will ignore it. - * - * plugin_name - A string giving a human-readable description of the - * plugin. There is no maximum length, but the symbol must refer to - * a valid string. - * - * plugin_type - A string suggesting the type of the plugin or its - * applicability to a particular form of data or method of data handling. - * If the low-level plugin API is used, the contents of this string are - * unimportant and may be anything. SLURM uses the higher-level plugin - * interface which requires this string to be of the form - * - * <application>/<method> - * - * where <application> is a description of the intended application of - * the plugin (e.g., "auth" for SLURM authentication) and <method> is a - * description of how this plugin satisfies that application. SLURM will - * only load authentication plugins if the plugin_type string has a prefix - * of "auth/". - * - * plugin_version - Specifies the version number of the plugin. This would - * typically be the same for all plugins. - */ -const char plugin_name[] = "Gres MIC plugin"; -const char plugin_type[] = "gres/mic"; -const uint32_t plugin_version = 120; - -static char gres_name[] = "mic"; - -static int *mic_devices = NULL; -static int nb_available_files; - -/* - * We could load gres state or validate it using various mechanisms here. - * This only validates that the configuration was specified in gres.conf. - * In the general case, no code would need to be changed. - */ -extern int node_config_load(List gres_conf_list) -{ - int i, rc = SLURM_ERROR; - ListIterator iter; - gres_slurmd_conf_t *gres_slurmd_conf; - int nb_mic = 0; /* Number of MICs in the list */ - int available_files_index = 0; - - xassert(gres_conf_list); - iter = list_iterator_create(gres_conf_list); - while ((gres_slurmd_conf = list_next(iter))) { - if (strcmp(gres_slurmd_conf->name, gres_name)) - continue; - rc = SLURM_SUCCESS; - if (gres_slurmd_conf->file) - nb_mic++; - } - list_iterator_destroy(iter); - mic_devices = NULL; - nb_available_files = -1; - - /* (Re-)Allocate memory if number of files changed */ - if (nb_mic != nb_available_files) { - xfree(mic_devices); /* No-op if NULL */ - mic_devices = (int *) xmalloc(sizeof(int) * nb_mic); - nb_available_files = nb_mic; - for (i = 0; i < nb_available_files; i++) - mic_devices[i] = -1; - } - - iter = list_iterator_create(gres_conf_list); - while ((gres_slurmd_conf = list_next(iter))) { - if ((strcmp(gres_slurmd_conf->name, gres_name) == 0) && - gres_slurmd_conf->file) { - /* Populate mic_devices array with number - * at end of the file name */ - for (i = 0; gres_slurmd_conf->file[i]; i++) { - if (!isdigit(gres_slurmd_conf->file[i])) - continue; - mic_devices[available_files_index] = - atoi(gres_slurmd_conf->file + i); - break; - } - available_files_index++; - } - } - list_iterator_destroy(iter); - - if (rc != SLURM_SUCCESS) - fatal("%s failed to load configuration", plugin_name); - - for (i = 0; i < nb_available_files; i++) - info("mic %d is device number %d", i, mic_devices[i]); - - return rc; -} - -/* - * Set environment variables as appropriate for a job (i.e. all tasks) based - * upon the job's GRES state. - */ -extern void job_set_env(char ***job_env_ptr, void *gres_ptr) -{ - int i, len; - char *dev_list = NULL; - gres_job_state_t *gres_job_ptr = (gres_job_state_t *) gres_ptr; - - if ((gres_job_ptr != NULL) && - (gres_job_ptr->node_cnt == 1) && - (gres_job_ptr->gres_bit_alloc != NULL) && - (gres_job_ptr->gres_bit_alloc[0] != NULL)) { - len = bit_size(gres_job_ptr->gres_bit_alloc[0]); - for (i=0; i<len; i++) { - if (!bit_test(gres_job_ptr->gres_bit_alloc[0], i)) - continue; - if (!dev_list) - dev_list = xmalloc(128); - else - xstrcat(dev_list, ","); - if (mic_devices && (mic_devices[i] >= 0)) - xstrfmtcat(dev_list, "%d", mic_devices[i]); - else - xstrfmtcat(dev_list, "%d", i); - } - } - if (dev_list) { - env_array_overwrite(job_env_ptr,"OFFLOAD_DEVICES", - dev_list); - xfree(dev_list); - } else { - /* The gres.conf file must identify specific device files - * in order to set the OFFLOAD_DEVICES env var */ - error("gres/mic unable to set OFFLOAD_DEVICES, " - "no device files configured"); - } -} - -/* - * Set environment variables as appropriate for a job (i.e. all tasks) based - * upon the job step's GRES state. - */ -extern void step_set_env(char ***job_env_ptr, void *gres_ptr) -{ - int i, len; - char *dev_list = NULL; - gres_step_state_t *gres_step_ptr = (gres_step_state_t *) gres_ptr; - - if ((gres_step_ptr != NULL) && - (gres_step_ptr->node_cnt == 1) && - (gres_step_ptr->gres_bit_alloc != NULL) && - (gres_step_ptr->gres_bit_alloc[0] != NULL)) { - len = bit_size(gres_step_ptr->gres_bit_alloc[0]); - for (i=0; i<len; i++) { - if (!bit_test(gres_step_ptr->gres_bit_alloc[0], i)) - continue; - if (!dev_list) - dev_list = xmalloc(128); - else - xstrcat(dev_list, ","); - if (mic_devices && (mic_devices[i] >= 0)) - xstrfmtcat(dev_list, "%d", mic_devices[i]); - else - xstrfmtcat(dev_list, "%d", i); - } - } - if (dev_list) { - env_array_overwrite(job_env_ptr,"OFFLOAD_DEVICES", - dev_list); - xfree(dev_list); - } else { - /* The gres.conf file must identify specific device files - * in order to set the OFFLOAD_DEVICES env var */ - error("gres/mic unable to set OFFLOAD_DEVICES, " - "no device files configured"); - } -} - -/* Send GRES information to slurmstepd on the specified file descriptor*/ -extern void send_stepd(int fd) -{ - int i; - - safe_write(fd, &nb_available_files, sizeof(int)); - for (i = 0; i < nb_available_files; i++) - safe_write(fd, &mic_devices[i], sizeof(int)); - return; - -rwfail: error("gres_plugin_send_stepd failed"); -} - -/* Receive GRES information from slurmd on the specified file descriptor*/ -extern void recv_stepd(int fd) -{ - int i; - - safe_read(fd, &nb_available_files, sizeof(int)); - if (nb_available_files > 0) - mic_devices = xmalloc(sizeof(int) * nb_available_files); - for (i = 0; i < nb_available_files; i++) - safe_read(fd, &mic_devices[i], sizeof(int)); - return; - -rwfail: error("gres_plugin_recv_stepd failed"); -} - -extern int job_info(gres_job_state_t *job_gres_data, uint32_t node_inx, - enum gres_job_data_type data_type, void *data) -{ - return EINVAL; -} - -extern int step_info(gres_step_state_t *step_gres_data, uint32_t node_inx, - enum gres_step_data_type data_type, void *data) -{ - return EINVAL; -} +/*****************************************************************************\ + * gres_mic.c - Support MICs as a generic resources. + ***************************************************************************** + * Copyright (C) 2012 CSC-IT Center for Science Ltd. + * Written by Olli-Pekka Lehto + * Based upon gres_gpu.c with the copyright notice shown below: + * Copyright (C) 2010 Lawrence Livermore National Security. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://slurm.schedmd.com/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#if HAVE_CONFIG_H +# include "config.h" +# if STDC_HEADERS +# include <string.h> +# endif +# if HAVE_SYS_TYPES_H +# include <sys/types.h> +# endif /* HAVE_SYS_TYPES_H */ +# if HAVE_UNISTD_H +# include <unistd.h> +# endif +# if HAVE_INTTYPES_H +# include <inttypes.h> +# else /* ! HAVE_INTTYPES_H */ +# if HAVE_STDINT_H +# include <stdint.h> +# endif +# endif /* HAVE_INTTYPES_H */ +#else /* ! HAVE_CONFIG_H */ +# include <sys/types.h> +# include <unistd.h> +# include <stdint.h> +# include <string.h> +#endif /* HAVE_CONFIG_H */ + +#ifdef HAVE_HWLOC +# include <hwloc.h> +#endif /* HAVE_HWLOC */ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> + +#include "slurm/slurm.h" +#include "slurm/slurm_errno.h" + +#include "src/common/slurm_xlator.h" +#include "src/common/bitstring.h" +#include "src/common/env.h" +#include "src/common/gres.h" +#include "src/common/list.h" +#include "src/common/xstring.h" + +/* + * These variables are required by the generic plugin interface. If they + * are not found in the plugin, the plugin loader will ignore it. + * + * plugin_name - A string giving a human-readable description of the + * plugin. There is no maximum length, but the symbol must refer to + * a valid string. + * + * plugin_type - A string suggesting the type of the plugin or its + * applicability to a particular form of data or method of data handling. + * If the low-level plugin API is used, the contents of this string are + * unimportant and may be anything. SLURM uses the higher-level plugin + * interface which requires this string to be of the form + * + * <application>/<method> + * + * where <application> is a description of the intended application of + * the plugin (e.g., "auth" for SLURM authentication) and <method> is a + * description of how this plugin satisfies that application. SLURM will + * only load authentication plugins if the plugin_type string has a prefix + * of "auth/". + * + * plugin_version - Specifies the version number of the plugin. This would + * typically be the same for all plugins. + */ +const char plugin_name[] = "Gres MIC plugin"; +const char plugin_type[] = "gres/mic"; +const uint32_t plugin_version = 120; + +static char gres_name[] = "mic"; + +static int *mic_devices = NULL; +static int nb_available_files; + +/* + * We could load gres state or validate it using various mechanisms here. + * This only validates that the configuration was specified in gres.conf. + * In the general case, no code would need to be changed. + */ +extern int node_config_load(List gres_conf_list) +{ + int i, rc = SLURM_SUCCESS; + ListIterator iter; + gres_slurmd_conf_t *gres_slurmd_conf; + int nb_mic = 0; /* Number of MICs in the list */ + int available_files_index = 0; + + xassert(gres_conf_list); + iter = list_iterator_create(gres_conf_list); + while ((gres_slurmd_conf = list_next(iter))) { + if (strcmp(gres_slurmd_conf->name, gres_name)) + continue; + if (gres_slurmd_conf->file) + nb_mic++; + } + list_iterator_destroy(iter); + mic_devices = NULL; + nb_available_files = -1; + + /* (Re-)Allocate memory if number of files changed */ + if (nb_mic != nb_available_files) { + xfree(mic_devices); /* No-op if NULL */ + mic_devices = (int *) xmalloc(sizeof(int) * nb_mic); + nb_available_files = nb_mic; + for (i = 0; i < nb_available_files; i++) + mic_devices[i] = -1; + } + + iter = list_iterator_create(gres_conf_list); + while ((gres_slurmd_conf = list_next(iter))) { + if ((strcmp(gres_slurmd_conf->name, gres_name) == 0) && + gres_slurmd_conf->file) { + /* Populate mic_devices array with number + * at end of the file name */ + for (i = 0; gres_slurmd_conf->file[i]; i++) { + if (!isdigit(gres_slurmd_conf->file[i])) + continue; + mic_devices[available_files_index] = + atoi(gres_slurmd_conf->file + i); + break; + } + available_files_index++; + } + } + list_iterator_destroy(iter); + + if (rc != SLURM_SUCCESS) + fatal("%s failed to load configuration", plugin_name); + + for (i = 0; i < nb_available_files; i++) + info("mic %d is device number %d", i, mic_devices[i]); + + return rc; +} + +/* + * Set environment variables as appropriate for a job (i.e. all tasks) based + * upon the job's GRES state. + */ +extern void job_set_env(char ***job_env_ptr, void *gres_ptr) +{ + int i, len; + char *dev_list = NULL; + gres_job_state_t *gres_job_ptr = (gres_job_state_t *) gres_ptr; + + if ((gres_job_ptr != NULL) && + (gres_job_ptr->node_cnt == 1) && + (gres_job_ptr->gres_bit_alloc != NULL) && + (gres_job_ptr->gres_bit_alloc[0] != NULL)) { + len = bit_size(gres_job_ptr->gres_bit_alloc[0]); + for (i=0; i<len; i++) { + if (!bit_test(gres_job_ptr->gres_bit_alloc[0], i)) + continue; + if (!dev_list) + dev_list = xmalloc(128); + else + xstrcat(dev_list, ","); + if (mic_devices && (mic_devices[i] >= 0)) + xstrfmtcat(dev_list, "%d", mic_devices[i]); + else + xstrfmtcat(dev_list, "%d", i); + } + } + if (dev_list) { + env_array_overwrite(job_env_ptr,"OFFLOAD_DEVICES", + dev_list); + xfree(dev_list); + } else { + /* The gres.conf file must identify specific device files + * in order to set the OFFLOAD_DEVICES env var */ + error("gres/mic unable to set OFFLOAD_DEVICES, " + "no device files configured"); + } +} + +/* + * Set environment variables as appropriate for a job (i.e. all tasks) based + * upon the job step's GRES state. + */ +extern void step_set_env(char ***job_env_ptr, void *gres_ptr) +{ + int i, len; + char *dev_list = NULL; + gres_step_state_t *gres_step_ptr = (gres_step_state_t *) gres_ptr; + + if ((gres_step_ptr != NULL) && + (gres_step_ptr->node_cnt == 1) && + (gres_step_ptr->gres_bit_alloc != NULL) && + (gres_step_ptr->gres_bit_alloc[0] != NULL)) { + len = bit_size(gres_step_ptr->gres_bit_alloc[0]); + for (i=0; i<len; i++) { + if (!bit_test(gres_step_ptr->gres_bit_alloc[0], i)) + continue; + if (!dev_list) + dev_list = xmalloc(128); + else + xstrcat(dev_list, ","); + if (mic_devices && (mic_devices[i] >= 0)) + xstrfmtcat(dev_list, "%d", mic_devices[i]); + else + xstrfmtcat(dev_list, "%d", i); + } + } + if (dev_list) { + env_array_overwrite(job_env_ptr,"OFFLOAD_DEVICES", + dev_list); + xfree(dev_list); + } else { + /* The gres.conf file must identify specific device files + * in order to set the OFFLOAD_DEVICES env var */ + error("gres/mic unable to set OFFLOAD_DEVICES, " + "no device files configured"); + } +} + +/* Send GRES information to slurmstepd on the specified file descriptor */ +extern void send_stepd(int fd) +{ + int i; + + safe_write(fd, &nb_available_files, sizeof(int)); + for (i = 0; i < nb_available_files; i++) + safe_write(fd, &mic_devices[i], sizeof(int)); + return; + +rwfail: error("gres_plugin_send_stepd failed"); +} + +/* Receive GRES information from slurmd on the specified file descriptor */ +extern void recv_stepd(int fd) +{ + int i; + + safe_read(fd, &nb_available_files, sizeof(int)); + if (nb_available_files > 0) + mic_devices = xmalloc(sizeof(int) * nb_available_files); + for (i = 0; i < nb_available_files; i++) + safe_read(fd, &mic_devices[i], sizeof(int)); + return; + +rwfail: error("gres_plugin_recv_stepd failed"); +} + +extern int job_info(gres_job_state_t *job_gres_data, uint32_t node_inx, + enum gres_job_data_type data_type, void *data) +{ + return EINVAL; +} + +extern int step_info(gres_step_state_t *step_gres_data, uint32_t node_inx, + enum gres_step_data_type data_type, void *data) +{ + return EINVAL; +} diff --git a/src/slurmd/common/setproctitle.c b/src/slurmd/common/setproctitle.c index 9cc07b9bbc4ffdc103257ba4cb90c702a8337e07..0b838934e8b97c33eb30461570af4fc1da7ca59b 100644 --- a/src/slurmd/common/setproctitle.c +++ b/src/slurmd/common/setproctitle.c @@ -154,6 +154,7 @@ static const size_t ps_buffer_size = sizeof(ps_buffer); #else static char *ps_buffer; /* will point to argv area */ static size_t ps_buffer_size; /* space determined at run time */ +static char **new_environ = (char **) NULL; #endif /* save the original argv[] location here */ @@ -261,7 +262,6 @@ init_setproctitle(int argc, char *argv[]) { #if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV char *end_of_area = NULL; - char **new_environ; int i; #endif @@ -318,10 +318,7 @@ init_setproctitle(int argc, char *argv[]) } for (i = 0; environ[i] != NULL; i++) { new_environ[i] = strdup(environ[i]); - //free(environ[i]); } - /* if (environ) */ -/* free(environ); */ new_environ[i] = NULL; environ = new_environ; #endif /* PS_USE_CLOBBER_ARGV */ @@ -334,11 +331,15 @@ void fini_setproctitle(void) #if SETPROCTITLE_STRATEGY == PS_USE_CLOBBER_ARGV int i; - for (i = 0; environ[i] != NULL; i++) { - free(environ[i]); + if (!new_environ) + return; + + for (i = 0; new_environ[i] != NULL; i++) { + free(new_environ[i]); } - free(environ); - environ = (char **) NULL; + free(new_environ); + new_environ = (char **) NULL; + environ = new_environ; #endif /* PS_USE_CLOBBER_ARGV */ } diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index 6e3007c60814831e3663a8c136422bfa93f104a3..9fc7a59bc25ea0c982b2102debdc441fe7ec6c28 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -171,7 +171,7 @@ static void _term_handler(int); static void _update_logging(void); static void _update_nice(void); static void _usage(void); -static void _wait_for_all_threads(void); +static void _wait_for_all_threads(int secs); int @@ -336,11 +336,7 @@ main (int argc, char *argv[]) if (unlink(conf->pidfile) < 0) error("Unable to remove pidfile `%s': %m", conf->pidfile); - _wait_for_all_threads(); - - switch_g_node_fini(); - jobacct_gather_fini(); - + _wait_for_all_threads(120); _slurmd_fini(); _destroy_conf(); slurm_crypto_fini(); /* must be after _destroy_conf() */ @@ -412,6 +408,7 @@ _msg_engine(void) while (!_shutdown) { if (_reconfig) { verbose("got reconfigure request"); + _wait_for_all_threads(5); /* Wait for RPCs to finish */ _reconfigure(); } @@ -461,15 +458,16 @@ _increment_thd_count(void) slurm_mutex_unlock(&active_mutex); } +/* secs IN - wait up to this number of seconds for all threads to complete */ static void -_wait_for_all_threads(void) +_wait_for_all_threads(int secs) { struct timespec ts; int rc; ts.tv_sec = time(NULL); ts.tv_nsec = 0; - ts.tv_sec += 120; /* 2 minutes allowed for shutdown */ + ts.tv_sec += secs; slurm_mutex_lock(&active_mutex); while (active_threads > 0) { @@ -798,8 +796,6 @@ _read_config(void) xfree(conf->block_map); xfree(conf->block_map_inv); - conf->block_map_size = 0; - _update_logging(); _update_nice(); @@ -1573,6 +1569,9 @@ cleanup: static int _slurmd_fini(void) { + switch_g_node_fini(); + jobacct_gather_fini(); + acct_gather_profile_fini(); save_cred_state(conf->vctx); switch_fini(); slurmd_task_fini(); diff --git a/testsuite/expect/test1.84 b/testsuite/expect/test1.84 index b106fce072d581cbcb08c25ff213dc98426e3f57..7250902020a4966cd89f8c0f96066b2eb93c7a80 100755 --- a/testsuite/expect/test1.84 +++ b/testsuite/expect/test1.84 @@ -116,6 +116,8 @@ if {$part_share_force != 0} { exit 0 } +# Add sleep for any epilog clean up of previous jobs +sleep 2 set fd [open "|$scontrol --oneliner show node $def_hostlist"] exp_internal 1 while {[gets $fd line] != -1} {