diff --git a/NEWS b/NEWS index a00fcfa9fbbf2fa6de637fc1efd574ba606282aa..3d14c4bce15c6cd15f5d0be9794b17f18a9ff55c 100644 --- a/NEWS +++ b/NEWS @@ -245,6 +245,14 @@ documents those changes that are of interest to users and administrators. variable "PMI_CRAY_NO_SMP_ENV=1" -- Fix invalid memory reference in SlurmDBD when putting a node up. -- Allow opening of plugstack.conf even when a symlink. + -- Fix scontrol reboot so that rebooted nodes will not be set down with reason + 'Node xyz unexpectedly rebooted' but will be correctly put back to service. + -- CRAY - Throttle the post NHC operations as to not hog the job write lock + if many steps/jobs finish at once. + -- Disable changes to GRES count while jobs are running on the node. + -- CRAY - Fix issue with scontrol reconfig. + -- slurmd: Remove wrong reporting of "Error reading step ... memory limit". + The logic was treating success as an error. * Changes in Slurm 14.11.5 ========================== diff --git a/src/common/gres.c b/src/common/gres.c index b4348d2529a98bc14ff1d0c6020bbda8c9772b19..da3ab68ebe406a632680bcd7a3154506b619c6bc 100644 --- a/src/common/gres.c +++ b/src/common/gres.c @@ -1631,11 +1631,23 @@ extern int _node_config_validate(char *node_name, char *orig_config, context_ptr->gres_type, node_name, gres_data->gres_cnt_found, gres_cnt); } - gres_data->gres_cnt_found = gres_cnt; - updated_config = true; + if ((gres_data->gres_cnt_found != NO_VAL) && + (gres_data->gres_cnt_alloc != 0)) { + if (reason_down && (*reason_down == NULL)) { + xstrfmtcat(*reason_down, + "%s count changed and jobs are " + "using them (%u != %u)", + context_ptr->gres_type, + gres_data->gres_cnt_found, gres_cnt); + } + rc = EINVAL; + } else { + gres_data->gres_cnt_found = gres_cnt; + updated_config = true; + } } if (updated_config == false) - return SLURM_SUCCESS; + return rc; if ((set_cnt == 0) && (set_cnt != gres_data->topo_cnt)) { /* Need to clear topology info */ diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index f13e9f4783a4d13dc0b3450f17eead6682b76ddf..bc0de744fd27a6352283a5664bda6cb5b1bda49a 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -1351,9 +1351,11 @@ extern char *rpc_num2string(uint16_t opcode); int rc; \ while (remaining > 0) { \ rc = read(fd, ptr, remaining); \ - if ((rc == 0) && (remaining == size)) \ + if ((rc == 0) && (remaining == size)) { \ + debug("%s:%d: %s: safe_read EOF", \ + __FILE__, __LINE__, __CURRENT_FUNC__); \ goto rwfail; \ - else if (rc == 0) { \ + } else if (rc == 0) { \ debug("%s:%d: %s: safe_read (%d of %d) EOF", \ __FILE__, __LINE__, __CURRENT_FUNC__, \ remaining, (int)size); \ diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index c9a3c620e4d4a9eaa7000b5207defeaea59ab4f3..f7fe589d1df14e58df92f1c5c97ba063421d1307 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -151,6 +151,13 @@ static uint32_t blade_cnt = 0; static pthread_mutex_t blade_mutex = PTHREAD_MUTEX_INITIALIZER; static time_t last_npc_update; +static alpsc_topology_t *topology = NULL; +static size_t topology_num_nodes = 0; + +static int active_post_nhc_cnt = 0; +static pthread_mutex_t throttle_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t throttle_cond = PTHREAD_COND_INITIALIZER; + #ifdef HAVE_NATIVE_CRAY @@ -865,6 +872,32 @@ static void _set_job_running_restore(select_jobinfo_t *jobinfo) last_npc_update = time(NULL); } +/* These functions prevent the fini's of jobs and steps from keeping + * the slurmctld write locks constantly set after the nhc is ran, + * which can prevent other RPCs and system functions from being + * processed. For example, a steady stream of step or job completions + * can prevent squeue from responding or jobs from being scheduled. */ +static void _throttle_start(void) +{ + slurm_mutex_lock(&throttle_mutex); + while (1) { + if (active_post_nhc_cnt == 0) { + active_post_nhc_cnt++; + break; + } + pthread_cond_wait(&throttle_cond, &throttle_mutex); + } + slurm_mutex_unlock(&throttle_mutex); + usleep(100); +} +static void _throttle_fini(void) +{ + slurm_mutex_lock(&throttle_mutex); + active_post_nhc_cnt--; + pthread_cond_broadcast(&throttle_cond); + slurm_mutex_unlock(&throttle_mutex); +} + static void *_job_fini(void *args) { struct job_record *job_ptr = (struct job_record *)args; @@ -896,6 +929,7 @@ static void *_job_fini(void *args) /***********/ xfree(nhc_info.nodelist); + _throttle_start(); lock_slurmctld(job_write_lock); if (job_ptr->magic == JOB_MAGIC) { select_jobinfo_t *jobinfo = NULL; @@ -911,6 +945,7 @@ static void *_job_fini(void *args) "this should never happen", nhc_info.jobid); unlock_slurmctld(job_write_lock); + _throttle_fini(); return NULL; } @@ -957,6 +992,7 @@ static void *_step_fini(void *args) xfree(nhc_info.nodelist); + _throttle_start(); lock_slurmctld(job_write_lock); if (!step_ptr->job_ptr) { error("For some reason we don't have a job_ptr for " @@ -988,6 +1024,7 @@ static void *_step_fini(void *args) post_job_step(step_ptr); } unlock_slurmctld(job_write_lock); + _throttle_fini(); return NULL; } @@ -1114,6 +1151,9 @@ extern int fini ( void ) _free_blade(&blade_array[i]); xfree(blade_array); + if (topology) + free(topology); + slurm_mutex_unlock(&blade_mutex); return other_select_fini(); @@ -1440,23 +1480,27 @@ extern int select_p_node_init(struct node_record *node_ptr, int node_cnt) #if defined(HAVE_NATIVE_CRAY_GA) && !defined(HAVE_CRAY_NETWORK) int nn, end_nn, last_nn = 0; bool found = 0; - alpsc_topology_t *topology = NULL; - size_t num_nodes; char *err_msg = NULL; - if (alpsc_get_topology(&err_msg, &topology, &num_nodes)) { - if (err_msg) { - error("(%s: %d: %s) Could not get system " - "topology info: %s", - THIS_FILE, __LINE__, __FUNCTION__, err_msg); - free(err_msg); - } else { - error("(%s: %d: %s) Could not get system " - "topology info: No error message present.", - THIS_FILE, __LINE__, __FUNCTION__); + if (!topology) { + if (alpsc_get_topology(&err_msg, &topology, + &topology_num_nodes)) { + if (err_msg) { + error("(%s: %d: %s) Could not get system " + "topology info: %s", + THIS_FILE, __LINE__, + __FUNCTION__, err_msg); + free(err_msg); + } else { + error("(%s: %d: %s) Could not get system " + "topology info: No error " + "message present.", + THIS_FILE, __LINE__, __FUNCTION__); + } + return SLURM_ERROR; } - return SLURM_ERROR; } + #endif slurm_mutex_lock(&blade_mutex); @@ -1490,7 +1534,7 @@ extern int select_p_node_init(struct node_record *node_ptr, int node_cnt) } #if defined(HAVE_NATIVE_CRAY_GA) && !defined(HAVE_CRAY_NETWORK) - end_nn = num_nodes; + end_nn = topology_num_nodes; start_again: @@ -1506,7 +1550,7 @@ extern int select_p_node_init(struct node_record *node_ptr, int node_cnt) break; } } - if (end_nn != num_nodes) { + if (end_nn != topology_num_nodes) { /* already looped */ fatal("Node %s(%d) isn't found on the system", node_ptr->name, nodeinfo->nid); @@ -1545,10 +1589,6 @@ extern int select_p_node_init(struct node_record *node_ptr, int node_cnt) /* give back the memory */ xrealloc(blade_array, sizeof(blade_info_t) * blade_cnt); -#if defined(HAVE_NATIVE_CRAY_GA) && !defined(HAVE_CRAY_NETWORK) - free(topology); -#endif - slurm_mutex_unlock(&blade_mutex); return other_node_init(node_ptr, node_cnt); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index 9e368a126df8f91b88986cd9a9a7c2718b50d76e..33e944c8600bede0227c1c30c314f2a3fbc859f4 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1350,7 +1350,7 @@ static void _queue_reboot_msg(void) node_ptr->reason = xstrdup("Scheduled reboot"); bit_clear(avail_node_bitmap, i); bit_clear(idle_node_bitmap, i); - node_ptr->last_response = now; + node_ptr->last_response = now + slurm_get_resume_timeout(); } if (reboot_agent_args != NULL) { hostlist_uniq(reboot_agent_args->hostlist); diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 078d66c757e96d2d6a324040ba86bfd002e04c9b..3060f8b298343ea099f9d0c83ae132c9bcba05bc 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -2325,8 +2325,11 @@ extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg, node_ptr->reason = xstrdup( "Node unexpectedly rebooted"); } - info("Node %s unexpectedly rebooted", - reg_msg->node_name); + info("%s: Node %s unexpectedly rebooted boot_time %d" + "last response %d", + __func__, reg_msg->node_name, + (int)node_ptr->boot_time, + (int)node_ptr->last_response); _make_node_down(node_ptr, now); kill_running_job_by_node_name(reg_msg->node_name); last_node_update = now; diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index a075ce01c95168a09399b5fb42cafa6d35d8861e..a4136edf6c6ad4b8aa08f1d8fef0605a5493f2ee 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -1990,9 +1990,10 @@ _load_job_limits(void) if (fd == -1) continue; /* step completed */ - if (!stepd_get_mem_limits(fd, stepd->protocol_version, - &stepd_mem_info)) { - error("Error reading step %u.%u memory limits", + if (stepd_get_mem_limits(fd, stepd->protocol_version, + &stepd_mem_info) != SLURM_SUCCESS) { + error("Error reading step %u.%u memory limits from " + "slurmstepd", stepd->jobid, stepd->stepid); close(fd); continue; diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index ea5c9e68798056b8e444c872a7bdc606306e06a5..f652d842f4825a9b215f5f2459c2eb0b74848df9 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -2051,8 +2051,8 @@ static int _memory_spec_init(void) pid_t pid; if (conf->mem_spec_limit == 0) { - info ("Resource spec: system memory limit not configured " - "for this node"); + info ("Resource spec: Reserved system memory limit not " + "configured for this node"); return SLURM_SUCCESS; } if (init_system_memory_cgroup() != SLURM_SUCCESS) {