diff --git a/NEWS b/NEWS index a6ca65311b10ac952fe2366e476705b55740970c..3f38ab60dbb5eea811e6a13b4dc8318c811bbc66 100644 --- a/NEWS +++ b/NEWS @@ -57,6 +57,9 @@ documents those changes that are of interest to users and administrators. * Changes in Slurm 14.11.2 ========================== -- Fix Centos5 compile errors. + -- Fix issue with association hash not getting the correct index which + could result in seg fault. + -- Fix salloc/sbatch -B segfault. * Changes in Slurm 14.11.1 ========================== @@ -409,6 +412,8 @@ documents those changes that are of interest to users and administrators. -- Double max string that Slurm can pack from 16MB to 32MB to support larger MPI2 configurations. -- Fix Centos5 compile issues. + -- Log Cray MPI job calling exit() without mpi_fini(), but do not treat it as + a fatal error. This partially reverts logic added in version 14.03.9. * Changes in Slurm 14.03.10 =========================== diff --git a/src/common/assoc_mgr.c b/src/common/assoc_mgr.c index 662866833fc10e7fe246864b6230ed87221bafb9..2e2b4a45739915875e965a8f1c01b9dfedd887f2 100644 --- a/src/common/assoc_mgr.c +++ b/src/common/assoc_mgr.c @@ -249,19 +249,21 @@ static slurmdb_assoc_rec_t *_find_assoc_rec( * assoc_count - count of assoc list entries * assoc_hash - hash table into assoc records */ -static void _delete_assoc_hash(void *assoc) +static void _delete_assoc_hash(slurmdb_assoc_rec_t *assoc) { - slurmdb_assoc_rec_t *assoc_ptr = - (slurmdb_assoc_rec_t *) assoc; + slurmdb_assoc_rec_t *assoc_ptr = assoc; slurmdb_assoc_rec_t **assoc_pptr; xassert(assoc); /* Remove the record from assoc hash table */ assoc_pptr = &assoc_hash_id[ASSOC_HASH_ID_INX(assoc_ptr->id)]; - while (assoc_pptr && ((assoc_ptr = *assoc_pptr) != - (slurmdb_assoc_rec_t *) assoc)) - assoc_pptr = &assoc_ptr->assoc_next_id; + while (assoc_pptr && ((assoc_ptr = *assoc_pptr) != assoc)) { + if (!assoc_ptr->assoc_next_id) + assoc_pptr = NULL; + else + assoc_pptr = &assoc_ptr->assoc_next_id; + } if (!assoc_pptr) { fatal("assoc id hash error"); @@ -269,11 +271,14 @@ static void _delete_assoc_hash(void *assoc) } else *assoc_pptr = assoc_ptr->assoc_next_id; - assoc_ptr = (slurmdb_assoc_rec_t *) assoc; + assoc_ptr = assoc; assoc_pptr = &assoc_hash[_assoc_hash_index(assoc_ptr)]; - while (assoc_pptr && ((assoc_ptr = *assoc_pptr) != - (slurmdb_assoc_rec_t *) assoc)) - assoc_pptr = &assoc_ptr->assoc_next; + while (assoc_pptr && ((assoc_ptr = *assoc_pptr) != assoc)) { + if (!assoc_ptr->assoc_next) + assoc_pptr = NULL; + else + assoc_pptr = &assoc_ptr->assoc_next; + } if (!assoc_pptr) { fatal("assoc hash error"); @@ -456,13 +461,16 @@ static int _change_user_name(slurmdb_user_rec_t *user) if (!assoc->user) continue; if (!strcmp(user->old_name, assoc->user)) { - xfree(assoc->user); - assoc->user = xstrdup(user->name); - assoc->uid = user->uid; /* Since the uid changed the - hash as well will change. + hash as well will change. Remove + the assoc from the hash before the + change or you won't find it. */ _delete_assoc_hash(assoc); + + xfree(assoc->user); + assoc->user = xstrdup(user->name); + assoc->uid = user->uid; _add_assoc_hash(assoc); debug3("changing assoc %d", assoc->id); } @@ -4576,11 +4584,14 @@ extern int assoc_mgr_set_missing_uids() "couldn't get a uid for user %s", object->user); } else { - object->uid = pw_uid; /* Since the uid changed the - hash as well will change. + hash as well will change. Remove + the assoc from the hash before the + change or you won't find it. */ _delete_assoc_hash(object); + + object->uid = pw_uid; _add_assoc_hash(object); } } diff --git a/src/common/proc_args.c b/src/common/proc_args.c index b067cdf582ba403b33c4ea0b599669fbda1f8813..feeb6c7cf506299430ee01e53a0310ac742e0d20 100644 --- a/src/common/proc_args.c +++ b/src/common/proc_args.c @@ -608,7 +608,8 @@ bool verify_socket_core_thread_count(const char *arg, int *min_sockets, /* if cpu_bind_type doesn't already have a auto preference, choose * the level based on the level of the -E specification */ - if (!(*cpu_bind_type & (CPU_BIND_TO_SOCKETS | + if (cpu_bind_type && + !(*cpu_bind_type & (CPU_BIND_TO_SOCKETS | CPU_BIND_TO_CORES | CPU_BIND_TO_THREADS))) { if (j == 0) { diff --git a/src/plugins/task/cray/task_cray.c b/src/plugins/task/cray/task_cray.c index 7ababe4af2ac31ded372ac331b84179855b7995e..86ba6ee7d129dd93c260111fb291c1ac128ea011 100644 --- a/src/plugins/task/cray/task_cray.c +++ b/src/plugins/task/cray/task_cray.c @@ -609,7 +609,6 @@ static int _check_status_file(stepd_step_rec_t *job, return SLURM_SUCCESS; } - // Cancel the job step, since we didn't find the mpi_fini msg // srun only gets the error() messages by default, send one // per compute node, but log all other events with info(). if (terminated) { @@ -622,8 +621,6 @@ static int _check_status_file(stepd_step_rec_t *job, job->jobid, job->stepid, task->gtid); terminated = 1; } - info("reset estatus from %d to %d", task->estatus, SIGKILL); - task->estatus = SIGKILL; } return SLURM_SUCCESS; }