diff --git a/NEWS b/NEWS index 06d59042973448df20ca04c13bb06c92a0bfb71f..7bf4d4be2a9d842829da7b6454a1f32783b3541c 100644 --- a/NEWS +++ b/NEWS @@ -217,6 +217,11 @@ documents those changes that are of interest to users and admins. the code) -- Added support for OSX build. +* Changes in SLURM 1.1.27 +========================= + - Fix possible race condition for two simultaneous "scontrol show config" + calls resulting in slurm_xfree() Error: from read_config.c:642 + * Changes in SLURM 1.1.26 ========================= - In sched/wiki2, fixes for support of job features. @@ -669,6 +674,9 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.0.17 ========================= -- Set correct user groups for task epilogs. + -- Set SLURM_DIST_CYCLIC = 1 (needed for HP MPI, slurm.hp.env.patch). + -- Add more debugging for tracking slow slurmd job initiations + (slurm.hp.replaydebug.patch). * Changes in SLURM 1.0.16 ========================= diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index c3026b5c28d93db9ec8c4459a02b3d57bf3e1b4a..cc9d67d1638f402efd0b59b00adf8831b8809506 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -309,7 +309,7 @@ enum jobacct_data_type { /* Possible task distributions across the nodes */ typedef enum task_dist_states { - SLURM_DIST_CYCLIC, /* distribute tasks 1 per node, round robin */ + SLURM_DIST_CYCLIC = 1, /* distribute tasks 1 per node, round robin */ SLURM_DIST_BLOCK, /* distribute tasks filling node by node */ SLURM_DIST_ARBITRARY, /* arbitrary task distribution */ SLURM_DIST_PLANE, /* distribute tasks by filling up diff --git a/src/common/read_config.c b/src/common/read_config.c index 947637efca4bd3b74c678c244c5602c66a9f238d..0a69ee59668942355ce3be154216a861d95a9b75 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -1023,9 +1023,11 @@ gethostname_short (char *name, size_t len) /* * free_slurm_conf - free all storage associated with a slurm_ctl_conf_t. * IN/OUT ctl_conf_ptr - pointer to data structure to be freed + * IN purge_node_hash - purge system-wide node hash table if set, + * set to zero if clearing private copy of config data */ -void -free_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) +extern void +free_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr, bool purge_node_hash) { xfree (ctl_conf_ptr->authtype); xfree (ctl_conf_ptr->checkpoint_type); @@ -1067,8 +1069,9 @@ free_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr) xfree (ctl_conf_ptr->srun_prolog); xfree (ctl_conf_ptr->srun_epilog); xfree (ctl_conf_ptr->node_prefix); - - _free_name_hashtbl(); + + if (purge_node_hash) + _free_name_hashtbl(); } /* @@ -1189,7 +1192,7 @@ _destroy_slurm_conf() s_p_hashtbl_destroy(default_partition_tbl); default_partition_tbl = NULL; } - free_slurm_conf(conf_ptr); + free_slurm_conf(conf_ptr, true); conf_initialized = false; /* xfree(conf_ptr); */ diff --git a/src/common/read_config.h b/src/common/read_config.h index 471eb7ef15e2222779df6c144b72d58fd524469b..8263651cc959a3c1828d45eb98859b40c344fc66 100644 --- a/src/common/read_config.h +++ b/src/common/read_config.h @@ -281,8 +281,11 @@ extern void init_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr); /* * free_slurm_conf - free all storage associated with a slurm_ctl_conf_t. * IN/OUT ctl_conf_ptr - pointer to data structure to be freed + * IN purge_node_hash - purge system-wide node hash table if set, + * set to zero if clearing private copy of config data */ -extern void free_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr); +extern void free_slurm_conf (slurm_ctl_conf_t *ctl_conf_ptr, + bool purge_node_hash); /* * gethostname_short - equivalent to gethostname(), but return only the first diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index a42326e434cfb7c5990cf12e7cd8cb5bd4f01753..a627eacdbcbd298292070b7df314ab0b84447a2e 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -548,7 +548,7 @@ static void _slurm_rpc_dump_conf(slurm_msg_t * msg) /* send message */ slurm_send_node_msg(msg->conn_fd, &response_msg); - free_slurm_conf(&config_tbl); + free_slurm_conf(&config_tbl, false); } } diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 5195a78e5b7751b2aeb4245220af442b3f614952..ba733bd60502cd12cf3a7db93ea58070119ad390 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -389,12 +389,15 @@ _send_slurmstepd_init(int fd, slurmd_step_type_t type, void *req, free_buf(buffer); /* send cached group ids array for the relevant uid */ + debug3("_send_slurmstepd_init: call to getpwuid"); if (!(pw = getpwuid(uid))) { error("_send_slurmstepd_init getpwuid: %m"); len = 0; safe_write(fd, &len, sizeof(int)); return -1; } + debug3("_send_slurmstepd_init: return from getpwuid"); + if ((gids = _gids_cache_lookup(pw->pw_name, pw->pw_gid))) { int i; uint32_t tmp32; @@ -709,8 +712,10 @@ _rpc_launch_tasks(slurm_msg_t *msg) adlen = sizeof(self); _slurm_getsockname(msg->conn_fd, (struct sockaddr *)&self, &adlen); + debug3("_rpc_launch_tasks: call to _forkexec_slurmstepd"); errnum = _forkexec_slurmstepd(LAUNCH_TASKS, (void *)req, cli, &self, step_hset); + debug3("_rpc_launch_tasks: return from _forkexec_slurmstepd"); done: if (step_hset) @@ -846,8 +851,10 @@ _rpc_batch_job(slurm_msg_t *msg) info("Launching batch job %u.%u for UID %d", req->job_id, req->step_id, req->uid); + debug3("_rpc_batch_job: call to _forkexec_slurmstepd"); rc = _forkexec_slurmstepd(LAUNCH_BATCH_JOB, (void *)req, cli, NULL, (hostset_t)NULL); + debug3("_rpc_batch_job: return from _forkexec_slurmstepd"); slurm_mutex_unlock(&launch_mutex); diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 8401bfdcc95661d86c2a002eb3a8bb3940c3a24f..2a5c598d180b85e571864d8100714b14512f0314 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -667,6 +667,7 @@ _step_req_create(srun_job_t *j) break; } + opt.distribution = r->task_dist; if (slurmctld_comm_addr.port) { r->host = xstrdup(slurmctld_comm_addr.hostname);