Skip to content
Snippets Groups Projects
Commit 235d6029 authored by Moe Jette's avatar Moe Jette
Browse files
parent 7356fc33
No related branches found
No related tags found
No related merge requests found
...@@ -79,6 +79,10 @@ documents those changes that are of interest to users and admins. ...@@ -79,6 +79,10 @@ documents those changes that are of interest to users and admins.
-- Make select/cons_res interoperate with mpi/lam plugin for task counts. -- Make select/cons_res interoperate with mpi/lam plugin for task counts.
-- Fix race condition where srun could seg-fault due to use of logging functions -- Fix race condition where srun could seg-fault due to use of logging functions
within pthread after calling log_fini. within pthread after calling log_fini.
-- Code changes for clean build with gcc 2.96 (gcc_2_96.patch, Takao Hatazaki, HP).
-- Add CacheGroups configuration support in configurator.html (configurator.patch,
Takao Hatazaki, HP).
-- Fix bug preventing use of mpich-gm plugin (mpichgm.patch, Takao Hatazaki, HP).
* Changes in SLURM 1.0.9 * Changes in SLURM 1.0.9
======================== ========================
......
...@@ -77,7 +77,7 @@ function displayfile() ...@@ -77,7 +77,7 @@ function displayfile()
"SlurmdPidFile=" + document.config.slurmd_pid_file.value + "<br>" + "SlurmdPidFile=" + document.config.slurmd_pid_file.value + "<br>" +
"ProctrackType=proctrack/" + get_radio_value(document.config.proctrack_type) + "<br>" + "ProctrackType=proctrack/" + get_radio_value(document.config.proctrack_type) + "<br>" +
"#PluginDir= <br>" + "#PluginDir= <br>" +
"#CacheGroups= <br>" + "CacheGroups=" + get_radio_value(document.config.cache_groups) + "<br>" +
"#FirstJobId= <br>" + "#FirstJobId= <br>" +
"ReturnToService=" + get_radio_value(document.config.return_to_service) + "<br>" + "ReturnToService=" + get_radio_value(document.config.return_to_service) + "<br>" +
"#MaxJobCount= <br>" + "#MaxJobCount= <br>" +
...@@ -215,6 +215,26 @@ purposes any user name can be used. ...@@ -215,6 +215,26 @@ purposes any user name can be used.
<input type="text" name="slurm_user" value="slurm"> <B>SlurmUser</B> <input type="text" name="slurm_user" value="slurm"> <B>SlurmUser</B>
<P> <P>
<H2>Group ID Caching</H2>
If you have a slow NIS environment, big parallel jobs take a long time
to start up (and may eventually time-out) because the NIS server(s)
may not be able to quickly respond to simultaneous requests from
multiple slurmd's. You can instruct slurmd to cache /etc/groups
entries to prevent this from happening by setting
<B>CacheGroups</B>=1". Reconfiguring ("scontrol reconfig") with
<B>CacheGroups</B>=0 will cause slurmd to purge the cache. Select one
value for <B>CacheGroups</B>:<BR>
<input type="radio" name="cache_groups" value="0" checked>
<B>0</B>: for normal environment.<BR>
<input type="radio" name="cache_groups" value="1">
<B>1</B>: for slow NIS environment.
<P>
WARNING: The group ID cache does not try to keep itself in sync with
the system. You MUST run "scontrol reconfig" to update the cache
after making any changes to system password or group databases.
<P>
<H2>SLURM Port Numbers</H2> <H2>SLURM Port Numbers</H2>
The SLURM controller (slurmctld) requires a unique port for communcations The SLURM controller (slurmctld) requires a unique port for communcations
as do the SLURM compute node deamonds (slurmd). If not set, slurm ports as do the SLURM compute node deamonds (slurmd). If not set, slurm ports
......
...@@ -278,8 +278,8 @@ int setup_env(env_t *env) ...@@ -278,8 +278,8 @@ int setup_env(env_t *env)
} }
if (env->cpu_bind_type) { if (env->cpu_bind_type) {
unsetenvp(env->env, "SLURM_CPU_BIND"); /* don't propagate SLURM_CPU_BIND */
int setstat = 0; int setstat = 0;
unsetenvp(env->env, "SLURM_CPU_BIND"); /* don't propagate SLURM_CPU_BIND */
if (env->cpu_bind_type & CPU_BIND_VERBOSE) { if (env->cpu_bind_type & CPU_BIND_VERBOSE) {
setstat |= setenvf(&env->env, "SLURM_CPU_BIND_VERBOSE", "verbose"); setstat |= setenvf(&env->env, "SLURM_CPU_BIND_VERBOSE", "verbose");
} else { } else {
...@@ -320,9 +320,9 @@ int setup_env(env_t *env) ...@@ -320,9 +320,9 @@ int setup_env(env_t *env)
rc = SLURM_FAILURE; rc = SLURM_FAILURE;
} }
} else { } else {
int setstat = 0;
unsetenvp(env->env, "SLURM_CPU_BIND"); /* don't propagate SLURM_CPU_BIND */ unsetenvp(env->env, "SLURM_CPU_BIND"); /* don't propagate SLURM_CPU_BIND */
/* set SLURM_CPU_BIND_* env vars to defaults */ /* set SLURM_CPU_BIND_* env vars to defaults */
int setstat = 0;
setstat |= setenvf(&env->env, "SLURM_CPU_BIND_VERBOSE", "quiet"); setstat |= setenvf(&env->env, "SLURM_CPU_BIND_VERBOSE", "quiet");
setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", ""); setstat |= setenvf(&env->env, "SLURM_CPU_BIND_TYPE", "");
setstat |= setenvf(&env->env, "SLURM_CPU_BIND_LIST", ""); setstat |= setenvf(&env->env, "SLURM_CPU_BIND_LIST", "");
......
...@@ -60,13 +60,10 @@ typedef struct { ...@@ -60,13 +60,10 @@ typedef struct {
#define GMPI_RECV_BUF_LEN 65536 #define GMPI_RECV_BUF_LEN 65536
static int _gmpi_parse_init_recv_msg(srun_job_t *job, char *rbuf,
gm_slave_t *slave_data);
static int gmpi_fd = -1; static int gmpi_fd = -1;
static int _gmpi_parse_init_recv_msg(srun_job_t *job, char *rbuf, static int _gmpi_parse_init_recv_msg(srun_job_t *job, char *rbuf,
gm_slave_t *slave_data) gm_slave_t *slave_data, int *ii)
{ {
unsigned int magic, id, port_board_id, unique_high_id, unsigned int magic, id, port_board_id, unique_high_id,
unique_low_id, numanode, remote_pid, remote_port; unique_low_id, numanode, remote_pid, remote_port;
...@@ -76,6 +73,7 @@ static int _gmpi_parse_init_recv_msg(srun_job_t *job, char *rbuf, ...@@ -76,6 +73,7 @@ static int _gmpi_parse_init_recv_msg(srun_job_t *job, char *rbuf,
got = sscanf(rbuf, "<<<%u:%u:%u:%u:%u:%u:%u::%u>>>", got = sscanf(rbuf, "<<<%u:%u:%u:%u:%u:%u:%u::%u>>>",
&magic, &id, &port_board_id, &unique_high_id, &magic, &id, &port_board_id, &unique_high_id,
&unique_low_id, &numanode, &remote_pid, &remote_port); &unique_low_id, &numanode, &remote_pid, &remote_port);
*ii = id;
if (got != 8) { if (got != 8) {
error("GMPI master received invalid init message"); error("GMPI master received invalid init message");
return -1; return -1;
...@@ -113,8 +111,9 @@ static int _gmpi_parse_init_recv_msg(srun_job_t *job, char *rbuf, ...@@ -113,8 +111,9 @@ static int _gmpi_parse_init_recv_msg(srun_job_t *job, char *rbuf,
static int _gmpi_establish_map(srun_job_t *job) static int _gmpi_establish_map(srun_job_t *job)
{ {
struct sockaddr_in addr; struct sockaddr_in addr;
in_addr_t *iaddrs;
socklen_t addrlen; socklen_t addrlen;
int accfd, newfd, rlen, nprocs, i, j; int accfd, newfd, rlen, nprocs, i, j, id;
size_t gmaplen, lmaplen, maplen; size_t gmaplen, lmaplen, maplen;
char *p, *rbuf = NULL, *gmap = NULL, *lmap = NULL, *map = NULL; char *p, *rbuf = NULL, *gmap = NULL, *lmap = NULL, *map = NULL;
char tmp[128]; char tmp[128];
...@@ -127,6 +126,7 @@ static int _gmpi_establish_map(srun_job_t *job) ...@@ -127,6 +126,7 @@ static int _gmpi_establish_map(srun_job_t *job)
accfd = gmpi_fd; accfd = gmpi_fd;
addrlen = sizeof(addr); addrlen = sizeof(addr);
nprocs = opt.nprocs; nprocs = opt.nprocs;
iaddrs = (in_addr_t *)xmalloc(sizeof(*iaddrs)*nprocs);
slave_data = (gm_slave_t *)xmalloc(sizeof(*slave_data)*nprocs); slave_data = (gm_slave_t *)xmalloc(sizeof(*slave_data)*nprocs);
for (i=0; i<nprocs; i++) for (i=0; i<nprocs; i++)
slave_data[i].defined = 0; slave_data[i].defined = 0;
...@@ -147,8 +147,11 @@ static int _gmpi_establish_map(srun_job_t *job) ...@@ -147,8 +147,11 @@ static int _gmpi_establish_map(srun_job_t *job)
} else { } else {
rbuf[rlen] = 0; rbuf[rlen] = 0;
} }
if (_gmpi_parse_init_recv_msg(job, rbuf, slave_data) == 0) if (_gmpi_parse_init_recv_msg(job, rbuf, slave_data,
&id) == 0) {
i++; i++;
iaddrs[id] = ntohl(addr.sin_addr.s_addr);
}
close(newfd); close(newfd);
} }
xfree(rbuf); xfree(rbuf);
...@@ -184,9 +187,7 @@ static int _gmpi_establish_map(srun_job_t *job) ...@@ -184,9 +187,7 @@ static int _gmpi_establish_map(srun_job_t *job)
dp = &slave_data[i]; dp = &slave_data[i];
p = lmap; p = lmap;
for (j=0; j<nprocs; j++) { for (j=0; j<nprocs; j++) {
int jhostid = step_layout_host_id (job->step_layout, j); if (iaddrs[i] == iaddrs[j] &&
if ((ihostid == jhostid) &&
(dp->numanode == slave_data[j].numanode)) { (dp->numanode == slave_data[j].numanode)) {
sprintf(tmp, "<%u>", j); sprintf(tmp, "<%u>", j);
strcpy(p, tmp); strcpy(p, tmp);
...@@ -212,8 +213,7 @@ static int _gmpi_establish_map(srun_job_t *job) ...@@ -212,8 +213,7 @@ static int _gmpi_establish_map(srun_job_t *job)
error("setsockopt in GMPI master: %m"); error("setsockopt in GMPI master: %m");
bzero(&addr, sizeof(addr)); bzero(&addr, sizeof(addr));
addr.sin_family = AF_INET; addr.sin_family = AF_INET;
addr.sin_addr.s_addr addr.sin_addr.s_addr = htonl(iaddrs[i]);
= job->slurmd_addr[ihostid].sin_addr.s_addr;
addr.sin_port = htons(dp->remote_port); addr.sin_port = htons(dp->remote_port);
if (connect(newfd, (struct sockaddr *)&addr, sizeof(addr))) if (connect(newfd, (struct sockaddr *)&addr, sizeof(addr)))
fatal("GMPI master failed to connect"); fatal("GMPI master failed to connect");
...@@ -224,6 +224,7 @@ static int _gmpi_establish_map(srun_job_t *job) ...@@ -224,6 +224,7 @@ static int _gmpi_establish_map(srun_job_t *job)
xfree(slave_data); xfree(slave_data);
xfree(lmap); xfree(lmap);
xfree(gmap); xfree(gmap);
xfree(iaddrs);
debug2("GMPI master responded to all GMPI processes"); debug2("GMPI master responded to all GMPI processes");
return 0; return 0;
......
...@@ -728,9 +728,9 @@ again: ...@@ -728,9 +728,9 @@ again:
list_enqueue(server->msg_queue, msg); list_enqueue(server->msg_queue, msg);
} }
} else if (header.type == SLURM_IO_STDIN) { } else if (header.type == SLURM_IO_STDIN) {
debug("SLURM_IO_STDIN");
int nodeid; int nodeid;
struct server_io_info *server; struct server_io_info *server;
debug("SLURM_IO_STDIN");
msg->ref_count = 1; msg->ref_count = 1;
nodeid = step_layout_host_id(info->job->step_layout, nodeid = step_layout_host_id(info->job->step_layout,
header.gtaskid); header.gtaskid);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment