diff --git a/contribs/mpich1.slurm.patch b/contribs/mpich1.slurm.patch index 6ecb186d331ddfd2ac128a5a16b731524d9f0f5a..738fb9eda602fcff51d52ad967d28c47274fa087 100644 --- a/contribs/mpich1.slurm.patch +++ b/contribs/mpich1.slurm.patch @@ -282,8 +282,8 @@ Index: README +DETAILS: The srun command opens two socket connections and passes +their ports to all tasks via the SLURM_MPICH1_P4_PORT1 and +SLURM_MPICH1_P4_PORT2 environment variables. Task zero connects to -+SLURM_MPICH1_P4_PORT1 and writes port number. The other tasks connect -+to SLURM_MPICH1_P4_PORT2 and that port number. This avoid the requirement ++SLURM_MPICH1_P4_PORT1 and writes its port number. The other tasks connect to ++SLURM_MPICH1_P4_PORT2 and read that port number. This avoid the requirement +of having task zero launch all subsequent tasks and also launches +all tasks under the direct control of SLURM (for process management +and accounting). SLURM only launches one task per node and that diff --git a/src/api/slurm_pmi.c b/src/api/slurm_pmi.c index d6403a47cf6e96e4eb9ad5242d92d114519c009b..8a5fb2b2156e6bb1c784e891f7720b048030a6f7 100644 --- a/src/api/slurm_pmi.c +++ b/src/api/slurm_pmi.c @@ -197,12 +197,19 @@ int slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr, /* Send the RPC to the local srun communcation manager. * Since the srun can be sent thousands of messages at * the same time and refuse some connections, retry as - * needed. Spread out messages by task's rank. Also - * increase the timeout if many tasks since the srun - * command is very overloaded. - * We also increase the timeout (default timeout is - * 10 secs). */ - usleep(pmi_rank * pmi_time); + * needed. Wait until all key-pairs have been sent by + * all tasks then spread out messages by task's rank. + * Also increase the message timeout if many tasks + * since the srun command can get very overloaded (the + * default timeout is 10 secs). + * + * TaskID SendTime GetTime (Units are PMI_TIME, default=500 usec) + * 0 0 N+0 + * 1 1 N+1 + * 2 2 N+2 + * N-1 N-1 N+N-1 + */ + usleep(pmi_size * pmi_time); if (pmi_size > 1000) /* 100 secs */ timeout = slurm_get_msg_timeout() * 10000; else if (pmi_size > 100) /* 50 secs */ diff --git a/src/api/step_ctx.c b/src/api/step_ctx.c index 356c66de2cf7aa02370b88afe443444b353cbfd7..9ea6885c08c339afb8918fb259f911df27f936e0 100644 --- a/src/api/step_ctx.c +++ b/src/api/step_ctx.c @@ -329,7 +329,7 @@ extern void slurm_step_ctx_params_t_init (slurm_step_ctx_params_t *ptr) char *jobid_str; /* zero the entire structure */ - memset(ptr, 0, sizeof(job_step_create_request_msg_t)); + memset(ptr, 0, sizeof(slurm_step_ctx_params_t)); /* now set anything that shouldn't be 0 or NULL by default */ ptr->relative = (uint16_t)NO_VAL; diff --git a/src/sattach/opt.c b/src/sattach/opt.c index 3ba573782578b66ba572f5b2232b1428c4268728..dd0ccfb3e53f3c466cab7512a2db21567adcc94a 100644 --- a/src/sattach/opt.c +++ b/src/sattach/opt.c @@ -305,7 +305,7 @@ void set_options(const int argc, char **argv) switch (opt_char) { case '?': - fprintf(stderr, "Try \"sbatch --help\" for more " + fprintf(stderr, "Try \"sattach --help\" for more " "information\n"); exit(1); break;