Skip to content
Snippets Groups Projects
Commit 765600fd authored by Moe Jette's avatar Moe Jette
Browse files
parent 8c939577
No related branches found
No related tags found
No related merge requests found
...@@ -282,8 +282,8 @@ Index: README ...@@ -282,8 +282,8 @@ Index: README
+DETAILS: The srun command opens two socket connections and passes +DETAILS: The srun command opens two socket connections and passes
+their ports to all tasks via the SLURM_MPICH1_P4_PORT1 and +their ports to all tasks via the SLURM_MPICH1_P4_PORT1 and
+SLURM_MPICH1_P4_PORT2 environment variables. Task zero connects to +SLURM_MPICH1_P4_PORT2 environment variables. Task zero connects to
+SLURM_MPICH1_P4_PORT1 and writes port number. The other tasks connect +SLURM_MPICH1_P4_PORT1 and writes its port number. The other tasks connect to
+to SLURM_MPICH1_P4_PORT2 and that port number. This avoid the requirement +SLURM_MPICH1_P4_PORT2 and read that port number. This avoid the requirement
+of having task zero launch all subsequent tasks and also launches +of having task zero launch all subsequent tasks and also launches
+all tasks under the direct control of SLURM (for process management +all tasks under the direct control of SLURM (for process management
+and accounting). SLURM only launches one task per node and that +and accounting). SLURM only launches one task per node and that
......
...@@ -197,12 +197,19 @@ int slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr, ...@@ -197,12 +197,19 @@ int slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr,
/* Send the RPC to the local srun communcation manager. /* Send the RPC to the local srun communcation manager.
* Since the srun can be sent thousands of messages at * Since the srun can be sent thousands of messages at
* the same time and refuse some connections, retry as * the same time and refuse some connections, retry as
* needed. Spread out messages by task's rank. Also * needed. Wait until all key-pairs have been sent by
* increase the timeout if many tasks since the srun * all tasks then spread out messages by task's rank.
* command is very overloaded. * Also increase the message timeout if many tasks
* We also increase the timeout (default timeout is * since the srun command can get very overloaded (the
* 10 secs). */ * default timeout is 10 secs).
usleep(pmi_rank * pmi_time); *
* TaskID SendTime GetTime (Units are PMI_TIME, default=500 usec)
* 0 0 N+0
* 1 1 N+1
* 2 2 N+2
* N-1 N-1 N+N-1
*/
usleep(pmi_size * pmi_time);
if (pmi_size > 1000) /* 100 secs */ if (pmi_size > 1000) /* 100 secs */
timeout = slurm_get_msg_timeout() * 10000; timeout = slurm_get_msg_timeout() * 10000;
else if (pmi_size > 100) /* 50 secs */ else if (pmi_size > 100) /* 50 secs */
......
...@@ -329,7 +329,7 @@ extern void slurm_step_ctx_params_t_init (slurm_step_ctx_params_t *ptr) ...@@ -329,7 +329,7 @@ extern void slurm_step_ctx_params_t_init (slurm_step_ctx_params_t *ptr)
char *jobid_str; char *jobid_str;
/* zero the entire structure */ /* zero the entire structure */
memset(ptr, 0, sizeof(job_step_create_request_msg_t)); memset(ptr, 0, sizeof(slurm_step_ctx_params_t));
/* now set anything that shouldn't be 0 or NULL by default */ /* now set anything that shouldn't be 0 or NULL by default */
ptr->relative = (uint16_t)NO_VAL; ptr->relative = (uint16_t)NO_VAL;
......
...@@ -305,7 +305,7 @@ void set_options(const int argc, char **argv) ...@@ -305,7 +305,7 @@ void set_options(const int argc, char **argv)
switch (opt_char) { switch (opt_char) {
case '?': case '?':
fprintf(stderr, "Try \"sbatch --help\" for more " fprintf(stderr, "Try \"sattach --help\" for more "
"information\n"); "information\n");
exit(1); exit(1);
break; break;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment