diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 60bfc43ef8735236234efa30c7cd199f62d69c58..7681096d6818be6d73c368a4f3ea8829f7838112 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -1087,7 +1087,9 @@ job_complete (uint32_t job_id) if ((job_ptr->job_state == JOB_STAGE_IN) || (job_ptr->job_state == JOB_RUNNING) || (job_ptr->job_state == JOB_STAGE_OUT)) { + unlock_slurmctld (job_write_lock); deallocate_nodes (job_ptr); + lock_slurmctld(job_write_lock); verbose ("job_complete for job id %u successful", job_id); } else { diff --git a/src/slurmd/elan_interconnect.c b/src/slurmd/elan_interconnect.c index 7a114f5767a509dadd96afaa60a88bd639a16a18..39d19e956ac299eb830147a5e8af1ee57796383d 100644 --- a/src/slurmd/elan_interconnect.c +++ b/src/slurmd/elan_interconnect.c @@ -16,15 +16,17 @@ #include <string.h> #include <stdarg.h> +#include <src/common/xmalloc.h> +#include <src/common/xstring.h> #include <src/common/bitstring.h> #include <src/common/qsw.h> #include <src/common/slurm_errno.h> #include <src/common/slurm_protocol_api.h> #include <src/slurmd/task_mgr.h> #include <src/slurmd/interconnect.h> +#include <src/slurmd/setenvpf.h> + -static int setenvf(const char *fmt, ...) ; -static int do_env(int nodeid, int nnodes, int procid, int nprocs) ; /* exported module funtion to launch tasks */ /*launch_tasks should really be named launch_job_step*/ int launch_tasks ( launch_tasks_request_msg_t * launch_msg ) @@ -61,6 +63,7 @@ int interconnect_init ( launch_tasks_request_msg_t * launch_msg ) } /* Process 2: */ + info("qsw_prog_init called from process %ld", getpid()); if (qsw_prog_init(launch_msg->qsw_job, launch_msg->uid) < 0) { slurm_perror("qsw_prog_init"); @@ -81,14 +84,15 @@ int interconnect_set_capabilities ( task_start_t * task_start ) nnodes = task_start->launch_msg->nnodes; procid = task_start->local_task_id; nprocs = task_start->launch_msg->nprocs; + + info("nodeid=%d nnodes=%d procid=%d nprocs=%d", + nodeid, nnodes, procid, nprocs); + info("setting capability in process %ld", getpid()); + if (qsw_setcap( task_start -> launch_msg -> qsw_job, procid) < 0) { slurm_perror("qsw_setcap"); return SLURM_ERROR ; } - if (do_env(nodeid, nnodes, procid, nprocs) < 0) { - slurm_perror("do_env"); - return SLURM_ERROR ; - } pid = fork(); switch (pid) { @@ -108,41 +112,21 @@ int interconnect_set_capabilities ( task_start_t * task_start ) } } -/* - * Set a variable in the callers environment. Args are printf style. - * XXX Space is allocated on the heap and will never be reclaimed. - * Example: setenvf("RMS_RANK=%d", rank); - */ -static int setenvf(const char *fmt, ...) -{ - va_list ap; - char buf[BUFSIZ]; - char *bufcpy; - - va_start(ap, fmt); - vsnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); - - bufcpy = strdup(buf); - if (bufcpy == NULL) - return -1; - return putenv(bufcpy); -} - /* * Set environment variables needed by QSW MPICH / libelan. */ -static int do_env(int nodeid, int nnodes, int procid, int nprocs) +int interconnect_env(char ***env, int *envc, int nodeid, int nnodes, + int procid, int nprocs) { - if (setenvf("RMS_RANK=%d", procid) < 0) + if (setenvpf(env, envc, "RMS_RANK=%d", procid) < 0) return -1; - if (setenvf("RMS_NODEID=%d", nodeid) < 0) + if (setenvpf(env, envc, "RMS_NODEID=%d", nodeid) < 0) return -1; - if (setenvf("RMS_PROCID=%d", procid) < 0) + if (setenvpf(env, envc, "RMS_PROCID=%d", procid) < 0) return -1; - if (setenvf("RMS_NNODES=%d", nnodes) < 0) + if (setenvpf(env, envc, "RMS_NNODES=%d", nnodes) < 0) return -1; - if (setenvf("RMS_NPROCS=%d", nprocs) < 0) + if (setenvpf(env, envc, "RMS_NPROCS=%d", nprocs) < 0) return -1; return 0; } diff --git a/src/slurmd/task_mgr.c b/src/slurmd/task_mgr.c index 8fa1577a4e23a217a18b869e84bce5d8bb880efd..ef3aae65f752b4d5fdc27f061b2f0474f4a51046 100644 --- a/src/slurmd/task_mgr.c +++ b/src/slurmd/task_mgr.c @@ -59,6 +59,8 @@ int fan_out_task_launch ( launch_tasks_request_msg_t * launch_msg ) * launched*/ task_start_t * task_start[launch_msg->tasks_to_launch]; + debug("msg->job_step_id = %d", launch_msg->job_step_id); + if ( ( session_id = setsid () ) == SLURM_ERROR ) { info ( "set sid failed" ); @@ -143,8 +145,13 @@ void * task_exec_thread ( void * arg ) posix_signal_ignore (SIGTTOU); /* ignore tty output */ posix_signal_ignore (SIGTTIN); /* ignore tty input */ posix_signal_ignore (SIGTSTP); /* ignore user */ - - /* setup std stream pipes */ + + interconnect_env(&launch_msg->env, &launch_msg->envc, + launch_msg->srun_node_id, /* setup std stream pipes */ + launch_msg->nnodes, + task_start->local_task_id, + launch_msg->nprocs); + setup_child_pipes ( pipes ) ; /* get passwd file info */ @@ -239,6 +246,7 @@ int kill_tasks ( kill_tasks_msg_t * kill_task_msg ) task_t * task_ptr ; /* find job step */ job_step_t * job_step_ptr = find_job_step ( shmem_ptr , kill_task_msg -> job_id , kill_task_msg -> job_step_id ) ; + debug("request to kill step %d.%d", kill_task_msg -> job_id , kill_task_msg -> job_step_id); debug3 ( "entering kill_tasks" ) ; if ( job_step_ptr == (void * ) SLURM_ERROR ) { diff --git a/src/srun/launch.c b/src/srun/launch.c index c5250b515241a50724dea54f88f22ed8c213e5f3..437ffa8fbe3c970823ea0ac25967abff7f847a94 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -63,17 +63,19 @@ launch(void *arg) msg.argc = remote_argc; msg.argv = remote_argv; msg.credential = job->cred; - msg.job_step_id = 0; + msg.job_step_id = job->stepid; msg.envc = envcount(environ); msg.env = environ; msg.cwd = opt.cwd; + msg.nnodes = job->nhosts; + msg.nprocs = opt.nprocs; slurm_set_addr_char(&msg.response_addr, ntohs(job->jaddr.sin_port), hostname); #if HAVE_LIBELAN3 msg.qsw_job = job->qsw_job; #endif - debug("setting iopart to %s:%d", hostname, ntohs(job->ioport)); + debug("setting ioport to %s:%d", hostname, ntohs(job->ioport)); slurm_set_addr_char(&msg.streams , ntohs(job->ioport), hostname); debug("sending to slurmd port %d", slurm_get_slurmd_port()); diff --git a/src/srun/srun.c b/src/srun/srun.c index 8d6f48ae46d1896cca7a511d8b4aa7de6ac31028..37a84a76c4b44aef98db89be4ecf2cffc277b01c 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -301,6 +301,9 @@ create_job_step(job_t *job) job->stepid = resp->job_step_id; job->cred = resp->credentials; +#if HAVE_LIBELAN3 + job->qsw_job= resp->qsw_job; +#endif }