diff --git a/NEWS b/NEWS index 1b3640b0b4bec6c178015f391d2f56f6adc5342f..89340f5d9dfedbeae193bf917f1691890233a1e2 100644 --- a/NEWS +++ b/NEWS @@ -42,6 +42,7 @@ documents those changes that are of interest to users and admins. -- Fix problem in switch/elan error handling that could hang a slurmd step manager process. -- Build on AIX with -bmaxdata:0x70000000 for memory limit more than 256MB. + -- Restore srun's return code support. * Changes in SLURM 0.6.6 ======================== diff --git a/src/srun/msg.c b/src/srun/msg.c index c1ecadbe566ba63bb4bfd5fb3dead55558fdb33d..14edd64d73eaf3cd57bfd3c15187f0c57024fd64 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -537,6 +537,21 @@ _die_if_signaled(srun_job_t *job, int status) } } +static void +_update_task_exitcode(srun_job_t *job, int taskid) +{ + pipe_enum_t pipe_enum = PIPE_TASK_EXITCODE; + + if(message_thread) { + write(job->forked_msg->par_msg->msg_pipe[1], + &pipe_enum, sizeof(int)); + write(job->forked_msg->par_msg->msg_pipe[1], + &taskid, sizeof(int)); + write(job->forked_msg->par_msg->msg_pipe[1], + &job->tstatus[taskid], sizeof(int)); + } +} + static void _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) { @@ -564,6 +579,7 @@ _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) slurm_mutex_lock(&job->task_mutex); job->tstatus[taskid] = status; + _update_task_exitcode(job, taskid); if (status) job->task_state[taskid] = SRUN_TASK_ABNORMAL_EXIT; else { @@ -877,7 +893,7 @@ par_thr(void *arg) //slurm_uid = (uid_t) slurm_get_slurm_user_id(); close(msg_par->msg_pipe[0]); // close read end of pipe close(par_msg->msg_pipe[1]); // close write end of pipe - while(read(par_msg->msg_pipe[0],&c,sizeof(int))>0) { + while(read(par_msg->msg_pipe[0], &c, sizeof(int)) == sizeof(int)) { // getting info from msg thread if(type == PIPE_NONE) { debug2("got type %d\n",c); @@ -886,8 +902,10 @@ par_thr(void *arg) } if(type == PIPE_JOB_STATE) { + debug("PIPE_JOB_STATE, c = %d", c); update_job_state(job, c); } else if(type == PIPE_TASK_STATE) { + debug("PIPE_TASK_STATE"); if(tid == -1) { tid = c; continue; @@ -902,6 +920,18 @@ par_thr(void *arg) update_job_state(job, SRUN_JOB_TERMINATED); } tid = -1; + } else if(type == PIPE_TASK_EXITCODE) { + debug("PIPE_TASK_EXITCODE"); + if(tid == -1) { + debug(" setting tid"); + tid = c; + continue; + } + slurm_mutex_lock(&job->task_mutex); + debug(" setting task %d exitcode %d", tid, c); + job->tstatus[tid] = c; + slurm_mutex_unlock(&job->task_mutex); + tid = -1; } else if(type == PIPE_HOST_STATE) { if(tid == -1) { tid = c; diff --git a/src/srun/srun_job.h b/src/srun/srun_job.h index 802476d7d81de7b886b3ae429baaa19fda1fbea9..bd0ee56e866e0d98053536354183730e1fcbc717 100644 --- a/src/srun/srun_job.h +++ b/src/srun/srun_job.h @@ -49,6 +49,7 @@ typedef enum { PIPE_NONE = 0, PIPE_JOB_STATE, PIPE_TASK_STATE, + PIPE_TASK_EXITCODE, PIPE_HOST_STATE, PIPE_SIGNALED, PIPE_MPIR_PROCTABLE_SIZE,