diff --git a/NEWS b/NEWS index edf0d067c75bb0c8b7f35c862decd0671c0c77eb..325cf7dcd02439ddcd381d5399a5f79f2b74d27d 100644 --- a/NEWS +++ b/NEWS @@ -62,6 +62,8 @@ documents those changes that are of interest to users and admins. ======================== -- Fix bug in mpi plugin to set the ID correctly -- Accounting bug causing segv fixed (Andy Riebs, 14oct.jobacct.patch) + -- Fix for failed launch of a debugged job (e.g. bad executable name). + -- Wiki plugin fix for tracking allocated nodes (Ernest Artiaga, BSC). * Changes in SLURM 0.6.8 ======================== diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index c2f9c1f8d256195f2aa68d6ab383849489c338cf..1ada2647222f70f1643279475634f9cb503ee285 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -211,9 +211,9 @@ static slurm_errtab_t slurm_errtab[] = { { ESLURMD_CANNOT_SPAWN_IO_THREAD, "Slurmd could not spawn I/O thread" }, { ESLURMD_FORK_FAILED, - "Slurmd could not fork batch job" }, + "Slurmd could not fork job" }, { ESLURMD_EXECVE_FAILED, - "Slurmd could not execve batch job" }, + "Slurmd could not execve job" }, { ESLURMD_IO_ERROR, "Slurmd could not connect IO" }, { ESLURMD_PROLOG_FAILED, diff --git a/src/plugins/sched/maui/wiki/wiki_request.cpp b/src/plugins/sched/maui/wiki/wiki_request.cpp index 7bf14331e104c3abfc37fb7eb00e9ef41e972b7f..bd1cc79ef10463b4d64e13db4cb5238b61a7e108 100644 --- a/src/plugins/sched/maui/wiki/wiki_request.cpp +++ b/src/plugins/sched/maui/wiki/wiki_request.cpp @@ -370,6 +370,7 @@ wiki_request_t::postproc_string( char * const field, char * (*post_processor)( const char * const val ); } post_proc_map [] = { { JOB_FIELD_REQ_NODES, wiki_request_t::colonify_commas }, + { JOB_FIELD_ALLOC_NODES,wiki_request_t::colonify_commas }, { NULL, NULL } }; @@ -472,6 +473,7 @@ wiki_getnodes_t::map_enum( char * const field, { NODE_STATE_LABEL_IDLE, "Idle" }, { NODE_STATE_LABEL_DRAINED, "Draining" }, { NODE_STATE_LABEL_DRAINING, "Draining" }, + { NODE_STATE_LABEL_ALLOCATED, "Running" }, { NODE_STATE_LABEL_COMPLETING, "Busy" }, { NULL, NULL } diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index a3fe9c416f2d3a21d4dccce7c4933fd9c473eb04..defbe0f4f5354b25ae656944ff2a6c201ec4a047 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -470,11 +470,13 @@ job_manager(slurmd_job_t *job) if (!job->batch && (interconnect_init(job->switch_job, job->uid) < 0)) { /* error("interconnect_init: %m"); already logged */ + rc = ESLURM_INTERCONNECT_FAILURE; goto fail2; } if (_fork_all_tasks(job) < 0) { debug("_fork_all_tasks failed"); + rc = ESLURMD_EXECVE_FAILED; goto fail2; } @@ -544,6 +546,7 @@ job_manager(slurmd_job_t *job) static int _fork_all_tasks(slurmd_job_t *job) { + int rc = SLURM_SUCCESS; int i; int *writefds; /* array of write file descriptors */ int *readfds; /* array of read file descriptors */ @@ -659,12 +662,15 @@ _fork_all_tasks(slurmd_job_t *job) * Prepare process for attach by parallel debugger * (if specified and able) */ + if (pdebug_trace_process(job, job->task[i]->pid) + == SLURM_ERROR) + rc = SLURM_ERROR; pdebug_trace_process(job, job->task[i]->pid); } xfree(writefds); xfree(readfds); - return SLURM_SUCCESS; + return rc; } diff --git a/src/slurmd/slurmstepd/pdebug.c b/src/slurmd/slurmstepd/pdebug.c index 67a1bd43df2afee3f9ed7aabd4618d2fe28aaf11..e84ac80ed98d6f78a2cbdacc9e74f37f40a1897c 100644 --- a/src/slurmd/slurmstepd/pdebug.c +++ b/src/slurmd/slurmstepd/pdebug.c @@ -27,8 +27,9 @@ /* * Prepare task for parallel debugger attach + * Returns SLURM_SUCCESS or SLURM_ERROR. */ -void +int pdebug_trace_process(slurmd_job_t *job, pid_t pid) { /* If task to be debugged, wait for it to stop via @@ -41,15 +42,34 @@ pdebug_trace_process(slurmd_job_t *job, pid_t pid) if (job->task_flags & TASK_PARALLEL_DEBUG) { int status; waitpid(pid, &status, WUNTRACED); - if ((pid > (pid_t) 0) && (kill(pid, SIGSTOP) < 0)) + if (!WIFSTOPPED(status)) { + debug("pdebug_trace_process WIFSTOPPED false" + " for pid %lu", pid); + if (WIFEXITED(status)) { + debug("Process %lu exited \"normally\"" + " with return code %d", + pid, WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + debug("Process %lu kill by signal %d", + pid, WTERMSIG(status)); + } + return SLURM_ERROR; + } + if ((pid > (pid_t) 0) && (kill(pid, SIGSTOP) < 0)) { error("kill(%lu): %m", (unsigned long) pid); + return SLURM_ERROR; + } + #ifdef HAVE_AIX - if (_PTRACE(PT_DETACH, pid, NULL, 0)) + if (_PTRACE(PT_DETACH, pid, NULL, 0)) { #else - if (_PTRACE(PTRACE_DETACH, pid, NULL, 0)) + if (_PTRACE(PTRACE_DETACH, pid, NULL, 0)) { #endif error("ptrace(%lu): %m", (unsigned long) pid); + return SLURM_ERROR; + } } + return SLURM_SUCCESS; } /* diff --git a/src/slurmd/slurmstepd/pdebug.h b/src/slurmd/slurmstepd/pdebug.h index f92af015b47570a723b99c58bf56250f91f6056a..a4c62c25178ec3441f98cae45d2de3a231826a92 100644 --- a/src/slurmd/slurmstepd/pdebug.h +++ b/src/slurmd/slurmstepd/pdebug.h @@ -36,8 +36,9 @@ void pdebug_stop_current(slurmd_job_t *job); /* * Prepare task for parallel debugger attach + * Returns SLURM_SUCCESS or SLURM_ERROR. */ -void pdebug_trace_process(slurmd_job_t *job, pid_t pid); +int pdebug_trace_process(slurmd_job_t *job, pid_t pid); #ifdef HAVE_PTRACE64 # define _PTRACE(r,p,a,d) ptrace64((r),(long long)(p),(long long)(a),(d),NULL) diff --git a/src/srun/msg.c b/src/srun/msg.c index af25de6878ca19a5bfb9eedece237cb19cd23378..795fa4fb3b50b9314e50d58d21369aa1f8e94ed6 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -183,6 +183,8 @@ static void _dump_proctable(srun_job_t *job) for (task_inx=0; task_inx<job->step_layout->tasks[node_inx]; task_inx++) { taskid = job->step_layout->tids[node_inx][task_inx]; tv = &MPIR_proctable[taskid]; + if (!tv) + break; info("task:%d, host:%s, pid:%d", taskid, tv->host_name, tv->pid); }