Skip to content
Snippets Groups Projects
Commit 07f73ff7 authored by Moe Jette's avatar Moe Jette
Browse files

Improve the handling of task launch failure.

Previously could hang the srun command with no error message.
parent c1d0fd01
No related branches found
No related tags found
No related merge requests found
......@@ -96,6 +96,7 @@ enum {
SLURM_PROTOCOL_INSANE_MSG_LENGTH,
SLURM_MPI_PLUGIN_NAME_INVALID,
SLURM_MPI_PLUGIN_PRELAUNCH_SETUP_FAILED,
SLURM_PLUGIN_NAME_INVALID,
/* communication failures to/from slurmctld */
SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR = 1800,
......
/*****************************************************************************\
* step_launch.c - launch a parallel job step
*
* $Id$
*****************************************************************************
* Copyright (C) 2006 The Regents of the University of California.
* Copyright (C) 2006-2007 The Regents of the University of California.
* Copyright (C) 2008 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Christopher J. Morrone <morrone2@llnl.gov>
* LLNL-CODE-402394.
......@@ -741,10 +740,18 @@ _launch_handler(struct step_launch_state *sls, slurm_msg_t *resp)
pthread_mutex_lock(&sls->lock);
for (i = 0; i < msg->count_of_pids; i++) {
bit_set(sls->tasks_started, msg->task_ids[i]);
if (msg->return_code) {
for (i = 0; i < msg->count_of_pids; i++) {
error("task %u launch failed: %s",
msg->task_ids[i],
slurm_strerror(msg->return_code));
bit_set(sls->tasks_started, msg->task_ids[i]);
bit_set(sls->tasks_exited, msg->task_ids[i]);
}
} else {
for (i = 0; i < msg->count_of_pids; i++)
bit_set(sls->tasks_started, msg->task_ids[i]);
}
if (sls->callback.task_start != NULL)
(sls->callback.task_start)(msg);
......@@ -770,7 +777,7 @@ _exit_handler(struct step_launch_state *sls, slurm_msg_t *exit_msg)
pthread_mutex_lock(&sls->lock);
for (i = 0; i < msg->num_tasks; i++) {
debug("task %d done", msg->task_id_list[i]);
debug("task %u done", msg->task_id_list[i]);
bit_set(sls->tasks_exited, msg->task_id_list[i]);
}
......
......@@ -91,6 +91,8 @@ static slurm_errtab_t slurm_errtab[] = {
"Invalid MPI plugin name" },
{ SLURM_MPI_PLUGIN_PRELAUNCH_SETUP_FAILED,
"MPI plugin's pre-launch setup failed" },
{ "SLURM_PLUGIN_NAME_INVALID",
"Plugin initialization failed" },
/* communication failures to/from slurmctld */
{ SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR,
......
......@@ -3,6 +3,7 @@
* $Id$
*****************************************************************************
* Copyright (C) 2002-2007 The Regents of the University of California.
* Copyright (C) 2008 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Mark Grondona <mgrondona@llnl.gov>.
* LLNL-CODE-402394.
......@@ -669,10 +670,13 @@ job_manager(slurmd_job_t *job)
*/
if (switch_init() != SLURM_SUCCESS
|| slurmd_task_init() != SLURM_SUCCESS
|| mpi_hook_slurmstepd_init(&job->env) != SLURM_SUCCESS
|| slurm_proctrack_init() != SLURM_SUCCESS
|| slurm_jobacct_gather_init() != SLURM_SUCCESS) {
rc = SLURM_FAILURE;
rc = SLURM_PLUGIN_NAME_INVALID;
goto fail1;
}
if (mpi_hook_slurmstepd_init(&job->env) != SLURM_SUCCESS) {
rc = SLURM_MPI_PLUGIN_NAME_INVALID;
goto fail1;
}
......
/*****************************************************************************\
* slurmd/slurmstepd/task.c - task launching functions for slurmstepd
* $Id$
*****************************************************************************
* Copyright (C) 2002-2006 The Regents of the University of California.
* Copyright (C) 2002-2007 The Regents of the University of California.
* Copyright (C) 2008 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* Written by Mark A. Grondona <mgrondona@llnl.gov>.
* LLNL-CODE-402394.
......@@ -266,7 +266,7 @@ _build_path(char* fname, char **prog_env)
return file_path;
}
static void
static int
_setup_mpi(slurmd_job_t *job, int ltaskid)
{
mpi_plugin_task_info_t info[1];
......@@ -282,7 +282,7 @@ _setup_mpi(slurmd_job_t *job, int ltaskid)
info->self = job->envtp->self;
info->client = job->envtp->cli;
mpi_hook_slurmstepd_task(info, &job->env);
return mpi_hook_slurmstepd_task(info, &job->env);
}
......@@ -374,8 +374,12 @@ exec_task(slurmd_job_t *job, int i, int waitfd)
exit(1);
}
_setup_mpi(job, i);
if (_setup_mpi(job, i) != SLURM_SUCCESS) {
error("Unable to configure MPI plugin: %m");
log_fini();
exit(1);
}
pdebug_stop_current(job);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment