diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index 3e9b7e9b7a2466ca387a85cecf236e5a58d873d1..1895d696f2bf1e75d7167c3eaae10bc84d31ff38 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -96,6 +96,7 @@ enum { SLURM_PROTOCOL_INSANE_MSG_LENGTH, SLURM_MPI_PLUGIN_NAME_INVALID, SLURM_MPI_PLUGIN_PRELAUNCH_SETUP_FAILED, + SLURM_PLUGIN_NAME_INVALID, /* communication failures to/from slurmctld */ SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR = 1800, diff --git a/src/api/step_launch.c b/src/api/step_launch.c index 98a3223450a4ef155cdab58bed77633473f6c65a..82227e56e4f10fb3e9f532ae7dffd25d671b9aac 100644 --- a/src/api/step_launch.c +++ b/src/api/step_launch.c @@ -1,9 +1,8 @@ /*****************************************************************************\ * step_launch.c - launch a parallel job step - * - * $Id$ ***************************************************************************** - * Copyright (C) 2006 The Regents of the University of California. + * Copyright (C) 2006-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Christopher J. Morrone <morrone2@llnl.gov> * LLNL-CODE-402394. @@ -741,10 +740,18 @@ _launch_handler(struct step_launch_state *sls, slurm_msg_t *resp) pthread_mutex_lock(&sls->lock); - for (i = 0; i < msg->count_of_pids; i++) { - bit_set(sls->tasks_started, msg->task_ids[i]); + if (msg->return_code) { + for (i = 0; i < msg->count_of_pids; i++) { + error("task %u launch failed: %s", + msg->task_ids[i], + slurm_strerror(msg->return_code)); + bit_set(sls->tasks_started, msg->task_ids[i]); + bit_set(sls->tasks_exited, msg->task_ids[i]); + } + } else { + for (i = 0; i < msg->count_of_pids; i++) + bit_set(sls->tasks_started, msg->task_ids[i]); } - if (sls->callback.task_start != NULL) (sls->callback.task_start)(msg); @@ -770,7 +777,7 @@ _exit_handler(struct step_launch_state *sls, slurm_msg_t *exit_msg) pthread_mutex_lock(&sls->lock); for (i = 0; i < msg->num_tasks; i++) { - debug("task %d done", msg->task_id_list[i]); + debug("task %u done", msg->task_id_list[i]); bit_set(sls->tasks_exited, msg->task_id_list[i]); } diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 4720b21895c488ad3cc9fd90f1d5a96065a08eee..0f0c1b19827983eea138fb7cddb22f1afbb5855b 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -91,6 +91,8 @@ static slurm_errtab_t slurm_errtab[] = { "Invalid MPI plugin name" }, { SLURM_MPI_PLUGIN_PRELAUNCH_SETUP_FAILED, "MPI plugin's pre-launch setup failed" }, + { "SLURM_PLUGIN_NAME_INVALID", + "Plugin initialization failed" }, /* communication failures to/from slurmctld */ { SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR, diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 85a82d94a2795940ba67bfe9334555e8cfdbc7cd..a97b9eae62cb3c84ff17f6ffe4614cd4f54dba11 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -3,6 +3,7 @@ * $Id$ ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Mark Grondona <mgrondona@llnl.gov>. * LLNL-CODE-402394. @@ -669,10 +670,13 @@ job_manager(slurmd_job_t *job) */ if (switch_init() != SLURM_SUCCESS || slurmd_task_init() != SLURM_SUCCESS - || mpi_hook_slurmstepd_init(&job->env) != SLURM_SUCCESS || slurm_proctrack_init() != SLURM_SUCCESS || slurm_jobacct_gather_init() != SLURM_SUCCESS) { - rc = SLURM_FAILURE; + rc = SLURM_PLUGIN_NAME_INVALID; + goto fail1; + } + if (mpi_hook_slurmstepd_init(&job->env) != SLURM_SUCCESS) { + rc = SLURM_MPI_PLUGIN_NAME_INVALID; goto fail1; } diff --git a/src/slurmd/slurmstepd/task.c b/src/slurmd/slurmstepd/task.c index 0ebe57a570dfeddc00d473d36f8fb4af937def93..339ff6c4aa533266134e2ba2358073b280343bb2 100644 --- a/src/slurmd/slurmstepd/task.c +++ b/src/slurmd/slurmstepd/task.c @@ -1,8 +1,8 @@ /*****************************************************************************\ * slurmd/slurmstepd/task.c - task launching functions for slurmstepd - * $Id$ ***************************************************************************** - * Copyright (C) 2002-2006 The Regents of the University of California. + * Copyright (C) 2002-2007 The Regents of the University of California. + * Copyright (C) 2008 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Mark A. Grondona <mgrondona@llnl.gov>. * LLNL-CODE-402394. @@ -266,7 +266,7 @@ _build_path(char* fname, char **prog_env) return file_path; } -static void +static int _setup_mpi(slurmd_job_t *job, int ltaskid) { mpi_plugin_task_info_t info[1]; @@ -282,7 +282,7 @@ _setup_mpi(slurmd_job_t *job, int ltaskid) info->self = job->envtp->self; info->client = job->envtp->cli; - mpi_hook_slurmstepd_task(info, &job->env); + return mpi_hook_slurmstepd_task(info, &job->env); } @@ -374,8 +374,12 @@ exec_task(slurmd_job_t *job, int i, int waitfd) exit(1); } - _setup_mpi(job, i); - + if (_setup_mpi(job, i) != SLURM_SUCCESS) { + error("Unable to configure MPI plugin: %m"); + log_fini(); + exit(1); + } + pdebug_stop_current(job); }