diff --git a/NEWS b/NEWS index 0fd54fdc9de7702aabe8f052ed4d9fc292918a23..09616ec15765ba4e5c73c6075c240804d7b975f7 100644 --- a/NEWS +++ b/NEWS @@ -40,6 +40,7 @@ documents those changes that are of interest to users and admins. -- Add job_submit/all_partitions plugin to set a job's default partition to ALL available partitions in the cluster. -- Modify switch/nrt logic to permit build without libnrt.so library. + -- Handle srun task launch failure without duplicate error messages or abort. * Changes in SLURM 2.5.1 ======================== diff --git a/src/api/step_launch.c b/src/api/step_launch.c index 494e2346b414765e7f75bf9582a83c72accc896b..8d49b60d66bbf2c9d71a2d8f4f92cd91f09e18b1 100644 --- a/src/api/step_launch.c +++ b/src/api/step_launch.c @@ -647,8 +647,10 @@ void slurm_step_launch_wait_finish(slurm_step_ctx_t *ctx) pthread_mutex_lock(&sls->lock); pmi_kvs_free(); - if (sls->msg_handle) + if (sls->msg_handle) { eio_handle_destroy(sls->msg_handle); + sls->msg_handle = NULL; + } /* Shutdown the io timeout thread, if one exists */ if (sls->io_timeout_thread_created) { @@ -667,6 +669,7 @@ void slurm_step_launch_wait_finish(slurm_step_ctx_t *ctx) pthread_mutex_lock(&sls->lock); client_io_handler_destroy(sls->io.normal); + sls->io.normal = NULL; } mpi_hook_client_fini(sls->mpi_state); diff --git a/src/plugins/launch/slurm/launch_slurm.c b/src/plugins/launch/slurm/launch_slurm.c index 3190eb34278c551d6d66259189c3b081e539ba9f..ad08198de2ca324a92aa25551373c46c10c3aed8 100644 --- a/src/plugins/launch/slurm/launch_slurm.c +++ b/src/plugins/launch/slurm/launch_slurm.c @@ -556,21 +556,22 @@ extern int launch_p_step_launch( update_job_state(job, SRUN_JOB_LAUNCHING); launch_start_time = time(NULL); if (first_launch) { - if (slurm_step_launch( - job->step_ctx, &launch_params, &callbacks) != - SLURM_SUCCESS) { + if (slurm_step_launch(job->step_ctx, &launch_params, + &callbacks) != SLURM_SUCCESS) { + rc = errno; + *local_global_rc = errno; error("Application launch failed: %m"); - *local_global_rc = 1; slurm_step_launch_abort(job->step_ctx); slurm_step_launch_wait_finish(job->step_ctx); goto cleanup; } } else { if (slurm_step_launch_add(job->step_ctx, &launch_params, - job->nodelist, job->fir_nodeid) != - SLURM_SUCCESS) { + job->nodelist, job->fir_nodeid) + != SLURM_SUCCESS) { + rc = errno; + *local_global_rc = errno; error("Application launch add failed: %m"); - *local_global_rc = 1; slurm_step_launch_abort(job->step_ctx); slurm_step_launch_wait_finish(job->step_ctx); goto cleanup;