From 36f59fb3c71e3262953860328d1867e2788d9296 Mon Sep 17 00:00:00 2001 From: Artem Polyakov <artpol84@gmail.com> Date: Wed, 9 Aug 2017 00:28:35 +0700 Subject: [PATCH] mpi/pmix: Fix UCX connection error case handling Signed-off-by: Artem Polyakov <artpol84@gmail.com> --- src/plugins/mpi/pmix/pmixp_dconn.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/plugins/mpi/pmix/pmixp_dconn.h b/src/plugins/mpi/pmix/pmixp_dconn.h index 624634556e9..8dbc37bc012 100644 --- a/src/plugins/mpi/pmix/pmixp_dconn.h +++ b/src/plugins/mpi/pmix/pmixp_dconn.h @@ -228,9 +228,9 @@ static inline int pmixp_dconn_connect( if (SLURM_SUCCESS == rc){ dconn->state = PMIXP_DIRECT_CONNECTED; } else { - /* drop the state to INIT so we will try again later - * if it will always be failing - we will always use - * SLURM's protocol + /* + * Abort the application - we can't do what user requested. + * Make sure to provide enough info */ char *nodename = pmixp_info_job_host(dconn->nodeid); xassert(nodename); @@ -239,10 +239,12 @@ static inline int pmixp_dconn_connect( dconn->nodeid); abort(); } - dconn->state = PMIXP_DIRECT_INIT; PMIXP_ERROR("Cannot establish direct connection to %s (%d)", nodename, dconn->nodeid); xfree(nodename); + pmixp_debug_hang(0); /* enable hang to debug this! */ + slurm_kill_job_step(pmixp_info_jobid(), + pmixp_info_stepid(), SIGKILL); } return rc; } -- GitLab