From 36f59fb3c71e3262953860328d1867e2788d9296 Mon Sep 17 00:00:00 2001
From: Artem Polyakov <artpol84@gmail.com>
Date: Wed, 9 Aug 2017 00:28:35 +0700
Subject: [PATCH] mpi/pmix: Fix UCX connection error case handling

Signed-off-by: Artem Polyakov <artpol84@gmail.com>
---
 src/plugins/mpi/pmix/pmixp_dconn.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/plugins/mpi/pmix/pmixp_dconn.h b/src/plugins/mpi/pmix/pmixp_dconn.h
index 624634556e9..8dbc37bc012 100644
--- a/src/plugins/mpi/pmix/pmixp_dconn.h
+++ b/src/plugins/mpi/pmix/pmixp_dconn.h
@@ -228,9 +228,9 @@ static inline int pmixp_dconn_connect(
 	if (SLURM_SUCCESS == rc){
 		dconn->state = PMIXP_DIRECT_CONNECTED;
 	} else {
-		/* drop the state to INIT so we will try again later
-		 * if it will always be failing - we will always use
-		 * SLURM's protocol
+		/*
+		 * Abort the application - we can't do what user requested.
+		 * Make sure to provide enough info
 		 */
 		char *nodename = pmixp_info_job_host(dconn->nodeid);
 		xassert(nodename);
@@ -239,10 +239,12 @@ static inline int pmixp_dconn_connect(
 				    dconn->nodeid);
 			abort();
 		}
-		dconn->state = PMIXP_DIRECT_INIT;
 		PMIXP_ERROR("Cannot establish direct connection to %s (%d)",
 			    nodename, dconn->nodeid);
 		xfree(nodename);
+		pmixp_debug_hang(0); /* enable hang to debug this! */
+		slurm_kill_job_step(pmixp_info_jobid(),
+				    pmixp_info_stepid(), SIGKILL);
 	}
 	return rc;
 }
-- 
GitLab