From 056f9ce259780476267cd0173e0fcbc554e05556 Mon Sep 17 00:00:00 2001 From: Artem Polyakov <artpol84@gmail.com> Date: Tue, 8 Aug 2017 13:21:36 +0700 Subject: [PATCH] mpi/pmix: Fix the case where UCX fails to connect There were segmentation faults because of double free of a pending list when UCX comonent was trying to connect multiple times. Signed-off-by: Artem Polyakov <artpol84@gmail.com> --- src/plugins/mpi/pmix/pmixp_dconn_ucx.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/plugins/mpi/pmix/pmixp_dconn_ucx.c b/src/plugins/mpi/pmix/pmixp_dconn_ucx.c index 3c150705630..783f845cd53 100644 --- a/src/plugins/mpi/pmix/pmixp_dconn_ucx.c +++ b/src/plugins/mpi/pmix/pmixp_dconn_ucx.c @@ -637,8 +637,7 @@ static void _ucx_fini(void *_priv) slurm_mutex_unlock(&_ucx_worker_lock); } else { slurm_mutex_lock(&_ucx_worker_lock); - pmixp_rlist_init(&priv->pending, &_free_list, - PMIXP_UCX_LIST_PREALLOC); + pmixp_rlist_fini(&priv->pending); slurm_mutex_unlock(&_ucx_worker_lock); } xfree(priv); @@ -664,8 +663,9 @@ static int _ucx_connect(void *_priv, void *ep_data, size_t ep_len, if (status != UCS_OK) { PMIXP_ERROR("ucp_ep_create failed: %s", ucs_status_string(status)); - rc = SLURM_ERROR; - goto exit; + xfree(priv->ucx_addr); + slurm_mutex_unlock(&_ucx_worker_lock); + return SLURM_ERROR; } priv->connected = true; @@ -673,7 +673,6 @@ static int _ucx_connect(void *_priv, void *ep_data, size_t ep_len, if (init_msg) { pmixp_rlist_push(&priv->pending, init_msg); } -exit: slurm_mutex_unlock(&_ucx_worker_lock); /* we need to send data while being unlocked */ -- GitLab