Skip to content
Snippets Groups Projects
Commit 4d4c8c17 authored by Artem Polyakov's avatar Artem Polyakov Committed by David Bigagli
Browse files

1. Prepare to the new state machine 2. Fail with SIGKILL if state machine misbehaves.

parent 195ae448
No related branches found
No related tags found
No related merge requests found
......@@ -99,6 +99,26 @@ static int _pack_ranges(pmixp_coll_t *coll)
return SLURM_SUCCESS;
}
static void _fan_in_finished(pmixp_coll_t *coll)
{
xassert(PMIXP_COLL_FAN_IN == coll->state);
coll->state = PMIXP_COLL_FAN_OUT;
memset(coll->ch_contribs, 0, sizeof(int) * coll->children_cnt);
coll->contrib_cntr = 0;
coll->contrib_local = 0;
set_buf_offset(coll->buf, coll->serv_offs);
if (SLURM_SUCCESS != _pack_ranges(coll)) {
PMIXP_ERROR("Cannot pack ranges to coll message header!");
}
}
static void _fan_out_finished(pmixp_coll_t *coll)
{
xassert( PMIXP_COLL_FAN_OUT == coll->state /* || fan_out_in */);
coll->state = PMIXP_COLL_SYNC;
coll->seq++; /* move to the next collective */
}
static void _reset_coll(pmixp_coll_t *coll)
{
switch (coll->state) {
......@@ -535,8 +555,7 @@ static void _progress_fan_in(pmixp_coll_t *coll)
}
/* transit to the next state */
coll->state = PMIXP_COLL_FAN_OUT;
set_buf_offset(coll->buf, 0);
_fan_in_finished(coll);
/* if we are root - push data to PMIx here.
* Originally there was a homogenuous solution: root nodename was in the hostlist.
......@@ -577,7 +596,7 @@ void _progres_fan_out(pmixp_coll_t *coll, Buf buf)
pmixp_free_Buf, (void *)buf);
}
/* Prepare for the next collective operation */
_reset_coll(coll);
_fan_out_finished(coll);
PMIXP_DEBUG("%s:%d: collective is prepared for the next use",
pmixp_info_namespace(), pmixp_info_nodeid());
......
......@@ -170,8 +170,6 @@ static inline int pmixp_coll_check_seq(pmixp_coll_t *coll, uint32_t seq,
* want to discard this message */
return SLURM_ERROR;
}
PMIXP_ERROR("Bad collective seq. #%d from %s, current is %d", seq,
nodename, coll->seq);
/* maybe need more sophisticated handling in presence of
* several steps. However maybe it's enough to just ignore */
/* slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL); */
......
......@@ -411,16 +411,22 @@ static void _process_server_request(recv_header_t *_hdr, void *payload)
coll = pmixp_state_coll_get(type, procs, nprocs);
xfree(procs);
PMIXP_DEBUG(
"FENCE collective message from node \"%s\", type = %s",
nodename,
(PMIXP_MSG_FAN_IN == hdr->type) ?
"fan-in" : "fan-out");
if (SLURM_SUCCESS
!= pmixp_coll_check_seq(coll, hdr->seq,
PMIXP_DEBUG("FENCE collective message from node \"%s\", type = %s",
nodename,
(PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out");
if (SLURM_SUCCESS != pmixp_coll_check_seq(coll, hdr->seq,
nodename)) {
/* stop processing discardig this message */
/* this is unexepable event: either something went
* really wrong or the state machine is incorrect.
* This will 100% lead to application hang.
*/
PMIXP_ERROR("Bad collective seq. #%d from %s, current is %d",
hdr->seq, nodename, coll->seq);
pmixp_debug_hang(0); /* enable hang to debug this! */
slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(),
SIGKILL);
break;
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment