Skip to content
Snippets Groups Projects
Commit 1d2a6f8a authored by Danny Auble's avatar Danny Auble
Browse files

BGQ - fixes to the runjob_mux plugin to not put cnodes in an error state

if any message comes through, and since we have states of cnodes in
the status threads we don't need to keep retrying to send the message of
cnodes in error if the slurmctld is down.
parent 96338f25
No related branches found
No related tags found
No related merge requests found
...@@ -115,17 +115,14 @@ static void _destroy_runjob_job(void *object) ...@@ -115,17 +115,14 @@ static void _destroy_runjob_job(void *object)
static void _send_failed_cnodes(block_fail_cnode_t *block_fail_cnode) static void _send_failed_cnodes(block_fail_cnode_t *block_fail_cnode)
{ {
int rc, count = 0; int rc;
if (!block_fail_cnode) if (!block_fail_cnode)
return; return;
while ((rc = slurm_fail_cnode(block_fail_cnode))) { if ((rc = slurm_fail_cnode(block_fail_cnode))) {
std::cerr << "Trying to fail cnodes, but slurmctld is " std::cerr << "Trying to fail cnodes, but slurmctld is "
"not responding, trying for " << count * 5 << "not responding, not sending." << std::endl;
" seconds." << std::endl;
sleep(5);
count++;
} }
} }
...@@ -430,18 +427,18 @@ void Plugin::execute(const bgsched::runjob::Terminated& data) ...@@ -430,18 +427,18 @@ void Plugin::execute(const bgsched::runjob::Terminated& data)
_send_failed_cnodes(&block_fail_cnode); _send_failed_cnodes(&block_fail_cnode);
xfree(block_fail_cnode.cnodes); xfree(block_fail_cnode.cnodes);
} else if (!data.message().empty()) { } // else if (!data.message().empty()) {
std::cerr << runjob_job->job_id << "." << runjob_job->step_id // std::cerr << runjob_job->job_id << "." << runjob_job->step_id
<< " had a message of '" << data.message() // << " had a message of '" << data.message()
<< "'. Failing the cnodes on the job. (" // << "'. Failing the cnodes on the job. ("
<< runjob_job->total_cnodes << ")" << std::endl; // << runjob_job->total_cnodes << ")" << std::endl;
memset(&block_fail_cnode, 0, sizeof(block_fail_cnode_t)); // memset(&block_fail_cnode, 0, sizeof(block_fail_cnode_t));
block_fail_cnode.bg_block_id = runjob_job->bg_block_id; // block_fail_cnode.bg_block_id = runjob_job->bg_block_id;
block_fail_cnode.cnodes = runjob_job->total_cnodes; // block_fail_cnode.cnodes = runjob_job->total_cnodes;
block_fail_cnode.job_id = runjob_job->job_id; // block_fail_cnode.job_id = runjob_job->job_id;
block_fail_cnode.step_id = runjob_job->step_id; // block_fail_cnode.step_id = runjob_job->step_id;
_send_failed_cnodes(&block_fail_cnode); // _send_failed_cnodes(&block_fail_cnode);
} // }
_destroy_runjob_job(runjob_job); _destroy_runjob_job(runjob_job);
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment