Skip to content
Snippets Groups Projects
Commit b72aac4c authored by Danny Auble's avatar Danny Auble
Browse files

BGQ - Added first stage of logic to send the slurmctld a cnode has failed

during runjob.  The controller isn't notified just yet since the rpc hasn't
been created yet.
parent 85ca3ac7
No related branches found
No related tags found
No related merge requests found
...@@ -1536,6 +1536,17 @@ typedef struct partition_info_msg { ...@@ -1536,6 +1536,17 @@ typedef struct partition_info_msg {
/* BLUEGENE specific information */ /* BLUEGENE specific information */
typedef struct {
char *bg_block_id; /* name of block that had the failing */
char *cnodes; /* cnodes that failed */
uint32_t job_id; /* jobid of job that was involved in
the failing */
uint16_t relative; /* if the cnodes are relative or
absolute */
uint32_t step_id; /* stepid of job that was involved in
* the failing */
} block_cnode_fail_t;
typedef struct { typedef struct {
char *cnodes; /* used for sub-block jobs */ char *cnodes; /* used for sub-block jobs */
int *cnode_inx; /* list index pairs for cnodes in the int *cnode_inx; /* list index pairs for cnodes in the
......
...@@ -46,6 +46,8 @@ extern "C" { ...@@ -46,6 +46,8 @@ extern "C" {
# include "config.h" # include "config.h"
#endif #endif
#include "src/common/xmalloc.h" #include "src/common/xmalloc.h"
#include "src/common/list.h"
#include "src/common/hostlist.h"
#include <slurm/slurm.h> #include <slurm/slurm.h>
} }
...@@ -86,18 +88,49 @@ private: ...@@ -86,18 +88,49 @@ private:
boost::mutex _mutex; boost::mutex _mutex;
}; };
typedef struct {
char *bg_block_id;
pid_t pid; /* The only way we can track things
since we don't have a jobid from
mmcs in the verify state.
*/
uint32_t job_id;
uint32_t step_id;
char *total_cnodes;
} runjob_job_t;
static List runjob_list = NULL;
static pthread_mutex_t runjob_list_lock = PTHREAD_MUTEX_INITIALIZER;
static void _destroy_runjob_job(void *object)
{
runjob_job_t *runjob_job = (runjob_job_t *)object;
if (runjob_job) {
xfree(runjob_job->bg_block_id);
xfree(runjob_job->total_cnodes);
xfree(runjob_job);
}
}
Plugin::Plugin() : Plugin::Plugin() :
bgsched::runjob::Plugin(), bgsched::runjob::Plugin(),
_mutex() _mutex()
{ {
assert(HIGHEST_DIMENSIONS >= Dimension::NodeDims); assert(HIGHEST_DIMENSIONS >= Dimension::NodeDims);
runjob_list = list_create(_destroy_runjob_job);
std::cout << "Slurm runjob plugin loaded" << std::endl; std::cout << "Slurm runjob plugin loaded" << std::endl;
} }
Plugin::~Plugin() Plugin::~Plugin()
{ {
std::cout << "Slurm runjob plugin finished" << std::endl; std::cout << "Slurm runjob plugin finished" << std::endl;
slurm_mutex_lock(&runjob_list_lock);
list_destroy(runjob_list);
runjob_list = NULL;
slurm_mutex_unlock(&runjob_list_lock);
} }
void Plugin::execute(bgsched::runjob::Verify& verify) void Plugin::execute(bgsched::runjob::Verify& verify)
...@@ -112,22 +145,25 @@ void Plugin::execute(bgsched::runjob::Verify& verify) ...@@ -112,22 +145,25 @@ void Plugin::execute(bgsched::runjob::Verify& verify)
bool sub_block_job = 0; bool sub_block_job = 0;
job_step_info_response_msg_t * step_resp = NULL; job_step_info_response_msg_t * step_resp = NULL;
job_step_info_t *step_ptr = NULL; job_step_info_t *step_ptr = NULL;
uint32_t job_id = NO_VAL, step_id = NO_VAL; runjob_job_t *runjob_job = NULL;
char *bg_block_id = NULL;
geo[0] = NO_VAL; geo[0] = NO_VAL;
start_coords[0] = NO_VAL; start_coords[0] = NO_VAL;
runjob_job = (runjob_job_t *)xmalloc(sizeof(runjob_job_t));
runjob_job->job_id = NO_VAL;
runjob_job->step_id = NO_VAL;
/* Get the job/step id's from the environment and then go /* Get the job/step id's from the environment and then go
* verify with the slurmctld where this step should be running. * verify with the slurmctld where this step should be running.
*/ */
BOOST_FOREACH(const bgsched::runjob::Environment& env_var, BOOST_FOREACH(const bgsched::runjob::Environment& env_var,
verify.envs()) { verify.envs()) {
if (env_var.getKey() == "SLURM_JOB_ID") { if (env_var.getKey() == "SLURM_JOB_ID") {
job_id = atoi(env_var.getValue().c_str()); runjob_job->job_id = atoi(env_var.getValue().c_str());
found++; found++;
} else if (env_var.getKey() == "SLURM_STEP_ID") { } else if (env_var.getKey() == "SLURM_STEP_ID") {
step_id = atoi(env_var.getValue().c_str()); runjob_job->step_id = atoi(env_var.getValue().c_str());
found++; found++;
} }
...@@ -138,7 +174,8 @@ void Plugin::execute(bgsched::runjob::Verify& verify) ...@@ -138,7 +174,8 @@ void Plugin::execute(bgsched::runjob::Verify& verify)
if (found != looking_for) if (found != looking_for)
goto deny_job; goto deny_job;
if (slurm_get_job_steps((time_t) 0, job_id, step_id, if (slurm_get_job_steps((time_t) 0, runjob_job->job_id,
runjob_job->step_id,
&step_resp, SHOW_ALL)) { &step_resp, SHOW_ALL)) {
slurm_perror((char *)"slurm_get_job_steps error"); slurm_perror((char *)"slurm_get_job_steps error");
goto deny_job; goto deny_job;
...@@ -146,7 +183,8 @@ void Plugin::execute(bgsched::runjob::Verify& verify) ...@@ -146,7 +183,8 @@ void Plugin::execute(bgsched::runjob::Verify& verify)
if (!step_resp->job_step_count) { if (!step_resp->job_step_count) {
std::cerr << "No steps match this id " std::cerr << "No steps match this id "
<< job_id << "." << step_id << std::endl; << runjob_job->job_id << "."
<< runjob_job->step_id << std::endl;
goto deny_job; goto deny_job;
} }
...@@ -156,7 +194,8 @@ void Plugin::execute(bgsched::runjob::Verify& verify) ...@@ -156,7 +194,8 @@ void Plugin::execute(bgsched::runjob::Verify& verify)
supposed to be running. supposed to be running.
*/ */
if (verify.user().uid() != step_ptr->user_id) { if (verify.user().uid() != step_ptr->user_id) {
std::cerr << "Jobstep " << job_id << "." << step_id std::cerr << "Jobstep " << runjob_job->job_id << "."
<< runjob_job->step_id
<< " should be ran by uid " << step_ptr->user_id << " should be ran by uid " << step_ptr->user_id
<< " but it is trying to be ran by " << " but it is trying to be ran by "
<< verify.user().uid() << std::endl; << verify.user().uid() << std::endl;
...@@ -165,12 +204,18 @@ void Plugin::execute(bgsched::runjob::Verify& verify) ...@@ -165,12 +204,18 @@ void Plugin::execute(bgsched::runjob::Verify& verify)
if (slurm_get_select_jobinfo(step_ptr->select_jobinfo, if (slurm_get_select_jobinfo(step_ptr->select_jobinfo,
SELECT_JOBDATA_BLOCK_ID, SELECT_JOBDATA_BLOCK_ID,
&bg_block_id)) { &runjob_job->bg_block_id)) {
std::cerr << "Can't get the block id!" << std::endl; std::cerr << "Can't get the block id!" << std::endl;
goto deny_job; goto deny_job;
} }
verify.block(bg_block_id); verify.block(runjob_job->bg_block_id);
xfree(bg_block_id);
if (slurm_get_select_jobinfo(step_ptr->select_jobinfo,
SELECT_JOBDATA_IONODES,
&runjob_job->total_cnodes)) {
std::cerr << "Can't get the cnode string!" << std::endl;
goto deny_job;
}
if (slurm_get_select_jobinfo(step_ptr->select_jobinfo, if (slurm_get_select_jobinfo(step_ptr->select_jobinfo,
SELECT_JOBDATA_BLOCK_NODE_CNT, SELECT_JOBDATA_BLOCK_NODE_CNT,
...@@ -269,10 +314,18 @@ void Plugin::execute(bgsched::runjob::Verify& verify) ...@@ -269,10 +314,18 @@ void Plugin::execute(bgsched::runjob::Verify& verify)
// const ProcessTree tree( verify.pid() ); // const ProcessTree tree( verify.pid() );
// std::cout << tree << std::endl; // std::cout << tree << std::endl;
runjob_job->pid = verify.pid();
slurm_mutex_lock(&runjob_list_lock);
if (runjob_list)
list_append(runjob_list, runjob_job);
slurm_mutex_unlock(&runjob_list_lock);
slurm_free_job_step_info_response_msg(step_resp); slurm_free_job_step_info_response_msg(step_resp);
return; return;
deny_job: deny_job:
_destroy_runjob_job(runjob_job);
slurm_free_job_step_info_response_msg(step_resp); slurm_free_job_step_info_response_msg(step_resp);
verify.deny_job(bgsched::runjob::Verify::DenyJob::Yes); verify.deny_job(bgsched::runjob::Verify::DenyJob::Yes);
return; return;
...@@ -287,6 +340,10 @@ void Plugin::execute(const bgsched::runjob::Started& data) ...@@ -287,6 +340,10 @@ void Plugin::execute(const bgsched::runjob::Started& data)
void Plugin::execute(const bgsched::runjob::Terminated& data) void Plugin::execute(const bgsched::runjob::Terminated& data)
{ {
ListIterator itr = NULL;
runjob_job_t *runjob_job = NULL;
block_cnode_fail_t block_cnode_fail;
boost::lock_guard<boost::mutex> lock( _mutex ); boost::lock_guard<boost::mutex> lock( _mutex );
// std::cout << "runjob " << data.pid() << " shadowing job " // std::cout << "runjob " << data.pid() << " shadowing job "
// << data.job() << " finished with status " // << data.job() << " finished with status "
...@@ -295,16 +352,82 @@ void Plugin::execute(const bgsched::runjob::Terminated& data) ...@@ -295,16 +352,82 @@ void Plugin::execute(const bgsched::runjob::Terminated& data)
// output failed nodes // output failed nodes
const bgsched::runjob::Terminated::Nodes& nodes = const bgsched::runjob::Terminated::Nodes& nodes =
data.software_error_nodes(); data.software_error_nodes();
if (!nodes.empty()) {
/* FIXME: We sould tell the slurmctld about this slurm_mutex_lock(&runjob_list_lock);
instead of just printing it out. if (runjob_list) {
*/ itr = list_iterator_create(runjob_list);
while ((runjob_job = (runjob_job_t *)list_next(itr))) {
if (runjob_job->pid == data.pid()) {
list_remove(itr);
break;
}
}
list_iterator_destroy(itr);
}
slurm_mutex_unlock(&runjob_list_lock);
if (!runjob_job) {
if (runjob_list)
std::cerr << "Couldn't find job running with pid "
<< data.pid() << std::endl;
} else if (data.kill_timeout()) {
std::cerr << runjob_job->job_id << "." << runjob_job->step_id
<< " had a kill_timeout()" << std::endl;
memset(&block_cnode_fail, 0, sizeof(block_cnode_fail_t));
block_cnode_fail.bg_block_id = runjob_job->bg_block_id;
block_cnode_fail.cnodes = runjob_job->total_cnodes;
block_cnode_fail.job_id = runjob_job->job_id;
block_cnode_fail.step_id = runjob_job->step_id;
/* FIXME: send to the slurmctld here */
} else if (!nodes.empty()) {
hostlist_t hl = hostlist_create_dims(NULL, 5);
char tmp_char[6];
std::cerr << nodes.size() << " failed nodes" << std::endl; std::cerr << nodes.size() << " failed nodes" << std::endl;
BOOST_FOREACH(const bgsched::runjob::Node& i, nodes) { BOOST_FOREACH(const bgsched::runjob::Node& i, nodes) {
sprintf(tmp_char, "%u%u%u%u%u",
i.coordinates().a(),
i.coordinates().b(),
i.coordinates().c(),
i.coordinates().d(),
i.coordinates().e());
hostlist_push_host_dims(hl, tmp_char, 5);
std::cerr << i.location() << ": " std::cerr << i.location() << ": "
<< i.coordinates() << std::endl; << i.coordinates()
<< tmp_char << std::endl;
} }
memset(&block_cnode_fail, 0, sizeof(block_cnode_fail_t));
block_cnode_fail.bg_block_id = runjob_job->bg_block_id;
block_cnode_fail.cnodes =
hostlist_ranged_string_xmalloc_dims(hl, 5, 0);
hostlist_destroy(hl);
hl = NULL;
block_cnode_fail.job_id = runjob_job->job_id;
block_cnode_fail.step_id = runjob_job->step_id;
block_cnode_fail.relative = 1;
std::cerr << "total was "
<< block_cnode_fail.cnodes << std::endl;
/* FIXME: send to the slurmctld here */
xfree(block_cnode_fail.cnodes);
} else if (!data.message().empty()) {
std::cerr << runjob_job->job_id << "." << runjob_job->step_id
<< " had a message of '" << data.message()
<< "'. Failing the cnodes on the job."
<< std::endl;
memset(&block_cnode_fail, 0, sizeof(block_cnode_fail_t));
block_cnode_fail.bg_block_id = runjob_job->bg_block_id;
block_cnode_fail.cnodes = runjob_job->total_cnodes;
block_cnode_fail.job_id = runjob_job->job_id;
block_cnode_fail.step_id = runjob_job->step_id;
/* FIXME: send to the slurmctld here */
} }
_destroy_runjob_job(runjob_job);
} }
extern "C" bgsched::runjob::Plugin* create() extern "C" bgsched::runjob::Plugin* create()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment