Skip to content
Snippets Groups Projects
Commit e98910cc authored by Danny Auble's avatar Danny Auble
Browse files

fixed bug in bluegene plugin to fail job if the partition is freed while running the job

parent c93312af
No related branches found
No related tags found
No related merge requests found
......@@ -79,10 +79,11 @@ static int _block_is_deallocating(bg_record_t *bg_record)
bg_record->user_name)) {
error("Partition %s was in a ready state "
"for user %s but is being freed. "
"Job was lost.",
"Job %s was lost.",
bg_record->bg_block_id,
bg_record->user_name);
term_jobs_on_block(bg_record->bg_block_id);
bg_record->user_name,
bg_record->job_running);
(void) slurm_fail_job(bg_record->job_running);
} else {
debug("Partition %s was in a ready state "
"but is being freed. No job running.",
......
......@@ -112,7 +112,7 @@ static int _find_best_block_match(struct job_record* job_ptr,
we want to fall through to tell the scheduler that it is runnable
just not right now.
*/
if(full_system_block->job_running && checked<2) {
if((full_system_block->job_running != -1) && checked<2) {
checked++;
select_g_set_jobinfo(job_ptr->select_jobinfo,
SELECT_DATA_CHECKED, &checked);
......@@ -149,7 +149,7 @@ static int _find_best_block_match(struct job_record* job_ptr,
*/
slurm_mutex_lock(&block_state_mutex);
debug3("job_running = %d", record->job_running);
if(record->job_running && checked<2) {
if((record->job_running != -1) && checked<2) {
job_running++;
debug("block %s in use by %s",
record->bg_block_id,
......
......@@ -186,7 +186,9 @@ static void _sync_agent(bg_update_t *bg_update_ptr)
error("No block %s", bg_update_ptr->bg_block_id);
return;
}
slurm_mutex_lock(&block_state_mutex);
bg_record->job_running = bg_update_ptr->job_id;
slurm_mutex_unlock(&block_state_mutex);
if(bg_record->state==RM_PARTITION_READY) {
if(bg_record->user_uid != bg_update_ptr->uid) {
......@@ -301,7 +303,7 @@ static void _start_agent(bg_update_t *bg_update_ptr)
sleep(1);
}
if(bg_record->job_running == 0)
if(bg_record->job_running == -1)
return;
if((rc = boot_block(bg_record))
!= SLURM_SUCCESS) {
......@@ -447,7 +449,7 @@ static void _term_agent(bg_update_t *bg_update_ptr)
}
slurm_mutex_lock(&block_state_mutex);
bg_record->job_running = 0;
bg_record->job_running = -1;
/*remove user from list */
if(bg_record->target_name) {
......@@ -755,7 +757,7 @@ int term_job(struct job_record *job_ptr)
job_ptr->job_id,
bg_record->bg_block_id);
bg_record->state = RM_PARTITION_FREE;
bg_record->job_running = 0;
bg_record->job_running = -1;
last_bg_update = time(NULL);
xfree(block_id);
}
......
......@@ -286,8 +286,8 @@ int read_bg_blocks()
break;
}
} else {
if ((rc = rm_get_data(block_list, RM_PartListFirstPart,
&block_ptr)) != STATUS_OK) {
if ((rc = rm_get_data(block_list, RM_PartListFirstPart,
&block_ptr)) != STATUS_OK) {
error("rm_get_data(RM_PartListFirstPart): %s",
bg_err_str(rc));
break;
......@@ -330,6 +330,7 @@ int read_bg_blocks()
bg_record->state = -1;
bg_record->quarter = -1;
bg_record->job_running = -1;
if ((rc = rm_get_data(block_ptr,
RM_PartitionBPNum,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment