diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c index 4a8850c9a2d91c4a667d60b44a681d699025ed76..ab37841f62f97ea7446f4cfa34a7b6e2678c3ae4 100644 --- a/src/plugins/select/bluegene/plugin/bg_block_info.c +++ b/src/plugins/select/bluegene/plugin/bg_block_info.c @@ -79,10 +79,11 @@ static int _block_is_deallocating(bg_record_t *bg_record) bg_record->user_name)) { error("Partition %s was in a ready state " "for user %s but is being freed. " - "Job was lost.", + "Job %s was lost.", bg_record->bg_block_id, - bg_record->user_name); - term_jobs_on_block(bg_record->bg_block_id); + bg_record->user_name, + bg_record->job_running); + (void) slurm_fail_job(bg_record->job_running); } else { debug("Partition %s was in a ready state " "but is being freed. No job running.", diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index 9560d383590d9c249aa182f37bc6ca8a84a7ce6c..059ae0819e95ddd4f0e5716053bea82faff24f6b 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -112,7 +112,7 @@ static int _find_best_block_match(struct job_record* job_ptr, we want to fall through to tell the scheduler that it is runnable just not right now. */ - if(full_system_block->job_running && checked<2) { + if((full_system_block->job_running != -1) && checked<2) { checked++; select_g_set_jobinfo(job_ptr->select_jobinfo, SELECT_DATA_CHECKED, &checked); @@ -149,7 +149,7 @@ static int _find_best_block_match(struct job_record* job_ptr, */ slurm_mutex_lock(&block_state_mutex); debug3("job_running = %d", record->job_running); - if(record->job_running && checked<2) { + if((record->job_running != -1) && checked<2) { job_running++; debug("block %s in use by %s", record->bg_block_id, diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c index e476c31d31ab2c8cae1e415163bb6ffa3de0caba..4ecbf415255190b85b0cb2e2eca54204b7dc773a 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_run.c +++ b/src/plugins/select/bluegene/plugin/bg_job_run.c @@ -186,7 +186,9 @@ static void _sync_agent(bg_update_t *bg_update_ptr) error("No block %s", bg_update_ptr->bg_block_id); return; } + slurm_mutex_lock(&block_state_mutex); bg_record->job_running = bg_update_ptr->job_id; + slurm_mutex_unlock(&block_state_mutex); if(bg_record->state==RM_PARTITION_READY) { if(bg_record->user_uid != bg_update_ptr->uid) { @@ -301,7 +303,7 @@ static void _start_agent(bg_update_t *bg_update_ptr) sleep(1); } - if(bg_record->job_running == 0) + if(bg_record->job_running == -1) return; if((rc = boot_block(bg_record)) != SLURM_SUCCESS) { @@ -447,7 +449,7 @@ static void _term_agent(bg_update_t *bg_update_ptr) } slurm_mutex_lock(&block_state_mutex); - bg_record->job_running = 0; + bg_record->job_running = -1; /*remove user from list */ if(bg_record->target_name) { @@ -755,7 +757,7 @@ int term_job(struct job_record *job_ptr) job_ptr->job_id, bg_record->bg_block_id); bg_record->state = RM_PARTITION_FREE; - bg_record->job_running = 0; + bg_record->job_running = -1; last_bg_update = time(NULL); xfree(block_id); } diff --git a/src/plugins/select/bluegene/plugin/block_sys.c b/src/plugins/select/bluegene/plugin/block_sys.c index 1d9e4988800483ef0ba67eb6739e3250d1127215..5c5eca1e24dec03da21521c657150985341c32b6 100755 --- a/src/plugins/select/bluegene/plugin/block_sys.c +++ b/src/plugins/select/bluegene/plugin/block_sys.c @@ -286,8 +286,8 @@ int read_bg_blocks() break; } } else { - if ((rc = rm_get_data(block_list, RM_PartListFirstPart, - &block_ptr)) != STATUS_OK) { + if ((rc = rm_get_data(block_list, RM_PartListFirstPart, + &block_ptr)) != STATUS_OK) { error("rm_get_data(RM_PartListFirstPart): %s", bg_err_str(rc)); break; @@ -330,6 +330,7 @@ int read_bg_blocks() bg_record->state = -1; bg_record->quarter = -1; + bg_record->job_running = -1; if ((rc = rm_get_data(block_ptr, RM_PartitionBPNum,