diff --git a/src/plugins/select/bluegene/bgl_job_run.c b/src/plugins/select/bluegene/bgl_job_run.c index e224ea55417c33a08371840b4b2326893b666a03..21786550cbbd411ef243d9f22dc6ad7b5f362e55 100644 --- a/src/plugins/select/bluegene/bgl_job_run.c +++ b/src/plugins/select/bluegene/bgl_job_run.c @@ -135,7 +135,7 @@ static int _remove_job(db_job_id_t job_id) if ((rc = rm_free_job(job_rec)) != STATUS_OK) error("rm_free_job: %s", bgl_err_str(rc)); - debug("job %d is in state %d", job_id, job_state); + debug2("job %d is in state %d", job_id, job_state); /* check the state and process accordingly */ if(job_state == RM_JOB_TERMINATED) @@ -339,7 +339,7 @@ static void _term_agent(bgl_update_t *bgl_update_ptr) db_job_id_t job_id; bgl_record_t *bgl_record = NULL; - //debug("getting the job info"); + debug2("getting the job info"); live_states = JOB_ALL_FLAG & (~JOB_TERMINATED_FLAG) & (~JOB_KILLED_FLAG); @@ -353,7 +353,7 @@ static void _term_agent(bgl_update_t *bgl_update_ptr) jobs = 0; } else if (jobs > 300) fatal("Active job count (%d) invalid, restart MMCS", jobs); - //debug("job count %d",jobs); + debug2("job count %d",jobs); for (i=0; i<jobs; i++) { if (i) { @@ -383,7 +383,7 @@ static void _term_agent(bgl_update_t *bgl_update_ptr) part_id, bgl_err_str(rc)); continue; } - debug("looking at partition %s looking for %s\n", + debug2("looking at partition %s looking for %s\n", part_id, bgl_update_ptr->bgl_part_id); if (strcmp(part_id, bgl_update_ptr->bgl_part_id) != 0) continue; @@ -393,7 +393,7 @@ static void _term_agent(bgl_update_t *bgl_update_ptr) bgl_err_str(rc)); continue; } - debug("got job_id %d",job_id); + debug2("got job_id %d",job_id); if((rc = _remove_job(job_id)) == INTERNAL_ERROR) goto not_removed; @@ -404,7 +404,7 @@ static void _term_agent(bgl_update_t *bgl_update_ptr) bgl_record = find_bgl_record(bgl_update_ptr->bgl_part_id); if(bgl_record) { - debug("got the record %s user is %s", + debug2("got the record %s user is %s", bgl_record->bgl_part_id, bgl_record->user_name); @@ -600,7 +600,7 @@ int term_jobs_on_part(pm_partition_id_t bgl_part_id) bgl_update_ptr = xmalloc(sizeof(bgl_update_t)); bgl_update_ptr->op = TERM_OP; bgl_update_ptr->bgl_part_id = xstrdup(bgl_part_id); - _term_agent(bgl_update_ptr); + _part_op(bgl_update_ptr); return rc; } diff --git a/src/plugins/select/bluegene/bgl_part_info.c b/src/plugins/select/bluegene/bgl_part_info.c index f002702a2ef2876ded77af5b72129111035a0750..6ce2194f5ff8440848a14f6684b2cf815cf2b7c8 100644 --- a/src/plugins/select/bluegene/bgl_part_info.c +++ b/src/plugins/select/bluegene/bgl_part_info.c @@ -56,6 +56,61 @@ #define _DEBUG 0 #define RETRY_BOOT_COUNT 3 +static int _partition_is_deallocating(bgl_record_t *bgl_record); + +static int _partition_is_deallocating(bgl_record_t *bgl_record) +{ + if(remove_all_users(bgl_record->bgl_part_id, NULL) + == REMOVE_USER_ERR) { + error("Something happened removing " + "users from partition %s", + bgl_record->bgl_part_id); + } + + if(bgl_record->target_name + && bgl_record->user_name) { + if(!strcmp(bgl_record->target_name, + slurmctld_conf.slurm_user_name)) { + if(strcmp(bgl_record->target_name, + bgl_record->user_name)) { + error("Partition %s was in a ready state " + "for user %s but is being freed. " + "Job was lost.", + bgl_record->bgl_part_id, + bgl_record->user_name); + xfree(bgl_record->target_name); + bgl_record->target_name = + xstrdup(bgl_record->user_name); + //term_jobs_on_part(bgl_record->bgl_part_id); + } else { + debug("Partition %s was in a ready state " + "but is being freed. No job running.", + bgl_record->bgl_part_id); + } + } else { + error("State went to free on a boot " + "for partition %s.", + bgl_record->bgl_part_id); + } + + } else if(bgl_record->user_name) { + error("Target Name was not set " + "not set for partition %s.", + bgl_record->bgl_part_id); + bgl_record->target_name = + xstrdup(bgl_record->user_name); + } else { + error("Target Name and User Name are " + "not set for partition %s.", + bgl_record->bgl_part_id); + bgl_record->user_name = + xstrdup(slurmctld_conf.slurm_user_name); + bgl_record->target_name = + xstrdup(bgl_record->user_name); + } + return SLURM_SUCCESS; +} + /* * check to see if partition is ready to execute. Meaning * User is added to the list of users able to run, and no one @@ -124,7 +179,8 @@ extern int update_partition_list() time_t now; struct tm *time_ptr; char reason[128]; - + int skipped_dealloc = 0; + if(bgl_list == NULL && !last_bgl_update) return 0; @@ -212,62 +268,19 @@ extern int update_partition_list() } else if(bgl_record->state != state) { debug("state of Partition %s was %d and now is %d", name, bgl_record->state, state); + /* + check to make sure partition went + through freeing correctly + */ + if(bgl_record->state != RM_PARTITION_DEALLOCATING + && state == RM_PARTITION_FREE) + skipped_dealloc = 1; bgl_record->state = state; - error("Yeah, the state changed"); - if(bgl_record->state == RM_PARTITION_FREE) { - if((rc = remove_all_users( - bgl_record->bgl_part_id, - NULL)) - == REMOVE_USER_ERR) { - error("Something happened removing " - "users from partition %s", - bgl_record->bgl_part_id); - } - - if(bgl_record->target_name - && bgl_record->user_name) { - if(!strcmp(bgl_record->target_name, - slurmctld_conf. - slurm_user_name)) { - if(strcmp(bgl_record->target_name, - bgl_record->user_name)) { - error("Partition %s was in a " - "ready state for user %s but got " - "freed. Job was probably lost.", - bgl_record->user_name, - bgl_record->bgl_part_id); - xfree(bgl_record-> - user_name); - bgl_record->user_name = - xstrdup(bgl_record-> - target_name); - } else { - error("Partition %s was in a " - "ready state for user %s but got " - "freed. No job running.", - bgl_record->bgl_part_id); - } - } else { - error("State went to free on a boot " - "for partition %s.", - bgl_record->bgl_part_id); - } - - } else if(bgl_record->user_name) { - error("Target Name was not set " - "not set for partition %s.", - bgl_record->bgl_part_id); - bgl_record->target_name = - xstrdup(bgl_record->user_name); - } else { - error("Target Name and User Name are " - "not set for partition %s.", - bgl_record->bgl_part_id); - bgl_record->user_name = - xstrdup(slurmctld_conf.slurm_user_name); - bgl_record->target_name = - xstrdup(bgl_record->user_name); - } + if(bgl_record->state == RM_PARTITION_DEALLOCATING) { + _partition_is_deallocating(bgl_record); + } else if(skipped_dealloc) { + _partition_is_deallocating(bgl_record); + skipped_dealloc = 0; } else if(bgl_record->state == RM_PARTITION_CONFIGURING) bgl_record->boot_state = 1; diff --git a/src/plugins/select/bluegene/bluegene.c b/src/plugins/select/bluegene/bluegene.c index 8d4d93d30ae4645cd2b1ff97a6cb6947a21bba92..e2b32e645489df16ea5aaa7f661981e83c2debea 100644 --- a/src/plugins/select/bluegene/bluegene.c +++ b/src/plugins/select/bluegene/bluegene.c @@ -383,7 +383,7 @@ extern int remove_all_users(char *bgl_part_id, char *user_name) returnc = REMOVE_USER_ERR; user_count = 0; } else - debug("got %d users for %s",user_count, bgl_part_id); + debug2("got %d users for %s",user_count, bgl_part_id); for(i=0; i<user_count; i++) { if(i) { if ((rc = rm_get_data(part_ptr, diff --git a/src/slurmctld/proc_req.c b/src/slurmctld/proc_req.c index 3c9a29d6352f1fe1b5889337a392a1428a2b7a16..2029d942980e44e359dd752dee827270a71fdeed 100644 --- a/src/slurmctld/proc_req.c +++ b/src/slurmctld/proc_req.c @@ -1792,11 +1792,11 @@ static void _slurm_rpc_job_ready(slurm_msg_t * msg) END_TIMER; if (error_code) { - debug("_slurm_rpc_job_ready: %s", + debug2("_slurm_rpc_job_ready: %s", slurm_strerror(error_code)); slurm_send_rc_msg(msg, error_code); } else { - debug("_slurm_rpc_job_ready(%u)=%d %s", id_msg->job_id, + debug2("_slurm_rpc_job_ready(%u)=%d %s", id_msg->job_id, result, TIME_STR); rc_msg.return_code = result; response_msg.address = msg->address;