Skip to content
Snippets Groups Projects
Commit 583e6782 authored by Danny Auble's avatar Danny Auble
Browse files

fixed some issues with dynamic bgl

parent 1f4c8d62
No related branches found
No related tags found
No related merge requests found
...@@ -161,15 +161,14 @@ static void _drain_as_needed(char *node_list, char *reason) ...@@ -161,15 +161,14 @@ static void _drain_as_needed(char *node_list, char *reason)
extern int block_ready(struct job_record *job_ptr) extern int block_ready(struct job_record *job_ptr)
{ {
int rc = 1; int rc = 1;
#ifdef HAVE_BG_FILES
char *block_id = NULL; char *block_id = NULL;
bg_record_t *bg_record = NULL; bg_record_t *bg_record = NULL;
rc = select_g_get_jobinfo(job_ptr->select_jobinfo, rc = select_g_get_jobinfo(job_ptr->select_jobinfo,
SELECT_DATA_BLOCK_ID, &block_id); SELECT_DATA_BLOCK_ID, &block_id);
if (rc == SLURM_SUCCESS) { if (rc == SLURM_SUCCESS) {
bg_record = find_bg_record_in_list(bg_list, block_id);
slurm_mutex_lock(&block_state_mutex); slurm_mutex_lock(&block_state_mutex);
bg_record = find_bg_record(block_id);
if(bg_record) { if(bg_record) {
if ((bg_record->user_uid == job_ptr->user_id) if ((bg_record->user_uid == job_ptr->user_id)
...@@ -188,7 +187,6 @@ extern int block_ready(struct job_record *job_ptr) ...@@ -188,7 +187,6 @@ extern int block_ready(struct job_record *job_ptr)
xfree(block_id); xfree(block_id);
} else } else
rc = READY_JOB_ERROR; rc = READY_JOB_ERROR;
#endif
return rc; return rc;
} }
...@@ -273,10 +271,13 @@ extern int update_block_list() ...@@ -273,10 +271,13 @@ extern int update_block_list()
free(name); free(name);
continue; continue;
} }
bg_record = find_bg_record(name); bg_record = find_bg_record_in_list(bg_list, name);
if(bg_record == NULL) { if(bg_record == NULL) {
error("Block %s not found in bg_list " if(find_bg_record_in_list(bg_freeing_list, name)) {
break;
}
debug("Block %s not found in bg_list "
"removing from database", name); "removing from database", name);
term_jobs_on_block(name); term_jobs_on_block(name);
if ((rc = rm_get_data(block_ptr, if ((rc = rm_get_data(block_ptr,
...@@ -311,9 +312,15 @@ extern int update_block_list() ...@@ -311,9 +312,15 @@ extern int update_block_list()
|| (state == RM_PARTITION_ERROR)) { || (state == RM_PARTITION_ERROR)) {
rc = rm_remove_partition(name); rc = rm_remove_partition(name);
if (rc != STATUS_OK) { if (rc != STATUS_OK) {
error("rm_remove_partition(%s): %s", if(rc == PARTITION_NOT_FOUND) {
name, debug("1 block %s not found",
bg_err_str(rc)); name);
} else {
error("1 rm_remove_partition"
"(%s): %s",
name,
bg_err_str(rc));
}
} else } else
debug("done\n"); debug("done\n");
} }
......
...@@ -64,6 +64,8 @@ static void _rotate_geo(uint16_t *req_geometry, int rot_cnt) ...@@ -64,6 +64,8 @@ static void _rotate_geo(uint16_t *req_geometry, int rot_cnt)
} }
} }
pthread_mutex_t create_dynamic_mutex = PTHREAD_MUTEX_INITIALIZER;
/* /*
* finds the best match for a given job request * finds the best match for a given job request
* *
...@@ -126,6 +128,7 @@ static int _find_best_block_match(struct job_record* job_ptr, ...@@ -126,6 +128,7 @@ static int _find_best_block_match(struct job_record* job_ptr,
*found_bg_record = NULL; *found_bg_record = NULL;
try_again: try_again:
debug("got here");
slurm_mutex_lock(&block_state_mutex); slurm_mutex_lock(&block_state_mutex);
debug("number of blocks to check: %d state %d", debug("number of blocks to check: %d state %d",
list_count(bg_list), list_count(bg_list),
...@@ -135,7 +138,8 @@ try_again: ...@@ -135,7 +138,8 @@ try_again:
/* If test_only we want to fall through to tell the /* If test_only we want to fall through to tell the
scheduler that it is runnable just not right now. scheduler that it is runnable just not right now.
*/ */
debug3("job_running = %d", record->job_running); debug3("%s job_running = %d",
record->bg_block_id, record->job_running);
/*partition is being destroyed, ignore it*/ /*partition is being destroyed, ignore it*/
if(record->job_running == -2) if(record->job_running == -2)
continue; continue;
...@@ -228,12 +232,8 @@ try_again: ...@@ -228,12 +232,8 @@ try_again:
LAYOUT_DYNAMIC) { LAYOUT_DYNAMIC) {
temp_list = list_create(NULL); temp_list = list_create(NULL);
list_push(temp_list, record); list_push(temp_list, record);
num_block_to_free++;
slurm_mutex_unlock(
&block_state_mutex);
free_block_list(temp_list); free_block_list(temp_list);
slurm_mutex_lock( num_block_to_free++;
&block_state_mutex);
list_destroy(temp_list); list_destroy(temp_list);
} }
break; break;
...@@ -422,6 +422,8 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, ...@@ -422,6 +422,8 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap,
buf, buf,
min_nodes, min_nodes,
max_nodes); max_nodes);
if(bluegene_layout_mode == LAYOUT_DYNAMIC)
slurm_mutex_lock(&create_dynamic_mutex);
rc = _find_best_block_match(job_ptr, slurm_block_bitmap, min_nodes, rc = _find_best_block_match(job_ptr, slurm_block_bitmap, min_nodes,
max_nodes, spec, &record, test_only); max_nodes, spec, &record, test_only);
...@@ -432,8 +434,12 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, ...@@ -432,8 +434,12 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap,
select_g_set_jobinfo(job_ptr->select_jobinfo, select_g_set_jobinfo(job_ptr->select_jobinfo,
SELECT_DATA_BLOCK_ID, SELECT_DATA_BLOCK_ID,
"unassigned"); "unassigned");
/*FIX ME: isn't correct for small blocks */ if(job_ptr->num_procs < bluegene_bp_node_cnt) {
min_nodes *= bluegene_bp_node_cnt; i = procs_per_node/job_ptr->num_procs;
info("divide by %d",i);
} else
i = 1;
min_nodes *= bluegene_bp_node_cnt/i;
select_g_set_jobinfo(job_ptr->select_jobinfo, select_g_set_jobinfo(job_ptr->select_jobinfo,
SELECT_DATA_NODE_CNT, SELECT_DATA_NODE_CNT,
&min_nodes); &min_nodes);
...@@ -472,6 +478,8 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap, ...@@ -472,6 +478,8 @@ extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap,
"unassigned"); "unassigned");
} }
} }
if(bluegene_layout_mode == LAYOUT_DYNAMIC)
slurm_mutex_unlock(&create_dynamic_mutex);
return rc; return rc;
} }
...@@ -186,14 +186,16 @@ static void _sync_agent(bg_update_t *bg_update_ptr) ...@@ -186,14 +186,16 @@ static void _sync_agent(bg_update_t *bg_update_ptr)
{ {
bg_record_t * bg_record = NULL; bg_record_t * bg_record = NULL;
bg_record = find_bg_record(bg_update_ptr->bg_block_id); bg_record =
find_bg_record_in_list(bg_list, bg_update_ptr->bg_block_id);
if(!bg_record) { if(!bg_record) {
error("No block %s", bg_update_ptr->bg_block_id); error("No block %s", bg_update_ptr->bg_block_id);
return; return;
} }
slurm_mutex_lock(&block_state_mutex); slurm_mutex_lock(&block_state_mutex);
bg_record->job_running = bg_update_ptr->job_id; bg_record->job_running = bg_update_ptr->job_id;
list_push(bg_job_block_list, bg_record); if(!block_exist_in_list(bg_job_block_list, bg_record))
list_push(bg_job_block_list, bg_record);
slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&block_state_mutex);
if(bg_record->state == RM_PARTITION_READY) { if(bg_record->state == RM_PARTITION_READY) {
...@@ -235,7 +237,8 @@ static void _start_agent(bg_update_t *bg_update_ptr) ...@@ -235,7 +237,8 @@ static void _start_agent(bg_update_t *bg_update_ptr)
slurm_mutex_lock(&job_start_mutex); slurm_mutex_lock(&job_start_mutex);
bg_record = find_bg_record(bg_update_ptr->bg_block_id); bg_record =
find_bg_record_in_list(bg_list, bg_update_ptr->bg_block_id);
if(!bg_record) { if(!bg_record) {
error("block %s not found in bg_list", error("block %s not found in bg_list",
bg_update_ptr->bg_block_id); bg_update_ptr->bg_block_id);
...@@ -282,6 +285,9 @@ static void _start_agent(bg_update_t *bg_update_ptr) ...@@ -282,6 +285,9 @@ static void _start_agent(bg_update_t *bg_update_ptr)
/* wait for all necessary blocks to be freed */ /* wait for all necessary blocks to be freed */
while(num_block_to_free != num_block_freed) { while(num_block_to_free != num_block_freed) {
sleep(1); sleep(1);
debug("got %d of %d freed",
num_block_freed,
num_block_to_free);
} }
if(bg_record->job_running == -1) { if(bg_record->job_running == -1) {
...@@ -420,7 +426,8 @@ static void _term_agent(bg_update_t *bg_update_ptr) ...@@ -420,7 +426,8 @@ static void _term_agent(bg_update_t *bg_update_ptr)
} }
#endif #endif
/* remove the block's users */ /* remove the block's users */
bg_record = find_bg_record(bg_update_ptr->bg_block_id); bg_record =
find_bg_record_in_list(bg_list, bg_update_ptr->bg_block_id);
if(bg_record) { if(bg_record) {
debug("got the record %s user is %s", debug("got the record %s user is %s",
bg_record->bg_block_id, bg_record->bg_block_id,
...@@ -442,7 +449,8 @@ static void _term_agent(bg_update_t *bg_update_ptr) ...@@ -442,7 +449,8 @@ static void _term_agent(bg_update_t *bg_update_ptr)
} }
slurm_mutex_lock(&block_state_mutex); slurm_mutex_lock(&block_state_mutex);
bg_record->job_running = -1; if(bg_record->job_running != -2)
bg_record->job_running = -1;
/*remove user from list */ /*remove user from list */
if(bg_record->target_name) { if(bg_record->target_name) {
...@@ -667,14 +675,23 @@ extern int start_job(struct job_record *job_ptr) ...@@ -667,14 +675,23 @@ extern int start_job(struct job_record *job_ptr)
SELECT_DATA_BLOCK_ID, &(bg_update_ptr->bg_block_id)); SELECT_DATA_BLOCK_ID, &(bg_update_ptr->bg_block_id));
select_g_get_jobinfo(job_ptr->select_jobinfo, select_g_get_jobinfo(job_ptr->select_jobinfo,
SELECT_DATA_NODE_USE, &(bg_update_ptr->node_use)); SELECT_DATA_NODE_USE, &(bg_update_ptr->node_use));
bg_record = find_bg_record(bg_update_ptr->bg_block_id); bg_record =
find_bg_record_in_list(bg_list, bg_update_ptr->bg_block_id);
if (bg_record) { if (bg_record) {
job_ptr->num_procs = (bg_record->cpus_per_bp * job_ptr->num_procs = (bg_record->cpus_per_bp *
bg_record->bp_count); bg_record->bp_count);
slurm_mutex_lock(&block_state_mutex); slurm_mutex_lock(&block_state_mutex);
bg_record->job_running = bg_update_ptr->job_id; bg_record->job_running = bg_update_ptr->job_id;
list_push(bg_job_block_list, bg_record); if(!block_exist_in_list(bg_job_block_list, bg_record))
list_push(bg_job_block_list, bg_record);
if(!block_exist_in_list(bg_booted_block_list, bg_record))
list_push(bg_booted_block_list, bg_record);
slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&block_state_mutex);
} else {
error("bg_record %s does exist, requested for job (%d)",
bg_update_ptr->bg_block_id, job_ptr->job_id);
_bg_list_del(bg_update_ptr);
return SLURM_ERROR;
} }
info("Queue start of job %u in BG block %s", info("Queue start of job %u in BG block %s",
job_ptr->job_id, job_ptr->job_id,
...@@ -839,8 +856,11 @@ extern int boot_block(bg_record_t *bg_record) ...@@ -839,8 +856,11 @@ extern int boot_block(bg_record_t *bg_record)
return SLURM_ERROR; return SLURM_ERROR;
} }
slurm_mutex_unlock(&api_file_mutex); slurm_mutex_unlock(&api_file_mutex);
list_push(bg_booted_block_list, bg_record); slurm_mutex_lock(&block_state_mutex);
if(!block_exist_in_list(bg_booted_block_list, bg_record))
list_push(bg_booted_block_list, bg_record);
slurm_mutex_unlock(&block_state_mutex);
rc = 0; rc = 0;
while(rc < 10) { while(rc < 10) {
if(bg_record->state == RM_PARTITION_CONFIGURING) { if(bg_record->state == RM_PARTITION_CONFIGURING) {
...@@ -862,8 +882,9 @@ extern int boot_block(bg_record_t *bg_record) ...@@ -862,8 +882,9 @@ extern int boot_block(bg_record_t *bg_record)
last_bg_update = time(NULL); last_bg_update = time(NULL);
slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&block_state_mutex);
#else #else
list_push(bg_booted_block_list, bg_record);
slurm_mutex_lock(&block_state_mutex); slurm_mutex_lock(&block_state_mutex);
if(!block_exist_in_list(bg_booted_block_list, bg_record))
list_push(bg_booted_block_list, bg_record);
bg_record->state = RM_PARTITION_READY; bg_record->state = RM_PARTITION_READY;
last_bg_update = time(NULL); last_bg_update = time(NULL);
slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&block_state_mutex);
......
...@@ -145,7 +145,7 @@ static int _post_allocate(bg_record_t *bg_record) ...@@ -145,7 +145,7 @@ static int _post_allocate(bg_record_t *bg_record)
pm_partition_id_t block_id; pm_partition_id_t block_id;
struct passwd *pw_ent = NULL; struct passwd *pw_ent = NULL;
/* Add partition record to the DB */ /* Add partition record to the DB */
debug("adding partition\n"); debug2("adding partition\n");
slurm_mutex_lock(&api_file_mutex); slurm_mutex_lock(&api_file_mutex);
for(i=0;i<MAX_ADD_RETRY; i++) { for(i=0;i<MAX_ADD_RETRY; i++) {
...@@ -168,7 +168,7 @@ static int _post_allocate(bg_record_t *bg_record) ...@@ -168,7 +168,7 @@ static int _post_allocate(bg_record_t *bg_record)
} }
slurm_mutex_unlock(&api_file_mutex); slurm_mutex_unlock(&api_file_mutex);
debug("done adding\n"); debug2("done adding\n");
/* Get back the new partition id */ /* Get back the new partition id */
if ((rc = rm_get_data(bg_record->bg_block, RM_PartitionID, &block_id)) if ((rc = rm_get_data(bg_record->bg_block, RM_PartitionID, &block_id))
......
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
#include <stdio.h> #include <stdio.h>
#define BUFSIZE 4096 #define BUFSIZE 4096
#define BITSZE 128 #define BITSIZE 128
#define MMCS_POLL_TIME 120 /* poll MMCS for down switches and nodes #define MMCS_POLL_TIME 120 /* poll MMCS for down switches and nodes
* every 120 secs */ * every 120 secs */
#define BG_POLL_TIME 0 /* poll bg blocks every 3 secs */ #define BG_POLL_TIME 0 /* poll bg blocks every 3 secs */
...@@ -46,6 +46,7 @@ List bg_curr_block_list = NULL; /* current bg blocks in bluegene.conf*/ ...@@ -46,6 +46,7 @@ List bg_curr_block_list = NULL; /* current bg blocks in bluegene.conf*/
List bg_found_block_list = NULL; /* found bg blocks already on system */ List bg_found_block_list = NULL; /* found bg blocks already on system */
List bg_job_block_list = NULL; /* jobs running in these blocks */ List bg_job_block_list = NULL; /* jobs running in these blocks */
List bg_booted_block_list = NULL; /* blocks that are booted */ List bg_booted_block_list = NULL; /* blocks that are booted */
List bg_freeing_list = NULL; /* blocks that being freed */
char *bluegene_blrts = NULL, *bluegene_linux = NULL, *bluegene_mloader = NULL; char *bluegene_blrts = NULL, *bluegene_linux = NULL, *bluegene_mloader = NULL;
char *bluegene_ramdisk = NULL, *bridge_api_file = NULL; char *bluegene_ramdisk = NULL, *bridge_api_file = NULL;
...@@ -243,6 +244,30 @@ extern void destroy_bg_record(void *object) ...@@ -243,6 +244,30 @@ extern void destroy_bg_record(void *object)
} }
} }
extern int block_exist_in_list(List my_list, bg_record_t *bg_record)
{
ListIterator itr = list_iterator_create(my_list);
bg_record_t *found_record = NULL;
int rc = 0;
while ((found_record = (bg_record_t *) list_next(itr)) != NULL) {
if(bit_equal(bg_record->bitmap, found_record->bitmap)
&& (bg_record->quarter == found_record->quarter)
&& (bg_record->segment == found_record->segment)){
debug("This partition %s %d %d"
"already exists here %s",
bg_record->nodes,
bg_record->quarter,
bg_record->segment,
found_record->bg_block_id);
rc = 1;
break;
}
}
list_iterator_destroy(itr);
return rc;
}
extern void process_nodes(bg_record_t *bg_record) extern void process_nodes(bg_record_t *bg_record)
{ {
#ifdef HAVE_BG #ifdef HAVE_BG
...@@ -398,7 +423,7 @@ extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record) ...@@ -398,7 +423,7 @@ extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record)
sec_record->segment = fir_record->segment; sec_record->segment = fir_record->segment;
} }
extern bg_record_t *find_bg_record(char *bg_block_id) extern bg_record_t *find_bg_record_in_list(List my_list, char *bg_block_id)
{ {
ListIterator itr; ListIterator itr;
bg_record_t *bg_record = NULL; bg_record_t *bg_record = NULL;
...@@ -406,8 +431,9 @@ extern bg_record_t *find_bg_record(char *bg_block_id) ...@@ -406,8 +431,9 @@ extern bg_record_t *find_bg_record(char *bg_block_id)
if(!bg_block_id) if(!bg_block_id)
return NULL; return NULL;
if(bg_list) { if(my_list) {
itr = list_iterator_create(bg_list); slurm_mutex_lock(&block_state_mutex);
itr = list_iterator_create(my_list);
while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) {
if(bg_record->bg_block_id) if(bg_record->bg_block_id)
if (!strcmp(bg_record->bg_block_id, if (!strcmp(bg_record->bg_block_id,
...@@ -415,12 +441,13 @@ extern bg_record_t *find_bg_record(char *bg_block_id) ...@@ -415,12 +441,13 @@ extern bg_record_t *find_bg_record(char *bg_block_id)
break; break;
} }
list_iterator_destroy(itr); list_iterator_destroy(itr);
slurm_mutex_unlock(&block_state_mutex);
if(bg_record) if(bg_record)
return bg_record; return bg_record;
else else
return NULL; return NULL;
} else { } else {
error("find_bg_record: no bg_list"); error("find_bg_record_in_list: no list");
return NULL; return NULL;
} }
...@@ -443,10 +470,7 @@ extern int update_block_user(bg_record_t *bg_record, int set) ...@@ -443,10 +470,7 @@ extern int update_block_user(bg_record_t *bg_record, int set)
if((rc = remove_all_users(bg_record->bg_block_id, if((rc = remove_all_users(bg_record->bg_block_id,
bg_record->target_name)) bg_record->target_name))
== REMOVE_USER_ERR) { == REMOVE_USER_ERR) {
if(rc == INCONSISTENT_DATA error("1 Something happened removing "
&& bluegene_layout_mode == LAYOUT_DYNAMIC)
return 0;
error("Something happened removing "
"users from block %s", "users from block %s",
bg_record->bg_block_id); bg_record->bg_block_id);
return -1; return -1;
...@@ -545,6 +569,10 @@ extern int remove_all_users(char *bg_block_id, char *user_name) ...@@ -545,6 +569,10 @@ extern int remove_all_users(char *bg_block_id, char *user_name)
slurm_mutex_lock(&api_file_mutex); slurm_mutex_lock(&api_file_mutex);
if ((rc = rm_get_partition(bg_block_id, &block_ptr)) != STATUS_OK) { if ((rc = rm_get_partition(bg_block_id, &block_ptr)) != STATUS_OK) {
slurm_mutex_unlock(&api_file_mutex); slurm_mutex_unlock(&api_file_mutex);
if(rc == INCONSISTENT_DATA
&& bluegene_layout_mode == LAYOUT_DYNAMIC)
return REMOVE_USER_FOUND;
error("rm_get_partition(%s): %s", error("rm_get_partition(%s): %s",
bg_block_id, bg_block_id,
bg_err_str(rc)); bg_err_str(rc));
...@@ -601,9 +629,7 @@ extern int remove_all_users(char *bg_block_id, char *user_name) ...@@ -601,9 +629,7 @@ extern int remove_all_users(char *bg_block_id, char *user_name)
} }
} }
info("Removing user %s from Block %s", info("Removing user %s from Block %s", user, bg_block_id);
user,
bg_block_id);
if ((rc = rm_remove_part_user(bg_block_id, user)) if ((rc = rm_remove_part_user(bg_block_id, user))
!= STATUS_OK) { != STATUS_OK) {
debug("user %s isn't on block %s", debug("user %s isn't on block %s",
...@@ -905,13 +931,12 @@ extern int create_defined_blocks(bg_layout_t overlapped) ...@@ -905,13 +931,12 @@ extern int create_defined_blocks(bg_layout_t overlapped)
convert_node_use(bg_record->node_use)); convert_node_use(bg_record->node_use));
} }
list_iterator_destroy(itr); list_iterator_destroy(itr);
slurm_mutex_unlock(&block_state_mutex);
} else { } else {
error("create_defined_blocks: no bg_list 4"); error("create_defined_blocks: no bg_list 4");
slurm_mutex_unlock(&block_state_mutex);
return SLURM_ERROR; return SLURM_ERROR;
} }
slurm_mutex_unlock(&block_state_mutex);
#endif /* not have HAVE_BG_FILES */ #endif /* not have HAVE_BG_FILES */
...@@ -956,7 +981,7 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) ...@@ -956,7 +981,7 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list)
bitstr_t *my_bitmap = NULL; bitstr_t *my_bitmap = NULL;
int geo[BA_SYSTEM_DIMENSIONS]; int geo[BA_SYSTEM_DIMENSIONS];
int i; int i;
slurm_mutex_lock(&block_state_mutex); slurm_mutex_lock(&block_state_mutex);
reset_ba_system(); reset_ba_system();
...@@ -967,13 +992,14 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) ...@@ -967,13 +992,14 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list)
my_bitmap = my_bitmap =
bit_alloc(bit_size(bg_record->bitmap)); bit_alloc(bit_size(bg_record->bitmap));
} }
if(bg_record->bp_count>0
&& !bit_super_set(bg_record->bitmap, if(bg_record->job_running != -2
my_bitmap)) { && !bit_super_set(bg_record->bitmap, my_bitmap)) {
bit_and(my_bitmap, bg_record->bitmap); bit_or(my_bitmap, bg_record->bitmap);
for(i=0; i<BA_SYSTEM_DIMENSIONS; i++) for(i=0; i<BA_SYSTEM_DIMENSIONS; i++)
geo[i] = bg_record->geo[i]; geo[i] = bg_record->geo[i];
debug("adding %s %d%d%d %d%d%d", debug2("adding %s %d%d%d %d%d%d",
bg_record->nodes, bg_record->nodes,
bg_record->start[X], bg_record->start[X],
bg_record->start[Y], bg_record->start[Y],
...@@ -988,7 +1014,7 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) ...@@ -988,7 +1014,7 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list)
if(!name) { if(!name) {
debug("I was unable to make the " debug("I was unable to make the "
"requested block."); "requested block.");
FREE_NULL_BITMAP(my_bitmap); bit_free(my_bitmap);
slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&block_state_mutex);
return SLURM_ERROR; return SLURM_ERROR;
} }
...@@ -997,7 +1023,7 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) ...@@ -997,7 +1023,7 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list)
} }
list_iterator_destroy(itr); list_iterator_destroy(itr);
if(my_bitmap) if(my_bitmap)
FREE_NULL_BITMAP(my_bitmap); bit_free(my_bitmap);
} else { } else {
debug("No list was given"); debug("No list was given");
} }
...@@ -1027,29 +1053,41 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list) ...@@ -1027,29 +1053,41 @@ extern int create_dynamic_block(ba_request_t *request, List my_block_list)
rc = SLURM_ERROR; rc = SLURM_ERROR;
goto finished; goto finished;
} }
if(!list_count(bg_list) || !my_block_list) { if(!list_count(bg_list) || !my_block_list) {
bg_record = NULL; bg_record = NULL;
goto no_list; goto no_list;
} }
/*Try to put block starting in the smallest of the exisiting blocks*/
itr = list_iterator_create(bg_list); itr = list_iterator_create(bg_list);
while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) {
request->rotate_count = 0; request->rotate_count = 0;
request->elongate_count = 1; request->elongate_count = 1;
if((bg_record->job_running == -1) if(!my_bitmap) {
&& (bg_record->cpus_per_bp == procs_per_node my_bitmap = bit_alloc(bit_size(bg_record->bitmap));
}
if(bg_record->job_running == -1
&& (bg_record->quarter == (uint16_t) NO_VAL
|| (bg_record->quarter == 0 || (bg_record->quarter == 0
&& (bg_record->segment == 0 && (bg_record->segment == (uint16_t) NO_VAL
|| bg_record->segment == (uint16_t)NO_VAL)))) { || bg_record->segment == 0)))) {
debug2("allocating %d%d%d %d",
bg_record->nodes,
request->start[X],
request->start[Y],
request->start[Z],
request->size);
for(i=0; i<BA_SYSTEM_DIMENSIONS; i++) for(i=0; i<BA_SYSTEM_DIMENSIONS; i++)
request->start[i] = bg_record->start[i]; request->start[i] = bg_record->start[i];
request->start_req = 1; request->start_req = 1;
rc = SLURM_SUCCESS; rc = SLURM_SUCCESS;
if (!allocate_block(request, results)){ if (!allocate_block(request, results)){
debug2("allocate failure for size %d " debug2("allocate failure for size %d "
"midplanes", "base partitions",
request->size); request->size);
rc = SLURM_ERROR; rc = SLURM_ERROR;
} else } else
...@@ -1063,7 +1101,7 @@ no_list: ...@@ -1063,7 +1101,7 @@ no_list:
request->start_req = 0; request->start_req = 0;
rc = SLURM_SUCCESS; rc = SLURM_SUCCESS;
if (!allocate_block(request, results)){ if (!allocate_block(request, results)){
debug("allocate failure for size %d midplanes", debug("allocate failure for size %d base partitions",
request->size); request->size);
rc = SLURM_ERROR; rc = SLURM_ERROR;
} }
...@@ -1079,28 +1117,9 @@ no_list: ...@@ -1079,28 +1117,9 @@ no_list:
request->conn_type, num_segment, num_quarter); request->conn_type, num_segment, num_quarter);
while((bg_record = (bg_record_t *) list_pop(results)) != NULL) { while((bg_record = (bg_record_t *) list_pop(results)) != NULL) {
itr = list_iterator_create(bg_list); if(block_exist_in_list(bg_list, bg_record))
while ((found_record = destroy_bg_record(bg_record);
(bg_record_t *) list_next(itr)) != NULL) { else {
if(bit_equal(bg_record->bitmap,
found_record->bitmap)
&& (bg_record->quarter
== found_record->quarter)
&& (bg_record->segment
== found_record->segment)){
debug("This partition %s %d %d"
"already exists here %s",
bg_record->nodes,
bg_record->quarter,
bg_record->segment,
found_record->bg_block_id);
destroy_bg_record(bg_record);
break;
}
}
list_iterator_destroy(itr);
if(!found_record) {
if(_add_block_db(bg_record, &block_inx) == SLURM_ERROR) if(_add_block_db(bg_record, &block_inx) == SLURM_ERROR)
goto finished; goto finished;
list_push(bg_list, bg_record); list_push(bg_list, bg_record);
...@@ -1205,6 +1224,7 @@ extern int create_full_system_block() ...@@ -1205,6 +1224,7 @@ extern int create_full_system_block()
error("I was unable to make the " error("I was unable to make the "
"requested block."); "requested block.");
rc = SLURM_ERROR; rc = SLURM_ERROR;
destroy_bg_record(bg_record);
goto no_total; goto no_total;
} }
xfree(name); xfree(name);
...@@ -1212,6 +1232,7 @@ extern int create_full_system_block() ...@@ -1212,6 +1232,7 @@ extern int create_full_system_block()
#ifdef HAVE_BG_FILES #ifdef HAVE_BG_FILES
if((rc = configure_block(bg_record)) == SLURM_ERROR) { if((rc = configure_block(bg_record)) == SLURM_ERROR) {
error("unable to configure block in api"); error("unable to configure block in api");
destroy_bg_record(bg_record);
goto no_total; goto no_total;
} }
#endif /* HAVE_BG_FILES */ #endif /* HAVE_BG_FILES */
...@@ -1256,19 +1277,23 @@ extern int bg_free_block(bg_record_t *bg_record) ...@@ -1256,19 +1277,23 @@ extern int bg_free_block(bg_record_t *bg_record)
} }
while (1) { while (1) {
if(!bg_record) {
error("bg_free_block: there was no bg_record");
return SLURM_ERROR;
}
if (bg_record->state != NO_VAL if (bg_record->state != NO_VAL
&& bg_record->state != RM_PARTITION_FREE && bg_record->state != RM_PARTITION_FREE
&& bg_record->state != RM_PARTITION_DEALLOCATING) { && bg_record->state != RM_PARTITION_DEALLOCATING) {
#ifdef HAVE_BG_FILES #ifdef HAVE_BG_FILES
debug("pm_destroy %s",bg_record->bg_block_id); debug2("pm_destroy %s",bg_record->bg_block_id);
slurm_mutex_lock(&api_file_mutex); slurm_mutex_lock(&api_file_mutex);
rc = pm_destroy_partition(bg_record->bg_block_id); rc = pm_destroy_partition(bg_record->bg_block_id);
slurm_mutex_unlock(&api_file_mutex);
if (rc != STATUS_OK) { if (rc != STATUS_OK) {
if(rc == PARTITION_NOT_FOUND) { if(rc == PARTITION_NOT_FOUND) {
debug("block %s is not found", debug("block %s is not found",
bg_record->bg_block_id); bg_record->bg_block_id);
slurm_mutex_unlock(&api_file_mutex);
break; break;
} else if(rc == INCOMPATIBLE_STATE) { } else if(rc == INCOMPATIBLE_STATE) {
debug2("pm_destroy_partition(%s): %s " debug2("pm_destroy_partition(%s): %s "
...@@ -1276,7 +1301,6 @@ extern int bg_free_block(bg_record_t *bg_record) ...@@ -1276,7 +1301,6 @@ extern int bg_free_block(bg_record_t *bg_record)
bg_record->bg_block_id, bg_record->bg_block_id,
bg_err_str(rc), bg_err_str(rc),
bg_record->state); bg_record->state);
continue; continue;
} }
error("pm_destroy_partition(%s): %s " error("pm_destroy_partition(%s): %s "
...@@ -1284,7 +1308,6 @@ extern int bg_free_block(bg_record_t *bg_record) ...@@ -1284,7 +1308,6 @@ extern int bg_free_block(bg_record_t *bg_record)
bg_record->bg_block_id, bg_record->bg_block_id,
bg_err_str(rc), bg_record->state); bg_err_str(rc), bg_record->state);
} }
slurm_mutex_unlock(&api_file_mutex);
#else #else
bg_record->state = RM_PARTITION_FREE; bg_record->state = RM_PARTITION_FREE;
#endif #endif
...@@ -1306,6 +1329,12 @@ extern void *mult_free_block(void *args) ...@@ -1306,6 +1329,12 @@ extern void *mult_free_block(void *args)
{ {
bg_record_t *bg_record = NULL; bg_record_t *bg_record = NULL;
slurm_mutex_lock(&freed_cnt_mutex);
if ((bg_freeing_list == NULL)
&& ((bg_freeing_list = list_create(destroy_bg_record)) == NULL))
fatal("malloc failure in bg_freeing_list");
slurm_mutex_unlock(&freed_cnt_mutex);
/* /*
* Don't just exit when there is no work left. Creating * Don't just exit when there is no work left. Creating
* pthreads from within a dynamically linked object (plugin) * pthreads from within a dynamically linked object (plugin)
...@@ -1329,6 +1358,8 @@ extern void *mult_free_block(void *args) ...@@ -1329,6 +1358,8 @@ extern void *mult_free_block(void *args)
} }
slurm_mutex_lock(&freed_cnt_mutex); slurm_mutex_lock(&freed_cnt_mutex);
free_cnt--; free_cnt--;
if(bg_freeing_list)
list_destroy(bg_freeing_list);
slurm_mutex_unlock(&freed_cnt_mutex); slurm_mutex_unlock(&freed_cnt_mutex);
return NULL; return NULL;
} }
...@@ -1337,8 +1368,15 @@ extern void *mult_free_block(void *args) ...@@ -1337,8 +1368,15 @@ extern void *mult_free_block(void *args)
extern void *mult_destroy_block(void *args) extern void *mult_destroy_block(void *args)
{ {
bg_record_t *bg_record = NULL; bg_record_t *bg_record = NULL;
bg_record_t *found_record = NULL;
int rc; int rc;
char *temp_name = NULL;
slurm_mutex_lock(&freed_cnt_mutex);
if ((bg_freeing_list == NULL)
&& ((bg_freeing_list = list_create(destroy_bg_record)) == NULL))
fatal("malloc failure in bg_freeing_list");
slurm_mutex_unlock(&freed_cnt_mutex);
/* /*
* Don't just exit when there is no work left. Creating * Don't just exit when there is no work left. Creating
* pthreads from within a dynamically linked object (plugin) * pthreads from within a dynamically linked object (plugin)
...@@ -1354,40 +1392,62 @@ extern void *mult_destroy_block(void *args) ...@@ -1354,40 +1392,62 @@ extern void *mult_destroy_block(void *args)
continue; continue;
} }
slurm_mutex_lock(&block_state_mutex); slurm_mutex_lock(&block_state_mutex);
if(bg_record->job_running == -2) if(bg_record->job_running == -2) {
slurm_mutex_unlock(&block_state_mutex);
goto already_here; goto already_here;
}
bg_record->job_running = -2; bg_record->job_running = -2;
slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&block_state_mutex);
slurm_mutex_lock(&freed_cnt_mutex);
if(find_bg_record_in_list(bg_freeing_list,
bg_record->bg_block_id)) {
slurm_mutex_unlock(&freed_cnt_mutex);
goto already_here;
}
found_record = xmalloc(sizeof(bg_record_t));
found_record->bg_block_id = xstrdup(bg_record->bg_block_id);
list_push(bg_freeing_list, found_record);
slurm_mutex_unlock(&freed_cnt_mutex);
debug("removing the jobs on block %s\n", debug("removing the jobs on block %s\n",
bg_record->bg_block_id); bg_record->bg_block_id);
term_jobs_on_block(bg_record->bg_block_id); term_jobs_on_block(bg_record->bg_block_id);
debug2("destroying %s", (char *)bg_record->bg_block_id); debug2("destroying %s", (char *)bg_record->bg_block_id);
bg_free_block(bg_record); if(bg_free_block(bg_record) == SLURM_ERROR)
remove_from_bg_list(bg_list, bg_record);
if(!bg_record->bg_block_id) {
error("This one didn't have anything");
goto already_here; goto already_here;
} remove_from_bg_list(bg_list, bg_record);
#ifdef HAVE_BG_FILES #ifdef HAVE_BG_FILES
debug("removing from database %s", debug("removing from database %s",
(char *)bg_record->bg_block_id); (char *)found_record->bg_block_id);
slurm_mutex_lock(&api_file_mutex); slurm_mutex_lock(&api_file_mutex);
rc = rm_remove_partition(bg_record->bg_block_id); rc = rm_remove_partition(found_record->bg_block_id);
if (rc != STATUS_OK) { if (rc != STATUS_OK) {
error("rm_remove_partition(%s): %s", if(rc == PARTITION_NOT_FOUND) {
bg_record->bg_block_id, debug("block %s is not found",
bg_err_str(rc)); found_record->bg_block_id);
} else {
error("rm_remove_partition(%s): %s",
found_record->bg_block_id,
bg_err_str(rc));
}
} else } else
debug("done\n"); debug("done\n");
slurm_mutex_unlock(&api_file_mutex); slurm_mutex_unlock(&api_file_mutex);
#endif #endif
slurm_mutex_lock(&block_state_mutex); slurm_mutex_lock(&block_state_mutex);
if(blocks_are_created) if(blocks_are_created)
destroy_bg_record(bg_record); destroy_bg_record(bg_record);
destroy_bg_record(found_record);
slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&block_state_mutex);
slurm_mutex_lock(&freed_cnt_mutex);
remove_from_bg_list(bg_freeing_list, found_record);
slurm_mutex_unlock(&freed_cnt_mutex);
already_here: already_here:
slurm_mutex_lock(&freed_cnt_mutex); slurm_mutex_lock(&freed_cnt_mutex);
num_block_freed++; num_block_freed++;
...@@ -1395,6 +1455,8 @@ extern void *mult_destroy_block(void *args) ...@@ -1395,6 +1455,8 @@ extern void *mult_destroy_block(void *args)
} }
slurm_mutex_lock(&freed_cnt_mutex); slurm_mutex_lock(&freed_cnt_mutex);
destroy_cnt--; destroy_cnt--;
if(bg_freeing_list)
list_destroy(bg_freeing_list);
slurm_mutex_unlock(&freed_cnt_mutex); slurm_mutex_unlock(&freed_cnt_mutex);
return NULL; return NULL;
} }
...@@ -1408,7 +1470,6 @@ extern int free_block_list(List delete_list) ...@@ -1408,7 +1470,6 @@ extern int free_block_list(List delete_list)
pthread_attr_t attr_agent; pthread_attr_t attr_agent;
pthread_t thread_agent; pthread_t thread_agent;
slurm_mutex_lock(&freed_cnt_mutex);
/* set up which list to push onto */ /* set up which list to push onto */
if(bluegene_layout_mode == LAYOUT_DYNAMIC) { if(bluegene_layout_mode == LAYOUT_DYNAMIC) {
block_list = &bg_destroy_block_list; block_list = &bg_destroy_block_list;
...@@ -1417,6 +1478,7 @@ extern int free_block_list(List delete_list) ...@@ -1417,6 +1478,7 @@ extern int free_block_list(List delete_list)
block_list = &bg_free_block_list; block_list = &bg_free_block_list;
count = &free_cnt; count = &free_cnt;
} }
slurm_mutex_lock(&freed_cnt_mutex);
if ((*block_list == NULL) if ((*block_list == NULL)
&& ((*block_list = list_create(NULL)) == NULL)) && ((*block_list = list_create(NULL)) == NULL))
fatal("malloc failure in free_block_list"); fatal("malloc failure in free_block_list");
...@@ -1811,124 +1873,104 @@ static int _validate_config_nodes(void) ...@@ -1811,124 +1873,104 @@ static int _validate_config_nodes(void)
if(!bg_recover) if(!bg_recover)
return SLURM_ERROR; return SLURM_ERROR;
if(bg_list) { itr_conf = list_iterator_create(bg_list);
itr_conf = list_iterator_create(bg_list); while ((bg_record = (bg_record_t*) list_next(itr_conf))) {
while ((bg_record = (bg_record_t*) list_next(itr_conf))) { /* translate hostlist to ranged
/* translate hostlist to ranged string for consistent format
string for consistent format search here
search here */
*/ node_use = SELECT_COPROCESSOR_MODE;
node_use = SELECT_COPROCESSOR_MODE; itr_curr = list_iterator_create(bg_curr_block_list);
if(bg_curr_block_list) { while ((init_bg_record = (bg_record_t*)
itr_curr = list_iterator_create( list_next(itr_curr))
bg_curr_block_list); != NULL) {
while ((init_bg_record = (bg_record_t*) if (strcasecmp(bg_record->nodes,
list_next(itr_curr)) init_bg_record->nodes))
!= NULL) { continue; /* wrong nodes */
if (strcasecmp(bg_record->nodes, if (bg_record->conn_type
init_bg_record->nodes)) != init_bg_record->conn_type)
continue; /* wrong nodes */ continue; /* wrong conn_type */
if (bg_record->conn_type if(bg_record->quarter !=
!= init_bg_record->conn_type) init_bg_record->quarter)
continue; /* wrong conn_type */ continue; /* wrong quart */
if(bg_record->quarter != if(bg_record->segment !=
init_bg_record->quarter) init_bg_record->segment)
continue; /* wrong quart */ continue; /* wrong segment */
if(bg_record->segment != copy_bg_record(init_bg_record,
init_bg_record->segment) bg_record);
continue; /* wrong segment */ break;
copy_bg_record(init_bg_record, }
bg_record); list_iterator_destroy(itr_curr);
break;
} if (!bg_record->bg_block_id) {
list_iterator_destroy(itr_curr); format_node_name(bg_record, tmp_char);
} else {
error("_validate_config_nodes: "
"no bg_curr_block_list");
}
if (!bg_record->bg_block_id) {
format_node_name(bg_record, tmp_char);
info("Block found in bluegene.conf to be "
"created: Nodes:%s",
tmp_char);
rc = SLURM_ERROR;
} else {
list_append(bg_found_block_list, bg_record);
format_node_name(bg_record, tmp_char);
info("Found existing BG BlockID:%s " info("Block found in bluegene.conf to be "
"Nodes:%s Conn:%s", "created: Nodes:%s",
bg_record->bg_block_id, tmp_char);
tmp_char, rc = SLURM_ERROR;
convert_conn_type(bg_record->conn_type));
if((bg_record->state == RM_PARTITION_READY)
|| (bg_record->state
== RM_PARTITION_CONFIGURING))
list_push(bg_booted_block_list,
bg_record);
}
}
list_iterator_destroy(itr_conf);
if(bluegene_layout_mode == LAYOUT_DYNAMIC)
goto finished;
if(bg_curr_block_list) {
itr_curr = list_iterator_create(bg_curr_block_list);
while ((init_bg_record = (bg_record_t*)
list_next(itr_curr))
!= NULL) {
debug3("%s %d %d%d%d %d%d%d",
init_bg_record->bg_block_id,
init_bg_record->bp_count,
init_bg_record->geo[X],
init_bg_record->geo[Y],
init_bg_record->geo[Z],
DIM_SIZE[X],
DIM_SIZE[Y],
DIM_SIZE[Z]);
if ((init_bg_record->geo[X] == DIM_SIZE[X])
&& (init_bg_record->geo[Y] == DIM_SIZE[Y])
&& (init_bg_record->geo[Z] == DIM_SIZE[Z]))
{
bg_record = (bg_record_t*)
xmalloc(sizeof(bg_record_t));
list_append(bg_list, bg_record);
list_append(bg_found_block_list,
bg_record);
copy_bg_record(init_bg_record,
bg_record);
bg_record->full_block = 1;
debug("full system %s",
bg_record->bg_block_id);
format_node_name(bg_record, tmp_char);
info("Found existing BG "
"BlockID:%s "
"Nodes:%s Conn:%s",
bg_record->bg_block_id,
tmp_char,
convert_conn_type(
bg_record->conn_type));
if((bg_record->state
== RM_PARTITION_READY)
|| (bg_record->state
== RM_PARTITION_CONFIGURING))
list_push(bg_booted_block_list,
bg_record);
break;
}
}
list_iterator_destroy(itr_curr);
} else { } else {
error("_validate_config_nodes: " list_push(bg_found_block_list, bg_record);
"no bg_curr_block_list 2"); format_node_name(bg_record, tmp_char);
info("Found existing BG BlockID:%s Nodes:%s Conn:%s",
bg_record->bg_block_id,
tmp_char,
convert_conn_type(bg_record->conn_type));
if(((bg_record->state == RM_PARTITION_READY)
|| (bg_record->state == RM_PARTITION_CONFIGURING))
&& !block_exist_in_list(bg_booted_block_list,
bg_record))
list_push(bg_booted_block_list, bg_record);
}
}
list_iterator_destroy(itr_conf);
if(bluegene_layout_mode == LAYOUT_DYNAMIC)
goto finished;
itr_curr = list_iterator_create(bg_curr_block_list);
while ((init_bg_record = (bg_record_t*) list_next(itr_curr))
!= NULL) {
debug3("%s %d %d%d%d %d%d%d",
init_bg_record->bg_block_id,
init_bg_record->bp_count,
init_bg_record->geo[X],
init_bg_record->geo[Y],
init_bg_record->geo[Z],
DIM_SIZE[X],
DIM_SIZE[Y],
DIM_SIZE[Z]);
if ((init_bg_record->geo[X] == DIM_SIZE[X])
&& (init_bg_record->geo[Y] == DIM_SIZE[Y])
&& (init_bg_record->geo[Z] == DIM_SIZE[Z]))
{
bg_record = (bg_record_t*)
xmalloc(sizeof(bg_record_t));
list_push(bg_list, bg_record);
list_push(bg_found_block_list, bg_record);
copy_bg_record(init_bg_record, bg_record);
bg_record->full_block = 1;
debug("full system %s",
bg_record->bg_block_id);
format_node_name(bg_record, tmp_char);
info("Found existing BG BlockID:%s Nodes:%s Conn:%s",
bg_record->bg_block_id,
tmp_char,
convert_conn_type(bg_record->conn_type));
if(((bg_record->state == RM_PARTITION_READY)
|| (bg_record->state == RM_PARTITION_CONFIGURING))
&& !block_exist_in_list(bg_booted_block_list,
bg_record))
list_push(bg_booted_block_list, bg_record);
break;
} }
finished:
if(list_count(bg_list) == list_count(bg_curr_block_list))
rc = SLURM_SUCCESS;
} else {
error("_validate_config_nodes: no bg_list");
rc = SLURM_ERROR;
} }
list_iterator_destroy(itr_curr);
finished:
if(list_count(bg_list) == list_count(bg_curr_block_list))
rc = SLURM_SUCCESS;
#endif #endif
return rc; return rc;
...@@ -2037,7 +2079,7 @@ static int _delete_old_blocks(void) ...@@ -2037,7 +2079,7 @@ static int _delete_old_blocks(void)
if ((bg_destroy_block_list == NULL) if ((bg_destroy_block_list == NULL)
&& ((bg_destroy_block_list = list_create(NULL)) == NULL)) && ((bg_destroy_block_list = list_create(NULL)) == NULL))
fatal("malloc failure in block_list"); fatal("malloc failure in block_list");
itr_curr = list_iterator_create(bg_destroy_list); itr_curr = list_iterator_create(bg_destroy_list);
while ((init_record = (bg_record_t*) list_next(itr_curr))) { while ((init_record = (bg_record_t*) list_next(itr_curr))) {
list_push(bg_destroy_block_list, init_record); list_push(bg_destroy_block_list, init_record);
...@@ -2069,7 +2111,7 @@ static int _delete_old_blocks(void) ...@@ -2069,7 +2111,7 @@ static int _delete_old_blocks(void)
} }
list_iterator_destroy(itr_curr); list_iterator_destroy(itr_curr);
slurm_mutex_unlock(&freed_cnt_mutex); slurm_mutex_unlock(&freed_cnt_mutex);
retries=30; retries=30;
while(num_block_to_free != num_block_freed) { while(num_block_to_free != num_block_freed) {
_update_bg_record_state(bg_destroy_list); _update_bg_record_state(bg_destroy_list);
...@@ -2166,7 +2208,7 @@ static int _split_block(bg_record_t *bg_record, int procs, int *block_inx) ...@@ -2166,7 +2208,7 @@ static int _split_block(bg_record_t *bg_record, int procs, int *block_inx)
error("you asked for something that was already this size"); error("you asked for something that was already this size");
return SLURM_ERROR; return SLURM_ERROR;
} }
debug("asking for %d 32s from a %d block", debug2("asking for %d 32s from a %d block",
num_segment, bg_record->node_cnt); num_segment, bg_record->node_cnt);
small_count = num_segment+num_quarter; small_count = num_segment+num_quarter;
...@@ -2191,11 +2233,15 @@ static int _split_block(bg_record_t *bg_record, int procs, int *block_inx) ...@@ -2191,11 +2233,15 @@ static int _split_block(bg_record_t *bg_record, int procs, int *block_inx)
found_record = _create_small_record(bg_record, found_record = _create_small_record(bg_record,
quarter, quarter,
segment); segment);
if(_add_block_db(found_record, block_inx) == SLURM_ERROR) if(block_exist_in_list(bg_list, found_record)) {
return SLURM_ERROR; destroy_bg_record(found_record);
} else {
list_push(bg_list, found_record); if(_add_block_db(found_record, block_inx)
print_bg_record(found_record); == SLURM_ERROR)
return SLURM_ERROR;
list_push(bg_list, found_record);
print_bg_record(found_record);
}
node_cnt += bluegene_bp_node_cnt/small_size; node_cnt += bluegene_bp_node_cnt/small_size;
if(node_cnt == 128) { if(node_cnt == 128) {
node_cnt = 0; node_cnt = 0;
...@@ -2217,174 +2263,160 @@ static int _breakup_blocks(ba_request_t *request, List my_block_list, ...@@ -2217,174 +2263,160 @@ static int _breakup_blocks(ba_request_t *request, List my_block_list,
uint16_t last_quarter = (uint16_t) NO_VAL; uint16_t last_quarter = (uint16_t) NO_VAL;
char tmp_char[256]; char tmp_char[256];
debug("proc count = %d size = %d", debug2("proc count = %d size = %d",
request->procs, request->size); request->procs, request->size);
if(bg_list) { itr = list_iterator_create(bg_list);
itr = list_iterator_create(bg_list); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) {
if(bg_record->job_running != -1)
while ((bg_record = (bg_record_t *) list_next(itr)) continue;
!= NULL) { if(bg_record->state != RM_PARTITION_FREE)
if(bg_record->job_running > -1) continue;
continue; proc_cnt = bg_record->bp_count *
if(bg_record->state != RM_PARTITION_FREE) bg_record->cpus_per_bp;
continue; if(proc_cnt == request->procs) {
proc_cnt = bg_record->bp_count * debug2("found it here %s, %s",
bg_record->cpus_per_bp; bg_record->bg_block_id,
if(proc_cnt == request->procs) { bg_record->nodes);
debug2("found it here %s, %s", request->save_name = xmalloc(sizeof(char) * 4);
bg_record->bg_block_id, sprintf(request->save_name, "%d%d%d\0",
bg_record->nodes); bg_record->start[X],
list_iterator_destroy(itr); bg_record->start[Y],
request->save_name = bg_record->start[Z]);
xmalloc(sizeof(char) * 4); rc = SLURM_SUCCESS;
sprintf(request->save_name, goto finished;
"%d%d%d\0", }
if(bg_record->node_cnt > bluegene_bp_node_cnt)
continue;
if(proc_cnt < request->procs) {
if(last_quarter != bg_record->quarter){
last_quarter = bg_record->quarter;
total_proc_cnt = proc_cnt;
} else {
total_proc_cnt += proc_cnt;
}
debug2("1 got %d on quarter %d",
total_proc_cnt, last_quarter);
if(total_proc_cnt == request->procs) {
request->save_name = xmalloc(sizeof(char) * 4);
sprintf(request->save_name, "%d%d%d\0",
bg_record->start[X], bg_record->start[X],
bg_record->start[Y], bg_record->start[Y],
bg_record->start[Z]); bg_record->start[Z]);
rc = SLURM_SUCCESS; if(!my_block_list) {
goto finished; rc = SLURM_SUCCESS;
} goto finished;
if(bg_record->node_cnt > bluegene_bp_node_cnt)
continue;
if(proc_cnt < request->procs) {
if(last_quarter != bg_record->quarter){
last_quarter =
bg_record->quarter;
total_proc_cnt = proc_cnt;
} else {
total_proc_cnt += proc_cnt;
} }
debug2("1 got %d on quarter %d",
total_proc_cnt, last_quarter);
if(total_proc_cnt == request->procs) {
request->save_name =
xmalloc(sizeof(char) * 4);
sprintf(request->save_name,
"%d%d%d\0",
bg_record->start[X],
bg_record->start[Y],
bg_record->start[Z]);
list_iterator_destroy(itr);
if(!my_block_list) {
rc = SLURM_SUCCESS;
goto finished;
}
bg_record = _create_small_record( bg_record = _create_small_record(
bg_record, bg_record,
last_quarter, last_quarter,
(uint16_t) NO_VAL); (uint16_t) NO_VAL);
if(block_exist_in_list(bg_list, bg_record))
destroy_bg_record(bg_record);
else {
if(_add_block_db(bg_record, block_inx) if(_add_block_db(bg_record, block_inx)
== SLURM_ERROR) == SLURM_ERROR)
return SLURM_ERROR; return SLURM_ERROR;
list_push(bg_list, bg_record); list_push(bg_list, bg_record);
print_bg_record(bg_record); print_bg_record(bg_record);
rc = SLURM_SUCCESS;
goto finished;
} }
continue; rc = SLURM_SUCCESS;
goto finished;
} }
break; continue;
}
if(bg_record) {
debug("got one on the first pass");
goto found_one;
} }
list_iterator_reset(itr); break;
last_quarter = (uint16_t) NO_VAL; }
while ((bg_record = (bg_record_t *) list_next(itr)) if(bg_record) {
!= NULL) { debug2("got one on the first pass");
if(bg_record->job_running > -1) goto found_one;
continue; }
proc_cnt = bg_record->bp_count * list_iterator_reset(itr);
bg_record->cpus_per_bp; last_quarter = (uint16_t) NO_VAL;
if(proc_cnt == request->procs) { while ((bg_record = (bg_record_t *) list_next(itr))
debug2("found it here %s, %s", != NULL) {
bg_record->bg_block_id, if(bg_record->job_running != -1)
bg_record->nodes); continue;
list_iterator_destroy(itr); proc_cnt = bg_record->bp_count * bg_record->cpus_per_bp;
request->save_name = if(proc_cnt == request->procs) {
xmalloc(sizeof(char) * 4); debug2("found it here %s, %s",
sprintf(request->save_name, bg_record->bg_block_id,
"%d%d%d\0", bg_record->nodes);
request->save_name = xmalloc(sizeof(char) * 4);
sprintf(request->save_name, "%d%d%d\0",
bg_record->start[X],
bg_record->start[Y],
bg_record->start[Z]);
rc = SLURM_SUCCESS;
goto finished;
}
if(bg_record->node_cnt > bluegene_bp_node_cnt)
continue;
if(proc_cnt < request->procs) {
if(last_quarter != bg_record->quarter){
last_quarter = bg_record->quarter;
total_proc_cnt = proc_cnt;
} else {
total_proc_cnt += proc_cnt;
}
debug2("got %d on quarter %d",
total_proc_cnt, last_quarter);
if(total_proc_cnt == request->procs) {
request->save_name = xmalloc(sizeof(char) * 4);
sprintf(request->save_name, "%d%d%d\0",
bg_record->start[X], bg_record->start[X],
bg_record->start[Y], bg_record->start[Y],
bg_record->start[Z]); bg_record->start[Z]);
rc = SLURM_SUCCESS; if(!my_block_list) {
goto finished; rc = SLURM_SUCCESS;
} goto finished;
if(bg_record->node_cnt > bluegene_bp_node_cnt)
continue;
if(proc_cnt < request->procs) {
if(last_quarter != bg_record->quarter){
last_quarter =
bg_record->quarter;
total_proc_cnt = proc_cnt;
} else {
total_proc_cnt += proc_cnt;
} }
debug2("got %d on quarter %d", bg_record = _create_small_record(
total_proc_cnt, last_quarter); bg_record,
if(total_proc_cnt == request->procs) { last_quarter,
request->save_name = (uint16_t) NO_VAL);
xmalloc(sizeof(char) * 4); if(block_exist_in_list(bg_list, bg_record))
sprintf(request->save_name, destroy_bg_record(bg_record);
"%d%d%d\0", else {
bg_record->start[X],
bg_record->start[Y],
bg_record->start[Z]);
list_iterator_destroy(itr);
if(!my_block_list) {
rc = SLURM_SUCCESS;
goto finished;
}
bg_record = _create_small_record(
bg_record,
last_quarter,
(uint16_t) NO_VAL);
if(_add_block_db(bg_record, block_inx) if(_add_block_db(bg_record, block_inx)
== SLURM_ERROR) == SLURM_ERROR)
return SLURM_ERROR; return SLURM_ERROR;
list_push(bg_list, bg_record); list_push(bg_list, bg_record);
print_bg_record(bg_record); print_bg_record(bg_record);
rc = SLURM_SUCCESS;
goto finished;
} }
continue; rc = SLURM_SUCCESS;
goto finished;
} }
continue;
break; }
} break;
found_one: }
if(bg_record) { found_one:
format_node_name(bg_record, tmp_char); if(bg_record) {
format_node_name(bg_record, tmp_char);
debug("going to split %s, %s", debug2("going to split %s, %s",
bg_record->bg_block_id, bg_record->bg_block_id,
tmp_char); tmp_char);
if(_split_block(bg_record, request->procs, request->save_name = xmalloc(sizeof(char) * 4);
block_inx) sprintf(request->save_name, "%d%d%d\0",
== SLURM_SUCCESS) { bg_record->start[X],
request->save_name = bg_record->start[Y],
xmalloc(sizeof(char) * 4); bg_record->start[Z]);
sprintf(request->save_name, if(!my_block_list) {
"%d%d%d\0",
bg_record->start[X],
bg_record->start[Y],
bg_record->start[Z]);
}
list_iterator_destroy(itr);
rc = SLURM_SUCCESS; rc = SLURM_SUCCESS;
goto finished; goto finished;
} }
list_iterator_destroy(itr); _split_block(bg_record, request->procs, block_inx);
rc = SLURM_SUCCESS;
goto finished;
} }
finished: finished:
list_iterator_destroy(itr);
return rc; return rc;
} }
...@@ -2451,7 +2483,7 @@ static int _add_bg_record(List records, char *nodes, ...@@ -2451,7 +2483,7 @@ static int _add_bg_record(List records, char *nodes,
bg_record->segment = (uint16_t)NO_VAL; bg_record->segment = (uint16_t)NO_VAL;
/* bg_record->boot_state = 0; Implicit */ /* bg_record->boot_state = 0; Implicit */
/* bg_record->state = 0; Implicit */ /* bg_record->state = 0; Implicit */
debug("asking for %s %d %d",nodes, num_quarter, num_segment); debug2("asking for %s %d %d", nodes, num_quarter, num_segment);
len = strlen(nodes); len = strlen(nodes);
i=0; i=0;
while((nodes[i] != '[' && (nodes[i] > 57 || nodes[i] < 48)) while((nodes[i] != '[' && (nodes[i] > 57 || nodes[i] < 48))
...@@ -2479,7 +2511,7 @@ static int _add_bg_record(List records, char *nodes, ...@@ -2479,7 +2511,7 @@ static int _add_bg_record(List records, char *nodes,
bg_record->job_running = -1; bg_record->job_running = -1;
if(bg_record->conn_type != SELECT_SMALL) if(bg_record->conn_type != SELECT_SMALL)
list_append(records, bg_record); list_push(records, bg_record);
else { else {
if(num_segment==0 && num_quarter==0) { if(num_segment==0 && num_quarter==0) {
info("No specs given for this small block, " info("No specs given for this small block, "
...@@ -2526,7 +2558,7 @@ static int _add_bg_record(List records, char *nodes, ...@@ -2526,7 +2558,7 @@ static int _add_bg_record(List records, char *nodes,
quarter, quarter,
segment); segment);
list_append(records, found_record); list_push(records, found_record);
node_cnt += bluegene_bp_node_cnt/small_size; node_cnt += bluegene_bp_node_cnt/small_size;
if(node_cnt == 128) { if(node_cnt == 128) {
node_cnt = 0; node_cnt = 0;
......
...@@ -130,6 +130,7 @@ extern List bg_curr_block_list; /* Initial bg block state */ ...@@ -130,6 +130,7 @@ extern List bg_curr_block_list; /* Initial bg block state */
extern List bg_list; /* List of configured BG blocks */ extern List bg_list; /* List of configured BG blocks */
extern List bg_job_block_list; /* jobs running in these blocks */ extern List bg_job_block_list; /* jobs running in these blocks */
extern List bg_booted_block_list; /* blocks that are booted */ extern List bg_booted_block_list; /* blocks that are booted */
extern List bg_freeing_list; /* blocks that being freed */
extern bool agent_fini; extern bool agent_fini;
extern pthread_mutex_t block_state_mutex; extern pthread_mutex_t block_state_mutex;
...@@ -158,11 +159,12 @@ extern void fini_bg(void); ...@@ -158,11 +159,12 @@ extern void fini_bg(void);
/* Log a bg_record's contents */ /* Log a bg_record's contents */
extern void print_bg_record(bg_record_t *record); extern void print_bg_record(bg_record_t *record);
extern void destroy_bg_record(void *object); extern void destroy_bg_record(void *object);
extern int block_exist_in_list(List my_list, bg_record_t *bg_record);
extern void process_nodes(bg_record_t *bg_record); extern void process_nodes(bg_record_t *bg_record);
extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record); extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record);
/* return bg_record from bg_list */ /* return bg_record from a bg_list */
extern bg_record_t *find_bg_record(char *bg_block_id); extern bg_record_t *find_bg_record_in_list(List my_list, char *bg_block_id);
/* change username of a block bg_record_t target_name needs to be /* change username of a block bg_record_t target_name needs to be
updated before call of function. updated before call of function.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment