From 6e0a4c1c76805f0d8b765a31b6ddda470cecfd4f Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Fri, 26 Jun 2009 17:49:50 +0000 Subject: [PATCH] svn merge -r17976:17985 https://eris.llnl.gov/svn/slurm/branches/slurm-2.0 --- NEWS | 7 +++++++ .../multifactor/priority_multifactor.c | 14 ++++++++++++-- src/plugins/select/bluegene/plugin/bluegene.c | 19 ++----------------- src/sacctmgr/sacctmgr.c | 3 +++ src/salloc/salloc.c | 13 +++++++------ src/srun/allocate.c | 13 +++++++------ 6 files changed, 38 insertions(+), 31 deletions(-) diff --git a/NEWS b/NEWS index dee3d412526..67e4f5f4247 100644 --- a/NEWS +++ b/NEWS @@ -68,6 +68,13 @@ documents those changes that are of interest to users and admins. -- Better logging for when job's request bad output file locations. -- Fix issue where if user specified non-existant file to write to slurmstepd will regain privileges before sending batch script ended to the controller. + -- Fix bug when using the priority_multifactor plugin with no associations yet. + -- BLUEGENE - we no longer check for the images to sync state. This was + needed long ago when rebooting blocks wasn't a possibility and should + had been removed when that functionality was available. + -- Added message about no connection with the database for sacctmgr. + -- On BlueGene, let srun or salloc exit on SIGINT if slurmctld dies while + booting its block. * Changes in SLURM 2.0.3 ======================== diff --git a/src/plugins/priority/multifactor/priority_multifactor.c b/src/plugins/priority/multifactor/priority_multifactor.c index b7940509650..1f5ee0b56cf 100644 --- a/src/plugins/priority/multifactor/priority_multifactor.c +++ b/src/plugins/priority/multifactor/priority_multifactor.c @@ -138,6 +138,7 @@ static int _apply_decay(double decay_factor) return SLURM_SUCCESS; xassert(assoc_mgr_association_list); + xassert(assoc_mgr_qos_list); slurm_mutex_lock(&assoc_mgr_association_lock); itr = list_iterator_create(assoc_mgr_association_list); @@ -938,7 +939,14 @@ int init ( void ) temp); calc_fairshare = 0; weight_fs = 0; - } else { + } else if(weight_fs) { + if(!assoc_mgr_root_assoc) + fatal("It appears you don't have any association " + "data from your database. " + "The priority/multifactor plugin requires " + "this information to run correctly. Please " + "check your database connection and try again."); + if(!cluster_procs) fatal("We need to have a cluster cpu count " "before we can init the priority/multifactor " @@ -960,7 +968,9 @@ int init ( void ) fatal("pthread_create error %m"); slurm_attr_destroy(&thread_attr); - } + } else + calc_fairshare = 0; + xfree(temp); verbose("%s loaded", plugin_name); diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index 074805a2111..1eedebf3614 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -1416,7 +1416,8 @@ static int _validate_config_nodes(List curr_block_list, while ((bg_record = list_next(itr_conf))) { list_iterator_reset(itr_curr); while ((init_bg_record = list_next(itr_curr))) { - if (strcasecmp(bg_record->nodes, init_bg_record->nodes)) + if (strcasecmp(bg_record->nodes, + init_bg_record->nodes)) continue; /* wrong nodes */ if(!bit_equal(bg_record->ionode_bitmap, init_bg_record->ionode_bitmap)) @@ -1424,28 +1425,12 @@ static int _validate_config_nodes(List curr_block_list, #ifdef HAVE_BGL if (bg_record->conn_type != init_bg_record->conn_type) continue; /* wrong conn_type */ - if(bg_record->blrtsimage && - strcasecmp(bg_record->blrtsimage, - init_bg_record->blrtsimage)) - continue; #else if ((bg_record->conn_type != init_bg_record->conn_type) && ((bg_record->conn_type < SELECT_SMALL) && (init_bg_record->conn_type < SELECT_SMALL))) continue; /* wrong conn_type */ #endif - if(bg_record->linuximage && - strcasecmp(bg_record->linuximage, - init_bg_record->linuximage)) - continue; - if(bg_record->mloaderimage && - strcasecmp(bg_record->mloaderimage, - init_bg_record->mloaderimage)) - continue; - if(bg_record->ramdiskimage && - strcasecmp(bg_record->ramdiskimage, - init_bg_record->ramdiskimage)) - continue; copy_bg_record(init_bg_record, bg_record); /* remove from the curr list since we just diff --git a/src/sacctmgr/sacctmgr.c b/src/sacctmgr/sacctmgr.c index b62c43033dc..32841d82a7d 100644 --- a/src/sacctmgr/sacctmgr.c +++ b/src/sacctmgr/sacctmgr.c @@ -193,11 +193,14 @@ main (int argc, char *argv[]) errno = 0; db_conn = acct_storage_g_get_connection(false, 0, 1); if(errno != SLURM_SUCCESS) { + int tmp_errno = errno; if((input_field_count == 2) && (!strncasecmp(argv[2], "Configuration", strlen(argv[1]))) && ((!strncasecmp(argv[1], "list", strlen(argv[0]))) || (!strncasecmp(argv[1], "show", strlen(argv[0]))))) sacctmgr_list_config(false); + errno = tmp_errno; + fprintf(stderr, "Problem talking to the database: %m\n"); exit(1); } my_uid = getuid(); diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 68fee323429..5f3a77243df 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -102,7 +102,7 @@ static void _node_fail_handler(srun_node_fail_msg_t *msg); #define POLL_SLEEP 3 /* retry interval in seconds */ static int _wait_bluegene_block_ready( resource_allocation_response_msg_t *alloc); -static int _blocks_dealloc(); +static int _blocks_dealloc(void); #endif #ifdef HAVE_CRAY_XT @@ -689,7 +689,7 @@ static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc) &block_id); for (i=0; (cur_delay < max_delay); i++) { - if(i == 1) + if (i == 1) info("Waiting for block %s to become ready for job", block_id); if (i) { @@ -712,13 +712,14 @@ static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc) is_ready = 1; break; } + if (allocation_interrupted) + break; } if (is_ready) info("Block %s is ready for job", block_id); - else if(!allocation_interrupted) + else if (!allocation_interrupted) error("Block %s still not ready", block_id); - else /* this should never happen, but if allocation_intrrupted - send back not ready */ + else /* allocation_interrupted and slurmctld not responing */ is_ready = 0; xfree(block_id); @@ -734,7 +735,7 @@ static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc) * 0: no deallocate in progress * -1: error occurred */ -static int _blocks_dealloc() +static int _blocks_dealloc(void) { static node_select_info_msg_t *bg_info_ptr = NULL, *new_bg_ptr = NULL; int rc = 0, error_code = 0, i; diff --git a/src/srun/allocate.c b/src/srun/allocate.c index a52a56b9725..0b57e6fcf3a 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -102,7 +102,7 @@ static void _intr_handler(int signo); #define POLL_SLEEP 3 /* retry interval in seconds */ static int _wait_bluegene_block_ready( resource_allocation_response_msg_t *alloc); -static int _blocks_dealloc(); +static int _blocks_dealloc(void); #endif #ifdef HAVE_CRAY_XT @@ -237,7 +237,7 @@ static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc) &block_id); for (i=0; (cur_delay < max_delay); i++) { - if(i == 1) + if (i == 1) debug("Waiting for block %s to become ready for job", block_id); if (i) { @@ -260,13 +260,14 @@ static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc) is_ready = 1; break; } + if (destroy_job) + break; } if (is_ready) debug("Block %s is ready for job", block_id); - else if(!destroy_job) + else if (!destroy_job) error("Block %s still not ready", block_id); - else /* this should never happen, but if destroy_job - send back not ready */ + else /* destroy_job set and slurmctld not responing */ is_ready = 0; xfree(block_id); @@ -282,7 +283,7 @@ static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc) * 0: no deallocate in progress * -1: error occurred */ -static int _blocks_dealloc() +static int _blocks_dealloc(void) { static node_select_info_msg_t *bg_info_ptr = NULL, *new_bg_ptr = NULL; int rc = 0, error_code = 0, i; -- GitLab