From 64542be0340cf2ba813f96debb80e340f9b0a788 Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 25 Oct 2006 23:00:04 +0000 Subject: [PATCH] svn merge -r9908:9923 https://eris.llnl.gov/svn/slurm/branches/slurm-1.1 --- NEWS | 6 + doc/man/man5/wiki.conf.5 | 6 +- .../slurm_protocol_socket_implementation.c | 23 +++- src/plugins/sched/wiki2/event.c | 8 +- src/plugins/sched/wiki2/msg.c | 4 + .../select/bluegene/plugin/bg_block_info.c | 47 +++++++- .../select/bluegene/plugin/bg_job_run.c | 7 +- .../bluegene/plugin/bg_switch_connections.c | 37 ++++-- src/plugins/select/bluegene/plugin/bluegene.c | 106 ++++++++++++++++-- src/plugins/select/bluegene/plugin/bluegene.h | 2 +- src/slurmctld/controller.c | 8 +- testsuite/expect/test9.7.bash | 19 ++-- 12 files changed, 227 insertions(+), 46 deletions(-) diff --git a/NEWS b/NEWS index c63efe418a8..9b386ff3ba5 100644 --- a/NEWS +++ b/NEWS @@ -89,6 +89,12 @@ documents those changes that are of interest to users and admins. - In sched/wiki2, fix memory management bug for JOBWILLRUN command. - In sched/wiki2, consider job Busy while in Completing state for KillWait+10 seconds (used to be 30 seconds). + - BLUEGENE - Fixes to allow full block creation on the system and not to add + passthrough nodes to the allocation when creating a block. + - BLUEGENE - Fix deadlock issue with starting and failing jobs at the same + time + - Make connect() non-blocking and poll to avoid possibly very long default + timeout. * Changes in SLURM 1.1.17 ========================= diff --git a/doc/man/man5/wiki.conf.5 b/doc/man/man5/wiki.conf.5 index 772ff3c0f46..8c311088f38 100644 --- a/doc/man/man5/wiki.conf.5 +++ b/doc/man/man5/wiki.conf.5 @@ -25,15 +25,15 @@ This numeric value should match KEY configured in the \fBEHost\fR Name the computer on which Moab server executes. It is used in establishing a communications path for event notification. -By default the \fBEHost\fR will be identical in value to the +By default \fBEHost\fR will be identical in value to the \fBControlAddr\fR configured in slurm.conf. .TP \fBEHostBackup\fR Name the computer on which the backup Moab server executes. It is used in establishing a communications path for event notification. -There is no default value for \fBEHostBackup\fR (no backup -controller is configured). +By default \fBEHostBackup\fR will be identical in value to the +\fBBackupAddr\fR configured in slurm.conf. .TP \fBEPort\fR diff --git a/src/common/slurm_protocol_socket_implementation.c b/src/common/slurm_protocol_socket_implementation.c index a179ac0733f..c6b0980d0e1 100644 --- a/src/common/slurm_protocol_socket_implementation.c +++ b/src/common/slurm_protocol_socket_implementation.c @@ -565,7 +565,28 @@ extern int _slurm_getsockname (int __fd, struct sockaddr * __addr, extern int _slurm_connect (int __fd, struct sockaddr const * __addr, socklen_t __len) { - return connect ( __fd , __addr , __len ) ; + /* From "man connect": Note that for IP sockets the timeout + * may be very long when syncookies are enabled on the server. + * + * Timeouts in excess of 3 minutes have been observed, resulting + * in serious problems for slurmctld. Making the connect call + * non-blocking and polling seems to fix the problem. */ + int rc = -1, flags; + + flags = fcntl(__fd, F_GETFL); + fcntl(__fd, F_SETFL, flags | O_NONBLOCK); + rc = connect ( __fd , __addr , __len ) ; + if ((rc == -1) + && ((errno == EINPROGRESS) || (errno == EALREADY))) { + struct pollfd ufds; + ufds.fd = __fd; + ufds.events = POLLOUT; + ufds. revents = 0; + poll(&ufds, 1, 5000); /* 5 sec max wait */ + rc = connect ( __fd , __addr , __len ) ; + } + fcntl(__fd, F_SETFL, flags); + return rc; } /* Put the address of the peer connected to socket FD into *ADDR diff --git a/src/plugins/sched/wiki2/event.c b/src/plugins/sched/wiki2/event.c index 86b9a65bcc9..39d4eb0efc4 100644 --- a/src/plugins/sched/wiki2/event.c +++ b/src/plugins/sched/wiki2/event.c @@ -77,8 +77,11 @@ extern int event_notify(char *msg) } } event_fd = slurm_open_msg_conn(&moab_event_addr); - if ((event_fd == -1) && (event_addr_set == 2)) + if ((event_fd == -1) && (event_addr_set == 2)) { + debug("Unable to open wiki event port %s:%u: %m", + e_host, e_port); event_fd = slurm_open_msg_conn(&moab_event_addr_bu); + } if (event_fd == -1) { char *host_name; if (event_addr_set == 2) @@ -88,6 +91,9 @@ extern int event_notify(char *msg) error("Unable to open wiki event port %s:%u: %m", host_name, e_port); pthread_mutex_unlock(&event_mutex); + /* Don't retry again for a while (10 mins) + * to avoid long delays from ETIMEDOUT */ + last_notify_time = now + 600; return -1; } diff --git a/src/plugins/sched/wiki2/msg.c b/src/plugins/sched/wiki2/msg.c index 34d1f92dd66..c6a151edb23 100644 --- a/src/plugins/sched/wiki2/msg.c +++ b/src/plugins/sched/wiki2/msg.c @@ -211,6 +211,10 @@ static void _parse_wiki_config(void) /* Set default values */ conf = slurm_conf_lock(); strncpy(e_host, conf->control_addr, sizeof(e_host)); + if (conf->backup_addr) { + strncpy(e_host_bu, conf->backup_addr, + sizeof(e_host)); + } kill_wait = conf->kill_wait; slurm_conf_unlock(); diff --git a/src/plugins/select/bluegene/plugin/bg_block_info.c b/src/plugins/select/bluegene/plugin/bg_block_info.c index 9878b3f0fb8..f2103a91b9a 100644 --- a/src/plugins/select/bluegene/plugin/bg_block_info.c +++ b/src/plugins/select/bluegene/plugin/bg_block_info.c @@ -70,7 +70,15 @@ #define RETRY_BOOT_COUNT 3 #ifdef HAVE_BG_FILES -static int _block_is_deallocating(bg_record_t *bg_record); + +typedef struct { + int jobid; +} kill_job_struct_t; + +List kill_job_list = NULL; + +static int _block_is_deallocating(bg_record_t *bg_record); +static void _destroy_kill_struct(void *object); static int _block_is_deallocating(bg_record_t *bg_record) { @@ -86,11 +94,17 @@ static int _block_is_deallocating(bg_record_t *bg_record) } slurm_conf_unlock(); + if(jobid > -1) { + kill_job_struct_t *freeit = xmalloc(sizeof(kill_job_struct_t)); + freeit->jobid = jobid; + list_push(kill_job_list, freeit); + } if(bg_record->target_name && bg_record->user_name) { if(!strcmp(bg_record->target_name, user_name)) { if(strcmp(bg_record->target_name, - bg_record->user_name)) { + bg_record->user_name) + || (jobid > -1)) { error("Block %s was in a ready state " "for user %s but is being freed. " "Job %d was lost.", @@ -98,8 +112,6 @@ static int _block_is_deallocating(bg_record_t *bg_record) bg_record->user_name, jobid); - if(jobid > -1) - slurm_fail_job(jobid); if(remove_from_bg_list(bg_job_block_list, bg_record) == SLURM_SUCCESS) { @@ -136,6 +148,15 @@ static int _block_is_deallocating(bg_record_t *bg_record) return SLURM_SUCCESS; } +static void _destroy_kill_struct(void *object) +{ + kill_job_struct_t *freeit = (kill_job_struct_t *)object; + + if(freeit) { + xfree(freeit); + } +} + #endif @@ -210,8 +231,12 @@ extern int update_block_list() bg_record_t *bg_record = NULL; time_t now; int skipped_dealloc = 0; + kill_job_struct_t *freeit = NULL; ListIterator itr = NULL; + if(!kill_job_list) + kill_job_list = list_create(_destroy_kill_struct); + if(!bg_list) return updated; @@ -363,7 +388,12 @@ extern int update_block_list() case RM_PARTITION_READY: debug("block %s is ready.", bg_record->bg_block_id); - set_block_user(bg_record); + if(set_block_user(bg_record) == SLURM_ERROR) { + freeit = xmalloc( + sizeof(kill_job_struct_t)); + freeit->jobid = bg_record->job_running; + list_push(kill_job_list, freeit); + } break; default: debug("Hey the state of the " @@ -381,6 +411,13 @@ extern int update_block_list() } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); + + /* kill all the jobs from unexpectedly freed blocks */ + while((freeit = list_pop(kill_job_list))) { + debug2("killing job %d", freeit->jobid); + (void) slurm_fail_job(freeit->jobid); + _destroy_kill_struct(freeit); + } #endif return updated; diff --git a/src/plugins/select/bluegene/plugin/bg_job_run.c b/src/plugins/select/bluegene/plugin/bg_job_run.c index e65a45cc3d6..107ae095a90 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_run.c +++ b/src/plugins/select/bluegene/plugin/bg_job_run.c @@ -253,15 +253,14 @@ static void _start_agent(bg_update_t *bg_update_ptr) bg_record = find_bg_record_in_list(bg_list, bg_update_ptr->bg_block_id); - slurm_mutex_lock(&block_state_mutex); if(!bg_record) { - slurm_mutex_unlock(&block_state_mutex); error("block %s not found in bg_list", bg_update_ptr->bg_block_id); (void) slurm_fail_job(bg_update_ptr->job_id); slurm_mutex_unlock(&job_start_mutex); return; } + slurm_mutex_lock(&block_state_mutex); if(bg_record->job_running <= NO_JOB_RUNNING) { slurm_mutex_unlock(&block_state_mutex); slurm_mutex_unlock(&job_start_mutex); @@ -545,7 +544,7 @@ static void _term_agent(bg_update_t *bg_update_ptr) static void *_block_agent(void *args) { bg_update_t *bg_update_ptr = NULL; - + /* * Don't just exit when there is no work left. Creating * pthreads from within a dynamically linked object (plugin) @@ -562,7 +561,7 @@ static void *_block_agent(void *args) } if (bg_update_ptr->op == START_OP) _start_agent(bg_update_ptr); - else if (bg_update_ptr->op == TERM_OP) + else if (bg_update_ptr->op == TERM_OP) _term_agent(bg_update_ptr); else if (bg_update_ptr->op == SYNC_OP) _sync_agent(bg_update_ptr); diff --git a/src/plugins/select/bluegene/plugin/bg_switch_connections.c b/src/plugins/select/bluegene/plugin/bg_switch_connections.c index 2cb58413e5e..641a329b506 100644 --- a/src/plugins/select/bluegene/plugin/bg_switch_connections.c +++ b/src/plugins/select/bluegene/plugin/bg_switch_connections.c @@ -228,7 +228,7 @@ static int _lookat_path(bg_bp_t *bg_bp, ba_switch_t *curr_switch, if(port_tar == curr_switch->ext_wire[port_tar].port_tar) { //list_delete(conn_itr); //continue; - debug3("I found these %d %d",port_tar, + debug3("I found these %d %d", port_tar, curr_switch->ext_wire[port_tar].port_tar); } if(((bg_conn->source == port_tar) @@ -261,16 +261,16 @@ static int _lookat_path(bg_bp_t *bg_bp, ba_switch_t *curr_switch, /* set source to the node you are on */ node_src = curr_switch->ext_wire[0].node_tar; - debug("dim %d trying from %d%d%d %d -> %d%d%d %d", - dim, - node_src[X], - node_src[Y], - node_src[Z], - port_tar1, - node_tar[X], - node_tar[Y], - node_tar[Z], - port_tar); + debug2("dim %d trying from %d%d%d %d -> %d%d%d %d", + dim, + node_src[X], + node_src[Y], + node_src[Z], + port_tar1, + node_tar[X], + node_tar[Y], + node_tar[Z], + port_tar); bg_itr = list_iterator_create(bg_bp_list); @@ -520,13 +520,26 @@ extern int configure_block_switches(bg_record_t * bg_record) int first_bp=1; int first_switch=1; + if(!bg_record->bg_block_list) { + error("There was no block_list given, can't create block"); + return SLURM_ERROR; + } + bg_bp_list = list_create(NULL); bg_record->switch_count = 0; bg_record->bp_count = 0; itr = list_iterator_create(bg_record->bg_block_list); while ((ba_node = (ba_node_t *) list_next(itr)) != NULL) { - debug2("node %d%d%d", + if(!ba_node->used) { + debug3("%d%d%d is a passthrough, " + "not including in request", + ba_node->coord[X], + ba_node->coord[Y], + ba_node->coord[Z]); + continue; + } + debug2("using node %d%d%d", ba_node->coord[X], ba_node->coord[Y], ba_node->coord[Z]); diff --git a/src/plugins/select/bluegene/plugin/bluegene.c b/src/plugins/select/bluegene/plugin/bluegene.c index 2cf46e14b6f..8e7c4deadcf 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.c +++ b/src/plugins/select/bluegene/plugin/bluegene.c @@ -100,6 +100,7 @@ static int _addto_node_list(bg_record_t *bg_record, int *start, int *end); static void _set_bg_lists(); static int _validate_config_nodes(void); static int _bg_record_cmpf_inc(bg_record_t *rec_a, bg_record_t *rec_b); +static int _ba_node_cmpf_inc(ba_node_t *node_a, ba_node_t *node_b); static int _delete_old_blocks(void); static char *_get_bg_conf(void); static int _add_block_db(bg_record_t *bg_record, int *block_inx); @@ -358,9 +359,16 @@ extern void process_nodes(bg_record_t *bg_record) end[X] = -1; end[Y] = -1; end[Z] = -1; - + + list_sort(bg_record->bg_block_list, (ListCmpF) _ba_node_cmpf_inc); + itr = list_iterator_create(bg_record->bg_block_list); while ((ba_node = list_next(itr)) != NULL) { + debug4("%d%d%d is included in this block", + ba_node->coord[X], + ba_node->coord[Y], + ba_node->coord[Z]); + if(ba_node->coord[X]>end[X]) { bg_record->geo[X]++; end[X] = ba_node->coord[X]; @@ -375,10 +383,12 @@ extern void process_nodes(bg_record_t *bg_record) } } list_iterator_destroy(itr); - debug3("geo = %d%d%d\n", + debug3("geo = %d%d%d bp count is %d\n", bg_record->geo[X], bg_record->geo[Y], - bg_record->geo[Z]); + bg_record->geo[Z], + bg_record->bp_count); + if ((bg_record->geo[X] == DIM_SIZE[X]) && (bg_record->geo[Y] == DIM_SIZE[Y]) && (bg_record->geo[Z] == DIM_SIZE[Z])) @@ -425,6 +435,7 @@ extern void copy_bg_record(bg_record_t *fir_record, bg_record_t *sec_record) sec_record->switch_count = fir_record->switch_count; sec_record->boot_state = fir_record->boot_state; sec_record->boot_count = fir_record->boot_count; + sec_record->full_block = fir_record->full_block; for(i=0;i<BA_SYSTEM_DIMENSIONS;i++) { sec_record->geo[i] = fir_record->geo[i]; @@ -755,7 +766,11 @@ extern int remove_all_users(char *bg_block_id, char *user_name) return returnc; } -extern void set_block_user(bg_record_t *bg_record) +/* if SLURM_ERROR you will need to fail the job with + slurm_fail_job(bg_record->job_running); +*/ + +extern int set_block_user(bg_record_t *bg_record) { int rc = 0; debug("resetting the boot state flag and " @@ -766,16 +781,18 @@ extern void set_block_user(bg_record_t *bg_record) slurm_conf_lock(); if((rc = update_block_user(bg_record, 1)) == 1) { last_bg_update = time(NULL); + rc = SLURM_SUCCESS; } else if (rc == -1) { error("Unable to add user name to block %s. " "Cancelling job.", bg_record->bg_block_id); - (void) slurm_fail_job(bg_record->job_running); + rc = SLURM_ERROR; } xfree(bg_record->target_name); bg_record->target_name = xstrdup(slurmctld_conf.slurm_user_name); - slurm_conf_unlock(); + slurm_conf_unlock(); + return rc; } extern char* convert_lifecycle(lifecycle_type_t lifecycle) @@ -937,6 +954,7 @@ extern int create_defined_blocks(bg_layout_t overlapped) "no bg_found_block_list 1"); } if(bg_record->bp_count>0 + && !bg_record->full_block && bg_record->cpus_per_bp == procs_per_node) { char *name = NULL; if(overlapped == LAYOUT_OVERLAP) @@ -996,6 +1014,22 @@ extern int create_defined_blocks(bg_layout_t overlapped) } } if(found_record == NULL) { + if(bg_record->full_block) { + /* if this is defined we need + to remove it since we are + going to try to create it + later on overlap systems + this doesn't matter, but + since we don't clear the + table on static mode we + can't do it here or it just + won't work since other + wires will be or are + already set + */ + list_remove(itr); + continue; + } #ifdef HAVE_BG_FILES if((rc = configure_block(bg_record)) == SLURM_ERROR) { @@ -1952,6 +1986,8 @@ static int _validate_config_nodes(void) #ifdef HAVE_BG_FILES bg_record_t* bg_record = NULL; bg_record_t* init_bg_record = NULL; + bg_record_t* full_system_bg_record = NULL; + int full_created = 0; ListIterator itr_conf; ListIterator itr_curr; rm_partition_mode_t node_use; @@ -1962,7 +1998,15 @@ static int _validate_config_nodes(void) if(!bg_recover) return SLURM_ERROR; - + + if(!bg_curr_block_list) + return SLURM_ERROR; + + itr_curr = list_iterator_create(bg_curr_block_list); + while ((init_bg_record = list_next(itr_curr))) + if(init_bg_record->full_block) + full_system_bg_record = init_bg_record; + itr_conf = list_iterator_create(bg_list); while ((bg_record = (bg_record_t*) list_next(itr_conf))) { /* translate hostlist to ranged @@ -1970,7 +2014,7 @@ static int _validate_config_nodes(void) search here */ node_use = SELECT_COPROCESSOR_MODE; - itr_curr = list_iterator_create(bg_curr_block_list); + list_iterator_reset(itr_curr); while ((init_bg_record = list_next(itr_curr))) { if (strcasecmp(bg_record->nodes, init_bg_record->nodes)) @@ -1988,7 +2032,6 @@ static int _validate_config_nodes(void) bg_record); break; } - list_iterator_destroy(itr_curr); if (!bg_record->bg_block_id) { format_node_name(bg_record, tmp_char); @@ -1997,6 +2040,9 @@ static int _validate_config_nodes(void) tmp_char); rc = SLURM_ERROR; } else { + if(bg_record->full_block) + full_created = 1; + list_push(bg_found_block_list, bg_record); format_node_name(bg_record, tmp_char); info("Existing: BlockID:%s Nodes:%s Conn:%s", @@ -2011,8 +2057,26 @@ static int _validate_config_nodes(void) } } list_iterator_destroy(itr_conf); + list_iterator_destroy(itr_curr); if(bluegene_layout_mode == LAYOUT_DYNAMIC) goto finished; + + if(!full_created && full_system_bg_record) { + bg_record = xmalloc(sizeof(bg_record_t)); + copy_bg_record(full_system_bg_record, bg_record); + list_push(bg_list, bg_record); + list_push(bg_found_block_list, bg_record); + format_node_name(bg_record, tmp_char); + info("Existing: BlockID:%s Nodes:%s Conn:%s", + bg_record->bg_block_id, + tmp_char, + convert_conn_type(bg_record->conn_type)); + if(((bg_record->state == RM_PARTITION_READY) + || (bg_record->state == RM_PARTITION_CONFIGURING)) + && !block_exist_in_list(bg_booted_block_list, + bg_record)) + list_push(bg_booted_block_list, bg_record); + } finished: if(list_count(bg_list) == list_count(bg_curr_block_list)) @@ -2056,6 +2120,30 @@ static int _bg_record_cmpf_inc(bg_record_t* rec_a, bg_record_t* rec_b) return 0; } +static int _ba_node_cmpf_inc(ba_node_t *node_a, ba_node_t *node_b) +{ + if (node_a->coord[X] < node_b->coord[X]) + return -1; + else if (node_a->coord[X] > node_b->coord[X]) + return 1; + + if (node_a->coord[Y] < node_b->coord[Y]) + return -1; + else if (node_a->coord[Y] > node_b->coord[Y]) + return 1; + + if (node_a->coord[Z] < node_b->coord[Z]) + return -1; + else if (node_a->coord[Z] > node_b->coord[Z]) + return 1; + + error("You have the node %d%d%d in the list twice", + node_a->coord[X], + node_a->coord[Y], + node_a->coord[Z]); + return 0; +} + static int _delete_old_blocks(void) { #ifdef HAVE_BG_FILES diff --git a/src/plugins/select/bluegene/plugin/bluegene.h b/src/plugins/select/bluegene/plugin/bluegene.h index 5f31a3a38a8..fd586d77d95 100644 --- a/src/plugins/select/bluegene/plugin/bluegene.h +++ b/src/plugins/select/bluegene/plugin/bluegene.h @@ -198,7 +198,7 @@ extern bool blocks_overlap(bg_record_t *rec_a, bg_record_t *rec_b); #define REMOVE_USER_NONE 0 #define REMOVE_USER_FOUND 2 extern int remove_all_users(char *bg_block_id, char *user_name); -extern void set_block_user(bg_record_t *bg_record); +extern int set_block_user(bg_record_t *bg_record); /* Return strings representing blue gene data types */ extern char *convert_lifecycle(lifecycle_type_t lifecycle); diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index b2b5ee498b2..4504205280e 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -1031,6 +1031,7 @@ static int _shutdown_backup_controller(int wait_time) { int rc; slurm_msg_t req; + DEF_TIMERS; slurm_msg_t_init(&req); if ((slurmctld_conf.backup_addr == NULL) || @@ -1045,13 +1046,14 @@ static int _shutdown_backup_controller(int wait_time) /* send request message */ req.msg_type = REQUEST_CONTROL; + START_TIMER; if (slurm_send_recv_rc_msg_only_one(&req, &rc, CONTROL_TIMEOUT) < 0) { - error("shutdown_backup:send/recv: %m"); + END_TIMER; + error("_shutdown_backup_controller:send/recv: %m, %s", TIME_STR); return SLURM_ERROR; } - if (rc) { - error("shutdown_backup: %s", slurm_strerror(rc)); + error("_shutdown_backup_controller: %s", slurm_strerror(rc)); return SLURM_ERROR; } debug("backup controller has relinquished control"); diff --git a/testsuite/expect/test9.7.bash b/testsuite/expect/test9.7.bash index 25dfb8198ba..102349ed754 100755 --- a/testsuite/expect/test9.7.bash +++ b/testsuite/expect/test9.7.bash @@ -53,14 +53,15 @@ else iterations=3 fi -if [ $5 ]; then - inx=512 -else - inx=1 -fi +bluegene=0 +if [ $# -gt 5 ]; then + if [ $5 ]; then + bluegene=1 + fi +fi exit_code=0 - +inx=1 log="test9.7.$$.output" touch $log while [ $inx -le $iterations ] @@ -73,7 +74,11 @@ do exit_code=$rc fi sleep $sleep_time - $exec2 -N1-$inx -n$inx -O -s -l hostname >>$log 2>&1 + if [ $bluegene ]; then + $exec2 -N1-512 -n1 -s -l hostname >>$log 2>&1 + else + $exec2 -N1-$inx -n$inx -O -s -l hostname >>$log 2>&1 + fi rc=$? if [ $rc -ne 0 ]; then echo "exec2 rc=$rc" >> $log -- GitLab