diff --git a/NEWS b/NEWS index a120c1fdea265fa166bf52f3ffcb63231c1d26c2..0671c1bb8c71187e39fb89846b5af9722a6c6861 100644 --- a/NEWS +++ b/NEWS @@ -188,6 +188,7 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 1.1.24 ========================= - In sched/wiki2, add support for direct "srun --dependency=" use. + - mpi/mvapich: Add support for MVAPICH protocol version 6. * Changes in SLURM 1.1.23 ========================= diff --git a/src/plugins/mpi/mvapich/mvapich.c b/src/plugins/mpi/mvapich/mvapich.c index cbdf7a21c8190cbe60615d8fe2e2cacc02898c93..5891018d44604941624a5b657666f84117a26c3a 100644 --- a/src/plugins/mpi/mvapich/mvapich.c +++ b/src/plugins/mpi/mvapich/mvapich.c @@ -126,7 +126,7 @@ static struct mvapich_info **mvarray = NULL; static int mvapich_fd = -1; static int nprocs = -1; static int protocol_version = -1; -static int v5_phase = 0; +static int protocol_phase = 0; static int connect_once = 1; static int mvapich_verbose = 0; static int do_timing = 0; @@ -164,11 +164,20 @@ static void mvapich_info_destroy (struct mvapich_info *mvi) static int mvapich_requires_pids (void) { if ( protocol_version == MVAPICH_VERSION_REQUIRES_PIDS - || protocol_version == 5) + || protocol_version == 5 + || protocol_version == 6 ) return (1); return (0); } +/* + * Return non-zero if protocol version has two phases. + */ +static int mvapich_dual_phase (void) +{ + return (protocol_version == 5 || protocol_version == 6); +} + static int mvapich_abort_sends_rank (void) { if (protocol_version >= 3) @@ -230,9 +239,9 @@ static int mvapich_get_hostid (struct mvapich_info *mvi) static int mvapich_get_task_header (int fd, int *version, int *rank) { /* - * V5 only sends version on first pass + * dual phase only sends version on first pass */ - if (protocol_version != 5 || v5_phase == 0) { + if (!mvapich_dual_phase () || protocol_phase == 0) { if (fd_read_n (fd, version, sizeof (int)) < 0) return error ("mvapich: Unable to read version from task: %m"); } @@ -240,7 +249,7 @@ static int mvapich_get_task_header (int fd, int *version, int *rank) if (fd_read_n (fd, rank, sizeof (int)) < 0) return error ("mvapich: Unable to read task rank: %m"); - if (protocol_version == 5 && v5_phase > 0) + if (mvapich_dual_phase () && protocol_phase > 0) return (0); if (protocol_version == -1) @@ -264,7 +273,8 @@ static int mvapich_handle_task (int fd, struct mvapich_info *mvi) case 3: return mvapich_get_task_info (mvi); case 5: - if (v5_phase == 0) + case 6: + if (protocol_phase == 0) return mvapich_get_hostid (mvi); else return mvapich_get_task_info (mvi); @@ -367,7 +377,7 @@ static void mvapich_bcast_hostids (void) static void mvapich_bcast (void) { - if (protocol_version < 5 || v5_phase > 0) + if (!mvapich_dual_phase () || protocol_phase > 0) return mvapich_bcast_addrs (); else return mvapich_bcast_hostids (); @@ -405,7 +415,8 @@ static void mvapich_barrier (void) return; } -static void mvapich_print_abort_message (slurm_step_layout_t *sl, int rank) +static void +mvapich_print_abort_message (slurm_step_layout_t *sl, int rank, int dest) { char *host; @@ -417,8 +428,15 @@ static void mvapich_print_abort_message (slurm_step_layout_t *sl, int rank) host = slurm_step_layout_host_name( sl, slurm_step_layout_host_id(sl, rank)); - info ("mvapich: Received ABORT message from MPI rank %d [on %s]", - rank, host); + if (dest >= 0) { + info ("mvapich: %M: ABORT from MPI rank %d [on %s] dest rank %d [on %s]", + rank, host, dest, + slurm_step_layout_host_name (sl, dest)); + } + else { + info ("mvapich: %M: ABORT from MPI rank %d [on %s]", + rank, host); + } return; } @@ -427,6 +445,7 @@ static void mvapich_wait_for_abort(srun_job_t *job) { int rlen; char rbuf[1024]; + int *p = (int *) rbuf; /* * Wait for abort notification from any process. @@ -452,7 +471,10 @@ static void mvapich_wait_for_abort(srun_job_t *job) } close(newfd); - mvapich_print_abort_message(job->step_layout, *((int *) rbuf)); + if (rlen > sizeof (int)) + mvapich_print_abort_message (job->step_layout, p[1], p[0]); + else + mvapich_print_abort_message (job->step_layout, p[0], -1); fwd_signal(job, SIGKILL, opt.max_threads); } @@ -489,10 +511,12 @@ static int mvapich_handle_connection (int fd) { int version, rank; - if (v5_phase == 0 || !connect_once) { + if (protocol_phase == 0 || !connect_once) { if (mvapich_get_task_header (fd, &version, &rank) < 0) return (-1); + mvarray [rank]->rank = rank; + if (rank > nprocs - 1) return (error ("mvapich: task reported invalid rank (%d)", rank)); } else { @@ -542,7 +566,7 @@ static int mvapich_get_next_connection (int listenfd) slurm_addr addr; int fd; - if (connect_once && v5_phase > 0) { + if (connect_once && protocol_phase > 0) { return (poll_mvapich_fds ()); } @@ -620,8 +644,8 @@ again: mvapich_debug ("bcasting mvapich info to %d tasks", nprocs); mvapich_bcast (); - if (protocol_version == 5 && v5_phase == 0) { - v5_phase = 1; + if (mvapich_dual_phase () && protocol_phase == 0) { + protocol_phase = 1; goto again; } diff --git a/src/plugins/select/bluegene/plugin/bg_job_place.c b/src/plugins/select/bluegene/plugin/bg_job_place.c index 0498c1a266501c2dddb0856596b1d61b1b792d69..a18322bfdb3f03ed53d50e793cd913f1fba2baf7 100644 --- a/src/plugins/select/bluegene/plugin/bg_job_place.c +++ b/src/plugins/select/bluegene/plugin/bg_job_place.c @@ -111,7 +111,6 @@ static int _find_best_block_match(struct job_record* job_ptr, int i; int rot_cnt = 0; int created = 0; - int found = 0; int allow = 0; int check_image = 1; uint32_t max_procs = NO_VAL; @@ -449,15 +448,16 @@ try_again: debug3("%s job_running = %d", record->bg_block_id, record->job_running); /*block is messed up some how (BLOCK_ERROR_STATE) ignore it*/ - if(record->job_running == BLOCK_ERROR_STATE) + if(record->job_running == BLOCK_ERROR_STATE) { + debug("block %s is in an error state (can't use)", + record->bg_block_id); continue; - else if((record->job_running != NO_JOB_RUNNING) - && !test_only) { + } else if((record->job_running != NO_JOB_RUNNING) + && !test_only) { debug("block %s in use by %s job %d", record->bg_block_id, record->user_name, record->job_running); - found = 1; continue; } @@ -574,8 +574,8 @@ try_again: } } list_iterator_destroy(itr2); + if(found_record) { - found = 1; continue; } @@ -641,7 +641,6 @@ try_again: continue; /* Not usable */ } *found_bg_record = record; - found = 1; debug2("we found one! %s", (*found_bg_record)->bg_block_id); break; } diff --git a/src/srun/opt.c b/src/srun/opt.c index 4d15b05d4c2c562798487e8222c56af38601163f..93baa9bad37ca52a6e3f5c2bbf2f19d05d886c04 100644 --- a/src/srun/opt.c +++ b/src/srun/opt.c @@ -1868,7 +1868,7 @@ void set_options(const int argc, char **argv, int first) opt.begin = parse_time(optarg); break; case LONG_OPT_MAIL_TYPE: - opt.mail_type = _parse_mail_type(optarg); + opt.mail_type |= _parse_mail_type(optarg); if (opt.mail_type == 0) fatal("--mail-type=%s invalid", optarg); break; @@ -2375,6 +2375,7 @@ static char *_print_mail_type(const uint16_t type) { if (type == 0) return "NONE"; + if (type == MAIL_JOB_BEGIN) return "BEGIN"; if (type == MAIL_JOB_END) @@ -2384,7 +2385,7 @@ static char *_print_mail_type(const uint16_t type) if (type == (MAIL_JOB_BEGIN | MAIL_JOB_END | MAIL_JOB_FAIL)) return "ALL"; - return "UNKNOWN"; + return "MULTIPLE"; } static void