diff --git a/src/slurmctld/controller.c b/src/slurmctld/controller.c index b329153f2d775c25783fb45fe894fc73f8786fbd..4ffeb161dccd33cc9c70d01e917fc19b5b6e4da0 100644 --- a/src/slurmctld/controller.c +++ b/src/slurmctld/controller.c @@ -87,6 +87,7 @@ #include "src/slurmctld/licenses.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/ping_nodes.h" +#include "src/slurmctld/port_mgr.h" #include "src/slurmctld/proc_req.h" #include "src/slurmctld/read_config.h" #include "src/slurmctld/reservation.h" @@ -574,6 +575,7 @@ int main(int argc, char *argv[]) resv_fini(); trigger_fini(); assoc_mgr_fini(slurmctld_conf.state_save_location); + reserve_port_config(NULL); /* Some plugins are needed to purge job/node data structures, * unplug after other data structures are purged */ diff --git a/src/slurmctld/port_mgr.c b/src/slurmctld/port_mgr.c index a87d3ece2fef6b41c2408a3e53231a009383975d..4a7f50a5aea9e7350c5ecd1edfc99775fd81873b 100644 --- a/src/slurmctld/port_mgr.c +++ b/src/slurmctld/port_mgr.c @@ -41,6 +41,9 @@ # include "config.h" #endif +#include <string.h> +#include <stdlib.h> + #include "src/common/bitstring.h" #include "src/common/hostlist.h" #include "src/common/xmalloc.h" @@ -53,13 +56,63 @@ int port_resv_cnt = 0; int port_resv_min = 0; int port_resv_max = 0; +/* Configure reserved ports. + * Call with mpi_params==NULL to free memory */ +extern int reserve_port_config(char *mpi_params) +{ + char *tmp_e=NULL, *tmp_p=NULL; + int i, p_min, p_max; + + if (mpi_params) + tmp_p = strstr(mpi_params, "ports:"); + if (tmp_p == NULL) { + if (port_resv_table) { + info("Clearing port reservations"); + xfree(port_resv_table); + port_resv_cnt = 0; + port_resv_min = port_resv_max = 0; + } + return SLURM_SUCCESS; + } + + tmp_p += 6; + p_min = strtol(tmp_p, &tmp_e, 10); + if ((p_min < 1) || (tmp_e[0] != '-')) { + info("invalid MpiParams: %s", mpi_params); + return SLURM_ERROR; + } + tmp_e++; + p_max = strtol(tmp_e, NULL, 10); + if (p_max < p_min) { + info("invalid MpiParams: %s", mpi_params); + return SLURM_ERROR; + } + + if ((p_min == port_resv_min) && (p_max == port_resv_max)) + return SLURM_SUCCESS; /* No change */ + + port_resv_min = p_min; + port_resv_max = p_max; + port_resv_cnt = p_max - p_min + 1; + debug("Ports available for reservation %u-%u", + port_resv_min, port_resv_max); + + xfree(port_resv_table); + port_resv_table = xmalloc(sizeof(bitstr_t *) * port_resv_cnt); + for (i=0; i<port_resv_cnt; i++) + port_resv_table[i] = bit_alloc(node_record_count); + +/* FIXME: Rebuild record of reserved ports */ + return SLURM_SUCCESS; +} + /* Reserve ports for a job step * RET SLURM_SUCCESS or an error code */ -extern int reserve_ports(struct step_record *step_ptr) +extern int resv_port_alloc(struct step_record *step_ptr) { int i, port_inx; int *port_array = NULL; - char port_str[16]; + char port_str[16], *tmp_str; hostlist_t hl; if (step_ptr->resv_port_cnt > port_resv_cnt) { @@ -95,8 +148,8 @@ extern int reserve_ports(struct step_record *step_ptr) for (i=0; i<port_inx; i++) { bit_or(port_resv_table[port_array[i]], step_ptr->step_node_bitmap); - snprintf(port_str, sizeof(port_str), - "%d", (port_array[i] + port_resv_min)); + port_array[i] += port_resv_min; + snprintf(port_str, sizeof(port_str), "%d", port_array[i]); hostlist_push(hl, port_str); } hostlist_sort(hl); @@ -107,11 +160,47 @@ extern int reserve_ports(struct step_record *step_ptr) xfree(step_ptr->resv_ports); } hostlist_destroy(hl); - xfree(port_array); - info("reserved ports %s for step %u.%u", - step_ptr->resv_ports, - step_ptr->job_ptr->job_id, step_ptr->step_id); + step_ptr->resv_port_array = port_array; + + if (step_ptr->resv_ports[0] == '[') { + /* Remove brackets from hostlist */ + i = strlen(step_ptr->resv_ports); + step_ptr->resv_ports[i-1] = '\0'; + tmp_str = xmalloc(i); + strcpy(tmp_str, step_ptr->resv_ports + 1); + xfree(step_ptr->resv_ports); + step_ptr->resv_ports = tmp_str; + } + + debug("reserved ports %s for step %u.%u", + step_ptr->resv_ports, + step_ptr->job_ptr->job_id, step_ptr->step_id); return SLURM_SUCCESS; } +/* Release reserved ports for a job step + * RET SLURM_SUCCESS or an error code */ +extern void resv_port_free(struct step_record *step_ptr) +{ + int i, j; + + if (step_ptr->resv_port_array == NULL) + return; + + bit_not(step_ptr->step_node_bitmap); + for (i=0; i<step_ptr->resv_port_cnt; i++) { + if ((step_ptr->resv_port_array[i] < port_resv_min) || + (step_ptr->resv_port_array[i] > port_resv_max)) + continue; + j = step_ptr->resv_port_array[i] - port_resv_min; + bit_and(port_resv_table[j], step_ptr->step_node_bitmap); + + } + bit_not(step_ptr->step_node_bitmap); + xfree(step_ptr->resv_port_array); + + debug("freed ports %s for step %u.%u", + step_ptr->resv_ports, + step_ptr->job_ptr->job_id, step_ptr->step_id); +} diff --git a/src/slurmctld/port_mgr.h b/src/slurmctld/port_mgr.h index f9ebb209c794207083d456f42b7c973222459eef..af4a87a418531da210dae065c4e3f9049677ddc5 100644 --- a/src/slurmctld/port_mgr.h +++ b/src/slurmctld/port_mgr.h @@ -42,8 +42,16 @@ #include "src/slurmctld/slurmctld.h" +/* Configure reserved ports. + * Call with mpi_params==NULL to free memory */ +extern int reserve_port_config(char *mpi_params); + /* Reserve ports for a job step * RET SLURM_SUCCESS or an error code */ -extern int reserve_ports(struct step_record *step_ptr); +extern int resv_port_alloc(struct step_record *step_ptr); + +/* Release reserved ports for a job step + * RET SLURM_SUCCESS or an error code */ +extern void resv_port_free(struct step_record *step_ptr); #endif /* !_HAVE_PORT_MGR_H */ diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index 226dca999eec013d378346233c1e82f2845888f7..db9e746b26c48291f670586baa41a60a2bdcfe11 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -75,6 +75,7 @@ #include "src/slurmctld/licenses.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/node_scheduler.h" +#include "src/slurmctld/port_mgr.h" #include "src/slurmctld/proc_req.h" #include "src/slurmctld/read_config.h" #include "src/slurmctld/reservation.h" @@ -851,6 +852,7 @@ int read_slurm_conf(int recover) if ((rc = _build_bitmaps())) fatal("_build_bitmaps failure"); + reserve_port_config(conf->mpi_params); license_free(); if (license_init(slurmctld_conf.licenses) != SLURM_SUCCESS) diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h index f0992f8979fcc56ca0a2ca01fe61c354743887a6..28243c66ba896091f8d42d096f1a4eaf68f64cb5 100644 --- a/src/slurmctld/slurmctld.h +++ b/src/slurmctld/slurmctld.h @@ -555,6 +555,7 @@ struct step_record { uint8_t no_kill; /* 1 if no kill on node failure */ uint16_t port; /* port for srun communications */ time_t pre_sus_time; /* time step ran prior to last suspend */ + int *resv_port_array; /* reserved port indexes */ uint16_t resv_port_cnt; /* count of ports reserved per node */ char *resv_ports; /* ports reserved for job */ time_t start_time; /* step allocation time */ diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index cde3faa1a51adda7ea2b276c17a1f7906aece783..d36d5fb8750c5957141d6c51b549ee66400adaaa 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -164,6 +164,7 @@ static void _free_step_rec(struct step_record *step_ptr) FREE_NULL_BITMAP(step_ptr->core_bitmap_job); FREE_NULL_BITMAP(step_ptr->exit_node_bitmap); FREE_NULL_BITMAP(step_ptr->step_node_bitmap); + xfree(step_ptr->resv_port_array); xfree(step_ptr->resv_ports); xfree(step_ptr->network); xfree(step_ptr->ckpt_path); @@ -201,6 +202,7 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id) step_ptr->step_layout->node_list); switch_free_jobinfo (step_ptr->switch_job); } + resv_port_free(step_ptr); checkpoint_free_jobinfo (step_ptr->check_job); _free_step_rec(step_ptr); error_code = 0; @@ -1347,7 +1349,7 @@ step_create(job_step_create_request_msg_t *step_specs, } if (step_specs->resv_port_cnt != (uint16_t) NO_VAL) { step_ptr->resv_port_cnt = step_specs->resv_port_cnt; - i = reserve_ports(step_ptr); + i = resv_port_alloc(step_ptr); if (i != SLURM_SUCCESS) { delete_step_record (job_ptr, step_ptr->step_id); return i;