Skip to content
Snippets Groups Projects
Commit 62a781fb authored by Moe Jette's avatar Moe Jette
Browse files

get logic in place to allocate and deallocate ports for job steps

parent f1981eab
No related branches found
No related tags found
No related merge requests found
......@@ -87,6 +87,7 @@
#include "src/slurmctld/licenses.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/ping_nodes.h"
#include "src/slurmctld/port_mgr.h"
#include "src/slurmctld/proc_req.h"
#include "src/slurmctld/read_config.h"
#include "src/slurmctld/reservation.h"
......@@ -574,6 +575,7 @@ int main(int argc, char *argv[])
resv_fini();
trigger_fini();
assoc_mgr_fini(slurmctld_conf.state_save_location);
reserve_port_config(NULL);
/* Some plugins are needed to purge job/node data structures,
* unplug after other data structures are purged */
......
......@@ -41,6 +41,9 @@
# include "config.h"
#endif
#include <string.h>
#include <stdlib.h>
#include "src/common/bitstring.h"
#include "src/common/hostlist.h"
#include "src/common/xmalloc.h"
......@@ -53,13 +56,63 @@ int port_resv_cnt = 0;
int port_resv_min = 0;
int port_resv_max = 0;
/* Configure reserved ports.
* Call with mpi_params==NULL to free memory */
extern int reserve_port_config(char *mpi_params)
{
char *tmp_e=NULL, *tmp_p=NULL;
int i, p_min, p_max;
if (mpi_params)
tmp_p = strstr(mpi_params, "ports:");
if (tmp_p == NULL) {
if (port_resv_table) {
info("Clearing port reservations");
xfree(port_resv_table);
port_resv_cnt = 0;
port_resv_min = port_resv_max = 0;
}
return SLURM_SUCCESS;
}
tmp_p += 6;
p_min = strtol(tmp_p, &tmp_e, 10);
if ((p_min < 1) || (tmp_e[0] != '-')) {
info("invalid MpiParams: %s", mpi_params);
return SLURM_ERROR;
}
tmp_e++;
p_max = strtol(tmp_e, NULL, 10);
if (p_max < p_min) {
info("invalid MpiParams: %s", mpi_params);
return SLURM_ERROR;
}
if ((p_min == port_resv_min) && (p_max == port_resv_max))
return SLURM_SUCCESS; /* No change */
port_resv_min = p_min;
port_resv_max = p_max;
port_resv_cnt = p_max - p_min + 1;
debug("Ports available for reservation %u-%u",
port_resv_min, port_resv_max);
xfree(port_resv_table);
port_resv_table = xmalloc(sizeof(bitstr_t *) * port_resv_cnt);
for (i=0; i<port_resv_cnt; i++)
port_resv_table[i] = bit_alloc(node_record_count);
/* FIXME: Rebuild record of reserved ports */
return SLURM_SUCCESS;
}
/* Reserve ports for a job step
* RET SLURM_SUCCESS or an error code */
extern int reserve_ports(struct step_record *step_ptr)
extern int resv_port_alloc(struct step_record *step_ptr)
{
int i, port_inx;
int *port_array = NULL;
char port_str[16];
char port_str[16], *tmp_str;
hostlist_t hl;
if (step_ptr->resv_port_cnt > port_resv_cnt) {
......@@ -95,8 +148,8 @@ extern int reserve_ports(struct step_record *step_ptr)
for (i=0; i<port_inx; i++) {
bit_or(port_resv_table[port_array[i]],
step_ptr->step_node_bitmap);
snprintf(port_str, sizeof(port_str),
"%d", (port_array[i] + port_resv_min));
port_array[i] += port_resv_min;
snprintf(port_str, sizeof(port_str), "%d", port_array[i]);
hostlist_push(hl, port_str);
}
hostlist_sort(hl);
......@@ -107,11 +160,47 @@ extern int reserve_ports(struct step_record *step_ptr)
xfree(step_ptr->resv_ports);
}
hostlist_destroy(hl);
xfree(port_array);
info("reserved ports %s for step %u.%u",
step_ptr->resv_ports,
step_ptr->job_ptr->job_id, step_ptr->step_id);
step_ptr->resv_port_array = port_array;
if (step_ptr->resv_ports[0] == '[') {
/* Remove brackets from hostlist */
i = strlen(step_ptr->resv_ports);
step_ptr->resv_ports[i-1] = '\0';
tmp_str = xmalloc(i);
strcpy(tmp_str, step_ptr->resv_ports + 1);
xfree(step_ptr->resv_ports);
step_ptr->resv_ports = tmp_str;
}
debug("reserved ports %s for step %u.%u",
step_ptr->resv_ports,
step_ptr->job_ptr->job_id, step_ptr->step_id);
return SLURM_SUCCESS;
}
/* Release reserved ports for a job step
* RET SLURM_SUCCESS or an error code */
extern void resv_port_free(struct step_record *step_ptr)
{
int i, j;
if (step_ptr->resv_port_array == NULL)
return;
bit_not(step_ptr->step_node_bitmap);
for (i=0; i<step_ptr->resv_port_cnt; i++) {
if ((step_ptr->resv_port_array[i] < port_resv_min) ||
(step_ptr->resv_port_array[i] > port_resv_max))
continue;
j = step_ptr->resv_port_array[i] - port_resv_min;
bit_and(port_resv_table[j], step_ptr->step_node_bitmap);
}
bit_not(step_ptr->step_node_bitmap);
xfree(step_ptr->resv_port_array);
debug("freed ports %s for step %u.%u",
step_ptr->resv_ports,
step_ptr->job_ptr->job_id, step_ptr->step_id);
}
......@@ -42,8 +42,16 @@
#include "src/slurmctld/slurmctld.h"
/* Configure reserved ports.
* Call with mpi_params==NULL to free memory */
extern int reserve_port_config(char *mpi_params);
/* Reserve ports for a job step
* RET SLURM_SUCCESS or an error code */
extern int reserve_ports(struct step_record *step_ptr);
extern int resv_port_alloc(struct step_record *step_ptr);
/* Release reserved ports for a job step
* RET SLURM_SUCCESS or an error code */
extern void resv_port_free(struct step_record *step_ptr);
#endif /* !_HAVE_PORT_MGR_H */
......@@ -75,6 +75,7 @@
#include "src/slurmctld/licenses.h"
#include "src/slurmctld/locks.h"
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/port_mgr.h"
#include "src/slurmctld/proc_req.h"
#include "src/slurmctld/read_config.h"
#include "src/slurmctld/reservation.h"
......@@ -851,6 +852,7 @@ int read_slurm_conf(int recover)
if ((rc = _build_bitmaps()))
fatal("_build_bitmaps failure");
reserve_port_config(conf->mpi_params);
license_free();
if (license_init(slurmctld_conf.licenses) != SLURM_SUCCESS)
......
......@@ -555,6 +555,7 @@ struct step_record {
uint8_t no_kill; /* 1 if no kill on node failure */
uint16_t port; /* port for srun communications */
time_t pre_sus_time; /* time step ran prior to last suspend */
int *resv_port_array; /* reserved port indexes */
uint16_t resv_port_cnt; /* count of ports reserved per node */
char *resv_ports; /* ports reserved for job */
time_t start_time; /* step allocation time */
......
......@@ -164,6 +164,7 @@ static void _free_step_rec(struct step_record *step_ptr)
FREE_NULL_BITMAP(step_ptr->core_bitmap_job);
FREE_NULL_BITMAP(step_ptr->exit_node_bitmap);
FREE_NULL_BITMAP(step_ptr->step_node_bitmap);
xfree(step_ptr->resv_port_array);
xfree(step_ptr->resv_ports);
xfree(step_ptr->network);
xfree(step_ptr->ckpt_path);
......@@ -201,6 +202,7 @@ delete_step_record (struct job_record *job_ptr, uint32_t step_id)
step_ptr->step_layout->node_list);
switch_free_jobinfo (step_ptr->switch_job);
}
resv_port_free(step_ptr);
checkpoint_free_jobinfo (step_ptr->check_job);
_free_step_rec(step_ptr);
error_code = 0;
......@@ -1347,7 +1349,7 @@ step_create(job_step_create_request_msg_t *step_specs,
}
if (step_specs->resv_port_cnt != (uint16_t) NO_VAL) {
step_ptr->resv_port_cnt = step_specs->resv_port_cnt;
i = reserve_ports(step_ptr);
i = resv_port_alloc(step_ptr);
if (i != SLURM_SUCCESS) {
delete_step_record (job_ptr, step_ptr->step_id);
return i;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment