Skip to content
Snippets Groups Projects
Commit 6f375805 authored by Christopher J. Morrone's avatar Christopher J. Morrone
Browse files

More the SLURM_HOSTFILE parsing out of slurm_allocate_resources. This

leaves hostfile parsing broken, but really it was already broken.
parent 6256fd2c
No related branches found
No related tags found
No related merge requests found
......@@ -201,7 +201,7 @@ if the number of tasks is no larger than the number of nodes requested.
.TP
.B hostfile
The hostfile method of distribution will allocate processes in-order as
listed in file designated by the environment variable MP_HOSTFILE. If
listed in file designated by the environment variable SLURM_HOSTFILE. If
this variable is listed it will over ride any other method specified.
If not set the method will default to block.
.RE
......
......@@ -732,6 +732,23 @@ extern int slurm_job_step_create PARAMS((
job_step_create_request_msg_t * slurm_step_alloc_req_msg,
job_step_create_response_msg_t ** slurm_step_alloc_resp_msg));
/*
* slurm_read_hostfile - Read a SLURM hostfile specified by "filename".
* "filename" must contain a list of SLURM NodeNames, one per line.
* Reads up to "n" number of hostnames from the file. Returns a
* string representing a hostlist ranged string of the contents of
* the file. This is a helper function, it does not contact any
* SLURM daemons.
*
* IN filename - name of SLURM Hostlist file to be read.
* IN n - number of NodeNames required
* RET - a string representing the hostlist. Returns NULL if there are
* fewer than "n" hostnames in the file, or if an error occurs.
*
* NOTE: Returned string must be freed with free().
*/
extern char *slurm_read_hostfile PARAMS((char *filename, int n));
/*
* slurm_free_job_step_create_response_msg - free slurm
* job step create response message
......
......@@ -50,7 +50,6 @@ extern pid_t getsid(pid_t pid); /* missing from <unistd.h> */
#define BUFFER_SIZE 1024
static int _handle_rc_msg(slurm_msg_t *msg);
static int _nodelist_from_hostfile(job_step_create_request_msg_t *req);
/*
* slurm_allocate_resources - allocate resources for a job request
......@@ -224,9 +223,6 @@ slurm_job_step_create (job_step_create_request_msg_t *req,
forward_init(&resp_msg.forward, NULL);
resp_msg.ret_list = NULL;
if(_nodelist_from_hostfile(req) == 0)
debug("nodelist was NULL");
if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
return SLURM_ERROR;
......@@ -349,105 +345,103 @@ _handle_rc_msg(slurm_msg_t *msg)
return SLURM_SUCCESS;
}
/* FIXME - This should be moved out of the API (into slurm_ll_api and srun),
* or made into a seperate helper-function in the API. It should
* NOT run as a side effect of the slurm_job_step_create() function
* call. If made into a helper function, it should take a filename
* as a parameter; it should not check an environment variable on its
* own.
/*
* Read a SLURM hostfile specified by "filename". "filename" must contain
* a list of SLURM NodeNames, one per line. Reads up to "n" number of hostnames
* from the file. Returns a string representing a hostlist ranged string of
* the contents of the file. This is a helper function, it does not
* contact any SLURM daemons.
*
* Returns a string representing the hostlist. Returns NULL if there are fewer
* than "n" hostnames in the file, or if an error occurs.
*
* Returned string must be freed with free().
*/
static int _nodelist_from_hostfile(job_step_create_request_msg_t *req)
char *slurm_read_hostfile(char *filename, int n)
{
char *hostfile = NULL;
FILE *hostfilep = NULL;
FILE *fp = NULL;
char in_line[BUFFER_SIZE]; /* input line */
int i, j;
int line_size;
hostlist_t hostlist = NULL;
int count = 0;
int line_num = 0;
hostlist_t hostlist = NULL;
char *nodelist = NULL;
if ((hostfile = getenv("SLURM_HOSTFILE"))) {
if(strlen(hostfile)<1 || !strcmp(hostfile,"NULL"))
goto no_hostfile;
if((hostfilep = fopen(hostfile, "r")) == NULL) {
error("slurm_allocate_resources "
"error opening file %s, %m",
hostfile);
goto no_hostfile;
if (filename == NULL || strlen(filename) == 0)
return NULL;
if((fp = fopen(filename, "r")) == NULL) {
error("slurm_allocate_resources error opening file %s, %m",
filename);
return NULL;
}
hostlist = hostlist_create(NULL);
if (hostlist == NULL)
return NULL;
while (fgets(in_line, BUFFER_SIZE, fp) != NULL) {
line_num++;
line_size = strlen(in_line);
if (line_size == (BUFFER_SIZE - 1)) {
error ("Line %d, of hostfile %s too long",
line_num, filename);
fclose (fp);
return NULL;
}
hostlist = hostlist_create(NULL);
while (fgets (in_line, BUFFER_SIZE, hostfilep) != NULL) {
line_num++;
line_size = strlen(in_line);
if (line_size >= (BUFFER_SIZE - 1)) {
error ("Line %d, of hostfile %s too long",
line_num, hostfile);
fclose (hostfilep);
goto no_hostfile;
}
for (i = 0; i < line_size; i++) {
if (in_line[i] == '\n') {
in_line[i] = '\0';
break;
}
if (in_line[i] == '\0')
break;
if (in_line[i] != '#')
continue;
if ((i > 0) && (in_line[i - 1] == '\\')) {
for (j = i; j < line_size; j++) {
in_line[j - 1] = in_line[j];
}
line_size--;
continue;
}
for (i = 0; i < line_size; i++) {
if (in_line[i] == '\n') {
in_line[i] = '\0';
break;
}
hostlist_push(hostlist,in_line);
if(req->num_tasks && (line_num+1)>req->num_tasks)
break;
}
fclose (hostfilep);
nodelist = (char *)xmalloc(0xffff);
if (!nodelist) {
error("Nodelist xmalloc failed");
goto cleanup_hostfile;
}
if (hostlist_ranged_string(hostlist, 0xffff, nodelist) == -1) {
error("Hostlist is too long for the allocate RPC!");
xfree(nodelist);
nodelist = NULL;
goto cleanup_hostfile;
if (in_line[i] == '\0')
break;
if (in_line[i] != '#')
continue;
if ((i > 0) && (in_line[i - 1] == '\\')) {
for (j = i; j < line_size; j++) {
in_line[j - 1] = in_line[j];
}
line_size--;
continue;
}
in_line[i] = '\0';
break;
}
hostlist_push(hostlist, in_line);
if(hostlist_count(hostlist) == n)
break;
}
fclose(fp);
count = hostlist_count(hostlist);
if (count <= 0) {
error("Hostlist is empty!\n");
xfree(nodelist);
goto cleanup_hostfile;
}
debug2("Hostlist from SLURM_HOSTFILE = %s\n",
nodelist);
cleanup_hostfile:
hostlist_destroy(hostlist);
if (hostlist_count(hostlist) <= 0) {
error("Hostlist is empty!\n");
goto cleanup_hostfile;
}
no_hostfile:
if(nodelist) {
if(req->node_list)
xfree(req->node_list);
req->node_list = nodelist;
req->num_tasks = count;
req->task_dist = SLURM_DIST_ARBITRARY;
if (hostlist_count(hostlist) < n) {
error("Too few NodeNames in SLURM Hostfile");
goto cleanup_hostfile;
}
nodelist = (char *)malloc(0xffff);
if (!nodelist) {
error("Nodelist xmalloc failed");
goto cleanup_hostfile;
}
if (hostlist_ranged_string(hostlist, 0xffff, nodelist) == -1) {
error("Hostlist is too long for the allocate RPC!");
free(nodelist);
nodelist = NULL;
goto cleanup_hostfile;
}
return count;
debug2("Hostlist from SLURM_HOSTFILE = %s\n", nodelist);
cleanup_hostfile:
hostlist_destroy(hostlist);
return nodelist;
}
......@@ -106,6 +106,25 @@ allocate_nodes(void)
j->job_id = NO_VAL;
}
/* FIXME - this is an ugly place to check SLURM_HOSTFILE */
if (j->req_nodes == NULL) {
char *nodelist = NULL;
char *hostfile = getenv("SLURM_HOSTFILE");
if (hostfile != NULL) {
nodelist = slurm_read_hostfile(hostfile, j->num_tasks);
if (nodelist == NULL) {
error("Failure getting NodeNames from hostfile");
/* FIXME - need to fail somehow */
goto done;
} else {
j->req_nodes = xstrdup(nodelist);
free(nodelist);
j->task_dist = SLURM_DIST_ARBITRARY;
}
}
}
while ((rc = slurm_allocate_resources(j, &resp) < 0) && _retry()) {
if (destroy_job)
goto done;
......@@ -544,6 +563,27 @@ create_job_step(srun_job_t *job,
error ("Unable to allocate step request message");
return -1;
}
/* FIXME - this is also an ugly place to check SLURM_HOSTFILE,
* and does not quite work.
*/
if (req->node_list == NULL) {
char *nodelist = NULL;
char *hostfile = getenv("SLURM_HOSTFILE");
if (hostfile != NULL) {
nodelist = slurm_read_hostfile(hostfile, req->num_tasks);
if (nodelist == NULL) {
error("Error reading SLURM hostfile");
return -1;
}
req->node_list = xstrdup(nodelist);
free(nodelist);
req->task_dist = SLURM_DIST_ARBITRARY;
}
}
if ((slurm_job_step_create(req, &resp) < 0) || (resp == NULL)) {
error ("Unable to create job step: %m");
return -1;
......
......@@ -84,7 +84,7 @@ for {set i 0} {$i<2} {incr i} {
previous poe run\n"
exit 1
}
set env(MP_HOSTFILE) $hostfile
set env(SLURM_HOSTFILE) $hostfile
set 1node0 $node0
set 1node1 $node1
set file [open $hostfile "w"]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment