Skip to content
Snippets Groups Projects
Commit f2b890a3 authored by Moe Jette's avatar Moe Jette
Browse files

major restructuring of wiki will_run command logic.

parent dc73dca7
No related branches found
No related tags found
No related merge requests found
......@@ -41,17 +41,30 @@
#include "src/slurmctld/node_scheduler.h"
#include "src/slurmctld/state_save.h"
static char * _copy_nodelist_no_dup(char *node_list);
static int _will_run_test(uint32_t jobid, char *hostlist,
int *err_code, char **err_msg);
static char * _will_run_test(uint32_t jobid, char *job_list,
char *exclude_list, int *err_code, char **err_msg);
/* RET 0 on success, -1 on failure */
/*
* get_jobs - get information on specific job(s) changed since some time
* cmd_ptr IN - CMD=JOBWILLRUN ARG=<JOBID> AFTER=<JOBID>[:<JOBID>...]
* [EXCLUDE=<node_list>]
* err_code OUT - 0 on success or some error code
* err_msg OUT - error message or the JOBID from ordered list after
* which the specified job can start (no JOBID if job
* can start immediately) and the assigned node list.
* ARG=<JOBID> [AFTER=<JOBID>] NODES=<node_list>
* NOTE: xfree() err_msg if err_code is zero
* RET 0 on success, -1 on failure
*/
extern int job_will_run(char *cmd_ptr, int *err_code, char **err_msg)
{
char *arg_ptr, *task_ptr, *node_ptr, *tmp_char;
int i;
char *arg_ptr, *tmp_char, *job_list, *exclude_list;
char *buf, *tmp_buf;
int buf_size;
uint32_t jobid;
char host_string[MAXHOSTRANGELEN];
/* Locks: write job, read node and partition info */
slurmctld_lock_t job_write_lock = {
NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK };
arg_ptr = strstr(cmd_ptr, "ARG=");
if (arg_ptr == NULL) {
......@@ -68,114 +81,120 @@ extern int job_will_run(char *cmd_ptr, int *err_code, char **err_msg)
return -1;
}
task_ptr = strstr(cmd_ptr, "TASKLIST=");
if (task_ptr) {
hostlist_t hl;
node_ptr = task_ptr + 9;
for (i=0; node_ptr[i]!='\0'; i++) {
if (node_ptr[i] == ':')
node_ptr[i] = ',';
}
hl = hostlist_create(node_ptr);
i = hostlist_ranged_string(hl, sizeof(host_string), host_string);
hostlist_destroy(hl);
if (i < 0) {
*err_code = -300;
*err_msg = "JOBWILLRUN has invalid TASKLIST";
error("wiki: JOBWILLRUN has invalid TASKLIST");
return -1;
}
job_list = strstr(cmd_ptr, "AFTER=");
if (job_list) {
job_list += 6;
null_term(job_list);
} else {
/* no restrictions on nodes available for use */
strcpy(host_string, "");
*err_code = -300;
*err_msg = "Invalid AFTER value";
error("wiki: JOBWILLRUN has invalid jobid");
return -1;
}
if (_will_run_test(jobid, host_string, err_code, err_msg) != 0)
exclude_list = strstr(cmd_ptr, "EXCLUDE=");
if (exclude_list) {
exclude_list += 8;
null_term(exclude_list);
}
lock_slurmctld(job_write_lock);
buf = _will_run_test(jobid, job_list, exclude_list, err_code,
err_msg);
unlock_slurmctld(job_write_lock);
if (!buf) {
info("wiki: JOBWILLRUN failed for job %u", jobid);
return -1;
}
buf_size = strlen(buf);
tmp_buf = xmalloc(buf_size + 32);
sprintf(tmp_buf, "SC=0 ARG=%s", buf);
xfree(buf);
*err_code = 0;
*err_msg = tmp_buf;
return 0;
}
static int _will_run_test(uint32_t jobid, char *hostlist,
int *err_code, char **err_msg)
static char * _will_run_test(uint32_t jobid, char *job_list,
char *exclude_list, int *err_code, char **err_msg)
{
int rc = 0, i;
struct job_record *job_ptr;
/* Write lock on job info, read lock on node info */
slurmctld_lock_t job_write_lock = {
NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK };
char *new_node_list, *picked_node_list = NULL;
bitstr_t *new_bitmap, *save_exc_bitmap, *save_req_bitmap;
uint32_t save_prio;
bitstr_t *picked_node_bitmap = NULL;
/* Just create a big static message buffer to avoid dealing with
* xmalloc/xfree. We'll switch to compressed node naming soon
* and this buffer can be set smaller then. */
static char reply_msg[16384];
bitstr_t *save_exc_bitmap = NULL, *new_bitmap = NULL;
uint32_t save_prio, *jobid_list = NULL;
struct job_record **job_ptr_list;
int i, job_list_size;
char *tmp_char;
lock_slurmctld(job_write_lock);
job_ptr = find_job_record(jobid);
if (job_ptr == NULL) {
*err_code = -700;
*err_msg = "No such job";
error("wiki: Failed to find job %u", jobid);
rc = -1;
unlock_slurmctld(job_write_lock);
return rc;
return NULL;
}
if ((job_ptr->details == NULL)
|| (job_ptr->job_state != JOB_PENDING)) {
if ((job_ptr->details == NULL) ||
(job_ptr->job_state != JOB_PENDING)) {
*err_code = -700;
*err_msg = "Job not pending, can't test will_run";
error("wiki: Attempt to test will_run of non-pending job %u",
jobid);
rc = -1;
unlock_slurmctld(job_write_lock);
return rc;
}
new_node_list = _copy_nodelist_no_dup(hostlist);
if (hostlist && (new_node_list == NULL)) {
*err_code = -700;
*err_msg = "Invalid TASKLIST";
error("wiki: Attempt to set invalid node list for job %u, %s",
jobid, hostlist);
rc = -1;
unlock_slurmctld(job_write_lock);
return rc;
return NULL;
}
if (node_name2bitmap(new_node_list, false, &new_bitmap) != 0) {
*err_code = -700;
*err_msg = "Invalid TASKLIST";
error("wiki: Attempt to set invalid node list for job %u, %s",
jobid, hostlist);
rc = -1;
xfree(new_node_list);
unlock_slurmctld(job_write_lock);
return rc;
/* parse the job list */
job_list_size = strlen(job_list) + 1;
jobid_list = xmalloc(job_list_size * sizeof(uint32_t));
job_ptr_list = xmalloc(job_list_size * sizeof (struct job_record *));
tmp_char = job_list;
for (i=0; i<job_list_size; ) {
jobid_list[i] = strtoul(tmp_char, &tmp_char, 10);
if ((tmp_char[0] != '\0') && (!isspace(tmp_char[0])) &&
(tmp_char[0] != ':')) {
*err_code = -300;
*err_msg = "Invalid AFTER value";
error("wiki: Invalid AFTER value of %s", job_list);
xfree(jobid_list);
xfree(job_ptr_list);
return NULL;
}
job_ptr_list[i] = find_job_record(jobid_list[i]);
if (job_ptr_list[i])
i++;
else {
error("wiki: willrun AFTER job %u not found",
jobid_list[i]);
jobid_list[i] = 0;
}
if (tmp_char[0] == ':')
tmp_char++;
else
break;
}
/* Put the inverse of this on the excluded node list,
* Remove any required nodes, and test */
save_exc_bitmap = job_ptr->details->exc_node_bitmap;
if (hostlist[0]) { /* empty hostlist, all nodes usable */
bit_not(new_bitmap);
if (exclude_list) {
if (node_name2bitmap(exclude_list, false, &new_bitmap) != 0) {
*err_code = -700;
*err_msg = "Invalid EXCLUDE value";
error("wiki: Attempt to set invalid exclude node "
"list for job %u, %s",
jobid, exclude_list);
return NULL;
}
save_exc_bitmap = job_ptr->details->exc_node_bitmap;
job_ptr->details->exc_node_bitmap = new_bitmap;
}
save_req_bitmap = job_ptr->details->req_node_bitmap;
job_ptr->details->req_node_bitmap = bit_alloc(node_record_count);
/* test when the job can execute */
save_prio = job_ptr->priority;
job_ptr->priority = 1;
#if 0
/* execute will_run logic here */
/* Note that last jobid_list entry has a value of zero */
rc = select_nodes(job_ptr, true, &picked_node_bitmap);
if (picked_node_bitmap) {
picked_node_list = bitmap2wiki_node_name(picked_node_bitmap);
i = strlen(picked_node_list);
if ((i + 64) > sizeof(reply_msg))
error("wiki: will_run buffer overflow");
}
if (rc == SLURM_SUCCESS) {
*err_code = 0;
......@@ -186,13 +205,13 @@ static int _will_run_test(uint32_t jobid, char *hostlist,
} else if (rc == ESLURM_NODES_BUSY) {
*err_code = 1;
snprintf(reply_msg, sizeof(reply_msg),
"SC=1 Job %d runnable later TASKLIST:%s",
"SC=1 Job %u runnable later TASKLIST:%s",
jobid, picked_node_list);
*err_msg = reply_msg;
} else if (rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) {
*err_code = 1;
snprintf(reply_msg, sizeof(reply_msg),
"SC=1 Job %d not runnable with current configuration",
"SC=1 Job %u not runnable with current configuration",
jobid);
*err_msg = reply_msg;
} else {
......@@ -205,37 +224,21 @@ static int _will_run_test(uint32_t jobid, char *hostlist,
jobid, err_str);
*err_msg = reply_msg;
}
#endif
/* Restore job's state, release memory */
xfree(picked_node_list);
FREE_NULL_BITMAP(picked_node_bitmap);
xfree(new_node_list);
bit_free(new_bitmap);
FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
job_ptr->details->exc_node_bitmap = save_exc_bitmap;
job_ptr->details->req_node_bitmap = save_req_bitmap;
/* Restore job's state, release allocated memory */
if (save_exc_bitmap)
job_ptr->details->exc_node_bitmap = save_exc_bitmap;
FREE_NULL_BITMAP(new_bitmap);
job_ptr->priority = save_prio;
unlock_slurmctld(job_write_lock);
return rc;
}
xfree(jobid_list);
xfree(job_ptr_list);
static char * _copy_nodelist_no_dup(char *node_list)
{
int new_size = 128;
char *new_str;
hostlist_t hl = hostlist_create(node_list);
if (hl == NULL)
return NULL;
hostlist_uniq(hl);
new_str = xmalloc(new_size);
while (hostlist_ranged_string(hl, new_size, new_str) == -1) {
new_size *= 2;
xrealloc(new_str, new_size);
}
hostlist_destroy(hl);
return new_str;
#if 1
*err_code = -810;
*err_msg = "JOBWILLRUN not yet supported";
return NULL;
#endif
}
/*
......
......@@ -626,7 +626,8 @@ static void _proc_msg(slurm_fd new_fd, char *msg)
job_release_task(cmd_ptr, &err_code, &err_msg);
} else if (strncmp(cmd_ptr, "JOBWILLRUN", 10) == 0) {
msg_type = "wiki:JOBWILLRUN";
job_will_run(cmd_ptr, &err_code, &err_msg);
if (!job_will_run(cmd_ptr, &err_code, &err_msg))
goto free_resp_msg;
} else if (strncmp(cmd_ptr, "MODIFYJOB", 9) == 0) {
msg_type = "wiki:MODIFYJOB";
job_modify_wiki(cmd_ptr, &err_code, &err_msg);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment