diff --git a/NEWS b/NEWS index 6eb7606bedb21a13c04d1a05557696b1d1acfc81..5a71847bf9a09ce35fbbc43644726ea6953a6658 100644 --- a/NEWS +++ b/NEWS @@ -239,6 +239,8 @@ documents those changes that are of interest to users and admins. ========================= - If a job's stdout/err file names are unusable (bad path), use the default names. + - sched/wiki - Fix logic to be compatable with select/cons_res plugin + for allocating individual processors within nodes. * Changes in SLURM 1.1.31 ========================= diff --git a/src/plugins/sched/wiki2/get_jobs.c b/src/plugins/sched/wiki2/get_jobs.c index b8fe8ce52d7e8e8a2540da7c996c24bc95b037cd..a6ff6706b360dae58dcdc62878a38b1f4ff69aea 100644 --- a/src/plugins/sched/wiki2/get_jobs.c +++ b/src/plugins/sched/wiki2/get_jobs.c @@ -58,6 +58,7 @@ static uint32_t _get_job_submit_time(struct job_record *job_ptr); static uint32_t _get_job_suspend_time(struct job_record *job_ptr); static uint32_t _get_job_tasks(struct job_record *job_ptr); static uint32_t _get_job_time_limit(struct job_record *job_ptr); +static char * _full_task_list(struct job_record *job_ptr); #define SLURM_INFO_ALL 0 #define SLURM_INFO_VOLITILE 1 @@ -232,8 +233,13 @@ static char * _dump_job(struct job_record *job_ptr, int state_info) xstrcat(buf, tmp); } } else if (!IS_JOB_FINISHED(job_ptr)) { - char *hosts = bitmap2wiki_node_name( - job_ptr->node_bitmap); + char *hosts; + if (job_ptr->cr_enabled) { + hosts = _full_task_list(job_ptr); + } else { + hosts = bitmap2wiki_node_name( + job_ptr->node_bitmap); + } snprintf(tmp, sizeof(tmp), "TASKLIST=%s;", hosts); xstrcat(buf, tmp); @@ -457,3 +463,38 @@ static uint32_t _get_job_suspend_time(struct job_record *job_ptr) } return (uint32_t) 0; } + +/* Return a job's task list. + * List hostname once for each allocated CPU on that node. + * NOTE: xfree the return value. */ +static char * _full_task_list(struct job_record *job_ptr) +{ + int i, j; + char *buf = NULL, *host; + hostlist_t hl = hostlist_create(job_ptr->nodes); + + if (hl == NULL) { + error("hostlist_create error for job %u, %s", + job_ptr->job_id, job_ptr->nodes); + return buf; + } + + for (i=0; i<job_ptr->alloc_lps_cnt; i++) { + host = hostlist_shift(hl); + if (host == NULL) { + error("bad alloc_lps_cnt for job %u (%s, %d)", + job_ptr->job_id, job_ptr->nodes, + job_ptr->alloc_lps_cnt); + break; + } + for (j=0; j<job_ptr->alloc_lps[i]; j++) { + if (buf) + xstrcat(buf, ":"); + xstrcat(buf, host); + } + free(host); + } + hostlist_destroy(hl); + return buf; +} + diff --git a/src/plugins/sched/wiki2/job_will_run.c b/src/plugins/sched/wiki2/job_will_run.c index dd9a96e5c6c27bc9d598a517a7d6c8f31d153a05..d591074bb646b940b21af06f682acd9cf37817c2 100644 --- a/src/plugins/sched/wiki2/job_will_run.c +++ b/src/plugins/sched/wiki2/job_will_run.c @@ -245,7 +245,7 @@ static char * _copy_nodelist_no_dup(char *node_list) */ extern char * bitmap2wiki_node_name(bitstr_t *bitmap) { - int i, first = 1; + int i; char *buf = NULL; if (use_host_exp) @@ -257,9 +257,8 @@ extern char * bitmap2wiki_node_name(bitstr_t *bitmap) for (i = 0; i < node_record_count; i++) { if (bit_test (bitmap, i) == 0) continue; - if (first == 0) + if (buf) xstrcat(buf, ":"); - first = 0; xstrcat(buf, node_record_table_ptr[i].name); } return buf; diff --git a/src/plugins/sched/wiki2/start_job.c b/src/plugins/sched/wiki2/start_job.c index 20259fe92d9563f5943541f78bd1e26a3826a206..eac1eddb6763bac2531c8bc8deb08ca19a0ba95d 100644 --- a/src/plugins/sched/wiki2/start_job.c +++ b/src/plugins/sched/wiki2/start_job.c @@ -41,14 +41,14 @@ #include "src/slurmctld/state_save.h" static char * _copy_nodelist_no_dup(char *node_list); -static int _start_job(uint32_t jobid, char *hostlist, +static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, int *err_code, char **err_msg); /* RET 0 on success, -1 on failure */ extern int start_job(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *task_ptr, *node_ptr, *tmp_char; - int i; + int i, task_cnt = 1; uint32_t jobid; hostlist_t hl; char host_string[MAXHOSTRANGELEN]; @@ -78,8 +78,10 @@ extern int start_job(char *cmd_ptr, int *err_code, char **err_msg) } node_ptr = task_ptr + 9; for (i=0; node_ptr[i]!='\0'; i++) { - if (node_ptr[i] == ':') + if (node_ptr[i] == ':') { node_ptr[i] = ','; + task_cnt++; + } } hl = hostlist_create(node_ptr); if (hl == NULL) { @@ -100,7 +102,7 @@ extern int start_job(char *cmd_ptr, int *err_code, char **err_msg) host_string); return -1; } - if (_start_job(jobid, host_string, err_code, err_msg) != 0) + if (_start_job(jobid, task_cnt, host_string, err_code, err_msg) != 0) return -1; snprintf(reply_msg, sizeof(reply_msg), @@ -109,10 +111,10 @@ extern int start_job(char *cmd_ptr, int *err_code, char **err_msg) return 0; } -static int _start_job(uint32_t jobid, char *hostlist, +static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, int *err_code, char **err_msg) { - int rc = 0; + int rc = 0, old_task_cnt; struct job_record *job_ptr; /* Write lock on job info, read lock on node info */ slurmctld_lock_t job_write_lock = { @@ -161,12 +163,12 @@ static int _start_job(uint32_t jobid, char *hostlist, goto fini; } - /* Remove any excluded nodes, incompatable with Wiki */ - if (job_ptr->details->exc_nodes) { - error("wiki: clearing exc_nodes for job %u", jobid); - xfree(job_ptr->details->exc_nodes); - if (job_ptr->details->exc_node_bitmap) - FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); + /* User excluded node list incompatable with Wiki + * Exclude all nodes not explicitly requested */ + if (job_ptr->cr_enabled) { + FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); + job_ptr->details->exc_node_bitmap = bit_copy(new_bitmap); + bit_not(job_ptr->details->exc_node_bitmap); } /* start it now */ @@ -174,6 +176,8 @@ static int _start_job(uint32_t jobid, char *hostlist, job_ptr->details->req_nodes = new_node_list; FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); job_ptr->details->req_node_bitmap = new_bitmap; + old_task_cnt = job_ptr->num_procs; + job_ptr->num_procs = MAX(task_cnt, old_task_cnt); job_ptr->priority = 100000000; fini: unlock_slurmctld(job_write_lock); @@ -189,6 +193,7 @@ static int _start_job(uint32_t jobid, char *hostlist, /* restore job state */ job_ptr->priority = 0; + job_ptr->num_procs = old_task_cnt; if (job_ptr->details) { /* Details get cleared on job abort; happens * if the request is sufficiently messed up.