diff --git a/NEWS b/NEWS index 350eacc6da6ac00847b4689b22c827ca30b9c093..ab9473fe2abe6fe461057f4a7004b7a7d35b9bd9 100644 --- a/NEWS +++ b/NEWS @@ -74,6 +74,7 @@ documents those changes that are of interest to users and admins. ========================= -- Fix for job accounting logic submitted from Andy Riebs to handle issues with suspending jobs and such. patch file named requeue.patch + -- Make select/cons_res interoperate with mpi/lam plugin for task counts. * Changes in SLURM 1.0.9 ======================== diff --git a/src/common/dist_tasks.c b/src/common/dist_tasks.c index 313a02184f81455e0b27607877ae0750ee2183bb..cfa9314eb3907cd1210640708bbbce79f75131c1 100644 --- a/src/common/dist_tasks.c +++ b/src/common/dist_tasks.c @@ -251,6 +251,7 @@ extern int step_layout_destroy(slurm_step_layout_t *step_layout) xfree(step_layout->tids); xfree(step_layout->cpus); xfree(step_layout->tasks); + xfree(step_layout->hostids); hostlist_destroy(step_layout->hl); xfree(step_layout); @@ -274,11 +275,10 @@ extern int task_layout(slurm_step_layout_t *step_layout) * step_layout->num_hosts); step_layout->host = xmalloc(sizeof(char *) * step_layout->num_hosts); - if ((step_layout->cpus == NULL) || (step_layout->tasks == NULL) || - (step_layout->host == NULL)) { - slurm_seterrno(ENOMEM); - return SLURM_ERROR; - } + step_layout->tids = xmalloc(sizeof(uint32_t *) + * step_layout->num_hosts); + step_layout->hostids = xmalloc(sizeof(uint32_t) + * step_layout->num_tasks); for (i=0; i<step_layout->num_hosts; i++) { step_layout->host[i] = hostlist_shift(step_layout->hl); @@ -289,14 +289,6 @@ extern int task_layout(slurm_step_layout_t *step_layout) cpu_cnt = 0; } } - step_layout->tasks = xmalloc(sizeof(uint32_t) - * step_layout->num_hosts); - step_layout->tids = xmalloc(sizeof(uint32_t *) - * step_layout->num_hosts); - if ((step_layout->tasks == NULL) || (step_layout->tids == NULL)) { - slurm_seterrno(ENOMEM); - return SLURM_ERROR; - } if (step_layout->task_dist == SLURM_DIST_CYCLIC) return _task_layout_cyclic(step_layout); @@ -308,6 +300,26 @@ extern int task_layout(slurm_step_layout_t *step_layout) return _task_layout_block(step_layout); } +int +step_layout_host_id (slurm_step_layout_t *s, int taskid) +{ + if (taskid > s->num_tasks - 1) + return SLURM_ERROR; + + return (s->hostids[taskid]); +} + +char * +step_layout_host_name (slurm_step_layout_t *s, int taskid) +{ + int hostid = step_layout_host_id (s, taskid); + + if (hostid < 0) + return NULL; + + return (s->host[hostid]); +} + #ifndef HAVE_FRONT_END /* use specific set run tasks on each host listed in hostfile * XXX: Need to handle over-subscribe. @@ -345,6 +357,7 @@ static int _task_layout_hostfile(slurm_step_layout_t *step_layout) while((host_task = hostlist_next(itr_task))) { if(!strcmp(host, host_task)) { step_layout->tids[i][j] = taskid; + step_layout->hostids[taskid] = i; j++; } taskid++; @@ -397,8 +410,11 @@ static int _task_layout_block(slurm_step_layout_t *step_layout) slurm_seterrno(ENOMEM); return SLURM_ERROR; } - for (j=0; j<step_layout->tasks[i]; j++) - step_layout->tids[i][j] = taskid++; + for (j=0; j<step_layout->tasks[i]; j++) { + step_layout->tids[i][j] = taskid; + step_layout->hostids[taskid] = i; + taskid++; + } } return SLURM_SUCCESS; } @@ -435,7 +451,9 @@ static int _task_layout_cyclic(slurm_step_layout_t *step_layout) && (taskid<step_layout->num_tasks)); i++) { if ((j<step_layout->cpus[i]) || over_subscribe) { step_layout->tids[i][step_layout->tasks[i]] = - taskid++; + taskid; + step_layout->hostids[taskid] = i; + taskid++; step_layout->tasks[i]++; if ((j+1) < step_layout->cpus[i]) space_remaining = true; diff --git a/src/common/dist_tasks.h b/src/common/dist_tasks.h index 3d2d6c6933ba03be2ccae1bfb3daab0299e46370..1e0f3d52a7bc8a63f9501c9f7f34598413064f5a 100644 --- a/src/common/dist_tasks.h +++ b/src/common/dist_tasks.h @@ -56,6 +56,7 @@ typedef struct slurm_step_layout { uint32_t *tasks; /* number of tasks on each host */ uint32_t **tids; /* host id => task id mapping */ + uint32_t *hostids; /* task id => host id mapping */ uint32_t num_hosts; /* node count */ uint32_t num_tasks; /* number of tasks to execute */ @@ -94,5 +95,9 @@ extern slurm_step_layout_t *step_layout_create( extern int step_layout_destroy(slurm_step_layout_t *step_layout); /* build maps for task layout on nodes */ extern int task_layout(slurm_step_layout_t *step_layout); + +extern int step_layout_host_id (slurm_step_layout_t *s, int taskid); + +extern char * step_layout_host_name (slurm_step_layout_t *s, int hostid); #endif /* !_DIST_TASKS_H */ diff --git a/src/plugins/mpi/mpichgm/mpichgm.c b/src/plugins/mpi/mpichgm/mpichgm.c index 65a1ce89d8022811feafc42fada2572590870381..3d1052054f430c2a6b25af497676629b1552583f 100644 --- a/src/plugins/mpi/mpichgm/mpichgm.c +++ b/src/plugins/mpi/mpichgm/mpichgm.c @@ -177,14 +177,17 @@ static int _gmpi_establish_map(srun_job_t *job) */ lmap = (char *)xmalloc(128*nprocs); for (i=0; i<nprocs; i++) { + int ihostid = step_layout_host_id(job->step_layout, i); /* * Compose the string to send. */ dp = &slave_data[i]; p = lmap; for (j=0; j<nprocs; j++) { - if (job->hostid[i] == job->hostid[j] && - dp->numanode == slave_data[j].numanode) { + int jhostid = step_layout_host_id (job->step_layout, j); + + if ((ihostid == jhostid) && + (dp->numanode == slave_data[j].numanode)) { sprintf(tmp, "<%u>", j); strcpy(p, tmp); p += strlen(tmp); @@ -210,7 +213,7 @@ static int _gmpi_establish_map(srun_job_t *job) bzero(&addr, sizeof(addr)); addr.sin_family = AF_INET; addr.sin_addr.s_addr - = job->slurmd_addr[job->hostid[i]].sin_addr.s_addr; + = job->slurmd_addr[ihostid].sin_addr.s_addr; addr.sin_port = htons(dp->remote_port); if (connect(newfd, (struct sockaddr *)&addr, sizeof(addr))) fatal("GMPI master failed to connect"); diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 219e10eb7e3ae93107aa8c906c81c64ac6228fbc..7f5457b604d91045be5f5b582f694ce98020bda4 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -584,9 +584,9 @@ _check_job_credential(slurm_cred_t cred, uint32_t jobid, goto fail; } - if (!(arg.ntask[host_index] == tasks_to_launch)) { - error("job cr credential (%d != %d) invalid for this host [%d.%d %ld %s]", - arg.ntask[host_index], tasks_to_launch, arg.jobid, arg.stepid, + if (tasks_to_launch > arg.ntask[host_index]) { + error("job cr credential (%d > %d) invalid for this host [%d.%d %ld %s]", + tasks_to_launch, arg.ntask[host_index], arg.jobid, arg.stepid, (long) arg.uid, arg.hostlist); goto fail; } diff --git a/src/srun/io.c b/src/srun/io.c index 2da00953ae906229efdc2dc8fc225bcaab3c0950..c95658a71474f94bb653c0f6fce8f73d8c88fb5f 100644 --- a/src/srun/io.c +++ b/src/srun/io.c @@ -50,6 +50,7 @@ #include "src/common/xsignal.h" #include "src/common/io_hdr.h" #include "src/common/net.h" +#include "src/common/dist_tasks.h" #include "src/srun/io.h" #include "src/srun/srun_job.h" @@ -731,7 +732,8 @@ again: int nodeid; struct server_io_info *server; msg->ref_count = 1; - nodeid = info->job->hostid[header.gtaskid]; + nodeid = step_layout_host_id(info->job->step_layout, + header.gtaskid); debug3(" taskid %d maps to nodeid %d", header.gtaskid, nodeid); server = info->job->ioserver[nodeid]->arg; list_enqueue(server->msg_queue, msg); diff --git a/src/srun/msg.c b/src/srun/msg.c index 22c84bed197afea01ea09dbada9cbbb4f031d1ad..5a3b582501858c3ec6a2b9a23ad8cc6e79d0ed5e 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -603,9 +603,12 @@ _reattach_handler(srun_job_t *job, slurm_msg_t *msg) job->step_layout->tasks[resp->srun_node_id] = resp->ntasks; + info ("ntasks = %d\n"); + for (i = 0; i < resp->ntasks; i++) { job->step_layout->tids[resp->srun_node_id][i] = resp->gtids[i]; - job->hostid[resp->gtids[i]] = resp->srun_node_id; + job->step_layout->hostids[resp->gtids[i]] = resp->srun_node_id; + info ("setting task%d on hostid %d\n", resp->gtids[i], resp->srun_node_id); } _update_step_layout(job->forked_msg->par_msg->msg_pipe[1], job->step_layout, resp->srun_node_id); @@ -708,12 +711,15 @@ _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) { task_exit_msg_t *msg = (task_exit_msg_t *) exit_msg->data; hostlist_t hl = hostlist_create(NULL); - int hostid = exit_msg->srun_node_id; - char *host = job->step_layout->host[hostid]; + int task0 = msg->task_id_list[0]; + char *host = NULL; int status = msg->return_code; int i; char buf[1024]; + if (!(host = step_layout_host_name(job->step_layout, task0))) + host = "Unknown host"; + if (!job->etimeout && !tasks_exited) job->etimeout = time(NULL) + opt.max_exit_timeout; @@ -749,7 +755,7 @@ _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) } } - update_tasks_state(job, hostid); + update_tasks_state(job, step_layout_host_id(job->step_layout, task0)); _print_exit_status(job, hl, host, status); diff --git a/src/srun/reattach.c b/src/srun/reattach.c index f304d29c6b4fd353b76321f16ae9bf92d1f342a4..cefaed2a82bbb704a40891b2a722b82d22e0d173 100644 --- a/src/srun/reattach.c +++ b/src/srun/reattach.c @@ -443,7 +443,6 @@ int reattach() job->jobid = s->jobid; job->stepid = s->stepid; job->step_layout->tids = xmalloc(job->nhosts * sizeof(uint32_t *)); - job->hostid = xmalloc(s->ntasks * sizeof(uint32_t *)); if (job->stepid == NO_VAL) { char *new_argv0 = NULL; diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c index 600b4173f9947ceab125cb137e1e817b897539f7..a33c4650da23b51c091cb7bc97fd7f4f0dd9f687 100644 --- a/src/srun/srun_job.c +++ b/src/srun/srun_job.c @@ -244,8 +244,6 @@ _job_create_structure(allocation_info_t *info) job->listenport = (int *) xmalloc(job->num_listen * sizeof(int)); - job->hostid = xmalloc(opt.nprocs * sizeof(uint32_t)); - slurm_mutex_init(&job->task_mutex); job->old_job = false; diff --git a/src/srun/srun_job.h b/src/srun/srun_job.h index c2faafadfde484b1de2cbd702cbe65d8a01d81a1..556d31fdd36f9d98569e1a8f454f9bb1b09239af 100644 --- a/src/srun/srun_job.h +++ b/src/srun/srun_job.h @@ -107,7 +107,6 @@ typedef struct srun_job { slurm_cred_t cred; /* Slurm job credential */ char *nodelist; /* nodelist in string form */ - uint32_t *hostid; /* task id => host id mapping */ slurm_addr *slurmd_addr;/* slurm_addr vector to slurmd's */