diff --git a/NEWS b/NEWS index ae679b8be85626be4206bde244817a9103ca6667..b6ae20383e500606b237e308d5970c33616b9301 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,9 @@ documents those changes that are of interest to users and admins. -- Remove sched/wiki plugin (use sched/wiki2 for now) -- Disable pthread_create() for PMI_send when TotalView is running for better performance. + -- fixed certain tests in test suite to not run with bluegene or front-end + systems + -- removed addresses from slurm_step_layout_t * Changes in SLURM 1.2.0-pre2 ============================= diff --git a/slurm/slurm.h.in b/slurm/slurm.h.in index a3a8c12ae5cc9c1b77d0c84ee60a7531eb9dd504..b4d0ced47ebd307b612f4a11e77e8235b1491eff 100644 --- a/slurm/slurm.h.in +++ b/slurm/slurm.h.in @@ -506,8 +506,6 @@ typedef struct slurm_step_layout { uint16_t node_cnt; /* node count */ uint32_t task_cnt; /* total number of tasks in the step */ char *node_list; /* list of nodes in step */ - slurm_addr *node_addr; /* corresponding addresses */ - /* Array of length "node_cnt". Each element of the array is the number of tasks assigned to the corresponding node */ uint32_t *tasks; diff --git a/src/api/job_info.c b/src/api/job_info.c index 74481eb5ab9a1cd3d3dfd69cd86c2864ac3cac65..9f3ffc7f31b39da5c26cf74a7c8da3a19b618c52 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -16,7 +16,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -345,7 +345,6 @@ slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner ) xstrcat(out, "\n "); xstrcat(out, select_buf); } - xstrcat(out, select_buf); xstrcat(out, "\n\n"); return out; diff --git a/src/api/spawn.c b/src/api/spawn.c index 596a7a59578968ed6ca0ebaca97fe3b990e7010f..2f737b9a76f4e954d7beca0a3384962e08a34faa 100644 --- a/src/api/spawn.c +++ b/src/api/spawn.c @@ -59,6 +59,7 @@ #include "src/common/hostlist.h" #include "src/common/slurm_protocol_api.h" #include "src/common/slurm_protocol_defs.h" +#include "src/common/read_config.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" @@ -109,7 +110,9 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) int task_cnt = 0; uint32_t *cpus = NULL; slurm_step_layout_t *step_layout = ctx->step_resp->step_layout; - + hostlist_t hl = NULL; + char *name = NULL; + if ((ctx == NULL) || (ctx->magic != STEP_CTX_MAGIC) || (fd_array == NULL)) { @@ -152,6 +155,7 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) req_array_ptr = xmalloc(sizeof(slurm_msg_t) * step_layout->node_cnt); + hl = hostlist_create(step_layout->node_list); for (i=0; i<step_layout->node_cnt; i++) { spawn_task_request_msg_t *r = &msg_array_ptr[i]; slurm_msg_t *m = &req_array_ptr[i]; @@ -179,15 +183,28 @@ extern int slurm_spawn (slurm_step_ctx ctx, int *fd_array) m->msg_type = REQUEST_SPAWN_TASK; m->data = r; - - memcpy(&m->address, &step_layout->node_addr[i], - sizeof(slurm_addr)); + name = hostlist_shift(hl); + if(!name) { + error("hostlist incomplete for this job request"); + hostlist_destroy(hl); + return SLURM_ERROR; + } + if(slurm_conf_get_addr(name, &m->address) + == SLURM_ERROR) { + error("_init_task_layout: can't get addr for " + "host %s", name); + free(name); + hostlist_destroy(hl); + return SLURM_ERROR; + } + free(name); #if _DEBUG printf("tid=%d, fd=%d, port=%u, node_id=%u\n", step_layout->tids[i][0], fd_array[i], r->io_port, i); #endif } + hostlist_destroy(hl); rc = _p_launch(req_array_ptr, ctx); xfree(msg_array_ptr); diff --git a/src/common/read_config.c b/src/common/read_config.c index 16b5368bd152ae96e6410132fef058b58fb9acfe..169592ad676db4ff759310f97f5f72df7a22af09 100644 --- a/src/common/read_config.c +++ b/src/common/read_config.c @@ -714,6 +714,7 @@ static int _register_conf_node_aliases(slurm_conf_node_t *node_ptr) free(hostname); free(address); #endif + } /* free allocated storage */ @@ -795,8 +796,8 @@ extern char *slurm_conf_get_nodename(const char *node_hostname) slurm_conf_lock(); _init_slurmd_nodehash(); - idx = _get_hash_idx(node_hostname); + p = host_to_node_hashtbl[idx]; while (p) { if (strcmp(p->hostname, node_hostname) == 0) { diff --git a/src/common/slurm_step_layout.c b/src/common/slurm_step_layout.c index 12595caeadaa2e8906a65257d493e443f9e3556c..c9687fc5fc1046b50cd0a6bb7e2896dbc2b76018 100644 --- a/src/common/slurm_step_layout.c +++ b/src/common/slurm_step_layout.c @@ -158,7 +158,7 @@ slurm_step_layout_t *fake_slurm_step_layout_create( { uint32_t cpn = 1; int cpu_cnt = 0, cpu_inx = 0, i, j; - char *name = NULL; +/* char *name = NULL; */ hostlist_t hl = NULL; slurm_step_layout_t *step_layout = xmalloc(sizeof(slurm_step_layout_t)); @@ -185,8 +185,8 @@ slurm_step_layout_t *fake_slurm_step_layout_create( step_layout->node_cnt = node_cnt; step_layout->tasks = xmalloc(sizeof(uint32_t) * node_cnt); step_layout->tids = xmalloc(sizeof(uint32_t *) * node_cnt); - step_layout->node_addr = - xmalloc(sizeof(slurm_addr) * node_cnt); +/* step_layout->node_addr = */ +/* xmalloc(sizeof(slurm_addr) * node_cnt); */ step_layout->task_cnt = 0; for (i=0; i<step_layout->node_cnt; i++) { @@ -223,26 +223,26 @@ slurm_step_layout_t *fake_slurm_step_layout_create( } } } - name = hostlist_shift(hl); - if(!name) { - error("fake_slurm_step_layout_create: " - "We don't have the correct nodelist."); - goto error; - } - if(slurm_conf_get_addr(name, &step_layout->node_addr[i]) == - SLURM_ERROR) { - error("fake_slurm_step_layout_create: " - "we didn't get an addr for host %s.", name); +/* name = hostlist_shift(hl); */ +/* if(!name) { */ +/* error("fake_slurm_step_layout_create: " */ +/* "We don't have the correct nodelist."); */ +/* goto error; */ +/* } */ +/* if(slurm_conf_get_addr(name, &step_layout->node_addr[i]) == */ +/* SLURM_ERROR) { */ +/* error("fake_slurm_step_layout_create: " */ +/* "we didn't get an addr for host %s.", name); */ - } - free(name); +/* } */ +/* free(name); */ } hostlist_destroy(hl); return step_layout; -error: - hostlist_destroy(hl); - slurm_step_layout_destroy(step_layout); - return NULL; +/* error: */ +/* hostlist_destroy(hl); */ +/* slurm_step_layout_destroy(step_layout); */ +/* return NULL; */ } @@ -261,9 +261,9 @@ extern slurm_step_layout_t *slurm_step_layout_copy( layout->node_cnt = step_layout->node_cnt; layout->task_cnt = step_layout->task_cnt; - layout->node_addr = xmalloc(sizeof(slurm_addr) * layout->node_cnt); - memcpy(layout->node_addr, step_layout->node_addr, - (sizeof(slurm_addr) * layout->node_cnt)); +/* layout->node_addr = xmalloc(sizeof(slurm_addr) * layout->node_cnt); */ +/* memcpy(layout->node_addr, step_layout->node_addr, */ +/* (sizeof(slurm_addr) * layout->node_cnt)); */ layout->tasks = xmalloc(sizeof(uint32_t) * layout->node_cnt); memcpy(layout->tasks, step_layout->tasks, @@ -292,8 +292,8 @@ extern void pack_slurm_step_layout(slurm_step_layout_t *step_layout, packstr(step_layout->node_list, buffer); pack16(step_layout->node_cnt, buffer); pack32(step_layout->task_cnt, buffer); - slurm_pack_slurm_addr_array(step_layout->node_addr, - step_layout->node_cnt, buffer); +/* slurm_pack_slurm_addr_array(step_layout->node_addr, */ +/* step_layout->node_cnt, buffer); */ for(i=0; i<step_layout->node_cnt; i++) { pack32_array(step_layout->tids[i], step_layout->tasks[i], @@ -323,11 +323,11 @@ extern int unpack_slurm_step_layout(slurm_step_layout_t **layout, Buf buffer) safe_unpack16(&step_layout->node_cnt, buffer); safe_unpack32(&step_layout->task_cnt, buffer); - if (slurm_unpack_slurm_addr_array(&(step_layout->node_addr), - &uint16_tmp, buffer)) - goto unpack_error; - if (uint16_tmp != step_layout->node_cnt) - goto unpack_error; +/* if (slurm_unpack_slurm_addr_array(&(step_layout->node_addr), */ +/* &uint16_tmp, buffer)) */ +/* goto unpack_error; */ +/* if (uint16_tmp != step_layout->node_cnt) */ +/* goto unpack_error; */ step_layout->tasks = xmalloc(sizeof(uint32_t) * step_layout->node_cnt); step_layout->tids = xmalloc(sizeof(uint32_t *) @@ -353,7 +353,7 @@ extern int slurm_step_layout_destroy(slurm_step_layout_t *step_layout) int i=0; if(step_layout) { xfree(step_layout->node_list); - xfree(step_layout->node_addr); +/* xfree(step_layout->node_addr); */ xfree(step_layout->tasks); for (i = 0; i < step_layout->node_cnt; i++) { xfree(step_layout->tids[i]); @@ -397,7 +397,7 @@ static int _init_task_layout(slurm_step_layout_t *step_layout, { int cpu_cnt = 0, cpu_inx = 0, i; hostlist_t hl = NULL; - char *name = NULL; +/* char *name = NULL; */ uint32_t cpus[step_layout->node_cnt]; if (step_layout->node_cnt == 0) @@ -405,8 +405,8 @@ static int _init_task_layout(slurm_step_layout_t *step_layout, if (step_layout->tasks) /* layout already completed */ return SLURM_SUCCESS; - step_layout->node_addr = xmalloc(sizeof(slurm_addr) - * step_layout->node_cnt); +/* step_layout->node_addr = xmalloc(sizeof(slurm_addr) */ +/* * step_layout->node_cnt); */ step_layout->tasks = xmalloc(sizeof(uint32_t) * step_layout->node_cnt); step_layout->tids = xmalloc(sizeof(uint32_t *) @@ -428,22 +428,22 @@ static int _init_task_layout(slurm_step_layout_t *step_layout, } for (i=0; i<step_layout->node_cnt; i++) { - name = hostlist_shift(hl); - if(!name) { - error("hostlist incomplete for this job request"); - hostlist_destroy(hl); - return SLURM_ERROR; - } - if(slurm_conf_get_addr(name, &step_layout->node_addr[i]) - == SLURM_ERROR) { - error("_init_task_layout: can't get addr for " - "host %s", name); - free(name); - continue; - } +/* name = hostlist_shift(hl); */ +/* if(!name) { */ +/* error("hostlist incomplete for this job request"); */ +/* hostlist_destroy(hl); */ +/* return SLURM_ERROR; */ +/* } */ +/* if(slurm_conf_get_addr(name, &step_layout->node_addr[i]) */ +/* == SLURM_ERROR) { */ +/* error("_init_task_layout: can't get addr for " */ +/* "host %s", name); */ +/* free(name); */ +/* continue; */ +/* } */ - debug2("host %d = %s", i, name); - free(name); +/* debug2("host %d = %s", i, name); */ +/* free(name); */ cpus[i] = cpus_per_node[cpu_inx]; if ((++cpu_cnt) >= cpu_count_reps[cpu_inx]) { diff --git a/src/plugins/jobacct/common/common_slurmctld.c b/src/plugins/jobacct/common/common_slurmctld.c index 76be8ee50f72e49da10244b21b4e3f229e365451..8a6dd5fd3f46722efbc879198993aa435db9d08f 100644 --- a/src/plugins/jobacct/common/common_slurmctld.c +++ b/src/plugins/jobacct/common/common_slurmctld.c @@ -287,14 +287,14 @@ extern int common_step_start_slurmctld(struct step_record *step) if(quarter != (uint16_t)NO_VAL && nodecard != (uint16_t)NO_VAL) snprintf(node_list, BUFFER_SIZE, - "%s.%d.%d", step->step_layout->node_list, + "%s.%d.%d", step->job_ptr->nodes, quarter, nodecard); else if(quarter != (uint16_t)NO_VAL) snprintf(node_list, BUFFER_SIZE, - "%s.%d", step->step_layout->node_list, quarter); + "%s.%d", step->job_ptr->nodes, quarter); else snprintf(node_list, BUFFER_SIZE, "%s", - step->step_layout->node_list); + step->job_ptr->nodes); #else if(!step->step_layout || !step->step_layout->task_cnt) { @@ -405,14 +405,14 @@ extern int common_step_complete_slurmctld(struct step_record *step) if(quarter != (uint16_t)NO_VAL && nodecard != (uint16_t)NO_VAL) snprintf(node_list, BUFFER_SIZE, - "%s.%d.%d", step->step_layout->node_list, + "%s.%d.%d", step->job_ptr->nodes, quarter, nodecard); else if(quarter != (uint16_t)NO_VAL) snprintf(node_list, BUFFER_SIZE, - "%s.%d", step->step_layout->node_list, quarter); + "%s.%d", step->job_ptr->nodes, quarter); else snprintf(node_list, BUFFER_SIZE, "%s", - step->step_layout->node_list); + step->job_ptr->nodes); #else if(!step->step_layout || !step->step_layout->task_cnt) { @@ -511,7 +511,7 @@ extern int common_suspend_slurmctld(struct job_record *job_ptr) debug("jobacct init was not called or it failed"); return SLURM_ERROR; } - + /* tell what time has passed */ if(!now) now = job_ptr->start_time; diff --git a/src/sacct/print.c b/src/sacct/print.c index f2fb7e1f7d231534e473ac76d356243e1e7165d9..5ba577be4abaf7f7640b6e7a04d70232b1faf75d 100644 --- a/src/sacct/print.c +++ b/src/sacct/print.c @@ -40,7 +40,7 @@ #include "sacct.h" #include "src/common/parse_time.h" #include "slurm.h" -#define FORMAT_STRING_SIZE 32 +#define FORMAT_STRING_SIZE 50 void _elapsed_time(long secs, long usecs, char *str); @@ -713,10 +713,10 @@ void print_pages(type_t type, void *object) switch(type) { case HEADLINE: - printf("%-35s", "MaxPages/Node:Task - Ave"); + printf("%-50s", "MaxPages/Node:Task - Ave"); break; case UNDERSCORE: - printf("%-35s", "----------------------------------"); + printf("%-50s", "----------------------------------"); break; case JOB: sacct = job->sacct; @@ -736,7 +736,7 @@ void print_pages(type_t type, void *object) sacct.max_pages_id.taskid, buf2); } - printf("%-35s", outbuf); + printf("%-50s", outbuf); break; case JOBSTEP: sacct = step->sacct; @@ -750,7 +750,7 @@ void print_pages(type_t type, void *object) buf3, sacct.max_pages_id.taskid, buf2); - printf("%-35s", outbuf); + printf("%-50s", outbuf); break; } } @@ -769,10 +769,10 @@ void print_rss(type_t type, void *object) switch(type) { case HEADLINE: - printf("%-32s", "MaxRSS/Node:Task - Ave"); + printf("%-50s", "MaxRSS/Node:Task - Ave"); break; case UNDERSCORE: - printf("%-32s", "--------------------------------"); + printf("%-50s", "--------------------------------"); break; case JOB: sacct = job->sacct; @@ -792,7 +792,7 @@ void print_rss(type_t type, void *object) sacct.max_rss_id.taskid, buf2); } - printf("%-32s", outbuf); + printf("%-50s", outbuf); break; case JOBSTEP: sacct = step->sacct; @@ -806,7 +806,7 @@ void print_rss(type_t type, void *object) buf3, sacct.max_rss_id.taskid, buf2); - printf("%-32s", outbuf); + printf("%-50s", outbuf); break; } } @@ -1044,10 +1044,10 @@ void print_vsize(type_t type, void *object) switch(type) { case HEADLINE: - printf("%-34s", "MaxVSIZE/Node:Task - Ave"); + printf("%-50s", "MaxVSIZE/Node:Task - Ave"); break; case UNDERSCORE: - printf("%-34s", "----------------------------------"); + printf("%-50s", "----------------------------------"); break; case JOB: sacct = job->sacct; @@ -1066,7 +1066,7 @@ void print_vsize(type_t type, void *object) sacct.max_vsize_id.taskid, buf2); } - printf("%-34s", outbuf); + printf("%-50s", outbuf); break; case JOBSTEP: sacct = step->sacct; @@ -1080,7 +1080,7 @@ void print_vsize(type_t type, void *object) buf3, sacct.max_vsize_id.taskid, buf2); - printf("%-34s", outbuf); + printf("%-50s", outbuf); break; } } @@ -1099,10 +1099,10 @@ void print_cputime(type_t type, void *object) switch(type) { case HEADLINE: - printf("%-36s", "MinCPUtime/Node:Task - Ave"); + printf("%-50s", "MinCPUtime/Node:Task - Ave"); break; case UNDERSCORE: - printf("%-36s", "------------------------------------"); + printf("%-50s", "------------------------------------"); break; case JOB: sacct = job->sacct; @@ -1122,7 +1122,7 @@ void print_cputime(type_t type, void *object) sacct.min_cpu_id.taskid, buf2); } - printf("%-36s", outbuf); + printf("%-50s", outbuf); break; case JOBSTEP: sacct = step->sacct; @@ -1137,7 +1137,7 @@ void print_cputime(type_t type, void *object) buf3, sacct.min_cpu_id.taskid, buf2); - printf("%-36s", outbuf); + printf("%-50s", outbuf); break; } } diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index e428c4c21d0ad97613efbadb21050a435f102105..57d84879cd306b66bb4fd87857f504ee4e7dfef3 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -4212,7 +4212,8 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, job_ptr->suspend_time = now; reply: - jobacct_g_suspend_slurmctld(job_ptr); + if(job_ptr) + jobacct_g_suspend_slurmctld(job_ptr); if (conn_fd >= 0) { slurm_msg_t_init(&resp_msg); diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index c90ec6f7a7b4b2e26d6339e1bee4f92ab752af62..853ee85a94b3dfd35b907120f65b7c84d568bb01 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -16,7 +16,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 61e9b54c99f097527e68ad7018c5cfdba8f66892..7010dc84ed9273521d86572ea2d034ca0578ee89 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -16,7 +16,7 @@ * any later version. * * In addition, as a special exception, the copyright holders give permission - * to link the code of portions of this program with the OpenSSL library under + * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than @@ -1327,8 +1327,15 @@ _send_launch_failure (launch_tasks_request_msg_t *msg, slurm_addr *cli, int rc) { slurm_msg_t resp_msg; launch_tasks_response_msg_t resp; - int nodeid = nodelist_find(msg->complete_nodelist, conf->node_name); - + int nodeid = 0; + char *name = NULL; +#ifndef HAVE_FRONT_END + nodeid = nodelist_find(msg->complete_nodelist, conf->node_name); + name = xstrdup(conf->node_name); +#else + name = xstrdup(msg->complete_nodelist); + +#endif debug ("sending launch failure message: %s", slurm_strerror (rc)); slurm_msg_t_init(&resp_msg); @@ -1339,12 +1346,12 @@ _send_launch_failure (launch_tasks_request_msg_t *msg, slurm_addr *cli, int rc) resp_msg.data = &resp; resp_msg.msg_type = RESPONSE_LAUNCH_TASKS; - resp.node_name = conf->node_name; + resp.node_name = name; resp.return_code = rc ? rc : -1; resp.count_of_pids = 0; slurm_send_only_node_msg(&resp_msg); - + xfree(name); return; } @@ -1366,7 +1373,7 @@ _send_launch_resp(slurmd_job_t *job, int rc) resp_msg.data = &resp; resp_msg.msg_type = RESPONSE_LAUNCH_TASKS; - resp.node_name = conf->node_name; + resp.node_name = xstrdup(job->node_name); resp.return_code = rc; resp.count_of_pids = job->ntasks; @@ -1381,6 +1388,7 @@ _send_launch_resp(slurmd_job_t *job, int rc) xfree(resp.local_pids); xfree(resp.task_ids); + xfree(resp.node_name); } @@ -1396,7 +1404,7 @@ _complete_batch_script(slurmd_job_t *job, int err, int status) req.slurm_rc = err; slurm_msg_t_init(&req_msg); - req.node_name = conf->node_name; + req.node_name = job->node_name; req_msg.msg_type= REQUEST_COMPLETE_BATCH_SCRIPT; req_msg.data = &req; diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c index 08682cecbb15727f77abd5d603926b0468197a40..c022c0242264eadde9dd178ea694377e9533bbd8 100644 --- a/src/slurmd/slurmstepd/slurmstepd.c +++ b/src/slurmd/slurmstepd/slurmstepd.c @@ -346,6 +346,9 @@ _step_setup(slurm_addr *cli, slurm_addr *self, slurm_msg_t *msg) fatal("handle_launch_message: Unrecognized launch/spawn RPC"); break; } + if(!job) { + fatal("_step_setup: no job returned"); + } job->jmgr_pid = getpid(); job->jobacct = jobacct_g_alloc(NULL); diff --git a/src/slurmd/slurmstepd/slurmstepd_job.c b/src/slurmd/slurmstepd/slurmstepd_job.c index c9129641f8dff862825c7a63ccd19a49c7c9cb64..28bddef37247c53c95ef9202b59081c16cc74853 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.c +++ b/src/slurmd/slurmstepd/slurmstepd_job.c @@ -157,15 +157,7 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) xassert(msg != NULL); xassert(msg->complete_nodelist != NULL); - - nodeid = nodelist_find(msg->complete_nodelist, conf->node_name); - debug3("entering job_create on node %d", nodeid); - - if(nodeid < 0) { - error("couldn't find node %s in %s", - conf->node_name, msg->complete_nodelist); - return NULL; - } + debug3("entering job_create"); if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) { error("uid %ld not found on system", (long) msg->uid); slurm_seterrno (ESLURMD_UID_NOT_FOUND); @@ -177,7 +169,20 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) return NULL; } job = xmalloc(sizeof(slurmd_job_t)); - +#ifndef HAVE_FRONT_END + nodeid = nodelist_find(msg->complete_nodelist, conf->node_name); + job->node_name = xstrdup(conf->node_name); +#else + nodeid = 0; + job->node_name = xstrdup(msg->complete_nodelist); +#endif + if(nodeid < 0) { + error("couldn't find node %s in %s", + job->node_name, msg->complete_nodelist); + job_destroy(job); + return NULL; + } + job->state = SLURMSTEPD_STEP_STARTING; job->pwd = pwd; job->ntasks = msg->tasks_to_launch[nodeid]; @@ -266,18 +271,29 @@ job_spawn_create(spawn_task_request_msg_t *msg, slurm_addr *cli_addr) int nodeid = NO_VAL; xassert(msg != NULL); + xassert(msg->complete_nodelist != NULL); debug3("entering job_spawn_create"); - nodeid = nodelist_find(msg->complete_nodelist, conf->node_name); - if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) { error("uid %ld not found on system", (long) msg->uid); slurm_seterrno (ESLURMD_UID_NOT_FOUND); return NULL; } job = xmalloc(sizeof(slurmd_job_t)); - +#ifndef HAVE_FRONT_END + nodeid = nodelist_find(msg->complete_nodelist, conf->node_name); + job->node_name = xstrdup(conf->node_name); +#else + nodeid = 0; + job->node_name = xstrdup(msg->complete_nodelist); +#endif + if(nodeid < 0) { + error("couldn't find node %s in %s", + job->node_name, msg->complete_nodelist); + job_destroy(job); + return NULL; + } job->state = SLURMSTEPD_STEP_STARTING; job->pwd = pwd; job->ntasks = 1; /* tasks to launch always one */ @@ -516,6 +532,7 @@ job_destroy(slurmd_job_t *job) task_info_destroy(job->task[i]); list_destroy(job->sruns); xfree(job->envtp); + xfree(job->node_name); xfree(job->task_prolog); xfree(job->task_epilog); xfree(job); diff --git a/src/slurmd/slurmstepd/slurmstepd_job.h b/src/slurmd/slurmstepd/slurmstepd_job.h index 16cd1233bbd7ed6f9f85e940224f7dc300dafcb4..1b119a31166c88855d7d0a1baef25c5f40443d01 100644 --- a/src/slurmd/slurmstepd/slurmstepd_job.h +++ b/src/slurmd/slurmstepd/slurmstepd_job.h @@ -118,6 +118,8 @@ typedef struct slurmd_job { char **env; /* job environment */ char **argv; /* job argument vector */ char *cwd; /* path to current working directory */ + char *node_name; /* node name of node running job + * needed for front-end systems */ cpu_bind_type_t cpu_bind_type; /* --cpu_bind= */ char *cpu_bind; /* binding map for map/mask_cpu */ mem_bind_type_t mem_bind_type; /* --mem_bind= */ diff --git a/src/srun/msg.c b/src/srun/msg.c index 660689ff056251dfd52731e86c3850e96f879510..90020d8ad6030c5a22207e7dcdd3e651b17f7dcf 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -363,8 +363,9 @@ static void _process_launch_resp(srun_job_t *job, launch_tasks_response_msg_t *msg) { pipe_enum_t pipe_enum = PIPE_HOST_STATE; - int nodeid = nodelist_find(job->step_layout->node_list, + int nodeid = nodelist_find(job->step_layout->node_list, msg->node_name); + if ((nodeid < 0) || (nodeid >= job->nhosts)) { error ("Bad launch response from %s", msg->node_name); return; diff --git a/src/srun/reattach.c b/src/srun/reattach.c index 9a727315b532aebd3ea3564da077a2d3cd571888..7b32f31f76d9d8d41e655b32c7afe3f68c2c1a88 100644 --- a/src/srun/reattach.c +++ b/src/srun/reattach.c @@ -319,12 +319,15 @@ _attach_to_job(srun_job_t *job) int i; reattach_tasks_request_msg_t *req = NULL; slurm_msg_t *msg = NULL; - + hostlist_t hl = NULL; + char *name = NULL; + req = xmalloc(job->nhosts * sizeof(reattach_tasks_request_msg_t)); msg = xmalloc(job->nhosts * sizeof(slurm_msg_t)); debug("Going to attach to job %u.%u", job->jobid, job->stepid); + hl = hostlist_create(job->step_layout->node_list); for (i = 0; i < job->nhosts; i++) { reattach_tasks_request_msg_t *r = &req[i]; slurm_msg_t *m = &msg[i]; @@ -342,13 +345,27 @@ _attach_to_job(srun_job_t *job) slurm_msg_t_init(m); m->data = r; m->msg_type = REQUEST_REATTACH_TASKS; - - memcpy(&m->address, &job->step_layout->node_addr[i], - sizeof(slurm_addr)); + name = hostlist_shift(hl); + if(!name) { + error("hostlist incomplete for this job request"); + hostlist_destroy(hl); + return SLURM_ERROR; + } + if(slurm_conf_get_addr(name, &m->address) + == SLURM_ERROR) { + error("_init_task_layout: can't get addr for " + "host %s", name); + free(name); + hostlist_destroy(hl); + return SLURM_ERROR; + } + free(name); + /* memcpy(&m->address, &job->step_layout->node_addr[i], */ +/* sizeof(slurm_addr)); */ } - + hostlist_destroy(hl); _p_reattach(msg, job); - + return SLURM_SUCCESS; } diff --git a/testsuite/expect/test1.80 b/testsuite/expect/test1.80 index 6248842a8a0e275a076e140f320a4d4080d8eb22..8715e68e62e568455cc8db89a81561796fd02fdf 100755 --- a/testsuite/expect/test1.80 +++ b/testsuite/expect/test1.80 @@ -38,8 +38,8 @@ set exit_code 0 print_header $test_id -if { [test_xcpu] } { - send_user "\nWARNING: This test is incompatable with XCPU systems\n" +if { [test_front_end] } { + send_user "\nWARNING: This test is incompatable with front end systems\n" exit 0 } diff --git a/testsuite/expect/test12.2 b/testsuite/expect/test12.2 index fdc73d59865a8f21551da974302c68b684c476cb..a8691d8071e164949de06eecada43a498908e77a 100755 --- a/testsuite/expect/test12.2 +++ b/testsuite/expect/test12.2 @@ -48,6 +48,10 @@ set ret_code 42 print_header $test_id +if {[test_bluegene] != 0} { + send_user "\nWARNING: This test is incompatable with bluegene systems\n" + exit 0 +} # # Check if accounting is enabled # diff --git a/testsuite/expect/test18.27 b/testsuite/expect/test18.27 index 8bed364f38caff9a8134a6bec8fc8bb269d82007..e20136ce8ea373913442b413331153a0e6b895e3 100755 --- a/testsuite/expect/test18.27 +++ b/testsuite/expect/test18.27 @@ -38,9 +38,9 @@ set exit_code 0 print_header $test_id -if { [test_xcpu] } { - send_user "\nWARNING: This test is incompatable with XCPU systems\n" - exit 0 +if {[test_front_end] != 0} { + send_user "\nWARNING: Additional testing is incompatable with front-end systems\n" + exit $exit_code } # @@ -91,11 +91,6 @@ if {[string compare $expected_layout $tested_layout] != 0} { set exit_code 1 } -if {[test_front_end] != 0} { - send_user "\nWARNING: Additional testing is incompatable with front-end systems\n" - exit $exit_code -} - # # Submit a two node job with cyclic distribution # diff --git a/testsuite/expect/test3.7 b/testsuite/expect/test3.7 index abcc862766783ec59c5974d9b17aaf5f059418b2..659cbcd3f2fe28e027c649450f7c6d566147fba0 100755 --- a/testsuite/expect/test3.7 +++ b/testsuite/expect/test3.7 @@ -204,7 +204,6 @@ sleep 5 suspend_job $job_id1 suspend if {$not_supported == 1} { exec $scancel $job_id1 - exec $scancel $job_id2 exit 0 } if {$not_supported == 0} {