diff --git a/NEWS b/NEWS index 545e9a65d895f44eeab47d7ae8231fc4cbd541d5..8578390dea5dcfa4a83dae6ea086ac85dbcca3bc 100644 --- a/NEWS +++ b/NEWS @@ -1,10 +1,12 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. -* Changes in SLURM 1.1.0-pre6 +* Changes in SLURM 1.1.0-pre7 ============================= -- Accounting works for aix systems, use jobacct/aix -- Support large (over 2GB) files on 32-bit linux systems + -- changed all writes to safe_write in srun + -- added $float to globals.example in the testsuite * Changes in SLURM 1.1.0-pre6 ============================= diff --git a/src/sacct/options.c b/src/sacct/options.c index d95830b69c3363d8971c38ab2f4ea4a866c838a0..83e315bb130b26d05fa4b1237862797ce515ec41 100644 --- a/src/sacct/options.c +++ b/src/sacct/options.c @@ -472,12 +472,12 @@ int get_data(void) show_full = 1; list_iterator_destroy(itr); goto foundjob; - } else if (rec_type == JOB_STEP + } else if (rec_type != JOB_STEP || !strcmp(f[F_JOBSTEP], selected_step->step)) { list_iterator_destroy(itr); goto foundjob; - } + } } list_iterator_destroy(itr); continue; /* no match */ @@ -914,7 +914,6 @@ void parse_command_line(int argc, char **argv) selected_step = xmalloc(sizeof(selected_step_t)); list_append(selected_steps, selected_step); - selected_step->job = xstrdup(start); dot = strstr(start, "."); if (dot == NULL) { debug2("No jobstep requested"); @@ -923,6 +922,7 @@ void parse_command_line(int argc, char **argv) *dot++ = 0; selected_step->step = xstrdup(dot); } + selected_step->job = xstrdup(start); start = end + 1; } if (params.opt_verbose) { @@ -1715,7 +1715,7 @@ void do_list(void) print_fields(JOB, job); } - if (do_jobsteps && job->track_steps) { + if (do_jobsteps && (job->track_steps || !job->show_full)) { itr_step = list_iterator_create(job->steps); while((step = list_next(itr_step))) { if (step->status == JOB_RUNNING diff --git a/src/srun/io.c b/src/srun/io.c index 3e5b153144fcac263d4eaf66993f615a0a0ee994..5bf4e663c5f030b9a5188fda5c6aaa98698c6ef3 100644 --- a/src/srun/io.c +++ b/src/srun/io.c @@ -380,7 +380,8 @@ _server_write(eio_obj_t *obj, List objs) debug3("_server_write: nothing in the queue"); return SLURM_SUCCESS; } - debug3(" dequeue successful, s->out_msg->length = %d", s->out_msg->length); + debug3(" dequeue successful, s->out_msg->length = %d", + s->out_msg->length); s->out_remaining = s->out_msg->length; } @@ -420,7 +421,6 @@ again: s->out_msg = NULL; return SLURM_SUCCESS; - } /********************************************************************** @@ -451,7 +451,10 @@ static void _write_label(int fd, int taskid) snprintf(buf, 16, "%0*d: ", fmt_width, taskid); /* FIXME - Need to handle return code */ - write(fd, buf, fmt_width+2); + safe_write(fd, buf, fmt_width+2); + return; +rwfail: + error("_write_label: write from io process failed"); } static void _write_newline(int fd) @@ -468,6 +471,7 @@ again: } /* FIXME handle error */ } + return; } /* diff --git a/src/srun/launch.c b/src/srun/launch.c index a7970702c2d975f49fd830cf280aab59666d4ad6..1e809b2500cf8c862b4121fe0eef16d3957e2e6d 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -431,11 +431,11 @@ _update_failed_node(srun_job_t *j, int id) j->host_state[id] = SRUN_HOST_UNREACHABLE; if(message_thread) { - write(j->forked_msg->par_msg->msg_pipe[1], + safe_write(j->forked_msg->par_msg->msg_pipe[1], &pipe_enum,sizeof(int)); - write(j->forked_msg->par_msg->msg_pipe[1], + safe_write(j->forked_msg->par_msg->msg_pipe[1], &id,sizeof(int)); - write(j->forked_msg->par_msg->msg_pipe[1], + safe_write(j->forked_msg->par_msg->msg_pipe[1], &j->host_state[id],sizeof(int)); } } @@ -445,18 +445,23 @@ _update_failed_node(srun_job_t *j, int id) j->task_state[j->step_layout->tids[id][i]] = SRUN_TASK_FAILED; if(message_thread) { - write(j->forked_msg->par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - write(j->forked_msg->par_msg->msg_pipe[1], - &j->step_layout->tids[id][i],sizeof(int)); - write(j->forked_msg->par_msg->msg_pipe[1], - &j->task_state[j->step_layout->tids[id][i]], - sizeof(int)); + safe_write(j->forked_msg->par_msg->msg_pipe[1], + &pipe_enum, sizeof(int)); + safe_write(j->forked_msg->par_msg->msg_pipe[1], + &j->step_layout->tids[id][i], sizeof(int)); + safe_write(j->forked_msg->par_msg->msg_pipe[1], + &j->task_state[j->step_layout->tids[id][i]], + sizeof(int)); } } pthread_mutex_unlock(&j->task_mutex); /* update_failed_tasks(j, id); */ + return; +rwfail: + pthread_mutex_unlock(&j->task_mutex); + error("_update_failed_node: " + "write from srun message-handler process failed"); } static void @@ -468,15 +473,20 @@ _update_contacted_node(srun_job_t *j, int id) if (j->host_state[id] == SRUN_HOST_INIT) { j->host_state[id] = SRUN_HOST_CONTACTED; if(message_thread) { - write(j->forked_msg->par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - write(j->forked_msg->par_msg->msg_pipe[1], - &id,sizeof(int)); - write(j->forked_msg->par_msg->msg_pipe[1], - &j->host_state[id],sizeof(int)); + safe_write(j->forked_msg->par_msg->msg_pipe[1], + &pipe_enum, sizeof(int)); + safe_write(j->forked_msg->par_msg->msg_pipe[1], + &id, sizeof(int)); + safe_write(j->forked_msg->par_msg->msg_pipe[1], + &j->host_state[id], sizeof(int)); } } pthread_mutex_unlock(&j->task_mutex); + return; +rwfail: + pthread_mutex_unlock(&j->task_mutex); + error("_update_contacted_node: " + "write from srun message-handler process failed"); } diff --git a/src/srun/msg.c b/src/srun/msg.c index c79c6b9da5bc94b67bec839917f576c0721ba8e6..e422addbc060e0c901b72962ba2ff5f42d729a29 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -140,8 +140,7 @@ static void _update_mpir_proctable(int fd, srun_job_t *job, return; rwfail: - error("write to srun main process failed"); - return; + error("_update_mpir_proctable: write to srun main process failed"); } static void _handle_update_mpir_proctable(int fd, srun_job_t *job) @@ -205,11 +204,12 @@ static void _handle_update_mpir_proctable(int fd, srun_job_t *job) return; rwfail: - error("read from srun message-handler process failed"); - return; + error("_handle_update_mpir_proctable: " + "read from srun message-handler process failed"); } -static void _update_step_layout(int fd, slurm_step_layout_t *layout, int nodeid) +static void _update_step_layout(int fd, slurm_step_layout_t *layout, + int nodeid) { int msg_type = PIPE_UPDATE_STEP_LAYOUT; int dummy = 0xdeadbeef; @@ -234,8 +234,7 @@ static void _update_step_layout(int fd, slurm_step_layout_t *layout, int nodeid) return; rwfail: - error("write to srun main process failed"); - return; + error("_update_step_layout: write to srun main process failed"); } static void _handle_update_step_layout(int fd, slurm_step_layout_t *layout) @@ -270,8 +269,8 @@ static void _handle_update_step_layout(int fd, slurm_step_layout_t *layout) return; rwfail: - error("read from srun message-handler process failed"); - return; + error("_handle_update_step_layout: " + "read from srun message-handler process failed"); } static void _dump_proctable(srun_job_t *job) @@ -302,15 +301,20 @@ void debugger_launch_failure(srun_job_t *job) if (opt.parallel_debug) { if(message_thread && job) { i = MPIR_DEBUG_ABORTING; - write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &i,sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &pipe_enum, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &i, sizeof(int)); } else if(!job) { error("Hey I don't have a job to write to on the " "failure of the debugger launch."); } } + return; +rwfail: + error("debugger_launch_failure: " + "write from srun message-handler process failed"); + } /* @@ -328,7 +332,7 @@ static void _timeout_handler(time_t timeout) if (timeout != last_timeout) { last_timeout = timeout; verbose("job time limit to be reached at %s", - ctime(&timeout)); + ctime(&timeout)); } } @@ -374,12 +378,12 @@ _process_launch_resp(srun_job_t *job, launch_tasks_response_msg_t *msg) pthread_mutex_unlock(&job->task_mutex); if(message_thread) { - write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &msg->srun_node_id, sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &job->host_state[msg->srun_node_id], sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &pipe_enum, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &msg->srun_node_id, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &job->host_state[msg->srun_node_id], sizeof(int)); } _update_mpir_proctable(job->forked_msg->par_msg->msg_pipe[1], job, @@ -387,6 +391,10 @@ _process_launch_resp(srun_job_t *job, launch_tasks_response_msg_t *msg) msg->local_pids, remote_argv[0]); _print_pid_list( msg->node_name, msg->count_of_pids, msg->local_pids, remote_argv[0] ); + return; +rwfail: + error("_process_launch_resp: " + "write from srun message-handler process failed"); } @@ -396,21 +404,27 @@ update_tasks_state(srun_job_t *job, uint32_t nodeid) int i; pipe_enum_t pipe_enum = PIPE_TASK_STATE; debug2("updating %d running tasks for node %d", - job->step_layout->tasks[nodeid], nodeid); + job->step_layout->tasks[nodeid], nodeid); slurm_mutex_lock(&job->task_mutex); for (i = 0; i < job->step_layout->tasks[nodeid]; i++) { uint32_t tid = job->step_layout->tids[nodeid][i]; if(message_thread) { - write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &tid,sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &job->task_state[tid],sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &pipe_enum,sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &tid,sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &job->task_state[tid],sizeof(int)); } } slurm_mutex_unlock(&job->task_mutex); + return; +rwfail: + slurm_mutex_unlock(&job->task_mutex); + error("update_tasks_state: " + "write from srun message-handler process failed"); + } static void @@ -419,22 +433,28 @@ update_running_tasks(srun_job_t *job, uint32_t nodeid) int i; pipe_enum_t pipe_enum = PIPE_TASK_STATE; debug2("updating %d running tasks for node %d", - job->step_layout->tasks[nodeid], nodeid); + job->step_layout->tasks[nodeid], nodeid); slurm_mutex_lock(&job->task_mutex); for (i = 0; i < job->step_layout->tasks[nodeid]; i++) { uint32_t tid = job->step_layout->tids[nodeid][i]; job->task_state[tid] = SRUN_TASK_RUNNING; if(message_thread) { - write(job->forked_msg-> - par_msg->msg_pipe[1],&pipe_enum,sizeof(int)); - write(job->forked_msg-> - par_msg->msg_pipe[1],&tid,sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &job->task_state[tid],sizeof(int)); + safe_write(job->forked_msg-> + par_msg->msg_pipe[1], + &pipe_enum,sizeof(int)); + safe_write(job->forked_msg-> + par_msg->msg_pipe[1],&tid, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &job->task_state[tid], sizeof(int)); } } slurm_mutex_unlock(&job->task_mutex); + return; +rwfail: + slurm_mutex_unlock(&job->task_mutex); + error("update_running_tasks: " + "write from srun message-handler process failed"); } static void @@ -449,12 +469,12 @@ update_failed_tasks(srun_job_t *job, uint32_t nodeid) job->task_state[tid] = SRUN_TASK_FAILED; if(message_thread) { - write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - write(job->forked_msg-> - par_msg->msg_pipe[1],&tid,sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &job->task_state[tid],sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &pipe_enum, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &tid, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &job->task_state[tid], sizeof(int)); } tasks_exited++; } @@ -464,6 +484,11 @@ update_failed_tasks(srun_job_t *job, uint32_t nodeid) debug2("all tasks exited"); update_job_state(job, SRUN_JOB_TERMINATED); } +rwfail: + slurm_mutex_unlock(&job->task_mutex); + error("update_failed_tasks: " + "write from srun message-handler process failed"); + } static void @@ -479,28 +504,29 @@ _launch_handler(srun_job_t *job, slurm_msg_t *resp) if (msg->return_code != 0) { error("%s: launch failed: %s", - msg->node_name, slurm_strerror(msg->return_code)); + msg->node_name, slurm_strerror(msg->return_code)); slurm_mutex_lock(&job->task_mutex); job->host_state[msg->srun_node_id] = SRUN_HOST_REPLIED; slurm_mutex_unlock(&job->task_mutex); if(message_thread) { - write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &msg->srun_node_id,sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &job->host_state[msg->srun_node_id],sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &pipe_enum, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &msg->srun_node_id, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &job->host_state[msg->srun_node_id], + sizeof(int)); } update_failed_tasks(job, msg->srun_node_id); /* - if (!opt.no_kill) { - job->rc = 124; - update_job_state(job, SRUN_JOB_WAITING_ON_IO); - } else - update_failed_tasks(job, msg->srun_node_id); + if (!opt.no_kill) { + job->rc = 124; + update_job_state(job, SRUN_JOB_WAITING_ON_IO); + } else + update_failed_tasks(job, msg->srun_node_id); */ debugger_launch_failure(job); return; @@ -508,6 +534,11 @@ _launch_handler(srun_job_t *job, slurm_msg_t *resp) _process_launch_resp(job, msg); update_running_tasks(job, msg->srun_node_id); } + return; +rwfail: + error("_launch_handler: " + "write from srun message-handler process failed"); + } /* _confirm_launch_complete @@ -555,12 +586,12 @@ _reattach_handler(srun_job_t *job, slurm_msg_t *msg) if(message_thread) { pipe_enum_t pipe_enum = PIPE_HOST_STATE; - write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &resp->srun_node_id, sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &job->host_state[resp->srun_node_id], sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &pipe_enum, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &resp->srun_node_id, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &job->host_state[resp->srun_node_id], sizeof(int)); } if (resp->return_code != 0) { @@ -589,8 +620,9 @@ _reattach_handler(srun_job_t *job, slurm_msg_t *msg) for (i = 0; i < resp->ntasks; i++) { job->step_layout->tids[resp->srun_node_id][i] = resp->gtids[i]; - job->step_layout->hostids[resp->gtids[i]] = resp->srun_node_id; - info ("setting task%d on hostid %d\n", resp->gtids[i], resp->srun_node_id); + job->step_layout->hostids[resp->gtids[i]] = resp->srun_node_id; + info ("setting task%d on hostid %d\n", + resp->gtids[i], resp->srun_node_id); } _update_step_layout(job->forked_msg->par_msg->msg_pipe[1], job->step_layout, resp->srun_node_id); @@ -612,7 +644,10 @@ _reattach_handler(srun_job_t *job, slurm_msg_t *msg) remote_argv[0]); update_running_tasks(job, resp->srun_node_id); - + return; +rwfail: + error("_reattach_handler: " + "write from srun message-handler process failed"); } @@ -679,13 +714,17 @@ _update_task_exitcode(srun_job_t *job, int taskid) pipe_enum_t pipe_enum = PIPE_TASK_EXITCODE; if(message_thread) { - write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum, sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &taskid, sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &job->tstatus[taskid], sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &pipe_enum, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &taskid, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &job->tstatus[taskid], sizeof(int)); } + return; +rwfail: + error("_update_task_exitcode: " + "write from srun message-handler process failed"); } static void @@ -729,7 +768,7 @@ _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) tasks_exited++; if ((tasks_exited == opt.nprocs) - || (slurm_mpi_single_task_per_node () + || (slurm_mpi_single_task_per_node () && (tasks_exited == job->nhosts))) { debug2("All tasks exited"); eio_signal_shutdown(job->eio); @@ -782,56 +821,56 @@ _handle_msg(srun_job_t *job, slurm_msg_t *msg) switch (msg->msg_type) { - case RESPONSE_LAUNCH_TASKS: - _launch_handler(job, msg); - slurm_free_launch_tasks_response_msg(msg->data); - break; - case MESSAGE_TASK_EXIT: - _exit_handler(job, msg); - slurm_free_task_exit_msg(msg->data); - break; - case RESPONSE_REATTACH_TASKS: - debug2("recvd reattach response"); - _reattach_handler(job, msg); - slurm_free_reattach_tasks_response_msg(msg->data); - break; - case SRUN_PING: - debug3("slurmctld ping received"); - slurm_send_rc_msg(msg, SLURM_SUCCESS); - slurm_free_srun_ping_msg(msg->data); - break; - case SRUN_TIMEOUT: - to = msg->data; - _timeout_handler(to->timeout); - slurm_send_rc_msg(msg, SLURM_SUCCESS); - slurm_free_srun_timeout_msg(msg->data); - break; - case SRUN_NODE_FAIL: - nf = msg->data; - _node_fail_handler(nf->nodelist, job); - slurm_send_rc_msg(msg, SLURM_SUCCESS); - slurm_free_srun_node_fail_msg(msg->data); - break; - case RESPONSE_RESOURCE_ALLOCATION: - debug3("resource allocation response received"); - slurm_send_rc_msg(msg, SLURM_SUCCESS); - slurm_free_resource_allocation_response_msg(msg->data); - break; - case PMI_KVS_PUT_REQ: - debug3("PMI_KVS_PUT_REQ received"); - rc = pmi_kvs_put((struct kvs_comm_set *) msg->data); - slurm_send_rc_msg(msg, rc); - break; - case PMI_KVS_GET_REQ: - debug3("PMI_KVS_GET_REQ received"); - rc = pmi_kvs_get((kvs_get_msg_t *) msg->data); - slurm_send_rc_msg(msg, rc); - slurm_free_get_kvs_msg((kvs_get_msg_t *) msg->data); - break; - default: - error("received spurious message type: %d\n", - msg->msg_type); - break; + case RESPONSE_LAUNCH_TASKS: + _launch_handler(job, msg); + slurm_free_launch_tasks_response_msg(msg->data); + break; + case MESSAGE_TASK_EXIT: + _exit_handler(job, msg); + slurm_free_task_exit_msg(msg->data); + break; + case RESPONSE_REATTACH_TASKS: + debug2("recvd reattach response"); + _reattach_handler(job, msg); + slurm_free_reattach_tasks_response_msg(msg->data); + break; + case SRUN_PING: + debug3("slurmctld ping received"); + slurm_send_rc_msg(msg, SLURM_SUCCESS); + slurm_free_srun_ping_msg(msg->data); + break; + case SRUN_TIMEOUT: + to = msg->data; + _timeout_handler(to->timeout); + slurm_send_rc_msg(msg, SLURM_SUCCESS); + slurm_free_srun_timeout_msg(msg->data); + break; + case SRUN_NODE_FAIL: + nf = msg->data; + _node_fail_handler(nf->nodelist, job); + slurm_send_rc_msg(msg, SLURM_SUCCESS); + slurm_free_srun_node_fail_msg(msg->data); + break; + case RESPONSE_RESOURCE_ALLOCATION: + debug3("resource allocation response received"); + slurm_send_rc_msg(msg, SLURM_SUCCESS); + slurm_free_resource_allocation_response_msg(msg->data); + break; + case PMI_KVS_PUT_REQ: + debug3("PMI_KVS_PUT_REQ received"); + rc = pmi_kvs_put((struct kvs_comm_set *) msg->data); + slurm_send_rc_msg(msg, rc); + break; + case PMI_KVS_GET_REQ: + debug3("PMI_KVS_GET_REQ received"); + rc = pmi_kvs_get((kvs_get_msg_t *) msg->data); + slurm_send_rc_msg(msg, rc); + slurm_free_get_kvs_msg((kvs_get_msg_t *) msg->data); + break; + default: + error("received spurious message type: %d\n", + msg->msg_type); + break; } return; } @@ -875,7 +914,7 @@ _accept_msg_connection(srun_job_t *job, int fdnum) * to 5 seconds for no_alloc option only */ if (opt.no_alloc) timeout = 5; - again: +again: ret_list = slurm_receive_msg(fd, msg, timeout); if(!ret_list || errno != SLURM_SUCCESS) { if (errno == EINTR) { @@ -935,7 +974,7 @@ _do_poll(srun_job_t *job, struct pollfd *fds, int timeout) case EINVAL: case EFAULT: fatal("poll: %m"); default: error("poll: %m. Continuing..."); - continue; + continue; } } @@ -960,8 +999,8 @@ _get_next_timeout(srun_job_t *job) timeout = job->ltimeout - time(NULL); else timeout = job->ltimeout < job->etimeout ? - job->ltimeout - time(NULL) : - job->etimeout - time(NULL); + job->ltimeout - time(NULL) : + job->etimeout - time(NULL); return timeout; } @@ -1064,7 +1103,8 @@ par_thr(void *arg) //slurm_uid = (uid_t) slurm_get_slurm_user_id(); close(msg_par->msg_pipe[0]); // close read end of pipe close(par_msg->msg_pipe[1]); // close write end of pipe - while(read(par_msg->msg_pipe[0], &c, sizeof(int)) == sizeof(int)) { + while(read(par_msg->msg_pipe[0], &c, sizeof(int)) + == sizeof(int)) { // getting info from msg thread if(type == PIPE_NONE) { debug2("got type %d\n",c); @@ -1216,7 +1256,7 @@ msg_thr_create(srun_job_t *job) slurm_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); while ((errno = pthread_create(&job->jtid, &attr, &msg_thr, - (void *)job))) { + (void *)job))) { if (++retries > MAX_RETRIES) fatal("Can't create pthread"); sleep(1); @@ -1231,8 +1271,7 @@ msg_thr_create(srun_job_t *job) * close. */ while(read(job->forked_msg->msg_par->msg_pipe[0], - &c, sizeof(int)) - > 0) + &c, sizeof(int)) > 0) ; /* do nothing */ close(job->forked_msg->msg_par->msg_pipe[0]); @@ -1251,7 +1290,7 @@ msg_thr_create(srun_job_t *job) slurm_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); while ((errno = pthread_create(&job->jtid, &attr, &par_thr, - (void *)job))) { + (void *)job))) { if (++retries > MAX_RETRIES) fatal("Can't create pthread"); sleep(1); /* sleep and try again */ @@ -1307,13 +1346,13 @@ extern slurm_fd slurmctld_msg_init(void) fatal("slurm_get_stream_addr error %m"); fd_set_nonblocking(slurmctld_fd); /* hostname is not set, so slurm_get_addr fails - slurm_get_addr(&slurm_address, &port, hostname, sizeof(hostname)); */ + slurm_get_addr(&slurm_address, &port, hostname, sizeof(hostname)); */ port = ntohs(slurm_address.sin_port); slurmctld_comm_addr.hostname = xstrdup(opt.ctrl_comm_ifhn); slurmctld_comm_addr.port = port; debug2("slurmctld messages to host=%s,port=%u", - slurmctld_comm_addr.hostname, - slurmctld_comm_addr.port); + slurmctld_comm_addr.hostname, + slurmctld_comm_addr.port); return slurmctld_fd; } diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c index 3ae218f8c09d0960eaec66deb0cbf55c190c2fbf..e90f270675f5b2fc41ffceef0a817aabb6869024 100644 --- a/src/srun/srun_job.c +++ b/src/srun/srun_job.c @@ -304,15 +304,21 @@ update_job_state(srun_job_t *job, srun_job_state_t state) if (job->state < state) { job->state = state; if(message_thread) { - write(job->forked_msg-> - par_msg->msg_pipe[1],&pipe_enum,sizeof(int)); - write(job->forked_msg-> - par_msg->msg_pipe[1],&job->state,sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &pipe_enum, sizeof(int)); + safe_write(job->forked_msg->par_msg->msg_pipe[1], + &job->state, sizeof(int)); } pthread_cond_signal(&job->state_cond); } pthread_mutex_unlock(&job->state_mutex); + return; +rwfail: + pthread_mutex_unlock(&job->state_mutex); + error("update_job_state: " + "write from srun message-handler process failed"); + } srun_job_state_t diff --git a/testsuite/expect/globals.example b/testsuite/expect/globals.example index 2535af3d4313550296a56b144a99be6fc7583c8a..651cacea652ad511e1d6afe44b1676db50bd5e28 100755 --- a/testsuite/expect/globals.example +++ b/testsuite/expect/globals.example @@ -116,6 +116,7 @@ set alpha_numeric_under "\[a-zA-Z0-9_\]+" set alpha_under "\[A-Z_\]+" set end_of_line "\[\r\n\]" set number "\[0-9\]+" +set float "\[0-9\]+\\.?\[0-9\]+" # # Cache SlurmUser to check for SuperUser requests diff --git a/testsuite/expect/test12.2 b/testsuite/expect/test12.2 index 358d4a725d745901177960ea44b0f9bfaac7e53c..f949dcb81bbc4732107cd90d16c2cb3d96c05bf2 100755 --- a/testsuite/expect/test12.2 +++ b/testsuite/expect/test12.2 @@ -39,7 +39,7 @@ set matches 0 # job paramters set mem_size 100000 -set sleep_time 25 +set sleep_time 5 set ret_code 42 print_header $test_id @@ -51,7 +51,11 @@ set supported 0 log_user 0 spawn $scontrol show config expect { - -re "jobacct/log" { + -re "jobacct/linux" { + set supported 1 + exp_continue + } + -re "jobacct/aix" { set supported 1 exp_continue } @@ -103,7 +107,7 @@ exec $bin_sleep 5 # Report basic sacct info # -spawn $sacct --noheader --jobstep=$job_id.0 --fields jobstep,jobname,status,error +spawn $sacct --noheader --job=$job_id.0 --fields jobid,jobname,status,exitcode expect { -re "$job_id\.0" { incr matches @@ -138,17 +142,13 @@ if {$matches != 4} { # Report the sacct accouting info # set elapsed_time 0 -spawn $sacct --noheader --jobstep=$job_id.0 --fields elapsed +spawn $sacct --noheader --job=$job_id.0 --fields elapsed expect { - -re "($number):($number)\.($number)" { + -re "($number):($number)" { set mins $expect_out(1,string) set secs $expect_out(2,string) set elapsed_time [expr $mins * 60 + $secs] } - -re "($number)\.($number)" { - set elapsed_time $expect_out(1,string) - exp_continue - } timeout { send_user "\nFAILURE: sacct not responding\n" set exit_code 1 @@ -167,11 +167,15 @@ if {$error_time > 5} { send_user "\nSUCCESS: sacct elapsed time error of $error_time\n" } -set mem_used 0 -spawn $sacct --noheader --jobstep=$job_id.0 --fields vsize +set mem_used -1 +set mem_task -1 +set ave_used -1 +spawn $sacct --noheader --job=$job_id.0 --fields vsize expect { - -re "($number)" { + -re "($float).*/($number) - ($float)" { set mem_used $expect_out(1,string) + set mem_task $expect_out(2,string) + set ave_used $expect_out(3,string) exp_continue } timeout { @@ -183,6 +187,19 @@ expect { } } +if { $mem_used == -1 } { + send_user "\nFAILURE: sacct memory not found\n" + set exit_code 1 +} elseif { $mem_task != 0 } { + send_user "\nFAILURE: sacct memory task not found\n" + set exit_code 1 +} elseif { $ave_used != $mem_used } { + send_user "\nFAILURE: sacct memory task not equal to ave memory\n" + set exit_code 1 +} + +set mem_used [expr $mem_used * 1024] + set diff_mem [expr $mem_used - $mem_size] set error_mem [expr abs($diff_mem)] if {$error_mem > 4000} { diff --git a/testsuite/expect/test14.7 b/testsuite/expect/test14.7 index 46983d4df2ea0a9d092ad82b98b159e55ec0061d..ca30f15c90eec4e8203f9aec86367e90f0ca73c2 100755 --- a/testsuite/expect/test14.7 +++ b/testsuite/expect/test14.7 @@ -118,7 +118,12 @@ if {[wait_for_file $file_err] == 0} { incr matches exp_continue } - -re "No such file" | "does not exist" { + -re "No such file" { + send_user "These errors are expected, no worries\n" + incr matches + exp_continue + } + -re "does not exist" { send_user "These errors are expected, no worries\n" incr matches exp_continue