diff --git a/NEWS b/NEWS index 317fffdde695d083fd265d525aafd906d346705c..29656be96955690672c7dabb5b87ba741e0c463e 100644 --- a/NEWS +++ b/NEWS @@ -110,6 +110,10 @@ documents those changes that are of interest to users and admins. -- Initialize job->mail_type to 0 (NONE) for job submission. -- Fix for stalled task stdout/stderr when buffered I/O is used, and a single line exceeds 4096 bytes. + -- Fix for spinning srun when the terminal to which srun is talking + goes away. + -- Don't set avail_node_bitmap for DRAINED nodes on slurmctld reconfig + (can schedule a job on drained node after reconfig). * Changes in SLURM 1.0.11 ========================= diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c index e5a62b31697e0155b415eb788db5897ead4d83d3..6adfe083831867b97328d32e1f163fbd8cb12a51 100644 --- a/src/slurmctld/read_config.c +++ b/src/slurmctld/read_config.c @@ -165,12 +165,14 @@ static int _build_bitmaps(void) /* scan all nodes and identify which are up, idle and * their configuration, resync DRAINED vs. DRAINING state */ for (i = 0; i < node_record_count; i++) { - uint16_t base_state, no_resp_flag, job_cnt; + uint16_t base_state, drain_flag, no_resp_flag, job_cnt; if (node_record_table_ptr[i].name[0] == '\0') continue; /* defunct */ base_state = node_record_table_ptr[i].node_state & - NODE_STATE_BASE; + NODE_STATE_BASE; + drain_flag = node_record_table_ptr[i].node_state & + NODE_STATE_DRAIN; no_resp_flag = node_record_table_ptr[i].node_state & NODE_STATE_NO_RESPOND; job_cnt = node_record_table_ptr[i].run_job_cnt + @@ -179,9 +181,10 @@ static int _build_bitmaps(void) if (((base_state == NODE_STATE_IDLE) && (job_cnt == 0)) || (base_state == NODE_STATE_DOWN)) bit_set(idle_node_bitmap, i); - if (( (base_state == NODE_STATE_IDLE) - || (base_state == NODE_STATE_ALLOCATED) ) - && (no_resp_flag == 0)) + if (((base_state == NODE_STATE_IDLE) + || (base_state == NODE_STATE_ALLOCATED)) + && (drain_flag == 0) + && (no_resp_flag == 0)) bit_set(avail_node_bitmap, i); if (node_record_table_ptr[i].config_ptr) bit_set(node_record_table_ptr[i].config_ptr-> diff --git a/src/slurmd/slurmstepd/io.c b/src/slurmd/slurmstepd/io.c index a0a963737f2f941e8bf8802e7f563da484a69760..f0db845176b01e8cf0a0a1bc62adfd266535b313 100644 --- a/src/slurmd/slurmstepd/io.c +++ b/src/slurmd/slurmstepd/io.c @@ -505,8 +505,15 @@ again: if ((n = write(obj->fd, buf, in->remaining)) < 0) { if (errno == EINTR) goto again; - /* FIXME handle error */ - return SLURM_ERROR; + else if (errno == EAGAIN || errno == EWOULDBLOCK) + return SLURM_SUCCESS; + else { + close(obj->fd); + obj->fd = -1; + _free_incoming_msg(in->msg, in->job); + in->msg = NULL; + return SLURM_ERROR; + } } in->remaining -= n; if (in->remaining > 0) diff --git a/src/srun/io.c b/src/srun/io.c index 5bf4e663c5f030b9a5188fda5c6aaa98698c6ef3..76cdebd773c536c6018c9555baaf5790bdbb6cf3 100644 --- a/src/srun/io.c +++ b/src/srun/io.c @@ -293,10 +293,11 @@ _server_read(eio_obj_t *obj, List objs) if ((n = read(obj->fd, buf, s->in_remaining)) < 0) { if (errno == EINTR) goto again; - /* FIXME handle error */ - return SLURM_ERROR; + if (errno == EAGAIN || errno == EWOULDBLOCK) + return SLURM_SUCCESS; + debug3("_server_read error: %m"); } - if (n == 0) { /* got eof */ + if (n <= 0) { /* got eof or unhandled error */ debug3( "got eof on _server_read body"); s->in_eof = true; list_enqueue(s->job->free_outgoing, s->in_msg); @@ -327,7 +328,11 @@ _server_read(eio_obj_t *obj, List objs) else obj = s->job->stderr_obj; info = (struct file_write_info *) obj->arg; - list_enqueue(info->msg_queue, s->in_msg); + if (info->eof) + /* this output is closed, discard message */ + list_enqueue(s->job->free_outgoing, s->in_msg); + else + list_enqueue(info->msg_queue, s->in_msg); s->in_msg = NULL; } @@ -445,19 +450,34 @@ create_file_write_eio_obj(int fd, srun_job_t *job) return eio; } -static void _write_label(int fd, int taskid) +static int _write_label(int fd, int taskid) { + int n; + int left = fmt_width + 2; char buf[16]; + void *ptr = buf; snprintf(buf, 16, "%0*d: ", fmt_width, taskid); - /* FIXME - Need to handle return code */ - safe_write(fd, buf, fmt_width+2); - return; -rwfail: - error("_write_label: write from io process failed"); + while (left > 0) { + again: + if ((n = write(fd, ptr, fmt_width+2)) < 0) { + if (errno == EINTR) + goto again; + if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { + debug3(" got EAGAIN in _write_label"); + goto again; + } + error("In _write_label: %m"); + return SLURM_ERROR; + } + left -= n; + ptr += n; + } + + return SLURM_SUCCESS; } -static void _write_newline(int fd) +static int _write_newline(int fd) { int n; @@ -469,9 +489,10 @@ again: || errno == EWOULDBLOCK) { goto again; } - /* FIXME handle error */ + error("In _write_newline: %m"); + return SLURM_ERROR; } - return; + return SLURM_SUCCESS; } /* @@ -482,21 +503,22 @@ static int _write_line(int fd, void *buf, int len) { int n; int left = len; + void *ptr = buf; debug2("Called _write_line"); while (left > 0) { again: - if ((n = write(fd, buf, left)) < 0) { + if ((n = write(fd, ptr, left)) < 0) { if (errno == EINTR) goto again; if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { debug3(" got EAGAIN in _write_line"); goto again; } - /* FIXME handle error */ return -1; } left -= n; + ptr += n; } return len; @@ -528,22 +550,30 @@ static int _write_msg(int fd, void *buf, int len, int taskid) start = buf + written; end = memchr(start, '\n', remaining); if (opt.labelio) - _write_label(fd, taskid); + if (_write_label(fd, taskid) != SLURM_SUCCESS) + goto done; if (end == NULL) { /* no newline found */ rc = _write_line(fd, start, remaining); + if (rc <= 0) { + goto done; + } else { + remaining -= rc; + written += rc; + } if (opt.labelio) - _write_newline(fd); + if (_write_newline(fd) != SLURM_SUCCESS) + goto done; } else { line_len = (int)(end - start) + 1; rc = _write_line(fd, start, line_len); + if (rc <= 0) { + goto done; + } else { + remaining -= rc; + written += rc; + } } - if (rc <= 0) { - goto done; - } else { - remaining -= rc; - written += rc; - } } done: if (written > 0) @@ -595,6 +625,8 @@ static int _file_write(eio_obj_t *obj, List objs) if ((n = _write_msg(obj->fd, ptr, info->out_remaining, info->out_msg->header.gtaskid)) < 0) { + list_enqueue(info->job->free_outgoing, info->out_msg); + info->eof = true; return SLURM_ERROR; } debug3(" wrote %d bytes", n); @@ -819,8 +851,6 @@ io_thr_create(srun_job_t *job) eio_new_initial_obj(job->eio, obj); } - /* FIXME - Need to open files here (or perhaps earlier) */ - xsignal(SIGTTIN, SIG_IGN); slurm_attr_init(&attr);