diff --git a/src/common/eio.c b/src/common/eio.c index 9c8548b8961523fa50bd64e13d419fbd203169f6..4f989f22eafa2d0a87328f02c6b0432e60205a9f 100644 --- a/src/common/eio.c +++ b/src/common/eio.c @@ -45,6 +45,7 @@ struct eio_handle_components { int magic; #endif int fds[2]; + List obj_list; }; @@ -53,13 +54,13 @@ struct eio_handle_components { static int _poll_loop_internal(eio_t eio, List objs); static int _poll_internal(struct pollfd *pfds, unsigned int nfds); -static unsigned int _poll_setup_pollfds(struct pollfd *, io_obj_t **, List); -static void _poll_dispatch(struct pollfd *, unsigned int, io_obj_t **, +static unsigned int _poll_setup_pollfds(struct pollfd *, eio_obj_t **, List); +static void _poll_dispatch(struct pollfd *, unsigned int, eio_obj_t **, List objList); -static void _poll_handle_event(short revents, io_obj_t *obj, +static void _poll_handle_event(short revents, eio_obj_t *obj, List objList); -eio_t eio_handle_create(void) +eio_t eio_handle_create(List eio_obj_list) { eio_t eio = xmalloc(sizeof(*eio)); @@ -73,6 +74,8 @@ eio_t eio_handle_create(void) xassert(eio->magic = EIO_MAGIC); + eio->obj_list = eio_obj_list; + return eio; } @@ -82,36 +85,60 @@ void eio_handle_destroy(eio_t eio) xassert(eio->magic == EIO_MAGIC); close(eio->fds[0]); close(eio->fds[1]); + /* FIXME - Destroy the eio object list here ? */ xassert(eio->magic = ~EIO_MAGIC); xfree(eio); } -int eio_handle_signal(eio_t eio) +int eio_signal_shutdown(eio_t eio) +{ + char c = 1; + if (write(eio->fds[1], &c, sizeof(char)) != 1) + return error("eio_handle_signal_shutdown: write; %m"); + return 0; +} + +int eio_signal_wakeup(eio_t eio) { char c = 0; if (write(eio->fds[1], &c, sizeof(char)) != 1) - return error("eio_signal: write; %m"); + return error("eio_handle_signal_wake: write; %m"); return 0; } +static void _mark_shutdown_true(List obj_list) +{ + ListIterator objs; + eio_obj_t *obj; + + objs = list_iterator_create(obj_list); + while (obj = list_next(objs)) { + obj->shutdown = true; + } + list_iterator_destroy(objs); +} + static int _eio_clear(eio_t eio) { - char buf[1024]; + char c = 0; int rc = 0; - while ((rc = (read(eio->fds[0], buf, 1024)) > 0)) {;} + while ((rc = (read(eio->fds[0], &c, 1)) > 0)) { + if (c == 1) + _mark_shutdown_true(eio->obj_list); + } if (rc < 0) return error("eio_clear: read: %m"); return 0; } -int io_handle_events(eio_t eio, List objs) +int eio_handle_mainloop(eio_t eio) { xassert (eio != NULL); xassert (eio->magic == EIO_MAGIC); - return _poll_loop_internal(eio, objs); + return _poll_loop_internal(eio, eio->obj_list); } static int @@ -119,7 +146,7 @@ _poll_loop_internal(eio_t eio, List objs) { int retval = 0; struct pollfd *pollfds = NULL; - io_obj_t **map = NULL; + eio_obj_t **map = NULL; unsigned int maxnfds = 0, nfds = 0; unsigned int n = 0; @@ -129,7 +156,7 @@ _poll_loop_internal(eio_t eio, List objs) if (maxnfds < (n = list_count(objs))) { maxnfds = n; xrealloc(pollfds, (maxnfds+1) * sizeof(struct pollfd)); - xrealloc(map, maxnfds * sizeof(io_obj_t * )); + xrealloc(map, maxnfds * sizeof(eio_obj_t * )); /* * Note: xrealloc() also handles initial malloc */ @@ -188,22 +215,22 @@ _poll_internal(struct pollfd *pfds, unsigned int nfds) } static bool -_is_writable(io_obj_t *obj) +_is_writable(eio_obj_t *obj) { return (obj->ops->writable && (*obj->ops->writable)(obj)); } static bool -_is_readable(io_obj_t *obj) +_is_readable(eio_obj_t *obj) { return (obj->ops->readable && (*obj->ops->readable)(obj)); } static unsigned int -_poll_setup_pollfds(struct pollfd *pfds, io_obj_t *map[], List l) +_poll_setup_pollfds(struct pollfd *pfds, eio_obj_t *map[], List l) { ListIterator i = list_iterator_create(l); - io_obj_t *obj = NULL; + eio_obj_t *obj = NULL; unsigned int nfds = 0; bool readable, writable; @@ -232,12 +259,12 @@ _poll_setup_pollfds(struct pollfd *pfds, io_obj_t *map[], List l) } static void -_poll_dispatch(struct pollfd *pfds, unsigned int nfds, io_obj_t *map[], +_poll_dispatch(struct pollfd *pfds, unsigned int nfds, eio_obj_t *map[], List objList) { int i; ListIterator iter; - io_obj_t *obj; + eio_obj_t *obj; for (i = 0; i < nfds; i++) { if (pfds[i].revents > 0) @@ -253,18 +280,18 @@ _poll_dispatch(struct pollfd *pfds, unsigned int nfds, io_obj_t *map[], } static void -_poll_handle_event(short revents, io_obj_t *obj, List objList) +_poll_handle_event(short revents, eio_obj_t *obj, List objList) { if ((revents & POLLERR ) && obj->ops->handle_error) { if ((*obj->ops->handle_error) (obj, objList) < 0) return; } - if (((revents & POLLIN) || (revents & POLLHUP)) + if ((revents & POLLHUP) && obj->ops->handle_close) { + (*obj->ops->handle_close) (obj, objList); + } else if (((revents & POLLIN) || (revents & POLLHUP)) && obj->ops->handle_read ) { (*obj->ops->handle_read ) (obj, objList); - } else if ((revents & POLLHUP) && obj->ops->handle_close) { - (*obj->ops->handle_close) (obj, objList); } if ((revents & POLLOUT) && obj->ops->handle_write) { @@ -272,4 +299,33 @@ _poll_handle_event(short revents, io_obj_t *obj, List objList) } } +static struct io_operations * +_ops_copy(struct io_operations *ops) +{ + struct io_operations *ret = xmalloc(sizeof(*ops)); + + /* Copy initial client_ops */ + *ret = *ops; + return ret; +} + +eio_obj_t * +eio_obj_create(int fd, struct io_operations *ops, void *arg) +{ + eio_obj_t *obj = xmalloc(sizeof(*obj)); + obj->fd = fd; + obj->arg = arg; + obj->ops = _ops_copy(ops); + obj->shutdown = false; + return obj; +} +void eio_obj_destroy(eio_obj_t *obj) +{ + if (obj) { + if (obj->ops) { + xfree(obj->ops); + } + xfree(obj); + } +} diff --git a/src/common/eio.h b/src/common/eio.h index 60862faae65cfea5f0939bc6a88f21c7273bbb2d..5fcffdbaf269659e8b369b5de1b68202c9238d28 100644 --- a/src/common/eio.h +++ b/src/common/eio.h @@ -30,7 +30,7 @@ #include "src/common/list.h" #include "src/common/macros.h" -typedef struct io_obj io_obj_t; +typedef struct eio_obj eio_obj_t; typedef struct eio_handle_components * eio_t; @@ -41,33 +41,38 @@ typedef struct eio_handle_components * eio_t; * */ struct io_operations { - bool (*readable )(io_obj_t *); - bool (*writable )(io_obj_t *); - int (*handle_read )(io_obj_t *, List); - int (*handle_write)(io_obj_t *, List); - int (*handle_error)(io_obj_t *, List); - int (*handle_close)(io_obj_t *, List); + bool (*readable )(eio_obj_t *); + bool (*writable )(eio_obj_t *); + int (*handle_read )(eio_obj_t *, List); + int (*handle_write)(eio_obj_t *, List); + int (*handle_error)(eio_obj_t *, List); + int (*handle_close)(eio_obj_t *, List); }; -struct io_obj { +struct eio_obj { int fd; /* fd to operate on */ void *arg; /* application-specific data */ struct io_operations *ops; /* pointer to ops struct for obj */ + bool shutdown; }; -/* passed a list of struct io_obj's, this routine will watch for activtiy - * on the fd's as long as obj->readable() or obj->writable() returns >0 +eio_t eio_handle_create(List eio_obj_list); +void eio_handle_destroy(eio_t eio); + +/* This routine will watch for activtiy on the fd's as long + * as obj->readable() or obj->writable() returns >0 * * routine returns 0 when either list is empty or no objects in list are * readable() or writable(). * * returns -1 on error. */ -int io_handle_events(eio_t eio, List io_obj_list); +int eio_handle_mainloop(eio_t eio); +int eio_signal_wakeup(eio_t eio); +int eio_signal_shutdown(eio_t eio); -eio_t eio_handle_create(void); -void eio_handle_destroy(eio_t eio); -int eio_handle_signal(eio_t eio); +eio_obj_t *eio_obj_create(int fd, struct io_operations *ops, void *arg); +void eio_obj_destroy(eio_obj_t *obj); #endif /* !_EIO_H */ diff --git a/src/common/io_hdr.c b/src/common/io_hdr.c index 40a8c4cd5a60c6135ec593b5646750050dd53efa..425dbc37c947f73fddbea0e98be3c03bf37bda9c 100644 --- a/src/common/io_hdr.c +++ b/src/common/io_hdr.c @@ -32,7 +32,7 @@ #include "src/common/io_hdr.h" #include "src/common/slurm_protocol_defs.h" -#define IO_HDR_VERSION 0xa001 +#define IO_PROTOCOL_VERSION 0xb001 /* static void @@ -49,96 +49,226 @@ _print_data(char *data, int datalen) } */ - -static void +void io_hdr_pack(io_hdr_t *hdr, Buf buffer) { - pack16(hdr->version, buffer); - packmem((char *) hdr->key, (uint16_t) SLURM_IO_KEY_SIZE, buffer); - pack32(hdr->taskid, buffer); - pack16(hdr->type, buffer); + pack16(hdr->type, buffer); + pack16(hdr->gtaskid, buffer); + pack16(hdr->ltaskid, buffer); + pack32(hdr->length, buffer); } -static int +int io_hdr_unpack(io_hdr_t *hdr, Buf buffer) { uint16_t val; - safe_unpack16(&hdr->version, buffer); - - safe_unpackmem((char *) hdr->key, &val, buffer); - - if (val != SLURM_IO_KEY_SIZE) - goto unpack_error; - - safe_unpack32(&hdr->taskid, buffer); safe_unpack16(&hdr->type, buffer); - + safe_unpack16(&hdr->gtaskid, buffer); + safe_unpack16(&hdr->ltaskid, buffer); + safe_unpack32(&hdr->length, buffer); return SLURM_SUCCESS; unpack_error: + error("io_hdr_unpack error: %m"); return SLURM_ERROR; } int io_hdr_packed_size() { - return sizeof(uint32_t) + 3*sizeof(uint16_t) + SLURM_IO_KEY_SIZE; + return sizeof(uint32_t) + 3*sizeof(uint16_t); } -int -io_hdr_write_cb(cbuf_t cb, io_hdr_t *hdr) +/* + * Only return when the all of the bytes have been read, or an unignorable + * error has occurred. + */ +static int _full_read(int fd, void *buf, size_t count) { - int retval = SLURM_SUCCESS; + int n; + int left; + void *ptr; - Buf buffer = init_buf(1024); - hdr->version = IO_HDR_VERSION; + left = count; + ptr = buf; + while (left > 0) { + again: + if ((n = read(fd, (void *) ptr, left)) < 0) { + if (errno == EINTR + || errno == EAGAIN + || errno == EWOULDBLOCK) + goto again; + debug3("Leaving _full_read on error!"); + return -1; + } else if (n == 0) { /* got eof */ + debug3(" _full_read (_client_read) got eof"); + return 0; + } + left -= n; + ptr += n; + } - io_hdr_pack(hdr, buffer); + return count; +} - xassert(buffer->processed == io_hdr_packed_size()); +/* + * Read and unpack an io_hdr_t from a file descriptor (socket). + */ +int io_hdr_read_fd(int fd, io_hdr_t *hdr) +{ + Buf buffer; + int n = 0; + int rc; + + debug3("Entering io_hdr_read_fd"); + buffer = init_buf(io_hdr_packed_size()); + n = _full_read(fd, buffer->head, io_hdr_packed_size()); + if (n <= 0) + goto fail; + if (io_hdr_unpack(hdr, buffer) == SLURM_ERROR) { + n = -1; + goto fail; + } - retval = cbuf_write(cb, buffer->head, buffer->processed, NULL); +fail: + debug3("Leaving io_hdr_read_fd"); free_buf(buffer); + return n; +} + + - return retval; -} int -io_hdr_read_cb(cbuf_t cb, io_hdr_t *hdr) +io_init_msg_validate(struct slurm_io_init_msg *msg, const char *sig) { - Buf buffer = init_buf(4096); - int rc = SLURM_SUCCESS; + debug2("Entering io_init_msg_validate"); - cbuf_read(cb, buffer->head, io_hdr_packed_size()); - - rc = io_hdr_unpack(hdr, buffer); + debug3(" msg->version = %x", msg->version); + debug3(" msg->nodeid = %u", msg->nodeid); - free_buf(buffer); - return rc; + if (msg->version != IO_PROTOCOL_VERSION) { + error("Invalid IO init header version"); + return SLURM_ERROR; + } + + if (memcmp((void *)sig, (void *)msg->cred_signature, + SLURM_CRED_SIGLEN)) { + error("Invalid IO init header signature"); + return SLURM_ERROR; + } + + debug2("Leaving io_init_msg_validate"); + return SLURM_SUCCESS; } -int -io_hdr_validate(io_hdr_t *hdr, const char *key, int len) + +static int +io_init_msg_packed_size(void) { + int len; + + len = sizeof(uint16_t) /* version */ + + sizeof(uint32_t) /* nodeid */ + + (SLURM_CRED_SIGLEN + sizeof(uint16_t)); /* signature */ + + return len; +} + +static void +io_init_msg_pack(struct slurm_io_init_msg *hdr, Buf buffer) +{ + pack16(hdr->version, buffer); + pack32(hdr->nodeid, buffer); + packmem((char *) hdr->cred_signature, + (uint16_t) SLURM_CRED_SIGLEN, buffer); +} + + +static int +io_init_msg_unpack(struct slurm_io_init_msg *hdr, Buf buffer) +{ + uint16_t val; + + safe_unpack16(&hdr->version, buffer); + safe_unpack32(&hdr->nodeid, buffer); + safe_unpackmem((char *) hdr->cred_signature, &val, buffer); + if (val != SLURM_CRED_SIGLEN) + goto unpack_error; + + return SLURM_SUCCESS; + + unpack_error: + error("unpack error in io_init_msg_unpack"); + return SLURM_ERROR; +} + + +int +io_init_msg_write_to_fd(int fd, struct slurm_io_init_msg *msg) +{ + Buf buf; + void *ptr; + int n; - if (hdr->version != IO_HDR_VERSION) { - error("Invalid IO header version"); + xassert(msg); + + debug2("Entering io_init_msg_write_to_fd"); + msg->version = IO_PROTOCOL_VERSION; + buf = init_buf(io_init_msg_packed_size()); + debug2(" msg->nodeid = %d", msg->nodeid); + io_init_msg_pack(msg, buf); + + ptr = get_buf_data(buf); +again: + if ((n = write(fd, ptr, io_init_msg_packed_size())) < 0) { + if (errno == EINTR) + goto again; + free_buf(buf); return SLURM_ERROR; } - - if ((hdr->type != SLURM_IO_STDOUT) && (hdr->type != SLURM_IO_STDERR)) { - error("Invalid IO header type: %d", hdr->type); + if (n != io_init_msg_packed_size()) { + error("io init msg write too small"); + free_buf(buf); return SLURM_ERROR; } - len = len < SLURM_IO_KEY_SIZE ? len : SLURM_IO_KEY_SIZE; + free_buf(buf); + debug2("Leaving io_init_msg_write_to_fd"); + return SLURM_SUCCESS; +} - if (memcmp((void *) key, (void *) hdr->key, len)) { - error("Invalid IO header signature"); + +int +io_init_msg_read_from_fd(int fd, struct slurm_io_init_msg *msg) +{ + Buf buf; + void *ptr; + int n; + + xassert(msg); + + debug2("Entering io_init_msg_read_from_fd"); + buf = init_buf(io_init_msg_packed_size()); + ptr = get_buf_data(buf); +again: + if ((n = read(fd, ptr, io_init_msg_packed_size())) < 0) { + if (errno == EINTR) + goto again; + free_buf(buf); return SLURM_ERROR; } + if (n != io_init_msg_packed_size()) { + error("io init msg read too small"); + free_buf(buf); + return SLURM_ERROR; + } + debug3(" read %d bytes", n); + io_init_msg_unpack(msg, buf); + free_buf(buf); + + debug2("Leaving io_init_msg_read_from_fd"); return SLURM_SUCCESS; } - diff --git a/src/common/io_hdr.h b/src/common/io_hdr.h index 17fe75fef1d40a599ffec22b2f3604c694200904..604d62b841e6f7c7a7ca24b185a6eff2fc93d257 100644 --- a/src/common/io_hdr.h +++ b/src/common/io_hdr.h @@ -40,54 +40,45 @@ # include <inttypes.h> #endif +#include "src/common/macros.h" /* Containes SLURM_CRED_SIGLEN */ #include "src/common/pack.h" #include "src/common/cbuf.h" +#include "src/common/xmalloc.h" -#define SLURM_IO_KEY_SIZE 8 /* IO key is 64 bits */ +#define MAX_MSG_LEN 1024 -/* - * Slurm IO stream types: - * - * STDOUT = stdout/stdin - * STDERR = stderr/signals - */ -#define SLURM_IO_STDOUT 0x00 -#define SLURM_IO_STDERR 0x01 +#define SLURM_IO_STDIN 0 +#define SLURM_IO_STDOUT 1 +#define SLURM_IO_STDERR 2 +#define SLURM_IO_ALLSTDIN 3 -typedef struct slurm_io_header { - unsigned char key[SLURM_IO_KEY_SIZE]; - uint32_t taskid; +struct slurm_io_init_msg { uint16_t version; + unsigned char cred_signature[SLURM_CRED_SIGLEN]; + uint32_t nodeid; +}; + + +typedef struct slurm_io_header { uint16_t type; + uint16_t gtaskid; + uint16_t ltaskid; + uint32_t length; } io_hdr_t; - /* * Return the packed size of an IO header in bytes; */ int io_hdr_packed_size(); - - -/* - * Write an io header into the cbuf in packed form - */ -int io_hdr_write_cb(cbuf_t cb, io_hdr_t *hdr); - -/* - * Read an io header from the cbuf into hdr - */ -int io_hdr_read_cb(cbuf_t cb, io_hdr_t *hdr); +void io_hdr_pack(io_hdr_t *hdr, Buf buffer); +int io_hdr_unpack(io_hdr_t *hdr, Buf buffer); +int io_hdr_read_fd(int fd, io_hdr_t *hdr); /* - * Validate io header hdr against len bytes of the data in key - * - * Returns 0 on success, -1 if any of the following is not true - * - * version != internal version - * type != (SLURM_IO_STDOUT or SLURM_IO_STDERR) - * len bytes of key != hdr->key - * + * Validate io init msg */ -int io_hdr_validate(io_hdr_t *hdr, const char *key, int len); +int io_init_msg_validate(struct slurm_io_init_msg *msg, const char *sig); +int io_init_msg_write_to_fd(int fd, struct slurm_io_init_msg *msg); +int io_init_msg_read_from_fd(int fd, struct slurm_io_init_msg *msg); #endif /* !_HAVE_IO_HDR_H */ diff --git a/src/common/macros.h b/src/common/macros.h index c6d6aa1a99491bd9de403c5e2b8c1955f7858f8a..ada19676f5715b6d886b1dbd3afaf173787759d7 100644 --- a/src/common/macros.h +++ b/src/common/macros.h @@ -61,7 +61,7 @@ typedef enum {false, true} bool; # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif -#define SLURM_IO_KEY_SIZE 8 +#define SLURM_CRED_SIGLEN 8 /* Avoid going over 32 bits for a constant to avoid warnings on some systems */ # define UINT64_SWAP_LE_BE(val) ((uint64_t) ( \ diff --git a/src/common/slurm_cred.c b/src/common/slurm_cred.c index 92071daced55c41976f4152c753b731be00fb9d8..b84bfca532c4d7ca59c5b2f9e13244cf2a3f0151 100644 --- a/src/common/slurm_cred.c +++ b/src/common/slurm_cred.c @@ -410,7 +410,7 @@ slurm_cred_faker(slurm_cred_arg_t *arg) memcpy(cred->ntask, arg->ntask, cred->ntask_cnt * sizeof(int)); } cred->ctime = time(NULL); - cred->siglen = SLURM_IO_KEY_SIZE; + cred->siglen = SLURM_CRED_SIGLEN; cred->signature = xmalloc(cred->siglen * sizeof(char)); diff --git a/src/common/slurm_protocol_defs.h b/src/common/slurm_protocol_defs.h index 5d56280cc3a0e81e2f668117f4aff901b0c14d77..0faae815cb3a494c9ea54ebecaeb3531449625bc 100644 --- a/src/common/slurm_protocol_defs.h +++ b/src/common/slurm_protocol_defs.h @@ -279,6 +279,8 @@ typedef struct launch_tasks_request_msg { char *ofname; char *efname; char *ifname; + /* buffered stdio flag: 1 for line-buffered, 0 for unbuffered */ + uint8_t buffered_stdio; char *task_prolog; char *task_epilog; diff --git a/src/common/slurm_protocol_pack.c b/src/common/slurm_protocol_pack.c index b5f7ce22173d286eb04b9d18ec6f32d55ba6477a..aedd703717bf17af720c23525584cf60b54081de 100644 --- a/src/common/slurm_protocol_pack.c +++ b/src/common/slurm_protocol_pack.c @@ -2434,6 +2434,7 @@ _pack_launch_tasks_request_msg(launch_tasks_request_msg_t * msg, Buf buffer) packstr(msg->ofname, buffer); packstr(msg->efname, buffer); packstr(msg->ifname, buffer); + pack8(msg->buffered_stdio, buffer); packstr(msg->task_prolog, buffer); packstr(msg->task_epilog, buffer); pack32(msg->slurmd_debug, buffer); @@ -2474,6 +2475,7 @@ _unpack_launch_tasks_request_msg(launch_tasks_request_msg_t ** safe_unpackstr_xmalloc(&msg->ofname, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&msg->efname, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&msg->ifname, &uint16_tmp, buffer); + safe_unpack8(&msg->buffered_stdio, buffer); safe_unpackstr_xmalloc(&msg->task_prolog, &uint16_tmp, buffer); safe_unpackstr_xmalloc(&msg->task_epilog, &uint16_tmp, buffer); safe_unpack32(&msg->slurmd_debug, buffer); diff --git a/src/slurmd/fname.c b/src/slurmd/fname.c index a701e951a14081b122a0bea3585b762240c1ad28..bcace0f765ab4df53476355f6f1d1ddd5cef2ed3 100644 --- a/src/slurmd/fname.c +++ b/src/slurmd/fname.c @@ -174,9 +174,9 @@ int fname_single_task_io (const char *fmt) taskid = strtoul (fmt, &p, 10); if (*p == '\0') - return ((int) taskid); + return (int)taskid; - return (-1); + return -1; } int diff --git a/src/slurmd/io.c b/src/slurmd/io.c index bd267ef1d3dbe0de44f4701c26d7f3a87cbc8581..2374ae4774091c7eaad6cfc4aeff148516d35fca 100644 --- a/src/slurmd/io.c +++ b/src/slurmd/io.c @@ -63,1745 +63,1056 @@ #include "src/slurmd/fname.h" #include "src/slurmd/slurmd.h" -typedef enum slurmd_io_tupe { - TASK_STDERR = 0, - TASK_STDOUT, - TASK_STDIN, - CLIENT_STDERR, - CLIENT_STDOUT, - CLIENT_STDIN, -} slurmd_io_type_t; - -static char *_io_str[] = -{ - "task stderr", - "task stdout", - "task stdin", - "client stderr", - "client stdout", - "client stdin", +struct incoming_client_info { + struct slurm_io_header header; + struct io_buf *msg; + int32_t remaining; + bool eof; }; -enum error_type { - E_NONE, - E_WRITE, - E_READ, - E_POLL +struct outgoing_fd_info { + List msg_queue; + struct io_buf *msg; + int32_t remaining; }; -struct error_state { - enum error_type e_type; - int e_last; - int e_count; - time_t e_time; -}; +/********************************************************************** + * IO client socket declarations + **********************************************************************/ +static bool _client_readable(eio_obj_t *); +static bool _client_writable(eio_obj_t *); +static int _client_read(eio_obj_t *, List); +static int _client_write(eio_obj_t *, List); +struct io_operations client_ops = { + readable: &_client_readable, + writable: &_client_writable, + handle_read: &_client_read, + handle_write: &_client_write, +}; -/* The IO information structure - */ -struct io_info { +struct client_io_info { #ifndef NDEBUG -#define IO_MAGIC 0x10101 - int magic; +#define CLIENT_IO_MAGIC 0x10102 + int magic; #endif - uint32_t id; /* global task id */ - io_obj_t *obj; /* pointer back to eio object */ slurmd_job_t *job; /* pointer back to job data */ - slurmd_task_info_t *task; /* pointer back to task data */ - cbuf_t buf; /* IO buffer */ - List readers; /* list of current readers */ - List writers; /* list of current writers */ - slurmd_io_type_t type; /* type of IO object */ - - struct error_state err; /* error state information */ - unsigned eof:1; /* obj recvd or generated EOF */ - - unsigned disconnected:1; /* signifies that fd is not - * connected to anything - * (e.g. A "ghost" client attached - * to a task.) - */ - - unsigned rw:1; /* 1 if client is read-write - * capable, 0 otherwize - */ + struct incoming_client_info in; + struct outgoing_fd_info out; }; +/********************************************************************** + * Task write declarations + **********************************************************************/ +static bool _task_writable(eio_obj_t *); +static int _task_write(eio_obj_t *, List); -static void _fatal_cleanup(void *); -static int find_obj(void *obj, void *key); -/* static int find_fd(void *obj, void *key); */ -static int _io_init_pipes(slurmd_task_info_t *t); -static int _io_prepare_tasks(slurmd_job_t *); -static void * _io_thr(void *); -static int _io_write_header(struct io_info *, srun_info_t *); -static void _io_client_attach(io_obj_t *, io_obj_t *, io_obj_t *, - List objList); -static void _io_connect_objs(io_obj_t *, io_obj_t *); -static int _shutdown_task_obj(struct io_info *t); -static bool _isa_client(struct io_info *io); -static int _open_output_file(slurmd_job_t *job, slurmd_task_info_t *t, - char *fname, slurmd_io_type_t type); -static int _open_stdin_file(slurmd_job_t *job, slurmd_task_info_t *t, - srun_info_t *srun); - -static struct io_obj * _io_obj_create(int fd, void *arg); -static struct io_info * _io_info_create(uint32_t id); -static struct io_obj * _io_obj(slurmd_job_t *, slurmd_task_info_t *, int, int); -static void * _io_thr(void *arg); - -static void _clear_error_state(struct io_info *io); -static int _update_error_state(struct io_info *, enum error_type, int); +struct io_operations task_in_ops = { + writable: &_task_writable, + handle_write: &_task_write, +}; +struct task_in_info { #ifndef NDEBUG -static bool _isa_task(struct io_info *io); +#define TASK_IN_MAGIC 0x10103 + int magic; #endif + slurmd_job_t *job; /* pointer back to job data */ -static struct io_operations * _ops_copy(struct io_operations *ops); + struct outgoing_fd_info out; +}; -/* Slurmd I/O objects: - * N task stderr, stdout objs (read-only) - * N*M client stderr, stdout objs (read-write) (possibly a file) - * N task stdin objs (write only) (possibly a file) - */ +/********************************************************************** + * Task read declarations + **********************************************************************/ +static bool _task_readable(eio_obj_t *); +static int _task_read(eio_obj_t *, List); -static bool _readable(io_obj_t *); -static bool _writable(io_obj_t *); -static int _write(io_obj_t *, List); -static int _task_read(io_obj_t *, List); -static int _client_read(io_obj_t *, List); -static int _task_error(io_obj_t *, List); -static int _client_error(io_obj_t *, List); -static int _connecting_write(io_obj_t *, List); -static int _obj_close(io_obj_t *, List); - -/* Task Output operations (TASK_STDOUT, TASK_STDERR) - * These objects are never writable -- - * therefore no need for writeable and handle_write methods - */ struct io_operations task_out_ops = { - readable: &_readable, + readable: &_task_readable, handle_read: &_task_read, - handle_error: &_task_error, - handle_close: &_obj_close }; -/* Task Input operations (TASK_STDIN) - * input objects are never readable - */ -struct io_operations task_in_ops = { - writable: &_writable, - handle_write: &_write, - handle_error: &_task_error, - handle_close: &_obj_close -}; - -/* Normal client operations (CLIENT_STDOUT, CLIENT_STDERR, CLIENT_STDIN) - * these methods apply to clients which are considered - * "connected" i.e. in the case of srun, they've read - * the so-called IO-header data - */ -struct io_operations client_ops = { - readable: &_readable, - writable: &_writable, - handle_read: &_client_read, - handle_write: &_write, - handle_error: &_client_error, - handle_close: &_obj_close +struct task_out_info { +#ifndef NDEBUG +#define TASK_OUT_MAGIC 0x10103 + int magic; +#endif + uint16_t type; /* type of IO object */ + uint16_t gtaskid; + uint16_t ltaskid; + slurmd_job_t *job; /* pointer back to job data */ + cbuf_t buf; + bool eof; + bool eof_msg_sent; }; +/********************************************************************** + * General declarations + **********************************************************************/ +static void *_io_thr(void *); +static int _send_io_init_msg(int sock, srun_key_t *key, int nodeid); +static void _send_eof_msg(struct task_out_info *out); +static struct io_buf *_task_build_message(struct task_out_info *out, + slurmd_job_t *job, cbuf_t cbuf); +static struct io_obj *_io_obj(slurmd_job_t *, slurmd_task_info_t *, int, int); +static void *_io_thr(void *arg); +static void _route_msg_task_to_client(eio_obj_t *obj); + +/********************************************************************** + * IO client socket functions + **********************************************************************/ +static bool +_client_readable(eio_obj_t *obj) +{ + struct client_io_info *client = (struct client_io_info *) obj->arg; + + debug3("Called _client_readable"); + xassert(client->magic == CLIENT_IO_MAGIC); -/* Connecting client operations -- - * clients use a connecting write until they've - * written out the IO header data. Not until that - * point will clients be able to read regular - * stdout/err data, so we treat them special - */ -struct io_operations connecting_client_ops = { - writable: &_writable, - handle_write: &_connecting_write, - handle_error: &_client_error, - handle_close: &_obj_close -}; + if (client->in.eof) { + debug3(" false"); + return false; + } + if (obj->shutdown) { + debug3(" false, shutdown"); + shutdown(obj->fd, SHUT_RD); + client->in.eof = true; + } -#ifndef NDEBUG -static int _validate_io_list(List objList); -#endif /* NDEBUG */ + if (client->in.msg != NULL + || !list_is_empty(client->job->free_incoming)) + return true; -int -io_spawn_handler(slurmd_job_t *job) -{ - pthread_attr_t attr; + debug3(" false"); + return false; +} - if (io_init_pipes(job) == SLURM_FAILURE) { - error("io_handler: init_pipes failed: %m"); - return SLURM_FAILURE; - } +static bool +_client_writable(eio_obj_t *obj) +{ + struct client_io_info *client = (struct client_io_info *) obj->arg; - /* create task IO objects and append these to the objs list - */ - if (_io_prepare_tasks(job) < 0) - return SLURM_FAILURE; + debug3("Called _client_writable"); + xassert(client->magic == CLIENT_IO_MAGIC); - slurm_attr_init(&attr); - xassert(_validate_io_list(job->objs)); + if (client->out.msg != NULL) + debug3(" client->out.msg != NULL"); - if (pthread_create(&job->ioid, &attr, &_io_thr, (void *)job) != 0) - fatal("pthread_create: %m"); - - fatal_add_cleanup(&_fatal_cleanup, (void *) job); + if (!list_is_empty(client->out.msg_queue)) + debug3(" client->out.msg_queue queue length = %d", + list_count(client->out.msg_queue)); - return 0; -} + if (client->out.msg != NULL + || !list_is_empty(client->out.msg_queue)) + return true; -static int -_xclose(int fd) -{ - int rc; - do { - rc = close(fd); - } while (rc == -1 && errno == EINTR); - return rc; + debug3(" false"); + return false; } - -/* - * Close child fds in parent as well as - * any stdin io objs in job->objs - * - */ -static void -_io_finalize(slurmd_task_info_t *t) +static int +_client_read(eio_obj_t *obj, List objs) { - struct io_info *in = t->in->arg; - - if (_xclose(t->pin[0] ) < 0) - error("close(stdin) : %m"); - if (_xclose(t->pout[1]) < 0) - error("close(stdout): %m"); - if (_xclose(t->perr[1]) < 0) - error("close(stderr): %m"); + struct client_io_info *client = (struct client_io_info *) obj->arg; + struct incoming_client_info *in; + void *buf; + int n; - in->disconnected = 1; + debug2("Entering _client_read"); + xassert(client->magic == CLIENT_IO_MAGIC); - /* Need to close all stdin writers - * - * We effectively close these writers by - * forcing them to be unreadable. This will - * prevent the IO thread from hanging waiting - * for stdin data. (While also not forcing the - * close of a pipe that is also writable) + in = &client->in; + /* + * Read the header, if a message read is not already in progress */ + if (in->msg == NULL) { + in->msg = list_dequeue(client->job->free_incoming); + if (in->msg == NULL) { + debug3(" _client_read free_incoming is empty"); + return SLURM_SUCCESS; + } + n = io_hdr_read_fd(obj->fd, &in->header); + if (n <= 0) { /* got eof or fatal error */ + debug3(" got eof or error _client_read header, n=%d", n); + in->eof = true; + list_enqueue(client->job->free_incoming, in->msg); + in->msg = NULL; + return SLURM_SUCCESS; + } + debug3("in->header.length = %d", in->header.length); + if (in->header.length > MAX_MSG_LEN) + fatal("Message length of %d exceeds maximum of %d", + in->header.length, MAX_MSG_LEN); + in->remaining = in->header.length; + in->msg->length = in->header.length; + } - - if (in->writers) { - ListIterator i; - struct io_info *io; + /* + * Read the body + */ + if (in->header.length == 0) { /* zero length is an eof message */ + debug3(" got stdin eof message!"); + } else { + buf = in->msg->data + (in->msg->length - in->remaining); + again: + if ((n = read(obj->fd, buf, in->remaining)) < 0) { + if (errno == EINTR) + goto again; + /* FIXME handle error */ + return SLURM_ERROR; + } + if (n == 0) { /* got eof */ + debug3(" got eof on _client_read body"); + in->eof = true; + list_enqueue(client->job->free_incoming, in->msg); + in->msg = NULL; + return SLURM_SUCCESS; + } + in->remaining -= n; + if (in->remaining > 0) + return SLURM_SUCCESS; + *(char *)(buf + n) = '\0'; + debug3("\"%s\"", buf); + } - i = list_iterator_create(in->writers); - while ((io = list_next(i))) { - if (io->obj->fd > 0) { - io->obj->ops->readable = NULL; + /* + * Route the message to its destination(s) + */ + if (in->header.type != SLURM_IO_STDIN + && in->header.type != SLURM_IO_ALLSTDIN) { + error("Input in->header.type is not valid!"); + in->msg = NULL; + return SLURM_ERROR; + } else { + int i; + slurmd_task_info_t *task; + struct task_in_info *io; + + in->msg->ref_count = 0; + if (in->header.type == SLURM_IO_ALLSTDIN) { + for (i = 0; i < client->job->ntasks; i++) { + task = client->job->task[i]; + io = (struct task_in_info *)(task->in->arg); + in->msg->ref_count++; + list_enqueue(io->out.msg_queue, in->msg); + } + debug3(" message ref_count = %d", in->msg->ref_count); + } else { + for (i = 0; i < client->job->ntasks; i++) { + task = client->job->task[i]; + io = (struct task_in_info *)task->in->arg; + if (task->gtid != in->header.gtaskid) + continue; + in->msg->ref_count++; + list_enqueue(io->out.msg_queue, in->msg); + break; } } - list_iterator_destroy(i); } + in->msg = NULL; + debug2("Leaving _client_read"); + return SLURM_SUCCESS; } -void -io_close_all(slurmd_job_t *job) +/* + * Write outgoing packed messages to the client socket. + */ +static int +_client_write(eio_obj_t *obj, List objs) { - int i; + struct client_io_info *client = (struct client_io_info *) obj->arg; + struct outgoing_fd_info *out; + void *buf; + int n; - for (i = 0; i < job->ntasks; i++) - _io_finalize(job->task[i]); + xassert(client->magic == CLIENT_IO_MAGIC); - /* No more debug info will be received by client after this point + debug2("Entering _client_write"); + out = &client->out; + + /* + * If we aren't already in the middle of sending a message, get the + * next message from the queue. */ - debug("Closing debug channel"); - close(STDERR_FILENO); + if (out->msg == NULL) { + out->msg = list_dequeue(out->msg_queue); + if (out->msg == NULL) { + debug3("_client_write: nothing in the queue"); + return SLURM_SUCCESS; + } + debug3(" dequeue successful, out->msg->length = %d", out->msg->length); + out->remaining = out->msg->length; + } - /* Signal IO thread to close appropriate - * client connections + debug3(" out->remaining = %d", out->remaining); + + /* + * Write message to socket. */ - eio_handle_signal(job->eio); + buf = out->msg->data + (out->msg->length - out->remaining); +again: + if ((n = write(obj->fd, buf, out->remaining)) < 0) { + if (errno == EINTR) + goto again; + /* FIXME handle error */ + return SLURM_ERROR; + } + debug3("Wrote %d bytes to socket", n); + out->remaining -= n; + if (out->remaining > 0) + return SLURM_SUCCESS; + + _free_outgoing_msg(out->msg, client->job); + out->msg = NULL; + + return SLURM_SUCCESS; } -static void -_fatal_cleanup(void *arg) +/********************************************************************** + * Task write functions + **********************************************************************/ +/* + * Create an eio_obj_t for handling a task's stdin traffic + */ +static eio_obj_t * +_create_task_in_eio(int fd, slurmd_job_t *job) { - slurmd_job_t *job = (slurmd_job_t *) arg; - ListIterator i; - io_obj_t *obj; - struct io_info *io; + struct task_in_info *in = NULL; + eio_obj_t *eio = NULL; - error("in fatal_cleanup"); + in = (struct task_in_info *)xmalloc(sizeof(struct task_in_info)); +#ifndef NDEBUG + in->magic = TASK_IN_MAGIC; +#endif + in->job = job; + in->out.msg_queue = list_create(NULL); /* FIXME! Add destructor */ + in->out.msg = NULL; + in->out.remaining = 0; - _task_read(job->task[0]->err, job->objs); + eio = eio_obj_create(fd, &task_in_ops, (void *)in); - i = list_iterator_create(job->objs); - while((obj = list_next(i))) { - io = (struct io_info *) obj->arg; - if (obj->ops->writable && (*obj->ops->writable)(obj)) - _write(obj, job->objs); - } - list_iterator_destroy(i); + return eio; } -static void -_handle_unprocessed_output(slurmd_job_t *job) +static bool +_task_writable(eio_obj_t *obj) { - int i; - slurmd_task_info_t *t; - struct io_info *io; - List readers; - size_t n = 0; - /* XXX Do something with unwritten IO */ - for (i = 0; i < job->ntasks; i++) { - if (!(t = job->task[i])) - continue; - if (!(readers = ((struct io_info *)t->out->arg)->readers)) - continue; - if (!(io = list_peek(readers))) - continue; - - if (io->buf && (n = cbuf_used(io->buf))) - error("task %d: %ld bytes of stdout unprocessed", - io->id, (long) n); - - if (!(readers = ((struct io_info *)t->err->arg)->readers)) - continue; - if (!(io = list_peek(readers))) - continue; - - if (io->buf && (n = cbuf_used(io->buf))) - error("task %d: %ld bytes of stderr unprocessed", - io->id, (long) n); - } -} + struct task_in_info *in = (struct task_in_info *) obj->arg; + struct outgoing_fd_info *out = &in->out; -static void * -_io_thr(void *arg) -{ - slurmd_job_t *job = (slurmd_job_t *) arg; - sigset_t set; + debug3("Called _task_writable"); - /* A SIGHUP signal signals a reattach to the mgr thread. We need - * to block SIGHUP from being delivered to this thread so the mgr - * thread will see the signal. - * - * FIXME! It is conceivable that a SIGHUP could be delivered to - * this thread before we get a chance to block it. - */ - sigemptyset(&set); - sigaddset(&set, SIGHUP); - pthread_sigmask(SIG_BLOCK, &set, NULL); + if (out->msg != NULL || list_count(out->msg_queue) > 0) { + debug3(" true, list_count = %d", list_count(out->msg_queue)); + return true; + } - debug("IO handler started pid=%lu", (unsigned long) getpid()); - io_handle_events(job->eio, job->objs); - debug("IO handler exited"); - _handle_unprocessed_output(job); - return (void *)1; + debug3(" false (list_count = %d)", list_count(out->msg_queue)); + return false; } static int -_io_prepare_tasks(slurmd_job_t *job) +_task_write(eio_obj_t *obj, List objs) { - int i; - slurmd_task_info_t *t; - io_obj_t *obj; - - for (i = 0; i < job->ntasks; i++) { - t = job->task[i]; - - t->in = _io_obj(job, t, t->pin[1], TASK_STDIN ); - list_append(job->objs, (void *)t->in ); + struct task_in_info *in = (struct task_in_info *) obj->arg; + struct outgoing_fd_info *out; + void *buf; + int n; - t->out = _io_obj(job, t, t->pout[0], TASK_STDOUT); - list_append(job->objs, (void *)t->out); + debug2("Entering _task_write"); + xassert(in->magic == TASK_IN_MAGIC); - /* "ghost" stdout client buffers task data without sending - * it anywhere - */ - obj = _io_obj(job, t, -1, CLIENT_STDOUT); - _io_client_attach(obj, t->out, NULL, job->objs); + out = &in->out; - t->err = _io_obj(job, t, t->perr[0], TASK_STDERR); - list_append(job->objs, (void *)t->err); + /* + * If we aren't already in the middle of sending a message, get the + * next message from the queue. + */ + if (out->msg == NULL) { + out->msg = list_dequeue(out->msg_queue); + if (out->msg == NULL) { + debug3("_task_write: nothing in the queue"); + return SLURM_SUCCESS; + } + if (out->msg->length == 0) { /* eof message */ + close(obj->fd); + obj->fd = -1; + _free_incoming_msg(out->msg, in->job); + out->msg = NULL; + return SLURM_SUCCESS; + } + out->remaining = out->msg->length; + } - /* "fake" stderr client buffers task data without sending - * it anywhere - */ - obj = _io_obj(job, t, -1, CLIENT_STDERR); - _io_client_attach(obj, t->err, NULL, job->objs); + /* + * Write message to socket. + */ + buf = out->msg->data + (out->msg->length - out->remaining); +again: + if ((n = write(obj->fd, buf, out->remaining)) < 0) { + if (errno == EINTR) + goto again; + /* FIXME handle error */ + return SLURM_ERROR; } + out->remaining -= n; + if (out->remaining > 0) + return SLURM_SUCCESS; - xassert(_validate_io_list(job->objs)); + _free_incoming_msg(out->msg, in->job); + out->msg = NULL; return SLURM_SUCCESS; } +/********************************************************************** + * Task read functions + **********************************************************************/ /* - * Turn off obj's readable() function such that it is never - * checked for readability + * Create an eio_obj_t for handling a task's stdout or stderr traffic */ -static inline void -_obj_set_unreadable(io_obj_t *obj) +static eio_obj_t * +_create_task_out_eio(int fd, uint16_t type, + slurmd_job_t *job, slurmd_task_info_t *task) { - obj->ops->readable = NULL; -} + struct task_out_info *out = NULL; + eio_obj_t *eio = NULL; -static inline void -_obj_set_unwritable(io_obj_t *obj) -{ - obj->ops->writable = NULL; + out = (struct task_out_info *)xmalloc(sizeof(struct task_out_info)); +#ifndef NDEBUG + out->magic = TASK_OUT_MAGIC; +#endif + out->type = type; + out->gtaskid = task->gtid; + out->ltaskid = task->id; + out->job = job; + out->buf = cbuf_create(MAX_MSG_LEN, MAX_MSG_LEN*4); + out->eof = false; + out->eof_msg_sent = false; + if (cbuf_opt_set(out->buf, CBUF_OPT_OVERWRITE, CBUF_NO_DROP) == -1) + error("setting cbuf options"); + + eio = eio_obj_create(fd, &task_out_ops, (void *)out); + + return eio; } -static char * -_local_filename (char *fname, int taskid) +static bool +_task_readable(eio_obj_t *obj) { - int id; - - if (fname == NULL) - return (NULL); + struct task_out_info *out = (struct task_out_info *)obj->arg; - if ((id = fname_single_task_io (fname)) < 0) - return (fname); + debug3("Called _task_readable, task %d, %s", out->gtaskid, + out->type == SLURM_IO_STDOUT ? "STDOUT" : "STDERR"); - if (id != taskid) - return ("/dev/null"); + if (out->eof_msg_sent) { + debug3(" false, eof message sent"); + return false; + } + if (cbuf_free(out->buf) > 0) { + debug3(" cbuf_free = %d", cbuf_free(out->buf)); + return true; + } - return (NULL); + debug3(" false"); + return false; } +/* + * Read output (stdout or stderr) from a task into a cbuf. The cbuf + * allows whole lines to be packed into messages if line buffering + * is requested. + */ static int -_io_add_connecting(slurmd_job_t *job, slurmd_task_info_t *t, srun_info_t *srun, - slurmd_io_type_t type) +_task_read(eio_obj_t *obj, List objs) { - io_obj_t *obj = NULL; - int sock = -1; + struct task_out_info *out = (struct task_out_info *)obj->arg; + struct client_io_info *client; + struct io_buf *msg = NULL; + eio_obj_t *eio; + ListIterator clients; + int len; + int rc = -1; + + xassert(out->magic == TASK_OUT_MAGIC); + + debug2("Entering _task_read"); + len = cbuf_free(out->buf); + if (len > 0) { +again: + if ((rc = cbuf_write_from_fd(out->buf, obj->fd, len, NULL)) + < 0) { + if (errno == EINTR) + goto again; + if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { + error("_task_read returned EAGAIN"); + return SLURM_SUCCESS; + } + /* FIXME add error message */ + debug3(" error in _task_read"); + return SLURM_ERROR; + } + if (rc == 0) { /* got eof */ + debug3(" got eof on task"); + out->eof = true; + } + } - debug2 ("adding connecting %s for task %d", _io_str[type], t->gtid); + debug3("************************ %d bytes read from task %s", rc, + out->type == SLURM_IO_STDOUT ? "STDOUT" : "STDERR"); - if ((sock = (int) slurm_open_stream(&srun->ioaddr)) < 0) { - error("connect io: %m"); - /* XXX retry or silently fail? - * fail for now. - */ - return SLURM_ERROR; - } - - fd_set_nonblocking(sock); - fd_set_close_on_exec(sock); - obj = _io_obj(job, t, sock, type); - obj->ops = _ops_copy(&connecting_client_ops); - _io_write_header(obj->arg, srun); + /* + * Put the message in client outgoing queues + */ + _route_msg_task_to_client(obj); - if ((type == CLIENT_STDOUT) - && (!_local_filename(srun->ifname, t->gtid))) { - struct io_info *io = obj->arg; - /* This is the only read-write capable client - * at this time: a connected CLIENT_STDOUT - */ - io->rw = 1; + /* + * Send the eof message + */ + if (cbuf_used(out->buf) == 0 && out->eof) { + _send_eof_msg(out); } - list_append(job->objs, (void *)obj); + return SLURM_SUCCESS; +} - debug3("Now handling %d IO objects", list_count(job->objs)); +/********************************************************************** + * General fuctions + **********************************************************************/ +static char * +_local_filename (char *fname, int taskid) +{ + int id; - return SLURM_SUCCESS; + if (fname == NULL) + return NULL; + + if ((id = fname_single_task_io(fname)) < 0) + return fname; + + if (id != taskid) + return "/dev/null"; + + return (NULL); } -/* - * If filename is given for stdout/err/in, open appropriate file, - * otherwise create a connecting client back to srun process. - */ static int -_io_prepare_one(slurmd_job_t *j, slurmd_task_info_t *t, srun_info_t *s) +_init_task_stdio_fds(slurmd_job_t *job, slurmd_task_info_t *task, + srun_info_t *srun) { - int retval = SLURM_SUCCESS; - char *fname = NULL; + char *name; + int single; + int fd; + struct passwd *spwd = NULL; - /* Try hard to get stderr connected to something + /* + * Initialize stdin */ - if ( (_open_output_file(j, t, s->efname, CLIENT_STDERR) < 0) - && (_io_add_connecting(j, t, s, CLIENT_STDERR) < 0) ) - retval = SLURM_FAILURE; - - if ((fname = _local_filename (s->ofname, t->gtid))) { - if (_open_output_file(j, t, fname, CLIENT_STDOUT) < 0) - retval = SLURM_FAILURE; + if ((name = _local_filename(srun->ifname, task->gtid)) != NULL) { + /* open file "name" on task's stdin */ + name = fname_create(job, srun->ifname, task->gtid); + debug3(" stdin file name = %s", name); + if ((task->stdin = open(name, O_RDONLY)) == -1) { + error("Could not open stdin file: %m"); + return SLURM_ERROR; + } + task->to_stdin = -1; /* not used */ } else { - _io_add_connecting(j, t, s, CLIENT_STDOUT); + /* create pipe and eio object */ + int pin[2]; + if (pipe(pin) < 0) { + error("stdin pipe: %m"); + return SLURM_ERROR; + } + task->stdin = pin[0]; + task->to_stdin = pin[1]; + fd_set_close_on_exec(task->to_stdin); + fd_set_nonblocking(task->to_stdin); + task->in = _create_task_in_eio(task->to_stdin, job); + list_append(job->objs, (void *)task->in); } - - if ((fname = _local_filename (s->ifname, t->gtid))) { - if (_open_stdin_file(j, t, s) < 0) - retval = SLURM_FAILURE; - } else if (_local_filename (s->ofname, t->gtid)) { - _io_add_connecting(j, t, s, CLIENT_STDIN); + + /* + * Initialize stdout + */ + if ((name = _local_filename(srun->ofname, task->gtid)) != NULL) { + /* open file "name" on task's stdout */ + name = fname_create(job, srun->ofname, task->gtid); + debug3(" stdout file name = %s", name); + task->stdout = open(name, O_CREAT|O_WRONLY|O_TRUNC|O_APPEND, + 0666); + if (task->stdout == -1) { + error("Could not open stdout file: %m"); + return SLURM_ERROR; + } + task->from_stdout == -1; /* not used */ + } else { + /* create pipe and eio object */ + int pout[2]; + if (pipe(pout) < 0) { + error("stdout pipe: %m"); + return SLURM_ERROR; + } + task->stdout = pout[1]; + task->from_stdout = pout[0]; + fd_set_close_on_exec(task->from_stdout); + fd_set_nonblocking(task->from_stdout); + task->out = _create_task_out_eio(task->from_stdout, + SLURM_IO_STDOUT, job, task); + list_append(job->objs, (void *)task->out); } - if (!list_find_first(t->srun_list, (ListFindF) find_obj, s)) { - debug3("appending new client to srun_list for task %d", t->gtid); - list_append(t->srun_list, (void *) s); + /* + * Initialize stderr + */ + if ((name = _local_filename(srun->efname, task->gtid)) != NULL) { + /* open file "name" on task's stdout */ + name = fname_create(job, srun->efname, task->gtid); + debug3(" stderr file name = %s", name); + task->stderr = open(name, O_CREAT|O_WRONLY|O_TRUNC|O_APPEND, + 0666); + if (task->stderr == -1) { + error("Could not open stderr file: %m"); + return SLURM_ERROR; + } + task->from_stderr == -1; /* not used */ + } else { + /* create pipe and eio object */ + int perr[2]; + if (pipe(perr) < 0) { + error("stderr pipe: %m"); + return SLURM_ERROR; + } + task->stderr = perr[1]; + task->from_stderr = perr[0]; + fd_set_close_on_exec(task->from_stderr); + fd_set_nonblocking(task->from_stderr); + task->err = _create_task_out_eio(task->from_stderr, + SLURM_IO_STDERR, job, task); + list_append(job->objs, (void *)task->err); } - - return retval; } -/* - * create initial client objs for N tasks - */ int -io_prepare_clients(slurmd_job_t *job) +io_init_tasks_stdio(slurmd_job_t *job) { - int i; - int retval = SLURM_SUCCESS; srun_info_t *srun; + int i; srun = list_peek(job->sruns); xassert(srun != NULL); - if (srun->ofname && (fname_trunc_all(job, srun->ofname) < 0)) - goto error; - - if ( srun->efname - && (!srun->ofname || (strcmp(srun->ofname, srun->efname) != 0))) { - if (fname_trunc_all(job, srun->efname) < 0) - goto error; - } - - if (srun->ioaddr.sin_addr.s_addr) { - char host[256]; - uint16_t port; - slurmd_get_addr(&srun->ioaddr, &port, host, sizeof(host)); - debug2("connecting IO back to %s:%d", host, ntohs(port)); - } - - /* Connect stdin/out/err to either a remote srun or - * local file - */ for (i = 0; i < job->ntasks; i++) { - if (_io_prepare_one(job, job->task[i], srun) < 0) - retval = SLURM_ERROR; - - /* kick IO thread */ - eio_handle_signal(job->eio); + _init_task_stdio_fds(job, job->task[i], srun); } - - return retval; - - error: - /* - * Try to open stderr connection for errors - */ - _io_add_connecting(job, job->task[0], srun, CLIENT_STDERR); - eio_handle_signal(job->eio); - return SLURM_FAILURE; } int -io_new_clients(slurmd_job_t *job) +io_thread_start(slurmd_job_t *job) { - return io_prepare_clients(job); + pthread_attr_t attr; + + slurm_attr_init(&attr); + + if (pthread_create(&job->ioid, &attr, &_io_thr, (void *)job) != 0) + fatal("pthread_create: %m"); + + /*fatal_add_cleanup(&_fatal_cleanup, (void *) job);*/ + + return 0; } static int -_open_task_file(char *filename, int flags) +_xclose(int fd) { - int fd; - if (filename == NULL) - return -1; - if ((fd = open(filename, flags, 0644))< 0) { - error( "Unable to open `%s': %s", - filename, slurm_strerror(errno) ); - return -1; - } - fd_set_nonblocking(fd); - fd_set_close_on_exec(fd); - return fd; + int rc; + do { + rc = close(fd); + } while (rc == -1 && errno == EINTR); + return rc; } -static int -_open_output_file(slurmd_job_t *job, slurmd_task_info_t *t, char *fmt, - slurmd_io_type_t type) +static void +_route_msg_task_to_client(eio_obj_t *obj) { - int fd = -1; - io_obj_t *obj = NULL; - int flags = O_APPEND|O_WRONLY; - char *fname = NULL; - - xassert((type == CLIENT_STDOUT) || (type == CLIENT_STDERR)); - - if (fmt == NULL) - return SLURM_ERROR; - - if (!_local_filename (fmt, t->gtid)) - return SLURM_ERROR; - - fname = fname_create(job, fmt, t->gtid); - if ((fd = _open_task_file(fname, flags)) > 0) { - debug2 ("opened `%s' for task %d %s fd %d", - fname, t->gtid, _io_str[type], fd); - obj = _io_obj(job, t, fd, type); - _obj_set_unreadable(obj); - xassert(obj->ops->writable != NULL); - if (type == CLIENT_STDOUT) - _io_client_attach(obj, t->out, NULL, job->objs); - else - _io_client_attach(obj, t->err, NULL, job->objs); - } - xfree(fname); + struct task_out_info *out = (struct task_out_info *)obj->arg; + struct client_io_info *client; + struct io_buf *msg = NULL; + eio_obj_t *eio; + ListIterator clients; + + /* Pack task output into messages for transfer to a client */ + while (cbuf_used(out->buf) > 0 + && !list_is_empty(out->job->free_outgoing)) { + debug3("cbuf_used = %d", cbuf_used(out->buf)); + msg = _task_build_message(out, out->job, out->buf); + if (msg == NULL) + return; - xassert(_validate_io_list(job->objs)); + debug3("\"%s\"", msg->data + io_hdr_packed_size()); - return fd; -} - -static int -_open_stdin_file(slurmd_job_t *job, slurmd_task_info_t *t, srun_info_t *srun) -{ - int fd = -1; - io_obj_t *obj = NULL; - int flags = O_RDONLY; - char *fname = fname_create(job, srun->ifname, t->gtid); - - if (!strcmp(fname, "/dev/null")) { - /* AIX returns POLLERR when a file descriptor for /dev/null is - * polled, so we bypass the normal eio handling of stdin, and - * instead connect the task's stdin directly to /dev/null. - * - * Without eio the pin pipe is no longer useful so we close - * both ends. We reuse pin[0] to pass the file descriptor for - * /dev/null to io_prepare_child. - */ - close(t->pin[0]); - close(t->pin[1]); - /* io_prepare_child will do close(t->pin[1]), so set it to a - * number unlikely to conflict with new file descriptors. - */ - t->pin[1] = -1; - if ((fd = open("/dev/null", flags)) < 0) { - error("Unable to open /dev/null: %s", - slurm_strerror(errno)); - return -1; + /* Add message to the msg_queue of all clients */ + clients = list_iterator_create(out->job->clients); + while(eio = list_next(clients)) { + client = (struct client_io_info *)eio->arg; + debug3("======================== Enqueued message"); + xassert(client->magic == CLIENT_IO_MAGIC); + if (list_enqueue(client->out.msg_queue, msg)) + msg->ref_count++; } - debug("opened /dev/null for direct stdin use fd %d", fd); - t->pin[0] = fd; - } else if ((fd = _open_task_file(fname, flags)) > 0) { - debug("opened `%s' for %s fd %d", fname, "stdin", fd); - obj = _io_obj(job, t, fd, CLIENT_STDIN); - _io_client_attach(obj, NULL, t->in, job->objs); + list_iterator_destroy(clients); } - xfree(fname); - return fd; } - -/* Attach io obj "client" as a reader of 'writer' and a writer to 'reader' - * if 'reader' is NULL client will have no readers. - * - */ static void -_io_client_attach(io_obj_t *client, io_obj_t *writer, - io_obj_t *reader, List objList) +_free_incoming_msg(struct io_buf *msg, slurmd_job_t *job) { - struct io_info *src = writer ? writer->arg : NULL; - struct io_info *dst = reader ? reader->arg : NULL; - struct io_info *cli = client->arg; - struct io_info *io; - struct io_operations *opsptr = NULL; - - xassert((src != NULL) || (dst != NULL)); - xassert((src == NULL) || (src->magic == IO_MAGIC)); - xassert((dst == NULL) || (dst->magic == IO_MAGIC)); - - if (writer == NULL) { - /* - * Only connect new client to reader if the - * reader is still available. - * - */ - if (reader->fd < 0 || dst->disconnected) { - debug3("can't attach %s to closed %s", - _io_str[cli->type], _io_str[dst->type]); - _obj_close(client, objList); - return; - } - - _io_connect_objs(client, reader); - if (!list_find_first(objList, (ListFindF) find_obj, client)) - list_append(objList, client); - return; - } - - io = list_peek(src->readers); - xassert((io == NULL) || (io->magic == IO_MAGIC)); - - /* Check to see if src's first reader has disconnected, - * if so, replace the object with this client, if not, - * append client to readers list - */ - if ((io != NULL) && (io->disconnected)) { - /* Resurrect the ghost: - * Attached client inherits ghost client's cbuf - * and eof, as well as place in reader list and - * master objList. However, we need to reset the - * file descriptor, operations structure, and - * disconnected flag. - */ - xassert(io->obj->fd == -1); - xassert(io->obj->ops->writable); - - io->obj->fd = client->fd; - io->disconnected = 0; - - opsptr = io->obj->ops; - io->obj->ops = _ops_copy(client->ops); - xfree(opsptr); - - /* - * Delete old client which is now an empty vessel - */ - list_delete_all(objList, (ListFindF)find_obj, client); + int i; - /* - * Rewind a few lines if possible - */ - cbuf_rewind_line(io->buf, 256, -1); + msg->ref_count--; + if (msg->ref_count == 0) { + /* Put the message back on the free List */ + list_enqueue(job->free_incoming, msg); - /* - * connect resurrected client ("io") to reader - * if (reader != NULL). - */ - if (reader != NULL) - _io_connect_objs(io->obj, reader); - xassert(io->obj->ops->writable == &_writable); - } else { - char buf[1024]; - /* Append new client into readers list and master objList - * client still copies existing eof bit, though. - */ - if (io) { - int n; - cli->eof = io->eof; - - if ((n = cbuf_replay_line(io->buf, buf, 256, -1)) > 0) - cbuf_write(cli->buf, buf, n, NULL); - } - _io_connect_objs(writer, client); - if (reader != NULL) - _io_connect_objs(client, reader); - - /* Only append to objList if client is not already present. - * (connecting client would already be in objList) - */ - if (!list_find_first(objList, (ListFindF) find_obj, client)) - list_append(objList, client); + /* Kick the event IO engine */ + eio_signal_wakeup(job->eio); } - - xassert(_validate_io_list(objList)); } static void -_io_connect_objs(io_obj_t *obj1, io_obj_t *obj2) +_free_outgoing_msg(struct io_buf *msg, slurmd_job_t *job) { - struct io_info *src = (struct io_info *) obj1->arg; - struct io_info *dst = (struct io_info *) obj2->arg; - xassert(src->magic == IO_MAGIC); - xassert(dst->magic == IO_MAGIC); - - if (!list_find_first(src->readers, (ListFindF)find_obj, dst)) - list_append(src->readers, dst); - else - debug3("%s already in %s readers list!", - _io_str[dst->type], _io_str[src->type]); - - if (!list_find_first(dst->writers, (ListFindF)find_obj, src)) - list_append(dst->writers, src); - else - debug3("%s already in %s writers list!", - _io_str[src->type], _io_str[dst->type]); -} + int i; -/* -static int -find_fd(void *obj, void *key) -{ - xassert(obj != NULL); - xassert(key != NULL); + msg->ref_count--; + if (msg->ref_count == 0) { + /* Put the message back on the free List */ + list_enqueue(job->free_outgoing, msg); - return (((io_obj_t *)obj)->fd == *((int *)key)); + /* Try packing messages from tasks' output cbufs */ + if (job->task == NULL) + return; + for (i = 0; i < job->ntasks; i++) { + if (job->task[i]->err != NULL) { + _route_msg_task_to_client(job->task[i]->err); + if (list_is_empty(job->free_outgoing)) + break; + } + if (job->task[i]->out != NULL) { + _route_msg_task_to_client(job->task[i]->out); + if (list_is_empty(job->free_outgoing)) + break; + } + } + /* Kick the event IO engine */ + eio_signal_wakeup(job->eio); + } } -*/ -static int -find_obj(void *obj, void *key) +extern void +io_close_task_fds(slurmd_job_t *job) { - xassert(obj != NULL); - xassert(key != NULL); - - return (obj == key); -} + int i; -/* delete the connection from src to dst, i.e. remove src - * from dst->writers, and dst from src->readers - */ -static void -_io_disconnect(struct io_info *src, struct io_info *dst) -{ - char *a, *b; - xassert(src->magic == IO_MAGIC); - xassert(src->readers != NULL); - xassert(dst->magic == IO_MAGIC); - xassert(dst->writers != NULL); - a = _io_str[dst->type]; - b = _io_str[src->type]; - - if (list_delete_all(src->readers, (ListFindF)find_obj, dst) <= 0) - error("Unable to delete %s from %s readers list", a, b); - - if (list_delete_all(dst->writers, (ListFindF)find_obj, src) <= 0) - error("Unable to delete %s from %s writers list", b, a); + for (i = 0; i < job->ntasks; i++) { + close(job->task[i]->stdin); + close(job->task[i]->stdout); + close(job->task[i]->stderr); + } } -static void -_io_disconnect_client(struct io_info *client, List objs) +void +io_close_all(slurmd_job_t *job) { - bool destroy = true; - struct io_info *t; - ListIterator i; + int i; - xassert(client->magic == IO_MAGIC); - xassert(_isa_client(client)); - xassert(client == client->obj->arg); +#if 0 + for (i = 0; i < job->ntasks; i++) + _io_finalize(job->task[i]); +#endif - /* Our client becomes a ghost + /* No more debug info will be received by client after this point */ - client->disconnected = 1; - - if (client->writers) { - /* delete client from its writer->readers list - */ - i = list_iterator_create(client->writers); - while ((t = list_next(i))) { - if (list_count(t->readers) > 1) - _io_disconnect(t, client); - else - destroy = false; - } - list_iterator_destroy(i); - } - - if (client->readers) { - /* delete client from its reader->writers list - */ - i = list_iterator_create(client->readers); - while ((t = list_next(i))) - _io_disconnect(client, t); - list_iterator_destroy(i); - } + debug("Closing debug channel"); + close(STDERR_FILENO); - xassert(client == client->obj->arg); + /* Signal IO thread to close appropriate + * client connections + */ + eio_signal_wakeup(job->eio); +} - if (!destroy) - return; +static void * +_io_thr(void *arg) +{ + slurmd_job_t *job = (slurmd_job_t *) arg; + sigset_t set; - debug3("Going to destroy %s %d", _io_str[client->type], client->id); - if (list_delete_all(objs, (ListFindF)find_obj, client->obj) <= 0) - error("Unable to destroy %s %d (%p)", - _io_str[client->type], client->id, client); + /* A SIGHUP signal signals a reattach to the mgr thread. We need + * to block SIGHUP from being delivered to this thread so the mgr + * thread will see the signal. + * + * FIXME! It is conceivable that a SIGHUP could be delivered to + * this thread before we get a chance to block it. + */ + sigemptyset(&set); + sigaddset(&set, SIGHUP); + pthread_sigmask(SIG_BLOCK, &set, NULL); - return; + debug("IO handler started pid=%lu", (unsigned long) getpid()); + eio_handle_mainloop(job->eio); + debug("IO handler exited"); + return (void *)1; } -#ifndef NDEBUG -static bool -_isa_task(struct io_info *io) +/* + * create initial client obj for this job step + */ +int +io_client_connect(slurmd_job_t *job) { - xassert(io->magic == IO_MAGIC); - return ((io->type == TASK_STDOUT) - || (io->type == TASK_STDERR) - || (io->type == TASK_STDIN )); -} -#endif + int i; + srun_info_t *srun; + int sock = -1; + struct client_io_info *client; + eio_obj_t *obj; -static bool -_isa_client(struct io_info *io) -{ - xassert(io->magic == IO_MAGIC); - return ((io->type == CLIENT_STDOUT) - || (io->type == CLIENT_STDERR) - || (io->type == CLIENT_STDIN )); -} + debug2 ("adding IO connection (logical node rank %d)", job->nodeid); -static struct io_operations * -_ops_copy(struct io_operations *ops) -{ - struct io_operations *ret = xmalloc(sizeof(*ops)); - /* - * Copy initial client_ops - */ - *ret = *ops; - return ret; -} + srun = list_peek(job->sruns); + xassert(srun != NULL); + if (srun->ioaddr.sin_addr.s_addr) { + char host[256]; + uint16_t port; + slurmd_get_addr(&srun->ioaddr, &port, host, sizeof(host)); + debug2("connecting IO back to %s:%d", host, ntohs(port)); + } -io_obj_t * -_io_obj(slurmd_job_t *job, slurmd_task_info_t *t, int fd, int type) -{ - struct io_info *io = _io_info_create(t->gtid); - struct io_obj *obj = _io_obj_create(fd, (void *)io); - - xassert(io->magic == IO_MAGIC); - xassert(type >= 0); - - io->type = type; - switch (type) { - case TASK_STDERR: - case TASK_STDOUT: - obj->ops = &task_out_ops; - io->readers = list_create(NULL); - break; - case TASK_STDIN: - obj->ops = &task_in_ops; - io->buf = cbuf_create(512, 4096); - io->writers = list_create(NULL); - - /* Never overwrite stdin data - */ - cbuf_opt_set(io->buf, CBUF_OPT_OVERWRITE, 0); - break; - case CLIENT_STDOUT: - io->readers = list_create(NULL); - case CLIENT_STDERR: - xfree(obj->ops); - obj->ops = _ops_copy(&client_ops); - io->buf = cbuf_create(1024, 1048576); - io->writers = list_create(NULL); - - cbuf_opt_set(io->buf, CBUF_OPT_OVERWRITE, CBUF_WRAP_ONCE); - break; - case CLIENT_STDIN: - xfree(obj->ops); - obj->ops = _ops_copy(&client_ops); - _obj_set_unwritable(obj); - io->readers = list_create(NULL); - /* - * Connected stdin still needs output buffer - * (for connection header) - */ - io->buf = cbuf_create(256, 1024); - break; - default: - error("io: unknown I/O obj type %d", type); + if ((sock = (int) slurm_open_stream(&srun->ioaddr)) < 0) { + error("connect io: %m"); + /* XXX retry or silently fail? + * fail for now. + */ + return SLURM_ERROR; } - io->disconnected = fd < 0 ? 1 : 0; + fd_set_blocking(sock); /* just in case... */ - /* io info pointers back to eio object, - * job, and task information - */ - io->obj = obj; - io->job = job; - io->task = t; + _send_io_init_msg(sock, srun->key, job->nodeid); - xassert(io->task->gtid == io->id); + debug3(" back from _send_io_init_msg"); + fd_set_nonblocking(sock); + fd_set_close_on_exec(sock); - return obj; -} + /* Now set up the eio object */ + client = xmalloc(sizeof(struct client_io_info)); +#ifndef NDEBUG + client->magic = CLIENT_IO_MAGIC; +#endif + client->job = job; + client->out.msg_queue = list_create(NULL); /* FIXME! Need desctructor */ -void -io_obj_destroy(io_obj_t *obj) -{ - struct io_info *io = (struct io_info *) obj->arg; - - xassert(obj != NULL); - xassert(io != NULL); - xassert(io->magic == IO_MAGIC); - - switch (io->type) { - case TASK_STDERR: - case TASK_STDOUT: - list_destroy(io->readers); - break; - case TASK_STDIN: - cbuf_destroy(io->buf); - list_destroy(io->writers); - break; - case CLIENT_STDOUT: - list_destroy(io->readers); - xfree(obj->ops); - case CLIENT_STDERR: - cbuf_destroy(io->buf); - list_destroy(io->writers); - xfree(obj->ops); - break; - case CLIENT_STDIN: - cbuf_destroy(io->buf); - xfree(obj->ops); - list_destroy(io->readers); - break; - default: - error("unknown IO object type: %ld", (long) io->type); - } + obj = eio_obj_create(sock, &client_ops, (void *)client); + list_append(job->clients, (void *)obj); + list_append(job->objs, (void *)obj); - xassert(io->magic = ~IO_MAGIC); - xfree(io); - xfree(obj); -} + debug3("Now handling %d IO Client object(s)", list_count(job->clients)); -static io_obj_t * -_io_obj_create(int fd, void *arg) -{ - io_obj_t *obj = xmalloc(sizeof(*obj)); - obj->fd = fd; - obj->arg = arg; - obj->ops = NULL; - return obj; -} + /* kick IO thread */ + eio_signal_wakeup(job->eio); + debug3(" test 3"); -static struct io_info * -_io_info_create(uint32_t id) -{ - struct io_info *io = (struct io_info *) xmalloc(sizeof(*io)); - io->id = id; - io->job = NULL; - io->task = NULL; - io->obj = NULL; - io->buf = NULL; - io->type = -1; - io->readers = NULL; - io->writers = NULL; - io->eof = 0; - io->disconnected = 0; - io->rw = 0; - xassert(io->magic = IO_MAGIC); - return io; + return SLURM_SUCCESS; } int -io_init_pipes(slurmd_job_t *job) +io_new_clients(slurmd_job_t *job) { - int i; - for (i = 0; i < job->ntasks; i++) { - if (_io_init_pipes(job->task[i]) == SLURM_FAILURE) { - error("init_pipes <task %d> failed", i); - return SLURM_FAILURE; - } - } - return SLURM_SUCCESS; + return SLURM_ERROR; +#if 0 + return io_prepare_clients(job); +#endif } static int -_io_write_header(struct io_info *client, srun_info_t *srun) +_send_io_init_msg(int sock, srun_key_t *key, int nodeid) { - io_hdr_t hdr; + struct slurm_io_init_msg msg; - memcpy(hdr.key, srun->key->data, SLURM_IO_KEY_SIZE); - hdr.taskid = client->id; + memcpy(msg.cred_signature, key->data, SLURM_CRED_SIGLEN); + msg.nodeid = nodeid; - if ((client->type == CLIENT_STDOUT) || (client->type == CLIENT_STDIN)) - hdr.type = SLURM_IO_STDOUT; - else - hdr.type = SLURM_IO_STDERR; - - if (io_hdr_write_cb(client->buf, &hdr) < 0) { - error ("Unable to write io header: %m"); + if (io_init_msg_write_to_fd(sock, &msg) != SLURM_SUCCESS) { + error("Couldn't sent slurm_io_init_msg"); return SLURM_ERROR; } return SLURM_SUCCESS; } -static int -_io_init_pipes(slurmd_task_info_t *t) -{ - if ( (pipe(t->pin) < 0) - || (pipe(t->pout) < 0) - || (pipe(t->perr) < 0) ) { - error("io_init_pipes: pipe: %m"); - return SLURM_FAILURE; - } - - fd_set_close_on_exec(t->pin[1]); - fd_set_close_on_exec(t->pout[0]); - fd_set_close_on_exec(t->perr[0]); - - fd_set_nonblocking(t->pin[1]); - fd_set_nonblocking(t->pout[0]); - fd_set_nonblocking(t->perr[0]); - - return SLURM_SUCCESS; -} - -/* prepare for child I/O: - * dup stdin,stdout,stderr onto appropriate pipes and - * close write end of stdin, and read end of stdout/err +/* + * dup the appropriate file descriptors onto the task's + * stdin, stdout, and stderr. + * + * Close the server's end of the stdio pipes. */ -int -io_prepare_child(slurmd_task_info_t *t) +int +io_dup_stdio(slurmd_task_info_t *t) { - if (dup2(t->pin[0], STDIN_FILENO ) < 0) { + if (dup2(t->stdin, STDIN_FILENO ) < 0) { error("dup2(stdin): %m"); return SLURM_FAILURE; } - if (dup2(t->pout[1], STDOUT_FILENO) < 0) { + if (dup2(t->stdout, STDOUT_FILENO) < 0) { error("dup2(stdout): %m"); return SLURM_FAILURE; } - if (dup2(t->perr[1], STDERR_FILENO) < 0) { + if (dup2(t->stderr, STDERR_FILENO) < 0) { error("dup2(stderr): %m"); return SLURM_FAILURE; } /* ignore errors on close */ - close(t->pin[1] ); - close(t->pout[0]); - close(t->perr[0]); - return SLURM_SUCCESS; -} - -static int -_obj_close(io_obj_t *obj, List objs) -{ - struct io_info *io = (struct io_info *) obj->arg; - - xassert(io->magic == IO_MAGIC); - xassert(_validate_io_list(objs)); - - debug3("Need to close %d %s", io->id, _io_str[io->type]); - - if (close(obj->fd) < 0) - error("close: %m"); - obj->fd = -1; - - if (_isa_client(io)) - _io_disconnect_client(io, objs); - else - _shutdown_task_obj(io); - - xassert(_validate_io_list(objs)); - + close(t->to_stdin ); + close(t->from_stdout); + close(t->from_stderr); return SLURM_SUCCESS; } -static int -_min_free (struct io_info *reader, int *lenp) -{ - int nfree = cbuf_free (reader->buf); - if (nfree < *lenp) - *lenp = nfree; - return (0); -} - -static int -_max_readable (struct io_info *io, int max) -{ - if (!io->readers) - return (0); - /* - * Determine the maximum amount of data we will - * safely be able to read (starting at max) - */ - list_for_each (io->readers, (ListForF) _min_free, (void *) &max); - return (max); -} - -static bool -_readable(io_obj_t *obj) -{ - struct io_info *io = (struct io_info *) obj->arg; - - xassert(io->magic == IO_MAGIC); - - if (io->disconnected || io->eof || (obj->fd < 0)) - return (false); - - if (_max_readable(io, 1024) == 0) - return (false); - - return (true); -} - -static bool -_writable(io_obj_t *obj) -{ - bool rc; - struct io_info *io = (struct io_info *) obj->arg; - - xassert(io->magic == IO_MAGIC); - - debug3("_writable(): [%p] task %d fd %d %s [%d %d %d]", io, - io->id, obj->fd, _io_str[io->type], io->disconnected, - cbuf_used(io->buf), io->eof); - - rc = ((io->obj->fd > 0) - && !io->disconnected - && ((cbuf_used(io->buf) > 0) || io->eof)); - - if ((io->type == CLIENT_STDERR) && (io->id == 0)) - rc = rc || (log_has_data() && !io->disconnected); - - if (rc) - debug3("%d %s is writable", io->id, _io_str[io->type]); - - return rc; -} - -static int -_write(io_obj_t *obj, List objs) -{ - struct io_info *io = (struct io_info *) obj->arg; - int n = 0; - - xassert(io->magic == IO_MAGIC); - xassert(io->type >= 0); - - if (io->disconnected) - return 0; - - if (io->id == 0) - log_flush(); - - debug3("Need to write %d bytes to %s %d", - cbuf_used(io->buf), _io_str[io->type], io->id); - - /* - * If obj has recvd EOF, and there is no more data to write - * (or there are many pending errors for this object), - * close the descriptor and remove object from event lists. - */ - if ( io->eof - && ( (cbuf_used(io->buf) == 0) - || (io->err.e_count > 1) ) ) { - _obj_close(obj, objs); - return 0; - } - - while ((n = cbuf_read_to_fd(io->buf, obj->fd, -1)) < 0) { - switch (errno) { - case EAGAIN: - return 0; - break; - case EPIPE: - case EINVAL: - case EBADF: - case ECONNRESET: - _obj_close(obj, objs); - break; - default: - _update_error_state(io, E_WRITE, errno); - } - return -1; - } - - debug3("Wrote %d bytes to %s %d", n, _io_str[io->type], io->id); - - return 0; -} - static void -_do_attach(struct io_info *io) -{ - slurmd_task_info_t *t; - struct io_operations *opsptr; - - xassert(io != NULL); - xassert(io->magic == IO_MAGIC); - xassert(_isa_client(io)); - - opsptr = io->obj->ops; - io->obj->ops = _ops_copy(&client_ops); - xfree(opsptr); - - t = io->task; - - switch (io->type) { - case CLIENT_STDOUT: - if (io->rw) { - debug3("attaching task %d client stdout read-write", - io->id); - _io_client_attach( io->obj, t->out, t->in, - io->job->objs ); - } else { - debug3("attaching task %d client stdout write-only", - io->id); - _io_client_attach( io->obj, t->out, NULL, - io->job->objs ); - } - break; - case CLIENT_STDERR: - _io_client_attach(io->obj, t->err, NULL, io->job->objs); - break; - case CLIENT_STDIN: - _io_client_attach(io->obj, NULL, t->in, io->job->objs); - break; - default: - error("Unknown client type %d in do_attach()", io->type); - - } -} - -/* Write method for client objects which are connecting back to the - * remote host - */ -static int -_connecting_write(io_obj_t *obj, List objs) -{ - struct io_info *io = (struct io_info *) obj->arg; - int n; - - xassert(io->magic == IO_MAGIC); - xassert(_isa_client(io)); - - debug3("Need to write %d bytes to connecting %s %d", - cbuf_used(io->buf), _io_str[io->type], io->id); - while ((n = cbuf_read_to_fd(io->buf, obj->fd, -1)) < 0) { - if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) - continue; - if ((errno == EPIPE) || (errno == EINVAL) || (errno == EBADF)) - _obj_close(obj, objs); - else - error("write failed: <task %d>: %m", io->id); - return -1; - } - debug3("Wrote %d bytes to %s %d", n, _io_str[io->type], io->id); - - /* If we've written the contents of the buffer, this is - * a connecting client no longer -- it may now be attached - * to the appropriate task. - */ - if (cbuf_used(io->buf) == 0) - _do_attach(io); - - return 0; -} - - -static int -_shutdown_task_obj(struct io_info *t) -{ - ListIterator i; - struct io_info *r; - - xassert(_isa_task(t)); - - debug3("shutdown_task_obj: %d %s [%d readers, %d writers]", - t->id, _io_str[t->type], - (t->readers ? list_count(t->readers) : 0), - (t->writers ? list_count(t->writers) : 0)); - - t->disconnected = 1; - - if (!t->readers) - return 0; - - /* Task objects do not get destroyed. - * Simply propagate the EOF to the clients - * - * Only propagate EOF to readers - * - */ - i = list_iterator_create(t->readers); - while ((r = list_next(i))) - r->eof = 1; - list_iterator_destroy(i); - - xassert(_validate_io_list(t->job->objs)); - - return 0; -} - -static int -_task_read(io_obj_t *obj, List objs) -{ - struct io_info *r, *t; - char buf[4096]; /* XXX Configurable? */ - ssize_t n, len = sizeof(buf); - ListIterator i; - - t = (struct io_info *) obj->arg; - - xassert(t->magic == IO_MAGIC); - xassert((t->type == TASK_STDOUT) || (t->type == TASK_STDERR)); - xassert(_validate_io_list(objs)); - - len = _max_readable (t, len); - - again: - if ((n = read(obj->fd, (void *) buf, len)) < 0) { - if (errno == EINTR) - goto again; - if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { - error("%s %d: read returned EAGAIN", - _io_str[t->type], t->id); - return 0; - } - _update_error_state(t, E_READ, errno); - return -1; - } - debug3("read %d bytes from %s %d", n, _io_str[t->type], t->id); - - if (n == 0) { /* got eof */ - debug3("got eof on task %ld", (long) t->id); - _obj_close(obj, objs); - return 0; - } - - /* copy buf to all readers */ - i = list_iterator_create(t->readers); - while((r = list_next(i))) { - int dropped; - xassert(r->magic == IO_MAGIC); - n = cbuf_write(r->buf, (void *) buf, n, &dropped); - debug3("wrote %ld bytes into %s buf (fd=%d)", - (long) n, _io_str[r->type], r->obj->fd); - if (dropped > 0) { - debug3("dropped %d bytes from %s buf", - dropped, _io_str[r->type]); - } - } - list_iterator_destroy(i); - - return 0; -} - -static int -_task_error(io_obj_t *obj, List objs) -{ - int err; - socklen_t size; - struct io_info *t = (struct io_info *) obj->arg; - xassert(t->magic == IO_MAGIC); - - if (getsockopt(obj->fd, SOL_SOCKET, SO_ERROR, &err, &size) < 0) - error ("_task_error getsockopt: %m"); - else - _update_error_state(t, E_POLL, err); - _obj_close(obj, objs); - return -1; -} -static int -_client_read(io_obj_t *obj, List objs) +_send_eof_msg(struct task_out_info *out) { - struct io_info *client = (struct io_info *) obj->arg; - struct io_info *reader; - char buf[4096]; - int dropped = 0; - ssize_t n = 0; - ssize_t len = sizeof(buf); - ListIterator i = NULL; - - xassert(client->magic == IO_MAGIC); - xassert(_validate_io_list(objs)); - xassert(_isa_client(client)); - - len = _max_readable (client, len); - - again: - if ((n = read(obj->fd, (void *) buf, len)) < 0) { - if (errno == EINTR) - goto again; - _update_error_state(client, E_READ, errno); - return -1; - } - - debug3("read %d bytes from %s %d", n, _io_str[client->type], - client->id); - - if (n == 0) { /* got eof, pass this eof to readers */ - debug3("task %d [%s fd %d] read closed", - client->id, _io_str[client->type], obj->fd); - /* - * Do not read from this stdin any longer - */ - _obj_set_unreadable(obj); - - /* - * Loop through this client's readers, - * noting that EOF was recvd only if this - * client is the only writer - */ - if (client->readers) { - i = list_iterator_create(client->readers); - while((reader = list_next(i))) { - if (list_count(reader->writers) == 1) - reader->eof = 1; - else - debug3("can't send EOF to stdin"); - - } - list_iterator_destroy(i); - } - - /* It is unsafe to close CLIENT_STDOUT - */ - if (client->type == CLIENT_STDIN) - _obj_close(obj, client->job->objs); - - return 0; - } - - if (client->type == CLIENT_STDERR) { - /* unsigned long int signo = strtoul(buf, NULL, 10); */ - /* return kill(client->id, signo); */ - return 0; + struct client_io_info *client; + struct io_buf *msg = NULL; + eio_obj_t *eio; + ListIterator clients; + struct slurm_io_header header; + Buf packbuf; + + debug2("Entering _send_eof_msg"); + msg = list_dequeue(out->job->free_outgoing); + if (msg == NULL) { + debug3(" free_outgoing msg list empty, can't send eof_msg"); + return; } - /* - * Copy cbuf to all readers - */ - i = list_iterator_create(client->readers); - while((reader = list_next(i))) { - n = cbuf_write(reader->buf, (void *) buf, n, &dropped); - if (dropped > 0) - error("Dropped %d bytes stdin data to task %d", - dropped, client->id); + header.type = out->type; + header.ltaskid = out->ltaskid; + header.gtaskid = out->gtaskid; + header.length = 0; /* eof */ + + packbuf = create_buf(msg->data, io_hdr_packed_size()); + io_hdr_pack(&header, packbuf); + msg->length = io_hdr_packed_size() + header.length; + msg->ref_count = 0; /* make certain it is initialized */ + + /* Add eof message to the msg_queue of all clients */ + clients = list_iterator_create(out->job->clients); + while(eio = list_next(clients)) { + client = (struct client_io_info *)eio->arg; + debug3("======================== Enqueued message"); + xassert(client->magic == CLIENT_IO_MAGIC); + if (list_enqueue(client->out.msg_queue, msg)) + msg->ref_count++; } - list_iterator_destroy(i); + list_iterator_destroy(clients); - return 0; + out->eof_msg_sent = true; + debug2("Leaving _send_eof_msg"); } -static int -_client_error(io_obj_t *obj, List objs) -{ - struct io_info *io = (struct io_info *) obj->arg; - socklen_t size = sizeof(int); - int err = 0; - - xassert(io->magic == IO_MAGIC); - - if (getsockopt(obj->fd, SOL_SOCKET, SO_ERROR, &err, &size) < 0) - error ("_client_error getsockopt: %m"); - else if (err != ECONNRESET) /* Do not log connection resets */ - _update_error_state(io, E_POLL, err); - return 0; -} - -static char * -err_string(enum error_type type) +static struct io_buf * +_task_build_message(struct task_out_info *out, slurmd_job_t *job, cbuf_t cbuf) { - switch (type) { - case E_NONE: - return ""; - case E_WRITE: - return "write failed"; - case E_READ: - return "read failed"; - case E_POLL: - return "poll error"; - } - - return ""; -} - -static void -_clear_error_state(struct io_info *io) -{ - io->err.e_time = time(NULL); - io->err.e_count = 0; -} - -static void -_error_print(struct io_info *io) -{ - struct error_state *err = &io->err; - - if (!err->e_count) { - error("%s: <task %d> %s: %s", - err_string(err->e_type), io->id, _io_str[io->type], - slurm_strerror(err->e_last)); - } else { - error("%s: <task %d> %s: %s (repeated %d times)", - err_string(err->e_type), io->id, _io_str[io->type], - slurm_strerror(err->e_last), err->e_count); - } -} + struct io_buf *msg; + char *ptr; + Buf packbuf; + bool must_truncate = false; + int avail; + struct slurm_io_header header; + int n; + debug2("Entering _task_build_message"); + msg = list_dequeue(job->free_outgoing); + if (msg == NULL) + return NULL; + ptr = msg->data + io_hdr_packed_size(); -static int -_update_error_state(struct io_info *io, enum error_type type, int err) -{ - xassert(io != NULL); - xassert(io->magic == IO_MAGIC); - - /* getsockopt(,,SO_ERROR,&err,) returns err value of -1 - * under AIX under some circumstances */ - if (err <= 0) { - error("Unspecified I/O error <task %d>", io->id); - return 0; + if (job->buffered_stdio) { + avail = cbuf_peek_line(cbuf, ptr, MAX_MSG_LEN, 1); + if (avail >= MAX_MSG_LEN) + must_truncate = true; } - if ( (io->err.e_type == type) - && (io->err.e_last == err ) ) { - /* - * If the current and last error were the same, - * update the error counter - */ - io->err.e_count++; - - /* - * If it has been less than 5 seconds since the - * original error, don't print anything. - */ - if ( ((io->err.e_time + 5) > time(NULL)) - && (io->err.e_count < 65000) ) - return 0; + debug3(" buffered_stdio is %s", job->buffered_stdio ? "true" : "false"); + debug3(" must_truncate is %s", must_truncate ? "true" : "false"); + if (must_truncate || !job->buffered_stdio) { + n = cbuf_read(cbuf, ptr, MAX_MSG_LEN); } else { - /* - * Update error values - */ - io->err.e_count = 0; - io->err.e_type = type; - io->err.e_last = err; - io->err.e_time = time(NULL); - } - - _error_print(io); - - if (io->err.e_count > 0) - _clear_error_state(io); - - return 0; -} - -#ifndef NDEBUG -static void -_validate_task_out(struct io_info *t, int type) -{ - ListIterator i; - struct io_info *r; - - xassert(t->magic == IO_MAGIC); - xassert(!t->writers); - i = list_iterator_create(t->readers); - while ((r = list_next(i))) { - xassert(r->magic == IO_MAGIC); - xassert(r->type == type); - } - list_iterator_destroy(i); -} - -static void -_validate_task_in(struct io_info *t) -{ - ListIterator i; - struct io_info *r; - - xassert(t->magic == IO_MAGIC); - xassert(!t->readers); - i = list_iterator_create(t->writers); - while ((r = list_next(i))) { - xassert(r->magic == IO_MAGIC); - xassert((r->type == CLIENT_STDOUT) - || (r->type == CLIENT_STDIN)); + n = cbuf_read_line(cbuf, ptr, MAX_MSG_LEN, -1); + if (n == 0) { + debug3(" partial line in buffer, ignoring"); + debug2("Leaving _task_build_message"); + list_enqueue(job->free_outgoing, msg); + return NULL; + } } - list_iterator_destroy(i); -} + header.type = out->type; + header.ltaskid = out->ltaskid; + header.gtaskid = out->gtaskid; + header.length = n; -static void -_validate_client_stdout(struct io_info *client) -{ - ListIterator i; - struct io_info *t; + debug3(" header.length = %d", n); + packbuf = create_buf(msg->data, io_hdr_packed_size()); + io_hdr_pack(&header, packbuf); + msg->length = io_hdr_packed_size() + header.length; + msg->ref_count = 0; /* make certain it is initialized */ - xassert(client->magic == IO_MAGIC); - xassert(client->obj->ops->writable != NULL); + /* free the Buf packbuf, but not the memory to which it points */ + packbuf->head = NULL; + free_buf(packbuf); - i = list_iterator_create(client->readers); - while ((t = list_next(i))) { - xassert(t->magic == IO_MAGIC); - xassert(t->type == TASK_STDIN); - } - list_iterator_destroy(i); - - i = list_iterator_create(client->writers); - while ((t = list_next(i))) { - xassert(t->magic == IO_MAGIC); - xassert(t->type == TASK_STDOUT); - } - list_iterator_destroy(i); + debug2("Leaving _task_build_message"); + return msg; } -static void -_validate_client_stderr(struct io_info *client) +struct io_buf * +alloc_io_buf(void) { - ListIterator i; - struct io_info *t; - - xassert(client->magic == IO_MAGIC); - xassert(!client->readers); - xassert(client->obj->ops->writable != NULL); - - i = list_iterator_create(client->writers); - while ((t = list_next(i))) { - xassert(t->magic == IO_MAGIC); - xassert(t->type == TASK_STDERR); + struct io_buf *buf; + + buf = (struct io_buf *)xmalloc(sizeof(struct io_buf)); + if (!buf) + return NULL; + buf->ref_count = 0; + buf->length = 0; + /* The following "+ 1" is just temporary so I can stick a \0 at + the end and do a printf of the data pointer */ + buf->data = xmalloc(MAX_MSG_LEN + io_hdr_packed_size() + 1); + if (!buf->data) { + xfree(buf); + return NULL; } - list_iterator_destroy(i); -} -static void -_validate_client_stdin(struct io_info *client) -{ - ListIterator i; - struct io_info *t; - - xassert(client->magic == IO_MAGIC); - xassert(!client->writers); - i = list_iterator_create(client->readers); - while ((t = list_next(i))) { - xassert(t->magic == IO_MAGIC); - xassert(t->type == TASK_STDIN); - } - list_iterator_destroy(i); + return buf; } -static int -_validate_io_list(List objList) +void +free_io_buf(struct io_buf *buf) { - io_obj_t *obj; - int retval = 1; - ListIterator i = list_iterator_create(objList); - while ((obj = list_next(i))) { - struct io_info *io = (struct io_info *) obj->arg; - - xassert(io->obj == obj); - - switch (io->type) { - case TASK_STDOUT: - _validate_task_out(io, CLIENT_STDOUT); - break; - case TASK_STDERR: - _validate_task_out(io, CLIENT_STDERR); - break; - case TASK_STDIN: - _validate_task_in(io); - break; - case CLIENT_STDERR: - _validate_client_stderr(io); - break; - case CLIENT_STDOUT: - _validate_client_stdout(io); - break; - case CLIENT_STDIN: - _validate_client_stdin(io); - } + if (buf) { + if (buf->data) + xfree(buf->data); + xfree(buf); } - list_iterator_destroy(i); - return retval; } -#endif /* NDEBUG */ - diff --git a/src/slurmd/io.h b/src/slurmd/io.h index 9b3f09ff213a316896f0730f91b35f5445d2a538..2de8e7c780a1bca09d4d2f5a173eb9145ae48d59 100644 --- a/src/slurmd/io.h +++ b/src/slurmd/io.h @@ -31,13 +31,28 @@ #include "src/slurmd/slurmd_job.h" #include "src/common/eio.h" +struct io_buf { + int ref_count; + uint32_t length; + void *data; +}; + +struct io_buf *alloc_io_buf(void); +void free_io_buf(struct io_buf *buf); + +/* + * Initialize each task's standard I/O file descriptors. The file descriptors + * may be files, or may be the end of a pipe which is handled by an eio_obj_t. + */ +int io_init_tasks_stdio(slurmd_job_t *job); + /* - * Spawn IO handling thread. + * Start IO handling thread. * Initializes IO pipes, creates IO objects and appends them to job->objs, * and opens 2*ntask initial connections for stdout/err, also appending these * to job->objs list. */ -int io_spawn_handler(slurmd_job_t *job); +int io_thread_start(slurmd_job_t *job); /* * Create a set of new connecting clients for the running job @@ -46,65 +61,15 @@ int io_spawn_handler(slurmd_job_t *job); */ int io_new_clients(slurmd_job_t *job); -/* - * Frees memory associated with the given IO object - */ -void io_obj_destroy(io_obj_t *obj); - -int io_init_pipes(slurmd_job_t *job); -int io_prepare_child(slurmd_task_info_t *t); - -void io_close_all(slurmd_job_t *job); +int io_dup_stdio(slurmd_task_info_t *t); /* - * Connect initial N tasks to their stdio + * Close the tasks' ends of the stdio pipes. + * Presumably the tasks have already been started, and + * have their copies of these file descriptors. */ -int io_prepare_clients(slurmd_job_t *job); +void io_close_task_fds(slurmd_job_t *job); -/* Notes: - * - * slurmd <-+---> client (e.g. srun, file) - * `---> client - * - * slurmd can handle multiple client connections. Each task writes - * stdout and stderr data to the client and reads stdin and signals - * from the client streams. - * - * I/O objects: - * task stdout: R/0 pipe created by slurmd - * - buffer is null - * - readers list has at least one client reader (may be a file obj) - * - writers list is empty - * - * task stderr: R/O pipe created by slurmd - * - buffer is null - * - readers list has at least one client reader (may be a file obj) - * - writers list is empty - * - * task stdin: W/O pipe created by slurmd - * - circular buffer - * - readers list is empty - * - writers list contains only one client (may be a file obj) - * - * client stdout/in socket: - * - circular buffer for stdout data - * - readers list is one task stdin obj or empty - * - writers list is one task stdout obj - * - * client stderr/sig socket: - * - circular buffer for stderr data - * - readers list is null (data read is converted to signal) - * - writers list is one task stderr obj - * - * stdout/err file obj: - * - circular buffer for stdout/err data - * - readers list is empty - * - writers list is one task stdout/err obj - * - * stdin file obj - * - buffer is null - * - readers list is one or more task stdin obj's - * - writers list is empty - */ +void io_close_all(slurmd_job_t *job); #endif /* !_IO_H */ diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index b36a561061982a88452758a1d89cde9134d1c8e2..1bb86b5c05108873e8bcc964fad57f71129952df 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -367,35 +367,38 @@ _setup_io(slurmd_job_t *job) int rc = 0; struct passwd *spwd = NULL; - /* - * Save current UID/GID - */ + debug2("Entering _setup_io"); + + /* Save current UID/GID */ if (!(spwd = getpwuid(geteuid()))) { error("getpwuid: %m"); return ESLURMD_IO_ERROR; } - if (io_spawn_handler(job) < 0) - return ESLURMD_IO_ERROR; - /* - * Initialize log facility to copy errors back to srun - */ - _slurmd_job_log_init(job); - - /* - * Temporarily drop permissions, initialize IO clients - * (open files/connections for IO, etc), then reclaim privileges. + * Temporarily drop permissions, initialize task stdio file + * decriptors (which may be connected to files), then + * reclaim privileges. */ if (_drop_privileges(job->pwd) < 0) return ESLURMD_SET_UID_OR_GID_ERROR; - rc = io_prepare_clients(job); + io_init_tasks_stdio(job); if (_reclaim_privileges(spwd) < 0) error("sete{u/g}id(%lu/%lu): %m", (u_long) spwd->pw_uid, (u_long) spwd->pw_gid); + if (io_thread_start(job) < 0) + return ESLURMD_IO_ERROR; + + /* + * Initialize log facility to copy errors back to srun + */ + _slurmd_job_log_init(job); + + rc = io_client_connect(job); + #ifndef NDEBUG # ifdef PR_SET_DUMPABLE if (prctl(PR_SET_DUMPABLE, 1) < 0) @@ -406,6 +409,7 @@ _setup_io(slurmd_job_t *job) if (rc < 0) return ESLURMD_IO_ERROR; + debug2("Leaving _setup_io"); return SLURM_SUCCESS; } @@ -540,6 +544,8 @@ _job_mgr(slurmd_job_t *job) goto fail2; } + io_close_task_fds(job); + xsignal_block(mgr_sigarray); reattach_job = job; xsignal(SIGHUP, _hup_handler); @@ -580,8 +586,10 @@ _job_mgr(slurmd_job_t *job) /* * Wait for io thread to complete (if there is one) */ - if (!job->spawn_task) + if (!job->spawn_task) { + eio_signal_shutdown(job->eio); _wait_for_io(job); + } job_update_state(job, SLURMD_JOB_COMPLETE); g_slurmd_jobacct_jobstep_terminated(job); @@ -738,7 +746,7 @@ _fork_all_tasks(slurmd_job_t *job) /* * Loop once through tasks looking for all tasks that have exited with * the same exit status (and whose statuses have not been sent back to - * the client) Aggregrate these tasks into a single task exit message. + * the client) Aggregate these tasks into a single task exit message. * */ static int @@ -1208,7 +1216,7 @@ _slurmd_job_log_init(slurmd_job_t *job) /* Connect slurmd stderr to job's stderr */ if ((!job->spawn_task) && - (dup2(job->task[0]->perr[1], STDERR_FILENO) < 0)) { + (dup2(job->task[0]->stderr, STDERR_FILENO) < 0)) { error("job_log_init: dup2(stderr): %m"); return; } diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index ec807bba79bed2015a75ef4b61623e50e21ecf4e..4934e2902a7c129195fc66f70cb115b5ca6eed75 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -649,7 +649,7 @@ shm_update_step_addrs(uint32_t jobid, uint32_t stepid, if (!s->io_update) { s->ioaddr = *ioaddr; s->respaddr = *respaddr; - memcpy(s->key.data, keydata, SLURM_IO_KEY_SIZE); + memcpy(s->key.data, keydata, SLURM_CRED_SIGLEN); s->io_update = true; debug3("Going to send shm update signal to %ld", @@ -695,7 +695,7 @@ shm_step_addrs(uint32_t jobid, uint32_t stepid, } else { *ioaddr = s->ioaddr; *respaddr = s->respaddr; - memcpy(key->data, s->key.data, SLURM_IO_KEY_SIZE); + memcpy(key->data, s->key.data, SLURM_CRED_SIGLEN); s->io_update = false; } } else { diff --git a/src/slurmd/slurmd_job.c b/src/slurmd/slurmd_job.c index c9b587af618bfb44174d6443f255bad3689fe416..146698c5126104d899e6ed39959a74f8c6d1f2e7 100644 --- a/src/slurmd/slurmd_job.c +++ b/src/slurmd/slurmd_job.c @@ -142,6 +142,7 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) srun_info_t *srun; slurm_addr resp_addr; slurm_addr io_addr; + int i; xassert(msg != NULL); @@ -170,9 +171,19 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) job->cwd = xstrdup(msg->cwd); job->env = _array_copy(msg->envc, msg->env); - job->eio = eio_handle_create(); - job->objs = list_create((ListDelF) io_obj_destroy); + job->objs = list_create(NULL); /* FIXME! Needs destructor */ + job->eio = eio_handle_create(job->objs); job->sruns = list_create((ListDelF) _srun_info_destructor); + job->clients = list_create(NULL); /* FIXME! Needs destructor */ + job->free_incoming = list_create(NULL); /* FIXME! Needs destructor */ + for (i = 0; i < 10; i++) { + list_enqueue(job->free_incoming, alloc_io_buf()); + } + job->free_outgoing = list_create(NULL); /* FIXME! Needs destructor */ + for (i = 0; i < 10; i++) { + list_enqueue(job->free_outgoing, alloc_io_buf()); + } + job->envtp = xmalloc(sizeof(env_t)); job->envtp->jobid = -1; job->envtp->stepid = -1; @@ -190,10 +201,10 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) srun->ofname = xstrdup(msg->ofname); srun->efname = xstrdup(msg->efname); srun->ifname = xstrdup(msg->ifname); + job->buffered_stdio = msg->buffered_stdio; job->argc = msg->argc; job->argv = _array_copy(job->argc, msg->argv); - job->nnodes = msg->nnodes; job->nodeid = msg->srun_node_id; @@ -243,8 +254,8 @@ job_spawn_create(spawn_task_request_msg_t *msg, slurm_addr *cli_addr) job->cwd = xstrdup(msg->cwd); job->env = _array_copy(msg->envc, msg->env); - job->eio = eio_handle_create(); - job->objs = list_create((ListDelF) io_obj_destroy); + job->objs = list_create(NULL); /* Need destructor */ + job->eio = eio_handle_create(job->objs); job->sruns = list_create((ListDelF) _srun_info_destructor); job->envtp = xmalloc(sizeof(env_t)); job->envtp->jobid = -1; @@ -326,8 +337,8 @@ job_batch_job_create(batch_job_launch_msg_t *msg) job->cwd = xstrdup(msg->work_dir); job->env = _array_copy(msg->envc, msg->environment); - job->eio = eio_handle_create(); - job->objs = list_create((ListDelF) io_obj_destroy); + job->objs = list_create(NULL); /* FIXME - Need desctructor */ + job->eio = eio_handle_create(job->objs); job->sruns = list_create((ListDelF) _srun_info_destructor); job->envtp = xmalloc(sizeof(env_t)); job->envtp->jobid = -1; @@ -476,14 +487,14 @@ srun_info_create(slurm_cred_t cred, slurm_addr *resp_addr, slurm_addr *ioaddr) slurm_cred_get_signature(cred, &data, &len); - len = len > SLURM_IO_KEY_SIZE ? SLURM_IO_KEY_SIZE : len; + len = len > SLURM_CRED_SIGLEN ? SLURM_CRED_SIGLEN : len; if (data != NULL) { memcpy((void *) key->data, data, len); - if (len < SLURM_IO_KEY_SIZE) + if (len < SLURM_CRED_SIGLEN) memset( (void *) (key->data + len), 0, - SLURM_IO_KEY_SIZE - len ); + SLURM_CRED_SIGLEN - len); } if (ioaddr != NULL) @@ -518,21 +529,20 @@ task_info_create(int taskid, int gtaskid) slurm_mutex_init(&t->mutex); slurm_mutex_lock(&t->mutex); - t->state = SLURMD_TASK_INIT; - t->id = taskid; - t->gtid = gtaskid; - t->pid = (pid_t) -1; - t->pin[0] = -1; - t->pin[1] = -1; - t->pout[0] = -1; - t->pout[1] = -1; - t->perr[0] = -1; - t->perr[1] = -1; - t->estatus = -1; - t->in = NULL; - t->out = NULL; - t->err = NULL; - t->srun_list = list_create(NULL); + t->state = SLURMD_TASK_INIT; + t->id = taskid; + t->gtid = gtaskid; + t->pid = (pid_t) -1; + t->stdin = -1; + t->to_stdin = -1; + t->stdout = -1; + t->from_stdout = -1; + t->stderr = -1; + t->from_stderr = -1; + t->estatus = -1; + t->in = NULL; + t->out = NULL; + t->err = NULL; slurm_mutex_unlock(&t->mutex); return t; } @@ -542,7 +552,6 @@ void task_info_destroy(slurmd_task_info_t *t) { slurm_mutex_lock(&t->mutex); - list_destroy(t->srun_list); slurm_mutex_unlock(&t->mutex); slurm_mutex_destroy(&t->mutex); xfree(t); diff --git a/src/slurmd/slurmd_job.h b/src/slurmd/slurmd_job.h index e8125f197e68325b5c82d85d7f0f8fd19a61914b..d34c755d698bde359e62cb253d36797eca5043c1 100644 --- a/src/slurmd/slurmd_job.h +++ b/src/slurmd/slurmd_job.h @@ -47,7 +47,7 @@ #endif typedef struct srun_key { - unsigned char data[SLURM_IO_KEY_SIZE]; + unsigned char data[SLURM_CRED_SIGLEN]; } srun_key_t; typedef struct srun_info { @@ -78,24 +78,26 @@ typedef enum job_state { } slurmd_job_state_t; typedef struct task_info { - pthread_mutex_t mutex; /* mutex to protect task state */ - slurmd_task_state_t state; /* task state */ + pthread_mutex_t mutex; /* mutex to protect task state */ + slurmd_task_state_t state; /* task state */ - int id; /* local task id */ - uint32_t gtid; /* global task id */ - pid_t pid; /* task pid */ - int pin[2]; /* stdin pipe */ - int pout[2]; /* stdout pipe */ - int perr[2]; /* stderr pipe */ - io_obj_t *in, - *out, /* I/O objects used in IO event loop */ - *err; - - bool esent; /* true if exit status has been sent */ - bool exited; /* true if task has exited */ - int estatus; /* this task's exit status */ - - List srun_list; /* List of srun objs for this task */ + int id; /* local task id */ + uint32_t gtid; /* global task id */ + pid_t pid; /* task pid */ + + int stdin; /* standard input file descriptor */ + int stdout; /* standard output file descriptor */ + int stderr; /* standard error file descriptor */ + int to_stdin; /* write file descriptor for task stdin */ + int from_stdout;/* read file descriptor from task stdout*/ + int from_stderr;/* read file descriptor from task stderr*/ + eio_obj_t *in; /* standard input event IO object */ + eio_obj_t *out; /* standard output event IO object */ + eio_obj_t *err; /* standard error event IO object */ + + bool esent; /* true if exit status has been sent */ + bool exited; /* true if task has exited */ + int estatus; /* this task's exit status */ } slurmd_task_info_t; typedef struct slurmd_job { @@ -121,10 +123,22 @@ typedef struct slurmd_job { time_t timelimit; /* time at which job must stop */ struct passwd *pwd; /* saved passwd struct for user job */ - slurmd_task_info_t **task; /* list of task information pointers */ + slurmd_task_info_t **task; /* array of task information pointers */ eio_t eio; - List objs; /* list of IO objects */ - List sruns; /* List of sruns */ + List objs; /* List of io_obj_t pointers (see eio.h) */ + List sruns; /* List of srun_info_t pointers */ + List clients; /* List of struct client_io_info pointers */ + List free_incoming; /* List of free struct io_buf * for incoming + * traffic. "incoming" means traffic from srun + * to the tasks. + */ + List free_outgoing; /* List of free struct io_buf * for outgoing + * traffic "outgoing" means traffic from the + * tasks to srun. + */ + uint8_t buffered_stdio; /* stdio buffering flag, 1 for line-buffering, + * 0 for no buffering + */ pthread_t ioid; /* pthread id of IO thread */ diff --git a/src/slurmd/task.c b/src/slurmd/task.c index d946ca9f4da7174651387b689546891075b49a38..0d36ec9429a46f34313dd5cfd06f2486c3b32ef5 100644 --- a/src/slurmd/task.c +++ b/src/slurmd/task.c @@ -116,9 +116,9 @@ _cleanup_file_descriptors(slurmd_job_t *j) /* * Ignore errors on close() */ - close(t->pin[1]); - close(t->pout[0]); - close(t->perr[0]); + close(t->to_stdin); + close(t->from_stdout); + close(t->from_stdout); } } @@ -199,7 +199,7 @@ exec_task(slurmd_job_t *job, int i, int waitfd) if (job->spawn_task) _setup_spawn_io(job); else - io_prepare_child(job->task[i]); + io_dup_stdio(job->task[i]); execve(job->argv[0], job->argv, job->env); diff --git a/src/srun/io.c b/src/srun/io.c index 730b3a01fca7bb719cabdbdde4a0a015f2d22f2f..cff169cd91a80841082aea1c653c0b507690bb08 100644 --- a/src/srun/io.c +++ b/src/srun/io.c @@ -55,517 +55,692 @@ #include "src/srun/srun_job.h" #include "src/srun/opt.h" -static int fmt_width = 0; - -/* fd_info struct used in poll() loop to map fds back to task number, - * appropriate output type (stdout/err), and original fd - */ -typedef struct fd_info { - int taskid; /* corresponding task id */ - int *fd; /* pointer to fd in job->out/err array */ - FILE *fp; /* fp on which to write output */ - cbuf_t buf; -} fd_info_t; - -static void _accept_io_stream(srun_job_t *job, int i); -static void _bcast_stdin(int fd, srun_job_t *job); -static int _close_stream(int *fd, FILE *out, int tasknum); -static int _do_task_output(int *fd, FILE *out, cbuf_t buf, int tasknum); -static int _do_task_output_poll(fd_info_t *info); -static int _do_task_input(srun_job_t *job, int taskid); -static int _do_task_input_poll(srun_job_t *job, fd_info_t *info); -static inline bool _io_thr_done(srun_job_t *job); -static int _handle_pollerr(fd_info_t *info); -static ssize_t _readx(int fd, char *buf, size_t maxbytes); -static int _read_io_header(int fd, srun_job_t *job, char *host); -static void _terminate_node_io(int node_inx, srun_job_t *job); -#define _poll_set_rd(_pfd, _fd) do { \ - (_pfd).fd = _fd; \ - (_pfd).events = POLLIN; \ - } while (0) - -#define _poll_set_wr(_pfd, _fd) do { \ - (_pfd).fd = _fd; \ - (_pfd).events = POLLOUT; \ - } while (0) - -#define _poll_rd_isset(pfd) ((pfd).revents & POLLIN ) -#define _poll_wr_isset(pfd) ((pfd).revents & POLLOUT) -#define _poll_err(pfd) ((pfd).revents & POLLERR) -#define _poll_hup(pfd) ((pfd).revents & POLLHUP) - #define MAX_RETRIES 3 -/* True if an EOF needs to be broadcast to all tasks - */ -static bool stdin_got_eof = false; -static bool stdin_open = true; -static uint32_t nbytes = 0; -static uint32_t nwritten = 0; - -static int -_do_task_output_poll(fd_info_t *info) -{ - return _do_task_output(info->fd, info->fp, info->buf, info->taskid); -} +static int fmt_width = 0; -static int -_do_task_input_poll(srun_job_t *job, fd_info_t *info) +static void _handle_io_init_msg(int fd, srun_job_t *job); +static ssize_t _readx(int fd, char *buf, size_t maxbytes); +static int _read_io_init_msg(int fd, srun_job_t *job, char *host); +static int _wid(int n); + +/********************************************************************** + * Listening socket declarations + **********************************************************************/ +static bool _listening_socket_readable(eio_obj_t *obj); +static int _listening_socket_read(eio_obj_t *obj, List objs); + +struct io_operations listening_socket_ops = { + readable: &_listening_socket_readable, + handle_read: &_listening_socket_read +}; + +/********************************************************************** + * IO server socket declarations + **********************************************************************/ +static bool _server_readable(eio_obj_t *obj); +static int _server_read(eio_obj_t *obj, List objs); +static bool _server_writable(eio_obj_t *obj); +static int _server_write(eio_obj_t *obj, List objs); + +struct io_operations server_ops = { + readable: &_server_readable, + handle_read: &_server_read, + writable: &_server_writable, + handle_write: &_server_write +}; + +struct server_io_info { + srun_job_t *job; + + /* incoming variables */ + struct slurm_io_header header; + struct io_buf *in_msg; + int32_t in_remaining; + bool in_eof; + + /* outgoing variables */ + List msg_queue; + struct io_buf *out_msg; + int32_t out_remaining; + bool out_eof; +}; + +/********************************************************************** + * File write declarations + **********************************************************************/ +static bool _file_writable(eio_obj_t *obj); +static int _file_write(eio_obj_t *obj, List objs); + +struct io_operations file_write_ops = { + writable: &_file_writable, + handle_write: &_file_write, +}; + +struct file_write_info { + srun_job_t *job; + + /* outgoing variables */ + List msg_queue; + struct io_buf *out_msg; + int32_t out_remaining; + bool eof; +}; + +/********************************************************************** + * File read declarations + **********************************************************************/ +static bool _file_readable(eio_obj_t *obj); +static int _file_read(eio_obj_t *obj, List objs); + +struct io_operations file_read_ops = { + readable: &_file_readable, + handle_read: &_file_read, +}; + +struct file_read_info { + srun_job_t *job; + + /* header contains destination of file input */ + struct slurm_io_header header; + + bool eof; +}; + + +/********************************************************************** + * Listening socket functions + **********************************************************************/ +static bool +_listening_socket_readable(eio_obj_t *obj) { - return _do_task_input(job, info->taskid); + debug3("Called _listening_socket_readable"); + if (obj->shutdown == true) { + debug2(" false, shutdown"); + return false; + } + return true; } static int -_handle_pollerr(fd_info_t *info) +_listening_socket_read(eio_obj_t *obj, List objs) { - int fd = *info->fd; - int err; - socklen_t size = sizeof(int); - if (getsockopt(fd, SOL_SOCKET, SO_ERROR, (void *)&err, &size) < 0) - error("_handle_error_poll: getsockopt: %m"); - - if (err > 0) - debug3("%d: poll error on fd %d: %s", - info->taskid, fd, slurm_strerror(err)); - else - debug3("%d: fd %d got hangup", info->taskid, fd); + srun_job_t *job = (srun_job_t *)obj->arg; - /* _do_task_output() should read EOF and close output - * stream if necessary. This way, any remaining data - * is read. - */ - _do_task_output(info->fd, info->fp, info->buf, info->taskid); - - return 0; + debug3("Called _listening_socket_read"); + _handle_io_init_msg(obj->fd, job); } static void -_set_iofds_nonblocking(srun_job_t *job) +_set_listensocks_nonblocking(srun_job_t *job) { int i; - for (i = 0; i < job->niofds; i++) - fd_set_nonblocking(job->iofd[i]); - /* - * Do not do this. Setting stdin nonblocking has the side - * effect of setting stdout/stderr nonblocking, which is - * not what we want. We should have similar functionality - * with blocking stdin. - */ - /* fd_set_nonblocking(job->stdinfd); */ + for (i = 0; i < job->num_listen; i++) + fd_set_nonblocking(job->listensock[i]); } -static void -_update_task_io_state(srun_job_t *job, int taskid) -{ - pipe_enum_t pipe_enum = PIPE_TASK_STATE; - - slurm_mutex_lock(&job->task_mutex); - if (job->task_state[taskid] == SRUN_TASK_IO_WAIT) { - job->task_state[taskid] = SRUN_TASK_EXITED; - if(message_thread) { - write(job->forked_msg->par_msg->msg_pipe[1], - &pipe_enum,sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &taskid,sizeof(int)); - write(job->forked_msg->par_msg->msg_pipe[1], - &job->task_state[taskid],sizeof(int)); - } - } - slurm_mutex_unlock(&job->task_mutex); -} - - -static void -_do_output_line(cbuf_t buf, FILE *out, int tasknum) +/********************************************************************** + * IO server socket functions + **********************************************************************/ +static eio_obj_t * +_create_server_eio_obj(int fd, srun_job_t *job) { - int len = 0; - int tot = 0; - char line[4096]; + struct server_io_info *info = NULL; + eio_obj_t *eio = NULL; - while ((len = cbuf_read_line(buf, line, sizeof(line), 1))) { - int n = 0; + info = (struct server_io_info *)xmalloc(sizeof(struct server_io_info)); + info->job = job; + info->in_msg = NULL; + info->in_remaining = 0; + info->in_eof = false; + info->msg_queue = list_create(NULL); /* FIXME! Add destructor */ + info->out_msg = NULL; + info->out_remaining = 0; + info->out_eof = false; - if (opt.labelio) - fprintf(out, "%0*d: ", fmt_width, tasknum); + eio = eio_obj_create(fd, &server_ops, (void *)info); - if ((n = fprintf(out, "%s", line)) < len) { - int rewind = (n < 0) ? len : len-n; - error("Rewinding %d of %d bytes: %m", rewind, len); - cbuf_rewind(buf, rewind); - if (ferror(out)) - clearerr(out); - goto done; - } else - tot += n; - } - done: - if(fflush(out)) { - error ("fflush error: %m"); - if (ferror(out)) - clearerr(out); - } - - debug3("do_output: [%d %d %d]", tot, cbuf_used(buf), cbuf_size(buf)); - - nwritten += tot; - return; + return eio; } -static void -_do_output(cbuf_t buf, FILE *out, int tasknum) -{ - if (opt.unbuffered) - cbuf_read_to_fd(buf, fileno(out), -1); - else - _do_output_line(buf, out, tasknum); -} - -static void -_flush_io(srun_job_t *job) +static bool +_server_readable(eio_obj_t *obj) { + struct server_io_info *s = (struct server_io_info *) obj->arg; + struct file_write_info *fout, *ferr; int i; - debug3("flushing all io"); - for (i = 0; i < opt.nprocs; i++) { - /* - * Ensure remaining output is written - */ - if (cbuf_used(job->outbuf[i])) - cbuf_write(job->outbuf[i], "\n", 1, NULL); - if (cbuf_used(job->errbuf[i])) - cbuf_write(job->errbuf[i], "\n", 1, NULL); + debug2("Called _server_readable"); - _do_output(job->outbuf[i], job->outstream, i); - if (job->out[i] != IO_DONE) - _close_stream(&job->out[i], stdout, i); + if (list_is_empty(s->job->free_outgoing)) { + debug3(" false, free_io_buf is empty"); + return false; + } - _do_output(job->errbuf[i], job->errstream, i); - if (job->err[i] != IO_DONE) - _close_stream(&job->err[i], stderr, i); + if (s->in_eof) { + debug3(" false, eof"); + return false; } - debug3("Read %dB from tasks, wrote %dB", nbytes, nwritten); -} + if (s->job->stdout) { + fout = (struct file_write_info *)s->job->stdout->arg; + if (fout->eof == false) { + debug3(" stdout no eof"); + return true; + } + } -static int -_initial_fd_state (io_filename_t *f, int task) -{ - if (f->type == IO_ALL) - return (WAITING_FOR_IO); - if (f->type == IO_ONE && f->taskid == task) - return (WAITING_FOR_IO); + if (s->job->stderr) { + ferr = (struct file_write_info *)s->job->stderr->arg; + if (ferr->eof == false) { + debug3(" stderr no eof"); + return true; + } + } - return (IO_DONE); + debug3(" false"); + return false; } -static void -_io_thr_init(srun_job_t *job, struct pollfd *fds) +static int +_server_read(eio_obj_t *obj, List objs) { - int i; - sigset_t set; + struct server_io_info *s = (struct server_io_info *) obj->arg; + void *buf; + int n; - xassert(job != NULL); + debug3("Entering _server_read"); + if (s->in_msg == NULL) { + s->in_msg = list_dequeue(s->job->free_outgoing); + if (s->in_msg == NULL) { + debug("List free_outgoing is empty!"); + return SLURM_ERROR; + } - /* Block SIGHUP because it is interrupting file stream functions - * (fprintf, fflush, etc.) and causing data loss on stdout. - */ - sigemptyset(&set); - sigaddset(&set, SIGHUP); - pthread_sigmask(SIG_BLOCK, &set, NULL); + n = io_hdr_read_fd(obj->fd, &s->header); + if (n == 0) { /* got eof on socket read */ + debug3( "got eof on _server_read header"); + s->in_eof = true; + list_enqueue(s->job->free_outgoing, s->in_msg); + s->in_msg = NULL; + return SLURM_SUCCESS; + } + s->in_remaining = s->header.length; + s->in_msg->length = s->header.length; + s->in_msg->header = s->header; + } - _set_iofds_nonblocking(job); + /* + * Read the body + */ + if (s->header.length != 0) { + buf = s->in_msg->data + (s->in_msg->length - s->in_remaining); + again: + if ((n = read(obj->fd, buf, s->in_remaining)) < 0) { + if (errno == EINTR) + goto again; + /* FIXME handle error */ + return SLURM_ERROR; + } + if (n == 0) { /* got eof */ + debug3( "got eof on _server_read body"); + s->in_eof = true; + list_enqueue(s->job->free_outgoing, s->in_msg); + s->in_msg = NULL; + return SLURM_SUCCESS; + } - for (i = 0; i < opt.nprocs; i++) { - int instate = _initial_fd_state (job->ifname, i); - job->out[i] = _initial_fd_state (job->ofname, i); - job->err[i] = _initial_fd_state (job->efname, i); + *(char *)(buf + n) = '\0'; + debug3("\"%s\"", buf); + s->in_remaining -= n; + if (s->in_remaining > 0) + return SLURM_SUCCESS; + } - if (job->out[i] != WAITING_FOR_IO) - job->out[i] = instate; - + /* + * Route the message to the proper output + */ + { + eio_obj_t *obj; + struct file_write_info *info; + + s->in_msg->ref_count = 1; + if (s->in_msg->header.type == SLURM_IO_STDOUT) + obj = s->job->stdout; + else + obj = s->job->stderr; + info = (struct file_write_info *) obj->arg; + list_enqueue(info->msg_queue, s->in_msg); + + s->in_msg = NULL; } - for (i = 0; i < job->niofds; i++) - _poll_set_rd(fds[i], job->iofd[i]); + return SLURM_SUCCESS; } -static void -_fd_info_init(fd_info_t *info, int taskid, int *pfd, FILE *fp, cbuf_t buf) +static bool +_server_writable(eio_obj_t *obj) { - info->taskid = taskid; - info->fd = pfd; - info->fp = fp; - info->buf = buf; -} + struct server_io_info *s = (struct server_io_info *) obj->arg; -static int -_stdin_buffer_space (srun_job_t *job) -{ - int i, nfree, len = 0; - for (i = 0; i < opt.nprocs; i++) { - if ((nfree = cbuf_free (job->inbuf[i])) == 0) - return (0); - if ((len == 0) || (nfree < len)) - len = nfree; - } - return (len); + debug3("Called _server_writable"); + + if (s->out_eof) { + debug3(" false, eof"); + return false; + } + if (obj->shutdown == true) { + debug3(" false, shutdown"); + return false; + } + if (s->out_msg != NULL + || !list_is_empty(s->msg_queue)) { + debug3(" true, s->msg_queue length = %d", + list_count(s->msg_queue)); + return true; + } + + debug3(" false"); + return false; } -static nfds_t -_setup_pollfds(srun_job_t *job, struct pollfd *fds, fd_info_t *map) +static int +_server_write(eio_obj_t *obj, List objs) { - int eofcnt = 0; - int i; - nfds_t nfds = job->niofds; /* already have n ioport fds + stdin */ + struct server_io_info *s = (struct server_io_info *) obj->arg; + void *buf; + int n; - /* set up reader for the io thread signalling pipe */ - if (job->io_thr_pipe[0] >= 0) { - _poll_set_rd(fds[nfds], job->io_thr_pipe[0]); - nfds++; - } + debug2("Entering _server_write"); - if ((job->stdinfd >= 0) && stdin_open && _stdin_buffer_space(job)) { - _poll_set_rd(fds[nfds], job->stdinfd); - nfds++; + /* + * If we aren't already in the middle of sending a message, get the + * next message from the queue. + */ + if (s->out_msg == NULL) { + s->out_msg = list_dequeue(s->msg_queue); + if (s->out_msg == NULL) { + debug3("_server_write: nothing in the queue"); + return SLURM_SUCCESS; + } + debug3(" dequeue successful, s->out_msg->length = %d", s->out_msg->length); + s->out_remaining = s->out_msg->length; } - for (i = 0; i < opt.nprocs; i++) { - - if (job->task_state[i] == SRUN_TASK_FAILED) { - job->out[i] = IO_DONE; - if ((job->err[i] == WAITING_FOR_IO)) - job->err[i] = IO_DONE; + debug3(" s->out_remaining = %d", s->out_remaining); + + /* + * Write message to socket. + */ + buf = s->out_msg->data + (s->out_msg->length - s->out_remaining); +again: + if ((n = write(obj->fd, buf, s->out_remaining)) < 0) { + if (errno == EINTR) { + goto again; + } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { + debug3(" got EAGAIN in _server_write"); + return SLURM_SUCCESS; + } else { + error("_server_write write failed: %m"); + s->out_eof = true; + /* FIXME - perhaps we should free the message here? */ + return SLURM_ERROR; } + } + debug3("Wrote %d bytes to socket", n); + s->out_remaining -= n; + if (s->out_remaining > 0) + return SLURM_SUCCESS; - if (job->out[i] >= 0) { + /* + * Free the message and prepare to send the next one. + */ + s->out_msg->ref_count--; + if (s->out_msg->ref_count == 0) + list_enqueue(s->job->free_incoming, s->out_msg); + else + debug3(" Could not free msg!!"); + s->out_msg = NULL; - _poll_set_rd(fds[nfds], job->out[i]); + return SLURM_SUCCESS; - if ( (cbuf_used(job->inbuf[i]) > 0) - || (stdin_got_eof && !job->stdin_eof[i])) - fds[nfds].events |= POLLOUT; +} - _fd_info_init( map + nfds, i, &job->out[i], - job->outstream, job->outbuf[i] ); - nfds++; - } +/********************************************************************** + * File write functions + **********************************************************************/ +extern eio_obj_t * +create_file_write_eio_obj(int fd, srun_job_t *job) +{ + struct file_write_info *info = NULL; + eio_obj_t *eio = NULL; - if (job->err[i] >= 0) { - _poll_set_rd(fds[nfds], job->err[i]); + info = (struct file_write_info *) + xmalloc(sizeof(struct file_write_info)); + info->job = job; + info->msg_queue = list_create(NULL); /* FIXME! Add destructor */ + info->out_msg = NULL; + info->out_remaining = 0; + info->eof = false; - _fd_info_init( map + nfds, i, &job->err[i], - job->errstream, job->errbuf[i] ); - nfds++; - } + eio = eio_obj_create(fd, &file_write_ops, (void *)info); + return eio; +} - if ( (job->out[i] == IO_DONE) - && (job->err[i] == IO_DONE) ) { - eofcnt++; - _update_task_io_state(job, i); - } +static void _write_label(int fd, int taskid) +{ + char buf[16]; + snprintf(buf, 16, "%0*d: ", fmt_width, taskid); + /* FIXME - Need to handle return code */ + write(fd, buf, fmt_width+2); +} - } +static void _write_newline(int fd) +{ + int n; - /* exit if we have received EOF on all streams */ - if (eofcnt) { - if ((eofcnt == opt.nprocs) - || (slurm_mpi_single_task_per_node() - && (eofcnt == job->nhosts))) { - debug("got EOF on all streams"); - _flush_io(job); - pthread_exit(0); - } + debug2("Called _write_newline"); +again: + if ((n = write(fd, "\n", 1)) < 0) { + if (errno == EINTR + || errno == EAGAIN + || errno == EWOULDBLOCK) { + goto again; + } + /* FIXME handle error */ } - return nfds; } -static void * -_io_thr_poll(void *job_arg) +/* + * Blocks until write is complete, regardless of the file + * descriptor being in non-blocking mode. + */ +static int _write_line(int fd, void *buf, int len) { - int i, rc; - srun_job_t *job = (srun_job_t *) job_arg; - int numfds = (opt.nprocs*2) + job->niofds + 3; - nfds_t nfds = 0; - struct pollfd fds[numfds]; - fd_info_t map[numfds]; /* map fd in pollfd array to fd info */ + int n; + int left = len; - xassert(job != NULL); - - debug3("IO thread pid = %lu", (unsigned long) getpid()); - - _io_thr_init(job, fds); - - while (!_io_thr_done(job)) { - - nfds = _setup_pollfds(job, fds, map); - if ((rc = poll(fds, nfds, -1)) <= 0) { - switch(errno) { - case EINTR: - case EAGAIN: - continue; - case ENOMEM: - case EFAULT: - fatal("poll: %m"); - break; - default: - error("poll: %m. trying again."); - continue; + debug2("Called _write_line"); + while (left > 0) { + again: + if ((n = write(fd, buf, left)) < 0) { + if (errno == EINTR) + goto again; + if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { + debug3(" got EAGAIN in _write_line"); + goto again; } + /* FIXME handle error */ + return -1; } + left -= n; + } + + return len; +} +static int _write_msg(int fd, void *buf, int len, int taskid) +{ + void *start; + void *end; + int line_len; + int rc; + + /* FIXME - should loop here, write as many lines as in the message */ + start = buf; + end = memchr(start, '\n', len); + if (opt.labelio) + _write_label(fd, taskid); + if (end == NULL) { /* no newline found */ + rc = _write_line(fd, start, len); + if (opt.labelio) + _write_newline(fd); + } else { + line_len = (int)(end - start) + 1; + rc = _write_line(fd, start, line_len); + } - for (i = 0; i < job->niofds; i++) { - if (fds[i].revents) { - if (_poll_err(fds[i])) - error("poll error on io fd %d", i); - else - _accept_io_stream(job, i); - } - } + return rc; +} - /* Check for wake-up signal from other srun pthreads */ - if (fds[i].fd == job->io_thr_pipe[0]) { - if ((job->io_thr_pipe[0] >=0) - && fds[i].revents) { - char c; - int n; - debug3("I/O thread received wake-up message"); - n = read(job->io_thr_pipe[0], &c, 1); - if (n < 0) { - error("Unable to read from io_thr_pipe: %m"); - } else if (n == 0) { - close(job->io_thr_pipe[0]); - job->io_thr_pipe[0] = IO_DONE; - } - } - ++i; - } +static bool _file_writable(eio_obj_t *obj) +{ + struct file_write_info *info = (struct file_write_info *) obj->arg; - if ( (fds[i].fd == job->stdinfd) - && (job->stdinfd >= 0) - && stdin_open - && fds[i].revents ) { - _bcast_stdin(job->stdinfd, job); - ++i; - } + debug2("Called _file_writable"); + if (info->out_msg != NULL + || !list_is_empty(info->msg_queue)) + return true; - for ( ; i < nfds; i++) { - unsigned short revents = fds[i].revents; - xassert(!(revents & POLLNVAL)); - if ((revents & POLLERR) || (revents & POLLHUP)) - _handle_pollerr(&map[i]); + debug3(" false"); + debug3(" eof is %s", info->eof ? "true" : "false"); + return false; +} - if ((revents & POLLIN) && (*map[i].fd >= 0)) - _do_task_output_poll(&map[i]); +static int _file_write(eio_obj_t *obj, List objs) +{ + struct file_write_info *info = (struct file_write_info *) obj->arg; + void *ptr; + int n; - if ((revents & POLLOUT) && (*map[i].fd >= 0)) - _do_task_input_poll(job, &map[i]); + debug2("Entering _file_write"); + /* + * If we aren't already in the middle of sending a message, get the + * next message from the queue. + */ + if (info->out_msg == NULL) { + info->out_msg = list_dequeue(info->msg_queue); + if (info->out_msg == NULL) { + debug3("_file_write: nothing in the queue"); + return SLURM_SUCCESS; + } + info->out_remaining = info->out_msg->length; + if (info->out_msg->length == 0) /* eof */ + info->eof = true; + } + + /* + * Write message to file. + */ + if (!info->eof) { + ptr = info->out_msg->data + (info->out_msg->length + - info->out_remaining); + if ((n = _write_msg(obj->fd, ptr, + info->out_remaining, + info->out_msg->header.gtaskid)) < 0) { + return SLURM_ERROR; } + debug3(" wrote %d bytes", n); + info->out_remaining -= n; + if (info->out_remaining > 0) + return SLURM_SUCCESS; } - debug("IO thread exiting"); + /* + * Free the message. + */ + info->out_msg->ref_count--; + if (info->out_msg->ref_count == 0) + list_enqueue(info->job->free_outgoing, info->out_msg); + info->out_msg = NULL; + debug2("Leaving _file_write"); - return NULL; + return SLURM_SUCCESS; } -static inline bool -_io_thr_done(srun_job_t *job) +/********************************************************************** + * File read functions + **********************************************************************/ +extern eio_obj_t * +create_file_read_eio_obj(int fd, srun_job_t *job, + uint16_t type, uint16_t gtaskid) { - bool retval; - slurm_mutex_lock(&job->state_mutex); - retval = (job->state >= SRUN_JOB_FORCETERM); - slurm_mutex_unlock(&job->state_mutex); - return retval; + struct file_read_info *info = NULL; + eio_obj_t *eio = NULL; + + info = (struct file_read_info *) + xmalloc(sizeof(struct file_read_info)); + info->job = job; + info->header.type = type; + info->header.gtaskid = gtaskid; + /* FIXME! Need to set ltaskid based on gtaskid */ + info->header.ltaskid = (uint16_t)-1; + info->eof = false; + + eio = eio_obj_create(fd, &file_read_ops, (void *)info); + + return eio; } -static int -_stdin_open(char *filename) +static bool _file_readable(eio_obj_t *obj) { - int fd; - int flags = O_RDONLY; - - xassert(filename != NULL); + struct file_read_info *info = (struct file_read_info *) obj->arg; - if ((fd = open(filename, flags, 0644)) < 0) { - error ("Unable to open `%s' for stdin: %m", filename); - return -1; + debug2("Called _file_readable"); + + if (info->job->ioservers_ready < info->job->nhosts) { + debug3(" false, all ioservers not yet initialized"); + return false; + } + + if (info->eof) { + debug3(" false, eof"); + return false; } - fd_set_nonblocking(fd); - fd_set_close_on_exec(fd); - return fd; + if (obj->shutdown == true) { + debug3(" false, shutdown"); + close(obj->fd); + obj->fd = -1; + info->eof = true; + return false; + } + if (!list_is_empty(info->job->free_incoming)) + return true; + + debug3(" false"); + return false; } -static FILE * -_fopen(char *filename) +static int _file_read(eio_obj_t *obj, List objs) { - FILE *fp; + struct file_read_info *info = (struct file_read_info *) obj->arg; + struct io_buf *msg; + io_hdr_t header; + void *ptr; + Buf packbuf; + int len; - xassert(filename != NULL); + debug2("Entering _file_read"); + msg = list_dequeue(info->job->free_incoming); + if (msg == NULL) { + debug3(" List free_incoming is empty, no file read"); + return SLURM_SUCCESS; + } - if (!(fp = fopen(filename, "w"))) - error ("Unable to open `%s' for writing: %m", filename); + ptr = msg->data + io_hdr_packed_size(); - return fp; -} +again: + if ((len = read(obj->fd, ptr, MAX_MSG_LEN)) < 0) { + if (errno == EINTR) + goto again; + if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { + error("_file_read returned EAGAIN"); + goto again; + } + } + if (len == 0) { /* got eof */ + debug3("got eof on _file_read"); + info->eof = true; + /* send eof message, message with payload length 0 */ + } -static int -_is_local_file (io_filename_t *fname) -{ - if (fname->name == NULL) - return (0); + debug3(" read %d bytes from file", len); + /* + * Pack header and build msg + */ + header = info->header; + header.length = len; + packbuf = create_buf(msg->data, io_hdr_packed_size()); + io_hdr_pack(&header, packbuf); + msg->length = io_hdr_packed_size() + header.length; + msg->ref_count = 0; /* make certain it is initialized */ + /* free the Buf packbuf, but not the memory to which it points */ + packbuf->head = NULL; + free_buf(packbuf); + debug3(" msg->length = %d", msg->length); - return ((fname->type != IO_PER_TASK) && (fname->type != IO_ONE)); + /* + * Route the message to the correct IO servers + */ + if (header.type == SLURM_IO_ALLSTDIN) { + int i; + struct server_io_info *server; + for (i = 0; i < info->job->nhosts; i++) { + msg->ref_count++; + if (info->job->ioserver[i] == NULL) + fatal("ioserver stream not yet initialized"); + server = info->job->ioserver[i]->arg; + list_enqueue(server->msg_queue, msg); + } + } else if (header.type == SLURM_IO_STDIN) { + fatal("Not yet implemented"); +#if 0 + int nodeid; + struct server_io_info *server; + msg->ref_count = 1; + nodeid = info->job->taskid_to_nodeid[header.gtaskid]; + server = info->job->ioserver[nodeid]->arg; + list_enqueue(server->msg_queue, msg); +#endif + } else { + fatal("Unsupported header.type"); + } + msg = NULL; + return SLURM_SUCCESS; } -int -open_streams(srun_job_t *job) -{ - if (_is_local_file (job->ifname)) - job->stdinfd = _stdin_open(job->ifname->name); - else - job->stdinfd = STDIN_FILENO; +/********************************************************************** + * General fuctions + **********************************************************************/ - if (_is_local_file (job->ofname)) - job->outstream = _fopen(job->ofname->name); - else - job->outstream = stdout; +static void * +_io_thr_internal(void *job_arg) +{ + srun_job_t *job = (srun_job_t *) job_arg; + sigset_t set; - if (_is_local_file (job->efname)) - job->errstream = _fopen(job->efname->name); - else - job->errstream = stderr; + xassert(job != NULL); - if (!job->outstream || !job->errstream || (job->stdinfd < 0)) - return -1; + debug3("IO thread pid = %lu", (unsigned long) getpid()); - /* - * Turn off buffering of output stream, since we're doing it - * with our own buffers. (Also, stdio buffering seems to - * causing some problems with loss of output) + /* Block SIGHUP because it is interrupting file stream functions + * (fprintf, fflush, etc.) and causing data loss on stdout. */ - /* setvbuf(job->outstream, NULL, _IONBF, 0); */ + sigemptyset(&set); + sigaddset(&set, SIGHUP); + pthread_sigmask(SIG_BLOCK, &set, NULL); - return 0; -} + _set_listensocks_nonblocking(job); + /* start the eio engine */ + eio_handle_mainloop(job->eio); -void * -io_thr(void *arg) -{ - return _io_thr_poll(arg); + debug("IO thread exiting"); + + return NULL; } -static int -_wid(int n) +static eio_obj_t * +_create_listensock_eio(int fd, srun_job_t *job) { - int width = 1; - n--; /* For zero origin */ - while (n /= 10) - width++; - return width; + eio_obj_t *eio = NULL; + + eio = eio_obj_create(fd, &listening_socket_ops, (void *)job); + + return eio; } int @@ -577,26 +752,26 @@ io_thr_create(srun_job_t *job) if (opt.labelio) fmt_width = _wid(opt.nprocs); - for (i = 0; i < job->niofds; i++) { - if (net_stream_listen(&job->iofd[i], &job->ioport[i]) < 0) - fatal("unable to initialize stdio server port: %m"); - debug("initialized stdio server port %d\n", - ntohs(job->ioport[i])); - net_set_low_water(job->iofd[i], 140); + for (i = 0; i < job->num_listen; i++) { + eio_obj_t *obj; + + if (net_stream_listen(&job->listensock[i], + &job->listenport[i]) < 0) + fatal("unable to initialize stdio listen socket: %m"); + debug("initialized stdio listening socket, port %d\n", + ntohs(job->listenport[i])); + /*net_set_low_water(job->listensock[i], 140);*/ + obj = _create_listensock_eio(job->listensock[i], job); + list_enqueue(job->eio_objs, obj); } - if (open_streams(job) < 0) { - return SLURM_ERROR; - } + /* FIXME - Need to open files here (or perhaps earlier) */ xsignal(SIGTTIN, SIG_IGN); - if (pipe(job->io_thr_pipe) < 0) - error("io_thr_create: pipe: %m"); - slurm_attr_init(&attr); - while ((errno = pthread_create(&job->ioid, &attr, &io_thr, - (void *) job))) { + while (errno = pthread_create(&job->ioid, &attr, + &_io_thr_internal, (void *) job)) { if (++retries > MAX_RETRIES) { error ("pthread_create error %m"); return SLURM_ERROR; @@ -608,81 +783,64 @@ io_thr_create(srun_job_t *job) return SLURM_SUCCESS; } -static bool -_is_fd_ready(int fd) -{ - struct pollfd pfd[1]; - int rc; - - pfd[0].fd = fd; - pfd[0].events = POLLIN; - - rc = poll(pfd, 1, 10); - - return ((rc == 1) && (pfd[0].revents & POLLIN)); -} - - static int -_read_io_header(int fd, srun_job_t *job, char *host) +_read_io_init_msg(int fd, srun_job_t *job, char *host) { - int size = io_hdr_packed_size(); - cbuf_t cb = cbuf_create(size, size); - char *key = NULL; - int len = 0; - io_hdr_t hdr; - - if (cbuf_write_from_fd(cb, fd, size, NULL) < 0) { - error ("Bad stream header write: %m"); - goto fail; - } + struct slurm_io_init_msg msg; + char *sig; + int siglen; - if (io_hdr_read_cb(cb, &hdr) < 0) { - error ("Unable to unpack io header: %m"); + if (io_init_msg_read_from_fd(fd, &msg) != SLURM_SUCCESS) { + error("failed reading io init message"); goto fail; } - - if (slurm_cred_get_signature(job->cred, &key, &len) < 0) { + if (slurm_cred_get_signature(job->cred, &sig, &siglen) < 0) { error ("Couldn't get existing cred signature"); goto fail; } - - if (io_hdr_validate(&hdr, key, len) < 0) /* check key */ - goto fail; - - /* - * validate reality of hdr.taskid - */ - if ((hdr.taskid < 0) || (hdr.taskid >= opt.nprocs)) { - error ("Invalid taskid %d from %s", hdr.taskid, host); + if (io_init_msg_validate(&msg, sig) < 0) { + goto fail; + } + if (msg.nodeid >= job->nhosts) { + error ("Invalid nodeid %d from %s", msg.nodeid, host); goto fail; } + debug2("Validated IO connection from %s, node rank %u, sd=%d", + host, msg.nodeid, fd); + + net_set_low_water(fd, 1); + job->ioserver[msg.nodeid] = _create_server_eio_obj(fd, job); + list_enqueue(job->eio_objs, job->ioserver[msg.nodeid]); + job->ioservers_ready++; - if (hdr.type == SLURM_IO_STDOUT) - job->out[hdr.taskid] = fd; - else - job->err[hdr.taskid] = fd; - - debug2("accepted %s connection from %s task %u, sd=%d", - (hdr.type == SLURM_IO_STDERR ? "stderr" : "stdout"), - host, hdr.taskid, fd ); - - cbuf_destroy(cb); return SLURM_SUCCESS; fail: - cbuf_destroy(cb); close(fd); return SLURM_ERROR; } +static bool +_is_fd_ready(int fd) +{ + struct pollfd pfd[1]; + int rc; + + pfd[0].fd = fd; + pfd[0].events = POLLIN; + + rc = poll(pfd, 1, 10); + + return ((rc == 1) && (pfd[0].revents & POLLIN)); +} + + static void -_accept_io_stream(srun_job_t *job, int i) +_handle_io_init_msg(int fd, srun_job_t *job) { int j; - int fd = job->iofd[i]; - debug2("Activity on IO server port %d fd %d", i, fd); + debug2("Activity on IO server socket %d", fd); for (j = 0; j < 15; j++) { int sd; @@ -690,7 +848,7 @@ _accept_io_stream(srun_job_t *job, int i) struct sockaddr_in *sin; socklen_t size = sizeof(addr); char buf[INET_ADDRSTRLEN]; - + /* * Return early if fd is not now ready */ @@ -727,7 +885,7 @@ _accept_io_stream(srun_job_t *job, int i) /* * Read IO header and update job structure appropriately */ - if (_read_io_header(sd, job, buf) < 0) + if (_read_io_init_msg(sd, job, buf) < 0) continue; fd_set_nonblocking(sd); @@ -735,79 +893,6 @@ _accept_io_stream(srun_job_t *job, int i) } -static int -_close_stream(int *fd, FILE *out, int tasknum) -{ - int retval; - debug2("%d: <%s disconnected>", tasknum, - out == stdout ? "stdout" : "stderr"); - retval = shutdown(*fd, SHUT_RDWR); - if ((retval >= 0) || (errno != EBADF)) - close(*fd); - *fd = IO_DONE; - return retval; -} - -static int -_do_task_output(int *fd, FILE *out, cbuf_t buf, int tasknum) -{ - int len = 0; - int dropped = 0; - - again: - if ((len = cbuf_write_from_fd(buf, *fd, -1, &dropped)) < 0) { - - /* - * If output buffer is full, flush all output to - * output stream - */ - if (errno == ENOSPC) { - cbuf_read_to_fd(buf, fileno(out), -1); - goto again; - } - - if (errno == EAGAIN) - return 0; - - error("Error task %d IO: %m", tasknum); - _close_stream(fd, out, tasknum); - return len; - - } else if (len == 0) { - _close_stream(fd, out, tasknum); - return len; - } - - nbytes += len; - - _do_output(buf, out, tasknum); - - return len; -} - -static int -_do_task_input(srun_job_t *job, int taskid) -{ - int len = 0; - cbuf_t buf = job->inbuf[taskid]; - int fd = job->out[taskid]; - - if ( stdin_got_eof - && !job->stdin_eof[taskid] - && (cbuf_used(buf) == 0) ) { - job->stdin_eof[taskid] = true; - shutdown(job->out[taskid], SHUT_WR); - return 0; - } - - if ((len = cbuf_read_to_fd(buf, fd, -1)) < 0) - error ("writing stdin data: %m"); - - debug3("wrote %d bytes to task %d stdin", len, taskid); - - return len; -} - static ssize_t _readx(int fd, char *buf, size_t maxbytes) { @@ -826,94 +911,6 @@ _readx(int fd, char *buf, size_t maxbytes) } -static void -_write_all(srun_job_t *job, cbuf_t cb, char *buf, size_t len, int taskid) -{ - int n = 0; - int dropped = 0; - - again: - n = cbuf_write(cb, buf, len, &dropped); - if ((n < len) && (job->out[taskid] >= 0)) { - error("cbuf_write returned %d", n); - _do_task_input(job, taskid); - goto again; - } - - if (dropped) - error ("Dropped %d bytes stdin data", dropped); -} - -static void -_close_stdin(srun_job_t *j) -{ - close(j->stdinfd); - j->stdinfd = IO_DONE; - stdin_got_eof = true; - stdin_open = false; -} - -static void -_bcast_stdin(int fd, srun_job_t *job) -{ - int i; - char buf[4096]; - ssize_t len = sizeof(buf); - ssize_t n = 0; - - if (job->ifname->type == IO_ONE) { - i = job->ifname->taskid; - if (cbuf_free(job->inbuf[i]) < len) - len = cbuf_free(job->inbuf[i]); - } else { - for (i = 0; i < opt.nprocs; i++) { - if (cbuf_free(job->inbuf[i]) < len) - len = cbuf_free(job->inbuf[i]); - } - } - - if (len == 0) - return; - - if ((n = _readx(fd, buf, len)) < 0) { - if (errno == EIO) { - stdin_open = false; - debug2("disabling stdin"); - } else if (errno != EINTR) - error("error reading stdin. %m"); - return; - } - - if (n == 0) { - _close_stdin(job); - return; - } - - if (job->ifname->type == IO_ONE) { - i = job->ifname->taskid; - _write_all(job, job->inbuf[i], buf, n, i); - } else { - for (i = 0; i < opt.nprocs; i++) - _write_all(job, job->inbuf[i], buf, n, i); - } - - return; -} - - -/* - * io_thr_wake - Wake the I/O thread if it is blocking in poll(). - */ -void -io_thr_wake(srun_job_t *job) -{ - char c; - - debug3("Sending wake-up message to the I/O thread."); - if (write(job->io_thr_pipe[1], &c, 1) == -1) - error("Failed sending wakeup signal to io thread: %m"); -} - /* * io_node_fail - Some nodes have failed. Identify affected I/O streams. * Flag them as done and signal the I/O thread. @@ -934,26 +931,52 @@ io_node_fail(char *nodelist, srun_job_t *job) for (node_inx=0; node_inx<job->nhosts; node_inx++) { if (strcmp(node_name, job->host[node_inx])) continue; - _terminate_node_io(node_inx, job); break; } } - io_thr_wake(job); + eio_signal_wakeup(job->eio); hostlist_destroy(fail_list); return SLURM_SUCCESS; } -static void -_terminate_node_io(int node_inx, srun_job_t *job) +static int +_wid(int n) { - int i; + int width = 1; + n--; /* For zero origin */ + while (n /= 10) + width++; + return width; +} - for (i=0; i<opt.nprocs; i++) { - if (job->hostid[i] != node_inx) - continue; - job->out[i] = IO_DONE; - job->err[i] = IO_DONE; +struct io_buf * +alloc_io_buf(void) +{ + struct io_buf *buf; + + buf = (struct io_buf *)xmalloc(sizeof(struct io_buf)); + if (!buf) + return NULL; + buf->ref_count = 0; + buf->length = 0; + /* The following "+ 1" is just temporary so I can stick a \0 at + the end and do a printf of the data pointer */ + buf->data = xmalloc(MAX_MSG_LEN + io_hdr_packed_size() + 1); + if (!buf->data) { + xfree(buf); + return NULL; } + + return buf; } +void +free_io_buf(struct io_buf *buf) +{ + if (buf) { + if (buf->data) + xfree(buf->data); + xfree(buf); + } +} diff --git a/src/srun/io.h b/src/srun/io.h index 4b41c58db0f86b2d3a53b8a854eddacd19335bcf..468e3987e22399b25172e378491039607251e39a 100644 --- a/src/srun/io.h +++ b/src/srun/io.h @@ -27,15 +27,23 @@ #ifndef _HAVE_IO_H #define _HAVE_IO_H +#include "src/common/io_hdr.h" #include "src/srun/srun_job.h" -#define WAITING_FOR_IO -1 -#define IO_DONE -9 +struct io_buf { + int ref_count; + uint32_t length; + void *data; + io_hdr_t header; +}; + +struct io_buf *alloc_io_buf(void); +void free_io_buf(struct io_buf *buf); int io_node_fail(char *nodelist, srun_job_t *job); -void *io_thr(void *arg); int io_thr_create(srun_job_t *job); -void io_thr_wake(srun_job_t *job); -int open_streams(srun_job_t *job); +eio_obj_t *create_file_write_eio_obj(int fd, srun_job_t *job); +eio_obj_t *create_file_read_eio_obj(int fd, srun_job_t *job, + uint16_t type, uint16_t gtaskid); #endif /* !_HAVE_IO_H */ diff --git a/src/srun/launch.c b/src/srun/launch.c index 4787882a8259b2671309739dde0c371d148a27c3..c5f73bd4f083fc97307bde72b342da6681b9f924 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -141,6 +141,7 @@ launch(void *arg) r->ofname = fname_remote_string (job->ofname); r->efname = fname_remote_string (job->efname); r->ifname = fname_remote_string (job->ifname); + r->buffered_stdio = !opt.unbuffered; if (opt.parallel_debug) r->task_flags |= TASK_PARALLEL_DEBUG; @@ -153,7 +154,7 @@ launch(void *arg) r->global_task_ids = job->tids[i]; r->cpus_allocated = job->cpus[i]; r->srun_node_id = (uint32_t)i; - r->io_port = ntohs(job->ioport[i%job->niofds]); + r->io_port = ntohs(job->listenport[i%job->num_listen]); r->resp_port = ntohs(job->jaddr[i%job->njfds].sin_port); m->msg_type = REQUEST_LAUNCH_TASKS; m->data = &msg_array_ptr[i]; diff --git a/src/srun/msg.c b/src/srun/msg.c index b619af72b42e4363a70f8842299d7f8ac3c93fd1..f291a08d6231b791684f52033a490b25fc323c6d 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -253,7 +253,7 @@ static void _node_fail_handler(char *nodelist, srun_job_t *job) info("sending Ctrl-C to remaining tasks"); fwd_signal(job, SIGINT); if (job->ioid) - io_thr_wake(job); + eio_signal_wakeup(job->eio); } static bool _job_msg_done(srun_job_t *job) @@ -572,11 +572,11 @@ _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) if (status) job->task_state[taskid] = SRUN_TASK_ABNORMAL_EXIT; else { - if ( (job->err[taskid] != IO_DONE) - || (job->out[taskid] != IO_DONE) ) - job->task_state[taskid] = SRUN_TASK_IO_WAIT; - else - job->task_state[taskid] = SRUN_TASK_EXITED; +/* if ( (job->err[taskid] != IO_DONE) */ +/* || (job->out[taskid] != IO_DONE) ) */ +/* job->task_state[taskid] = SRUN_TASK_IO_WAIT; */ +/* else */ +/* job->task_state[taskid] = SRUN_TASK_EXITED; */ } slurm_mutex_unlock(&job->task_mutex); @@ -586,9 +586,9 @@ _exit_handler(srun_job_t *job, slurm_msg_t *exit_msg) || (slurm_mpi_single_task_per_node () && (tasks_exited == job->nhosts))) { debug2("All tasks exited"); + eio_signal_shutdown(job->eio); update_job_state(job, SRUN_JOB_TERMINATED); } - } _print_exit_status(job, hl, host, status); diff --git a/src/srun/reattach.c b/src/srun/reattach.c index d4add6acfda4638d418819b57027c3fddbdcdbe5..0234c2f0967b36bdd12df933b0423bc4876d19f8 100644 --- a/src/srun/reattach.c +++ b/src/srun/reattach.c @@ -306,7 +306,7 @@ _attach_to_job(srun_job_t *job) r->job_id = job->jobid; r->job_step_id = job->stepid; r->srun_node_id = (uint32_t) i; - r->io_port = ntohs(job->ioport[i%job->niofds]); + r->io_port = ntohs(job->listenport[i%job->num_listen]); r->resp_port = ntohs(job->jaddr[i%job->njfds].sin_port); r->cred = job->cred; diff --git a/src/srun/srun.c b/src/srun/srun.c index 8906859e262e583d90fdd860b76e393e23fe03d9..f8e01a4645ce9618394ec7ad3d3bb58b70e29ffd 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -309,7 +309,7 @@ int srun(int ac, char **av) * wait for all output to complete */ debug("Waiting for IO thread"); - io_thr_wake(job); + eio_signal_wakeup(job->eio); if (pthread_join(job->ioid, NULL) < 0) error ("Waiting on IO: %m"); diff --git a/src/srun/srun_job.c b/src/srun/srun_job.c index 788c7afe70384c05a46dadc4379eda22c243c2fb..c2e096a9cb032dd60a75247a65c9967e26d96da3 100644 --- a/src/srun/srun_job.c +++ b/src/srun/srun_job.c @@ -45,6 +45,7 @@ #include "src/common/slurm_cred.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" +#include "src/common/io_hdr.h" #include "src/srun/srun_job.h" #include "src/srun/opt.h" @@ -261,7 +262,7 @@ job_force_termination(srun_job_t *job) update_job_state(job, SRUN_JOB_FORCETERM); } - io_thr_wake(job); + eio_signal_wakeup(job->eio); } @@ -365,9 +366,9 @@ report_task_status(srun_job_t *job) for (i = 0; i < opt.nprocs; i++) { int state = job->task_state[i]; - if ((state == SRUN_TASK_EXITED) - && ((job->err[i] >= 0) || (job->out[i] >= 0))) - state = 4; +/* if ((state == SRUN_TASK_EXITED) */ +/* && ((job->err[i] >= 0) || (job->out[i] >= 0))) */ +/* state = 4; */ snprintf(buf, 256, "task%d", i); hostlist_push(hl[state], buf); } @@ -415,6 +416,100 @@ _set_nprocs(allocation_info_t *info) } } +static int +_is_local_file (io_filename_t *fname) +{ + if (fname->name == NULL) + return 1; + + return ((fname->type != IO_PER_TASK) && (fname->type != IO_ONE)); +} + +static void +_init_stdio_eio_objs(srun_job_t *job) +{ + io_filename_t *inname, *outname, *errname; + int infd, outfd, errfd; + bool err_shares_out = false; + + inname = fname_create(job, opt.ifname); + outname = fname_create(job, opt.ofname); + errname = fname_create(job, opt.efname); + + /* + * build stdin eio_obj_t + */ + if (_is_local_file(inname)) { + uint16_t type, destid; + if (inname->name == NULL) { + infd = STDIN_FILENO; + } else { + infd = open(inname->name, O_RDONLY); + if (infd == -1) + fatal("Could not open stdin file: %m"); + } + fd_set_nonblocking(infd); + fd_set_close_on_exec(infd); + if (inname->type == IO_ONE) { + type = SLURM_IO_STDIN; + destid = inname->taskid; + } else { + type = SLURM_IO_ALLSTDIN; + destid = -1; + } + job->stdin = create_file_read_eio_obj(infd, job, type, destid); + list_enqueue(job->eio_objs, job->stdin); + } + + /* + * build stdout eio_obj_t + */ + if (_is_local_file(outname)) { + int refcount; + if (outname->name == NULL) { + outfd = STDOUT_FILENO; + } else { + outfd = open(outname->name, + O_CREAT|O_WRONLY|O_TRUNC, 0644); + if (outfd == -1) + fatal("Could not open stdout file: %m"); + } + if (outname->name != NULL + && errname->name != NULL + && !strcmp(outname->name, errname->name)) { + refcount = job->ntasks * 2; + err_shares_out = true; + } else { + refcount = job->ntasks; + } + /*job->stdout = create_file_write_eio_obj(outfd, job, refcount);*/ + job->stdout = create_file_write_eio_obj(outfd, job); + list_enqueue(job->eio_objs, job->stdout); + } + + /* + * build a seperate stderr eio_obj_t only if stderr is not sharing + * the stdout eio_obj_t + */ + if (err_shares_out) { + error("Doh, sharing"); + job->stderr = job->stdout; + } else if (_is_local_file(errname)) { + int refcount; + if (errname->name == NULL) { + errfd = STDERR_FILENO; + } else { + errfd = open(errname->name, + O_CREAT|O_WRONLY|O_TRUNC, 0644); + if (errfd == -1) + fatal("Could not open stderr file: %m"); + } + refcount = job->ntasks; + /*job->stderr = create_file_write_eio_obj(errfd, job, refcount);*/ + job->stderr = create_file_write_eio_obj(errfd, job); + list_enqueue(job->eio_objs, job->stderr); + } +} static srun_job_t * _job_create_internal(allocation_info_t *info) @@ -424,6 +519,7 @@ _job_create_internal(allocation_info_t *info) int cpu_inx = 0; hostlist_t hl; srun_job_t *job; + eio_obj_t *obj; /* Reset nprocs if necessary */ @@ -453,7 +549,7 @@ _job_create_internal(allocation_info_t *info) job->nhosts = hostlist_count(hl); #endif - job->select_jobinfo = info->select_jobinfo; + job->select_jobinfo = info->select_jobinfo; job->jobid = info->jobid; job->stepid = info->stepid; job->old_job = false; @@ -482,23 +578,27 @@ _job_create_internal(allocation_info_t *info) debug3("njfds = %d", job->njfds); - /* Compute number of IO file descriptors needed and allocate - * memory for them + /* Compute number of listening sockets needed to allow + * all of the slurmds to establish IO streams with srun, without + * overstressing the TCP/IP backoff/retry algorithm */ - job->niofds = _estimate_nports(opt.nprocs, 64); - job->iofd = (int *) xmalloc(job->niofds * sizeof(int)); - job->ioport = (int *) xmalloc(job->niofds * sizeof(int)); - - /* ntask stdout and stderr fds */ - job->out = (int *) xmalloc(opt.nprocs * sizeof(int)); - job->err = (int *) xmalloc(opt.nprocs * sizeof(int)); - - /* ntask cbufs for stdout and stderr */ - job->outbuf = (cbuf_t *) xmalloc(opt.nprocs * sizeof(cbuf_t)); - job->errbuf = (cbuf_t *) xmalloc(opt.nprocs * sizeof(cbuf_t)); - job->inbuf = (cbuf_t *) xmalloc(opt.nprocs * sizeof(cbuf_t)); - job->stdin_eof = (bool *) xmalloc(opt.nprocs * sizeof(bool)); - + job->num_listen = _estimate_nports(opt.nprocs, 64); + job->listensock = (int *) xmalloc(job->num_listen * sizeof(int)); + job->listenport = (int *) xmalloc(job->num_listen * sizeof(int)); + + job->eio_objs = list_create(NULL); /* FIXME - needs destructor */ + job->eio = eio_handle_create(job->eio_objs); + job->ioservers_ready = 0; + /* "nhosts" number of IO protocol sockets */ + job->ioserver = (eio_obj_t **)xmalloc(job->nhosts*sizeof(eio_obj_t *)); + job->free_incoming = list_create(NULL); /* FIXME! Needs destructor */ + for (i = 0; i < 10; i++) { + list_enqueue(job->free_incoming, alloc_io_buf()); + } + job->free_outgoing = list_create(NULL); /* FIXME! Needs destructor */ + for (i = 0; i < 10; i++) { + list_enqueue(job->free_outgoing, alloc_io_buf()); + } /* nhost host states */ job->host_state = xmalloc(job->nhosts * sizeof(srun_host_state_t)); @@ -507,20 +607,6 @@ _job_create_internal(allocation_info_t *info) job->task_state = xmalloc(opt.nprocs * sizeof(srun_task_state_t)); job->tstatus = xmalloc(opt.nprocs * sizeof(int)); - for (i = 0; i < opt.nprocs; i++) { - job->task_state[i] = SRUN_TASK_INIT; - - job->outbuf[i] = cbuf_create(4096, 1048576); - job->errbuf[i] = cbuf_create(4096, 1048576); - job->inbuf[i] = cbuf_create(4096, 4096); - - cbuf_opt_set(job->outbuf[i], CBUF_OPT_OVERWRITE, CBUF_NO_DROP); - cbuf_opt_set(job->errbuf[i], CBUF_OPT_OVERWRITE, CBUF_NO_DROP); - cbuf_opt_set(job->inbuf[i], CBUF_OPT_OVERWRITE, CBUF_NO_DROP); - - job->stdin_eof[i] = false; - } - slurm_mutex_init(&job->task_mutex); for(i = 0; i < job->nhosts; i++) { @@ -543,11 +629,15 @@ _job_create_internal(allocation_info_t *info) job->nodelist, opt.nprocs); #endif + job->ntasks = 0; for (i = 0; i < job->nhosts; i++) { debug3("distribute_tasks placed %d tasks on host %d", - job->ntask[i], i); + job->ntask[i], i); + job->ntasks += job->ntask[i]; } + _init_stdio_eio_objs(job); + /* Build task id list for each host */ job->tids = xmalloc(job->nhosts * sizeof(uint32_t *)); job->hostid = xmalloc(opt.nprocs * sizeof(uint32_t)); @@ -566,7 +656,7 @@ _job_create_internal(allocation_info_t *info) else _dist_cyclic(job); - job_update_io_fnames(job); +/* job_update_io_fnames(job); */ hostlist_destroy(hl); diff --git a/src/srun/srun_job.h b/src/srun/srun_job.h index c5207d8bbde97e95ec7c4715c26fe4ffa16fa46d..648b361ca5262f624546e2ee25618d1781fae4a3 100644 --- a/src/srun/srun_job.h +++ b/src/srun/srun_job.h @@ -35,6 +35,7 @@ #include <slurm/slurm.h> +#include "src/common/eio.h" #include "src/common/cbuf.h" #include "src/common/macros.h" #include "src/common/node_select.h" @@ -105,6 +106,7 @@ typedef struct srun_job { char **host; /* hostname vector */ int *cpus; /* number of processors on each host */ int *ntask; /* number of tasks to run on each host */ + int ntasks; /* total number of tasks in the job step */ uint32_t **tids; /* host id => task ids mapping */ uint32_t *hostid; /* task id => host id mapping */ @@ -118,20 +120,24 @@ typedef struct srun_job { slurm_addr *jaddr; /* job control info ports */ pthread_t ioid; /* stdio thread id */ - int io_thr_pipe[2]; /* pipe for waking stdio thread */ - int niofds; /* Number of IO fds */ - int *iofd; /* stdio listen fds */ - int *ioport; /* stdio listen ports */ - - int *out; /* ntask stdout fds */ - int *err; /* ntask stderr fds */ - - /* XXX Need long term solution here: - * Quickfix: ntask*2 cbufs for buffering job output - */ - cbuf_t *outbuf; - cbuf_t *errbuf; - cbuf_t *inbuf; /* buffer for stdin data */ + int num_listen; /* Number of stdio listen sockets */ + int *listensock; /* Array of stdio listen sockets */ + int *listenport; /* Array of stdio listen ports */ + eio_t eio; /* Event IO handle */ + List eio_objs; /* List of eio_obj_t pointers */ + int ioservers_ready; /* Number of servers that established contact */ + eio_obj_t **ioserver; /* Array of nhosts pointers to eio_obj_t */ + eio_obj_t *stdout; /* stdout eio_obj_t */ + eio_obj_t *stderr; /* stderr eio_obj_t */ + eio_obj_t *stdin; /* stdin eio_obj_t */ + List free_incoming; /* List of free struct io_buf * for incoming + * traffic. "incoming" means traffic from srun + * to the tasks. + */ + List free_outgoing; /* List of free struct io_buf * for outgoing + * traffic "outgoing" means traffic from the + * tasks to srun. + */ pthread_t lid; /* launch thread id */ @@ -150,10 +156,6 @@ typedef struct srun_job { io_filename_t *efname; /* Output streams and stdin fileno */ - FILE *outstream; - FILE *errstream; - int stdinfd; - bool *stdin_eof; /* true if task i processed stdin eof */ forked_msg_t *forked_msg; select_jobinfo_t select_jobinfo; } srun_job_t;