diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 1c886b9120fb8d80e37acb77fb34c0f411d68b65..0192c43563322f342a93c2369733f796459d3225 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -318,7 +318,7 @@ _forkexec_slurmstepd(slurmd_step_type_t type, void *req, if (pipe(to_stepd) < 0 || pipe(to_slurmd) < 0) { error("_forkexec_slurmstepd pipe failed: %m"); - return -1; + return SLURM_FAILURE; } if ((pid = fork()) < 0) { @@ -327,13 +327,12 @@ _forkexec_slurmstepd(slurmd_step_type_t type, void *req, close(to_stepd[1]); close(to_slurmd[0]); close(to_slurmd[1]); - return -1; + return SLURM_FAILURE; } else if (pid > 0) { - int ok; int rc = 0; /* * Parent sends initialization data to the slurmstepd - * over the to_stepd pipe, and waits for an "ok" + * over the to_stepd pipe, and waits for the return code * reply on the to_slurmd pipe. */ if (close(to_stepd[0]) < 0) @@ -344,14 +343,16 @@ _forkexec_slurmstepd(slurmd_step_type_t type, void *req, if ((rc = _send_slurmstepd_init(to_stepd[1], type, req, cli, self)) < 0) { error("Unable to init slurmstepd"); - rc = -1; + rc = SLURM_FAILURE; + goto done; } - if (read(to_slurmd[0], &ok, sizeof(int)) != sizeof(int)) { - error("Error reading \"ok\" message " + if (read(to_slurmd[0], &rc, sizeof(int)) != sizeof(int)) { + error("Error reading return code message " " from slurmstepd: %m"); - rc = -2; + rc = SLURM_FAILURE; } + done: /* Reap child */ if (waitpid(pid, NULL, 0) < 0) error("Unable to reap slurmd child process"); diff --git a/src/slurmd/slurmstepd/req.c b/src/slurmd/slurmstepd/req.c index d15f02a5ca7a8159620903ee2e1436358328f152..74a2e9187bb73a9761323a4368ad1daf2228c13f 100644 --- a/src/slurmd/slurmstepd/req.c +++ b/src/slurmd/slurmstepd/req.c @@ -95,8 +95,6 @@ _create_socket(const char *name) return -1; fd_set_close_on_exec(fd); - unlink(name); /* in case it already exists */ - memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_UNIX; strcpy(addr.sun_path, name); @@ -123,15 +121,29 @@ _domain_socket_create(const char *dir, const char *nodename, /* * Make sure that "dir" exists and is a directory. */ - if (stat(dir, &stat_buf) < 0) - fatal("Domain socket directory %s: %m", dir); - else if (!S_ISDIR(stat_buf.st_mode)) - fatal("%s is not a directory", dir); + if (stat(dir, &stat_buf) < 0) { + error("Domain socket directory %s: %m", dir); + return -1; + } else if (!S_ISDIR(stat_buf.st_mode)) { + error("%s is not a directory", dir); + return -1; + } /* * Now build the the name of socket, and create the socket. */ xstrfmtcat(name, "%s/%s_%u.%u", dir, nodename, jobid, stepid); + + /* + * First check to see if the named socket already exists. + */ + if (stat(name, &stat_buf) == 0) { + error("Socket %s already exists", name); + xfree(name); + errno = ESLURMD_STEP_EXISTS; + return -1; + } + fd = _create_socket(name); if (fd < 0) fatal("Could not create domain socket: %m"); @@ -161,15 +173,19 @@ _msg_thr_internal(void *job_arg) debug("Message thread exited"); } -void +int msg_thr_create(slurmd_job_t *job) { int fd; eio_obj_t *eio_obj; pthread_attr_t attr; + errno = 0; fd = _domain_socket_create(conf->spooldir, "nodename", job->jobid, job->stepid); + if (fd == -1) + return SLURM_ERROR; + fd_set_nonblocking(fd); eio_obj = eio_obj_create(fd, &msg_socket_ops, (void *)job); @@ -178,8 +194,12 @@ msg_thr_create(slurmd_job_t *job) slurm_attr_init(&attr); if (pthread_create(&job->msgid, &attr, - &_msg_thr_internal, (void *)job) != 0) - fatal("pthread_create: %m"); + &_msg_thr_internal, (void *)job) != 0) { + error("pthread_create: %m"); + return SLURM_ERROR; + } + + return SLURM_SUCCESS; } static bool diff --git a/src/slurmd/slurmstepd/req.h b/src/slurmd/slurmstepd/req.h index cf72b9d273034d0d1ca90fa17854ca3af539306d..5dc51bc580b684f2c7a4af39a5d61226741d25eb 100644 --- a/src/slurmd/slurmstepd/req.h +++ b/src/slurmd/slurmstepd/req.h @@ -30,6 +30,6 @@ #include "src/slurmd/slurmstepd/slurmstepd_job.h" -void msg_thr_create(slurmd_job_t *job); +int msg_thr_create(slurmd_job_t *job); #endif /* _STEP_REQ_H */ diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c index 0b13e968c89c9b74962fcb311d7c5f909bd6f8a3..5e96d6cdb92ab1bf69860ce8a88f25b80c6aa873 100644 --- a/src/slurmd/slurmstepd/slurmstepd.c +++ b/src/slurmd/slurmstepd/slurmstepd.c @@ -43,6 +43,7 @@ static int _init_from_slurmd(int sock, char **argv, slurm_addr **_cli, slurm_addr **_self, slurm_msg_t **_msg); static void _send_ok_to_slurmd(int sock); +static void _send_fail_to_slurmd(int sock); static slurmd_job_t *_step_setup(slurm_addr *cli, slurm_addr *self, slurm_msg_t *msg); static void _step_cleanup(slurmd_job_t *job, slurm_msg_t *msg, int rc); @@ -71,7 +72,11 @@ main (int argc, char *argv[]) job = _step_setup(cli, self, msg); - msg_thr_create(job); /* sets job->msg_handle and job->msgid */ + /* sets job->msg_handle and job->msgid */ + if (msg_thr_create(job) == SLURM_ERROR) { + _send_fail_to_slurmd(STDOUT_FILENO); + return -1; + } _send_ok_to_slurmd(STDOUT_FILENO); close(STDOUT_FILENO); @@ -99,13 +104,25 @@ main (int argc, char *argv[]) static void _send_ok_to_slurmd(int sock) { - int ok = 42; + int ok = SLURM_SUCCESS; safe_write(sock, &ok, sizeof(int)); return; rwfail: error("Unable to send \"ok\" to slurmd"); } +static void +_send_fail_to_slurmd(int sock) +{ + int fail = SLURM_FAILURE; + + if (errno) + fail = errno; + safe_write(sock, &fail, sizeof(int)); + return; +rwfail: + error("Unable to send \"fail\" to slurmd"); +} static int _init_from_slurmd(int sock, char **argv,