Skip to content
Snippets Groups Projects
Commit 7dadcde6 authored by Christopher J. Morrone's avatar Christopher J. Morrone
Browse files

When the slurmd it started with the "-c" paramter and it finds stray

slurmstepd unix domain sockets, it will attempt to send a SIGKILL using
stepd_signal_container before unlinking the socket file.
parent 85a27c75
No related branches found
No related tags found
No related merge requests found
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include <regex.h> #include <regex.h>
#include <string.h> #include <string.h>
#include <inttypes.h> #include <inttypes.h>
#include <signal.h>
#include "src/common/xmalloc.h" #include "src/common/xmalloc.h"
#include "src/common/xstring.h" #include "src/common/xstring.h"
...@@ -398,8 +399,8 @@ done: ...@@ -398,8 +399,8 @@ done:
} }
/* /*
* Unlink all of the unix domain socket files for a given directory * Send the termination signal to all of the unix domain socket files
* and nodename. * for a given directory and nodename, and then unlink the files.
* Returns SLURM_ERROR if any sockets could not be unlinked. * Returns SLURM_ERROR if any sockets could not be unlinked.
*/ */
int int
...@@ -433,10 +434,24 @@ stepd_cleanup_sockets(const char *directory, const char *nodename) ...@@ -433,10 +434,24 @@ stepd_cleanup_sockets(const char *directory, const char *nodename)
uint32_t jobid, stepid; uint32_t jobid, stepid;
if (_sockname_regex(&re, ent->d_name, &jobid, &stepid) == 0) { if (_sockname_regex(&re, ent->d_name, &jobid, &stepid) == 0) {
char *path; char *path;
int fd;
path = NULL; path = NULL;
xstrfmtcat(path, "%s/%s", directory, ent->d_name); xstrfmtcat(path, "%s/%s", directory, ent->d_name);
verbose("Unlinking stray socket %s", path); verbose("Cleaning up stray job step %u.%u",
if (unlink(path) == -1) { jobid, stepid);
/* signal the slurmstepd to terminate its step */
fd = stepd_connect(directory, nodename, jobid, stepid);
if (fd == -1) {
debug("Unable to connect to socket %s", path);
} else {
stepd_signal_container(fd, SIGKILL);
close(fd);
}
/* make sure that the socket has been removed */
if (unlink(path) == -1 && errno != ENOENT) {
error("Unable to clean up stray socket %s: %m", error("Unable to clean up stray socket %s: %m",
path); path);
rc = SLURM_ERROR; rc = SLURM_ERROR;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment