diff --git a/NEWS b/NEWS index 58fb8a0800f3bf6fdafde14321faf7db13b4471b..b616251356b6c2d70a8223b0441c30f2e1da115e 100644 --- a/NEWS +++ b/NEWS @@ -6,7 +6,10 @@ documents those changes that are of interest to users and admins. -- select/cray: Add support for Accelerator information including model and memory options. -- Cray systems: Add support to suspend/resume salloc command to insure that - aprun does not get initiated when the job is suspended. + aprun does not get initiated when the job is suspended. Processes suspended + and resumed are determined by using process group ID and parent process ID, + so some processes may be missed. Since salloc runs as a normal user, it's + ability to identify processes associated with a job is limited. -- Cray systems: Modify smap and sview to display all nodes even if multiple nodes exist at each coordinate. -- Improve efficiency of select/linear plugin with topology/tree plugin diff --git a/src/salloc/salloc.c b/src/salloc/salloc.c index 32554ac61eba6343a02f98532348d67f578a522a..f05c0b87da1595f71bd0e36221b8edf30a3d0dee 100644 --- a/src/salloc/salloc.c +++ b/src/salloc/salloc.c @@ -42,6 +42,8 @@ # include "config.h" #endif +#include <dirent.h> +#include <fcntl.h> #include <pwd.h> #include <stdbool.h> #include <stdio.h> @@ -83,6 +85,7 @@ extern uint64_t job_getjid(pid_t pid); #endif #endif +#define HASH_RECS 100 #define MAX_RETRIES 10 #define POLL_SLEEP 3 /* retry interval in seconds */ @@ -129,6 +132,14 @@ static int _blocks_dealloc(void); static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc); #endif +typedef struct xppid_s { + int pid; + int ppid; + int pgid; + struct xppid_s *next; +} xppid_t; +static xppid_t *pid_hash[HASH_RECS]; + bool salloc_shutdown = false; /* Signals that are considered terminal before resource allocation. */ int sig_array[] = { @@ -864,19 +875,153 @@ static void _job_complete_handler(srun_job_complete_msg_t *comp) } } +/* Build a hash table of process ID information */ +static void _push_to_hash(int pid, int ppid, int pgid) +{ + int hash_inx = pid % HASH_RECS; + xppid_t *ppid_rec = xmalloc(sizeof(xppid_t)); + ppid_rec->pid = pid; + ppid_rec->ppid = ppid; + ppid_rec->pgid = pgid; + ppid_rec->next = pid_hash[hash_inx]; + pid_hash[hash_inx] = ppid_rec; +} + +/* Find a specific process ID's record in the hash table, NULL if not found */ +static xppid_t * _find_hash_rec(int pid) +{ + int hash_inx = pid % HASH_RECS; + xppid_t *this_hash = pid_hash[hash_inx]; + + while (this_hash) { + if (this_hash->pid == pid) + return this_hash; + this_hash = this_hash->next; + } + return NULL; +} + +/* Determine if s specific process ID is a child of the spawed process + * (command_pid) */ +static bool _is_child_of(int command_pid, xppid_t *this_hash) +{ + while (this_hash) { + if (this_hash->ppid == command_pid) + return true; + if (this_hash->ppid <= 1) + return false; + this_hash = _find_hash_rec(this_hash->ppid); + } + return false; +} + +/* Scan the entire process hash table and signal all children of a specific + * process ID */ +static void _kill_by_hash(int command_pid, int sig) +{ + int i; + xppid_t *this_hash; + + for (i = 0; i < HASH_RECS; i++) { + this_hash = pid_hash[i]; + while (this_hash) { + if ((this_hash->pgid != command_pid) && + _is_child_of(command_pid, this_hash)) + kill(this_hash->pid, sig); + this_hash = this_hash->next; + } + } +} + +/* Free all memory allocated to the process hash table */ +static void _free_hash(void) +{ + int i; + xppid_t *this_hash, *next_hash; + + for (i = 0; i < HASH_RECS; i++) { + this_hash = pid_hash[i]; + while (this_hash) { + next_hash = this_hash->next; + xfree(this_hash); + this_hash = next_hash; + } + pid_hash[i] = NULL; + } +} + +/* Signal all child processses. Since salloc executes a the user, it is unable + * to create most job containers (e.g. cgroups), so it identifies child + * processes using process group ID and parent process IDs. This means that + * child processes can become children of the init process and avoid being + * suspended. */ +static void _signal_children(int sig) +{ + int fd; + DIR *dir; + struct dirent *de; + char path[PATH_MAX], *endptr, *num, rbuf[1024]; + char cmd[1024]; + char state; + int pid, ppid, pgid; + long ret_l; + + if (command_pid <= 0) + return; + + killpg(command_pid, sig); + + if ((dir = opendir("/proc")) == NULL) { + error("opendir(/proc): %m"); + return; + } + while ((de = readdir(dir)) != NULL) { + num = de->d_name; + if ((num[0] < '0') || (num[0] > '9')) + continue; + ret_l = strtol(num, &endptr, 10); + if ((ret_l == LONG_MIN) || (ret_l == LONG_MAX) || + (errno == ERANGE)) { + error("couldn't do a strtol on str %s(%ld): %m", + num, ret_l); + } + if ((endptr == NULL) || (*endptr != 0)) + continue; + sprintf(path, "/proc/%s/stat", num); + if ((fd = open(path, O_RDONLY)) < 0) { + continue; + } + if (read(fd, rbuf, 1024) <= 0) { + close(fd); + continue; + } + if (sscanf(rbuf, "%d %s %c %d %d", &pid, cmd, &state, &ppid, + &pgid) != 5) { + close(fd); + continue; + } + _push_to_hash(pid, ppid, pgid); + close(fd); + } + closedir(dir); + + _kill_by_hash(command_pid, sig); + _free_hash(); + + return; +} + static void _job_suspend_handler(suspend_msg_t *msg) { pthread_mutex_lock(&allocation_state_lock); if (msg->op == SUSPEND_JOB) { verbose("job has been suspended"); suspend_flag = true; - if (command_pid > 0) - killpg(command_pid, SIGSTOP); + _signal_children(SIGSTOP); } else if (msg->op == RESUME_JOB) { verbose("job has been resumed"); suspend_flag = false; - if (command_pid > 0) - killpg(command_pid, SIGCONT); + _signal_children(SIGCONT); } pthread_cond_broadcast(&allocation_state_cond); pthread_mutex_unlock(&allocation_state_lock); diff --git a/src/slurmctld/job_mgr.c b/src/slurmctld/job_mgr.c index 2b95cebb52d0e1cd864e7212d882a837909623b5..0062793c9451599d36204965c1b1c24ddc944284 100644 --- a/src/slurmctld/job_mgr.c +++ b/src/slurmctld/job_mgr.c @@ -8962,7 +8962,7 @@ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, } /* Send message to salloc, required on Crays with ALPS */ -#if defined(HAVE_CRAY) || 1 +#if defined HAVE_CRAY srun_job_suspend(job_ptr, sus_ptr->op); #endif diff --git a/src/slurmctld/srun_comm.c b/src/slurmctld/srun_comm.c index e3ee5ea4e23f8e77038c226a4e68ef48a817846c..7a0a9093220720290fcc68fa1d440b9b5616b7a9 100644 --- a/src/slurmctld/srun_comm.c +++ b/src/slurmctld/srun_comm.c @@ -374,12 +374,14 @@ extern void srun_job_complete (struct job_record *job_ptr) /* * srun_job_suspend - notify salloc of suspend/resume operation * IN job_ptr - pointer to the slurmctld job record - * IN op - + * IN op - SUSPEND_JOB or RESUME_JOB (enum suspend_opts from slurm.h) + * RET - true if message send, otherwise false */ -extern void srun_job_suspend (struct job_record *job_ptr, uint16_t op) +extern bool srun_job_suspend (struct job_record *job_ptr, uint16_t op) { slurm_addr_t * addr; suspend_msg_t *msg_arg; + bool msg_sent = false; xassert(job_ptr); @@ -391,7 +393,9 @@ extern void srun_job_suspend (struct job_record *job_ptr, uint16_t op) msg_arg->op = op; _srun_agent_launch(addr, job_ptr->alloc_node, SRUN_REQUEST_SUSPEND, msg_arg); + msg_sent = true; } + return msg_sent; } /* diff --git a/src/slurmctld/srun_comm.h b/src/slurmctld/srun_comm.h index 226a2b163f95142ec1af7b85d007dfab96d454ba..692606babbd68a3749b19028a8d539ee4a7683ac 100644 --- a/src/slurmctld/srun_comm.h +++ b/src/slurmctld/srun_comm.h @@ -75,9 +75,10 @@ extern void srun_job_complete (struct job_record *job_ptr); /* * srun_job_suspend - notify salloc of suspend/resume operation * IN job_ptr - pointer to the slurmctld job record - * IN op - + * IN op - SUSPEND_JOB or RESUME_JOB (enum suspend_opts from slurm.h) + * RET - true if message send, otherwise false */ -extern void srun_job_suspend (struct job_record *job_ptr, uint16_t op); +extern bool srun_job_suspend (struct job_record *job_ptr, uint16_t op); /* * srun_step_complete - notify srun of a job step's termination