diff --git a/src/plugins/proctrack/aix/proctrack_aix.c b/src/plugins/proctrack/aix/proctrack_aix.c index 2617757ff1641c0c31d76b02d5efb36da967a52b..a2bbc27c85faf5021681bb3d939ecd640a266495 100644 --- a/src/plugins/proctrack/aix/proctrack_aix.c +++ b/src/plugins/proctrack/aix/proctrack_aix.c @@ -43,10 +43,6 @@ #include <slurm/slurm_errno.h> #include "src/common/log.h" -#ifndef __USE_XOPEN_EXTENDED -extern pid_t getsid(pid_t pid); /* missing from <unistd.h> */ -#endif - extern int proctrack_job_reg(int *jobid); /* register a job, include this proc */ extern int proctrack_job_unreg(int *jobid); /* unregister a job */ extern int proctrack_job_kill(int *jobid, int *signal); /* signal a job */ @@ -102,13 +98,12 @@ extern int fini ( void ) /* * For this plugin, we ignore the job_id. - * To generate a unique container ID, we use setsid. + * To generate a unique container ID, we use getpid. */ extern uint32_t slurm_create_container ( uint32_t job_id ) { - pid_t pid = setsid(); + pid_t pid = getpid(); int jobid = (int) pid; - (void) setpgrp(); if (pid < 0) { error("slurm_create_container: setsid: %m"); diff --git a/src/plugins/proctrack/sid/proctrack_sid.c b/src/plugins/proctrack/sid/proctrack_sid.c index c6f38222b5e1281d511b0aa9a9da23db8f1fb7ae..ee2d81703c00796b9289bc9294cabf93a3f58823 100644 --- a/src/plugins/proctrack/sid/proctrack_sid.c +++ b/src/plugins/proctrack/sid/proctrack_sid.c @@ -1,5 +1,5 @@ /*****************************************************************************\ - * proctrack_sid.c - process tracking via session ID plugin. + * proctrack_sid.c - process tracking via process group ID plugin. ***************************************************************************** * Copyright (C) 2005 The Regents of the University of California. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). @@ -43,11 +43,6 @@ #include <slurm/slurm_errno.h> #include "src/common/log.h" -#ifndef __USE_XOPEN_EXTENDED -extern pid_t getsid(pid_t pid); /* missing from <unistd.h> */ -extern pid_t setsid(void); /* missing from <unistd.h> */ -#endif - /* * These variables are required by the generic plugin interface. If they * are not found in the plugin, the plugin loader will ignore it. @@ -97,18 +92,16 @@ extern int fini ( void ) /* * For this plugin, we ignore the job_id. - * To generate a unique container ID, we use setsid. + * + * FIXME! - This is basically a no-op. We return -1 because + * only returning a 0 disturbs the caller. The caller throws + * away the return code anyway. This needs to be redone + * for slurm-0.6 when the task-creation code is rewritten to + * eliminate the user-owned session manager slurmd. */ extern uint32_t slurm_create_container ( uint32_t job_id ) { - pid_t pid = setsid(); - (void) setpgrp(); - - if (pid < 0) { - error("slurm_create_container: setsid: %m"); - return (uint32_t) 0; - } - return (uint32_t) pid; + return (uint32_t) -1; } extern int slurm_add_container ( uint32_t id ) @@ -123,6 +116,11 @@ extern int slurm_signal_container ( uint32_t id, int signal ) if (!id) /* no container ID */ return ESRCH; + if (id == getpid() || id == getpgid(0)) { + error("slurm_signal_container would kill caller!"); + return ESRCH; + } + return killpg(pid, signal); } @@ -134,7 +132,7 @@ extern int slurm_destroy_container ( uint32_t id ) extern uint32_t slurm_find_container(pid_t pid) { - pid_t rc = getsid(pid); + pid_t rc = getpgid(pid); if (rc == -1) return (uint32_t) 0; diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index f75b45815aaf90622ece63106675d650ead7271e..8d5d081efa58a2429b2c37e0df67853f13186dae 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -655,6 +655,8 @@ _create_job_session(slurmd_job_t *job) */ if (shm_update_step_mpid(job->jobid, job->stepid, getpid()) < 0) debug("shm_update_step_mpid: %m"); + if (shm_update_step_spid(job->jobid, job->stepid, spid) < 0) + debug("shm_update_step_spid: %m"); job->smgr_pid = spid; @@ -924,8 +926,8 @@ _kill_running_tasks(slurmd_job_t *job) while ((s = list_next(i))) { if ((s->jobid != job->jobid) || (s->stepid != job->stepid)) continue; - if (s->task_list && s->task_list->pid) - killpg(s->task_list->pid, SIGKILL); +/* if (s->task_list && s->task_list->pid) */ +/* killpg(s->task_list->pid, SIGKILL); */ if (s->cont_id) slurm_signal_container(s->cont_id, SIGKILL); } diff --git a/src/slurmd/req.c b/src/slurmd/req.c index 9667b6f9d808e5398596ef7cb9950777ab106152..5e8ad327492f56272f670792b9d31d603fef8885 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -138,6 +138,7 @@ slurmd_req(slurm_msg_t *msg, slurm_addr *cli) slurm_free_reattach_tasks_request_msg(msg->data); break; case REQUEST_KILL_JOB: + debug2("RPC: REQUEST_KILL_JOB"); _rpc_kill_job(msg, cli); slurm_free_kill_job_msg(msg->data); break; @@ -757,14 +758,16 @@ _rpc_kill_tasks(slurm_msg_t *msg, slurm_addr *cli_addr) kill_proc_tree((pid_t) step->cont_id, req->signal); rc = SLURM_SUCCESS; } else { - if (slurm_signal_container(step->cont_id, req->signal) < 0) - rc = errno; - - if (step->task_list - && (step->task_list->pid > (pid_t) 0) - && (killpg(step->task_list->pid, req->signal) < 0)) - rc = errno; - + if ((req->signal == SIGKILL) || (req->signal == 0)) { + if (slurm_signal_container(step->cont_id, + req->signal) < 0) + rc = errno; + } else { + if (step->task_list + && (step->task_list->pid > (pid_t) 0) + && (killpg(step->task_list->pid, req->signal) < 0)) + rc = errno; + } if (rc == SLURM_SUCCESS) verbose("Sent signal %d to %u.%u", req->signal, req->job_id, req->job_step_id); @@ -790,7 +793,8 @@ _kill_running_session_mgrs(uint32_t jobid, int signum, char *signame) while ((s = list_next(i))) { if ((s->jobid == jobid) && s->cont_id) { - slurm_signal_container(s->cont_id, signum); + kill(s->spid, signum); + /* slurm_signal_container(s->cont_id, signum); */ cnt++; } } @@ -1028,11 +1032,11 @@ _kill_all_active_steps(uint32_t jobid, int sig, bool batch) if (slurm_signal_container(s->cont_id, sig) < 0) error("kill jid %d cont_id %u: %m", s->jobid, s->cont_id); - if (s->task_list - && (s->task_list->pid > (pid_t) 0) - && (killpg(s->task_list->pid, sig) < 0)) - error("kill jid %d pgrp %d: %m", - s->jobid, s->task_list->pid); +/* if (s->task_list */ +/* && (s->task_list->pid > (pid_t) 0) */ +/* && (killpg(s->task_list->pid, sig) < 0)) */ +/* error("kill jid %d pgrp %d: %m", */ +/* s->jobid, s->task_list->pid); */ } } list_iterator_destroy(i); diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index 47256bb04b255c511558c67c36d35fba4365019c..514f7aa601d02d5ddf601b8b1f68af34002509d4 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -526,6 +526,21 @@ shm_update_step_mpid(uint32_t jobid, uint32_t stepid, int mpid) return retval; } +int +shm_update_step_spid(uint32_t jobid, uint32_t stepid, int spid) +{ + int i, retval = SLURM_SUCCESS; + _shm_lock(); + if ((i = _shm_find_step(jobid, stepid)) >= 0) + slurmd_shm->step[i].spid = spid; + else { + slurm_seterrno(ESRCH); + retval = SLURM_FAILURE; + } + _shm_unlock(); + return retval; +} + int shm_update_step_cont_id(uint32_t jobid, uint32_t stepid, uint32_t cont_id) { diff --git a/src/slurmd/shm.h b/src/slurmd/shm.h index f8687caa3a342974fbd9014f614d0cbafc63033e..78e0d4e220609f8d6570c735465af09430430e7f 100644 --- a/src/slurmd/shm.h +++ b/src/slurmd/shm.h @@ -76,6 +76,7 @@ struct job_step { uint32_t sw_id; /* Switch/Interconnect specific id */ int ntasks; /* number of tasks in this job */ pid_t mpid; /* Job manager pid */ + pid_t spid; /* Session manager pid */ uint32_t cont_id; /* Job container id */ /* Executable's pathname */ @@ -198,6 +199,12 @@ int shm_update_step_cont_id(uint32_t jobid, uint32_t stepid, uint32_t cont_id); int shm_update_step_mpid(uint32_t jobid, uint32_t stepid, int mpid); +/* + * update job step "session manager" pid + */ +int shm_update_step_spid(uint32_t jobid, uint32_t stepid, int spid); + + /* * update job step state */ diff --git a/src/slurmd/smgr.c b/src/slurmd/smgr.c index dc0ff900fed38cd66742362bddfa994ae8f2c08a..7770f45b89a0dd5fa6edf7ab9df928ed925d7367 100644 --- a/src/slurmd/smgr.c +++ b/src/slurmd/smgr.c @@ -324,14 +324,6 @@ _exec_all_tasks(slurmd_job_t *job) (unsigned long) job->task[i]->gtid, (unsigned long) pid); - /* - * Send pid to job manager - */ - if (fd_write_n(fd, (char *)&pid, sizeof(pid_t)) < 0) { - error("unable to update task pid!: %m"); - return SLURM_ERROR; - } - job->task[i]->pid = pid; /* @@ -341,6 +333,14 @@ _exec_all_tasks(slurmd_job_t *job) error ("Unable to put task %d (pid %ld) into pgrp %ld", i, pid, job->task[0]->pid); + /* + * Send pid to job manager + */ + if (fd_write_n(fd, (char *)&pid, sizeof(pid_t)) < 0) { + error("unable to update task pid!: %m"); + return SLURM_ERROR; + } + /* * Now it's ok to unblock this child, so it may call exec */