diff --git a/NEWS b/NEWS index dc52c856fe8201e55cfa50905eedca17a66d4aff..b655765b693c62035565670dde91675a302a2e51 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,9 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 0.6.0-pre2 +============================= + * Changes in SLURM 0.6.0-pre1 ============================= -- Added bgl/partition_allocator/smap changes from 0.5.7. diff --git a/src/plugins/proctrack/aix/proctrack_aix.c b/src/plugins/proctrack/aix/proctrack_aix.c index 75f7060c041fb65681d76fdffefc0b125b658037..fa3777e66ab9ec73141f999e7890b3c7652a467c 100644 --- a/src/plugins/proctrack/aix/proctrack_aix.c +++ b/src/plugins/proctrack/aix/proctrack_aix.c @@ -110,7 +110,7 @@ extern int fini ( void ) * Uses job step process group id as a unique identifier. Job id * and step id are not unique by themselves. */ -extern uint32_t slurm_create_container ( slurmd_job_t *job ) +extern uint32_t slurm_container_create ( slurmd_job_t *job ) { int pgid = (int) job->pgid; int i; @@ -128,13 +128,13 @@ extern uint32_t slurm_create_container ( slurmd_job_t *job ) } -extern int slurm_add_container ( uint32_t id ) +extern int slurm_container_add ( uint32_t id, pid_t pid ) { - error("slurm_add_container not supported"); - return SLURM_ERROR; + debug("slurm_container_add not supported"); + return SLURM_SUCCESS; } -extern int slurm_signal_container ( uint32_t id, int signal ) +extern int slurm_container_signal ( uint32_t id, int signal ) { int jobid = (int) id; if (!id) /* no container ID */ @@ -143,7 +143,7 @@ extern int slurm_signal_container ( uint32_t id, int signal ) return proctrack_job_kill(&jobid, &signal); } -extern int slurm_destroy_container ( uint32_t id ) +extern int slurm_container_destroy ( uint32_t id ) { int jobid = (int) id; @@ -158,7 +158,7 @@ extern int slurm_destroy_container ( uint32_t id ) } extern uint32_t -slurm_find_container(pid_t pid) +slurm_container_find(pid_t pid) { int local_pid = (int) pid; int cont_id = proctrack_get_job_id(&local_pid); diff --git a/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c b/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c index c4301bcca8c2c175895a897c0dae8b8d5ee2d7ed..f737f86bc0d1337e066c1d84a1334ad75bd10b73 100644 --- a/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c +++ b/src/plugins/proctrack/linuxproc/proctrack_linuxproc.c @@ -92,27 +92,27 @@ extern int fini ( void ) /* * Uses slurmd job-step manager's pid as the unique container id. */ -extern uint32_t slurm_create_container ( slurmd_job_t *job ) +extern uint32_t slurm_container_create ( slurmd_job_t *job ) { return (uint32_t) job->jmgr_pid; } -extern int slurm_add_container ( uint32_t id ) +extern int slurm_container_add ( uint32_t id, pid_t pid ) { return SLURM_SUCCESS; } -extern int slurm_signal_container ( uint32_t id, int signal ) +extern int slurm_container_signal ( uint32_t id, int signal ) { return kill_proc_tree_not_top((pid_t)id, signal); } -extern int slurm_destroy_container ( uint32_t id ) +extern int slurm_container_destroy ( uint32_t id ) { return SLURM_SUCCESS; } -extern uint32_t slurm_find_container(pid_t pid) +extern uint32_t slurm_container_find(pid_t pid) { return (uint32_t) find_ancestor(pid, "slurmd"); } diff --git a/src/plugins/proctrack/pgid/proctrack_pgid.c b/src/plugins/proctrack/pgid/proctrack_pgid.c index bab31b769d5c47f5801ba400bdd1eb4cf70f0851..e0c2f73fb982143f445c1dd36ca8c1c99ca9c5bc 100644 --- a/src/plugins/proctrack/pgid/proctrack_pgid.c +++ b/src/plugins/proctrack/pgid/proctrack_pgid.c @@ -94,17 +94,17 @@ extern int fini ( void ) /* * Uses job step process group id. */ -extern uint32_t slurm_create_container ( slurmd_job_t *job ) +extern uint32_t slurm_container_create ( slurmd_job_t *job ) { return (uint32_t) job->pgid; } -extern int slurm_add_container ( uint32_t id ) +extern int slurm_container_add ( uint32_t id, pid_t pid ) { return SLURM_SUCCESS; } -extern int slurm_signal_container ( uint32_t id, int signal ) +extern int slurm_container_signal ( uint32_t id, int signal ) { pid_t pid = (pid_t) id; @@ -119,13 +119,13 @@ extern int slurm_signal_container ( uint32_t id, int signal ) return (int)killpg(pid, signal); } -extern int slurm_destroy_container ( uint32_t id ) +extern int slurm_container_destroy ( uint32_t id ) { return SLURM_SUCCESS; } extern uint32_t -slurm_find_container(pid_t pid) +slurm_container_find(pid_t pid) { pid_t rc = getpgid(pid); diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index 9a8cf9d73a1384346622eba4f1455321364c76d6..e3cab987fce04d41b91baa3888798f18116085ff 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -696,9 +696,9 @@ _fork_all_tasks(slurmd_job_t *job) * will wait for our signal before calling exec. */ shm_update_step_pgid(job->jobid, job->stepid, job->pgid); - cont_id = slurm_create_container(job); + cont_id = slurm_container_create(job); if (cont_id == 0) { - error("slurm_create_container: %m"); + error("slurm_container_create: %m"); exit(3); } shm_update_step_cont_id(job->jobid, job->stepid, cont_id); @@ -876,12 +876,12 @@ _kill_running_tasks(slurmd_job_t *job) return; if (s->cont_id) { - slurm_signal_container(s->cont_id, SIGKILL); + slurm_container_signal(s->cont_id, SIGKILL); /* Try destroying the container up to 30 times */ - while (slurm_destroy_container(s->cont_id) != SLURM_SUCCESS + while (slurm_container_destroy(s->cont_id) != SLURM_SUCCESS && limit < 30) { - slurm_signal_container(s->cont_id, SIGKILL); + slurm_container_signal(s->cont_id, SIGKILL); sleep(1); limit++; } diff --git a/src/slurmd/proctrack.c b/src/slurmd/proctrack.c index e3615ed814742930d59049326b4fb750e8b088c8..a7daa31efd625e0194f15220920dc3f910beeafd 100644 --- a/src/slurmd/proctrack.c +++ b/src/slurmd/proctrack.c @@ -38,7 +38,7 @@ /* ************************************************************************ */ typedef struct slurm_proctrack_ops { uint32_t (*create) ( slurmd_job_t *job ); - int (*add) ( uint32_t id ); + int (*add) ( uint32_t id, pid_t pid ); int (*signal) ( uint32_t id, int signal ); int (*destroy) ( uint32_t id ); uint32_t (*find_cont) ( pid_t pid ); @@ -70,11 +70,11 @@ _proctrack_get_ops( slurm_proctrack_context_t *c ) * Must be synchronized with slurm_proctrack_ops_t above. */ static const char *syms[] = { - "slurm_create_container", - "slurm_add_container", - "slurm_signal_container", - "slurm_destroy_container", - "slurm_find_container" + "slurm_container_create", + "slurm_container_add", + "slurm_container_signal", + "slurm_container_destroy", + "slurm_container_find" }; int n_syms = sizeof( syms ) / sizeof( char * ); @@ -223,7 +223,7 @@ slurm_proctrack_fini( void ) * Returns container ID or zero on error */ extern uint32_t -slurm_create_container(slurmd_job_t *job) +slurm_container_create(slurmd_job_t *job) { if ( slurm_proctrack_init() < 0 ) return 0; @@ -232,30 +232,31 @@ slurm_create_container(slurmd_job_t *job) } /* - * Add this process to the specified container - * cont_id IN - container ID as returned by slurm_create_container() + * Add a process to the specified container + * cont_id IN - container ID as returned by slurm_container_create() + * pid IN - process ID to be added to the container * * Returns a SLURM errno. */ extern int -slurm_add_container(uint32_t cont_id) +slurm_container_add(uint32_t cont_id, pid_t pid) { if ( slurm_proctrack_init() < 0 ) return SLURM_ERROR; - return (*(g_proctrack_context->ops.add))( cont_id ); + return (*(g_proctrack_context->ops.add))( cont_id , pid ); } /* * Signal all processes within a container - * cont_id IN - container ID as returned by slurm_create_container() + * cont_id IN - container ID as returned by slurm_container_create() * signal IN - signal to send, if zero then perform error checking * but do not send signal * * Returns a SLURM errno. */ extern int -slurm_signal_container(uint32_t cont_id, int signal) +slurm_container_signal(uint32_t cont_id, int signal) { if ( slurm_proctrack_init() < 0 ) return SLURM_ERROR; @@ -265,12 +266,12 @@ slurm_signal_container(uint32_t cont_id, int signal) /* * Destroy a container, any processes within the container are not effected - * cont_id IN - container ID as returned by slurm_create_container() + * cont_id IN - container ID as returned by slurm_container_create() * * Returns a SLURM errno. */ extern int -slurm_destroy_container(uint32_t cont_id) +slurm_container_destroy(uint32_t cont_id) { if ( slurm_proctrack_init() < 0 ) return SLURM_ERROR; @@ -284,7 +285,7 @@ slurm_destroy_container(uint32_t cont_id) * Returns a SLURM errno. */ extern uint32_t -slurm_find_container(pid_t pid) +slurm_container_find(pid_t pid) { if ( slurm_proctrack_init() < 0 ) return SLURM_ERROR; diff --git a/src/slurmd/proctrack.h b/src/slurmd/proctrack.h index 6d04643ed62239886f5c393f62ea8ceeced6f7a8..f104cf067d0842ff38d613db82680fb939967aaa 100644 --- a/src/slurmd/proctrack.h +++ b/src/slurmd/proctrack.h @@ -55,41 +55,42 @@ extern int slurm_proctrack_fini(void); * * Returns container ID or zero on error */ -extern uint32_t slurm_create_container(slurmd_job_t *job); +extern uint32_t slurm_container_create(slurmd_job_t *job); /* - * Add this process to the specified container - * cont_id IN - container ID as returned by slurm_create_container() + * Add a process to the specified container + * cont_id IN - container ID as returned by slurm_container_create() + * pid IN - process ID to be added to the container * * Returns a SLURM errno. */ -extern int slurm_add_container(uint32_t cont_id); +extern int slurm_container_add(uint32_t cont_id, pid_t pid); /* * Signal all processes within a container - * cont_id IN - container ID as returned by slurm_create_container() + * cont_id IN - container ID as returned by slurm_container_create() * signal IN - signal to send, if zero then perform error checking * but do not send signal * * Returns a SLURM errno. */ -extern int slurm_signal_container(uint32_t cont_id, int signal); +extern int slurm_container_signal(uint32_t cont_id, int signal); /* * Destroy a container, any processes within the container are not effected - * cont_id IN - container ID as returned by slurm_create_container() + * cont_id IN - container ID as returned by slurm_container_create() * * Returns a SLURM errno. */ -extern int slurm_destroy_container(uint32_t cont_id); +extern int slurm_container_destroy(uint32_t cont_id); /* * Get container ID for give process ID * * Returns a SLURM errno. */ -extern uint32_t slurm_find_container(pid_t pid); +extern uint32_t slurm_container_find(pid_t pid); /* Wait for all processes within a container to exit */ /* Add process to a container */ diff --git a/src/slurmd/req.c b/src/slurmd/req.c index dcd4546ef8ae1c120197da60300cd3d94bf7e6d8..1cfce6be5943043bc575e2ce768372fc45ba16fc 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -760,11 +760,11 @@ _rpc_kill_tasks(slurm_msg_t *msg, slurm_addr *cli_addr) * Assume step termination request. * Send SIGCONT just in case the processes are stopped. */ - slurm_signal_container(step->cont_id, SIGCONT); - if (slurm_signal_container(step->cont_id, req->signal) < 0) + slurm_container_signal(step->cont_id, SIGCONT); + if (slurm_container_signal(step->cont_id, req->signal) < 0) rc = errno; } else if (req->signal == 0) { - if (slurm_signal_container(step->cont_id, req->signal) < 0) + if (slurm_container_signal(step->cont_id, req->signal) < 0) rc = errno; /* SIGMIGRATE and SIGSOUND are used to initiate job checkpoint on AIX. * These signals are not sent to the entire process group, but just a @@ -839,10 +839,10 @@ static void _rpc_pid2jid(slurm_msg_t *msg, slurm_addr *cli) slurm_msg_t resp_msg; job_id_response_msg_t resp; bool found = false; - uint32_t my_cont = slurm_find_container(req->job_pid); + uint32_t my_cont = slurm_container_find(req->job_pid); if (my_cont == 0) { - verbose("slurm_find_container(%u): process not found", + verbose("slurm_container_find(%u): process not found", (uint32_t) req->job_pid); } else { List steps = shm_get_steps(); @@ -1012,7 +1012,7 @@ _kill_all_active_steps(uint32_t jobid, int sig, bool batch) debug2("signal %d to job %u (cont_id:%u)", sig, jobid, s->cont_id); - if (slurm_signal_container(s->cont_id, sig) < 0) + if (slurm_container_signal(s->cont_id, sig) < 0) error("kill jid %d cont_id %u: %m", s->jobid, s->cont_id); } diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index 7e47c9c59d568f9af5b355fa5042b4ccbef9f6f8..fbf0f3086621a93f4df86cbbc21d2bde22bfc3b7 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -880,7 +880,7 @@ _shm_clear_stale_entries(void) if ((s->state == SLURMD_JOB_UNUSED) /* unused */ || (s->cont_id == 0) /* empty */ - || (slurm_signal_container(s->cont_id, 0) == 0)) /* active */ + || (slurm_container_signal(s->cont_id, 0) == 0)) /* active */ continue; while (t && !active_tasks) { @@ -1135,7 +1135,7 @@ static bool _valid_slurmd_cont_id(uint32_t cont_id) { /* Check if container has processes */ - if (slurm_signal_container(cont_id, 0) != 0) + if (slurm_container_signal(cont_id, 0) != 0) return false; return true;