diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index 2a976cd2b6f20be6a94913721a6c9d7f06436e57..2173a6dcd0ac4d3d3eacc0b6ce8b24edf44c848a 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -158,6 +158,8 @@ enum { ESLURMD_PROLOG_FAILED, ESLURMD_EPILOG_FAILED, ESLURMD_SESSION_KILLED, + ESLURMD_TOOMANYSTEPS, + ESLURMD_STEP_EXISTS, /* slurmd errors in user batch job */ ESCRIPT_CHDIR_FAILED = 4100, diff --git a/src/common/io_hdr.c b/src/common/io_hdr.c index 5bebe568d918c05385e44f8f1b95eed794de0a11..c855c28e992dc6b697bade6db533479c23082b75 100644 --- a/src/common/io_hdr.c +++ b/src/common/io_hdr.c @@ -34,6 +34,7 @@ #define IO_HDR_VERSION 0xa001 +/* static void _print_data(char *data, int datalen) { @@ -46,6 +47,7 @@ _print_data(char *data, int datalen) info("data: %s", buf); } +*/ static void diff --git a/src/common/slurm_auth.c b/src/common/slurm_auth.c index 7f45a7dbe7e502af149f83bc69fb4d0c81b1f5f2..465533a0776c76a665b42d24b8e0ade8e8b27ed0 100644 --- a/src/common/slurm_auth.c +++ b/src/common/slurm_auth.c @@ -104,7 +104,7 @@ get_plugin_dir( void ) read_slurm_conf_ctl( &conf ); } if ( conf.plugindir == NULL ) { - conf.plugindir = xstrdup( "/usr/local/lib" ); + conf.plugindir = xstrdup( SLURM_PLUGIN_PATH ); } slurm_mutex_unlock( &config_lock ); diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index ca1413abcef7c998039fd4438116396fb96f1aeb..23e96bf8e382ad2494cd3a3c9d8022468dbe5c7f 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -218,6 +218,10 @@ static slurm_errtab_t slurm_errtab[] = { "Job epilog failed" }, { ESLURMD_SESSION_KILLED, "Session manager killed" }, + { ESLURMD_TOOMANYSTEPS, + "Too many job steps on node" }, + { ESLURMD_STEP_EXISTS, + "Job step already in shared memory" }, /* slurmd errors in user batch job */ { ESCRIPT_CHDIR_FAILED, diff --git a/src/plugins/auth/auth_munge.c b/src/plugins/auth/auth_munge.c index dd48818827cd95f41585066b7605d4d5c9141fd5..cd5a55bd0a55b900b2d2bc2eca92df564d933dbe 100644 --- a/src/plugins/auth/auth_munge.c +++ b/src/plugins/auth/auth_munge.c @@ -171,6 +171,7 @@ slurm_auth_free( slurm_auth_credential_t *cred ) */ if (cred->m_str) free(cred->m_str); xfree(cred); + return SLURM_SUCCESS; } /* @@ -381,6 +382,7 @@ slurm_auth_print( slurm_auth_credential_t *cred, FILE *fp ) fprintf(fp, "BEGIN SLURM MUNGE AUTHENTICATION CREDENTIAL\n" ); fprintf(fp, "%s\n", cred->m_str ); fprintf(fp, "END SLURM MUNGE AUTHENTICATION CREDENTIAL\n" ); + return SLURM_SUCCESS; } int diff --git a/src/slurmd/job.c b/src/slurmd/job.c index ac53d461749a26a0d9858982d436c733e152814e..117fb5fec5c1ab991cd2ac6ef92a5e4743e4c7b3 100644 --- a/src/slurmd/job.c +++ b/src/slurmd/job.c @@ -422,7 +422,7 @@ task_info_destroy(task_info_t *t) xfree(t); } -void +int job_update_shm(slurmd_job_t *job) { job_step_t s; @@ -436,13 +436,15 @@ job_update_shm(slurmd_job_t *job) s.sw_id = 0; s.io_update = false; - if (shm_insert_step(&s) < 0) - error("Updating shm with new step info: %m"); + if (shm_insert_step(&s) < 0) + return SLURM_ERROR; if (job->stepid == NO_VAL) debug("updated shm with job %d", job->jobid); else debug("updated shm with step %d.%d", job->jobid, job->stepid); + + return SLURM_SUCCESS; } void diff --git a/src/slurmd/job.h b/src/slurmd/job.h index 735f9d5eb9be1e3d74f49046befb68ba370493ab..f337bc472511b7a8ec235c5f2954e0d3507b7d7e 100644 --- a/src/slurmd/job.h +++ b/src/slurmd/job.h @@ -141,7 +141,7 @@ struct task_info * task_info_create(int taskid, int gtaskid); void task_info_destroy(struct task_info *t); -void job_update_shm(slurmd_job_t *job); +int job_update_shm(slurmd_job_t *job); void job_delete_shm(slurmd_job_t *job); diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index 376df11a5307bbf4bb8ee52d0f173c3a3b26ac15..3a77bed2173ec5809a728c17654f12027e9dfefb 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -362,7 +362,13 @@ _job_mgr(slurmd_job_t *job) if (shm_init() < 0) goto fail0; - job_update_shm(job); + if (job_update_shm(job) < 0) { + if (errno == ENOSPC) + rc = ESLURMD_TOOMANYSTEPS; + else if (errno == EEXIST) + rc = ESLURMD_STEP_EXISTS; + goto fail0; + } if (!job->batch && (interconnect_preinit(job) < 0)) { rc = ESLURM_INTERCONNECT_FAILURE; diff --git a/src/slurmd/req.c b/src/slurmd/req.c index 65a363db0acfb3df641aa7262606803bd7fb5eb1..fd0b9471a6f4aa5f1549c8da408e033dc37c17c7 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -490,13 +490,21 @@ _rpc_timelimit(slurm_msg_t *msg, slurm_addr *cli_addr) */ _kill_running_session_mgrs(req->job_id, SIGXCPU); - step_cnt = _kill_all_active_steps(req->job_id, SIGTERM); + if ((step_cnt = _kill_all_active_steps(req->job_id, SIGTERM))) + found_job = true; - info("Timeout for job=%u, step_cnt=%d, kill_wait=%u", - req->job_id, step_cnt, conf->cf.kill_wait); + verbose( "Job %u: timeout: sent SIGTERM to %d active steps", + req->job_id, step_cnt ); + + sleep(1); + /* + * Check to see if any processes are still around + */ + if (found_job && _kill_all_active_steps(req->job_id, 0)) { + + verbose( "Job %u: waiting %d secs for SIGKILL", + req->job_id, conf->cf.kill_wait ); - if (step_cnt) { - found_job = true; sleep(conf->cf.kill_wait); } @@ -653,8 +661,7 @@ _kill_all_active_steps(uint32_t jobid, int sig) } list_destroy(steps); if (step_cnt == 0) - debug2("No steps in jobid %d to send signal %d", - jobid, sig); + debug2("No steps in jobid %d to send signal %d", jobid, sig); return step_cnt; } diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index 05fbcee25e436eb333aaa20b80a6e9e331f05098..1f41896aa78f0c87d5323da800e83f93209805fa 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -278,7 +278,7 @@ _is_valid_ipc_name(const char *name) static char * _create_ipc_name(const char *name) { - char *dst, *dir, *slash; + char *dst = NULL, *dir = NULL, *slash = NULL; int rc; if ((rc = _is_valid_ipc_name(name)) != 1) @@ -419,11 +419,10 @@ shm_signal_step(uint32_t jobid, uint32_t stepid, uint32_t signal) if ((i = _shm_find_step(jobid, stepid)) >= 0) { s = &slurmd_shm->step[i]; for (t = _taskp(s->task_list); t; t = _taskp(t->next)) { + pid_t sid = getsid(t->pid); - if (getsid(t->pid) != s->sid) { - error ("Task pid is not in my session!"); + if ((sid < (pid_t) 0) || (sid != s->sid)) continue; - } if (t->pid > 0 && kill(t->pid, signo) < 0) { error("kill %d.%d task %d pid %ld: %m", diff --git a/src/slurmd/smgr.c b/src/slurmd/smgr.c index 6a9d910c2fc4782dc36191522155e2c7939a0cc9..67162b9cd5dea549d3ced70910bf4bb021a45ef8 100644 --- a/src/slurmd/smgr.c +++ b/src/slurmd/smgr.c @@ -282,7 +282,7 @@ _exec_task(slurmd_job_t *job, int i) } static sig_atomic_t timelimit_exceeded = 0; -static +static void _xcpu_handler() { timelimit_exceeded = 1;