From 808f912a69557d6c0d59cd8df10360ba56aaa47c Mon Sep 17 00:00:00 2001 From: Mark Grondona <mgrondona@llnl.gov> Date: Fri, 21 Mar 2003 01:00:57 +0000 Subject: [PATCH] o src/slurmd : fail job launch if shared memory is filled. o src/slurmd : do not bother to sleep full timeout in rpc_timelimit if the job step goes away on SIGTERM. o src/common/slurm_auth.c : get default plugin dir from SLURM_PLUGIN_DIR rather that always "/usr/local/lib/slurm" o other small fixes --- slurm/slurm_errno.h | 2 ++ src/common/io_hdr.c | 2 ++ src/common/slurm_auth.c | 2 +- src/common/slurm_errno.c | 4 ++++ src/plugins/auth/auth_munge.c | 2 ++ src/slurmd/job.c | 8 +++++--- src/slurmd/job.h | 2 +- src/slurmd/mgr.c | 8 +++++++- src/slurmd/req.c | 21 ++++++++++++++------- src/slurmd/shm.c | 7 +++---- src/slurmd/smgr.c | 2 +- 11 files changed, 42 insertions(+), 18 deletions(-) diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index 2a976cd2b6f..2173a6dcd0a 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -158,6 +158,8 @@ enum { ESLURMD_PROLOG_FAILED, ESLURMD_EPILOG_FAILED, ESLURMD_SESSION_KILLED, + ESLURMD_TOOMANYSTEPS, + ESLURMD_STEP_EXISTS, /* slurmd errors in user batch job */ ESCRIPT_CHDIR_FAILED = 4100, diff --git a/src/common/io_hdr.c b/src/common/io_hdr.c index 5bebe568d91..c855c28e992 100644 --- a/src/common/io_hdr.c +++ b/src/common/io_hdr.c @@ -34,6 +34,7 @@ #define IO_HDR_VERSION 0xa001 +/* static void _print_data(char *data, int datalen) { @@ -46,6 +47,7 @@ _print_data(char *data, int datalen) info("data: %s", buf); } +*/ static void diff --git a/src/common/slurm_auth.c b/src/common/slurm_auth.c index 7f45a7dbe7e..465533a0776 100644 --- a/src/common/slurm_auth.c +++ b/src/common/slurm_auth.c @@ -104,7 +104,7 @@ get_plugin_dir( void ) read_slurm_conf_ctl( &conf ); } if ( conf.plugindir == NULL ) { - conf.plugindir = xstrdup( "/usr/local/lib" ); + conf.plugindir = xstrdup( SLURM_PLUGIN_PATH ); } slurm_mutex_unlock( &config_lock ); diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index ca1413abcef..23e96bf8e38 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -218,6 +218,10 @@ static slurm_errtab_t slurm_errtab[] = { "Job epilog failed" }, { ESLURMD_SESSION_KILLED, "Session manager killed" }, + { ESLURMD_TOOMANYSTEPS, + "Too many job steps on node" }, + { ESLURMD_STEP_EXISTS, + "Job step already in shared memory" }, /* slurmd errors in user batch job */ { ESCRIPT_CHDIR_FAILED, diff --git a/src/plugins/auth/auth_munge.c b/src/plugins/auth/auth_munge.c index dd48818827c..cd5a55bd0a5 100644 --- a/src/plugins/auth/auth_munge.c +++ b/src/plugins/auth/auth_munge.c @@ -171,6 +171,7 @@ slurm_auth_free( slurm_auth_credential_t *cred ) */ if (cred->m_str) free(cred->m_str); xfree(cred); + return SLURM_SUCCESS; } /* @@ -381,6 +382,7 @@ slurm_auth_print( slurm_auth_credential_t *cred, FILE *fp ) fprintf(fp, "BEGIN SLURM MUNGE AUTHENTICATION CREDENTIAL\n" ); fprintf(fp, "%s\n", cred->m_str ); fprintf(fp, "END SLURM MUNGE AUTHENTICATION CREDENTIAL\n" ); + return SLURM_SUCCESS; } int diff --git a/src/slurmd/job.c b/src/slurmd/job.c index ac53d461749..117fb5fec5c 100644 --- a/src/slurmd/job.c +++ b/src/slurmd/job.c @@ -422,7 +422,7 @@ task_info_destroy(task_info_t *t) xfree(t); } -void +int job_update_shm(slurmd_job_t *job) { job_step_t s; @@ -436,13 +436,15 @@ job_update_shm(slurmd_job_t *job) s.sw_id = 0; s.io_update = false; - if (shm_insert_step(&s) < 0) - error("Updating shm with new step info: %m"); + if (shm_insert_step(&s) < 0) + return SLURM_ERROR; if (job->stepid == NO_VAL) debug("updated shm with job %d", job->jobid); else debug("updated shm with step %d.%d", job->jobid, job->stepid); + + return SLURM_SUCCESS; } void diff --git a/src/slurmd/job.h b/src/slurmd/job.h index 735f9d5eb9b..f337bc47251 100644 --- a/src/slurmd/job.h +++ b/src/slurmd/job.h @@ -141,7 +141,7 @@ struct task_info * task_info_create(int taskid, int gtaskid); void task_info_destroy(struct task_info *t); -void job_update_shm(slurmd_job_t *job); +int job_update_shm(slurmd_job_t *job); void job_delete_shm(slurmd_job_t *job); diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index 376df11a530..3a77bed2173 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -362,7 +362,13 @@ _job_mgr(slurmd_job_t *job) if (shm_init() < 0) goto fail0; - job_update_shm(job); + if (job_update_shm(job) < 0) { + if (errno == ENOSPC) + rc = ESLURMD_TOOMANYSTEPS; + else if (errno == EEXIST) + rc = ESLURMD_STEP_EXISTS; + goto fail0; + } if (!job->batch && (interconnect_preinit(job) < 0)) { rc = ESLURM_INTERCONNECT_FAILURE; diff --git a/src/slurmd/req.c b/src/slurmd/req.c index 65a363db0ac..fd0b9471a6f 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -490,13 +490,21 @@ _rpc_timelimit(slurm_msg_t *msg, slurm_addr *cli_addr) */ _kill_running_session_mgrs(req->job_id, SIGXCPU); - step_cnt = _kill_all_active_steps(req->job_id, SIGTERM); + if ((step_cnt = _kill_all_active_steps(req->job_id, SIGTERM))) + found_job = true; - info("Timeout for job=%u, step_cnt=%d, kill_wait=%u", - req->job_id, step_cnt, conf->cf.kill_wait); + verbose( "Job %u: timeout: sent SIGTERM to %d active steps", + req->job_id, step_cnt ); + + sleep(1); + /* + * Check to see if any processes are still around + */ + if (found_job && _kill_all_active_steps(req->job_id, 0)) { + + verbose( "Job %u: waiting %d secs for SIGKILL", + req->job_id, conf->cf.kill_wait ); - if (step_cnt) { - found_job = true; sleep(conf->cf.kill_wait); } @@ -653,8 +661,7 @@ _kill_all_active_steps(uint32_t jobid, int sig) } list_destroy(steps); if (step_cnt == 0) - debug2("No steps in jobid %d to send signal %d", - jobid, sig); + debug2("No steps in jobid %d to send signal %d", jobid, sig); return step_cnt; } diff --git a/src/slurmd/shm.c b/src/slurmd/shm.c index 05fbcee25e4..1f41896aa78 100644 --- a/src/slurmd/shm.c +++ b/src/slurmd/shm.c @@ -278,7 +278,7 @@ _is_valid_ipc_name(const char *name) static char * _create_ipc_name(const char *name) { - char *dst, *dir, *slash; + char *dst = NULL, *dir = NULL, *slash = NULL; int rc; if ((rc = _is_valid_ipc_name(name)) != 1) @@ -419,11 +419,10 @@ shm_signal_step(uint32_t jobid, uint32_t stepid, uint32_t signal) if ((i = _shm_find_step(jobid, stepid)) >= 0) { s = &slurmd_shm->step[i]; for (t = _taskp(s->task_list); t; t = _taskp(t->next)) { + pid_t sid = getsid(t->pid); - if (getsid(t->pid) != s->sid) { - error ("Task pid is not in my session!"); + if ((sid < (pid_t) 0) || (sid != s->sid)) continue; - } if (t->pid > 0 && kill(t->pid, signo) < 0) { error("kill %d.%d task %d pid %ld: %m", diff --git a/src/slurmd/smgr.c b/src/slurmd/smgr.c index 6a9d910c2fc..67162b9cd5d 100644 --- a/src/slurmd/smgr.c +++ b/src/slurmd/smgr.c @@ -282,7 +282,7 @@ _exec_task(slurmd_job_t *job, int i) } static sig_atomic_t timelimit_exceeded = 0; -static +static void _xcpu_handler() { timelimit_exceeded = 1; -- GitLab