Skip to content
Snippets Groups Projects
Commit 808f912a authored by Mark Grondona's avatar Mark Grondona
Browse files

o src/slurmd : fail job launch if shared memory is filled.

 o src/slurmd : do not bother to sleep full timeout in rpc_timelimit
   if the job step goes away on SIGTERM.
 o src/common/slurm_auth.c : get default plugin dir from SLURM_PLUGIN_DIR
   rather that always "/usr/local/lib/slurm"
 o other small fixes
parent 30f48a4a
No related branches found
No related tags found
No related merge requests found
......@@ -158,6 +158,8 @@ enum {
ESLURMD_PROLOG_FAILED,
ESLURMD_EPILOG_FAILED,
ESLURMD_SESSION_KILLED,
ESLURMD_TOOMANYSTEPS,
ESLURMD_STEP_EXISTS,
/* slurmd errors in user batch job */
ESCRIPT_CHDIR_FAILED = 4100,
......
......@@ -34,6 +34,7 @@
#define IO_HDR_VERSION 0xa001
/*
static void
_print_data(char *data, int datalen)
{
......@@ -46,6 +47,7 @@ _print_data(char *data, int datalen)
info("data: %s", buf);
}
*/
static void
......
......@@ -104,7 +104,7 @@ get_plugin_dir( void )
read_slurm_conf_ctl( &conf );
}
if ( conf.plugindir == NULL ) {
conf.plugindir = xstrdup( "/usr/local/lib" );
conf.plugindir = xstrdup( SLURM_PLUGIN_PATH );
}
slurm_mutex_unlock( &config_lock );
......
......@@ -218,6 +218,10 @@ static slurm_errtab_t slurm_errtab[] = {
"Job epilog failed" },
{ ESLURMD_SESSION_KILLED,
"Session manager killed" },
{ ESLURMD_TOOMANYSTEPS,
"Too many job steps on node" },
{ ESLURMD_STEP_EXISTS,
"Job step already in shared memory" },
/* slurmd errors in user batch job */
{ ESCRIPT_CHDIR_FAILED,
......
......@@ -171,6 +171,7 @@ slurm_auth_free( slurm_auth_credential_t *cred )
*/
if (cred->m_str) free(cred->m_str);
xfree(cred);
return SLURM_SUCCESS;
}
/*
......@@ -381,6 +382,7 @@ slurm_auth_print( slurm_auth_credential_t *cred, FILE *fp )
fprintf(fp, "BEGIN SLURM MUNGE AUTHENTICATION CREDENTIAL\n" );
fprintf(fp, "%s\n", cred->m_str );
fprintf(fp, "END SLURM MUNGE AUTHENTICATION CREDENTIAL\n" );
return SLURM_SUCCESS;
}
int
......
......@@ -422,7 +422,7 @@ task_info_destroy(task_info_t *t)
xfree(t);
}
void
int
job_update_shm(slurmd_job_t *job)
{
job_step_t s;
......@@ -436,13 +436,15 @@ job_update_shm(slurmd_job_t *job)
s.sw_id = 0;
s.io_update = false;
if (shm_insert_step(&s) < 0)
error("Updating shm with new step info: %m");
if (shm_insert_step(&s) < 0)
return SLURM_ERROR;
if (job->stepid == NO_VAL)
debug("updated shm with job %d", job->jobid);
else
debug("updated shm with step %d.%d", job->jobid, job->stepid);
return SLURM_SUCCESS;
}
void
......
......@@ -141,7 +141,7 @@ struct task_info * task_info_create(int taskid, int gtaskid);
void task_info_destroy(struct task_info *t);
void job_update_shm(slurmd_job_t *job);
int job_update_shm(slurmd_job_t *job);
void job_delete_shm(slurmd_job_t *job);
......
......@@ -362,7 +362,13 @@ _job_mgr(slurmd_job_t *job)
if (shm_init() < 0)
goto fail0;
job_update_shm(job);
if (job_update_shm(job) < 0) {
if (errno == ENOSPC)
rc = ESLURMD_TOOMANYSTEPS;
else if (errno == EEXIST)
rc = ESLURMD_STEP_EXISTS;
goto fail0;
}
if (!job->batch && (interconnect_preinit(job) < 0)) {
rc = ESLURM_INTERCONNECT_FAILURE;
......
......@@ -490,13 +490,21 @@ _rpc_timelimit(slurm_msg_t *msg, slurm_addr *cli_addr)
*/
_kill_running_session_mgrs(req->job_id, SIGXCPU);
step_cnt = _kill_all_active_steps(req->job_id, SIGTERM);
if ((step_cnt = _kill_all_active_steps(req->job_id, SIGTERM)))
found_job = true;
info("Timeout for job=%u, step_cnt=%d, kill_wait=%u",
req->job_id, step_cnt, conf->cf.kill_wait);
verbose( "Job %u: timeout: sent SIGTERM to %d active steps",
req->job_id, step_cnt );
sleep(1);
/*
* Check to see if any processes are still around
*/
if (found_job && _kill_all_active_steps(req->job_id, 0)) {
verbose( "Job %u: waiting %d secs for SIGKILL",
req->job_id, conf->cf.kill_wait );
if (step_cnt) {
found_job = true;
sleep(conf->cf.kill_wait);
}
......@@ -653,8 +661,7 @@ _kill_all_active_steps(uint32_t jobid, int sig)
}
list_destroy(steps);
if (step_cnt == 0)
debug2("No steps in jobid %d to send signal %d",
jobid, sig);
debug2("No steps in jobid %d to send signal %d", jobid, sig);
return step_cnt;
}
......
......@@ -278,7 +278,7 @@ _is_valid_ipc_name(const char *name)
static char *
_create_ipc_name(const char *name)
{
char *dst, *dir, *slash;
char *dst = NULL, *dir = NULL, *slash = NULL;
int rc;
if ((rc = _is_valid_ipc_name(name)) != 1)
......@@ -419,11 +419,10 @@ shm_signal_step(uint32_t jobid, uint32_t stepid, uint32_t signal)
if ((i = _shm_find_step(jobid, stepid)) >= 0) {
s = &slurmd_shm->step[i];
for (t = _taskp(s->task_list); t; t = _taskp(t->next)) {
pid_t sid = getsid(t->pid);
if (getsid(t->pid) != s->sid) {
error ("Task pid is not in my session!");
if ((sid < (pid_t) 0) || (sid != s->sid))
continue;
}
if (t->pid > 0 && kill(t->pid, signo) < 0) {
error("kill %d.%d task %d pid %ld: %m",
......
......@@ -282,7 +282,7 @@ _exec_task(slurmd_job_t *job, int i)
}
static sig_atomic_t timelimit_exceeded = 0;
static
static void
_xcpu_handler()
{
timelimit_exceeded = 1;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment