Skip to content
Snippets Groups Projects
Commit 5eea178d authored by Mark Grondona's avatar Mark Grondona
Browse files

o slurmd reports back early job launch error (such as missing user id on node)

   to srun or slurmctld.
 o new errno ESLURMD_UID_NOT_FOUND
parent 43714500
No related branches found
No related tags found
No related merge requests found
...@@ -195,6 +195,8 @@ static slurm_errtab_t slurm_errtab[] = { ...@@ -195,6 +195,8 @@ static slurm_errtab_t slurm_errtab[] = {
"Pipe error on task spawn" }, "Pipe error on task spawn" },
{ ESLURMD_KILL_TASK_FAILED, { ESLURMD_KILL_TASK_FAILED,
"Kill task failed" }, "Kill task failed" },
{ ESLURMD_UID_NOT_FOUND,
"User not found on host" },
{ ESLURMD_KILL_JOB_FAILED, { ESLURMD_KILL_JOB_FAILED,
"Attempt to kill job failed or timed out" }, "Attempt to kill job failed or timed out" },
{ ESLURMD_INVALID_JOB_CREDENTIAL, { ESLURMD_INVALID_JOB_CREDENTIAL,
......
...@@ -106,6 +106,7 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) ...@@ -106,6 +106,7 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr)
if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) { if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) {
error("uid %ld not found on system", (long) msg->uid); error("uid %ld not found on system", (long) msg->uid);
slurm_seterrno (ESLURMD_UID_NOT_FOUND);
return NULL; return NULL;
} }
job = xmalloc(sizeof(*job)); job = xmalloc(sizeof(*job));
...@@ -190,6 +191,7 @@ job_batch_job_create(batch_job_launch_msg_t *msg) ...@@ -190,6 +191,7 @@ job_batch_job_create(batch_job_launch_msg_t *msg)
if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) { if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) {
error("uid %ld not found on system", (long) msg->uid); error("uid %ld not found on system", (long) msg->uid);
slurm_seterrno (ESLURMD_UID_NOT_FOUND);
return NULL; return NULL;
} }
......
...@@ -107,6 +107,8 @@ static int mgr_sigarray[] = { ...@@ -107,6 +107,8 @@ static int mgr_sigarray[] = {
/* /*
* Job manager related prototypes * Job manager related prototypes
*/ */
static void _send_launch_failure(launch_tasks_request_msg_t *,
slurm_addr *, int);
static int _job_mgr(slurmd_job_t *job); static int _job_mgr(slurmd_job_t *job);
static void _set_job_log_prefix(slurmd_job_t *job); static void _set_job_log_prefix(slurmd_job_t *job);
static int _setup_io(slurmd_job_t *job); static int _setup_io(slurmd_job_t *job);
...@@ -137,7 +139,7 @@ static void _random_sleep(slurmd_job_t *job); ...@@ -137,7 +139,7 @@ static void _random_sleep(slurmd_job_t *job);
static char * _make_batch_dir(slurmd_job_t *job); static char * _make_batch_dir(slurmd_job_t *job);
static char * _make_batch_script(batch_job_launch_msg_t *msg, char *path); static char * _make_batch_script(batch_job_launch_msg_t *msg, char *path);
static int _setup_batch_env(slurmd_job_t *job, char *nodes); static int _setup_batch_env(slurmd_job_t *job, char *nodes);
static int _complete_job(slurmd_job_t *job, int err, int status); static int _complete_job(uint32_t jobid, int err, int status);
/* SIGHUP (empty) signal handler /* SIGHUP (empty) signal handler
...@@ -152,8 +154,10 @@ mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *cli) ...@@ -152,8 +154,10 @@ mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *cli)
{ {
slurmd_job_t *job = NULL; slurmd_job_t *job = NULL;
if (!(job = job_create(msg, cli))) if (!(job = job_create(msg, cli))) {
_send_launch_failure (msg, cli, errno);
return SLURM_ERROR; return SLURM_ERROR;
}
_set_job_log_prefix(job); _set_job_log_prefix(job);
...@@ -177,11 +181,17 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli) ...@@ -177,11 +181,17 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli)
{ {
int rc = 0; int rc = 0;
int status = 0; int status = 0;
uint32_t jobid = msg->job_id;
slurmd_job_t *job; slurmd_job_t *job;
char *batchdir; char *batchdir;
if (!(job = job_batch_job_create(msg))) if (!(job = job_batch_job_create(msg))) {
/*
* Set "job" status to returned errno and cleanup job.
*/
status = errno;
goto cleanup; goto cleanup;
}
_set_job_log_prefix(job); _set_job_log_prefix(job);
...@@ -209,8 +219,8 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli) ...@@ -209,8 +219,8 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli)
xfree(batchdir); xfree(batchdir);
cleanup : cleanup :
verbose("job %u completed with slurm_rc = %d, job_rc = %d", verbose("job %u completed with slurm_rc = %d, job_rc = %d",
job->jobid, rc, status); jobid, rc, status);
_complete_job(job, rc, status); _complete_job(jobid, rc, status);
return 0; return 0;
} }
...@@ -914,6 +924,28 @@ _setup_batch_env(slurmd_job_t *job, char *nodes) ...@@ -914,6 +924,28 @@ _setup_batch_env(slurmd_job_t *job, char *nodes)
return 0; return 0;
} }
static void
_send_launch_failure (launch_tasks_request_msg_t *msg, slurm_addr *cli, int rc)
{
slurm_msg_t resp_msg;
launch_tasks_response_msg_t resp;
debug ("sending launch failure message: %s", slurm_strerror (rc));
memcpy(&resp_msg.address, cli, sizeof(slurm_addr));
slurm_set_addr(&resp_msg.address, msg->resp_port, NULL);
resp_msg.data = &resp;
resp_msg.msg_type = RESPONSE_LAUNCH_TASKS;
resp.node_name = conf->hostname;
resp.srun_node_id = msg->srun_node_id;
resp.return_code = rc ? rc : -1;
resp.count_of_pids = 0;
slurm_send_only_node_msg(&resp_msg);
return;
}
static void static void
_send_launch_resp(slurmd_job_t *job, int rc) _send_launch_resp(slurmd_job_t *job, int rc)
...@@ -945,15 +977,15 @@ _send_launch_resp(slurmd_job_t *job, int rc) ...@@ -945,15 +977,15 @@ _send_launch_resp(slurmd_job_t *job, int rc)
static int static int
_complete_job(slurmd_job_t *job, int err, int status) _complete_job(uint32_t jobid, int err, int status)
{ {
int rc; int rc;
slurm_msg_t req_msg; slurm_msg_t req_msg;
complete_job_step_msg_t req; complete_job_step_msg_t req;
req.job_id = job->jobid; req.job_id = jobid;
req.job_step_id = NO_VAL; req.job_step_id = NO_VAL;
req.job_rc = status; req.job_rc = status;
req.slurm_rc = err; req.slurm_rc = err;
req.node_name = conf->hostname; req.node_name = conf->hostname;
req_msg.msg_type= REQUEST_COMPLETE_JOB_STEP; req_msg.msg_type= REQUEST_COMPLETE_JOB_STEP;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment