From 5eea178ddad47c55007e32c08a89d89bd783ebad Mon Sep 17 00:00:00 2001 From: Mark Grondona <mgrondona@llnl.gov> Date: Fri, 12 Dec 2003 20:25:33 +0000 Subject: [PATCH] o slurmd reports back early job launch error (such as missing user id on node) to srun or slurmctld. o new errno ESLURMD_UID_NOT_FOUND --- src/common/slurm_errno.c | 2 ++ src/slurmd/job.c | 2 ++ src/slurmd/mgr.c | 48 +++++++++++++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index 57076ecb355..7410cc27adb 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -195,6 +195,8 @@ static slurm_errtab_t slurm_errtab[] = { "Pipe error on task spawn" }, { ESLURMD_KILL_TASK_FAILED, "Kill task failed" }, + { ESLURMD_UID_NOT_FOUND, + "User not found on host" }, { ESLURMD_KILL_JOB_FAILED, "Attempt to kill job failed or timed out" }, { ESLURMD_INVALID_JOB_CREDENTIAL, diff --git a/src/slurmd/job.c b/src/slurmd/job.c index 1b075360f3c..047d2116a47 100644 --- a/src/slurmd/job.c +++ b/src/slurmd/job.c @@ -106,6 +106,7 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) { error("uid %ld not found on system", (long) msg->uid); + slurm_seterrno (ESLURMD_UID_NOT_FOUND); return NULL; } job = xmalloc(sizeof(*job)); @@ -190,6 +191,7 @@ job_batch_job_create(batch_job_launch_msg_t *msg) if ((pwd = _pwd_create((uid_t)msg->uid)) == NULL) { error("uid %ld not found on system", (long) msg->uid); + slurm_seterrno (ESLURMD_UID_NOT_FOUND); return NULL; } diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index 514d1f11edf..47db96942c3 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -107,6 +107,8 @@ static int mgr_sigarray[] = { /* * Job manager related prototypes */ +static void _send_launch_failure(launch_tasks_request_msg_t *, + slurm_addr *, int); static int _job_mgr(slurmd_job_t *job); static void _set_job_log_prefix(slurmd_job_t *job); static int _setup_io(slurmd_job_t *job); @@ -137,7 +139,7 @@ static void _random_sleep(slurmd_job_t *job); static char * _make_batch_dir(slurmd_job_t *job); static char * _make_batch_script(batch_job_launch_msg_t *msg, char *path); static int _setup_batch_env(slurmd_job_t *job, char *nodes); -static int _complete_job(slurmd_job_t *job, int err, int status); +static int _complete_job(uint32_t jobid, int err, int status); /* SIGHUP (empty) signal handler @@ -152,8 +154,10 @@ mgr_launch_tasks(launch_tasks_request_msg_t *msg, slurm_addr *cli) { slurmd_job_t *job = NULL; - if (!(job = job_create(msg, cli))) + if (!(job = job_create(msg, cli))) { + _send_launch_failure (msg, cli, errno); return SLURM_ERROR; + } _set_job_log_prefix(job); @@ -177,11 +181,17 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli) { int rc = 0; int status = 0; + uint32_t jobid = msg->job_id; slurmd_job_t *job; char *batchdir; - if (!(job = job_batch_job_create(msg))) + if (!(job = job_batch_job_create(msg))) { + /* + * Set "job" status to returned errno and cleanup job. + */ + status = errno; goto cleanup; + } _set_job_log_prefix(job); @@ -209,8 +219,8 @@ mgr_launch_batch_job(batch_job_launch_msg_t *msg, slurm_addr *cli) xfree(batchdir); cleanup : verbose("job %u completed with slurm_rc = %d, job_rc = %d", - job->jobid, rc, status); - _complete_job(job, rc, status); + jobid, rc, status); + _complete_job(jobid, rc, status); return 0; } @@ -914,6 +924,28 @@ _setup_batch_env(slurmd_job_t *job, char *nodes) return 0; } +static void +_send_launch_failure (launch_tasks_request_msg_t *msg, slurm_addr *cli, int rc) +{ + slurm_msg_t resp_msg; + launch_tasks_response_msg_t resp; + + debug ("sending launch failure message: %s", slurm_strerror (rc)); + + memcpy(&resp_msg.address, cli, sizeof(slurm_addr)); + slurm_set_addr(&resp_msg.address, msg->resp_port, NULL); + resp_msg.data = &resp; + resp_msg.msg_type = RESPONSE_LAUNCH_TASKS; + + resp.node_name = conf->hostname; + resp.srun_node_id = msg->srun_node_id; + resp.return_code = rc ? rc : -1; + resp.count_of_pids = 0; + + slurm_send_only_node_msg(&resp_msg); + + return; +} static void _send_launch_resp(slurmd_job_t *job, int rc) @@ -945,15 +977,15 @@ _send_launch_resp(slurmd_job_t *job, int rc) static int -_complete_job(slurmd_job_t *job, int err, int status) +_complete_job(uint32_t jobid, int err, int status) { int rc; slurm_msg_t req_msg; complete_job_step_msg_t req; - req.job_id = job->jobid; + req.job_id = jobid; req.job_step_id = NO_VAL; - req.job_rc = status; + req.job_rc = status; req.slurm_rc = err; req.node_name = conf->hostname; req_msg.msg_type= REQUEST_COMPLETE_JOB_STEP; -- GitLab