From ecaaf8075d9bcf1d4bfa15d9cf0c12560515abcd Mon Sep 17 00:00:00 2001 From: Mark Grondona <mgrondona@llnl.gov> Date: Fri, 28 Feb 2003 23:15:12 +0000 Subject: [PATCH] o Cleanup of some unused slurm_errno's o src/slurmd : if session manager process is killed, ensure that entire session is also dead, and make sure all task exit msgs are sent to clients o src/slurmd/reattach.c : initialize job->hostid in reattach code --- slurm/slurm_errno.h | 14 +------ src/common/credential_utils.c | 4 +- src/common/signature_utils.c | 29 +++++++++------ src/common/slurm_errno.c | 32 +--------------- src/slurmd/job.c | 9 +++++ src/slurmd/job.h | 5 ++- src/slurmd/mgr.c | 70 +++++++++++++++++++++++++++++------ src/slurmd/req.c | 9 ++++- src/slurmd/smgr.c | 12 +++--- src/srun/msg.c | 7 +++- src/srun/reattach.c | 1 + 11 files changed, 113 insertions(+), 79 deletions(-) diff --git a/slurm/slurm_errno.h b/slurm/slurm_errno.h index df3f6103ade..a6381640517 100644 --- a/slurm/slurm_errno.h +++ b/slurm/slurm_errno.h @@ -137,22 +137,9 @@ enum { /* slurmd error codes */ ESLRUMD_PIPE_ERROR_ON_TASK_SPAWN = 4000, ESLURMD_KILL_TASK_FAILED, - ESLURMD_OPENSSL_ERROR, - ESLURMD_NO_AVAILABLE_JOB_STEP_SLOTS_IN_SHMEM, - ESLURMD_NO_AVAILABLE_TASK_SLOTS_IN_SHMEM, ESLURMD_INVALID_JOB_CREDENTIAL, - ESLURMD_NODE_NAME_NOT_PRESENT_IN_CREDENTIAL, ESLURMD_CREDENTIAL_EXPIRED, ESLURMD_CREDENTIAL_REVOKED, - ESLURMD_CREDENTIAL_TO_EXPIRE_DOESNOT_EXIST, - ESLURMD_ERROR_SIGNING_CREDENTIAL, - ESLURMD_ERROR_FINDING_JOB_STEP_IN_SHMEM, - ESLURMD_CIRBUF_POINTER_0, - ESLURMD_PIPE_DISCONNECT, - ESLURMD_EOF_ON_SOCKET, - ESLURMD_SOCKET_DISCONNECT, - ESLURMD_UNKNOWN_SOCKET_ERROR, - ESLURMD_SIGNATURE_FIELD_TOO_SMALL, ESLURMD_CREATE_BATCH_DIR_ERROR, ESLURMD_MODIFY_BATCH_DIR_ERROR, ESLURMD_CREATE_BATCH_SCRIPT_ERROR, @@ -167,6 +154,7 @@ enum { ESLURMD_IO_ERROR, ESLURMD_PROLOG_FAILED, ESLURMD_EPILOG_FAILED, + ESLURMD_SESSION_KILLED, /* slurmd errors in user batch job */ ESCRIPT_CHDIR_FAILED = 4100, diff --git a/src/common/credential_utils.c b/src/common/credential_utils.c index fd0dd524525..ba554e11808 100644 --- a/src/common/credential_utils.c +++ b/src/common/credential_utils.c @@ -76,8 +76,8 @@ sign_credential(slurm_ssl_key_ctx_t * ctx, slurm_job_credential_t * cred) cred->signature, &sigsize); free_buf(buffer); - if (rc) - slurm_seterrno_ret(ESLURMD_ERROR_SIGNING_CREDENTIAL); + if (rc != 0) + return SLURM_ERROR; if (sigsize != SLURM_SSL_SIGNATURE_LENGTH) error("signature size not correct in ssl_sign!"); diff --git a/src/common/signature_utils.c b/src/common/signature_utils.c index fae4a6137dc..881b4353fd2 100644 --- a/src/common/signature_utils.c +++ b/src/common/signature_utils.c @@ -62,26 +62,35 @@ int slurm_ssl_destroy() int slurm_init_signer(slurm_ssl_key_ctx_t * ctx, char *path) { - FILE *fp; + FILE *fp = NULL; + EVP_PKEY *pk = NULL; + int rc = SLURM_SUCCESS; if (!(fp = fopen(path, "r"))) { error ("can't open key file '%s' : %m", path); return SLURM_ERROR; }; - ctx->key.private = NULL; - if (!PEM_read_PrivateKey(fp, &ctx->key.private, NULL, NULL)) { + if (PEM_read_PrivateKey(fp, &pk, NULL, NULL)) + ctx->key.private = pk; + else { error ("PEM_read_PrivateKey [%s]: %m", path); - slurm_seterrno_ret(ESLURMD_OPENSSL_ERROR); + rc = SLURM_ERROR; } fclose(fp); - return SLURM_SUCCESS; + if (pk && (EVP_PKEY_size(pk) > SLURM_SSL_SIGNATURE_LENGTH)) { + error ("slurm_ssl_sign: key size too large"); + rc = SLURM_ERROR; + } + + return rc; } int slurm_init_verifier(slurm_ssl_key_ctx_t * ctx, char *path) { FILE *fp = NULL; + int rc = SLURM_SUCCESS; if ((fp = fopen(path, "r")) == NULL) { error ("can't open certificate file '%s' : %m ", path); @@ -91,11 +100,11 @@ int slurm_init_verifier(slurm_ssl_key_ctx_t * ctx, char *path) ctx->key.public = NULL; if (!PEM_read_PUBKEY(fp, &ctx->key.public, NULL, NULL)) { error("PEM_read_PUBKEY[%s]: %m",path); - slurm_seterrno_ret(ESLURMD_OPENSSL_ERROR); + rc = SLURM_ERROR; } fclose(fp); - return SLURM_SUCCESS; + return rc; } int slurm_destroy_ssl_key_ctx(slurm_ssl_key_ctx_t * ctx) @@ -111,8 +120,6 @@ slurm_ssl_sign(slurm_ssl_key_ctx_t *ctx, { EVP_MD_CTX ectx; - if (EVP_PKEY_size(ctx->key.private) > SLURM_SSL_SIGNATURE_LENGTH) - slurm_seterrno_ret(ESLURMD_SIGNATURE_FIELD_TOO_SMALL); EVP_SignInit(&ectx, EVP_sha1()); @@ -120,7 +127,7 @@ slurm_ssl_sign(slurm_ssl_key_ctx_t *ctx, if (!EVP_SignFinal(&ectx, sig, siglen, ctx->key.private)) { ERR_print_errors_fp(log_fp()); - slurm_seterrno_ret(ESLURMD_OPENSSL_ERROR); + return SLURM_ERROR; } return SLURM_SUCCESS; @@ -139,7 +146,7 @@ slurm_ssl_verify(slurm_ssl_key_ctx_t * ctx, if (!EVP_VerifyFinal(&ectx, sig, siglen, ctx->key.public)) { error("EVP_VerifyFinal: %s", ERR_error_string(ERR_get_error(), NULL)); - slurm_seterrno_ret(ESLURMD_OPENSSL_ERROR); + return SLURM_ERROR; } return SLURM_SUCCESS; } diff --git a/src/common/slurm_errno.c b/src/common/slurm_errno.c index f1444c41102..7be6a0bd2a8 100644 --- a/src/common/slurm_errno.c +++ b/src/common/slurm_errno.c @@ -176,38 +176,8 @@ static slurm_errtab_t slurm_errtab[] = { "Pipe error on task spawn" }, { ESLURMD_KILL_TASK_FAILED, "Kill task failed" }, - { ESLURMD_OPENSSL_ERROR, - "Openssl error" }, - { ESLURMD_NO_AVAILABLE_JOB_STEP_SLOTS_IN_SHMEM, - "No available job step slots in shmem" }, - { ESLURMD_NO_AVAILABLE_TASK_SLOTS_IN_SHMEM, - "No available task slots in shmem" }, { ESLURMD_INVALID_JOB_CREDENTIAL, "Invalid job credential" }, - { ESLURMD_NODE_NAME_NOT_PRESENT_IN_CREDENTIAL, - "Job credential not valid for this node" }, - { ESLURMD_CREDENTIAL_EXPIRED, - "Job credential has expired" }, - { ESLURMD_CREDENTIAL_REVOKED, - "Job credential has been revoked" }, - { ESLURMD_CREDENTIAL_TO_EXPIRE_DOESNOT_EXIST, - "Credential requested to expire doesn't exist" }, - { ESLURMD_ERROR_SIGNING_CREDENTIAL, - "SSL crypto error signing job credential" }, - { ESLURMD_ERROR_FINDING_JOB_STEP_IN_SHMEM, - "Job step not found in shmem" }, - { ESLURMD_CIRBUF_POINTER_0, - "Circular read or write buffer size is 0. not good" }, - { ESLURMD_PIPE_DISCONNECT, - "Task has closed or dropped its stdio pipes" }, - { ESLURMD_EOF_ON_SOCKET, - "Socket returned EOF, it was closed" }, - { ESLURMD_SOCKET_DISCONNECT, - "Socket disconnected" }, - { ESLURMD_UNKNOWN_SOCKET_ERROR, - "Unknown socket error" }, - { ESLURMD_SIGNATURE_FIELD_TOO_SMALL, - "Credential signature field is too small" }, { ESLURMD_CREATE_BATCH_DIR_ERROR, "Slurmd could not create a batch directory" }, { ESLURMD_MODIFY_BATCH_DIR_ERROR, @@ -236,6 +206,8 @@ static slurm_errtab_t slurm_errtab[] = { "Job prolog failed" }, { ESLURMD_EPILOG_FAILED, "Job epilog failed" }, + { ESLURMD_SESSION_KILLED, + "Session manager killed" }, /* slurmd errors in user batch job */ { ESCRIPT_CHDIR_FAILED, diff --git a/src/slurmd/job.c b/src/slurmd/job.c index 7209cca8ab7..b52d009a105 100644 --- a/src/slurmd/job.c +++ b/src/slurmd/job.c @@ -158,6 +158,11 @@ job_create(launch_tasks_request_msg_t *msg, slurm_addr *cli_addr) return NULL; } + fd_set_close_on_exec(job->fdpair[0]); + fd_set_close_on_exec(job->fdpair[1]); + + job->smgr_status = -1; + return job; } @@ -218,6 +223,10 @@ job_batch_job_create(batch_job_launch_msg_t *msg) error("pipe: %m"); return NULL; } + fd_set_close_on_exec(job->fdpair[0]); + fd_set_close_on_exec(job->fdpair[1]); + + job->smgr_status = -1; _job_init_task_info(job, &global_taskid); diff --git a/src/slurmd/job.h b/src/slurmd/job.h index 5a4922ee048..0e171af53da 100644 --- a/src/slurmd/job.h +++ b/src/slurmd/job.h @@ -115,8 +115,9 @@ typedef struct slurmd_job { List sruns; /* List of sruns */ pthread_t ioid; /* pthread id of IO thread */ - pid_t jmgr_pid; /* job manager pid */ - pid_t smgr_pid; /* session manager pid */ + pid_t jmgr_pid; /* job manager pid */ + pid_t smgr_pid; /* session manager pid */ + int smgr_status; /* session manager status */ int fdpair[2]; /* file descriptor pair for */ /* communication between slurmds */ diff --git a/src/slurmd/mgr.c b/src/slurmd/mgr.c index 65409fe363c..376df11a530 100644 --- a/src/slurmd/mgr.c +++ b/src/slurmd/mgr.c @@ -106,8 +106,11 @@ static int _create_job_session(slurmd_job_t *job); static int _wait_for_task_exit(slurmd_job_t *job); static int _wait_for_session(slurmd_job_t *job); static void _wait_for_io(slurmd_job_t *job); +static void _set_unexited_task_status(slurmd_job_t *job, int status); static void _handle_attach_req(slurmd_job_t *job); static int _send_exit_msg(slurmd_job_t *job, int tid[], int n, int status); +static void _set_unexited_task_status(slurmd_job_t *job, int status); +static int _send_pending_exit_msgs(slurmd_job_t *job); static void _setargs(slurmd_job_t *job, char **argv, int argc); @@ -396,6 +399,14 @@ _job_mgr(slurmd_job_t *job) */ _wait_for_session(job); + /* + * Set status of any unexited tasks to that of + * the session manager. Then send any pending + * exit messages back to clients. + */ + _set_unexited_task_status(job, job->smgr_status); + while (_send_pending_exit_msgs(job)) {;} + fail2: /* * Wait for io thread to complete @@ -527,6 +538,7 @@ _handle_task_exit(slurmd_job_t *job) * read at most ntask task exit codes from session manager */ for (i = 0; i < job->ntasks; i++) { + task_info_t *t; if ((len = read(job->fdpair[0], &e, sizeof(e))) < 0) { if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) @@ -538,13 +550,15 @@ _handle_task_exit(slurmd_job_t *job) if (len == 0) /* EOF */ break; - job->task[e.taskid]->estatus = e.status; - job->task[e.taskid]->exited = true; - job->task[e.taskid]->esent = false; + t = job->task[e.taskid]; + + t->estatus = e.status; + t->exited = true; + t->esent = false; nexited++; debug2("global task %d exited with status %d", - e.taskid, e.status); + t->gid, t->estatus); } return nexited; @@ -594,6 +608,7 @@ _send_pending_exit_msgs(slurmd_job_t *job) return nsent; } + /* * Wait for tasks to exit by reading task exit codes from session manger. * @@ -619,10 +634,11 @@ _wait_for_task_exit(slurmd_job_t *job) int nsent = 0; if ((rc = poll(pfd, 1, timeout)) < 0) { - if (errno == EINTR) { + if (errno == EINTR) _handle_attach_req(job); - continue; - } + else + error("wait_for_task_exit: poll: %m"); + continue; } revents = pfd[0].revents; @@ -653,13 +669,27 @@ _wait_for_task_exit(slurmd_job_t *job) } while (waiting); + close(rfd); return SLURM_SUCCESS; done: - while (_send_pending_exit_msgs(job)) {;} + close(rfd); return SLURM_FAILURE; } +static void +_set_unexited_task_status(slurmd_job_t *job, int status) +{ + int i; + for (i = 0; i < job->ntasks; i++) { + task_info_t *t = job->task[i]; + + if (t->exited) continue; + + t->exited = true; + t->estatus = status; + } +} /* * read task exit status from slurmd session manager process, @@ -668,8 +698,12 @@ _wait_for_task_exit(slurmd_job_t *job) static int _wait_for_session(slurmd_job_t *job) { - int status = -1; - pid_t pid; + int status = job->smgr_status; + int rc = 0; + pid_t pid; + + if (status != -1) + goto done; while ((pid = waitpid(job->smgr_pid, &status, 0)) < (pid_t) 0) { if (errno == EINTR) @@ -680,9 +714,21 @@ _wait_for_session(slurmd_job_t *job) } } - status = WEXITSTATUS(status); + job->smgr_status = status; + + done: + if (WIFSIGNALED(status)) { + /* + * Make sure all processes in session are dead + */ + killpg(job->smgr_pid, SIGKILL); + return ESLURMD_SESSION_KILLED; + } + + if (!WIFEXITED(status)) + rc = WEXITSTATUS(status); - return (status < MAX_SMGR_EXIT_STATUS) ? exit_errno[status] : status; + return (rc <= MAX_SMGR_EXIT_STATUS) ? exit_errno[rc] : rc; } /* diff --git a/src/slurmd/req.c b/src/slurmd/req.c index d0a933b19fa..3c24fca3062 100644 --- a/src/slurmd/req.c +++ b/src/slurmd/req.c @@ -349,7 +349,7 @@ _rpc_kill_tasks(slurm_msg_t *msg, slurm_addr *cli_addr) goto done; } - /* Special case some signals to avoid harming job's slurmd shepherd */ + /* Special case some signals to avoid harming job's slurmd shepherd if ((req->signal == SIGSTOP) || (req->signal == SIGCONT) || (req->signal == SIGKILL)) rc = shm_signal_step(req->job_id, req->job_step_id, @@ -357,7 +357,12 @@ _rpc_kill_tasks(slurm_msg_t *msg, slurm_addr *cli_addr) else { if (killpg(step->sid, req->signal) < 0) rc = errno; - } + } + */ + + if (killpg(step->sid, req->signal) < 0) + rc = errno; + shm_free_step(step); if (rc == SLURM_SUCCESS) verbose("Sent signal %d to %u.%u", diff --git a/src/slurmd/smgr.c b/src/slurmd/smgr.c index 0c28518a777..9eba4c5d4ef 100644 --- a/src/slurmd/smgr.c +++ b/src/slurmd/smgr.c @@ -162,15 +162,17 @@ _cleanup_file_descriptors(slurmd_job_t *j) { int i; for (i = 0; i < j->ntasks; i++) { - close(j->task[i]->pin[1]); /* Ignore errors */ - close(j->task[i]->pout[0]); - - /* Leave stderr open for slurmd error logging + task_info_t *t = j->task[i]; + /* + * Ignore errors on close() */ + close(t->pin[1]); + close(t->pout[0]); + close(t->perr[0]); } } -static int + static int _become_user(slurmd_job_t *job) { if (setgid(job->pwd->pw_gid) < 0) { diff --git a/src/srun/msg.c b/src/srun/msg.c index 80af484ff70..7d6f2056ffc 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -319,11 +319,14 @@ _reattach_handler(job_t *job, slurm_msg_t *msg) /* * store global task id information as returned from slurmd */ - job->tids[resp->srun_node_id] = xmalloc( resp->ntasks * - sizeof(uint32_t) ); + job->tids[resp->srun_node_id] = + xmalloc( resp->ntasks * sizeof(uint32_t) ); + job->ntask[resp->srun_node_id] = resp->ntasks; + for (i = 0; i < resp->ntasks; i++) { job->tids[resp->srun_node_id][i] = resp->gids[i]; + job->hostid[resp->gids[i]] = resp->srun_node_id; } #if HAVE_TOTALVIEW diff --git a/src/srun/reattach.c b/src/srun/reattach.c index 955a557749d..7088138735c 100644 --- a/src/srun/reattach.c +++ b/src/srun/reattach.c @@ -459,6 +459,7 @@ int reattach() job->jobid = s->jobid; job->stepid = s->stepid; job->tids = xmalloc(job->nhosts * sizeof(uint32_t *)); + job->hostid = xmalloc(s->ntasks * sizeof(uint32_t *)); if (job->stepid == NO_VAL) { char *new_argv0 = NULL; -- GitLab