From 042fe16d4280430f6f649664161f2cbb706dff4e Mon Sep 17 00:00:00 2001 From: Mark Grondona <mgrondona@llnl.gov> Date: Mon, 5 Aug 2002 22:01:01 +0000 Subject: [PATCH] o more changes in srun related to signals --- src/srun/launch.c | 1 + src/srun/msg.c | 8 ++++++-- src/srun/srun.c | 36 +++++++++++++++++++++++------------- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/srun/launch.c b/src/srun/launch.c index 33d112f8590..b69633265f8 100644 --- a/src/srun/launch.c +++ b/src/srun/launch.c @@ -82,6 +82,7 @@ launch(void *arg) msg.tasks_to_launch = job->ntask[i]; msg.global_task_ids = (uint32_t *) xmalloc(job->ntask[i]*sizeof(uint32_t)); + msg.srun_node_id = (uint32_t)i; for (j = 0; j < job->ntask[i]; j++) msg.global_task_ids[j] = taskid++; diff --git a/src/srun/msg.c b/src/srun/msg.c index 9d489460114..2a50f88a829 100644 --- a/src/srun/msg.c +++ b/src/srun/msg.c @@ -18,14 +18,18 @@ _launch_handler(job_t *job, slurm_msg_t *resp) launch_tasks_response_msg_t *msg = (launch_tasks_response_msg_t *) resp->data; - debug2("recieved launch resp from %s", msg->node_name); + debug2("recieved launch resp from %s nodeid=%d", msg->node_name, + msg->srun_node_id); if (msg->return_code != 0) { error("recvd return code %d from %s", msg->return_code, msg->node_name); return; } else { - /* job->host_state[msg->host_id] = SRUN_HOST_REPLIED; */ + + if (msg->srun_node_id > 0 && msg->srun_node_id < job->nhosts) + job->host_state[msg->srun_node_id] = + SRUN_HOST_REPLIED; } } diff --git a/src/srun/srun.c b/src/srun/srun.c index 348cccdf2e9..d12424b1214 100644 --- a/src/srun/srun.c +++ b/src/srun/srun.c @@ -180,8 +180,10 @@ main(int ac, char **av) pthread_kill(job->lid, SIGTERM); pthread_kill(job->jtid, SIGTERM); - pthread_kill(job->ioid, SIGTERM); pthread_kill(job->sigid, SIGTERM); + fflush(stderr); + fflush(stdout); + pthread_kill(job->ioid, SIGTERM); exit(0); } @@ -227,7 +229,7 @@ allocate_nodes(void) if (rc == SLURM_FAILURE) { error("Unable to allocate resources: %s", slurm_strerror(errno)); - exit(1); + return NULL; } return resp; @@ -320,26 +322,34 @@ sig_thr(void *arg) { job_t *job = (job_t *)arg; sigset_t set; + static time_t last_intr = 0; int signo; struct sigaction action; while (1) { sigfillset(&set); - pthread_sigmask(SIG_UNBLOCK, &set, NULL); + pthread_sigmask(SIG_BLOCK, &set, NULL); sigwait(&set, &signo); debug2("recvd signal %d", signo); switch (signo) { - case SIGINT: - fwd_signal(job, SIGINT); - pthread_mutex_lock(&job->state_mutex); - job->state = SRUN_JOB_OVERDONE; - pthread_cond_signal(&job->state_cond); - pthread_mutex_unlock(&job->state_mutex); - break; case SIGTERM: pthread_exit(0); break; + case SIGINT: + if (time(NULL) - last_intr > 1) { + info("sending Ctrl-C to remote tasks"); + last_intr = time(NULL); + fwd_signal(job, signo); + } else { /* second Ctrl-C in half as many seconds */ + /* terminate job */ + info("forcing termination"); + pthread_mutex_lock(&job->state_mutex); + job->state = SRUN_JOB_OVERDONE; + pthread_cond_signal(&job->state_cond); + pthread_mutex_unlock(&job->state_mutex); + } + break; default: fwd_signal(job, signo); break; @@ -349,14 +359,14 @@ sig_thr(void *arg) pthread_exit(0); } -void + void fwd_signal(job_t *job, int signo) { int i; slurm_msg_t req; slurm_msg_t resp; kill_tasks_msg_t msg; - + debug("forward signal %d to job", signo); req.msg_type = REQUEST_KILL_TASKS; @@ -368,7 +378,7 @@ fwd_signal(job_t *job, int signo) for (i = 0; i < job->nhosts; i++) { slurm_set_addr_uint(&req.address, slurm_get_slurmd_port(), - ntohl(job->iaddr[i])); + ntohl(job->iaddr[i])); debug("sending kill req to %s", job->host[i]); if (slurm_send_recv_node_msg(&req, &resp) < 0) error("Unable to send signal to host %s", -- GitLab