From 7484b82883de8519a13e22a29cb22ad5c755dc0e Mon Sep 17 00:00:00 2001 From: Mark Grondona <mgrondona@llnl.gov> Date: Tue, 8 Jun 2004 17:32:22 +0000 Subject: [PATCH] o Fix bug where srun could not be interrupted by ctrl-c while waiting for a queued job to receive an allocation. --- NEWS | 7 +++++- src/srun/allocate.c | 53 ++++++++++++++++++++------------------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/NEWS b/NEWS index 32bede9be7b..963ee216695 100644 --- a/NEWS +++ b/NEWS @@ -1,7 +1,12 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. -* Changes in SLURM 0.3.3 (not tagged yet) +* Changes in SLURM 0.3.4 +======================== + -- Allow srun to respond to ctrl-c and kill queued job while waiting + for allocation from controller. + +* Changes in SLURM 0.3.3 ======================== -- Fix slurmctld handling of heterogeneous processor count on elan switch (was setting DRAINED nodes in state DRAINING). diff --git a/src/srun/allocate.c b/src/srun/allocate.c index 0a4b66246c3..b04815ee19d 100644 --- a/src/srun/allocate.c +++ b/src/srun/allocate.c @@ -149,40 +149,37 @@ existing_allocation(void) static void _wait_for_resources(resource_allocation_response_msg_t **resp) { - old_job_alloc_msg_t old_job; + old_job_alloc_msg_t old; resource_allocation_response_msg_t *r = *resp; int sleep_time = MIN_ALLOC_WAIT; info ("job %u queued and waiting for resources", r->job_id); - old_job.job_id = r->job_id; - old_job.uid = (uint32_t) getuid(); + old.job_id = r->job_id; + old.uid = (uint32_t) getuid(); slurm_free_resource_allocation_response_msg(r); /* Keep polling until the job is allocated resources */ - while (1) { - if (_wait_for_alloc_rpc(sleep_time, resp) > 0) - break; - if (slurm_confirm_allocation(&old_job, resp) >= 0) + while (_wait_for_alloc_rpc(sleep_time, resp) <= 0) { + + if (slurm_confirm_allocation(&old, resp) >= 0) break; - if (slurm_get_errno() == ESLURM_JOB_PENDING) { - debug3("Still waiting for allocation"); - if (sleep_time < MAX_ALLOC_WAIT) - ++sleep_time; - sleep(sleep_time); - } else { - error("Unable to confirm resource allocation for " - "job %u: %m", old_job.job_id); - exit (1); - } + + if (slurm_get_errno() == ESLURM_JOB_PENDING) + debug3 ("Still waiting for allocation"); + else + fatal ("Unable to confirm allocation for job %u: %m", + old.job_id); if (destroy_job) { - verbose("cancelling job %u", old_job.job_id); - slurm_complete_job(old_job.job_id, 0, 0); + verbose("cancelling job %u", old.job_id); + slurm_complete_job(old.job_id, 0, 0); debugger_launch_failure(); exit(0); } + if (sleep_time < MAX_ALLOC_WAIT) + sleep_time++; } info ("job %u has been allocated resources", (*resp)->job_id); } @@ -197,22 +194,20 @@ _wait_for_alloc_rpc(int sleep_time, resource_allocation_response_msg_t **resp) { struct pollfd fds[1]; slurm_fd slurmctld_fd; - int rc, wait_msec = sleep_time * 1000; - slurmctld_fd = slurmctld_msg_init(); - if (slurmctld_fd < 0) { - sleep(sleep_time); - return 0; + if ((slurmctld_fd = slurmctld_msg_init()) < 0) { + sleep (sleep_time); + return (0); } fds[0].fd = slurmctld_fd; fds[0].events = POLLIN; - while ((rc = poll(fds, 1, wait_msec)) < 0) { + while (poll (fds, 1, (sleep_time * 1000)) < 0) { switch (errno) { case EAGAIN: case EINTR: - continue; + return (-1); case ENOMEM: case EINVAL: case EFAULT: @@ -222,10 +217,10 @@ _wait_for_alloc_rpc(int sleep_time, resource_allocation_response_msg_t **resp) } } - rc = 0; if (fds[0].revents & POLLIN) - rc = _accept_msg_connection(slurmctld_fd, resp); - return rc; + return (_accept_msg_connection(slurmctld_fd, resp)); + + return (0); } /* Accept RPC from slurmctld and process it. -- GitLab