Skip to content
Snippets Groups Projects
Commit 5583cc9f authored by Moe Jette's avatar Moe Jette
Browse files

Start deallocating nodes and put job in completing state as soon as

job_cancel arrives. job/step complete RPCs will now return job completed
error, but the immediate turn-around is required for DPCS.
parent 5fb4450a
No related branches found
No related tags found
No related merge requests found
...@@ -461,7 +461,7 @@ static int _load_job_state(Buf buffer) ...@@ -461,7 +461,7 @@ static int _load_job_state(Buf buffer)
job_id, job_state, batch_flag); job_id, job_state, batch_flag);
goto unpack_error; goto unpack_error;
} }
if (kill_on_step_done > KILL_IN_PROGRESS) { if (kill_on_step_done > KILL_ON_STEP_DONE) {
error("Invalid data for job %u: kill_on_step_done=%u", error("Invalid data for job %u: kill_on_step_done=%u",
job_id, kill_on_step_done); job_id, kill_on_step_done);
goto unpack_error; goto unpack_error;
...@@ -1169,8 +1169,7 @@ int job_signal(uint32_t job_id, uint16_t signal, uid_t uid) ...@@ -1169,8 +1169,7 @@ int job_signal(uint32_t job_id, uint16_t signal, uid_t uid)
return ESLURM_USER_ID_MISSING; return ESLURM_USER_ID_MISSING;
} }
if ((IS_JOB_FINISHED(job_ptr)) || if (IS_JOB_FINISHED(job_ptr))
(job_ptr->kill_on_step_done & KILL_IN_PROGRESS))
return ESLURM_ALREADY_DONE; return ESLURM_ALREADY_DONE;
if ((job_ptr->job_state == JOB_PENDING) && if ((job_ptr->job_state == JOB_PENDING) &&
...@@ -1198,14 +1197,9 @@ int job_signal(uint32_t job_id, uint16_t signal, uid_t uid) ...@@ -1198,14 +1197,9 @@ int job_signal(uint32_t job_id, uint16_t signal, uid_t uid)
list_iterator_destroy (step_record_iterator); list_iterator_destroy (step_record_iterator);
if (signal == SIGKILL) { if (signal == SIGKILL) {
job_ptr->kill_on_step_done |= KILL_IN_PROGRESS;
job_ptr->time_last_active = now; job_ptr->time_last_active = now;
last_job_update = now; last_job_update = now;
}
if ((signal == SIGKILL) && (step_cnt == 0)) {
/* kill job with no active steps */
job_ptr->job_state = JOB_COMPLETE | JOB_COMPLETING; job_ptr->job_state = JOB_COMPLETE | JOB_COMPLETING;
job_ptr->end_time = now;
deallocate_nodes(job_ptr, false); deallocate_nodes(job_ptr, false);
} }
verbose("job_signal of running job %u successful", job_id); verbose("job_signal of running job %u successful", job_id);
...@@ -1878,21 +1872,6 @@ void job_time_limit(void) ...@@ -1878,21 +1872,6 @@ void job_time_limit(void)
if (job_ptr->job_state != JOB_RUNNING) if (job_ptr->job_state != JOB_RUNNING)
continue; continue;
if (job_ptr->kill_on_step_done & KILL_IN_PROGRESS) {
if (difftime(now, job_ptr->time_last_active) <=
JOB_KILL_TIMEOUT)
continue;
last_job_update = now;
info("Job_id %u not properly terminating, forcing it",
job_ptr->job_id);
last_job_update = now;
job_ptr->end_time = time(NULL);
job_ptr->job_state = JOB_TIMEOUT | JOB_COMPLETING;
deallocate_nodes(job_ptr, false);
delete_all_step_records(job_ptr);
continue;
}
if (slurmctld_conf.inactive_limit) { if (slurmctld_conf.inactive_limit) {
if (job_ptr->step_list && if (job_ptr->step_list &&
(list_count(job_ptr->step_list) > 0)) (list_count(job_ptr->step_list) > 0))
...@@ -1928,7 +1907,6 @@ static void _job_timed_out(struct job_record *job_ptr) ...@@ -1928,7 +1907,6 @@ static void _job_timed_out(struct job_record *job_ptr)
job_ptr->end_time = now; job_ptr->end_time = now;
job_ptr->time_last_active = now; job_ptr->time_last_active = now;
job_ptr->job_state = JOB_TIMEOUT | JOB_COMPLETING; job_ptr->job_state = JOB_TIMEOUT | JOB_COMPLETING;
job_ptr->kill_on_step_done &= KILL_IN_PROGRESS;
deallocate_nodes(job_ptr, true); deallocate_nodes(job_ptr, true);
} else } else
job_signal(job_ptr->job_id, SIGKILL, 0); job_signal(job_ptr->job_id, SIGKILL, 0);
......
...@@ -91,10 +91,6 @@ ...@@ -91,10 +91,6 @@
/* Check for jobs reaching their time limit every PERIODIC_TIMEOUT seconds */ /* Check for jobs reaching their time limit every PERIODIC_TIMEOUT seconds */
#define PERIODIC_TIMEOUT 60 #define PERIODIC_TIMEOUT 60
/* Release a job's allocation if it does not terminate gracefully in
* JOB_KILL_TIMEOUT seconds, leave time for large core file write */
#define JOB_KILL_TIMEOUT 300
/* Pathname of group file record for checking update times */ /* Pathname of group file record for checking update times */
#define GROUP_FILE "/etc/group" #define GROUP_FILE "/etc/group"
...@@ -206,7 +202,6 @@ extern time_t last_job_update; /* time of last update to part records */ ...@@ -206,7 +202,6 @@ extern time_t last_job_update; /* time of last update to part records */
#define JOB_MAGIC 0xf0b7392c #define JOB_MAGIC 0xf0b7392c
#define STEP_MAGIC 0xce593bc1 #define STEP_MAGIC 0xce593bc1
#define KILL_ON_STEP_DONE 1 #define KILL_ON_STEP_DONE 1
#define KILL_IN_PROGRESS 2
extern int job_count; /* number of jobs in the system */ extern int job_count; /* number of jobs in the system */
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment