diff --git a/NEWS b/NEWS index 9e8e7ff68b121af15ef76d0001465ffcf1edd3a9..80b7d1b63d5c59ca1804e49143163d99a996ee25 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,7 @@ documents those changes that are of interest to users and admins. -- Fix for srun -n and -O options when paired with -b. -- Added logic for fanout to failover to forward list if main node is unreachable + -- sacct also now keeps track of submitted, started and ending times of jobs * Changes in SLURM 1.1.0-pre8 ============================= diff --git a/src/plugins/jobacct/common/common_slurmctld.c b/src/plugins/jobacct/common/common_slurmctld.c index d0f460128619561c426933532b4bc020a7b8f62e..595bab33a944c334feb2ab98fa507855401f0bc3 100644 --- a/src/plugins/jobacct/common/common_slurmctld.c +++ b/src/plugins/jobacct/common/common_slurmctld.c @@ -87,7 +87,10 @@ static int _print_record(struct job_record *job_ptr, { static int rc=SLURM_SUCCESS; char *block_id = NULL; - + if(!job_ptr->details) { + error("job_acct: job=%u doesn't exist", job_ptr->job_id); + return SLURM_ERROR; + } debug2("_print_record, job=%u, \"%s\"", job_ptr->job_id, data); #ifdef HAVE_BG @@ -104,7 +107,7 @@ static int _print_record(struct job_record *job_ptr, if (fprintf(LOGFILE, "%u %s %u %u %d %d %s - %s\n", job_ptr->job_id, job_ptr->partition, - (int)job_ptr->start_time, (int)time, + (int)job_ptr->details->submit_time, (int)time, job_ptr->user_id, job_ptr->group_id, block_id, data) < 0) rc=SLURM_ERROR; diff --git a/src/sacct/options.c b/src/sacct/options.c index fd38223ef56e96155db54deebcc24d3e27d1c63c..4e14c57493b2f34b606fd5c5c601324144563947 100644 --- a/src/sacct/options.c +++ b/src/sacct/options.c @@ -1100,6 +1100,9 @@ void do_dump(void) step->exitcode=1; } _dump_header(step->header); + if(step->end == 0) + step->end = job->end; + gmtime_r(&step->end, &ts); printf("JOB_STEP 1 50 %u %04d%02d%02d%02d%02d%02d ", step->stepnum, @@ -1746,6 +1749,8 @@ void do_list(void) if(!selected_status[step->status]) continue; } + if(step->end == 0) + step->end = job->end; print_fields(JOBSTEP, step); } list_iterator_destroy(itr_step); diff --git a/src/sacct/process.c b/src/sacct/process.c index eb38f3c1ecef15e055f663d9aef559085b46b65b..996969cd5fe4d3da61a32c50aec39a54fef902ac 100644 --- a/src/sacct/process.c +++ b/src/sacct/process.c @@ -28,19 +28,27 @@ #include "sacct.h" -job_rec_t *_find_job_record(acct_header_t header); +job_rec_t *_find_job_record(acct_header_t header, int type); +int _remove_job_record(uint32_t jobnum); step_rec_t *_find_step_record(job_rec_t *job, long jobstep); job_rec_t *_init_job_rec(acct_header_t header); step_rec_t *_init_step_rec(acct_header_t header); int _parse_line(char *f[], void **data); -job_rec_t *_find_job_record(acct_header_t header) +job_rec_t *_find_job_record(acct_header_t header, int type) { job_rec_t *job = NULL; ListIterator itr = list_iterator_create(jobs); while((job = (job_rec_t *)list_next(itr)) != NULL) { if (job->header.jobnum == header.jobnum) { + if(job->header.job_submit == 0 && type == JOB_START) { + list_remove(itr); + destroy_job(job); + job = NULL; + break; + } + if(job->header.job_submit == BATCH_JOB_TIMESTAMP) { job->header.job_submit = header.job_submit; break; @@ -64,6 +72,23 @@ job_rec_t *_find_job_record(acct_header_t header) return job; } +int _remove_job_record(uint32_t jobnum) +{ + job_rec_t *job = NULL; + int rc = SLURM_ERROR; + ListIterator itr = list_iterator_create(jobs); + + while((job = (job_rec_t *)list_next(itr)) != NULL) { + if (job->header.jobnum == jobnum) { + list_remove(itr); + destroy_job(job); + rc = SLURM_SUCCESS; + } + } + list_iterator_destroy(itr); + return rc; +} + step_rec_t *_find_step_record(job_rec_t *job, long stepnum) { step_rec_t *step = NULL; @@ -229,15 +254,20 @@ void process_start(char *f[], int lc, int show_full) job_rec_t *temp = NULL; _parse_line(f, (void **)&temp); - job = _find_job_record(temp->header); + job = _find_job_record(temp->header, JOB_START); if (job) { /* Hmmm... that's odd */ - fprintf(stderr, - "Conflicting JOB_START for job %u at" - " line %d -- ignoring it\n", - job->header.jobnum, lc); - input_error++; - destroy_job(temp); - return; + printf("job->header.job_submit = %d", (int)job->header.job_submit); + if(job->header.job_submit == 0) + _remove_job_record(job->header.jobnum); + else { + fprintf(stderr, + "Conflicting JOB_START for job %u at" + " line %d -- ignoring it\n", + job->header.jobnum, lc); + input_error++; + destroy_job(temp); + return; + } } job = temp; @@ -256,7 +286,7 @@ void process_step(char *f[], int lc, int show_full) _parse_line(f, (void **)&temp); - job = _find_job_record(temp->header); + job = _find_job_record(temp->header, JOB_STEP); if (temp->stepnum == -2) { destroy_step(temp); @@ -308,7 +338,8 @@ void process_step(char *f[], int lc, int show_full) step = temp; temp = NULL; list_append(job->steps, step); - + if(job->header.timestamp == 0) + job->header.timestamp = step->header.timestamp; job->job_step_seen = 1; job->ntasks += step->ntasks; if(!job->nodes || !strcmp(job->nodes, "(unknown)")) { @@ -371,7 +402,7 @@ void process_suspend(char *f[], int lc, int show_full) job_rec_t *temp = NULL; _parse_line(f, (void **)&temp); - job = _find_job_record(temp->header); + job = _find_job_record(temp->header, JOB_SUSPEND); if (!job) job = _init_job_rec(temp->header); @@ -390,7 +421,7 @@ void process_terminated(char *f[], int lc, int show_full) job_rec_t *temp = NULL; _parse_line(f, (void **)&temp); - job = _find_job_record(temp->header); + job = _find_job_record(temp->header, JOB_TERMINATED); if (!job) { /* fake it for now */ job = _init_job_rec(temp->header); if (params.opt_verbose > 1)