diff --git a/src/sacct/options.c b/src/sacct/options.c index 99bc87ab484d4e18a6938028d7ade6dd3f938c86..2bf72742775165a317da0e6e390278dad7407909 100644 --- a/src/sacct/options.c +++ b/src/sacct/options.c @@ -128,7 +128,7 @@ void _dump_header(acct_header_t header) (int)header.timestamp, header.uid, header.gid, - "-", /* reserved 2 */ + header.blockid, /* block id */ "-"); /* reserved 1 */ } /* _open_log_file() -- find the current or specified log file, and open it @@ -1449,7 +1449,7 @@ void do_fdump(char* f[], int lc) "timestamp", /* F_TIMESTAMP */ "uid", /* F_UIDGID */ "gid", /* F_UIDGID */ - "reserved-1",/* F_RESERVED1 */ + "BlockID", /* F_BLOCKID */ "reserved-2",/* F_RESERVED1 */ "recordType",/* F_RECTYPE */ NULL}; diff --git a/src/sacct/print.c b/src/sacct/print.c index ad26054c8c9ee9b1d4229c57df1b2a129d2766c7..d3011ee137acd4b75136ac0b1a22d27b42b29c84 100644 --- a/src/sacct/print.c +++ b/src/sacct/print.c @@ -644,6 +644,35 @@ void print_partition(type_t type, void *object) } } +void print_blockid(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%-16s", "BlockID"); + break; + case UNDERSCORE: + printf("%-16s", "----------------"); + break; + case JOB: + if(strlen(job->header.partition)<17) + printf("%-16s", job->header.partition); + else + printf("%-13.13s...", job->header.partition); + + break; + case JOBSTEP: + if(strlen(step->header.partition)<17) + printf("%-16s", step->header.blockid); + else + printf("%-13.13s...", step->header.blockid); + + break; + } +} + void print_psize(type_t type, void *object) { job_rec_t *job = (job_rec_t *)object; diff --git a/src/sacct/process.c b/src/sacct/process.c index d9affb9db42d1e8da402dad4c2326dbed0a27df5..1e5f5f8665a86fa22ac9b3e1c528e37080a410d4 100644 --- a/src/sacct/process.c +++ b/src/sacct/process.c @@ -30,7 +30,8 @@ job_rec_t *_find_job_record(acct_header_t header); step_rec_t *_find_step_record(job_rec_t *job, long jobstep); -job_rec_t *_init_job_rec(acct_header_t header, int lc); +job_rec_t *_init_job_rec(acct_header_t header); +step_rec_t *_init_step_rec(acct_header_t header); int _parse_line(char *f[], void **data); job_rec_t *_find_job_record(acct_header_t header) @@ -80,50 +81,55 @@ step_rec_t *_find_step_record(job_rec_t *job, long stepnum) return step; } -job_rec_t *_init_job_rec(acct_header_t header, int lc) +job_rec_t *_init_job_rec(acct_header_t header) { job_rec_t *job = xmalloc(sizeof(job_rec_t)); - - job->header.jobnum = header.jobnum; - job->header.partition = xstrdup(header.partition); - job->header.job_start = header.job_start; - job->header.timestamp = header.timestamp; - job->header.uid = header.uid; - job->header.gid = header.gid; + memcpy(&job->header, &header, sizeof(acct_header_t)); + memset(&job->rusage, 0, sizeof(struct rusage)); job->job_start_seen = 0; job->job_step_seen = 0; job->job_terminated_seen = 0; job->jobnum_superseded = 0; job->jobname = xstrdup("(unknown)"); job->status = JOB_PENDING; + job->nodes = NULL; + job->jobname = NULL; + job->exitcode = 0; + job->priority = 0; + job->ntasks = 0; + job->ncpus = 0; + job->elapsed = 0; job->tot_cpu_sec = 0; job->tot_cpu_usec = 0; - job->rusage.ru_utime.tv_sec = 0; - job->rusage.ru_utime.tv_usec += 0; - job->rusage.ru_stime.tv_sec += 0; - job->rusage.ru_stime.tv_usec += 0; - job->rusage.ru_inblock += 0; - job->rusage.ru_oublock += 0; - job->rusage.ru_msgsnd += 0; - job->rusage.ru_msgrcv += 0; - job->rusage.ru_nsignals += 0; - job->rusage.ru_nvcsw += 0; - job->rusage.ru_nivcsw += 0; - job->rusage.ru_maxrss = 0; - job->rusage.ru_ixrss = 0; - job->rusage.ru_idrss = 0; - job->rusage.ru_isrss = 0; - job->rusage.ru_minflt = 0; - job->rusage.ru_majflt = 0; - job->rusage.ru_nswap = 0; job->vsize = 0; - job->psize = 0; - job->exitcode = 0; + job->psize = 0; job->steps = list_create(destroy_step); job->nodes = NULL; + job->track_steps = 0; + return job; } +step_rec_t *_init_step_rec(acct_header_t header) +{ + step_rec_t *step = xmalloc(sizeof(job_rec_t)); + memcpy(&step->header, &header, sizeof(acct_header_t)); + memset(&step->rusage, 0, sizeof(struct rusage)); + step->stepnum = (uint32_t)NO_VAL; + step->nodes = NULL; + step->stepname = NULL; + step->status = NO_VAL; + step->exitcode = NO_VAL; + step->ntasks = (uint32_t)NO_VAL; + step->ncpus = (uint32_t)NO_VAL; + step->elapsed = (uint32_t)NO_VAL; + step->tot_cpu_sec = (uint32_t)NO_VAL; + step->tot_cpu_usec = (uint32_t)NO_VAL; + step->vsize = (uint32_t)NO_VAL; + step->psize = (uint32_t)NO_VAL; + + return step; +} int _parse_header(char *f[], acct_header_t *header) { header->jobnum = atoi(f[F_JOB]); @@ -132,6 +138,7 @@ int _parse_header(char *f[], acct_header_t *header) header->timestamp = atoi(f[F_TIMESTAMP]); header->uid = atoi(f[F_UID]); header->gid = atoi(f[F_GID]); + header->blockid = xstrdup(f[F_BLOCKID]); return SLURM_SUCCESS; } @@ -140,11 +147,12 @@ int _parse_line(char *f[], void **data) int i = atoi(f[F_RECTYPE]); job_rec_t **job = (job_rec_t **)data; step_rec_t **step = (step_rec_t **)data; - + acct_header_t header; + _parse_header(f, &header); + switch(i) { case JOB_START: - *job = xmalloc(sizeof(job_rec_t)); - _parse_header(f, &(*job)->header); + *job = _init_job_rec(header); (*job)->jobname = xstrdup(f[F_JOBNAME]); (*job)->track_steps = atoi(f[F_TRACK_STEPS]); (*job)->priority = atoi(f[F_PRIORITY]); @@ -159,8 +167,7 @@ int _parse_line(char *f[], void **data) } break; case JOB_STEP: - *step = xmalloc(sizeof(step_rec_t)); - _parse_header(f, &(*step)->header); + *step = _init_step_rec(header); (*step)->stepnum = atoi(f[F_JOBSTEP]); (*step)->status = atoi(f[F_STATUS]); (*step)->exitcode = atoi(f[F_EXITCODE]); @@ -191,11 +198,11 @@ int _parse_line(char *f[], void **data) (*step)->psize = atoi(f[F_PSIZE]); (*step)->stepname = xstrdup(f[F_STEPNAME]); (*step)->nodes = xstrdup(f[F_STEPNODES]); + break; case JOB_SUSPEND: case JOB_TERMINATED: - *job = xmalloc(sizeof(job_rec_t)); - _parse_header(f, &(*job)->header); + *job = _init_job_rec(header); (*job)->elapsed = atoi(f[F_TOT_ELAPSED]); (*job)->status = atoi(f[F_STATUS]); break; @@ -223,18 +230,11 @@ void process_start(char *f[], int lc) return; } - job = _init_job_rec(temp->header, lc); + job = temp; + list_append(jobs, job); job->job_start_seen = 1; - job->header.uid = temp->header.uid; - job->header.gid = temp->header.gid; - xfree(job->jobname); - job->jobname = xstrdup(temp->jobname); - job->priority = temp->priority; - job->track_steps = temp->track_steps; - job->ncpus = temp->ncpus; - job->nodes = xstrdup(temp->nodes); - destroy_job(temp); + } void process_step(char *f[], int lc) @@ -253,7 +253,7 @@ void process_step(char *f[], int lc) return; } if (!job) { /* fake it for now */ - job = _init_job_rec(temp->header, lc); + job = _init_job_rec(temp->header); if ((params.opt_verbose > 1) && (params.opt_jobstep_list==NULL)) fprintf(stderr, @@ -291,6 +291,7 @@ void process_step(char *f[], int lc) step->psize = temp->psize; xfree(step->stepname); step->stepname = xstrdup(temp->stepname); + destroy_step(temp); goto got_step; } step = temp; @@ -305,7 +306,7 @@ void process_step(char *f[], int lc) } got_step: - destroy_step(temp); + if (job->job_terminated_seen == 0) { /* If the job is still running, this is the most recent @@ -359,7 +360,7 @@ void process_suspend(char *f[], int lc) _parse_line(f, (void **)&temp); job = _find_job_record(temp->header); if (!job) - job = _init_job_rec(temp->header, lc); + job = _init_job_rec(temp->header); if (job->status == JOB_SUSPENDED) job->elapsed -= temp->elapsed; @@ -377,7 +378,7 @@ void process_terminated(char *f[], int lc) _parse_line(f, (void **)&temp); job = _find_job_record(temp->header); if (!job) { /* fake it for now */ - job = _init_job_rec(temp->header, lc); + job = _init_job_rec(temp->header); if (params.opt_verbose > 1) fprintf(stderr, "Note: JOB_TERMINATED record for job " "%u preceded " @@ -418,13 +419,21 @@ finished: destroy_job(temp); } +void destroy_acct_header(void *object) +{ + acct_header_t *header = (acct_header_t *)object; + if(header) { + xfree(header->partition); + xfree(header->blockid); + } +} void destroy_job(void *object) { job_rec_t *job = (job_rec_t *)object; if (job) { if(job->steps) list_destroy(job->steps); - xfree(job->header.partition); + destroy_acct_header(&job->header); xfree(job->jobname); xfree(job->nodes); xfree(job); @@ -435,7 +444,7 @@ void destroy_step(void *object) { step_rec_t *step = (step_rec_t *)object; if (step) { - xfree(step->header.partition); + destroy_acct_header(&step->header); xfree(step->stepname); xfree(step->nodes); xfree(step); diff --git a/src/sacct/sacct.c b/src/sacct/sacct.c index 9dad019a36243e66c079e6552dc9d52c48a2730e..cc25862a7ce729ba5499d1b7ef07ecd24c4fa81a 100644 --- a/src/sacct/sacct.c +++ b/src/sacct/sacct.c @@ -165,6 +165,7 @@ fields_t fields[] = {{"cpu", print_cpu}, {"nvcsw", print_nvcsw}, {"outblocks", print_outblocks}, {"partition", print_partition}, + {"blockid", print_blockid}, {"psize", print_psize}, {"rss", print_rss}, {"status", print_status}, diff --git a/src/sacct/sacct.h b/src/sacct/sacct.h index 94b1c52ebf26619aa72f315c77a4fd2aef3c6e40..b94044bbaf88beb51ae930c01fdf50ef659d818f 100644 --- a/src/sacct/sacct.h +++ b/src/sacct/sacct.h @@ -83,7 +83,7 @@ enum { F_JOB = 0, F_TIMESTAMP, F_UID, F_GID, - F_RESERVED1, + F_BLOCKID, F_RESERVED2, F_RECTYPE, HEADER_LENGTH @@ -159,6 +159,7 @@ enum { CANCELLED, typedef struct header { uint32_t jobnum; char *partition; + char *blockid; time_t job_start; time_t timestamp; uint32_t uid; @@ -181,24 +182,28 @@ typedef struct job_rec { int32_t status; int32_t exitcode; uint32_t elapsed; - uint32_t tot_cpu_sec, tot_cpu_usec; - uint32_t vsize, psize; + uint32_t tot_cpu_sec; + uint32_t tot_cpu_usec; + uint32_t vsize; + uint32_t psize; struct rusage rusage; List steps; } job_rec_t; typedef struct step_rec { - acct_header_t header; + acct_header_t header; uint32_t stepnum; /* job's step number */ - uint32_t next; /* linked list of job steps */ char *nodes; char *stepname; int32_t status; int32_t exitcode; - uint32_t ntasks, ncpus; + uint32_t ntasks; + uint32_t ncpus; uint32_t elapsed; - uint32_t tot_cpu_sec, tot_cpu_usec; - uint32_t vsize, psize; + uint32_t tot_cpu_sec; + uint32_t tot_cpu_usec; + uint32_t vsize; + uint32_t psize; struct rusage rusage; } step_rec_t; @@ -251,6 +256,7 @@ void process_start(char *f[], int lc); void process_step(char *f[], int lc); void process_suspend(char *f[], int lc); void process_terminated(char *f[], int lc); +void destroy_acct_header(void *object); void destroy_job(void *object); void destroy_step(void *object); @@ -281,6 +287,7 @@ void print_ntasks(type_t type, void *object); void print_nvcsw(type_t type, void *object); void print_outblocks(type_t type, void *object); void print_partition(type_t type, void *object); +void print_blockid(type_t type, void *object); void print_psize(type_t type, void *object); void print_rss(type_t type, void *object); void print_status(type_t type, void *object); diff --git a/src/slurmctld/jobacct.c b/src/slurmctld/jobacct.c index a70fc6f487e0b7094731a66cac4608c210017931..798c000d393fe6de5e77ad6e342b1abe411ec8ff 100644 --- a/src/slurmctld/jobacct.c +++ b/src/slurmctld/jobacct.c @@ -79,24 +79,33 @@ const char *_jobstep_format = static int _print_record(struct job_record *job_ptr, time_t time, char *data) { - struct tm *ts; /* timestamp decoder */ static int rc=SLURM_SUCCESS; + char *block_id = NULL; - ts = xmalloc(sizeof(struct tm)); - gmtime_r(&time, ts); debug3("_print_record, job=%u, \"%s\"", job_ptr->job_id, data); +#ifdef HAVE_BG + select_g_get_jobinfo(job_ptr->select_jobinfo, + SELECT_DATA_BLOCK_ID, + &block_id); + +#endif + if(!block_id) + block_id = xstrdup("-"); + slurm_mutex_lock( &logfile_lock ); + if (fprintf(LOGFILE, - "%u %s %u %u %d %d - - %s\n", + "%u %s %u %u %d %d %s - %s\n", job_ptr->job_id, job_ptr->partition, (int)job_ptr->start_time, (int)time, - job_ptr->user_id, job_ptr->group_id, data) + job_ptr->user_id, job_ptr->group_id, block_id, data) < 0) rc=SLURM_ERROR; fdatasync(LOGFILE_FD); slurm_mutex_unlock( &logfile_lock ); - xfree(ts); + xfree(block_id); + return rc; } @@ -146,6 +155,7 @@ int jobacct_job_start(struct job_record *job_ptr) debug("jobacct init was not called or it failed"); return SLURM_ERROR; } + debug2("jobacct_job_start() called"); for (i=0; i < job_ptr->num_cpu_groups; i++) ncpus += (job_ptr->cpus_per_node[i]) @@ -185,17 +195,39 @@ int jobacct_step_start(struct step_record *step) { char buf[BUFFER_SIZE]; int cpus = 0; + char node_list[BUFFER_SIZE]; +#ifdef HAVE_BG + uint16_t quarter = (uint16_t)NO_VAL; + uint16_t nodecard = (uint16_t)NO_VAL; +#endif if(!init) { debug("jobacct init was not called or it failed"); return SLURM_ERROR; } + #ifdef HAVE_BG - cpus = step->job_ptr->num_procs; + select_g_get_jobinfo(step->job_ptr->select_jobinfo, + SELECT_DATA_QUARTER, + &quarter); + select_g_get_jobinfo(step->job_ptr->select_jobinfo, + SELECT_DATA_NODECARD, + &nodecard); + if(quarter != (uint16_t)NO_VAL + && nodecard != (uint16_t)NO_VAL) + snprintf(node_list, BUFFER_SIZE, + "%s.%d.%d", step->step_node_list, quarter, nodecard); + else if(quarter != (uint16_t)NO_VAL) + snprintf(node_list, BUFFER_SIZE, + "%s.%d", step->step_node_list, quarter); + else + snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list); + #else cpus = step->num_cpus; + snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list); + block_id = xstrdup("-"); #endif - snprintf(buf, BUFFER_SIZE, _jobstep_format, JOB_STEP, step->step_id, /* stepid */ @@ -226,8 +258,9 @@ int jobacct_step_start(struct step_record *step) 0, /* total nivcsw */ 0, /* max vsize */ 0, /* max psize */ - step->name, /* step exe name */ - step->step_node_list); /* name of nodes step running on */ + step->name, /* step exe name */ + node_list); /* name of nodes step running on */ + return _print_record(step->job_ptr, step->start_time, buf); } @@ -238,6 +271,11 @@ int jobacct_step_complete(struct step_record *step) int elapsed; int comp_status; int cpus = 0; + char node_list[BUFFER_SIZE]; +#ifdef HAVE_BG + uint16_t quarter = (uint16_t)NO_VAL; + uint16_t nodecard = (uint16_t)NO_VAL; +#endif if(!init) { debug("jobacct init was not called or it failed"); @@ -254,9 +292,26 @@ int jobacct_step_complete(struct step_record *step) comp_status = JOB_COMPLETE; #ifdef HAVE_BG - cpus = step->job_ptr->num_procs; + select_g_get_jobinfo(step->job_ptr->select_jobinfo, + SELECT_DATA_QUARTER, + &quarter); + select_g_get_jobinfo(step->job_ptr->select_jobinfo, + SELECT_DATA_NODECARD, + &nodecard); + if(quarter != (uint16_t)NO_VAL + && nodecard != (uint16_t)NO_VAL) + snprintf(node_list, BUFFER_SIZE, + "%s.%d.%d", step->step_node_list, quarter, nodecard); + else if(quarter != (uint16_t)NO_VAL) + snprintf(node_list, BUFFER_SIZE, + "%s.%d", step->step_node_list, quarter); + else + snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list); + #else cpus = step->num_cpus; + snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list); + block_id = xstrdup("-"); #endif snprintf(buf, BUFFER_SIZE, _jobstep_format, @@ -294,8 +349,8 @@ int jobacct_step_complete(struct step_record *step) step->max_vsize, /* max vsize */ step->max_psize, /* max psize */ step->name, /* step exe name */ - step->step_node_list); /* name of nodes step running on */ - + node_list); /* name of nodes step running on */ + return _print_record(step->job_ptr, now, buf); } diff --git a/src/slurmctld/jobacct.h b/src/slurmctld/jobacct.h index 94bd1b6c0f555953dcbe5c9c5a027a68e0ea8174..a5e6f0a38b2e41922adb4fcaa2fe95f4a37555ba 100644 --- a/src/slurmctld/jobacct.h +++ b/src/slurmctld/jobacct.h @@ -35,6 +35,7 @@ #include <slurm/slurm_errno.h> #include <sys/stat.h> #include "src/common/xstring.h" +#include "src/common/node_select.h" #include "slurmctld.h" int jobacct_init(char *job_acct_log);