From 31de3b36b44ca782e349886dad69bba2785d03e8 Mon Sep 17 00:00:00 2001 From: Danny Auble <da@llnl.gov> Date: Thu, 6 Apr 2006 15:36:32 +0000 Subject: [PATCH] even more files not added --- src/sacct/options.c | 1661 +++++++++++++++++++++++++++++++++++++++++++ src/sacct/print.c | 855 ++++++++++++++++++++++ src/sacct/process.c | 442 ++++++++++++ src/sacct/sacct.h | 307 ++++++++ 4 files changed, 3265 insertions(+) create mode 100644 src/sacct/options.c create mode 100644 src/sacct/print.c create mode 100644 src/sacct/process.c create mode 100644 src/sacct/sacct.h diff --git a/src/sacct/options.c b/src/sacct/options.c new file mode 100644 index 00000000000..9264c7f2fd1 --- /dev/null +++ b/src/sacct/options.c @@ -0,0 +1,1661 @@ +/*****************************************************************************\ + * options.c - option functions for sacct + * + * $Id: options.c 7541 2006-03-18 01:44:58Z da $ + ***************************************************************************** + * Copyright (C) 2006 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov>. + * UCRL-CODE-217948. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#include "sacct.h" + +typedef struct expired_rec { /* table of expired jobs */ + uint32_t job; + time_t job_start; + char *line; +} expired_rec_t; + +void _destroy_parts(void *object); +void _destroy_steps(void *object); +void _destroy_exp(void *object); +char *_convert_type(int rec_type); +int _cmp_jrec(const void *a1, const void *a2); +void _dump_header(acct_header_t header); +FILE *_open_log_file(void); +void _help_fields_msg(void); +void _help_msg(void); +void _usage(void); +void _init_params(); +char *_prefix_filename(char *path, char *prefix); + +int selected_status[STATUS_COUNT]; +List selected_parts = NULL; +List selected_steps = NULL; + +void _destroy_parts(void *object) +{ + char *part = (char *)object; + xfree(part); +} + +void _destroy_steps(void *object) +{ + selected_step_t *step = (selected_step_t *)object; + if(step) { + xfree(step->job); + xfree(step->step); + xfree(step); + } +} +void _destroy_exp(void *object) +{ + expired_rec_t *exp_rec = (expired_rec_t *)object; + if(exp_rec) { + xfree(exp_rec->line); + xfree(exp_rec); + } +} + +char *_convert_type(int rec_type) +{ + switch(rec_type) { + case JOB_START: + return "JOB_START"; + case JOB_STEP: + return "JOB_STEP"; + case JOB_TERMINATED: + return "JOB_TERMINATED"; + default: + return "UNKNOWN"; + } +} + +void _show_rec(char *f[]) +{ + int i; + fprintf(stderr, "rec>"); + for (i=0; f[i]; i++) + fprintf(stderr, " %s", f[i]); + fprintf(stderr, "\n"); + return; +} + +int _cmp_jrec(const void *a1, const void *a2) { + expired_rec_t *j1 = (expired_rec_t *) a1; + expired_rec_t *j2 = (expired_rec_t *) a2; + + if (j1->job < j2->job) + return -1; + else if (j1->job == j2->job) { + if(j1->job_start == j2->job_start) + return 0; + else + return 1; + } + return 1; +} + +/* _dump_header() -- dump the common fields of a record + * + * In: Index into the jobs table + * Out: Nothing. + */ +void _dump_header(acct_header_t header) +{ + printf("%d %s %d %d %d %d %s %s ", + header.jobnum, + header.partition, + (int)header.job_start, + (int)header.timestamp, + header.uid, + header.gid, + "-", /* reserved 2 */ + "-"); /* reserved 1 */ +} +/* _open_log_file() -- find the current or specified log file, and open it + * + * IN: Nothing + * RETURNS: Nothing + * + * Side effects: + * - Sets opt_filein to the current system accounting log unless + * the user specified another file. + */ + +FILE *_open_log_file(void) +{ + FILE *fd = fopen(params.opt_filein, "r"); + if (fd == NULL) { + perror(params.opt_filein); + exit(1); + } + return fd; +} + +void _help_fields_msg(void) +{ + int i; + + for (i = 0; fields[i].name; i++) { + if (i & 3) + printf(" "); + else + printf("\n"); + printf("%-10s", fields[i].name); + } + printf("\n"); + return; +} + +void _help_msg(void) +{ + printf("\n" + "By default, sacct displays accounting data for all jobs and job\n" + "steps that are present in the log.\n" + "\n" + "Notes:\n" + "\n" + " * If --dump is specified,\n" + " * The field selection options (--brief, --fields, ...)\n" + " have no effect\n" + " * Elapsed time fields are presented as 2 fields, integral\n" + " seconds and integral microseconds\n" + " * If --dump is not specified, elapsed time fields are presented\n" + " as [[days-]hours:]minutes:seconds.hundredths\n" + " * The default input file is the file named in the \"jobacct_logfile\"\n" + " parameter in " SLURM_CONFIG_FILE ".\n" + "\n" + "Options:\n" + "\n" + "-a, --all\n" + " Display job accounting data for all users. By default, only\n" + " data for the current user is displayed for users other than\n" + " root.\n" + "-b, --brief\n" + " Equivalent to \"--fields=jobstep,status,error\". This option\n" + " has no effect if --dump is specified.\n" + "-d, --dump\n" + " Dump the raw data records\n" + "--duplicates\n" + " If SLURM job ids are reset, but the job accounting log file\n" + " isn't reset at the same time (with -e, for example), some\n" + " job numbers will probably appear more than once in the\n" + " accounting log file to refer to different jobs; such jobs\n" + " can be distinguished by the \"job_start\" time stamp in the\n" + " data records.\n" + " When data for specific jobs or jobsteps are requested with\n" + " the --jobs or --jobsteps options, we assume that the user\n" + " wants to see only the most recent job with that number. This\n" + " behavior can be overridden by specifying --duplicates, in\n" + " which case all records that match the selection criteria\n" + " will be returned.\n" + " When neither --jobs or --jobsteps is specified, we report\n" + " data for all jobs that match the selection criteria, even if\n" + " some of the job numbers are reused. Specify that you only\n" + " want the most recent job for each selected job number with\n" + " the --noduplicates option.\n" + "-e <timespec>, --expire=<timespec>\n" + " Remove jobs from SLURM's current accounting log file (or the\n" + " file specified with --file) that completed more than <timespec>\n" + " ago. If <timespec> is an integer, it is interpreted as\n" + " minutes. If <timespec> is an integer followed by \"h\", it is\n" + " interpreted as a number of hours. If <timespec> is an integer\n" + " followed by \"d\", it is interpreted as number of days. For\n" + " example, \"--expire=14d\" means that you wish to purge the job\n" + " accounting log of all jobs that completed more than 14 days ago.\n" + "-F <field-list>, --fields=<field-list>\n" + " Display the specified data (use \"--help-fields\" for a\n" + " list of available fields). If no field option is specified,\n" + " we use \"--fields=jobstep,jobname,partition,ncpus,status,error\".\n" + "-f<file>, --file=<file>\n" + " Read data from the specified file, rather than SLURM's current\n" + " accounting log file.\n" + "-l, --long\n" + " Equivalent to specifying\n" + " \"--fields=jobstep,usercpu,systemcpu,minflt,majflt,nprocs,\n" + " ncpus,elapsed,status,exitcode\"\n" + "-O, --formatted_dump\n" + " Dump accounting records in an easy-to-read format, primarily\n" + " for debugging.\n" + "-g <gid>, --gid <gid>\n" + " Select only jobs submitted from the <gid> group.\n" + "-h, --help\n" + " Print a general help message.\n" + "--help-fields\n" + " Print a list of fields that can be specified with the\n" + " \"--fields\" option\n" + "-j <job_list>, --jobs=<job_list>\n" + " Display information about this job or comma-separated\n" + " list of jobs. The default is all jobs.\n" + "-J <job.step>, --jobstep=<job.step>\n" + " Show data only for the specified step of the specified job.\n" + "--noduplicates\n" + " See the discussion under --duplicates.\n" + "--noheader\n" + " Print (or don't print) a header. The default is to print a\n" + " header; the option has no effect if --dump is specified\n" + "-p <part_list>, --partition=<part_list>\n" + " Display or purge information about jobs and job steps in the\n" + " <part_list> partition(s). The default is all partitions.\n" + "-P --purge\n" + " Used in conjunction with --expire to remove invalid data\n" + " from the job accounting log.\n" + "-s <state-list>, --state=<state-list>\n" + " Select jobs based on their current status: running (r),\n" + " completed (cd), failed (f), timeout (to), and node_fail (nf).\n" + "-t, --total\n" + " Only show cumulative statistics for each job, not the\n" + " intermediate steps\n" + "-u <uid>, --uid <uid>\n" + " Select only jobs submitted by the user with uid <uid>. Only\n" + " root users are allowed to specify a uid other than their own.\n" + "--usage\n" + " Pointer to this message.\n" + "-v, --verbose\n" + " Primarily for debugging purposes, report the state of various\n" + " variables during processing.\n"); + return; +} + +void _usage(void) +{ + printf("\nUsage: sacct [options]\n\tUse --help for help\n"); +} + +void _init_params() +{ + params.opt_dump = 0; /* --dump */ + params.opt_dup = -1; /* --duplicates; +1 = explicitly set */ + params.opt_fdump = 0; /* --formattted_dump */ + params.opt_gid = -1; /* --gid (-1=wildcard, 0=root) */ + params.opt_header = 1; /* can only be cleared */ + params.opt_help = 0; /* --help */ + params.opt_long = 0; /* --long */ + params.opt_lowmem = 0; /* --low_memory */ + params.opt_purge = 0; /* --purge */ + params.opt_total = 0; /* --total */ + params.opt_uid = -1; /* --uid (-1=wildcard, 0=root) */ + params.opt_verbose = 0; /* --verbose */ + params.opt_expire_timespec = NULL; /* --expire= */ + params.opt_field_list = NULL; /* --fields= */ + params.opt_filein = NULL; /* --file */ + params.opt_job_list = NULL; /* --jobs */ + params.opt_jobstep_list = NULL; /* --jobstep */ + params.opt_partition_list = NULL;/* --partitions */ + params.opt_state_list = NULL; /* --states */ +} + +/* prefix_filename() -- insert a filename prefix into a path + * + * IN: path = fully-qualified path+file name + * prefix = the prefix to insert into the file name + * RETURNS: pointer to the updated path+file name + */ + +char *_prefix_filename(char *path, char *prefix) { + char *out; + int i, + plen; + + plen = strlen(path); + out = xmalloc(plen+strlen(prefix)+1); + for (i=plen-1; i>=0; i--) + if (path[i]=='/') { + break; + } + i++; + *out = 0; + strncpy(out, path, i); + out[i] = 0; + strcat(out, prefix); + strcat(out, path+i); + return(out); +} + +int decode_status_char(char *status) +{ + if (!strcasecmp(status, "p")) + return JOB_PENDING; /* we should never see this */ + else if (!strcasecmp(status, "r")) + return JOB_RUNNING; + else if (!strcasecmp(status, "su")) + return JOB_SUSPENDED; + else if (!strcasecmp(status, "cd")) + return JOB_COMPLETE; + else if (!strcasecmp(status, "ca")) + return JOB_CANCELLED; + else if (!strcasecmp(status, "f")) + return JOB_FAILED; + else if (!strcasecmp(status, "to")) + return JOB_TIMEOUT; + else if (!strcasecmp(status, "nf")) + return JOB_NODE_FAIL; + else if (!strcasecmp(status, "je")) + return JOB_END; + else + return -1; // unknown +} + +char *decode_status_int(int status) +{ + switch(status) { + case JOB_PENDING: + return "PENDING"; /* we should never see this */ + case JOB_RUNNING: + return "RUNNING"; + case JOB_SUSPENDED: + return "SUSPENDED"; + case JOB_COMPLETE: + return "COMPLETED"; + case JOB_CANCELLED: + return "CANCELLED"; + case JOB_FAILED: + return "FAILED"; + case JOB_TIMEOUT: + return "TIMEOUT"; + case JOB_NODE_FAIL: + return "NODE_FAILED"; + case JOB_END: + return "JOB_END"; + default: + return "UNKNOWN"; + } + /* if (!strcasecmp(cs, "ca")) */ +/* return "CANCELLED"; */ +/* else if (strcasecmp(cs, "cd")==0) */ +/* return "COMPLETED"; */ +/* else if (strcasecmp(cs, "cg")==0) */ +/* return "COMPLETING"; /\* we should never see this *\/ */ +/* else if (strcasecmp(cs, "f")==0) */ +/* return "FAILED"; */ +/* else if (strcasecmp(cs, "nf")==0) */ +/* return "NODEFAILED"; */ +/* else if (strcasecmp(cs, "p")==0) */ +/* return "PENDING"; /\* we should never see this *\/ */ +/* else if (strcasecmp(cs, "r")==0) */ +/* return "RUNNING"; */ +/* else if (strcasecmp(cs, "to")==0) */ +/* return "TIMEDOUT"; */ +} + +int get_data(void) +{ + char line[BUFFER_SIZE]; + char *f[MAX_RECORD_FIELDS]; /* End list with null entry and, + possibly, more data than we + expected */ + char *fptr; + int i; + FILE *fd = NULL; + int lc = 0; + int rec_type = -1; + selected_step_t *selected_step = NULL; + char *selected_part = NULL; + ListIterator itr = NULL; + + fd = _open_log_file(); + + while (fgets(line, BUFFER_SIZE, fd)) { + lc++; + fptr = line; /* break the record into NULL- + terminated strings */ + + for (i = 0; i < MAX_RECORD_FIELDS; i++) { + f[i] = fptr; + fptr = strstr(fptr, " "); + if (fptr == NULL) { + fptr = strstr(f[i], "\n"); + if (fptr) + *fptr = 0; + break; + } else + *fptr++ = 0; + } + f[++i] = 0; + + if(i < HEADER_LENGTH) { + continue; + } + + rec_type = atoi(f[F_RECTYPE]); + + if (list_count(selected_steps)) { + itr = list_iterator_create(selected_steps); + while((selected_step = list_next(itr))) { + if (strcmp(selected_step->job, f[F_JOB])) + continue; + /* job matches; does the step> */ + if (selected_step->step == NULL + || rec_type == JOB_STEP + || !strcmp(f[F_JOBSTEP], + selected_step->step)) { + list_iterator_destroy(itr); + goto foundjob; + } + } + list_iterator_destroy(itr); + continue; /* no match */ + } + foundjob: + + if (list_count(selected_parts)) { + itr = list_iterator_create(selected_parts); + while((selected_part = list_next(itr))) + if (!strcasecmp(f[F_PARTITION], + selected_part)) { + list_iterator_destroy(itr); + goto foundp; + } + list_iterator_destroy(itr); + continue; /* no match */ + } + foundp: + + if (params.opt_fdump) { + do_fdump(f, lc); + continue; + } + + /* Build suitable tables with all the data */ + switch(rec_type) { + case JOB_START: + if(i < JOB_START_LENGTH) { + printf("Bad data on a Job Start\n"); + _show_rec(f); + } else + process_start(f, lc); + break; + case JOB_STEP: + if(i < JOB_STEP_LENGTH) { + printf("Bad data on a Step entry\n"); + _show_rec(f); + } else + process_step(f, lc); + break; + case JOB_SUSPEND: + if(i < JOB_TERM_LENGTH) { + printf("Bad data on a Suspend entry\n"); + _show_rec(f); + } else + process_suspend(f, lc); + break; + case JOB_TERMINATED: + if(i < JOB_TERM_LENGTH) { + printf("Bad data on a Job Term\n"); + _show_rec(f); + } else + process_terminated(f, lc); + break; + default: + if (params.opt_verbose > 1) + fprintf(stderr, + "Invalid record at line %d of " + "input file\n", + lc); + if (params.opt_verbose > 2) + _show_rec(f); + input_error++; + break; + } + } + + if (ferror(fd)) { + perror(params.opt_filein); + exit(1); + } + fclose(fd); + + return SLURM_SUCCESS; +} + +void parse_command_line(int argc, char **argv) +{ + extern int optind; + int c, i, optionIndex = 0; + char *end, *start, *acct_type; + selected_step_t *selected_step = NULL; + ListIterator itr = NULL; + struct stat stat_buf; + static struct option long_options[] = { + {"all", 0,0, 'a'}, + {"brief", 0, 0, 'b'}, + {"duplicates", 0, ¶ms.opt_dup, 1}, + {"dump", 0, 0, 'd'}, + {"expire", 1, 0, 'e'}, + {"fields", 1, 0, 'F'}, + {"file", 1, 0, 'f'}, + {"formatted_dump", 0, 0, 'O'}, + {"gid", 1, 0, 'g'}, + {"group", 1, 0, 'g'}, + {"help", 0, ¶ms.opt_help, 1}, + {"help-fields", 0, ¶ms.opt_help, 2}, + {"jobs", 1, 0, 'j'}, + {"jobstep", 1, 0, 'J'}, + {"long", 0, 0, 'l'}, + {"big_logfile", 0, ¶ms.opt_lowmem, 1}, + {"noduplicates", 0, ¶ms.opt_dup, 0}, + {"noheader", 0, ¶ms.opt_header, 0}, + {"partition", 1, 0, 'p'}, + {"purge", 0, 0, 'P'}, + {"state", 1, 0, 's'}, + {"total", 0, 0, 't'}, + {"uid", 1, 0, 'u'}, + {"usage", 0, ¶ms.opt_help, 3}, + {"user", 1, 0, 'u'}, + {"verbose", 0, 0, 'v'}, + {"version", 0, 0, 'V'}, + {0, 0, 0, 0}}; + + _init_params(); + + if ((i=getuid())) /* default to current user unless root*/ + params.opt_uid = i; + + opterr = 1; /* Let getopt report problems to the user */ + + while (1) { /* now cycle through the command line */ + c = getopt_long(argc, argv, "abde:F:f:g:hj:J:lOPp:s:tUu:Vv", + long_options, &optionIndex); + if (c == -1) + break; + switch (c) { + case 'a': + params.opt_uid = -1; + break; + case 'b': + params.opt_field_list = + xrealloc(params.opt_field_list, + (params.opt_field_list==NULL? 0 : + sizeof(params.opt_field_list)) + + sizeof(BRIEF_FIELDS)+1); + strcat(params.opt_field_list, BRIEF_FIELDS); + strcat(params.opt_field_list, ","); + break; + + case 'd': + params.opt_dump = 1; + break; + + case 'e': + { /* decode the time spec */ + long acc=0; + params.opt_expire_timespec = strdup(optarg); + for (i=0; params.opt_expire_timespec[i]; i++) { + char c = params.opt_expire_timespec[i]; + if (isdigit(c)) { + acc = (acc*10)+(c-'0'); + continue; + } + switch (c) { + case 'D': + case 'd': + params.opt_expire += + acc*SECONDS_IN_DAY; + acc=0; + break; + case 'H': + case 'h': + params.opt_expire += + acc*SECONDS_IN_HOUR; + acc=0; + break; + case 'M': + case 'm': + params.opt_expire += + acc*SECONDS_IN_MINUTE; + acc=0; + break; + default: + params.opt_expire = -1; + goto bad_timespec; + } + } + params.opt_expire += acc*SECONDS_IN_MINUTE; + bad_timespec: + if (params.opt_expire <= 0) { + fprintf(stderr, + "Invalid timspec for " + "--expire: \"%s\"\n", + params.opt_expire_timespec); + exit(1); + } + } + params.opt_uid = -1; /* fix default; can't purge by uid */ + break; + + case 'F': + params.opt_field_list = + xrealloc(params.opt_field_list, + (params.opt_field_list==NULL? 0 : + strlen(params.opt_field_list)) + + strlen(optarg) + 1); + strcat(params.opt_field_list, optarg); + strcat(params.opt_field_list, ","); + break; + + case 'f': + params.opt_filein = + xrealloc(params.opt_filein, strlen(optarg)+1); + strcpy(params.opt_filein, optarg); + break; + + case 'g': + if (isdigit((int) *optarg)) + params.opt_gid = atoi(optarg); + else { + struct group *grp; + if ((grp=getgrnam(optarg))==NULL) { + fprintf(stderr, + "Invalid group id: %s\n", + optarg); + exit(1); + } + params.opt_gid=grp->gr_gid; + } + break; + + case 'h': + params.opt_help = 1; + break; + + case 'j': + if (strspn(optarg, "0123456789, ") < strlen(optarg)) { + fprintf(stderr, "Invalid jobs list: %s\n", + optarg); + exit(1); + } + params.opt_job_list = + xrealloc(params.opt_job_list, + (params.opt_job_list==NULL? 0 : + strlen(params.opt_job_list)) + + strlen(optarg) + 1); + strcat(params.opt_job_list, optarg); + strcat(params.opt_job_list, ","); + break; + + case 'J': + if (strspn(optarg, ".0123456789, ") < strlen(optarg)) { + fprintf(stderr, "Invalid jobstep list: %s\n", + optarg); + exit(1); + } + params.opt_jobstep_list = + xrealloc(params.opt_jobstep_list, + (params.opt_jobstep_list==NULL? 0 : + strlen(params.opt_jobstep_list)) + + strlen(optarg) + 1); + strcat(params.opt_jobstep_list, optarg); + strcat(params.opt_jobstep_list, ","); + break; + + case 'l': + params.opt_field_list = + xrealloc(params.opt_field_list, + (params.opt_field_list==NULL? 0 : + strlen(params.opt_field_list)) + + sizeof(LONG_FIELDS)+1); + strcat(params.opt_field_list, LONG_FIELDS); + strcat(params.opt_field_list, ","); + break; + + case 'O': + params.opt_fdump = 1; + break; + + case 'P': + params.opt_purge = 1; + break; + + case 'p': + params.opt_partition_list = + xrealloc(params.opt_partition_list, + (params.opt_partition_list==NULL? 0 : + strlen(params.opt_partition_list)) + + strlen(optarg) + 1); + strcat(params.opt_partition_list, optarg); + strcat(params.opt_partition_list, ","); + break; + + case 's': + params.opt_state_list = + xrealloc(params.opt_state_list, + (params.opt_state_list==NULL? 0 : + strlen(params.opt_state_list)) + + strlen(optarg) + 1); + strcat(params.opt_state_list, optarg); + strcat(params.opt_state_list, ","); + break; + + case 't': + params.opt_total = 1; + break; + + case 'U': + params.opt_help = 3; + break; + + case 'u': + if (isdigit((int) *optarg)) + params.opt_uid = atoi(optarg); + else { + struct passwd *pwd; + if ((pwd=getpwnam(optarg))==NULL) { + fprintf(stderr, + "Invalid user id: %s\n", + optarg); + exit(1); + } + params.opt_uid=pwd->pw_uid; + } + break; + + case 'v': + /* Handle -vvv thusly... + * 0 - report only normal messages and errors + * 1 - report options selected and major operations + * 2 - report data anomalies probably not errors + * 3 - blather on and on + */ + params.opt_verbose++; + break; + + case 'V': + { + char obuf[20]; /* should be long enough */ + char *rev="$Revision: 7267 $"; + char *s; + + s=strstr(rev, " ")+1; + for (i=0; s[i]!=' '; i++) + obuf[i]=s[i]; + obuf[i] = 0; + printf("%s: %s\n", argv[0], obuf); + exit(0); + } + + case ':': + case '?': /* getopt() has explained it */ + exit(1); + } + } + + /* Now set params.opt_dup, unless they've already done so */ + if (params.opt_dup < 0) /* not already set explicitly */ + if (params.opt_job_list || params.opt_jobstep_list) + /* They probably want the most recent job N if + * they requested specific jobs or steps. */ + params.opt_dup = 0; + + if (params.opt_verbose) { + fprintf(stderr, "Options selected:\n" + "\topt_dump=%d\n" + "\topt_dup=%d\n" + "\topt_expire=%s (%lu seconds)\n" + "\topt_fdump=%d\n" + "\topt_field_list=%s\n" + "\topt_filein=%s\n" + "\topt_header=%d\n" + "\topt_help=%d\n" + "\topt_job_list=%s\n" + "\topt_jobstep_list=%s\n" + "\topt_long=%d\n" + "\topt_lowmem=%d\n" + "\topt_partition_list=%s\n" + "\topt_purge=%d\n" + "\topt_state_list=%s\n" + "\topt_total=%d\n" + "\topt_uid=%d\n" + "\topt_verbose=%d\n", + params.opt_dump, + params.opt_dup, + params.opt_expire_timespec, params.opt_expire, + params.opt_fdump, + params.opt_field_list, + params.opt_filein, + params.opt_header, + params.opt_help, + params.opt_job_list, + params.opt_jobstep_list, + params.opt_long, + params.opt_lowmem, + params.opt_partition_list, + params.opt_purge, + params.opt_state_list, + params.opt_total, + params.opt_uid, + params.opt_verbose); + } + + /* check if we have accounting data to view */ + if (params.opt_filein == NULL) + params.opt_filein = slurm_get_jobacct_loc(); + acct_type = slurm_get_jobacct_type(); + if ((strcmp(acct_type, "jobacct/none") == 0) + && (stat(params.opt_filein, &stat_buf) != 0)) { + fprintf(stderr, "SLURM accounting is disabled\n"); + exit(1); + } + xfree(acct_type); + + /* specific partitions requested? */ + if (params.opt_partition_list) { + + start = params.opt_partition_list; + while ((end = strstr(start, ","))) { + *end = 0; + acct_type = xstrdup(start); + list_append(selected_parts, acct_type); + start = end + 1; + } + if (params.opt_verbose) { + fprintf(stderr, "Partitions requested:\n"); + itr = list_iterator_create(selected_parts); + while((start = list_next(itr))) + fprintf(stderr, "\t: %s\n", start); + list_iterator_destroy(itr); + } + } + + /* specific jobsteps requested? */ + if (params.opt_jobstep_list) { + char *dot; + + start = params.opt_jobstep_list; + while ((end = strstr(start, ","))) { + *end = 0;; + while (isspace(*start)) + start++; /* discard whitespace */ + dot = strstr(start, "."); + if (dot == NULL) { + fprintf(stderr, "Invalid jobstep: %s\n", + start); + exit(1); + } + *dot++ = 0; + selected_step = xmalloc(sizeof(selected_step_t)); + list_append(selected_steps, selected_step); + + selected_step->job = xstrdup(start); + selected_step->step = xstrdup(dot); + start = end + 1; + } + if (params.opt_verbose) { + fprintf(stderr, "Job steps requested:\n"); + itr = list_iterator_create(selected_steps); + while((selected_step = list_next(itr))) + fprintf(stderr, "\t: %s.%s\n", + selected_step->job, + selected_step->step); + list_iterator_destroy(itr); + + } + } + + /* specific jobs requested? */ + if (params.opt_job_list) { + start = params.opt_job_list; + while ((end = strstr(start, ","))) { + while (isspace(*start)) + start++; /* discard whitespace */ + *end = 0; + selected_step = xmalloc(sizeof(selected_step_t)); + list_append(selected_steps, selected_step); + + selected_step->job = xstrdup(start); + selected_step->step = NULL; + start = end + 1; + } + if (params.opt_verbose) { + fprintf(stderr, "Jobs requested:\n"); + itr = list_iterator_create(selected_steps); + while((selected_step = list_next(itr))) + fprintf(stderr, "\t: %s\n", + selected_step->job); + list_iterator_destroy(itr); + } + } + + /* specific states (completion status) requested? */ + if (params.opt_state_list) { + start = params.opt_state_list; + while ((end = strstr(start, ","))) { + *end = 0; + selected_status[decode_status_char(start)] = 1; + start = end + 1; + } + if (params.opt_verbose) { + fprintf(stderr, "States requested:\n"); + for(i=0; i< STATUS_COUNT; i++) { + if(selected_status[i]) { + fprintf(stderr, "\t: %s\n", + decode_status_int(i)); + break; + } + } + } + } + + /* select the output fields */ + if (params.opt_field_list==NULL) { + if (params.opt_dump || params.opt_expire) + goto endopt; + params.opt_field_list = xmalloc(sizeof(DEFAULT_FIELDS)+1); + strcpy(params.opt_field_list, DEFAULT_FIELDS); + strcat(params.opt_field_list, ","); + } + start = params.opt_field_list; + while ((end = strstr(start, ","))) { + *end = 0; + for (i = 0; fields[i].name; i++) { + if (!strcasecmp(fields[i].name, start)) + goto foundfield; + } + fprintf(stderr, + "Invalid field requested: \"%s\"\n", + start); + exit(1); + foundfield: + printfields[nprintfields++] = i; + start = end + 1; + } + if (params.opt_verbose) { + fprintf(stderr, "%d field%s selected:\n", + nprintfields, + (nprintfields==1? "" : "s")); + for (i = 0; i < nprintfields; i++) + fprintf(stderr, + "\t%s\n", + fields[printfields[i]].name); + } +endopt: + if (optind < argc) { + fprintf(stderr, "Error: Unknown arguments:"); + for (i=optind; i<argc; i++) + fprintf(stderr, " %s", argv[i]); + fprintf(stderr, "\n"); + exit(1); + } + return; +} + +void do_dump(void) +{ + ListIterator itr = NULL; + ListIterator itr_step = NULL; + job_rec_t *job = NULL; + step_rec_t *step = NULL; + + itr = list_iterator_create(jobs); + while((job = list_next(itr))) { + if (!params.opt_dup) + if (job->jobnum_superseded) { + if (params.opt_verbose > 1) + fprintf(stderr, + "Note: Skipping older" + " job %d dated %d\n", + job->header.jobnum, + (int)job->header.job_start); + continue; + } + if (params.opt_uid>=0) + if (job->header.uid != params.opt_uid) + continue; + /* JOB_START */ + if (params.opt_jobstep_list == NULL) { + if (!job->job_start_seen && job->job_step_seen) { + /* If we only saw JOB_TERMINATED, the + * job was probably canceled. */ + fprintf(stderr, + "Error: No JOB_START record for " + "job %d\n", + job->header.jobnum); + } + _dump_header(job->header); + printf("JOB_START %s %d %d %d %s\n", + job->jobname, + job->track_steps, + job->priority, + job->ncpus, + job->nodes); + } + /* JOB_STEP */ + itr_step = list_iterator_create(job->steps); + while((step = list_next(itr_step))) { + if (step->status == JOB_RUNNING && + job->job_terminated_seen) { + step->status = JOB_FAILED; + step->exitcode=1; + } + _dump_header(step->header); + printf("JOB_STEP %d %s %s ", + step->stepnum, + step->stepname, + step->nodes); + printf("%s %d %d %d %d ", + decode_status_int(step->status), + step->exitcode, + step->ntasks, + step->ncpus, + step->elapsed); + printf("%d %d %d %d %d %d ", + step->tot_cpu_sec, + step->tot_cpu_usec, + (int)step->rusage.ru_utime.tv_sec, + (int)step->rusage.ru_utime.tv_usec, + (int)step->rusage.ru_stime.tv_sec, + (int)step->rusage.ru_stime.tv_usec); + printf("%d %d %d %d %d %d %d %d %d " + "%d %d %d %d %d %d %d\n", + (int)step->rusage.ru_maxrss, + (int)step->rusage.ru_ixrss, + (int)step->rusage.ru_idrss, + (int)step->rusage.ru_isrss, + (int)step->rusage.ru_minflt, + (int)step->rusage.ru_majflt, + (int)step->rusage.ru_nswap, + (int)step->rusage.ru_inblock, + (int)step->rusage.ru_oublock, + (int)step->rusage.ru_msgsnd, + (int)step->rusage.ru_msgrcv, + (int)step->rusage.ru_nsignals, + (int)step->rusage.ru_nvcsw, + (int)step->rusage.ru_nivcsw, + step->vsize, + step->psize); + } + list_iterator_destroy(itr_step); + /* JOB_TERMINATED */ + if (params.opt_jobstep_list == NULL) { + _dump_header(job->header); + printf("JOB_TERMINATED %d ", + job->elapsed); + printf("%s %d %d %d %d ", + decode_status_int(job->status), + job->exitcode, + job->ntasks, + job->ncpus, + job->elapsed); + printf("%d %d %d %d %d %d ", + job->tot_cpu_sec, + job->tot_cpu_usec, + (int)job->rusage.ru_utime.tv_sec, + (int)job->rusage.ru_utime.tv_usec, + (int)job->rusage.ru_stime.tv_sec, + (int)job->rusage.ru_stime.tv_usec); + printf("%d %d %d %d %d %d ", + (int)job->rusage.ru_maxrss, + (int)job->rusage.ru_ixrss, + (int)job->rusage.ru_idrss, + (int)job->rusage.ru_isrss, + (int)job->rusage.ru_minflt, + (int)job->rusage.ru_majflt); + printf("%d %d %d %d %d %d %d %d %d %d\n", + (int)job->rusage.ru_nswap, + (int)job->rusage.ru_inblock, + (int)job->rusage.ru_oublock, + (int)job->rusage.ru_msgsnd, + (int)job->rusage.ru_msgrcv, + (int)job->rusage.ru_nsignals, + (int)job->rusage.ru_nvcsw, + (int)job->rusage.ru_nivcsw, + job->vsize, + job->psize); + } + } + list_iterator_destroy(itr); +} + +/* do_expire() -- purge expired data from the accounting log file + * + * What we're doing: + * 1. Open logfile.orig + * 2. stat logfile.orig + * - confirm that it's not a sym link + * - capture the ownership and permissions + * 3. scan logfile.orig for JOB_TERMINATED records with F_TIMESTAMP dates + * that precede the specified expiration date. Build exp_table as + * a list of expired jobs. + * 4. Open logfile.expired for append + * 5. Create logfile.new as ".new.<logfile>" (output with line buffering) + * 6. Re-scan logfile.orig, writing + * - Expired job records to logfile.expired + * - Other job records to logfile.new + * 7. Rename logfile.orig as ".old.<logfile>" + * 8. Rename logfile.new as "<logfile>" + * 9. Execute "scontrol reconfigure" which will cause slurmctld to + * start writing to logfile.new + * 10. fseek(ftell(logfile.orig)) to clear EOF + * 11. Copy any new records from logfile.orig to logfile.new + * 12. Close logfile.expired, logfile.new + * 13. Unlink .old.<logfile> + */ + +void do_expire(void) +{ + char line[BUFFER_SIZE], + *f[EXPIRE_READ_LENGTH], + *fptr = NULL, + *logfile_name = NULL, + *old_logfile_name = NULL; + int file_err=0, + new_file, + i; + expired_rec_t *exp_rec = NULL; + expired_rec_t *exp_rec2 = NULL; + List keep_list = list_create(_destroy_exp); + List exp_list = list_create(_destroy_exp); + List other_list = list_create(_destroy_exp); + pid_t pid; + struct stat statbuf; + mode_t prot = 0600; + uid_t uid; + gid_t gid; + FILE *expired_logfile = NULL, + *new_logfile = NULL; + FILE *fd = NULL; + int lc=0; + int rec_type = -1; + ListIterator itr = NULL; + ListIterator itr2 = NULL; + char *temp = NULL; + + /* Figure out our expiration date */ + time_t expiry; + expiry = time(NULL)-params.opt_expire; + if (params.opt_verbose) + fprintf(stderr, "Purging jobs completed prior to %d\n", + (int)expiry); + + /* Open the current or specified logfile, or quit */ + fd = _open_log_file(); + if (stat(params.opt_filein, &statbuf)) { + perror("stat'ing logfile"); + goto finished; + } + if ((statbuf.st_mode & S_IFLNK) == S_IFLNK) { + fprintf(stderr, "%s is a symbolic link; --expire requires " + "a hard-linked file name\n", params.opt_filein); + goto finished; + } + if (!(statbuf.st_mode & S_IFREG)) { + fprintf(stderr, "%s is not a regular file; --expire " + "only works on accounting log files\n", + params.opt_filein); + goto finished; + } + prot = statbuf.st_mode & 0777; + gid = statbuf.st_gid; + uid = statbuf.st_uid; + old_logfile_name = _prefix_filename(params.opt_filein, ".old."); + if (stat(old_logfile_name, &statbuf)) { + if (errno != ENOENT) { + fprintf(stderr,"Error checking for %s: ", + old_logfile_name); + perror(""); + goto finished; + } + } else { + fprintf(stderr, "Warning! %s exists -- please remove " + "or rename it before proceeding", + old_logfile_name); + goto finished; + } + + /* create our initial buffer */ + while (fgets(line, BUFFER_SIZE, fd)) { + lc++; + fptr = line; /* break the record into NULL- + terminated strings */ + exp_rec = xmalloc(sizeof(expired_rec_t)); + exp_rec->line = xstrdup(line); + + for (i = 0; i < EXPIRE_READ_LENGTH; i++) { + f[i] = fptr; + fptr = strstr(fptr, " "); + if (fptr == NULL) + break; + else + *fptr++ = 0; + } + + exp_rec->job = atoi(f[F_JOB]); + exp_rec->job_start = atoi(f[F_JOB_START]); + + rec_type = atoi(f[F_RECTYPE]); + /* Odd, but complain some other time */ + if (rec_type == JOB_TERMINATED) { + if (expiry < atoi(f[F_TIMESTAMP])) { + list_append(keep_list, exp_rec); + continue; + } + if (list_count(selected_parts)) { + itr = list_iterator_create(selected_parts); + while((temp = list_next(itr))) + if(!strcasecmp(f[F_PARTITION], temp)) + break; + list_iterator_destroy(itr); + if(!temp) { + list_append(keep_list, exp_rec); + continue; + } /* no match */ + } + list_append(exp_list, exp_rec); + if (params.opt_verbose > 2) + fprintf(stderr, "Selected: %8d %d\n", + exp_rec->job, + (int)exp_rec->job_start); + } else { + list_append(other_list, exp_rec); + } + } + if (!list_count(exp_list)) { + printf("No job records were purged.\n"); + goto finished; + } + logfile_name = xmalloc(strlen(params.opt_filein)+sizeof(".expired")); + sprintf(logfile_name, "%s.expired", params.opt_filein); + new_file = stat(logfile_name, &statbuf); + if ((expired_logfile = fopen(logfile_name, "a"))==NULL) { + fprintf(stderr, "Error while opening %s", + logfile_name); + perror(""); + xfree(logfile_name); + goto finished; + } + + if (new_file) { /* By default, the expired file looks like the log */ + chmod(logfile_name, prot); + chown(logfile_name, uid, gid); + } + xfree(logfile_name); + + logfile_name = _prefix_filename(params.opt_filein, ".new."); + if ((new_logfile = fopen(logfile_name, "w"))==NULL) { + fprintf(stderr, "Error while opening %s", + logfile_name); + perror(""); + fclose(expired_logfile); + goto finished; + } + chmod(logfile_name, prot); /* preserve file protection */ + chown(logfile_name, uid, gid); /* and ownership */ + /* Use line buffering to allow us to safely write + * to the log file at the same time as slurmctld. */ + if (setvbuf(new_logfile, NULL, _IOLBF, 0)) { + perror("setvbuf()"); + fclose(expired_logfile); + goto finished2; + } + + list_sort(exp_list, (ListCmpF) _cmp_jrec); + list_sort(keep_list, (ListCmpF) _cmp_jrec); + + if (params.opt_verbose > 2) { + fprintf(stderr, "--- contents of exp_list ---"); + itr = list_iterator_create(exp_list); + while((exp_rec = list_next(itr))) { + if (!(i%5)) + fprintf(stderr, "\n"); + else + fprintf(stderr, "\t"); + fprintf(stderr, "%d", exp_rec->job); + } + fprintf(stderr, "\n---- end of exp_list ---\n"); + list_iterator_destroy(itr); + } + /* write the expired file */ + itr = list_iterator_create(exp_list); + while((exp_rec = list_next(itr))) { + itr2 = list_iterator_create(other_list); + while((exp_rec2 = list_next(itr2))) { + if((exp_rec2->job != exp_rec->job) + || (exp_rec2->job_start != exp_rec->job_start)) + continue; + if (fputs(exp_rec2->line, expired_logfile)<0) { + perror("writing expired_logfile"); + list_iterator_destroy(itr2); + list_iterator_destroy(itr); + fclose(expired_logfile); + goto finished2; + } + list_remove(itr2); + _destroy_exp(exp_rec2); + } + list_iterator_destroy(itr2); + if (fputs(exp_rec->line, expired_logfile)<0) { + perror("writing expired_logfile"); + list_iterator_destroy(itr); + fclose(expired_logfile); + goto finished2; + } + } + list_iterator_destroy(itr); + fclose(expired_logfile); + + /* write the new log */ + itr = list_iterator_create(keep_list); + while((exp_rec = list_next(itr))) { + itr2 = list_iterator_create(other_list); + while((exp_rec2 = list_next(itr2))) { + if(exp_rec2->job != exp_rec->job) + continue; + if (fputs(exp_rec2->line, new_logfile)<0) { + perror("writing keep_logfile"); + list_iterator_destroy(itr2); + list_iterator_destroy(itr); + goto finished2; + } + list_remove(itr2); + _destroy_exp(exp_rec2); + } + list_iterator_destroy(itr2); + if (fputs(exp_rec->line, new_logfile)<0) { + perror("writing keep_logfile"); + list_iterator_destroy(itr); + goto finished2; + } + } + list_iterator_destroy(itr); + + if (rename(params.opt_filein, old_logfile_name)) { + perror("renaming logfile to .old."); + goto finished2; + } + if (rename(logfile_name, params.opt_filein)) { + perror("renaming new logfile"); + /* undo it? */ + if (!rename(old_logfile_name, params.opt_filein)) + fprintf(stderr, "Please correct the problem " + "and try again"); + else + fprintf(stderr, "SEVERE ERROR: Current accounting " + "log may have been renamed %s;\n" + "please rename it to \"%s\" if necessary, " + "and try again\n", + old_logfile_name, params.opt_filein); + goto finished2; + } + fflush(new_logfile); /* Flush the buffers before forking */ + fflush(fd); + if ((pid=fork())) { + if (waitpid(pid, &i, 0) < 1) { + perror("forking scontrol"); + goto finished; + } + } else { + execlp("scontrol", "scontrol", "reconfigure", NULL); + perror("attempting to run \"scontrol reconfigure\""); + goto finished2; + } + if (WEXITSTATUS(i)) { + file_err = 1; + fprintf(stderr, "Error: Attempt to execute \"scontrol " + "reconfigure\" failed. If SLURM is\n" + "running, please rename the file \"%s\"\n" + " to \"%s\" and try again.\n", + old_logfile_name, params.opt_filein); + } + if (fseek(fd, 0, SEEK_CUR)) { /* clear EOF */ + perror("looking for late-arriving records"); + goto finished2; + } + while (fgets(line, BUFFER_SIZE, fd)) { + if (fputs(line, new_logfile)<0) { + perror("writing final records"); + goto finished2; + } + } + + printf("%d jobs expired.\n", list_count(exp_list)); +finished2: + fclose(new_logfile); + if (!file_err) + unlink(old_logfile_name); +finished: + fclose(fd); + list_destroy(exp_list); + list_destroy(keep_list); + list_destroy(other_list); + xfree(old_logfile_name); + xfree(logfile_name); +} + +void do_fdump(char* f[], int lc) +{ + int i=0, j=0; + char **type; + char *header[] = {"job", /* F_JOB */ + "partition", /* F_PARTITION */ + "job_start", /* F_JOB_START */ + "timestamp", /* F_TIMESTAMP */ + "uid", /* F_UIDGID */ + "gid", /* F_UIDGID */ + "reserved-1",/* F_RESERVED1 */ + "reserved-2",/* F_RESERVED1 */ + "recordType",/* F_RECTYPE */ + NULL}; + + char *start[] = {"jobName", /* F_JOBNAME */ + "TrackSteps", /* F_TRACK_STEPS */ + "priority", /* F_PRIORITY */ + "ncpus", /* F_NCPUS */ + "nodeList", /* F_NODES */ + NULL}; + + char *step[] = {"jobStep", /* F_JOBSTEP */ + "status", /* F_STATUS */ + "exitcode", /* F_EXITCODE */ + "ntasks", /* F_NTASKS */ + "ncpus", /* F_STEPNCPUS */ + "elapsed", /* F_ELAPSED */ + "cpu_sec", /* F_CPU_SEC */ + "cpu_usec", /* F_CPU_USEC */ + "user_sec", /* F_USER_SEC */ + "user_usec", /* F_USER_USEC */ + "sys_sec", /* F_SYS_SEC */ + "sys_usec", /* F_SYS_USEC */ + "rss", /* F_RSS */ + "ixrss", /* F_IXRSS */ + "idrss", /* F_IDRSS */ + "isrss", /* F_ISRSS */ + "minflt", /* F_MINFLT */ + "majflt", /* F_MAJFLT */ + "nswap", /* F_NSWAP */ + "inblocks", /* F_INBLOCKS */ + "oublocks", /* F_OUTBLOCKS */ + "msgsnd", /* F_MSGSND */ + "msgrcv", /* F_MSGRCV */ + "nsignals", /* F_NSIGNALS */ + "nvcsw", /* F_VCSW */ + "nivcsw", /* F_NIVCSW */ + "vsize", /* F_VSIZE */ + "psize", /* F_PSIZE */ + "StepName", /* F_STEPNAME */ + "StepNodes", /* F_STEPNODES */ + NULL}; + + char *suspend[] = {"Suspend/Run time", /* F_TOT_ELAPSED */ + "status", /* F_STATUS */ + NULL}; + + char *term[] = {"totElapsed", /* F_TOT_ELAPSED */ + "status", /* F_STATUS */ + NULL}; + + i = atoi(f[F_RECTYPE]); + printf("\n------- Line %d %s -------\n", lc, _convert_type(i)); + + for(j=0; j < HEADER_LENGTH; j++) + printf("%12s: %s\n", header[j], f[j]); + switch(i) { + case JOB_START: + type = start; + j = JOB_START_LENGTH; + break; + case JOB_STEP: + type = step; + j = JOB_STEP_LENGTH; + break; + case JOB_SUSPEND: + type = suspend; + j = JOB_TERM_LENGTH; + case JOB_TERMINATED: + type = term; + j = JOB_TERM_LENGTH; + break; + default: + while(f[j]) { + printf(" Field[%02d]: %s\n", j, f[j]); + j++; + } + return; + } + + for(i=HEADER_LENGTH; i < j; i++) + printf("%12s: %s\n", type[i-HEADER_LENGTH], f[i]); +} + +void do_help(void) +{ + switch (params.opt_help) { + case 1: + _help_msg(); + break; + case 2: + _help_fields_msg(); + break; + case 3: + _usage(); + break; + default: + fprintf(stderr, "sacct bug: params.opt_help=%d\n", + params.opt_help); + } +} + +/* do_list() -- List the assembled data + * + * In: Nothing explicit. + * Out: void. + * + * At this point, we have already selected the desired data, + * so we just need to print it for the user. + */ +void do_list(void) +{ + int do_jobs=1, + do_jobsteps=1; + int rc = 0; + + ListIterator itr = NULL; + ListIterator itr_step = NULL; + job_rec_t *job = NULL; + step_rec_t *step = NULL; + + if (params.opt_total) + do_jobsteps = 0; + else if (params.opt_jobstep_list) + do_jobs = 0; + itr = list_iterator_create(jobs); + while((job = list_next(itr))) { + if (!params.opt_dup) + if (job->jobnum_superseded) { + if (params.opt_verbose > 1) + fprintf(stderr, + "Note: Skipping older" + " job %d dated %d\n", + job->header.jobnum, + (int)job->header.job_start); + continue; + } + if (!job->job_start_seen && job->job_step_seen) { + /* If we only saw JOB_TERMINATED, the job was + * probably canceled. */ + fprintf(stderr, + "Error: No JOB_START record for job %d\n", + job->header.jobnum); + if (rc<ERROR) + rc = ERROR; + } + if (params.opt_verbose > 1) { + if (!job->job_start_seen) + fprintf(stderr, + "Note: No JOB_START record for " + "job %d\n", + job->header.jobnum); + if (!job->job_step_seen) + fprintf(stderr, + "Note: No JOB_STEP record for " + "job %d\n", + job->header.jobnum); + if (!job->job_terminated_seen) + fprintf(stderr, + "Note: No JOB_TERMINATED record for " + "job %d\n", + job->header.jobnum); + } + if (params.opt_uid >= 0 && (job->header.uid != params.opt_uid)) + continue; + if (params.opt_gid >= 0 && (job->header.gid != params.opt_gid)) + continue; + if (do_jobs) { + if (params.opt_state_list) { + if(!selected_status[job->status]) + continue; + } + print_fields(JOB, job); + } + if (do_jobsteps && job->track_steps) { + itr_step = list_iterator_create(job->steps); + while((step = list_next(itr_step))) { + if (step->status == JOB_RUNNING + && job->job_terminated_seen) { + step->status = JOB_FAILED; + } + if (params.opt_state_list) { + if(!selected_status[step->status]) + continue; + } + print_fields(JOBSTEP, step); + } + list_iterator_destroy(itr_step); + } + } + list_iterator_destroy(itr); +} + +void sacct_init() +{ + int i=0; + jobs = list_create(destroy_job); + selected_parts = list_create(_destroy_parts); + selected_steps = list_create(_destroy_steps); + for(i=0; i<STATUS_COUNT; i++) + selected_status[i] = 0; +} + +void sacct_fini() +{ + list_destroy(jobs); + list_destroy(selected_parts); + list_destroy(selected_steps); +} diff --git a/src/sacct/print.c b/src/sacct/print.c new file mode 100644 index 00000000000..786bd3c0050 --- /dev/null +++ b/src/sacct/print.c @@ -0,0 +1,855 @@ +/*****************************************************************************\ + * print.c - print functions for sacct + * + * $Id: print.c 7541 2006-03-18 01:44:58Z da $ + ***************************************************************************** + * Copyright (C) 2006 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov>. + * UCRL-CODE-217948. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#include "sacct.h" + +char *_decode_status(int status); +char *_elapsed_time(long secs, long usecs); + +char *_elapsed_time(long secs, long usecs) +{ + int days, hours, minutes, seconds; + char daybuf[10], + hourbuf[4], + minbuf[4]; + static char outbuf[20]; /* this holds LOTS of time! */ + div_t res; + + daybuf[0] = 0; + hourbuf[0] = 0; + minbuf[0] = 0; + + res = div(usecs+5000, 1e6); /* round up the usecs, then */ + usecs /= 1e4; /* truncate to .00's */ + + res = div(secs+res.quot, 60*60*24); /* 1 day is 24 hours of 60 + minutes of 60 seconds */ + days = res.quot; + res = div(res.rem, 60*60); + hours = res.quot; + res = div(res.rem, 60); + minutes = res.quot; + seconds = res.rem; + if (days) { + snprintf(daybuf, sizeof(daybuf), "%d-", days); + snprintf(hourbuf, sizeof(hourbuf), "%02d:", hours); + } else if (hours) + snprintf(hourbuf, sizeof(hourbuf), "%2d:", hours); + if (days || hours) + snprintf(minbuf, sizeof(minbuf), "%02d:", minutes); + else if (minutes) + snprintf(minbuf, sizeof(minbuf), "%2d:", minutes); + if (days || hours || minutes) + snprintf(outbuf, sizeof(outbuf), "%s%s%s%02d.%02ld", + daybuf, hourbuf, minbuf, seconds, usecs); + else + snprintf(outbuf, sizeof(outbuf), "%2d.%02ld", seconds, usecs); + return(outbuf); +} + +void print_fields(type_t type, void *object) +{ + int f, pf; + for (f=0; f<nprintfields; f++) { + pf = printfields[f]; + if (f) + printf(" "); + (fields[pf].print_routine)(type, object); + } + printf("\n"); +} + +/* Field-specific print routines */ + +void print_cpu(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%15s", "Cpu"); + break; + case UNDERSCORE: + printf("%15s", "---------------"); + break; + case JOB: + printf("%15s", + _elapsed_time(job->tot_cpu_sec, + job->tot_cpu_usec)); + break; + case JOBSTEP: + printf("%15s", + _elapsed_time(step->tot_cpu_sec, + step->tot_cpu_usec)); + break; + } +} + +void print_elapsed(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%15s", "Elapsed"); + break; + case UNDERSCORE: + printf("%15s", "---------------"); + break; + case JOB: + printf("%15s", _elapsed_time(job->elapsed,0)); + break; + case JOBSTEP: + printf("%15s", _elapsed_time(step->elapsed,0)); + break; + } +} + +void print_exitcode(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%8s", "ExitCode"); + break; + case UNDERSCORE: + printf("%8s", "--------"); + break; + case JOB: + printf("%8d", job->exitcode); + break; + case JOBSTEP: + printf("%8d", step->exitcode); + break; + } +} + +void print_gid(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%5s", "Gid"); + break; + case UNDERSCORE: + printf("%5s", "-----"); + break; + case JOB: + printf("%5d", job->header.gid); + break; + case JOBSTEP: + printf("s%5d", step->header.gid); + break; + } +} + +void print_group(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + int gid = -1; + char *tmp="(unknown)"; + struct group *gr = NULL; + + switch(type) { + case HEADLINE: + printf("%-9s", "Group"); + break; + case UNDERSCORE: + printf("%-9s", "---------"); + break; + case JOB: + gid = job->header.gid; + break; + case JOBSTEP: + gid = step->header.gid; + break; + } + if(gid != -1) { + if ((gr=getgrgid(gid))) + tmp=gr->gr_name; + printf("%-9s", tmp); + } +} + +void print_idrss(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%8s", "Idrss"); + break; + case UNDERSCORE: + printf("%8s", "------"); + break; + case JOB: + printf("%8ld", job->rusage.ru_idrss); + break; + case JOBSTEP: + printf("%8ld", step->rusage.ru_idrss); + break; + } +} + +void print_inblocks(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%9s", "Inblocks"); + break; + case UNDERSCORE: + printf("%9s", "---------"); + break; + case JOB: + printf("%9ld", job->rusage.ru_inblock); + break; + case JOBSTEP: + printf("%9ld", step->rusage.ru_inblock); + break; + } +} + +void print_isrss(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%8s", "Isrss"); + break; + case UNDERSCORE: + printf("%8s", "------"); + break; + case JOB: + printf("%8ld", job->rusage.ru_isrss); + break; + case JOBSTEP: + printf("%8ld", step->rusage.ru_isrss); + break; + } + +} + +void print_ixrss(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%8s", "Ixrss"); + break; + case UNDERSCORE: + printf("%8s", "------"); + break; + case JOB: + printf("%8ld", job->rusage.ru_ixrss); + break; + case JOBSTEP: + printf("%8ld", step->rusage.ru_ixrss); + break; + } + +} + +void print_job(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%8s", "Job"); + break; + case UNDERSCORE: + printf("%8s", "--------"); + break; + case JOB: + printf("%8ld", job->header.jobnum); + break; + case JOBSTEP: + printf("%8ld", step->header.jobnum); + break; + } +} + +void print_name(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%-18s", "Jobname"); + break; + case UNDERSCORE: + printf("%-18s", "------------------"); + break; + case JOB: + if(strlen(job->jobname)<19) + printf("%-18s", job->jobname); + else + printf("%-15.15s...", job->jobname); + + break; + case JOBSTEP: + if(strlen(step->stepname)<19) + printf("%-18s", step->stepname); + else + printf("%-15.15s...", step->stepname); + break; + } +} + +void print_step(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + char outbuf[10]; + + switch(type) { + case HEADLINE: + printf("%-10s", "JobID"); + break; + case UNDERSCORE: + printf("%-10s", "----------"); + break; + case JOB: + printf("%-10d", job->header.jobnum); + break; + case JOBSTEP: + snprintf(outbuf, sizeof(outbuf), "%ld.%ld", + step->header.jobnum, + step->stepnum); + printf("%-10s", outbuf); + break; + } + +} + +void print_majflt(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%8s", "Majflt"); + break; + case UNDERSCORE: + printf("%8s", "------"); + break; + case JOB: + printf("%8ld", job->rusage.ru_majflt); + break; + case JOBSTEP: + printf("%8ld", step->rusage.ru_majflt); + break; + } +} + +void print_minflt(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%8s", "Minflt"); + break; + case UNDERSCORE: + printf("%8s", "------"); + break; + case JOB: + printf("%8ld", job->rusage.ru_minflt); + break; + case JOBSTEP: + printf("%8ld", step->rusage.ru_minflt); + break; + } +} + +void print_msgrcv(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%9s", "Msgrcv"); + break; + case UNDERSCORE: + printf("%9s", "---------"); + break; + case JOB: + printf("%9ld", job->rusage.ru_msgrcv); + break; + case JOBSTEP: + printf("%9ld", step->rusage.ru_msgrcv); + break; + } +} + +void print_msgsnd(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%9s", "Msgsnd"); + break; + case UNDERSCORE: + printf("%9s", "---------"); + break; + case JOB: + printf("%9ld", job->rusage.ru_msgsnd); + break; + case JOBSTEP: + printf("%9ld", step->rusage.ru_msgsnd); + break; + } +} + +void print_ncpus(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%7s", "Ncpus"); + break; + case UNDERSCORE: + printf("%7s", "-------"); + break; + case JOB: + printf("%7ld", job->ncpus); + break; + case JOBSTEP: + printf("%7ld", step->ncpus); + break; + } +} + +void print_nivcsw(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%9s", "Nivcsw"); + break; + case UNDERSCORE: + printf("%9s", "---------"); + break; + case JOB: + printf("%9ld", job->rusage.ru_nivcsw); + break; + case JOBSTEP: + printf("%9ld", step->rusage.ru_nivcsw); + break; + } +} + +void print_nodes(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%-30s", "Nodes"); + break; + case UNDERSCORE: + printf("%-30s", "------------------------------"); + break; + case JOB: + printf("%-30s", job->nodes); + break; + case JOBSTEP: + printf("%-30s", " "); + break; + } +} + +void print_nsignals(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%9s", "Nsignals"); + break; + case UNDERSCORE: + printf("%9s", "---------"); + break; + case JOB: + printf("%9ld", job->rusage.ru_nsignals); + break; + case JOBSTEP: + printf("%9ld", step->rusage.ru_nsignals); + break; + } +} + +void print_nswap(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%8s", "Nswap"); + break; + case UNDERSCORE: + printf("%8s", "------"); + break; + case JOB: + printf("%8ld", job->rusage.ru_nswap); + break; + case JOBSTEP: + printf("%8ld", step->rusage.ru_nswap); + break; + } +} + +void print_ntasks(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%7s", "Ntasks"); + break; + case UNDERSCORE: + printf("%7s", "-------"); + break; + case JOB: + printf("%7ld", job->ntasks); + break; + case JOBSTEP: + printf("%7ld", step->ntasks); + break; + } +} + +void print_nvcsw(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%9s", "Nvcsw"); + break; + case UNDERSCORE: + printf("%9s", "---------"); + break; + case JOB: + printf("%9ld", job->rusage.ru_nvcsw); + break; + case JOBSTEP: + printf("%9ld", step->rusage.ru_nvcsw); + break; + } +} + +void print_outblocks(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%9s", "Outblocks"); + break; + case UNDERSCORE: + printf("%9s", "---------"); + break; + case JOB: + printf("%9ld", job->rusage.ru_oublock); + break; + case JOBSTEP: + printf("%9ld", step->rusage.ru_oublock); + break; + } +} + +void print_partition(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%-10s", "Partition"); + break; + case UNDERSCORE: + printf("%-10s", "----------"); + break; + case JOB: + if(strlen(job->header.partition)<11) + printf("%-10s", job->header.partition); + else + printf("%-7.7s...", job->header.partition); + + break; + case JOBSTEP: + if(strlen(step->header.partition)<11) + printf("%-10s", step->header.partition); + else + printf("%-7.7s...", step->header.partition); + + break; + } +} + +void print_psize(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%10s", "Psize"); + break; + case UNDERSCORE: + printf("%10s", "------"); + break; + case JOB: + printf("%10.ld", job->psize); + break; + case JOBSTEP: + printf("%10ld", step->psize); + break; + } +} + +void print_rss(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%8s", "Rss"); + break; + case UNDERSCORE: + printf("%8s", "------"); + break; + case JOB: + printf("%8ld", job->rusage.ru_maxrss); + break; + case JOBSTEP: + printf("%8ld", step->rusage.ru_maxrss); + break; + } +} + +void print_status(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%-10s", "Status"); + break; + case UNDERSCORE: + printf("%-10s", "----------"); + break; + case JOB: + printf("%-10s", decode_status_int(job->status)); + break; + case JOBSTEP: + printf("%-10s", decode_status_int(step->status)); + break; + } +} + +void print_submitted(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%-14s", "Submitted"); + break; + case UNDERSCORE: + printf("%-14s", "--------------"); + break; + case JOB: + printf("%-14d", job->header.job_start); + break; + case JOBSTEP: + printf("%-14d", step->header.job_start); + break; + } +} + +void print_systemcpu(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%15s", "SystemCpu"); + break; + case UNDERSCORE: + printf("%15s", "---------------"); + break; + case JOB: + printf("%15s", + _elapsed_time(job->rusage.ru_stime.tv_sec, + job->rusage.ru_stime.tv_usec)); + break; + case JOBSTEP: + printf("%15s", + _elapsed_time(step->rusage.ru_stime.tv_sec, + step->rusage.ru_stime.tv_usec)); + break; + } + +} + +void print_uid(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%5s", "Uid"); + break; + case UNDERSCORE: + printf("%5s", "-----"); + break; + case JOB: + printf("%5d", job->header.uid); + break; + case JOBSTEP: + printf("%5d", step->header.uid); + break; + } +} + +void print_user(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + int uid = -1; + char *tmp="(unknown)"; + struct passwd *pw = NULL; + + switch(type) { + case HEADLINE: + printf("%-9s", "User"); + break; + case UNDERSCORE: + printf("%-9s", "---------"); + break; + case JOB: + uid = job->header.uid; + break; + case JOBSTEP: + uid = step->header.uid; + break; + } + if(uid != -1) { + if ((pw=getpwuid(uid))) + tmp=pw->pw_name; + printf("%-9s", tmp); + } +} + +void print_usercpu(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%15s", "UserCpu"); + break; + case UNDERSCORE: + printf("%15s", "---------------"); + break; + case JOB: + printf("%15s", + _elapsed_time(job->rusage.ru_utime.tv_sec, + job->rusage.ru_utime.tv_usec)); + break; + case JOBSTEP: + printf("%15s", + _elapsed_time(step->rusage.ru_utime.tv_sec, + step->rusage.ru_utime.tv_usec)); + break; + } + +} + +void print_vsize(type_t type, void *object) +{ + job_rec_t *job = (job_rec_t *)object; + step_rec_t *step = (step_rec_t *)object; + + switch(type) { + case HEADLINE: + printf("%10s", "Vsize"); + break; + case UNDERSCORE: + printf("%10s", "------"); + break; + case JOB: + printf("%10ld", job->vsize); + break; + case JOBSTEP: + printf("%10ld", step->vsize); + break; + } +} + + diff --git a/src/sacct/process.c b/src/sacct/process.c new file mode 100644 index 00000000000..a1997816fab --- /dev/null +++ b/src/sacct/process.c @@ -0,0 +1,442 @@ +/*****************************************************************************\ + * process.c - process functions for sacct + * + * $Id: process.c 7541 2006-03-18 01:44:58Z da $ + ***************************************************************************** + * Copyright (C) 2006 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov>. + * UCRL-CODE-217948. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#include "sacct.h" + +job_rec_t *_find_job_record(acct_header_t header); +step_rec_t *_find_step_record(job_rec_t *job, long jobstep); +job_rec_t *_init_job_rec(acct_header_t header, int lc); +int _parse_line(char *f[], void **data); + +job_rec_t *_find_job_record(acct_header_t header) +{ + job_rec_t *job = NULL; + ListIterator itr = list_iterator_create(jobs); + + while((job = (job_rec_t *)list_next(itr)) != NULL) { + if (job->header.jobnum == header.jobnum) { + if(job->header.job_start == BATCH_JOB_TIMESTAMP) { + job->header.job_start = header.job_start; + break; + } + + if(job->header.job_start == header.job_start) + break; + else { + /* If we're looking for a later + * record with this job number, we + * know that this one is an older, + * duplicate record. + * We assume that the newer record + * will be created if it doesn't + * already exist. */ + job->jobnum_superseded = 1; + } + } + } + list_iterator_destroy(itr); + return job; +} + +step_rec_t *_find_step_record(job_rec_t *job, long stepnum) +{ + step_rec_t *step = NULL; + ListIterator itr = NULL; + + if(!list_count(job->steps)) + return step; + + itr = list_iterator_create(job->steps); + while((step = (step_rec_t *)list_next(itr)) != NULL) { + if (step->stepnum == stepnum) + break; + } + list_iterator_destroy(itr); + return step; +} + +job_rec_t *_init_job_rec(acct_header_t header, int lc) +{ + job_rec_t *job = xmalloc(sizeof(job_rec_t)); + + job->header.jobnum = header.jobnum; + job->header.partition = xstrdup(header.partition); + job->header.job_start = header.job_start; + job->header.timestamp = header.timestamp; + job->header.uid = header.uid; + job->header.gid = header.gid; + job->job_start_seen = 0; + job->job_step_seen = 0; + job->job_terminated_seen = 0; + job->jobnum_superseded = 0; + job->jobname = xstrdup("(unknown)"); + job->status = JOB_PENDING; + job->tot_cpu_sec = 0; + job->tot_cpu_usec = 0; + job->rusage.ru_utime.tv_sec = 0; + job->rusage.ru_utime.tv_usec += 0; + job->rusage.ru_stime.tv_sec += 0; + job->rusage.ru_stime.tv_usec += 0; + job->rusage.ru_inblock += 0; + job->rusage.ru_oublock += 0; + job->rusage.ru_msgsnd += 0; + job->rusage.ru_msgrcv += 0; + job->rusage.ru_nsignals += 0; + job->rusage.ru_nvcsw += 0; + job->rusage.ru_nivcsw += 0; + job->rusage.ru_maxrss = 0; + job->rusage.ru_ixrss = 0; + job->rusage.ru_idrss = 0; + job->rusage.ru_isrss = 0; + job->rusage.ru_minflt = 0; + job->rusage.ru_majflt = 0; + job->rusage.ru_nswap = 0; + job->vsize = 0; + job->psize = 0; + job->exitcode = 0; + job->steps = list_create(destroy_step); + return job; +} + +int _parse_header(char *f[], acct_header_t *header) +{ + header->jobnum = atoi(f[F_JOB]); + header->partition = xstrdup(f[F_PARTITION]); + header->job_start = atoi(f[F_JOB_START]); + header->timestamp = atoi(f[F_TIMESTAMP]); + header->uid = atoi(f[F_UID]); + header->gid = atoi(f[F_GID]); + return SLURM_SUCCESS; +} + +int _parse_line(char *f[], void **data) +{ + int i = atoi(f[F_RECTYPE]); + job_rec_t **job = (job_rec_t **)data; + step_rec_t **step = (step_rec_t **)data; + + switch(i) { + case JOB_START: + *job = xmalloc(sizeof(job_rec_t)); + _parse_header(f, &(*job)->header); + (*job)->jobname = xstrdup(f[F_JOBNAME]); + (*job)->track_steps = atoi(f[F_TRACK_STEPS]); + (*job)->priority = atoi(f[F_PRIORITY]); + (*job)->ncpus = atoi(f[F_NCPUS]); + (*job)->nodes = xstrdup(f[F_NODES]); + for (i=0; (*job)->nodes[i]; i++) /* discard trailing <CR> */ + if (isspace((*job)->nodes[i])) + (*job)->nodes[i] = 0; + if (!strcmp((*job)->nodes, "(null)")) { + xfree((*job)->nodes); + (*job)->nodes = xstrdup("unknown"); + } + break; + case JOB_STEP: + *step = xmalloc(sizeof(step_rec_t)); + _parse_header(f, &(*step)->header); + (*step)->stepnum = atoi(f[F_JOBSTEP]); + (*step)->status = atoi(f[F_STATUS]); + (*step)->exitcode = atoi(f[F_EXITCODE]); + (*step)->ntasks = atoi(f[F_NTASKS]); + (*step)->ncpus = atoi(f[F_NCPUS]); + (*step)->elapsed = atoi(f[F_ELAPSED]); + (*step)->tot_cpu_sec = atoi(f[F_CPU_SEC]); + (*step)->tot_cpu_usec = atoi(f[F_CPU_USEC]); + (*step)->rusage.ru_utime.tv_sec = atoi(f[F_USER_SEC]); + (*step)->rusage.ru_utime.tv_usec = atoi(f[F_USER_USEC]); + (*step)->rusage.ru_stime.tv_sec = atoi(f[F_SYS_SEC]); + (*step)->rusage.ru_stime.tv_usec = atoi(f[F_SYS_USEC]); + (*step)->rusage.ru_maxrss = atoi(f[F_RSS]); + (*step)->rusage.ru_ixrss = atoi(f[F_IXRSS]); + (*step)->rusage.ru_idrss = atoi(f[F_IDRSS]); + (*step)->rusage.ru_isrss = atoi(f[F_ISRSS]); + (*step)->rusage.ru_minflt = atoi(f[F_MINFLT]); + (*step)->rusage.ru_majflt = atoi(f[F_MAJFLT]); + (*step)->rusage.ru_nswap = atoi(f[F_NSWAP]); + (*step)->rusage.ru_inblock = atoi(f[F_INBLOCKS]); + (*step)->rusage.ru_oublock = atoi(f[F_OUBLOCKS]); + (*step)->rusage.ru_msgsnd = atoi(f[F_MSGSND]); + (*step)->rusage.ru_msgrcv = atoi(f[F_MSGRCV]); + (*step)->rusage.ru_nsignals = atoi(f[F_NSIGNALS]); + (*step)->rusage.ru_nvcsw = atoi(f[F_NVCSW]); + (*step)->rusage.ru_nivcsw = atoi(f[F_NIVCSW]); + (*step)->vsize = atoi(f[F_VSIZE]); + (*step)->psize = atoi(f[F_PSIZE]); + (*step)->stepname = xstrdup(f[F_STEPNAME]); + (*step)->nodes = xstrdup(f[F_STEPNODES]); + break; + case JOB_SUSPEND: + case JOB_TERMINATED: + *job = xmalloc(sizeof(job_rec_t)); + _parse_header(f, &(*job)->header); + (*job)->elapsed = atoi(f[F_TOT_ELAPSED]); + (*job)->status = atoi(f[F_STATUS]); + break; + default: + printf("UNKOWN TYPE %d",i); + break; + } + return SLURM_SUCCESS; +} + +void process_start(char *f[], int lc) +{ + job_rec_t *job = NULL; + job_rec_t *temp = NULL; + + _parse_line(f, (void **)&temp); + job = _find_job_record(temp->header); + if (job) { /* Hmmm... that's odd */ + fprintf(stderr, + "Conflicting JOB_START for job %d at" + " line %d -- ignoring it\n", + job->header.jobnum, lc); + input_error++; + destroy_job(temp); + return; + } + + job = _init_job_rec(temp->header, lc); + list_append(jobs, job); + job->job_start_seen = 1; + job->header.uid = temp->header.uid; + job->header.gid = temp->header.gid; + xfree(job->jobname); + job->jobname = xstrdup(temp->jobname); + job->priority = temp->priority; + job->track_steps = temp->track_steps; + job->ncpus = temp->ncpus; + xfree(job->nodes); + job->nodes = xstrdup(temp->nodes); + destroy_job(temp); +} + +void process_step(char *f[], int lc) +{ + job_rec_t *job = NULL; + + step_rec_t *step = NULL; + step_rec_t *temp = NULL; + + _parse_line(f, (void **)&temp); + + job = _find_job_record(temp->header); + + if (temp->stepnum == -2) { + destroy_step(temp); + return; + } + if (!job) { /* fake it for now */ + job = _init_job_rec(temp->header, lc); + if ((params.opt_verbose > 1) + && (params.opt_jobstep_list==NULL)) + fprintf(stderr, + "Note: JOB_STEP record %d.%d preceded " + "JOB_START record at line %d\n", + temp->header.jobnum, temp->stepnum, lc); + } + if ((step = _find_step_record(job, temp->stepnum))) { + + if (temp->status == JOB_RUNNING) { + destroy_step(temp); + return;/* if "R" record preceded by F or CD; unusual */ + } + if (step->status != JOB_RUNNING) { /* if not JOB_RUNNING */ + fprintf(stderr, + "Conflicting JOB_STEP record for " + "jobstep %d.%d at line %d " + "-- ignoring it\n", + step->header.jobnum, + step->stepnum, lc); + input_error++; + + destroy_step(temp); + return; + } + step->status = temp->status; + step->exitcode = temp->exitcode; + step->ntasks = temp->ntasks; + step->ncpus = temp->ncpus; + step->elapsed = temp->elapsed; + step->tot_cpu_sec = temp->tot_cpu_sec; + step->tot_cpu_usec = temp->tot_cpu_usec; + memcpy(&step->rusage, &temp->rusage, sizeof(struct rusage)); + step->vsize = temp->vsize; + step->psize = temp->psize; + xfree(step->stepname); + step->stepname = xstrdup(temp->stepname); + goto got_step; + } + step = temp; + temp = NULL; + list_append(job->steps, step); + + job->job_step_seen = 1; + job->ntasks += step->ntasks; + if(!strcmp(job->nodes, "(unknown)")) { + xfree(job->nodes); + job->nodes = xstrdup(step->nodes); + } + +got_step: + destroy_step(temp); + + if (job->job_terminated_seen == 0) { /* If the job is still running, + this is the most recent + status */ + if ( job->exitcode == 0 ) + job->exitcode = step->exitcode; + job->header.timestamp = step->header.timestamp; + job->status = JOB_RUNNING; + job->elapsed = time(NULL) - job->header.timestamp; + } + /* now aggregate the aggregatable */ + job->tot_cpu_sec += step->tot_cpu_sec; + job->tot_cpu_usec += step->tot_cpu_usec; + job->rusage.ru_utime.tv_sec += step->rusage.ru_utime.tv_sec; + job->rusage.ru_utime.tv_usec += step->rusage.ru_utime.tv_usec; + job->rusage.ru_stime.tv_sec += step->rusage.ru_stime.tv_sec; + job->rusage.ru_stime.tv_usec += step->rusage.ru_stime.tv_usec; + job->rusage.ru_inblock += step->rusage.ru_inblock; + job->rusage.ru_oublock += step->rusage.ru_oublock; + job->rusage.ru_msgsnd += step->rusage.ru_msgsnd; + job->rusage.ru_msgrcv += step->rusage.ru_msgrcv; + job->rusage.ru_nsignals += step->rusage.ru_nsignals; + job->rusage.ru_nvcsw += step->rusage.ru_nvcsw; + job->rusage.ru_nivcsw += step->rusage.ru_nivcsw; + + /* and finally the maximums for any process */ + job->rusage.ru_maxrss = MAX(job->rusage.ru_maxrss, + step->rusage.ru_maxrss); + job->rusage.ru_ixrss = MAX(job->rusage.ru_ixrss, + step->rusage.ru_ixrss); + job->rusage.ru_idrss = MAX(job->rusage.ru_idrss, + step->rusage.ru_idrss); + job->rusage.ru_isrss = MAX(job->rusage.ru_isrss, + step->rusage.ru_isrss); + job->rusage.ru_minflt = MAX(job->rusage.ru_minflt, + step->rusage.ru_minflt); + job->rusage.ru_majflt = MAX(job->rusage.ru_majflt, + step->rusage.ru_majflt); + job->rusage.ru_nswap = MAX(job->rusage.ru_nswap, + step->rusage.ru_nswap); + job->psize = MAX(job->psize, step->psize); + job->vsize = MAX(job->vsize, step->vsize); + job->ncpus = MAX(job->ncpus, step->ncpus); +} + +void process_suspend(char *f[], int lc) +{ + job_rec_t *job = NULL; + job_rec_t *temp = NULL; + + _parse_line(f, (void **)&temp); + job = _find_job_record(temp->header); + if (!job) + job = _init_job_rec(temp->header, lc); + + if (job->status == JOB_SUSPENDED) + job->elapsed -= temp->elapsed; + + job->header.timestamp = temp->header.timestamp; + job->status = temp->status; + destroy_job(temp); +} + +void process_terminated(char *f[], int lc) +{ + job_rec_t *job = NULL; + job_rec_t *temp = NULL; + + _parse_line(f, (void **)&temp); + job = _find_job_record(temp->header); + if (!job) { /* fake it for now */ + job = _init_job_rec(temp->header, lc); + if (params.opt_verbose > 1) + fprintf(stderr, "Note: JOB_TERMINATED record for job " + "%d preceded " + "other job records at line %d\n", + temp->header.jobnum, lc); + } else if (job->job_terminated_seen) { + if (temp->status == JOB_NODE_FAIL) { + /* multiple node failures - extra TERMINATED records */ + if (params.opt_verbose > 1) + fprintf(stderr, + "Note: Duplicate JOB_TERMINATED " + "record (nf) for job %d at " + "line %d\n", + temp->header.jobnum, lc); + /* JOB_TERMINATED/NF records may be preceded + * by a JOB_TERMINATED/CA record; NF is much + * more interesting. + */ + job->status = temp->status; + goto finished; + } + + fprintf(stderr, + "Conflicting JOB_TERMINATED record (%s) for " + "job %d at line %d -- ignoring it\n", + decode_status_int(temp->status), job, lc); + input_error++; + goto finished; + } + job->job_terminated_seen = 1; + job->elapsed = temp->elapsed; + job->header.timestamp = temp->header.timestamp; + job->status = temp->status; + if(list_count(job->steps) > 1) + job->track_steps = 1; +finished: + destroy_job(temp); +} + +void destroy_job(void *object) +{ + job_rec_t *job = (job_rec_t *)object; + if (job) { + if(job->steps) + list_destroy(job->steps); + xfree(job->header.partition); + xfree(job->jobname); + xfree(job->nodes); + xfree(job); + } +} + +void destroy_step(void *object) +{ + step_rec_t *step = (step_rec_t *)object; + if (step) { + xfree(step->header.partition); + xfree(step->stepname); + xfree(step->nodes); + xfree(step); + } +} diff --git a/src/sacct/sacct.h b/src/sacct/sacct.h new file mode 100644 index 00000000000..94b1c52ebf2 --- /dev/null +++ b/src/sacct/sacct.h @@ -0,0 +1,307 @@ +/*****************************************************************************\ + * sacct.h - header file for sacct + * + * $Id: sacct.h 7541 2006-03-18 01:44:58Z da $ + ***************************************************************************** + * Copyright (C) 2006 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Danny Auble <da@llnl.gov>. + * UCRL-CODE-217948. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ +#ifndef _SACCT_H +#define _SACCT_H + +#include <ctype.h> +#include <errno.h> +#include <grp.h> +#include <pwd.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +#include "src/common/getopt.h" +#include "src/common/slurm_protocol_api.h" +#include "src/common/xmalloc.h" +#include "src/common/xstring.h" +#include "src/common/list.h" + +#define ERROR 2 + +/* slurmd uses "(uint32_t) -2" to track data for batch allocations + * which have no logical jobsteps. */ +#define BATCH_JOB_TIMESTAMP 0 + +#define BRIEF_FIELDS "jobstep,status,exitcode" +#define DEFAULT_FIELDS "jobstep,jobname,partition,ncpus,status,exitcode" +#define LONG_FIELDS "jobstep,jobname,partition,usercpu,systemcpu,minflt,majflt,ntasks,ncpus,elapsed,status,exitcode" + +#define BUFFER_SIZE 4096 +#define STATUS_COUNT 10 + +#define MAX_PRINTFIELDS 100 +#define EXPIRE_READ_LENGTH 10 +#define MAX_RECORD_FIELDS 100 + +#define SECONDS_IN_MINUTE 60 +#define SECONDS_IN_HOUR (60*SECONDS_IN_MINUTE) +#define SECONDS_IN_DAY (24*SECONDS_IN_HOUR) + +#define TIMESTAMP_LENGTH 15 + +#ifndef SLURM_CONFIG_FILE +#define SLURM_CONFIG_FILE "sacct was built with no default slurm.conf path" +#endif + +/* Map field names to positions */ + +/* Fields common to all records */ +enum { F_JOB = 0, + F_PARTITION, + F_JOB_START, + F_TIMESTAMP, + F_UID, + F_GID, + F_RESERVED1, + F_RESERVED2, + F_RECTYPE, + HEADER_LENGTH +}; + +/* JOB_START fields */ +enum { F_JOBNAME = HEADER_LENGTH, + F_TRACK_STEPS, + F_PRIORITY, + F_NCPUS, + F_NODES, + JOB_START_LENGTH +}; + +/* JOB_STEP fields */ +enum { F_JOBSTEP = HEADER_LENGTH, + F_STATUS, + F_EXITCODE, + F_NTASKS, + F_STEPNCPUS, + F_ELAPSED, + F_CPU_SEC, + F_CPU_USEC, + F_USER_SEC, + F_USER_USEC, + F_SYS_SEC, + F_SYS_USEC, + F_RSS, + F_IXRSS, + F_IDRSS, + F_ISRSS, + F_MINFLT, + F_MAJFLT, + F_NSWAP, + F_INBLOCKS, + F_OUBLOCKS, + F_MSGSND, + F_MSGRCV, + F_NSIGNALS, + F_NVCSW, + F_NIVCSW, + F_VSIZE, + F_PSIZE, + F_STEPNAME, + F_STEPNODES, + JOB_STEP_LENGTH +}; + +/* JOB_TERM / JOB_SUSPEND fields */ +enum { F_TOT_ELAPSED = HEADER_LENGTH, + F_TERM_STATUS, + JOB_TERM_LENGTH +}; + +/* On output, use fields 12-37 from JOB_STEP */ + +typedef enum { HEADLINE, + UNDERSCORE, + JOB, + JOBSTEP +} type_t; + +enum { CANCELLED, + COMPLETED, + COMPLETING, + FAILED, + NODEFAILED, + PENDING, + RUNNING, + TIMEDOUT +}; + +typedef struct header { + uint32_t jobnum; + char *partition; + time_t job_start; + time_t timestamp; + uint32_t uid; + uint32_t gid; + uint16_t rec_type; +} acct_header_t; + +typedef struct job_rec { + uint32_t job_start_seen, /* useful flags */ + job_step_seen, + job_terminated_seen, + jobnum_superseded; /* older jobnum was reused */ + acct_header_t header; + char *nodes; + char *jobname; + uint16_t track_steps; + int32_t priority; + uint32_t ncpus; + uint32_t ntasks; + int32_t status; + int32_t exitcode; + uint32_t elapsed; + uint32_t tot_cpu_sec, tot_cpu_usec; + uint32_t vsize, psize; + struct rusage rusage; + List steps; +} job_rec_t; + +typedef struct step_rec { + acct_header_t header; + uint32_t stepnum; /* job's step number */ + uint32_t next; /* linked list of job steps */ + char *nodes; + char *stepname; + int32_t status; + int32_t exitcode; + uint32_t ntasks, ncpus; + uint32_t elapsed; + uint32_t tot_cpu_sec, tot_cpu_usec; + uint32_t vsize, psize; + struct rusage rusage; +} step_rec_t; + +typedef struct selected_step_t { + char *job; + char *step; +} selected_step_t; + +typedef struct fields { + char *name; /* Specified in --fields= */ + void (*print_routine) (); /* Who gets to print it? */ +} fields_t; + +/* Input parameters */ +typedef struct sacct_parameters { + int opt_dump; /* --dump */ + int opt_dup; /* --duplicates; +1 = explicitly set */ + int opt_fdump; /* --formattted_dump */ + int opt_gid; /* --gid (-1=wildcard, 0=root) */ + int opt_header; /* can only be cleared */ + int opt_help; /* --help */ + int opt_long; /* --long */ + int opt_lowmem; /* --low_memory */ + int opt_purge; /* --purge */ + int opt_total; /* --total */ + int opt_uid; /* --uid (-1=wildcard, 0=root) */ + int opt_verbose; /* --verbose */ + long opt_expire; /* --expire= */ + char *opt_expire_timespec; /* --expire= */ + char *opt_field_list; /* --fields= */ + char *opt_filein; /* --file */ + char *opt_job_list; /* --jobs */ + char *opt_jobstep_list; /* --jobstep */ + char *opt_partition_list;/* --partitions */ + char *opt_state_list; /* --states */ +} sacct_parameters_t; + +extern fields_t fields[]; +extern sacct_parameters_t params; + +extern long input_error; /* Muddle through bad data, but complain! */ + +extern List jobs; + +extern int printfields[MAX_PRINTFIELDS], /* Indexed into fields[] */ + nprintfields; + +/* process.c */ +void process_start(char *f[], int lc); +void process_step(char *f[], int lc); +void process_suspend(char *f[], int lc); +void process_terminated(char *f[], int lc); +void destroy_job(void *object); +void destroy_step(void *object); + +/* print.c */ +void print_fields(type_t type, void *object); +void print_cpu(type_t type, void *object); +void print_elapsed(type_t type, void *object); +void print_exitcode(type_t type, void *object); +void print_gid(type_t type, void *object); +void print_group(type_t type, void *object); +void print_idrss(type_t type, void *object); +void print_inblocks(type_t type, void *object); +void print_isrss(type_t type, void *object); +void print_ixrss(type_t type, void *object); +void print_job(type_t type, void *object); +void print_name(type_t type, void *object); +void print_step(type_t type, void *object); +void print_majflt(type_t type, void *object); +void print_minflt(type_t type, void *object); +void print_msgrcv(type_t type, void *object); +void print_msgsnd(type_t type, void *object); +void print_ncpus(type_t type, void *object); +void print_nivcsw(type_t type, void *object); +void print_nodes(type_t type, void *object); +void print_nsignals(type_t type, void *object); +void print_nswap(type_t type, void *object); +void print_ntasks(type_t type, void *object); +void print_nvcsw(type_t type, void *object); +void print_outblocks(type_t type, void *object); +void print_partition(type_t type, void *object); +void print_psize(type_t type, void *object); +void print_rss(type_t type, void *object); +void print_status(type_t type, void *object); +void print_submitted(type_t type, void *object); +void print_systemcpu(type_t type, void *object); +void print_uid(type_t type, void *object); +void print_user(type_t type, void *object); +void print_usercpu(type_t type, void *object); +void print_vsize(type_t type, void *object); + +/* options.c */ +int decode_status_char(char *status); +char *decode_status_int(int status); +int get_data(void); +void parse_command_line(int argc, char **argv); +void do_dump(void); +void do_expire(void); +void do_fdump(char* fields[], int lc); +void do_help(void); +void do_list(void); +void sacct_init(); +void sacct_fini(); + +#endif /* !_SACCT_H */ -- GitLab