Skip to content
Snippets Groups Projects
Commit cbcb167a authored by Danny Auble's avatar Danny Auble
Browse files

support for bluegene systems on sacct

parent 15962166
No related branches found
No related tags found
No related merge requests found
......@@ -128,7 +128,7 @@ void _dump_header(acct_header_t header)
(int)header.timestamp,
header.uid,
header.gid,
"-", /* reserved 2 */
header.blockid, /* block id */
"-"); /* reserved 1 */
}
/* _open_log_file() -- find the current or specified log file, and open it
......@@ -1449,7 +1449,7 @@ void do_fdump(char* f[], int lc)
"timestamp", /* F_TIMESTAMP */
"uid", /* F_UIDGID */
"gid", /* F_UIDGID */
"reserved-1",/* F_RESERVED1 */
"BlockID", /* F_BLOCKID */
"reserved-2",/* F_RESERVED1 */
"recordType",/* F_RECTYPE */
NULL};
......
......@@ -644,6 +644,35 @@ void print_partition(type_t type, void *object)
}
}
void print_blockid(type_t type, void *object)
{
job_rec_t *job = (job_rec_t *)object;
step_rec_t *step = (step_rec_t *)object;
switch(type) {
case HEADLINE:
printf("%-16s", "BlockID");
break;
case UNDERSCORE:
printf("%-16s", "----------------");
break;
case JOB:
if(strlen(job->header.partition)<17)
printf("%-16s", job->header.partition);
else
printf("%-13.13s...", job->header.partition);
break;
case JOBSTEP:
if(strlen(step->header.partition)<17)
printf("%-16s", step->header.blockid);
else
printf("%-13.13s...", step->header.blockid);
break;
}
}
void print_psize(type_t type, void *object)
{
job_rec_t *job = (job_rec_t *)object;
......
......@@ -30,7 +30,8 @@
job_rec_t *_find_job_record(acct_header_t header);
step_rec_t *_find_step_record(job_rec_t *job, long jobstep);
job_rec_t *_init_job_rec(acct_header_t header, int lc);
job_rec_t *_init_job_rec(acct_header_t header);
step_rec_t *_init_step_rec(acct_header_t header);
int _parse_line(char *f[], void **data);
job_rec_t *_find_job_record(acct_header_t header)
......@@ -80,50 +81,55 @@ step_rec_t *_find_step_record(job_rec_t *job, long stepnum)
return step;
}
job_rec_t *_init_job_rec(acct_header_t header, int lc)
job_rec_t *_init_job_rec(acct_header_t header)
{
job_rec_t *job = xmalloc(sizeof(job_rec_t));
job->header.jobnum = header.jobnum;
job->header.partition = xstrdup(header.partition);
job->header.job_start = header.job_start;
job->header.timestamp = header.timestamp;
job->header.uid = header.uid;
job->header.gid = header.gid;
memcpy(&job->header, &header, sizeof(acct_header_t));
memset(&job->rusage, 0, sizeof(struct rusage));
job->job_start_seen = 0;
job->job_step_seen = 0;
job->job_terminated_seen = 0;
job->jobnum_superseded = 0;
job->jobname = xstrdup("(unknown)");
job->status = JOB_PENDING;
job->nodes = NULL;
job->jobname = NULL;
job->exitcode = 0;
job->priority = 0;
job->ntasks = 0;
job->ncpus = 0;
job->elapsed = 0;
job->tot_cpu_sec = 0;
job->tot_cpu_usec = 0;
job->rusage.ru_utime.tv_sec = 0;
job->rusage.ru_utime.tv_usec += 0;
job->rusage.ru_stime.tv_sec += 0;
job->rusage.ru_stime.tv_usec += 0;
job->rusage.ru_inblock += 0;
job->rusage.ru_oublock += 0;
job->rusage.ru_msgsnd += 0;
job->rusage.ru_msgrcv += 0;
job->rusage.ru_nsignals += 0;
job->rusage.ru_nvcsw += 0;
job->rusage.ru_nivcsw += 0;
job->rusage.ru_maxrss = 0;
job->rusage.ru_ixrss = 0;
job->rusage.ru_idrss = 0;
job->rusage.ru_isrss = 0;
job->rusage.ru_minflt = 0;
job->rusage.ru_majflt = 0;
job->rusage.ru_nswap = 0;
job->vsize = 0;
job->psize = 0;
job->exitcode = 0;
job->psize = 0;
job->steps = list_create(destroy_step);
job->nodes = NULL;
job->track_steps = 0;
return job;
}
step_rec_t *_init_step_rec(acct_header_t header)
{
step_rec_t *step = xmalloc(sizeof(job_rec_t));
memcpy(&step->header, &header, sizeof(acct_header_t));
memset(&step->rusage, 0, sizeof(struct rusage));
step->stepnum = (uint32_t)NO_VAL;
step->nodes = NULL;
step->stepname = NULL;
step->status = NO_VAL;
step->exitcode = NO_VAL;
step->ntasks = (uint32_t)NO_VAL;
step->ncpus = (uint32_t)NO_VAL;
step->elapsed = (uint32_t)NO_VAL;
step->tot_cpu_sec = (uint32_t)NO_VAL;
step->tot_cpu_usec = (uint32_t)NO_VAL;
step->vsize = (uint32_t)NO_VAL;
step->psize = (uint32_t)NO_VAL;
return step;
}
int _parse_header(char *f[], acct_header_t *header)
{
header->jobnum = atoi(f[F_JOB]);
......@@ -132,6 +138,7 @@ int _parse_header(char *f[], acct_header_t *header)
header->timestamp = atoi(f[F_TIMESTAMP]);
header->uid = atoi(f[F_UID]);
header->gid = atoi(f[F_GID]);
header->blockid = xstrdup(f[F_BLOCKID]);
return SLURM_SUCCESS;
}
......@@ -140,11 +147,12 @@ int _parse_line(char *f[], void **data)
int i = atoi(f[F_RECTYPE]);
job_rec_t **job = (job_rec_t **)data;
step_rec_t **step = (step_rec_t **)data;
acct_header_t header;
_parse_header(f, &header);
switch(i) {
case JOB_START:
*job = xmalloc(sizeof(job_rec_t));
_parse_header(f, &(*job)->header);
*job = _init_job_rec(header);
(*job)->jobname = xstrdup(f[F_JOBNAME]);
(*job)->track_steps = atoi(f[F_TRACK_STEPS]);
(*job)->priority = atoi(f[F_PRIORITY]);
......@@ -159,8 +167,7 @@ int _parse_line(char *f[], void **data)
}
break;
case JOB_STEP:
*step = xmalloc(sizeof(step_rec_t));
_parse_header(f, &(*step)->header);
*step = _init_step_rec(header);
(*step)->stepnum = atoi(f[F_JOBSTEP]);
(*step)->status = atoi(f[F_STATUS]);
(*step)->exitcode = atoi(f[F_EXITCODE]);
......@@ -191,11 +198,11 @@ int _parse_line(char *f[], void **data)
(*step)->psize = atoi(f[F_PSIZE]);
(*step)->stepname = xstrdup(f[F_STEPNAME]);
(*step)->nodes = xstrdup(f[F_STEPNODES]);
break;
case JOB_SUSPEND:
case JOB_TERMINATED:
*job = xmalloc(sizeof(job_rec_t));
_parse_header(f, &(*job)->header);
*job = _init_job_rec(header);
(*job)->elapsed = atoi(f[F_TOT_ELAPSED]);
(*job)->status = atoi(f[F_STATUS]);
break;
......@@ -223,18 +230,11 @@ void process_start(char *f[], int lc)
return;
}
job = _init_job_rec(temp->header, lc);
job = temp;
list_append(jobs, job);
job->job_start_seen = 1;
job->header.uid = temp->header.uid;
job->header.gid = temp->header.gid;
xfree(job->jobname);
job->jobname = xstrdup(temp->jobname);
job->priority = temp->priority;
job->track_steps = temp->track_steps;
job->ncpus = temp->ncpus;
job->nodes = xstrdup(temp->nodes);
destroy_job(temp);
}
void process_step(char *f[], int lc)
......@@ -253,7 +253,7 @@ void process_step(char *f[], int lc)
return;
}
if (!job) { /* fake it for now */
job = _init_job_rec(temp->header, lc);
job = _init_job_rec(temp->header);
if ((params.opt_verbose > 1)
&& (params.opt_jobstep_list==NULL))
fprintf(stderr,
......@@ -291,6 +291,7 @@ void process_step(char *f[], int lc)
step->psize = temp->psize;
xfree(step->stepname);
step->stepname = xstrdup(temp->stepname);
destroy_step(temp);
goto got_step;
}
step = temp;
......@@ -305,7 +306,7 @@ void process_step(char *f[], int lc)
}
got_step:
destroy_step(temp);
if (job->job_terminated_seen == 0) { /* If the job is still running,
this is the most recent
......@@ -359,7 +360,7 @@ void process_suspend(char *f[], int lc)
_parse_line(f, (void **)&temp);
job = _find_job_record(temp->header);
if (!job)
job = _init_job_rec(temp->header, lc);
job = _init_job_rec(temp->header);
if (job->status == JOB_SUSPENDED)
job->elapsed -= temp->elapsed;
......@@ -377,7 +378,7 @@ void process_terminated(char *f[], int lc)
_parse_line(f, (void **)&temp);
job = _find_job_record(temp->header);
if (!job) { /* fake it for now */
job = _init_job_rec(temp->header, lc);
job = _init_job_rec(temp->header);
if (params.opt_verbose > 1)
fprintf(stderr, "Note: JOB_TERMINATED record for job "
"%u preceded "
......@@ -418,13 +419,21 @@ finished:
destroy_job(temp);
}
void destroy_acct_header(void *object)
{
acct_header_t *header = (acct_header_t *)object;
if(header) {
xfree(header->partition);
xfree(header->blockid);
}
}
void destroy_job(void *object)
{
job_rec_t *job = (job_rec_t *)object;
if (job) {
if(job->steps)
list_destroy(job->steps);
xfree(job->header.partition);
destroy_acct_header(&job->header);
xfree(job->jobname);
xfree(job->nodes);
xfree(job);
......@@ -435,7 +444,7 @@ void destroy_step(void *object)
{
step_rec_t *step = (step_rec_t *)object;
if (step) {
xfree(step->header.partition);
destroy_acct_header(&step->header);
xfree(step->stepname);
xfree(step->nodes);
xfree(step);
......
......@@ -165,6 +165,7 @@ fields_t fields[] = {{"cpu", print_cpu},
{"nvcsw", print_nvcsw},
{"outblocks", print_outblocks},
{"partition", print_partition},
{"blockid", print_blockid},
{"psize", print_psize},
{"rss", print_rss},
{"status", print_status},
......
......@@ -83,7 +83,7 @@ enum { F_JOB = 0,
F_TIMESTAMP,
F_UID,
F_GID,
F_RESERVED1,
F_BLOCKID,
F_RESERVED2,
F_RECTYPE,
HEADER_LENGTH
......@@ -159,6 +159,7 @@ enum { CANCELLED,
typedef struct header {
uint32_t jobnum;
char *partition;
char *blockid;
time_t job_start;
time_t timestamp;
uint32_t uid;
......@@ -181,24 +182,28 @@ typedef struct job_rec {
int32_t status;
int32_t exitcode;
uint32_t elapsed;
uint32_t tot_cpu_sec, tot_cpu_usec;
uint32_t vsize, psize;
uint32_t tot_cpu_sec;
uint32_t tot_cpu_usec;
uint32_t vsize;
uint32_t psize;
struct rusage rusage;
List steps;
} job_rec_t;
typedef struct step_rec {
acct_header_t header;
acct_header_t header;
uint32_t stepnum; /* job's step number */
uint32_t next; /* linked list of job steps */
char *nodes;
char *stepname;
int32_t status;
int32_t exitcode;
uint32_t ntasks, ncpus;
uint32_t ntasks;
uint32_t ncpus;
uint32_t elapsed;
uint32_t tot_cpu_sec, tot_cpu_usec;
uint32_t vsize, psize;
uint32_t tot_cpu_sec;
uint32_t tot_cpu_usec;
uint32_t vsize;
uint32_t psize;
struct rusage rusage;
} step_rec_t;
......@@ -251,6 +256,7 @@ void process_start(char *f[], int lc);
void process_step(char *f[], int lc);
void process_suspend(char *f[], int lc);
void process_terminated(char *f[], int lc);
void destroy_acct_header(void *object);
void destroy_job(void *object);
void destroy_step(void *object);
......@@ -281,6 +287,7 @@ void print_ntasks(type_t type, void *object);
void print_nvcsw(type_t type, void *object);
void print_outblocks(type_t type, void *object);
void print_partition(type_t type, void *object);
void print_blockid(type_t type, void *object);
void print_psize(type_t type, void *object);
void print_rss(type_t type, void *object);
void print_status(type_t type, void *object);
......
......@@ -79,24 +79,33 @@ const char *_jobstep_format =
static int _print_record(struct job_record *job_ptr,
time_t time, char *data)
{
struct tm *ts; /* timestamp decoder */
static int rc=SLURM_SUCCESS;
char *block_id = NULL;
ts = xmalloc(sizeof(struct tm));
gmtime_r(&time, ts);
debug3("_print_record, job=%u, \"%s\"",
job_ptr->job_id, data);
#ifdef HAVE_BG
select_g_get_jobinfo(job_ptr->select_jobinfo,
SELECT_DATA_BLOCK_ID,
&block_id);
#endif
if(!block_id)
block_id = xstrdup("-");
slurm_mutex_lock( &logfile_lock );
if (fprintf(LOGFILE,
"%u %s %u %u %d %d - - %s\n",
"%u %s %u %u %d %d %s - %s\n",
job_ptr->job_id, job_ptr->partition,
(int)job_ptr->start_time, (int)time,
job_ptr->user_id, job_ptr->group_id, data)
job_ptr->user_id, job_ptr->group_id, block_id, data)
< 0)
rc=SLURM_ERROR;
fdatasync(LOGFILE_FD);
slurm_mutex_unlock( &logfile_lock );
xfree(ts);
xfree(block_id);
return rc;
}
......@@ -146,6 +155,7 @@ int jobacct_job_start(struct job_record *job_ptr)
debug("jobacct init was not called or it failed");
return SLURM_ERROR;
}
debug2("jobacct_job_start() called");
for (i=0; i < job_ptr->num_cpu_groups; i++)
ncpus += (job_ptr->cpus_per_node[i])
......@@ -185,17 +195,39 @@ int jobacct_step_start(struct step_record *step)
{
char buf[BUFFER_SIZE];
int cpus = 0;
char node_list[BUFFER_SIZE];
#ifdef HAVE_BG
uint16_t quarter = (uint16_t)NO_VAL;
uint16_t nodecard = (uint16_t)NO_VAL;
#endif
if(!init) {
debug("jobacct init was not called or it failed");
return SLURM_ERROR;
}
#ifdef HAVE_BG
cpus = step->job_ptr->num_procs;
select_g_get_jobinfo(step->job_ptr->select_jobinfo,
SELECT_DATA_QUARTER,
&quarter);
select_g_get_jobinfo(step->job_ptr->select_jobinfo,
SELECT_DATA_NODECARD,
&nodecard);
if(quarter != (uint16_t)NO_VAL
&& nodecard != (uint16_t)NO_VAL)
snprintf(node_list, BUFFER_SIZE,
"%s.%d.%d", step->step_node_list, quarter, nodecard);
else if(quarter != (uint16_t)NO_VAL)
snprintf(node_list, BUFFER_SIZE,
"%s.%d", step->step_node_list, quarter);
else
snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list);
#else
cpus = step->num_cpus;
snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list);
block_id = xstrdup("-");
#endif
snprintf(buf, BUFFER_SIZE, _jobstep_format,
JOB_STEP,
step->step_id, /* stepid */
......@@ -226,8 +258,9 @@ int jobacct_step_start(struct step_record *step)
0, /* total nivcsw */
0, /* max vsize */
0, /* max psize */
step->name, /* step exe name */
step->step_node_list); /* name of nodes step running on */
step->name, /* step exe name */
node_list); /* name of nodes step running on */
return _print_record(step->job_ptr, step->start_time, buf);
}
......@@ -238,6 +271,11 @@ int jobacct_step_complete(struct step_record *step)
int elapsed;
int comp_status;
int cpus = 0;
char node_list[BUFFER_SIZE];
#ifdef HAVE_BG
uint16_t quarter = (uint16_t)NO_VAL;
uint16_t nodecard = (uint16_t)NO_VAL;
#endif
if(!init) {
debug("jobacct init was not called or it failed");
......@@ -254,9 +292,26 @@ int jobacct_step_complete(struct step_record *step)
comp_status = JOB_COMPLETE;
#ifdef HAVE_BG
cpus = step->job_ptr->num_procs;
select_g_get_jobinfo(step->job_ptr->select_jobinfo,
SELECT_DATA_QUARTER,
&quarter);
select_g_get_jobinfo(step->job_ptr->select_jobinfo,
SELECT_DATA_NODECARD,
&nodecard);
if(quarter != (uint16_t)NO_VAL
&& nodecard != (uint16_t)NO_VAL)
snprintf(node_list, BUFFER_SIZE,
"%s.%d.%d", step->step_node_list, quarter, nodecard);
else if(quarter != (uint16_t)NO_VAL)
snprintf(node_list, BUFFER_SIZE,
"%s.%d", step->step_node_list, quarter);
else
snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list);
#else
cpus = step->num_cpus;
snprintf(node_list, BUFFER_SIZE, "%s", step->step_node_list);
block_id = xstrdup("-");
#endif
snprintf(buf, BUFFER_SIZE, _jobstep_format,
......@@ -294,8 +349,8 @@ int jobacct_step_complete(struct step_record *step)
step->max_vsize, /* max vsize */
step->max_psize, /* max psize */
step->name, /* step exe name */
step->step_node_list); /* name of nodes step running on */
node_list); /* name of nodes step running on */
return _print_record(step->job_ptr, now, buf);
}
......
......@@ -35,6 +35,7 @@
#include <slurm/slurm_errno.h>
#include <sys/stat.h>
#include "src/common/xstring.h"
#include "src/common/node_select.h"
#include "slurmctld.h"
int jobacct_init(char *job_acct_log);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment