diff --git a/NEWS b/NEWS index d2a1d3fb1cd3288534b6d05576a05e22e9a282b1..dd820164c5193c37759dff2eb051e7f8b97d8f07 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,10 @@ This file describes changes in recent versions of SLURM. It primarily documents those changes that are of interest to users and admins. +* Changes in SLURM 1.1.0-pre6 +============================= + -- Accounting works for aix systems use jobacct/aix + * Changes in SLURM 1.1.0-pre6 ============================= -- Added logic to "stat" a running job with sacct option -S use -j to specify diff --git a/src/api/job_info.c b/src/api/job_info.c index 7395dd66d17d9c2f276d1d1748bcc0008853bb83..4c65ceeb833374fea01d823704487acc5641a356 100644 --- a/src/api/job_info.c +++ b/src/api/job_info.c @@ -269,7 +269,7 @@ slurm_print_job_info ( FILE* out, job_info_t * job_ptr, int one_liner ) strncpy(time_str, "None", sizeof(time_str)); } fprintf ( out, "SuspendTime=%s PreSusTime=%ld", - time_str, job_ptr->pre_sus_time); + time_str, (long int)job_ptr->pre_sus_time); /****** Line 13 (optional) ******/ select_g_sprint_jobinfo(job_ptr->select_jobinfo, diff --git a/src/common/env.c b/src/common/env.c index 97ca68e7ca3485a6406b56ba8862a98ae382edd6..c7cac5a28dc581ef43918cca0af28c74b7f4b518 100644 --- a/src/common/env.c +++ b/src/common/env.c @@ -502,7 +502,7 @@ int setup_env(env_t *env) #ifdef HAVE_AIX { - char res_env[128], tmp_env[32]; + char res_env[128]; char *debug_env = (char *)getenv("SLURM_LL_API_DEBUG"); int debug_num = 0; diff --git a/src/plugins/checkpoint/aix/checkpoint_aix.c b/src/plugins/checkpoint/aix/checkpoint_aix.c index 873ee38c7ce0af7f2fdf7e634d9fa743bade8b02..e0f18ae8c6c00570c48060304937105f71ef9b7a 100644 --- a/src/plugins/checkpoint/aix/checkpoint_aix.c +++ b/src/plugins/checkpoint/aix/checkpoint_aix.c @@ -412,7 +412,7 @@ static void *_ckpt_agent_thr(void *arg) iter = list_iterator_create(ckpt_timeout_list); slurm_mutex_lock(&ckpt_agent_mutex); /* look for and process any timeouts */ - while (rec = list_next(iter)) { + while ((rec = list_next(iter))) { if (rec->end_time > now) continue; info("checkpoint timeout for %u.%u", @@ -479,7 +479,7 @@ static void _ckpt_dequeue_timeout(uint32_t job_id, uint32_t step_id, if (!ckpt_timeout_list) goto fini; iter = list_iterator_create(ckpt_timeout_list); - while (rec = list_next(iter)) { + while ((rec = list_next(iter))) { if ((rec->job_id != job_id) || (rec->step_id != step_id) || (start_time && (rec->start_time != start_time))) continue; diff --git a/src/plugins/jobacct/aix/jobacct_aix.c b/src/plugins/jobacct/aix/jobacct_aix.c index cdf23807b32c2c166b6d3111ad541d3199c9155b..c9394869d4faebd4a0dd0ddb8439c334da6b0e08 100644 --- a/src/plugins/jobacct/aix/jobacct_aix.c +++ b/src/plugins/jobacct/aix/jobacct_aix.c @@ -30,6 +30,12 @@ #include "src/plugins/jobacct/common/jobacct_common.h" +#ifdef HAVE_AIX +#include <procinfo.h> +#include <sys/types.h> +#define NPROCS 5000 +#endif + /* * These variables are required by the generic plugin interface. If they @@ -61,28 +67,37 @@ * matures. */ const char plugin_name[] = -"Job accounting LINUX plugin"; -const char plugin_type[] = "jobacct/linux"; +"Job accounting AIX plugin"; +const char plugin_type[] = "jobacct/aix"; const uint32_t plugin_version = 100; /* Other useful declarations */ -#if 0 +#ifdef HAVE_AIX typedef struct prec { /* process record */ - pid_t pid; - pid_t ppid; - int psize; /* maxrss */ - int vsize; /* max virtual size */ + pid_t pid; + pid_t ppid; + int usec; /* user cpu time */ + int ssec; /* system cpu time */ + int pages; /* pages */ + float rss; /* maxrss */ + float vsize; /* max virtual size */ } prec_t; static int freq = 0; -static List prec_list = NULL; /* Finally, pre-define all the routines. */ -static void _get_offspring_data(prec_t *ancestor, pid_t pid); -static void _get_process_data(pid_t pid); -static int _get_process_data_line(FILE *in, prec_t *prec); +static void _get_offspring_data(List prec_list, prec_t *ancestor, pid_t pid); +static void _get_process_data(); static void *_watch_tasks(void *arg); static void _destroy_prec(void *object); + +/* system call to get process table */ +extern int getprocs(struct procsinfo *procinfo, int, struct fdsinfo *, + int, pid_t *, int); + /* procinfo: pointer to array of procinfo struct */ + /* nproc: number of user procinfo struct */ + /* sizproc: size of expected procinfo structure */ + #endif /* * The following routine is called by the slurmd mainline @@ -181,29 +196,22 @@ int jobacct_p_startpoll(int frequency) { int rc = SLURM_SUCCESS; -#if 0 pthread_attr_t attr; pthread_t _watch_tasks_thread_id; -#endif - debug("jobacct AIX plugin loaded"); - return rc; -#if 0 - /* FIXME!!!!!!!!!!!!!!!!!!!! - This was written for linux systems doesn't to anything on AIX */ - - /* Parse the JobAcctParameters */ + debug("jobacct AIX plugin loaded"); debug("jobacct: frequency = %d", frequency); fini = false; if (frequency == 0) { /* don't want dynamic monitoring? */ - debug2("jobacct LINUX dynamic logging disabled"); + debug2("jobacct AIX dynamic logging disabled"); return rc; } freq = frequency; + task_list = list_create(common_free_jobacct); /* create polling thread */ slurm_attr_init(&attr); @@ -217,9 +225,8 @@ int jobacct_p_startpoll(int frequency) frequency = 0; } else - debug3("jobacct LINUX dynamic logging enabled"); + debug3("jobacct AIX dynamic logging enabled"); slurm_attr_destroy(&attr); -#endif return rc; } @@ -248,7 +255,8 @@ void jobacct_p_suspendpoll() common_suspendpoll(); } -#if 0 +#ifdef HAVE_AIX + /* * _get_offspring_data() -- collect memory usage data for the offspring * @@ -256,7 +264,8 @@ void jobacct_p_suspendpoll() * usage data to the ancestor's <prec> record. Recurse to gather data * for *all* subsequent generations. * - * IN: ancestor The entry in precTable[] to which the data + * IN: prec_list list of prec's + * ancestor The entry in prec_list to which the data * should be added. Even as we recurse, this will * always be the prec for the base of the family * tree. @@ -269,9 +278,27 @@ void jobacct_p_suspendpoll() * * THREADSAFE! Only one thread ever gets here. */ -static void _get_offspring_data(prec_t *ancestor, pid_t pid) +static void _get_offspring_data(List prec_list, prec_t *ancestor, pid_t pid) { - + ListIterator itr; + prec_t *prec = NULL; + + itr = list_iterator_create(prec_list); + while((prec = list_next(itr))) { + if (prec->ppid == pid) { + _get_offspring_data(prec_list, ancestor, prec->pid); + debug2("adding %d to %d rss = %f vsize = %f", + prec->pid, ancestor->pid, + prec->rss, prec->vsize); + ancestor->usec += prec->usec; + ancestor->ssec += prec->ssec; + ancestor->pages += prec->pages; + ancestor->rss += prec->rss; + ancestor->vsize += prec->vsize; + } + } + list_iterator_destroy(itr); + return; } @@ -289,31 +316,89 @@ static void _get_offspring_data(prec_t *ancestor, pid_t pid) * is a Linux-style stat entry. We disregard the data if they look * wrong. */ -static void _get_process_data(pid_t pid) +static void _get_process_data() { + struct procsinfo proc; + int pid = 0; + static int processing = 0; + prec_t *prec = NULL; + struct jobacctinfo *jobacct = NULL; + List prec_list = NULL; + ListIterator itr; + ListIterator itr2; + + if(processing) { + debug("already running, returning"); + return; + } + + processing = 1; + prec_list = list_create(_destroy_prec); + /* get the whole process table */ + while(getprocs(&proc, sizeof(proc), 0, 0, &pid, 1) == 1) { + prec = xmalloc(sizeof(prec_t)); + list_append(prec_list, prec); + prec->pid = proc.pi_pid; + prec->ppid = proc.pi_ppid; + prec->usec = proc.pi_ru.ru_utime.tv_sec + + proc.pi_ru.ru_utime.tv_usec * 1e-6; + prec->ssec = proc.pi_ru.ru_stime.tv_sec + + proc.pi_ru.ru_stime.tv_usec * 1e-6; + prec->pages = proc.pi_majflt; + prec->rss = (proc.pi_trss + proc.pi_drss) * 4; + prec->rss *= 1024; + prec->vsize = (proc.pi_tsize / 1024); + prec->vsize += (proc.pi_dvm * 4); + prec->vsize *= 1024; + /* debug("vsize = %f = %d/1024+%d", */ +/* prec->vsize, proc.pi_tsize, proc.pi_dvm * 4); */ + } + if(!list_count(prec_list)) + goto finished; + + slurm_mutex_lock(&jobacct_lock); + if(!task_list || !list_count(task_list)) { + slurm_mutex_unlock(&jobacct_lock); + goto finished; + } + itr = list_iterator_create(task_list); + while((jobacct = list_next(itr))) { + itr2 = list_iterator_create(prec_list); + while((prec = list_next(itr2))) { + //debug2("pid %d ? %d", prec->ppid, jobacct->pid); + if (prec->pid == jobacct->pid) { + /* find all my descendents */ + _get_offspring_data(prec_list, prec, + prec->pid); + + /* tally their usage */ + jobacct->max_rss = jobacct->tot_rss = + MAX(jobacct->max_rss, (int)prec->rss); + jobacct->max_vsize = jobacct->tot_vsize = + MAX(jobacct->max_vsize, + (int)prec->vsize); + jobacct->max_pages = jobacct->tot_pages + = MAX(jobacct->max_pages, prec->pages); + jobacct->min_cpu = jobacct->tot_cpu = + (prec->usec + prec->ssec); + debug2("%d size now %d %d time %d", + jobacct->pid, jobacct->max_rss, + jobacct->max_vsize, jobacct->tot_cpu); + + break; + } + } + list_iterator_destroy(itr2); + } + list_iterator_destroy(itr); + slurm_mutex_unlock(&jobacct_lock); +finished: + list_destroy(prec_list); + processing = 0; return; } -/* _get_process_data_line() - get line of data from /proc/<pid>/stat - * - * IN: in - input file channel - * OUT: prec - the destination for the data - * - * RETVAL: ==0 - no valid data - * !=0 - data are valid - * - * Note: It seems a bit wasteful to do all those atoi() and - * atol() conversions that are implicit in the scanf(), - * but they help to ensure that we really are looking at the - * expected type of record. - */ -static int _get_process_data_line(FILE *in, prec_t *prec) -{ - /* discardable data */ - - return 1; -} /* _watch_tasks() -- monitor slurm jobs and track their memory usage * @@ -323,10 +408,9 @@ static int _get_process_data_line(FILE *in, prec_t *prec) static void *_watch_tasks(void *arg) { - pid_t pid = getpid(); while(!fini) { /* Do this until slurm_jobacct_task_exit() stops us */ if(!suspended) { - _get_process_data(pid); /* Update the data */ + _get_process_data(); /* Update the data */ } sleep(freq); } diff --git a/src/plugins/jobacct/common/common_slurmctld.c b/src/plugins/jobacct/common/common_slurmctld.c index 0607c16dc2d9707e4f73012e0b473633f31a7535..d0f460128619561c426933532b4bc020a7b8f62e 100644 --- a/src/plugins/jobacct/common/common_slurmctld.c +++ b/src/plugins/jobacct/common/common_slurmctld.c @@ -371,13 +371,18 @@ extern int common_step_complete_slurmctld(struct step_record *step) #endif /* figure out the ave of the totals sent */ if(step->num_tasks > 0) { - ave_vsize = jobacct->tot_vsize/step->num_tasks; - ave_rss = jobacct->tot_rss/step->num_tasks; - ave_pages = jobacct->tot_pages/step->num_tasks; - ave_cpu = jobacct->tot_cpu/step->num_tasks; + ave_vsize = jobacct->tot_vsize; + ave_vsize /= step->num_tasks; + ave_rss = jobacct->tot_rss; + ave_rss /= step->num_tasks; + ave_pages = jobacct->tot_pages; + ave_pages /= step->num_tasks; + ave_cpu = jobacct->tot_cpu; + ave_cpu /= step->num_tasks; ave_cpu /= 100; } - ave_cpu2 = jobacct->min_cpu/100; + ave_cpu2 = jobacct->min_cpu; + ave_cpu2 /= 100; snprintf(buf, BUFFER_SIZE, _jobstep_format, JOB_STEP, diff --git a/src/plugins/jobacct/linux/jobacct_linux.c b/src/plugins/jobacct/linux/jobacct_linux.c index dbe2b2803c9db0e09ea51ee69317dc627beac435..acff1f2b1d66415a3149056ac803ad273375105e 100644 --- a/src/plugins/jobacct/linux/jobacct_linux.c +++ b/src/plugins/jobacct/linux/jobacct_linux.c @@ -397,7 +397,7 @@ static void _get_process_data() { while((jobacct = list_next(itr))) { itr2 = list_iterator_create(prec_list); while((prec = list_next(itr2))) { - if (prec->ppid == jobacct->pid) { + if (prec->pid == jobacct->pid) { /* find all my descendents */ _get_offspring_data(prec_list, prec, prec->pid); diff --git a/src/plugins/select/bluegene/block_allocator/block_allocator.c b/src/plugins/select/bluegene/block_allocator/block_allocator.c index ac166789b2ee145ba7543902b33d8180f6bf5c99..f8bbe2c8569dccc9342151a04af733d5d05c99da 100644 --- a/src/plugins/select/bluegene/block_allocator/block_allocator.c +++ b/src/plugins/select/bluegene/block_allocator/block_allocator.c @@ -2738,13 +2738,13 @@ static int _find_x_path(List results, ba_node_t *ba_node, ba_switch_t *curr_switch = NULL; ba_switch_t *next_switch = NULL; - int port_tar; + int port_tar = 0; int source_port=0; int target_port=0; int broke = 0, not_first = 0; int ports_to_try[2] = {3,5}; int *node_tar = NULL; - int i; + int i = 0; ba_node_t *next_node = NULL; ba_node_t *check_node = NULL; int highest_phys_x = geometry[X] - start[X]; @@ -3054,7 +3054,7 @@ static int _find_x_path2(List results, ba_node_t *ba_node, int broke = 0, not_first = 0; int ports_to_try[2] = {3,5}; int *node_tar = NULL; - int i; + int i = 0; ba_node_t *next_node = NULL; ba_node_t *check_node = NULL; diff --git a/src/plugins/switch/federation/federation.c b/src/plugins/switch/federation/federation.c index 9a06b268c7b208ebcd3e941686b4534f0c42a823..8be66221d6e021d356730a3be4fac0238e2b624e 100644 --- a/src/plugins/switch/federation/federation.c +++ b/src/plugins/switch/federation/federation.c @@ -1454,10 +1454,10 @@ static int _window_state_set(int adapter_cnt, fed_tableinfo_t *tableinfo, char *hostname, int task_id, enum NTBL_RC state) { - fed_nodeinfo_t *node; + fed_nodeinfo_t *node = NULL; fed_adapter_t *adapter = NULL; - fed_window_t *window; - NTBL *table; + fed_window_t *window = NULL; + NTBL *table = NULL; int i, j; bool adapter_found; @@ -1668,7 +1668,7 @@ fed_build_jobinfo(fed_jobinfo_t *jp, hostlist_t hl, int nprocs, { int nnodes; hostlist_iterator_t hi; - char *host; + char *host = NULL; int proc_cnt = 0; int i, j; fed_nodeinfo_t *node; diff --git a/src/plugins/switch/federation/federation.h b/src/plugins/switch/federation/federation.h index a3848ad5fff7cc0f9d63ea55d96a34db92cf7db0..9e2a1108a146dacd430901ca1ef021b3018ec6c3 100644 --- a/src/plugins/switch/federation/federation.h +++ b/src/plugins/switch/federation/federation.h @@ -66,6 +66,9 @@ enum { #define FED_MAXADAPTERS 2 #define FED_LIBSTATE_LEN (1024 * 1024 * 1) +int fed_slurmctld_init(void); +int fed_slurmd_init(void); +int fed_slurmd_step_init(void); int fed_alloc_nodeinfo(fed_nodeinfo_t **nh); int fed_build_nodeinfo(fed_nodeinfo_t *np, char *hostname); char *fed_print_nodeinfo(fed_nodeinfo_t *np, char *buf, size_t size); diff --git a/src/plugins/switch/federation/switch_federation.c b/src/plugins/switch/federation/switch_federation.c index a4de81c9acf277da7e70595bef820d047b8b9888..3468808d7bcd3f73abf03608ba9829e1a341c433 100644 --- a/src/plugins/switch/federation/switch_federation.c +++ b/src/plugins/switch/federation/switch_federation.c @@ -35,6 +35,7 @@ #include <signal.h> #include <sys/types.h> #include <regex.h> +#include <stdlib.h> #include <slurm/slurm_errno.h> #include "src/common/slurm_xlator.h" @@ -222,7 +223,7 @@ int switch_p_libstate_restore ( char * dir_name, bool recover ) data = xmalloc(data_allocated); while (1) { data_read = read (state_fd, &data[data_size], - FED_BUF_SIZE); + FED_BUF_SIZE); if ((data_read < 0) && (errno == EINTR)) continue; if (data_read < 0) { diff --git a/src/sacct/options.c b/src/sacct/options.c index 5efe7b0462c8488523ef27fdc17f015e92aa3d20..84013f1efd4649be7fcd9d17aaa853ae14d4a8a9 100644 --- a/src/sacct/options.c +++ b/src/sacct/options.c @@ -28,6 +28,7 @@ #include "src/common/read_config.h" #include "sacct.h" +#include <time.h> typedef struct expired_rec { /* table of expired jobs */ uint32_t job; @@ -122,13 +123,18 @@ int _cmp_jrec(const void *a1, const void *a2) { */ void _dump_header(acct_header_t header) { - printf("%u %s %d %d %d %d %s %s ", + struct tm ts; + gmtime_r(&header.timestamp, &ts); + printf("%u %s %04d%02d%02d%02d%02d%02d %d %s %s ", header.jobnum, header.partition, + 1900+(ts.tm_year), + 1+(ts.tm_mon), + ts.tm_mday, + ts.tm_hour, + ts.tm_min, + ts.tm_sec, (int)header.job_start, - (int)header.timestamp, - header.uid, - header.gid, header.blockid, /* block id */ "-"); /* reserved 1 */ } @@ -364,7 +370,7 @@ int decode_status_char(char *status) char *decode_status_int(int status) { - switch(status) { + switch(status & ~JOB_COMPLETING) { case JOB_PENDING: return "PENDING"; /* we should never see this */ case JOB_RUNNING: @@ -386,23 +392,33 @@ char *decode_status_int(int status) default: return "UNKNOWN"; } - /* if (!strcasecmp(cs, "ca")) */ -/* return "CANCELLED"; */ -/* else if (strcasecmp(cs, "cd")==0) */ -/* return "COMPLETED"; */ -/* else if (strcasecmp(cs, "cg")==0) */ -/* return "COMPLETING"; /\* we should never see this *\/ */ -/* else if (strcasecmp(cs, "f")==0) */ -/* return "FAILED"; */ -/* else if (strcasecmp(cs, "nf")==0) */ -/* return "NODEFAILED"; */ -/* else if (strcasecmp(cs, "p")==0) */ -/* return "PENDING"; /\* we should never see this *\/ */ -/* else if (strcasecmp(cs, "r")==0) */ -/* return "RUNNING"; */ -/* else if (strcasecmp(cs, "to")==0) */ -/* return "TIMEDOUT"; */ -} +} + +char *decode_status_int_abbrev(int status) +{ + switch(status & ~JOB_COMPLETING) { + case JOB_PENDING: + return "PD"; /* we should never see this */ + case JOB_RUNNING: + return "R"; + case JOB_SUSPENDED: + return "S"; + case JOB_COMPLETE: + return "CD"; + case JOB_CANCELLED: + return "CA"; + case JOB_FAILED: + return "F"; + case JOB_TIMEOUT: + return "TO"; + case JOB_NODE_FAIL: + return "NF"; + case JOB_END: + return "JOB_END"; + default: + return "UNKNOWN"; + } +} int get_data(void) { @@ -986,14 +1002,29 @@ endopt: return; } +/* Note: do_dump() strives to present data in an upward-compatible + * manner so that apps written to use data from `sacct -d` in slurm + * v1.0 will continue to work in v1.1 and later. + * + * To help ensure this compatibility, + * a. The meaning of an existing field never changes + * b. New fields are appended to the end of a record + * + * The "numfields" field of the record can be used as a sub-version + * number, as it will never decrease for the life of the current + * record version number (currently 1). For example, if your app needs + * to use field 28, a record with numfields<28 is too old a version + * for you, while numfields>=28 will provide what you are expecting. + */ void do_dump(void) { ListIterator itr = NULL; ListIterator itr_step = NULL; job_rec_t *job = NULL; step_rec_t *step = NULL; - float tempf; - + struct tm ts; + time_t finished; + itr = list_iterator_create(jobs); while((job = list_next(itr))) { if (!params.opt_dup) @@ -1009,18 +1040,14 @@ void do_dump(void) if (params.opt_uid>=0) if (job->header.uid != params.opt_uid) continue; - if(job->sacct.min_cpu == NO_VAL) + if(job->sacct.min_cpu == (float)NO_VAL) job->sacct.min_cpu = 0; - + if(list_count(job->steps)) { - tempf = job->sacct.ave_cpu/list_count(job->steps); - job->sacct.ave_cpu = (uint32_t)tempf; - tempf = job->sacct.ave_rss/list_count(job->steps); - job->sacct.ave_rss = (uint32_t)tempf; - tempf = job->sacct.ave_vsize/list_count(job->steps); - job->sacct.ave_vsize = (uint32_t)tempf; - tempf = job->sacct.ave_pages/list_count(job->steps); - job->sacct.ave_pages = (uint32_t)tempf; + job->sacct.ave_cpu /= list_count(job->steps); + job->sacct.ave_rss /= list_count(job->steps); + job->sacct.ave_vsize /= list_count(job->steps); + job->sacct.ave_pages /= list_count(job->steps); } /* JOB_START */ @@ -1034,7 +1061,9 @@ void do_dump(void) job->header.jobnum); } _dump_header(job->header); - printf("JOB_START %s %d %d %d %s\n", + printf("JOB_START 1 16 %d %d %s %d %d %d %s\n", + job->header.uid, + job->header.gid, job->jobname, job->track_steps, job->priority, @@ -1050,12 +1079,14 @@ void do_dump(void) step->exitcode=1; } _dump_header(step->header); - printf("JOB_STEP %u %s %s ", + finished=step->header.job_start+step->elapsed; + gmtime_r(&finished, &ts); + printf("JOB_STEP 1 50 %u %04d%02d%02d%02d%02d%02d ", step->stepnum, - step->stepname, - step->nodes); + 1900+(ts.tm_year), 1+(ts.tm_mon), ts.tm_mday, + ts.tm_hour, ts.tm_min, ts.tm_sec); printf("%s %d %d %d %d ", - decode_status_int(step->status), + decode_status_int_abbrev(step->status), step->exitcode, step->ntasks, step->ncpus, @@ -1068,7 +1099,7 @@ void do_dump(void) (int)step->rusage.ru_stime.tv_sec, (int)step->rusage.ru_stime.tv_usec); printf("%d %d %d %d %d %d %d %d %d " - "%d %d %d %d %d ", + "%d %d %d %d %d %d %d ", (int)step->rusage.ru_maxrss, (int)step->rusage.ru_ixrss, (int)step->rusage.ru_idrss, @@ -1082,13 +1113,14 @@ void do_dump(void) (int)step->rusage.ru_msgrcv, (int)step->rusage.ru_nsignals, (int)step->rusage.ru_nvcsw, - (int)step->rusage.ru_nivcsw); - printf("%d %d %.2f %d %d %.2f " - "%d %d %.2f %.2f %d %.2f\n", + (int)step->rusage.ru_nivcsw, step->sacct.max_vsize, + step->sacct.max_rss); + /* Data added in Slurm v1.1 */ + printf("%d %.2f %d %.2f %d %d %.2f " + "%.2f %d %.2f %s %s\n", step->sacct.max_vsize_task, step->sacct.ave_vsize, - step->sacct.max_rss, step->sacct.max_rss_task, step->sacct.ave_rss, step->sacct.max_pages, @@ -1096,16 +1128,23 @@ void do_dump(void) step->sacct.ave_pages, step->sacct.min_cpu, step->sacct.min_cpu_task, - step->sacct.ave_cpu); + step->sacct.ave_cpu, + step->stepname, + step->nodes); } list_iterator_destroy(itr_step); /* JOB_TERMINATED */ if (job->show_full) { _dump_header(job->header); - printf("JOB_TERMINATED %d ", + finished=job->header.job_start+job->elapsed; + gmtime_r(&finished, &ts); + printf("JOB_TERMINATED 1 50 %d ", job->elapsed); + printf("%04d%02d%02d%02d%02d%02d ", + 1900+(ts.tm_year), 1+(ts.tm_mon), ts.tm_mday, + ts.tm_hour, ts.tm_min, ts.tm_sec); printf("%s %d %d %d %d ", - decode_status_int(job->status), + decode_status_int_abbrev(job->status), job->exitcode, job->ntasks, job->ncpus, @@ -1117,14 +1156,14 @@ void do_dump(void) (int)job->rusage.ru_utime.tv_usec, (int)job->rusage.ru_stime.tv_sec, (int)job->rusage.ru_stime.tv_usec); - printf("%d %d %d %d %d %d ", + printf("%d %d %d %d %d %d %d %d %d " + "%d %d %d %d %d %d %d ", (int)job->rusage.ru_maxrss, (int)job->rusage.ru_ixrss, (int)job->rusage.ru_idrss, (int)job->rusage.ru_isrss, (int)job->rusage.ru_minflt, - (int)job->rusage.ru_majflt); - printf("%d %d %d %d %d %d %d %d ", + (int)job->rusage.ru_majflt, (int)job->rusage.ru_nswap, (int)job->rusage.ru_inblock, (int)job->rusage.ru_oublock, @@ -1132,13 +1171,14 @@ void do_dump(void) (int)job->rusage.ru_msgrcv, (int)job->rusage.ru_nsignals, (int)job->rusage.ru_nvcsw, - (int)job->rusage.ru_nivcsw); - printf("%d %d %.2f %d %d %.2f " - "%d %d %.2f %.2f %d %.2f\n", + (int)job->rusage.ru_nivcsw, job->sacct.max_vsize, + job->sacct.max_rss); + /* Data added in Slurm v1.1 */ + printf("%d %.2f %d %.2f %d %d %.2f " + "%.2f %d %.2f %s %s\n", job->sacct.max_vsize_task, job->sacct.ave_vsize, - job->sacct.max_rss, job->sacct.max_rss_task, job->sacct.ave_rss, job->sacct.max_pages, @@ -1146,7 +1186,9 @@ void do_dump(void) job->sacct.ave_pages, job->sacct.min_cpu, job->sacct.min_cpu_task, - job->sacct.ave_cpu); + job->sacct.ave_cpu, + "-", + job->nodes); } } list_iterator_destroy(itr); @@ -1607,8 +1649,7 @@ void do_list(void) ListIterator itr_step = NULL; job_rec_t *job = NULL; step_rec_t *step = NULL; - float tempf; - + if (params.opt_total) do_jobsteps = 0; @@ -1658,14 +1699,10 @@ void do_list(void) job->sacct.min_cpu = 0; if(list_count(job->steps)) { - tempf = job->sacct.ave_cpu/list_count(job->steps); - job->sacct.ave_cpu = (uint32_t)tempf; - tempf = job->sacct.ave_rss/list_count(job->steps); - job->sacct.ave_rss = (uint32_t)tempf; - tempf = job->sacct.ave_vsize/list_count(job->steps); - job->sacct.ave_vsize = (uint32_t)tempf; - tempf = job->sacct.ave_pages/list_count(job->steps); - job->sacct.ave_pages = (uint32_t)tempf; + job->sacct.ave_cpu /= list_count(job->steps); + job->sacct.ave_rss /= list_count(job->steps); + job->sacct.ave_vsize /= list_count(job->steps); + job->sacct.ave_pages /= list_count(job->steps); } if (job->show_full) { diff --git a/src/sacct/process.c b/src/sacct/process.c index 78f8476f4f8cebb427a1254924a2af202a544c2f..5f56460b50efff4dee5f591455ca98ae9aeab4ba 100644 --- a/src/sacct/process.c +++ b/src/sacct/process.c @@ -203,7 +203,7 @@ int _parse_line(char *f[], void **data) (*step)->sacct.max_pages = atoi(f[F_MAX_PAGES]); (*step)->sacct.max_pages_task = atoi(f[F_MAX_PAGES_TASK]); (*step)->sacct.ave_pages = atof(f[F_AVE_PAGES]); - (*step)->sacct.min_cpu = atoi(f[F_MIN_CPU]); + (*step)->sacct.min_cpu = atof(f[F_MIN_CPU]); (*step)->sacct.min_cpu_task = atoi(f[F_MIN_CPU_TASK]); (*step)->sacct.ave_cpu = atof(f[F_AVE_CPU]); (*step)->stepname = xstrdup(f[F_STEPNAME]); @@ -328,6 +328,9 @@ got_step: job->elapsed = time(NULL) - job->header.timestamp; } /* now aggregate the aggregatable */ + job->ncpus = MAX(job->ncpus, step->ncpus); + if(step->status < JOB_COMPLETE) + return; job->tot_cpu_sec += step->tot_cpu_sec; job->tot_cpu_usec += step->tot_cpu_usec; job->rusage.ru_utime.tv_sec += step->rusage.ru_utime.tv_sec; @@ -360,10 +363,6 @@ got_step: /* get the max for all the sacct_t struct */ aggregate_sacct(&job->sacct, &step->sacct); - - /* job->psize = MAX(job->psize, step->psize); */ -/* job->vsize = MAX(job->vsize, step->vsize); */ - job->ncpus = MAX(job->ncpus, step->ncpus); } void process_suspend(char *f[], int lc, int show_full) diff --git a/src/sacct/sacct_stat.c b/src/sacct/sacct_stat.c index f5d68bccca33b13edb03270d28461024183b847c..824448a5441d892059003d131a73c0b339dc534e 100644 --- a/src/sacct/sacct_stat.c +++ b/src/sacct/sacct_stat.c @@ -239,7 +239,10 @@ int _sacct_query(resource_allocation_response_msg_t *job, uint32_t step_id) slurm_mutex_destroy(&stat_mutex); if(step.ntasks) { tempf = step.sacct.ave_cpu/step.ntasks; + tempf /= 100; step.sacct.ave_cpu = (uint32_t)tempf; + tempf = step.sacct.min_cpu/100; + step.sacct.min_cpu = (uint32_t)tempf; tempf = step.sacct.ave_rss/step.ntasks; step.sacct.ave_rss = (uint32_t)tempf; tempf = step.sacct.ave_vsize/step.ntasks; diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c index 2df1a4eb823a171c351cb0ebad14425fa8e3f3c6..4d30065250b9e91935f8b722018cb14b381a8e2a 100644 --- a/src/slurmctld/node_mgr.c +++ b/src/slurmctld/node_mgr.c @@ -792,9 +792,9 @@ void set_slurmd_addr (void) */ int update_node ( update_node_msg_t * update_node_msg ) { - int error_code = 0, base_state, node_inx; - struct node_record *node_ptr; - char *this_node_name ; + int error_code = 0, base_state = 0, node_inx; + struct node_record *node_ptr = NULL; + char *this_node_name = NULL; hostlist_t host_list; uint16_t node_flags = 0, state_val; diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index 150ec5e378a8489fea6d29933167e65f8428a9c5..e9f7673c11114c1892a3755a34bb2615d735ea99 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -37,6 +37,7 @@ #include <stdlib.h> #include <sys/types.h> #include <string.h> +#include <strings.h> #include <unistd.h> #include <slurm/slurm_errno.h> diff --git a/src/slurmd/slurmd/req.c b/src/slurmd/slurmd/req.c index 920240d5c91d43e2b6c8641296bbcf71748eec32..e4677e8052093b75f307fae40cc3de12b71f2a6f 100644 --- a/src/slurmd/slurmd/req.c +++ b/src/slurmd/slurmd/req.c @@ -1141,7 +1141,8 @@ done2: close(fd); done: slurm_send_rc_msg(msg, rc); - return SLURM_SUCCESS; + + return rc; } static int diff --git a/src/slurmd/slurmd/slurmd.c b/src/slurmd/slurmd/slurmd.c index daeee2a856814743ebc3850ec9647933503a4a9e..6bd27b1302ee236fbbeda36068c53267dc3971ea 100644 --- a/src/slurmd/slurmd/slurmd.c +++ b/src/slurmd/slurmd/slurmd.c @@ -336,7 +336,6 @@ _handle_connection(slurm_fd fd, slurm_addr *cli) _service_connection((void *) arg); return; } - slurm_attr_destroy(&attr); return; } diff --git a/src/slurmd/slurmstepd/mgr.c b/src/slurmd/slurmstepd/mgr.c index 7084c3bd034522dd7be324e135485f0eeb66e4bd..de80f6242eedebb22e2a9675b6cf2da18405d0b0 100644 --- a/src/slurmd/slurmstepd/mgr.c +++ b/src/slurmd/slurmstepd/mgr.c @@ -37,6 +37,14 @@ # include <sys/prctl.h> #endif +#ifdef HAVE_AIX +# undef HAVE_UNSETENV +# include <sys/checkpnt.h> +#endif +#ifndef HAVE_UNSETENV +# include "src/common/unsetenv.h" +#endif + #include <sys/wait.h> #include <sys/stat.h> #include <sys/param.h>