From a600e2cf94fd75b121ff3b70160a9160ff16acbe Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Mon, 14 Nov 2005 19:15:30 +0000 Subject: [PATCH] svn merge -r6526:6609 https://eris.llnl.gov/svn/slurm/branches/slurm-0-6-branch --- NEWS | 2 +- doc/html/quickstart_admin.html | 3 +- src/plugins/jobacct/log/jobacct_log.c | 50 +++++++++++++++++---------- 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/NEWS b/NEWS index 2ff623f6e7a..27427d7c936 100644 --- a/NEWS +++ b/NEWS @@ -57,7 +57,7 @@ documents those changes that are of interest to users and admins. * Changes in SLURM 0.6.9 ======================== -- Fix bug in mpi plugin to set the ID correctly - + -- Accounting bug causing segv fixed (Andy Riebs, 14oct.jobacct.patch) * Changes in SLURM 0.6.8 ======================== diff --git a/doc/html/quickstart_admin.html b/doc/html/quickstart_admin.html index a359a25442f..1443a15a9eb 100644 --- a/doc/html/quickstart_admin.html +++ b/doc/html/quickstart_admin.html @@ -102,7 +102,8 @@ are denoted below. package.) The switch/elan plugin also requires the presence of the libelanosts library and /etc/elanhosts configuration file. (See elanhosts(5) man page in that - package for more details) + package for more details). Finally, the "ptrack" kernel + patch is required for process tracking. </ul> Please see the <a href=download.html>Download</a> page for references to required software to build these plugins.</p> diff --git a/src/plugins/jobacct/log/jobacct_log.c b/src/plugins/jobacct/log/jobacct_log.c index 1ba5f46bf8f..9e5b0658d87 100644 --- a/src/plugins/jobacct/log/jobacct_log.c +++ b/src/plugins/jobacct/log/jobacct_log.c @@ -725,24 +725,34 @@ int slurmd_jobacct_smgr(void) int slurmd_jobacct_task_exit(slurmd_job_t *job, pid_t pid, int status, struct rusage *rusage) { - _jrec_t *jrec; - int rc=SLURM_SUCCESS; + _jrec_t *jrec; + int rc=SLURM_SUCCESS; + static int active=0; + + if (active==0) + active = job->ntasks-1; + else + active--; - debug2("slurmd_jobacct_task_exit for job %u.%u," - " node %d, status=%d", - job->jobid, job->stepid, job->nodeid, status/256); + debug2("slurmd_jobacct_task_exit(%d) for job %u.%u," + " node %d, status=%d, nprocs %d, active %d", + getpid(), + job->jobid, job->stepid, job->nodeid, status/256, + job->nprocs, active); jrec = _alloc_jrec(job); jrec->nodeid = job->nodeid; memcpy(&jrec->rusage, rusage, sizeof(struct rusage)); jrec->status = status/256; - if (prec_frequency) { /* if dynamic monitoring */ - slurm_mutex_lock(&precTable_lock); /* let watcher finish loop */ - pthread_cancel(_watch_tasks_thread_id); - pthread_join(_watch_tasks_thread_id,NULL); - slurm_mutex_unlock(&precTable_lock); - jrec->max_psize = max_psize; - jrec->max_vsize = max_vsize; - } + if (prec_frequency) /* if dynamic monitoring */ + if (active==0) { + debug3("slurmd_jobacct_task_exit(%d) cancelling " + "_watch_tasks", + getpid(), job->jobid, job->stepid); + pthread_cancel(_watch_tasks_thread_id); + pthread_join(_watch_tasks_thread_id,NULL); + } + jrec->max_psize = max_psize; + jrec->max_vsize = max_vsize; rc = _send_data_to_mynode(TASKDATA, jrec); xfree(jrec); return rc; @@ -856,6 +866,7 @@ static _jrec_t *_get_jrec_by_jobstep(List jrecs, uint32_t jobid, uint32_t stepid) { _jrec_t *jrec = NULL; ListIterator i; + if (jrecs==NULL) { error("no accounting job list"); return jrec; @@ -1360,11 +1371,6 @@ static int _send_data_to_node_0(_jrec_t *jrec) { int rc=SLURM_SUCCESS, retry; - if (!strcmp(jrec->node0, NOT_FOUND)) { - error("jobacct(%d): job %d has no node0"); - return SLURM_SUCCESS; /* can't do anything here */ - } - debug2("jobacct(%d): in _send_data_to_node_0(job %u), nodes0,1=%s,%s" ", utime=%d.%06d", getpid(), jrec->jobid, jrec->node0, jrec->node1, @@ -1376,6 +1382,12 @@ static int _send_data_to_node_0(_jrec_t *jrec) { return rc; } + if (strcmp(jrec->node0, NOT_FOUND)==0) { + error("jobacct(%d): job %d has no node0", + getpid(), jrec->jobid); + return SLURM_SUCCESS; /* can't do anything here */ + } + /* make a stats_msg */ stats.msg_type = htonl(TO_NODE0); stats.jobid = htonl(jrec->jobid); @@ -1602,8 +1614,8 @@ static void *_watch_tasks(void *arg) { while(1) { /* Do this until slurm_jobacct_task_exit() stops us */ sleep(prec_frequency); pthread_testcancel(); - slurm_mutex_lock(&precTable_lock); pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &tmp); + slurm_mutex_lock(&precTable_lock); _get_process_data(); /* Update the data */ slurm_mutex_unlock(&precTable_lock); pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &tmp); -- GitLab