Skip to content
Snippets Groups Projects
Commit a600e2cf authored by Moe Jette's avatar Moe Jette
Browse files
parent 7d76fdaa
No related branches found
No related tags found
No related merge requests found
...@@ -57,7 +57,7 @@ documents those changes that are of interest to users and admins. ...@@ -57,7 +57,7 @@ documents those changes that are of interest to users and admins.
* Changes in SLURM 0.6.9 * Changes in SLURM 0.6.9
======================== ========================
-- Fix bug in mpi plugin to set the ID correctly -- Fix bug in mpi plugin to set the ID correctly
-- Accounting bug causing segv fixed (Andy Riebs, 14oct.jobacct.patch)
* Changes in SLURM 0.6.8 * Changes in SLURM 0.6.8
======================== ========================
......
...@@ -102,7 +102,8 @@ are denoted below. ...@@ -102,7 +102,8 @@ are denoted below.
package.) The switch/elan plugin also requires the package.) The switch/elan plugin also requires the
presence of the libelanosts library and /etc/elanhosts presence of the libelanosts library and /etc/elanhosts
configuration file. (See elanhosts(5) man page in that configuration file. (See elanhosts(5) man page in that
package for more details) package for more details). Finally, the "ptrack" kernel
patch is required for process tracking.
</ul> </ul>
Please see the <a href=download.html>Download</a> page for references to Please see the <a href=download.html>Download</a> page for references to
required software to build these plugins.</p> required software to build these plugins.</p>
......
...@@ -725,24 +725,34 @@ int slurmd_jobacct_smgr(void) ...@@ -725,24 +725,34 @@ int slurmd_jobacct_smgr(void)
int slurmd_jobacct_task_exit(slurmd_job_t *job, pid_t pid, int status, struct rusage *rusage) int slurmd_jobacct_task_exit(slurmd_job_t *job, pid_t pid, int status, struct rusage *rusage)
{ {
_jrec_t *jrec; _jrec_t *jrec;
int rc=SLURM_SUCCESS; int rc=SLURM_SUCCESS;
static int active=0;
if (active==0)
active = job->ntasks-1;
else
active--;
debug2("slurmd_jobacct_task_exit for job %u.%u," debug2("slurmd_jobacct_task_exit(%d) for job %u.%u,"
" node %d, status=%d", " node %d, status=%d, nprocs %d, active %d",
job->jobid, job->stepid, job->nodeid, status/256); getpid(),
job->jobid, job->stepid, job->nodeid, status/256,
job->nprocs, active);
jrec = _alloc_jrec(job); jrec = _alloc_jrec(job);
jrec->nodeid = job->nodeid; jrec->nodeid = job->nodeid;
memcpy(&jrec->rusage, rusage, sizeof(struct rusage)); memcpy(&jrec->rusage, rusage, sizeof(struct rusage));
jrec->status = status/256; jrec->status = status/256;
if (prec_frequency) { /* if dynamic monitoring */ if (prec_frequency) /* if dynamic monitoring */
slurm_mutex_lock(&precTable_lock); /* let watcher finish loop */ if (active==0) {
pthread_cancel(_watch_tasks_thread_id); debug3("slurmd_jobacct_task_exit(%d) cancelling "
pthread_join(_watch_tasks_thread_id,NULL); "_watch_tasks",
slurm_mutex_unlock(&precTable_lock); getpid(), job->jobid, job->stepid);
jrec->max_psize = max_psize; pthread_cancel(_watch_tasks_thread_id);
jrec->max_vsize = max_vsize; pthread_join(_watch_tasks_thread_id,NULL);
} }
jrec->max_psize = max_psize;
jrec->max_vsize = max_vsize;
rc = _send_data_to_mynode(TASKDATA, jrec); rc = _send_data_to_mynode(TASKDATA, jrec);
xfree(jrec); xfree(jrec);
return rc; return rc;
...@@ -856,6 +866,7 @@ static _jrec_t *_get_jrec_by_jobstep(List jrecs, uint32_t jobid, ...@@ -856,6 +866,7 @@ static _jrec_t *_get_jrec_by_jobstep(List jrecs, uint32_t jobid,
uint32_t stepid) { uint32_t stepid) {
_jrec_t *jrec = NULL; _jrec_t *jrec = NULL;
ListIterator i; ListIterator i;
if (jrecs==NULL) { if (jrecs==NULL) {
error("no accounting job list"); error("no accounting job list");
return jrec; return jrec;
...@@ -1360,11 +1371,6 @@ static int _send_data_to_node_0(_jrec_t *jrec) { ...@@ -1360,11 +1371,6 @@ static int _send_data_to_node_0(_jrec_t *jrec) {
int rc=SLURM_SUCCESS, int rc=SLURM_SUCCESS,
retry; retry;
if (!strcmp(jrec->node0, NOT_FOUND)) {
error("jobacct(%d): job %d has no node0");
return SLURM_SUCCESS; /* can't do anything here */
}
debug2("jobacct(%d): in _send_data_to_node_0(job %u), nodes0,1=%s,%s" debug2("jobacct(%d): in _send_data_to_node_0(job %u), nodes0,1=%s,%s"
", utime=%d.%06d", ", utime=%d.%06d",
getpid(), jrec->jobid, jrec->node0, jrec->node1, getpid(), jrec->jobid, jrec->node0, jrec->node1,
...@@ -1376,6 +1382,12 @@ static int _send_data_to_node_0(_jrec_t *jrec) { ...@@ -1376,6 +1382,12 @@ static int _send_data_to_node_0(_jrec_t *jrec) {
return rc; return rc;
} }
if (strcmp(jrec->node0, NOT_FOUND)==0) {
error("jobacct(%d): job %d has no node0",
getpid(), jrec->jobid);
return SLURM_SUCCESS; /* can't do anything here */
}
/* make a stats_msg */ /* make a stats_msg */
stats.msg_type = htonl(TO_NODE0); stats.msg_type = htonl(TO_NODE0);
stats.jobid = htonl(jrec->jobid); stats.jobid = htonl(jrec->jobid);
...@@ -1602,8 +1614,8 @@ static void *_watch_tasks(void *arg) { ...@@ -1602,8 +1614,8 @@ static void *_watch_tasks(void *arg) {
while(1) { /* Do this until slurm_jobacct_task_exit() stops us */ while(1) { /* Do this until slurm_jobacct_task_exit() stops us */
sleep(prec_frequency); sleep(prec_frequency);
pthread_testcancel(); pthread_testcancel();
slurm_mutex_lock(&precTable_lock);
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &tmp); pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &tmp);
slurm_mutex_lock(&precTable_lock);
_get_process_data(); /* Update the data */ _get_process_data(); /* Update the data */
slurm_mutex_unlock(&precTable_lock); slurm_mutex_unlock(&precTable_lock);
pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &tmp); pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &tmp);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment