From e0d92b8a1a0ed87c3fbbbc529f33aecf40f44144 Mon Sep 17 00:00:00 2001
From: Moe Jette <jette1@llnl.gov>
Date: Sat, 9 Apr 2011 23:46:35 +0000
Subject: [PATCH] slurmstepd: avoid coredump in case of NULL job

We build slurm with --enable-memory-leak-debug and encountered twice the same core
dump when user 'root' was trying to run jobs during a maintenance session.

The root user is not in the accounting database, which explains the errors seen
below. The gdb session shows that in this invocation

palu7:0 log>stat /var/crash/palu7-slurmstepd-6602.core
...
Modify: 2011-04-04 19:34:44.000000000 +0200

slurmctld.log
[2011-04-04T19:34:44] _slurm_rpc_submit_batch_job JobId=3254 usec=1773
[2011-04-04T19:34:44] ALPS RESERVATION #5, JobId 3254: BASIL -n 1920 -N 0 -d 1 -m 1333
[2011-04-04T19:34:44] sched: Allocate JobId=3254 NodeList=nid000[03-13,18-29,32-88] #CPUs=1920
[2011-04-04T19:34:44] error: slurmd error 4005 running JobId=3254 on front_end=palu7: User not found on host
[2011-04-04T19:34:44] update_front_end: set state of palu7 to DRAINING
[2011-04-04T19:34:44] completing job 3254
[2011-04-04T19:34:44] Requeue JobId=3254 due to node failure
[2011-04-04T19:34:44] sched: job_complete for JobId=3254 successful
[2011-04-04T19:34:44] requeue batch job 3254
[2011-04-04T20:28:43] sched: Cancel of JobId=3254 by UID=0, usec=57285

(gdb) core-file palu7-slurmstepd-6602.core
[New Thread 6604]
Core was generated by `/opt/slurm/2.3.0/sbin/slurmstepd'.
Program terminated with signal 11, Segmentation fault.
#0  main (argc=1, argv=0x7fffd65a1fd8) at slurmstepd.c:413
413             jobacct_gather_g_destroy(job->jobacct);
(gdb) print job
$1 = (slurmd_job_t *) 0x0
(gdb) list
408
409     #ifdef MEMORY_LEAK_DEBUG
410     static void
411     _step_cleanup(slurmd_job_t *job, slurm_msg_t *msg, int rc)
412     {
413             jobacct_gather_g_destroy(job->jobacct);
414             if (!job->batch)
415                     job_destroy(job);
416             /*
417              * The message cannot be freed until the jobstep is complete
(gdb) print msg
$2 = (slurm_msg_t *) 0x916008
(gdb) print rc
$3 = -1
(gdb)

The patch tests for a NULL job argument for the calls that need to dereference the job pointer.
---
 src/slurmd/slurmstepd/slurmstepd.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/slurmd/slurmstepd/slurmstepd.c b/src/slurmd/slurmstepd/slurmstepd.c
index eead6f4ca7d..54e5967aedf 100644
--- a/src/slurmd/slurmstepd/slurmstepd.c
+++ b/src/slurmd/slurmstepd/slurmstepd.c
@@ -410,9 +410,11 @@ _step_setup(slurm_addr_t *cli, slurm_addr_t *self, slurm_msg_t *msg)
 static void
 _step_cleanup(slurmd_job_t *job, slurm_msg_t *msg, int rc)
 {
-	jobacct_gather_g_destroy(job->jobacct);
-	if (!job->batch)
-		job_destroy(job);
+	if (job) {
+		jobacct_gather_g_destroy(job->jobacct);
+		if (!job->batch)
+			job_destroy(job);
+	}
 	/*
 	 * The message cannot be freed until the jobstep is complete
 	 * because the job struct has pointers into the msg, such
-- 
GitLab