From a0e3e5de36e94b3878970ea0ee8766881be30400 Mon Sep 17 00:00:00 2001
From: Morris Jette <jette@schedmd.com>
Date: Tue, 23 Feb 2016 16:37:25 -0800
Subject: [PATCH] Improve some step allocation logs

Include warning for Cray simulation as reminder for developers to
change code as needed.
---
 src/slurmctld/step_mgr.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c
index f8770f527d9..9c7a3b08b5b 100644
--- a/src/slurmctld/step_mgr.c
+++ b/src/slurmctld/step_mgr.c
@@ -1925,8 +1925,8 @@ extern void step_alloc_lps(struct step_record *step_ptr)
 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_CPU_BIND)
 			_dump_step_layout(step_ptr);
 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS) {
-			info("step alloc of %s procs: %u of %u",
-			     node_record_table_ptr[i_node].name,
+			info("step alloc on job node %d (%s) used %u of %u CPUs",
+			     job_node_inx, node_record_table_ptr[i_node].name,
 			     job_resrcs_ptr->cpus_used[job_node_inx],
 			     job_resrcs_ptr->cpus[job_node_inx]);
 		}
@@ -1979,6 +1979,9 @@ static void _dump_step_layout(struct step_record *step_ptr)
 
 static void _step_dealloc_lps(struct step_record *step_ptr)
 {
+#if !defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
+	static bool cray_simulate_logged = false;
+#endif
 	struct job_record  *job_ptr = step_ptr->job_ptr;
 	job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;
 	int cpus_alloc;
@@ -2023,12 +2026,21 @@ static void _step_dealloc_lps(struct step_record *step_ptr)
 		cpus_alloc = step_ptr->step_layout->tasks[step_node_inx] *
 			     step_ptr->cpus_per_task;
 #endif
-		if (job_resrcs_ptr->cpus_used[job_node_inx] >= cpus_alloc)
+		if (job_resrcs_ptr->cpus_used[job_node_inx] >= cpus_alloc) {
 			job_resrcs_ptr->cpus_used[job_node_inx] -= cpus_alloc;
-		else {
-			error("_step_dealloc_lps: cpu underflow for %u.%u",
-				job_ptr->job_id, step_ptr->step_id);
+		} else {
+			error("%s: CPU underflow for %u.%u (%u<%u on job node %d)",
+			      __func__, job_ptr->job_id, step_ptr->step_id,
+			      job_resrcs_ptr->cpus_used[job_node_inx],
+			      cpus_alloc, job_node_inx);
 			job_resrcs_ptr->cpus_used[job_node_inx] = 0;
+#if !defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
+			if (!cray_simulate_logged) {
+				error("Remember to comment out post_job_step() "
+				      "call in slurmctld/step_mgr.c");
+				cray_simulate_logged = true;
+			}
+#endif
 		}
 		if (step_ptr->pn_min_memory && _is_mem_resv()) {
 			uint32_t mem_use = step_ptr->pn_min_memory;
@@ -2048,8 +2060,8 @@ static void _step_dealloc_lps(struct step_record *step_ptr)
 			}
 		}
 		if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS) {
-			info("step dealloc of %s procs: %u of %u",
-			     node_record_table_ptr[i_node].name,
+			info("step dealloc on job node %d (%s) used: %u of %u CPUs",
+			     job_node_inx, node_record_table_ptr[i_node].name,
 			     job_resrcs_ptr->cpus_used[job_node_inx],
 			     job_resrcs_ptr->cpus[job_node_inx]);
 		}
-- 
GitLab