From a0e3e5de36e94b3878970ea0ee8766881be30400 Mon Sep 17 00:00:00 2001 From: Morris Jette <jette@schedmd.com> Date: Tue, 23 Feb 2016 16:37:25 -0800 Subject: [PATCH] Improve some step allocation logs Include warning for Cray simulation as reminder for developers to change code as needed. --- src/slurmctld/step_mgr.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/slurmctld/step_mgr.c b/src/slurmctld/step_mgr.c index f8770f527d9..9c7a3b08b5b 100644 --- a/src/slurmctld/step_mgr.c +++ b/src/slurmctld/step_mgr.c @@ -1925,8 +1925,8 @@ extern void step_alloc_lps(struct step_record *step_ptr) if (slurmctld_conf.debug_flags & DEBUG_FLAG_CPU_BIND) _dump_step_layout(step_ptr); if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS) { - info("step alloc of %s procs: %u of %u", - node_record_table_ptr[i_node].name, + info("step alloc on job node %d (%s) used %u of %u CPUs", + job_node_inx, node_record_table_ptr[i_node].name, job_resrcs_ptr->cpus_used[job_node_inx], job_resrcs_ptr->cpus[job_node_inx]); } @@ -1979,6 +1979,9 @@ static void _dump_step_layout(struct step_record *step_ptr) static void _step_dealloc_lps(struct step_record *step_ptr) { +#if !defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK) + static bool cray_simulate_logged = false; +#endif struct job_record *job_ptr = step_ptr->job_ptr; job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs; int cpus_alloc; @@ -2023,12 +2026,21 @@ static void _step_dealloc_lps(struct step_record *step_ptr) cpus_alloc = step_ptr->step_layout->tasks[step_node_inx] * step_ptr->cpus_per_task; #endif - if (job_resrcs_ptr->cpus_used[job_node_inx] >= cpus_alloc) + if (job_resrcs_ptr->cpus_used[job_node_inx] >= cpus_alloc) { job_resrcs_ptr->cpus_used[job_node_inx] -= cpus_alloc; - else { - error("_step_dealloc_lps: cpu underflow for %u.%u", - job_ptr->job_id, step_ptr->step_id); + } else { + error("%s: CPU underflow for %u.%u (%u<%u on job node %d)", + __func__, job_ptr->job_id, step_ptr->step_id, + job_resrcs_ptr->cpus_used[job_node_inx], + cpus_alloc, job_node_inx); job_resrcs_ptr->cpus_used[job_node_inx] = 0; +#if !defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK) + if (!cray_simulate_logged) { + error("Remember to comment out post_job_step() " + "call in slurmctld/step_mgr.c"); + cray_simulate_logged = true; + } +#endif } if (step_ptr->pn_min_memory && _is_mem_resv()) { uint32_t mem_use = step_ptr->pn_min_memory; @@ -2048,8 +2060,8 @@ static void _step_dealloc_lps(struct step_record *step_ptr) } } if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS) { - info("step dealloc of %s procs: %u of %u", - node_record_table_ptr[i_node].name, + info("step dealloc on job node %d (%s) used: %u of %u CPUs", + job_node_inx, node_record_table_ptr[i_node].name, job_resrcs_ptr->cpus_used[job_node_inx], job_resrcs_ptr->cpus[job_node_inx]); } -- GitLab