diff --git a/src/plugins/switch/cray/Makefile.am b/src/plugins/switch/cray/Makefile.am index 7f3c0f2280ddb70ce96966a495f2a118c5cc7d3a..be9f651df6a8fabe452bd24843977836c2495d37 100644 --- a/src/plugins/switch/cray/Makefile.am +++ b/src/plugins/switch/cray/Makefile.am @@ -11,6 +11,7 @@ switch_cray_la_SOURCES = \ gpu.c \ pe_info.c \ ports.c \ + scaling.c \ switch_cray.c \ switch_cray.h \ util.c diff --git a/src/plugins/switch/cray/Makefile.in b/src/plugins/switch/cray/Makefile.in index b9cb9a18a26b72499852d24861b700cfc687f2d7..e7d79572ea81f52bbf34e33995a054402543c778 100644 --- a/src/plugins/switch/cray/Makefile.in +++ b/src/plugins/switch/cray/Makefile.in @@ -163,7 +163,8 @@ LTLIBRARIES = $(pkglib_LTLIBRARIES) switch_cray_la_LIBADD = am_switch_cray_la_OBJECTS = switch_cray_la-gpu.lo \ switch_cray_la-pe_info.lo switch_cray_la-ports.lo \ - switch_cray_la-switch_cray.lo switch_cray_la-util.lo + switch_cray_la-scaling.lo switch_cray_la-switch_cray.lo \ + switch_cray_la-util.lo switch_cray_la_OBJECTS = $(am_switch_cray_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -479,6 +480,7 @@ switch_cray_la_SOURCES = \ gpu.c \ pe_info.c \ ports.c \ + scaling.c \ switch_cray.c \ switch_cray.h \ util.c @@ -575,6 +577,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/switch_cray_la-gpu.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/switch_cray_la-pe_info.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/switch_cray_la-ports.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/switch_cray_la-scaling.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/switch_cray_la-switch_cray.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/switch_cray_la-util.Plo@am__quote@ @@ -620,6 +623,13 @@ switch_cray_la-ports.lo: ports.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(switch_cray_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o switch_cray_la-ports.lo `test -f 'ports.c' || echo '$(srcdir)/'`ports.c +switch_cray_la-scaling.lo: scaling.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(switch_cray_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT switch_cray_la-scaling.lo -MD -MP -MF $(DEPDIR)/switch_cray_la-scaling.Tpo -c -o switch_cray_la-scaling.lo `test -f 'scaling.c' || echo '$(srcdir)/'`scaling.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/switch_cray_la-scaling.Tpo $(DEPDIR)/switch_cray_la-scaling.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='scaling.c' object='switch_cray_la-scaling.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(switch_cray_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o switch_cray_la-scaling.lo `test -f 'scaling.c' || echo '$(srcdir)/'`scaling.c + switch_cray_la-switch_cray.lo: switch_cray.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(switch_cray_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT switch_cray_la-switch_cray.lo -MD -MP -MF $(DEPDIR)/switch_cray_la-switch_cray.Tpo -c -o switch_cray_la-switch_cray.lo `test -f 'switch_cray.c' || echo '$(srcdir)/'`switch_cray.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/switch_cray_la-switch_cray.Tpo $(DEPDIR)/switch_cray_la-switch_cray.Plo diff --git a/src/plugins/switch/cray/scaling.c b/src/plugins/switch/cray/scaling.c new file mode 100644 index 0000000000000000000000000000000000000000..09a7700d055cefaba50d33cc259c4de61dab99f9 --- /dev/null +++ b/src/plugins/switch/cray/scaling.c @@ -0,0 +1,264 @@ +/*****************************************************************************\ + * scaling.c - Library for managing a switch on a Cray system. + ***************************************************************************** + * Copyright (C) 2014 SchedMD LLC + * Copyright 2014 Cray Inc. All Rights Reserved. + * Written by David Gloe <c16817@cray.com> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://slurm.schedmd.com/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#define _GNU_SOURCE + +#include "switch_cray.h" + +#ifdef HAVE_NATIVE_CRAY + +#include <stdio.h> +#include <stdlib.h> + +// Static functions +static int _get_cpu_total(void); +static uint32_t _get_mem_total(void); + +/* + * Determines the cpu scaling amount to use. + * Returns -1 on failure. + */ +int get_cpu_scaling(stepd_step_rec_t *job) +{ + int total_cpus, num_app_cpus, cpu_scaling; + + /* + * Get the number of CPUs on the node + */ + total_cpus = _get_cpu_total(); + if (total_cpus <= 0) { + CRAY_ERR("total_cpus <= 0: %d", total_cpus); + return -1; + } + + /* + * If the submission didn't come from srun (API style) + * perhaps they didn't fill in things correctly. + */ + if (!job->cpus_per_task) { + job->cpus_per_task = 1; + } + + /* + * Determine number of CPUs requested for the step + */ + num_app_cpus = job->cpus; + if (num_app_cpus <= 0) { + num_app_cpus = job->node_tasks * job->cpus_per_task; + if (num_app_cpus <= 0) { + CRAY_ERR("num_app_cpus <= 0: %d", num_app_cpus); + return -1; + } + } + + /* + * Determine what percentage of the CPUs were requested + */ + cpu_scaling = (((double) num_app_cpus / (double) total_cpus) * + (double) 100) + 0.5; + if (cpu_scaling > MAX_SCALING) { + debug("Cpu scaling out of bounds: %d. Reducing to %d%%", + cpu_scaling, MAX_SCALING); + cpu_scaling = MAX_SCALING; + } else if (cpu_scaling < MIN_SCALING) { + CRAY_ERR("Cpu scaling out of bounds: %d. Increasing to %d%%", + cpu_scaling, MIN_SCALING); + cpu_scaling = MIN_SCALING; + } + return cpu_scaling; +} + +/* + * Determines the memory scaling amount to use. + * Returns -1 on failure. + */ +int get_mem_scaling(stepd_step_rec_t *job) +{ + int mem_scaling; + uint32_t total_mem; + + /* + * Get the memory amount + */ + total_mem = _get_mem_total(); + if (total_mem == 0) { + CRAY_ERR("Scanning /proc/meminfo results in MemTotal=0"); + return -1; + } + + /* + * Scale total_mem, which is in kilobytes, to megabytes because + * app_mem is in megabytes. + * Round to the nearest integer. + * If the memory request is greater than 100 percent, then scale + * it to 100%. + * If the memory request is zero, then return an error. + * + * Note: Because this has caused some confusion in the past, + * The MEM_PER_CPU flag is used to indicate that job->step_mem + * is the amount of memory per CPU, not total. However, this + * flag is read and cleared in slurmd prior to passing this + * value to slurmstepd. + * The value comes to slurmstepd already properly scaled. + * Thus, this function does not need to check the MEM_PER_CPU + * flag. + */ + mem_scaling = ((((double) job->step_mem / + ((double) total_mem / 1024)) * (double) 100)) + + 0.5; + if (mem_scaling > MAX_SCALING) { + CRAY_INFO("Memory scaling out of bounds: %d. " + "Reducing to %d%%.", + mem_scaling, MAX_SCALING); + mem_scaling = MAX_SCALING; + } + + if (mem_scaling < MIN_SCALING) { + CRAY_ERR("Memory scaling out of bounds: %d. " + "Increasing to %d%%", + mem_scaling, MIN_SCALING); + mem_scaling = MIN_SCALING; + } + + return mem_scaling; +} + +/* + * Get the total amount of memory on the node. + * Returns 0 on failure. + */ +static uint32_t _get_mem_total(void) +{ + FILE *f = NULL; + size_t sz = 0; + ssize_t lsz = 0; + char *lin = NULL; + int meminfo_value; + char meminfo_str[1024]; + uint32_t total_mem = 0; + + f = fopen("/proc/meminfo", "r"); + if (f == NULL ) { + CRAY_ERR("Failed to open /proc/meminfo: %m"); + return 0; + } + + while (!feof(f)) { + lsz = getline(&lin, &sz, f); + if (lsz > 0) { + sscanf(lin, "%s %d", meminfo_str, + &meminfo_value); + if (!strcmp(meminfo_str, "MemTotal:")) { + total_mem = meminfo_value; + break; + } + } + } + free(lin); + TEMP_FAILURE_RETRY(fclose(f)); + return total_mem; +} + +/* + * Function: get_cpu_total + * Description: + * Get the total number of online cpus on the node. + * + * RETURNS + * Returns the number of online cpus on the node. On error, it returns -1. + * + * TODO: + * Danny suggests using xcgroup_get_param to read the CPU values instead of + * this function. Look at the way task/cgroup/task_cgroup_cpuset.c or + * jobacct_gather/cgroup/jobacct_gather_cgroup.c does it. + */ +static int _get_cpu_total(void) +{ + FILE *f = NULL; + char *token = NULL, *lin = NULL, *saveptr = NULL; + int total = 0; + ssize_t lsz; + size_t sz; + int matches; + long int number1, number2; + + f = fopen("/sys/devices/system/cpu/online", "r"); + + if (!f) { + CRAY_ERR("Failed to open file" + " /sys/devices/system/cpu/online: %m"); + return -1; + } + + while (!feof(f)) { + lsz = getline(&lin, &sz, f); + if (lsz > 0) { + // Split into comma-separated tokens + token = strtok_r(lin, ",", &saveptr); + while (token) { + // Check each token for a range + matches = sscanf(token, "%ld-%ld", + &number1, &number2); + if (matches <= 0) { + // This token isn't numeric + CRAY_ERR("Error parsing %s: %m", token); + free(lin); + TEMP_FAILURE_RETRY(fclose(f)); + return -1; + } else if (matches == 1) { + // Single entry + total++; + } else if (number2 > number1) { + // Range + total += number2 - number1 + 1; + } else { + // Invalid range + CRAY_ERR("Invalid range %s", token); + free(lin); + TEMP_FAILURE_RETRY(fclose(f)); + return -1; + } + token = strtok_r(NULL, ",", &saveptr); + } + } + } + free(lin); + TEMP_FAILURE_RETRY(fclose(f)); + return total; +} + +#endif diff --git a/src/plugins/switch/cray/switch_cray.c b/src/plugins/switch/cray/switch_cray.c index 0c95795972bc5d424dd560fbf95a68a465528255..b66124b6fd123d46aa5ee8c11720339a14bd4663 100644 --- a/src/plugins/switch/cray/switch_cray.c +++ b/src/plugins/switch/cray/switch_cray.c @@ -62,43 +62,14 @@ #include "src/common/slurm_xlator.h" #include "src/common/pack.h" #include "src/common/gres.h" +#include "switch_cray.h" #ifdef HAVE_NATIVE_CRAY #include <job.h> /* Cray's job module component */ -#include "switch_cray.h" -#endif - - -/* This allows for a BUILD time definition of LEGACY_SPOOL_DIR on the compile - * line. - * LEGACY_SPOOL_DIR can be customized to wherever the builder desires. - * This customization could be important because the default is a hard-coded - * path that does not vary regardless of where Slurm is installed. - */ -#ifndef LEGACY_SPOOL_DIR -#define LEGACY_SPOOL_DIR "/var/spool/alps/" #endif -/* - * CRAY_JOBINFO_MAGIC: The switch_jobinfo was not NULL. The packed data - * is good and can be safely unpacked. - * CRAY_NULL_JOBINFO_MAGIC: The switch_jobinfo was NULL. No data was packed. - * Do not attempt to unpack any data. - */ -#define CRAY_JOBINFO_MAGIC 0xCAFECAFE -#define CRAY_NULL_JOBINFO_MAGIC 0xDEAFDEAF -#define MIN_PORT 20000 -#define MAX_PORT 30000 -#define ATTEMPTS 2 -#define PORT_CNT (MAX_PORT - MIN_PORT + 1) #define SWITCH_BUF_SIZE (PORT_CNT + 128) - -#ifdef HAVE_NATIVE_CRAY #define SWITCH_CRAY_STATE_VERSION "PROTOCOL_VERSION" -bitstr_t *port_resv = NULL; -uint32_t last_alloc_port = 0; -pthread_mutex_t port_mutex = PTHREAD_MUTEX_INITIALIZER; -#endif uint32_t debug_flags = 0; @@ -133,80 +104,6 @@ const char plugin_name[] = "switch CRAY plugin"; const char plugin_type[] = "switch/cray"; const uint32_t plugin_version = 100; -/* opaque data structures - no peeking! */ -typedef struct slurm_cray_jobinfo { - uint32_t magic; - uint32_t num_cookies; /* The number of cookies sent to configure the - HSN */ - /* Double pointer to an array of cookie strings. - * cookie values here as NULL-terminated strings. - * There are num_cookies elements in the array. - * The caller is responsible for free()ing - * the array contents and the array itself. */ - char **cookies; - /* The array itself must be free()d when this struct is destroyed. */ - uint32_t *cookie_ids; - uint32_t port;/* Port for PMI Communications */ - uint32_t jobid; /* Current SLURM job id */ - uint32_t stepid; /* Current step id */ - /* Cray Application ID; A unique combination of the job id and step id*/ - uint64_t apid; -} slurm_cray_jobinfo_t; - -static void _print_jobinfo(slurm_cray_jobinfo_t *job); -#ifdef HAVE_NATIVE_CRAY -static int _get_cpu_total(void); -static void _free_alpsc_pe_info(alpsc_peInfo_t alpsc_pe_info); - -static void _print_alpsc_pe_info(alpsc_peInfo_t alps_info) -{ - int i; - info("*************************alpsc_peInfo Start" - "*************************"); - info("totalPEs: %d\nfirstPeHere: %d\npesHere: %d\npeDepth: %d\n", - alps_info.totalPEs, alps_info.firstPeHere, alps_info.pesHere, - alps_info.peDepth); - for (i = 0; i < alps_info.totalPEs; i++) { - info("Task: %d\tNode: %d", i, alps_info.peNidArray[i]); - } - info("*************************alpsc_peInfo Stop" - "*************************"); -} -#endif - -static void _print_jobinfo(slurm_cray_jobinfo_t *job) -{ - int i; - - if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) { - error("(%s: %d: %s) job pointer was NULL", THIS_FILE, __LINE__, - __FUNCTION__); - return; - } - - xassert(job->magic == CRAY_JOBINFO_MAGIC); - - info("Program: %s", slurm_prog_name); - info("Address of slurm_cray_jobinfo_t structure: %p", job); - info("--Begin Jobinfo--"); - info(" Magic: %" PRIx32, job->magic); - info(" Job ID: %" PRIu32, job->jobid); - info(" Step ID: %" PRIu32, job->stepid); - info(" APID: %" PRIu64, job->apid); - info(" PMI Port: %" PRIu32, job->port); - info(" num_cookies: %" PRIu32, job->num_cookies); - info(" --- cookies ---"); - for (i = 0; i < job->num_cookies; i++) { - info(" cookies[%d]: %s", i, job->cookies[i]); - } - info(" --- cookie_ids ---"); - for (i = 0; i < job->num_cookies; i++) { - info(" cookie_ids[%d]: %" PRIu32, i, job->cookie_ids[i]); - } - info(" ------"); - info("--END Jobinfo--"); -} - /* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. @@ -463,7 +360,7 @@ int switch_p_alloc_jobinfo(switch_jobinfo_t **switch_job, uint32_t job_id, if (debug_flags & DEBUG_FLAG_SWITCH) { info("(%s: %d: %s) switch_jobinfo_t contents", THIS_FILE, __LINE__, __FUNCTION__); - _print_jobinfo(new); + print_jobinfo(new); } return SLURM_SUCCESS; @@ -669,7 +566,7 @@ int switch_p_pack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer, if (debug_flags & DEBUG_FLAG_SWITCH) { info("(%s: %d: %s) switch_jobinfo_t contents", THIS_FILE, __LINE__, __FUNCTION__); - _print_jobinfo(job); + print_jobinfo(job); } pack32(job->magic, buffer); @@ -734,7 +631,7 @@ int switch_p_unpack_jobinfo(switch_jobinfo_t *switch_job, Buf buffer, if (debug_flags & DEBUG_FLAG_SWITCH) { info("(%s:%d: %s) switch_jobinfo_t contents:", THIS_FILE, __LINE__, __FUNCTION__); - _print_jobinfo(job); + print_jobinfo(job); } return SLURM_SUCCESS; @@ -918,135 +815,25 @@ extern int switch_p_job_init(stepd_step_rec_t *job) */ if (job->ntasks > 1) { - /* - * Get the number of CPUs. - */ - total_cpus = _get_cpu_total(); - if (total_cpus <= 0) { - error("(%s: %d: %s) total_cpus <=0: %d", - THIS_FILE, __LINE__, __FUNCTION__, total_cpus); - return SLURM_ERROR; + cpu_scaling = get_cpu_scaling(job); + if (cpu_scaling == -1) { + return SLURM_ERROR; } - /* - * Use /proc/meminfo to get the total amount of memory on the - * node - */ - f = fopen("/proc/meminfo", "r"); - if (!f) { - error("(%s: %d: %s) Failed to open /proc/meminfo: %m", - THIS_FILE, __LINE__, __FUNCTION__); - return SLURM_ERROR; - } - - while (!feof(f)) { - lsz = getline(&lin, &sz, f); - if (lsz > 0) { - sscanf(lin, "%s %d", meminfo_str, - &meminfo_value); - if (!strcmp(meminfo_str, "MemTotal:")) { - total_mem = meminfo_value; - break; - } - } - } - free(lin); - TEMP_FAILURE_RETRY(fclose(f)); - - if (total_mem == 0) { - error("(%s: %d: %s) Scanning /proc/meminfo results in" - " MemTotal=0", - THIS_FILE, __LINE__, __FUNCTION__); - return SLURM_ERROR; - } - - /* If the submission didn't come from srun (API style) - * perhaps they didn't fill in things correctly. - */ - if (!job->cpus_per_task) - job->cpus_per_task = 1; - /* - * Scaling - * For the CPUS round the scaling to the nearest integer. - * If the scaling is greater than 100 percent, then scale it to - * 100%. - * If the scaling is zero, then return an error. - */ - num_app_cpus = job->node_tasks * job->cpus_per_task; - if (num_app_cpus <= 0) { - error("(%s: %d: %s) num_app_cpus <=0: %d", - THIS_FILE, __LINE__, __FUNCTION__, num_app_cpus); - return SLURM_ERROR; - } - - cpu_scaling = (((double) num_app_cpus / (double) total_cpus) * - (double) 100) + 0.5; - if (cpu_scaling > 100) { - error("(%s: %d: %s) Cpu scaling out of bounds: %d." - " Reducing to 100%%", - THIS_FILE, __LINE__, __FUNCTION__, cpu_scaling); - cpu_scaling = 100; - } - if (cpu_scaling <= 0) { - error("(%s: %d: %s) Cpu scaling out of bounds: %d." - " Increasing to 1%%", - THIS_FILE, __LINE__, __FUNCTION__, cpu_scaling); - cpu_scaling = 1; - } - - /* - * Scale total_mem, which is in kilobytes, to megabytes because - * app_mem is in megabytes. - * Round to the nearest integer. - * If the memory request is greater than 100 percent, then scale - * it to 100%. - * If the memory request is zero, then return an error. - * - * Note: Because this has caused some confusion in the past, - * The MEM_PER_CPU flag is used to indicate that job->step_mem - * is the amount of memory per CPU, not total. However, this - * flag is read and cleared in slurmd prior to passing this - * value to slurmstepd. - * The value comes to slurmstepd already properly scaled. - * Thus, this function does not need to check the MEM_PER_CPU - * flag. - */ - mem_scaling = ((((double) job->step_mem / - ((double) total_mem / 1024)) * (double) 100)) - + 0.5; - if (mem_scaling > 100) { - info("(%s: %d: %s) Memory scaling out of bounds: %d. " - "Reducing to 100%%.", - THIS_FILE, __LINE__, __FUNCTION__, mem_scaling); - mem_scaling = 100; - } - - if (mem_scaling <= 0) { - error("(%s: %d: %s) Memory scaling out of bounds: %d. " - " Increasing to 1%%", - THIS_FILE, __LINE__, __FUNCTION__, mem_scaling); - mem_scaling = 1; + mem_scaling = get_mem_scaling(job); + if (mem_scaling == -1) { + return SLURM_ERROR; } if (debug_flags & DEBUG_FLAG_SWITCH) { - info("(%s:%d: %s) --Network Scaling Start--", - THIS_FILE, __LINE__, __FUNCTION__); - info("(%s:%d: %s) --CPU Scaling: %d--", - THIS_FILE, __LINE__, __FUNCTION__, cpu_scaling); - - info("(%s:%d: %s) --Memory Scaling: %d--", - THIS_FILE, __LINE__, __FUNCTION__, mem_scaling); - info("(%s:%d: %s) --Network Scaling End--", - THIS_FILE, __LINE__, __FUNCTION__); - - info("(%s:%d: %s) --PAGG Job Container ID: %"PRIx64"--", - THIS_FILE, __LINE__, __FUNCTION__, job->cont_id); + CRAY_INFO("--Network Scaling Start--"); + CRAY_INFO("--CPU Scaling: %d--", cpu_scaling); + CRAY_INFO("--Memory Scaling: %d--", mem_scaling); + CRAY_INFO("--Network Scaling End--"); + CRAY_INFO("--PAGG Job Container ID: %"PRIx64"--", + job->cont_id); } - rc = alpsc_configure_nic(&err_msg, 0, cpu_scaling, mem_scaling, - job->cont_id, sw_job->num_cookies, - (const char **) sw_job->cookies, - &num_ptags, &ptags, ntt_desc_ptr); /* * We don't use the ptags because Cray's LLI acquires them * itself, so they can be immediately discarded. @@ -1487,132 +1274,4 @@ extern int switch_p_slurmd_step_init(void) return SLURM_SUCCESS; } -#ifdef HAVE_NATIVE_CRAY - -/* - * Function: get_cpu_total - * Description: - * Get the total number of online cpus on the node. - * - * RETURNS - * Returns the number of online cpus on the node. On error, it returns -1. - * - * TODO: - * Danny suggests using xcgroup_get_param to read the CPU values instead of - * this function. Look at the way task/cgroup/task_cgroup_cpuset.c or - * jobacct_gather/cgroup/jobacct_gather_cgroup.c does it. - */ -static int _get_cpu_total(void) -{ - FILE *f = NULL; - char * token = NULL, *token1 = NULL, *token2 = NULL, *lin = NULL; - char *saveptr = NULL, *saveptr1 = NULL, *endptr = NULL; - int total = 0; - ssize_t lsz; - size_t sz; - long int number1, number2; - - f = fopen("/sys/devices/system/cpu/online", "r"); - - if (!f) { - error("(%s: %d: %s) Failed to open file" - " /sys/devices/system/cpu/online: %m", - THIS_FILE, __LINE__, __FUNCTION__); - return -1; - } - - while (!feof(f)) { - lsz = getline(&lin, &sz, f); - if (lsz > 0) { - token = strtok_r(lin, ",", &saveptr); - while (token) { - // Check for ranged sub-list - token1 = strtok_r(token, "-", &saveptr1); - if (token1) { - number1 = strtol(token1, &endptr, 10); - if ((number1 == LONG_MIN) || - (number1 == LONG_MAX)) { - error("(%s: %d: %s) Error: %m", - THIS_FILE, __LINE__, - __FUNCTION__); - free(lin); - TEMP_FAILURE_RETRY(fclose(f)); - return -1; - } else if (endptr == token1) { - error("(%s: %d: %s) Error:" - " Not a number: %s\n", - THIS_FILE, __LINE__, - __FUNCTION__, endptr); - free(lin); - TEMP_FAILURE_RETRY(fclose(f)); - return -1; - } - - token2 = strtok_r(NULL, "-", &saveptr1); - if (token2) { - number2 = strtol(token2, - &endptr, 10); - if ((number2 == LONG_MIN) || - (number2 == LONG_MAX)) { - error("(%s: %d: %s)" - " Error: %m", - THIS_FILE, - __LINE__, - __FUNCTION__); - free(lin); - TEMP_FAILURE_RETRY( - fclose(f)); - return -1; - } else if (endptr == token2) { - error("(%s: %d: %s)" - " Error: Not a" - " number: '%s'\n", - THIS_FILE, - __LINE__, - __FUNCTION__, - endptr); - free(lin); - TEMP_FAILURE_RETRY( - fclose(f)); - return -1; - } - - total += number2 - number1 + 1; - } else { - total += 1; - } - } - token = strtok_r(NULL, ",", &saveptr); - } - } - } - free(lin); - TEMP_FAILURE_RETRY(fclose(f)); - return total; -} - -/* - * Function: _free_alpsc_pe_info - * Description: - * Frees any allocated members of alpsc_pe_info. - * Parameters: - * IN alpsc_pe_info: alpsc_peInfo_t structure needing to be freed - * - * Returns - * Void. - */ -static void _free_alpsc_pe_info(alpsc_peInfo_t alpsc_pe_info) -{ - if (alpsc_pe_info.peNidArray) { - xfree(alpsc_pe_info.peNidArray); - } - if (alpsc_pe_info.peCmdMapArray) { - xfree(alpsc_pe_info.peCmdMapArray); - } - if (alpsc_pe_info.nodeCpuArray) { - xfree(alpsc_pe_info.nodeCpuArray); - } - return; -} -#endif #endif diff --git a/src/plugins/switch/cray/switch_cray.h b/src/plugins/switch/cray/switch_cray.h index 7d7c3ef73ac0757cfa9bbfa383020de2e0aca595..848815b5e8ec535aa7d3050a06b2efb692987f27 100644 --- a/src/plugins/switch/cray/switch_cray.h +++ b/src/plugins/switch/cray/switch_cray.h @@ -42,8 +42,6 @@ #include "config.h" #endif -#ifdef HAVE_NATIVE_CRAY - #include <stdint.h> #include "src/common/bitstring.h" @@ -51,13 +49,20 @@ #include "src/common/slurm_protocol_defs.h" #include "src/slurmd/slurmstepd/slurmstepd_job.h" +#ifdef HAVE_NATIVE_CRAY #include "alpscomm_cn.h" #include "alpscomm_sn.h" +#endif /********************************************************** * Constants **********************************************************/ -// Legacy ALPS spool directory (can be redefined at compile time) +/* This allows for a BUILD time definition of LEGACY_SPOOL_DIR on the compile + * line. + * LEGACY_SPOOL_DIR can be customized to wherever the builder desires. + * This customization could be important because the default is a hard-coded + * path that does not vary regardless of where Slurm is installed. + */ #ifndef LEGACY_SPOOL_DIR #define LEGACY_SPOOL_DIR "/var/spool/alps/" #endif @@ -104,28 +109,21 @@ **********************************************************/ // Opaque Cray jobinfo structure typedef struct slurm_cray_jobinfo { - // Magic value uint32_t magic; - - // Number of cookies sent to configure the HSN - uint32_t num_cookies; - - // Array of cookies - char **cookies; - - // Array of cookie IDs + uint32_t num_cookies; /* The number of cookies sent to configure the + HSN */ + /* Double pointer to an array of cookie strings. + * cookie values here as NULL-terminated strings. + * There are num_cookies elements in the array. + * The caller is responsible for free()ing + * the array contents and the array itself. */ + char **cookies; + /* The array itself must be free()d when this struct is destroyed. */ uint32_t *cookie_ids; - - // Port for PMI communications - uint32_t port; - - // Slurm job id - uint32_t jobid; - - // Slurm step id - uint32_t stepid; - - // Cray Application ID (Slurm hash) + uint32_t port;/* Port for PMI Communications */ + uint32_t jobid; /* Current SLURM job id */ + uint32_t stepid; /* Current step id */ + /* Cray Application ID; A unique combination of the job id and step id*/ uint64_t apid; } slurm_cray_jobinfo_t; @@ -147,11 +145,13 @@ extern uint32_t debug_flags; /********************************************************** * Function declarations **********************************************************/ + +#ifdef HAVE_NATIVE_CRAY // Implemented in ports.c extern int assign_port(uint32_t *ret_port); extern int release_port(uint32_t real_port); -// Implemented in pe_info.c +// Implemented in pe_info.c extern int build_alpsc_pe_info(stepd_step_rec_t *job, slurm_cray_jobinfo_t *sw_job, alpsc_peInfo_t *alpsc_pe_info); @@ -170,13 +170,14 @@ extern int list_str_to_array(char *list, int *cnt, int32_t **numbers); extern void alpsc_debug(const char *file, int line, const char *func, int rc, int expected_rc, const char *alpsc_func, char *err_msg); -extern void print_jobinfo(slurm_cray_jobinfo_t *job); extern int create_apid_dir(uint64_t apid, uid_t uid, gid_t gid); extern int set_job_env(stepd_step_rec_t *job, slurm_cray_jobinfo_t *sw_job); extern void recursive_rmdir(const char *dirnm); +#endif /* HAVE_NATIVE_CRAY */ +extern void print_jobinfo(slurm_cray_jobinfo_t *job); /********************************************************** - * Macros + * Macros **********************************************************/ #define ALPSC_CN_DEBUG(f) alpsc_debug(THIS_FILE, __LINE__, __FUNCTION__, \ rc, 1, f, err_msg); @@ -189,6 +190,4 @@ extern void recursive_rmdir(const char *dirnm); #define CRAY_INFO(fmt, ...) info("(%s: %d: %s) "fmt, THIS_FILE, __LINE__, \ __FUNCTION__, ##__VA_ARGS__); -#endif /* HAVE_NATIVE_CRAY */ - #endif /* SWITCH_CRAY_H */ diff --git a/src/plugins/switch/cray/util.c b/src/plugins/switch/cray/util.c index be53dd7b1575cfb306ef0d59bd9c3b978587c6d9..85e3f0efff55d64778db2fa12f389e8f3c53321d 100644 --- a/src/plugins/switch/cray/util.c +++ b/src/plugins/switch/cray/util.c @@ -284,6 +284,8 @@ fileDel: st = unlink(dirnm); } } +#endif /* HAVE_NATIVE_CRAY */ + void print_jobinfo(slurm_cray_jobinfo_t *job) { int i; @@ -295,7 +297,10 @@ void print_jobinfo(slurm_cray_jobinfo_t *job) xassert(job->magic == CRAY_JOBINFO_MAGIC); - info("Jobinfo magic %"PRIx32, job->magic); + info("Program: %s", slurm_prog_name); + info("Address of slurm_cray_jobinfo_t structure: %p", job); + info("--Begin Jobinfo--"); + info(" Magic: %" PRIx32, job->magic); info(" Job ID: %" PRIu32, job->jobid); info(" Step ID: %" PRIu32, job->stepid); info(" APID: %" PRIu64, job->apid); @@ -309,7 +314,6 @@ void print_jobinfo(slurm_cray_jobinfo_t *job) for (i = 0; i < job->num_cookies; i++) { info(" cookie_ids[%d]: %" PRIu32, i, job->cookie_ids[i]); } + info(" ------"); info("--END Jobinfo--"); } - -#endif /* HAVE_NATIVE_CRAY */