diff --git a/src/plugins/switch/cray/Makefile.am b/src/plugins/switch/cray/Makefile.am index 560c747d8a86ca5bc3d5e59632c3411ac26a4185..83025c67d8eec5c7b77d399b6238f7afe0c77cc2 100644 --- a/src/plugins/switch/cray/Makefile.am +++ b/src/plugins/switch/cray/Makefile.am @@ -4,11 +4,19 @@ AUTOMAKE_OPTIONS = foreign PLUGIN_FLAGS = -module -avoid-version --export-dynamic -AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common \ - $(CRAY_SWITCH_CPPFLAGS) - pkglib_LTLIBRARIES = switch_cray.la -# Null switch plugin. -switch_cray_la_SOURCES = switch_cray.c -switch_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) $(CRAY_SWITCH_LDFLAGS) +# Cray switch plugin. +switch_cray_la_SOURCES = \ + pe_info.c \ + switch_cray.c \ + switch_cray.h \ + util.c +switch_cray_la_LDFLAGS = \ + $(SO_LDFLAGS) \ + $(PLUGIN_FLAGS) \ + $(CRAY_SWITCH_LDFLAGS) +switch_cray_la_CPPFLAGS = \ + -I$(top_srcdir) \ + -I$(top_srcdir)/src/common \ + $(CRAY_SWITCH_CPPFLAGS) diff --git a/src/plugins/switch/cray/Makefile.in b/src/plugins/switch/cray/Makefile.in index 9f3fabb7935fac6d6b0cd73afac3d17ea79b40ac..9f00c93d82212cfb68a5bd82ffe8f49866c7bdfd 100644 --- a/src/plugins/switch/cray/Makefile.in +++ b/src/plugins/switch/cray/Makefile.in @@ -161,7 +161,8 @@ am__uninstall_files_from_dir = { \ am__installdirs = "$(DESTDIR)$(pkglibdir)" LTLIBRARIES = $(pkglib_LTLIBRARIES) switch_cray_la_LIBADD = -am_switch_cray_la_OBJECTS = switch_cray.lo +am_switch_cray_la_OBJECTS = switch_cray_la-pe_info.lo \ + switch_cray_la-switch_cray.lo switch_cray_la-util.lo switch_cray_la_OBJECTS = $(am_switch_cray_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -470,14 +471,25 @@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ AUTOMAKE_OPTIONS = foreign PLUGIN_FLAGS = -module -avoid-version --export-dynamic -AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common \ - $(CRAY_SWITCH_CPPFLAGS) - pkglib_LTLIBRARIES = switch_cray.la -# Null switch plugin. -switch_cray_la_SOURCES = switch_cray.c -switch_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) $(CRAY_SWITCH_LDFLAGS) +# Cray switch plugin. +switch_cray_la_SOURCES = \ + pe_info.c \ + switch_cray.c \ + switch_cray.h \ + util.c + +switch_cray_la_LDFLAGS = \ + $(SO_LDFLAGS) \ + $(PLUGIN_FLAGS) \ + $(CRAY_SWITCH_LDFLAGS) + +switch_cray_la_CPPFLAGS = \ + -I$(top_srcdir) \ + -I$(top_srcdir)/src/common \ + $(CRAY_SWITCH_CPPFLAGS) + all: all-am .SUFFIXES: @@ -557,7 +569,9 @@ mostlyclean-compile: distclean-compile: -rm -f *.tab.c -@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/switch_cray.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/switch_cray_la-pe_info.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/switch_cray_la-switch_cray.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/switch_cray_la-util.Plo@am__quote@ .c.o: @am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @@ -580,6 +594,27 @@ distclean-compile: @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $< +switch_cray_la-pe_info.lo: pe_info.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(switch_cray_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT switch_cray_la-pe_info.lo -MD -MP -MF $(DEPDIR)/switch_cray_la-pe_info.Tpo -c -o switch_cray_la-pe_info.lo `test -f 'pe_info.c' || echo '$(srcdir)/'`pe_info.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/switch_cray_la-pe_info.Tpo $(DEPDIR)/switch_cray_la-pe_info.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='pe_info.c' object='switch_cray_la-pe_info.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(switch_cray_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o switch_cray_la-pe_info.lo `test -f 'pe_info.c' || echo '$(srcdir)/'`pe_info.c + +switch_cray_la-switch_cray.lo: switch_cray.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(switch_cray_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT switch_cray_la-switch_cray.lo -MD -MP -MF $(DEPDIR)/switch_cray_la-switch_cray.Tpo -c -o switch_cray_la-switch_cray.lo `test -f 'switch_cray.c' || echo '$(srcdir)/'`switch_cray.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/switch_cray_la-switch_cray.Tpo $(DEPDIR)/switch_cray_la-switch_cray.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='switch_cray.c' object='switch_cray_la-switch_cray.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(switch_cray_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o switch_cray_la-switch_cray.lo `test -f 'switch_cray.c' || echo '$(srcdir)/'`switch_cray.c + +switch_cray_la-util.lo: util.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(switch_cray_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT switch_cray_la-util.lo -MD -MP -MF $(DEPDIR)/switch_cray_la-util.Tpo -c -o switch_cray_la-util.lo `test -f 'util.c' || echo '$(srcdir)/'`util.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/switch_cray_la-util.Tpo $(DEPDIR)/switch_cray_la-util.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='util.c' object='switch_cray_la-util.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(switch_cray_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o switch_cray_la-util.lo `test -f 'util.c' || echo '$(srcdir)/'`util.c + mostlyclean-libtool: -rm -f *.lo diff --git a/src/plugins/switch/cray/pe_info.c b/src/plugins/switch/cray/pe_info.c new file mode 100644 index 0000000000000000000000000000000000000000..7b4fa74af5eb0e1adaca32fb95fc4255407d9638 --- /dev/null +++ b/src/plugins/switch/cray/pe_info.c @@ -0,0 +1,240 @@ +/*****************************************************************************\ + * pe_info.c - Library for managing a switch on a Cray system. + ***************************************************************************** + * Copyright (C) 2014 SchedMD LLC + * Copyright 2014 Cray Inc. All Rights Reserved. + * Written by David Gloe <c16817@cray.com> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://slurm.schedmd.com/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#include "switch_cray.h" + +#ifdef HAVE_NATIVE_CRAY + +#include <stdio.h> +#include <stdlib.h> + +static void _print_alpsc_pe_info(alpsc_peInfo_t *alps_info); +static int _get_first_pe(uint32_t nodeid, uint32_t task_count, + uint32_t **host_to_task_map, int32_t *first_pe); + +/* + * Fill in an alpsc_peInfo_t structure + */ +int build_alpsc_pe_info(stepd_step_rec_t *job, + slurm_cray_jobinfo_t *sw_job, + alpsc_peInfo_t *alpsc_pe_info) +{ + int rc, i, j, cnt = 0; + int32_t *task_to_nodes_map = NULL; + int32_t *nodes = NULL; + int32_t first_pe_here; + uint32_t task; + size_t size; + + alpsc_pe_info->totalPEs = job->ntasks; + alpsc_pe_info->pesHere = job->node_tasks; + alpsc_pe_info->peDepth = job->cpus_per_task; + + /* + * Fill in alpsc_pe_info->firstPeHere + */ + rc = _get_first_pe(job->nodeid, job->node_tasks, + job->msg->global_task_ids, + &first_pe_here); + if (rc < 0) { + CRAY_ERR("get_first_pe failed"); + return SLURM_ERROR; + } + alpsc_pe_info->firstPeHere = first_pe_here; + + /* + * Fill in alpsc_pe_info->peNidArray + * + * The peNidArray maps tasks to nodes. + * Basically, reverse the tids variable which maps nodes to tasks. + */ + rc = list_str_to_array(job->msg->complete_nodelist, &cnt, &nodes); + if (rc < 0) { + CRAY_ERR("list_str_to_array failed"); + return SLURM_ERROR; + } + if (cnt == 0) { + CRAY_ERR("list_str_to_array returned a node count of zero"); + return SLURM_ERROR; + } + if (job->msg->nnodes != cnt) { + CRAY_ERR("list_str_to_array returned count %" + PRIu32 "does not match expected count %d", + cnt, job->msg->nnodes); + } + + task_to_nodes_map = xmalloc(job->msg->ntasks * sizeof(int32_t)); + + for (i = 0; i < job->msg->nnodes; i++) { + for (j = 0; j < job->msg->tasks_to_launch[i]; j++) { + task = job->msg->global_task_ids[i][j]; + task_to_nodes_map[task] = nodes[i]; + if (debug_flags & DEBUG_FLAG_SWITCH) { + CRAY_INFO("peNidArray:\tTask: %d\tNode: %d", + task, nodes[i]); + } + } + } + alpsc_pe_info->peNidArray = task_to_nodes_map; + xfree(nodes); + + /* + * Fill in alpsc_pe_info->peCmdMapArray + * + * If the job is an SPMD job, then the command index (cmd_index) is 0. + * Otherwise, if the job is an MPMD job, then the command index + * (cmd_index) is equal to the number of executables in the job minus 1. + * + * TODO: Add MPMD support once SchedMD provides the needed MPMD data. + */ + + if (!job->multi_prog) { + /* SPMD Launch */ + size = alpsc_pe_info->totalPEs * sizeof(int); + alpsc_pe_info->peCmdMapArray = xmalloc(size); + memset(alpsc_pe_info->peCmdMapArray, 0, size); + } else { + /* MPMD Launch */ + CRAY_ERR("MPMD Applications are not currently supported."); + goto error_free_alpsc_pe_info_t; + } + + /* + * Fill in alpsc_pe_info->nodeCpuArray + * I don't know how to get this information from SLURM. + * Cray's PMI does not need the information. + * It may be used by debuggers like ATP or lgdb. If so, then it will + * have to be filled in when support for them is added. + * Currently, it's all zeros. + */ + size = job->msg->nnodes * sizeof(int); + alpsc_pe_info->nodeCpuArray = xmalloc(size); + memset(alpsc_pe_info->nodeCpuArray, 0, size); + + if (debug_flags & DEBUG_FLAG_SWITCH) { + _print_alpsc_pe_info(alpsc_pe_info); + } + + return SLURM_SUCCESS; + +error_free_alpsc_pe_info_t: + free_alpsc_pe_info(alpsc_pe_info); + return SLURM_ERROR; +} + +/* + * Print information about an alpsc_peInfo_t structure + */ +static void _print_alpsc_pe_info(alpsc_peInfo_t *alps_info) +{ + int i; + info("*************************alpsc_peInfo Start" + "*************************"); + info("totalPEs: %d\nfirstPeHere: %d\npesHere: %d\npeDepth: %d\n", + alps_info->totalPEs, alps_info->firstPeHere, alps_info->pesHere, + alps_info->peDepth); + for (i = 0; i < alps_info->totalPEs; i++) { + info("Task: %d\tNode: %d", i, alps_info->peNidArray[i]); + } + info("*************************alpsc_peInfo Stop" + "*************************"); +} + +/* + * Function: get_first_pe + * Description: + * Returns the first (i.e. lowest) PE on the node. + * + * IN: + * nodeid -- Index of the node in the host_to_task_map + * task_count -- Number of tasks on the node + * host_to_task_map -- 2D array mapping the host to its tasks + * + * OUT: + * first_pe -- The first (i.e. lowest) PE on the node + * + * RETURN + * 0 on success and -1 on error + */ +static int _get_first_pe(uint32_t nodeid, uint32_t task_count, + uint32_t **host_to_task_map, int32_t *first_pe) +{ + + int i, ret = 0; + + if (task_count == 0) { + CRAY_ERR("task_count == 0"); + return -1; + } + if (!host_to_task_map) { + CRAY_ERR("host_to_task_map == NULL"); + return -1; + } + *first_pe = host_to_task_map[nodeid][0]; + for (i = 0; i < task_count; i++) { + if (host_to_task_map[nodeid][i] < *first_pe) { + *first_pe = host_to_task_map[nodeid][i]; + } + } + return ret; +} + +/* + * Function: _free_alpsc_pe_info + * Description: + * Frees any allocated members of alpsc_pe_info. + * Parameters: + * IN alpsc_pe_info: alpsc_peInfo_t structure needing to be freed + * + * Returns + * Void. + */ +void free_alpsc_pe_info(alpsc_peInfo_t *alpsc_pe_info) +{ + if (alpsc_pe_info->peNidArray) { + xfree(alpsc_pe_info->peNidArray); + } + if (alpsc_pe_info->peCmdMapArray) { + xfree(alpsc_pe_info->peCmdMapArray); + } + if (alpsc_pe_info->nodeCpuArray) { + xfree(alpsc_pe_info->nodeCpuArray); + } + return; +} + +#endif diff --git a/src/plugins/switch/cray/switch_cray.c b/src/plugins/switch/cray/switch_cray.c index a311399adc61b51cc4a746c0b674ab2fb8fa8beb..fa81eebf06fab9c9a7c5aabd3cb340f3d3f011e3 100644 --- a/src/plugins/switch/cray/switch_cray.c +++ b/src/plugins/switch/cray/switch_cray.c @@ -827,6 +827,7 @@ extern int switch_p_job_init(stepd_step_rec_t *job) } rc = alpsc_attach_cncu_container(&err_msg, sw_job->jobid, job->cont_id); + if (rc != 1) { if (err_msg) { error("(%s: %d: %s) alpsc_attach_cncu_container failed:" @@ -1055,124 +1056,19 @@ extern int switch_p_job_init(stepd_step_rec_t *job) //alpsc_config_gpcd(); /* - * The following section will fill in the alpsc_peInfo structure which - * is the key argument to the alpsc_write_placement_file() call. + * Fill in the alpsc_pe_info structure */ - alpsc_pe_info.totalPEs = job->ntasks; - alpsc_pe_info.pesHere = job->node_tasks; - alpsc_pe_info.peDepth = job->cpus_per_task; + rc = build_alpsc_pe_info(job, sw_job, &alpsc_pe_info); + if (rc != SLURM_SUCCESS) + return rc; /* - * Fill in alpsc_pe_info.firstPeHere + * Set the cmd_index */ - rc = _get_first_pe(job->nodeid, job->node_tasks, - job->msg->global_task_ids, - &first_pe_here); - if (rc < 0) { - error("(%s: %d: %s) get_first_pe failed", THIS_FILE, __LINE__, - __FUNCTION__); - return SLURM_ERROR; - } - alpsc_pe_info.firstPeHere = first_pe_here; - - /* - * Fill in alpsc_pe_info.peNidArray - * - * The peNidArray maps tasks to nodes. - * Basically, reverse the tids variable which maps nodes to tasks. - */ - rc = _list_str_to_array(job->msg->complete_nodelist, &cnt, &nodes); - if (rc < 0) { - error("(%s: %d: %s) list_str_to_array failed", - THIS_FILE, __LINE__, __FUNCTION__); - return SLURM_ERROR; - } - if (cnt == 0) { - error("(%s: %d: %s) list_str_to_array returned a node count of" - " zero.", - THIS_FILE, __LINE__, __FUNCTION__); - return SLURM_ERROR; - } - if (job->msg->nnodes != cnt) { - error("(%s: %d: %s) list_str_to_array returned count %" - PRIu32 "does not match expected count %d", - THIS_FILE, __LINE__, __FUNCTION__, cnt, - job->msg->nnodes); - } - - task_to_nodes_map = xmalloc(job->msg->ntasks * sizeof(int32_t)); - for (i = 0; i < job->msg->nnodes; i++) { - for (j = 0; j < job->msg->tasks_to_launch[i]; j++) { - task = job->msg->global_task_ids[i][j]; - task_to_nodes_map[task] = nodes[i]; - if (debug_flags & DEBUG_FLAG_SWITCH) { - info("(%s:%d: %s) peNidArray:\tTask: %d\tNode:" - " %d", THIS_FILE, - __LINE__, __FUNCTION__, task, nodes[i]); - } - } - } - alpsc_pe_info.peNidArray = task_to_nodes_map; - xfree(nodes); - - /* - * Fill in alpsc_pe_info.peCmdMapArray - * - * If the job is an SPMD job, then the command index (cmd_index) is 0. - * Otherwise, if the job is an MPMD job, then the command index - * (cmd_index) is equal to the number of executables in the job minus 1. - * - * TODO: Add MPMD support once SchedMD provides the needed MPMD data. - */ - - if (!job->multi_prog) { - /* SPMD Launch */ + if (!job->multi_prog) cmd_index = 0; - size = alpsc_pe_info.totalPEs * sizeof(int); - alpsc_pe_info.peCmdMapArray = xmalloc(size); - memset(alpsc_pe_info.peCmdMapArray, 0, size); - } else { - /* MPMD Launch */ - - // Deferred support - /* - ap.cmdIndex = ; - ap.peCmdMapArray = ; - ap.firstPeHere = ; - - ap.pesHere = pesHere; // These need to be MPMD specific. - */ - - error("(%s: %d: %s) MPMD Applications are not currently " - "supported.", - THIS_FILE, __LINE__, __FUNCTION__); - goto error_free_alpsc_pe_info_t; - } - - /* - * Fill in alpsc_pe_info.nodeCpuArray - * I don't know how to get this information from SLURM. - * Cray's PMI does not need the information. - * It may be used by debuggers like ATP or lgdb. If so, then it will - * have to be filled in when support for them is added. - * Currently, it's all zeros. - */ - size = job->msg->nnodes * sizeof(int); - alpsc_pe_info.nodeCpuArray = xmalloc(size); - memset(alpsc_pe_info.nodeCpuArray, 0, size); - if (job->msg->nnodes && !alpsc_pe_info.nodeCpuArray) { - free(alpsc_pe_info.peCmdMapArray); - error("(%s: %d: %s) failed to calloc nodeCpuArray.", THIS_FILE, - __LINE__, __FUNCTION__); - goto error_free_alpsc_pe_info_t; - } - - if (debug_flags & DEBUG_FLAG_SWITCH) { - _print_alpsc_pe_info(alpsc_pe_info); - } - /* * Some of the input parameters for alpsc_write_placement_file do not * apply for SLURM. These parameters will be given zero values. @@ -1194,70 +1090,22 @@ extern int switch_p_job_init(stepd_step_rec_t *job) &alpsc_pe_info, control_nid, control_soc, num_branches, &alpsc_branch_info); - if (rc != 1) { - if (err_msg) { - error("(%s: %d: %s) alpsc_write_placement_file failed:" - " %s", - THIS_FILE, __LINE__, __FUNCTION__, err_msg); - free(err_msg); - } else { - error("(%s: %d: %s) alpsc_write_placement_file failed:" - " No error message present.", - THIS_FILE, __LINE__, __FUNCTION__); - } - goto error_free_alpsc_pe_info_t; - } - if (err_msg) { - info("(%s: %d: %s) alpsc_write_placement_file: %s", - THIS_FILE, __LINE__, __FUNCTION__, err_msg); - free(err_msg); - } - - /* Clean up alpsc_pe_info*/ - _free_alpsc_pe_info(alpsc_pe_info); - - /* - * Write the CRAY_NUM_COOKIES and CRAY_COOKIES variables out, too. - */ - rc = env_array_overwrite_fmt(&job->env, "CRAY_NUM_COOKIES", "%" PRIu32, - sw_job->num_cookies); - if (rc == 0) { - info("Failed to set env variable CRAY_NUM_COOKIES"); + ALPSC_CN_DEBUG("alpsc_write_placement_file"); + if (rc != 1) { + free_alpsc_pe_info(&alpsc_pe_info); return SLURM_ERROR; } - /* - * Create the CRAY_COOKIES environment variable in the application's - * environment. - * Create one string containing a comma separated list of cookies. - */ - for (i = 0; i < sw_job->num_cookies; i++) { - if (i > 0) { - xstrfmtcat(buff, ",%s", sw_job->cookies[i]); - } else - xstrcat(buff, sw_job->cookies[i]); - } - - rc = env_array_overwrite(&job->env, "CRAY_COOKIES", buff); - if (rc == 0) { - info("Failed to set env variable CRAY_COOKIES = %s", buff); - xfree(buff); - return SLURM_ERROR; - } - xfree(buff); + /* Clean up alpsc_pe_info*/ + free_alpsc_pe_info(&alpsc_pe_info); /* - * Write the PMI_CONTROL_PORT - * Cray's PMI uses this is the port to communicate its control tree - * information. + * Write some environment variables used by LLI and PMI */ - rc = env_array_overwrite_fmt(&job->env, "PMI_CONTROL_PORT", "%" PRIu32, - sw_job->port); - if (rc == 0) { - info("Failed to set env variable PMI_CONTROL_PORT"); - return SLURM_ERROR; - } + rc = set_job_env(job, sw_job); + if (rc != SLURM_SUCCESS) + return rc; /* * Query the generic resources to see if the GPU should be allocated @@ -1294,15 +1142,8 @@ extern int switch_p_job_init(stepd_step_rec_t *job) * Set the Job's APID */ job_setapid(getpid(), sw_job->apid); - - return SLURM_SUCCESS; - -error_free_alpsc_pe_info_t: - _free_alpsc_pe_info(alpsc_pe_info); - return SLURM_ERROR; -#else - return SLURM_SUCCESS; #endif + return SLURM_SUCCESS; } extern int switch_p_job_suspend_test(switch_jobinfo_t *jobinfo) diff --git a/src/plugins/switch/cray/switch_cray.h b/src/plugins/switch/cray/switch_cray.h new file mode 100644 index 0000000000000000000000000000000000000000..7d7c3ef73ac0757cfa9bbfa383020de2e0aca595 --- /dev/null +++ b/src/plugins/switch/cray/switch_cray.h @@ -0,0 +1,194 @@ +/*****************************************************************************\ + * switch_cray.h - Library for managing a switch on a Cray system. + ***************************************************************************** + * Copyright (C) 2013 SchedMD LLC + * Copyright 2013 Cray Inc. All Rights Reserved. + * Written by Danny Auble <da@schedmd.com> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://slurm.schedmd.com/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#ifndef SWITCH_CRAY_H +#define SWITCH_CRAY_H + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_NATIVE_CRAY + +#include <stdint.h> + +#include "src/common/bitstring.h" +#include "src/common/log.h" +#include "src/common/slurm_protocol_defs.h" +#include "src/slurmd/slurmstepd/slurmstepd_job.h" + +#include "alpscomm_cn.h" +#include "alpscomm_sn.h" + +/********************************************************** + * Constants + **********************************************************/ +// Legacy ALPS spool directory (can be redefined at compile time) +#ifndef LEGACY_SPOOL_DIR +#define LEGACY_SPOOL_DIR "/var/spool/alps/" +#endif + +// Magic value signifying that jobinfo wasn't NULL +#define CRAY_JOBINFO_MAGIC 0xCAFECAFE + +// Magic value signifying that jobinfo was NULL, don't unpack +#define CRAY_NULL_JOBINFO_MAGIC 0xDEAFDEAF + +// File to save plugin state to +#define CRAY_SWITCH_STATE "/switch_cray_state" + +// Temporary file containing new plugin state +#define CRAY_SWITCH_STATE_NEW CRAY_SWITCH_STATE".new" + +// File containing previous plugin state +#define CRAY_SWITCH_STATE_OLD CRAY_SWITCH_STATE".old" + +// Minimum PMI port to allocate +#define MIN_PORT 20000 + +// Maximum PMI port to allocate +#define MAX_PORT 60000 + +// Number of ports to allocate +#define PORT_CNT (MAX_PORT - MIN_PORT + 1) + +// Length of bitmap in bytes (see _bitstr_words in src/common/bitstring.c) +#define PORT_BITMAP_LEN ((((PORT_CNT + BITSTR_MAXPOS) >> BITSTR_SHIFT) \ + + BITSTR_OVERHEAD) * sizeof(bitstr_t)) + +// Number of times to attempt allocating a port when none are available +#define ATTEMPTS 2 + +// Maximum network resource scaling (in percent) +#define MAX_SCALING 50 + +// Minimum network resource scaling (in percent) +#define MIN_SCALING 1 + +/********************************************************** + * Type definitions + **********************************************************/ +// Opaque Cray jobinfo structure +typedef struct slurm_cray_jobinfo { + // Magic value + uint32_t magic; + + // Number of cookies sent to configure the HSN + uint32_t num_cookies; + + // Array of cookies + char **cookies; + + // Array of cookie IDs + uint32_t *cookie_ids; + + // Port for PMI communications + uint32_t port; + + // Slurm job id + uint32_t jobid; + + // Slurm step id + uint32_t stepid; + + // Cray Application ID (Slurm hash) + uint64_t apid; +} slurm_cray_jobinfo_t; + +/********************************************************** + * Global variables + **********************************************************/ +// Which ports are reserved (holds PORT_CNT bits) +extern bitstr_t *port_resv; + +// Last allocated port index +extern uint32_t last_alloc_port; + +// Mutex controlling access to port variables +extern pthread_mutex_t port_mutex; + +// Debug flags +extern uint32_t debug_flags; + +/********************************************************** + * Function declarations + **********************************************************/ +// Implemented in ports.c +extern int assign_port(uint32_t *ret_port); +extern int release_port(uint32_t real_port); + +// Implemented in pe_info.c +extern int build_alpsc_pe_info(stepd_step_rec_t *job, + slurm_cray_jobinfo_t *sw_job, + alpsc_peInfo_t *alpsc_pe_info); +extern void free_alpsc_pe_info(alpsc_peInfo_t *alpsc_pe_info); + +// Implemented in gpu.c +extern int setup_gpu(stepd_step_rec_t *job); +extern int reset_gpu(stepd_step_rec_t *job); + +// Implemented in scaling.c +extern int get_cpu_scaling(stepd_step_rec_t *job); +extern int get_mem_scaling(stepd_step_rec_t *job); + +// Implemented in util.c +extern int list_str_to_array(char *list, int *cnt, int32_t **numbers); +extern void alpsc_debug(const char *file, int line, const char *func, + int rc, int expected_rc, const char *alpsc_func, + char *err_msg); +extern void print_jobinfo(slurm_cray_jobinfo_t *job); +extern int create_apid_dir(uint64_t apid, uid_t uid, gid_t gid); +extern int set_job_env(stepd_step_rec_t *job, slurm_cray_jobinfo_t *sw_job); +extern void recursive_rmdir(const char *dirnm); + +/********************************************************** + * Macros + **********************************************************/ +#define ALPSC_CN_DEBUG(f) alpsc_debug(THIS_FILE, __LINE__, __FUNCTION__, \ + rc, 1, f, err_msg); +#define ALPSC_SN_DEBUG(f) alpsc_debug(THIS_FILE, __LINE__, __FUNCTION__, \ + rc, 0, f, err_msg); +#define CRAY_ERR(fmt, ...) error("(%s: %d: %s) "fmt, THIS_FILE, __LINE__, \ + __FUNCTION__, ##__VA_ARGS__); +#define CRAY_DEBUG(fmt, ...) debug2("(%s: %d: %s) "fmt, THIS_FILE, __LINE__, \ + __FUNCTION__, ##__VA_ARGS__); +#define CRAY_INFO(fmt, ...) info("(%s: %d: %s) "fmt, THIS_FILE, __LINE__, \ + __FUNCTION__, ##__VA_ARGS__); + +#endif /* HAVE_NATIVE_CRAY */ + +#endif /* SWITCH_CRAY_H */ diff --git a/src/plugins/switch/cray/util.c b/src/plugins/switch/cray/util.c new file mode 100644 index 0000000000000000000000000000000000000000..be53dd7b1575cfb306ef0d59bd9c3b978587c6d9 --- /dev/null +++ b/src/plugins/switch/cray/util.c @@ -0,0 +1,315 @@ +/*****************************************************************************\ + * util.c - Library for managing a switch on a Cray system. + ***************************************************************************** + * Copyright (C) 2013 SchedMD LLC + * Copyright 2013 Cray Inc. All Rights Reserved. + * Written by Danny Auble <da@schedmd.com> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://slurm.schedmd.com/>. + * Please also read the included file: DISCLAIMER. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +#if HAVE_CONFIG_H +#include "config.h" +#endif + +#include <dirent.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include "src/common/xstring.h" +#include "switch_cray.h" + +#ifdef HAVE_NATIVE_CRAY + +/* + * Create APID directory with given uid/gid as the owner. + */ +int create_apid_dir(uint64_t apid, uid_t uid, gid_t gid) +{ + int rc = 0; + char *apid_dir = NULL; + + apid_dir = xstrdup_printf(LEGACY_SPOOL_DIR "%" PRIu64, apid); + + rc = mkdir(apid_dir, 0700); + if (rc) { + xfree(apid_dir); + CRAY_ERR("mkdir failed to make directory %s: %m", apid_dir); + return SLURM_ERROR; + } + + rc = chown(apid_dir, uid, gid); + if (rc) { + xfree(apid_dir); + CRAY_ERR("chown failed: %m"); + return SLURM_ERROR; + } + + xfree(apid_dir); + return SLURM_SUCCESS; +} + +/* + * Set job environment variables used by LLI and PMI + */ +int set_job_env(stepd_step_rec_t *job, slurm_cray_jobinfo_t *sw_job) +{ + int rc, i; + char *buff = NULL; + + /* + * Write the CRAY_NUM_COOKIES and CRAY_COOKIES variables out + */ + rc = env_array_overwrite_fmt(&job->env, "CRAY_NUM_COOKIES", "%" PRIu32, + sw_job->num_cookies); + if (rc == 0) { + CRAY_ERR("Failed to set env variable CRAY_NUM_COOKIES"); + return SLURM_ERROR; + } + + /* + * Create the CRAY_COOKIES environment variable in the application's + * environment. + * Create one string containing a comma separated list of cookies. + */ + for (i = 0; i < sw_job->num_cookies; i++) { + if (i > 0) { + xstrfmtcat(buff, ",%s", sw_job->cookies[i]); + } else + xstrcat(buff, sw_job->cookies[i]); + } + + rc = env_array_overwrite(&job->env, "CRAY_COOKIES", buff); + if (rc == 0) { + CRAY_ERR("Failed to set env variable CRAY_COOKIES"); + xfree(buff); + return SLURM_ERROR; + } + xfree(buff); + + /* + * Write the PMI_CONTROL_PORT + * Cray's PMI uses this is the port to communicate its control tree + * information. + */ + rc = env_array_overwrite_fmt(&job->env, "PMI_CONTROL_PORT", "%" PRIu32, + sw_job->port); + if (rc == 0) { + CRAY_ERR("Failed to set env variable PMI_CONTROL_PORT"); + return SLURM_ERROR; + } + return SLURM_SUCCESS; +} + +/* + * Print the results of an alpscomm call + */ +void alpsc_debug(const char *file, int line, const char *func, + int rc, int expected_rc, const char *alpsc_func, + char *err_msg) +{ + if (rc != expected_rc) { + error("(%s: %d: %s) %s failed: %s", file, line, func, + alpsc_func, + err_msg ? err_msg : "No error message present"); + } else if (err_msg) { + info("(%s: %d: %s) %s: %s", file, line, func, + alpsc_func, err_msg); + } + free(err_msg); +} + + +/* + * Function: list_str_to_array + * Description: + * Convert the list string into an array of integers. + * + * IN list -- The list string + * OUT cnt -- The number of numbers in the list string + * OUT numbers -- Array of integers; Caller is responsible to xfree() + * this. + * + * N.B. Caller is responsible to xfree() numbers. + * + * RETURNS + * Returns 0 on success and -1 on failure. + */ + +int list_str_to_array(char *list, int *cnt, int32_t **numbers) +{ + + int32_t *item_ptr = NULL; + hostlist_t hl; + int i, ret = 0; + char *str, *cptr = NULL; + + /* + * Create a hostlist + */ + if (!(hl = hostlist_create(list))) { + CRAY_ERR("hostlist_create error on %s", list); + error("hostlist_create error on %s", list); + return -1; + } + + *cnt = hostlist_count(hl); + + if (!*cnt) { + *numbers = NULL; + return 0; + } + + /* + * Create an integer array of item_ptr in the same order as in the list. + */ + i = 0; + item_ptr = *numbers = xmalloc((*cnt) * sizeof(int32_t)); + while ((str = hostlist_shift(hl))) { + if (!(cptr = strpbrk(str, "0123456789"))) { + CRAY_ERR("Error: Node was not recognizable: %s", str); + free(str); + xfree(item_ptr); + *numbers = NULL; + hostlist_destroy(hl); + return -1; + } + item_ptr[i] = atoll(cptr); + i++; + free(str); + } + + // Clean up + hostlist_destroy(hl); + + return ret; +} + +/* + * Recursive directory delete + * + * Call with a directory name and this function will delete + * all files and directories rooted in this name. Finally + * the named directory will be deleted. + * If called with a file name, only that file will be deleted. + * + * Stolen from the ALPS code base. I may need to write my own. + */ +void recursive_rmdir(const char *dirnm) +{ + int st; + size_t dirnm_len, fnm_len, name_len; + char *fnm = 0; + DIR *dirp; + struct dirent *dir; + struct stat st_buf; + + /* Don't do anything if there is no directory name */ + if (!dirnm) { + return; + } + dirp = opendir(dirnm); + if (!dirp) { + if (errno == ENOTDIR) + goto fileDel; + CRAY_ERR("Error opening directory %s", dirnm); + return; + } + + dirnm_len = strlen(dirnm); + if (dirnm_len == 0) + return; + while ((dir = readdir(dirp))) { + name_len = strlen(dir->d_name); + if (name_len == 1 && dir->d_name[0] == '.') + continue; + if (name_len == 2 && strcmp(dir->d_name, "..") == 0) + continue; + fnm_len = dirnm_len + name_len + 2; + free(fnm); + fnm = malloc(fnm_len); + snprintf(fnm, fnm_len, "%s/%s", dirnm, dir->d_name); + st = stat(fnm, &st_buf); + if (st < 0) { + CRAY_ERR("stat of %s", fnm); + continue; + } + if (st_buf.st_mode & S_IFDIR) { + recursive_rmdir(fnm); + } else { + + st = unlink(fnm); + if (st < 0 && errno == EISDIR) + st = rmdir(fnm); + if (st < 0 && errno != ENOENT) { + CRAY_ERR("Error removing %s", fnm); + } + } + } + free(fnm); + closedir(dirp); +fileDel: st = unlink(dirnm); + if (st < 0 && errno == EISDIR) + st = rmdir(dirnm); + if (st < 0 && errno != ENOENT) { + CRAY_ERR("Error removing %s", dirnm); + } +} + +void print_jobinfo(slurm_cray_jobinfo_t *job) +{ + int i; + + if (!job || (job->magic == CRAY_NULL_JOBINFO_MAGIC)) { + CRAY_ERR("job pointer was NULL"); + return; + } + + xassert(job->magic == CRAY_JOBINFO_MAGIC); + + info("Jobinfo magic %"PRIx32, job->magic); + info(" Job ID: %" PRIu32, job->jobid); + info(" Step ID: %" PRIu32, job->stepid); + info(" APID: %" PRIu64, job->apid); + info(" PMI Port: %" PRIu32, job->port); + info(" num_cookies: %" PRIu32, job->num_cookies); + info(" --- cookies ---"); + for (i = 0; i < job->num_cookies; i++) { + info(" cookies[%d]: %s", i, job->cookies[i]); + } + info(" --- cookie_ids ---"); + for (i = 0; i < job->num_cookies; i++) { + info(" cookie_ids[%d]: %" PRIu32, i, job->cookie_ids[i]); + } + info("--END Jobinfo--"); +} + +#endif /* HAVE_NATIVE_CRAY */