From 6de1fbad921b9b37dfde2ef0f0aae3fc652ad9f7 Mon Sep 17 00:00:00 2001 From: phung4 <phung4@unknown> Date: Tue, 14 Sep 2004 16:25:25 +0000 Subject: [PATCH] checking in alpha version of bluegene plugin --- src/plugins/select/bluegene/Makefile.am | 11 +- src/plugins/select/bluegene/Manifest | 16 + src/plugins/select/bluegene/README | 45 + src/plugins/select/bluegene/README.dev | 43 + .../select/bluegene/bgl_switch_connections.c | 313 +++++ .../select/bluegene/bgl_switch_connections.h | 133 ++ src/plugins/select/bluegene/bluegene.c | 1052 ++++++++++++++++ src/plugins/select/bluegene/bluegene.conf | 9 + src/plugins/select/bluegene/bluegene.h | 100 ++ src/plugins/select/bluegene/partition_sys.c | 1112 +++++++++++++++++ src/plugins/select/bluegene/partition_sys.h | 73 ++ src/plugins/select/bluegene/select_bluegene.c | 86 +- 12 files changed, 2978 insertions(+), 15 deletions(-) create mode 100644 src/plugins/select/bluegene/Manifest create mode 100644 src/plugins/select/bluegene/README create mode 100644 src/plugins/select/bluegene/README.dev create mode 100644 src/plugins/select/bluegene/bgl_switch_connections.c create mode 100644 src/plugins/select/bluegene/bgl_switch_connections.h create mode 100644 src/plugins/select/bluegene/bluegene.c create mode 100644 src/plugins/select/bluegene/bluegene.conf create mode 100644 src/plugins/select/bluegene/bluegene.h create mode 100755 src/plugins/select/bluegene/partition_sys.c create mode 100644 src/plugins/select/bluegene/partition_sys.h diff --git a/src/plugins/select/bluegene/Makefile.am b/src/plugins/select/bluegene/Makefile.am index 94bc766a695..4af2ef50cbd 100644 --- a/src/plugins/select/bluegene/Makefile.am +++ b/src/plugins/select/bluegene/Makefile.am @@ -3,16 +3,19 @@ AUTOMAKE_OPTIONS = foreign -PLUGIN_FLAGS = -module -avoid-version --export-dynamic +PLUGIN_FLAGS = -module -avoid-version --export-dynamic -lm INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common pkglib_LTLIBRARIES = select_bluegene.la # Blue Gene node selection plugin. -select_bluegene_la_SOURCES = select_bluegene.c -select_bluegene_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) +select_bluegene_la_SOURCES = select_bluegene.c \ + bluegene.c bluegene.h \ + partition_sys.c partition_sys.h \ + rm_api.h +select_bluegene_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) if HAVE_AIX -select_bluegene_la_LIBADD = $(top_builddir)/src/common/libcommon.la + select_bluegene_la_LIBADD = $(top_builddir)/src/common/libcommon.la endif diff --git a/src/plugins/select/bluegene/Manifest b/src/plugins/select/bluegene/Manifest new file mode 100644 index 00000000000..72b82a60980 --- /dev/null +++ b/src/plugins/select/bluegene/Manifest @@ -0,0 +1,16 @@ +Filename : description +README : description of the plugin and + configuration file +Makefile.am : autoconf Makefile +bluegene.conf : sample configuration file +select_bluegene.c : API for the select_plugin +bluegene.c : component used for parsing config + file, determining where jobs will + and other plugin logic. +bluegene.h : header file +partition_sys.c : component used for wiring up the + partitions +partition_sys.h : header file +bgl_switch_connections.c : interface for connecting wires in + the BGL system +bgl_switch_connections.h : header file diff --git a/src/plugins/select/bluegene/README b/src/plugins/select/bluegene/README new file mode 100644 index 00000000000..133fb21cef9 --- /dev/null +++ b/src/plugins/select/bluegene/README @@ -0,0 +1,45 @@ +DESCRIPTION +----------- + +This directory contains the select plugin specific for the Bluegene/L +(BGL) machine. The task of this plugin is to select the appropriate +nodes (base partitions) required for running jobs. + +CONFIGURATION +------------- + +You may specify the type of bgl partitions that were specified in the +slurm partition file in a bluegene.conf file. Otherwise, the default +behavior is to sort in decreasing order and try to fit the largest +size partitions as tori. + +Each entry in the configuration file is a set of nodes and their +desired type. The set of nodes must match a set of nodes specified +for a BGL partition from within the SLURM partition. E.g. for a SLURM +partition entry in slurm.conf: + + PartitionName=batch222 Nodes=bgl[200x311] MinNodes=8 MaxNodes=8 + PartitionName=batch222 Nodes=bgl[220x331] # Add more nodes + PartitionName=batch222 Nodes=bgl[202x313] # Add more nodes + PartitionName=batch222 Nodes=bgl[222x333] # Add more nodes + +there may be a corresponding BGL partition specification: + + Nodes=bgl[200x311] Type=Mesh + Nodes=bgl[220x331] Type=Torus + Nodes=bgl[202x313] Type=Torus + Nodes=bgl[222x333] Type=Mesh + +Note that the configuration specified in this file may not be +physically possible due to the wiring contraints of the BGL system. A +tool will be provided that allows an admin to determine what +configurations are feasible. + + +PROBLEMS -------- + +If you experience problems compiling, installing, or running this +plugin please send email to either Dan Phung <phung4@llnl.gov> or +Morris Jette <jette@llnl.gov>. + +$Id$ diff --git a/src/plugins/select/bluegene/README.dev b/src/plugins/select/bluegene/README.dev new file mode 100644 index 00000000000..e0839bb91ea --- /dev/null +++ b/src/plugins/select/bluegene/README.dev @@ -0,0 +1,43 @@ +DESCRIPTION +----------- + +This file contains some (hopefully) helpful information when modifying +the bluegene select plugin. + +Here's the flow of the plugin + +init + doesn't do anything (now, Mon Sep 13 09:54:57 PDT 2004) + +select_p_part_init + reads in bluegene.conf and creates the BGL partitions. + +select_p_state_save + FIXME - saves bluegene state + +select_p_state_restore + FIXME - restores former bluegene state + +select_p_node_init + NO-OP + +select_p_job_test + tests if a partition can satisfy the required resources and assigns + them accordingly. + +select_p_job_init +select_p_job_fini + calls CMCS API to initialize/remove job related structures, as + appropriate. + +FIXME - DAN, put more info about what's happening in bluegene.c and +partition_sys here. + + +PROBLEMS -------- + +If you experience problems compiling, installing, or running this +plugin please send email to either Dan Phung <phung4@llnl.gov> or +Morris Jette <jette@llnl.gov>. + +$Id$ diff --git a/src/plugins/select/bluegene/bgl_switch_connections.c b/src/plugins/select/bluegene/bgl_switch_connections.c new file mode 100644 index 00000000000..2d23da858c5 --- /dev/null +++ b/src/plugins/select/bluegene/bgl_switch_connections.c @@ -0,0 +1,313 @@ +/*****************************************************************************\ + * bgl_switch_connections.c + * + ***************************************************************************** + * Copyright (C) 2004 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Dan Phung <phung4@llnl.gov> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +/** + * connect the given switch up with the given connections + */ +void _connect(rm_partition_t *my_part, rm_switch_t *my_switch, + rm_connection_t *conn1, rm_connection_t *conn2, rm_connection_t *conn3, + int first) +{ + // rm_get_data(bgl,RM_FirstSwitch,&my_switch); + // rm_get_data(bgl,RM_NextSwitch,&my_switch); + + if (first){ + rm_set_data(my_switch,RM_SwitchFirstConnection,conn1); + rm_set_data(my_switch,RM_SwitchSecondConnection,conn2); + rm_set_data(my_switch,RM_SwitchThirdConnection,conn3); + rm_set_data(my_part,RM_PartFirstSwitch,my_switch); + } else { + rm_set_data(my_switch,RM_SwitchFirstConnection,conn1); + rm_set_data(my_switch,RM_SwitchSecondConnection,conn2); + rm_set_data(my_switch,RM_SwitchThirdConnection,conn3); + rm_set_data(my_part,RM_PartNextSwitch,my_switch); + } +} + +/** + * connect the given switch up in the "A" pattern + * 0 1 + * /--|--|--\ + * | / \ | + * 2 --/ \-- 5 + * | /--\ | + * \__|__|__/ + * 3 4 + */ +void connect_switch_A(rm_partition_t *my_part, rm_switch_t *my_switch, + int first) +{ + rm_connection_t conn1, conn2, conn3; + + conn1.p1 = RM_PORT_S0; + conn1.p2 = RM_PORT_S2; + conn1.part_id = NULL; + conn1.usage = RM_CONNECTION_USED; + + conn2.p1 = RM_PORT_S1; + conn2.p2 = RM_PORT_S5; + conn2.part_id = NULL; + conn2.usage = RM_CONNECTION_USED; + + conn3.p1 = RM_PORT_S3; + conn3.p2 = RM_PORT_S4; + conn3.part_id = NULL; + conn3.usage = RM_CONNECTION_USED; + + connect(my_part, my_switch, &conn1, &conn2, &conn3, first); +} + +/** + * connect the given switch up in the "B" pattern + * 0 1 + * /--|--|--\ + * | \ / | + * 2 ----\/---- 5 + * | /\ | + * \__|__|__/ + * 3 4 + */ +void connect_switch_B(rm_partition_t *my_part, rm_switch_t *my_switch, + int first) +{ + rm_connection_t conn1, conn2, conn3; + + conn1.p1 = RM_PORT_S0; + conn1.p2 = RM_PORT_S4; + conn1.part_id = NULL; + conn1.usage = RM_CONNECTION_USED; + + conn2.p1 = RM_PORT_S1; + conn2.p2 = RM_PORT_S3; + conn2.part_id = NULL; + conn2.usage = RM_CONNECTION_USED; + + conn3.p1 = RM_PORT_S2; + conn3.p2 = RM_PORT_S5; + conn3.part_id = NULL; + conn3.usage = RM_CONNECTION_USED; + + connect(my_part, my_switch, &conn1, &conn2, &conn3, first); +} + +/** + * connect the given switch up in the "C" pattern + * 0 1 + * /--|--|--\ + * | \ \ | + * 5 --\ \ \-- 2 + * | \ \ | + * \__|__|__/ + * 3 4 + */ +void connect_switch_C(rm_partition_t *my_part, rm_switch_t *my_switch, + int first) +{ + rm_connection_t conn1, conn2, conn3; + + conn1.p1 = RM_PORT_S0; + conn1.p2 = RM_PORT_S4; + conn1.part_id = NULL; + conn1.usage = RM_CONNECTION_USED; + + conn2.p1 = RM_PORT_S1; + conn2.p2 = RM_PORT_S5; + conn2.part_id = NULL; + conn2.usage = RM_CONNECTION_USED; + + conn3.p1 = RM_PORT_S2; + conn3.p2 = RM_PORT_S3; + conn3.part_id = NULL; + conn3.usage = RM_CONNECTION_USED; + + connect(my_part, my_switch, &conn1, &conn2, &conn3, first); +} + +/** + * connect the given switch up in the "D" pattern + * 0 1 + * /--|--|--\ + * | / / | + * 2 --/ / /-- 5 + * | / / | + * \__|__|__/ + * 3 4 + */ +void connect_switch_D(rm_partition_t *my_part, rm_switch_t *my_switch, + int first) +{ + rm_connection_t conn1, conn2, conn3; + + conn1.p1 = RM_PORT_S0; + conn1.p2 = RM_PORT_S2; + conn1.part_id = NULL; + conn1.usage = RM_CONNECTION_USED; + + conn2.p1 = RM_PORT_S1; + conn2.p2 = RM_PORT_S3; + conn2.part_id = NULL; + conn2.usage = RM_CONNECTION_USED; + + conn3.p1 = RM_PORT_S4; + conn3.p2 = RM_PORT_S5; + conn3.part_id = NULL; + conn3.usage = RM_CONNECTION_USED; + + connect(my_part, my_switch, &conn1, &conn2, &conn3, first); +} + +/** + * connect the given switch up in the "E" pattern (loopback) + * 0 1 + * /--|--|--\ + * | \__/ | + * 2 ---------- 5 + * | /--\ | + * \__|__|__/ + * 3 4 + */ +void connect_switch_E(rm_partition_t *my_part, rm_switch_t *my_switch, + int first) +{ + rm_connection_t conn1, conn2, conn3; + + conn1.p1 = RM_PORT_S0; + conn1.p2 = RM_PORT_S1; + conn1.part_id = NULL; + conn1.usage = RM_CONNECTION_USED; + + conn2.p1 = RM_PORT_S2; + conn2.p2 = RM_PORT_S5; + conn2.part_id = NULL; + conn2.usage = RM_CONNECTION_USED; + + conn3.p1 = RM_PORT_S3; + conn3.p2 = RM_PORT_S4; + conn3.part_id = NULL; + conn3.usage = RM_CONNECTION_USED; + + connect(my_part, my_switch, &conn1, &conn2, &conn3, first); +} + +/** + * connect the given switch up in the "F" pattern (loopback) + * 0 1 + * /--|--|--\ + * | \__/ | + * 2 --\ /-- 5 + * | \ / | + * \__|__|__/ + * 3 4 + */ +void connect_switch_F(rm_partition_t *my_part, rm_switch_t *my_switch, + int first) +{ + rm_connection_t conn1, conn2, conn3; + + conn1.p1 = RM_PORT_S0; + conn1.p2 = RM_PORT_S1; + conn1.part_id = NULL; + conn1.usage = RM_CONNECTION_USED; + + conn2.p1 = RM_PORT_S2; + conn2.p2 = RM_PORT_S3; + conn2.part_id = NULL; + conn2.usage = RM_CONNECTION_USED; + + conn3.p1 = RM_PORT_S4; + conn3.p2 = RM_PORT_S5; + conn3.part_id = NULL; + conn3.usage = RM_CONNECTION_USED; + + connect(my_part, my_switch, &conn1, &conn2, &conn3, first); +} + + +/** + * connect the node to the next node (higher up number) + * 0 1 + * /--|--|--\ + * | / | + * 2 - / - 5 + * | / | + * \__|__|__/ + * 3 4 + */ +void connect_next(rm_partition_t *my_part, rm_switch_t *my_switch) +{ + rm_connection_t conn1, conn2, conn3; + int first = 0; + + conn1.p1 = RM_PORT_S1; + conn1.p2 = RM_PORT_S3; + conn1.part_id = NULL; + conn1.usage = RM_CONNECTION_USED; + + conn2.p1 = RM_PORT_S0; + conn2.p2 = RM_PORT_S2; + conn2.part_id = NULL; + conn2.usage = RM_CONNECTION_NOT_USED; + + conn3.p1 = RM_PORT_S4; + conn3.p2 = RM_PORT_S5; + conn3.part_id = NULL; + conn3.usage = RM_CONNECTION_NOT_USED; + + connect(my_part, my_switch, &conn1, &conn2, &conn3, first); +} + +/** + * connect the given switch up to the previous node + * 0 1 + * /--|--|--\ + * | \ | + * 2 - \ - 5 + * | \ | + * \__|__|__/ + * 3 4 + */ +void connect_prev(rm_partition_t *my_part, rm_switch_t *my_switch) +{ + rm_connection_t conn1, conn2, conn3; + int first = 0; + + conn1.p1 = RM_PORT_S0; + conn1.p2 = RM_PORT_S4; + conn1.part_id = NULL; + conn1.usage = RM_CONNECTION_USED; + + conn2.p1 = RM_PORT_S2; + conn2.p2 = RM_PORT_S3; + conn2.part_id = NULL; + conn2.usage = RM_CONNECTION_NOT_USED; + + conn3.p1 = RM_PORT_S1; + conn3.p2 = RM_PORT_S5; + conn3.part_id = NULL; + conn3.usage = RM_CONNECTION_NOT_USED; + + connect(my_part, my_switch, &conn1, &conn2, &conn3, first); +} diff --git a/src/plugins/select/bluegene/bgl_switch_connections.h b/src/plugins/select/bluegene/bgl_switch_connections.h new file mode 100644 index 00000000000..3952a53fa50 --- /dev/null +++ b/src/plugins/select/bluegene/bgl_switch_connections.h @@ -0,0 +1,133 @@ +/*****************************************************************************\ + * bgl_switch_connections.c + * + ***************************************************************************** + * Copyright (C) 2004 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Dan Phung <phung4@llnl.gov> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#ifndef _BGL_SWITCH_CONNECTIONS_H_ +#define + +/** + * connect the given switch up in the "A" pattern + * 0 1 + * /--|--|--\ + * | / \ | + * 2 --/ \-- 5 + * | /--\ | + * \__|__|__/ + * 3 4 + */ +void connect_switch_A(rm_BGL_t *bgl, rm_partition_t *my_part, + rm_switch_t *my_switch, + int first); + +/** + * connect the given switch up in the "B" pattern + * 0 1 + * /--|--|--\ + * | \ / | + * 2 ----\/---- 5 + * | /\ | + * \__|__|__/ + * 3 4 + */ +void connect_switch_B(rm_BGL_t *bgl, rm_partition_t *my_part, rm_switch_t *my_switch, + int first); + +/** + * connect the given switch up in the "C" pattern + * 0 1 + * /--|--|--\ + * | \ \ | + * 2 --\ \ \-- 5 + * | \ \ | + * \__|__|__/ + * 3 4 + */ +void connect_switch_C(rm_BGL_t *bgl, rm_partition_t *my_part, rm_switch_t *my_switch, + int first); + +/** + * connect the given switch up in the "D" pattern + * 0 1 + * /--|--|--\ + * | / / | + * 2 --/ / /-- 5 + * | / / | + * \__|__|__/ + * 3 4 + */ +void connect_switch_D(rm_BGL_t *bgl, rm_partition_t *my_part, rm_switch_t *my_switch, + int first); + +/** + * connect the given switch up in the "E" pattern (loopback) + * 0 1 + * /--|--|--\ + * | \__/ | + * 2 ---------- 5 + * | /--\ | + * \__|__|__/ + * 3 4 + */ +void connect_switch_E(rm_BGL_t *bgl, rm_partition_t *my_part, rm_switch_t *my_switch, + int first); + +/** + * connect the given switch up in the "F" pattern (loopback) + * 0 1 + * /--|--|--\ + * | \__/ | + * 2 --\ /-- 5 + * | \ / | + * \__|__|__/ + * 3 4 + */ +void connect_switch_F(rm_BGL_t *bgl, rm_partition_t *my_part, rm_switch_t *my_switch, + int first); + +/** + * connect the node to the next node (higher up number) + * 0 1 + * /--|--|--\ + * | / | + * 2 - / - 5 + * | / | + * \__|__|__/ + * 3 4 + */ +void connect_next(rm_partition_t *my_part, rm_switch_t *my_switch); + +/** + * connect the given switch up to the previous node + * 0 1 + * /--|--|--\ + * | \ | + * 2 - \ - 5 + * | \ | + * \__|__|__/ + * 3 4 + */ +void connect_prev(rm_partition_t *my_part, rm_switch_t *my_switch); + +#endif /* _BGL_SWITCH_CONNECTIONS_H_ */ diff --git a/src/plugins/select/bluegene/bluegene.c b/src/plugins/select/bluegene/bluegene.c new file mode 100644 index 00000000000..b030ca60779 --- /dev/null +++ b/src/plugins/select/bluegene/bluegene.c @@ -0,0 +1,1052 @@ +/*****************************************************************************\ + * bluegene.c - bgl node allocation plugin. + ***************************************************************************** + * Copyright (C) 2003 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Dan Phung <phung4@llnl.gov> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#include <stdlib.h> +#include "src/slurmctld/proc_req.h" +#include "src/common/list.h" +#include "src/common/read_config.h" +#include "src/common/parse_spec.h" +#include "bluegene.h" +#include "partition_sys.h" +#include "src/common/hostlist.h" +#define SYSTEM_DIMENSIONS 3 + +#define RANGE_MAX 8192 +#define BUF_SIZE 4096 + +static char* bgl_conf = "/home/phung4/root/etc/bluegene.conf"; + +/** some internally used functions */ + +/** */ +int _find_best_partition_match(struct job_record* job_ptr, bitstr_t* slurm_part_bitmap, + int min_nodes, int max_nodes, + int spec, bgl_record_t* found_bgl_record); +/** */ +int _parse_request(char* request_string, partition_t** request); +/** */ +int _get_request_dimensions(int* bl, int* tr, int** dim); +/** */ +int _extract_range(char* request, char** result); +/** */ +int _create_bgl_partitions(); +/** */ +void _get_bitmap(hostlist_t* hostlist, bitstr_t* bitmap); +/** */ +int _bgl_record_cmpf_inc(bgl_record_t* A, bgl_record_t* B); +/** */ +int _bgl_record_cmpf_dec(bgl_record_t* A, bgl_record_t* B); +/** + * to be used by list object to destroy the array elements + */ +void _bgl_record_destroy(void* object); +/** */ +void _bgl_conf_record_destroy(void* object); + +/** */ +void _print_bitmap(bitstr_t* bitmap); + +/** */ +void _process_config(); +/** */ +static int _parse_bgl_spec(char *in_line); +/** */ +static int _copy_slurm_partition_list(); +/** */ +int _find_part_type(char* nodes, rm_partition_t** return_part_type); +/** */ +static int _ListFindF_conf_part_record(bgl_conf_record_t* record, char *nodes); +/** */ +int _compute_part_size(char* nodes); + +/** + * create_static_partitions - create the static partitions that will be used + * for scheduling. + * IN - (global, from slurmctld): the system and desired partition configurations + * OUT - (global, to slurmctld): Table of partitionIDs to geometries + * RET - success of fitting all configurations + */ +int create_static_partitions() +{ + /** purge the old list. Later on, it may be more efficient just to amend the list */ + if (bgl_list){ + list_destroy(bgl_list); + } + bgl_list = list_create(_bgl_record_destroy); + + /** copy the slurm conf partition info, this will fill in bgl_list */ + if (_copy_slurm_partition_list()){ + return SLURM_ERROR; + } + + _process_config(); + /* after reading in the configuration, we have a list of partition requests (List <int*>) + * that we can use to partition up the system + */ + _create_bgl_partitions(); + + return SLURM_SUCCESS; +} + +/** + * IN - requests: list of bgl_record(s) + */ +int _create_bgl_partitions() +{ + bgl_record_t* cur_record; + partition_t* cur_partition; + printf("bluegene::create_bgl_partitions\n"); + + ListIterator itr = list_iterator_create(bgl_list); + while ((cur_record = (bgl_record_t*) list_next(itr))) { + cur_partition = (partition_t*) cur_record->alloc_part; + if (configure_switches(cur_partition)){ + error("error on cur_record %s", cur_record->nodes); + } + } + list_iterator_destroy(itr); + + printf("create_bgl_partitions done\n"); + return SLURM_SUCCESS; +} + +/** + * process the slurm configuration to interpret BGL specific semantics: + * if MaxNodes == MinNodes == size (Nodes), = static partition, otherwise + * + * creates a List of allocation requests made up of partition_t's (see partition_sys.h) + * + * + */ +void _process_config() +{ + ListIterator itr; + bgl_record_t *bgl_part; + partition_t* request; + + itr = list_iterator_create(bgl_list); + while ((bgl_part = (bgl_record_t*) list_next(itr))) { + /** + * parse request will fill up the partition_t's + * bl_coord, tr_coord, dimensions, and size + */ + if (_parse_request(bgl_part->nodes, &request) || request == NULL) + error("_process_config: error parsing request %s\n", bgl_part->nodes); + + /** + * bgl_part->part_type should have been extracted in + * copy_slurm_partition_list + */ + request->part_type = (rm_partition_t*) bgl_part->part_type; + request->bgl_record_ptr = bgl_part; + bgl_part->alloc_part = request; + } + list_iterator_destroy(itr); +} + +/* copy the current partition info that was read in from slurm.conf so + * that we can maintain our own separate table of bgl_part_id to + * slurm_part_id. + */ +static int _copy_slurm_partition_list() +{ + struct part_record* slurm_part; + bgl_record_t* bgl_record; + ListIterator itr; + char* cur_nodes, *delimiter=",", *nodes_tmp, *next_ptr; + int err; + + if (!slurm_part_list){ + error("_copy_slurm_partition_list: slurm_part_list is not initialized yet\n"); + return SLURM_ERROR; + } + itr = list_iterator_create(slurm_part_list); + /** + * try to find the corresponding bgl_conf_record for the + * nodes specified in the slurm_part_list, but if not + * found, _find_part_type will default to RM_MESH + */ + while ((slurm_part = (struct part_record *) list_next(itr))) { + nodes_tmp = strdup(slurm_part->nodes); + + cur_nodes = strtok_r(nodes_tmp, delimiter, &next_ptr); + /** debugging info */ + { + debug("current slurm nodes to parse <%s>\n", slurm_part->nodes); + debug("slurm_part->node_bitmap"); + _print_bitmap(slurm_part->node_bitmap); + } + // debug("received token"); + // debug("received token <%s>", cur_nodes); + /** + * for each of the slurm partitions, there may be + * several bgl partitions, so we need to find how to + * wire each of those bgl partitions. + */ + err = 0; + while(cur_nodes != NULL){ + bgl_record = (bgl_record_t*) xmalloc(sizeof(bgl_record_t)); + if (!bgl_record){ + error("_copy_slurm_partition_list: not enough memory for bgl_record" + "for node %s", cur_nodes); + err = 1; + goto cleanup_while; + } + + bgl_record->nodes = strdup(cur_nodes); + bgl_record->slurm_part_id = slurm_part->name; + bgl_record->part_type = (rm_partition_t*) malloc(sizeof(rm_partition_t)); + bgl_record->used = 0; + if (!bgl_record->part_type){ + error("_copy_slurm_partition_list: not enough memory for bgl_record->part_type"); + err = 1; + goto cleanup_while; + } + + if (_find_part_type(cur_nodes, &bgl_record->part_type)){ + error("_copy_slurm_partition_list: not enough memory for bgl_record->part_type"); + err = 1; + goto cleanup_while; + } + + bgl_record->hostlist = (hostlist_t *) xmalloc(sizeof(hostlist_t)); + *(bgl_record->hostlist) = hostlist_create(cur_nodes); + bgl_record->size = hostlist_count(*(bgl_record->hostlist)); + if (node_name2bitmap(cur_nodes, false, &(bgl_record->bitmap))){ + error("unable to convert nodes %s to bitmap", cur_nodes); + } + + if (slurm_part->min_nodes == slurm_part->max_nodes && + bgl_record->size == slurm_part->max_nodes) + bgl_record->part_lifecycle = STATIC; + else + bgl_record->part_lifecycle = DYNAMIC; + // print_bgl_record(bgl_record); + list_push(bgl_list, bgl_record); + cleanup_while: + // dunno if we have to free this after inserting into the list + // free(bgl_record); + // bgl_record = NULL; + /* free(nodes_tmp); why does this interfere with strtok_r */ + nodes_tmp = next_ptr; + cur_nodes = strtok_r(nodes_tmp, delimiter, &next_ptr); + if (err) { + return SLURM_ERROR; + } + } + } + list_iterator_destroy(itr); + return SLURM_SUCCESS; +} + +int read_bgl_conf() +{ + DEF_TIMERS; + FILE *bgl_spec_file; /* pointer to input data file */ + int line_num; /* line number in input file */ + char in_line[BUF_SIZE]; /* input line */ + int i, j, error_code; + + /* initialization */ + START_TIMER; + /* bgl_conf defined in bgl_node_alloc.h */ + bgl_spec_file = fopen(bgl_conf, "r"); + if (bgl_spec_file == NULL) + fatal("read_bgl_conf error opening file %s, %m", + bgl_conf); + + /* process the data file */ + line_num = 0; + while (fgets(in_line, BUF_SIZE, bgl_spec_file) != NULL) { + line_num++; + if (strlen(in_line) >= (BUF_SIZE - 1)) { + error("_read_bgl_config line %d, of input file %s " + "too long", + line_num, bgl_conf); + fclose(bgl_spec_file); + return E2BIG; + break; + } + + /* everything after a non-escaped "#" is a comment */ + /* replace comment flag "#" with an end of string (NULL) */ + /* escape sequence "\#" translated to "#" */ + for (i = 0; i < BUF_SIZE; i++) { + if (in_line[i] == (char) NULL) + break; + if (in_line[i] != '#') + continue; + if ((i > 0) && (in_line[i - 1] == '\\')) { + for (j = i; j < BUF_SIZE; j++) { + in_line[j - 1] = in_line[j]; + } + continue; + } + in_line[i] = (char) NULL; + break; + } + + /* parse what is left, non-comments */ + /* partition configuration parameters */ + if ((error_code = _parse_bgl_spec(in_line))) { + error("_parse_bgl_spec error, skipping this line\n"); + + } + + /* report any leftover strings on input line */ + report_leftover(in_line, line_num); + } + fclose(bgl_spec_file); + + END_TIMER; + debug("select_bluegene _read_bgl_conf: finished loading configuration %s", + TIME_STR); + + return error_code; +} + +/* + * phung: edited to piggy back on this function to also allow configuration + * option of the partition (ie, you can specify the config to be a 2x2x2 + * partition. + * + * _parse_part_spec - parse the partition specification, build table and + * set values + * IN/OUT in_line - line from the configuration file, parsed keywords + * and values replaced by blanks + * RET 0 if no error, error code otherwise + * Note: Operates on common variables + * global: part_list - global partition list pointer + * default_part - default parameters for a partition + */ +static int _parse_bgl_spec(char *in_line) +{ + int error_code = SLURM_SUCCESS; + char *nodes = NULL, *part_type = NULL; + bgl_conf_record_t* new_record; + + error_code = slurm_parser(in_line, + "Nodes=", 's', &nodes, + "Type=", 's', &part_type, + "END"); + + /** error if you're not specifying nodes or partition type on + this line */ + if (error_code || !nodes || !part_type){ + xfree(nodes); + xfree(part_type); + return error_code; + } + + // debug("parsed nodes %s\n", nodes); + // debug("partition type %s\n", part_type); + + new_record = (bgl_conf_record_t*) xmalloc(sizeof(bgl_conf_record_t)); + if (!new_record){ + error("_parse_bgl_spec: not enough memory for new_record"); + return SLURM_ERROR; + } + + new_record->nodes = strdup(nodes); + new_record->part_type = malloc(sizeof(rm_partition_t)); + if (strcasecmp(part_type, "TORUS") == 0){ + // error("warning, TORUS specified, but I can't handle those yet! Defaulting to mesh"); + /** FIXME */ + *(new_record->part_type) = RM_TORUS; + // new_record->part_type = RM_MESH; + } else if (strcasecmp(part_type, "MESH") == 0) { + *(new_record->part_type) = RM_MESH; + } else { + error("_parse_bgl_spec: partition type %s invalid for nodes %s", + part_type, nodes); + error("defaulting to type: MESH"); + /* error("defaulting to type: PREFER_TORUS"); */ + *(new_record->part_type) = RM_MESH; + } + list_push(bgl_conf_list, new_record); + + return SLURM_SUCCESS; +} + +void _bgl_record_destroy(void* object) +{ + bgl_record_t* this_record = (bgl_record_t*) object; + if (this_record){ + if (this_record->slurm_part_id) + xfree(this_record->slurm_part_id); + if (this_record->nodes) + xfree(this_record->nodes); + if (this_record->part_type) + xfree(this_record->part_type); + if (this_record->hostlist) + hostlist_destroy(*(this_record->hostlist)); + if (this_record->bitmap) + bit_free(this_record->bitmap); + +#ifdef _RM_API_H__ + if (this_record->bgl_part_id) + xfree(this_record->bgl_part_id); +#endif + + } + xfree(this_record); +} + +void _bgl_conf_record_destroy(void* object) +{ + bgl_conf_record_t* this_record = (bgl_conf_record_t*) object; + if (this_record){ + if (this_record->nodes) + xfree(this_record->nodes); + if (this_record->part_type) + xfree(this_record->part_type); + } + xfree(this_record); +} + +/** + * search through the list of nodes,types to find the partition type + * for the given nodes + */ +int _find_part_type(char* nodes, rm_partition_t** return_part_type) +{ + bgl_conf_record_t* record = NULL; + + record = (bgl_conf_record_t*) list_find_first(bgl_conf_list, + (ListFindF) _ListFindF_conf_part_record, + nodes); + + *return_part_type = (rm_partition_t*) malloc(sizeof(rm_partition_t)); + if (!(*return_part_type)) { + error("_find_part_type: not enough memory for return_part_type"); + return SLURM_ERROR; + } + + if (record != NULL && record->part_type != NULL){ + **return_part_type = *(record->part_type); + } else { + // error("warning: nodes not found in slurm.conf, defaulting to type RM_MESH"); + **return_part_type = RM_MESH; + } + + return SLURM_SUCCESS; +} + +/** nodes example: 000x111 */ +static int _ListFindF_conf_part_record(bgl_conf_record_t* record, char *nodes) +{ + return (!strcasecmp(record->nodes, nodes)); +} + +int _compute_part_size(char* nodes) +{ + int size; + /* nhosts is stored as int, hopefully unsigned 32-bit */ + hostlist_t hosts = hostlist_create(nodes); + size = (int) hostlist_count(hosts); + hostlist_destroy(hosts); + // debug("compute_part_size for %s = %d", nodes, size); + return size; +} + +/** + * converts a request of form ABCxXYZ to two int*'s + * of bl[ABC] and tr[XYZ]. + */ +int char2intptr(char* request, int** bl, int** tr) +{ + int i; + char *request_tmp, *delimit = "x,", *next_ptr; + char zero = '0'; + char* token; + //request_tmp = (char*) xmalloc(sizeof(char) * strlen(request)); + request_tmp = strdup(request); + (*bl) = (int*) xmalloc(sizeof(int) * SYSTEM_DIMENSIONS); + (*tr) = (int*) xmalloc(sizeof(int) * SYSTEM_DIMENSIONS); + + if (!request_tmp || !bl || !tr){ + error("char2intptr: not enough memory for char2intptr"); + return SLURM_ERROR; + } + + // printf("char2intptr request <%s>\n", request_tmp); + token = strtok_r(request_tmp, delimit, &next_ptr); + if (token == NULL) + goto cleanup; + + for (i=0; i<SYSTEM_DIMENSIONS; i++){ + (*bl)[i] = (int)(token[i]-zero); + } + + request_tmp = next_ptr; + token = strtok_r(request_tmp, delimit, &next_ptr); + if (token == NULL) + goto cleanup; + + for (i=0; i<SYSTEM_DIMENSIONS; i++){ + (*tr)[i] = (int)(token[i]-zero); + } + return SLURM_SUCCESS; + + cleanup: + error("char2intptr request string insufficient dimensions"); + free(request_tmp); + free(bl); + free(tr); + bl = NULL; tr = NULL; + return SLURM_ERROR; +} + +/** + * tmp is of form ABCxXYZ + * + */ +int _parse_request(char* request_string, partition_t** request) +{ + char* range; + int *bl=NULL, *tr=NULL, *dim=NULL; + int i; + (*request) = (partition_t*) malloc(sizeof(partition_t)); + if (!(*request)) { + error("parse_request: not enough memory for request"); + return SLURM_ERROR; + } + + /** token needs to be of the form 000x000 */ + if(_extract_range(request_string, &range)) + return SLURM_ERROR; + + if (char2intptr(range, &bl, &tr) || bl == NULL || tr == NULL || + _get_request_dimensions(bl, tr, &dim)){ + goto cleanup; + } + + /** place all the correct values into the request */ + for (i=0; i<SYSTEM_DIMENSIONS; i++){ + (*request)->bl_coord[i] = bl[i]; + (*request)->tr_coord[i] = tr[i]; + (*request)->dimensions[i] = dim[i]; + } + + (*request)->size = intArray_size(dim); + return SLURM_SUCCESS; + + cleanup: + xfree(bl); + xfree(tr); + xfree(request); + xfree(dim); + bl = NULL; tr = NULL; dim = NULL; + request = NULL; + return SLURM_ERROR; +} + +int _get_request_dimensions(int* bl, int* tr, int** dim) +{ + int i; + /* + debug("get request dimensions dim: bl[%d %d %d] tr[%d %d %d]", + bl[0], bl[1], bl[2], tr[0], tr[1], tr[2]); + */ + if (bl == NULL || tr == NULL){ + return SLURM_ERROR; + } + + *dim = (int*) malloc(sizeof(int)*SYSTEM_DIMENSIONS); + if (!(*dim)) { + error("get_request_dimensions: not enough memory for dim"); + return SLURM_ERROR; + } + for (i=0; i<SYSTEM_DIMENSIONS; i++){ + (*dim)[i] = tr[i] - bl[i] + 1; /* plus one because we're + counting current + number, so 0 to 1 = 2 + */ + if ((*dim)[i] <= 0){ + error("_get_request_dimensions: tr dimension less than bl dimension."); + goto cleanup; + } + } + // debug("dim: [%d %d %d]", (*dim)[0], (*dim)[1], (*dim)[2]); + return SLURM_SUCCESS; + + cleanup: + xfree(*dim); + *dim = NULL; + return SLURM_ERROR; +} + +int init_bgl() +{ + /** global variable */ + bgl_conf_list = (List) list_create(_bgl_conf_record_destroy); + +#ifdef _RM_API_H__ + int rc = rm_get_BGL(&bgl); + if (rc != STATUS_OK){ + error("init_bgl: rm_get_BGL failed\n"); + return SLURM_ERROR; + } +#endif + + return SLURM_SUCCESS; +} + +int _extract_range(char* request, char** result) +{ + int RANGE_SIZE = 7; /* expecting something of the size: 000x000 = 7 chars */ + int i, my_i, request_length; + int start = 0, end = 0; + + if (!request) + return 1; + if (!(*result)) { + *result = (char*) malloc(sizeof(RANGE_SIZE)); + if (!(*result)) { + error("_extract_range: not enough memory for *result"); + return SLURM_ERROR; + } + } + + request_length = strlen(request); + + for(i=0, my_i=0; i<request_length; i++){ + if (request[i] == ']'){ + (*result)[ (my_i) ] = '\0'; + end = 1; + break; + } + + if (start) + (*result)[ (my_i++) ] = request[i]; + + if (request[i] == '[') + start = 1; + } + + if (!start || !end) + goto cleanup; + + return SLURM_SUCCESS; + + cleanup: + free(*result); + *result = NULL; + return SLURM_ERROR; +} + +void print_bgl_record(bgl_record_t* record) +{ + if (!record){ + error("print_bgl_record, record given is null"); + } + + debug(" bgl_record:"); + debug(" \tslurm_part_id: %s", record->slurm_part_id); + if (record->bgl_part_id) + debug(" \tbgl_part_id: %d", *(record->bgl_part_id)); + debug(" \tnodes: %s", record->nodes); + // debug(" size: %d", record->size); + debug(" \tlifecycle: %s", convert_lifecycle(record->part_lifecycle)); + debug(" \tpart_type: %s", convert_part_type(record->part_type)); + + if (record->hostlist){ + char buffer[RANGE_MAX]; + hostlist_ranged_string(*(record->hostlist), RANGE_MAX, buffer); + debug(" \thostlist %s", buffer); + } + + if (record->alloc_part){ + debug(" \talloc_part:"); + printPartition(record->alloc_part); + } else { + debug(" \talloc_part: NULL"); + } + + if (record->bitmap){ + int bitsize = 128; + char* bitstring = (char*) malloc(sizeof(char)*bitsize); + bit_fmt(bitstring, bitsize, record->bitmap); + debug("\tbitmap: %s", bitstring); + } +} + +char* convert_lifecycle(lifecycle_type_t lifecycle) +{ + if (lifecycle == DYNAMIC) + return "DYNAMIC"; + else + return "STATIC"; +} + +char* convert_part_type(rm_partition_t* pt) +{ + switch(*pt) { + case (RM_MESH): + return "RM_MESH"; + case (RM_TORUS): + return "RM_TORUS"; + case (RM_NAV): + return "RM_NAV"; + + default: + break; + } + return ""; +} + +/** + * finds the best match for a given job request + * + * IN - int spec right now holds the place for some type of + * specification as to the importance of certain job params, for + * instance, geometry, type, size, etc. + * + * OUT - part_id of matched partition, NULL otherwise + * returns 1 for error (no match) + * + */ +int _find_best_partition_match(struct job_record* job_ptr, bitstr_t* slurm_part_bitmap, + int min_nodes, int max_nodes, + int spec, bgl_record_t* found_bgl_record) +{ + /** FIXME, need to get all the partition_t's in a list, or a common data structure + * that holds all that info I need!!! + */ + ListIterator itr; + bgl_record_t* record; + int i, num_dim_best, cur_dim_match; + int* geometry = NULL; + bitstr_t* bitcpy; + sort_bgl_record_inc_size(bgl_list); + + /** this is where we should have the control flow depending on + the spec arguement*/ + num_dim_best = 0; + itr = list_iterator_create(bgl_list); + found_bgl_record = NULL; + /* NEED TO PUT THIS LOGIC IN: + * if RM_NAV, then the partition with both the TORUS and the + * dims should be favored over the MESH and the dims, but + * foremost is the correct num of dims. + */ + while ((record = (bgl_record_t*) list_next(itr))) { + if (!record){ + error("FIXME: well, bad bad bad..."); + continue; + } + if (record->used){ + debug("this record used"); + continue; + } + /** + * first we check against the bitmap to see + * if this partition can be used for this job. + * + * the slurm partition bitmap is a superset of the bgl part bitmap + * + * - if we AND the incoming slurm bitmap with the bgl + * bitmap, and the bgl bitmap is different that should + * mean that some nodes in the slurm bitmap have been + * "drained" or set otherwise unusable. + */ + debug("- - - - - - - - - - - - -"); + debug("check partition bitmap"); + debug("- - - - - - - - - - - - -"); + bitcpy = bit_copy(record->bitmap); + debug("copy before"); + // 0000 0011 + _print_bitmap(bitcpy); + /* this fxn mutates first argument */ + // 0000 0011 & 0000 0000 => 0000 0000 + // 0000 0011 | 0000 0000 => 0000 0011 + bit_and(bitcpy, slurm_part_bitmap); + debug("slurm bit"); + _print_bitmap(slurm_part_bitmap); + debug("copy after"); + _print_bitmap(bitcpy); + debug("bgl"); + _print_bitmap(record->bitmap); + debug("equals? %d", (bit_equal(bitcpy, record->bitmap))); + if (!bit_equal(bitcpy, record->bitmap)){ + debug("bgl partition %s unusable", record->nodes); + continue; + } + /** ?? FIXME */ + bit_free(bitcpy); + + debug("This partition matched!!!"); + debug("- - - - - - - - - - - - -"); + /*******************************************/ + /** check that the number of nodes match */ + /*******************************************/ + debug("nodes num match: max %d min %d record_num_nodes %d", + max_nodes, min_nodes, record->size); + if (record->size < min_nodes || (max_nodes != 0 && record->size > max_nodes)){ + error("debug request num nodes doesn't fit"); + continue; + } + + /***********************************************/ + /* check the connection type specified matches */ + /***********************************************/ + debug("part type match %s ? %s", convert_part_type(&job_ptr->type), + convert_part_type(record->part_type)); + if (!record->part_type){ + error("find_best_partition_match record->part_type is NULL"); + continue; + } + if (job_ptr->type != *(record->part_type) && + job_ptr->type != RM_NAV){ + continue; + } + + /*****************************************/ + /** match up geometry as "best" possible */ + /*****************************************/ + if (job_ptr->geometry[0] == 0){ + debug("find_best_partitionmatch: we don't care about geometry"); + found_bgl_record = record; + break; + } + if (job_ptr->rotate) + rotate_part(job_ptr->geometry, &geometry); + + cur_dim_match = 0; + for (i=0; i<SYSTEM_DIMENSIONS; i++){ + if (!record->alloc_part) { + error("warning, bgl_record %s has not found a home...", + record->nodes); + continue; + } + + /** + * we should distinguish between an exact match and a + * fuzzy match (being greater than + */ + if (record->alloc_part->dimensions[i] >= job_ptr->geometry[i]){ + cur_dim_match++; + } + } + + if (cur_dim_match > num_dim_best){ + found_bgl_record = record; + num_dim_best = cur_dim_match; + if (num_dim_best == SYSTEM_DIMENSIONS) + break; + } + } + + /** set the bitmap and do other allocation activities */ + if (found_bgl_record){ + bitcpy = bit_copy(found_bgl_record->bitmap); + debug("phung: SUCCESS! found partition %s <%s>", + found_bgl_record->slurm_part_id, found_bgl_record->nodes); + found_bgl_record->used = 1; + debug("- - - - - - - - - - - - -"); + debug("setting return bitmap"); + debug("- - - - - - - - - - - - -"); + // bit_not(bitcpy); + bit_not(slurm_part_bitmap); + debug("not copy: "); + _print_bitmap(bitcpy); + debug("slurm before: "); + _print_bitmap(slurm_part_bitmap); + bit_or(slurm_part_bitmap, bitcpy); + debug("slurm after: "); + _print_bitmap(slurm_part_bitmap); + + debug("- - - - - - - - - - - - -"); + /** ?? FIXME */ + bit_free(bitcpy); + return SLURM_SUCCESS; + } + + debug("phung: FAILURE! no bgl record found"); + return SLURM_ERROR; +} + +/** + * Comparator used for sorting partitions smallest to largest + * + * returns: -1: A greater than B 0: A equal to B 1: A less than B + * + */ +int _bgl_record_cmpf_inc(bgl_record_t* A, bgl_record_t* B) +{ + if (A->size < B->size) + return -1; + else if (A->size > B->size) + return 1; + else + return 0; +} + +/** + * Comparator used for sorting partitions largest to smallest + * + * returns: -1: A greater than B 0: A equal to B 1: A less than B + * + */ +int _bgl_record_cmpf_dec(bgl_record_t* A, bgl_record_t* B) +{ + if (A->size > B->size) + return -1; + else if (A->size < B->size) + return 1; + else + return 0; +} + +/** + * sort the partitions by increasing size + */ +void sort_bgl_record_inc_size(List records){ + if (records == NULL) + return; + list_sort(records, (ListCmpF) _bgl_record_cmpf_inc); +} + +/** + * sort the partitions by decreasing size + */ +void sort_bgl_record_dec_size(List records){ + if (records == NULL) + return; + list_sort(records, (ListCmpF) _bgl_record_cmpf_dec); +} + +/** + * + */ +int submit_job(struct job_record *job_ptr, bitstr_t *slurm_part_bitmap, + int min_nodes, int max_nodes) +{ + int spec = 1; // this will be like, keep TYPE a priority, etc, blah blah. + + ListIterator itr; + bgl_record_t* record; + debug("bluegene::submit_job"); + + itr = list_iterator_create(bgl_list); + while ((record = (bgl_record_t*) list_next(itr))) { + print_bgl_record(record); + } + + debug("job request"); + debug("geometry:\t%d %d %d", job_ptr->geometry[0], job_ptr->geometry[1], job_ptr->geometry[2]); + debug("type:\t%s", convert_part_type(&job_ptr->type)); + debug("rotate:\t%d", job_ptr->rotate); + debug("min_nodes:\t %d", min_nodes); + debug("max_nodes:\t%d", max_nodes); + _print_bitmap(slurm_part_bitmap); + + if (_find_best_partition_match(job_ptr, slurm_part_bitmap, min_nodes, max_nodes, + spec, record)){ + return SLURM_ERROR; + } else { + ; // now we place the part_id into the env of the script to run + } + + /** we should do the BGL stuff here like, init BGL job stuff... */ + debug("return slurm partition bitmap"); + _print_bitmap(slurm_part_bitmap); + return SLURM_SUCCESS; +} + +/** + * for my debugging purposes, I occasionally print out the bitmap + */ +void _print_bitmap(bitstr_t* bitmap) +{ + int bitsize = 128; + char* bitstring = (char*) malloc(sizeof(char)*bitsize); + bit_fmt(bitstring, bitsize, bitmap); + debug("bitmap:\t%s", bitstring); + free(bitstring); +} + +/** + * global - bgl: + */ +void update_bgl_node_bitmap(bitstr_t* bitmap) +{ +#ifdef _RM_API_H__ + int bp_num,wire_num,switch_num,i; + rm_BP_t *my_bp; + rm_switch_t *my_switch; + rm_wire_t *my_wire; + rm_size3D_t bp_size,size_in_bp,m_size; + + if (!bgl){ + error("error, BGL is not initialized"); + } + + printf("---------rm_get_BGL------------\n"); + rm_get_data(bgl,RM_BPsize,&bp_size); + rm_get_data(bgl,RM_Msize,&m_size); + + printf("BP Size = (%d x %d x %d)\n",bp_size.X,bp_size.Y,bp_size.Z); + + rm_get_data(bgl,RM_BPNum,&bp_num); + printf("- - - - - BPS (%d) - - - - - -\n",bp_num); + + for(i=0;i<bp_num;i++){ + if(i==0) + rm_get_data(bgl,RM_FirstBP,&my_bp); + else + rm_get_data(bgl,RM_NextBP,&my_bp); + rm_BP_state_t bp_state; + rm_get_data(my_bp,RM_BPState,&bp_state); + /* from here we either update the node or bitmap + entry */ + } + +#endif +} + + +#ifdef _RM_API_H__ +/** */ +char *convert_bp_state(rm_BP_state_t state){ + switch(state){ + case RM_BP_UP: + return "RM_BP_UP"; + break; + case RM_BP_DOWN: + return "RM_BP_DOWN"; + break; + case RM_BP_NAV: + return "RM_BP_NAV"; + defalt: + return "BP_STATE_UNIDENTIFIED!"; + } +}; +#endif + +int _bitmap_notequals(bitstr_t* A, bitstr_t* B) +{ + return !(bit_super_set(A, B) && bit_super_set(B, A)); +} diff --git a/src/plugins/select/bluegene/bluegene.conf b/src/plugins/select/bluegene/bluegene.conf new file mode 100644 index 00000000000..7ddcd23119f --- /dev/null +++ b/src/plugins/select/bluegene/bluegene.conf @@ -0,0 +1,9 @@ + +# Sample Bluegene/L plugin configuration file. See README for +# description. the BGL parititon nodes must match preconfigured SLURM +# partitions. + +Nodes=bgl[400x511] Type=Mesh +Nodes=bgl[600x711] Type=Torus +Nodes=bgl[422x531] Type=Torus +Nodes=bgl[622x731] Type=Mesh diff --git a/src/plugins/select/bluegene/bluegene.h b/src/plugins/select/bluegene/bluegene.h new file mode 100644 index 00000000000..c7dab7272cb --- /dev/null +++ b/src/plugins/select/bluegene/bluegene.h @@ -0,0 +1,100 @@ +#ifndef _BLUEGENE_H_ +#define _BLUEGENE_H_ + +/*****************************************************************************\ + * bluegene.h - header for bgl node allocation plugin. + ***************************************************************************** + * Copyright (C) 2003 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Dan Phung <phung4@llnl.gov> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +#include "src/common/bitstring.h" +#include "src/slurmctld/slurmctld.h" +#ifndef _HOSTLIST_H +#include "src/common/hostlist.h" +#endif + +// #include "rm_api.h" +#ifndef _RM_API_H__ +typedef int pm_partition_id_t; +typedef int rm_partition_t; +#else +rm_BGL_t *bgl; +#endif + +List slurm_part_list; /* cached copy of slurm's part_list */ +List bgl_list; /* list of bgl_record entries */ +List bgl_conf_list; /* list of bgl_conf_record entries */ +typedef int lifecycle_type_t; +enum part_lifecycle {DYNAMIC, STATIC}; + +typedef struct bgl_record { + int used; + char* slurm_part_id; /* ID specified by admins */ + pm_partition_id_t* bgl_part_id; /* ID returned from CMCS */ + char* nodes; /* String of nodes in partition */ + lifecycle_type_t part_lifecycle;/* either STATIC or DYNAMIC */ + hostlist_t* hostlist; /* expanded form of hosts */ + bitstr_t *bitmap; /* bitmap of nodes for this partition */ + struct partition* alloc_part; /* the allocated partition */ + int size; /* node count for the partitions */ + rm_partition_t* part_type; /* Type=Mesh/Torus/ */ +} bgl_record_t; + +/** + * bgl_conf_record is used to store the elements read from the config + * file from init(). + */ +typedef struct bgl_conf_record{ + char* nodes; + rm_partition_t* part_type; +} bgl_conf_record_t; + +/** + * process the configuration file so to interpret what partitions are + * static, dynamic, etc. + * + */ +int read_bgl_conf(); +/** */ +int init_bgl(); + +int create_static_partitions(); +/** */ +int submit_job(struct job_record *job_ptr, bitstr_t *bitmap, + int min_nodes, int max_nodes); +/** */ +void sort_bgl_record_inc_size(List records); +/** */ +void sort_bgl_record_dec_size(List records); + +/** */ +void print_bgl_record(bgl_record_t* record); +/** */ +char* convert_lifecycle(lifecycle_type_t lifecycle); +/** */ +char* convert_part_type(rm_partition_t* pt); + +/** */ +void update_bgl_node_bitmap(bitstr_t* bitmap); + + +#endif /* _BLUEGENE_H_ */ diff --git a/src/plugins/select/bluegene/partition_sys.c b/src/plugins/select/bluegene/partition_sys.c new file mode 100755 index 00000000000..670d6b55b2d --- /dev/null +++ b/src/plugins/select/bluegene/partition_sys.c @@ -0,0 +1,1112 @@ +/*****************************************************************************\ + * partition_sys.c + * + ***************************************************************************** + * Copyright (C) 2004 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Dan Phung <phung4@llnl.gov> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ + +// #define DEBUG_ALLOCATE +// #define DEBUG_PART +// #define _RM_API_H__ + +/** need this to have it compile with the BGL header*/ +// typedef int MPIR_PROCDESC; + +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <math.h> +#include "src/common/list.h" +#include "src/common/xmalloc.h" +#include "partition_sys.h" +#include "bluegene.h" +// #include "rm_api.h" + +#ifdef _RM_API_H__ +#include "bgl_switch_connections.h" +#endif + +/****************************/ +/* for testing purposes */ +int BGL_PARTITION_NUMBER = 0; +/****************************/ + +#ifdef _UNIT_TEST_ +extern void * lsd_fatal_error(char *file, int line, char *mesg){} +extern void * lsd_nomem_error(char *file, int line, char *mesg){} +#endif + +/** */ +#ifdef _RM_API_H__ +char *BGL_MLOADER_IMAGE = "/bgl/edi/build/bglsys/bin/mmcs-mloader.rts"; +char *BGL_BLRTS_IMAGE = "/bgl/edi/build/bglsys/bin//rts_hw.rts"; +char *BGL_LINUX_IMAGE = "/bgl/edi/build/bglsys/bin/zImage.elf"; +char *BGL_RAMDISK_IMAGE = "/bgl/edi/build/bglsys/bin/ramdisk.elf"; +#endif + +/** these are used in the dynamic partitioning algorithm */ +/* global system = list of free partitions */ +List bgl_sys_free = NULL; +/* global system = list of allocated partitions */ +List bgl_sys_allocated = NULL; + +void _init_sys(partition_t*); +int _isNotEqualsAllCoord(int* A, int* B); +int _isNotEqualsSomeCoord(int* A, int* B); + +#ifdef _RM_API_H__ +/** + * _get_BP: get the BP at location loc + * + * IN - bgl: pointer to preinitialized bgl pointer + * IN - bp: pointer to preinitailized rm_element_t that will + * hold the BP that we resolve to. + * IN - loc: location of the desired BP + * OUT - bp: will point to BP at location loc + * OUT - rc: error code (0 = success) + */ +int _get_BP(rm_element_t *bp, rm_location_t *loc); +int check_BP_status(rm_location_t *loc); +void pre_allocate(rm_partition_t* my_part, rm_connection_type_t* part_conn); +int post_allocate(rm_partition_t *my_part, pm_partition_id_t *part_id); +int get_switch(partition_t* partition, List switch_list); +int get_BP_by_location(int* cur_coord, rm_BP_t* BP); +void rm_switch_t_destroy(void* object); + +#else +int create_bgl_partitions(List requests); + +#endif + +int break_up_partition(List sys, partition_t* partition_to_break, int index); +int fit_request(List sys, List allocated, int* request); + + +void int_array_destroy(void* object); +int intArray_CmpF(int* A, int* B); + +int partition_CmpF_inc(struct partition* A, struct partition* B); +int partition_CmpF_dec(struct partition* A, struct partition* B); + +#ifdef _RM_API_H__ +void preallocate(rm_BGL_t* bgl, rm_partition_t* my_part, + char* username, rm_connection_type_t* part_conn); +int postallocate(rm_BGL_t *bgl, rm_partition_t *my_part, + pm_partition_id_t *part_id); +#endif + +List get_bgl_sys_free(); +List get_bgl_sys_allocated(); +#ifdef _UNIT_TESTS_ +void debug(const char *fmt, ...); +#endif + + +/** + * partition_sys: + * + * partition the system according to the given configuration. We're + * assuming that the input config array is only one dimension + * (eg. only X configurations) and is sorted in decreasing order. + * + * example usage: admin wants to partition system as such: 4x4x4, + * 2x4x4, 2x4x4 to do this, we would run partition_sys three times with + * the config as {4,2,2} (X-direction), then {4,4,4}(for Y) and + * finally {4,4,4}( for Z). + * + * we should really just have all the config stuff in one struct + * and then have each element in the configs be of type part_config + * + * IN - config: one of the following system configurations + * IN - BGL_system: pointer to the bgl system functions..or something + * OUT - partid_list: list of BGL partition id's + * OUT - return code of success + * + * SIDE EFFECT: calls BGL CMCS API that changes the DB2 and + * essentially wires up the system + */ + +int partition_sys(List requests) +{ + + ListIterator itr; + partition_t part; + + /* 1) we sort in decreasing order by size */ + sortIntArrayByDecSize(requests); + /* initialize the starting system */ + _init_sys(&part); + + if (bgl_sys_allocated == NULL) + error("list_create failed for bgl_sys_allocated\n"); + + /* 2) for each partition configuration, place them in + * order + */ + itr = list_iterator_create(requests); + +#ifdef DEBUG_PART + debug("REQUESTS: \n"); + printList(requests); +#endif + + int* request; + int all_success = 0; // 0 = yes, 1 = no + while ((request = (int*) list_next(itr))) { + if (fit_request(bgl_sys_free, bgl_sys_allocated, request)){ +#ifdef DEBUG_PART + debug("failure in allocation!!!\n"); +#endif + all_success = 1; + } else { +#ifdef DEBUG_PART + debug("success in allocation\n"); +#endif + } + } + list_iterator_destroy(itr); + + create_bgl_partitions(bgl_sys_allocated); + + return all_success; +} + +/** + * IN - requests: List <partition_t*> to wire up. + * + */ +int create_bgl_partitions(List requests) +{ + partition_t* cur_partition; + + printf("partition_sys::create_bgl_partitions\n"); + ListIterator itr = list_iterator_create(requests); + while ((cur_partition = (partition_t*) list_next(itr))) { + configure_switches(cur_partition); + } + list_iterator_destroy(itr); + return 0; +} + +/** + * assign a list of nodes to the configuration + * + * since we *know* that the configuration will + * fit in somewhere in a power of two in the system + * we can always ensure a perfect fit. Thus if the + * size of a given partition is two big, we can + * cut it in half. + * + * we assume that the partitioning done before hand + * + */ +int fit_request(List sys, List allocated, int* request) +{ + if (sys == NULL || allocated == NULL || request == NULL) + return 1; + + int i, rc = 1; + /** print out the request */ +#ifdef DEBUG_PART + debug("\nTrying to fit [ %d", request[0]); + for (i=1; i<SYSTEM_DIMENSIONS; i++){ + debug(" x %d", request[i]); + } + // debug(" ]\n"); + debug("current system list\n"); + printSysList(sys); +#endif + /* ??? FIXME wtf, if rotate_part doesn't have something printed before it....it segfaults */ + debug(""); + int* new_request; + rotate_part(request, &new_request); + free(request); + request = new_request; + + /** */ + ListIterator itr = list_iterator_create(sys); + int request_size = intArray_size(request); + partition_t* cur_partition; + + /** this stuff is for knowing which partition we want to select to break */ + partition_t* partition_to_break = NULL; + int partition_dim_max = -1; + int max_index = SYSTEM_DIMENSIONS; /* we want the earliest, so we'll + * set a good high point as the farthest + * dimesion */ + while ((cur_partition = (partition_t*) list_next(itr))) { + if (!isNotCorrectDimension(cur_partition->dimensions, request)){ +#ifdef DEBUG_PART + debug("\n!!!!!!!!!!!!!!!!!\n! FOUND FIT !\n!!!!!!!!!!!!!!!!!\n"); + printPartition(cur_partition); +#endif + + list_push(allocated, cur_partition); + list_remove(itr); + rc = 0; + break; + } + + /* this partition's too small to break up, so goto next. */ + if (cur_partition->size < request_size) { + continue; + + /* big enough to break */ + } else { + /* partition selection policy: + * + * the largest dimension that is larger than the request (in + * some dimension) that is earliest (dimension wise). + */ + for (i=0; i<SYSTEM_DIMENSIONS; i++){ + /* if the current partition's dimension is greater + * than that requested + */ + if (cur_partition->dimensions[i] > request[i] && + cur_partition->dimensions[i] > partition_dim_max && + i < max_index){ + + partition_to_break = cur_partition; + partition_dim_max = cur_partition->dimensions[i]; + max_index = i; + } + } + } + } + list_iterator_destroy(itr); + + /* well, if we have a partition to break, then we break apart the + * partition and then call ourselves again. otherwise, we've + * exhausted all possibilities so we can't fit this request :( + */ + if (rc != 0 && partition_to_break != NULL){ + /* break up the partition and do the RECURSIVE CALL! */ + break_up_partition(sys, partition_to_break, max_index); + rc = fit_request(sys, allocated, request); + /** ??? FIXME, if something is not printed, the program will segfault, looks like + * stdout just needs to be flushed or something */ + /** 999 */ + + debug(""); +#ifdef DEBUG_PART + ; +#endif + } + return rc; +} + +/** + * break up a partition in half according to the index (dimension) + * given. since we expect to have only powers of 2 partitions later + * on, we definetely want to simply split by what's requested. + * + * IMPORTANT!!! I am assuming that we will ALWAYS have a power of 2, so + * odd number sizes, and dimensions will kill this!!! + * + */ +int break_up_partition(List sys, partition_t* partition_to_break, int index) +{ + if (sys == NULL || partition_to_break == NULL) + return 1; + + /* the two new partitions to create */ + partition_t *first_part, *second_part; + first_part = (partition_t*) xmalloc(sizeof(partition_t)); + second_part = (partition_t*) xmalloc(sizeof(partition_t)); + + if (!first_part || !second_part){ + error("break_up_partition: not enough memory to break up partitions"); + return 1; + } + + copyPartition(partition_to_break, first_part); + copyPartition(partition_to_break, second_part); + + first_part->size /= 2; + second_part->size /= 2; + first_part->dimensions[index] /= 2; + second_part->dimensions[index] /= 2; + + double diff = partition_to_break->tr_coord[index] - partition_to_break->bl_coord[index]; + first_part->tr_coord[index] = floor(diff/2); + second_part->bl_coord[index] = ceil(diff/2); + + ListIterator itr; + partition_t* next; + itr = list_iterator_create(sys); + while ((next = (partition_t*) list_next(itr))) { + if(!isNotCorrectDimension(next->dimensions, partition_to_break->dimensions)){ + /* next we remove the old partition */ + list_remove(itr); + + /* then we insert our new partitions */ + list_append(sys, first_part); + list_append(sys, second_part); + break; + } + } + list_iterator_destroy(itr); + return 0; +} + +void printPartition(partition_t* part) +{ + if (part == NULL) + return; + + debug("\tdimensions: [ %d %d %d ]\n", part->dimensions[0], + part->dimensions[1], part->dimensions[2]); + debug("\tbl coord: [ %d %d %d ]\n", part->bl_coord[0], + part->bl_coord[1], part->bl_coord[2]); + debug("\ttr coord: [ %d %d %d ]\n", part->tr_coord[0], + part->tr_coord[1], part->tr_coord[2]); + debug("\tsize: %d\n", part->size); + debug("\tbgl_record_ptr addr: %d\n", part->bgl_record_ptr); +} + +void copyPartition(partition_t* src, partition_t* dest) +{ + if (src == NULL || dest == NULL) + return; + + int i; + for (i=0; i<SYSTEM_DIMENSIONS; i++){ + dest->bl_coord[i] = src->bl_coord[i]; + dest->tr_coord[i] = src->tr_coord[i]; + dest->dimensions[i] = src->dimensions[i]; + } + dest->size = src->size; +} + +/** + * returns 0 for equals, 1 for not equals + */ +int isPartitionNotEquals(partition_t* A, partition_t* B) +{ + if (A == NULL || B == NULL) + return 1; + + if (A->bl_coord == B->bl_coord && + A->tr_coord == B->tr_coord) + return 0; + else + return 1; +} + +/** + * return - the int array's size + */ +int intArray_size(int* part_geometry){ + if (part_geometry == NULL) + return 0; + + int size = 1; + int i; + + for(i=0; i<SYSTEM_DIMENSIONS; i++){ + size *= part_geometry[i]; + } + + return size; +} + +/** + * print out a list + */ +void printList(List list) +{ + if (list == NULL) + return; + + debug("trying to get the list iterator\n"); + ListIterator itr = list_iterator_create(list); + debug("doen\n"); + int* stuff = NULL, i = 0; + debug("printing list\n"); + while ((stuff = (int*) list_next(itr))) { + debug("stuff %d", stuff); + if (stuff == NULL){ + break; + } + + debug("[ %d", stuff[0]); + for (i=1; i<SYSTEM_DIMENSIONS; i++){ + debug(" x %d", stuff[i]); + } + debug(" ]\n"); + } + list_iterator_destroy(itr); +} + +/** + * print out list of the system partitions + */ +void printSysList(List list) +{ + if (list == NULL){ + debug("List is empty (NULL)\n"); + return; + } + + ListIterator itr = list_iterator_create(list); + int i, part_count=0; + partition_t* stuff; + while ((stuff = (partition_t*) list_next(itr))) { + if (stuff == NULL){ + break; + } + + debug("part %d: dimensions [ %d", part_count++, stuff->dimensions[0]); + for (i=1; i<SYSTEM_DIMENSIONS; i++){ + debug(" x %d", stuff->dimensions[i]); + } + debug(" ]\n"); + + debug("bl coord [ %d", stuff->bl_coord[0]); + for (i=1; i<SYSTEM_DIMENSIONS; i++){ + debug(" x %d", stuff->bl_coord[i]); + } + debug(" ]\n"); + + debug("tr coord [ %d", stuff->tr_coord[0]); + for (i=1; i<SYSTEM_DIMENSIONS; i++){ + debug(" x %d", stuff->tr_coord[i]); + } + debug(" ]\n"); + + } + list_iterator_destroy(itr); +} + +/** + * sort the configurations by decreasing size + */ +void sortIntArrayByDecSize(List configs){ + if (configs == NULL) + return; + list_sort(configs, (ListCmpF) intArray_CmpF); +} + +/** + * Comparator used for sorting int arrays + * + * returns: -1: A greater than B 0: A equal to B 1: A less than B + * + * Note: return values are "reversed" so that we can have the list + * sorted in decreasing order (largest to smallest) + */ +int intArray_CmpF(int* A, int* B) +{ + if (A == NULL || B == NULL) + return -9; + + int volA = intArray_size(A); + int volB = intArray_size(B); + if (volA > volB) + return -1; + else if (volA < volB) + return 1; + else + return 0; +} + +/** + * configure_switches = wire the partition as a mesh (pre_0_1 implementation) + * + * returns 0 for success, 1 for failure + */ +#ifdef _RM_API_H__ +int configure_switches(rm_partition_t* partition, partition_t* partition) +#else + int configure_switches(partition_t* partition) +#endif +{ + if (partition == NULL){ + return 1; + } + + int cur_coord[SYSTEM_DIMENSIONS]; + pm_partition_id_t* bgl_part_id; +#ifdef _RM_API_H__ + BGL_switch_t* bgl_switch; + rm_partition_t *bgl_part; + pre_allocate(bgl_part, cur_partition->part_type); +#endif + + /** FIXME + * right now the loop is (for example): + * for bl: 102 to tr 323 (dim = 3x3x2 volume = 18) + * 102, 103 + * 112, 113 + * 122, 123 + * + * 202, 203 + * 212, 213 + * 222, 223 + * + * 302, 303 + * 312, 313 + * 322, 323 + */ + + /* for each of the dimensions */ + // int first = 1; + for (cur_coord[0] = partition->bl_coord[0]; + cur_coord[0] <= partition->tr_coord[0]; + cur_coord[0]++){ + + for (cur_coord[1] = partition->bl_coord[1]; + cur_coord[1] <= partition->tr_coord[1]; + cur_coord[1]++){ + + for (cur_coord[2] = partition->bl_coord[2]; + cur_coord[2] <= partition->tr_coord[2]; + cur_coord[2]++){ + +#ifdef _RM_API_H__ + /***** BGL SPECIFIC ******/ + /** below, we wire up each all three switches of each BP **/ + /* SPECIAL CASE FIRST BP */ + if (!_isNotEqualsSomeCoord(cur_coord, partition->bl_coord)){ + List switch_list; + ListIterator itr; + rm_switch_t* cur_switch; + if (get_switch(cur_coord, switch_list)){ + error("configure_switches, error in getting bgl switch"); + } + + itr = list_iterator_create(switch_list); + while ((cur_switch = (rm_switch_t*) list_next(itr))) { + rm_dimension_t dim; + rm_get_data(cur_switch,RM_SwitchDim,&dim); + /** why is the X dim such a bizach */ + if (dim == RM_DIM_X){ + /** + * FIXME, well see, here + * we should be hooking up + * BP's 0 and 1 as in the "middle", + * that is, having both the + * "next and prev" connections + * and of course, this depends + * on the size of the BP. if size=2, + * we're all right, but if size is + * greater, that it's the one's + * in the physical middle that + * must be wired so. + */ + if (first == 1){ + connect_next(bgl_part, cur_switch, first); + first = 0; + } + } else { + if (first == 1){ + connect_next(bgl_part, cur_switch, first); + first = 0; + } + } + } + + /* now we have a valide BGL + * partition ID with which to + * submit jobs */ + + /** FIXME now go get the BGL record and insert the new + * bgl_part_id */ + + /* SPECIAL CASE END BP */ + } else if (!_isNotEqualsSomeCoord(cur_coord, partition->tr_coord)){ + ; + + /* NORMAL CASE, IN BETWEEN */ + } else { + bgl_switch = get_switch(); + } + +#else /* FOR DEBUGGING PURPOSES */ +#ifdef DEBUG_ALLOCATE + /***** DEBUG SPECIFIC (PRINT OUT RESULTS) ******/ + /* SPECIAL CASE FIRST BP */ + if (!_isNotEqualsSomeCoord(cur_coord, partition->bl_coord)){ + debug("allocate: connecting 1-3 of BP %d", cur_coord[0]); + for (i=1; i<SYSTEM_DIMENSIONS; i++){ + debug(" x %d", cur_coord[i]); + } + debug("\n"); + + + /* SPECIAL CASE END BP */ + } else if (!_isNotEqualsSomeCoord(cur_coord, partition->tr_coord)){ + debug("allocate: connecting 0-4 of BP %d", cur_coord[0]); + for (i=1; i<SYSTEM_DIMENSIONS; i++){ + debug(" x %d", cur_coord[i]); + } + debug("\n"); + + /* NORMAL CASE, IN BETWEEN */ + } else { + debug("allocate: connecting 0-4,1-3 of BP %d", cur_coord[0]); + for (i=1; i<SYSTEM_DIMENSIONS; i++){ + debug(" x %d", cur_coord[i]); + } + debug("\n"); + } +#endif +#endif + + } /* end of cur_coord[2]*/ + } /* end of cur_coord[1]*/ + } /* end of cur_coord[1]*/ + +#ifdef _RM_API_H__ + post_allocate(bgl_part, bgl_part_id); + bgl_record_t* bgl_rec = (bgl_record_t*) partition->bgl_record_ptr; + bgl_rec->bgl_part_id = bgl_part_id; + partition->bgl_part_id = bgl_part_id; + +#else + + bgl_part_id = (pm_partition_id_t*) malloc(sizeof(pm_partition_id_t)); + // *bgl_part_id = (int)(rand()%100); + *bgl_part_id = BGL_PARTITION_NUMBER++; + bgl_record_t* bgl_rec = (bgl_record_t*) partition->bgl_record_ptr; + bgl_rec->bgl_part_id = bgl_part_id; + partition->bgl_part_id = bgl_part_id; + // debug("999 partition id %d\n", *bgl_part_id); + +#endif + return 0; +} + +/** + * find if the cur_part fits the same dimensions as the given request + * return 0 for affirmative (correct dimension), and 1 for negative (not correct dimension) + */ +int isNotCorrectDimension(int* cur_part, int* request) +{ + if (cur_part == NULL || request == NULL) + return 1; + + int i, j; + int cur_part_tmp[SYSTEM_DIMENSIONS]; + + /* copy over arrays into temporary arrays */ + for(i=0; i<SYSTEM_DIMENSIONS; i++){ + cur_part_tmp[i] = cur_part[i]; + } + + int end_of_array = SYSTEM_DIMENSIONS; + int tmp, found_match; + for(i=0; i<SYSTEM_DIMENSIONS; i++){ + found_match = 0; + for(j=0; j<end_of_array; j++){ + if (request[i] == cur_part_tmp[j]){ + /* swap out end of array */ + tmp = cur_part_tmp[end_of_array-1]; + cur_part_tmp[end_of_array-1] = cur_part_tmp[j]; + cur_part_tmp[j] = tmp; + --end_of_array; + found_match = 1; + break; + } + } + if (!found_match){ + /* debug("didn't find match for %d\n", request[i]); */ + break; + } + } + + /* if we've found all the elements, then the "end of array" + * should be 0 + */ + if (!end_of_array){ + /* debug("success, all elements found!\n"); */ + return 0; + } else { + /* debug("failure, some elements not found!\n"); */ + return 1; + } +} + +int factorial(int numb) +{ + int i, fact = 1; + for (i=numb; i > 0; i++) + fact *= i; + return fact; +} + +/** + * return the index + */ +int max_dim_index(int* array) +{ + int i, max = -1, max_index = 0; + for (i=0; i<SYSTEM_DIMENSIONS; i++){ + if (array[i] > max){ + max = array[i]; + max_index = i; + } + } + + return max_index; +} + +/** + * rotate the given partition configuration into decreasing + * order. (ie, 2,1,4 -> 4,2,1). + * + * note: this is for 3d only! + */ +void rotate_part(const int* config, int** new_config) +{ + if (config == NULL) + return; + + if (*new_config != NULL){ + free(*new_config); + } + + (*new_config) = (int*) calloc(SYSTEM_DIMENSIONS, sizeof(int)); + if (!(*new_config)){ + // printf("error: rotate_part: not enough memory for new array\n"); + return; + } + + if (config[0] > config[1]){ + if (config[1] > config[2]){ + ; // array already sorted + } else { + if (config[0] > config[2]){ + (*new_config)[0] = config[0]; + (*new_config)[1] = config[2]; + (*new_config)[2] = config[1]; + return; + } else { + (*new_config)[0] = config[2]; + (*new_config)[1] = config[0]; + (*new_config)[2] = config[1]; + return; + } + } + /* config[0] <= config[1] */ + } else { + if (config[1] > config[2]){ + if (config[0] > config [2]){ + (*new_config)[0] = config[1]; + (*new_config)[1] = config[0]; + (*new_config)[2] = config[2]; + return; + } else { + (*new_config)[0] = config[1]; + (*new_config)[1] = config[2]; + (*new_config)[2] = config[0]; + return; + } + } else { + (*new_config)[0] = config[2]; + (*new_config)[1] = config[1]; + (*new_config)[2] = config[0]; + return; + } + } +} + +/** + * get the initial configuration of the BGL system (or clean it up so + * that we know what we're dealing with). + * + * this should really go out and get BGL specific information + * + */ +void _init_sys(partition_t *part) +{ + /* initialize the system wide partition */ + bgl_sys_free = list_create((ListDelF) int_array_destroy); + + part->bl_coord[0] = 0; + part->bl_coord[1] = 0; + part->bl_coord[2] = 0; + part->tr_coord[0] = 7; + part->tr_coord[1] = 3; + part->tr_coord[2] = 3; + part->dimensions[0] = 8; + part->dimensions[1] = 4; + part->dimensions[2] = 4; + part->size = 128; + + /** ??? FIXME, I get a segfault if I have the list_create before the use of bgl_sys_free (swapping + * the following two statements. WHY??? + */ + list_push(bgl_sys_free, part); + bgl_sys_allocated = list_create((ListDelF) int_array_destroy); +} + +/** + * to be used by list object to destroy the array elements + */ +void int_array_destroy(void* object) +{ + xfree(object); +} + +#ifdef _RM_API_H__ +/** + * initialize the BGL partition in the resource manager + */ +void pre_allocate(rm_partition_t *my_part, rm_connection_type_t *part_conn) +{ + rm_new_partition(&my_part); //here we go... new partition to be added + rm_set_data(my_part,RM_PartitionMloaderImg, BGL_MLOADER_IMAGE); + rm_set_data(my_part,RM_PartitionBlrtsImg, BGL_BLRTS_IMAGE); + rm_set_data(my_part,RM_PartitionLinuxImg, BGL_LINUX_IMAGE); + rm_set_data(my_part,RM_PartitionRamdiskImg, BGL_RAMDISK_IMAGE); + rm_set_data(my_part,RM_PartitionConnection, part_conn); +} + +/** + * add the partition record to the DB and boot it up! + */ +int post_allocate(rm_partition_t *my_part, pm_partition_id_t *part_id) +{ + int rc; + rm_partition_state_t state; + + //Add partition record to the DB + rc = rm_add_partition(my_part); + if (rc != STATUS_OK){ + error("Error adding partition\n"); + return(-1); + } + + // Get back the new partition id + rm_get_data(my_part, RM_PartitionID, &part_id); + + //Initiate boot of the partition + debug("Booting Partition %s\n", part_id); + rc = pm_create_partition(part_id); + if (rc != STATUS_OK){ + error("Error booting_partition partition\n"); + return(-1); + } + + //Wait for Partition to be booted + rc = rm_get_partition(part_id, &my_part); + if (rc != STATUS_OK){ + error("Error in GetPartition\n"); + return(-1); + } + + rm_get_data(my_part, RM_PartitionState, &state); + error("Partition %s state = %s. Waiting...", part_id, convert_partition_state(state)); + fflush(stdout); +} + +/** + * get switch of the BP of these coordinates + */ +int get_switch(int* cur_coord, List switch_list) +{ + int switch_num, i; + rm_BP_t * bp; + rm_BP_id_t *cur_bpid, *bpid; + rm_switch_t* bgl_switch; + int found_bpid; + + if (switch_list == NULL){ + switch_list = list_create(rm_switch_t_destroy); + } else { + list_destroy(switch_list); + switch_list = list_create(rm_switch_t_destroy); + } + + if (get_BP_by_location(cur_coord, bp)){ + return SLURM_ERROR; + } + + rm_get_data(bp, RM_BPID, &bpid); + + rm_get_data(bgl, RM_SwitchNum, &switch_num); + rm_get_data(my_bgl,RM_FirstSwitch,&my_switch); + found_bpid = 0; + for (i=0; i<SwitchNum; i++){ + rm_get_data(my_switch, RM_SwitchBPID, &cur_bpid); + /** FIXME is there an equality function for BPID? */ + if (*bpid == *cur_bpid){ + found_bpid = 1; + break; + } + + rm_get_data(my_bgl,RM_NextSwitch,&my_switch); + } + + if (found_bpid){ + /** + * according to the mpirun example, I should be able to simply + * get these three switches in a row, and they should be XYZ + */ + + list_push(switch_list, my_switch); + for (i=0; i<2; i++){ + rm_get_data(my_bgl,RM_NextSwitch,&my_switch); + /* i'm just going to check here again for my sanity*/ + if (*bpid != *cur_bpid) + break; + + list_push(switch_list, my_switch); + } + + return SLURM_SUCCESS; + } + + return SLURM_ERROR; +} + +/** + * to be used by list object to destroy the array elements + */ +void rm_switch_t_destroy(void* object) +{ + /** FIXME, find out how to destroy one of these */ + xfree(object); +} + +/** + * this is just stupid. there are some implicit rules for where + * "NextBP" goes to, but we don't know, so we have to do this. + */ +int get_BP_by_location(int* cur_coord, rm_BP_t* BP) +{ + int BP_num; + rm_location_t* loc; + rm_get_data(bgl, RM_BPNum, &bp_num); + rm_get_data(bgl, RM_FirstBP, &BP); + for (i=0; i<BP_num; i++){ + rm_get_data(BP, RM_BPLoc, &loc); + if (loc.X == cur_coord[0] && loc.Y == cur_coord[1] && loc.Z == cur_coord[2]) + return SLURM_SUCCESS; + + rm_get_data(bgl, RM_NextBP, &BP); + } + + error("get_BP_by_location: could not find specified BP.\n"); + return SLURM_ERROR; +} +#endif + +/** + * non-equality for at least one coordinate + * + * returns 0 if equals, 1 if not equals + */ +int _isNotEqualsSomeCoord(int* A, int* B) +{ + int i; + for (i=0; i<SYSTEM_DIMENSIONS; i++){ + if (A[i] == B[i]) + return 0; + } + return 1; +} + +/** + * non-equality for all coordinates + * + * returns 0 if equals, 1 if not equals + */ +int _isNotEqualsAllCoord(int* A, int* B) +{ + int i; + for (i=0; i<SYSTEM_DIMENSIONS; i++){ + if (A[i] != B[i]) + return 1; + } + return 0; +} + +/** + * sort the partitions by increasing size + */ +void sortPartitionsByIncSize(List parts){ + if (parts == NULL) + return; + list_sort(parts, (ListCmpF) partition_CmpF_inc); +} + +/** + * sort the partitions by decreasing size + */ +void sortPartitionsByDecSize(List parts){ + if (parts == NULL) + return; + list_sort(parts, (ListCmpF) partition_CmpF_dec); +} + + +/** + * Comparator used for sorting partitions smallest to largest + * + * returns: -1: A greater than B 0: A equal to B 1: A less than B + * + */ +int partition_CmpF_inc(struct partition* A, struct partition* B) +{ + if (A->size < B->size) + return -1; + else if (A->size > B->size) + return 1; + else + return 0; +} + +/** + * Comparator used for sorting partitions largest to smallest + * + * returns: -1: A greater than B 0: A equal to B 1: A less than B + * + */ +int partition_CmpF_dec(struct partition* A, struct partition* B) +{ + if (A->size > B->size) + return -1; + else if (A->size < B->size) + return 1; + else + return 0; +} + + +List get_bgl_sys_allocated() +{ + return bgl_sys_allocated; +} + +List get_bgl_sys_free() +{ + return bgl_sys_free; +} + +#ifdef _UNIT_TESTS_ +void debug(const char *fmt, ...) +{ + printf(fmt, ...); +} +#endif diff --git a/src/plugins/select/bluegene/partition_sys.h b/src/plugins/select/bluegene/partition_sys.h new file mode 100644 index 00000000000..1b86321bb8c --- /dev/null +++ b/src/plugins/select/bluegene/partition_sys.h @@ -0,0 +1,73 @@ +/*****************************************************************************\ + * partition_sys.h + * + ***************************************************************************** + * Copyright (C) 2004 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Dan Phung <phung4@llnl.gov> + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. +\*****************************************************************************/ +#ifndef _PARTITION_SYS_H_ +#define _PARTITION_SYS_H_ + +#include <math.h> + +#define SYSTEM_DIMENSIONS 3 +#define X_DIMENSION 8 +#define Y_DIMENSION 4 +#define Z_DIMENSION 4 + +/** + * structure for use by partitioning algorithm to refer to the + * structural elements of the BGL partition system. + */ +typedef struct partition{ + int bl_coord[SYSTEM_DIMENSIONS]; /* bottom left coordinates */ + int tr_coord[SYSTEM_DIMENSIONS]; /* top right coordinates */ + int dimensions[SYSTEM_DIMENSIONS]; /* X,Y,Z dimensions */ + void* bgl_record_ptr; /* pointer to referring bgl_record */ + int size; +#ifdef _RM_API_H__ + pm_partition_id_t* bgl_part_id; /* ID returned from CMCS */ + rm_partition_t* part_type; /* Type=Mesh/Torus/ */ +#else + int* bgl_part_id; /* ID returned from CMCS */ + int* part_type; /* Type=Mesh/Torus/ */ +#endif +} partition_t; + +int configure_switches(partition_t* partition); +int partition_sys(List requests); + +void copyPartition(partition_t* A, partition_t* B); +void printPartition(partition_t* part); +void printList(List list); +void printSysList(List list); + +int isNotCorrectDimension(int* cur_part, int* req); +int isPartitionNotEquals(partition_t* A, partition_t* B); +void rotate_part(const int* config, int** new_config); + +int intArray_size(int* part_geometry); +void sortIntArrayByDecSize(List configs); +void sortPartitionsByIncSize(List partitions); +void sortPartitionsByDecSize(List partitions); + + +#endif /* _PARTITION_SYS_H_ */ diff --git a/src/plugins/select/bluegene/select_bluegene.c b/src/plugins/select/bluegene/select_bluegene.c index a1b3e5ce2a1..5d562c3152a 100644 --- a/src/plugins/select/bluegene/select_bluegene.c +++ b/src/plugins/select/bluegene/select_bluegene.c @@ -40,9 +40,20 @@ #include <slurm/slurm_errno.h> #include "src/common/list.h" +#include "src/common/plugin.h" + +#include "src/common/slurm_xlator.h" #include "src/common/log.h" +#include "src/common/macros.h" #include "src/slurmctld/slurmctld.h" +#ifdef WITH_PTHREADS +# include <pthread.h> +#endif /* WITH_PTHREADS */ + +#include "slurm/slurm_errno.h" +#include "bluegene.h" + /* * These variables are required by the generic plugin interface. If they * are not found in the plugin, the plugin loader will ignore it. @@ -81,11 +92,19 @@ const uint32_t plugin_version = 90; */ int init ( void ) { + debug("init"); + verbose("%s loading...", plugin_name); + + if (init_bgl()) + return SLURM_ERROR; + + verbose("%s done loading, system ready for use.", plugin_name); return SLURM_SUCCESS; } int fini ( void ) { + debug("fini"); return SLURM_SUCCESS; } @@ -94,18 +113,49 @@ int fini ( void ) * node selection API. */ +/** + * this is called periodically by slurmctld when + * - new nodes are added + * - new configuration file is loaded + */ +extern int select_p_part_init(List part_list) +{ + debug("select_p_part_init"); + /** isn't the part_list already accessible to me? */ + slurm_part_list = part_list; + + if (read_bgl_conf()) + return SLURM_ERROR; + + /* create_static_partitions */ + if (create_static_partitions()){ + /* error in creating the static partitions, so + * partitions referenced by submitted jobs won't + * correspond to actual slurm partitions/bgl + * partitions. + */ + fatal("Error, could not create the static partitions"); + return 1; + } + + return SLURM_SUCCESS; +} + extern int select_p_state_save(char *dir_name) { + debug("select_p_state_save"); return SLURM_SUCCESS; } extern int select_p_state_restore(char *dir_name) { + debug("select_p_state_restore"); return SLURM_SUCCESS; } extern int select_p_node_init(struct node_record *node_ptr, int node_cnt) { + debug("select_p_node_init"); if (node_ptr == NULL) { error("select_p_node_init: node_ptr == NULL"); return SLURM_ERROR; @@ -116,13 +166,10 @@ extern int select_p_node_init(struct node_record *node_ptr, int node_cnt) return SLURM_ERROR; } - error("select/bluegene plugin not yet functional"); - return SLURM_ERROR; -} - - -extern int select_p_part_init(List part_list) -{ + // error("select/bluegene plugin not yet functional"); + debug("select_p_node_init should be doing a system wide status " + "check on all the nodes to updated the system bitmap."); + // return SLURM_ERROR; return SLURM_SUCCESS; } @@ -145,19 +192,36 @@ extern int select_p_part_init(List part_list) * select_p_job_test is called */ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap, - int min_nodes, int max_nodes) + int min_nodes, int max_nodes) { - error("select/bluegene plugin not yet functional"); - return SLURM_ERROR; + debug("select_p_job_test"); + debug("select/bluegene plugin in alpha development"); + + + /* bgl partition test - is there a partition where we have: + * 1) geometery requested + * 2) min/max nodes (BPs) requested + * 3) type? (TORUS is harder than MESH to fulfill)...HOW TO TEST?!?!! + * + * note: we don't have to worry about security at this level + * b/c the SLURM partition logic will handle access rights. + */ + + if (submit_job(job_ptr, bitmap, min_nodes, max_nodes)){ + return SLURM_ERROR; + } else { + return SLURM_SUCCESS; + } } extern int select_p_job_init(struct job_record *job_ptr) { + debug("select_p_job_init"); return SLURM_SUCCESS; } extern int select_p_job_fini(struct job_record *job_ptr) { + debug("select_p_job_fini"); return SLURM_SUCCESS; } - -- GitLab