From 8504a2a624fcb514b58ceb4c1dbac0b804bd09ed Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Wed, 7 Jan 2009 00:17:26 +0000 Subject: [PATCH] Start of BASIL interface, needed for Cray --- src/slurmctld/basil_interface.c | 259 ++++++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 src/slurmctld/basil_interface.c diff --git a/src/slurmctld/basil_interface.c b/src/slurmctld/basil_interface.c new file mode 100644 index 00000000000..7bf0b7eed10 --- /dev/null +++ b/src/slurmctld/basil_interface.c @@ -0,0 +1,259 @@ +/*****************************************************************************\ + * basil_interface.c - slurmctld interface to BASIL, Cray's Batch Application + * Scheduler Interface Layer (BASIL) + ***************************************************************************** + * Copyright (C) 2009 Lawrence Livermore National Security. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Morris Jette <jette1@llnl.gov> + * LLNL-CODE-402394. + * + * This file is part of SLURM, a resource management program. + * For details, see <http://www.llnl.gov/linux/slurm/>. + * + * SLURM is free software; you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * In addition, as a special exception, the copyright holders give permission + * to link the code of portions of this program with the OpenSSL library under + * certain conditions as described in each individual source file, and + * distribute linked combinations including the two. You must obey the GNU + * General Public License in all respects for all of the code used other than + * OpenSSL. If you modify file(s) with this exception, you may extend this + * exception to your version of the file(s), but you are not obligated to do + * so. If you do not wish to do so, delete this exception statement from your + * version. If you delete this exception statement from all source files in + * the program, then also delete it here. + * + * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with SLURM; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +\*****************************************************************************/ + +/* FIXME: In slurmctld/slurmctld.h, add node_ptr->basil_node_id, init to NO_VAL */ +/* FIXME: In slurmctld/node_mgr.c, make _sync_bitmaps() extern */ +/* FIXME: In common/node_select.c, add reservation_id to select_job */ +/* FIXME: Document, ALPS must be started before SLURM */ + +#include <slurm/slurm_errno.h> +#include <string.h> + +#include "src/common/log.h" +#include "src/common/xmalloc.h" +#include "src/common/xstring.h" +#include "src/slurmctld/slurmctld.h" + +#define BASIL_DEBUG 1 + +#ifndef HAVE_BASIL +static int last_res_id = 0; +#endif + +#ifdef HAVE_BASIL +/* Make sure that each SLURM node has a BASIL node ID */ +static void _validate_basil_node_id(void) +{ + uint16_t base_state; + int i; + struct node_record *node_ptr = node_record_table_ptr; + + for (i=0; i<node_record_cnt; i++, node_ptr++) + if (node_ptr->basil_node_id != NO_VAL) + continue; + base_state = node_ptr->state & NODE_STATE_BASE; + if (base_state == NODE_STATE_DOWN) + continue; + + error("Node %s has no basil node_id", node_ptr->name); + last_node_update = time(NULL); + set_node_down(node_ptr->name, "No BASIL node_id"); + _sync_bitmaps(node_ptr, 0); + } +} +#endif /* HAVE_BASIL */ + +/* + * basil_query - Query BASIL for node and reservation state. + * Execute once at slurmctld startup and periodically thereafter. + * RET 0 or error code + */ +extern int basil_query(void) +{ + int error_code = SLURM_SUCCESS; +#ifdef HAVE_BASIL + struct config_record *config_ptr; + struct node_record *node_ptr; + struct job_record *job_ptr; + ListIterator job_iterator; + uint16_t base_state; + char *reason, *res_id; + + /* Issue the BASIL QUERY request */ + if (request_failure) { + fatal("basil query error: %s", "TBD"); + return SLURM_ERROR; + } + debug("basil query initiated"); + + /* Validate configuration for each node that BASIL reports */ + for (each_basil_node) { +#if BASIL_DEBUG + /* Log node state according to BASIL */ + info("basil query: name=%s arch=%s", + basil_node_name, basil_node_arch, etc.); +#endif /* BASIL_DEBUG */ + + /* NOTE: Cray should provide X-, Y- and Z-coordinates + * in the future. When that happens, we'll want to use + * those numbers to generate the hostname: + * slurm_host_name = xmalloc(sizeof(conf->node_prefix) + 4); + * sprintf(slurm_host_name: %s%d%d%d", basil_node_name, X, Y, Z); + */ + node_ptr = find_node_record(basil_node_name); + if (node_ptr == NULL) { + error("basil node %s not found in slurm", + basil_node_name); + continue; + } + + /* Record BASIL's node_id for use in reservations */ + node_ptr->basil_node_id = basil_node_id; + + /* Update slurmctld's node architecture */ + if (node_ptr->arch == NULL) { + xfree(node_ptr->arch); + node_ptr->arch = xstrdup(basil_node_arch); + } + + /* Update slurmctld's node state if necessary */ + reason = NULL; + base_state = node_ptr->state & NODE_STATE_BASE; + if (base_state != NODE_STATE_DOWN) { + if (strcmp(basil_state, "UP")) + reason = "basil state not UP"; + else if (strcmp(basil_role, "BATCH")) + reason = "basil role not BATCH"; + } + + /* Calculate the total count of processors and + * MB of memory on the node */ + config_ptr = node_ptr->config_ptr; + if ((slurmctld_conf.fast_schedule != 2) && + (basil_cpus < config_ptr->cpus)) { + error("Node %s has low cpu count %d", + node_ptr->name, basil_cpus); + reason = "Low CPUs"; + } + node_ptr->cpus = basil_cpus; + if ((slurmctld_conf.fast_schedule != 2) && + (basil_memory < config_ptr->real_memory)) { + error("Node %s has low real_memory size %d", + node_ptr->name, basil_memory); + reason = "Low RealMemory"; + } + node_ptr->real_memory = basil_memory; + + if (reason) { + last_node_update = time(NULL); + set_node_down(node_ptr->name, reason); + _sync_bitmaps(node_ptr, 0); + } + } + _validate_basil_node_id(); + + /* Validate that each BASIL reservation is still valid, + * purge vestigial reservations */ + for (each_basil_reservation) { + bool found = false; + job_iterator = list_iterator_create(job_list); + while ((job_ptr = (struct job_record *) list_next(job_iterator))) { + select_g_get_jobinfo(job_ptr->select_jobinfo, + SELECT_DATA_BLOCK_ID, &res_id); + found = !strcmp(res_id, basil_reservation_id); + xfree(res_id); + if (found) + break; + } + list_iterator_destroy(job_iterator); + if (found) { + error("vestigial basil reservation %s being removed", + basil_reservation_id); + basil_dealloc(basil_reservation_id); + } + } +#else + struct job_record *job_ptr; + ListIterator job_iterator; + char *res_id, *tmp; + + job_iterator = list_iterator_create(job_list); + while ((job_ptr = (struct job_record *) list_next(job_iterator))) { + select_g_get_jobinfo(job_ptr->select_jobinfo, + SELECT_DATA_BLOCK_ID, &res_id); + tmp = strchr(res_id, '_'); + if (tmp) { + job_res_id = atoi(tmp+1); + last_res_id = MAX(last_res_id, job_res_id); + } + xfree(res_id); + } + list_iterator_destroy(job_iterator); + debug("basil_query() executed, last_res_id=%d", last_res_id); +#endif /* HAVE_BASIL */ + + return error_code; +} + +/* + * basil_reserve - create a BASIL reservation. + * IN job_ptr - pointer to job which has just been allocated resources + * RET 0 or error code + */ +extern int basil_reserve(struct job_record *job_ptr) +{ + int error_code = SLURM_SUCCESS; +#ifdef HAVE_BASIL + /* Issue the BASIL RESERVE request */ + if (request_failure) { + error("basil reserve error: %s", "TBD"); + return SLURM_ERROR; + } + debug("basil reservation made job_id=%u res_id=%s", + job_ptr->job_id, reservation_id); + /* FIXME: add reservation_id to select_job_struct */ +#else + char *reservation_id; + xstrfmtcat(reservation_id, "RES_%d", ++last_res_id); + debug("basil reservation made job_id=%u res_id=%s", + job_ptr->job_id, reservation_id); + /* FIXME: add reservation_id to select_job_struct */ +#endif /* HAVE_BASIL */ + return error_code; +} + +/* + * basil_release - release a BASIL reservation. + * IN reservation_id - ID of reservation to release + * RET 0 or error code + */ +extern int basil_release(char *reservation_id) +{ + int error_code = SLURM_SUCCESS; +#ifdef HAVE_BASIL + /* Issue the BASIL RELEASE request */ + if (request_failure) { + error("basil release of %s error: %s", reservation_id, "TBD"); + return SLURM_ERROR; + } + debug("basil release of %s complete", reservation_id); +#else + debug("basil release of %s complete", reservation_id); +#endif /* HAVE_BASIL */ + return error_code; +} -- GitLab