From 832898b7a6d44eecfa5b0f11a9a612da9380c62c Mon Sep 17 00:00:00 2001 From: Moe Jette <jette1@llnl.gov> Date: Sat, 29 Jan 2011 04:47:16 +0000 Subject: [PATCH] Cray BASIL API: basic support added to the select/cray plugin. 01_Cray-BASIL-basic-support.diff plus 01_changes-from-first-revision-of-patch-01.diff --- NEWS | 3 + src/plugins/select/cray/Makefile.am | 7 +- src/plugins/select/cray/Makefile.in | 12 +- src/plugins/select/cray/basil_interface.c | 353 ++++++++++++++++++++++ src/plugins/select/cray/basil_interface.h | 28 ++ src/plugins/select/cray/select_cray.c | 13 +- 6 files changed, 411 insertions(+), 5 deletions(-) create mode 100644 src/plugins/select/cray/basil_interface.c create mode 100644 src/plugins/select/cray/basil_interface.h diff --git a/NEWS b/NEWS index 80ba33557f1..3c9a7b68d26 100644 --- a/NEWS +++ b/NEWS @@ -18,6 +18,9 @@ documents those changes that are of interest to users and admins. tab. This change dramatically improves scalability of sview. -- Do not attempt to read the batch script for non-batch jobs. This patch eliminates some inappropriate error messages. 01_interactive-no-script.diff + -- Cray BASIL API: basic support added to the select/cray plugin. + 01_Cray-BASIL-basic-support.diff plus + 01_changes-from-first-revision-of-patch-01.diff * Changes in SLURM 2.3.0.pre1 ============================= diff --git a/src/plugins/select/cray/Makefile.am b/src/plugins/select/cray/Makefile.am index 89731109b3e..be48ffc9561 100644 --- a/src/plugins/select/cray/Makefile.am +++ b/src/plugins/select/cray/Makefile.am @@ -9,6 +9,11 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common pkglib_LTLIBRARIES = select_cray.la # Cray node selection plugin. -select_cray_la_SOURCES = select_cray.c other_select.c other_select.h +select_cray_la_SOURCES = \ + basil_interface.c \ + basil_interface.h \ + other_select.c \ + other_select.h \ + select_cray.c select_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) diff --git a/src/plugins/select/cray/Makefile.in b/src/plugins/select/cray/Makefile.in index 2e9bfa2e7d7..86bfa8ce17c 100644 --- a/src/plugins/select/cray/Makefile.in +++ b/src/plugins/select/cray/Makefile.in @@ -105,7 +105,8 @@ am__base_list = \ am__installdirs = "$(DESTDIR)$(pkglibdir)" LTLIBRARIES = $(pkglib_LTLIBRARIES) select_cray_la_LIBADD = -am_select_cray_la_OBJECTS = select_cray.lo other_select.lo +am_select_cray_la_OBJECTS = basil_interface.lo other_select.lo \ + select_cray.lo select_cray_la_OBJECTS = $(am_select_cray_la_OBJECTS) select_cray_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ @@ -325,7 +326,13 @@ INCLUDES = -I$(top_srcdir) -I$(top_srcdir)/src/common pkglib_LTLIBRARIES = select_cray.la # Cray node selection plugin. -select_cray_la_SOURCES = select_cray.c other_select.c other_select.h +select_cray_la_SOURCES = \ + basil_interface.c \ + basil_interface.h \ + other_select.c \ + other_select.h \ + select_cray.c + select_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) all: all-am @@ -401,6 +408,7 @@ mostlyclean-compile: distclean-compile: -rm -f *.tab.c +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/basil_interface.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/other_select.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/select_cray.Plo@am__quote@ diff --git a/src/plugins/select/cray/basil_interface.c b/src/plugins/select/cray/basil_interface.c new file mode 100644 index 00000000000..b59199b60b1 --- /dev/null +++ b/src/plugins/select/cray/basil_interface.c @@ -0,0 +1,353 @@ +/* + * Interface between lower-level ALPS XML-RPC functions and SLURM. + * + * Copyright (c) 2010-11 Centro Svizzero di Calcolo Scientifico (CSCS) + * Licensed under GPLv2. + */ +#include "basil_interface.h" +#include "basil_alps.h" +#include "basil_torus.h" +#include "basil_mysql.h" + +/* + * Following routines are from src/plugins/select/bluegene/plugin/jobinfo.c + */ +static int set_select_jobinfo(select_jobinfo_t *jobinfo, + enum select_jobdata_type data_type, void *data) +{ + uint32_t *uint32 = (uint32_t *) data; + + if (jobinfo == NULL) { + error("cray/set_select_jobinfo: jobinfo not set"); + return SLURM_ERROR; + } + if (jobinfo->magic != JOBINFO_MAGIC) { + error("cray/set_select_jobinfo: jobinfo magic bad"); + return SLURM_ERROR; + } + + switch (data_type) { + case SELECT_JOBDATA_RESV_ID: + jobinfo->reservation_id = *uint32; + break; + default: + error("cray/set_select_jobinfo: data_type %d invalid", + data_type); + } + + return SLURM_SUCCESS; +} + +static int get_select_jobinfo(select_jobinfo_t *jobinfo, + enum select_jobdata_type data_type, void *data) +{ + uint32_t *uint32 = (uint32_t *) data; + + if (jobinfo == NULL) { + error("cray/get_select_jobinfo: jobinfo not set"); + return SLURM_ERROR; + } + if (jobinfo->magic != JOBINFO_MAGIC) { + error("cray/get_select_jobinfo: jobinfo magic bad"); + return SLURM_ERROR; + } + + switch (data_type) { + case SELECT_JOBDATA_RESV_ID: + *uint32 = jobinfo->reservation_id; + break; + default: + error("cray/get_select_jobinfo: data_type %d invalid", + data_type); + } + + return SLURM_SUCCESS; +} + +/** Convert between Cray NID and slurm nodename format */ +static struct node_record *find_node_by_basil_id(uint32_t node_id) +{ + char nid[9]; /* nid%05d\0 */ + + snprintf(nid, sizeof(nid), "nid%05u", node_id); + + return find_node_record(nid); +} + +/** + * basil_inventory - Periodic node-state query via ALPS XML-RPC. + * This should be run immediately before each scheduling cycle. + * Returns non-SLURM_SUCCESS if + * - INVENTORY method failed (error) + * - no nodes are available (no point in scheduling) + * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) + */ +extern int basil_inventory(void) +{ + enum basil_version version = get_basil_version(); + struct basil_inventory *inv; + struct basil_node *node; + struct basil_rsvn *rsvn; + int rc = SLURM_SUCCESS; + + inv = get_full_inventory(version); + if (inv == NULL) { + error("BASIL %s INVENTORY failed", bv_names_long[version]); + return SLURM_ERROR; + } + + debug("BASIL %s INVENTORY: %d/%d batch nodes available", + bv_names_long[version], inv->batch_avail, inv->batch_total); + + if (!inv->f->node_head || !inv->batch_avail || !inv->batch_total) + rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; + + for (node = inv->f->node_head; node; node = node->next) { + struct node_record *node_ptr; + char *reason = NULL; + + node_ptr = find_node_by_basil_id(node->node_id); + if (node_ptr == NULL) { + error("nid%05u (%s node in state %s) not in slurm.conf", + node->node_id, nam_noderole[node->role], + nam_nodestate[node->state]); + continue; + } + + if (node->state == BNS_DOWN) { + reason = "ALPS marked it DOWN"; + } else if (node->state == BNS_UNAVAIL) { + reason = "node is UNAVAILABLE"; + } else if (node->state == BNS_ROUTE) { + reason = "node does ROUTING"; + } else if (node->state == BNS_SUSPECT) { + reason = "entered SUSPECT mode"; + } else if (node->state == BNS_ADMINDOWN) { + reason = "node is ADMINDOWN"; + } else if (node->state != BNS_UP) { + reason = "state not UP"; + } else if (node->role != BNR_BATCH) { + reason = "mode not BATCH"; + } else if (node->arch != BNA_XT) { + reason = "arch not XT/XE"; + } + + if (reason) { + xfree(node_ptr->reason); + + if (IS_NODE_DOWN(node_ptr)) { + node_ptr->reason = xstrdup(reason); + } else { + debug("MARKING %s DOWN (%s)", + node_ptr->name, reason); + set_node_down(node_ptr->name, reason); + } + } else if (IS_NODE_DOWN(node_ptr)) { + xfree(node_ptr->reason); + + /* Reset state, make_node_idle figures out the rest */ + node_ptr->node_state &= NODE_STATE_FLAGS; + node_ptr->node_state |= NODE_STATE_UNKNOWN; + + make_node_idle(node_ptr, NULL); + } + } + + /* + * Check that each ALPS reservation corresponds to a SLURM job. + * Purge orphaned reservations, which may result from stale or + * messed up system state, or are indicative of ALPS problems + * (stuck in pending cancel calls). + * Don't return an error code here, to encourage scheduling + * even while some of the resources have not yet been freed. + */ + for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { + ListIterator job_iter = list_iterator_create(job_list); + struct job_record *job_ptr; + uint32_t resv_id; + + if (job_iter == NULL) + fatal("list_iterator_create: malloc failure"); + + while ((job_ptr = (struct job_record *)list_next(job_iter))) { + + select_g_select_jobinfo_get(job_ptr->select_jobinfo, + SELECT_JOBDATA_RESV_ID, + &resv_id); + if (resv_id == rsvn->rsvn_id) + break; + } + list_iterator_destroy(job_iter); + + if (job_ptr == NULL) { + error("orphaned ALPS reservation %u, trying to remove", + rsvn->rsvn_id); + basil_release(rsvn->rsvn_id); + /* + * ALPS will take some time, do not schedule now. + */ + rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; + } + } + free_inv(inv); + return rc; +} + +/** + * do_basil_reserve - create a BASIL reservation. + * IN job_ptr - pointer to job which has just been allocated resources + * RET 0 or error code, job will abort or be requeued on failure + */ +extern int do_basil_reserve(struct job_record *job_ptr) +{ + struct nodespec *ns_head = NULL; + uint16_t mppwidth = 0, mppdepth, mppnppn; + uint32_t mppmem = 0, node_min_mem = 0; + uint32_t resv_id; + int i, first_bit, last_bit; + hostlist_t hl; + long rc; + char *user, batch_id[16]; + + if (!job_ptr->job_resrcs || job_ptr->job_resrcs->nhosts == 0) + return SLURM_SUCCESS; + + debug3("job #%u: %u nodes = %s, cpus=%u" , job_ptr->job_id, + job_ptr->job_resrcs->nhosts, + job_ptr->job_resrcs->nodes, + job_ptr->job_resrcs->ncpus + ); + + if (job_ptr->job_resrcs->node_bitmap == NULL) { + error("job %u node_bitmap not set", job_ptr->job_id); + return SLURM_SUCCESS; + } + + first_bit = bit_ffs(job_ptr->job_resrcs->node_bitmap); + last_bit = bit_fls(job_ptr->job_resrcs->node_bitmap); + if (first_bit == -1 || last_bit == -1) + return SLURM_SUCCESS; /* no nodes allocated */ + + mppdepth = MAX(1, job_ptr->details->cpus_per_task); + mppnppn = job_ptr->details->ntasks_per_node; + + /* mppmem */ + if (job_ptr->details->pn_min_memory & MEM_PER_CPU) { + /* Only honour --mem-per-cpu if --ntasks has been given */ + if (job_ptr->details->num_tasks) + mppmem = job_ptr->details->pn_min_memory & ~MEM_PER_CPU; + } else if (job_ptr->details->pn_min_memory) { + node_min_mem = job_ptr->details->pn_min_memory; + } + + hl = hostlist_create(""); + if (hl == NULL) + fatal("hostlist_create: malloc error"); + + for (i = first_bit; i <= last_bit; i++) { + struct node_record *node_ptr = node_record_table_ptr + i; + uint32_t basil_node_id; + + if (!bit_test(job_ptr->job_resrcs->node_bitmap, i)) + continue; + + if (!node_ptr->name) + continue; + if (sscanf(node_ptr->name, "nid%05u", &basil_node_id) != 1) + fatal("can not read basil_node_id from %s", node_ptr->name); + + if (ns_add_node(&ns_head, basil_node_id) != 0) { + error("can not add node %s (nid%05u)", node_ptr->name, basil_node_id); + free_nodespec(ns_head); + return SLURM_ERROR; + } + + if (node_min_mem) { + uint32_t node_cpus, node_mem; + + if (slurmctld_conf.fast_schedule) { + node_cpus = node_ptr->config_ptr->cpus; + node_mem = node_ptr->config_ptr->real_memory; + } else { + node_cpus = node_ptr->cpus; + node_mem = node_ptr->real_memory; + } + /* + * ALPS 'Processing Elements per Node' value (aprun -N), + * which in slurm is --ntasks-per-node and 'mppnppn' in + * PBS: if --ntasks is specified, default to the number + * of cores per node (also the default for 'aprun -N'). + */ + node_mem /= mppnppn ? mppnppn : node_cpus; + + mppmem = node_min_mem = MIN(node_mem, node_min_mem); + } + } + + /* mppwidth */ + for (i = 0; i < job_ptr->job_resrcs->nhosts; i++) { + uint16_t node_tasks = job_ptr->job_resrcs->cpus[i] / mppdepth; + + if (mppnppn && mppnppn < node_tasks) + node_tasks = mppnppn; + mppwidth += node_tasks; + } + + snprintf(batch_id, sizeof(batch_id), "%u", job_ptr->job_id); + user = uid_to_string(job_ptr->user_id); + rc = basil_reserve(user, batch_id, mppwidth, + mppdepth, mppnppn, mppmem, ns_head); + xfree(user); + if (rc <= 0) { + /* errno value will be resolved by select_g_job_begin() */ + errno = is_transient_error(rc) ? EAGAIN : ECONNABORTED; + return SLURM_ERROR; + } + + resv_id = rc; + set_select_jobinfo(job_ptr->select_jobinfo->data, + SELECT_JOBDATA_RESV_ID, &resv_id); + + info("ALPS RESERVATION #%u, JobId %u: BASIL -n %d -N %d -d %d -m %d", + resv_id, job_ptr->job_id, mppwidth, mppnppn, mppdepth, mppmem); + + return SLURM_SUCCESS; +} + +/** + * do_basil_confirm - confirm an existing BASIL reservation. + * This requires the alloc_sid to equal the session ID (getsid()) of the process + * executing the aprun/mpirun commands + */ +extern int do_basil_confirm(struct job_record *job_ptr) +{ + uint32_t resv_id; + + get_select_jobinfo(job_ptr->select_jobinfo->data, + SELECT_JOBDATA_RESV_ID, &resv_id); + + debug2("confirming ALPS resId %u for JobId %u with pagg %u", resv_id, + job_ptr->job_id, job_ptr->alloc_sid); + + return basil_confirm(resv_id, job_ptr->job_id, job_ptr->alloc_sid); +} + +/** + * do_basil_release - release an (unconfirmed) BASIL reservation + * IN job_ptr - pointer to job which has just been deallocated resources + * RET 0 or error code + */ +extern int do_basil_release(struct job_record *job_ptr) +{ + uint32_t resv_id; + + get_select_jobinfo(job_ptr->select_jobinfo->data, + SELECT_JOBDATA_RESV_ID, &resv_id); + + if (resv_id && basil_release(resv_id) < 0) + return SLURM_ERROR; + + debug("released ALPS resId %u for JobId %u", resv_id, job_ptr->job_id); + + return SLURM_SUCCESS; +} diff --git a/src/plugins/select/cray/basil_interface.h b/src/plugins/select/cray/basil_interface.h new file mode 100644 index 00000000000..9201afc6ee3 --- /dev/null +++ b/src/plugins/select/cray/basil_interface.h @@ -0,0 +1,28 @@ +/* + * Interface between lower-level ALPS XML-RPC functions and SLURM. + * + * Copyright (c) 2010-11 Centro Svizzero di Calcolo Scientifico (CSCS) + * Licensed under GPLv2. + */ +#ifndef __CRAY_BASIL_INTERFACE_H +#define __CRAY_BASIL_INTERFACE_H + +#if HAVE_CONFIG_H +# include "config.h" +#endif /* HAVE_CONFIG_H */ + +#include <slurm/slurm.h> +#include <slurm/slurm_errno.h> + +#include "src/common/log.h" +#include "src/common/uid.h" +#include "src/common/xstring.h" +#include "src/common/node_select.h" +#include "src/slurmctld/slurmctld.h" + +extern int basil_inventory(void); +extern int do_basil_reserve(struct job_record *job_ptr); +extern int do_basil_confirm(struct job_record *job_ptr); +extern int do_basil_release(struct job_record *job_ptr); + +#endif /* __CRAY_BASIL_INTERFACE_H */ diff --git a/src/plugins/select/cray/select_cray.c b/src/plugins/select/cray/select_cray.c index a8242e0967d..6223ca2cb59 100644 --- a/src/plugins/select/cray/select_cray.c +++ b/src/plugins/select/cray/select_cray.c @@ -53,6 +53,7 @@ #include <unistd.h> #include "other_select.h" +#include "basil_interface.h" #define NOT_FROM_CONTROLLER -2 /* These are defined here so when we link with something other than @@ -241,7 +242,11 @@ extern int select_p_job_test(struct job_record *job_ptr, bitstr_t *bitmap, extern int select_p_job_begin(struct job_record *job_ptr) { - + if (do_basil_reserve(job_ptr) != SLURM_SUCCESS) { + job_ptr->state_reason = WAIT_RESOURCES; + xfree(job_ptr->state_desc); + return SLURM_ERROR; + } return other_job_begin(job_ptr); } @@ -250,7 +255,6 @@ extern int select_p_job_ready(struct job_record *job_ptr) return other_job_ready(job_ptr); } - extern int select_p_job_resized(struct job_record *job_ptr, struct node_record *node_ptr) { @@ -259,6 +263,9 @@ extern int select_p_job_resized(struct job_record *job_ptr, extern int select_p_job_fini(struct job_record *job_ptr) { + /* Reservations of batch jobs are released by the stepdmanager */ + if (!job_ptr->batch_flag && do_basil_release(job_ptr) != SLURM_SUCCESS) + return SLURM_ERROR; return other_job_fini(job_ptr); } @@ -654,5 +661,7 @@ extern int select_p_alter_node_cnt(enum select_node_cnt type, void *data) extern int select_p_reconfigure(void) { + if (basil_inventory()) + return SLURM_ERROR; return other_reconfigure(); } -- GitLab