Skip to content
Snippets Groups Projects
Commit 2243bf33 authored by Marlys Konhke's avatar Marlys Konhke Committed by Danny Auble
Browse files

Initial commit for changes needed to make CCM work on a Cray XT.

parent 3644504c
No related branches found
No related tags found
No related merge requests found
...@@ -10,6 +10,6 @@ AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common \ ...@@ -10,6 +10,6 @@ AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common \
pkglib_LTLIBRARIES = select_cray.la pkglib_LTLIBRARIES = select_cray.la
# Cray node selection plugin. # Cray node selection plugin.
select_cray_la_SOURCES = select_cray.c select_cray_la_SOURCES = ccm.c ccm.h select_cray.c
select_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) $(CRAY_SELECT_LDFLAGS) select_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) $(CRAY_SELECT_LDFLAGS)
select_cray_la_LIBADD = ../other/libother_select.la select_cray_la_LIBADD = ../other/libother_select.la
...@@ -174,7 +174,7 @@ am__uninstall_files_from_dir = { \ ...@@ -174,7 +174,7 @@ am__uninstall_files_from_dir = { \
am__installdirs = "$(DESTDIR)$(pkglibdir)" am__installdirs = "$(DESTDIR)$(pkglibdir)"
LTLIBRARIES = $(pkglib_LTLIBRARIES) LTLIBRARIES = $(pkglib_LTLIBRARIES)
select_cray_la_DEPENDENCIES = ../other/libother_select.la select_cray_la_DEPENDENCIES = ../other/libother_select.la
am_select_cray_la_OBJECTS = select_cray.lo am_select_cray_la_OBJECTS = ccm.lo select_cray.lo
select_cray_la_OBJECTS = $(am_select_cray_la_OBJECTS) select_cray_la_OBJECTS = $(am_select_cray_la_OBJECTS)
AM_V_lt = $(am__v_lt_@AM_V@) AM_V_lt = $(am__v_lt_@AM_V@)
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
...@@ -509,7 +509,7 @@ AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common \ ...@@ -509,7 +509,7 @@ AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/src/common \
pkglib_LTLIBRARIES = select_cray.la pkglib_LTLIBRARIES = select_cray.la
# Cray node selection plugin. # Cray node selection plugin.
select_cray_la_SOURCES = select_cray.c select_cray_la_SOURCES = ccm.c ccm.h select_cray.c
select_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) $(CRAY_SELECT_LDFLAGS) select_cray_la_LDFLAGS = $(SO_LDFLAGS) $(PLUGIN_FLAGS) $(CRAY_SELECT_LDFLAGS)
select_cray_la_LIBADD = ../other/libother_select.la select_cray_la_LIBADD = ../other/libother_select.la
all: all-am all: all-am
...@@ -590,6 +590,7 @@ mostlyclean-compile: ...@@ -590,6 +590,7 @@ mostlyclean-compile:
distclean-compile: distclean-compile:
-rm -f *.tab.c -rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccm.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/select_cray.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/select_cray.Plo@am__quote@
.c.o: .c.o:
......
/*****************************************************************************\
* ccm.c - Cray CCM app ssh launch over the Aries interconnect; node
* selection plugin for cray systems.
*****************************************************************************
* Copyright (C) 2016 SchedMD LLC
* Copyright 2016 Cray Inc. All Rights Reserved.
* Written by Marlys Kohnke
*
* This file is part of SLURM, a resource management program.
* For details, see <http://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#define _GNU_SOURCE 1
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include "src/common/pack.h"
#include "src/slurmctld/locks.h"
#include "select_cray.h"
#if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
/* CCM use */
#include <string.h>
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/srun_comm.h"
const char *ccm_prolog_path;
const char *ccm_epilog_path;
ccm_config_t ccm_config;
static int *_ccm_convert_nodelist(char *nodelist, int *node_cnt);
static char *_ccm_create_nidlist_file(ccm_info_t *ccm_info);
static char * _ccm_create_unique_file(char *uniqnm, int *fd,
ccm_info_t *ccm_info);
static int _parse_ccm_config(char *entry, char **ccm_partition);
static int _run_ccm_prolog_epilog(ccm_info_t *ccm_info, char *ccm_type,
const char *ccm_script);
/*
* Get the CCM configuration information.
*
*/
void _ccm_get_config()
{
char *err_msg = NULL, *ccm_env;
/* Alternate paths for testing purposes */
ccm_env = getenv("CCM_PROLOG");
if (ccm_env) {
ccm_prolog_path = xstrdup(ccm_env);
} else {
ccm_prolog_path = xstrdup(CCM_PROLOG_PATH);
}
ccm_env = getenv("CCM_EPILOG");
if (ccm_env) {
ccm_epilog_path = xstrdup(ccm_env);
} else {
ccm_epilog_path = xstrdup(CCM_EPILOG_PATH);
}
ccm_config.ccm_enabled = 0;
err_msg = _get_ccm_partition(&ccm_config);
if (err_msg) {
info("CCM ssh launch disabled, %s", err_msg);
} else {
if (ccm_config.num_ccm_partitions > 0) {
ccm_config.ccm_enabled = 1;
info("CCM prolog %s, epilog %s",
ccm_prolog_path, ccm_epilog_path);
}
}
return;
}
/*
* Open the CCM config file and read the CCM_QUEUES list of partition
* name(s). Store the values in the global ccm_partition array.
* This is done once per slurmctld startup.
*
* Upon success, returns NULL; otherwise, an error string is returned.
*/
char *_get_ccm_partition(ccm_config_t *ccm_config)
{
FILE *fp;
size_t num_read, len;
char *entry = NULL, *err_mesg = NULL, extra[2];
char *ccm_file_name = "/etc/opt/cray/ccm/ccm.conf";
static char err_buf[256];
int i, num_ents = 0;
ccm_config->num_ccm_partitions = 0;
fp = fopen(ccm_file_name, "r");
if (fp == NULL) {
snprintf(err_buf, sizeof(err_buf),
"CCM unable to open %s, %m\n", ccm_file_name);
err_mesg = err_buf;
return err_mesg;
}
while ((num_read = getline(&entry, &len, fp)) != -1) {
if (entry) {
if (entry[num_read - 1] == '\n') {
entry[num_read - 1] = '\0';
}
if (strcasestr(entry, "CCM_QUEUES") == 0) {
continue;
}
/* Ignore a comment line */
if (sscanf(entry, " %1[""#""]", extra) == 1) {
continue;
}
num_ents = _parse_ccm_config(entry,
ccm_config->ccm_partition);
if (num_ents <= 0) {
snprintf(err_buf, sizeof(err_buf),
"CCM bad CCM_QUEUES %s in %s\n",
entry, ccm_file_name);
err_mesg = err_buf;
free(entry);
return err_mesg;
}
ccm_config->num_ccm_partitions = num_ents;
break;
}
}
debug2("CCM _get_ccm_partition num_ents %d",
ccm_config->num_ccm_partitions);
if (ccm_config->num_ccm_partitions > 0) {
for (i = 0; i < ccm_config->num_ccm_partitions; i++) {
debug2("CCM ccm_config->ccm_partition[%d] %s", i,
ccm_config->ccm_partition[i]);
}
}
free(entry);
return err_mesg;
}
/*
* Parse the CCM_QUEUES entry within the CCM config file.
* CCM_QUEUES value is a string containing one or more partition names,
* such as CCM_QUEUES="ccm_queue, ccm_queue_2"
*
* Upon success, the ccm_partitions array is filled in and the
* number of partition names is returned. For any error, -1 is returned.
*/
static int _parse_ccm_config(char *entry, char **ccm_partition)
{
int num_ents = -1, i = 0, token_sz = 0;
char *part_list, *token, *saveptr, *tmp;
char const *delims = " \t\n\v\f\r,"; /* whitespace + comma */
part_list = NULL;
/* Find start of tokens indicated by first " */
part_list = (strchr(entry, '"'));
if (part_list == NULL) {
debug("CCM part_list invalid config entry %s", entry);
return num_ents;
}
/* Move past initial " and ignore anything after the end matching " */
tmp = part_list;
tmp = strchr(part_list + 1, '"');
if (tmp == NULL) {
debug("CCM tmp invalid config entry %s", part_list + 1);
return num_ents;
}
*tmp = '\0';
token = strtok_r(part_list + 1, delims, &saveptr);
while ((token != NULL) && (i < CCM_PARTITION_MAX)) {
token_sz = strlen(token);
/* Strip off an ending " */
if (token[token_sz - 1] == '"') {
token[token_sz - 1] = '\0';
}
if (strlen(token) > 0) {
ccm_partition[i] =
xmalloc((token_sz + 1) * sizeof(char));
strcpy(ccm_partition[i], token);
i++;
}
token = strtok_r(NULL, delims, &saveptr);
}
num_ents = i;
return num_ents;
}
/*
* Free the malloc'd fields within a ccm_info structure.
*/
static void free_ccm_info(ccm_info_t *ccm_info)
{
xfree(ccm_info->cpu_count_reps);
xfree(ccm_info->cpus_per_node);
xfree(ccm_info->nodelist);
return;
}
/*
* Take an input string of hostnames (i.e. nid00050) and convert that string
* into an array of integers (i.e. 50). For success, returns an integer array;
* for an error, returns NULL. For success, nid_cnt is set to the number of
* array entries; otherwise, nid_cnt is set to -1.
*/
static int * _ccm_convert_nodelist(char *nodelist, int *nid_cnt)
{
hostlist_t hl;
int i, cnt = -1, *nid_array = NULL;
char *nidname, *nidstr = NULL;
if (!(hl = hostlist_create(nodelist))) {
CRAY_ERR("CCM hostlist_create error on %s", nodelist);
return NULL;
}
if (!(cnt = hostlist_count(hl))) {
CRAY_ERR("CCM nodelist %s hostlist_count cnt %d",
nodelist, cnt);
hostlist_destroy(hl);
return NULL;
}
i = 0;
nid_array = xmalloc(cnt * sizeof(int32_t));
while ((nidname = hostlist_shift(hl))) {
if (!(nidstr = strpbrk(nidname, "0123456789"))) {
CRAY_ERR("CCM unexpected format nidname %s", nidname);
free(nidname);
xfree(nid_array);
hostlist_destroy(hl);
return NULL;
}
nid_array[i++] = atoi(nidstr);
free(nidname);
}
hostlist_destroy(hl);
*nid_cnt = cnt;
return nid_array;
}
/*
* Create a unique nidlist file. The file name is returned or
* NULL for any error. For success, the assigned fd is returned; otherwise,
* uniq_fd is set to -1. The caller is responsible to free the returned
* file name.
*/
static char * _ccm_create_unique_file(char *uniqnm, int *uniq_fd,
ccm_info_t *ccm_info)
{
mode_t um;
int fd = -1;
char *tmpfilenm = NULL;
/*
* Create a unique temp file; name of the file will be passed
* in an env variable to the CCM prolog/epilog.
*/
*uniq_fd = -1;
tmpfilenm = xstrdup(uniqnm);
um = umask(0);
fd = mkstemp(tmpfilenm);
(void)umask(um);
if (fd < 0) {
CRAY_ERR("CCM job %u unable to mkstemp %s, %m",
ccm_info->job_id, uniqnm);
} else if ((fchmod(fd, 0644)) < 0) {
CRAY_ERR("CCM job %u file %s, fd %d, fchmod error, %m",
ccm_info->job_id, uniqnm, fd);
close(fd);
fd = -1;
}
if (fd < 0) {
xfree(tmpfilenm);
} else {
*uniq_fd = fd;
}
return tmpfilenm;
}
/*
* Fill in a nodelist file with one nid entry per PE (exec'd app process).
* For 2 PEs running on nid 36 and 1 on nid 100, the file contents are:
* 36
* 36
* 100
*
* The name of the unique nodelist file is returned; NULL is returned
* for any error. The caller is responsible to free the returned file name.
*/
static char *_ccm_create_nidlist_file(ccm_info_t *ccm_info)
{
int i, j, fd = -1, nodecnt = 0, *nodes = NULL;
char *unique_filenm = NULL;
FILE *tmp_fp = NULL;
slurm_step_layout_t *step_layout = NULL;
/*
* Create a unique temp file; name of the file will be passed
* in an env variable to the CCM prolog/epilog.
*/
unique_filenm = _ccm_create_unique_file(CCM_CRAY_UNIQUE_FILENAME, &fd,
ccm_info);
if (!unique_filenm) {
return NULL;
}
if ((tmp_fp = fdopen(fd, "w")) == NULL) {
CRAY_ERR("CCM job %u file %s, fd %d, fdopen error %m",
ccm_info->job_id, unique_filenm, fd);
close(fd);
xfree(unique_filenm);
return NULL;
}
/* Convert the nodelist into an array of nids */
nodes = _ccm_convert_nodelist(ccm_info->nodelist, &nodecnt);
debug("CCM job %u nodelist %s, nodecnt %d", ccm_info->job_id,
ccm_info->nodelist, nodecnt);
if (!nodes) {
fclose(tmp_fp);
xfree(unique_filenm);
return NULL;
}
for (i = 0; i < nodecnt; i++) {
debug3("CCM job %u nodes[%d] is %d",
ccm_info->job_id, i, nodes[i]);
}
/* Determine how many PEs(tasks) will be run on each node */
step_layout = slurm_step_layout_create(ccm_info->nodelist,
ccm_info->cpus_per_node,
ccm_info->cpu_count_reps,
ccm_info->node_cnt,
ccm_info->num_tasks,
ccm_info->cpus_per_task,
ccm_info->task_dist,
ccm_info->plane_size);
if (!step_layout) {
CRAY_ERR("CCM job %u slurm_step_layout_create failure",
ccm_info->job_id);
fclose(tmp_fp);
xfree(unique_filenm);
xfree(nodes);
return NULL;
}
debug2("CCM job %u step_layout node_cnt %d", ccm_info->job_id,
step_layout->node_cnt);
/* Fill in the nodelist file with an entry per PE */
for (i = 0; i < step_layout->node_cnt; i++) {
debug2("CCM job %u step_layout nodes[%d] %d, tasks[%d] %d",
ccm_info->job_id, i, nodes[i], i,
step_layout->tasks[i]);
for (j = 0; j < step_layout->tasks[i]; j++) {
fprintf(tmp_fp, "%d\n", nodes[i]);
debug3("CCM job %u nodelist file step tasks[%d] %d, "
"j %d nodes[%d] %d", ccm_info->job_id, i,
step_layout->tasks[i], j, i, nodes[i]);
}
}
slurm_step_layout_destroy(step_layout);
fclose(tmp_fp);
xfree(nodes);
debug2("CCM job %u unique_filemn %s", ccm_info->job_id, unique_filenm);
return unique_filenm;
}
/*
* Create a detached pthread to handle CCM prolog and epilog
* activities. This is only called if CCM is enabled and the batch job has
* been identified as coming from a CCM partition.
*/
void _spawn_ccm_thread(
void *obj_ptr, void *(*start_routine) (void *))
{
pthread_attr_t attr_agent;
pthread_t thread_agent;
int retries;
struct job_record *job_ptr = (struct job_record *)obj_ptr;
/* spawn a pthread to start CCM prolog or epilog activities */
slurm_attr_init(&attr_agent);
if (pthread_attr_setdetachstate(&attr_agent, PTHREAD_CREATE_DETACHED)) {
CRAY_ERR("CCM job %u pthread_attr_setdetachstate error %m",
job_ptr->job_id);
}
retries = 0;
while (pthread_create(&thread_agent, &attr_agent,
start_routine, obj_ptr)) {
CRAY_ERR("CCM job_id %u pthread_create error %m",
job_ptr->job_id);
if (++retries > CCM_MAX_PTHREAD_RETRIES) {
if (strcasecmp((char *)start_routine,
"_ccm_begin") == 0) {
/* Decrement so job launch can continue */
debug("CCM job %u prolog_running_decr, cur %d",
job_ptr->job_id,
job_ptr->details->prolog_running);
prolog_running_decr(job_ptr);
}
fatal("CCM job %u _spawn_ccm_thread can't create "
"pthread", job_ptr->job_id);
}
usleep(100000); /* sleep 1/10th second and retry */
}
slurm_attr_destroy(&attr_agent);
return;
}
/*
* Set up the appropriate environment to run the CCM prolog and epilog
* scripts. Fork a child to exec the script; wait for the child to complete.
* Return value of kill so caller can decide what further action to take.
*/
static int _run_ccm_prolog_epilog(ccm_info_t *ccm_info, char *ccm_type,
const char *ccm_script)
{
int cpid, ret, status, wait_rc, kill = 1;
char strval[128], *nid_list_file = NULL;
const char *argv[4];
DEF_TIMERS;
START_TIMER;
if (strcasecmp(ccm_type, "prolog") == 0) {
nid_list_file = _ccm_create_nidlist_file(ccm_info);
if (nid_list_file == NULL) {
CRAY_ERR("CCM job %u unable to create nidlist file",
ccm_info->job_id);
return kill;
}
}
/* Fork a child to exec the CCM script */
cpid = fork();
if (cpid < 0) {
CRAY_ERR("CCM job %u %s fork failed, %m", ccm_info->job_id,
ccm_type);
if (nid_list_file) {
ret = unlink(nid_list_file);
if (ret == -1) {
CRAY_ERR("CCM job %u unable to unlink %s",
ccm_info->job_id, nid_list_file);
}
xfree(nid_list_file);
}
return kill;
} else if (cpid == 0) {
/* child */
setsid();
setpgrp();
snprintf(strval, sizeof(strval), "%u", ccm_info->job_id);
setenv("ALPS_PREP_BATCHID", strval, 1);
snprintf(strval, sizeof(strval), "%u", ccm_info->user_id);
setenv("ALPS_PREP_UID", strval, 1);
if (nid_list_file) {
setenv("ALPS_PREP_NIDFILE",
(const char *)nid_list_file, 1);
}
argv[0] = "sh";
argv[1] = "-c";
argv[2] = ccm_script;
argv[3] = NULL;
debug("CCM job %u invoking %s %s", ccm_info->job_id, ccm_type,
ccm_script);
execv("/bin/sh", (char *const *)argv);
CRAY_ERR("CCM job %u %s %s execv failed, %m",
ccm_info->job_id, ccm_type, ccm_script);
exit(127);
} else {
/* parent */
while (1) {
wait_rc = waitpid(cpid, &status, 0);
if (wait_rc > 0) {
if (WIFEXITED(status)) {
ret = WEXITSTATUS(status);
if (ret != 0) {
info("CCM job %u %s waitpid "
"ret %d",
ccm_info->job_id,
ccm_type, ret);
} else {
kill = 0;
}
} else if (WIFSIGNALED(status)) {
info("CCM job %u %s received "
"signal %d", ccm_info->job_id,
ccm_type, WTERMSIG(status));
} else {
/* Success */
kill = 0;
}
break;
} else if (wait_rc < 0) {
if (errno == EINTR) {
continue;
}
CRAY_ERR("CCM job %u %s waitpid error %m",
ccm_info->job_id, ccm_type);
break;
}
}
if (nid_list_file) {
ret = unlink(nid_list_file);
if (ret == -1) {
info("CCM job %u unable to unlink %s, %m",
ccm_info->job_id, nid_list_file);
}
xfree(nid_list_file);
}
}
END_TIMER;
debug("CCM job %u %s completed in %s", ccm_info->job_id, ccm_type,
TIME_STR);
return kill;
}
/*
* Check if this batch job is being started from a CCM partition.
* Returns 1 if so, otherwise 0.
*/
int _ccm_check_partitions(struct job_record *job_ptr)
{
int i, ccm_partition;
char *partition = NULL;
ccm_partition = 0;
partition = job_ptr->partition;
debug2("CCM job %u _ccm_check_partitions partition %s",
job_ptr->job_id, partition);
for (i = 0; i < ccm_config.num_ccm_partitions; i++) {
if (!strcasecmp(partition, ccm_config.ccm_partition[i])) {
ccm_partition = 1;
break;
}
}
return ccm_partition;
}
/*
* As applicable, this is run at batch job start to gather info for the
* CCM prolog activities. If the CCM prolog fails, the job will be
* killed.
*/
void *_ccm_begin(void *args)
{
int i, j, num_ents, kill = 1;
size_t copysz;
ccm_info_t ccm_info;
char err_str_buf[128], srun_msg_buf[256];
struct job_record *job_ptr = (struct job_record *)args;
slurmctld_lock_t job_read_lock =
{NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
debug2("CCM job %u_ccm_begin partition %s", job_ptr->job_id,
job_ptr->partition);
memset(&ccm_info, 0, sizeof(ccm_info_t));
lock_slurmctld(job_read_lock);
ccm_info.job_id = job_ptr->job_id;
ccm_info.user_id = job_ptr->user_id;
ccm_info.nodelist = xstrdup(job_ptr->nodes);
ccm_info.node_cnt = job_ptr->node_cnt;
ccm_info.cpus_per_task = job_ptr->details->cpus_per_task;
if (ccm_info.cpus_per_task == 0) {
ccm_info.cpus_per_task = 1;
}
ccm_info.num_cpu_groups = job_ptr->job_resrcs->cpu_array_cnt;
copysz = ccm_info.num_cpu_groups * sizeof(uint16_t);
ccm_info.cpus_per_node = xmalloc(copysz);
memcpy(ccm_info.cpus_per_node, job_ptr->job_resrcs->cpu_array_value,
copysz);
copysz = ccm_info.num_cpu_groups * sizeof(uint32_t);
ccm_info.cpu_count_reps = xmalloc(copysz);
memcpy(ccm_info.cpu_count_reps, job_ptr->job_resrcs->cpu_array_reps,
copysz);
ccm_info.num_tasks = job_ptr->details->num_tasks;
if (ccm_info.num_tasks == 0) {
ccm_info.num_tasks =
job_ptr->cpu_cnt / ccm_info.cpus_per_task;
debug("CCM job %u ccm_info.num_tasks was 0; now %d",
job_ptr->job_id, ccm_info.num_tasks);
}
/*
* When task_dist is set to PLANE, the plane_size is still 0.
* This causes a failure later with the slurm_step_layout_create()
* call. Both task_dist and plane_size are arguments to that
* procedure call used to get the number of tasks for each node.
* I don't think either value impacts that, but leave this code
* here.
*/
if ((job_ptr->details->task_dist == 0) ||
(job_ptr->details->task_dist > SLURM_DIST_UNKNOWN) ||
(job_ptr->details->task_dist == SLURM_DIST_PLANE)) {
ccm_info.task_dist = SLURM_DIST_BLOCK;
debug("CCM job %u job task_dist %d, CCM using SLURM_DIST_BLOCK",
job_ptr->job_id, job_ptr->details->task_dist);
} else {
ccm_info.task_dist = job_ptr->details->task_dist;
}
ccm_info.plane_size = job_ptr->details->plane_size;
unlock_slurmctld(job_read_lock);
debug("CCM job %u, user_id %u, nodelist %s, node_cnt %d, "
"num_tasks %d", ccm_info.job_id, ccm_info.user_id,
ccm_info.nodelist, ccm_info.node_cnt, ccm_info.num_tasks);
debug("CCM job %u cpus_per_task %d, task_dist %u, plane_size %d",
ccm_info.job_id, ccm_info.cpus_per_task, ccm_info.task_dist,
ccm_info.plane_size);
for (i = num_ents = 0; i < ccm_info.num_cpu_groups; i++) {
for (j = 0; j < ccm_info.cpu_count_reps[i]; j++) {
debug3("CCM job %u cpus_per_node[%d] %d, i %d, j %d",
ccm_info.job_id, num_ents,
ccm_info.cpus_per_node[i], i, j);
num_ents++;
}
}
if (ccm_info.node_cnt != num_ents) {
CRAY_ERR("CCM job %u ccm_info.node_cnt %d doesn't match the "
"number of cpu_count_reps entries %d",
job_ptr->job_id, ccm_info.node_cnt, num_ents);
snprintf(err_str_buf, sizeof(err_str_buf),
"node_cnt %d != cpu_count_reps %d, prolog not run",
ccm_info.node_cnt, num_ents);
} else {
kill = _run_ccm_prolog_epilog(&ccm_info, "prolog",
ccm_prolog_path);
snprintf(err_str_buf, sizeof(err_str_buf),
"prolog failed");
}
debug("CCM _ccm_begin job %u prolog_running_decr, cur %d",
ccm_info.job_id, job_ptr->details->prolog_running);
prolog_running_decr(job_ptr);
if (kill) {
/* Stop the launch */
CRAY_ERR("CCM %s, job %u killed", err_str_buf, job_ptr->job_id);
snprintf(srun_msg_buf, sizeof(srun_msg_buf),
"CCM %s, job %u killed", err_str_buf, ccm_info.job_id);
srun_user_message(job_ptr, srun_msg_buf);
(void) job_signal(job_ptr->job_id, SIGKILL, 0, 0, false);
}
/* Free the malloc'd fields within this structure */
free_ccm_info(&ccm_info);
return NULL;
}
/*
* As applicable, this is run at batch job exit to provide info for the
* CCM epilog activities. The epilog only needs the job id and user id.
* If the CCM prolog is still executing, delay starting the CCM epilog
* to prevent bad interactions between the two. Delay up to
* CCM_MAX_EPILOG_DELAY seconds.
*/
void *_ccm_fini(void *args)
{
int rc;
ccm_info_t ccm_info;
time_t delay;
struct job_record *job_ptr = (struct job_record *)args;
slurmctld_lock_t job_read_lock =
{NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
memset(&ccm_info, 0, sizeof(ccm_info_t));
lock_slurmctld(job_read_lock);
ccm_info.job_id = job_ptr->job_id;
ccm_info.user_id = job_ptr->user_id;
unlock_slurmctld(job_read_lock);
/*
* Delay starting the CCM epilog if the CCM prolog may still be
* running.
*/
if (job_ptr->details && (job_ptr->details->prolog_running > 0)) {
delay = time(0) + CCM_MAX_EPILOG_DELAY;
info("CCM job %u epilog delayed; prolog_running %d",
ccm_info.job_id, job_ptr->details->prolog_running);
while (job_ptr->details->prolog_running > 0) {
usleep(100000);
if (time(0) >= delay) {
info("CCM job %u epilog max delay; "
"running epilog", ccm_info.job_id);
break;
}
}
}
debug2("CCM epilog job %u, user_id %u", ccm_info.job_id,
ccm_info.user_id);
rc = _run_ccm_prolog_epilog(&ccm_info, "epilog", ccm_epilog_path);
if (rc) {
/* Log a failure, no further action to take */
CRAY_ERR("CCM job %u epilog failed", ccm_info.job_id);
}
return NULL;
}
#endif
/*****************************************************************************\
* ccm.h
*****************************************************************************
* Copyright (C) 2016 SchedMD LLC
* Copyright 2016 Cray Inc. All Rights Reserved.
* Written by Marlys Kohnke
*
* This file is part of SLURM, a resource management program.
* For details, see <http://slurm.schedmd.com/>.
* Please also read the included file: DISCLAIMER.
*
* SLURM is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with SLURM; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#ifndef SELECT_CRAY_CCM_H
#define SELECT_CRAY_CCM_H
#include "src/common/slurm_xlator.h"
#ifdef HAVE_CONFIG_H
# include "config.h"
# if HAVE_STDINT_H
# include <stdint.h>
# endif
# if HAVE_INTTYPES_H
# include <inttypes.h>
# endif
# if WITH_PTHREADS
# include <pthread.h>
# endif
#endif
#if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
/* CCM use */
#include <string.h>
#include "src/slurmctld/job_scheduler.h"
#include "src/slurmctld/srun_comm.h"
/*
* CCM will be used to provide app ssh launch using the Cray network
* interconnect.
*/
#define CCM_MAX_EPILOG_DELAY 30
#define CCM_MAX_PTHREAD_RETRIES 6
#define CCM_PARTITION_MAX 32
#define CCM_CRAY_UNIQUE_FILENAME "/tmp/crayCCMXXXXXX"
#define CCM_PROLOG_PATH "/opt/cray/ccm/default/etc/ccm-prologue"
#define CCM_EPILOG_PATH "/opt/cray/ccm/default/etc/ccm-epilogue"
typedef struct {
char *ccm_partition[CCM_PARTITION_MAX];
int num_ccm_partitions;
int ccm_enabled;
} ccm_config_t;
extern ccm_config_t ccm_config;
extern const char *ccm_prolog_path;
extern const char *ccm_epilog_path;
typedef struct {
uint32_t job_id;
uint32_t user_id;
uint32_t node_cnt; /* Number of allocated nodes */
uint32_t num_tasks; /* Number of app PEs/tasks to exec */
uint32_t num_cpu_groups; /* Number of entries in cpus arrays */
uint32_t *cpu_count_reps; /* Number of reps of each cpu count */
uint16_t *cpus_per_node; /* Number of cpus per node */
uint16_t cpus_per_task; /* Number of cpus per app task/PE */
uint16_t task_dist;
uint16_t plane_size;
char *nodelist; /* Allocated node hostname list */
} ccm_info_t;
#define CRAY_ERR(fmt, ...) error("(%s: %d: %s) "fmt, THIS_FILE, __LINE__, \
__FUNCTION__, ##__VA_ARGS__);
extern void _ccm_get_config();
extern int _ccm_check_partitions(struct job_record *job_ptr);
extern void *_ccm_begin(void *args);
extern void *_ccm_fini(void *args);
extern char *_get_ccm_partition(ccm_config_t *ccm_config);
extern void _spawn_ccm_thread(void *obj_ptr, void *(*start_routine) (void *));
#endif
#endif /* CRAY_SELECT_CCM_H */
...@@ -37,19 +37,6 @@ ...@@ -37,19 +37,6 @@
#define _GNU_SOURCE 1 #define _GNU_SOURCE 1
#ifdef HAVE_CONFIG_H
# include "config.h"
# if HAVE_STDINT_H
# include <stdint.h>
# endif
# if HAVE_INTTYPES_H
# include <inttypes.h>
# endif
# if WITH_PTHREADS
# include <pthread.h>
# endif
#endif
#include <stdio.h> #include <stdio.h>
#include <poll.h> #include <poll.h>
#include <sys/types.h> #include <sys/types.h>
...@@ -63,6 +50,7 @@ ...@@ -63,6 +50,7 @@
#include "src/slurmctld/burst_buffer.h" #include "src/slurmctld/burst_buffer.h"
#include "src/slurmctld/locks.h" #include "src/slurmctld/locks.h"
#include "other_select.h" #include "other_select.h"
#include "ccm.h"
#ifdef HAVE_NATIVE_CRAY #ifdef HAVE_NATIVE_CRAY
#include "alpscomm_sn.h" #include "alpscomm_sn.h"
...@@ -626,21 +614,21 @@ static void _initialize_event(alpsc_ev_app_t *event, ...@@ -626,21 +614,21 @@ static void _initialize_event(alpsc_ev_app_t *event,
*/ */
static void _copy_event(alpsc_ev_app_t *dest, alpsc_ev_app_t *src) static void _copy_event(alpsc_ev_app_t *dest, alpsc_ev_app_t *src)
{ {
dest->apid = src->apid; dest->apid = src->apid;
dest->uid = src->uid; dest->uid = src->uid;
dest->app_name = xstrdup(src->app_name); dest->app_name = xstrdup(src->app_name);
dest->batch_id = xstrdup(src->batch_id); dest->batch_id = xstrdup(src->batch_id);
dest->state = src->state; dest->state = src->state;
if (src->num_nodes > 0 && src->nodes != NULL) { if (src->num_nodes > 0 && src->nodes != NULL) {
dest->nodes = xmalloc(src->num_nodes * sizeof(int32_t)); dest->nodes = xmalloc(src->num_nodes * sizeof(int32_t));
memcpy(dest->nodes, src->nodes, memcpy(dest->nodes, src->nodes,
src->num_nodes * sizeof(int32_t)); src->num_nodes * sizeof(int32_t));
dest->num_nodes = src->num_nodes; dest->num_nodes = src->num_nodes;
} else { } else {
dest->nodes = NULL; dest->nodes = NULL;
dest->num_nodes = 0; dest->num_nodes = 0;
} }
return; return;
} }
/* /*
...@@ -1249,6 +1237,13 @@ extern int init ( void ) ...@@ -1249,6 +1237,13 @@ extern int init ( void )
plugin_id = SELECT_PLUGIN_CRAY_CONS_RES; plugin_id = SELECT_PLUGIN_CRAY_CONS_RES;
debug_flags = slurm_get_debug_flags(); debug_flags = slurm_get_debug_flags();
#if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
/* Read and store the CCM configured partition name(s). */
if (run_in_daemon("slurmctld")) {
/* Get any CCM configuration information */
_ccm_get_config();
}
#endif
if (!slurmctld_primary && run_in_daemon("slurmctld")) { if (!slurmctld_primary && run_in_daemon("slurmctld")) {
START_TIMER; START_TIMER;
if (slurmctld_config.scheduling_disabled) { if (slurmctld_config.scheduling_disabled) {
...@@ -1494,7 +1489,7 @@ extern int select_p_state_restore(char *dir_name) ...@@ -1494,7 +1489,7 @@ extern int select_p_state_restore(char *dir_name)
for (j=0; j<blade_cnt; j++) { for (j=0; j<blade_cnt; j++) {
if (blade_info.id == blade_array[j].id) { if (blade_info.id == blade_array[j].id) {
/* blade_array[j].job_cnt = */ /* blade_array[j].job_cnt = */
/* blade_info.job_cnt; */ /* blade_info.job_cnt; */
if (!bit_equal(blade_array[j]. if (!bit_equal(blade_array[j].
node_bitmap, node_bitmap,
blade_info.node_bitmap)) blade_info.node_bitmap))
...@@ -1879,13 +1874,52 @@ extern int select_p_job_begin(struct job_record *job_ptr) ...@@ -1879,13 +1874,52 @@ extern int select_p_job_begin(struct job_record *job_ptr)
/* xfree(tmp3); */ /* xfree(tmp3); */
slurm_mutex_unlock(&blade_mutex); slurm_mutex_unlock(&blade_mutex);
#if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
if (ccm_config.ccm_enabled) {
/*
* Create a thread to do setup activity before running the
* CCM prolog for a CCM partition.
*/
if (_ccm_check_partitions(job_ptr)) {
if (job_ptr->details == NULL) {
/* This info is required; abort the launch */
CRAY_ERR("CCM prolog missing job details, "
"job %u killed", job_ptr->job_id);
srun_user_message(job_ptr,
"CCM prolog missing job details, killed");
(void) job_signal(job_ptr->job_id, SIGKILL, 0,
0, false);
} else {
/* Delay job launch until CCM prolog is done */
debug("CCM job %u increment prolog_running, "
"current %d", job_ptr->job_id,
job_ptr->details->prolog_running);
job_ptr->details->prolog_running++;
/* Cleared in prolog_running_decr() */
debug("CCM job %u setting JOB_CONFIGURING",
job_ptr->job_id);
job_ptr->job_state |= JOB_CONFIGURING;
_spawn_ccm_thread(job_ptr, _ccm_begin);
}
}
}
#endif
return other_job_begin(job_ptr); return other_job_begin(job_ptr);
} }
extern int select_p_job_ready(struct job_record *job_ptr) extern int select_p_job_ready(struct job_record *job_ptr)
{ {
xassert(job_ptr); xassert(job_ptr);
#if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
if (_ccm_check_partitions(job_ptr)) {
/* Delay CCM job launch until CCM prolog is done */
if (IS_JOB_CONFIGURING(job_ptr)) {
debug("CCM job %u job configuring set; job not ready",
job_ptr->job_id);
return READY_JOB_ERROR;
}
}
#endif
return other_job_ready(job_ptr); return other_job_ready(job_ptr);
} }
...@@ -1917,7 +1951,14 @@ extern int select_p_job_fini(struct job_record *job_ptr) ...@@ -1917,7 +1951,14 @@ extern int select_p_job_fini(struct job_record *job_ptr)
{ {
select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data; select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data;
#if defined(HAVE_NATIVE_CRAY) && !defined(HAVE_CRAY_NETWORK)
/* Create a thread to run the CCM epilog for a CCM partition */
if (ccm_config.ccm_enabled) {
if (_ccm_check_partitions(job_ptr)) {
_spawn_ccm_thread(job_ptr, _ccm_fini);
}
}
#endif
if (slurmctld_conf.select_type_param & CR_NHC_NO) { if (slurmctld_conf.select_type_param & CR_NHC_NO) {
debug3("NHC_No set, not running NHC after allocations"); debug3("NHC_No set, not running NHC after allocations");
other_job_fini(job_ptr); other_job_fini(job_ptr);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment