Commit ed73d371 authored by Robert Dietrich's avatar Robert Dietrich

added generic prolog and epilog scripts

parent ccaf213e
#!/bin/bash
#if [ "$SLURM_JOB_USER" != "rdietric" ] && [ "$SLURM_JOB_USER" != "fwinkler" ]; then
# exit 0
#fi
#if [[ $HOSTNAME = taurusi7* ]] && [ "$SLURM_JOB_USER" != "rdietric" ]; then
# exit 0
#fi
# (1) Get PIKA environment variables
source /sw/taurus/tools/pika/pika-current.conf
export PIKA_JOB_ID=${SLURM_JOB_ID}
export PIKA_JOB_NODELIST=${SLURM_NODELIST}
# (2.1) Check if prolog script was called
if [ -e ${LOCAL_STORE}/pika_prolog_${PIKA_JOB_ID} ]; then
rm -f ${LOCAL_STORE}/pika_prolog_${PIKA_JOB_ID}
else
exit 0
fi
# (2.2) Check for debug file
if [ "${PIKA_DEBUG}" == "1" ]; then
mkdir -p /tmp/pika_debug
DEBUG_PATH=/tmp/pika_debug/pika_${PIKA_JOB_ID}
echo -e "\nStart epilog debugging..." >> $DEBUG_PATH 2>&1
else
DEBUG_PATH=/dev/null
fi
# (3) Read SLURM environment from file (created in during prolog)
BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
# (4) Determine master node
if [ -x "$(command -v ${PYTHON_ROOT}/bin/nodeset)" ]; then
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | ${PYTHON_ROOT}/bin/nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
else
echo -e "Error: PIKA nodeset is NOT available!" >> $DEBUG_PATH 2>&1
if [ -x "$(command -v nodeset)" ]; then
echo "Try system default nodeset." >> $DEBUG_PATH 2>&1
save_pypath=$PYTHONHOME
unset PYTHONHOME
unset PYTHONPATH
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
PYTHONHOME=$save_pypath
PYTHONPATH=$save_pypath
else
echo -e "Error: nodeset not available!" >> $DEBUG_PATH 2>&1
fi
fi
if [ "$MASTER_NODE" = "" ]; then
echo "PIKA_JOB_NODELIST=${PIKA_JOB_NODELIST}" >> $DEBUG_PATH 2>&1
fi
echo -e "\nMASTER_NODE=$MASTER_NODE" >> $DEBUG_PATH 2>&1
# this node's name
PIKA_HOSTNAME=$(hostname | cut -d. -f1)
echo "PIKA_HOSTNAME=$PIKA_HOSTNAME" >> $DEBUG_PATH 2>&1
# (5) Update job metadata
epilog_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
source ${epilog_dir}/pika_update_metadata_epilog_include.sh >> $DEBUG_PATH 2>&1
# (6) Set LIKWID counters if LIKWID is used with direct access mode
if [ "${PIKA_LIKWID_MODE}" = "direct" ]; then
# Reset counters for exclusive jobs AND monitoring enabled
if [ $PIKA_MONITORING -eq 1 ] && [ $PIKA_JOB_EXCLUSIVE -eq 1 ]; then
echo "PUTNOTIF severity=okay time=$(date +%s) plugin=likwid message=rstCtrs" | nc -U ${PIKA_COLLECTD_SOCKET}
fi
fi
# (7) Cleanup local data
rm -f ${BATCHSYSTEM_ENV_FILE}
echo -e "\nEpilog finished sucessfully!" >> $DEBUG_PATH 2>&1
exit 0
#!/bin/bash
DAEMON="${PIKA_INSTALL_PATH}/collectd/${COLLECTD_VERSION}/sbin/collectd -C ${PIKA_COLLECTD_CONF} -P ${PIKA_COLLECTD_PID_FILE}"
# check if any jobs are already running on the local node
if [ ${LOCAL_JOBS_RUNNING} -eq 0 ]; then
echo -e "\nNo jobs are currently running on this node." >> $DEBUG_PATH 2>&1
fi
# we stop collectd if:
# user wants to disable monitoring OR
# we do not get the no_monitoring flag, but there is no other job running on the local node
if [[ $PIKA_MONITORING -eq 0 || ($PIKA_MONITORING -eq -1 && ${LOCAL_JOBS_RUNNING} -eq 0) ]]; then
echo -e "\nExclusive and no_monitoring set! Stop collectd." >> $DEBUG_PATH 2>&1
# get collectd pid and kill the process
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "$DAEMON" | awk '{print $1}'`
echo -e "\nCOLLECTD_PID=$COLLECTD_PID" >> $DEBUG_PATH 2>&1
# check if COLLECTD_PID is empty
if [ -z "$COLLECTD_PID" ]; then
echo -e "\ncollectd is not running." >> $DEBUG_PATH 2>&1
else
echo -e "\nkill -TERM $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -TERM $COLLECTD_PID >> $DEBUG_PATH 2>&1 # flushes metric buffer? send extra signal?
sleep 1
# make sure collectd gets killed
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "$DAEMON" | awk '{print $1}'`
if [ -n "$COLLECTD_PID" ]; then
echo -e "\nkill -KILL $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -KILL $COLLECTD_PID >> $DEBUG_PATH 2>&1
fi
fi
fi
##### only one prolog should care about getting collectd to start ####
# try to access the lock
#exec {lock_fd}>${LOCAL_STORE}/pika_collectd_lock || exit 1
#flock -n "$lock_fd"
#trap "rm -f ${LOCAL_STORE}/pika_collectd_lock" 0 #make sure that the lock is removed on exit
mkdir $lock_collectd 2> /dev/null
if [ $? == 0 ] || [ "$have_setup_lock" = true ]; then
trap "rm -rf ${lock_collectd}" QUIT TERM EXIT INT #make sure that the lock is removed on exit
# determine number of PIKA collectd processes
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nActive PIKA collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
# if we have more than one collectd process running, kill all but the last
if [ $active_procs -gt 1 ]; then
COLLECTD_PIDS=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
KILL_COLLECTD_PIDS=`echo $COLLECTD_PID | sed s/'\w*$'//` #tail of last word/pid in line
echo -e "\nError: More than one PIKA collectd active: ${COLLECTD_PIDS}. kill -TERM ${KILL_COLLECTD_PIDS}" >> $DEBUG_PATH 2>&1
kill -TERM ${KILL_COLLECTD_PIDS} >> $DEBUG_PATH 2>&1
fi
# we only check for collectd if $PIKA_MONITORING=1
if [ "$PIKA_MONITORING" -eq 1 ]; then
# if collectd is not running yet
if [ "$active_procs" -eq 0 ]; then
source ${prolog_dir}/pika_start_collectd.sh >> $DEBUG_PATH 2>&1
fi
fi
rm -rf ${lock_collectd} # remove the lock
fi
#flock -u "$lock_fd"
# end of locked region
#!/bin/bash
source /sw/taurus/tools/pika/pika-current.conf
DEBUG_PATH=/tmp/pika_control.out
function pika_clean() {
rm -rf /tmp/pika_*
rm -rf /opt/pika/*
}
function pika_install() {
mkdir -p ${PIKA_INSTALL_PATH}
echo -e "tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/.."
tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/..
}
function pika_start() {
DAEMON="${PIKA_INSTALL_PATH}/collectd/${COLLECTD_VERSION}/sbin/collectd -C ${PIKA_COLLECTD_CONF} -P ${PIKA_COLLECTD_PID_FILE}"
touch $DEBUG_PATH
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_start_collectd.sh
cat $DEBUG_PATH
rm -f $DEBUG_PATH
}
function pika_stop() {
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_stop_collectd.sh
}
if [ ! $# -eq 1 ]; then
echo "One argument required!"
echo "./pika_control [purge|install|start|stop]"
else
case "$1" in
purge)
pika_stop
pika_clean
;;
install)
pika_stop
pika_clean
pika_install
;;
start)
pika_stop
if [ ! -d "${PIKA_INSTALL_PATH}" ]; then
pika_install
fi
pika_start
;;
stop)
pika_stop
;;
*)
echo "$1 is not supported."
;;
esac
fi
#!/bin/bash
# check if user wants to disable collectd monitoring
# PIKA_MONITORING has three values:
# 1 -> monitoring enabled
# 0 -> request to disable monitoring in exclusive job
# -1 -> redis server down or error in python script
# master node retrieves additional job information
if [ "${PIKA_HOSTNAME}" = "${MASTER_NODE}" ]; then
echo -e "\nGet job meta data (master node)" >> $DEBUG_PATH 2>&1
PIKA_MONITORING=`python3 ${prolog_dir}/pika_slurm_env_redis.py --jobid=${PIKA_JOB_ID} --env_file=${BATCHSYSTEM_ENV_FILE} --force 2>&1`
else
PIKA_MONITORING=`python3 ${prolog_dir}/pika_slurm_env_redis.py --jobid=${PIKA_JOB_ID} 2>&1`
fi
# check if variable PIKA_MONITORING contains error output
if [[ $PIKA_MONITORING != '0' && $PIKA_MONITORING != '1' && $PIKA_MONITORING != '-1' ]]; then
echo -e "\nError: $PIKA_MONITORING" >> $DEBUG_PATH 2>&1
echo -e "\nVariable PIKA_MONITORING is corrupt, set it to -1" >> $DEBUG_PATH 2>&1
PIKA_MONITORING=-1
fi
echo -e "\nPIKA_MONITORING=$PIKA_MONITORING" >> $DEBUG_PATH 2>&1
# write monitoring flag into file for master node
if [ "${PIKA_HOSTNAME}" = "${MASTER_NODE}" ]; then
echo $PIKA_MONITORING > ${LOCAL_STORE}/pika_monitoring_${PIKA_JOB_ID}
fi
#!/bin/bash
# install pika python and likwid in /opt/pika if it is not already installed
if [ ! -d "${PIKA_INSTALL_PATH}" ]; then
echo -e "\n${PIKA_INSTALL_PATH} does not exist" >> $DEBUG_PATH 2>&1
# create a simple lock via mkdir
mkdir $lock_collectd 2> /dev/null #returns 1, if directory already exists
# check if directory could be successfully created
# ensures that only one prolog instance is installing PIKA
if [ $? == 0 ]; then
trap "rm -rf ${lock_collectd}" QUIT TERM EXIT INT # ensure that lock is released on exit
have_setup_lock=true
echo -e "\nInstall/Replace PIKA software stack to ${PIKA_INSTALL_PATH}" >> $DEBUG_PATH 2>&1
# check if an old collectd daemon is still running, if so kill it
echo -e "\nCheck if an old PIKA Collectd is still running" >> $DEBUG_PATH 2>&1
DAEMON="pika_collectd"
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "$DAEMON" | awk '{print $1}'`
if [ -z "$COLLECTD_PID" ]; then
echo -e "Collectd is not running." >> $DEBUG_PATH 2>&1
else
echo -e "\nTerminate old PIKA Collectd processes: $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -TERM $COLLECTD_PID
sleep 1 # give the terminate a chance
# make sure that collectd processes are killed
old_pika_collectd_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
if [ $old_pika_collectd_procs -gt 0 ]; then
echo -e "\nkill -KILL $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -KILL $COLLECTD_PID >> $DEBUG_PATH 2>&1
fi
# check if the hard kill worked
old_pika_collectd_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
if [ $old_pika_collectd_procs -gt 0 ]; then
echo -e "\nError: Could not terminate old PIKA Collectd processes. ${old_pika_collectd_procs} are still running." >> $DEBUG_PATH 2>&1
fi
fi
# delete old PIKA installation, except for the new version
#cd ${PIKA_INSTALL_PATH}/..
#old_pika_count=`ls | wc -l`
#if [ $old_pika_count -gt 1 ]; then
# echo -e "\nDelete $old_pika_count old /opt/pika installations." >> $DEBUG_PATH 2>&1
# rm -rf $(ls | grep -v ${PIKA_VERSION})
#fi
if [ -d "/opt/pika" ]; then
echo -e "rm -rf /opt/pika" >> $DEBUG_PATH 2>&1
rm -rf /opt/pika
fi
# temporary: delete old prope installations
if [ -d "/opt/prope" ]; then
echo -e "rm -rf /opt/prope" >> $DEBUG_PATH 2>&1
rm -rf /opt/prope
echo -e "rm -f /tmp/diamond.*" >> $DEBUG_PATH 2>&1
rm -f /tmp/diamond.*
echo -e "rm -rf /tmp/prope_debug" >> $DEBUG_PATH 2>&1
rm -rf /tmp/prope_debug
fi
# create the install directory
mkdir -p ${PIKA_INSTALL_PATH}
echo -e "tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/.." >> $DEBUG_PATH 2>&1
tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/.. >> $DEBUG_PATH 2>&1
# unlock (collectd is ready to be started)
# rm -rf ${lock_collectd}
fi
fi
#!/bin/bash
if [ "${PIKA_HOSTNAME}" = "${MASTER_NODE}" ]; then
# get utility functions
source ${PIKA_ROOT}/pika_utils.sh >> $DEBUG_PATH 2>&1
LOCAL_TIME_STORE=${LOCAL_STORE}/pika_local_time_${PIKA_JOB_ID} # store
# save local start time
JOB_START=`date +%s`
# store local start time into file
mkdir -p ${LOCAL_TIME_STORE}
echo "${JOB_START}" > ${LOCAL_TIME_STORE}/START_${PIKA_HOSTNAME}
echo -e "\nJOB_START=${JOB_START}" >> $DEBUG_PATH 2>&1
# if Redis script worked overwrite metadata
if [[ $PIKA_MONITORING -ge 0 ]]; then
source ${BATCHSYSTEM_ENV_FILE} >> $DEBUG_PATH 2>&1
fi
#echo -e "\nenv | grep \"PIKA_\"" >> $DEBUG_PATH 2>&1
env | grep "PIKA_" >> $DEBUG_PATH 2>&1
# use start time from redis if available
if [ "${PIKA_JOB_START}" -gt "0" ]; then
JOB_START=${PIKA_JOB_START}
fi
# save all metadata in job table
echo -e "\nGet number of nodes" >> $DEBUG_PATH 2>&1
JOB_NUM_NODES=$(nodeset -c ${PIKA_JOB_NODELIST} 2>> $DEBUG_PATH)
if [ "$JOB_NUM_NODES" == "" ]; then
echo -e "\nCould not determine number of nodes. Set to 0." >> $DEBUG_PATH 2>&1
JOB_NUM_NODES=0
else
echo "JOB_NUM_NODES=$JOB_NUM_NODES" >> $DEBUG_PATH 2>&1
fi
#echo -e "\nCheck for GPUs" >> $DEBUG_PATH 2>&1
if [ -z "${SLURM_JOB_GPUS}" ]; then
echo -e "\nNo GPUs on this node" >> $DEBUG_PATH 2>&1
SLURM_JOB_GPUS=""
else
echo -e "\nSLURM_JOB_GPUS=$SLURM_JOB_GPUS" >> $DEBUG_PATH 2>&1
fi
# check if job is part of an array job
JOB_ARRAY_ID=0
if [ "${PIKA_JOB_ARRAY_ID}" != "None" ]; then
JOB_ARRAY_ID=${PIKA_JOB_ARRAY_ID}
fi
# check if job name is too long (maximum length is 256)
chrlen=${#PIKA_JOB_NAME}
echo "Length of job name: $chrlen" >> $DEBUG_PATH 2>&1
if [ $chrlen -gt 252 ]; then
echo "Truncate job name string..." >> $DEBUG_PATH 2>&1
PIKA_JOB_NAME="${PIKA_JOB_NAME:0:252}..."
fi
# save all nodes for exclusive jobs (node number > 1) in cpu list in order to search jobs by a specific node
if [ $PIKA_JOB_EXCLUSIVE -eq 1 ] && [ $JOB_NUM_NODES -gt 1 ]; then
PIKA_JOB_CPUS_ALLOCATED=`echo ${PIKA_JOB_NODELIST} | nodeset -e`
fi
# create sql statement
SQL_QUERY="INSERT INTO Job_Data "
SQL_QUERY+="(JID,USER,PROJECT,STATUS,NUM_NODES,NODELIST,CPULIST,NUM_CORES,START,NAME,WALLTIME,P_PARTITION,EXCLUSIVE,ARRAY_ID) "
SQL_QUERY+="VALUES ('${PIKA_JOB_ID}','${PIKA_JOB_USER}','${PIKA_JOB_ACCOUNT}','running','${JOB_NUM_NODES}','${PIKA_JOB_NODELIST}','${PIKA_JOB_CPUS_ALLOCATED}','${PIKA_JOB_NUM_CORES}','${JOB_START}','${PIKA_JOB_NAME}','${PIKA_JOB_WALLTIME}','${PIKA_JOB_PARTITION}','${PIKA_JOB_EXCLUSIVE}','${JOB_ARRAY_ID}')"
# check if mysql is installed
MYSQL_CHECK=`command -v mysql`
if [ -z "${MYSQL_CHECK}" ]; then
echo -e "\nMYSQL client not found! Cannot write into database!" >> $DEBUG_PATH 2>&1
else
mysql_command "${SQL_QUERY}"
fi
fi
#!/bin/bash
export PIKA_JOB_START=0 #`date +%s`
export PIKA_JOB_ID=${SLURM_JOB_ID}
export PIKA_JOB_NODELIST=${SLURM_NODELIST}
export PIKA_JOB_USER=${SLURM_JOB_USER}
export PIKA_JOB_EXCLUSIVE=0
export PIKA_JOB_PARTITION=${SLURM_JOB_PARTITION}
export PIKA_JOB_NAME='n/a'
export PIKA_JOB_WALLTIME=0
export PIKA_JOB_CPUS_ALLOCATED='n/a'
export PIKA_JOB_ARRAY_ID='None'
export PIKA_JOB_ACCOUNT='n/a'
export PIKA_JOB_NUM_CORES=0
\ No newline at end of file
#!/usr/bin/env python
import redis
import _pickle as cPickle
import sys
import ast
import os
import time
import pwd
import argparse
from time import sleep
from itertools import count, groupby
def list_to_ranges(L):
G=(list(x) for _,x in groupby(L, lambda x,c=count(): next(c)-x))
return ",".join("-".join(map(str,(g[0],g[-1])[:len(g)])) for g in G)
def main(job_id, debug_path, env_file, force):
debug_file = None
if debug_path:
debug_file_path = debug_path + "/memcache_" + str(job_id)
debug_file = open(debug_file_path,'w')
debug_file.write("debug before: {0} {1}\n".format(job_id, time.time()))
redis_host = os.environ.get('REDIS_HOST')
if redis_host is None:
if debug_file:
debug_file.write("REDIS_HOST is is not set!\n")
sys.exit("REDIS_HOST is is not set!")
connection = redis.StrictRedis(host=redis_host, port=6379, socket_timeout=10, socket_connect_timeout=10)
slurm_env_string = None
haveConnectionError = False
try:
slurm_env_string = connection.get("prope_" + str(job_id))
except: # redis.exceptions.TimeoutError:
haveConnectionError = True
t = 0
while slurm_env_string == None and t < 10:
try:
slurm_env_string = connection.get("prope_" + str(job_id))
except: # redis.exceptions.TimeoutError:
haveConnectionError = True
continue
sleep(1)
t = t + 1
#pprint(slurm_env_string)
if debug_file:
if haveConnectionError:
debug_file.write("Redis connection error ocurred!\n")
debug_file.write("Connection attemps = {:d}\n".format(t))
debug_file.write("debug after: {0} {1}\n".format(job_id, time.time()))
#f.close()
# decode the job info
slurm_env = {}
if slurm_env_string and len(str(slurm_env_string)) > 0:
try:
slurm_env = cPickle.loads(slurm_env_string)
#print(slurm_env)
except:
slurm_env = {}
if debug_path and debug_file:
if slurm_env:
debug_file.write(str(slurm_env))
else:
debug_file.write(str(slurm_env_string))
# check if this job is exclusive and no monitoring is requested
monitoring_on = -1
if slurm_env:
nodes_shared = str(slurm_env['shared'])
if nodes_shared == "OK":
monitoring_on = 1
else:
#check no_monitoring comment
slurm_comment = str(slurm_env['comment'])
if 'no_monitoring' in slurm_comment:
monitoring_on = 0
else:
monitoring_on = 1
# if env file is given and (forced or job should be monitored)
if env_file and (force or monitoring_on != 0):
save_job_env(env_file, slurm_env, connection, debug_file)
# print result as return value of the script
print(monitoring_on)
def save_job_env(env_file, slurm_env, connection, debug_file):
# set default values
start_time = 0
work_dir = "'n/a'"
exclusive = 0
partition_name = "'n/a'"
job_name = "'n/a'"
walltime = 0
walltime_formatted = "'n/a'"
cpu_allocated = "'n/a'"
job_array_id = "None"
account = "'n/a'"
num_cores = 0
job_user = None
if "SLURM_JOB_USER" in os.environ:
job_user = os.environ["SLURM_JOB_USER"]
# get selected job information
if slurm_env:
start_time = str(slurm_env['start_time'])
partition_name = str(slurm_env['partition'])
nodes_shared = str(slurm_env['shared'])
work_dir = str(slurm_env['work_dir'])
if nodes_shared == "OK":
total_cpus_allocated = int(slurm_env['num_cpus'])
node_count = int(slurm_env['num_nodes'])
#print "Shared: " + str(total_cpus_allocated / node_count) + " cpus per node on partition " + partition_name
# get partition data
partition_data_string = connection.get(partition_name)
partition_data = cPickle.loads(partition_data_string)
#partition_data = ast.literal_eval(str(partition_data_string))
try:
cpus_avail = int(partition_data['max_cpus_per_node'])
#print(str(partition_data))
except:
cpus_avail = -1
#print(str(partition_data))
if (total_cpus_allocated / node_count) == cpus_avail:
#print "Exclusive with " + str(total_cpus_allocated / node_count) + " cpus per node on partition " + partition_name
exclusive = 1
else:
#print "Exclusive flag set by user"
exclusive = 1
cpu_allocated = ''
if exclusive == 0:
for key, value in slurm_env['cpus_alloc_layout'].items():
cpu_allocated += str(key) + str('[') + str(list_to_ranges(value)) + str('],')
#remove last comma from string
cpu_allocated = cpu_allocated[:-1]
try:
job_name = str(slurm_env['name'])
except:
job_name = 'corrupt'
#determine job user
if not job_user:
job_user = pwd.getpwuid(slurm_env['user_id']).pw_name
try:
walltime = slurm_env['time_limit']
except:
walltime = 0
#convert walltime from minutes to seconds
walltime *= 60
walltime_formatted = slurm_env['time_limit_str']
job_array_id = slurm_env['array_job_id']
#determine account
account = str(slurm_env['account'])
#determine number of cores
num_cores = str(slurm_env['num_cpus'])
if env_file:
f = open(env_file,'w')
print( "#!/bin/bash", file=f )
print( "export PIKA_JOB_START=" + str(start_time), file=f )
print( "export PIKA_JOB_USER=" + str(job_user), file=f )
print( "export PIKA_JOB_EXCLUSIVE=" + str(exclusive), file=f )
print( "export PIKA_JOB_PARTITION=" + partition_name, file=f )
print( "export PIKA_JOB_NAME='" + job_name + "'", file=f )
print( "export PIKA_JOB_WALLTIME=" + str(walltime), file=f )
print( "export PIKA_JOB_WALLTIME_FORMATTED=" + str(walltime_formatted), file=f )
print( "export PIKA_JOB_CPUS_ALLOCATED=" + str(cpu_allocated), file=f )
print( "export PIKA_JOB_ARRAY_ID=" + str(job_array_id), file=f )
print( "export PIKA_JOB_ACCOUNT=" + str(account), file=f )
print( "export PIKA_JOB_NUM_CORES=" + str(num_cores), file=f )
print( "export PIKA_JOB_WORK_DIR=" + str(work_dir), file=f )
f.close()
if debug_file:
debug_file.write("\nReservation: " + str(exclusive))
debug_file.write("\nPartition: " + partition_name)
debug_file.write("\nJob Name: " + str(slurm_env['name']))
debug_file.write("\nWalltime: " + str(slurm_env['time_limit']))
#debug_file.write("\n" + str(cpu_allocated))
debug_file.write("\n\n")
debug_file.close()
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--jobid', type=int, required=True)
parser.add_argument('--debug_path', type=str, required=False)
parser.add_argument('--env_file', type=str, required=False)
parser.add_argument('--force', action="store_true", default=False, help='force request for job meta data')
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
main(job_id=args.jobid,
debug_path=args.debug_path,
env_file=args.env_file,
force=args.force)
#!/bin/bash
#requires a source of /sw/taurus/tools/pika/pika-current.conf
#source /sw/taurus/tools/pika/pika-current.conf
CD_INST_PATH=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION
CUSTOM_TYPES_DIR=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION/share/collectd
COLLECTD_PYTHON_PLUGIN_PATH=$PIKA_ROOT/daemon/collectd/collectd-plugins/python