...
 
Commits (6)
......@@ -2,15 +2,21 @@
source ../pika_install.conf
install_path=$PIKA_INSTALL_PATH
if [ `id -u` -ne 0 ]; then
install_path=$PIKA_BUILD_PATH
fi
# set collectd install path
COLLECTD_INST_PATH=$PIKA_INSTALL_PATH/collectd/${COLLECTD_VERSION}
PYTHON_ROOT=$PIKA_INSTALL_PATH/python/$PYTHON_VERSION
COLLECTD_INST_PATH=$install_path/collectd/${COLLECTD_VERSION}
PYTHON_ROOT=$install_path/python/$PYTHON_VERSION
# expose Python3 path
export PATH=$PYTHON_ROOT/bin:$PATH
mkdir -p $PIKA_INSTALL_PATH/../sources
cd $PIKA_INSTALL_PATH/../sources
mkdir -p $install_path/../sources
cd $install_path/../sources
# download, unpack, remove source package
if [ $COLLECTD_VERSION == 'GIT' ]; then
......@@ -69,7 +75,7 @@ export COLLECTD_ROOT=${COLLECTD_INST_PATH}
#export LIKWID_ROOT=$PIKA_ROOT/sw/pika/$PIKA_VERSION/likwid/$LIKWID_VERSION
export LIKWID_ROOT=${COLLECTD_INST_PATH}/../../likwid/$LIKWID_VERSION
cd ${PIKA_ROOT}/daemon/collectd/collectd-plugins/c
make
make likwid
# copy custom types into collectd installation
cp $PIKA_ROOT/daemon/collectd/custom_types.db ${COLLECTD_INST_PATH}/share/collectd/
......@@ -2,6 +2,12 @@
source ../pika_install.conf
install_path=$PIKA_INSTALL_PATH
if [ `id -u` -ne 0 ]; then
install_path=$PIKA_BUILD_PATH
fi
# set compiler
if [ -n "${PIKA_TARGET}" ] && [ ${PIKA_TARGET} == 'power' ]; then
COMPILER=GCCPOWER
......@@ -9,11 +15,11 @@ else
COMPILER=GCC #GCCPOWER for IBM Power systems
fi
mkdir -p $PIKA_INSTALL_PATH/../sources
cd $PIKA_INSTALL_PATH/../sources
mkdir -p $install_path/../sources
cd $install_path/../sources
PYTHON_ROOT=$PIKA_INSTALL_PATH/python/${PYTHON_VERSION}
LIKWID_INST_PATH=$PIKA_INSTALL_PATH/likwid/${LIKWID_VERSION}
PYTHON_ROOT=$install_path/python/${PYTHON_VERSION}
LIKWID_INST_PATH=$install_path/likwid/${LIKWID_VERSION}
### Build Likwid
......@@ -65,8 +71,8 @@ cp config.mk config.mk.backup
sed -i "/^PREFIX .*/ s|.*|PREFIX = $LIKWID_INST_PATH|" config.mk
# set access mode
sed -i "/^ACCESSMODE = .*/ s|.*|ACCESSMODE = direct|" config.mk
#sed -i "/^ACCESSMODE = .*/ s|.*|ACCESSMODE = perf_event|" config.mk
#sed -i "/^ACCESSMODE = .*/ s|.*|ACCESSMODE = direct|" config.mk
sed -i "/^ACCESSMODE = .*/ s|.*|ACCESSMODE = perf_event|" config.mk
# do not build access daemon or frequency changer
sed -i "/^BUILDDAEMON = .*/ s|.*|BUILDDAEMON = false|" config.mk
......
......@@ -3,16 +3,31 @@
source ../pika_install.conf
#delete old installation
rm -rf /opt/pika/
if [ `id -u` -ne 0 ]; then
if [ ! -z "$PIKA_BUILD_PATH" ] && [ -d "$PIKA_BUILD_PATH" ]; then
rm -rf $PIKA_BUILD_PATH/../sources
cd $PIKA_BUILD_PATH/..
rm -rf $PIKA_VERSION
cd -
else
echo Error with build path $PIKA_BUILD_PATH
fi
else
rm -rf /opt/pika/
fi
./install_python3.sh
./install_likwid.sh
./install_python3.sh 2>&1 | tee python_install.log
./install_likwid.sh 2>&1 | tee likwid_install.log
# collectd requires likwid and python
./install_collectd.sh
./install_collectd.sh 2>&1 | tee collectd_install.log
# go to PIKA install root folder
cd $PIKA_INSTALL_PATH/..
if [ `id -u` -ne 0 ]; then
cd $PIKA_BUILD_PATH/..
else
cd $PIKA_INSTALL_PATH/..
fi
#create tarball in /sw/taurus/tools/pika/archives
tar czf ${PIKA_PACKAGE_PATH} ${PIKA_VERSION}
......@@ -5,10 +5,16 @@
source ../pika_install.conf
mkdir -p $PIKA_INSTALL_PATH/../sources
cd $PIKA_INSTALL_PATH/../sources
install_path=$PIKA_INSTALL_PATH
DEST_INST=${PIKA_INSTALL_PATH}/python/${PYTHON_VERSION}
if [ `id -u` -ne 0 ]; then
install_path=$PIKA_BUILD_PATH
fi
mkdir -p $install_path/../sources
cd $install_path/../sources
DEST_INST=${install_path}/python/${PYTHON_VERSION}
# download python
wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tar.xz
......@@ -28,11 +34,15 @@ rm -rf Python-${PYTHON_VERSION}.tar.xz
export PATH=${DEST_INST}/bin:$PATH
export LD_LIBRARY_PATH=${DEST_INST}/lib:${DEST_INST}/lib/python3.7:${DEST_INST}/lib/python3.6:$LD_LIBRARY_PATH
PYTHONHOME=${DEST_INST}
PYTHONROOT=${DEST_INST}
pip3 install --upgrade pip
# install influxdb and mysql client
pip3 install influxdb
pip3 install mysql-connector #==2.1.4
#pip3 install mysql-connector #==2.1.4
pip3 install PyMySQL
#pip3 install nvidia-ml-py
pip3 install ClusterShell
#pip install python-memcached
......
......@@ -9,7 +9,7 @@ COMPUTE_NODES=$(sinfo -o %N --noheader)
clush -t 30 -B -u 30 -w $COMPUTE_NODES "ls /opt/slurm/prolog.d/07_pika;ls /opt/slurm/epilog.d/03_pika"
#purge pika on all compute nodes
clush -t 30 -B -u 30 -w $COMPUTE_NODES "sudo /sw/taurus/tools/pika/job_control/slurm/taurus/pika_control.sh purge"
clush -t 30 -B -u 90 -w $COMPUTE_NODES "sudo /sw/taurus/tools/pika/job_control/slurm/taurus/pika_control.sh purge" 2>&1 | tee pika_purge.txt
#install pika in all compute nodes
clush -t 30 -B -u 30 -w $COMPUTE_NODES "sudo /sw/taurus/tools/pika/job_control/slurm/taurus/pika_control.sh install"
clush -t 30 -B -u 90 -w $COMPUTE_NODES "sudo /sw/taurus/tools/pika/job_control/slurm/taurus/pika_control.sh install" 2>&1 | tee pika_install.txt
#!/bin/bash
if [ "$SLURM_JOB_USER" != "rdietric" ] && [ "$SLURM_JOB_USER" != "fwinkler" ]; then
#if [ "$SLURM_JOB_USER" != "rdietric" ] && [ "$SLURM_JOB_USER" != "fwinkler" ]; then
# exit 0
#fi
if [[ $HOSTNAME = taurusi7* ]]; then
exit 0
fi
......@@ -29,18 +33,38 @@ fi
BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
# determine master node
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
if [ -x "$(command -v ${PYTHON_ROOT}/bin/nodeset)" ]; then
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | ${PYTHON_ROOT}/bin/nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
else
echo -e "Error: PIKA nodeset is NOT available!" >> $DEBUG_PATH 2>&1
if [ -x "$(command -v nodeset)" ]; then
echo "Try system default nodeset." >> $DEBUG_PATH 2>&1
save_pypath=$PYTHONHOME
unset PYTHONHOME
unset PYTHONPATH
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
PYTHONHOME=$save_pypath
PYTHONPATH=$save_pypath
else
echo -e "Error: nodeset not available!" >> $DEBUG_PATH 2>&1
fi
fi
if [ "$MASTER_NODE" = "" ]; then
echo "PIKA_JOB_NODELIST=${PIKA_JOB_NODELIST}" >> $DEBUG_PATH 2>&1
fi
echo -e "\nMASTER_NODE=$MASTER_NODE" >> $DEBUG_PATH 2>&1
# this node's name
PIKA_HOSTNAME=$(hostname | cut -d. -f1)
echo "PIKA_HOSTNAME=$PIKA_HOSTNAME" >> $DEBUG_PATH 2>&1
# update job metadata
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_update_metadata_epilog_include.sh >> $DEBUG_PATH 2>&1
# Reset counters for exclusive jobs AND monitoring enabled
if [ $PIKA_MONITORING -eq 1 ] && [ $PIKA_JOB_EXCLUSIVE -eq 1 ]; then
echo "PUTNOTIF severity=okay time=$(date +%s) plugin=likwid message=rstCtrs" | nc -U ${PIKA_COLLECTD_SOCKET}
fi
# Reset counters for exclusive jobs AND monitoring enabled (need for LIKWID direct MSR access)
#if [ $PIKA_MONITORING -eq 1 ] && [ $PIKA_JOB_EXCLUSIVE -eq 1 ]; then
# echo "PUTNOTIF severity=okay time=$(date +%s) plugin=likwid message=rstCtrs" | nc -U ${PIKA_COLLECTD_SOCKET}
#fi
# cleanup local data
rm -f ${BATCHSYSTEM_ENV_FILE}
......
......@@ -7,7 +7,7 @@
# -1 -> redis server down or error in python script
# master node retrieves additional job information
if [[ ${PIKA_HOSTNAME} = *"${MASTER_NODE}"* ]]; then
if [ "${PIKA_HOSTNAME}" = "${MASTER_NODE}" ]; then
echo -e "\nGet job meta data (master node)" >> $DEBUG_PATH 2>&1
PIKA_MONITORING=`python3 ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env_redis.py --jobid=${PIKA_JOB_ID} --env_file=${BATCHSYSTEM_ENV_FILE} --force 2>&1`
else
......@@ -24,6 +24,6 @@ fi
echo -e "\nPIKA_MONITORING=$PIKA_MONITORING" >> $DEBUG_PATH 2>&1
# write monitoring flag into file for master node
if [[ ${PIKA_HOSTNAME} = *"${MASTER_NODE}"* ]]; then
if [ "${PIKA_HOSTNAME}" = "${MASTER_NODE}" ]; then
echo $PIKA_MONITORING > ${LOCAL_STORE}/pika_monitoring_${PIKA_JOB_ID}
fi
#!/bin/bash
if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
if [ "${PIKA_HOSTNAME}" = "${MASTER_NODE}" ]; then
# get utility functions
source ${PIKA_ROOT}/pika_utils.sh >> $DEBUG_PATH 2>&1
......@@ -38,12 +38,12 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
echo "JOB_NUM_NODES=$JOB_NUM_NODES" >> $DEBUG_PATH 2>&1
fi
echo -e "\nCheck for GPUs" >> $DEBUG_PATH 2>&1
#echo -e "\nCheck for GPUs" >> $DEBUG_PATH 2>&1
if [ -z "${SLURM_JOB_GPUS}" ]; then
echo "No GPUs on this node" >> $DEBUG_PATH 2>&1
echo -e "\nNo GPUs on this node" >> $DEBUG_PATH 2>&1
SLURM_JOB_GPUS=""
else
echo "SLURM_JOB_GPUS=$SLURM_JOB_GPUS" >> $DEBUG_PATH 2>&1
echo -e "\nSLURM_JOB_GPUS=$SLURM_JOB_GPUS" >> $DEBUG_PATH 2>&1
fi
# check if job is part of an array job
......
......@@ -28,18 +28,29 @@ def main(job_id, debug_path, env_file, force):
debug_file = open(debug_file_path,'w')
debug_file.write("debug before: {0} {1}\n".format(job_id, time.time()))
t = 0
slurm_env_string = None
haveConnectionError = False
try:
slurm_env_string = connection.get("prope_" + str(job_id))
except: # redis.exceptions.TimeoutError:
haveConnectionError = True
t = 0
while slurm_env_string == None and t < 10:
try:
slurm_env_string = connection.get("prope_" + str(job_id))
except: # redis.exceptions.TimeoutError:
haveConnectionError = True
continue
sleep(1)
t = t + 1
#pprint(slurm_env_string)
if debug_path and debug_file:
time_attemps = "Time attemps = " + str(t) + str("\n")
debug_file.write(time_attemps)
if debug_file:
if haveConnectionError:
debug_file.write("Redis connection error ocurred!\n")
debug_file.write("Connection attemps = {:d}\n".format(t))
debug_file.write("debug after: {0} {1}\n".format(job_id, time.time()))
#f.close()
......
#!/bin/bash
if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
if [ "${PIKA_HOSTNAME}" = "${MASTER_NODE}" ]; then
# get utility functions
source ${PIKA_ROOT}/pika_utils.sh >> $DEBUG_PATH 2>&1
......@@ -10,8 +10,13 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
# get start time from prolog
LOCAL_TIME_STORE=${LOCAL_STORE}/pika_local_time_${PIKA_JOB_ID}
if [ -d ${LOCAL_TIME_STORE} ] && [ -f ${LOCAL_TIME_STORE}/START_${PIKA_HOSTNAME} ]; then
JOB_START=`cat ${LOCAL_TIME_STORE}/START_${PIKA_HOSTNAME}`
rm -rf ${LOCAL_TIME_STORE}
else
rm -rf ${LOCAL_TIME_STORE}
exit 0
fi
echo -e "\nJOB_START=${JOB_START}" >> $DEBUG_PATH 2>&1
# save local end time
......@@ -31,7 +36,11 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
# if Redis script worked overwrite metadata
if [ "${PIKA_MONITORING}" -ge 0 ]; then
if [ -f ${BATCHSYSTEM_ENV_FILE} ]; then
source ${BATCHSYSTEM_ENV_FILE} >> $DEBUG_PATH 2>&1
else
echo "${BATCHSYSTEM_ENV_FILE} does not exist!" >> $DEBUG_PATH 2>&1
fi
else
echo -e "\nNo job metadata from redis available." >> $DEBUG_PATH 2>&1
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env.sh >> $DEBUG_PATH 2>&1
......
#!/bin/bash
if [ "$SLURM_JOB_USER" != "rdietric" ] && [ "$SLURM_JOB_USER" != "fwinkler" ]; then
#if [ "$SLURM_JOB_USER" != "rdietric" ] && [ "$SLURM_JOB_USER" != "fwinkler" ]; then
# exit 0
#fi
if [[ $HOSTNAME = taurusi7* ]]; then
exit 0
fi
......@@ -38,6 +42,15 @@ else
DEBUG_PATH=/dev/null
fi
#### Developer Debugging ####
if [ "$SLURM_JOB_USER" = "rdietric" ]; then
echo -e "\n### $SLURM_JOB_USER ###" >> $DEBUG_PATH 2>&1
export LD_LIBRARY_PATH=${PIKA_BUILD_PATH}/likwid/${LIKWID_VERSION}/lib:$LD_LIBRARY_PATH
/sw/taurus/tools/pika/daemon/collectd/collectd-plugins/c/topo >> $DEBUG_PATH 2>&1
echo "### End $SLURM_JOB_USER ###" >> $DEBUG_PATH 2>&1
fi
#############################
# print date
date >> $DEBUG_PATH 2>&1
......@@ -70,21 +83,6 @@ if [ ! -f "/etc/cron.daily/pika_logrotate.sh" ]; then
cp ${PIKA_ROOT}/daemon/logrotate/pika_logrotate.sh /etc/cron.daily >> $DEBUG_PATH 2>&1
fi
# determine master node
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
# this node's name
PIKA_HOSTNAME=$(hostname | cut -d. -f1)
echo -e "\nMASTER_NODE=$MASTER_NODE" >> $DEBUG_PATH 2>&1
echo -e "\nPIKA_HOSTNAME=$PIKA_HOSTNAME" >> $DEBUG_PATH 2>&1
# file where job information is stored (can be sourced later)
BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
# set defaults for all pika metadata provided by SLURM
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env.sh >> $DEBUG_PATH 2>&1
# check for Python installation
echo -e "\nCheck PIKA python3:" >> $DEBUG_PATH 2>&1
pika_python_bin=${PYTHON_ROOT}/bin/python3
......@@ -101,6 +99,35 @@ else
done
fi
# determine master node
if [ -x "$(command -v ${PYTHON_ROOT}/bin/nodeset)" ]; then
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | ${PYTHON_ROOT}/bin/nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
else
echo "Error: PIKA nodeset is NOT available!" >> $DEBUG_PATH 2>&1
if [ -x "$(command -v nodeset)" ]; then
echo "Try system default nodeset." >> $DEBUG_PATH 2>&1
save_pypath=$PYTHONHOME
unset PYTHONHOME
unset PYTHONPATH
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
PYTHONHOME=$save_pypath
PYTHONPATH=$save_pypath
else
echo -e "Error: nodeset not available!" >> $DEBUG_PATH 2>&1
fi
fi
echo -e "\nMASTER_NODE=$MASTER_NODE" >> $DEBUG_PATH 2>&1
# this node's name
PIKA_HOSTNAME=$(hostname | cut -d. -f1)
echo "PIKA_HOSTNAME=$PIKA_HOSTNAME" >> $DEBUG_PATH 2>&1
# file where job information is stored (can be sourced later)
BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
# set defaults for all pika metadata provided by SLURM
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env.sh >> $DEBUG_PATH 2>&1
##### (7) get additional job metadata from redis
PIKA_MONITORING=1
if [ -x "$(command -v ${pika_python_bin})" ]; then
......
......@@ -2,16 +2,21 @@
export PIKA_VERSION=1.1
export COLLECTD_VERSION=5.10.0
export LIKWID_VERSION=5.0.1
export LIKWID_VERSION=git #5.0.1
export PIKA_ROOT=/sw/taurus/tools/pika
if [[ $(hostname -s) = taurusml* ]]; then
export PYTHON_VERSION=3.6.10
export PIKA_PACKAGE_PATH=${PIKA_ROOT}/archives/pika-${PIKA_VERSION}-ml.tar.gz
#export PIKA_PACKAGE_PATH=${PIKA_ROOT}/archives/pika-${PIKA_VERSION}-direct-ml.tar.gz
export PIKA_COLLECTD_BATCH_SIZE=500
else
export PYTHON_VERSION=3.7.6
export PYTHON_VERSION=3.7.7
export PIKA_PACKAGE_PATH=${PIKA_ROOT}/archives/pika-${PIKA_VERSION}.tar.gz
export PIKA_COLLECTD_BATCH_SIZE=200
fi
......