Commit 26d1ec04 authored by Robert Dietrich's avatar Robert Dietrich

added locking for install and collectd start

parent 1cd21bb2
......@@ -27,27 +27,48 @@ if [[ $PIKA_MONITORING -eq 0 || ($PIKA_MONITORING -eq -1 && ${LOCAL_JOBS_RUNNING
echo -e "\ncollectd is not running." >> $DEBUG_PATH 2>&1
else
echo -e "\nkill -TERM $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -TERM $COLLECTD_PID
kill -TERM $COLLECTD_PID >> $DEBUG_PATH 2>&1 # flushes metric buffer? send extra signal?
sleep 1
# make sure collectd gets killed
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "$DAEMON" | awk '{print $1}'`
if [ -z "$COLLECTD_PID" ]; then
echo -e "\nkill -KILL $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -KILL $COLLECTD_PID >> $DEBUG_PATH 2>&1
fi
fi
fi
# determine number of PIKA collectd processes
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nActive PIKA collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
##### only one prolog should care about getting collectd to start ####
# try to access the lock
#exec {lock_fd}>${LOCAL_STORE}/pika_collectd_lock || exit 1
#flock -n "$lock_fd"
#trap "rm -f ${LOCAL_STORE}/pika_collectd_lock" 0 #make sure that the lock is removed on exit
mkdir $lock_collectd 2> /dev/null
if [ $? == 0 ]; then
trap "rm -rf ${lock_collectd}" QUIT TERM EXIT INT #make sure that the lock is removed on exit
# enter locked region (via flock)
# if we have more than one collectd process running, kill all but the last
if [ $active_procs -gt 1 ]; then
# determine number of PIKA collectd processes
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nActive PIKA collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
# if we have more than one collectd process running, kill all but the last
if [ $active_procs -gt 1 ]; then
COLLECTD_PIDS=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
KILL_COLLECTD_PIDS=`echo $COLLECTD_PID | sed s/'\w*$'//` #tail of last word/pid in line
echo -e "\nMore than one PIKA collectd active: ${COLLECTD_PIDS}. kill -TERM ${KILL_COLLECTD_PIDS}" >> $DEBUG_PATH 2>&1
echo -e "\nError: More than one PIKA collectd active: ${COLLECTD_PIDS}. kill -TERM ${KILL_COLLECTD_PIDS}" >> $DEBUG_PATH 2>&1
kill -TERM ${KILL_COLLECTD_PIDS} >> $DEBUG_PATH 2>&1
fi
fi
# we only check for collectd if $PIKA_MONITORING=1
if [[ $PIKA_MONITORING -eq 1 ]]; then
# we only check for collectd if $PIKA_MONITORING=1
if [ "$PIKA_MONITORING" -eq 1 ]; then
# if collectd is not running yet
if [ $active_procs -eq 0 ]; then
if [ "$active_procs" -eq 0 ]; then
CD_INST_PATH=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION
CUSTOM_TYPES_DIR=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION/share/collectd
......@@ -71,10 +92,15 @@ if [[ $PIKA_MONITORING -eq 1 ]]; then
# Check for lustre and, if available reset counters
LUSTRE_PATH=/proc/fs/lustre
if [ -d "${LUSTRE_PATH}" ]; then
echo -e "\nLustre is available. Reset Lustre counters." >> $DEBUG_PATH 2>&1
if [ -x "$(command -v llstat)" ]; then
echo -e "\nLustre is available. Reset Lustre counters for " >> $DEBUG_PATH 2>&1
for fs in ${LUSTRE_PATH}/llite/*; do
echo -e "$fs " >> $DEBUG_PATH 2>&1
llstat -c $fs/stats
done
else
echo -e "\nLustre is available, but llstat is missing. Cannot reset Lustre counters. " >> $DEBUG_PATH 2>&1
fi
else
echo -e "Disable Lustre plugin. ${LUSTRE_PATH} not found." >> $DEBUG_PATH 2>&1
sed -i "/Import \"lustre_bw\"/,/<\/Module>/"' s/^/#/' "${COLLECTD_CONF}"
......@@ -91,26 +117,26 @@ if [[ $PIKA_MONITORING -eq 1 ]]; then
# disable (comment out) the NVML plugin if nvidia-smi is not available or CUDA driver not installed
if ! [ -x "$(command -v nvidia-smi)" ] || [ "$(nvidia-smi | grep -c failed)" -gt 0 ]; then
echo -e "Disable GPU NVIDIA plugin (no nvidia-smi or CUDA driver)." >> $DEBUG_PATH 2>&1
echo -e "\nDisable GPU NVIDIA plugin (no nvidia-smi or CUDA driver)." >> $DEBUG_PATH 2>&1
sed -i "/<LoadPlugin gpu_nvidia>/,/gpu_nvidia_end/"' s/^/#/' "${COLLECTD_CONF}"
fi
# get the architecture directory of Likwid perfgroup
export LIKWID_INST_PATH=$PIKA_INSTALL_PATH/likwid/${LIKWID_VERSION}
echo -e "Check for Likwid install path: $LIKWID_INST_PATH" >> $DEBUG_PATH 2>&1
echo -e "\nCheck for Likwid install path: $LIKWID_INST_PATH" >> $DEBUG_PATH 2>&1
arch_dir=`$LIKWID_INST_PATH/bin/likwid-perfctr -i | grep "CPU short:" | awk '{print $3}'`
#echo -e Architecture: $arch_dir >> $DEBUG_PATH 2>&1
group_count=`ls -l $LIKWID_INST_PATH/share/likwid/perfgroups/$arch_dir | grep -c "pika_metrics_"`
#echo Groups: $group_count
# determine measurement time and align second
if [ $group_count -gt 0 ]; then
if [ "$group_count" -gt 0 ]; then
# assuming that not more than 5 groups are measured
if [ $group_count -eq 1 ]; then
if [ "$group_count" -eq 1 ]; then
mtime=50
elif [ $group_count -eq 2 ]; then
elif [ "$group_count" -eq 2 ]; then
mtime=25
elif [ $group_count -eq 3 ]; then
elif [ "$group_count" -eq 3 ]; then
mtime=15
else
mtime=10
......@@ -141,7 +167,7 @@ if [[ $PIKA_MONITORING -eq 1 ]]; then
chmod 640 ${COLLECTD_CONF}
# start collectd
echo -e "Starting Collectd: $DAEMON" >> $DEBUG_PATH 2>&1
echo -e "\nStarting Collectd: $DAEMON" >> $DEBUG_PATH 2>&1
export LIKWID_PERF_PID=-1 # workaround for root access with perf to counter registers
export HOME="workaroundLIKWIDbug"
......@@ -152,15 +178,19 @@ if [[ $PIKA_MONITORING -eq 1 ]]; then
# check if collectd is up and running
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nDaemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
#echo -e "Daemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
sleep 1
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nDaemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
echo -e "Daemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
echo -e "\nError: Collectd could not be started!" >> $DEBUG_PATH 2>&1
echo -e "Error: Collectd could not be started!" >> $DEBUG_PATH 2>&1
fi
fi
fi
fi
rm -rf ${lock_collectd} # remove the lock
fi
#flock -u "$lock_fd"
# end of locked region
......@@ -11,7 +11,7 @@ if [[ ${PIKA_HOSTNAME} = *"${MASTER_NODE}"* ]]; then
echo -e "\nGet job meta data (master node)" >> $DEBUG_PATH 2>&1
PIKA_MONITORING=`python3 ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env_redis.py --jobid=${PIKA_JOB_ID} --env_file=${BATCHSYSTEM_ENV_FILE} --force 2>&1`
else
PIKA_MONITORING=`python3 ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env_redis.py --jobid=${PIKA_JOB_ID}`
PIKA_MONITORING=`python3 ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env_redis.py --jobid=${PIKA_JOB_ID} 2>&1`
fi
# check if variable PIKA_MONITORING contains error output
......
......@@ -21,7 +21,7 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
# check if monitoring is enabled or disabled
PIKA_MONITORING=1
if [ -e ${LOCAL_STORE}/pika_monitoring_${PIKA_JOB_ID} ]; then
if [ -e "${LOCAL_STORE}/pika_monitoring_${PIKA_JOB_ID}" ]; then
echo -e "\nCheck if monitoring is enabled or disabled" >> $DEBUG_PATH 2>&1
PIKA_MONITORING=`cat ${LOCAL_STORE}/pika_monitoring_${PIKA_JOB_ID}`
rm -f ${LOCAL_STORE}/pika_monitoring_${PIKA_JOB_ID}
......@@ -30,7 +30,7 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
echo -e "\nPIKA_MONITORING=${PIKA_MONITORING}" >> $DEBUG_PATH 2>&1
# if Redis script worked overwrite metadata
if [[ ${PIKA_MONITORING} -ge 0 ]]; then
if [ "${PIKA_MONITORING}" -ge 0 ]; then
source ${BATCHSYSTEM_ENV_FILE} >> $DEBUG_PATH 2>&1
else
echo -e "\nNo job metadata from redis available." >> $DEBUG_PATH 2>&1
......@@ -41,7 +41,7 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
env | grep "PIKA_" >> $DEBUG_PATH 2>&1
# use start time from redis if available
if [ "${PIKA_JOB_START}" -gt "0" ]; then
if [ "${PIKA_JOB_START}" -gt 0 ]; then
JOB_START=${PIKA_JOB_START}
fi
......
......@@ -37,6 +37,11 @@ fi
# print date
date >> $DEBUG_PATH 2>&1
# check that local store (/dev/shm) exists
# TODO: if LOCAL_STORE does not exist, everything will break
if ! [ -d "${LOCAL_STORE}" ]; then
echo -e "\nError:LOCAL_STORE=${LOCAL_STORE} does not exist." >> $DEBUG_PATH 2>&1
fi
##### (4) if SLURM_NODELIST is of length zero
if [[ -z "${SLURM_NODELIST}" ]]; then
......@@ -48,61 +53,18 @@ fi
##### (5) pika package installation
# used for install and collectd start
lock_collectd=${LOCAL_STORE}/pika_collectd_setup.lock
# install pika python and likwid in /opt/pika if it is not already there
if [ ! -d "${PIKA_INSTALL_PATH}" ]; then
echo -e "\nInstall/Replace PIKA software stack to ${PIKA_INSTALL_PATH}" >> $DEBUG_PATH 2>&1
# check if an old collectd daemon is still running, if so kill it
echo -e "\nCheck if an old PIKA collectd is still running" >> $DEBUG_PATH 2>&1
DAEMON="pika_collectd"
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "$DAEMON" | awk '{print $1}'`
echo -e "\nCOLLECTD_PID=$COLLECTD_PID" >> $DEBUG_PATH 2>&1
if [ -z "$COLLECTD_PID" ]; then
echo -e "\ncollectd is not running." >> $DEBUG_PATH 2>&1
else
old_pika_collectd_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nNumber of active old PIKA Collectd processes: ${old_pika_collectd_procs}. Try to terminate them." >> $DEBUG_PATH 2>&1
kill -TERM $COLLECTD_PID
sleep 1
old_pika_collectd_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
if [ $old_pika_collectd_procs -gt 0 ]; then
echo -e "\nkill -KILL $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -KILL $COLLECTD_PID >> $DEBUG_PATH 2>&1
fi
old_pika_collectd_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nError: Could not terminate old PIKA Collectd processes. ${old_pika_collectd_procs} are still running." >> $DEBUG_PATH 2>&1
fi
# delete old installation if it is still there
if [ -d "/opt/pika" ]; then
rm -rf /opt/pika
fi
# temporary: delete old prope installations
if [ -d "/opt/prope" ]; then
rm -rf /opt/prope
fi
mkdir -p ${PIKA_INSTALL_PATH}
echo -e "tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/.." >> $DEBUG_PATH 2>&1
tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/.. >> $DEBUG_PATH 2>&1
fi
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_install_prolog_include.sh >> $DEBUG_PATH 2>&1
##### (6) pika presetup
# setup logrotate
# check if pika_logrotate.sh is in /etc/cron.daily
# check if pika_lograte.sh is in /etc/cron.daily
if [ ! -f "/etc/cron.daily/pika_logrotate.sh" ]; then
echo -e "\nSetup logrotate" >> $DEBUG_PATH 2>&1
cp ${PIKA_ROOT}/daemon/logrotate/pika_logrotate.sh /etc/cron.daily >> $DEBUG_PATH 2>&1
fi
# check python
echo -e "\nCheck python3 path:" >> $DEBUG_PATH 2>&1
which python3 >> $DEBUG_PATH 2>&1
# determine master node
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
......@@ -118,18 +80,39 @@ BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
# set defaults for all pika metadata provided by SLURM
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env.sh >> $DEBUG_PATH 2>&1
# check for Python installation
echo -e "\nCheck python3 path:" >> $DEBUG_PATH 2>&1
if [ -x "$(command -v python3)" ]; then
which python3 >> $DEBUG_PATH 2>&1
else
# sleep until we have a python (at most 5 seconds)
for i in 1 2 3 4 5 ; do
sleep 1
if [ -x "$(command -v python3)" ]; then
echo -e "python3 is now available!" >> $DEBUG_PATH 2>&1
break
fi
done
fi
##### (7) get additional job metadata from redis
PIKA_MONITORING=1
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_get_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1
if [ -x "$(command -v python3)" ]; then
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_get_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1
else
echo -e "Error: python3 is NOT available!" >> $DEBUG_PATH 2>&1
fi
##### (8) based on the PIKA_MONITORING value, start or stop collectd
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_collectd_prolog_include.sh >> $DEBUG_PATH 2>&1
#/opt/pika/1.0/likwid/5.0.1/bin/likwid-topology >> $DEBUG_PATH 2>&1
#export OMP_NUM_THREADS=176
#export HOME="workaroundLIKWIDbug"
#/opt/pika/1.0/likwid/5.0.1/bin/likwid-perfctr -V 3 -g pika_metrics_1 ~rdietric/examples/bash_scripts/laplace2d-taurusml >> $DEBUG_PATH 2>&1
##### (9) save job metadata
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_save_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1
echo -e "\nProlog finished sucessfully!" >> $DEBUG_PATH 2>&1
echo -e "\nProlog finished successfully!" >> $DEBUG_PATH 2>&1
exit 0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment