Commit 26d1ec04 authored by Robert Dietrich's avatar Robert Dietrich

added locking for install and collectd start

parent 1cd21bb2
......@@ -27,140 +27,170 @@ if [[ $PIKA_MONITORING -eq 0 || ($PIKA_MONITORING -eq -1 && ${LOCAL_JOBS_RUNNING
echo -e "\ncollectd is not running." >> $DEBUG_PATH 2>&1
else
echo -e "\nkill -TERM $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -TERM $COLLECTD_PID
kill -TERM $COLLECTD_PID >> $DEBUG_PATH 2>&1 # flushes metric buffer? send extra signal?
sleep 1
# make sure collectd gets killed
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "$DAEMON" | awk '{print $1}'`
if [ -z "$COLLECTD_PID" ]; then
echo -e "\nkill -KILL $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -KILL $COLLECTD_PID >> $DEBUG_PATH 2>&1
fi
fi
fi
# determine number of PIKA collectd processes
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nActive PIKA collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
# if we have more than one collectd process running, kill all but the last
if [ $active_procs -gt 1 ]; then
COLLECTD_PIDS=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
KILL_COLLECTD_PIDS=`echo $COLLECTD_PID | sed s/'\w*$'//` #tail of last word/pid in line
echo -e "\nMore than one PIKA collectd active: ${COLLECTD_PIDS}. kill -TERM ${KILL_COLLECTD_PIDS}" >> $DEBUG_PATH 2>&1
kill -TERM ${KILL_COLLECTD_PIDS} >> $DEBUG_PATH 2>&1
fi
##### only one prolog should care about getting collectd to start ####
# we only check for collectd if $PIKA_MONITORING=1
if [[ $PIKA_MONITORING -eq 1 ]]; then
# try to access the lock
#exec {lock_fd}>${LOCAL_STORE}/pika_collectd_lock || exit 1
#flock -n "$lock_fd"
#trap "rm -f ${LOCAL_STORE}/pika_collectd_lock" 0 #make sure that the lock is removed on exit
# if collectd is not running yet
if [ $active_procs -eq 0 ]; then
CD_INST_PATH=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION
CUSTOM_TYPES_DIR=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION/share/collectd
COLLECTD_PYTHON_PLUGIN_PATH=$PIKA_ROOT/daemon/collectd/collectd-plugins/python
mkdir $lock_collectd 2> /dev/null
if [ $? == 0 ]; then
trap "rm -rf ${lock_collectd}" QUIT TERM EXIT INT #make sure that the lock is removed on exit
# enter locked region (via flock)
# determine number of PIKA collectd processes
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nActive PIKA collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
# set hostname to avoid systemcall for every metric dispatch and copy collectd.conf into /tmp
hostshort=`hostname -s`
sed "/#HostnameReplace/cHostname ${hostshort}" \
${PIKA_ROOT}/daemon/collectd/collectd_template.conf > ${COLLECTD_CONF}
# if we have more than one collectd process running, kill all but the last
if [ $active_procs -gt 1 ]; then
COLLECTD_PIDS=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
KILL_COLLECTD_PIDS=`echo $COLLECTD_PID | sed s/'\w*$'//` #tail of last word/pid in line
echo -e "\nError: More than one PIKA collectd active: ${COLLECTD_PIDS}. kill -TERM ${KILL_COLLECTD_PIDS}" >> $DEBUG_PATH 2>&1
kill -TERM ${KILL_COLLECTD_PIDS} >> $DEBUG_PATH 2>&1
fi
# we only check for collectd if $PIKA_MONITORING=1
if [ "$PIKA_MONITORING" -eq 1 ]; then
# if collectd is not running yet
if [ "$active_procs" -eq 0 ]; then
# set the path and name of the collectd logfile
sed -i '/<Plugin logfile>/,/Plugin>/'" s|File.*|File \"${COLLECTD_LOGFILE}\"|" ${COLLECTD_CONF}
CD_INST_PATH=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION
CUSTOM_TYPES_DIR=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION/share/collectd
COLLECTD_PYTHON_PLUGIN_PATH=$PIKA_ROOT/daemon/collectd/collectd-plugins/python
# set hostname to avoid systemcall for every metric dispatch and copy collectd.conf into /tmp
hostshort=`hostname -s`
sed "/#HostnameReplace/cHostname ${hostshort}" \
${PIKA_ROOT}/daemon/collectd/collectd_template.conf > ${COLLECTD_CONF}
# use + as sed separator
sed -i -e "s+CD_INST_PATH+${CD_INST_PATH}+" ${COLLECTD_CONF}
sed -i -e "s+CUSTOM_TYPES_DIR+${CUSTOM_TYPES_DIR}+" ${COLLECTD_CONF}
# set the path and name of the collectd logfile
sed -i '/<Plugin logfile>/,/Plugin>/'" s|File.*|File \"${COLLECTD_LOGFILE}\"|" ${COLLECTD_CONF}
# use + as sed separator
sed -i -e "s+CD_INST_PATH+${CD_INST_PATH}+" ${COLLECTD_CONF}
sed -i -e "s+CUSTOM_TYPES_DIR+${CUSTOM_TYPES_DIR}+" ${COLLECTD_CONF}
# set python module path
sed -i "/ModulePath/c \ \ ModulePath \"${COLLECTD_PYTHON_PLUGIN_PATH}\"" ${COLLECTD_CONF}
# Check for lustre and, if available reset counters
LUSTRE_PATH=/proc/fs/lustre
if [ -d "${LUSTRE_PATH}" ]; then
echo -e "\nLustre is available. Reset Lustre counters." >> $DEBUG_PATH 2>&1
for fs in ${LUSTRE_PATH}/llite/*; do
llstat -c $fs/stats
done
else
echo -e "Disable Lustre plugin. ${LUSTRE_PATH} not found." >> $DEBUG_PATH 2>&1
sed -i "/Import \"lustre_bw\"/,/<\/Module>/"' s/^/#/' "${COLLECTD_CONF}"
fi
# set python module path
sed -i "/ModulePath/c \ \ ModulePath \"${COLLECTD_PYTHON_PLUGIN_PATH}\"" ${COLLECTD_CONF}
# Check for lustre and, if available reset counters
LUSTRE_PATH=/proc/fs/lustre
if [ -d "${LUSTRE_PATH}" ]; then
if [ -x "$(command -v llstat)" ]; then
echo -e "\nLustre is available. Reset Lustre counters for " >> $DEBUG_PATH 2>&1
for fs in ${LUSTRE_PATH}/llite/*; do
echo -e "$fs " >> $DEBUG_PATH 2>&1
llstat -c $fs/stats
done
else
echo -e "\nLustre is available, but llstat is missing. Cannot reset Lustre counters. " >> $DEBUG_PATH 2>&1
fi
else
echo -e "Disable Lustre plugin. ${LUSTRE_PATH} not found." >> $DEBUG_PATH 2>&1
sed -i "/Import \"lustre_bw\"/,/<\/Module>/"' s/^/#/' "${COLLECTD_CONF}"
fi
# set InfluxDB access settings
sed -i "/#INFLUXHOST/c \ \ \ \ host \"${INFLUXDB_HOST}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXPORT/c \ \ \ \ port \"${INFLUXDB_PORT}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXUSER/c \ \ \ \ user \"${INFLUXDB_USER}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXPWD/c \ \ \ \ pwd \"${INFLUXDB_PASSWORD}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXDBNAME/c \ \ \ \ database \"${INFLUXDB_DATABASE}\"" ${COLLECTD_CONF}
# set InfluxDB access settings
sed -i "/#INFLUXHOST/c \ \ \ \ host \"${INFLUXDB_HOST}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXPORT/c \ \ \ \ port \"${INFLUXDB_PORT}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXUSER/c \ \ \ \ user \"${INFLUXDB_USER}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXPWD/c \ \ \ \ pwd \"${INFLUXDB_PASSWORD}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXDBNAME/c \ \ \ \ database \"${INFLUXDB_DATABASE}\"" ${COLLECTD_CONF}
which nvidia-smi >> $DEBUG_PATH 2>&1
# disable (comment out) the NVML plugin if nvidia-smi is not available or CUDA driver not installed
if ! [ -x "$(command -v nvidia-smi)" ] || [ "$(nvidia-smi | grep -c failed)" -gt 0 ]; then
echo -e "Disable GPU NVIDIA plugin (no nvidia-smi or CUDA driver)." >> $DEBUG_PATH 2>&1
sed -i "/<LoadPlugin gpu_nvidia>/,/gpu_nvidia_end/"' s/^/#/' "${COLLECTD_CONF}"
fi
# get the architecture directory of Likwid perfgroup
export LIKWID_INST_PATH=$PIKA_INSTALL_PATH/likwid/${LIKWID_VERSION}
echo -e "Check for Likwid install path: $LIKWID_INST_PATH" >> $DEBUG_PATH 2>&1
arch_dir=`$LIKWID_INST_PATH/bin/likwid-perfctr -i | grep "CPU short:" | awk '{print $3}'`
#echo -e Architecture: $arch_dir >> $DEBUG_PATH 2>&1
group_count=`ls -l $LIKWID_INST_PATH/share/likwid/perfgroups/$arch_dir | grep -c "pika_metrics_"`
#echo Groups: $group_count
# determine measurement time and align second
if [ $group_count -gt 0 ]; then
# assuming that not more than 5 groups are measured
if [ $group_count -eq 1 ]; then
mtime=50
elif [ $group_count -eq 2 ]; then
mtime=25
elif [ $group_count -eq 3 ]; then
mtime=15
else
mtime=10
which nvidia-smi >> $DEBUG_PATH 2>&1
# disable (comment out) the NVML plugin if nvidia-smi is not available or CUDA driver not installed
if ! [ -x "$(command -v nvidia-smi)" ] || [ "$(nvidia-smi | grep -c failed)" -gt 0 ]; then
echo -e "\nDisable GPU NVIDIA plugin (no nvidia-smi or CUDA driver)." >> $DEBUG_PATH 2>&1
sed -i "/<LoadPlugin gpu_nvidia>/,/gpu_nvidia_end/"' s/^/#/' "${COLLECTD_CONF}"
fi
startsecond=$((60-(group_count*mtime)))
echo -e "Set Likwid align second to $startsecond and measurement time to $mtime (arch: ${arch_dir})" >> $DEBUG_PATH 2>&1
sed -i '/<LoadPlugin likwid>/,/LoadPlugin/'" s/AlignRead.*/AlignRead $startsecond/" "${COLLECTD_CONF}"
sed -i "/Mtime/c \ \ Mtime \"$mtime\"" ${COLLECTD_CONF}
# get the architecture directory of Likwid perfgroup
export LIKWID_INST_PATH=$PIKA_INSTALL_PATH/likwid/${LIKWID_VERSION}
echo -e "\nCheck for Likwid install path: $LIKWID_INST_PATH" >> $DEBUG_PATH 2>&1
arch_dir=`$LIKWID_INST_PATH/bin/likwid-perfctr -i | grep "CPU short:" | awk '{print $3}'`
#echo -e Architecture: $arch_dir >> $DEBUG_PATH 2>&1
group_count=`ls -l $LIKWID_INST_PATH/share/likwid/perfgroups/$arch_dir | grep -c "pika_metrics_"`
#echo Groups: $group_count
# determine measurement time and align second
if [ "$group_count" -gt 0 ]; then
# assuming that not more than 5 groups are measured
if [ "$group_count" -eq 1 ]; then
mtime=50
elif [ "$group_count" -eq 2 ]; then
mtime=25
elif [ "$group_count" -eq 3 ]; then
mtime=15
else
mtime=10
fi
startsecond=$((60-(group_count*mtime)))
echo -e "Set Likwid align second to $startsecond and measurement time to $mtime (arch: ${arch_dir})" >> $DEBUG_PATH 2>&1
sed -i '/<LoadPlugin likwid>/,/LoadPlugin/'" s/AlignRead.*/AlignRead $startsecond/" "${COLLECTD_CONF}"
sed -i "/Mtime/c \ \ Mtime \"$mtime\"" ${COLLECTD_CONF}
group_string=`ls $LIKWID_INST_PATH/share/likwid/perfgroups/$arch_dir | grep "pika_metrics" | tr '\n' ',' | sed 's/.txt,/,/g' | sed 's/,*$//g'`
sed -i "/Groups/c \ \ Groups \"$group_string\"" ${COLLECTD_CONF}
#expose path to likwid library (needed to load collectd likwid plugin)
export LD_LIBRARY_PATH=${LIKWID_INST_PATH}/lib:${LD_LIBRARY_PATH}
# set the socket file from PIKA configuration
sed -i '/<Plugin unixsock>/,/Plugin>/'" s|SocketFile.*|SocketFile \"${PIKA_COLLECTD_SOCKET}\"|" ${COLLECTD_CONF}
else
echo -e "Error: No PIKA group definitions for LIKWID found! Disable LIKWID and unixsock plugins." >> $DEBUG_PATH 2>&1
sed -i "/likwid/,/likwid_end/"' s/^/#/' "${COLLECTD_CONF}"
sed -i "/LoadPlugin unixsock/,/Plugin>/"' s/^/#/' "${COLLECTD_CONF}"
fi
# disallow users to read collectd config file
chmod 640 ${COLLECTD_CONF}
# start collectd
echo -e "\nStarting Collectd: $DAEMON" >> $DEBUG_PATH 2>&1
export LIKWID_PERF_PID=-1 # workaround for root access with perf to counter registers
group_string=`ls $LIKWID_INST_PATH/share/likwid/perfgroups/$arch_dir | grep "pika_metrics" | tr '\n' ',' | sed 's/.txt,/,/g' | sed 's/,*$//g'`
export HOME="workaroundLIKWIDbug"
sed -i "/Groups/c \ \ Groups \"$group_string\"" ${COLLECTD_CONF}
echo -e "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $DEBUG_PATH 2>&1
#expose path to likwid library (needed to load collectd likwid plugin)
export LD_LIBRARY_PATH=${LIKWID_INST_PATH}/lib:${LD_LIBRARY_PATH}
$DAEMON >> $DEBUG_PATH 2>&1
# set the socket file from PIKA configuration
sed -i '/<Plugin unixsock>/,/Plugin>/'" s|SocketFile.*|SocketFile \"${PIKA_COLLECTD_SOCKET}\"|" ${COLLECTD_CONF}
else
echo -e "Error: No PIKA group definitions for LIKWID found! Disable LIKWID and unixsock plugins." >> $DEBUG_PATH 2>&1
sed -i "/likwid/,/likwid_end/"' s/^/#/' "${COLLECTD_CONF}"
sed -i "/LoadPlugin unixsock/,/Plugin>/"' s/^/#/' "${COLLECTD_CONF}"
fi
# disallow users to read collectd config file
chmod 640 ${COLLECTD_CONF}
# start collectd
echo -e "Starting Collectd: $DAEMON" >> $DEBUG_PATH 2>&1
export LIKWID_PERF_PID=-1 # workaround for root access with perf to counter registers
export HOME="workaroundLIKWIDbug"
echo -e "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $DEBUG_PATH 2>&1
$DAEMON >> $DEBUG_PATH 2>&1
# check if collectd is up and running
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nDaemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
sleep 1
# check if collectd is up and running
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nDaemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
#echo -e "Daemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
echo -e "\nError: Collectd could not be started!" >> $DEBUG_PATH 2>&1
sleep 1
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "Daemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
echo -e "Error: Collectd could not be started!" >> $DEBUG_PATH 2>&1
fi
fi
fi
fi
rm -rf ${lock_collectd} # remove the lock
fi
#flock -u "$lock_fd"
# end of locked region
......@@ -11,7 +11,7 @@ if [[ ${PIKA_HOSTNAME} = *"${MASTER_NODE}"* ]]; then
echo -e "\nGet job meta data (master node)" >> $DEBUG_PATH 2>&1
PIKA_MONITORING=`python3 ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env_redis.py --jobid=${PIKA_JOB_ID} --env_file=${BATCHSYSTEM_ENV_FILE} --force 2>&1`
else
PIKA_MONITORING=`python3 ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env_redis.py --jobid=${PIKA_JOB_ID}`
PIKA_MONITORING=`python3 ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env_redis.py --jobid=${PIKA_JOB_ID} 2>&1`
fi
# check if variable PIKA_MONITORING contains error output
......@@ -26,4 +26,4 @@ echo -e "\nPIKA_MONITORING=$PIKA_MONITORING" >> $DEBUG_PATH 2>&1
# write monitoring flag into file for master node
if [[ ${PIKA_HOSTNAME} = *"${MASTER_NODE}"* ]]; then
echo $PIKA_MONITORING > ${LOCAL_STORE}/pika_monitoring_${PIKA_JOB_ID}
fi
\ No newline at end of file
fi
......@@ -21,7 +21,7 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
# check if monitoring is enabled or disabled
PIKA_MONITORING=1
if [ -e ${LOCAL_STORE}/pika_monitoring_${PIKA_JOB_ID} ]; then
if [ -e "${LOCAL_STORE}/pika_monitoring_${PIKA_JOB_ID}" ]; then
echo -e "\nCheck if monitoring is enabled or disabled" >> $DEBUG_PATH 2>&1
PIKA_MONITORING=`cat ${LOCAL_STORE}/pika_monitoring_${PIKA_JOB_ID}`
rm -f ${LOCAL_STORE}/pika_monitoring_${PIKA_JOB_ID}
......@@ -30,7 +30,7 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
echo -e "\nPIKA_MONITORING=${PIKA_MONITORING}" >> $DEBUG_PATH 2>&1
# if Redis script worked overwrite metadata
if [[ ${PIKA_MONITORING} -ge 0 ]]; then
if [ "${PIKA_MONITORING}" -ge 0 ]; then
source ${BATCHSYSTEM_ENV_FILE} >> $DEBUG_PATH 2>&1
else
echo -e "\nNo job metadata from redis available." >> $DEBUG_PATH 2>&1
......@@ -41,7 +41,7 @@ if [ ${PIKA_HOSTNAME} = ${MASTER_NODE} ]; then
env | grep "PIKA_" >> $DEBUG_PATH 2>&1
# use start time from redis if available
if [ "${PIKA_JOB_START}" -gt "0" ]; then
if [ "${PIKA_JOB_START}" -gt 0 ]; then
JOB_START=${PIKA_JOB_START}
fi
......
......@@ -37,6 +37,11 @@ fi
# print date
date >> $DEBUG_PATH 2>&1
# check that local store (/dev/shm) exists
# TODO: if LOCAL_STORE does not exist, everything will break
if ! [ -d "${LOCAL_STORE}" ]; then
echo -e "\nError:LOCAL_STORE=${LOCAL_STORE} does not exist." >> $DEBUG_PATH 2>&1
fi
##### (4) if SLURM_NODELIST is of length zero
if [[ -z "${SLURM_NODELIST}" ]]; then
......@@ -48,61 +53,18 @@ fi
##### (5) pika package installation
# used for install and collectd start
lock_collectd=${LOCAL_STORE}/pika_collectd_setup.lock
# install pika python and likwid in /opt/pika if it is not already there
if [ ! -d "${PIKA_INSTALL_PATH}" ]; then
echo -e "\nInstall/Replace PIKA software stack to ${PIKA_INSTALL_PATH}" >> $DEBUG_PATH 2>&1
# check if an old collectd daemon is still running, if so kill it
echo -e "\nCheck if an old PIKA collectd is still running" >> $DEBUG_PATH 2>&1
DAEMON="pika_collectd"
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "$DAEMON" | awk '{print $1}'`
echo -e "\nCOLLECTD_PID=$COLLECTD_PID" >> $DEBUG_PATH 2>&1
if [ -z "$COLLECTD_PID" ]; then
echo -e "\ncollectd is not running." >> $DEBUG_PATH 2>&1
else
old_pika_collectd_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nNumber of active old PIKA Collectd processes: ${old_pika_collectd_procs}. Try to terminate them." >> $DEBUG_PATH 2>&1
kill -TERM $COLLECTD_PID
sleep 1
old_pika_collectd_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
if [ $old_pika_collectd_procs -gt 0 ]; then
echo -e "\nkill -KILL $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -KILL $COLLECTD_PID >> $DEBUG_PATH 2>&1
fi
old_pika_collectd_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "\nError: Could not terminate old PIKA Collectd processes. ${old_pika_collectd_procs} are still running." >> $DEBUG_PATH 2>&1
fi
# delete old installation if it is still there
if [ -d "/opt/pika" ]; then
rm -rf /opt/pika
fi
# temporary: delete old prope installations
if [ -d "/opt/prope" ]; then
rm -rf /opt/prope
fi
mkdir -p ${PIKA_INSTALL_PATH}
echo -e "tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/.." >> $DEBUG_PATH 2>&1
tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/.. >> $DEBUG_PATH 2>&1
fi
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_install_prolog_include.sh >> $DEBUG_PATH 2>&1
##### (6) pika presetup
# setup logrotate
# check if pika_logrotate.sh is in /etc/cron.daily
# check if pika_lograte.sh is in /etc/cron.daily
if [ ! -f "/etc/cron.daily/pika_logrotate.sh" ]; then
echo -e "\nSetup logrotate" >> $DEBUG_PATH 2>&1
cp ${PIKA_ROOT}/daemon/logrotate/pika_logrotate.sh /etc/cron.daily >> $DEBUG_PATH 2>&1
fi
# check python
echo -e "\nCheck python3 path:" >> $DEBUG_PATH 2>&1
which python3 >> $DEBUG_PATH 2>&1
# determine master node
MASTER_NODE=`echo ${PIKA_JOB_NODELIST} | nodeset -e | cut -d ' ' -f 1 | cut -d. -f1`
......@@ -118,18 +80,39 @@ BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
# set defaults for all pika metadata provided by SLURM
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env.sh >> $DEBUG_PATH 2>&1
# check for Python installation
echo -e "\nCheck python3 path:" >> $DEBUG_PATH 2>&1
if [ -x "$(command -v python3)" ]; then
which python3 >> $DEBUG_PATH 2>&1
else
# sleep until we have a python (at most 5 seconds)
for i in 1 2 3 4 5 ; do
sleep 1
if [ -x "$(command -v python3)" ]; then
echo -e "python3 is now available!" >> $DEBUG_PATH 2>&1
break
fi
done
fi
##### (7) get additional job metadata from redis
PIKA_MONITORING=1
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_get_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1
if [ -x "$(command -v python3)" ]; then
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_get_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1
else
echo -e "Error: python3 is NOT available!" >> $DEBUG_PATH 2>&1
fi
##### (8) based on the PIKA_MONITORING value, start or stop collectd
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_collectd_prolog_include.sh >> $DEBUG_PATH 2>&1
#/opt/pika/1.0/likwid/5.0.1/bin/likwid-topology >> $DEBUG_PATH 2>&1
#export OMP_NUM_THREADS=176
#export HOME="workaroundLIKWIDbug"
#/opt/pika/1.0/likwid/5.0.1/bin/likwid-perfctr -V 3 -g pika_metrics_1 ~rdietric/examples/bash_scripts/laplace2d-taurusml >> $DEBUG_PATH 2>&1
##### (9) save job metadata
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_save_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1
echo -e "\nProlog finished sucessfully!" >> $DEBUG_PATH 2>&1
echo -e "\nProlog finished successfully!" >> $DEBUG_PATH 2>&1
exit 0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment