Commit 46862dcd authored by fwinkler's avatar fwinkler

New scripts to control collectd.

parent e7c91d83
......@@ -32,7 +32,7 @@ if [[ $PIKA_MONITORING -eq 0 || ($PIKA_MONITORING -eq -1 && ${LOCAL_JOBS_RUNNING
# make sure collectd gets killed
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "$DAEMON" | awk '{print $1}'`
if [ -z "$COLLECTD_PID" ]; then
if [ -n "$COLLECTD_PID" ]; then
echo -e "\nkill -KILL $COLLECTD_PID" >> $DEBUG_PATH 2>&1
kill -KILL $COLLECTD_PID >> $DEBUG_PATH 2>&1
fi
......@@ -67,124 +67,7 @@ if [ $? == 0 ] || [ "$have_setup_lock" = true ]; then
# if collectd is not running yet
if [ "$active_procs" -eq 0 ]; then
CD_INST_PATH=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION
CUSTOM_TYPES_DIR=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION/share/collectd
COLLECTD_PYTHON_PLUGIN_PATH=$PIKA_ROOT/daemon/collectd/collectd-plugins/python
# set hostname to avoid systemcall for every metric dispatch and copy collectd.conf into /tmp
hostshort=`hostname -s`
sed "/#HostnameReplace/cHostname ${hostshort}" \
${PIKA_ROOT}/daemon/collectd/collectd_template.conf > ${COLLECTD_CONF}
# set the path and name of the collectd logfile
sed -i '/<Plugin logfile>/,/Plugin>/'" s|File.*|File \"${COLLECTD_LOGFILE}\"|" ${COLLECTD_CONF}
# use + as sed separator
sed -i -e "s+CD_INST_PATH+${CD_INST_PATH}+" ${COLLECTD_CONF}
sed -i -e "s+CUSTOM_TYPES_DIR+${CUSTOM_TYPES_DIR}+" ${COLLECTD_CONF}
# set python module path
sed -i "/ModulePath/c \ \ ModulePath \"${COLLECTD_PYTHON_PLUGIN_PATH}\"" ${COLLECTD_CONF}
# Check for lustre and, if available reset counters
LUSTRE_PATH=/proc/fs/lustre
if [ -d "${LUSTRE_PATH}" ]; then
if [ -x "$(command -v llstat)" ]; then
echo -e "\nLustre is available. Reset Lustre counters for " >> $DEBUG_PATH 2>&1
for fs in ${LUSTRE_PATH}/llite/*; do
echo -e "$fs " >> $DEBUG_PATH 2>&1
llstat -c $fs/stats
done
else
echo -e "\nLustre is available, but llstat is missing. Cannot reset Lustre counters. " >> $DEBUG_PATH 2>&1
fi
else
echo -e "Disable Lustre plugin. ${LUSTRE_PATH} not found." >> $DEBUG_PATH 2>&1
sed -i "/Import \"lustre_bw\"/,/<\/Module>/"' s/^/#/' "${COLLECTD_CONF}"
fi
# set InfluxDB access settings
sed -i "/#INFLUXHOST/c \ \ \ \ host \"${INFLUXDB_HOST}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXPORT/c \ \ \ \ port \"${INFLUXDB_PORT}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXUSER/c \ \ \ \ user \"${INFLUXDB_USER}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXPWD/c \ \ \ \ pwd \"${INFLUXDB_PASSWORD}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXDBNAME/c \ \ \ \ database \"${INFLUXDB_DATABASE}\"" ${COLLECTD_CONF}
which nvidia-smi >> $DEBUG_PATH 2>&1
# disable (comment out) the NVML plugin if nvidia-smi is not available or CUDA driver not installed
if ! [ -x "$(command -v nvidia-smi)" ] || [ "$(nvidia-smi | grep -c failed)" -gt 0 ]; then
echo -e "\nDisable GPU NVIDIA plugin (no nvidia-smi or CUDA driver)." >> $DEBUG_PATH 2>&1
sed -i "/<LoadPlugin gpu_nvidia>/,/gpu_nvidia_end/"' s/^/#/' "${COLLECTD_CONF}"
fi
# get the architecture directory of Likwid perfgroup
export LIKWID_INST_PATH=$PIKA_INSTALL_PATH/likwid/${LIKWID_VERSION}
echo -e "\nCheck for Likwid install path: $LIKWID_INST_PATH" >> $DEBUG_PATH 2>&1
arch_dir=`$LIKWID_INST_PATH/bin/likwid-perfctr -i | grep "CPU short:" | awk '{print $3}'`
#echo -e Architecture: $arch_dir >> $DEBUG_PATH 2>&1
group_count=`ls -l $LIKWID_INST_PATH/share/likwid/perfgroups/$arch_dir | grep -c "pika_metrics_"`
#echo Groups: $group_count
# determine measurement time and align second
if [ "$group_count" -gt 0 ]; then
# assuming that not more than 5 groups are measured
if [ "$group_count" -eq 1 ]; then
mtime=50
elif [ "$group_count" -eq 2 ]; then
mtime=25
elif [ "$group_count" -eq 3 ]; then
mtime=15
else
mtime=10
fi
startsecond=$((60-(group_count*mtime)))
echo -e "Set Likwid align second to $startsecond and measurement time to $mtime (arch: ${arch_dir})" >> $DEBUG_PATH 2>&1
sed -i '/<LoadPlugin likwid>/,/LoadPlugin/'" s/AlignRead.*/AlignRead $startsecond/" "${COLLECTD_CONF}"
sed -i "/Mtime/c \ \ Mtime \"$mtime\"" ${COLLECTD_CONF}
group_string=`ls $LIKWID_INST_PATH/share/likwid/perfgroups/$arch_dir | grep "pika_metrics" | tr '\n' ',' | sed 's/.txt,/,/g' | sed 's/,*$//g'`
sed -i "/Groups/c \ \ Groups \"$group_string\"" ${COLLECTD_CONF}
#expose path to likwid library (needed to load collectd likwid plugin)
export LD_LIBRARY_PATH=${LIKWID_INST_PATH}/lib:${LD_LIBRARY_PATH}
# set the socket file from PIKA configuration
sed -i '/<Plugin unixsock>/,/Plugin>/'" s|SocketFile.*|SocketFile \"${PIKA_COLLECTD_SOCKET}\"|" ${COLLECTD_CONF}
else
echo -e "Error: No PIKA group definitions for LIKWID found! Disable LIKWID and unixsock plugins." >> $DEBUG_PATH 2>&1
sed -i "/likwid/,/likwid_end/"' s/^/#/' "${COLLECTD_CONF}"
sed -i "/LoadPlugin unixsock/,/Plugin>/"' s/^/#/' "${COLLECTD_CONF}"
fi
# disallow users to read collectd config file
chmod 640 ${COLLECTD_CONF}
# start collectd
echo -e "\nStarting Collectd: $DAEMON" >> $DEBUG_PATH 2>&1
export LIKWID_PERF_PID=-1 # workaround for root access with perf to counter registers
export HOME="workaroundLIKWIDbug"
echo -e "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $DEBUG_PATH 2>&1
$DAEMON >> $DEBUG_PATH 2>&1
# check if collectd is up and running
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
#echo -e "Daemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
sleep 1
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "Daemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
echo -e "Error: Collectd could not be started!" >> $DEBUG_PATH 2>&1
fi
fi
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_start_collectd.sh >> $DEBUG_PATH 2>&1
fi
fi
......
#!/bin/bash
source /sw/taurus/tools/pika/pika-current.conf
DEBUG_PATH=/tmp/pika_control.out
function pika_clean() {
rm -rf /tmp/pika_*
rm -rf /opt/pika/*
}
function pika_install() {
mkdir -p ${PIKA_INSTALL_PATH}
echo -e "tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/.."
tar xzf ${PIKA_PACKAGE_PATH} -C ${PIKA_INSTALL_PATH}/..
}
function pika_start() {
# local collectd config file
COLLECTD_CONF=/tmp/pika_collectd.conf
# collectd call
COLLECTD_PID_FILE=/tmp/pika_collectd.pid
DAEMON="${PIKA_INSTALL_PATH}/collectd/${COLLECTD_VERSION}/sbin/collectd -C ${COLLECTD_CONF} -P ${COLLECTD_PID_FILE}"
touch $DEBUG_PATH
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_start_collectd.sh
cat $DEBUG_PATH
rm -f $DEBUG_PATH
}
function pika_stop() {
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_stop_collectd.sh
}
if [ ! $# -eq 1 ]; then
echo "One argument required!"
echo "./pika_control [purge|install|start|stop]"
else
case "$1" in
purge)
pika_stop
pika_clean
;;
install)
pika_stop
pika_clean
pika_install
;;
start)
pika_stop
if [ ! -d "${PIKA_INSTALL_PATH}" ]; then
pika_install
fi
pika_start
;;
stop)
pika_stop
;;
*)
echo "$1 is not supported."
;;
esac
fi
\ No newline at end of file
#!/bin/bash
#requires a source of /sw/taurus/tools/pika/pika-current.conf
#source /sw/taurus/tools/pika/pika-current.conf
#COLLECTD_CONF=/tmp/pika_collectd.conf
CD_INST_PATH=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION
CUSTOM_TYPES_DIR=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION/share/collectd
COLLECTD_PYTHON_PLUGIN_PATH=$PIKA_ROOT/daemon/collectd/collectd-plugins/python
# set hostname to avoid systemcall for every metric dispatch and copy collectd.conf into /tmp
hostshort=`hostname -s`
sed "/#HostnameReplace/cHostname ${hostshort}" \
${PIKA_ROOT}/daemon/collectd/collectd_template.conf > ${COLLECTD_CONF}
# set the path and name of the collectd logfile
sed -i '/<Plugin logfile>/,/Plugin>/'" s|File.*|File \"${COLLECTD_LOGFILE}\"|" ${COLLECTD_CONF}
# use + as sed separator
sed -i -e "s+CD_INST_PATH+${CD_INST_PATH}+" ${COLLECTD_CONF}
sed -i -e "s+CUSTOM_TYPES_DIR+${CUSTOM_TYPES_DIR}+" ${COLLECTD_CONF}
# set python module path
sed -i "/ModulePath/c \ \ ModulePath \"${COLLECTD_PYTHON_PLUGIN_PATH}\"" ${COLLECTD_CONF}
# Check for lustre and, if available reset counters
LUSTRE_PATH=/proc/fs/lustre
if [ -d "${LUSTRE_PATH}" ]; then
if [ -x "$(command -v llstat)" ]; then
echo -e "\nLustre is available. Reset Lustre counters for " >> $DEBUG_PATH 2>&1
for fs in ${LUSTRE_PATH}/llite/*; do
echo -e "$fs " >> $DEBUG_PATH 2>&1
llstat -c $fs/stats
done
else
echo -e "\nLustre is available, but llstat is missing. Cannot reset Lustre counters. " >> $DEBUG_PATH 2>&1
fi
else
echo -e "Disable Lustre plugin. ${LUSTRE_PATH} not found." >> $DEBUG_PATH 2>&1
sed -i "/Import \"lustre_bw\"/,/<\/Module>/"' s/^/#/' "${COLLECTD_CONF}"
fi
# set InfluxDB access settings
sed -i "/#INFLUXHOST/c \ \ \ \ host \"${INFLUXDB_HOST}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXPORT/c \ \ \ \ port \"${INFLUXDB_PORT}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXUSER/c \ \ \ \ user \"${INFLUXDB_USER}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXPWD/c \ \ \ \ pwd \"${INFLUXDB_PASSWORD}\"" ${COLLECTD_CONF}
sed -i "/#INFLUXDBNAME/c \ \ \ \ database \"${INFLUXDB_DATABASE}\"" ${COLLECTD_CONF}
which nvidia-smi >> $DEBUG_PATH 2>&1
# disable (comment out) the NVML plugin if nvidia-smi is not available or CUDA driver not installed
if ! [ -x "$(command -v nvidia-smi)" ] || [ "$(nvidia-smi | grep -c failed)" -gt 0 ]; then
echo -e "\nDisable GPU NVIDIA plugin (no nvidia-smi or CUDA driver)." >> $DEBUG_PATH 2>&1
sed -i "/<LoadPlugin gpu_nvidia>/,/gpu_nvidia_end/"' s/^/#/' "${COLLECTD_CONF}"
fi
# get the architecture directory of Likwid perfgroup
export LIKWID_INST_PATH=$PIKA_INSTALL_PATH/likwid/${LIKWID_VERSION}
echo -e "\nCheck for Likwid install path: $LIKWID_INST_PATH" >> $DEBUG_PATH 2>&1
arch_dir=`$LIKWID_INST_PATH/bin/likwid-perfctr -i | grep "CPU short:" | awk '{print $3}'`
#echo -e Architecture: $arch_dir >> $DEBUG_PATH 2>&1
group_count=`ls -l $LIKWID_INST_PATH/share/likwid/perfgroups/$arch_dir | grep -c "pika_metrics_"`
#echo Groups: $group_count
# determine measurement time and align second
if [ "$group_count" -gt 0 ]; then
# assuming that not more than 5 groups are measured
if [ "$group_count" -eq 1 ]; then
mtime=50
elif [ "$group_count" -eq 2 ]; then
mtime=25
elif [ "$group_count" -eq 3 ]; then
mtime=15
else
mtime=10
fi
startsecond=$((60-(group_count*mtime)))
echo -e "Set Likwid align second to $startsecond and measurement time to $mtime (arch: ${arch_dir})" >> $DEBUG_PATH 2>&1
sed -i '/<LoadPlugin likwid>/,/LoadPlugin/'" s/AlignRead.*/AlignRead $startsecond/" "${COLLECTD_CONF}"
sed -i "/Mtime/c \ \ Mtime \"$mtime\"" ${COLLECTD_CONF}
group_string=`ls $LIKWID_INST_PATH/share/likwid/perfgroups/$arch_dir | grep "pika_metrics" | tr '\n' ',' | sed 's/.txt,/,/g' | sed 's/,*$//g'`
sed -i "/Groups/c \ \ Groups \"$group_string\"" ${COLLECTD_CONF}
#expose path to likwid library (needed to load collectd likwid plugin)
export LD_LIBRARY_PATH=${LIKWID_INST_PATH}/lib:${LD_LIBRARY_PATH}
# set the socket file from PIKA configuration
sed -i '/<Plugin unixsock>/,/Plugin>/'" s|SocketFile.*|SocketFile \"${PIKA_COLLECTD_SOCKET}\"|" ${COLLECTD_CONF}
else
echo -e "Error: No PIKA group definitions for LIKWID found! Disable LIKWID and unixsock plugins." >> $DEBUG_PATH 2>&1
sed -i "/likwid/,/likwid_end/"' s/^/#/' "${COLLECTD_CONF}"
sed -i "/LoadPlugin unixsock/,/Plugin>/"' s/^/#/' "${COLLECTD_CONF}"
fi
# disallow users to read collectd config file
chmod 640 ${COLLECTD_CONF}
# start collectd
echo -e "\nStarting Collectd: $DAEMON" >> $DEBUG_PATH 2>&1
export LIKWID_PERF_PID=-1 # workaround for root access with perf to counter registers
export HOME="workaroundLIKWIDbug"
echo -e "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $DEBUG_PATH 2>&1
$DAEMON >> $DEBUG_PATH 2>&1
# check if collectd is up and running
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
#echo -e "Daemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
sleep 1
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "Daemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
echo -e "Error: Collectd could not be started!" >> $DEBUG_PATH 2>&1
fi
fi
#!/bin/bash
# get collectd pid and kill the process
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
# check if COLLECTD_PID is empty
if [ -z "$COLLECTD_PID" ]; then
echo "collectd is not running."
else
echo "kill -TERM $COLLECTD_PID"
kill -TERM $COLLECTD_PID # flushes metric buffer? send extra signal?
wsecs=0
while [ $wsecs -lt 50 ]; do
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
if [ -z "$COLLECTD_PID" ]; then
break
fi
sleep 1
wsecs=`expr $wsecs + 1`
done
# make sure collectd gets killed
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
if [ -z "$COLLECTD_PID" ]; then
echo "kill -KILL $COLLECTD_PID"
kill -KILL $COLLECTD_PID
fi
fi
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment