Commit e6b4c87d authored by Frank Winkler's avatar Frank Winkler
Browse files

Modified prolog scripts.

parent 9278e106
#!/bin/bash
export PIKA_ROOT=/root/packages/pika-control
export PIKA_ROOT=/etc/pika/pika-control
### global configuration
......@@ -8,8 +8,5 @@ export PIKA_ROOT=/root/packages/pika-control
export PIKA_DEBUG=1
export PIKA_LOGPATH=/tmp
# install path of PIKA package (on compute node), root required
export PIKA_INSTALL_PATH=
# access parameters for databases
source ${PIKA_ROOT}/pika_access
#!/bin/bash
# (1) Get PIKA environment variables
source /root/packages/pika-control/pika.conf
source /etc/pika/pika-control/pika.conf
PIKA_JOB_ID=${SLURM_JOB_ID}
PIKA_JOB_NODELIST=${SLURM_NODELIST}
......
......@@ -2,7 +2,7 @@
####################################
# (1) Get PIKA environment variables
source /root/packages/pika-control/pika.conf
source /etc/pika/pika-control/pika.conf
# if SLURM_JOB_ID is of length zero
if [[ -z "${SLURM_JOB_ID}" ]]; then
......@@ -52,7 +52,22 @@ echo "PIKA_HOSTNAME=$PIKA_HOSTNAME" >> $DEBUG_PATH 2>&1
####################################
# (4) Start/stop collectd
# source ${PIKA_ROOT}/slurm/utils/... >> $DEBUG_PATH 2>&1
if [[ "${SPANK_PIKA_MONITORING}" == "1" ]]; then
echo "PIKA monitoring enabled" >> $DEBUG_PATH 2>&1
if (systemctl -q is-active collectd) then
echo "collectd is already running." >> $DEBUG_PATH 2>&1
else
echo "systemctl start collectd" >> $DEBUG_PATH 2>&1
systemctl start collectd >> $DEBUG_PATH 2>&1
fi
elif [[ "${SPANK_PIKA_MONITORING}" == "0" ]]; then
echo "PIKA monitoring disabled" >> $DEBUG_PATH 2>&1
if (systemctl -q is-active collectd) then
echo "systemctl stop collectd" >> $DEBUG_PATH 2>&1
systemctl stop collectd >> $DEBUG_PATH 2>&1
fi
fi
# (5) Send job metadata to MariaDB
source ${PIKA_ROOT}/slurm/utils/pika_save_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1
......
......@@ -41,8 +41,8 @@ if [ "${PIKA_HOSTNAME}" = "${MASTER_NODE}" ]; then
# create sql statement
SQL_QUERY="INSERT INTO Job_Data "
SQL_QUERY+="(JID,USER,PROJECT,STATUS,NUM_NODES,NODELIST,CPULIST,NUM_CORES,SUBMIT,START,NAME,WALLTIME,P_PARTITION,EXCLUSIVE,ARRAY_ID) "
SQL_QUERY+="VALUES ('${PIKA_JOB_ID}','${PIKA_JOB_USER}','${PIKA_JOB_ACCOUNT}','running','${JOB_NUM_NODES}','${PIKA_JOB_NODELIST}','${PIKA_JOB_CPUS_ALLOCATED}','${PIKA_JOB_NUM_CORES}','${PIKA_JOB_SUBMIT}','${PIKA_JOB_START}','${PIKA_JOB_NAME}','${PIKA_JOB_WALLTIME}','${PIKA_JOB_PARTITION}','${PIKA_JOB_EXCLUSIVE}','${JOB_ARRAY_ID}')"
SQL_QUERY+="(JID,USER,PROJECT,STATUS,NUM_NODES,NODELIST,CPULIST,NUM_CORES,SUBMIT,START,NAME,WALLTIME,P_PARTITION,EXCLUSIVE,PROPERTY_ID,ARRAY_ID) "
SQL_QUERY+="VALUES ('${PIKA_JOB_ID}','${PIKA_JOB_USER}','${PIKA_JOB_ACCOUNT}','running','${JOB_NUM_NODES}','${PIKA_JOB_NODELIST}','${PIKA_JOB_CPUS_ALLOCATED}','${PIKA_JOB_NUM_CORES}','${PIKA_JOB_SUBMIT}','${PIKA_JOB_START}','${PIKA_JOB_NAME}','${PIKA_JOB_WALLTIME}','${PIKA_JOB_PARTITION}','${PIKA_JOB_EXCLUSIVE}','${SPANK_PIKA_MONITORING}','${JOB_ARRAY_ID}')"
# check if mysql is installed
MYSQL_CHECK=`command -v mysql`
......
......@@ -11,6 +11,10 @@ PIKA_JOB_ARRAY_ID='None'
# get remaining data from SLURM scontrol show job ${SLURM_JOB_ID} -d
slurm_data=$(scontrol show job ${SLURM_JOB_ID} -d)
if [[ $slurm_data == *"ArrayJobId="* ]]; then
PIKA_JOB_ARRAY_ID=$(echo $slurm_data | awk -F 'ArrayJobId=' '{print $2}' | awk '{print $1}')
fi
PIKA_JOB_NAME=$(echo $slurm_data | awk -F 'JobName=' '{print $2}' | awk '{print $1}')
PIKA_JOB_STATUS=$(echo $slurm_data | awk -F 'JobState=' '{print $2}' | awk '{print $1}')
PIKA_JOB_ACCOUNT=$(echo $slurm_data | awk -F 'Account=' '{print $2}' | awk '{print $1}')
......
#!/bin/bash
#requires a source of /sw/taurus/tools/pika/pika-current.conf
#source /sw/taurus/tools/pika/pika-current.conf
CD_INST_PATH=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION
CUSTOM_TYPES_DIR=$PIKA_INSTALL_PATH/collectd/$COLLECTD_VERSION/share/collectd
COLLECTD_PYTHON_PLUGIN_PATH=$PIKA_ROOT/daemon/collectd/collectd-plugins/python
# collectd template and target file have to be set, exit otherwise
if [ -z "${PIKA_COLLECTD_CONF_TEMPLATE}" ] || [ -z "${PIKA_COLLECTD_CONF}" ]; then
echo -e "PIKA collectd configuration template or target file is not set! PIKA_COLLECTD_CONF_TEMPLATE=${PIKA_COLLECTD_CONF_TEMPLATE};PIKA_COLLECTD_CONF=${PIKA_COLLECTD_CONF}" >> $DEBUG_PATH 2>&1
exit 0
fi
# set hostname to avoid systemcall for every metric dispatch and copy collectd.conf into /tmp
hostshort=`hostname -s`
sed "/#HostnameReplace/cHostname ${hostshort}" \
${PIKA_COLLECTD_CONF_TEMPLATE} > ${PIKA_COLLECTD_CONF}
# set the path and name of the collectd logfile
sed -i '/<Plugin logfile>/,/Plugin>/'" s|File.*|File \"${PIKA_COLLECTD_LOGFILE}\"|" ${PIKA_COLLECTD_CONF}
# use + as sed separator
sed -i -e "s+CD_INST_PATH+${CD_INST_PATH}+" ${PIKA_COLLECTD_CONF}
sed -i -e "s+CUSTOM_TYPES_DIR+${CUSTOM_TYPES_DIR}+" ${PIKA_COLLECTD_CONF}
# set python module path
sed -i "/ModulePath/c \ \ ModulePath \"${COLLECTD_PYTHON_PLUGIN_PATH}\"" ${PIKA_COLLECTD_CONF}
# Check for Lustre paths and, if available, clean stats files
lustre_avail=false
for lustre_path in /proc/fs/lustre /sys/kernel/debug/lustre; do
if [ -d "${lustre_path}" ]; then
echo -e "\nLustre stats directory: ${lustre_path}" >> $DEBUG_PATH 2>&1
#if [ -x "$(command -v llstat)" ]; then
# echo -e "\nReset Clear stats file " >> $DEBUG_PATH 2>&1
# for fs in ${lustre_path}/llite/*; do
# echo -e "$fs " >> $DEBUG_PATH 2>&1
# #llstat -c $fs/stats
# done
#else
# echo -e "\nLustre is available, but llstat is missing. Cannot reset Lustre counters. " >> $DEBUG_PATH 2>&1
#fi
lustre_avail=true
fi
done
if [ "$lustre_avail" = false ]; then
echo -e "Disable Lustre plugin. ${LUSTRE_PATH} not found." >> $DEBUG_PATH 2>&1
sed -i "/Import \"lustre_bw\"/,/<\/Module>/"' s/^/#/' "${PIKA_COLLECTD_CONF}"
fi
# set InfluxDB access settings
sed -i "/#INFLUXHOST/c \ \ \ \ host \"${INFLUXDB_HOST}\"" ${PIKA_COLLECTD_CONF}
sed -i "/#INFLUXPORT/c \ \ \ \ port \"${INFLUXDB_PORT}\"" ${PIKA_COLLECTD_CONF}
sed -i "/#INFLUXUSER/c \ \ \ \ user \"${INFLUXDB_USER}\"" ${PIKA_COLLECTD_CONF}
sed -i "/#INFLUXPWD/c \ \ \ \ pwd \"${INFLUXDB_PASSWORD}\"" ${PIKA_COLLECTD_CONF}
sed -i "/#INFLUXDBNAME/c \ \ \ \ database \"${INFLUXDB_DATABASE}\"" ${PIKA_COLLECTD_CONF}
sed -i '/<Module influx_write>/,/Module>/'" s|batch_size.*|batch_size ${PIKA_COLLECTD_BATCH_SIZE}|" ${PIKA_COLLECTD_CONF}
which nvidia-smi >> $DEBUG_PATH 2>&1
# disable (comment out) the NVML plugin if nvidia-smi is not available or CUDA driver not installed
if ! [ -x "$(command -v nvidia-smi)" ] || [ "$(nvidia-smi | grep -c failed)" -gt 0 ]; then
echo -e "\nDisable GPU NVIDIA plugin (no nvidia-smi or CUDA driver)." >> $DEBUG_PATH 2>&1
sed -i "/<LoadPlugin gpu_nvidia>/,/gpu_nvidia_end/"' s/^/#/' "${PIKA_COLLECTD_CONF}"
fi
# get the architecture directory of Likwid perfgroup
export LIKWID_INST_PATH=$PIKA_INSTALL_PATH/likwid/${LIKWID_VERSION}
echo -e "\nCheck for Likwid install path: $LIKWID_INST_PATH" >> $DEBUG_PATH 2>&1
arch_dir=`$LIKWID_INST_PATH/bin/likwid-perfctr -i | grep "CPU short:" | awk '{print $3}'`
#echo -e Architecture: $arch_dir >> $DEBUG_PATH 2>&1
group_count=`ls -l $LIKWID_INST_PATH/share/likwid/perfgroups/$arch_dir | grep -c "pika_metrics_"`
#echo Groups: $group_count
# determine measurement time and align offset
if [ "$group_count" -gt 0 ]; then
##############################################################################
# Hard-coded for a sampling interval of 60s for the collectd LIKWID plugin!
interval=60 # TODO: use awk to extract it from ${PIKA_COLLECTD_CONF_TEMPLATE}
# total measurement time has to be smaller than interval
mtime=$((interval/group_count-1))
# use a read offset to get round timestamps
# LIKWID plugin takes the timestamp after reading all groups)
readoffset=$((interval-(group_count*mtime)))
##############################################################################
echo -e "Set Likwid read offset to $readoffset and measurement time to $mtime (arch: ${arch_dir})" >> $DEBUG_PATH 2>&1
# PIKA 1.1 used a different AlignRead option
if [ $PIKA_VERSION = "1.1" ]; then
sed -i '/<LoadPlugin likwid>/,/LoadPlugin/'" s/AlignRead.*/AlignRead $readoffset/" "${PIKA_COLLECTD_CONF}"
else
sed -i '/<LoadPlugin likwid>/,/LoadPlugin/'" s/AlignReadOffset.*/AlignReadOffset $readoffset/" "${PIKA_COLLECTD_CONF}"
fi
# set the measurement time option
sed -i "/Mtime/c \ \ Mtime \"$mtime\"" ${PIKA_COLLECTD_CONF}
group_string=`ls $LIKWID_INST_PATH/share/likwid/perfgroups/$arch_dir | grep "pika_metrics" | tr '\n' ',' | sed 's/.txt,/,/g' | sed 's/,*$//g'`
sed -i "/Groups/c \ \ Groups \"$group_string\"" ${PIKA_COLLECTD_CONF}
#expose path to likwid library (needed to load collectd likwid plugin)
export LD_LIBRARY_PATH=${LIKWID_INST_PATH}/lib:${LD_LIBRARY_PATH}
# set the socket file from PIKA configuration
sed -i '/<Plugin unixsock>/,/Plugin>/'" s|SocketFile.*|SocketFile \"${PIKA_COLLECTD_SOCKET}\"|" ${PIKA_COLLECTD_CONF}
else
echo -e "Error: No PIKA group definitions for LIKWID found! Disable LIKWID and unixsock plugins." >> $DEBUG_PATH 2>&1
sed -i "/LoadPlugin likwid/,/\/Plugin>/"' s/^/#/' "${PIKA_COLLECTD_CONF}"
sed -i "/LoadPlugin unixsock/,/Plugin>/"' s/^/#/' "${PIKA_COLLECTD_CONF}"
fi
# disallow users to read collectd config file
chmod 640 ${PIKA_COLLECTD_CONF}
# start collectd
echo -e "\nStarting Collectd: $DAEMON" >> $DEBUG_PATH 2>&1
export LIKWID_PERF_PID=-1 # workaround for root access with perf to counter registers
export HOME="workaroundLIKWIDbug"
echo -e "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $DEBUG_PATH 2>&1
# add Likwid bin path to PATH (required by InfluxDB write plugin, which calls likwid-topology)
export PATH=${LIKWID_INST_PATH}/bin:${PATH}
$DAEMON >> $DEBUG_PATH 2>&1
# check if collectd is up and running
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
#echo -e "Daemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
sleep 1
active_procs=`ps -eo pid,cmd | grep -v grep | grep -c "$DAEMON"`
echo -e "Daemon started? Active collectd processes: ${active_procs}" >> $DEBUG_PATH 2>&1
if [ $active_procs -eq 0 ]; then
echo -e "Error: Collectd could not be started!" >> $DEBUG_PATH 2>&1
fi
fi
#!/bin/bash
# get collectd pid and kill the process
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
# check if COLLECTD_PID is empty
if [ -z "$COLLECTD_PID" ]; then
echo "collectd is not running."
else
echo "kill -TERM $COLLECTD_PID"
kill -TERM $COLLECTD_PID # flushes metric buffer? send extra signal?
wsecs=0
while [ $wsecs -lt 50 ]; do
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
if [ -z "$COLLECTD_PID" ]; then
break
fi
sleep 1
wsecs=`expr $wsecs + 1`
done
# make sure collectd gets killed
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
if [ -n "$COLLECTD_PID" ]; then
echo "kill -KILL $COLLECTD_PID"
kill -KILL $COLLECTD_PID
fi
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment