...
 
...@@ -125,7 +125,8 @@ LoadPlugin logfile ...@@ -125,7 +125,8 @@ LoadPlugin logfile
</Module> </Module>
Import "lustre_bw" Import "lustre_bw"
<Module lustre_bw> <Module lustre_bw>
#path "Path to lustre file systems (comma separated)" #path "Path to lustre instance paths (comma separated)"
fsname_and_mount "*:/ws" # for all file systems AND mount points the end with '/ws'
recheck_limit 1440 recheck_limit 1440
</Module> </Module>
</Plugin> </Plugin>
......
...@@ -22,19 +22,25 @@ sed -i -e "s+CUSTOM_TYPES_DIR+${CUSTOM_TYPES_DIR}+" ${PIKA_COLLECTD_CONF} ...@@ -22,19 +22,25 @@ sed -i -e "s+CUSTOM_TYPES_DIR+${CUSTOM_TYPES_DIR}+" ${PIKA_COLLECTD_CONF}
# set python module path # set python module path
sed -i "/ModulePath/c \ \ ModulePath \"${COLLECTD_PYTHON_PLUGIN_PATH}\"" ${PIKA_COLLECTD_CONF} sed -i "/ModulePath/c \ \ ModulePath \"${COLLECTD_PYTHON_PLUGIN_PATH}\"" ${PIKA_COLLECTD_CONF}
# Check for lustre and, if available reset counters # Check for Lustre paths and, if available, clean stats files
LUSTRE_PATH=/proc/fs/lustre lustre_avail=false
if [ -d "${LUSTRE_PATH}" ]; then for lustre_path in /proc/fs/lustre /sys/kernel/debug/lustre; do
if [ -x "$(command -v llstat)" ]; then if [ -d "${lustre_path}" ]; then
echo -e "\nLustre is available. Reset Lustre counters for " >> $DEBUG_PATH 2>&1 echo -e "\nLustre stats directory: ${lustre_path}" >> $DEBUG_PATH 2>&1
for fs in ${LUSTRE_PATH}/llite/*; do #if [ -x "$(command -v llstat)" ]; then
echo -e "$fs " >> $DEBUG_PATH 2>&1 # echo -e "\nReset Clear stats file " >> $DEBUG_PATH 2>&1
llstat -c $fs/stats # for fs in ${lustre_path}/llite/*; do
done # echo -e "$fs " >> $DEBUG_PATH 2>&1
else # #llstat -c $fs/stats
echo -e "\nLustre is available, but llstat is missing. Cannot reset Lustre counters. " >> $DEBUG_PATH 2>&1 # done
#else
# echo -e "\nLustre is available, but llstat is missing. Cannot reset Lustre counters. " >> $DEBUG_PATH 2>&1
#fi
lustre_avail=true
fi fi
else done
if [ "$lustre_avail" = false ]; then
echo -e "Disable Lustre plugin. ${LUSTRE_PATH} not found." >> $DEBUG_PATH 2>&1 echo -e "Disable Lustre plugin. ${LUSTRE_PATH} not found." >> $DEBUG_PATH 2>&1
sed -i "/Import \"lustre_bw\"/,/<\/Module>/"' s/^/#/' "${PIKA_COLLECTD_CONF}" sed -i "/Import \"lustre_bw\"/,/<\/Module>/"' s/^/#/' "${PIKA_COLLECTD_CONF}"
fi fi
...@@ -45,6 +51,7 @@ sed -i "/#INFLUXPORT/c \ \ \ \ port \"${INFLUXDB_PORT}\"" ${PIKA_COLLECTD_CONF} ...@@ -45,6 +51,7 @@ sed -i "/#INFLUXPORT/c \ \ \ \ port \"${INFLUXDB_PORT}\"" ${PIKA_COLLECTD_CONF}
sed -i "/#INFLUXUSER/c \ \ \ \ user \"${INFLUXDB_USER}\"" ${PIKA_COLLECTD_CONF} sed -i "/#INFLUXUSER/c \ \ \ \ user \"${INFLUXDB_USER}\"" ${PIKA_COLLECTD_CONF}
sed -i "/#INFLUXPWD/c \ \ \ \ pwd \"${INFLUXDB_PASSWORD}\"" ${PIKA_COLLECTD_CONF} sed -i "/#INFLUXPWD/c \ \ \ \ pwd \"${INFLUXDB_PASSWORD}\"" ${PIKA_COLLECTD_CONF}
sed -i "/#INFLUXDBNAME/c \ \ \ \ database \"${INFLUXDB_DATABASE}\"" ${PIKA_COLLECTD_CONF} sed -i "/#INFLUXDBNAME/c \ \ \ \ database \"${INFLUXDB_DATABASE}\"" ${PIKA_COLLECTD_CONF}
sed -i '/<Module influx_write>/,/Module>/'" s|batch_size.*|batch_size ${PIKA_COLLECTD_BATCH_SIZE}|" ${PIKA_COLLECTD_CONF}
which nvidia-smi >> $DEBUG_PATH 2>&1 which nvidia-smi >> $DEBUG_PATH 2>&1
......
...@@ -22,7 +22,7 @@ else ...@@ -22,7 +22,7 @@ else
# make sure collectd gets killed # make sure collectd gets killed
COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'` COLLECTD_PID=`ps -eo pid,cmd | grep -v grep | grep "pika_collectd" | awk '{print $1}'`
if [ -z "$COLLECTD_PID" ]; then if [ -n "$COLLECTD_PID" ]; then
echo "kill -KILL $COLLECTD_PID" echo "kill -KILL $COLLECTD_PID"
kill -KILL $COLLECTD_PID kill -KILL $COLLECTD_PID
fi fi
......
...@@ -86,15 +86,16 @@ BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID} ...@@ -86,15 +86,16 @@ BATCHSYSTEM_ENV_FILE=${LOCAL_STORE}/pika_batchsystem_env_${PIKA_JOB_ID}
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env.sh >> $DEBUG_PATH 2>&1 source ${PIKA_ROOT}/job_control/slurm/taurus/pika_slurm_env.sh >> $DEBUG_PATH 2>&1
# check for Python installation # check for Python installation
echo -e "\nCheck python3 path:" >> $DEBUG_PATH 2>&1 echo -e "\nCheck PIKA python3:" >> $DEBUG_PATH 2>&1
if [ -x "$(command -v python3)" ]; then pika_python_bin=${PYTHON_ROOT}/bin/python3
which python3 >> $DEBUG_PATH 2>&1 if [ -x "$(command -v ${pika_python_bin})" ]; then
echo -e Using ${pika_python_bin} >> $DEBUG_PATH 2>&1
else else
# sleep until we have a python (at most 5 seconds) # sleep until we have a python (at most 5 seconds)
for i in 1 2 3 4 5 ; do for i in 1 2 3 4 5 ; do
sleep 1 sleep 1
if [ -x "$(command -v python3)" ]; then if [ -x "$(command -v ${pika_python_bin})" ]; then
echo -e "python3 is now available!" >> $DEBUG_PATH 2>&1 echo -e "PIKA python3 is now available (${pika_python_bin})" >> $DEBUG_PATH 2>&1
break break
fi fi
done done
...@@ -102,10 +103,11 @@ fi ...@@ -102,10 +103,11 @@ fi
##### (7) get additional job metadata from redis ##### (7) get additional job metadata from redis
PIKA_MONITORING=1 PIKA_MONITORING=1
if [ -x "$(command -v python3)" ]; then if [ -x "$(command -v ${pika_python_bin})" ]; then
which python3 >> $DEBUG_PATH 2>&1
source ${PIKA_ROOT}/job_control/slurm/taurus/pika_get_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1 source ${PIKA_ROOT}/job_control/slurm/taurus/pika_get_metadata_prolog_include.sh >> $DEBUG_PATH 2>&1
else else
echo -e "Error: python3 is NOT available!" >> $DEBUG_PATH 2>&1 echo -e "Error: PIKA python3 is NOT available!" >> $DEBUG_PATH 2>&1
fi fi
##### (8) based on the PIKA_MONITORING value, start or stop collectd ##### (8) based on the PIKA_MONITORING value, start or stop collectd
......