Commit 69b5f295 authored by Robert Dietrich's avatar Robert Dietrich

added overhead test

parent 7c934fee
# Monitoring Overhead
We use five bencharks from [Likwid Bench](https://github.com/RRZE-HPC/likwid/wiki/Likwid-Bench) to determine the monitoring overhead on the compute nodes: peakflops_avx, triad_avx_fma, load_avx, store_avx and stream_avx_fma. The version (avx, fma) with the highest bandwidth or FLOPS rate was selected. The used benchmark iterations generate a runtime of about ten minutes per benchmark.
The [run.sh](run.sh) script performs ten passes. Each pass executes all benchmarks for the following monitoring setups:
* no monitoring (no PIKA collectd active)
* PIKA collectd with 60 seconds interval
* PIKA collectd with 10 seconds interval
* PIKA collectd with 1 second interval
* PIKA collectd with 1 second interval, but without LIKWID plugin
The results are written into a csv file. The [collectd](collectd) folder contains the respective collectd configuration files and the [start_collectd.sh](collectd/start_collectd.sh) script that is used by [run.sh](run.sh).
The [collectd](collectd) folder also provides the python script [delete_series.py](collectd/delete_series.py), which can be used to delete all data from the database for a specific host.
\ No newline at end of file
#/bin/python
import os
import sys
try:
from influxdb.client import InfluxDBClient
except ImportError:
InfluxDBClient = None
influx = None
database = None
"""
Connect to the InfluxDB server
"""
def _connect():
try:
global influx
global database
host = os.environ["INFLUXDB_HOST"]
port = os.environ["INFLUXDB_PORT"]
user = os.environ["INFLUXDB_USER"]
password = os.environ["INFLUXDB_PASSWORD"]
database = os.environ["INFLUXDB_DATABASE"]
influx = InfluxDBClient(host=host, port=port, username=user,
password=password, database=database)
print("InfluxDB: established connection to %s:%s/%s." % (host, port, database) )
except Exception as ex:
# Log Error
print("InfluxDB: failed to connect to %s:%s/%s. (%s:%s) - %s" % (host, port, database, username, password, ex) )
_close()
"""
Close the socket = do nothing for influx which is http stateless
"""
def _close():
global influx
influx = None
if __name__ == '__main__':
hostList = None
if len(sys.argv) == 2:
hostList = sys.argv[1].split(",")
else:
sys.exit("Only one argument, a comma separated list of host names is allowed")
_connect()
measurements = influx.get_list_measurements()
for host in hostList:
tag = { "hostname" : host }
#print(measurements)
for mdict in measurements:
measurement = mdict['name']
print("Delete from %s where hostname='%s'" % (measurement, host))
influx.delete_series(database, measurement, tag)
export PIKA_VERSION=1.1
export PYTHON_VERSION=3.7.7
export PIKA_ROOT=/sw/taurus/tools/pika
export PIKA_INSTALL_PATH=${PIKA_ROOT}/sw/${PIKA_VERSION}
export PYTHON_ROOT=${PIKA_INSTALL_PATH}/python/${PYTHON_VERSION}
export PYTHONHOME=${PYTHON_ROOT}
export PATH=$PYTHON_ROOT/bin:$PATH
export LD_LIBRARY_PATH=${PYTHON_ROOT}/lib:${PYTHON_ROOT}/lib/python3.7:${LD_LIBRARY_PATH}
source /sw/taurus/tools/pika/.pika_access
#
# Config file for collectd(1).
# Please read collectd.conf(5) for a list of options.
# http://collectd.org/
#
##############################################################################
# Global settings for the daemon. #
##############################################################################
Hostname taurusi5396
TypesDB "/opt/pika/1.1/collectd/5.10.0/share/collectd/types.db"
TypesDB "/opt/pika/1.1/collectd/5.10.0/share/collectd/custom_types.db"
Interval 30
AlignRead 30.012
ReadThreads 2
WriteThreads 1
##############################################################################
# Logging #
#----------------------------------------------------------------------------#
# Plugins which provide logging functions should be loaded first, so log #
# messages generated when loading or configuring other plugins can be #
# accessed. #
##############################################################################
LoadPlugin logfile
<Plugin logfile>
LogLevel info
File "/tmp/pika_collectd.log"
Timestamp true
PrintSeverity false
</Plugin>
##############################################################################
# LoadPlugin section #
##############################################################################
# plugin read functions are executed in reverse order?
<LoadPlugin memory>
Interval 30
AlignRead 0.00
</LoadPlugin>
<Plugin memory>
ValuesAbsolute true
ValuesPercentage false
</Plugin>
<LoadPlugin cpu>
Interval 30
AlignRead 0.02
</LoadPlugin>
<Plugin cpu>
ReportByCpu true
ReportByState false
ValuesPercentage true
ReportNumCpu false
ReportGuestState false
SubtractGuestState false
</Plugin>
<LoadPlugin disk>
Interval 30
AlignRead 0.04
</LoadPlugin>
<Plugin disk>
Disk "sda"
IgnoreSelected false
</Plugin>
#<LoadPlugin gpu_nvidia>
# Interval 30
# AlignRead 0.06
#</LoadPlugin>
#<Plugin gpu_nvidia>
## InstanceByGPUIndex false
# InstanceByGPUName false
#</Plugin> #gpu_nvidia_end
<LoadPlugin likwid>
Interval 60
AlignRead 2
</LoadPlugin>
<Plugin likwid>
NormalizeFlops flops_any
AccessMode 0 # 1 for accessdaemon, 0 for direct access (only as root or with perf)
Mtime "58"
Groups "pika_metrics_1"
# by default metrics are reported per hardware thread
PerSocketMetrics "mem_bw,rapl_power"
MaxValues "ipc:10,flops:1e11,mem_bw:1e12"
PerCore true
Verbose 1
</Plugin> #likwid_end
<LoadPlugin python>
AlignRead 0.06
</LoadPlugin>
<Plugin python>
ModulePath "/sw/taurus/tools/pika/daemon/collectd/collectd-plugins/python"
LogTraces true
Interactive false
Import "influx_write"
<Module influx_write>
host "172.24.146.84"
port "8086"
user "admin"
pwd "prope18!"
database "prope"
batch_size 200
cache_size 4000
StoreRates true
PerCore "cpu:avg" #"likwid_cpu:sum" #plugin1:aggregate,plugin2.aggregate
ssl false
</Module>
Import "ib_bw"
<Module ib_bw>
#devices "/sys/class/infiniband/mlx4_0"
#directory "/sys/class/infiniband"
recheck_limit 1440
</Module>
Import "lustre_bw"
<Module lustre_bw>
#path "Lustre instance paths (comma separated)"
fsname_and_mount "*:/ws" # for all file systems AND mount points that end with '/ws'
recheck_limit 360 # every 3h
</Module>
</Plugin>
LoadPlugin unixsock
<Plugin unixsock>
SocketFile "/opt/pika/1.1/pika_collectd.sock"
SocketGroup "root"
SocketPerms "0770"
DeleteSocket true
</Plugin>
#LoadPlugin write_log
##############################################################################
# Filter configuration #
##############################################################################
# Load required matches:
LoadPlugin match_regex
LoadPlugin target_scale
LoadPlugin target_set
PreCacheChain "pika"
<Chain "pika">
### ignore other than memory used
<Rule "mem_used_only">
<Match "regex">
Plugin "^memory$"
TypeInstance "^[f|s|c|b]"
</Match>
Target "stop"
</Rule>
# for the disk plugin, ignore other than disc_octets and disk_ops
<Rule "disk_o_only">
<Match "regex">
Plugin "^disk$"
Type "^(p|disk_[t|m|i])" #starts with p or disk_t|i|m
#Type "^(?!disk_o).+" # do not start with "disk_o" # does not work with collectd
</Match>
Target "stop"
</Rule>
# rename "disc_octets" to "bytes"
<Rule "rename_disk_octets">
<Match "regex">
Plugin "^disk$"
Type "^disk_octets$"
</Match>
<Target "set">
TypeInstance "bytes"
</Target>
Target "write"
Target "stop"
</Rule>
# no need to have an additional "disk" in the field name
<Rule "rename_disk_ops">
<Match "regex">
Plugin "^disk$"
Type "^disk_ops$"
</Match>
<Target "set">
TypeInstance "ops"
</Target>
Target "write"
Target "stop"
</Rule>
# rename CPU "active" to "used" and multiply each value by 0.01
<Rule "handle_cpu_active">
<Match "regex">
Plugin "^cpu$"
TypeInstance "^active$"
</Match>
<Target "scale">
Factor 0.01
</Target>
<Target "set">
TypeInstance "used"
</Target>
Target "write"
Target "stop"
</Rule>
# handle all rules for the gpu_nvidia plugin
<Rule "handle_gpu_nvidia">
<Match "regex">
Plugin "^gpu_nvidia$"
</Match>
<Target jump>
Chain "handle_gpu_nvidia"
</Target>
# set plugin name to nvml for metrics not handled in chain
<Target "set">
Plugin "nvml"
</Target>
</Rule>
</Chain>
<Chain "handle_gpu_nvidia">
<Rule "nvml_no_freq">
<Match "regex">
Type "^freq" #frequency for multiprocessor and memory
</Match>
Target "stop"
</Rule>
<Rule "nvml_no_freemem">
<Match "regex">
TypeInstance "^free"
</Match>
Target "stop"
</Rule>
<Rule "rename_temperature">
<Match "regex">
Type "^temp"
</Match>
<Target "set">
Plugin "nvml"
TypeInstance "temp"
</Target>
Target "write"
Target "stop"
</Rule>
<Rule "rename_memory">
<Match "regex">
Type "^memory$"
TypeInstance "^used$"
</Match>
<Target "set">
Plugin "nvml"
TypeInstance "mem_used"
</Target>
Target "write"
Target "stop"
</Rule>
<Rule "handle_gpu_used">
<Match "regex">
TypeInstance "gpu_used$"
</Match>
<Target "scale">
Factor 0.01
</Target>
<Target "set">
Plugin "nvml"
</Target>
Target "write"
Target "stop"
</Rule>
</Chain>
#
# Config file for collectd(1).
# Please read collectd.conf(5) for a list of options.
# http://collectd.org/
#
##############################################################################
# Global settings for the daemon. #
##############################################################################
Hostname taurusi5396
TypesDB "/opt/pika/1.2/collectd/5.11.0/share/collectd/types.db"
TypesDB "/opt/pika/1.2/collectd/5.11.0/share/collectd/custom_types.db"
Interval 1
AlignRead true
ReadThreads 2
WriteThreads 1
##############################################################################
# Logging #
#----------------------------------------------------------------------------#
# Plugins which provide logging functions should be loaded first, so log #
# messages generated when loading or configuring other plugins can be #
# accessed. #
##############################################################################
LoadPlugin logfile
<Plugin logfile>
LogLevel info
File "/tmp/pika_collectd.log"
Timestamp true
PrintSeverity false
</Plugin>
##############################################################################
# LoadPlugin section #
##############################################################################
# plugin read functions are executed in reverse order?
LoadPlugin memory
<Plugin memory>
ValuesAbsolute true
ValuesPercentage false
</Plugin>
<LoadPlugin cpu>
AlignReadOffset 0.02
</LoadPlugin>
<Plugin cpu>
ReportByCpu true
ReportByState false
ValuesPercentage true
ReportNumCpu false
ReportGuestState false
SubtractGuestState false
</Plugin>
<LoadPlugin disk>
AlignReadOffset 0.04
</LoadPlugin>
<Plugin disk>
Disk "sda"
IgnoreSelected false
</Plugin>
#<LoadPlugin gpu_nvidia>
# Interval 30
# AlignReadOffset 0.06
#</LoadPlugin>
#<Plugin gpu_nvidia>
## InstanceByGPUIndex false
# InstanceByGPUName false
#</Plugin> #gpu_nvidia_end
<LoadPlugin likwid>
AlignReadOffset 0.1
</LoadPlugin>
<Plugin likwid>
NormalizeFlops flops_any
AccessMode 0 # 1 for accessdaemon, 0 for direct access (only as root or with perf)
Mtime 0.9
Groups "pika_metrics_1"
# by default metrics are reported per hardware thread
PerSocketMetrics "mem_bw,rapl_power"
MaxValues "ipc:10,flops:1e11,mem_bw:1e12"
PerCore true
Verbose 1
</Plugin> #likwid_end
<LoadPlugin python>
AlignReadOffset 0.06
</LoadPlugin>
<Plugin python>
ModulePath "/sw/taurus/tools/pika/daemon/collectd/collectd-plugins/python"
LogTraces true
Interactive false
Import "influx_write"
<Module influx_write>
host "172.24.146.84"
port "8086"
user "admin"
pwd "prope18!"
database "prope"
batch_size 1000
cache_size 4000
StoreRates true
PerCore "cpu:avg" #"likwid_cpu:sum" #plugin1:aggregate,plugin2.aggregate
ssl false
</Module>
Import "ib_bw"
<Module ib_bw>
#devices "/sys/class/infiniband/mlx4_0"
#directory "/sys/class/infiniband"
recheck_limit 0
</Module>
Import "lustre_bw"
<Module lustre_bw>
#path "Lustre instance paths (comma separated)"
fsname_and_mount "*:/ws" # for all file systems AND mount points that end with '/ws'
recheck_limit 0 # every 3h
</Module>
</Plugin>
LoadPlugin unixsock
<Plugin unixsock>
SocketFile "/opt/pika/1.2/pika_collectd.sock"
SocketGroup "root"
SocketPerms "0770"
DeleteSocket true
</Plugin>
#LoadPlugin write_log
##############################################################################
# Filter configuration #
##############################################################################
# Load required matches:
LoadPlugin match_regex
LoadPlugin target_scale
LoadPlugin target_set
PreCacheChain "pika"
<Chain "pika">
### ignore other than memory used
<Rule "mem_used_only">
<Match "regex">
Plugin "^memory$"
TypeInstance "^[f|s|c|b]"
</Match>
Target "stop"
</Rule>
# for the disk plugin, ignore other than disc_octets and disk_ops
<Rule "disk_o_only">
<Match "regex">
Plugin "^disk$"
Type "^(p|disk_[t|m|i])" #starts with p or disk_t|i|m
#Type "^(?!disk_o).+" # do not start with "disk_o" # does not work with collectd
</Match>
Target "stop"
</Rule>
# rename "disc_octets" to "bytes"
<Rule "rename_disk_octets">
<Match "regex">
Plugin "^disk$"
Type "^disk_octets$"
</Match>
<Target "set">
TypeInstance "bytes"
</Target>
Target "write"
Target "stop"
</Rule>
# no need to have an additional "disk" in the field name
<Rule "rename_disk_ops">
<Match "regex">
Plugin "^disk$"
Type "^disk_ops$"
</Match>
<Target "set">
TypeInstance "ops"
</Target>
Target "write"
Target "stop"
</Rule>
# rename CPU "active" to "used" and multiply each value by 0.01
<Rule "handle_cpu_active">
<Match "regex">
Plugin "^cpu$"
TypeInstance "^active$"
</Match>
<Target "scale">
Factor 0.01
</Target>
<Target "set">
TypeInstance "used"
</Target>
Target "write"
Target "stop"
</Rule>
# handle all rules for the gpu_nvidia plugin
<Rule "handle_gpu_nvidia">
<Match "regex">
Plugin "^gpu_nvidia$"
</Match>
<Target jump>
Chain "handle_gpu_nvidia"
</Target>
# set plugin name to nvml for metrics not handled in chain
<Target "set">
Plugin "nvml"
</Target>
</Rule>
</Chain>
<Chain "handle_gpu_nvidia">
<Rule "nvml_no_freq">
<Match "regex">
Type "^freq" #frequency for multiprocessor and memory
</Match>
Target "stop"
</Rule>
<Rule "nvml_no_freemem">
<Match "regex">
TypeInstance "^free"
</Match>
Target "stop"
</Rule>
<Rule "rename_temperature">
<Match "regex">
Type "^temp"
</Match>
<Target "set">
Plugin "nvml"
TypeInstance "temp"
</Target>
Target "write"
Target "stop"
</Rule>
<Rule "rename_memory">
<Match "regex">
Type "^memory$"
TypeInstance "^used$"
</Match>
<Target "set">
Plugin "nvml"
TypeInstance "mem_used"
</Target>
Target "write"
Target "stop"
</Rule>
<Rule "handle_gpu_used">
<Match "regex">
TypeInstance "gpu_used$"
</Match>
<Target "scale">
Factor 0.01
</Target>
<Target "set">
Plugin "nvml"
</Target>
Target "write"
Target "stop"
</Rule>
</Chain>
#
# Config file for collectd(1).
# Please read collectd.conf(5) for a list of options.
# http://collectd.org/
#
##############################################################################
# Global settings for the daemon. #
##############################################################################
Hostname taurusi5396
TypesDB "/opt/pika/1.2/collectd/5.11.0/share/collectd/types.db"
TypesDB "/opt/pika/1.2/collectd/5.11.0/share/collectd/custom_types.db"
Interval 10
AlignRead true
ReadThreads 2
WriteThreads 1
##############################################################################
# Logging #
#----------------------------------------------------------------------------#
# Plugins which provide logging functions should be loaded first, so log #
# messages generated when loading or configuring other plugins can be #
# accessed. #
##############################################################################
LoadPlugin logfile
<Plugin logfile>
LogLevel info
File "/tmp/pika_collectd.log"
Timestamp true
PrintSeverity false
</Plugin>
##############################################################################
# LoadPlugin section #
##############################################################################
# plugin read functions are executed in reverse order?
LoadPlugin memory
<Plugin memory>