Commit b68ef907 authored by Frank Winkler's avatar Frank Winkler

Initial commit.

parent 840cf6c2
#
# Config file for collectd(1).
# Please read collectd.conf(5) for a list of options.
# http://collectd.org/
#
##############################################################################
# Global settings for the daemon. #
##############################################################################
#HostnameReplace
TypesDB "CD_INST_PATH/share/collectd/types.db"
TypesDB "CUSTOM_TYPES_DIR/custom_types.db"
Interval 30
AlignRead 30.012
ReadThreads 2
WriteThreads 1
##############################################################################
# Logging #
#----------------------------------------------------------------------------#
# Plugins which provide logging functions should be loaded first, so log #
# messages generated when loading or configuring other plugins can be #
# accessed. #
##############################################################################
LoadPlugin logfile
<Plugin logfile>
LogLevel info
File "/tmp/pika_collectd.log" #STDOUT
Timestamp true
PrintSeverity false
</Plugin>
##############################################################################
# LoadPlugin section #
##############################################################################
# plugin read functions are executed in reverse order?
<LoadPlugin memory>
Interval 30
AlignRead 0.00
</LoadPlugin>
<Plugin memory>
ValuesAbsolute true
ValuesPercentage false
</Plugin>
<LoadPlugin cpu>
Interval 30
AlignRead 0.02
</LoadPlugin>
<Plugin cpu>
ReportByCpu true
ReportByState false
ValuesPercentage true
ReportNumCpu false
ReportGuestState false
SubtractGuestState false
</Plugin>
<LoadPlugin disk>
Interval 30
AlignRead 0.04
</LoadPlugin>
<Plugin disk>
Disk "sda"
IgnoreSelected false
</Plugin>
<LoadPlugin gpu_nvidia>
Interval 30
AlignRead 0.06
</LoadPlugin>
<Plugin gpu_nvidia>
# InstanceByGPUIndex false
InstanceByGPUName false
</Plugin> #gpu_nvidia_end
<LoadPlugin likwid>
Interval 60
AlignRead 30.1
</LoadPlugin>
<Plugin likwid>
NormalizeFlops flops_any
AccessMode 0 # 1 for accessdaemon, 0 for direct access (only as root or with perf)
Mtime 15
Groups "pika_metrics_1,pika_metrics_2"
# by default metrics are reported per core
PerSocketMetrics "mem_bw,rapl_power"
Verbose 2
</Plugin> #likwid_end
<LoadPlugin python>
Interval 30
AlignRead 30.3
</LoadPlugin>
<Plugin python>
ModulePath "CD_PLUGINS_PYTHON"
LogTraces true
Interactive false
Import "influx_write"
<Module influx_write>
#INFLUXHOST
#INFLUXPORT
#INFLUXUSER
#INFLUXPWD
#INFLUXDBNAME
batch_size 50
cache_size 2000
StoreRates true
ssl false
</Module>
Import "ib_bw"
<Module ib_bw>
#devices "/sys/class/infiniband/mlx4_0"
#directory "/sys/class/infiniband"
recheck_limit 1440
</Module>
Import "lustre_bw"
<Module lustre_bw>
#path "Path to lustre file systems (comma separated)"
recheck_limit 1440
</Module>
</Plugin>
LoadPlugin unixsock
<Plugin unixsock>
SocketFile "/tmp/pika_collectd_unixsock" #socket for notifications
SocketGroup "root"
SocketPerms "0770"
DeleteSocket true
</Plugin>
#LoadPlugin write_log
##############################################################################
# Filter configuration #
##############################################################################
# Load required matches:
LoadPlugin match_regex
LoadPlugin target_scale
LoadPlugin target_set
PreCacheChain "pika"
<Chain "pika">
### ignore other than memory used
<Rule "mem_used_only">
<Match "regex">
Plugin "^memory$"
TypeInstance "^[f|s|c|b]"
</Match>
Target "stop"
</Rule>
# for the disk plugin, ignore other than disc_octets and disk_ops
<Rule "disk_o_only">
<Match "regex">
Plugin "^disk$"
Type "^(p|disk_[t|m|i])" #starts with p or disk_t|i|m
#Type "^(?!disk_o).+" # do not start with "disk_o" # does not work with collectd
</Match>
Target "stop"
</Rule>
# rename "disc_octets" to "bytes"
<Rule "rename_disk_octets">
<Match "regex">
Plugin "^disk$"
Type "^disk_octets$"
</Match>
<Target "set">
TypeInstance "bytes"
</Target>
Target "write"
Target "stop"
</Rule>
# no need to have an additional "disk" in the field name
<Rule "rename_disk_ops">
<Match "regex">
Plugin "^disk$"
Type "^disk_ops$"
</Match>
<Target "set">
TypeInstance "ops"
</Target>
Target "write"
Target "stop"
</Rule>
# rename CPU "active" to "used" and multiply each value by 0.01
<Rule "handle_cpu_active">
<Match "regex">
Plugin "^cpu$"
TypeInstance "^active$"
</Match>
<Target "scale">
Factor 0.01
</Target>
<Target "set">
TypeInstance "used"
</Target>
Target "write"
Target "stop"
</Rule>
# handle all rules for the gpu_nvidia plugin
<Rule "handle_gpu_nvidia">
<Match "regex">
Plugin "^gpu_nvidia$"
</Match>
<Target jump>
Chain "handle_gpu_nvidia"
</Target>
# set plugin name to nvml for metrics not handled in chain
<Target "set">
Plugin "nvml"
</Target>
</Rule>
</Chain>
<Chain "handle_gpu_nvidia">
<Rule "nvml_no_freq">
<Match "regex">
Type "^freq" #frequency for multiprocessor and memory
</Match>
Target "stop"
</Rule>
<Rule "nvml_no_freemem">
<Match "regex">
TypeInstance "^free"
</Match>
Target "stop"
</Rule>
<Rule "rename_temperature">
<Match "regex">
Type "^temp"
</Match>
<Target "set">
Plugin "nvml"
TypeInstance "temp"
</Target>
Target "write"
Target "stop"
</Rule>
<Rule "rename_memory">
<Match "regex">
Type "^memory$"
TypeInstance "^used$"
</Match>
<Target "set">
Plugin "nvml"
TypeInstance "mem_used"
</Target>
Target "write"
Target "stop"
</Rule>
</Chain>
likwid value:GAUGE:0:U
pair value:GAUGE:0:U, value:GAUGE:0:U
SHORT Branch prediction miss rate/ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 BR_INST_RETIRED_ALL_BRANCHES
PMC1 BR_MISP_RETIRED_ALL_BRANCHES
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Branch rate PMC0/FIXC0
Branch misprediction rate PMC1/FIXC0
Branch misprediction ratio PMC1/PMC0
Instructions per branch FIXC0/PMC0
LONG
Formulas:
Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction rate = BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
-
The rates state how often on average a branch or a mispredicted branch occurred
per instruction retired in total. The branch misprediction ratio sets directly
into relation what ratio of all branch instruction where mispredicted.
Instructions per branch is 1/branch rate.
SHORT Cache bandwidth in MBytes/s
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 L1D_REPLACEMENT
PMC1 L2_TRANS_L1D_WB
PMC2 L2_LINES_IN_ALL
PMC3 L2_TRANS_L2_WB
CBOX0C1 LLC_VICTIMS_M
CBOX1C1 LLC_VICTIMS_M
CBOX2C1 LLC_VICTIMS_M
CBOX3C1 LLC_VICTIMS_M
CBOX4C1 LLC_VICTIMS_M
CBOX5C1 LLC_VICTIMS_M
CBOX6C1 LLC_VICTIMS_M
CBOX7C1 LLC_VICTIMS_M
CBOX8C1 LLC_VICTIMS_M
CBOX9C1 LLC_VICTIMS_M
CBOX10C1 LLC_VICTIMS_M
CBOX11C1 LLC_VICTIMS_M
CBOX12C1 LLC_VICTIMS_M
CBOX13C1 LLC_VICTIMS_M
CBOX14C1 LLC_VICTIMS_M
CBOX15C1 LLC_VICTIMS_M
CBOX16C1 LLC_VICTIMS_M
CBOX17C1 LLC_VICTIMS_M
CBOX18C1 LLC_VICTIMS_M
CBOX19C1 LLC_VICTIMS_M
CBOX20C1 LLC_VICTIMS_M
CBOX21C1 LLC_VICTIMS_M
CBOX0C0 LLC_LOOKUP_DATA_READ
CBOX1C0 LLC_LOOKUP_DATA_READ
CBOX2C0 LLC_LOOKUP_DATA_READ
CBOX3C0 LLC_LOOKUP_DATA_READ
CBOX4C0 LLC_LOOKUP_DATA_READ
CBOX5C0 LLC_LOOKUP_DATA_READ
CBOX6C0 LLC_LOOKUP_DATA_READ
CBOX7C0 LLC_LOOKUP_DATA_READ
CBOX8C0 LLC_LOOKUP_DATA_READ
CBOX9C0 LLC_LOOKUP_DATA_READ
CBOX10C0 LLC_LOOKUP_DATA_READ
CBOX11C0 LLC_LOOKUP_DATA_READ
CBOX12C0 LLC_LOOKUP_DATA_READ
CBOX13C0 LLC_LOOKUP_DATA_READ
CBOX14C0 LLC_LOOKUP_DATA_READ
CBOX15C0 LLC_LOOKUP_DATA_READ
CBOX16C0 LLC_LOOKUP_DATA_READ
CBOX17C0 LLC_LOOKUP_DATA_READ
CBOX18C0 LLC_LOOKUP_DATA_READ
CBOX19C0 LLC_LOOKUP_DATA_READ
CBOX20C0 LLC_LOOKUP_DATA_READ
CBOX21C0 LLC_LOOKUP_DATA_READ
MBOX0C0 CAS_COUNT_RD
MBOX0C1 CAS_COUNT_WR
MBOX1C0 CAS_COUNT_RD
MBOX1C1 CAS_COUNT_WR
MBOX2C0 CAS_COUNT_RD
MBOX2C1 CAS_COUNT_WR
MBOX3C0 CAS_COUNT_RD
MBOX3C1 CAS_COUNT_WR
MBOX4C0 CAS_COUNT_RD
MBOX4C1 CAS_COUNT_WR
MBOX5C0 CAS_COUNT_RD
MBOX5C1 CAS_COUNT_WR
MBOX6C0 CAS_COUNT_RD
MBOX6C1 CAS_COUNT_WR
MBOX7C0 CAS_COUNT_RD
MBOX7C1 CAS_COUNT_WR
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
L2 to L1 load bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
L2 to L1 load data volume [GBytes] 1.0E-09*PMC0*64.0
L1 to L2 evict bandwidth [MBytes/s] 1.0E-06*PMC1*64.0/time
L1 to L2 evict data volume [GBytes] 1.0E-09*PMC1*64.0
L1 to/from L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
L1 to/from L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
L3 to L2 load bandwidth [MBytes/s] 1.0E-06*PMC2*64.0/time
L3 to L2 load data volume [GBytes] 1.0E-09*PMC2*64.0
L2 to L3 evict bandwidth [MBytes/s] 1.0E-06*PMC3*64.0/time
L2 to L3 evict data volume [GBytes] 1.0E-09*PMC3*64.0
L2 to/from L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
L2 to/from L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
System to L3 bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0)*64.0/time
System to L3 data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0)*64.0
L3 to system bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64/time
L3 to system data volume [GBytes] 1.0E-09*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64
L3 to/from system bandwidth [MBytes/s] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64.0/time
L3 to/from system data volume [GBytes] 1.0E-09*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0+CBOX18C0+CBOX19C0+CBOX20C0+CBOX21C0+CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1+CBOX10C1+CBOX11C1+CBOX12C1+CBOX13C1+CBOX14C1+CBOX15C1+CBOX16C1+CBOX17C1+CBOX18C1+CBOX19C1+CBOX20C1+CBOX21C1)*64.0
Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
LONG
Formulas:
L2 to L1 load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
L2 to L1 load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64
L1 to L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
L1 to L2 evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64
L1 to/from L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
L1 to/from L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
L3 to L2 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64/time
L3 to L2 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64
L2 to L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64/time
L2 to L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64
L2 to/from L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time
L2 to/from L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64
System to L3 bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ))*64/time
System to L3 data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ))*64
L3 to system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_VICTIMS_M))*64/time
L3 to system data volume [GBytes] = 1.0E-09*(SUM(LLC_VICTIMS_M))*64
L3 to/from system bandwidth [MBytes/s] = 1.0E-06*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64/time
L3 to/from system data volume [GBytes] = 1.0E-09*(SUM(LLC_LOOKUP_DATA_READ)+SUM(LLC_VICTIMS_M))*64
Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/time
Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0
Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/time
Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0
Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/time
Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0
-
Group to measure cache transfers between L1 and Memory. Please notice that the
L3 to/from system metrics contain any traffic to the system (memory,
Intel QPI, etc.) but don't seem to handle anything because commonly memory read
bandwidth and L3 to L2 bandwidth is higher as the memory to L3 bandwidth.
SHORT Power and Energy consumption
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PWR0 PWR_PKG_ENERGY
UBOXFIX UNCORE_CLOCK
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
Uncore Clock [MHz] 1.E-06*UBOXFIX/time
CPI FIXC1/FIXC0
Energy [J] PWR0
Power [W] PWR0/time
LONG
Formulas:
Power = PWR_PKG_ENERGY / time
Uncore Clock [MHz] = 1.E-06 * UNCORE_CLOCK / time
-
Broadwell implements the new RAPL interface. This interface enables to
monitor the consumed energy on the package (socket) level.
SHORT Cycle Activities
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 CYCLE_ACTIVITY_CYCLES_L2_PENDING
PMC1 CYCLE_ACTIVITY_CYCLES_LDM_PENDING
PMC2 CYCLE_ACTIVITY_CYCLES_L1D_PENDING
PMC3 CYCLE_ACTIVITY_CYCLES_NO_EXECUTE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Cycles without execution [%] (PMC3/FIXC1)*100
Cycles without execution due to L1D [%] (PMC2/FIXC1)*100
Cycles without execution due to L2 [%] (PMC0/FIXC1)*100
Cycles without execution due to memory loads [%] (PMC1/FIXC1)*100
LONG
Formulas:
Cycles without execution [%] = CYCLE_ACTIVITY_CYCLES_NO_EXECUTE/CPU_CLK_UNHALTED_CORE*100
Cycles with stalls due to L1D [%] = CYCLE_ACTIVITY_CYCLES_L1D_PENDING/CPU_CLK_UNHALTED_CORE*100
Cycles with stalls due to L2 [%] = CYCLE_ACTIVITY_CYCLES_L2_PENDING/CPU_CLK_UNHALTED_CORE*100
Cycles without execution due to memory loads [%] = CYCLE_ACTIVITY_CYCLES_LDM_PENDING/CPU_CLK_UNHALTED_CORE*100
--
This performance group measures the cycles while waiting for data from the cache
and memory hierarchy.
CYCLE_ACTIVITY_CYCLES_NO_EXECUTE: Counts number of cycles nothing is executed on
any execution port.
CYCLE_ACTIVITY_CYCLES_L1D_PENDING: Cycles while L1 cache miss demand load is
outstanding.
CYCLE_ACTIVITY_CYCLES_L2_PENDING: Cycles while L2 cache miss demand load is
outstanding.
CYCLE_ACTIVITY_CYCLES_LDM_PENDING: Cycles while memory subsystem has an
outstanding load.
SHORT Cycle Activities (Stalls)
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 CYCLE_ACTIVITY_STALLS_L2_PENDING
PMC1 CYCLE_ACTIVITY_STALLS_LDM_PENDING
PMC2 CYCLE_ACTIVITY_STALLS_L1D_PENDING
PMC3 CYCLE_ACTIVITY_STALLS_TOTAL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Total execution stalls PMC3
Stalls caused by L1D misses [%] (PMC2/PMC3)*100
Stalls caused by L2 misses [%] (PMC0/PMC3)*100
Stalls caused by memory loads [%] (PMC1/PMC3)*100
Execution stall rate [%] (PMC3/FIXC1)*100
Stalls caused by L1D misses rate [%] (PMC2/FIXC1)*100
Stalls caused by L2 misses rate [%] (PMC0/FIXC1)*100
Stalls caused by memory loads rate [%] (PMC1/FIXC1)*100
LONG
Formulas:
Total execution stalls = CYCLE_ACTIVITY_STALLS_TOTAL
Stalls caused by L1D misses [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
Stalls caused by L2 misses [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
Stalls caused by memory loads [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CYCLE_ACTIVITY_STALLS_TOTAL)*100
Execution stall rate [%] = (CYCLE_ACTIVITY_STALLS_TOTAL/CPU_CLK_UNHALTED_CORE)*100
Stalls caused by L1D misses rate [%] = (CYCLE_ACTIVITY_STALLS_L1D_PENDING/CPU_CLK_UNHALTED_CORE)*100
Stalls caused by L2 misses rate [%] = (CYCLE_ACTIVITY_STALLS_L2_PENDING/CPU_CLK_UNHALTED_CORE)*100
Stalls caused by memory loads rate [%] = (CYCLE_ACTIVITY_STALLS_LDM_PENDING/CPU_CLK_UNHALTED_CORE)*100
--
This performance group measures the stalls caused by data traffic in the cache
hierarchy.
CYCLE_ACTIVITY_STALLS_TOTAL: Total execution stalls.
CYCLE_ACTIVITY_STALLS_L1D_PENDING: Execution stalls while L1 cache miss demand
load is outstanding.
CYCLE_ACTIVITY_STALLS_L2_PENDING: Execution stalls while L2 cache miss demand
load is outstanding.
CYCLE_ACTIVITY_STALLS_LDM_PENDING: Execution stalls while memory subsystem has
an outstanding load.
SHORT Load to store ratio
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0 MEM_UOPS_RETIRED_LOADS_ALL
PMC1 MEM_UOPS_RETIRED_STORES_ALL
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
CPI FIXC1/FIXC0
Load to store ratio PMC0/PMC1
LONG
Formulas:
Load to store ratio = MEM_UOPS_RETIRED_LOADS_ALL/MEM_UOPS_RETIRED_STORES_ALL
-
This is a metric to determine your load to store ratio.
SHORT Divide unit information
EVENTSET
FIXC0 INSTR_RETIRED_ANY
FIXC1 CPU_CLK_UNHALTED_CORE
FIXC2 CPU_CLK_UNHALTED_REF
PMC0:EDGEDETECT ARITH_FPU_DIV_ACTIVE
PMC1 ARITH_FPU_DIV_ACTIVE
METRICS
Runtime (RDTSC) [s] time
Runtime unhalted [s] FIXC1*inverseClock
Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock