Commit 035549a8 authored by Julius Metz's avatar Julius Metz

rework merge, add filter functionalite

parent 32565906
def hardvalue(filter_data, filtervalue):
"""sort commands from greatest to smallest value in relation to the overall value (percentage).
Than add all commands to filter_info until filtervalue is smaller as all percentages of the commands.
Arguments:
filter_data {dict} -- dict with all data needed for filter func
filtervalue {int} -- percentage of the interesting values
Returns:
dict -- dict key= collectl heads value=list of cmd that have not been filtered out
"""
filter_info = {}
tmp_sort_data = {}
for cmd, cmd_data in filter_data['commands'].items():
for cmd_data_key, cmd_data_value in cmd_data.items():
if cmd_data_key == 'number_of_values':
continue
if cmd_data_key not in tmp_sort_data:
tmp_sort_data[cmd_data_key] = []
tmp_sort_data[cmd_data_key].append((
int(100 * (cmd_data_value) / filter_data[cmd_data_key]),
cmd,
))
for key, values in tmp_sort_data.items():
current_percent = 0
for (percent, cmd) in sorted(values, key=lambda x: x[0], reverse=True):
if percent == 0 or current_percent >= filtervalue:
break
current_percent += percent
if key not in filter_info:
filter_info[key] = []
filter_info[key].append(cmd)
return filter_info
def average(filter_data, filtervalue):
"""filters out the commands which are below the filtervalue(percentage)
in relation to the overall average.
Arguments:
filter_data {dict} -- dict with all data needed for filter func
filtervalue {int} -- percentage of min needed avarage in relation to the overall average
Returns:
dict -- dict key= collectl heads value=list of cmd that have not been filtered out
"""
all_averages = {}
filter_info = {}
for cmd, cmd_data in filter_data['commands'].items():
for cmd_data_key, cmd_data_value in cmd_data.items():
if cmd_data_key == 'number_of_values':
continue
if not cmd_data_key in all_averages:
all_averages[cmd_data_key] = filter_data[cmd_data_key] / filter_data['number_of_values']
filter_info[cmd_data_key] = []
cmd_avarage = cmd_data[cmd_data_key] / cmd_data['number_of_values']
if int( 100 * cmd_avarage / all_averages[cmd_data_key]) < filtervalue:
continue
filter_info[cmd_data_key].append(cmd)
return filter_info
......@@ -2,9 +2,9 @@ import plotly.io as pio
import datetime
def cpu_plot(values, title=None, xtitle=None, ytitle=None):
def build_plot(plot_data, title=None, xtitle=None, ytitle=None, showlegend=True, **kwargs):
plot = {
'data': [],
'data': plot_data,
'layout': {
'title': {
'text': title
......@@ -15,22 +15,45 @@ def cpu_plot(values, title=None, xtitle=None, ytitle=None):
'yaxis': {
'title': ytitle,
},
'showlegend': True,
'showlegend': showlegend,
},
}
for cmd, cmd_data in values.items():
CPU = cmd_data.get('PCT', [])
if not any(elem != '0' for elem in CPU):
continue
return plot
plot['data'].append(
def cpu_plot(cmds_data, filter_info, **kwargs):
needed_key = 'PCT'
plot_data = []
for cmd in filter_info[needed_key]:
plot_data.append(
{
'type': 'scatter',
'mode': 'markers',
'x': cmds_data[cmd].get('datetime', []),
'y': cmds_data[cmd].get(needed_key, []),
'name': cmd,
}
)
pio.show(build_plot(plot_data, **kwargs))
return build_plot(plot_data, **kwargs)
def ram_plot(cmds_data, filter_info, **kwargs):
plot_data = []
needed_key = 'VmRSS'
for cmd in filter_info[needed_key]:
cmd_needed_key_data = cmds_data[cmd].get(needed_key, [])
plot_data.append(
{
'type': 'scatter',
'mode': 'markers',
'x': cmd_data.get('datetime', []),
'y': CPU,
'x': cmds_data[cmd].get('datetime', []),
'y': [ value / 1024 / 1024 for value in cmd_needed_key_data],
'name': cmd,
}
)
pio.show(build_plot(plot_data, **kwargs))
return build_plot(plot_data, **kwargs)
pio.show(plot)
\ No newline at end of file
import re
import subprocess
import shlex
from pathlib import Path
import datetime
import click
import plots_generators
import filter_func
FILTER_FUNCTIONS = ['hardvalue', 'average']
NAME_SPEZIAL_PARAMETER_CONFIG = {
'java': ['-cp', '-classpath'],
'bash': [],
'sh': [],
'perl': [],
'python': ['-m', '-W', '-X', '--check-hash-based-pycs', '-c'],
}
SAME_COMMAND_REGEX = [
(r'.+ org.apache.spark.launcher.Main .+', 'org.apache.spark.launcher.Main'),
(r'.+ org.apache.spark.executor.CoarseGrainedExecutorBackend .+', 'org.apache.spark.executor.CoarseGrainedExecutorBackend'),
(r'.+ org.apache.spark.deploy.worker.Worker .+', 'org.apache.spark.deploy.worker.Worker'),
]
MERGE_VALUES = ['PCT']
MERGE_IDENTIFIER = 'datetime'
COMAND_BLACKLIST_REGEX = [r'^/usr/bin/perl .+collectl']
HEAD_BLACKLIST = ['Time', 'Date']
CONFIG = {
'cpu_plot':{
MERGE_VALUES = ['PCT', 'VmRSS']
COMAND_BLACKLIST_REGEX = [
r'^[^ ]+perl .+collectl',
]
PLOT_CONFIG = [{
'generator': 'cpu_plot',
'settings': {
'title': 'CPU load',
'xtitle': 'Date',
'ytitle': 'CPU load',
}
}
},
},
{
'generator': 'ram_plot',
'settings': {
'title': 'Memory Usage',
'xtitle': 'Date',
'ytitle': 'RAM usage GiB',
},
},
]
def datestr2date(datestr):
......@@ -42,93 +61,121 @@ def datestr2date(datestr):
)
def parse_file(path, collectl):
def get_cmdname(cmd, coarsest=False):
"""search in complete commandstring the name of the skript or the command that is used
Arguments:
cmd {str} -- complete commandstring
Keyword Arguments:
coarsest {bool} -- return only the call function(example: bash, python) if True (default: {False})
Returns:
str -- new cmd name
"""
cmd_splited = shlex.split(cmd)
bash_function = cmd_splited[0].split('/')[-1]
bash_function = re.search(r'[^\W\n]+', bash_function).group(0)
spezial_parameter = NAME_SPEZIAL_PARAMETER_CONFIG.get(bash_function, None)
if coarsest or spezial_parameter == None:
return bash_function
skip = False
for position, parameter in enumerate(cmd_splited[1:]):
if skip:
skip = False
continue
if parameter in spezial_parameter:
skip = True
continue
if bash_function == 'bash' or bash_function == 'sh' and parameter == '-c':
return bash_function + ' -c'
#return shlex.join(cmd_splited[position+1:])
if parameter.startswith('-'):
continue
return parameter.split('/')[-1]
return cmd
def parse_file(path, collectl, merge, coarsest):
process = subprocess.run(
[collectl, '-P', '-p', path, '-sZ'], capture_output=True,
)
output = process.stdout.decode().splitlines()
head = output.pop(0).split(' ')
for possible_head in output[:]:
if possible_head.startswith('#'):
head = possible_head.split(' ')
output.remove(possible_head)
else:
break
head[0] = head[0][1:]
head_indexes_dict = {head_title: index for index, head_title in enumerate(head)}
entrys_data = {}
tmp_date = None
tmp_time = None
for entry in output:
splited_entry = entry.split(' ', len(head)-1)
splited_entry = entry.split(' ', len(head_indexes_dict)-1)
cmd = splited_entry[-1]
for regexpr in COMAND_BLACKLIST_REGEX:
if re.search(regexpr, cmd):
break
else:
if merge:
cmd = get_cmdname(cmd, coarsest=coarsest)
if not cmd in entrys_data:
entrys_data[cmd] = {
head_elem : [] for head_elem in head[:-1] if head_elem not in HEAD_BLACKLIST
}
entrys_data[cmd]['datetime'] = []
for i, head_elem in enumerate(head[:-1]):
if head_elem == 'Date':
tmp_date = datestr2date(splited_entry[i])
if head_elem == 'Time':
tmp_time = datetime.time.fromisoformat(splited_entry[i])
if not head_elem in HEAD_BLACKLIST:
entrys_data[cmd][head_elem].append(splited_entry[i])
entrys_data[cmd]['datetime'].append(
datetime.datetime.combine(tmp_date, tmp_time),
entrys_data[cmd] = {}
tmp_datetime = datetime.datetime.combine(
datestr2date(splited_entry[head_indexes_dict['Date']]),
datetime.time.fromisoformat(splited_entry[head_indexes_dict['Time']]),
)
return entrys_data
def merge_same_commands(data):
"""checks which commands can be combined - this is done using SAME_COMMAND_REGEX
than combine the commands and remove the old.
MERGE_VALUES specify which variable keys are combined
MERGE_IDENTIFIER specifies the unique variable key where the merge is based on
all variable keys that are not in MERGE_VALUES or MERGE_IDENTIFIER are left out and not transmitted!
Arguments:
data {dict} -- data from collectl parsed
"""
for command, cmd_data in list(data.items()):
for regexpr, name in SAME_COMMAND_REGEX:
if re.search(regexpr, command):
if not name in data:
data[name] = {
key: cmd_data[key] for key in MERGE_VALUES + [MERGE_IDENTIFIER]
if not tmp_datetime in entrys_data[cmd]:
entrys_data[cmd][tmp_datetime] = {
key: 0.0 for key in MERGE_VALUES
}
else:
for i, identifier in enumerate(cmd_data[MERGE_IDENTIFIER]):
try:
index = data[name][MERGE_IDENTIFIER].index(identifier)
for key in MERGE_VALUES:
data[name][key][index] = float(data[name][key][index]) +\
float(cmd_data[key][i])
except ValueError:
for key in MERGE_VALUES + [MERGE_IDENTIFIER]:
data[name][key].append(cmd_data[key][i])
data.pop(command, None)
break
for head_title in MERGE_VALUES:
entrys_data[cmd][tmp_datetime][head_title] += float(splited_entry[head_indexes_dict[head_title]])
entry_data_plotfriendly = {}
plot_filter_data = {key: 0.0 for key in MERGE_VALUES}
plot_filter_data['number_of_values'] = 0
plot_filter_data['commands'] = {}
for cmd, cmd_data in entrys_data.items():
plot_filter_data['commands'][cmd] = {key: 0 for key in MERGE_VALUES}
plot_filter_data['commands'][cmd]['number_of_values'] = 0
entry_data_plotfriendly[cmd] = {key: [] for key in MERGE_VALUES}
entry_data_plotfriendly[cmd]['datetime'] = []
for cmd_data_time, cmd_data_values in cmd_data.items():
entry_data_plotfriendly[cmd]['datetime'].append(cmd_data_time)
for cmd_data_key, cmd_data_value in cmd_data_values.items():
entry_data_plotfriendly[cmd][cmd_data_key].append(cmd_data_value)
plot_filter_data['commands'][cmd][cmd_data_key] += cmd_data_value
plot_filter_data['commands'][cmd]['number_of_values'] += 1
plot_filter_data[cmd_data_key] += cmd_data_value
plot_filter_data['number_of_values'] += 1
return entry_data_plotfriendly, plot_filter_data
@click.command()
@click.option('--file', '-f', required=True)
@click.option('--collectl', '-c', required=False, default='collectl')
@click.option('--merge/--notmerge', default=True)
def main(file, collectl, merge):
@click.option('--coarsest/--notcoarsest', default=False)
@click.option('--filtercmd/--notfiltercmd', default=True)
@click.option('--filtervalue', '-v', type=int, default=90)
@click.option('--filtertype', '-t',
type=click.Choice(FILTER_FUNCTIONS, case_sensitive=False),
default=FILTER_FUNCTIONS[0])
def main(file, collectl, merge, coarsest, filtercmd, filtervalue, filtertype):
path = Path(file)
if path.exists():
data = parse_file(path, collectl)
if merge:
merge_same_commands(data)
for generator, settings in CONFIG.items():
getattr(plots_generators, generator)(data, **settings)
data, filter_data = parse_file(path, collectl, merge, coarsest)
if filtercmd:
filter_infos = getattr(filter_func, filtertype)(filter_data, filtervalue)
for plot_config in PLOT_CONFIG:
plot = getattr(plots_generators, plot_config['generator'])(data, filter_infos, **plot_config['settings'])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment