Commit 035549a8 authored by Julius Metz's avatar Julius Metz

rework merge, add filter functionalite

parent 32565906
def hardvalue(filter_data, filtervalue):
"""sort commands from greatest to smallest value in relation to the overall value (percentage).
Than add all commands to filter_info until filtervalue is smaller as all percentages of the commands.
Arguments:
filter_data {dict} -- dict with all data needed for filter func
filtervalue {int} -- percentage of the interesting values
Returns:
dict -- dict key= collectl heads value=list of cmd that have not been filtered out
"""
filter_info = {}
tmp_sort_data = {}
for cmd, cmd_data in filter_data['commands'].items():
for cmd_data_key, cmd_data_value in cmd_data.items():
if cmd_data_key == 'number_of_values':
continue
if cmd_data_key not in tmp_sort_data:
tmp_sort_data[cmd_data_key] = []
tmp_sort_data[cmd_data_key].append((
int(100 * (cmd_data_value) / filter_data[cmd_data_key]),
cmd,
))
for key, values in tmp_sort_data.items():
current_percent = 0
for (percent, cmd) in sorted(values, key=lambda x: x[0], reverse=True):
if percent == 0 or current_percent >= filtervalue:
break
current_percent += percent
if key not in filter_info:
filter_info[key] = []
filter_info[key].append(cmd)
return filter_info
def average(filter_data, filtervalue):
"""filters out the commands which are below the filtervalue(percentage)
in relation to the overall average.
Arguments:
filter_data {dict} -- dict with all data needed for filter func
filtervalue {int} -- percentage of min needed avarage in relation to the overall average
Returns:
dict -- dict key= collectl heads value=list of cmd that have not been filtered out
"""
all_averages = {}
filter_info = {}
for cmd, cmd_data in filter_data['commands'].items():
for cmd_data_key, cmd_data_value in cmd_data.items():
if cmd_data_key == 'number_of_values':
continue
if not cmd_data_key in all_averages:
all_averages[cmd_data_key] = filter_data[cmd_data_key] / filter_data['number_of_values']
filter_info[cmd_data_key] = []
cmd_avarage = cmd_data[cmd_data_key] / cmd_data['number_of_values']
if int( 100 * cmd_avarage / all_averages[cmd_data_key]) < filtervalue:
continue
filter_info[cmd_data_key].append(cmd)
return filter_info
...@@ -2,9 +2,9 @@ import plotly.io as pio ...@@ -2,9 +2,9 @@ import plotly.io as pio
import datetime import datetime
def cpu_plot(values, title=None, xtitle=None, ytitle=None): def build_plot(plot_data, title=None, xtitle=None, ytitle=None, showlegend=True, **kwargs):
plot = { plot = {
'data': [], 'data': plot_data,
'layout': { 'layout': {
'title': { 'title': {
'text': title 'text': title
...@@ -15,22 +15,45 @@ def cpu_plot(values, title=None, xtitle=None, ytitle=None): ...@@ -15,22 +15,45 @@ def cpu_plot(values, title=None, xtitle=None, ytitle=None):
'yaxis': { 'yaxis': {
'title': ytitle, 'title': ytitle,
}, },
'showlegend': True, 'showlegend': showlegend,
}, },
} }
for cmd, cmd_data in values.items(): return plot
CPU = cmd_data.get('PCT', [])
if not any(elem != '0' for elem in CPU):
continue
plot['data'].append(
def cpu_plot(cmds_data, filter_info, **kwargs):
needed_key = 'PCT'
plot_data = []
for cmd in filter_info[needed_key]:
plot_data.append(
{
'type': 'scatter',
'mode': 'markers',
'x': cmds_data[cmd].get('datetime', []),
'y': cmds_data[cmd].get(needed_key, []),
'name': cmd,
}
)
pio.show(build_plot(plot_data, **kwargs))
return build_plot(plot_data, **kwargs)
def ram_plot(cmds_data, filter_info, **kwargs):
plot_data = []
needed_key = 'VmRSS'
for cmd in filter_info[needed_key]:
cmd_needed_key_data = cmds_data[cmd].get(needed_key, [])
plot_data.append(
{ {
'type': 'scatter', 'type': 'scatter',
'mode': 'markers', 'mode': 'markers',
'x': cmd_data.get('datetime', []), 'x': cmds_data[cmd].get('datetime', []),
'y': CPU, 'y': [ value / 1024 / 1024 for value in cmd_needed_key_data],
'name': cmd, 'name': cmd,
} }
) )
pio.show(build_plot(plot_data, **kwargs))
return build_plot(plot_data, **kwargs)
pio.show(plot)
\ No newline at end of file
import re import re
import subprocess import subprocess
import shlex
from pathlib import Path from pathlib import Path
import datetime import datetime
import click import click
import plots_generators import plots_generators
import filter_func
SAME_COMMAND_REGEX = [
(r'.+ org.apache.spark.launcher.Main .+', 'org.apache.spark.launcher.Main'), FILTER_FUNCTIONS = ['hardvalue', 'average']
(r'.+ org.apache.spark.executor.CoarseGrainedExecutorBackend .+', 'org.apache.spark.executor.CoarseGrainedExecutorBackend'),
(r'.+ org.apache.spark.deploy.worker.Worker .+', 'org.apache.spark.deploy.worker.Worker'), NAME_SPEZIAL_PARAMETER_CONFIG = {
] 'java': ['-cp', '-classpath'],
MERGE_VALUES = ['PCT'] 'bash': [],
MERGE_IDENTIFIER = 'datetime' 'sh': [],
'perl': [],
COMAND_BLACKLIST_REGEX = [r'^/usr/bin/perl .+collectl'] 'python': ['-m', '-W', '-X', '--check-hash-based-pycs', '-c'],
HEAD_BLACKLIST = ['Time', 'Date']
CONFIG = {
'cpu_plot':{
'title': 'CPU load',
'xtitle': 'Date',
'ytitle': 'CPU load',
}
} }
MERGE_VALUES = ['PCT', 'VmRSS']
COMAND_BLACKLIST_REGEX = [
r'^[^ ]+perl .+collectl',
]
PLOT_CONFIG = [{
'generator': 'cpu_plot',
'settings': {
'title': 'CPU load',
'xtitle': 'Date',
'ytitle': 'CPU load',
},
},
{
'generator': 'ram_plot',
'settings': {
'title': 'Memory Usage',
'xtitle': 'Date',
'ytitle': 'RAM usage GiB',
},
},
]
def datestr2date(datestr): def datestr2date(datestr):
"""Converts a "datestring" to a date Object. """Converts a "datestring" to a date Object.
...@@ -42,93 +61,121 @@ def datestr2date(datestr): ...@@ -42,93 +61,121 @@ def datestr2date(datestr):
) )
def parse_file(path, collectl): def get_cmdname(cmd, coarsest=False):
"""search in complete commandstring the name of the skript or the command that is used
Arguments:
cmd {str} -- complete commandstring
Keyword Arguments:
coarsest {bool} -- return only the call function(example: bash, python) if True (default: {False})
Returns:
str -- new cmd name
"""
cmd_splited = shlex.split(cmd)
bash_function = cmd_splited[0].split('/')[-1]
bash_function = re.search(r'[^\W\n]+', bash_function).group(0)
spezial_parameter = NAME_SPEZIAL_PARAMETER_CONFIG.get(bash_function, None)
if coarsest or spezial_parameter == None:
return bash_function
skip = False
for position, parameter in enumerate(cmd_splited[1:]):
if skip:
skip = False
continue
if parameter in spezial_parameter:
skip = True
continue
if bash_function == 'bash' or bash_function == 'sh' and parameter == '-c':
return bash_function + ' -c'
#return shlex.join(cmd_splited[position+1:])
if parameter.startswith('-'):
continue
return parameter.split('/')[-1]
return cmd
def parse_file(path, collectl, merge, coarsest):
process = subprocess.run( process = subprocess.run(
[collectl, '-P', '-p', path, '-sZ'], capture_output=True, [collectl, '-P', '-p', path, '-sZ'], capture_output=True,
) )
output = process.stdout.decode().splitlines() output = process.stdout.decode().splitlines()
head = output.pop(0).split(' ') head = output.pop(0).split(' ')
for possible_head in output[:]:
if possible_head.startswith('#'):
head = possible_head.split(' ')
output.remove(possible_head)
else:
break
head[0] = head[0][1:] head[0] = head[0][1:]
head_indexes_dict = {head_title: index for index, head_title in enumerate(head)}
entrys_data = {} entrys_data = {}
tmp_date = None
tmp_time = None
for entry in output: for entry in output:
splited_entry = entry.split(' ', len(head)-1) splited_entry = entry.split(' ', len(head_indexes_dict)-1)
cmd = splited_entry[-1] cmd = splited_entry[-1]
for regexpr in COMAND_BLACKLIST_REGEX: for regexpr in COMAND_BLACKLIST_REGEX:
if re.search(regexpr, cmd): if re.search(regexpr, cmd):
break break
else: else:
if merge:
cmd = get_cmdname(cmd, coarsest=coarsest)
if not cmd in entrys_data: if not cmd in entrys_data:
entrys_data[cmd] = { entrys_data[cmd] = {}
head_elem : [] for head_elem in head[:-1] if head_elem not in HEAD_BLACKLIST tmp_datetime = datetime.datetime.combine(
} datestr2date(splited_entry[head_indexes_dict['Date']]),
entrys_data[cmd]['datetime'] = [] datetime.time.fromisoformat(splited_entry[head_indexes_dict['Time']]),
for i, head_elem in enumerate(head[:-1]):
if head_elem == 'Date':
tmp_date = datestr2date(splited_entry[i])
if head_elem == 'Time':
tmp_time = datetime.time.fromisoformat(splited_entry[i])
if not head_elem in HEAD_BLACKLIST:
entrys_data[cmd][head_elem].append(splited_entry[i])
entrys_data[cmd]['datetime'].append(
datetime.datetime.combine(tmp_date, tmp_time),
) )
return entrys_data if not tmp_datetime in entrys_data[cmd]:
entrys_data[cmd][tmp_datetime] = {
key: 0.0 for key in MERGE_VALUES
def merge_same_commands(data): }
"""checks which commands can be combined - this is done using SAME_COMMAND_REGEX for head_title in MERGE_VALUES:
than combine the commands and remove the old. entrys_data[cmd][tmp_datetime][head_title] += float(splited_entry[head_indexes_dict[head_title]])
MERGE_VALUES specify which variable keys are combined entry_data_plotfriendly = {}
MERGE_IDENTIFIER specifies the unique variable key where the merge is based on plot_filter_data = {key: 0.0 for key in MERGE_VALUES}
all variable keys that are not in MERGE_VALUES or MERGE_IDENTIFIER are left out and not transmitted! plot_filter_data['number_of_values'] = 0
plot_filter_data['commands'] = {}
Arguments: for cmd, cmd_data in entrys_data.items():
data {dict} -- data from collectl parsed plot_filter_data['commands'][cmd] = {key: 0 for key in MERGE_VALUES}
""" plot_filter_data['commands'][cmd]['number_of_values'] = 0
for command, cmd_data in list(data.items()): entry_data_plotfriendly[cmd] = {key: [] for key in MERGE_VALUES}
for regexpr, name in SAME_COMMAND_REGEX: entry_data_plotfriendly[cmd]['datetime'] = []
if re.search(regexpr, command): for cmd_data_time, cmd_data_values in cmd_data.items():
if not name in data: entry_data_plotfriendly[cmd]['datetime'].append(cmd_data_time)
data[name] = { for cmd_data_key, cmd_data_value in cmd_data_values.items():
key: cmd_data[key] for key in MERGE_VALUES + [MERGE_IDENTIFIER] entry_data_plotfriendly[cmd][cmd_data_key].append(cmd_data_value)
}
else: plot_filter_data['commands'][cmd][cmd_data_key] += cmd_data_value
for i, identifier in enumerate(cmd_data[MERGE_IDENTIFIER]): plot_filter_data['commands'][cmd]['number_of_values'] += 1
try: plot_filter_data[cmd_data_key] += cmd_data_value
index = data[name][MERGE_IDENTIFIER].index(identifier) plot_filter_data['number_of_values'] += 1
for key in MERGE_VALUES:
data[name][key][index] = float(data[name][key][index]) +\ return entry_data_plotfriendly, plot_filter_data
float(cmd_data[key][i])
except ValueError:
for key in MERGE_VALUES + [MERGE_IDENTIFIER]:
data[name][key].append(cmd_data[key][i])
data.pop(command, None)
break
@click.command() @click.command()
@click.option('--file', '-f', required=True) @click.option('--file', '-f', required=True)
@click.option('--collectl', '-c', required=False, default='collectl') @click.option('--collectl', '-c', required=False, default='collectl')
@click.option('--merge/--notmerge', default=True) @click.option('--merge/--notmerge', default=True)
def main(file, collectl, merge): @click.option('--coarsest/--notcoarsest', default=False)
@click.option('--filtercmd/--notfiltercmd', default=True)
@click.option('--filtervalue', '-v', type=int, default=90)
@click.option('--filtertype', '-t',
type=click.Choice(FILTER_FUNCTIONS, case_sensitive=False),
default=FILTER_FUNCTIONS[0])
def main(file, collectl, merge, coarsest, filtercmd, filtervalue, filtertype):
path = Path(file) path = Path(file)
if path.exists(): if path.exists():
data = parse_file(path, collectl) data, filter_data = parse_file(path, collectl, merge, coarsest)
if merge:
merge_same_commands(data)
for generator, settings in CONFIG.items():
getattr(plots_generators, generator)(data, **settings)
if filtercmd:
filter_infos = getattr(filter_func, filtertype)(filter_data, filtervalue)
for plot_config in PLOT_CONFIG:
plot = getattr(plots_generators, plot_config['generator'])(data, filter_infos, **plot_config['settings'])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment