Aug-23-2018, 08:12 PM
Hi,
I was going over some data. To store results I manually added empty list attributes to a class as needed. In order to make this dynamic I added a dict attribute that would store the series name(key) and series data(value). This turned out to be slow for large data sets. Then I decided to add attributes dynamically with setattr. This was fast again, like manually setting attributes. But I don't understand why a dict would be that much slower. I think attributes are stored in a dict anyway, so storing my results in a dict would only add one more look-up to each cycle, perhaps doubling execution time. Is there a loop in a loop somewhere that I'm missing?
The examples below are on a 10.000 value data set. With 100.000 values the attr script runs in less than a second, the dict version might take ... an hour? I can't be bothered to wait till it's done.
I tried to make it all as short as possible, please bear with me.
The script reads commands from a text file and runs these for each new data value.
I was going over some data. To store results I manually added empty list attributes to a class as needed. In order to make this dynamic I added a dict attribute that would store the series name(key) and series data(value). This turned out to be slow for large data sets. Then I decided to add attributes dynamically with setattr. This was fast again, like manually setting attributes. But I don't understand why a dict would be that much slower. I think attributes are stored in a dict anyway, so storing my results in a dict would only add one more look-up to each cycle, perhaps doubling execution time. Is there a loop in a loop somewhere that I'm missing?
The examples below are on a 10.000 value data set. With 100.000 values the attr script runs in less than a second, the dict version might take ... an hour? I can't be bothered to wait till it's done.
I tried to make it all as short as possible, please bear with me.
The script reads commands from a text file and runs these for each new data value.
# Operation data source destination parameter simple_moving_average data sma250 250 simple_moving_average sma250 sma500 500Here is using a dict:
#!/usr/bin/env python3
class DataFrame:
def __init__(self):
self.data = {}
def load_cmd_register(self, register):
self.cmd_register = []
labels = ['operation', 'source', 'destination', 'parameter']
with open(register) as reg:
for line in reg:
if line.startswith('#'):
continue
elif not line.strip():
continue
reg = dict(zip(labels, line.split()))
reg['parameter'] = int(reg['parameter'])
self.cmd_register.append(reg)
def cycle_cmd_register(self):
for reg in self.cmd_register:
op = getattr(self, reg['operation'])
src = reg['source']
dest = reg['destination']
param = reg['parameter']
op(src, dest, param)
def add_value(self, label, value):
try:
self.data[label].append(value)
except KeyError:
self.data[label] = [value]
def simple_moving_average(self, src, dest, window):
try:
cumsum = self.data[dest+'_cumsum'][-1] + self.data[src][-1]
except KeyError:
cumsum = self.data[src][0]
self.add_value(dest+'_cumsum', cumsum)
len_cumsum = len(self.data[dest+'_cumsum'][:])
if len_cumsum <= window:
sma = self.data[dest+'_cumsum'][-1] / len_cumsum
else:
sma = (self.data[dest+'_cumsum'][-1] \
-self.data[dest+'_cumsum'][-window - 1]) / window
self.add_value(dest, sma)
def read_line(fname):
with open(fname) as f:
for data in f:
yield float(data)
register = 'cmd_register.conf'
df = DataFrame()
df.load_cmd_register(register)
fname = 'data2.txt'
for value in read_line(fname):
df.add_value('data', value)
df.cycle_cmd_register()$ python3 -m cProfile with_dict.py
180054 function calls in 2.803 seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
2 0.000 0.000 0.000 0.000 _bootlocale.py:23(getpreferredencoding)
2 0.000 0.000 0.000 0.000 codecs.py:259(__init__)
2 0.000 0.000 0.000 0.000 codecs.py:308(__init__)
14 0.000 0.000 0.000 0.000 codecs.py:318(decode)
10000 0.052 0.000 2.732 0.000 with_dict.py:20(cycle_cmd_register)
50000 0.050 0.000 0.059 0.000 with_dict.py:28(add_value)
1 0.024 0.024 2.803 2.803 with_dict.py:3(<module>)
1 0.000 0.000 0.000 0.000 with_dict.py:3(DataFrame)
20000 2.607 0.000 2.668 0.000 with_dict.py:34(simple_moving_average)
1 0.000 0.000 0.000 0.000 with_dict.py:4(__init__)
10001 0.039 0.000 0.039 0.000 with_dict.py:50(read_line)
1 0.000 0.000 0.000 0.000 with_dict.py:7(load_cmd_register)
14 0.000 0.000 0.000 0.000 {built-in method _codecs.utf_8_decode}
2 0.000 0.000 0.000 0.000 {built-in method _locale.nl_langinfo}
1 0.000 0.000 0.000 0.000 {built-in method builtins.__build_class__}
1 0.000 0.000 2.803 2.803 {built-in method builtins.exec}
20000 0.012 0.000 0.012 0.000 {built-in method builtins.getattr}
20000 0.011 0.000 0.011 0.000 {built-in method builtins.len}
2 0.000 0.000 0.000 0.000 {built-in method io.open}
49997 0.009 0.000 0.009 0.000 {method 'append' of 'list' objects}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
2 0.000 0.000 0.000 0.000 {method 'split' of 'str' objects}
5 0.000 0.000 0.000 0.000 {method 'startswith' of 'str' objects}
4 0.000 0.000 0.000 0.000 {method 'strip' of 'str' objects}Here is using attributes:#!/usr/bin/env python3
class DataFrame:
def load_cmd_register(self, register):
self.cmd_register = []
labels = ['operation', 'source', 'destination', 'parameter']
with open(register) as reg:
for line in reg:
if line.startswith('#'):
continue
elif not line.strip():
continue
reg = dict(zip(labels, line.split()))
reg['source'] = reg['source'].split(',')
reg['destination'] = reg['destination'].split(',')
reg['parameter'] = [int(p) for p in reg['parameter'].split(',')]
self.cmd_register.append(reg)
def cycle_cmd_register(self):
for cmd in self.cmd_register:
op = getattr(self, cmd['operation'])
src = cmd['source']
dest = cmd['destination']
param = cmd['parameter']
op(*src, *dest, *param)
def add_value(self, label, value):
try:
attr = getattr(self, label)
attr.append(value)
except AttributeError:
setattr(self, label, [value])
def simple_moving_average(self, src, dest, window):
try:
src_attr = getattr(self, src)
dest_attr = getattr(self, dest+'_cumsum')
cumsum = src_attr[-1] + dest_attr[-1]
except AttributeError:
cumsum = src_attr[0]
self.add_value(dest+'_cumsum', cumsum)
dest_attr = getattr(self, dest+'_cumsum')
len_cumsum = len(dest_attr)
if len_cumsum <= window:
sma = dest_attr[-1] / len_cumsum
else:
sma = (dest_attr[-1] - dest_attr[-window - 1]) / window
self.add_value(dest, sma)
def read_line(fname):
with open(fname) as f:
for data in f:
yield float(data)
register = 'cmd_register.conf'
df = DataFrame()
df.load_cmd_register(register)
fname = 'data2.txt'
for value in read_line(fname):
df.add_value('data', value)
df.cycle_cmd_register()$ python3 -m cProfile with_attr.py
290066 function calls in 0.199 seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
2 0.000 0.000 0.000 0.000 _bootlocale.py:23(getpreferredencoding)
2 0.000 0.000 0.000 0.000 codecs.py:259(__init__)
2 0.000 0.000 0.000 0.000 codecs.py:308(__init__)
14 0.000 0.000 0.000 0.000 codecs.py:318(decode)
2 0.000 0.000 0.000 0.000 with_attr.py:16(<listcomp>)
10000 0.031 0.000 0.166 0.000 with_attr.py:19(cycle_cmd_register)
50000 0.033 0.000 0.054 0.000 with_attr.py:27(add_value)
1 0.012 0.012 0.199 0.199 with_attr.py:3(<module>)
1 0.000 0.000 0.000 0.000 with_attr.py:3(DataFrame)
20000 0.064 0.000 0.129 0.000 with_attr.py:34(simple_moving_average)
1 0.000 0.000 0.000 0.000 with_attr.py:4(load_cmd_register)
10001 0.010 0.000 0.010 0.000 with_attr.py:53(read_line)
14 0.000 0.000 0.000 0.000 {built-in method _codecs.utf_8_decode}
2 0.000 0.000 0.000 0.000 {built-in method _locale.nl_langinfo}
1 0.000 0.000 0.000 0.000 {built-in method builtins.__build_class__}
1 0.000 0.000 0.199 0.199 {built-in method builtins.exec}
130000 0.039 0.000 0.039 0.000 {built-in method builtins.getattr}
20000 0.003 0.000 0.003 0.000 {built-in method builtins.len}
5 0.000 0.000 0.000 0.000 {built-in method builtins.setattr}
2 0.000 0.000 0.000 0.000 {built-in method io.open}
49997 0.007 0.000 0.007 0.000 {method 'append' of 'list' objects}
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
8 0.000 0.000 0.000 0.000 {method 'split' of 'str' objects}
5 0.000 0.000 0.000 0.000 {method 'startswith' of 'str' objects}
4 0.000 0.000 0.000 0.000 {method 'strip' of 'str' objects}
