Aug-27-2018, 07:34 PM
I have attached a csv file where this data is being stored as a nested dataframe in a main dataframe which i cannot include in here. main_col is the column from the main dataframe that has the data in this csv file stored in it as a nested df. what I want to achieve is to measure the data density but i am getting an index positional error. The code I am currently using looks like below and I am not sure what is causing the problem.
testdata.csv (Size: 310 bytes / Downloads: 32)
import pandas as pd
df = pd.read_csv('test_data.csv')
def data_density(thresh=None):
counter = 0
counter_1 = 0
ix = []
for ixn, data in df.iterrows():
counter = counter + 1
total_matrix = data['main_col'].loc[:, 'A1']['Game1'].shape[0] * \
data['main_col'].loc[:, 'A1']['Game2'].shape[1] + \
data['main_col'].loc[:, 'A2']['Game1'].shape[0] * \
data['main_col'].loc[:, 'A2']['Game2'].shape[1]
total_values = data['main_col'].loc[:, 'A1']['Game1'].count().sum() + \
data['main_col'].loc[:, 'A2']['Game1'].count().sum()
if total_values != 0:
data_density = float(total_values) / float(total_matrix)
if data_density > threshold:
counter_1 = counter_1 + 1
ix.append(ixn)
ratio = float(counter_1) / counter
return ix, ratio
df3 = pd.DataFrame()
for i in range(80, 100, 5):
i = float(i) / 100
ix, ratio = data_density(thresh=i)
print('data density for', ratio, 'when threshold is:', i)
print(len(ix))
df = pd.DataFrame()
for j in range(0, len(ix)):
df2 = df[(df.index == ix[j])]
df = df.append(df2)
print(df)
df3 = df3.append(df)
