I am building code that reads data from excel files into two different pandas dataframes, and performs a fast fuzzy match (fuzzy wuzzy and excel functions take far too long) comparing values from one dataframe to another. I am getting a "KeyError: 0" somewhere along the way that is preventing the rest of my code from running. I'm quite new to python and have no idea where the error is happening, any help would be much appreciated. If you have any other recommendations for optimizing my code I am open to other routes (comparing 1-20k items to a reference of 300k), the full code is only 160 lines if required to assess the issue.
I understand the error "KeyError: 0" means that I am trying to access the column 0 of a dataframe and that it doesn't exist. I'm just not sure where it's occurring, whether to use .loc to fix this or ensure that I am using the right column names everywhere..
Here is where I think the error is happening:
Full Code for Review:
I understand the error "KeyError: 0" means that I am trying to access the column 0 of a dataframe and that it doesn't exist. I'm just not sure where it's occurring, whether to use .loc to fix this or ensure that I am using the right column names everywhere..
Here is where I think the error is happening:
def get_matches_df(sparse_matrix, name_vector, top=100):
non_zeros = sparse_matrix.nonzero()
sparserows = non_zeros[0]
sparsecols = non_zeros[1]
if top:
nr_matches = top
else:
nr_matches = sparsecols.size
left_side = np.empty([nr_matches], dtype=object)
right_side = np.empty([nr_matches], dtype=object)
similarity = np.zeros(nr_matches)
for index in range(0, nr_matches):
left_side[index] = name_vector[sparserows[index]]
right_side[index] = name_vector[sparsecols[index]]
similarity[index] = sparse_matrix.data[index]
return pd.DataFrame({'left_side': left_side,
'right_side': right_side,
'similarity': similarity})
#########
matches_df = get_matches_df(matches, dfFindMatch, top=1000)
matches_df = matches_df[matches_df['similarity'] < 0.99999] # Remove all exact matches
# matches_df.sample(20)
matches_df.sort_values(['similarity'], ascending=False).head(20)
######################
product_desc_ref = dfReference['Product Description'].unique()
print('Vectorizing the data - this could take a few minutes for large datasets...')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(product_desc_ref)
print('Vectorizing completed...')Here is the error:Error:C:...\Automation\venv\Scripts\python.exe C:.../Automation/main.py
SELF TIMED: 0.0
Traceback (most recent call last):
File "C:...\Automation\venv\lib\site-packages\pandas\core\indexes\base.py", line 3621, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:...\Automation\main.py", line 113, in <module>
matches_df = get_matches_df(matches, dfFindMatch, top=1000)
File "C:...\Automation\main.py", line 102, in get_matches_df
left_side[index] = name_vector[sparserows[index]]
File "C:...\Automation\venv\lib\site-packages\pandas\core\frame.py", line 3505, in __getitem__
indexer = self.columns.get_loc(key)
File "C:...\Automation\venv\lib\site-packages\pandas\core\indexes\base.py", line 3623, in get_loc
raise KeyError(key) from err
KeyError: 0
Process finished with exit code 1Full Code for Review:
import pandas as pd
import numpy as np
import sparse_dot_topn.sparse_dot_topn as ct
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import time
from matplotlib import style
from ftfy import fix_text
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
pd.set_option('display.max_colwidth', 0)
style.use('fivethirtyeight')
cols = ['Product ID', 'Product Description']
# Pandas read xlsx
dfReference = pd.read_excel('Sample.xlsx', sheet_name='Reference', usecols=cols)
dfFindMatch = pd.read_excel('Sample.xlsx', sheet_name='Needs Product ID', usecols=cols)
######################
def ngrams(string, n=3):
string = str(string)
string = fix_text(string) # fix text
string = string.encode("ascii", errors="ignore").decode() # remove non ascii chars
string = string.lower()
chars_to_remove = [")", "(", ".", "|", "[", "]", "{", "}", "'"]
rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
string = re.sub(rx, '', string)
string = string.replace('&', 'and')
string = string.replace(',', ' ')
string = string.replace('-', ' ')
string = string.title() # normalise case - capital at start of each word
string = re.sub(' +', ' ', string).strip() # get rid of multiple spaces and replace with a single
string = ' ' + string + ' ' # pad names for ngrams...
string = re.sub(r'[,-./]|\sBD', r'', string)
ngrams = zip(*[string[i:] for i in range(n)])
return [''.join(ngram) for ngram in ngrams]
######################
def awesome_cossim_top(A, B, ntop, lower_bound=0):
# force A and B as a CSR matrix.
# If they have already been CSR, there is no overhead
A = A.tocsr()
B = B.tocsr()
M, _ = A.shape
_, N = B.shape
idx_dtype = np.int32
nnz_max = M * ntop
indptr = np.zeros(M + 1, dtype=idx_dtype)
indices = np.zeros(nnz_max, dtype=idx_dtype)
data = np.zeros(nnz_max, dtype=A.dtype)
ct.sparse_dot_topn(
M, N, np.asarray(A.indptr, dtype=idx_dtype),
np.asarray(A.indices, dtype=idx_dtype),
A.data,
np.asarray(B.indptr, dtype=idx_dtype),
np.asarray(B.indices, dtype=idx_dtype),
B.data,
ntop,
lower_bound,
indptr, indices, data)
return csr_matrix((data, indices, indptr), shape=(M, N))
product_desc = dfFindMatch['Product Description'].unique()
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(product_desc)
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.85)
t = time.time() - t1
print("SELF TIMED:", t)
#####################
def get_matches_df(sparse_matrix, name_vector, top=100):
non_zeros = sparse_matrix.nonzero()
sparserows = non_zeros[0]
sparsecols = non_zeros[1]
if top:
nr_matches = top
else:
nr_matches = sparsecols.size
left_side = np.empty([nr_matches], dtype=object)
right_side = np.empty([nr_matches], dtype=object)
similarity = np.zeros(nr_matches)
for index in range(0, nr_matches):
left_side[index] = name_vector[sparserows[index]]
right_side[index] = name_vector[sparsecols[index]]
similarity[index] = sparse_matrix.data[index]
return pd.DataFrame({'left_side': left_side,
'right_side': right_side,
'similarity': similarity})
#########
matches_df = get_matches_df(matches, dfFindMatch, top=1000)
matches_df = matches_df[matches_df['similarity'] < 0.99999] # Remove all exact matches
# matches_df.sample(20)
matches_df.sort_values(['similarity'], ascending=False).head(20)
######################
product_desc_ref = dfReference['Product Description'].unique()
print('Vectorizing the data - this could take a few minutes for large datasets...')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(product_desc_ref)
print('Vectorizing completed...')
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
product_column = 'Product Description' # column to match against in the messy data
unique_products = set(dfFindMatch[product_column].values) # set used for increased performance
# matching query:
def getNearestN(query):
queryTFIDF_ = vectorizer.transform(query)
distances, indices = nbrs.kneighbors(queryTFIDF_)
return distances, indices
t2 = time.time()
print('getting nearest n...')
distances, indices = getNearestN(unique_products)
t_2 = time.time() - t2
print("COMPLETED IN:", t_2)
unique_products = list(unique_products) # need to convert back to a list
print('finding matches...')
matches = []
for i, j in enumerate(indices):
temp = [round(distances[i][0], 2), dfReference.values[j][0][0], unique_products[i]]
matches.append(temp)
print('Building data frame...')
matches = pd.DataFrame(matches, columns=['Match Confidence (lower is better)', 'Matched Description',
'Original Description'])
print('Done')
matches.head(10)
