Nov-04-2019, 09:19 PM
I am new to Python. An example reviews code has single line reviews and runs well. Mine has multiple lines. I converted the csv file to tsv. The reviews file has 2 columns, Review and Liked. Liked contains 0 or 1, for 'not liked' or 'liked'. This is for natural language processing.
KeyError Traceback (most recent call last)
<ipython-input-8-0f0b9d7dcfd5> in <module>
21
22 # column : "Review", row ith
---> 23 review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
24
25 # convert all cases to lower cases
The rest of the code is
# Importing Libraries
import numpy as np
import pandas as pd
# Import dataset
dataset = pd.read_csv("../AfricanPride_b.txt", delimiter = '\t', error_bad_lines = False)
# library to clean data
import re
# Natural Language Tool Kit
import nltk
nltk.download('stopwords')
# to remove stopword
from nltk.corpus import stopwords
# for Stemming propose
from nltk.stem.porter import PorterStemmer
# Initialize empty array
# to append clean text
corpus = []
# 1000 (reviews) rows to clean
for i in range(0, 5000):
# column : "Review", row ith
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
# convert all cases to lower cases
review = review.lower()
# split to array(default delimiter is " ")
review = review.split()
# creating PorterStemmer object to
# take main stem of each word
ps = PorterStemmer()
# loop for stemming each word
# in string array at ith row
review = [ps.stem(word) for word in review
if not word in set(stopwords.words('english'))]
# rejoin all string array elements
# to create back into a string
review = ' '.join(review)
# append each string to create
# array of clean text
corpus.append(review) This results in a KeyError. KeyError Traceback (most recent call last)
<ipython-input-8-0f0b9d7dcfd5> in <module>
21
22 # column : "Review", row ith
---> 23 review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
24
25 # convert all cases to lower cases
The rest of the code is
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
# To extract max 1500 feature.
# "max_features" is attribute to
# experiment with to get better results
cv = CountVectorizer(max_features = 1500)
# X contains corpus (dependent variable)
X = cv.fit_transform(corpus).toarray()
# y contains answers if review
# is positive or negative
y = dataset.iloc[:, 1].values
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
# To extract max 1500 feature.
# "max_features" is attribute to
# experiment with to get better results
cv = CountVectorizer(max_features = 1500)
# X contains corpus (dependent variable)
X = cv.fit_transform(corpus).toarray()
# y contains answers if review
# is positive or negative
y = dataset.iloc[:, 1].values
# Splitting the dataset into
# the Training set and Test set
from sklearn.model_selection import train_test_split
# experiment with "test_size"
# to get better results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# Fitting Random Forest Classification
# to the Training set
from sklearn.ensemble import RandomForestClassifier
# n_estimators can be said as number of
# trees, experiment with n_estimators
# to get better results
model = RandomForestClassifier(n_estimators = 501,
criterion = 'entropy')
model.fit(X_train, y_train)
# Predicting the Test set results
y_pred = model.predict(X_test)
y_pred
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm
