Mar-23-2020, 08:42 PM
I'm analyzing the speeches of several US presidents and I would like to make some sentiment analysis. However, the pre-processing part gets stuck on [*] in Google Colab and it doesn't pre-process the speeches. However I executed everything and everything worked well (no errors) before I added the
text_process(df['Text'])after at the top. However, I noticed that the text was not pre-processed so I decided to add that snippet of code at the top, but now it's stuck on [*] in Google Colab...
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from PIL import Image
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('train.csv')
df.head(10)
lemmatiser = WordNetLemmatizer()
# Defining a module for Text Processing
def text_process(tex):
# 1. Removal of Punctuation Marks
nopunct=[char for char in tex if char not in string.punctuation]
nopunct=''.join(nopunct)
# 2. Lemmatisation
a=''
i=0
for i in range(len(nopunct.split())):
b=lemmatiser.lemmatize(nopunct.split()[i], pos="v")
a=a+b+' '
tex.strip("[]") #res = str(test_list)[1:-1]
# 3. Removal of Stopwords
return [word for word in a.split() if word.lower() not
in stopwords.words('english')]
##################### STUCK HERE ############################
text_process(df['Text'])
y = df['Author']
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)
X = df['Text']
wordcloud1 = WordCloud().generate(X[0]) # for Andrew Jackson
wordcloud2 = WordCloud().generate(X[26]) # for Barack Obama
wordcloud3 = WordCloud().generate(X[75]) # for Bill Clinton
wordcloud4 = WordCloud().generate(X[114]) # for Donald Trump
wordcloud5 = WordCloud().generate(X[136]) # for Franklin D. Roosevelt
wordcloud6 = WordCloud().generate(X[185]) # for George H. W. Bush
wordcloud7 = WordCloud().generate(X[208]) # for George W. Bush
wordcloud8 = WordCloud().generate(X[247]) # for George Washington
wordcloud9 = WordCloud().generate(X[268]) # for Richard M. Nixon
wordcloud10 = WordCloud().generate(X[291]) # for Ronald Reagan
wordcloud11 = WordCloud().generate(X[350]) # for Thomas Jefferson
#print(X[0])
print(df['Author'][0])
plt.imshow(wordcloud1, interpolation='bilinear')
plt.show()
#print(X[1])
print(df['Author'][26])
plt.imshow(wordcloud2, interpolation='bilinear')
plt.show()
#print(X[3])
print(df['Author'][75])
plt.imshow(wordcloud3, interpolation='bilinear')
plt.show()
print(df['Author'][114])
plt.imshow(wordcloud4, interpolation='bilinear')
plt.show()
print(df['Author'][136])
plt.imshow(wordcloud5, interpolation='bilinear')
plt.show()
print(df['Author'][185])
plt.imshow(wordcloud6, interpolation='bilinear')
plt.show()
print(df['Author'][208])
plt.imshow(wordcloud7, interpolation='bilinear')
plt.show()
print(df['Author'][247])
plt.imshow(wordcloud8, interpolation='bilinear')
plt.show()
print(df['Author'][268])
plt.imshow(wordcloud9, interpolation='bilinear')
plt.show()
print(df['Author'][291])
plt.imshow(wordcloud10, interpolation='bilinear')
plt.show()
print(df['Author'][350])
plt.imshow(wordcloud11, interpolation='bilinear')
plt.show()
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
# 80-20 splitting the dataset (80%->Training and 20%->Validation)
X_train, X_test, y_train, y_test = train_test_split(X, y
,test_size=0.2, random_state=42)
# defining the bag-of-words transformer on the text-processed corpus # i.e., text_process() declared in II is executed...
bow_transformer=CountVectorizer(analyzer=text_process).fit(X_train)
# transforming into Bag-of-Words and hence textual data to numeric..
text_bow_train=bow_transformer.transform(X_train)#ONLY TRAINING DATA
# transforming into Bag-of-Words and hence textual data to numeric..
text_bow_test=bow_transformer.transform(X_test)#TEST DATA
# instantiating the model with Multinomial Naive Bayes..
model = MultinomialNB()
# training the model...
model = model.fit(text_bow_train, y_train)
# Training Accuracy
model.score(text_bow_train, y_train)
# Test Accuracy
model.score(text_bow_test, y_test)
# Getting the predictions of the Test Set...
predictions = model.predict(text_bow_test)
# Getting the Precision, Recall, F1-Score
print(classification_report(y_test,predictions))
# Defining a module for Confusion Matrix...
def plot_confusion_matrix(cm, classes,normalize=False,title='Confusion matrix',cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting \normalize=True`.`
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
# print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
cm = confusion_matrix(y_test,predictions)
plt.figure(figsize=(20,10))
plot_confusion_matrix(cm, classes=[0,26,75,114,136,185,208,247,268,291,350], normalize=True,title='Confusion Matrix')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
def sentiment_analyzer_scores(sentence):
score = analyser.polarity_scores(sentence)
return ("{:-<40} {}".format(sentence, str(score)))
df.loc[df['Author'].str.contains('George Washington'), 'Year'] = 1789
df.loc[df['Author'].str.contains('Thomas Jefferson'), 'Year'] = 1801
df.loc[df['Author'].str.contains('Andrew Jackson'), 'Year'] = 1829
df.loc[df['Author'].str.contains('Franklin D. Roosevelt'), 'Year'] = 1933
df.loc[df['Author'].str.contains('Richard M. Nixon"'), 'Year'] = 1969
df.loc[df['Author'].str.contains('Ronald Reagan'), 'Year'] = 1981
df.loc[df['Author'].str.contains('George H. W. Bush'), 'Year'] = 1989
df.loc[df['Author'].str.contains('Bill Clinton'), 'Year'] = 1993
df.loc[df['Author'].str.contains('George W. Bush'), 'Year'] = 2001
df.loc[df['Author'].str.contains('Barack Obama'), 'Year'] = 2009
df.loc[df['Author'].str.contains('Donald Trump'), 'Year'] = 2017
df.Year = df.Year.fillna(1969)
d = pd.Series(df.Author.values,index=df.Year).to_dict()
for key in sorted(d.keys()):
print("%s: %s" % (key, d[key]))
dfAndrewJackson = df[['Text']].copy()
# Delete row at index from 26 to 375
dfAndrewJackson_clean = dfAndrewJackson.drop(dfAndrewJackson.index[26:375])
years = ['1834/04/15', '1834/04/21', '1834/12/01', '1833/12/03', '1832/12/04',
'1836/12/05', '1830/12/06', '1831/12/06', '1832/12/06', '1835/12/07',
'1829/12/08', '1832/12/10', '1833/12/12', '1836/12/21', '1832/02/15',
'1831/02/22', '1833/01/16', '1832/07/10', '1829/03/04', '1833/03/04',
'1837/03/04', '1830/05/06', '1829/05/11', '1830/05/27', '1830/10/05',
'1833/09/18']
print(len(years))
dfAndrewJackson_clean['Speech_Date'] = years
print(dfAndrewJackson_clean.shape)
print(dfAndrewJackson_clean)
dict_AndrewJackson = pd.Series(dfAndrewJackson_clean.Text.values,index=dfAndrewJackson_clean.Speech_Date).to_dict()
for key in sorted(dict_AndrewJackson.keys()):
print("%s: %s" % (key, dict_AndrewJackson[key]))
l = []
for key in sorted(dict_AndrewJackson.keys()):
l.append(sentiment_analyzer_scores(dict_AndrewJackson[key]))
print(l)
print(len(l))
for i in range(len(l)):
print(l[i][-62:])
sentiments = pd.DataFrame(columns=['Neg', 'Neu', 'Pos'])
sentiments['Neg'] = [0.032, 0.033, 0.064, 0.011, 0.05, 0.014, 0.054, 0.049,
0.055, 0.021, 0.048, 0.066, 0.045, 0.094, 0.06, 0.046,
0.051, 0.055, 0.008, 0.063, 0.043, 0.07, 0.059, 0.071,
0.066, 0.092]
sentiments['Neu'] = [0.778, 0.842, 0.776, 0.873, 0.806, 0.853, 0.77, 0.864,
0.772, 0.83, 0.821, 0.767, 0.845, 0.792, 0.831, 0.768,
0.857, 0.782, 0.92, 0.834, 0.867, 0.785, 0.789, 0.805,
0.813, 0.711]
sentiments['Pos'] = [0.191, 0.126, 0.159, 0.117, 0.144, 0.133, 0.176, 0.087,
0.173, 0.149, 0.131, 0.167, 0.11, 0.114, 0.109, 0.186,
0.092, 0.163, 0.072, 0.103, 0.09, 0.145, 0.151, 0.124,
0.121, 0.198]
sentimentsWhat could be the issue? Any suggestions are appreciated. Thx. You can find my 'train.csv' file here: https://drive.google.com/file/d/1a5fbORQ...sp=sharing
