Apr-07-2020, 02:44 PM
Hi everyone;
I have this code, made for many parts of code, for preprocess and detected unique word on whatsapp group conversation:
Dictionary on .CSV
Word_Comp Replace
yronadura tronadura
vulzanizado vulcanizado
vomo como
viernescuántas viernes
via vía
venian venían
vel velocidad
vdfcon vdf
varuadores variadores
vamps vamos
vamiones camiones
I have this code, made for many parts of code, for preprocess and detected unique word on whatsapp group conversation:
import re
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches
from PIL import Image
from collections import Counter
from matplotlib.offsetbox import AnchoredText
from mpl_toolkits.axes_grid1 import make_axes_locatable
# PROCESING DATA
def import_data(file, path = ''):
""" Import whatsapp data and transform it to a dataframe
Parameters:
-----------
file : str
Name of file including its extension.
path : str, default ''
Path to file without the file name.
Keep it empty if the file is in the
working directory.
Returns:
--------
df : dataframe
Dataframe of all messages
"""
with open(path + file, encoding = 'utf-8') as outfile:
raw_text = outfile.readlines()
messages = {}
for message in raw_text:
# Some messages are not sent by the user,
# but are simply comments and therefore need to be removed
try:
name = message.split(' - ')[1].split(':')[0]
except:
continue
# Add name to dictionary if it exists
if name in messages:
messages[name].append(message)
else:
messages[name] = [message]
# Convert dictionary to dataframe
df = pd.DataFrame(columns=['Message_Raw', 'User'])
for name in messages.keys():
df = df.append(pd.DataFrame({'Message_Raw': messages[name], 'User': name}))
df.reset_index(inplace=True)
return df
def clean_message(row):
"""
Try to extract name, if not possible then
somebody didn't write a message but changed
the avatar of the group.
"""
name = row.User + ': '
try:
return row.Message_Raw.split(name)[1][:-1]
except:
return row.Message_Raw
def remove_inactive_users(df, min_messages=10):
""" Removes inactive users or users that have
posted very few messages.
Parameters:
-----------
df : pandas dataframe
Dataframe of all messages
min_messages: int, default 10
Number of minimum messages that a user must have
Returns:
--------
df : pandas dataframe
Dataframe of all messages
"""
# Remove users that have not posted more than min_messages
to_keep = df.groupby('User').count().reset_index()
to_keep = to_keep.loc[to_keep['Message_Raw'] >= min_messages, 'User'].values
df = df[df.User.isin(to_keep)]
return df
def preprocess_data(df, min_messages=10):
""" Preprocesses the data by executing the following steps:
* Import data
* Create column with only message, not date/name etc.
* Create column with only text message, no smileys etc.
* Remove inactive users
* Remove indices of images
Parameters:
-----------
df : pandas dataframe
Raw data in pandas dataframe format
min_messages : int, default 10
Number of minimum messages each user needs
to have posted else they are removed.
Returns:
--------
df : pandas dataframe
Dataframe of all messages
"""
# Create column with only message, not date/name etc.
df['Message_Clean'] = df.apply(lambda row: clean_message(row), axis = 1)
# Create column with only text message, no smileys etc.
df['Message_Only_Text'] = df.apply(lambda row: re.sub(r'[^a-zA-Z ñáéíóúÑÁÉÍÓÚ]+', '',
row.Message_Clean.lower()),
axis = 1)
# Remove inactive users
df = remove_inactive_users(df, min_messages)
# Remove indices of images
indices_to_remove = list(df.loc[df.Message_Clean.str.contains('|'.join(['<', '>'])),
'Message_Clean'].index)
df = df.drop(indices_to_remove)
# Extract Time
df['Date'] = df.apply(lambda row: row['Message_Raw'].split(' - ')[0], axis = 1)
if '/' in str(df.iloc[df.index[0]].Date):
df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%y %H:%M")
else:
if ',' in str(df.iloc[df.index[0]].Date):
df['Date'] = pd.to_datetime(df['Date'], format="%d-%m-%y, %H:%M")
else:
df['Date'] = pd.to_datetime(df['Date'], format="%d-%m-%y %H:%M")
# Extact Day of the Week
df['Hour'] = df.apply(lambda row: row.Date.hour, axis = 1)
df['Day_of_Week'] = df.apply(lambda row: row.Date.dayofweek, axis = 1)
# Sort values by date to keep order
df.sort_values('Date', inplace=True)
return df
df = import_data('Chat de WhatsApp con Operaciones GCHC.txt')
df = preprocess_data(df)
# PROCESING COUNT WORD PER USER
def count_words_per_user(df, sentence_column = "Message_Only_Text", user_column = "User"):
""" Creates a count vector for each user in which
the occurence of each word is count over all
documents for that user.
Parameters:
-----------
df : pandas dataframe
Dataframe of all messages
sentence_column : string, default 'Message_Only_Text'
Name of the column of which you want to
create a word count
user_column : string, default 'User'
Name of the column that specifies the user
Returns:
--------
df : pandas dataframe
Dataframe counts per word per user
"""
# Creating a dataframe with all words
counts = list(Counter(" ".join(list(df[sentence_column])).split(" ")).items())
counts = [word[0] for word in counts]
counts = pd.DataFrame(counts, columns = ['Word'])
counts = counts.drop(0)
# Adding counts of each user to the dataframe
for user in df.User.unique():
count_temp = list(Counter(" ".join(list(df.loc[df[user_column] == user,
'Message_Only_Text'])).split(" ")).items())
counts[user] = 0
for word, count in count_temp:
counts.loc[counts['Word'] == word, user] = count
counts = counts[counts.Word.str.len() > 1]
return counts
def remove_stopwords(df, file, path='', column = "Word"):
""" Remove stopwords from a dataframe choosing
a specific column in which to remove those words
Parameters:
-----------
df : pandas dataframe
Dataframe of counts per word per user
file : string
Name of file that contains the stopwords
path : string, default ''
Path of the file that contains the stopwords
column : string, default 'Word'
Column to clean
Returns:
--------
df : pandas dataframe
Dataframe of counts per word per user
excluding the stopwords
"""
# Remove stopwords
with open(path + file) as stopwords:
stopwords = stopwords.readlines()
stopwords = [word[:-1] for word in stopwords]
df = df[~df[column].isin(stopwords)]
return df
def get_unique_words(counts, df_raw, version):
""" Get a list of unique words
The dataframe needs be structured as follows:
First column is called "Word" and contains a certain word
Any following columns are named as the users and contain the
count of each word.
| | Word | Tim | Nadia |
| 1 | pride | 0 | 1 |
| 2 | groceries | 2 | 9 |
etc.
Formulas:
t_user = Number of times word t said by user
t_all = Number of times word t said by all users
sum_messages = Number of all messages
messages_user = Number of messages user has send
sum_words = Number of all words
words_user = Number of words user has send
Version A
TF_IDF = ((t_user+1)^2 / t_all) * (sum_messages / messages_user)
Version B
TF_IDF = ((t_user+1)^2 / t_all) * (sum_words / words_user)
Version C
TF_IDF = (t_user + 1) / (words_user + 1) * log(sum_messages / t_all)
Parameters:
-----------
counts : pandas dataframe
Dataframe of counts per word per user
df_raw : pandas dataframe
Dataframe of raw messages
version : string
Which formula to use (A, B, C)
Returns:
--------
df_words : pandas dataframe
Dataframe tf_idf scores per word per user and unique value
"""
df_words = counts.copy()
# Number of messages by i
nr_messages = {user: len(df_raw[df_raw.User == user]) for user in df_words.columns[1:]}
nr_users = len(nr_messages.keys())
nr_words = {user: np.sum(df_words[user]) for user in df_words.columns[1:]}
total = sum(nr_messages.values())
# Calculate TF_IDF based on the version
for user in nr_messages.keys():
df_words[user+"_TF_IDF"] = df_words.apply(lambda row: tf_idf(row, user,
nr_users, nr_words,
nr_messages, version=version),
axis = 1)
# TF_IDF divided by each other so we can see the relative importance
for user in nr_messages.keys():
df_words[user+"_Unique"] = df_words.apply(lambda row: word_uniqueness(row,
nr_users,
user),
axis = 1)
return df_words
def tf_idf(row, user, nr_users, nr_words, nr_messages, version):
""" Used as a lambda function inside get_unique_words() to
get the tf_idf scores based on one of three formulas
Formulas:
t_user = Number of times word t said by user
t_all = Number of times word t said by all users
sum_messages = Number of all messages
messages_user = Number of messages user has send
sum_words = Number of all words
words_user = Number of words user has send
Version A
TF_IDF = ((t_user+1)^2 / t_all) * (sum_messages / messages_user)
Version B
TF_IDF = ((t_user+1)^2 / t_all) * (sum_words / words_user)
Version C
TF_IDF = (t_user + 1) / (words_user + 1) * log(sum_messages / t_all)
"""
# TF_IDF = (t_user^2 / t_all) * (sum of messages / messages by user)
if version == "A":
t_user = row[user]
t_all = np.sum(row.iloc[1:nr_users+1])
sum_messages = sum(nr_messages.values())
messages_user = nr_messages[user]
tf_idf = (np.square(t_user + 1) / (t_all)) * (sum_messages / messages_user)
return tf_idf
# TF_IDF = (t_user^2 / t_all) * (sum of words / words by user)
elif version == "B":
t_user = row[user]
t_all = np.sum(row.iloc[1:nr_users+1])
sum_words = sum(nr_words.values())
words_user = nr_words[user]
tf_idf = (np.square(t_user + 1) / (t_all)) * (sum_words / words_user)
return tf_idf
# TF_IDF = (t_user / words_user) * log(sum of messages / t_all)
elif version == "C":
t_user = row[user]
words_user = nr_words[user]
sum_messages = sum(nr_messages.values())
t_all = np.sum(row.iloc[1:nr_users+1])
tf_idf = (t_user + 1 / words_user + 1) * np.log(sum_messages / t_all)
return tf_idf
def word_uniqueness(row, nr_users, user):
""" Used as a lambda function in function get_unique_words()
Formula:
word_uniqueness = tf_idf_user / (tf_idf_all - tf_idf_user)
"""
tf_idf_user = row[user+"_TF_IDF"]
tf_idf_all = np.sum(row.iloc[nr_users+1: 2*nr_users+1])
with np.errstate(divide='ignore'):
unique_value_user = np.divide(tf_idf_user,
(tf_idf_all - tf_idf_user))
return unique_value_user
def plot_unique_words(df_unique, user, image_path=None, image_url=None, save_name=None, save_path="",
title=" ", title_color="white", title_background="black", font=None,
width=None, height=None):
"""
Parameters:
-----------
df_unique : dataframe
Dataframe containing a column "Word" and a column
user+"_Unique" that describes how unique a word is
by simply giving a floating value
user : string
The name of the user which is the user in the column user+"_Unique"
image_path : string with // to the path
Path to the picture you want to use
image_url : string
Url to the image you want to use
save_name : string
If you want to save the name then simply set a name without extension
save_path : string
Where you want to store the image
title : string
Title of the plot
title_color : string
Color of the title
title_background : string
Color of the background box of the title
font : string
Family font to use (make sure to check if you have it installed)
width : integer or float
Width of the plot (will also resize the image)
height : integer or float
Height of the plot (will also resize the image)
"""
# Set font to be used
if font:
font = {'fontname':font}
else:
font = {'fontname':'Comic Sans MS'}
# Background image to be used, black if nothing selected
if image_path:
img=mpimg.imread(image_path)
img = Image.open(image_path)
elif image_url:
img = Image.open(requests.get(url, stream=True).raw)
else:
img = np.zeros([100,100,3],dtype=np.uint8)
img.fill(0)
if width and height:
img = img.resize((width, height))
else:
# Get size of image
width = img.shape[1]
height = img.shape[0]
# Prepare data for plotting
# to_plot = get_unique_words(counts, df_raw, version = 'C')
to_plot = df_unique.sort_values(by=user+'_Unique', ascending=True)
to_plot = to_plot.tail(10)[['Word', user+'_Unique']].copy()
# Create left part of graph ('top') and right part which overlays
# the image ('bottom')
to_plot['top'] = (to_plot[user+'_Unique'] * (width*0.99) ) / max(to_plot[user+'_Unique'])
to_plot['bottom'] = width - to_plot['top']
# Create the steps of the bars based on the height of the image
steps = height/len(to_plot)
y_pos = [(height/len(to_plot)/2) + (i * steps) for i in range(0, len(to_plot))]
# Plot figure
fig, ax = plt.subplots()
# First plot the image
plt.imshow(img, extent=[0, width*0.99, 0, height], zorder=1)
# Then plot the right part which covers up the right part of the picture
ax.barh(y_pos, to_plot['bottom'], left=to_plot['top'],height=steps, color='w',align='center',
alpha=1,lw=2, edgecolor='w', zorder=2)
# Finally plot the bar which is fully transparent aside from its edges
ax.barh(y_pos, to_plot['top'], height=steps, fc=(1, 0, 0, 0.0), align='center',lw=2,
edgecolor='white',zorder=3)
# Remove ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
# Set labels and location y-axis
ax.set_yticks(y_pos)
ax.set_yticklabels(list(to_plot['Word'].values), fontsize=18,**font)
ax.set_ylim(top=height)
# Make them with to remove any image line that may be left
ax.spines['top'].set_color('white')
ax.spines['right'].set_color('white')
# Remove the left and bottom axis
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
# Add a small patch that removes some of the extra background at the top
ax.add_patch(patches.Rectangle((0,height),width, 20,facecolor='white',linewidth = 0, zorder=3))
# Add left and bottom lines
plt.axvline(0, color='black', ymax=1, lw=5, zorder=4)
plt.axvline(width, color='white', ymax=1, lw=5, zorder=5)
plt.axhline(0, color='black', xmax=1, lw=5, zorder=6)
plt.axhline(height, color=title_background, xmax=1, lw=3, zorder=7)
# Create Title Box
# This might be a temporary solution as
# makes_axes_locatable might lose its functionality
divider = make_axes_locatable(ax)
cax = divider.append_axes("top", size="9%", pad=None)
cax.get_xaxis().set_visible(False)
cax.get_yaxis().set_visible(False)
at = AnchoredText(title, loc=10, pad=0,
prop=dict(backgroundcolor=title_background,
size=23, color=title_color, **font))
cax.add_artist(at)
cax.set_facecolor(title_background)
cax.spines['left'].set_visible(False)
cax.spines['bottom'].set_visible(False)
cax.spines['right'].set_visible(False)
cax.spines['top'].set_visible(False)
fig.set_size_inches(10, 10)
if save_name:
plt.savefig(save_path+save_name+'.png', dpi = 300)
def print_users(df):
print("#" * (len('Users')+8))
print("## " + 'Users' + " ##" )
print("#" * (len('Users')+8))
print()
for user in df.User.unique():
print(user)Now i want to replace specific word with writte on .csv file, i try with this code:def check(df, file='dictionary.csv', column = "Word"):
dictionary = pd.read_csv(file)
read_words = pd.DataFrame(dictionary, columns=['Word_Comp'])
replace_word = pd.DataFrame(dictionary, columns=['Replace'])
if df[column] in read_words():
df.replace(to_replace=column, value=replace_word)
else:
returnbut dosent work. Please HelpDictionary on .CSV
Word_Comp Replace
yronadura tronadura
vulzanizado vulcanizado
vomo como
viernescuántas viernes
via vía
venian venían
vel velocidad
vdfcon vdf
varuadores variadores
vamps vamos
vamiones camiones
