Jun-12-2018, 05:55 PM
I'm running python in a jupyter notebook, and it keeps crashing and giving me this error message. This is code I ran successfully last week! I haven't changed anything, but suddenly it won't work. I've updated jupyter, reset my computer multiple times, and checked the input files, but nothing has helped.
This is not code I wrote, but is an edited version of a topic modeling code from Github...I don't have the knowledge or experience to know where the errors are, but everything runs fine until it gets to [btm.run()]
Any ideas, or suggestions, are very much appreciated.
This is not code I wrote, but is an edited version of a topic modeling code from Github...I don't have the knowledge or experience to know where the errors are, but everything runs fine until it gets to [btm.run()]
Any ideas, or suggestions, are very much appreciated.
from collections import defaultdict
import operator
import os
import random
import time
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import os
from itertools import chain
from glob import glob
file_content = open("nippur input.txt").read()
tokens = nltk.word_tokenize(file_content)
from nltk.corpus import stopwords
stop_words = set(stopwords.words("nippurstopwords.txt"))
file1 = open("nippur input.txt")
line = file1.read()
words = line.split()
for r in words:
if not r in stop_words:
appendFile = open("filteredtext.txt","a")
appendFile.write(" "+r)
appendFile.close()
class BTM(object):
def __init__(self, data_path, alpha, beta, num_iter, num_topic, output_dir):
self.data_path = data_path
self.alpha = alpha
self.beta = beta
self.num_iter = num_iter
self.num_topic = num_topic
self.output_dir = output_dir
self.word2Id = {}
self.Id2Word = {}
self.vocab_size = 0
self.wordId_corpus = []
self.biterms_in_doc = [] #list of dictionaries long->int
self.num_doc_biterm = defaultdict(int)
self.biterms = [] #List of numbers
self.topic_biterm = []
self.topic_word_num = [] #list of lists
self.num_topic_biterm = []
self.biterm_sum = {} #Map from long to double
def get_file_reader(self, path = None):
if path is None:
path = self.data_path
f = open(path, 'r')
return f
def get_file_writer(self,path, append = False):
if append:
read_mode = 'a'
else:
read_mode = 'w'
g = open(os.path.join(self.output_dir, path), read_mode)
return g
def print_params(self):
params = ['alpha','beta','num_iter','num_topic','topic_word_num','num_topic_biterm','topic_biterm']
for param in params:
print(param,':',getattr(self, param))
print('-'*40)
def load_data(self):
f = self.get_file_reader()
for line in f.readlines():
words = line.split()
curr_doc = []
for word in words:
if word not in self.word2Id:
index = len(self.word2Id)
self.word2Id[word] = index
self.Id2Word[index] = word
curr_doc.append(self.word2Id[word])
self.wordId_corpus.append(curr_doc)
f.close()
self.num_doc_biterm = [0]*len(self.wordId_corpus)
def init_model(self):
for doc_number, doc in enumerate(self.wordId_corpus):
oneCop = defaultdict(int)
for word1 in doc:
for word2 in doc:
if(word1<word2):
item_num = word1*1000000+word2 #encoding the biterms
oneCop[item_num] +=1
self.biterms.append(item_num)
self.num_doc_biterm[doc_number] +=1
self.biterms_in_doc.append(oneCop)
self.vocab_size = len(self.word2Id)
self.topic_biterm = [0]*len(self.biterms)
self.topic_word_num = {j: {i:0 for i in range(self.num_topic)} for j in range(self.vocab_size)}
print(len(self.topic_word_num), len(self.topic_word_num[0]))
self.num_topic_biterm = [1]*self.num_topic
for biterm_index, biterm in enumerate(self.biterms):
topic_id = random.randint(0, self.num_topic-1)
#if biterm_index 5:
#print(biterm, biterm%1000000, biterm//1000000)
#print(self.topic_word_num)
self.topic_word_num[biterm%1000000][topic_id] +=1
self.topic_word_num[biterm//1000000][topic_id] +=1
self.topic_biterm[biterm_index] = topic_id
def save_topic_words(self, topic_word_num = 10):
writer = self.get_file_writer(path = 'model-final-topic-words.txt')
for topic_id in range(self.num_topic):
topic_line = {}
for word_id, word in enumerate(self.word2Id):
topic_line[word_id] = self.topic_word_num[word_id][topic_id]/ self.num_topic_biterm[topic_id] / 2
sorted_topic_line = sorted(topic_line.items(), key = operator.itemgetter(1) )
writer.write("Topic:"+str(topic_id) + '\n')
for topic_word,score in sorted_topic_line[:topic_word_num]:
writer.write("\t"+str(self.Id2Word[topic_word])+"\t"+str(score) + '\n')
writer.close()
def save_wordIds(self):
writer = self.get_file_writer(path = 'model-final-wordIds.txt')
for key,value in self.word2Id.items():
writer.write(str(key) + ' ' + str(value) + '\n')
writer.close()
def get_sum(self, biterm):
if biterm not in self.biterm_sum:
word1 = biterm//1000000
word2 = biterm%1000000
sum = 0
for topic_id in range(self.num_topic):
calculation = (self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2
sum += calculation
self.biterm_sum[biterm] = sum
return self.biterm_sum[biterm]
def save_theta(self):
writer = self.get_file_writer(path = 'model-final-theta.txt')
for doc_index, line in enumerate(self.biterms_in_doc):
for topic_id in range(self.num_topic):
one_sum = 0
for key in line:
word1 = key//1000000
word2 = key%1000000
one_sum += ((line[key]/self.num_doc_biterm[doc_index]) * ((self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2)/(self.get_sum(key)))
writer.write(str(one_sum) + " ")
writer.write('\n')
writer.close()
def save_phi(self):
writer = self.get_file_writer(path = 'model-final-phi.txt')
for topic_id in range(self.num_topic):
for word_id in self.Id2Word:
calculation = (self.topic_word_num[word_id][topic_id] + self.beta) / ((self.num_topic_biterm[topic_id] * 2) + (self.vocab_size * self.beta))
writer.write(str(calculation) + ' ')
writer.write('\n')
writer.close()
def build_model(self):
for it in range(self.num_iter):
start_time = time.time()
for biterm_index, old_topic_id in enumerate(self.topic_biterm):
word1 = self.biterms[biterm_index]//1000000
word2 = self.biterms[biterm_index]%1000000
self.topic_word_num[word1][old_topic_id] -=1
self.topic_word_num[word2][old_topic_id] -=1
self.num_topic_biterm[old_topic_id] -=1
new_topic_id = -1
p = [0]*self.num_topic
for k in range(self.num_topic):
p[k] = (self.num_topic_biterm[k] + self.alpha) * (self.topic_word_num[word1][k] + self.beta) * (self.topic_word_num[word2][k] + self.beta) / ((2 * self.num_topic_biterm[k] ) + (self.vocab_size * self.beta))**2
for k in range(1,self.num_topic):
p[k] += p[k-1]
u = random.random() * p[-1]
for k in range(self.num_topic):
if u < p[k]:
new_topic_id = k
break
self.topic_word_num[word1][new_topic_id] +=1
self.topic_word_num[word2][new_topic_id] +=1
self.num_topic_biterm[new_topic_id] += 1
self.topic_biterm[biterm_index] = new_topic_id
print('Finished iteration:', it, 'Time taken:' + str(time.time()-start_time))
def save_result(self):
self.save_topic_words(20)
self.save_theta()
self.save_wordIds()
self.save_phi()
def run(self):
self.load_data()
self.init_model()
self.build_model()
self.save_result()
btm = BTM(data_path='../Topic Modeling/filteredtext.txt',alpha=2,beta=0.001, num_iter=10, num_topic=10, output_dir='.')
btm.run()
btm.save_result()
