Jupyter error - 'The kernel appears to have died, it will restart automatically'

meganhollie · Jun-12-2018, 05:55 PM

I'm running python in a jupyter notebook, and it keeps crashing and giving me this error message. This is code I ran successfully last week! I haven't changed anything, but suddenly it won't work. I've updated jupyter, reset my computer multiple times, and checked the input files, but nothing has helped.
This is not code I wrote, but is an edited version of a topic modeling code from Github...I don't have the knowledge or experience to know where the errors are, but everything runs fine until it gets to [btm.run()]

Any ideas, or suggestions, are very much appreciated.

from collections import defaultdict
import operator
import os
import random
import time
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import os
from itertools import chain
from glob import glob

file_content = open("nippur input.txt").read()
tokens = nltk.word_tokenize(file_content)

from nltk.corpus import stopwords
stop_words = set(stopwords.words("nippurstopwords.txt"))

file1 = open("nippur input.txt")
line = file1.read()
words = line.split()
for r in words:
    if not r in stop_words:
        appendFile = open("filteredtext.txt","a")
        appendFile.write(" "+r)
        appendFile.close()

class BTM(object):
    def __init__(self, data_path, alpha, beta, num_iter, num_topic, output_dir):
        self.data_path = data_path
        self.alpha = alpha
        self.beta = beta
        self.num_iter = num_iter
        self.num_topic = num_topic
        self.output_dir = output_dir
        
        self.word2Id = {}
        self.Id2Word = {}
        self.vocab_size = 0
        
        self.wordId_corpus = []
        
        self.biterms_in_doc = [] #list of dictionaries long->int
        self.num_doc_biterm = defaultdict(int)
        self.biterms = [] #List of numbers

        self.topic_biterm = []
        self.topic_word_num = [] #list of lists
        self.num_topic_biterm = []
        
        self.biterm_sum = {} #Map from long to double
        
    def get_file_reader(self, path = None):
        if path is None:
            path = self.data_path
        f = open(path, 'r')
        return f
    
    def get_file_writer(self,path, append = False):
        if append:
            read_mode = 'a'
        else:
            read_mode = 'w'
        g = open(os.path.join(self.output_dir, path), read_mode)
        return g
    
    def print_params(self):
        params = ['alpha','beta','num_iter','num_topic','topic_word_num','num_topic_biterm','topic_biterm']
        for param in params:
            print(param,':',getattr(self, param))
            print('-'*40)
    
    def load_data(self):
        f = self.get_file_reader()
        for line in f.readlines():
            words = line.split()
            curr_doc = []
            for word in words:
                if word not in self.word2Id:
                    index = len(self.word2Id)
                    self.word2Id[word] = index
                    self.Id2Word[index] = word
                curr_doc.append(self.word2Id[word])
            self.wordId_corpus.append(curr_doc)
        f.close()
        
        self.num_doc_biterm = [0]*len(self.wordId_corpus)
    
    def init_model(self):
        for doc_number, doc in enumerate(self.wordId_corpus):
            oneCop = defaultdict(int)
            for word1 in doc:
                for word2 in doc:
                    if(word1<word2):
                        item_num = word1*1000000+word2 #encoding the biterms
                        oneCop[item_num] +=1
                        self.biterms.append(item_num)
                        self.num_doc_biterm[doc_number] +=1
            self.biterms_in_doc.append(oneCop)
            
        self.vocab_size = len(self.word2Id)
        
        self.topic_biterm = [0]*len(self.biterms)
        self.topic_word_num = {j: {i:0 for i in range(self.num_topic)} for j in range(self.vocab_size)}
        print(len(self.topic_word_num), len(self.topic_word_num[0]))
        self.num_topic_biterm = [1]*self.num_topic
        
        for biterm_index, biterm in enumerate(self.biterms):
            topic_id = random.randint(0, self.num_topic-1)
            #if biterm_index  5:
                #print(biterm, biterm%1000000, biterm//1000000)
                #print(self.topic_word_num)
            self.topic_word_num[biterm%1000000][topic_id] +=1
            self.topic_word_num[biterm//1000000][topic_id] +=1
            self.topic_biterm[biterm_index] = topic_id
            
    def save_topic_words(self, topic_word_num = 10):
        writer = self.get_file_writer(path = 'model-final-topic-words.txt')
        for topic_id in range(self.num_topic):
            topic_line = {}
            for word_id, word in enumerate(self.word2Id):
                topic_line[word_id] = self.topic_word_num[word_id][topic_id]/ self.num_topic_biterm[topic_id] / 2
            sorted_topic_line = sorted(topic_line.items(), key = operator.itemgetter(1) )
            writer.write("Topic:"+str(topic_id) + '\n')
            for topic_word,score in sorted_topic_line[:topic_word_num]:
                writer.write("\t"+str(self.Id2Word[topic_word])+"\t"+str(score) + '\n')
        writer.close()
    
    def save_wordIds(self):
        writer = self.get_file_writer(path = 'model-final-wordIds.txt')
        for key,value in self.word2Id.items():
            writer.write(str(key) + ' ' + str(value) + '\n')
        writer.close()
        
    def get_sum(self, biterm):
        if biterm not in self.biterm_sum:
            word1 = biterm//1000000
            word2 = biterm%1000000
            sum = 0
            for topic_id in range(self.num_topic):
                calculation = (self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2
                sum += calculation
            self.biterm_sum[biterm] = sum
        return self.biterm_sum[biterm]
        
    def save_theta(self):
        writer = self.get_file_writer(path = 'model-final-theta.txt')

        for doc_index, line in enumerate(self.biterms_in_doc):
            for topic_id in range(self.num_topic):
                one_sum = 0
                for key in line:
                    word1 = key//1000000
                    word2 = key%1000000
                    one_sum += ((line[key]/self.num_doc_biterm[doc_index]) * ((self.num_topic_biterm[topic_id] + self.alpha) * (self.topic_word_num[word1][topic_id] + self.beta) * (self.topic_word_num[word2][topic_id] + self.beta) / ((2 * self.num_topic_biterm[topic_id] ) + (self.vocab_size * self.beta))**2)/(self.get_sum(key)))
                writer.write(str(one_sum) + " ")
            writer.write('\n')
        writer.close()
        
    def save_phi(self):
        writer = self.get_file_writer(path = 'model-final-phi.txt')
        for topic_id in range(self.num_topic):
            for word_id in self.Id2Word:
                calculation = (self.topic_word_num[word_id][topic_id] + self.beta) / ((self.num_topic_biterm[topic_id] * 2) + (self.vocab_size * self.beta))
                writer.write(str(calculation) + ' ')
            writer.write('\n')
        writer.close()
        
    
    def build_model(self):
        for it in range(self.num_iter):
            start_time = time.time()
            for biterm_index, old_topic_id in enumerate(self.topic_biterm):
                word1 = self.biterms[biterm_index]//1000000
                word2 = self.biterms[biterm_index]%1000000
                self.topic_word_num[word1][old_topic_id] -=1
                self.topic_word_num[word2][old_topic_id] -=1
                self.num_topic_biterm[old_topic_id] -=1
                
                new_topic_id = -1
                
                p = [0]*self.num_topic
                for k in range(self.num_topic):
                    p[k] = (self.num_topic_biterm[k] + self.alpha) * (self.topic_word_num[word1][k] + self.beta) * (self.topic_word_num[word2][k] + self.beta) / ((2 * self.num_topic_biterm[k] ) + (self.vocab_size * self.beta))**2
                    
                for k in range(1,self.num_topic):
                    p[k] += p[k-1]
                
                u = random.random() * p[-1]
                for k in range(self.num_topic):
                    if u < p[k]:
                        new_topic_id = k
                        break
                
                self.topic_word_num[word1][new_topic_id] +=1
                self.topic_word_num[word2][new_topic_id] +=1
                self.num_topic_biterm[new_topic_id] += 1
                
                self.topic_biterm[biterm_index] = new_topic_id
                
            print('Finished iteration:', it, 'Time taken:' + str(time.time()-start_time))
    
    def save_result(self):
        self.save_topic_words(20)
        self.save_theta()
        self.save_wordIds()
        self.save_phi()
        
    def run(self):
        self.load_data()
        self.init_model()
        self.build_model()
        self.save_result()    

btm = BTM(data_path='../Topic Modeling/filteredtext.txt',alpha=2,beta=0.001, num_iter=10, num_topic=10, output_dir='.')

btm.run()

btm.save_result()

**Larz60+** · Jun-12-2018, 09:16 PM

Seems it be easier, and more logical to get the code working outside of Jupyter notebook first, and then adding it back.
You are complicating things by running it in Jupyter.
If you need help setting up an environment for this, ask.

volcano63 · (This post was last modified: Jun-12-2018, 09:41 PM by volcano63.)

(Jun-12-2018, 09:16 PM)Larz60+ Wrote: Seems it be easier, and more logical to get the code working outside of Jupyter notebook first, and then adding it back.

I am just curious - why? I use Jupyter all the time, locally and remotely - without a glitch. There are also free Azure notebooks

I am not sure what your problem is, but if your computer receives dynamic IP, Jupyter session may become un-available - it often happens to me when I open session in the office, and then try to access it at home. You must kill the old session and start a new one.

I usually open it in "no-browser mode" and then open the produced link in a browser (I work on Linux)

Output:>jupyter-notebook --no-browser --ip=$( hostname -I | awk '{print $1}' )
[I 00:40:04.786 NotebookApp] Writing notebook server cookie secret to /run/user/1000/jupyter/notebook_cookie_secret
[I 00:40:05.277 NotebookApp] Serving notebooks from local directory: /home/mark
[I 00:40:05.278 NotebookApp] 0 active kernels
[I 00:40:05.278 NotebookApp] The Jupyter Notebook is running at:
[I 00:40:05.278 NotebookApp] http://10.0.0.8:8888/?token=27296b15eb9c22614ecb6d8716cffa7a4f7d41e868d2d21a
[I 00:40:05.278 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
[C 00:40:05.279 NotebookApp] 
    
    Copy/paste this URL into your browser when you connect for the first time,
    to login with a token:
        http://10.0.0.8:8888/?token=27296b15eb9c22614ecb6d8716cffa7a4f7d41e868d2d21a

I click the link at the bottom - and voila! - Jupyter session is ready

**Larz60+** · Jun-12-2018, 09:50 PM

Quote:why? I use Jupyter all the time, locally and remotely - without a glitch ... but if your computer receives dynamic IP, Jupyter session may become un-available

'nuff' said

volcano63 · (This post was last modified: Jun-12-2018, 09:55 PM by volcano63.)

(Jun-12-2018, 09:50 PM)Larz60+ Wrote:
Quote:why? I use Jupyter all the time, locally and remotely - without a glitch ... but if your computer receives dynamic IP, Jupyter session may become un-available
'nuff' said

So, if you don't know the answer - "don't use it" is a good answer?!

PS And since you obviously don't know what you are talking about - Jupyter sessions are auto-saved

**Larz60+** · Jun-12-2018, 10:11 PM

be nice, I use Jupyter all the time.

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	error handler appears to be turned off. How do I turn it back on?	jpotter0	0	1,372	Nov-26-2022, 11:44 AM Last Post: jpotter0
	Joining two jupyter notebooks and getting an error!	Led_Zeppelin	1	2,606	Oct-20-2022, 04:28 PM Last Post: deanhystad
	Setting up new Python kernel for JupyterLab Desktop on M1 Mac	daler6	0	2,635	Jun-20-2022, 03:45 AM Last Post: daler6
	Jupyter kernel restarts	russellm10	0	2,979	Sep-14-2021, 04:24 AM Last Post: russellm10
	Problem: Restart kernel onPydev console when trying to install a python package	poppy2020	1	10,913	Nov-25-2020, 06:13 PM Last Post: Larz60+
	How a Mac OS software can restart itself with admin permission in Python 3.7?	Formationgrowthhacking	0	2,854	Sep-03-2020, 05:29 PM Last Post: Formationgrowthhacking
	Using a button to kill and restart a script	duckredbeard	3	5,946	Sep-01-2020, 12:53 AM Last Post: duckredbeard
	How to convert what appears to be a JSON file to CSV	NewBeie	4	4,126	Aug-28-2020, 04:45 PM Last Post: Larz60+
	Print a certain string only the first time it appears in a test file	buttercup	5	5,067	Jul-23-2020, 01:30 PM Last Post: palladium
	fileinput package appears to be zeroing files	rexrf	0	2,275	Jul-01-2020, 06:05 PM Last Post: rexrf

Jupyter error - 'The kernel appears to have died, it will restart automatically'

User Panel Messages

Announcements