So I have a piece of code that works with a single file that is in the root folder:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
tweets = []
# Read Twitter JSON
for line in open('00.json'): # single JSON to work with
try:
tweet = json.loads(line)
tweets.append(tweet)
except:
continue
# Extract data of interest
tweet = tweets[0]
ids = [tweet['id_str'] for tweet in tweets if 'id_str' in tweet] #tweets often have missing data, therefore use if
text = [tweet['text'] for tweet in tweets if 'text' in tweet]
lang = [tweet['lang'] for tweet in tweets if 'lang' in tweet]
geo = [tweet['geo'] for tweet in tweets if 'geo' in tweet]
place = [tweet['place'] for tweet in tweets if 'place' in tweet]
# Save data of interest in a pandas data frame
df=pd.DataFrame({'Ids':pd.Index(ids),
'Text':pd.Index(text),
'Lang':pd.Index(lang),
'Geo':pd.Index(geo),
'Place':pd.Index(place)})
# Create a data frame for this specific JSON excluding some data:
df00 = df[(df['Lang']==('en')) & (df['Geo'].dropna())]Now, I have about a thousand of similar JSON files each of which is in the separate sub-folder. The following coding elaborations are difficult to me, so please bare with me here. My current goal is to (1) look into each sub-folder, (2) locate the *.json, (3) perform data extraction on it, (4) create a data frame with extracted data for all JSONs read.import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
tweets = []
rootdir = '/Users/mymac/Documents/00/23'
for subdir, dirs, files in os.walk(rootdir):
for file in files:
if file.endswith(".json"):
for line in open(file) :
try:
tweet = json.loads(line)
tweets.append(tweet)
except:
continue
tweet = tweets[0]
ids = [tweet['id_str'] for tweet in tweets if 'id_str' in tweet]
text = [tweet['text'] for tweet in tweets if 'text' in tweet]
lang = [tweet['lang'] for tweet in tweets if 'lang' in tweet]
geo = [tweet['geo'] for tweet in tweets if 'geo' in tweet]
place = [tweet['place'] for tweet in tweets if 'place' in tweet]
df=pd.DataFrame({'Ids':pd.Index(ids),
'Text':pd.Index(text),
'Lang':pd.Index(lang),
'Geo':pd.Index(geo),
'Place':pd.Index(place)})
dfThanks to suggestions provided below by wavic and zivoni, the code now works.
