Dec-19-2022, 11:11 PM
(Python) Pulling data from UA Google Analytics with more than 100k rows into csv. Works fine for 57k
My code just keeps outputting: len(rows): 100000
100000
None
len(rows): 200000
100000
None
len(rows): 300000
100000
None
len(rows): 400000
100000
None
len(rows): 500000
100000
None
looping forever rather than pulling the next page.
I appreciate I'm not asking very well, but what basic thing have I missed to make it pull the next page and append?
import pandas as pd
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
import datetime as datetime2
from datetime import date, timedelta
import time
from dates import StartDay, StartMonth, StartYear, EndDay,EndYear,EndMonth
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_LOCATION = './XXXXXXX.json'
VIEW_ID = 'XXXXXXX'
PAGESIZE = 100000
#StartDate = '2022-10-13'
#EndDate = '2022-10-14'
#RangeStart = datetime2.date(2022, 9, 1)
#RangeEnd = datetime2.date(2022, 9, 1)
#RangeStart = datetime2.date(2022, 1, 3) # from and inluding this date
#RangeEnd = datetime2.date(2022, 1, 4) # up to this date (not including)
RangeStart = datetime2.date(StartYear, StartMonth, StartDay) # from and inluding this date
RangeEnd = datetime2.date(EndYear, EndMonth, EndDay) # up to this date (not including)
delta = datetime2.timedelta(days=1)
#main.RangeStart
def initialize_analyticsreporting():
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
analytics = build('analyticsreporting', 'v4', credentials=credentials)
return analytics
# Get one report page
def get_report(analytics, pageTokenVar):
return analytics.reports().batchGet(
body={
##################################################################################
'reportRequests': [
{
'viewId': VIEW_ID,
'dateRanges': [{'startDate': startingDateFeed, 'endDate': singleDateFeed}],
'metrics': [
{'expression': 'ga:users'},
{'expression': 'ga:newUsers'},
#{'expression': 'ga:pageviews'},
#{'expression': 'ga:bounces'},
#{'expression': 'ga:sessions'},
#{'expression': 'ga:searchUniques'},
#{'expression': 'ga:timeOnPage'}
{'expression': 'ga:searchUniques'},
{'expression': 'ga:bounces'},
{'expression': 'ga:bounceRate'},
{'expression': 'ga:sessionDuration'},
{'expression': 'ga:avgSessionDuration'},
{'expression': 'ga:avgTimeOnPage'},
{'expression': 'ga:organicSearches'}
],
'dimensions': [
{'name': 'ga:userType'},
{'name': 'ga:sessionCount'},
{'name': 'ga:dateHourMinute'},
#{'name': 'ga:sourceMedium'},
#{'name': 'ga:deviceCategory'},
#{'name': 'ga:cityId'},
#{'name': 'ga:countryIsoCode'},
#{'name': 'ga:landingPagePath'},
#{'name': 'ga:pagePath'},
{'name': 'ga:campaign'},
#{'name' : 'ga:searchUniques'}
#{'name': 'ga:pagePathLevel1'},
#{'name': 'ga:pagePathLevel2'},
#{'name': 'ga:pagePathLevel3'},
#{'name': 'ga:pagePathLevel4'}
],
'samplingLevel': 'LARGE',
'pageSize': PAGESIZE
}]
##################################################################################
}
).execute()
def handle_report(analytics, pagetoken, rows):
response = get_report(analytics, pagetoken)
# Header, Dimentions Headers, Metric Headers
columnHeader = response.get("reports")[0].get('columnHeader', {})
dimensionHeaders = columnHeader.get('dimensions', [])
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
# Pagination
pagetoken = response.get("reports")[0].get('nextPageToken', None)
# Rows
rowsNew = response.get("reports")[0].get('data', {}).get('rows', [])
rows = rows + rowsNew
print("len(rows): " + str(len(rows)))
print(pagetoken)
print(response.get('nextPageToken'))
#print(response.get('rows', []))
# Recursivly query next page
if pagetoken != None:
return handle_report(analytics, pagetoken, rows)
else:
# nicer results
nicerows = []
for row in rows:
dic = {}
dimensions = row.get('dimensions', [])
dateRangeValues = row.get('metrics', [])
for header, dimension in zip(dimensionHeaders, dimensions):
dic[header] = dimension
for i, values in enumerate(dateRangeValues):
for metric, value in zip(metricHeaders, values.get('values')):
if ',' in value or ',' in value:
dic[metric.get('name')] = float(value)
else:
dic[metric.get('name')] = float(value)
nicerows.append(dic)
return nicerows
# Start
def main():
#while (RangeStart <= RangeEnd):
analytics = initialize_analyticsreporting()
global dfanalytics
dfanalytics = []
rows = []
rows = handle_report(analytics, '0', rows)
dfanalytics = pd.DataFrame(list(rows))
YYYYmmddHHMMSS = datetime.today().strftime('%Y%m%d%H%M%S')
dfanalytics.to_csv("./outputRAW/behaviour28/behaviour28days"+singleDateFeed+"_PULLED_ON_{}.csv".format(YYYYmmddHHMMSS))
dfanalytics.to_csv("./outputRAW/behaviour28/backupCopy/behaviour28days" + singleDateFeed + "_PULLED_ON_{}.csv".format(YYYYmmddHHMMSS))
#RangeStart += delta
#if __name__ == '__main__':
# main()
def daterange(RangeStart,RangeEnd):
for n in range(int((RangeEnd - RangeStart).days)):
yield RangeStart + timedelta(n)
for singleDate in daterange(RangeStart,RangeEnd):
#print(singleDate.strftime("%Y-%m-%d"))
singleDateMinusSeven = singleDate + timedelta(days=-7)
#print(singleDate + timedelta(days=-7))
singleDateFeed = singleDate.strftime("%Y-%m-%d")
startingDateFeed = singleDateMinusSeven.strftime("%Y-%m-%d")
#endingDateFeed = startingDateFeed + timedelta(days=-7)
#endingDateFeed = singleDate.strftime("%Y-%m-%d") + timedelta(days=-7)
main()
print(startingDateFeed)
print(singleDateFeed)
#print(endingDateFeed)
#time.sleep(3) #Sleep for a number of secoonds before running the next day - to decrease number of api calls
#dateTest = datetime2.date(2022, 2, 1)
#print (dateTest, end="\n")Help!My code just keeps outputting: len(rows): 100000
100000
None
len(rows): 200000
100000
None
len(rows): 300000
100000
None
len(rows): 400000
100000
None
len(rows): 500000
100000
None
looping forever rather than pulling the next page.
I appreciate I'm not asking very well, but what basic thing have I missed to make it pull the next page and append?
#Recursivly query next page
if pagetoken != None:
return handle_report(analytics, pagetoken, rows)It's this bit that doesn't seem to work This runs fine when there are 57k rows, but when I expand the range, the above happens. Many thanks in advance, and thank you for your patience!
