Feb-23-2022, 05:35 PM
Hello again everyone. The following issue I have currently at hand. The script runs to the second page for example "https://www.startpage.com/lookup/?search=position%202&version=NUM2200" and then return back to the first page of https://www.startpage.com/lookup/?search=position%201&version=NUM2200. I used urljoin to for the pages of the base and relative page but it keep cycling back and forth from page 1 to page 2 and back. Why does it want to do that? What can I do to fix this? thanks
from lxml import etree
import html5lib
import requests
from bs4 import BeautifulSoup
url = "https://www.startpage.com/lookup/?search=position%201&version=NUM2200"
while True:
request = requests.get(url) #Get URL server status
soup = BeautifulSoup(request.content, 'html5lib') #Pass url content to Soup
dom = etree.HTML(str(soup)) #Ini etree
url = urljoin(BASE_URL, dom.xpath('/html/body/div[2]/div/section/div[3]/div/div[2]/section/div[1]/div[1]/div[1]/a')[0].get("href")) #Join Relative and Base for full URL of next Page URL
print('THis is Next url',url)
for a in soup.find_all("span", {'class': re.compile(r'^text')}): #Get Text in Span Class and Filter out specific words
bltext=a.text
if bltext == 'cook Book':
st = bltext.replace('cook Book','')
elif bltext == 'Study Tools':
st = bltext.replace('Study Tools','')
elif bltext == 'Explore More':
st = bltext.replace('Explore More','')
elif bltext == 'WayPlus':
st = bltext.replace('WayPlus','')
elif bltext == 'Explore More':
st = bltext.replace('Explore More','')
elif bltext == 'Store':
st = bltext.replace('Store','')
else:
print('\n',a.text)
#with open(f'{chp}.txt', 'w', encoding='utf-8') as f:
#f.write(chp+'\n'+i.text)
print('pages',url)
print('This is url', url)
if url in 'https://www.startpage.com/lookup/?search=position%207&version=NUM2203': #Page to Stop
break #Break out of loop
