Sep-18-2018, 08:42 AM
hi,
I am trying to scrape a website that has text and links
i am creating a web scraper that will scrape the data using beautilfulsoup and requests and links using selenium.
everything is working fine in requests part but not in selenium part.
In the selenium part it is required to click on the link and link will open and then get page url and then move to main page and then start same procedures for another links but when i run the code it gets only first links and then throw error
below are my codes:
I am trying to scrape a website that has text and links
i am creating a web scraper that will scrape the data using beautilfulsoup and requests and links using selenium.
everything is working fine in requests part but not in selenium part.
In the selenium part it is required to click on the link and link will open and then get page url and then move to main page and then start same procedures for another links but when i run the code it gets only first links and then throw error
Error:Traceback (most recent call last):
File "C:\Users\prince.bhatia\Desktop\Bihar_rera\Bihar_Rera.py", line 90, in <module>
main()
File "C:\Users\prince.bhatia\Desktop\Bihar_rera\Bihar_Rera.py", line 89, in main
parsedata()
File "C:\Users\prince.bhatia\Desktop\Bihar_rera\Bihar_Rera.py", line 80, in parsedata
geta = i.find_elements_by_tag_name("a")[1]
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 237, in find_elements_by_tag_name
return self.find_elements(by=By.TAG_NAME, value=name)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 527, in find_elements
{"using": by, "value": value})['value']
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 493, in _execute
return self._parent.execute(command, params)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 256, in execute
self.error_handler.check_response(response)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 194, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=67.0.3396.99)
(Driver info: chromedriver=2.39.562718 (9a2698cba08cf5a471a29d30c8b3e12becabb0e9),platform=Windows NT 6.1.7601 SP1 x86_64) below are my codes:
from bs4 import BeautifulSoup
import requests
import csv
from selenium import webdriver
from selenium.webdriver.common import keys
from selenium.webdriver.support.ui import Select
import time
import functools
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
url = "https://nagarseva.bihar.gov.in/rerabihar/ReraGetProjectStatus.aspx"
final_data = []
def writefiles(alldata, filename):
with open ("./"+ filename, "w") as csvfile:
csvfile = csv.writer(csvfile, delimiter=",")
csvfile.writerow("")
for i in range(0, len(alldata)):
csvfile.writerow(alldata[i])
def getbyGet(url, values):
res = requests.get(url, data=values)
text = res.text
return text
def parsedata():
payload = {}
global url, final_data
data = getbyGet(url, {})
soup = BeautifulSoup(data, "html.parser")
#EVENTTARGET = soup.select("#__EVENTTARGET")[0]['value']
EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']
#print(EVENTVALIDATION)
VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
#print(VIEWSTATE)
#VIEWSTATEGENERATOR = soup.select("#__VIEWSTATEGENERATOR")[0]["value"]
headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Content-Type':'application/x-www-form-urlencoded',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}
formfields = {"__EVENTARGUMENT":"PrintIndicator$0",
'__EVENTTARGET':"ctl00$ContentPlaceHolder1$GV_Building",
'__EVENTVALIDATION':EVENTVALIDATION,
#'__EVENTTARGET':EVENTTARGET,
'__VIEWSTATE':VIEWSTATE,
"__VIEWSTATEENCRYPTED":"",
"__VIEWSTATEGENERATOR":"CE676888",
}
s = requests.session()
res = s.post(url, data=formfields, headers=headers).text
soup = BeautifulSoup(res, "html.parser")
getdata = soup.find_all("div", {"class":"col-lg-8 col-md-8 text-left"})
for i in getdata:
datas = i.find_all("h4")
for getspan in datas:
Buildername = getspan.find_all("span")[2].text
projectname = getspan.find_all("span")[3].text
getp = i.find_all("p")
for data in getp:
address = data.find_all("span")[2].text
area = data.find_all("span")[5].text
district = data.find_all("span")[8].text
stardate = data.find_all("span")[11].text
enddate = data.find_all("span")[12].text
status = data.find_all("span")[13].text
driver = webdriver.Chrome("./chromedriver")
driver.get('https://nagarseva.bihar.gov.in/rerabihar/ReraGetProjectStatus.aspx')
d = driver.find_element_by_xpath('/html/body/form/div[3]/div[2]/table/tbody/tr/td/table/tbody/tr[1]/td[1]/div/table/tbody/tr[2]/td[3]/input')
d.click()
getclass = driver.find_elements_by_css_selector(".col-lg-3.col-md-3")
for i in getclass:
sublist = []
time.sleep(2)
geta = i.find_elements_by_tag_name("a")[1]
geta.click()
window_before = driver.window_handles[0]
driver.switch_to_window(driver.window_handles[-1])
d = driver.current_url
print(d)
sublist.append(d)
driver.switch_to_window(window_before)
def main():
parsedata()
main()Please help on this one
