Aug-23-2023, 09:21 AM
Hello, I have to do webscraping of some articles from a website (pressreader).
My code is the following:
I am new to python and I still have a lot to learn, can someone help me? Thank you in advance
My code is the following:
from selenium import webdriver
import pandas as pd
import time
import json
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
import clipboard
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.chrome.service import Service
import pyautogui
import os.path
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def starttoend(start, end, year, month, day):
s_year = start[0:4]
s_mon = start[4:6]
s_day = start[6:8]
e_year = end[0:4]
e_mon = end[4:6]
e_day = end[6:8]
ret = []
for i in range(year.index(s_year), year.index(e_year) + 1):
for j in range(month.index(s_mon), month.index(e_mon) + 1):
if i == year.index(s_year) and j == month.index(s_mon):
for k in range(day.index(s_day), 31):
ret.append(year[i] + month[j] + day[k])
elif i == year.index(e_year) and j == month.index(e_mon):
for k in range(0, day.index(e_day) + 1):
ret.append(year[i] + month[j] + day[k])
else:
for k in range(31):
ret.append(year[i] + month[j] + day[k])
return ret
# name of papers to find
papernames = ["libero"]
start = "20080101"
end = "20230821"
cont_fail: int = 0
dates = []
year = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021",
"2022", "2023"]
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18",
"19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]
date_tul = starttoend(start, end, year, months, days)
dates.append(date_tul)
index = list(range(25))
# set up to save print as PDF file
settings = {
"appState": {
"recentDestinations": [{
"id": "Save as PDF",
"origin": "local"
}],
"selectedDestinationId": "Save as PDF",
"version": 2
}
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}
service = Service(executable_path=r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')
#service_obj = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')
#service = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')
#driver = webdriver.Chrome(service = service)
# change chrome printing option to minimize work.
chrome_options: Options = webdriver.ChromeOptions()
#chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
driver = webdriver.Chrome(service = service,
options=chrome_options)
# traverse through all papers
for i in range(len(papernames)):
# traverse through dates
for j in dates[i]:
count = 1
dobreak = False
for k in index:
if (dobreak):
break
try:
driver.get("https://www.pressreader.com/ita/" + papernames[i] + "/" + j + "/page/1/textview")
actions1 = webdriver.common.action_chains.ActionChains(driver)
actions2 = webdriver.common.action_chains.ActionChains(driver)
WebDriverWait(driver, 60).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="thumbsToolbarBottom_0"]/a')))
bottom_button = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottom_0"]/a')
bottom_button.click()
time.sleep(2)
all_bottom = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottomPreview_0"]')
all_news = all_bottom.find_elements_by_xpath('//a[@page-number="1"]')
news = all_news[k]
first = True
article_id = news.get_attribute("article-id")
print(article_id)
actions1.move_to_element(news).perform()
news.click()
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, '//article[@aid="' + str(article_id) + '"]')))
time.sleep(2)
arti = driver.find_element_by_xpath('//article[@aid="' + str(article_id) + '"]')
head = arti.find_element_by_tag_name("hgroup")
time.sleep(1)
actions2.move_to_element(head).perform()
time.sleep(1)
actions2.context_click(head).perform()
time.sleep(2)
printbutton = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[7]/a')
printbutton.click()
time.sleep(1)
printtext = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[1]/a')
printtext.click()
time.sleep(4)
name = ""
if (count < 10):
name = papernames[i] + "_" + j + "_" + "0" + str(count)
pyautogui.typewrite(papernames[i] + "_" + j + "_" + "0" + str(count))
else:
name = papernames[i] + "_" + j + "_" + str(count)
pyautogui.typewrite(papernames[i] + "_" + j + "_" + str(count))
time.sleep(1)
pyautogui.press('enter')
print("saved" + name)
time.sleep(10)
count += 1
cont_fail = 0
if k == len(all_news) - 1:
driver.quit()
dobreak = True
break
driver.quit()
time.sleep(1)
except:
cont_fail += 1
print("failed on" + papernames[i] + j + str(k))
driver.quit()
if cont_fail > 5:
break
continueI keep getting this error: C:\Users\cmosca\PycharmProjects\pythonProject\venv\Scripts\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py"
Traceback (most recent call last):
File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 96, in <module>
driver = webdriver.Chrome(service = service,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__
super().__init__(
File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 51, in __init__
self.service.path = DriverFinder.get_path(self.service, options)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 44, in get_path
raise NoSuchDriverException(f"Unable to locate or obtain driver for {options.capabilities['browserName']}")
selenium.common.exceptions.NoSuchDriverException: Message: Unable to locate or obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location
Process finished with exit code 1I have installed the chrome driver path in the system, already tried the old version of the selenium package (since I think that the original code (link: https://github.com/asui1/Webautomation/b...%20test.py) might be using an older version of selenium) but it still doesn't work.I am new to python and I still have a lot to learn, can someone help me? Thank you in advance

![[Image: Capture-d-cran-2023-08-25-101601.png]](https://i.ibb.co/6sPKydm/Capture-d-cran-2023-08-25-101601.png)