Feb-03-2021, 10:48 AM
hello dear all - good day
i have some code where i do not know why this permanently runs into an error...
question :could this scaling down cause some issues="
look forward to hear from you
regards apollo
see below the errors
i have some code where i do not know why this permanently runs into an error...
import requests
from bs4 import BeautifulSoup
from concurrent.futures.thread import ThreadPoolExecutor
url = "https://wordpress.org/plugins/browse/popular/{}"
def main(url, num):
with requests.Session() as req:
print(f"Collecting Page# {num}")
r = req.get(url.format(num))
soup = BeautifulSoup(r.content, 'html.parser')
link = [item.get("href")
for item in soup.findAll("a", rel="bookmark")]
return set(link)
with ThreadPoolExecutor(max_workers=20) as executor:
futures = [executor.submit(main, url, num)
for num in [""]+[f"page/{x}/" for x in range(2, 5)]]
allin = []
for future in futures:
allin.extend(future.result())
def parser(url):
with requests.Session() as req:
print(f"Extracting {url}")
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [item.get_text(strip=True, separator=" ") for item in soup.find(
"h3", class_="screen-reader-text").find_next("ul").findAll("li")[:8]]
head = [soup.find("h1", class_="plugin-title").text]
new = [x for x in target if x.startswith(
("V", "Las", "Ac", "W", "T", "P"))]
return head + new
with ThreadPoolExecutor(max_workers=50) as executor1:
futures1 = [executor1.submit(parser, url) for url in allin]
for future in futures1:
print(future.result())note: this fetches data (meta-data) from the WP-plugins. So i gather some meta information about the wordpress-plugins. Originally the code fetches the firts 50 pages like so: with ThreadPoolExecutor(max_workers=20) as executor:
futures = [executor.submit(main, url, num)
for num in [""]+[f"page/{x}/" for x in range(2, 50)]]
allin = []
for future in futures:see the page-range it originally should run over 50 pages. But - i have scaled doown the resuits for a smaller range - for only a few pages. question :could this scaling down cause some issues="
look forward to hear from you
regards apollo
see below the errors
Extracting https://wordpress.org/plugins/envira-gallery-lite/
Extracting https://wordpress.org/plugins/foobox-image-lightbox/
Extracting https://wordpress.org/plugins/all-in-one-favicon/
Extracting https://wordpress.org/plugins/wp-force-ssl/
Extracting https://wordpress.org/plugins/login-lockdown/
Extracting https://wordpress.org/plugins/post-type-switcher/Extracting https://wordpress.org/plugins/cyr3lat/
Extracting https://wordpress.org/plugins/wpfront-scroll-top/
Extracting https://wordpress.org/plugins/gdpr-cookie-compliance/Extracting https://wordpress.org/plugins/mw-wp-form/Extracting https://wordpress.org/plugins/cf7-conditional-fields/
Extracting https://wordpress.org/plugins/genesis-simple-edits/Extracting https://wordpress.org/plugins/user-switching/
Extracting https://wordpress.org/plugins/wp-rollback/
Extracting https://wordpress.org/plugins/woocommerce-google-analytics-integration/
Extracting https://wordpress.org/plugins/wp-crontrol/
Extracting https://wordpress.org/plugins/wp-retina-2x/
Extracting https://wordpress.org/plugins/https-redirection/
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-2-bcc70c1246ab> in <module>
42
43 for future in futures1:
---> 44 print(future.result())
~\devel\IDE\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433
434 self._condition.wait(timeout)
~\devel\IDE\lib\concurrent\futures\_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
~\devel\IDE\lib\concurrent\futures\thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
<ipython-input-2-bcc70c1246ab> in parser(url)
30 r = req.get(url)
31 soup = BeautifulSoup(r.content, 'html.parser')
---> 32 target = [item.get_text(strip=True, separator=" ") for item in soup.find(
33 "h3", class_="screen-reader-text").find_next("ul").findAll("li")[:8]]
34 head = [soup.find("h1", class_="plugin-title").text]
AttributeError: 'NoneType' object has no attribute 'find_next'
