I'm probably jumping the gun here as my code is still pretty crappy, but I'm trying to develop my first working application. I've set up version control locally on my hard drive (no remotes yet) and started commenting my working code the way I've seen code commented in other people's Git repositories. Could you have a look at this and comment on the style as much as on the (admittedly inexpert) code?
obtainer.py
obtainer.py
from core import *
"""
Obtains the first page of Google search results for the specified search term
in html format and writes it to an html file in the working directory.
Imports the core module (pasted below).
"""
def user_input():
"""Obtains the user input as a string representing the search term.
Verifies that all characters in the search term string are legal.
:returns: The search term string.
"""
screen = digits + ascii_letters + " "
print(screen)
while True:
search_term = input("Input the search term: ")
scan = [char in screen for char search_term]
print(scan)
if False not in scan:
return search_term
else:
print("Invalid character--try again.")
def parse_url(search_term):
"""Generates a Google search url from the search term. If the search
term is more than one word, adds '+' between words. This function is
called by get_and_write_page() below.
:search_term: The search term input by the user.
:returns: The url for the search term.
"""
url = "https://www.google.com/search?q="
search_term_list = search_term.split()
if len(search_term_list) > 1:
for index in range(len(search_term_list)-1):
url += search_term_list[index] + "+"
url += search_term_list[-1]
else:
url += search_term
print("{} parsed to {}.".format(search_term, url))
return url
def get_and_write_page():
"""
Calls parse_url() above to obtain url from search term input by the user
in user_input() above.
Calls the imported check_and_get() function to download the Google search
page in html format.
Writes downloaded data to disk in binary format as an html file in the
working directory.
:returns: None
"""
search_term = user_input()
url = parse_url(search_term)
raw_html = check_and_get(url)
print("{} obtained from {}.".format(search_term, url))
if raw_html is not None:
filename = "Google search results for " + search_term +".html"
with open(filename, "bw") as f:
f.write(raw_html)
print("File {} written.".format(filename))
else:
print("No data obtained.")core.pyfrom requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from datetime import datetime
"""
This module obtained, with minor modifications, from the article
'Practical Introduction to Web Scraping in Python', by Colin O'Keefe,
at https://realpython.com/blog/python/python-web-scraping-practical-introduction/.
It makes an URL request from a webpage and verifies that the
page contains HTML/XML data. If it doesn't, it displays an error message and
writes an error log to the working directory. If there is HTML/XML in the
webpage, the main function of the module extracts the data and returns it.
"""
def is_good_response(resp):
"""
Returns True if the response seems to be HTML/XML, False otherwise.
Called by check_and_get() below.
:resp: Response from with_closing() method of check_and_get()
below.
:returns: boolean value representing whether there is HTML/XML
in the webpage at the URL
"""
content_type = resp.headers["Content-Type"].lower()
return (resp.status_code == 200 and
content_type is not None and
content_type.find('html') > -1)
def log_error(e):
"""In case of error, writes log file to working directory and prints
error to console.
:e: Error type as returned from is_good_response() above.
:returns: None
"""
logstamp = datetime.today() + "-error.log"
with open(logstamp, 'w') as f:
f.write(e)
print(e)
def check_and_get(url):
"""
Attempts to get the contents of a webpage by making an HTTP GET request.
if the content-type response is HTML/XML, returns the text content,
else returns None.
:url: Full website URL
:returns: HTML/XML content of webpage or None
"""
try:
with closing(get(url, stream=True)) as resp:
if is_good_response(resp):
return resp.content
else:
return None
except RequestException as e:
log_error("Error during request to {0} : {1}".format(url, str(e)))
if __name__=='__main__':
get_and_write_page(
