Hello,
I need to loop through HTML files, and make use the charset line is the very first item in the header so the title is correctly displayed.
Do you know of a simple way to do this?
Thank you.
Edit: A less elegant solution that does the job: Removing all the relevant meta lines, and inserting one at the top.
I need to loop through HTML files, and make use the charset line is the very first item in the header so the title is correctly displayed.
Do you know of a simple way to do this?
Thank you.
#<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
look_for = re.compile("^Content-Type$", re.I)
soup = BeautifulSoup(open(INPUTFILE,'rb'), "lxml")
meta = soup.find("meta", {"http-equiv":look_for})
#if no meta, add one since BS doesn't
if not meta:
print("No meta")
metatag = soup.new_tag('meta')
metatag.attrs['http-equiv'] = 'Content-Type'
metatag.attrs['content'] = 'text/html; charset=utf-8'
head.insert(0,metatag) #insert as first line in head
else:
#check for dups, remove if any
print("Found meta(s)")
metas = soup.find_all("meta", {"http-equiv":look_for})
if len(metas) > 1:
print("Dups")
for meta in metas[1:]:
meta.decompose()
#at this point, only one line left: Move it to top in head
#TODO How to move utf-8 line at top in head?
"""
<head>
<title>
Blah
</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
"""---Edit: A less elegant solution that does the job: Removing all the relevant meta lines, and inserting one at the top.
look_for = re.compile("^Content-Type$", re.I)
def insert_meta(soup):
metatag = soup.new_tag('meta')
metatag.attrs['http-equiv'] = 'Content-Type'
metatag.attrs['content'] = 'text/html; charset=utf-8'
soup.head.insert(0,metatag)
#<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
meta = soup.find("meta", {"http-equiv":look_for})
#if no meta, add one since BS doesn't
if not meta:
print("No meta")
insert_meta(soup)
else:
print("Found meta(s)")
#remove for dups, if any
metas = soup.find_all("meta", {"http-equiv":look_for})
for meta in metas:
meta.decompose()
insert_meta(soup)
