I need to loop through HTML files, mostly Windows-encoded, and convert them all into UTF-8.
Since files can either be iso-8859-1, windows-1252, or utf8, is there a way to help BeautifulSoup/UnicodDamnit by limiting tries to those?
Thank you.
ROOT = r"c:\temp\"
os.chdir(ROOT)
#fill list with sub-dirs that start with "Blah -"
dirs = [os.path.join(ROOT, dir) for dir in os.listdir(ROOT) if dir.startswith("Blah - ")]
#TODO how to try different, known possible charsets?
#
https://tedboy.github.io/bs4_doc/10_encodings.htmlencoding = ["iso-8859-1","windows-1252","utf8"]
for dir in dirs:
for file in glob.glob(f"{dir}\\*.html"):
#BAD soup = BeautifulSoup(open(INPUTFILE, mode='rb'), "lxml",from_encoding=encoding)
soup = BeautifulSoup(open(file, mode='rb'), "lxml")
print("Orig encod:",soup.original_encoding)
head = soup.head
meta = head.find("meta", {"http-equiv".lower():"Content-Type".lower()})
#if no meta, add one since BS doesn't
if not meta:
print("No meta")
metatag = soup.new_tag('meta')
metatag.attrs['http-equiv'] = 'Content-Type'
metatag.attrs['content'] = 'text/html; charset=utf-8'
head.append(metatag)
else:
print("Found meta(s)")
#check for dups, remove if any
metas = head.find_all("meta", {"http-equiv".lower():"Content-Type".lower()})
for meta in metas[1:]:
#TODO doesn't remove dups?
meta.decompose()
print(head.prettify())