Hi everyone,
In various forums (Stack
Overflow, etc.), I've seen many people ask how to convert flat HTML to
hierarchical HTML based on heading levels.
As it turns out, I needed to solve this problem in my day job too.
Here's a testcase showing the approach I used:
import re
from bs4 import BeautifulSoup, Tag, NavigableString
def is_non_whitespace(thing) -> bool:
return not (isinstance(thing, NavigableString) and thing.text.isspace())
def find_previous_sibling_skip_ws(thing):
for this_thing in thing.previous_siblings:
if is_non_whitespace(this_thing):
return this_thing
return None
def find_next_sibling_skip_ws(thing):
for this_thing in thing.next_siblings:
if is_non_whitespace(this_thing):
return this_thing
return None
# add <article> hierarchy to flat HTML
def wrap_html_hierarchy(soup):
all_h = soup.find_all(re.compile('^h[1-6]$'))
for level in range(6, 1-1, -1):
# loop through heading levels bottom-up (<h6> to <h1>)
stop_pattern = re.compile(f"^h[1-{level}]$")
for this_h in [x for x in all_h if x.name == f"h{level}"]:
# starting at each heading tag of "level", create a group of stuff
# up to any heading tag of level 1 to "level"
group = []
current_elt = this_h
while True:
group.append(current_elt)
current_elt = current_elt.next_sibling
# if there's no next element *or* we reached a grouping barrier, stop accumulating
if (not current_elt) or (isinstance(current_elt, Tag) and re.search(stop_pattern, current_elt.name)):
# only create a container if stuff precedes or follows;
# otherwise the current parent is already our container
if (find_previous_sibling_skip_ws(group[0]) or find_next_sibling_skip_ws(group[-1])):
article = soup.new_tag('article')
this_h.insert_before(article)
article.extend(group)
break;
The comments explain the algorithm. When I run the following code:
html_doc = """
<html>
<body>
<h1>Top</h1>
<p>text1</p>
<h2>Mid-1</h2>
<p>text2</p>
<h3>Low</h1>
<p>text3</p>
<h2>Mid-2</h2>
<p>text4</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
wrap_html_hierarchy(soup)
print(soup)
I get the following output (manually cleaned up a bit):
<html>
<body>
<h1>Top</h1>
<p>text1</p>
<article>
<h2>Mid-1</h2>
<p>text2</p>
<article>
<h3>Low</h3>
<p>text3</p>
</article>
</article>
<article>
<h2>Mid-2</h2>
<p>text4</p>
</article>
</body>
</html>
<article> hierarchy is created only when a content group has other content adjacent to it. For example, no content is created for the <h1> stuff because the existing <body> element is sufficient.
I created two helper functions to skip whitespace text objects when checking for adjacent content:
find_previous_sibling_skip_ws
find_next_sibling_skip_ws
These could be performed natively via the following enhancement request:
I hope someone finds this useful!
- Chris