Hello fellow Soupers!
I need to wrap sequences of non-block tag/string elements in <p>. For example, I need to turn this:
<body>
This is text 1.
<pre>stuff</pre>
This is text 2.
<pre>stuff</pre>
This is text 3.
</body>
into this:
<body>
<p>This is text 1.</p>
<pre>stuff</pre>
<p>This is text 2.</p>
<pre>stuff</pre>
<p>This is text 3.</p>
</body>
I wrote some code to do this:
block_elements = ["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "p", "pre", "section", "table", "tfoot", "ul", "video"]
import re
import bs4
def add_p_tags(tag: bs4.Tag):
"""
In given tag, wrap non-block-element content in <p> tags.
"""
# wrap sequences of non-block elements in <p> tags
new_p_tags = []
current_p = None
for e in list(tag.children):
if isinstance(e, bs4.Tag) and e.name in block_elements:
# block element - forget about the previous <p>
current_p = None
else:
# non-block element - add to the current <p>
if not current_p:
# no current <p>, so start a new one
current_p = bs4.Tag(name="p")
new_p_tags.append(current_p)
e.insert_before(current_p)
# move this non-block element to the end of the current <p>
current_p.append(e)
# clean up any leading/trailing whitespace in our new <p> tags
for p in new_p_tags:
# remove leading whitespace from first text element
children = list(p.children)
if children:
first_child = children[0]
if isinstance(first_child, bs4.NavigableString):
text = re.sub(r"^\s+", "", first_child.string)
if text != "":
first_child.replace_with(text)
else:
first_child.extract()
# remove trailing whitespace from last text element
children = list(p.children)
if children:
last_child = children[-1]
if isinstance(last_child, bs4.NavigableString):
text = re.sub(r"\s+$", "", last_child.string)
if text != "":
last_child.replace_with(text)
else:
last_child.extract()
# if we're left with a completely empty <p>, delete it
if list(p.children) == []:
p.decompose()
html = """
<body>
This is text 1.
<pre>stuff</pre>
This is text 2.
<pre>stuff</pre>
This is text 3.
</body>
"""
soup = bs4.BeautifulSoup(html, "lxml")
print(soup)
add_p_tags(soup.find("body"))
print(soup)
The code to trim the leading/trailing spaces from the newly added <p> elements was more awkward than I expected. There did not seem to be an easy way to get the first/last child element, then operate on it if it existed. I had to render the .children generator to a complete list, then grab the first/last items. Actually, I had to do it twice because if there was a single string that was deleted when trimming the leading space, I needed to detect a newly empty .children list for the trailing trimming.
If someone can think of any ways to make this code more elegant, feel free to share!
- Chris