Wrapping sequences of non-block elements in <p>

35 views
Skip to first unread message

Chris Papademetrious

unread,
Sep 4, 2024, 10:16:24 PMSep 4
to beautifulsoup
Hello fellow Soupers!

I need to wrap sequences of non-block tag/string elements in <p>. For example, I need to turn this:

<body>
  This is text 1.
  <pre>stuff</pre>
  This is text 2.
  <pre>stuff</pre>
  This is text 3.
</body>

into this:

<body>
  <p>This is text 1.</p>
  <pre>stuff</pre>
  <p>This is text 2.</p>
  <pre>stuff</pre>
  <p>This is text 3.</p>
</body>

I wrote some code to do this:

block_elements = ["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "p", "pre", "section", "table", "tfoot", "ul", "video"]

import re
import bs4

def add_p_tags(tag: bs4.Tag):
    """
    In given tag, wrap non-block-element content in <p> tags.
    """
    # wrap sequences of non-block elements in <p> tags
    new_p_tags = []
    current_p = None
    for e in list(tag.children):
        if isinstance(e, bs4.Tag) and e.name in block_elements:
            # block element - forget about the previous <p>
            current_p = None
        else:
            # non-block element - add to the current <p>
            if not current_p:
                # no current <p>, so start a new one
                current_p = bs4.Tag(name="p")
                new_p_tags.append(current_p)
                e.insert_before(current_p)

            # move this non-block element to the end of the current <p>
            current_p.append(e)

    # clean up any leading/trailing whitespace in our new <p> tags
    for p in new_p_tags:
        # remove leading whitespace from first text element
        children = list(p.children)
        if children:
            first_child = children[0]
            if isinstance(first_child, bs4.NavigableString):
                text = re.sub(r"^\s+", "", first_child.string)
                if text != "":
                    first_child.replace_with(text)
                else:
                    first_child.extract()

        # remove trailing whitespace from last text element
        children = list(p.children)
        if children:
            last_child = children[-1]
            if isinstance(last_child, bs4.NavigableString):
                text = re.sub(r"\s+$", "", last_child.string)
                if text != "":
                    last_child.replace_with(text)
                else:
                    last_child.extract()

        # if we're left with a completely empty <p>, delete it
        if list(p.children) == []:
            p.decompose()

html = """
<body>
  This is text 1.
  <pre>stuff</pre>
  This is text 2.
  <pre>stuff</pre>
  This is text 3.
</body>
"""

soup = bs4.BeautifulSoup(html, "lxml")
print(soup)
add_p_tags(soup.find("body"))
print(soup)

The code to trim the leading/trailing spaces from the newly added <p> elements was more awkward than I expected. There did not seem to be an easy way to get the first/last child element, then operate on it if it existed. I had to render the .children generator to a complete list, then grab the first/last items. Actually, I had to do it twice because if there was a single string that was deleted when trimming the leading space, I needed to detect a newly empty .children list for the trailing trimming.

If someone can think of any ways to make this code more elegant, feel free to share!

 - Chris


Chris Papademetrious

unread,
Sep 5, 2024, 1:18:29 PMSep 5
to beautifulsoup
I have an improved implementation that moves leading/trailing whitespace within the new <p> to be outside the <p> element instead of deleting it, which preserves any surrounding newlines/indenting:

def add_p_tags(tag: bs4.Tag):
    """
    In given tag, wrap non-block-element content in <p> tags.
    """
    # wrap sequences of non-block elements in <p> tags
    new_p_tags = []
    current_p = None
    for e in list(tag.children):
        if isinstance(e, bs4.Tag) and e.name in block_elements:
            # block element - forget about the previous <p>
            current_p = None
        else:
            # non-block element - add to the current <p>
            if not current_p:
                # no current <p>, so start a new one
                current_p = bs4.Tag(name="p")
                new_p_tags.append(current_p)
                e.insert_before(current_p)

            # move this non-block element to the end of the current <p>
            current_p.append(e)

    # move any leading/trailing whitespace outside our new <p> tags
    for p in new_p_tags:
        # process leading whitespace of first text element

        children = list(p.children)
        if children:
            first_child = children[0]
            if isinstance(first_child, bs4.NavigableString):
                if match := re.search(r"^(\s+)(.*)$", first_child.string, flags=re.DOTALL):
                    whitespace, content = match.groups()
                    if content != "":
                        first_child.replace_with(content)
                    else:
                        first_child.extract()
                    if whitespace != "":
                        p.insert_before(whitespace)

        # process trailing whitespace of last text element

        children = list(p.children)
        if children:
            last_child = children[-1]
            if isinstance(last_child, bs4.NavigableString):
                if match := re.search(r"^(.*?)(\s+)$", last_child.string, flags=re.DOTALL):
                    content, whitespace = match.groups()
                    if content != "":
                        last_child.replace_with(content)
                    else:
                        last_child.extract()
                    if whitespace != "":
                        p.insert_after(whitespace)


        # f our <p> is now empty, delete it
        if list(p.children) == []:
            p.decompose()

 - Chris
Reply all
Reply to author
Forward
0 new messages