Currently if the DOCTYPE declaration doesn't parse correctly,
BeautifulSoup puts the entire document into a single NavigableString.
The problem is in the exception handler for parse_declaration(), which
passes the entire balance of the document to handle_data(). Here is a
version of parse_declaration() that attempts to skip just the
declaration itself.
Kent
def parse_declaration(self, i):
"""Treat a bogus SGML declaration as raw data. Treat a CDATA
declaration as a CData object."""
j = None
if self.rawdata[i:i+9] == '<![CDATA[':
k = self.rawdata.find(']]>', i)
if k == -1:
k = len(self.rawdata)
data = self.rawdata[i+9:k]
j = k+3
self._toStringSubclass(data, CData)
else:
try:
j = SGMLParser.parse_declaration(self, i)
except SGMLParseError:
# Could not parse the DOCTYPE declaration
# Try to just skip the actual declaration
match = re.search(r'<!DOCTYPE([^>]*?)>', self.rawdata,
re.MULTILINE)
if match:
toHandle = self.rawdata[i:match.end()]
else:
toHandle = self.rawdata[i:]
self.handle_data(toHandle)
j = i + len(toHandle)
return j