Robin Becker
unread,Nov 2, 2021, 8:55:40 AM11/2/21You do not have permission to delete messages in this group
Either email addresses are anonymous for this group or you need the view member email addresses permission to view the original message
to
I'm having a problem using lxml.etree to make a treebuilding parser that validates; I have test code where invalid xml
is detected and an error raised when the line below target=ET.TreeBuilder(), is commented out.
The validation error looks as expected > python tlxml.py invalid.rml
> re.compile('^.*(?:\\W|\\b)(?P<fn>dynamic_rml\\.dtd|rml\\.dtd|rml_0_2\\.dtd|rml_0_3\\.dtd|rml_1_0\\.dtd)$', re.MULTILINE)
> Resolving url='../rml.dtd' context=<lxml.etree._ParserContext object at 0x7f66103273c0> dtdPath='rml.dtd'
> Traceback (most recent call last):
> File "/home/robin/devel/reportlab/REPOS/rlextra/tmp/tlxml.py", line 78, in <module>
> tree = ET.parse(sys.argv[1],parser)
> File "src/lxml/etree.pyx", line 3521, in lxml.etree.parse
> File "src/lxml/parser.pxi", line 1859, in lxml.etree._parseDocument
> File "src/lxml/parser.pxi", line 1885, in lxml.etree._parseDocumentFromURL
> File "src/lxml/parser.pxi", line 1789, in lxml.etree._parseDocFromFile
> File "src/lxml/parser.pxi", line 1177, in lxml.etree._BaseParser._parseDocFromFile
> File "src/lxml/parser.pxi", line 615, in lxml.etree._ParserContext._handleParseResultDoc
> File "src/lxml/parser.pxi", line 725, in lxml.etree._handleParseResult
> File "src/lxml/parser.pxi", line 654, in lxml.etree._raiseParseError
> File "invalid.rml", line 23
> lxml.etree.XMLSyntaxError: No declaration for attribute x of element place1, line 23, column 55
when I have the target=etree.TreeBuilder() active the validation does not work and the tree is formed and passed to the
primitive tuple tree builder so the output looks like
> $ python tlxml.py invalid.rml
> Resolving url='../rml.dtd' context=<lxml.etree._TargetParserContext object at 0x7f73d7b159c0> dtdPath='rml.dtd'
> ('document',
> {'filename': 'test_000_simple.pdf', 'invariant': '1'},
> ['\n\n',
> ('stylesheet',
> ........
> None,
> 44),
> '\n \t\t\n \t\t'],
> 40),
> '\n'],
> 35),
> '\n\n'],
> 2)
If I use the standard example EchoTarget the validation also fails. So I assume that the target argument makes the
validation fail. Is there a way to get validation to work with a target?
The code is
######################################################################################################
from pprint import pprint
from lxml import etree as ET
import sys, os, re
from rlextra.rml2pdf.rml2pdf import CompatibleDTDNames as rmlDTDPat
rmlDTDPat = re.compile('^.*(?:\\W|\\b)(?P<fn>%s)$' % '|'.join((re.escape(_) for _ in rmlDTDPat)),re.M)
class TT:
def __init__(self):
pass
def __call__(self,e):
return (e.tag,e.attrib or None,self.content(e),e.sourceline)
def content(self,e):
t = e.text
if len(e)==0 and t is None:
return t
else:
r = [].append
if t is not None: r(t)
for c in e:
r(self(c))
t = c.tail
if t is not None:
r(t)
return r.__self__
class RMLDTDResolver(ET.Resolver):
__dtds = None
def resolve(self, url, id, context):
m = rmlDTDPat.match(url)
if m:
if self.__dtds is None:
from rlextra import rml2pdf
self.__dtds = {}
for fn in ('rml.dtd','dynamic_rml.dtd'):
with open(os.path.join(os.path.dirname(rml2pdf.__file__),fn),'r') as _:
self.__dtds[fn] = _.read()
fn = m.group('fn')
dtdPath = 'rml.dtd' if fn.startswith('rml') else 'dynamic.dtd'
print(f"Resolving url={url!r} context={context!r} {dtdPath=}")
return self.resolve_string(
self.__dtds[dtdPath],
context,
)
else:
return None
parser = ET.XMLParser(
load_dtd=True,
dtd_validation=True,
attribute_defaults=True,
no_network=True,
remove_comments=True,
remove_pis=True,
strip_cdata=True,
resolve_entities=True,
target=ET.TreeBuilder(), #if commented the parser validates
)
parser.resolvers.add(RMLDTDResolver())
tree = ET.parse(sys.argv[1],parser)
pprint(TT()(tree))
######################################################################################################