import urllib
import xml.dom.minidom
VERBOSE = 0
url = "http://rdfweb.org/people/danbri/rdfweb/danbri-foaf.rdf"
# url = "http://xmlns.com/foaf/0.1/index.rdf"
inf = urllib.urlopen(url)
R = inf.read()
dom = xml.dom.minidom.parseString(R)
print R
print
def isTextNode(node):
return node.nodeType == xml.dom.minidom.Node.TEXT_NODE
def emptyTextNode(node):
if not isTextNode(node):
return 0
return reduce(lambda x,y:x and y,
map(lambda x: x in " \n\r\t",
node.data))
def indent(k):
return k * " "
def descend(doc, k=0):
ind = indent(k)
print ind + repr(doc)
print ind + doc.nodeName
if doc.nodeType == xml.dom.minidom.Node.TEXT_NODE:
print ind + "data: " + repr(doc.data)
if doc.nodeType == xml.dom.minidom.Node.ATTRIBUTE_NODE:
print ind + "value: " + repr(doc.nodeValue)
if doc.namespaceURI:
print ind + "namespace: " + repr(doc.namespaceURI)
if hasattr(doc, "_attrs"):
keys = doc._attrs.keys()
if keys:
print ind + "Attributes:\n"
for at in keys:
descend(doc._attrs[at], k+2)
print
# Dump all kinds of extra info
if VERBOSE:
for z in dir(doc):
print (ind + " " + z
+ ": " + repr(getattr(doc, z)))
for x in doc.childNodes:
if not emptyTextNode(x):
descend(x, k+1)
descend(dom)
"""Put a useful docstring here.
"""
import sys
import pprint
import urllib
import xml.dom.minidom
verbose = 0
showWhitespace = 0
def init(blabR=0):
global dom
url = "http://rdfweb.org/people/danbri/rdfweb/danbri-foaf.rdf"
# url = "http://xmlns.com/foaf/0.1/index.rdf"
inf = urllib.urlopen(url)
R = inf.read()
dom = xml.dom.minidom.parseString(R)
if blabR:
print R
print
def isTextNode(node):
return node.nodeType == xml.dom.minidom.Node.TEXT_NODE
def emptyTextNode(node):
if not isTextNode(node):
return 0
return reduce(lambda x,y:x and y,
map(lambda x: x in " \n\r\t",
node.data))
def descend(doc, k=0):
ind = k * " " # indentation
print ind + repr(doc)
print ind + doc.nodeName
if doc.nodeType == xml.dom.minidom.Node.TEXT_NODE:
print ind + "data: " + repr(doc.data)
if doc.nodeType == xml.dom.minidom.Node.ATTRIBUTE_NODE:
print ind + "value: " + repr(doc.nodeValue)
if doc.namespaceURI:
print ind + "namespace: " + repr(doc.namespaceURI)
if hasattr(doc, "_attrs"):
keys = doc._attrs.keys()
if keys:
print ind + "Attributes:\n"
for at in keys:
descend(doc._attrs[at], k+2)
print
# Dump all kinds of extra info
if verbose:
for z in dir(doc):
print (ind + " " + z
+ ": " + repr(getattr(doc, z)))
for x in doc.childNodes:
if showWhitespace or not emptyTextNode(x):
descend(x, k+1)
#############################################
# Translate DOM into triples
class Item:
def __init__(self, node=None):
if node == None:
self.name = None
self.namespace = None
return
# end up with self.uri and self.namespace
# or self.name and self.namespace
rn = repr(node)
docprequel = "<xml.dom.minidom.Document instance"
attrprequel = "<xml.dom.minidom.Attr instance"
elemprequel = "<DOM Element: "
textprequel = "<DOM Text node \""
if rn.startswith(docprequel):
self.name = rn
self.namespace = node.namespaceURI
elif rn.startswith(attrprequel):
self.name = node.value
self.namespace = node.namespaceURI
elif rn.startswith(elemprequel):
self.name = rn[len(elemprequel):-1]
self.namespace = node.namespaceURI
elif rn.startswith(textprequel):
self.name = rn[len(elemprequel):-2]
self.namespace = None
else:
raise Exception, repr(node)
def __repr__(self):
# til I think of something smarter
return self.name
def makeTriples(node, triplist=[ ]):
def hackPredicate(subj, x, triplist=triplist):
if x.nodeType == xml.dom.minidom.Node.TEXT_NODE:
pass
elif x.nodeType == xml.dom.minidom.Node.ATTRIBUTE_NODE:
# The name is made up of two parts separated by
# a colon. The first part refers to a namespace,
# which was given in one of the namespace things
# for the document "xmlns:foobar". The second
# part is a section within that document.
#
# At least, that's my theory, but I can't chase
# down rdf:about and rdf:resource that way; they
# should be defined in
# http://www.w3.org/1999/02/22-rdf-syntax-ns#
# but they aren't.
#
# So what follows is WRONG for handling attribute
# nodes.
#
if verbose:
print "hacking predicate attribute", x.name
print "value", x.value
# print dir(x)
pred = Item()
# pred.name = x.namespaceURI + x.name # WRONG
# Actually this next attempt might be right
pred.name = (x.namespaceURI +
x.name[x.name.index(":")+1:])
pred.namespace = x.namespaceURI
obj = Item()
obj.name = x.value
triplist.append((subj, pred, obj))
#
# As if that all weren't enough humiliation, there
# are cases where a predicate has a bunch of
# attributes. That should be handled in here
# somewhere, and it's how you pick up the namespaces
# for your RDF document. Presumably you'd pass a list
# of known namespace prefixes (xmlns, rdf, rdfs, foaf,
# etc) down thru the descent.
#
elif x.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
if verbose:
print "hacking predicate element", x
pred = Item(x)
for y in x.childNodes:
if not emptyTextNode(y):
obj = Item(y)
triplist.append((subj, pred, obj))
# descend
makeTriples(y, triplist)
else:
print "unknown predicate node", node
if verbose:
print "hacking", node
subj = Item(node)
for x in node.childNodes:
hackPredicate(subj, x)
if hasattr(node, "_attrs"):
keys = node._attrs.keys()
if keys:
for at in keys:
hackPredicate(subj, node._attrs[at])
return triplist
#############################################
rflag = 0
dflag = 0
tflag = 0
if __name__ == "__main__":
import getopt
opts, remains = getopt.getopt(sys.argv[1:],
"d?htvwr",
["descend", "help", "triples",
"verbose", "whitespace", "readfile"])
for x in opts:
if x[0] in ("-?", "-h", "--help"):
print __doc__
elif x[0] in ("-r", "--readfile"):
rflag = 1
elif x[0] in ("-d", "--descend"):
dflag = 1
elif x[0] in ("-t", "--triples"):
tflag = 1
elif x[0] in ("-v", "--verbose"):
verbose = 1
elif x[0] in ("-w", "--whitespace"):
showWhitespace = 1
if dflag or tflag:
init(rflag)
if dflag:
descend(dom)
if tflag:
pprint.pprint(makeTriples(dom))