Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.

Dismiss

RDF hacking in Python

2 views

Skip to first unread message

Will Ware

unread,

Aug 26, 2003, 2:19:41 AM8/26/03

#!/usr/bin/python
"""RDF is an important part of the semantic web, which will some day
make the WWW machine-readable, hopefully. Smart agents, automated
scholarship, artificial intelligence, the Singularity, yadda yadda
yadda."""

import urllib
import xml.dom.minidom

VERBOSE = 0

url = "http://rdfweb.org/people/danbri/rdfweb/danbri-foaf.rdf"
# url = "http://xmlns.com/foaf/0.1/index.rdf"

inf = urllib.urlopen(url)
R = inf.read()
dom = xml.dom.minidom.parseString(R)

print R
print

def isTextNode(node):
return node.nodeType == xml.dom.minidom.Node.TEXT_NODE

def emptyTextNode(node):
if not isTextNode(node):
return 0
return reduce(lambda x,y:x and y,
map(lambda x: x in " \n\r\t",
node.data))

def indent(k):
return k * " "

def descend(doc, k=0):
ind = indent(k)
print ind + repr(doc)
print ind + doc.nodeName

if doc.nodeType == xml.dom.minidom.Node.TEXT_NODE:
print ind + "data: " + repr(doc.data)
if doc.nodeType == xml.dom.minidom.Node.ATTRIBUTE_NODE:
print ind + "value: " + repr(doc.nodeValue)
if doc.namespaceURI:
print ind + "namespace: " + repr(doc.namespaceURI)
if hasattr(doc, "_attrs"):
keys = doc._attrs.keys()
if keys:
print ind + "Attributes:\n"
for at in keys:
descend(doc._attrs[at], k+2)
print

# Dump all kinds of extra info
if VERBOSE:
for z in dir(doc):
print (ind + " " + z
+ ": " + repr(getattr(doc, z)))

for x in doc.childNodes:
if not emptyTextNode(x):
descend(x, k+1)

descend(dom)

Will Ware

unread,

Aug 27, 2003, 12:15:06 AM8/27/03

#!/usr/bin/python

"""Put a useful docstring here.
"""

import sys
import pprint
import urllib
import xml.dom.minidom

verbose = 0
showWhitespace = 0

def init(blabR=0):
global dom

url = "http://rdfweb.org/people/danbri/rdfweb/danbri-foaf.rdf"
# url = "http://xmlns.com/foaf/0.1/index.rdf"
inf = urllib.urlopen(url)
R = inf.read()
dom = xml.dom.minidom.parseString(R)

if blabR:
print R
print

def isTextNode(node):
return node.nodeType == xml.dom.minidom.Node.TEXT_NODE

def emptyTextNode(node):
if not isTextNode(node):
return 0
return reduce(lambda x,y:x and y,
map(lambda x: x in " \n\r\t",
node.data))

def descend(doc, k=0):
ind = k * " " # indentation

print ind + repr(doc)
print ind + doc.nodeName

# Dump all kinds of extra info

if verbose:

for z in dir(doc):
print (ind + " " + z
+ ": " + repr(getattr(doc, z)))

for x in doc.childNodes:
if showWhitespace or not emptyTextNode(x):
descend(x, k+1)

#############################################
# Translate DOM into triples

class Item:
def __init__(self, node=None):
if node == None:
self.name = None
self.namespace = None
return
# end up with self.uri and self.namespace
# or self.name and self.namespace
rn = repr(node)
docprequel = "<xml.dom.minidom.Document instance"
attrprequel = "<xml.dom.minidom.Attr instance"
elemprequel = "<DOM Element: "
textprequel = "<DOM Text node \""
if rn.startswith(docprequel):
self.name = rn
self.namespace = node.namespaceURI
elif rn.startswith(attrprequel):
self.name = node.value
self.namespace = node.namespaceURI
elif rn.startswith(elemprequel):
self.name = rn[len(elemprequel):-1]
self.namespace = node.namespaceURI
elif rn.startswith(textprequel):
self.name = rn[len(elemprequel):-2]
self.namespace = None
else:
raise Exception, repr(node)
def __repr__(self):
# til I think of something smarter
return self.name

def makeTriples(node, triplist=[ ]):

def hackPredicate(subj, x, triplist=triplist):
if x.nodeType == xml.dom.minidom.Node.TEXT_NODE:
pass
elif x.nodeType == xml.dom.minidom.Node.ATTRIBUTE_NODE:
# The name is made up of two parts separated by
# a colon. The first part refers to a namespace,
# which was given in one of the namespace things
# for the document "xmlns:foobar". The second
# part is a section within that document.
#
# At least, that's my theory, but I can't chase
# down rdf:about and rdf:resource that way; they
# should be defined in
# http://www.w3.org/1999/02/22-rdf-syntax-ns#
# but they aren't.
#
# So what follows is WRONG for handling attribute
# nodes.
#
if verbose:
print "hacking predicate attribute", x.name
print "value", x.value
# print dir(x)
pred = Item()
# pred.name = x.namespaceURI + x.name # WRONG
# Actually this next attempt might be right
pred.name = (x.namespaceURI +
x.name[x.name.index(":")+1:])
pred.namespace = x.namespaceURI
obj = Item()
obj.name = x.value
triplist.append((subj, pred, obj))
#
# As if that all weren't enough humiliation, there
# are cases where a predicate has a bunch of
# attributes. That should be handled in here
# somewhere, and it's how you pick up the namespaces
# for your RDF document. Presumably you'd pass a list
# of known namespace prefixes (xmlns, rdf, rdfs, foaf,
# etc) down thru the descent.
#
elif x.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
if verbose:
print "hacking predicate element", x
pred = Item(x)
for y in x.childNodes:
if not emptyTextNode(y):
obj = Item(y)
triplist.append((subj, pred, obj))
# descend
makeTriples(y, triplist)
else:
print "unknown predicate node", node

if verbose:
print "hacking", node
subj = Item(node)
for x in node.childNodes:
hackPredicate(subj, x)
if hasattr(node, "_attrs"):
keys = node._attrs.keys()
if keys:
for at in keys:
hackPredicate(subj, node._attrs[at])
return triplist

#############################################

rflag = 0
dflag = 0
tflag = 0

if __name__ == "__main__":
import getopt
opts, remains = getopt.getopt(sys.argv[1:],
"d?htvwr",
["descend", "help", "triples",
"verbose", "whitespace", "readfile"])
for x in opts:

if x[0] in ("-?", "-h", "--help"):
print __doc__

elif x[0] in ("-r", "--readfile"):
rflag = 1

elif x[0] in ("-d", "--descend"):
dflag = 1

elif x[0] in ("-t", "--triples"):
tflag = 1

elif x[0] in ("-v", "--verbose"):
verbose = 1

elif x[0] in ("-w", "--whitespace"):
showWhitespace = 1

if dflag or tflag:
init(rflag)
if dflag:
descend(dom)
if tflag:
pprint.pprint(makeTriples(dom))

0 new messages