Thanks Mike,
I eventually solved the problem as follows
(installed htmlentities gem - see
http://htmlentities.rubyforge.org/):
coder = HTMLEntities.new
f = File.open('raw/test.html',"r")
doc = Nokogiri::XML::DocumentFragment.parse(f.read.encode('UTF-8'))
f.close
doc.css("p").each do |p|
p.replace p.inner_html
end
doc.css("span.T2").each do |span|
span.replace span.content.upcase
end
doc.css("span.T5").each do |span|
span.replace "<em>"+span.content+"</em>"
end
doc.css("span").each do |span|
span.replace span.inner_html
end
f = File.open('processed/test.html',"w")
f.write(coder.decode(doc))
f.close