I would like to share this awk-code for parsing of xml-files.
It was inspired by Steve Coile's xmlparse.awk
which I found at
ftp://ftp.freefriends.org/arnold/Awkstuff/xmlparser.awk
I wanted something more useable (not necessary more readable) and
so this came out.
I am using it at work for some time now, where i have to stick with nawk shipped with solaris.
There is no excessive documentation, but trying it should make things clear.
Best regards,
Jan
#!/usr/bin/nawk -f
BEGIN {
while ( getXML(ARGV[1],1) ) {
print XTYPE, XITEM;
for (attrName in XATTR)
print "\t" attrName "=" XATTR[attrName];
}
if (XERROR) {
print XERROR;
exit 1;
}
}
##
# getXML( file, skipData ): # read next xml-data into XTYPE,XITEM,XATTR
# Parameters:
# file -- path to xml file
# skipData -- flag: do not read "DAT" (data between tags) sections
# External variables:
# XTYPE -- type of item read, e.g. "TAG"(tag), "END"(end tag), "COM"(comment), "DAT"(data)
# XITEM -- value of item, e.g. tagname if type is "TAG" or "END"
# XATTR -- Map of attributes, only set if XTYPE=="TAG"
# XPATH -- Path to current tag, e.g. /TopLevelTag/SubTag1/SubTag2
# XLINE -- current line number in input file
# XNODE -- XTYPE, XITEM, XATTR combined into a single string
# XERROR -- error text, set on parse error
# Returns:
# 1 on successful read: XTYPE, XITEM, XATTR are set accordingly
# "" at end of file or parse error, XERROR is set on error
# Private Data:
# _XMLIO -- buffer, XLINE, XPATH for open files
##
function getXML( file, skipData ,end,p,q,tag,att,accu,mline,mode,S0,ex,dtd) {
XTYPE=XITEM=XERROR=XNODE=""; split("",XATTR);
S0=_XMLIO[file,"S0"]; XLINE=_XMLIO[file,"line"]; XPATH=_XMLIO[file,"path"]; dtd=_XMLIO[file,"dtd"];
while (!XTYPE) {
if (S0=="") { if (1!=(getline S0 <file)) break; XLINE++; S0=S0 RS; }
if ( mode == "" ) {
mline=XLINE; accu=""; p=substr(S0,1,1);
if ( p!="<" && !(dtd && p=="]") ) mode="DAT";
else if ( p=="]" ) { S0=substr(S0,2); mode="DTE"; end=">"; dtd=0; }
else if ( substr(S0,1,4)=="<!--" ) { S0=substr(S0,5); mode="COM"; end="-->"; }
else if ( substr(S0,1,9)=="<!DOCTYPE" ) { S0=substr(S0,10); mode="DTB"; end=">"; }
else if ( substr(S0,1,9)=="<![CDATA[" ) { S0=substr(S0,10); mode="CDA"; end="]]>"; }
else if ( substr(S0,1,2)=="<!" ) { S0=substr(S0,3); mode="DEC"; end=">"; }
else if ( substr(S0,1,2)=="<?" ) { S0=substr(S0,3); mode="PIN"; end="?>"; }
else if ( substr(S0,1,2)=="</" ) { S0=substr(S0,3); mode="END"; end=">";
tag=S0;sub(/[ \n\r\t>].*$/,"",tag);S0=substr(S0,length(tag)+1);
ex=XPATH;sub(/\/[^\/]*$/,"",XPATH);ex=substr(ex,length(XPATH)+2);
if (tag!=ex) { XERROR="unexpected close tag <" ex ">..</" tag ">"; break; } }
else{ S0=substr(S0,2); mode="TAG";
tag=S0;sub(/[ \n\r\t\/>].*$/,"",tag);S0=substr(S0,length(tag)+1);
if ( tag !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/
XERROR="invalid tag name '" tag "'"; break; }
XPATH = XPATH "/" tag; } }
else if ( mode == "DAT" ) { # terminated by "<" or EOF
p=index(S0,"<"); if ( dtd && (q=index(S0,"]")) && (!p || q<p) ) p=q;
if (p) {
if (!skipData) { XTYPE="DAT"; XITEM=accu unescapeXML(substr(S0,1,p-1)); }
S0=substr(S0,p); mode=""; }
else{ if (!skipData) accu=accu unescapeXML(S0); S0=""; } }
else if ( mode == "TAG" ) { sub(/^[ \n\r\t]*/,"",S0); if (S0=="") continue;
if ( substr(S0,1,2)=="/>" ) {
S0=substr(S0,3); mode=""; XTYPE="TAG"; XITEM=tag; S0="</"tag">"S0; }
else if ( substr(S0,1,1)==">" ) {
S0=substr(S0,2); mode=""; XTYPE="TAG"; XITEM=tag; }
else{
att=S0; sub(/[= \n\r\t\/>].*$/,"",att); S0=substr(S0,length(att)+1); mode="ATTR";
if ( att !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/
XERROR="invalid attribute name '" att "'"; break; } } }
else if ( mode == "ATTR" ) { sub(/^[ \n\r\t]*/,"",S0); if (S0=="") continue;
if ( substr(S0,1,1)=="=" ) { S0=substr(S0,2); mode="EQ"; }
else { XATTR[att]=att; mode="TAG"; XNODE=XNODE att"="att"\001"; } }
else if ( mode == "EQ" ) { sub(/^[ \n\r\t]*/,"",S0); if (S0=="") continue;
end=substr(S0,1,1);
if ( end=="\"" || end=="'" ) {S0=substr(S0,2);accu="";mode="VALUE";}
else{
accu=S0; sub(/[ \n\r\t\/>].*$/,"",accu); S0=substr(S0,length(accu)+1);
XATTR[att]=unescapeXML(accu); mode="TAG"; XNODE=XNODE att"="XATTR[att]"\001"; } }
else if ( mode == "VALUE" ) { # terminated by end
if ( p=index(S0,end) ) {
XATTR[att]=accu unescapeXML(substr(S0,1,p-1)); XNODE=XNODE att"="XATTR[att]"\001";
S0=substr(S0,p+length(end)); mode="TAG"; }
else{ accu=accu unescapeXML(S0); S0=""; } }
else if ( mode == "DTB" ) { # terminated by "[" or ">"
if ( (q=index(S0,"[")) && (!(p=index(S0,end)) || q<p ) ) {
XTYPE=mode; XITEM= accu substr(S0,1,q-1); S0=substr(S0,q+1); mode=""; dtd=1; }
else if ( p=index(S0,end) ) {
XTYPE=mode; XITEM= accu substr(S0,1,p-1); S0="]"substr(S0,p); mode=""; dtd=1; }
else{ accu=accu S0; S0=""; } }
else if ( p=index(S0,end) ) { # terminated by end
XTYPE=mode; XITEM= ( mode=="END" ? tag : accu substr(S0,1,p-1) );
S0=substr(S0,p+length(end)); mode=""; }
else{ accu=accu S0; S0=""; } }
_XMLIO[file,"S0"]=S0; _XMLIO[file,"line"]=XLINE; _XMLIO[file,"path"]=XPATH; _XMLIO[file,"dtd"]=dtd;
if (mode=="DAT") { mode=""; if (accu!="") XTYPE="DAT"; XITEM=accu; }
if (XTYPE) { XNODE=XTYPE"\001"XITEM"\001"XNODE; return 1; }
close(file);
delete _XMLIO[file,"S0"]; delete _XMLIO[file,"line"]; delete _XMLIO[file,"path"]; delete _XMLIO[file,"dtd"];
if (XERROR) XERROR=file ":" XLINE ": " XERROR;
else if (mode) XERROR=file ":" mline ": " "unterminated " mode;
else if (XPATH) XERROR=file ":" XLINE ": " "unclosed tag(s) " XPATH;
} # function getXML
# unescape data and attribute values, used by getXML
function unescapeXML( text ) {
gsub( "'", "'", text );
gsub( """, "\"", text );
gsub( ">", ">", text );
gsub( "<", "<", text );
gsub( "&", "\\&", text );
return text
}
# close xml file
function closeXML( file ) {
close(file);
delete _XMLIO[file,"S0"]; delete _XMLIO[file,"line"]; delete _XMLIO[file,"path"]; delete _XMLIO[file,"dtd"];
delete _XMLIO[file,"open"]; delete _XMLIO[file,"IND"];
}
> I am using it at work for some time now, where i have to stick with nawk
> shipped with solaris.
What kind of problems are you working on with
this script ? How large are the files usually ?
I use this for several small tasks, like
extracting information from component- or datamodel-descriptions.
These files are usually a few KB up to several hundreds KB in size.
> I use this for several small tasks, like
> extracting information from component- or datamodel-descriptions.
Interesting, are there any standards for such
descriptions ? For example a W3C recommendation ?
Or a Schema ?
> Interesting, are there any standards for such
> descriptions ? For example a W3C recommendation ?
> Or a Schema ?
I am pretty sure, these descriptions are following the xml standard
http://www.w3.org/TR/REC-xml/
And I am pretty sure, my script does not - but the main goal was usability,
not standard-conformity :-)
The DTDs for these descriptions are company internals,
which i will definitely not post in the public.
http://home.vrweb.de/~juergen.kahrs/gawk/XML/
I wonder how your solution compares to using xmlgawk (which parses the
document using the Expat parser).
Regards,
Andy
Hello,
No, I was not familiar with this project (although i heard about it).
Very nice work!
Of course, I will not even try to compete with
expat in terms of correctness, completeness, or speed.
But a comparision could be interesting.
I translated one of the examples from the xmlgawk documentation page into
getXML-code.
-- xmlgawk: -----------
BEGIN { XMLMODE=1 }
XMLSTARTELEM {
printf("%*s%s", 2*depth, "", XMLSTARTELEM)
for (i in XMLATTR)
printf(" %s='%s'", i, XMLATTR[i])
print ""
depth++
}
XMLENDELEM { depth-- }
-- getXML: -----------
BEGIN {
while ( getXML(ARGV[1],1) ) {
if (XTYPE=="TAG") {
printf("%*s%s", 2*depth, "", XITEM)
for (i in XATTR)
printf(" %s='%s'", i, XATTR[i])
print ""
depth++
}else if (XTYPE=="END") depth--;
}
}
[include functions from initial post here]
----------------------
I tried both scripts with the xmlgawk win32 binary.
Input was a 400 kilobyte xml file, output piped into a file.
Running time for both scripts was less than a second.
But a speed difference was clearly feel-able.
I also tried the getXML version with mawk - the difference became less feelable :-)
The output was identical, except for the fact
that entity-references were not resolved in the getXML version.
Best regards,
Jan
(please forgive bad grammar)
> But a comparision could be interesting.
> I translated one of the examples from the xmlgawk documentation page into
> getXML-code.
This was an interesting comparison.
If any other reader uses the getXML code, I would
be interested in his comments.
Thanks to Jan for posting this.
His code shows that AWK also has more than
one way to find solutions to XML problems.
Regards,
Andy
This is correct, it only has to be done.
There are many other details in the manual
that need to be corrected. For example the
handling of attributes in associative arrays.
I think I will do this after the code merge
with gawk-3.1.5 (when it is out).
> His code shows that AWK also has more than
> one way to find solutions to XML problems.
So you might consider this code as an alternative
for anyone who can not use xmlgawk for some reason.
A pulling streaming XML-parser
implemented in pure awk and thus highly portable
with parsing speeds up to 1 MB/s on modern processors.
;-)
> So you might consider this code as an alternative
> for anyone who can not use xmlgawk for some reason.
Yes. Maybe I could include your script into our xgawk distribution ?
It would also make sense to have a chapter about it in our manual.
> A pulling streaming XML-parser
> implemented in pure awk and thus highly portable
> with parsing speeds up to 1 MB/s on modern processors.
I agree that it could be good enough for a significant
number of users. Especially those who have the production
of their XML data under their own control.
> Yes. Maybe I could include your script into our xgawk distribution ?
> It would also make sense to have a chapter about it in our manual.
Yes, of course, this code can freely be used, distributed, or modified,
without any restriction, as long as the origin of this code is
not misrepresented.
--
Robert B. Peirce, Venetia, PA 724-941-6883
bob AT peirce-family.com [Mac]
rbp AT cooksonpeirce.com [Office]
> I missed the initial and early posts on this topic so I am not sure
> whether this is about creating XML using awk or stripping XML with awk
> to get text. I am interested in the latter if anybody has a solution.
We talked about reading XML data with some AWK interpreter.
Jan Weber wrote/improved a portable (nawk) script.
xgawk is a SourceForge project which extends gawk.
Both solutions focus on reading XML data.
Use Google to find out more.
Hmm, maybe this sounded somewhat unfriendly.
What I wanted to say, was:
Yes, Please feel free to include this into your distribution.
I'm highly interested in the distribution of this script,
so anyone who may have a use for it will find it.
But i'm afraid that i will not have the time to write a chapter for your manual.
If you have any further questions, just contact me per email.
Best regards,
Jan
>> Yes, of course, this code can freely be used, distributed, or modified,
>> without any restriction, as long as the origin of this code is
>> not misrepresented.
>
>
>
> Hmm, maybe this sounded somewhat unfriendly.
No, it didnt sound unfriendly to.
> What I wanted to say, was:
>
> Yes, Please feel free to include this into your distribution.
> I'm highly interested in the distribution of this script,
> so anyone who may have a use for it will find it.
That's good. Yesterday evening I inserted an empty
chapter into the manual ("Reading XML files with POSIX AWK").
> But i'm afraid that i will not have the time to write a chapter for your
> manual.
It would have been a welcome idea if you had written
this chapter.
> If you have any further questions, just contact me per email.
I thought about making the script 'compatible' to xgawk
in the sense that variables have the same names and semamtics.
I will come back to this when I return to changing the manual.
> I thought about making the script 'compatible' to xgawk
> in the sense that variables have the same names and semamtics.
I would like to ask you to be carefully with this idea.
Yould probably would have to rewrite the whole parser.
It would also destroy the simplicity, compactness, and consistency
of the getXML interface.
Having a type/value pair (XTYPE,XITEM) is not the same as having
a bunch of variables, one for each type of data (what is your solution).
It would also be much harder to build the next step (DOM like parsing) on top
of this (You might quess, what the XNODE variable is ment for)
Best regards,
Jan
Regards,
Andy
I understand, someone should update the documentation ;-)
I thought about something like this:
function putXML( file, xtype_or_xnode, xitem, xattr ) {
code determines whether xtype_or_xnode is xtype or xnode
and uses xtype,xitem,xattr or xnode respectively.
}
or (currently my favorite):
function putXML( file, xnode ) {
code uses xnode, except if xnode is empty (not set) then it uses
global vars XTYPE,XITEM,XATTR
}
or even
function putXML( file, flagUseXNODE ) {
if flagUseXNODE is set, code uses global var XNODE
else it uses global vars XTYPE,XITEM,XATTR
}
with the last two versions it would be possible to write code like this:
# Example: increment the attribute named "bar" in all elements named "foo"
BEGIN {
while ( getXML(ARGV[1]) ) {
if (XTYPE=="TAG" && XITEM=="foo") {
XATTR["bar"]++;
putXML(ARGV[2]);
}
}
}
And this would finally be "XML processing with AWK" instead of just parsing ;-)
Comments are welcome.
Regards,
Jan
> I'm also thinking about a function putXML(), but i am not sure with the
> interface.
Good idea. Last year, Manuel Collado and Stefan Tramm
told me that there is an interesting Perl solution for
this problem.
> And this would finally be "XML processing with AWK" instead of just
> parsing ;-)
Indeed. Keep us posted about your advances.
And be prepared to write a chapter about it
in our manual.
> Jan Weber wrote:
>
>> I'm also thinking about a function putXML(), but i am not sure with the
>> interface.
>
> Good idea. Last year, Manuel Collado and Stefan Tramm
> told me that there is an interesting Perl solution for
> this problem.
Can you give more information about the "interesting" part of this
solution, maybe a link or a short explanation?
>> And this would finally be "XML processing with AWK" instead of just
>> parsing ;-)
>
> Indeed. Keep us posted about your advances.
I will do, but maybe it will take a while, as I currently have
no need for such script.
> And be prepared to write a chapter about it
> in our manual.
I think about it.
>> Good idea. Last year, Manuel Collado and Stefan Tramm
>> told me that there is an interesting Perl solution for
>> this problem.
>
> Can you give more information about the "interesting" part of this
> solution, maybe a link or a short explanation?
The interesting part of it was some functions
for assembling the textual output. For example
one function surrounded the text to be printed
with markup tags and attributes. I'm sorry thet
I cant remember the name of the Perl module.
Maybe someone else (Manuel, Stefan, Andrew) can
help with details ?
Here is the code I wrote for this purpose:
-----X---------------------------
# Extra functions for xmllib
#
# Author: Manuel Collado
# Date: June 2004
# generate the string for a XML attribute, with a leading space:
# ' name="value"'
function attr( name, value ) {
return " " name "=\"" value "\""
}
# generate the string for starting element tags
# - several ">" delimited tags may be specified
# - every tag can include attributes
# example: 'one att="val">two>three'
# generates: '<one att="val"><two><three>'
function xse( tags, n, t, k, s ) {
n = split( tags, t, ">" )
s = ""
for (k=1; k<=n; k++) {
s = s "<" t[k] ">"
}
return s
}
# generate the string for ending element tags, like xse, but
# - in reverse order
# - trimm attributes
function xee( tags, n, t, k, s ) {
#print "---" tags
n = split( tags, t, ">" )
s = ""
for (k=n; k>=1; k--) {
#print k "--" t[k]
if (index(t[k], " ")) {
s = s "</" substr(t[k], 1, index(t[k], " ")-1) ">"
} else {
s = s "</" t[k] ">"
}
}
return s
}
# generate the string for a full xml element, combine xse and xee
function xml( tags, content ) {
return xse(tags) content xee(tags)
}
# generate the string for a processing instruction
function xpi( pi ) {
return "<?" pi "?>"
}
# generate the string for a simple DOCTYPE declaration
function doctype( root, public, url ) {
if (public) {
return "<!DOCTYPE " root " PUBLIC " public " " url " >"
} else {
return "<!DOCTYPE " root " SYSTEM " url " >"
}
}
# generate the string for a stylesheet processing instruction
# (type = css/xsl)
function xss( type, url ) {
return xpi( "xsl-stylesheet" attr("type", "text/" type)
attr("href", url) )
}
# generate the string for a XML comment
function xcomment( comment ) {
return "<!-- " comment " -->"
}
-----X---------------------------
Regards,
--
To reply by e-mail, please remove the extra dot
in the given address: m.collado -> mcollado