XML parsing with awk

Jan Weber

unread,

Jul 10, 2005, 12:29:01 PM7/10/05

to

Hello,

I would like to share this awk-code for parsing of xml-files.
It was inspired by Steve Coile's xmlparse.awk
which I found at
ftp://ftp.freefriends.org/arnold/Awkstuff/xmlparser.awk
I wanted something more useable (not necessary more readable) and
so this came out.
I am using it at work for some time now, where i have to stick with nawk shipped with solaris.
There is no excessive documentation, but trying it should make things clear.

Best regards,
Jan

#!/usr/bin/nawk -f

BEGIN {
while ( getXML(ARGV[1],1) ) {
print XTYPE, XITEM;
for (attrName in XATTR)
print "\t" attrName "=" XATTR[attrName];
}
if (XERROR) {
print XERROR;
exit 1;
}
}

##
# getXML( file, skipData ): # read next xml-data into XTYPE,XITEM,XATTR
# Parameters:
# file -- path to xml file
# skipData -- flag: do not read "DAT" (data between tags) sections
# External variables:
# XTYPE -- type of item read, e.g. "TAG"(tag), "END"(end tag), "COM"(comment), "DAT"(data)
# XITEM -- value of item, e.g. tagname if type is "TAG" or "END"
# XATTR -- Map of attributes, only set if XTYPE=="TAG"
# XPATH -- Path to current tag, e.g. /TopLevelTag/SubTag1/SubTag2
# XLINE -- current line number in input file
# XNODE -- XTYPE, XITEM, XATTR combined into a single string
# XERROR -- error text, set on parse error
# Returns:
# 1 on successful read: XTYPE, XITEM, XATTR are set accordingly
# "" at end of file or parse error, XERROR is set on error
# Private Data:
# _XMLIO -- buffer, XLINE, XPATH for open files
##
function getXML( file, skipData ,end,p,q,tag,att,accu,mline,mode,S0,ex,dtd) {
XTYPE=XITEM=XERROR=XNODE=""; split("",XATTR);
S0=_XMLIO[file,"S0"]; XLINE=_XMLIO[file,"line"]; XPATH=_XMLIO[file,"path"]; dtd=_XMLIO[file,"dtd"];
while (!XTYPE) {
if (S0=="") { if (1!=(getline S0 <file)) break; XLINE++; S0=S0 RS; }
if ( mode == "" ) {
mline=XLINE; accu=""; p=substr(S0,1,1);
if ( p!="<" && !(dtd && p=="]") ) mode="DAT";
else if ( p=="]" ) { S0=substr(S0,2); mode="DTE"; end=">"; dtd=0; }
else if ( substr(S0,1,4)==""; }
else if ( substr(S0,1,9)=="<!DOCTYPE" ) { S0=substr(S0,10); mode="DTB"; end=">"; }
else if ( substr(S0,1,9)=="<![CDATA[" ) { S0=substr(S0,10); mode="CDA"; end="]]>"; }
else if ( substr(S0,1,2)=="<!" ) { S0=substr(S0,3); mode="DEC"; end=">"; }
else if ( substr(S0,1,2)=="<?" ) { S0=substr(S0,3); mode="PIN"; end="?>"; }
else if ( substr(S0,1,2)=="</" ) { S0=substr(S0,3); mode="END"; end=">";
tag=S0;sub(/[ \n\r\t>].*$/,"",tag);S0=substr(S0,length(tag)+1);
ex=XPATH;sub(/\/[^\/]*$/,"",XPATH);ex=substr(ex,length(XPATH)+2);
if (tag!=ex) { XERROR="unexpected close tag <" ex ">..</" tag ">"; break; } }
else{ S0=substr(S0,2); mode="TAG";
tag=S0;sub(/[ \n\r\t\/>].*$/,"",tag);S0=substr(S0,length(tag)+1);
if ( tag !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/
XERROR="invalid tag name '" tag "'"; break; }
XPATH = XPATH "/" tag; } }
else if ( mode == "DAT" ) { # terminated by "<" or EOF
p=index(S0,"<"); if ( dtd && (q=index(S0,"]")) && (!p || q<p) ) p=q;
if (p) {
if (!skipData) { XTYPE="DAT"; XITEM=accu unescapeXML(substr(S0,1,p-1)); }
S0=substr(S0,p); mode=""; }
else{ if (!skipData) accu=accu unescapeXML(S0); S0=""; } }
else if ( mode == "TAG" ) { sub(/^[ \n\r\t]*/,"",S0); if (S0=="") continue;
if ( substr(S0,1,2)=="/>" ) {
S0=substr(S0,3); mode=""; XTYPE="TAG"; XITEM=tag; S0="</"tag">"S0; }
else if ( substr(S0,1,1)==">" ) {
S0=substr(S0,2); mode=""; XTYPE="TAG"; XITEM=tag; }
else{
att=S0; sub(/[= \n\r\t\/>].*$/,"",att); S0=substr(S0,length(att)+1); mode="ATTR";
if ( att !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/
XERROR="invalid attribute name '" att "'"; break; } } }
else if ( mode == "ATTR" ) { sub(/^[ \n\r\t]*/,"",S0); if (S0=="") continue;
if ( substr(S0,1,1)=="=" ) { S0=substr(S0,2); mode="EQ"; }
else { XATTR[att]=att; mode="TAG"; XNODE=XNODE att"="att"\001"; } }
else if ( mode == "EQ" ) { sub(/^[ \n\r\t]*/,"",S0); if (S0=="") continue;
end=substr(S0,1,1);
if ( end=="\"" || end=="'" ) {S0=substr(S0,2);accu="";mode="VALUE";}
else{
accu=S0; sub(/[ \n\r\t\/>].*$/,"",accu); S0=substr(S0,length(accu)+1);
XATTR[att]=unescapeXML(accu); mode="TAG"; XNODE=XNODE att"="XATTR[att]"\001"; } }
else if ( mode == "VALUE" ) { # terminated by end
if ( p=index(S0,end) ) {
XATTR[att]=accu unescapeXML(substr(S0,1,p-1)); XNODE=XNODE att"="XATTR[att]"\001";
S0=substr(S0,p+length(end)); mode="TAG"; }
else{ accu=accu unescapeXML(S0); S0=""; } }
else if ( mode == "DTB" ) { # terminated by "[" or ">"
if ( (q=index(S0,"[")) && (!(p=index(S0,end)) || q<p ) ) {
XTYPE=mode; XITEM= accu substr(S0,1,q-1); S0=substr(S0,q+1); mode=""; dtd=1; }
else if ( p=index(S0,end) ) {
XTYPE=mode; XITEM= accu substr(S0,1,p-1); S0="]"substr(S0,p); mode=""; dtd=1; }
else{ accu=accu S0; S0=""; } }
else if ( p=index(S0,end) ) { # terminated by end
XTYPE=mode; XITEM= ( mode=="END" ? tag : accu substr(S0,1,p-1) );
S0=substr(S0,p+length(end)); mode=""; }
else{ accu=accu S0; S0=""; } }
_XMLIO[file,"S0"]=S0; _XMLIO[file,"line"]=XLINE; _XMLIO[file,"path"]=XPATH; _XMLIO[file,"dtd"]=dtd;
if (mode=="DAT") { mode=""; if (accu!="") XTYPE="DAT"; XITEM=accu; }
if (XTYPE) { XNODE=XTYPE"\001"XITEM"\001"XNODE; return 1; }
close(file);
delete _XMLIO[file,"S0"]; delete _XMLIO[file,"line"]; delete _XMLIO[file,"path"]; delete _XMLIO[file,"dtd"];
if (XERROR) XERROR=file ":" XLINE ": " XERROR;
else if (mode) XERROR=file ":" mline ": " "unterminated " mode;
else if (XPATH) XERROR=file ":" XLINE ": " "unclosed tag(s) " XPATH;
} # function getXML

# unescape data and attribute values, used by getXML
function unescapeXML( text ) {
gsub( "'", "'", text );
gsub( """, "\"", text );
gsub( ">", ">", text );
gsub( "<", "<", text );
gsub( "&", "\\&", text );
return text
}

# close xml file
function closeXML( file ) {
close(file);
delete _XMLIO[file,"S0"]; delete _XMLIO[file,"line"]; delete _XMLIO[file,"path"]; delete _XMLIO[file,"dtd"];
delete _XMLIO[file,"open"]; delete _XMLIO[file,"IND"];
}

Jürgen Kahrs

unread,

Jul 10, 2005, 1:42:01 PM7/10/05

to

Jan Weber wrote:

> I am using it at work for some time now, where i have to stick with nawk
> shipped with solaris.

What kind of problems are you working on with
this script ? How large are the files usually ?

Jan Weber

unread,

Jul 10, 2005, 3:43:58 PM7/10/05

to

I use this for several small tasks, like
extracting information from component- or datamodel-descriptions.
These files are usually a few KB up to several hundreds KB in size.

Jürgen Kahrs

unread,

Jul 10, 2005, 4:36:06 PM7/10/05

to

Jan Weber wrote:

> I use this for several small tasks, like
> extracting information from component- or datamodel-descriptions.

Interesting, are there any standards for such
descriptions ? For example a W3C recommendation ?
Or a Schema ?

Jan Weber

unread,

Jul 10, 2005, 5:36:48 PM7/10/05

to

On Sun, 10 Jul 2005 22:36:06 +0200, Jürgen Kahrs <Juergen.Kahr...@vr-web.de> wrote:

> Interesting, are there any standards for such
> descriptions ? For example a W3C recommendation ?
> Or a Schema ?

I am pretty sure, these descriptions are following the xml standard
http://www.w3.org/TR/REC-xml/
And I am pretty sure, my script does not - but the main goal was usability,
not standard-conformity :-)

The DTDs for these descriptions are company internals,
which i will definitely not post in the public.

Andrew Schorr

unread,

Jul 11, 2005, 9:05:18 AM7/11/05

to

Just curious: are you familiar with the xmlgawk sourceforge project?
You can find more info here:

http://home.vrweb.de/~juergen.kahrs/gawk/XML/

I wonder how your solution compares to using xmlgawk (which parses the
document using the Expat parser).

Regards,
Andy

Jan Weber

unread,

Jul 11, 2005, 2:49:36 PM7/11/05

to

Hello,

No, I was not familiar with this project (although i heard about it).
Very nice work!
Of course, I will not even try to compete with
expat in terms of correctness, completeness, or speed.

But a comparision could be interesting.
I translated one of the examples from the xmlgawk documentation page into
getXML-code.

-- xmlgawk: -----------
BEGIN { XMLMODE=1 }
XMLSTARTELEM {
printf("%*s%s", 2*depth, "", XMLSTARTELEM)
for (i in XMLATTR)
printf(" %s='%s'", i, XMLATTR[i])
print ""
depth++
}
XMLENDELEM { depth-- }

-- getXML: -----------

BEGIN {
while ( getXML(ARGV[1],1) ) {

if (XTYPE=="TAG") {
printf("%*s%s", 2*depth, "", XITEM)
for (i in XATTR)
printf(" %s='%s'", i, XATTR[i])
print ""
depth++
}else if (XTYPE=="END") depth--;
}
}
[include functions from initial post here]

----------------------

I tried both scripts with the xmlgawk win32 binary.
Input was a 400 kilobyte xml file, output piped into a file.

Running time for both scripts was less than a second.
But a speed difference was clearly feel-able.
I also tried the getXML version with mawk - the difference became less feelable :-)

The output was identical, except for the fact
that entity-references were not resolved in the getXML version.

Best regards,
Jan

(please forgive bad grammar)

Jürgen Kahrs

unread,

Jul 12, 2005, 3:04:38 AM7/12/05

to

Jan Weber wrote:

> But a comparision could be interesting.
> I translated one of the examples from the xmlgawk documentation page into
> getXML-code.

This was an interesting comparison.
If any other reader uses the getXML code, I would
be interested in his comments.

Thanks to Jan for posting this.
His code shows that AWK also has more than
one way to find solutions to XML problems.

Andrew Schorr

unread,

Jul 12, 2005, 9:16:24 AM7/12/05

to

Thanks, that comparison is interesting. By the way, I'm not sure
whether we updated the
documentation, but xmlgawk now has an XMLDEPTH variable, so there's no
need for
the xmlgawk script to calculate the depth. I guess we should fix the
example.

Regards,
Andy

Jürgen Kahrs

unread,

Jul 12, 2005, 9:18:54 AM7/12/05

to

Andrew Schorr wrote:

This is correct, it only has to be done.
There are many other details in the manual
that need to be corrected. For example the
handling of attributes in associative arrays.
I think I will do this after the code merge
with gawk-3.1.5 (when it is out).

Jan Weber

unread,

Jul 12, 2005, 5:47:23 PM7/12/05

to

On Tue, 12 Jul 2005 09:04:38 +0200, Jürgen Kahrs <Juergen.Kah...@vr-web.de> wrote:

> His code shows that AWK also has more than
> one way to find solutions to XML problems.

So you might consider this code as an alternative
for anyone who can not use xmlgawk for some reason.

A pulling streaming XML-parser
implemented in pure awk and thus highly portable
with parsing speeds up to 1 MB/s on modern processors.

;-)

Jürgen Kahrs

unread,

Jul 13, 2005, 12:25:54 PM7/13/05

to

Jan Weber wrote:

> So you might consider this code as an alternative
> for anyone who can not use xmlgawk for some reason.

Yes. Maybe I could include your script into our xgawk distribution ?
It would also make sense to have a chapter about it in our manual.

> A pulling streaming XML-parser
> implemented in pure awk and thus highly portable
> with parsing speeds up to 1 MB/s on modern processors.

I agree that it could be good enough for a significant
number of users. Especially those who have the production
of their XML data under their own control.

Jan Weber

unread,

Jul 13, 2005, 6:22:16 PM7/13/05

to

On Wed, 13 Jul 2005 18:25:54 +0200, Jürgen Kahrs <Juergen.Kahr...@vr-web.de> wrote:

> Yes. Maybe I could include your script into our xgawk distribution ?
> It would also make sense to have a chapter about it in our manual.

Yes, of course, this code can freely be used, distributed, or modified,
without any restriction, as long as the origin of this code is
not misrepresented.

Robert Peirce

unread,

Jul 14, 2005, 10:17:48 AM7/14/05

to

I missed the initial and early posts on this topic so I am not sure
whether this is about creating XML using awk or stripping XML with awk
to get text. I am interested in the latter if anybody has a solution.

--
Robert B. Peirce, Venetia, PA 724-941-6883
bob AT peirce-family.com [Mac]
rbp AT cooksonpeirce.com [Office]

Jürgen Kahrs

unread,

Jul 14, 2005, 11:43:45 AM7/14/05

to

Robert Peirce wrote:

> I missed the initial and early posts on this topic so I am not sure
> whether this is about creating XML using awk or stripping XML with awk
> to get text. I am interested in the latter if anybody has a solution.

We talked about reading XML data with some AWK interpreter.
Jan Weber wrote/improved a portable (nawk) script.
xgawk is a SourceForge project which extends gawk.
Both solutions focus on reading XML data.
Use Google to find out more.

Jan Weber

unread,

Jul 14, 2005, 4:30:51 PM7/14/05

to

Hmm, maybe this sounded somewhat unfriendly.
What I wanted to say, was:

Yes, Please feel free to include this into your distribution.
I'm highly interested in the distribution of this script,
so anyone who may have a use for it will find it.

But i'm afraid that i will not have the time to write a chapter for your manual.

If you have any further questions, just contact me per email.

Best regards,
Jan

Jürgen Kahrs

unread,

Jul 15, 2005, 3:21:07 AM7/15/05

to

Jan Weber wrote:

>> Yes, of course, this code can freely be used, distributed, or modified,
>> without any restriction, as long as the origin of this code is
>> not misrepresented.
>
>
>
> Hmm, maybe this sounded somewhat unfriendly.

No, it didnt sound unfriendly to.

> What I wanted to say, was:
>
> Yes, Please feel free to include this into your distribution.
> I'm highly interested in the distribution of this script,
> so anyone who may have a use for it will find it.

That's good. Yesterday evening I inserted an empty
chapter into the manual ("Reading XML files with POSIX AWK").

> But i'm afraid that i will not have the time to write a chapter for your
> manual.

It would have been a welcome idea if you had written
this chapter.

> If you have any further questions, just contact me per email.

I thought about making the script 'compatible' to xgawk
in the sense that variables have the same names and semamtics.
I will come back to this when I return to changing the manual.

Jan Weber

unread,

Jul 15, 2005, 3:23:15 PM7/15/05

to

On Fri, 15 Jul 2005 09:21:07 +0200, Jürgen Kahrs <Juergen.Kah...@vr-web.de> wrote:

> I thought about making the script 'compatible' to xgawk
> in the sense that variables have the same names and semamtics.

I would like to ask you to be carefully with this idea.
Yould probably would have to rewrite the whole parser.

It would also destroy the simplicity, compactness, and consistency
of the getXML interface.

Having a type/value pair (XTYPE,XITEM) is not the same as having
a bunch of variables, one for each type of data (what is your solution).

It would also be much harder to build the next step (DOM like parsing) on top
of this (You might quess, what the XNODE variable is ment for)

Best regards,
Jan

Andrew Schorr

unread,

Jul 18, 2005, 10:33:53 AM7/18/05

to

Actually, the xml extension to xgawk currently supports both
type of interfaces. You can simply use the XMLEVENT, XMLNAME,
and XMLATTR variables to process each event, or you can use
the original interface that uses lots of different variables.

Regards,
Andy

Jan Weber

unread,

Jul 18, 2005, 5:52:04 PM7/18/05

to

I understand, someone should update the documentation ;-)

Jan Weber

unread,

Jul 19, 2005, 5:08:57 AM7/19/05

to

I'm also thinking about a function putXML(), but i am not sure with the interface.
It should either operate on the XTYPE,XITEM,XATTR triple for streaming processing or
on XNODE for the DOM style part.
And it should be as simple as possible.

I thought about something like this:

function putXML( file, xtype_or_xnode, xitem, xattr ) {
code determines whether xtype_or_xnode is xtype or xnode
and uses xtype,xitem,xattr or xnode respectively.
}

or (currently my favorite):

function putXML( file, xnode ) {
code uses xnode, except if xnode is empty (not set) then it uses
global vars XTYPE,XITEM,XATTR
}

or even

function putXML( file, flagUseXNODE ) {
if flagUseXNODE is set, code uses global var XNODE
else it uses global vars XTYPE,XITEM,XATTR
}

with the last two versions it would be possible to write code like this:

# Example: increment the attribute named "bar" in all elements named "foo"
BEGIN {
while ( getXML(ARGV[1]) ) {
if (XTYPE=="TAG" && XITEM=="foo") {
XATTR["bar"]++;
putXML(ARGV[2]);
}
}
}

And this would finally be "XML processing with AWK" instead of just parsing ;-)
Comments are welcome.

Regards,
Jan

Jan Weber

unread,

Jul 19, 2005, 5:20:36 AM7/19/05

to

Example corrected:

Jürgen Kahrs

unread,

Jul 23, 2005, 2:48:31 PM7/23/05

to

Jan Weber wrote:

> I'm also thinking about a function putXML(), but i am not sure with the
> interface.

Good idea. Last year, Manuel Collado and Stefan Tramm
told me that there is an interesting Perl solution for
this problem.

> And this would finally be "XML processing with AWK" instead of just
> parsing ;-)

Indeed. Keep us posted about your advances.
And be prepared to write a chapter about it
in our manual.

Jan Weber

unread,

Jul 24, 2005, 10:58:59 AM7/24/05

to

On Sat, 23 Jul 2005 20:48:31 +0200, Jürgen Kahrs <Juergen.Kahr...@vr-web.de> wrote:

> Jan Weber wrote:
>
>> I'm also thinking about a function putXML(), but i am not sure with the
>> interface.
>
> Good idea. Last year, Manuel Collado and Stefan Tramm
> told me that there is an interesting Perl solution for
> this problem.

Can you give more information about the "interesting" part of this
solution, maybe a link or a short explanation?

>> And this would finally be "XML processing with AWK" instead of just
>> parsing ;-)
>
> Indeed. Keep us posted about your advances.

I will do, but maybe it will take a while, as I currently have
no need for such script.

> And be prepared to write a chapter about it
> in our manual.

I think about it.

Jürgen Kahrs

unread,

Jul 24, 2005, 4:10:37 PM7/24/05

to

Jan Weber wrote:

>> Good idea. Last year, Manuel Collado and Stefan Tramm
>> told me that there is an interesting Perl solution for
>> this problem.
>
> Can you give more information about the "interesting" part of this
> solution, maybe a link or a short explanation?

The interesting part of it was some functions
for assembling the textual output. For example
one function surrounded the text to be printed
with markup tags and attributes. I'm sorry thet
I cant remember the name of the Perl module.
Maybe someone else (Manuel, Stefan, Andrew) can
help with details ?

Manuel Collado

unread,

Jul 26, 2005, 6:27:37 AM7/26/05

to

Jürgen Kahrs escribió:

Here is the code I wrote for this purpose:

-----X---------------------------
# Extra functions for xmllib
#
# Author: Manuel Collado
# Date: June 2004

# generate the string for a XML attribute, with a leading space:
# ' name="value"'
function attr( name, value ) {
return " " name "=\"" value "\""
}

# generate the string for starting element tags
# - several ">" delimited tags may be specified
# - every tag can include attributes
# example: 'one att="val">two>three'
# generates: '<one att="val"><two><three>'
function xse( tags, n, t, k, s ) {
n = split( tags, t, ">" )
s = ""
for (k=1; k<=n; k++) {
s = s "<" t[k] ">"
}
return s
}

# generate the string for ending element tags, like xse, but
# - in reverse order
# - trimm attributes
function xee( tags, n, t, k, s ) {
#print "---" tags
n = split( tags, t, ">" )
s = ""
for (k=n; k>=1; k--) {
#print k "--" t[k]
if (index(t[k], " ")) {
s = s "</" substr(t[k], 1, index(t[k], " ")-1) ">"
} else {
s = s "</" t[k] ">"
}
}
return s
}

# generate the string for a full xml element, combine xse and xee
function xml( tags, content ) {
return xse(tags) content xee(tags)
}

# generate the string for a processing instruction
function xpi( pi ) {
return "<?" pi "?>"
}

# generate the string for a simple DOCTYPE declaration
function doctype( root, public, url ) {
if (public) {
return "<!DOCTYPE " root " PUBLIC " public " " url " >"
} else {
return "<!DOCTYPE " root " SYSTEM " url " >"
}
}

# generate the string for a stylesheet processing instruction
# (type = css/xsl)
function xss( type, url ) {
return xpi( "xsl-stylesheet" attr("type", "text/" type)
attr("href", url) )
}

# generate the string for a XML comment
function xcomment( comment ) {
return ""
}
-----X---------------------------

Regards,
--
To reply by e-mail, please remove the extra dot
in the given address: m.collado -> mcollado

Jan Weber

unread,

Jul 26, 2005, 6:52:04 PM7/26/05

to

Thanks!

kshji

unread,

Dec 30, 2016, 9:28:55 AM12/30/16

to

Jan: Is that okay that I publish this getXML on my github and tell that Jan Weber is father of this nice awk parser.

Luuk

unread,

Dec 31, 2016, 4:43:02 AM12/31/16

to

On 30-12-16 15:28, kshji wrote:
> Jan: Is that okay that I publish this getXML on my github and tell that Jan Weber is father of this nice awk parser.
>

(i did find the xmlparser.awk here:
ftp://ftp.freefriends.org/arnold/Awkstuff/xmlparser.awk )

While this seems to be nice, i think these lines are incorrect:

:~/tmp> grep 'upper\|lower' xmlparser.awk
tag = toupper( tolower( tag ));
attrib = tolower( attrib );

Not because of awk-syntax, but because in xml case does matter!

XML parsing with awk - getXML.awk

Jan Weber

Jürgen Kahrs

Jan Weber

Jürgen Kahrs

Jan Weber

Andrew Schorr

Jan Weber

Jürgen Kahrs

Andrew Schorr

Jürgen Kahrs

Jan Weber

Jürgen Kahrs

Jan Weber

Robert Peirce

Jürgen Kahrs

Jan Weber

Jürgen Kahrs

Jan Weber

Andrew Schorr

Jan Weber

Jan Weber

Jan Weber

Jürgen Kahrs

Jan Weber

Jürgen Kahrs

Manuel Collado

Jan Weber

kshji

Luuk