The problem is that I actually want the <def> tags. I am editing a
Pearl script but don't know how perl works properly. I have changed
it to my liking but need to strip all non <def> tags out of the
document. I want to do a find and replace on all non <def> tags.
Script below.
#!/usr/bin/perl -w
# This script performs a headword search within GCIDE_XML.
# Submitted by Alexei Puzikov (k...@validio.com.ua),
# and modified by Michael Dyck (jmd...@ibiblio.org).
use CGI qw (param escape);
use CGI::Carp qw (fatalsToBrowser);
$query = param('query');
$script_url = $ENV{"SCRIPT_URL"};
$xml_file_dir = "/home/users/web/b1617/ipg.jackmatt2/dictionary/
xml_files";
if ($query eq ""){ $query = "x"; }
# Get the first character of the query.
$first = lc(substr($query,0,1));
if ( $first =~ /[a-z]/ )
{
# That character is a letter,
# so we only have to examine the file(s) for that letter.
$file_pattern = "$xml_file_dir/gcide_${first}*.xml";
}
else
{
# That character isn't a letter
# (i.e., it's some sort of regexp metacharacter),
# so we have to examine all files.
$file_pattern = "$xml_file_dir/gcide_*.xml";
}
@files = glob( $file_pattern );
$pattern = "<hw>$query</hw>";
foreach $datafile (@files)
{
# print "$datafile\n";
open(DATAFILE,"$datafile");
while (1)
{
$astring = <DATAFILE>;
last if !defined($astring);
# Headwords appear with syllabification/stress marks * " `.
# There also might be an initial "‖".
# We need to remove those before comparing with $pattern.
# Possible bug: what if those marks appear in $astring
# *outside* of the <hw> element?
$astring =~ s/["*`]//g;
$astring =~ s/‖//g;
if ($astring =~ /$pattern/i)
{
# We have found a headword element that matches the query!
# Read lines to the end of this "entry".
$result = '';
while (1)
{
chomp($astring);
$result = "$result$astring\n";
$astring=<DATAFILE>;
last if !defined($astring);
last if $astring =~ /<hw>/;
}
# Convert GCIDE_XML tags into their HTML renditions
$_ = $result;
#s#<hw>#<spelling>#g;
#s#</hw>#</spelling>#g;
#s#<hwf>#<B>#g;
#s#</hwf>#</B>#g;
# s#<q>#<BLOCKQUOTE>#g;
# s#</q>#</BLOCKQUOTE>#g;
#s#<br/>#<BR>#g;
#s#<pbr/>#<BR>#g;
#$definitions = m#<def\b[^>]*>(.*?)</def>#g;
#s#^(.*)(<def\b[^>]*>(.*?)</def>)(.*)$#$1$3#g;
#s#^(.*)(<def\b[^>]*>(.*?)</def>)(.*)$#$1$3#;
s#<br/>##g;
s#<pbr/>##g;
s#&##g;
s#<(qex|qau|source|xex|pos|fld|ets|etsep|au|src|altname|altnpluf|
mark|ex|asp|cref|sd|contr|ant|spn|ord|gen|pluf|uex|stype|mathex|ratio|
singf|xlati|iref|figref|ptcl|part|var|tr)># - #g;
s#</(qex|qau|source|xex|pos|fld|ets|etsep|au|src|altname|altnpluf|
mark|ex|asp|cref|sd|contr|ant|spn|ord|gen|pluf|uex|stype|mathex|ratio|
singf|xlati|iref|figref|ptcl|part|var|tr)># - #g;
#s#<er>(.+?)</er>#<A href="$script_url?query=$1">$1</A>#g;
push @matches, $_;
redo;
# Redo the body of the while-loop,
# in case the <hw> that we just read
# (which signalled the end of the previous entry)
# is also a match for the query.
} # End of if.
} # End of while.
close(DATAFILE);
}
$count=@matches;
print << "EOM";
Content-type: text/html
<?xml version="1.0" encoding="ISO-8859-1"?>
EOM
print "<definitions count=\"$count\">\n";
if ($count > 0)
{
foreach $match (@matches)
{
print "$match\n";
}
}
print "</definitions>"
On Nov 6, 3:52 pm, "Eugeny.Satt...@gmail.com"
<eugeny.satt
...@gmail.com> wrote:
> Hm... how about deleting all def tags and their contents and storing
> the remainder into the result variable?
> By the way, if you have nested def tags your regex may turn to be
> imperfect.
> On 5 ноя, 19:49, jax <jackma...@gmail.com> wrote:
> > I have a working regex that selects all <def>...</def> tags.
> > <def\b[^>]*>(.*?)</def>
> > The problem is that I want to select everything other than the def
> > tags and their contents. How would I go about this?