On Thursday, August 3, 2023 at 1:39:17 AM UTC+8, Fonntuggnio wrote:
> Sorry for the total OT, but I failed to build a RegEx with
> the "help" (rotfl) of three different so called IA, getting
> to nothing
>
> I am scanning an HTML document (not in javascript, so I do
> not have access to DOM nodes from inside) and I need to
> match EVERY <p> whole tag.
>
> for whole I mean, starting from the <p and ending with the
> corresponding </p>, but such paragrapha MAY (and may not)
> contain
>
> a long list of attributes, with or without zero or more \n
> \r \t characters, valid, before the >.
The following is an example program of class Regex (class wrapper of regex(3)
functions). The regular expression "<p>.*</p>" should do most of the job, except
real HTML involves comments, nested tags, erroneous format...etc:
[]a_grep "<p>.*</p>" *html
-------------------------------------------------
/* Copyright is licensed by GNU LGPL, see file COPYING. by I.J.Wang 2023
Simulate grep command (Extended regular expression, ERE)
Build: make a_grep
*/
#include <Wy.stdio.h>
#include <Wy.unistd.h>
#include <Wy.regex.h>
using namespace Wy;
constexpr const char Red[]="\x1B[31m";
constexpr const char Reset[]= "\x1B[0m";
void sim_grep(Regex& rexpr, const char* fname)
{
Errno r;
String str;
::regmatch_t mbuf[5];
RegFile regf(fname,O_RDONLY);
RdBuf strm(regf);
for(;strm.is_eof()==false;) {
if((r=strm.read(str))!=Ok) {
WY_THROW(r);
}
if((r=rexpr.regexec(str.c_str(),mbuf,WY_CARR_SIZE(mbuf),0))!=Ok) {
continue;
}
cout << fname << ": ";
cout << StrSeg(str.begin(), str.begin()+mbuf[0].rm_so);
cout << Red << StrSeg(str.begin()+mbuf[0].rm_so,
str.begin()+mbuf[0].rm_eo) << Reset;
cout << StrSeg(str.begin()+mbuf[0].rm_eo, str.end());
}
};
int main(int argc, const char* argv[])
try {
static const char usage[]="a_grep <pattern> <file>+" WY_ENDL;
Errno r;
if(argc<3) {
cout << "Error: Invalid argument" WY_ENDL "Usage: "
<< usage << WY_ENDL;
return -1;
}
const char* ptn= argv[1];
Regex rexpr;
if((r=rexpr.regcomp(ptn,REG_EXTENDED))!=Ok) {
if(r!=EBADMSG) {
WY_THROW(r);
}
String str;
if((r=rexpr.regerror(str))!=Ok) {
WY_THROW(r);
}
cout << str << WY_ENDL;
return -1;
}
for(int i=2; i<argc; ++i) {
const char* fname= argv[i];
sim_grep(rexpr,fname);
}
cout << "OK" WY_ENDL;
return 0;
}
catch(const Errno& e) {
cerr << wrd(e) << WY_ENDL;
return -1;
}
catch(...) {
cerr << "main() caught(...)" WY_ENDL;
throw;
};