For comparison purposes, here are the C++ and C versions of my
"Substitute" function, as stand-alone C++ programs, dependent
only on std libraries (and the GNU C regex library in the case
of the C-style version).
First, the C++ version.
Total code lines: 38.
Total time spent writing & debugging: 60 minutes.
#include <iostream>
#include <regex>
using std::cout;
using std::cerr;
using std::endl;
std::string Substitute
(
std::string const & Pattern,
std::string const & Replacement,
std::string const & Text,
std::string const & Flags // If Flags contains 'g', do global replace;
) // otherwise, replace first occurrence only.
{
std::regex_constants::match_flag_type RegexFlags {};
if (std::string::npos == Flags.find_first_of("g"))
{
RegexFlags = std::regex_constants::format_first_only;
}
std::regex RegEx {Pattern};
std::string Result = std::regex_replace(Text, RegEx, Replacement, RegexFlags);
return Result;
} // end function rhregex::Substitute()
int main (int Beren, char ** Luthien)
{
std::string Pattern {Luthien[1]};
std::string Replacement {Luthien[2]};
std::string Text {Luthien[3]};
std::string Flags {Luthien[4]};
if (5 != Beren)
{
cerr
<< "Error in regex-c-test.cpp: this program takes 4 arguments," << endl
<< "but you only typed " << Beren << ". Aborting program." << endl;
exit(666);
}
std::string Result = Substitute(Pattern, Replacement, Text, Flags);
std::cout << Result << std::endl;
}
Next, the C-style version.
Total code lines: 322.
Total time spent writing & debugging: many hours
#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <regex.h>
#undef NDEBUG
#include <assert.h>
#include <errno.h>
#undef BLAT_ENABLE
#ifdef BLAT_ENABLE
#define BLAT(X) std::cerr << X << std::endl;
#else
#define BLAT(X)
#endif
using std::cout;
using std::cerr;
using std::setw;
using std::setfill;
using std::endl;
struct RegExException
{
RegExException(void) : msg(std::string("")) {}
RegExException(char const * Msg) : msg(std::string(Msg)) {}
RegExException(std::string const & Msg) : msg(Msg) {}
std::string msg;
};
std::string Substitute
(
std::string const & Pattern,
std::string const & Replacement,
std::string const & Text,
char Flag
)
{
BLAT("\nJust entered Substitute() with following parameters:")
BLAT("Pattern = " << Pattern)
BLAT("Replacement = " << Replacement)
BLAT("Text = " << Text)
BLAT("Flag = " << Flag << "\n")
int Result {};
std::string ReturnString {};
regex_t * RegEx = NULL;
RegEx = reinterpret_cast<regex_t *>(malloc(sizeof(regex_t)));
assert(NULL != RegEx);
// Make variable ReplacementCopy to hold copy of Replacement.
// (We can't alter Replacement because it is a const reference.)
BLAT("\nIn Substitute(), about to copy Replacement to new string ReplacementCopy.")
std::string ReplacementCopy {Replacement};
BLAT("In Substitute(), finished copying Text to TextCopy.\n")
// Make variable TextCopy to hold copy of Text.
// (We can't alter Text because it is a const reference.)
BLAT("\nIn Substitute(), about to copy Text to new string TextCopy.")
std::string TextCopy {Text};
BLAT("In Substitute(), finished copying Text to TextCopy.\n")
// COMPILE REGULAR EXPRESSION:
BLAT("\nIn Substitute(), about to call regcomp().")
Result = regcomp(RegEx, Pattern.c_str(), REG_EXTENDED);
BLAT("In Substitute(), just returned from regcomp().\n")
// If regcomp() failed, throw exception:
if (0 != Result)
{
BLAT("In Substitute(), regcomp() failed so about to throw exception.")
char ErrorBuffer[505];
size_t ErrorSize = regerror(Result, RegEx, ErrorBuffer, 500);
std::string ErrorMessage =
std::string("regcomp() failed.\nError message from regerror():\n")
+ std::string(ErrorBuffer) + "\n";
if (ErrorSize > 500)
{
ErrorMessage += "(Message truncated to 500 characters.)\n";
}
RegExException up (ErrorMessage);
regfree(RegEx); // Free the (internal) dynamically-allocated memory associated with RegEx.
free(RegEx);
BLAT("In Substitute(), at bottom of \"regcomp() failed so throw exception\" section;")
BLAT("ErrorMessage = " << ErrorMessage)
throw up;
}
// Allocate Matches:
BLAT("\nIn Substitute(), about to allocate Matches.")
regmatch_t * Matches = NULL;
Matches = reinterpret_cast<regmatch_t*>(malloc((RegEx->re_nsub + 5)*sizeof(regmatch_t)));
assert(NULL != Matches);
BLAT("In Substitute(), just allocated Matches.")
// EXECUTE REGULAR EXPRESSION:
// Run regexec(), which will look for matches to the RE in Text:
BLAT("In Substitute(), about to call regexec().")
Result = regexec(RegEx, Text.c_str(), 10, Matches, 0);
BLAT("In Substitute(), called regexec().")
// Don't regfree(RegEx) here; wait until we see if there was an error,
// so that we can feed RegEx to regerror().
// If there was an error, get error message, free RegEx and Matches, and throw an exception:
if (REG_NOERROR != Result && REG_NOMATCH != Result)
{
BLAT("In Substitute(), at top of \"regexec() failed so throw exception\" section;")
BLAT("Result = " << Result)
char ErrorBuffer[505] = {'\0'};
size_t ErrorSize = regerror(Result, RegEx, ErrorBuffer, 500);
std::string ErrorMessage =
std::string("regexec() failed.\nError message from regerror():\n")
+ std::string(ErrorBuffer) + "\n";
if (ErrorSize > 500)
{
ErrorMessage += "(Message truncated to 500 characters.)\n";
}
RegExException up (ErrorMessage);
regfree(RegEx); // Free the (internal) dynamically-allocated memory associated with RegEx.
free(RegEx);
free(Matches);
BLAT("In Substitute(), at bottom of \"regexec() failed so throw exception\" section;")
BLAT("ErrorMessage = " << ErrorMessage)
throw up;
}
BLAT("In Substitute(), after \"if (exec error) throw exception\" section;")
BLAT("just above \"if (no match)\" section.")
// If no match, free all dynamically-allocated memory and return TextCopy.
if (REG_NOMATCH == Result)
{
BLAT("In Substitute(), at top of \"no-match\" section; about to free RegEx and Matches.")
regfree(RegEx); // Free the (internal) dynamically-allocated memory associated with RegEx.
free(RegEx);
free(Matches);
BLAT("In Substitute(), at bottom of \"no-match\" section; about to return TextCopy:")
BLAT("TextCopy = " << TextCopy)
return TextCopy;
}
// If we get to here, we have matches. Grab number of submatches (if any)
// from RegEx.re_nsub and free RegEx:
size_t NumMatches = RegEx->re_nsub;
BLAT("In Substitute, in \"we have matches\" section, NumMatches = " << NumMatches)
regfree(RegEx);
free(RegEx);
BLAT("In Substitute, at bottom of \"we have matches\" section, just ran regfree(RegEx) and free(RegEx).")
// Expand backreferences in Replacement; store expanded version in ReplacementCopy:
for ( size_t i = 1 ; i <= NumMatches ; ++i )
{
// While there are instances of backreference i in Replacement, expand those:
std::ostringstream SS ("");
std::string::size_type Index;
SS << "\\" << setw(1) << i;
BLAT("In Substitute(), inside top of backreference-expansion for loop;")
BLAT("SS.str() = " << SS.str())
while (std::string::npos != (Index = ReplacementCopy.find(SS.str())))
{
BLAT("In Substitute(), just inside top of while(backreferences exist) loop,")
BLAT("Index = " << Index)
if // If there was a match for parenthetical group i...
(
Matches[i].rm_so > -1
&&
Matches[i].rm_eo > -1
&&
Matches[i].rm_so < static_cast<long>(TextCopy.size())
&&
Matches[i].rm_eo <= static_cast<long>(TextCopy.size())
&&
Matches[i].rm_eo > Matches[i].rm_so
)
{
BLAT("In Substitute(), inside if (match exists to backreference).")
// Expand current instance of backreferrence i:
ReplacementCopy.replace
(
Index,
2,
Text.substr
(
std::string::size_type(Matches[i].rm_so),
std::string::size_type(Matches[i].rm_eo - Matches[i].rm_so)
)
);
} // end if (there was a match for parenthetical group i)
else // Otherwise, current backreference is unused, so erase it:
{
BLAT("In Substitute(), inside else (no match to backreference).")
ReplacementCopy.erase(Index, 2);
} // end else (current backreference is unused)
} // end while (unexpanded backreference i instances exist in ReplacementCopy)
} // end for (each submatch, i = 1 through n)
// Replace first match of Pattern in TextCopy:
std::string::size_type RepPos = std::string::size_type(Matches[0].rm_so);
std::string::size_type RepLen = std::string::size_type(Matches[0].rm_eo - Matches[0].rm_so);
BLAT("In Substitute(), about to do replacement on TextCopy with these parameters:")
BLAT("TextCopy = " << TextCopy)
BLAT("RepPos = " << RepPos)
BLAT("RepLen = " << RepLen)
BLAT("ReplacementCopy = " << ReplacementCopy)
TextCopy.replace(RepPos, RepLen, ReplacementCopy);
BLAT("TextCopy after replacement = " << TextCopy)
// If doing global replacement, recurse until ALL instances of Pattern in TextCopy are
// replaced with Replacement. But don't do global replacement if the character '^'
// appears in a context other than a litteral or a character-list inversion. Otherwise,
// we'd violate the user's request that a replacement be done ONLY at the beginning of a
// line.
std::string::size_type index = Pattern.find('^');
if
(
index < std::string::npos
&&
'[' != Pattern[index-1]
&&
'\\' != Pattern[index-1]
)
{
BLAT("In Substitute; about to set flag to 'h' because of unescaped ^.")
Flag = 'h';
}
// Similarly, if the character '$' appears in Pattern in a non-litteral context,
// don't do global replacement:
index = Pattern.find('$');
if
(
index < std::string::npos
&&
'\\' != Pattern[index-1]
)
{
BLAT("In Substitute; about to set flag to 'h' because of unescaped $.")
Flag = 'h';
}
BLAT("In Substitute, just above global recursion section. Flag = " << Flag)
// If Flag is still 'g', do global replacement by recursing this function:
if ('g' == Flag)
{
BLAT("In Substitute, just inside \"if ('g' == Flag)\" section;")
BLAT("TextCopy = " << TextCopy);
// Now here things get very, very tricky! It would be tempting to do this:
// return Substitute(Pattern, Replacement, TextCopy, 'g');
// Tempting... but disastrous! if Pattern matches a substring of Replacement,
// then this would recurse forever! (Actually, it would recurse till it overflows
// the stack and crashes the system.) So we must split TextCopy into "processed" and
// "unprocessed" chunks, and pass only the unprocessed chunk to Substitute at the
// next recursive level down, then re-glom the chucks and return the result:
std::string::size_type FirstChunkSize = Matches[0].rm_so + ReplacementCopy.size();
BLAT("In Substitute, about to recurse;")
BLAT("TextCopy.substr(0, FirstChunkSize ) = ")
BLAT(TextCopy.substr(0, FirstChunkSize))
BLAT("TextCopy.substr(FirstChunkSize , std::string::npos) = ")
BLAT(TextCopy.substr(FirstChunkSize , std::string::npos))
// RECURSE!!!
ReturnString =
// substring, starting at 0, size = FirstChunkSize:
TextCopy.substr(0, FirstChunkSize)
+
Substitute // RECURSE!!!
(
Pattern,
Replacement,
// substring, starting at FirstChunkSize, size = unlimited:
TextCopy.substr(FirstChunkSize, std::string::npos),
'g'
);
BLAT("In Substitute() at bottom of Global section;")
BLAT("just returned from recursion;")
BLAT("ReturnString = " << ReturnString)
}
// Otherwise, just replace the first match:
else
{
ReturnString = TextCopy;
BLAT("In Substitute(), at bottom of \"else ('g' != Flag\" section;")
BLAT("Didn't recurse;")
BLAT("ReturnString = " << ReturnString)
}
BLAT("In Substitute(), about to free Matches.")
// Free Matches:
free(Matches);
// Return ReturnString:
BLAT("In Substitute(), freed Matches, at bottom, about to return;")
BLAT("ReturnString = " << ReturnString)
return ReturnString;
} // end function Substitute()
int main (int Beren, char ** Luthien)
{
std::ios_base::sync_with_stdio();
std::string Pattern {Luthien[1]};
std::string Replacement {Luthien[2]};
std::string Text {Luthien[3]};
char Flag {Luthien[4][0]};
if (5 != Beren)
{
cerr << "Error in regex-c-test.cpp: this program takes 4 arguments," << endl;
cerr << "but you only typed " << Beren << ". Aborting program." << endl;
exit(666);
}
std::string Result = Substitute(Pattern, Replacement, Text, Flag);
std::cout << Result << std::endl;