Repository :
https://github.com/FarGroup/FarManager
On branch : master
Link :
https://github.com/FarGroup/FarManager/commit/c35657d740642acb9df436dcbfe1d93ed08fb767
>---------------------------------------------------------------
commit c35657d740642acb9df436dcbfe1d93ed08fb767
Author: w17 <
vladimir....@gmail.com>
Date: Mon Nov 10 02:33:20 2025 +0300
revert 6590
>---------------------------------------------------------------
c35657d740642acb9df436dcbfe1d93ed08fb767
far/RegExp.cpp | 816 +++++++++++++++++++++++++++++++--------------------------
far/RegExp.hpp | 6 +-
far/changelog | 5 +
far/strmix.cpp | 11 +-
far/vbuild.m4 | 2 +-
5 files changed, 461 insertions(+), 379 deletions(-)
diff --git a/far/RegExp.cpp b/far/RegExp.cpp
index a2fd310ea..cfbc695f7 100644
--- a/far/RegExp.cpp
+++ b/far/RegExp.cpp
@@ -107,7 +107,7 @@ static const wchar_t* ops[]=
L"opAlternative",
L"opBackRef",
L"opNamedBracket",
- // L"opNamedBackRef",
+ L"opNamedBackRef",
L"opRangesBegin",
L"opRange",
L"opMinRange",
@@ -127,8 +127,8 @@ static const wchar_t* ops[]=
L"opBracketMinRange",
L"opBackRefRange",
L"opBackRefMinRange",
- // L"opNamedRefRange",
- // L"opNamedRefMinRange",
+ L"opNamedRefRange",
+ L"opNamedRefMinRange",
L"opRangesEnd",
L"opAssertionsBegin",
L"opLookAhead",
@@ -344,10 +344,10 @@ enum REOp
opAlternative, // |
- opBackRef, // \number \-number {number|-number|name} \g{number|-number|name} \p{number|-number|name}
+ opBackRef, // \1
- // opNamedBracket, // (?{name}
- // opNamedBackRef, // \p{name} \g{name} \{name} -- resolved as opBackRef
+ opNamedBracket, // (?{name}
+ opNamedBackRef, // \p{name}
opRangesBegin, // for op type check
@@ -378,8 +378,8 @@ enum REOp
opBackRefRange, // for backrefs
opBackRefMinRange,
-// opNamedRefRange,
-// opNamedRefMinRange,
+ opNamedRefRange,
+ opNamedRefMinRange,
opRangesEnd, // end of ranges
@@ -428,6 +428,10 @@ struct REOpCode_data
int min,max;
};
+ struct SNamedBracket: SBracket
+ {
+ const wchar_t* name;
+ };
struct SAssert
{
@@ -446,11 +450,13 @@ struct REOpCode_data
{
SRange range;
SBracket bracket;
+ SNamedBracket nbracket;
SAssert assert;
SAlternative alternative;
wchar_t symbol;
RegExp::UniSet *symbolclass;
int refindex;
+ const wchar_t* refname;
int type;
};
};
@@ -472,6 +478,8 @@ struct RegExp::REOpCode: public REOpCode_data
case opSymbolClass:delete symbolclass; break;
case opClassRange:
case opClassMinRange:delete range.symbolclass; break;
+ case opNamedBracket:delete[]
nbracket.name; break;
+ case opNamedBackRef:delete[] refname; break;
}
}
};
@@ -485,126 +493,9 @@ RegExp::RegExp():
RegExp::~RegExp() = default;
RegExp::RegExp(RegExp&&) noexcept = default;
-static wchar_t get_next_char(string_view src, int& pos, const int shift=0, const bool do_throw=false)
-{
- ++pos;
- if (static_cast<size_t>(pos) < src.size())
- return src[pos];
- pos = static_cast<int>(src.size());
- if (do_throw)
- throw regex_exception(errSyntax, pos + shift);
- return L'\0';
-}
-
-// \h \{h} \hh \{hh} \hhh \{hhh} \hhhh \{hhhh}
-//
-static int get_HexChar(string_view src, int& pos, const int shift)
-{
- auto c = get_next_char(src, pos);
-
- const auto curve_pos = pos + shift;
- const auto curves = c == L'{';
- if (curves)
- c = get_next_char(src, pos);
- if (!isxdigit(c))
- {
- if (!curves || c) throw regex_exception(errSyntax, pos + shift);
- throw regex_exception(errBrackets, curve_pos);
- }
-
- int unicode_char = 0;
- for (int j = 1; j <= 4; ++j)
- {
- c = TOLOWER(c);
- unicode_char = (unicode_char << 4) | (c - (c > '9' ? 'a' - 10 : '0'));
- c = get_next_char(src, pos);
- if (!isxdigit(c))
- break;
- }
- if (!curves)
- --pos;
- else if (c != L'}')
- {
- if (static_cast<size_t>(pos) >= src.size()) throw regex_exception(errBrackets, curve_pos);
- else throw regex_exception(errSyntax, pos + shift);
- }
-
- return unicode_char;
-}
-
-// (?{name}... or (?<name>...
-// ^ ^
-static string_view get_NamedGroup(string_view src, int& pos, const int shift)
-{
- const auto close_bracket = src[pos] == L'{' ? L'}' : (src[pos] == L'<' ? L'>' : src[pos]);
- const auto start_bracket = pos + shift;
-
- wchar_t c;
- do { c = get_next_char(src, pos); } while (ISSPACE(c));
- if (!ISALPHA(c))
- if (c && c != close_bracket) throw regex_exception(errSyntax, pos + shift);
- else throw regex_exception(c ? errIncompleteGroupStructure : errBrackets, start_bracket);
-
- const auto b_pos = pos;
- do { c = get_next_char(src, pos); } while (ISWORD(c));
- const auto e_pos = pos;
-
- while (ISSPACE(c)) c = get_next_char(src, pos);
- if (c != close_bracket)
- throw regex_exception(c ? errSyntax : errBrackets, c ? pos + shift : start_bracket);
-
- return src.substr(b_pos, e_pos - b_pos);
-}
-
-// \num \{num} \-num \{-num} \{num} \{-num} \{name}
-// ^ ^ ^ ^ ^ ^ ^
-// \pnum \p{num} \p-num \p{-num} \p{name} \gnum ... \g{name}
-// ^ ^ ^ ^ ^ ^ ^
-static string_view get_BackRef(string_view src, int& pos, const int shift)
-{
- auto c = src[pos];
- if (c == L'p' || c == L'g')
- c = get_next_char(src, pos);
-
- const auto curve_pos = pos;
- const auto curves = c == L'{';
- if (curves)
- do { c = get_next_char(src, pos); } while (c == L' '); // {g1} { g2 } \g{ -5} ...
-
- const auto n_pos = pos;
- const auto minus = c == L'-';
- if (minus)
- c = get_next_char(src, pos);
-
- const auto number_mode = !curves || minus || ISDIGIT(c);
- if ((number_mode && !ISDIGIT(c)) || (!number_mode && !ISALPHA(c)))
- {
- if (curves)
- {
- if (!c) throw regex_exception(errBrackets, curve_pos + shift);
- if (c == L'}') throw regex_exception(errIncompleteGroupStructure, curve_pos + shift);
- }
- throw regex_exception(errSyntax, pos + shift);
- }
-
- do { c = get_next_char(src, pos); } while (ISDIGIT(c) || (!number_mode && ISWORD(c)));
- const auto e_pos = pos;
-
- if (curves)
- {
- while (c == L' ') { c = get_next_char(src, pos); }
- if (c != L'}')
- throw regex_exception(c ? errSyntax : errBrackets, (c ? pos : curve_pos) + shift);
- }
- else
- --pos;
-
- return src.substr(n_pos, e_pos - n_pos);
-}
-
constexpr auto MinCodeLength = 3; //global brackets
-int RegExp::CalcLength(string_view src, const int shift)
+int RegExp::CalcLength(string_view src)
{
const auto srclength = static_cast<int>(src.size());
int length = MinCodeLength;
@@ -614,31 +505,6 @@ int RegExp::CalcLength(string_view src, const int shift)
bracketscount=1;
int inquote=0;
- const auto next_char = [src, shift](int& pos, const bool do_throw = false)
- {
- return get_next_char(src, pos, shift, do_throw);
- };
-
- const auto test_char = [src](const int pos)
- {
- return static_cast<size_t>(pos) < src.size() ? src[pos] : L'\0';
- };
-
- const auto hex_char = [src, shift](int& pos)
- {
- return get_HexChar(src, pos, shift);
- };
-
- const auto named_group = [src, shift](int& pos)
- {
- return get_NamedGroup(src, pos, shift);
- };
-
- const auto back_ref = [src, shift](int& pos)
- {
- return get_BackRef(src, pos, shift);
- };
-
for (int i=0; i<srclength; i++,length++)
{
if (inquote && src[i]!=backslashChar && (i + 1 == srclength || src[i+1] != L'E'))
@@ -646,35 +512,57 @@ int RegExp::CalcLength(string_view src, const int shift)
continue;
}
- if (src[i] == backslashChar)
+ if (src[i]==backslashChar)
{
- auto c = next_char(i);
- if (i >= srclength)
+ i++;
+ if (i == srclength)
continue;
- if (c == L'Q')
- {
- inquote = 1; continue;
- }
+ if (src[i] == L'Q')inquote=1;
- if (c == L'E')
- {
- inquote = 0; continue;
- }
+ if (src[i] == L'E')inquote=0;
- if (c == L'x')
+ if (src[i] == L'x')
{
- hex_char(i);
- continue;
+ i++;
+ if(i != srclength && isxdigit(src[i]))
+ {
+ for(int j=1,k=i;j<4;j++)
+ {
+ if(k + j != srclength && isxdigit(src[k+j]))
+ {
+ i++;
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+ else
+ throw regex_exception(errSyntax, i);
}
- // \n \-n \{n} \{-n} \{name}
- // \pn \p-n \p{n} \p{-n} \p{name}
- // \gn \g-n \g{n} \g{-n} \g{name}
- if (c && wcschr(L"pg{-", c) || ISDIGIT(c))
+ if (src[i] == L'p')
{
- back_ref(i);
+ i++;
+
+ if (i == srclength || src[i] != L'{')
+ throw regex_exception(errSyntax, i);
+
+ i++;
+ const auto save2 = i;
+
+ while (i < srclength && (ISWORD(src[i]) || ISSPACE(src[i])) && src[i] != L'}')
+ i++;
+
+ if (i >= srclength)
+ throw regex_exception(errBrackets, save2);
+
+ if (src[i] != L'}' && !(ISWORD(src[i]) || ISSPACE(src[i])))
+ throw regex_exception(errSyntax, i);
}
+
continue;
}
@@ -684,15 +572,26 @@ int RegExp::CalcLength(string_view src, const int shift)
{
brackets[count++]=i;
if (count >= MAXDEPTH)
- throw regex_exception(errMaxDepth, i + shift);
+ throw regex_exception(errMaxDepth, i);
- if (test_char(i + 1) == L'?')
+ if (i + 1 != srclength && src[i + 1]==L'?')
{
- i += 2;
- auto c1 = test_char(i), c2 = test_char(i + 1);
- if (c1 == L'{' || (c1 == L'<' && c2 != L'=' && c2 != L'!'))
+ i+=2;
+
+ if (i != srclength && src[i] == L'{')
{
- named_group(i);
+ save = i;
+ i++;
+
+ while (i < srclength && (ISWORD(src[i]) || ISSPACE(src[i])) && src[i] != L'}')
+ i++;
+
+ if (i >= srclength)
+ throw regex_exception(errBrackets, save);
+
+ if (src[i] != L'}' && !(ISWORD(src[i]) || ISSPACE(src[i])))
+ throw regex_exception(errSyntax, i);
+
++bracketscount;
}
}
@@ -708,7 +607,7 @@ int RegExp::CalcLength(string_view src, const int shift)
count--;
if (count < 0)
- throw regex_exception(errBrackets,i + shift);
+ throw regex_exception(errBrackets,i);
break;
}
@@ -727,10 +626,10 @@ int RegExp::CalcLength(string_view src, const int shift)
++i;
if (i >= srclength)
- throw regex_exception(errBrackets, save + shift);
+ throw regex_exception(errBrackets,save);
}
- if (i + 1 != srclength && src[i + 1] == L'?')
+ if (i + 1 != srclength && src[i + 1] == '?')
++i;
break;
@@ -749,7 +648,7 @@ int RegExp::CalcLength(string_view src, const int shift)
i += (backslashChar == src[i] && src[i+1] ? 2 : 1);
if (i >= srclength)
- throw regex_exception(errBrackets, save + shift);
+ throw regex_exception(errBrackets,save);
break;
}
@@ -758,8 +657,9 @@ int RegExp::CalcLength(string_view src, const int shift)
if (count)
{
- throw regex_exception(errBrackets, brackets[0] + shift);
+ throw regex_exception(errBrackets, brackets[0]);
}
+
return length;
}
@@ -793,11 +693,11 @@ void RegExp::Compile(string_view const src, int options)
{
switch (*i)
{
- case L'i':options|=OP_IGNORECASE; break;
- case L's':options|=OP_SINGLELINE; break;
- case L'm':options|=OP_MULTILINE; break;
- case L'x':options|=OP_XTENDEDSYNTAX; break;
- case L'o':options|=OP_OPTIMIZE; break;
+ case 'i':options|=OP_IGNORECASE; break;
+ case 's':options|=OP_SINGLELINE; break;
+ case 'm':options|=OP_MULTILINE; break;
+ case 'x':options|=OP_XTENDEDSYNTAX; break;
+ case 'o':options|=OP_OPTIMIZE; break;
default: throw regex_exception(errOptions, 1 + Regex.size() + 1 + (i - Options.cbegin()));
}
}
@@ -807,16 +707,15 @@ void RegExp::Compile(string_view const src, int options)
Regex = src;
}
- ignorecase = options & OP_IGNORECASE ? 1 : 0;
+ ignorecase=options&OP_IGNORECASE?1:0;
- const auto shift = static_cast<int>(Regex.data() - src.data());
- code.resize(CalcLength(Regex, shift));
+ code.resize(CalcLength(Regex));
- InnerCompile(Regex.data(), static_cast<int>(Regex.size()), shift, options);
+ InnerCompile(src.data(), Regex.data(), static_cast<int>(Regex.size()), options);
minlength = 0;
- if (options & OP_OPTIMIZE)
+ if (options&OP_OPTIMIZE)
Optimize();
}
@@ -866,6 +765,7 @@ static int CalcPatternLength(const RegExp::REOpCode* from, const RegExp::REOpCod
altcnt++;
continue;
+ case opNamedBracket:
case opOpenBracket:
{
const auto l = CalcPatternLength(from + 1, from->bracket.pairindex - 1);
@@ -887,6 +787,7 @@ static int CalcPatternLength(const RegExp::REOpCode* from, const RegExp::REOpCod
altcnt=0;
continue;
case opBackRef:
+ case opNamedBackRef:
return -1;
case opRangesBegin:
case opRange:
@@ -925,6 +826,8 @@ static int CalcPatternLength(const RegExp::REOpCode* from, const RegExp::REOpCod
}
case opBackRefRange:
case opBackRefMinRange:
+ case opNamedRefRange:
+ case opNamedRefMinRange:
return -1;
case opRangesEnd:
case opAssertionsBegin:
@@ -945,7 +848,7 @@ static int CalcPatternLength(const RegExp::REOpCode* from, const RegExp::REOpCod
return altlen==-1?len:altlen;
}
-void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shift, int options)
+void RegExp::InnerCompile(const wchar_t* const start, const wchar_t* src, int srclength, int options)
{
REOpCode* brackets[MAXDEPTH];
// current brackets depth
@@ -970,32 +873,7 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
#endif
havelookahead=0;
- const auto next_char = [src, srclength, shift](int& pos, const bool do_throw = false)
- {
- return get_next_char({src, static_cast<size_t>(srclength)}, pos, shift, do_throw);
- };
-
- const auto peek_char = [src, srclength](const int pos)
- {
- return pos < srclength ? src[pos] : L'\0';
- };
-
- const auto hex_char = [src, srclength, shift](int& pos)
- {
- return get_HexChar({src, static_cast<size_t>(srclength)}, pos, shift);
- };
-
- const auto named_group = [src, srclength, shift](int& pos)
- {
- return get_NamedGroup({src, static_cast<size_t>(srclength)}, pos, shift);
- };
-
- const auto back_ref = [src, srclength, shift](int& pos)
- {
- return get_BackRef({ src, static_cast<size_t>(srclength) }, pos, shift);
- };
-
- for (int i = 0; i < srclength; i++)
+ for (int i=0; i<srclength; i++)
{
auto op = &code[pos];
pos++;
@@ -1003,107 +881,177 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
op->srcpos=i+1;
#endif
- if (inquote && src[i] != backslashChar)
+ if (inquote && src[i]!=backslashChar)
{
- op->op = ignorecase ? opSymbolIgnoreCase : opSymbol;
- op->symbol = ignorecase ? TOLOWER(src[i]) : src[i];
+ op->op=ignorecase?opSymbolIgnoreCase:opSymbol;
+ op->symbol=ignorecase?TOLOWER(src[i]):src[i];
- if (ignorecase && TOUPPER(op->symbol) == op->symbol) op->op = opSymbol;
+ if (ignorecase && TOUPPER(op->symbol)==op->symbol)op->op=opSymbol;
continue;
}
- if (src[i] == backslashChar)
+ if (src[i]==backslashChar)
{
- auto c = next_char(i);
+ i++;
+ if (i == srclength)
+ throw regex_exception(errSyntax, i);
- if (inquote && c != 'E')
+ if (inquote && src[i]!='E')
{
- op->op = opSymbol;
- op->symbol = backslashChar;
+ op->op=opSymbol;
+ op->symbol=backslashChar;
op = &code[pos];
pos++;
- op->op = ignorecase?opSymbolIgnoreCase:opSymbol;
- op->symbol = ignorecase ? TOLOWER(src[i]) : src[i];
+ op->op=ignorecase?opSymbolIgnoreCase:opSymbol;
+ op->symbol=ignorecase?TOLOWER(src[i]):src[i];
- if (ignorecase && TOUPPER(op->symbol) == op->symbol) op->op=opSymbol;
+ if (ignorecase && TOUPPER(op->symbol)==op->symbol)op->op=opSymbol;
continue;
}
op->op=opType;
- switch (c)
+ switch (src[i])
{
- case L'Q':inquote=1; pos--; continue;
- case L'E':inquote=0; pos--; continue;
- case L'b':op->op=opWordBound; continue;
- case L'B':op->op=opNotWordBound; continue;
- case L'D':op->op=opNotType; [[fallthrough]];
- case L'd':op->type=TYPE_DIGITCHAR; continue;
- case L'S':op->op=opNotType; [[fallthrough]];
- case L's':op->type=TYPE_SPACECHAR; continue;
- case L'W':op->op=opNotType; [[fallthrough]];
- case L'w':op->type=TYPE_WORDCHAR; continue;
- case L'U':op->op=opNotType; [[fallthrough]];
- case L'u':op->type=TYPE_UPCASE; continue;
- case L'L':op->op=opNotType; [[fallthrough]];
- case L'l':op->type=TYPE_LOWCASE; continue;
- case L'I':op->op=opNotType; [[fallthrough]];
- case L'i':op->type=TYPE_ALPHACHAR; continue;
- case L'A':op->op=opDataStart; continue;
- case L'Z':op->op=opDataEnd; continue;
- case L'n':op->op=opSymbol; op->symbol=L'\n'; continue;
- case L'r':op->op=opSymbol; op->symbol=L'\r'; continue;
- case L't':op->op=opSymbol; op->symbol=L'\t'; continue;
- case L'f':op->op=opSymbol; op->symbol=L'\f'; continue;
- case L'e':op->op=opSymbol; op->symbol=L'\x1B'; continue;
- case L'O':op->op=opNoReturn; continue;
-
- case L'x': // \xH \x{H} ... \xHHHH \x{HHHH}
- {
- const auto unicode_char = hex_char(i);
-
- op->op = ignorecase ? opSymbolIgnoreCase : opSymbol;
- op->symbol = ignorecase ? TOLOWER(unicode_char) : unicode_char;
- if (ignorecase && unicode_char == TOLOWER(unicode_char)) op->op = opSymbol;
+ case 'Q':inquote=1; pos--; continue;
+ case 'E':inquote=0; pos--; continue;
+ case 'b':op->op=opWordBound; continue;
+ case 'B':op->op=opNotWordBound; continue;
+ case 'D':op->op=opNotType; [[fallthrough]];
+ case 'd':op->type=TYPE_DIGITCHAR; continue;
+ case 'S':op->op=opNotType; [[fallthrough]];
+ case 's':op->type=TYPE_SPACECHAR; continue;
+ case 'W':op->op=opNotType; [[fallthrough]];
+ case 'w':op->type=TYPE_WORDCHAR; continue;
+ case 'U':op->op=opNotType; [[fallthrough]];
+ case 'u':op->type=TYPE_UPCASE; continue;
+ case 'L':op->op=opNotType; [[fallthrough]];
+ case 'l':op->type=TYPE_LOWCASE; continue;
+ case 'I':op->op=opNotType; [[fallthrough]];
+ case 'i':op->type=TYPE_ALPHACHAR; continue;
+ case 'A':op->op=opDataStart; continue;
+ case 'Z':op->op=opDataEnd; continue;
+ case 'n':op->op=opSymbol; op->symbol='\n'; continue;
+ case 'r':op->op=opSymbol; op->symbol='\r'; continue;
+ case 't':op->op=opSymbol; op->symbol='\t'; continue;
+ case 'f':op->op=opSymbol; op->symbol='\f'; continue;
+ case 'e':op->op=opSymbol; op->symbol=27; continue;
+ case 'O':op->op=opNoReturn; continue;
+ case 'p':
+ {
+ op->op = opNamedBackRef;
+ i++;
+
+ if (src[i] != L'{')
+ throw regex_exception(errSyntax, i + (src - start));
+
+ int len = 0; i++;
+
+ while (src[i + len] != L'}')len++;
+
+ if (len > 0)
+ {
+ const auto Name = new wchar_t[len + 1];
+ std::memcpy(Name, src + i, len*sizeof(wchar_t));
+ Name[len] = 0;
+ if (!NamedGroups.contains(Name))
+ {
+ delete[] Name;
+ throw regex_exception(errReferenceToUndefinedNamedBracket, i + (src - start));
+ }
+ op->refname = Name;
+
+ i += len;
+ }
+ else
+ {
+ throw regex_exception(errSyntax, i + (src - start));
+ }
+ } continue;
+
+ case 'x':
+ {
+ i++;
+
+ if (i >= srclength)
+ throw regex_exception(errSyntax, i + (src - start) - 1);
+
+ if(isxdigit(src[i]))
+ {
+ int c=TOLOWER(src[i])-'0';
+
+ if (c>9)c-='a'-'0'-10;
+
+ op->op=ignorecase?opSymbolIgnoreCase:opSymbol;
+ op->symbol=c;
+ for(int j=1,k=i;j<4 && k+j<srclength;j++)
+ {
+ if(isxdigit(src[k+j]))
+ {
+ i++;
+ c=TOLOWER(src[k+j])-'0';
+ if (c>9)c-='a'-'0'-10;
+ op->symbol<<=4;
+ op->symbol|=c;
+ }
+ else
+ {
+ break;
+ }
+ }
+ if (ignorecase)
+ {
+ op->symbol=TOLOWER(op->symbol);
+ if (TOUPPER(op->symbol)==TOLOWER(op->symbol))
+ {
+ op->op=opSymbol;
+ }
+ }
+ }
+ else
+ throw regex_exception(errSyntax, i + (src - start));
continue;
}
-
default:
{
- if (c && wcschr(L"pg{-", c) || ISDIGIT(c)) // \n \-n \{n} \{-n} \p{n} \p{-n} \p{name} \g{n} \g{-n} \g{name}
+ const auto curves = src[i] == L'{' ? 1 : 0;
+ if (ISDIGIT(src[curves+i]))
{
- const auto bref = back_ref(i);
- const auto b_pos = static_cast<int>(bref.data() - src);
+ int save=i;
+ op->op=opBackRef;
+ i += curves;
+ op->refindex=GetNum(src,i);
+ i -= (1 - curves);
- int number = -1;
- const auto number_mode = ISDIGIT(bref[0]) || bref[0] == L'-';
- if (number_mode)
+ if ((curves && src[i] != L'}') || op->refindex <= 0 || op->refindex>brcount || !closedbrackets[op->refindex])
{
- number = std::stoi(std::wstring(bref));
- if (number < 0) number = brcount + 1 + number; // -1 == brcount
- if (number <= 0 || number > brcount || !closedbrackets[number])
- throw regex_exception(errInvalidBackRef, b_pos + shift);
+ throw regex_exception(errInvalidBackRef, save + (src - start) - 1);
}
- else
+
+ if (op->refindex>maxbackref)maxbackref=op->refindex;
+ }
+ else
+ {
+ if (options&OP_STRICT && ISALPHA(src[i]))
{
- const auto found = NamedGroups.find(bref);
- if (found != NamedGroups.cend())
- number = static_cast<int>(found->second);
- if (number <= 0)
- throw regex_exception(errReferenceToUndefinedNamedBracket, b_pos + shift);
+ throw regex_exception(errInvalidEscape, i + (src - start) - 1);
}
- op->op = opBackRef;
- op->refindex = number;
+ op->op=ignorecase?opSymbolIgnoreCase:opSymbol;
+ op->symbol=ignorecase?TOLOWER(src[i]):src[i];
- continue;
+ if (TOLOWER(op->symbol)==TOUPPER(op->symbol))
+ {
+ op->op=opSymbol;
+ }
}
}
- break;
}
+
+ continue;
}
switch (src[i])
@@ -1166,7 +1114,7 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
}
if ((brdepth + 1) >= MAXDEPTH)
- throw regex_exception(errMaxDepth, i + shift);
+ throw regex_exception(errMaxDepth, i + (src - start));
brackets[brdepth++]=op;
op->op=opAlternative;
@@ -1176,50 +1124,53 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
{
op->op=opOpenBracket;
- if (src[i+1] == L'?')
+ if (src[i+1]=='?')
{
- i += 2;
- const char c1 = peek_char(i);
- const char c2 = peek_char(i+1);
+ i+=2;
- switch (c1)
+ switch (src[i])
{
- case L':': op->bracket.index=-1; break;
- case L'=': op->op=opLookAhead; havelookahead=1; break;
- case L'!': op->op=opNotLookAhead; havelookahead=1; break;
-
- case L'<':
+ case ':':op->bracket.index=-1; break;
+ case '=':op->op=opLookAhead; havelookahead=1; break;
+ case '!':op->op=opNotLookAhead; havelookahead=1; break;
+ case '<':
{
- if (c2 == L'=')
+ i++;
+
+ if (src[i]=='=')
{
- ++i;
- op->op = opLookBehind;
- break;
+ op->op=opLookBehind;
}
- else if (c2 == L'!')
+ else if (src[i]=='!')
{
- ++i;
- op->op = opNotLookBehind;
- break;
+ op->op=opNotLookBehind;
}
- } [[fallthrough]];
+ else
+ throw regex_exception(errSyntax, i + (src - start));
+ } break;
case L'{':
{
- auto group_name = named_group(i);
+ op->op = opNamedBracket;
+ int len = 0;
+ i++;
- op->op = opOpenBracket; // opNamedBracket;
- ++brcount;
- closedbrackets.push_back(false);
- op->bracket.index = brcount;
+ while (src[i + len] != L'}')len++;
- if (!NamedGroups.emplace(group_name, brcount).second)
- throw regex_exception(errSubpatternGroupNameMustBeUnique, (group_name.data() - src) + shift);
+ if (!len)
+ throw regex_exception(errIncompleteGroupStructure, i + (src - start));
+ const auto Name = new wchar_t[len + 1];
+ std::memcpy(Name, src + i, len * sizeof(wchar_t));
+ Name[len] = 0;
+ op->
nbracket.name = Name;
+ ++brcount;
+ closedbrackets.push_back(false);
+ op->nbracket.index = brcount;
+ i += len;
} break;
-
default:
{
- throw regex_exception(errSyntax, i + shift);
+ throw regex_exception(errSyntax, i + (src - start));
}
}
}
@@ -1260,14 +1211,31 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
break;
}
+ case opNamedBracket:
+ {
+ op->nbracket.pairindex = brackets[brdepth];
+ brackets[brdepth]->nbracket.pairindex = op;
+ op->bracket.index = brackets[brdepth]->bracket.index;
+ if (op->bracket.index != -1)
+ {
+ closedbrackets[op->bracket.index] = true;
+ }
+
+ op->
nbracket.name = brackets[brdepth]->
nbracket.name;
+
+ if (!NamedGroups.emplace(op->
nbracket.name, op->bracket.index).second)
+ throw regex_exception(errSubpatternGroupNameMustBeUnique, i + (src - start));
+
+ break;
+ }
case opLookBehind:
case opNotLookBehind:
{
int l=CalcPatternLength(brackets[brdepth] + 1, op - 1);
if (l == -1)
- throw regex_exception(errVariableLengthLookBehind, i + shift);
+ throw regex_exception(errVariableLengthLookBehind, i + (src - start));
brackets[brdepth]->assert.length=l;
}
@@ -1314,36 +1282,70 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
switch (src[i])
{
- case L'D':isnottype=1; [[fallthrough]];
- case L'd':type=TYPE_DIGITCHAR; break;
- case L'W':isnottype=1; [[fallthrough]];
- case L'w':type=TYPE_WORDCHAR; break;
- case L'S':isnottype=1; [[fallthrough]];
- case L's':type=TYPE_SPACECHAR; break;
- case L'L':isnottype=1; [[fallthrough]];
- case L'l':type=TYPE_LOWCASE; break;
- case L'U':isnottype=1; [[fallthrough]];
- case L'u':type=TYPE_UPCASE; break;
- case L'I':isnottype=1; [[fallthrough]];
- case L'i':type=TYPE_ALPHACHAR; break;
- case L'n':lastchar=L'\n'; break;
- case L'r':lastchar=L'\r'; break;
- case L't':lastchar=L'\t'; break;
- case L'f':lastchar=L'\f'; break;
- case L'e':lastchar=L'\x1B'; break;
- case L'x':
+ case 'D':isnottype=1; [[fallthrough]];
+ case 'd':type=TYPE_DIGITCHAR; break;
+ case 'W':isnottype=1; [[fallthrough]];
+ case 'w':type=TYPE_WORDCHAR; break;
+ case 'S':isnottype=1; [[fallthrough]];
+ case 's':type=TYPE_SPACECHAR; break;
+ case 'L':isnottype=1; [[fallthrough]];
+ case 'l':type=TYPE_LOWCASE; break;
+ case 'U':isnottype=1; [[fallthrough]];
+ case 'u':type=TYPE_UPCASE; break;
+ case 'I':isnottype=1; [[fallthrough]];
+ case 'i':type=TYPE_ALPHACHAR; break;
+ case 'n':lastchar='\n'; break;
+ case 'r':lastchar='\r'; break;
+ case 't':lastchar='\t'; break;
+ case 'f':lastchar='\f'; break;
+ case 'e':lastchar=27; break;
+ case 'x':
{
- lastchar = hex_char(i);
- dpf((L"Last char=%c(%04x)\n", lastchar, lastchar));
+ i++;
+
+ if (i >= srclength)
+ throw regex_exception(errSyntax, i + (src - start) - 1);
+
+ if (isxdigit(src[i]))
+ {
+ int c=TOLOWER(src[i])-'0';
+
+ if (c>9)c-='a'-'0'-10;
+
+ lastchar=c;
+
+ for(int j=1,k=i;j<4 && k+j<srclength;j++)
+ {
+ if (isxdigit(src[k+j]))
+ {
+ i++;
+ c=TOLOWER(src[k+j])-'0';
+
+ if (c>9)c-='a'-'0'-10;
+
+ lastchar<<=4;
+ lastchar|=c;
+ }
+ else
+ {
+ break;
+ }
+ }
+ dpf((L"Last char=%c(%02x)\n",lastchar,lastchar));
+ }
+ else
+ throw regex_exception(errSyntax, i + (src - start));
+
break;
}
-
default:
{
- if (options & OP_STRICT && ISALPHA(src[i]))
- throw regex_exception(errInvalidEscape, i + shift - 1);
+ if (options&OP_STRICT && ISALPHA(src[i]))
+ {
+ throw regex_exception(errInvalidEscape, i + (src - start) - 1);
+ }
- lastchar = src[i];
+ lastchar=src[i];
}
}
@@ -1438,7 +1440,7 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
}
}
else
- throw regex_exception(errSyntax, i + shift);
+ throw regex_exception(errSyntax, i + (src - start));
}
else
{
@@ -1525,6 +1527,7 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
case '*':min=0; max=-2; break;
case '?':
{
+ //if(src[i+1]=='?') return SetError(errInvalidQuantifiersCombination,i);
min=0; max=1;
break;
}
@@ -1536,8 +1539,9 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
max=min;
if (min<0)
- throw regex_exception(errInvalidRange, save + shift);
+ throw regex_exception(errInvalidRange, save + (src - start));
+// i++;
if (src[i]==',')
{
if (src[i+1]=='}')
@@ -1550,13 +1554,14 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
i++;
max=GetNum(src,i);
+// i++;
if (max<min)
- throw regex_exception(errInvalidRange, save + shift);
+ throw regex_exception(errInvalidRange, save + (src - start));
}
}
if (src[i] != '}')
- throw regex_exception(errInvalidRange, save + shift);
+ throw regex_exception(errInvalidRange, save + (src - start));
}
}
@@ -1577,7 +1582,10 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
case opWordBound:
case opNotWordBound:
{
- throw regex_exception(errInvalidQuantifiersCombination, i + shift);
+ throw regex_exception(errInvalidQuantifiersCombination, i + (src - start));
+// op->range.op=op->op;
+// op->op=opRange;
+// continue;
}
case opCharAny:
case opCharAnyAll:
@@ -1618,13 +1626,20 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
op->op=opBackRefRange;
break;
}
+ case opNamedBackRef:
+ {
+ op->op = opNamedRefRange;
+ break;
+ }
case opClosingBracket:
{
op=op->bracket.pairindex;
- if (op->op != opOpenBracket /* && op->op != opNamedBracket */)
- throw regex_exception(errInvalidQuantifiersCombination, i + shift);
+ if (op->op != opOpenBracket && op->op != opNamedBracket)
+ throw regex_exception(errInvalidQuantifiersCombination, i + (src - start));
+ if (op->op == opNamedBracket)
+ delete[] op->
nbracket.name;
op->range.min=min;
op->range.max=max;
@@ -1634,7 +1649,7 @@ void RegExp::InnerCompile(const wchar_t* src, const int srclength, const int shi
default:
{
dpf((L"op->=%d\n",op->op));
- throw regex_exception(errInvalidQuantifiersCombination, i + shift);
+ throw regex_exception(errInvalidQuantifiersCombination, i + (src - start));
}
}//switch(code.op)
@@ -1758,7 +1773,7 @@ int RegExp::StrCmp(const wchar_t*& str, const wchar_t* start, const wchar_t* end
static constexpr RegExpMatch DefaultMatch{ -1, -1 };
-bool RegExp::InnerMatch(const wchar_t* start, const wchar_t* str, const wchar_t* strend, regex_match& RegexMatch, state_stack& StateStack) const
+bool RegExp::InnerMatch(const wchar_t* const start, const wchar_t* str, const wchar_t* strend, regex_match& RegexMatch, state_stack& StateStack) const
{
int i,j;
int minimizing;
@@ -2049,7 +2064,34 @@ bool RegExp::InnerMatch(const wchar_t* start, const wchar_t* str, const wchar_t*
continue;
}
+ case opNamedBracket:
+ {
+ if (op->bracket.index >= 0)
+ {
+ //if (inrangebracket) Mantis#1388
+ {
+ StateStackItem st;
+ st.op = opNamedBracket;
+ st.pos = op;
+ st.min = match[op->bracket.index].start;
+ st.max = match[op->bracket.index].end;
+ stack.emplace_back(st);
+ }
+
+ match[op->bracket.index].start = str - start;
+ }
+ if (op->bracket.nextalt)
+ {
+ StateStackItem st;
+ st.op = opAlternative;
+ st.pos = op->bracket.nextalt;
+ st.savestr = str;
+ stack.emplace_back(st);
+ }
+
+ continue;
+ }
case opClosingBracket:
{
switch (op->bracket.pairindex->op)
@@ -2060,9 +2102,18 @@ bool RegExp::InnerMatch(const wchar_t* start, const wchar_t* str, const wchar_t*
{
match[op->bracket.index].end = str - start;
}
+
continue;
}
+ case opNamedBracket:
+ {
+ if (op->nbracket.index >= 0)
+ {
+ match[op->nbracket.index].end = str - start;
+ }
+ continue;
+ }
case opBracketRange:
{
auto st = FindStateByPos(stack, op->bracket.pairindex,opBracketRange);
@@ -2259,43 +2310,53 @@ bool RegExp::InnerMatch(const wchar_t* start, const wchar_t* str, const wchar_t*
op = std::prev(op->alternative.endindex);
continue;
}
-
+ case opNamedBackRef:
case opBackRef:
{
- m = &match[op->refindex];
- if (m->start== - 1 || m->end == -1)
- break;
+ if (op->op == opBackRef)
+ {
+ m = &match[op->refindex];
+ }
+ else
+ {
+ const auto Iterator = NamedGroups.find(op->refname);
+ if (Iterator == NamedGroups.cend())
+ break;
+
+ m = &match[Iterator->second];
+ }
+
+ if (m->start==-1 || m->end==-1)break;
if (ignorecase)
{
- j = m->end;
- for (i = m->start; i < j; i++, str++)
+ j=m->end;
+
+ for (i=m->start; i<j; i++,str++)
{
- if (TOLOWER(start[i]) != TOLOWER(*str))
- break;
- if (str > strend)
- break;
+ if (TOLOWER(start[i])!=TOLOWER(*str))break;
+
+ if (str>strend)break;
}
- if (i < j)
- break;
+
+ if (i<j)break;
}
else
{
- j = m->end;
- for (i = m->start; i < j; i++, str++)
+ j=m->end;
+
+ for (i=m->start; i<j; i++,str++)
{
- if (start[i] != *str)
- break;
- if (str > strend)
- break;
+ if (start[i]!=*str)break;
+
+ if (str>strend)break;
}
- if (i < j)
- break;
+
+ if (i<j)break;
}
continue;
}
-
case opAnyRange:
case opAnyMinRange:
{
@@ -2604,12 +2665,14 @@ bool RegExp::InnerMatch(const wchar_t* start, const wchar_t* str, const wchar_t*
stack.emplace_back(st);
continue;
}
+ case opNamedRefRange:
+ case opNamedRefMinRange:
case opBackRefRange:
case opBackRefMinRange:
{
StateStackItem st;
st.op = op->op;
- minimizing = op->op == opBackRefMinRange; // || op->op == opNamedRefMinRange;
+ minimizing = op->op == opBackRefMinRange || op->op == opNamedRefMinRange;
j=op->range.min;
st.max=op->range.max-j;
if (op->op == opBackRefRange || op->op == opBackRefMinRange)
@@ -2817,6 +2880,7 @@ bool RegExp::InnerMatch(const wchar_t* start, const wchar_t* str, const wchar_t*
ps.savestr = str;
break;
}
+ case opNamedRefRange:
case opBackRefRange:
{
if (ps.op == opBackRefRange)
@@ -3007,6 +3071,7 @@ bool RegExp::InnerMatch(const wchar_t* start, const wchar_t* str, const wchar_t*
break;
}
+ case opNamedRefMinRange:
case opBackRefMinRange:
{
if (!(ps.max--))
@@ -3126,7 +3191,18 @@ bool RegExp::InnerMatch(const wchar_t* start, const wchar_t* str, const wchar_t*
continue;
}
+ case opNamedBracket:
+ {
+ j = ps.pos->nbracket.index;
+ if (j >= 0)
+ {
+ match[j].start = ps.min;
+ match[j].end = ps.max;
+ }
+
+ continue;
+ }
case opLookAhead:
case opLookBehind:
{
@@ -3252,6 +3328,7 @@ bool RegExp::Optimize()
case opClassMinRange:
minlength+=it->range.min;
break;
+ case opNamedBracket:
case opOpenBracket:
case opBracketRange:
case opBracketMinRange:
@@ -3360,6 +3437,7 @@ bool RegExp::Optimize()
break;
}
+ case opNamedBracket:
case opOpenBracket:
{
if (op->bracket.nextalt)
diff --git a/far/RegExp.hpp b/far/RegExp.hpp
index b9794a65c..01c1f92ac 100644
--- a/far/RegExp.hpp
+++ b/far/RegExp.hpp
@@ -179,10 +179,10 @@ private:
string resrc;
#endif
- int CalcLength(string_view src, const int shift);
- void InnerCompile(const wchar_t* src, const int srclength, const int shift, int options);
+ int CalcLength(string_view src);
+ void InnerCompile(const wchar_t* start, const wchar_t* src, int srclength, int options);
- bool InnerMatch(const wchar_t*start, const wchar_t* str, const wchar_t* strend, regex_match& RegexMatch, state_stack& Statetack) const;
+ bool InnerMatch(const wchar_t* start, const wchar_t* str, const wchar_t* strend, regex_match& RegexMatch, state_stack& Statetack) const;
void TrimTail(const wchar_t* start, const wchar_t*& strend) const;
diff --git a/far/changelog b/far/changelog
index 3860df22e..62aa9a5ee 100644
--- a/far/changelog
+++ b/far/changelog
@@ -1,3 +1,8 @@
+--------------------------------------------------------------------------------
+w17 2025-10-04 02:31:43+03:00 - build 6591
+
+1. Revert 6590 (not ready)
+
--------------------------------------------------------------------------------
w17 2025-10-04 00:59:47+03:00 - build 6590
diff --git a/far/strmix.cpp b/far/strmix.cpp
index f50b57b0a..e75c34506 100644
--- a/far/strmix.cpp
+++ b/far/strmix.cpp
@@ -1006,7 +1006,7 @@ string ReplaceBrackets(
{
const auto CurrentChar = Str[i];
- if (CurrentChar != L'$' || i + 1 >= length)
+ if (CurrentChar != L'$' || i + 1 == length)
{
result.push_back(CurrentChar);
continue;
@@ -1015,7 +1015,7 @@ string ReplaceBrackets(
auto NextPos = i;
string_view Replacement;
- if (const auto NextChar = Str[i+1], NextNextChar = i+2 >= length ? L'\0' : Str[i+2]; std::iswdigit(NextChar)) // $<digit>
+ if (const auto NextChar = Str[i + 1]; std::iswdigit(NextChar)) // $<digit>
{
// 0, 1, 2, ...
size_t NumberEnd;
@@ -1026,13 +1026,12 @@ string ReplaceBrackets(
Replacement = get_match(MatchData, Match[GroupNumber]);
NextPos += NumberEnd;
}
- else if (NextChar == L'{' || (NextChar == L'+' && NextNextChar == L'{')) // ${... or $+{...
+ else if (NextChar == L'{') // ${...
{
// {some text}
- const auto off = i + (NextChar == L'+' ? 3 : 2);
- if (const auto NameEnd = Str.find(L'}', off); NameEnd != Str.npos)
+ if (const auto NameEnd = Str.find(L'}', i + 2); NameEnd != Str.npos)
{
- const auto Name = Str.substr(off, NameEnd - off);
+ const auto Name = Str.substr(i + 2, NameEnd - i - 2);
auto GroupNumber = MAXSIZE_T;
const auto GroupIterator = NamedGroups.find(Name);
diff --git a/far/vbuild.m4 b/far/vbuild.m4
index 1e2ab85b5..a9555e7ab 100644
--- a/far/vbuild.m4
+++ b/far/vbuild.m4
@@ -1 +1 @@
-6590
+6591