mirror of
https://github.com/Tencent/rapidjson.git
synced 2025-03-10 03:29:59 +01:00
Add Search(), ^ and $ assertions to regex
This commit is contained in:
parent
a5ac3b5dbc
commit
3eb19ceaf9
@ -62,7 +62,7 @@ class GenericRegex {
|
|||||||
public:
|
public:
|
||||||
typedef typename Encoding::Ch Ch;
|
typedef typename Encoding::Ch Ch;
|
||||||
|
|
||||||
GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(),rangeCount_() {
|
GenericRegex(const Ch* source, Allocator* allocator = 0) : states_(allocator, 256), ranges_(allocator, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(), anchorBegin_(), anchorEnd_() {
|
||||||
StringStream ss(source);
|
StringStream ss(source);
|
||||||
DecodedStream<StringStream> ds(ss);
|
DecodedStream<StringStream> ds(ss);
|
||||||
Parse(ds);
|
Parse(ds);
|
||||||
@ -77,51 +77,24 @@ public:
|
|||||||
|
|
||||||
template <typename InputStream>
|
template <typename InputStream>
|
||||||
bool Match(InputStream& is) const {
|
bool Match(InputStream& is) const {
|
||||||
RAPIDJSON_ASSERT(IsValid());
|
return SearchWithAnchoring(is, true, true);
|
||||||
DecodedStream<InputStream> ds(is);
|
|
||||||
|
|
||||||
Allocator allocator;
|
|
||||||
Stack<Allocator> state0(&allocator, stateCount_ * sizeof(SizeType));
|
|
||||||
Stack<Allocator> state1(&allocator, stateCount_ * sizeof(SizeType));
|
|
||||||
Stack<Allocator> *current = &state0, *next = &state1;
|
|
||||||
|
|
||||||
const size_t stateSetSize = (stateCount_ + 31) / 32 * 4;
|
|
||||||
unsigned* stateSet = static_cast<unsigned*>(allocator.Malloc(stateSetSize));
|
|
||||||
std::memset(stateSet, 0, stateSetSize);
|
|
||||||
AddState(stateSet, *current, root_);
|
|
||||||
|
|
||||||
unsigned codepoint;
|
|
||||||
while (!current->Empty() && (codepoint = ds.Take()) != 0) {
|
|
||||||
std::memset(stateSet, 0, stateSetSize);
|
|
||||||
next->Clear();
|
|
||||||
for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
|
|
||||||
const State& sr = GetState(*s);
|
|
||||||
if (sr.codepoint == codepoint ||
|
|
||||||
sr.codepoint == kAnyCharacterClass ||
|
|
||||||
(sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
|
|
||||||
{
|
|
||||||
AddState(stateSet, *next, sr.out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Stack<Allocator>* temp = current;
|
|
||||||
current = next;
|
|
||||||
next = temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
Allocator::Free(stateSet);
|
|
||||||
|
|
||||||
for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s)
|
|
||||||
if (GetState(*s).out == kRegexInvalidState)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Match(const Ch* s) {
|
bool Match(const Ch* s) const {
|
||||||
StringStream is(s);
|
StringStream is(s);
|
||||||
return Match(is);
|
return Match(is);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename InputStream>
|
||||||
|
bool Search(InputStream& is) const {
|
||||||
|
return SearchWithAnchoring(is, anchorBegin_, anchorEnd_);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Search(const Ch* s) const {
|
||||||
|
StringStream is(s);
|
||||||
|
return Search(is);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum Operator {
|
enum Operator {
|
||||||
kZeroOrOne,
|
kZeroOrOne,
|
||||||
@ -193,32 +166,6 @@ private:
|
|||||||
return ranges_.template Bottom<Range>()[index];
|
return ranges_.template Bottom<Range>()[index];
|
||||||
}
|
}
|
||||||
|
|
||||||
void AddState(unsigned* stateSet, Stack<Allocator>& l, SizeType index) const {
|
|
||||||
if (index == kRegexInvalidState)
|
|
||||||
return;
|
|
||||||
|
|
||||||
const State& s = GetState(index);
|
|
||||||
if (s.out1 != kRegexInvalidState) { // Split
|
|
||||||
AddState(stateSet, l, s.out);
|
|
||||||
AddState(stateSet, l, s.out1);
|
|
||||||
}
|
|
||||||
else if (!(stateSet[index >> 5] & (1 << (index & 31)))) {
|
|
||||||
stateSet[index >> 5] |= (1 << (index & 31));
|
|
||||||
*l.template Push<SizeType>() = index;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool MatchRange(SizeType rangeIndex, unsigned codepoint) const {
|
|
||||||
bool yes = (GetRange(rangeIndex).start & kRangeNegationFlag) == 0;
|
|
||||||
while (rangeIndex != kRegexInvalidRange) {
|
|
||||||
const Range& r = GetRange(rangeIndex);
|
|
||||||
if (codepoint >= (r.start & ~kRangeNegationFlag) && codepoint <= r.end)
|
|
||||||
return yes;
|
|
||||||
rangeIndex = r.next;
|
|
||||||
}
|
|
||||||
return !yes;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename InputStream>
|
template <typename InputStream>
|
||||||
void Parse(DecodedStream<InputStream>& ds) {
|
void Parse(DecodedStream<InputStream>& ds) {
|
||||||
Allocator allocator;
|
Allocator allocator;
|
||||||
@ -231,6 +178,14 @@ private:
|
|||||||
unsigned codepoint;
|
unsigned codepoint;
|
||||||
while (ds.Peek() != 0) {
|
while (ds.Peek() != 0) {
|
||||||
switch (codepoint = ds.Take()) {
|
switch (codepoint = ds.Take()) {
|
||||||
|
case '^':
|
||||||
|
anchorBegin_ = true;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '$':
|
||||||
|
anchorEnd_ = true;
|
||||||
|
break;
|
||||||
|
|
||||||
case '|':
|
case '|':
|
||||||
while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
|
while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
|
||||||
if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
|
if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
|
||||||
@ -567,6 +522,8 @@ private:
|
|||||||
bool CharacterEscape(DecodedStream<InputStream>& ds, unsigned* escapedCodepoint) {
|
bool CharacterEscape(DecodedStream<InputStream>& ds, unsigned* escapedCodepoint) {
|
||||||
unsigned codepoint;
|
unsigned codepoint;
|
||||||
switch (codepoint = ds.Take()) {
|
switch (codepoint = ds.Take()) {
|
||||||
|
case '^':
|
||||||
|
case '$':
|
||||||
case '|':
|
case '|':
|
||||||
case '(':
|
case '(':
|
||||||
case ')':
|
case ')':
|
||||||
@ -590,11 +547,87 @@ private:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename InputStream>
|
||||||
|
bool SearchWithAnchoring(InputStream& is, bool anchorBegin, bool anchorEnd) const {
|
||||||
|
RAPIDJSON_ASSERT(IsValid());
|
||||||
|
DecodedStream<InputStream> ds(is);
|
||||||
|
|
||||||
|
Allocator allocator;
|
||||||
|
Stack<Allocator> state0(&allocator, stateCount_ * sizeof(SizeType));
|
||||||
|
Stack<Allocator> state1(&allocator, stateCount_ * sizeof(SizeType));
|
||||||
|
Stack<Allocator> *current = &state0, *next = &state1;
|
||||||
|
|
||||||
|
const size_t stateSetSize = (stateCount_ + 31) / 32 * 4;
|
||||||
|
unsigned* stateSet = static_cast<unsigned*>(allocator.Malloc(stateSetSize));
|
||||||
|
std::memset(stateSet, 0, stateSetSize);
|
||||||
|
|
||||||
|
bool matched = false;
|
||||||
|
matched = AddState(stateSet, *current, root_);
|
||||||
|
|
||||||
|
unsigned codepoint;
|
||||||
|
while (!current->Empty() && (codepoint = ds.Take()) != 0) {
|
||||||
|
std::memset(stateSet, 0, stateSetSize);
|
||||||
|
next->Clear();
|
||||||
|
matched = false;
|
||||||
|
for (const SizeType* s = current->template Bottom<SizeType>(); s != current->template End<SizeType>(); ++s) {
|
||||||
|
const State& sr = GetState(*s);
|
||||||
|
if (sr.codepoint == codepoint ||
|
||||||
|
sr.codepoint == kAnyCharacterClass ||
|
||||||
|
(sr.codepoint == kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
|
||||||
|
{
|
||||||
|
matched = AddState(stateSet, *next, sr.out) || matched;
|
||||||
|
if (!anchorEnd && matched)
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
if (!anchorBegin)
|
||||||
|
AddState(stateSet, *next, root_);
|
||||||
|
}
|
||||||
|
Stack<Allocator>* temp = current;
|
||||||
|
current = next;
|
||||||
|
next = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
exit:
|
||||||
|
Allocator::Free(stateSet);
|
||||||
|
return matched;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return whether the added states is a match state
|
||||||
|
bool AddState(unsigned* stateSet, Stack<Allocator>& l, SizeType index) const {
|
||||||
|
if (index == kRegexInvalidState)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
const State& s = GetState(index);
|
||||||
|
if (s.out1 != kRegexInvalidState) { // Split
|
||||||
|
bool matched = AddState(stateSet, l, s.out);
|
||||||
|
matched = AddState(stateSet, l, s.out1) || matched;
|
||||||
|
return matched;
|
||||||
|
}
|
||||||
|
else if (!(stateSet[index >> 5] & (1 << (index & 31)))) {
|
||||||
|
stateSet[index >> 5] |= (1 << (index & 31));
|
||||||
|
*l.template Push<SizeType>() = index;
|
||||||
|
return GetState(index).out == kRegexInvalidState;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MatchRange(SizeType rangeIndex, unsigned codepoint) const {
|
||||||
|
bool yes = (GetRange(rangeIndex).start & kRangeNegationFlag) == 0;
|
||||||
|
while (rangeIndex != kRegexInvalidRange) {
|
||||||
|
const Range& r = GetRange(rangeIndex);
|
||||||
|
if (codepoint >= (r.start & ~kRangeNegationFlag) && codepoint <= r.end)
|
||||||
|
return yes;
|
||||||
|
rangeIndex = r.next;
|
||||||
|
}
|
||||||
|
return !yes;
|
||||||
|
}
|
||||||
|
|
||||||
Stack<Allocator> states_;
|
Stack<Allocator> states_;
|
||||||
Stack<Allocator> ranges_;
|
Stack<Allocator> ranges_;
|
||||||
SizeType root_;
|
SizeType root_;
|
||||||
SizeType stateCount_;
|
SizeType stateCount_;
|
||||||
SizeType rangeCount_;
|
SizeType rangeCount_;
|
||||||
|
bool anchorBegin_;
|
||||||
|
bool anchorEnd_;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef GenericRegex<UTF8<> > Regex;
|
typedef GenericRegex<UTF8<> > Regex;
|
||||||
|
@ -432,11 +432,65 @@ TEST(Regex, CharacterRange8) {
|
|||||||
EXPECT_FALSE(re.Match("!"));
|
EXPECT_FALSE(re.Match("!"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(Regex, Search) {
|
||||||
|
Regex re("abc");
|
||||||
|
ASSERT_TRUE(re.IsValid());
|
||||||
|
EXPECT_TRUE(re.Search("abc"));
|
||||||
|
EXPECT_TRUE(re.Search("_abc"));
|
||||||
|
EXPECT_TRUE(re.Search("abc_"));
|
||||||
|
EXPECT_TRUE(re.Search("_abc_"));
|
||||||
|
EXPECT_TRUE(re.Search("__abc__"));
|
||||||
|
EXPECT_TRUE(re.Search("abcabc"));
|
||||||
|
EXPECT_FALSE(re.Search("a"));
|
||||||
|
EXPECT_FALSE(re.Search("ab"));
|
||||||
|
EXPECT_FALSE(re.Search("bc"));
|
||||||
|
EXPECT_FALSE(re.Search("cba"));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Regex, Search_BeginAnchor) {
|
||||||
|
Regex re("^abc");
|
||||||
|
ASSERT_TRUE(re.IsValid());
|
||||||
|
EXPECT_TRUE(re.Search("abc"));
|
||||||
|
EXPECT_TRUE(re.Search("abc_"));
|
||||||
|
EXPECT_TRUE(re.Search("abcabc"));
|
||||||
|
EXPECT_FALSE(re.Search("_abc"));
|
||||||
|
EXPECT_FALSE(re.Search("_abc_"));
|
||||||
|
EXPECT_FALSE(re.Search("a"));
|
||||||
|
EXPECT_FALSE(re.Search("ab"));
|
||||||
|
EXPECT_FALSE(re.Search("bc"));
|
||||||
|
EXPECT_FALSE(re.Search("cba"));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Regex, Search_EndAnchor) {
|
||||||
|
Regex re("abc$");
|
||||||
|
ASSERT_TRUE(re.IsValid());
|
||||||
|
EXPECT_TRUE(re.Search("abc"));
|
||||||
|
EXPECT_TRUE(re.Search("_abc"));
|
||||||
|
EXPECT_TRUE(re.Search("abcabc"));
|
||||||
|
EXPECT_FALSE(re.Search("abc_"));
|
||||||
|
EXPECT_FALSE(re.Search("_abc_"));
|
||||||
|
EXPECT_FALSE(re.Search("a"));
|
||||||
|
EXPECT_FALSE(re.Search("ab"));
|
||||||
|
EXPECT_FALSE(re.Search("bc"));
|
||||||
|
EXPECT_FALSE(re.Search("cba"));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Regex, Search_BothAnchor) {
|
||||||
|
Regex re("^abc$");
|
||||||
|
ASSERT_TRUE(re.IsValid());
|
||||||
|
EXPECT_TRUE(re.Search("abc"));
|
||||||
|
EXPECT_FALSE(re.Search(""));
|
||||||
|
EXPECT_FALSE(re.Search("a"));
|
||||||
|
EXPECT_FALSE(re.Search("b"));
|
||||||
|
EXPECT_FALSE(re.Search("ab"));
|
||||||
|
EXPECT_FALSE(re.Search("abcd"));
|
||||||
|
}
|
||||||
|
|
||||||
TEST(Regex, Escape) {
|
TEST(Regex, Escape) {
|
||||||
const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\{\\}\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]";
|
const char* s = "\\^\\$\\|\\(\\)\\?\\*\\+\\.\\[\\]\\{\\}\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]";
|
||||||
Regex re(s);
|
Regex re(s);
|
||||||
ASSERT_TRUE(re.IsValid());
|
ASSERT_TRUE(re.IsValid());
|
||||||
EXPECT_TRUE(re.Match("|()?*+.[]{}\\\x0C\n\r\t\x0B\b[]"));
|
EXPECT_TRUE(re.Match("^$|()?*+.[]{}\\\x0C\n\r\t\x0B\b[]"));
|
||||||
EXPECT_FALSE(re.Match(s)); // Not escaping
|
EXPECT_FALSE(re.Match(s)); // Not escaping
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user