Add character class escapes

This commit is contained in:
miloyip 2015-05-27 09:56:06 +08:00
parent 92285bed44
commit 0dffe87551
2 changed files with 49 additions and 28 deletions

View File

@ -46,6 +46,7 @@ static const SizeType kRegexInvalidRange = ~SizeType(0);
- \c [a-z0-9_] Character class combination - \c [a-z0-9_] Character class combination
- \c [^abc] Negated character classes - \c [^abc] Negated character classes
- \c [^a-c] Negated character class range - \c [^a-c] Negated character class range
- \c [\b] Backspace (U+0008)
- \c \\| \\\\ ... Escape characters - \c \\| \\\\ ... Escape characters
- \c \\f Form feed (U+000C) - \c \\f Form feed (U+000C)
- \c \\n Line feed (U+000A) - \c \\n Line feed (U+000A)
@ -265,26 +266,8 @@ private:
case '\\': // Escape character case '\\': // Escape character
if (!Encoding::Decode(is, &codepoint) || codepoint == 0) if (!Encoding::Decode(is, &codepoint) || codepoint == 0)
return; // Expect an escape character return; // Expect an escape character
switch (codepoint) { if (!CharacterEscape(codepoint, &codepoint))
case '|': return; // Unsupported escape character
case '(':
case ')':
case '?':
case '*':
case '+':
case '.':
case '[':
case ']':
case '\\':
break; // use the codepoint as is
case 'f': codepoint = 0x000C; break;
case 'n': codepoint = 0x000A; break;
case 'r': codepoint = 0x000D; break;
case 't': codepoint = 0x0009; break;
case 'v': codepoint = 0x000B; break;
default:
return; // Unsupported escape character
}
// fall through to default // fall through to default
default: // Pattern character default: // Pattern character
@ -414,9 +397,16 @@ private:
SizeType current = kRegexInvalidRange; SizeType current = kRegexInvalidRange;
unsigned codepoint; unsigned codepoint;
while (Encoding::Decode(is, &codepoint) && codepoint != 0) { while (Encoding::Decode(is, &codepoint) && codepoint != 0) {
if (isBegin && codepoint == '^') if (isBegin) {
negate = true; isBegin = false;
else if (codepoint == ']') { if (codepoint == '^') {
negate = true;
continue;
}
}
switch (codepoint) {
case ']':
if (step == 2) { // Add trailing '-' if (step == 2) { // Add trailing '-'
SizeType r = NewRange('-'); SizeType r = NewRange('-');
RAPIDJSON_ASSERT(current != kRegexInvalidRange); RAPIDJSON_ASSERT(current != kRegexInvalidRange);
@ -426,8 +416,17 @@ private:
GetRange(start).start |= kRangeNegationFlag; GetRange(start).start |= kRangeNegationFlag;
*range = start; *range = start;
return true; return true;
}
else { case '\\':
if (!Encoding::Decode(is, &codepoint) || codepoint == 0)
return false; // Expect an escape character
if (codepoint == 'b')
codepoint = 0x0008; // Escape backspace character
else if (!CharacterEscape(codepoint, &codepoint))
return false;
// fall through to default
default:
switch (step) { switch (step) {
case 1: case 1:
if (codepoint == '-') { if (codepoint == '-') {
@ -454,7 +453,6 @@ private:
step = 0; step = 0;
} }
} }
isBegin = false;
} }
return false; return false;
} }
@ -466,6 +464,29 @@ private:
return rangeCount_++; return rangeCount_++;
} }
bool CharacterEscape(unsigned codepoint, unsigned* escapedCodepoint) {
switch (codepoint) {
case '|':
case '(':
case ')':
case '?':
case '*':
case '+':
case '.':
case '[':
case ']':
case '\\':
*escapedCodepoint = codepoint; return true;
case 'f': *escapedCodepoint = 0x000C; return true;
case 'n': *escapedCodepoint = 0x000A; return true;
case 'r': *escapedCodepoint = 0x000D; return true;
case 't': *escapedCodepoint = 0x0009; return true;
case 'v': *escapedCodepoint = 0x000B; return true;
default:
return false; // Unsupported escape character
}
}
Stack<Allocator> states_; Stack<Allocator> states_;
Stack<Allocator> ranges_; Stack<Allocator> ranges_;
SizeType root_; SizeType root_;

View File

@ -328,10 +328,10 @@ TEST(Regex, CharacterRange8) {
} }
TEST(Regex, Escape) { TEST(Regex, Escape) {
const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v"; const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]";
Regex re(s); Regex re(s);
ASSERT_TRUE(re.IsValid()); ASSERT_TRUE(re.IsValid());
EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B")); EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B\b[]"));
EXPECT_FALSE(re.Match(s)); // Not escaping EXPECT_FALSE(re.Match(s)); // Not escaping
} }