Add character class escapes

2025-03-10 03:29:59 +01:00 · 2015-05-27 09:56:06 +08:00 · 2015-05-27 09:56:06 +08:00 · 0dffe87551
commit 0dffe87551
parent 92285bed44
2 changed files with 49 additions and 28 deletions
--- a/include/rapidjson/internal/regex.h
+++ b/include/rapidjson/internal/regex.h
@ -46,6 +46,7 @@ static const SizeType kRegexInvalidRange = ~SizeType(0);
    - \c [a-z0-9_] Character class combination
    - \c [^abc] Negated character classes
    - \c [^a-c] Negated character class range
    - \c [\b]   Backspace (U+0008)
    - \c \\| \\\\ ...  Escape characters
    - \c \\f Form feed (U+000C)
    - \c \\n Line feed (U+000A)
@ -265,26 +266,8 @@ private:
                case '\\': // Escape character
                    if (!Encoding::Decode(is, &codepoint) || codepoint == 0)
                        return; // Expect an escape character
-                    switch (codepoint) {
+                    if (!CharacterEscape(codepoint, &codepoint))
-                        case '|':
+                        return; // Unsupported escape character
                        case '(':
                        case ')':
                        case '?':
                        case '*':
                        case '+':
                        case '.':
                        case '[':
                        case ']':
                        case '\\':
                            break; // use the codepoint as is
                        case 'f': codepoint = 0x000C; break;
                        case 'n': codepoint = 0x000A; break;
                        case 'r': codepoint = 0x000D; break;
                        case 't': codepoint = 0x0009; break;
                        case 'v': codepoint = 0x000B; break;
                        default:
                            return; // Unsupported escape character
                    }
                    // fall through to default
                default: // Pattern character
@ -414,9 +397,16 @@ private:
        SizeType current = kRegexInvalidRange;
        unsigned codepoint;
        while (Encoding::Decode(is, &codepoint) && codepoint != 0) {
-            if (isBegin && codepoint == '^')
+            if (isBegin) {
-                negate = true;
+                isBegin = false;
-            else if (codepoint == ']') {
+                if (codepoint == '^') {
                    negate = true;
                    continue;
                }
            }
            switch (codepoint) {
            case ']':
                if (step == 2) { // Add trailing '-'
                    SizeType r = NewRange('-');
                    RAPIDJSON_ASSERT(current != kRegexInvalidRange);
@ -426,8 +416,17 @@ private:
                    GetRange(start).start |= kRangeNegationFlag;
                *range = start;
                return true;
-            }
+
-            else {
+            case '\\':
                if (!Encoding::Decode(is, &codepoint) || codepoint == 0)
                    return false; // Expect an escape character
                if (codepoint == 'b')
                    codepoint = 0x0008; // Escape backspace character
                else if (!CharacterEscape(codepoint, &codepoint))
                    return false;
                // fall through to default
            default:
                switch (step) {
                case 1:
                    if (codepoint == '-') {
@ -454,7 +453,6 @@ private:
                    step = 0;
                }
            }
            isBegin = false;
        }
        return false;
    }
@ -466,6 +464,29 @@ private:
        return rangeCount_++;
    }
    bool CharacterEscape(unsigned codepoint, unsigned* escapedCodepoint) {
        switch (codepoint) {
            case '|':
            case '(':
            case ')':
            case '?':
            case '*':
            case '+':
            case '.':
            case '[':
            case ']':
            case '\\':
                *escapedCodepoint = codepoint; return true;
            case 'f': *escapedCodepoint = 0x000C; return true;
            case 'n': *escapedCodepoint = 0x000A; return true;
            case 'r': *escapedCodepoint = 0x000D; return true;
            case 't': *escapedCodepoint = 0x0009; return true;
            case 'v': *escapedCodepoint = 0x000B; return true;
            default:
                return false; // Unsupported escape character
        }
    }
    Stack<Allocator> states_;
    Stack<Allocator> ranges_;
    SizeType root_;
--- a/test/unittest/regextest.cpp
+++ b/test/unittest/regextest.cpp
@ -328,10 +328,10 @@ TEST(Regex, CharacterRange8) {
 }
 TEST(Regex, Escape) {
-    const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v";
+    const char* s = "\\|\\(\\)\\?\\*\\+\\.\\[\\]\\\\\\f\\n\\r\\t\\v[\\b][\\[][\\]]";
    Regex re(s);
    ASSERT_TRUE(re.IsValid());
-    EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B"));
+    EXPECT_TRUE(re.Match("|()?*+.[]\\\x0C\n\r\t\x0B\b[]"));
    EXPECT_FALSE(re.Match(s)); // Not escaping
 }