From 3d316bd5489446b7bc75d188992ad244252fdd09 Mon Sep 17 00:00:00 2001 From: cesar Date: Thu, 26 May 2022 10:11:05 -0400 Subject: [PATCH] Readded named substring support for regexes (#3569) Original merged PR #952 was merged in 2015, but has gone missing. This is adding it back in. --- Foundation/include/Poco/RegularExpression.h | 39 +++++++++++-------- Foundation/src/RegularExpression.cpp | 26 +++++++++++++ .../testsuite/src/RegularExpressionTest.cpp | 12 ++++++ .../testsuite/src/RegularExpressionTest.h | 1 + 4 files changed, 61 insertions(+), 17 deletions(-) diff --git a/Foundation/include/Poco/RegularExpression.h b/Foundation/include/Poco/RegularExpression.h index 5291aab6c..edc9731e3 100644 --- a/Foundation/include/Poco/RegularExpression.h +++ b/Foundation/include/Poco/RegularExpression.h @@ -23,6 +23,7 @@ #include "Poco/Foundation.h" #include +#include namespace Poco { @@ -39,7 +40,7 @@ public: /// Some of the following options can only be passed to the constructor; /// some can be passed only to matching functions, and some can be used /// everywhere. - /// + /// /// * Options marked [ctor] can be passed to the constructor. /// * Options marked [match] can be passed to match, extract, split and subst. /// * Options marked [subst] can be passed to subst. @@ -61,10 +62,10 @@ public: RE_NO_AUTO_CAPTURE = 0x00001000, /// disable numbered capturing parentheses [ctor, match] RE_NO_UTF8_CHECK = 0x00002000, /// do not check validity of UTF-8 code sequences [match] RE_FIRSTLINE = 0x00040000, /// an unanchored pattern is required to match - /// before or at the first newline in the subject string, + /// before or at the first newline in the subject string, /// though the matched text may continue over the newline [ctor] RE_DUPNAMES = 0x00080000, /// names used to identify capturing subpatterns need not be unique [ctor] - RE_NEWLINE_CR = 0x00100000, /// assume newline is CR ('\r'), the default [ctor] + RE_NEWLINE_CR = 0x00100000, /// assume newline is CR ('\r'), the default [ctor] RE_NEWLINE_LF = 0x00200000, /// assume newline is LF ('\n') [ctor] RE_NEWLINE_CRLF = 0x00300000, /// assume newline is CRLF ("\r\n") [ctor] RE_NEWLINE_ANY = 0x00400000, /// assume newline is any valid Unicode newline character [ctor] @@ -72,21 +73,23 @@ public: RE_GLOBAL = 0x10000000, /// replace all occurences (/g) [subst] RE_NO_VARS = 0x20000000 /// treat dollar in replacement string as ordinary character [subst] }; - + struct Match { std::string::size_type offset; /// zero based offset (std::string::npos if subexpr does not match) std::string::size_type length; /// length of substring + std::string name; /// name of group }; using MatchVec = std::vector; - + using GroupMap = std::map; + RegularExpression(const std::string& pattern, int options = 0, bool study = true); /// Creates a regular expression and parses the given pattern. /// If study is true, the pattern is analyzed and optimized. This /// is mainly useful if the pattern is used more than once. /// For a description of the options, please see the PCRE documentation. /// Throws a RegularExpressionException if the patter cannot be compiled. - + ~RegularExpression(); /// Destroys the regular expression. @@ -99,7 +102,7 @@ public: /// Returns the number of matches. int match(const std::string& subject, std::string::size_type offset, Match& mtch, int options = 0) const; - /// Matches the given subject string, starting at offset, against the pattern. + /// Matches the given subject string, starting at offset, against the pattern. /// Returns the position of the captured substring in mtch. /// If no part of the subject matches the pattern, mtch.offset is std::string::npos and /// mtch.length is 0. @@ -107,7 +110,7 @@ public: /// Returns the number of matches. int match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options = 0) const; - /// Matches the given subject string against the pattern. + /// Matches the given subject string against the pattern. /// The first entry in matches contains the position of the captured substring. /// The following entries identify matching subpatterns. See the PCRE documentation /// for a more detailed explanation. @@ -140,19 +143,19 @@ public: /// the pattern is treated as if it starts with a ^. int extract(const std::string& subject, std::string& str, int options = 0) const; - /// Matches the given subject string against the pattern. + /// Matches the given subject string against the pattern. /// Returns the captured string. /// Throws a RegularExpressionException in case of an error. /// Returns the number of matches. int extract(const std::string& subject, std::string::size_type offset, std::string& str, int options = 0) const; - /// Matches the given subject string, starting at offset, against the pattern. + /// Matches the given subject string, starting at offset, against the pattern. /// Returns the captured string. /// Throws a RegularExpressionException in case of an error. /// Returns the number of matches. int split(const std::string& subject, std::vector& strings, int options = 0) const; - /// Matches the given subject string against the pattern. + /// Matches the given subject string against the pattern. /// The first entry in captured is the captured substring. /// The following entries contain substrings matching subpatterns. See the PCRE documentation /// for a more detailed explanation. @@ -161,14 +164,14 @@ public: /// Returns the number of matches. int split(const std::string& subject, std::string::size_type offset, std::vector& strings, int options = 0) const; - /// Matches the given subject string against the pattern. + /// Matches the given subject string against the pattern. /// The first entry in captured is the captured substring. /// The following entries contain substrings matching subpatterns. See the PCRE documentation /// for a more detailed explanation. /// If no part of the subject matches the pattern, captured is empty. /// Throws a RegularExpressionException in case of an error. /// Returns the number of matches. - + int subst(std::string& subject, const std::string& replacement, int options = 0) const; /// Substitute in subject all matches of the pattern with replacement. /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise, @@ -183,7 +186,7 @@ public: /// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise, /// only the first match is replaced. /// Unless RE_NO_VARS is specified, occurrences of $ (for example, $0, $1, $2, ... $9) - /// in replacement are replaced with the corresponding captured string. + /// in replacement are replaced with the corresponding captured string. /// $0 is the captured substring. $1 ... $n are the substrings matching the subpatterns. /// Returns the number of replaced occurrences. @@ -195,13 +198,15 @@ protected: std::string::size_type substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const; private: - // Note: to avoid a dependency on the pcre.h header the following are + // Note: to avoid a dependency on the pcre.h header the following are // declared as void* and casted to the correct type in the implementation file. void* _pcre; // Actual type is pcre* void* _extra; // Actual type is struct pcre_extra* - + + GroupMap _groups; + static const int OVEC_SIZE; - + RegularExpression(); RegularExpression(const RegularExpression&); RegularExpression& operator = (const RegularExpression&); diff --git a/Foundation/src/RegularExpression.cpp b/Foundation/src/RegularExpression.cpp index 90f22b9f5..794246f6e 100644 --- a/Foundation/src/RegularExpression.cpp +++ b/Foundation/src/RegularExpression.cpp @@ -33,6 +33,10 @@ RegularExpression::RegularExpression(const std::string& pattern, int options, bo { const char* error; int offs; + unsigned nmcount; + unsigned nmentrysz; + unsigned char* nmtbl; + _pcre = pcre_compile(pattern.c_str(), options, &error, &offs, 0); if (!_pcre) { @@ -42,6 +46,19 @@ RegularExpression::RegularExpression(const std::string& pattern, int options, bo } if (study) _extra = pcre_study(reinterpret_cast(_pcre), 0, &error); + + const pcre* regex = reinterpret_cast(_pcre); + const pcre_extra* extra = reinterpret_cast(_extra); + pcre_fullinfo(regex, extra, PCRE_INFO_NAMECOUNT, &nmcount); + pcre_fullinfo(regex, extra, PCRE_INFO_NAMEENTRYSIZE, &nmentrysz); + pcre_fullinfo(regex, extra, PCRE_INFO_NAMETABLE, &nmtbl); + + for (int i = 0; i < nmcount; i++) + { + unsigned char* group = nmtbl + 2 + (nmentrysz * i); + int n = pcre_get_stringnumber(regex, (char*) group); + _groups[n] = std::string((char*) group); + } } @@ -114,8 +131,17 @@ int RegularExpression::match(const std::string& subject, std::string::size_type for (int i = 0; i < rc; ++i) { Match m; + GroupMap::const_iterator it; + m.offset = ovec[i*2] < 0 ? std::string::npos : ovec[i*2] ; m.length = ovec[i*2 + 1] - m.offset; + + it = _groups.find(i); + if (it != _groups.end()) + { + m.name = (*it).second; + } + matches.push_back(m); } return rc; diff --git a/Foundation/testsuite/src/RegularExpressionTest.cpp b/Foundation/testsuite/src/RegularExpressionTest.cpp index 921b9a226..34a336a84 100644 --- a/Foundation/testsuite/src/RegularExpressionTest.cpp +++ b/Foundation/testsuite/src/RegularExpressionTest.cpp @@ -263,6 +263,17 @@ void RegularExpressionTest::testError() } +void RegularExpressionTest::testGroup() +{ + RegularExpression::MatchVec matches; + RegularExpression re("(?P[a-z]+) (?P[0-9]+)"); + assertTrue (re.match("abcd 1234", 0, matches) == 3); + assertTrue (matches[0].name == ""); + assertTrue (matches[1].name == "group1"); + assertTrue (matches[2].name == "group2"); +} + + void RegularExpressionTest::setUp() { } @@ -292,6 +303,7 @@ CppUnit::Test* RegularExpressionTest::suite() CppUnit_addTest(pSuite, RegularExpressionTest, testSubst3); CppUnit_addTest(pSuite, RegularExpressionTest, testSubst4); CppUnit_addTest(pSuite, RegularExpressionTest, testError); + CppUnit_addTest(pSuite, RegularExpressionTest, testGroup); return pSuite; } diff --git a/Foundation/testsuite/src/RegularExpressionTest.h b/Foundation/testsuite/src/RegularExpressionTest.h index 4779682e7..d2151780b 100644 --- a/Foundation/testsuite/src/RegularExpressionTest.h +++ b/Foundation/testsuite/src/RegularExpressionTest.h @@ -39,6 +39,7 @@ public: void testSubst3(); void testSubst4(); void testError(); + void testGroup(); void setUp(); void tearDown();