mirror of
https://github.com/pocoproject/poco.git
synced 2025-01-31 14:39:53 +01:00
Readded named substring support for regexes (#3569)
Original merged PR #952 was merged in 2015, but has gone missing. This is adding it back in.
This commit is contained in:
parent
5f1292a4d7
commit
3d316bd548
@ -23,6 +23,7 @@
|
||||
|
||||
#include "Poco/Foundation.h"
|
||||
#include <vector>
|
||||
#include <map>
|
||||
|
||||
|
||||
namespace Poco {
|
||||
@ -39,7 +40,7 @@ public:
|
||||
/// Some of the following options can only be passed to the constructor;
|
||||
/// some can be passed only to matching functions, and some can be used
|
||||
/// everywhere.
|
||||
///
|
||||
///
|
||||
/// * Options marked [ctor] can be passed to the constructor.
|
||||
/// * Options marked [match] can be passed to match, extract, split and subst.
|
||||
/// * Options marked [subst] can be passed to subst.
|
||||
@ -61,10 +62,10 @@ public:
|
||||
RE_NO_AUTO_CAPTURE = 0x00001000, /// disable numbered capturing parentheses [ctor, match]
|
||||
RE_NO_UTF8_CHECK = 0x00002000, /// do not check validity of UTF-8 code sequences [match]
|
||||
RE_FIRSTLINE = 0x00040000, /// an unanchored pattern is required to match
|
||||
/// before or at the first newline in the subject string,
|
||||
/// before or at the first newline in the subject string,
|
||||
/// though the matched text may continue over the newline [ctor]
|
||||
RE_DUPNAMES = 0x00080000, /// names used to identify capturing subpatterns need not be unique [ctor]
|
||||
RE_NEWLINE_CR = 0x00100000, /// assume newline is CR ('\r'), the default [ctor]
|
||||
RE_NEWLINE_CR = 0x00100000, /// assume newline is CR ('\r'), the default [ctor]
|
||||
RE_NEWLINE_LF = 0x00200000, /// assume newline is LF ('\n') [ctor]
|
||||
RE_NEWLINE_CRLF = 0x00300000, /// assume newline is CRLF ("\r\n") [ctor]
|
||||
RE_NEWLINE_ANY = 0x00400000, /// assume newline is any valid Unicode newline character [ctor]
|
||||
@ -72,21 +73,23 @@ public:
|
||||
RE_GLOBAL = 0x10000000, /// replace all occurences (/g) [subst]
|
||||
RE_NO_VARS = 0x20000000 /// treat dollar in replacement string as ordinary character [subst]
|
||||
};
|
||||
|
||||
|
||||
struct Match
|
||||
{
|
||||
std::string::size_type offset; /// zero based offset (std::string::npos if subexpr does not match)
|
||||
std::string::size_type length; /// length of substring
|
||||
std::string name; /// name of group
|
||||
};
|
||||
using MatchVec = std::vector<Match>;
|
||||
|
||||
using GroupMap = std::map<int, std::string>;
|
||||
|
||||
RegularExpression(const std::string& pattern, int options = 0, bool study = true);
|
||||
/// Creates a regular expression and parses the given pattern.
|
||||
/// If study is true, the pattern is analyzed and optimized. This
|
||||
/// is mainly useful if the pattern is used more than once.
|
||||
/// For a description of the options, please see the PCRE documentation.
|
||||
/// Throws a RegularExpressionException if the patter cannot be compiled.
|
||||
|
||||
|
||||
~RegularExpression();
|
||||
/// Destroys the regular expression.
|
||||
|
||||
@ -99,7 +102,7 @@ public:
|
||||
/// Returns the number of matches.
|
||||
|
||||
int match(const std::string& subject, std::string::size_type offset, Match& mtch, int options = 0) const;
|
||||
/// Matches the given subject string, starting at offset, against the pattern.
|
||||
/// Matches the given subject string, starting at offset, against the pattern.
|
||||
/// Returns the position of the captured substring in mtch.
|
||||
/// If no part of the subject matches the pattern, mtch.offset is std::string::npos and
|
||||
/// mtch.length is 0.
|
||||
@ -107,7 +110,7 @@ public:
|
||||
/// Returns the number of matches.
|
||||
|
||||
int match(const std::string& subject, std::string::size_type offset, MatchVec& matches, int options = 0) const;
|
||||
/// Matches the given subject string against the pattern.
|
||||
/// Matches the given subject string against the pattern.
|
||||
/// The first entry in matches contains the position of the captured substring.
|
||||
/// The following entries identify matching subpatterns. See the PCRE documentation
|
||||
/// for a more detailed explanation.
|
||||
@ -140,19 +143,19 @@ public:
|
||||
/// the pattern is treated as if it starts with a ^.
|
||||
|
||||
int extract(const std::string& subject, std::string& str, int options = 0) const;
|
||||
/// Matches the given subject string against the pattern.
|
||||
/// Matches the given subject string against the pattern.
|
||||
/// Returns the captured string.
|
||||
/// Throws a RegularExpressionException in case of an error.
|
||||
/// Returns the number of matches.
|
||||
|
||||
int extract(const std::string& subject, std::string::size_type offset, std::string& str, int options = 0) const;
|
||||
/// Matches the given subject string, starting at offset, against the pattern.
|
||||
/// Matches the given subject string, starting at offset, against the pattern.
|
||||
/// Returns the captured string.
|
||||
/// Throws a RegularExpressionException in case of an error.
|
||||
/// Returns the number of matches.
|
||||
|
||||
int split(const std::string& subject, std::vector<std::string>& strings, int options = 0) const;
|
||||
/// Matches the given subject string against the pattern.
|
||||
/// Matches the given subject string against the pattern.
|
||||
/// The first entry in captured is the captured substring.
|
||||
/// The following entries contain substrings matching subpatterns. See the PCRE documentation
|
||||
/// for a more detailed explanation.
|
||||
@ -161,14 +164,14 @@ public:
|
||||
/// Returns the number of matches.
|
||||
|
||||
int split(const std::string& subject, std::string::size_type offset, std::vector<std::string>& strings, int options = 0) const;
|
||||
/// Matches the given subject string against the pattern.
|
||||
/// Matches the given subject string against the pattern.
|
||||
/// The first entry in captured is the captured substring.
|
||||
/// The following entries contain substrings matching subpatterns. See the PCRE documentation
|
||||
/// for a more detailed explanation.
|
||||
/// If no part of the subject matches the pattern, captured is empty.
|
||||
/// Throws a RegularExpressionException in case of an error.
|
||||
/// Returns the number of matches.
|
||||
|
||||
|
||||
int subst(std::string& subject, const std::string& replacement, int options = 0) const;
|
||||
/// Substitute in subject all matches of the pattern with replacement.
|
||||
/// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise,
|
||||
@ -183,7 +186,7 @@ public:
|
||||
/// If RE_GLOBAL is specified as option, all matches are replaced. Otherwise,
|
||||
/// only the first match is replaced.
|
||||
/// Unless RE_NO_VARS is specified, occurrences of $<n> (for example, $0, $1, $2, ... $9)
|
||||
/// in replacement are replaced with the corresponding captured string.
|
||||
/// in replacement are replaced with the corresponding captured string.
|
||||
/// $0 is the captured substring. $1 ... $n are the substrings matching the subpatterns.
|
||||
/// Returns the number of replaced occurrences.
|
||||
|
||||
@ -195,13 +198,15 @@ protected:
|
||||
std::string::size_type substOne(std::string& subject, std::string::size_type offset, const std::string& replacement, int options) const;
|
||||
|
||||
private:
|
||||
// Note: to avoid a dependency on the pcre.h header the following are
|
||||
// Note: to avoid a dependency on the pcre.h header the following are
|
||||
// declared as void* and casted to the correct type in the implementation file.
|
||||
void* _pcre; // Actual type is pcre*
|
||||
void* _extra; // Actual type is struct pcre_extra*
|
||||
|
||||
|
||||
GroupMap _groups;
|
||||
|
||||
static const int OVEC_SIZE;
|
||||
|
||||
|
||||
RegularExpression();
|
||||
RegularExpression(const RegularExpression&);
|
||||
RegularExpression& operator = (const RegularExpression&);
|
||||
|
@ -33,6 +33,10 @@ RegularExpression::RegularExpression(const std::string& pattern, int options, bo
|
||||
{
|
||||
const char* error;
|
||||
int offs;
|
||||
unsigned nmcount;
|
||||
unsigned nmentrysz;
|
||||
unsigned char* nmtbl;
|
||||
|
||||
_pcre = pcre_compile(pattern.c_str(), options, &error, &offs, 0);
|
||||
if (!_pcre)
|
||||
{
|
||||
@ -42,6 +46,19 @@ RegularExpression::RegularExpression(const std::string& pattern, int options, bo
|
||||
}
|
||||
if (study)
|
||||
_extra = pcre_study(reinterpret_cast<pcre*>(_pcre), 0, &error);
|
||||
|
||||
const pcre* regex = reinterpret_cast<pcre*>(_pcre);
|
||||
const pcre_extra* extra = reinterpret_cast<pcre_extra*>(_extra);
|
||||
pcre_fullinfo(regex, extra, PCRE_INFO_NAMECOUNT, &nmcount);
|
||||
pcre_fullinfo(regex, extra, PCRE_INFO_NAMEENTRYSIZE, &nmentrysz);
|
||||
pcre_fullinfo(regex, extra, PCRE_INFO_NAMETABLE, &nmtbl);
|
||||
|
||||
for (int i = 0; i < nmcount; i++)
|
||||
{
|
||||
unsigned char* group = nmtbl + 2 + (nmentrysz * i);
|
||||
int n = pcre_get_stringnumber(regex, (char*) group);
|
||||
_groups[n] = std::string((char*) group);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -114,8 +131,17 @@ int RegularExpression::match(const std::string& subject, std::string::size_type
|
||||
for (int i = 0; i < rc; ++i)
|
||||
{
|
||||
Match m;
|
||||
GroupMap::const_iterator it;
|
||||
|
||||
m.offset = ovec[i*2] < 0 ? std::string::npos : ovec[i*2] ;
|
||||
m.length = ovec[i*2 + 1] - m.offset;
|
||||
|
||||
it = _groups.find(i);
|
||||
if (it != _groups.end())
|
||||
{
|
||||
m.name = (*it).second;
|
||||
}
|
||||
|
||||
matches.push_back(m);
|
||||
}
|
||||
return rc;
|
||||
|
@ -263,6 +263,17 @@ void RegularExpressionTest::testError()
|
||||
}
|
||||
|
||||
|
||||
void RegularExpressionTest::testGroup()
|
||||
{
|
||||
RegularExpression::MatchVec matches;
|
||||
RegularExpression re("(?P<group1>[a-z]+) (?P<group2>[0-9]+)");
|
||||
assertTrue (re.match("abcd 1234", 0, matches) == 3);
|
||||
assertTrue (matches[0].name == "");
|
||||
assertTrue (matches[1].name == "group1");
|
||||
assertTrue (matches[2].name == "group2");
|
||||
}
|
||||
|
||||
|
||||
void RegularExpressionTest::setUp()
|
||||
{
|
||||
}
|
||||
@ -292,6 +303,7 @@ CppUnit::Test* RegularExpressionTest::suite()
|
||||
CppUnit_addTest(pSuite, RegularExpressionTest, testSubst3);
|
||||
CppUnit_addTest(pSuite, RegularExpressionTest, testSubst4);
|
||||
CppUnit_addTest(pSuite, RegularExpressionTest, testError);
|
||||
CppUnit_addTest(pSuite, RegularExpressionTest, testGroup);
|
||||
|
||||
return pSuite;
|
||||
}
|
||||
|
@ -39,6 +39,7 @@ public:
|
||||
void testSubst3();
|
||||
void testSubst4();
|
||||
void testError();
|
||||
void testGroup();
|
||||
|
||||
void setUp();
|
||||
void tearDown();
|
||||
|
Loading…
x
Reference in New Issue
Block a user