mirror of
https://github.com/pocoproject/poco.git
synced 2025-01-19 00:46:03 +01:00
* update(pcre2): Version 10.44 (#4478) * update(pcre2): Add new file to VS project files (version 10.44) (#4478) * fix(RegEx): Use generic pcre2 function names (without suffix _8). * update(pcre2): Fix configuration (define PCRE2_STATIC) (#4478)
This commit is contained in:
parent
04fe04e3a4
commit
f3975eba96
@ -42,6 +42,7 @@ else()
|
||||
POCO_SOURCES(SRCS pcre2
|
||||
src/pcre2_auto_possess.c
|
||||
src/pcre2_chartables.c
|
||||
src/pcre2_chkdint.c
|
||||
src/pcre2_compile.c
|
||||
src/pcre2_config.c
|
||||
src/pcre2_context.c
|
||||
|
@ -1047,6 +1047,7 @@
|
||||
<ClCompile Include="src\PatternFormatter.cpp" />
|
||||
<ClCompile Include="src\pcre2_auto_possess.c" />
|
||||
<ClCompile Include="src\pcre2_chartables.c" />
|
||||
<ClCompile Include="src\pcre2_chkdint.c" />
|
||||
<ClCompile Include="src\pcre2_compile.c" />
|
||||
<ClCompile Include="src\pcre2_config.c" />
|
||||
<ClCompile Include="src\pcre2_context.c" />
|
||||
|
@ -846,6 +846,9 @@
|
||||
<ClCompile Include="src\pcre2_chartables.c">
|
||||
<Filter>RegularExpression\PCRE2 Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="src\pcre2_chkdint.c">
|
||||
<Filter>RegularExpression\PCRE2 Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="src\pcre2_compile.c">
|
||||
<Filter>RegularExpression\PCRE2 Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
@ -1509,6 +1509,7 @@
|
||||
<ClCompile Include="src\PatternFormatter.cpp" />
|
||||
<ClCompile Include="src\pcre2_auto_possess.c" />
|
||||
<ClCompile Include="src\pcre2_chartables.c" />
|
||||
<ClCompile Include="src\pcre2_chkdint.c" />
|
||||
<ClCompile Include="src\pcre2_compile.c" />
|
||||
<ClCompile Include="src\pcre2_config.c" />
|
||||
<ClCompile Include="src\pcre2_context.c" />
|
||||
|
@ -846,6 +846,9 @@
|
||||
<ClCompile Include="src\pcre2_chartables.c">
|
||||
<Filter>RegularExpression\PCRE2 Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="src\pcre2_chkdint.c">
|
||||
<Filter>RegularExpression\PCRE2 Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="src\pcre2_compile.c">
|
||||
<Filter>RegularExpression\PCRE2 Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
@ -35,7 +35,7 @@ objects = ArchiveStrategy Ascii ASCIIEncoding AsyncChannel AsyncNotificationCent
|
||||
zlib_objects = adler32 compress crc32 deflate \
|
||||
infback inffast inflate inftrees trees zutil
|
||||
|
||||
pcre_objects = pcre2_auto_possess pcre2_chartables pcre2_compile pcre2_config \
|
||||
pcre_objects = pcre2_auto_possess pcre2_chartables pcre2_chkdint pcre2_compile pcre2_config \
|
||||
pcre2_context pcre2_convert pcre2_dfa_match pcre2_error pcre2_extuni \
|
||||
pcre2_find_bracket pcre2_jit_compile pcre2_maketables pcre2_match \
|
||||
pcre2_match_data pcre2_newline pcre2_ord2utf pcre2_pattern_info \
|
||||
|
@ -29,34 +29,34 @@ namespace
|
||||
class MatchData
|
||||
{
|
||||
public:
|
||||
MatchData(pcre2_code_8* code):
|
||||
_match(pcre2_match_data_create_from_pattern_8(reinterpret_cast<pcre2_code_8*>(code), nullptr))
|
||||
MatchData(pcre2_code* code):
|
||||
_match(pcre2_match_data_create_from_pattern(reinterpret_cast<pcre2_code*>(code), nullptr))
|
||||
{
|
||||
if (!_match) throw Poco::RegularExpressionException("cannot create match data");
|
||||
}
|
||||
|
||||
~MatchData()
|
||||
{
|
||||
if (_match) pcre2_match_data_free_8(_match);
|
||||
if (_match) pcre2_match_data_free(_match);
|
||||
}
|
||||
|
||||
std::uint32_t count() const
|
||||
{
|
||||
return pcre2_get_ovector_count_8(_match);
|
||||
return pcre2_get_ovector_count(_match);
|
||||
}
|
||||
|
||||
const PCRE2_SIZE* data() const
|
||||
{
|
||||
return pcre2_get_ovector_pointer_8(_match);
|
||||
return pcre2_get_ovector_pointer(_match);
|
||||
}
|
||||
|
||||
operator pcre2_match_data_8*()
|
||||
operator pcre2_match_data*()
|
||||
{
|
||||
return _match;
|
||||
}
|
||||
|
||||
private:
|
||||
pcre2_match_data_8* _match;
|
||||
pcre2_match_data* _match;
|
||||
};
|
||||
}
|
||||
|
||||
@ -72,40 +72,40 @@ RegularExpression::RegularExpression(const std::string& pattern, int options, bo
|
||||
unsigned nameEntrySize;
|
||||
unsigned char* nameTable;
|
||||
|
||||
pcre2_compile_context_8* context = pcre2_compile_context_create_8(nullptr);
|
||||
pcre2_compile_context* context = pcre2_compile_context_create(nullptr);
|
||||
if (!context) throw Poco::RegularExpressionException("cannot create compile context");
|
||||
|
||||
if (options & RE_NEWLINE_LF)
|
||||
pcre2_set_newline_8(context, PCRE2_NEWLINE_LF);
|
||||
pcre2_set_newline(context, PCRE2_NEWLINE_LF);
|
||||
else if (options & RE_NEWLINE_CRLF)
|
||||
pcre2_set_newline_8(context, PCRE2_NEWLINE_CRLF);
|
||||
pcre2_set_newline(context, PCRE2_NEWLINE_CRLF);
|
||||
else if (options & RE_NEWLINE_ANY)
|
||||
pcre2_set_newline_8(context, PCRE2_NEWLINE_ANY);
|
||||
pcre2_set_newline(context, PCRE2_NEWLINE_ANY);
|
||||
else if (options & RE_NEWLINE_ANYCRLF)
|
||||
pcre2_set_newline_8(context, PCRE2_NEWLINE_ANYCRLF);
|
||||
pcre2_set_newline(context, PCRE2_NEWLINE_ANYCRLF);
|
||||
else // default RE_NEWLINE_CR
|
||||
pcre2_set_newline_8(context, PCRE2_NEWLINE_CR);
|
||||
pcre2_set_newline(context, PCRE2_NEWLINE_CR);
|
||||
|
||||
_pcre = pcre2_compile_8(reinterpret_cast<const PCRE2_SPTR>(pattern.c_str()), pattern.length(), compileOptions(options), &errorCode, &errorOffset, context);
|
||||
pcre2_compile_context_free_8(context);
|
||||
_pcre = pcre2_compile(reinterpret_cast<const PCRE2_SPTR>(pattern.c_str()), pattern.length(), compileOptions(options), &errorCode, &errorOffset, context);
|
||||
pcre2_compile_context_free(context);
|
||||
|
||||
if (!_pcre)
|
||||
{
|
||||
PCRE2_UCHAR buffer[256];
|
||||
pcre2_get_error_message_8(errorCode, buffer, sizeof(buffer));
|
||||
pcre2_get_error_message(errorCode, buffer, sizeof(buffer));
|
||||
std::ostringstream msg;
|
||||
msg << reinterpret_cast<char*>(buffer) << " (at offset " << errorOffset << ")";
|
||||
throw RegularExpressionException(msg.str());
|
||||
}
|
||||
|
||||
pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMECOUNT, &nameCount);
|
||||
pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMEENTRYSIZE, &nameEntrySize);
|
||||
pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMETABLE, &nameTable);
|
||||
pcre2_pattern_info(reinterpret_cast<pcre2_code*>(_pcre), PCRE2_INFO_NAMECOUNT, &nameCount);
|
||||
pcre2_pattern_info(reinterpret_cast<pcre2_code*>(_pcre), PCRE2_INFO_NAMEENTRYSIZE, &nameEntrySize);
|
||||
pcre2_pattern_info(reinterpret_cast<pcre2_code*>(_pcre), PCRE2_INFO_NAMETABLE, &nameTable);
|
||||
|
||||
for (int i = 0; i < nameCount; i++)
|
||||
{
|
||||
unsigned char* group = nameTable + 2 + (nameEntrySize * i);
|
||||
int n = pcre2_substring_number_from_name_8(reinterpret_cast<pcre2_code_8*>(_pcre), group);
|
||||
int n = pcre2_substring_number_from_name(reinterpret_cast<pcre2_code*>(_pcre), group);
|
||||
_groups[n] = std::string(reinterpret_cast<char*>(group));
|
||||
}
|
||||
}
|
||||
@ -113,7 +113,7 @@ RegularExpression::RegularExpression(const std::string& pattern, int options, bo
|
||||
|
||||
RegularExpression::~RegularExpression()
|
||||
{
|
||||
if (_pcre) pcre2_code_free_8(reinterpret_cast<pcre2_code_8*>(_pcre));
|
||||
if (_pcre) pcre2_code_free(reinterpret_cast<pcre2_code*>(_pcre));
|
||||
}
|
||||
|
||||
|
||||
@ -121,8 +121,8 @@ int RegularExpression::match(const std::string& subject, std::string::size_type
|
||||
{
|
||||
poco_assert (offset <= subject.length());
|
||||
|
||||
MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre));
|
||||
int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr);
|
||||
MatchData matchData(reinterpret_cast<pcre2_code*>(_pcre));
|
||||
int rc = pcre2_match(reinterpret_cast<pcre2_code*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr);
|
||||
if (rc == PCRE2_ERROR_NOMATCH)
|
||||
{
|
||||
mtch.offset = std::string::npos;
|
||||
@ -140,7 +140,7 @@ int RegularExpression::match(const std::string& subject, std::string::size_type
|
||||
else if (rc < 0)
|
||||
{
|
||||
PCRE2_UCHAR buffer[256];
|
||||
pcre2_get_error_message_8(rc, buffer, sizeof(buffer));
|
||||
pcre2_get_error_message(rc, buffer, sizeof(buffer));
|
||||
throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer)));
|
||||
}
|
||||
const PCRE2_SIZE* ovec = matchData.data();
|
||||
@ -156,8 +156,8 @@ int RegularExpression::match(const std::string& subject, std::string::size_type
|
||||
|
||||
matches.clear();
|
||||
|
||||
MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre));
|
||||
int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, options & 0xFFFF, matchData, nullptr);
|
||||
MatchData matchData(reinterpret_cast<pcre2_code*>(_pcre));
|
||||
int rc = pcre2_match(reinterpret_cast<pcre2_code*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, options & 0xFFFF, matchData, nullptr);
|
||||
if (rc == PCRE2_ERROR_NOMATCH)
|
||||
{
|
||||
return 0;
|
||||
@ -173,7 +173,7 @@ int RegularExpression::match(const std::string& subject, std::string::size_type
|
||||
else if (rc < 0)
|
||||
{
|
||||
PCRE2_UCHAR buffer[256];
|
||||
pcre2_get_error_message_8(rc, buffer, sizeof(buffer));
|
||||
pcre2_get_error_message(rc, buffer, sizeof(buffer));
|
||||
throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer)));
|
||||
}
|
||||
matches.reserve(rc);
|
||||
@ -279,8 +279,8 @@ std::string::size_type RegularExpression::substOne(std::string& subject, std::st
|
||||
{
|
||||
if (offset >= subject.length()) return std::string::npos;
|
||||
|
||||
MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre));
|
||||
int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr);
|
||||
MatchData matchData(reinterpret_cast<pcre2_code*>(_pcre));
|
||||
int rc = pcre2_match(reinterpret_cast<pcre2_code*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr);
|
||||
if (rc == PCRE2_ERROR_NOMATCH)
|
||||
{
|
||||
return std::string::npos;
|
||||
@ -296,7 +296,7 @@ std::string::size_type RegularExpression::substOne(std::string& subject, std::st
|
||||
else if (rc < 0)
|
||||
{
|
||||
PCRE2_UCHAR buffer[256];
|
||||
pcre2_get_error_message_8(rc, buffer, sizeof(buffer));
|
||||
pcre2_get_error_message(rc, buffer, sizeof(buffer));
|
||||
throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer)));
|
||||
}
|
||||
const PCRE2_SIZE* ovec = matchData.data();
|
||||
|
@ -29,7 +29,7 @@ void Unicode::properties(int ch, CharacterProperties& props)
|
||||
{
|
||||
if (ch > UCP_MAX_CODEPOINT) ch = 0;
|
||||
const ucd_record* ucd = GET_UCD(ch);
|
||||
props.category = static_cast<CharacterCategory>(PRIV(ucp_gentype_8)[ucd->chartype]);
|
||||
props.category = static_cast<CharacterCategory>(PRIV(ucp_gentype)[ucd->chartype]);
|
||||
props.type = static_cast<CharacterType>(ucd->chartype);
|
||||
props.script = static_cast<Script>(ucd->script);
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
/* This is the public header file for the PCRE library, second API, to be
|
||||
#included by applications that call PCRE2 functions.
|
||||
|
||||
Copyright (c) 2016-2021 University of Cambridge
|
||||
Copyright (c) 2016-2024 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
/* The current PCRE version information. */
|
||||
|
||||
#define PCRE2_MAJOR 10
|
||||
#define PCRE2_MINOR 42
|
||||
#define PCRE2_MINOR 44
|
||||
#define PCRE2_PRERELEASE
|
||||
#define PCRE2_DATE 2022-12-11
|
||||
#define PCRE2_DATE 2024-06-07
|
||||
|
||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE2, the appropriate
|
||||
@ -153,6 +153,12 @@ D is inspected during pcre2_dfa_match() execution
|
||||
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
|
||||
#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */
|
||||
#define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */
|
||||
#define PCRE2_EXTRA_CASELESS_RESTRICT 0x00000080u /* C */
|
||||
#define PCRE2_EXTRA_ASCII_BSD 0x00000100u /* C */
|
||||
#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
|
||||
#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
|
||||
#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
|
||||
#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
|
||||
|
||||
/* These are for pcre2_jit_compile(). */
|
||||
|
||||
@ -180,11 +186,12 @@ pcre2_jit_match() ignores the latter since it bypasses all sanity checks). */
|
||||
#define PCRE2_SUBSTITUTE_UNSET_EMPTY 0x00000400u /* pcre2_substitute() only */
|
||||
#define PCRE2_SUBSTITUTE_UNKNOWN_UNSET 0x00000800u /* pcre2_substitute() only */
|
||||
#define PCRE2_SUBSTITUTE_OVERFLOW_LENGTH 0x00001000u /* pcre2_substitute() only */
|
||||
#define PCRE2_NO_JIT 0x00002000u /* Not for pcre2_dfa_match() */
|
||||
#define PCRE2_NO_JIT 0x00002000u /* not for pcre2_dfa_match() */
|
||||
#define PCRE2_COPY_MATCHED_SUBJECT 0x00004000u
|
||||
#define PCRE2_SUBSTITUTE_LITERAL 0x00008000u /* pcre2_substitute() only */
|
||||
#define PCRE2_SUBSTITUTE_MATCHED 0x00010000u /* pcre2_substitute() only */
|
||||
#define PCRE2_SUBSTITUTE_REPLACEMENT_ONLY 0x00020000u /* pcre2_substitute() only */
|
||||
#define PCRE2_DISABLE_RECURSELOOP_CHECK 0x00040000u /* not for pcre2_dfa_match() or pcre2_jit_match() */
|
||||
|
||||
/* Options for pcre2_pattern_convert(). */
|
||||
|
||||
@ -399,6 +406,7 @@ released, the numbers must not be changed. */
|
||||
#define PCRE2_ERROR_CONVERT_SYNTAX (-64)
|
||||
#define PCRE2_ERROR_INTERNAL_DUPMATCH (-65)
|
||||
#define PCRE2_ERROR_DFA_UINVALID_UTF (-66)
|
||||
#define PCRE2_ERROR_INVALIDOFFSET (-67)
|
||||
|
||||
|
||||
/* Request types for pcre2_pattern_info() */
|
||||
@ -575,7 +583,7 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_config(uint32_t, void *);
|
||||
PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \
|
||||
pcre2_general_context_copy(pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \
|
||||
pcre2_general_context_create(void *(*)(PCRE2_SIZE, void *), \
|
||||
pcre2_general_context_create(void *(*)(size_t, void *), \
|
||||
void (*)(void *, void *), void *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_general_context_free(pcre2_general_context *);
|
||||
@ -595,6 +603,10 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_compile_extra_options(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_max_pattern_compiled_length(pcre2_compile_context *, PCRE2_SIZE); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_max_varlookbehind(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_newline(pcre2_compile_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
@ -628,7 +640,7 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_recursion_limit(pcre2_match_context *, uint32_t); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_set_recursion_memory_management(pcre2_match_context *, \
|
||||
void *(*)(PCRE2_SIZE, void *), void (*)(void *, void *), void *);
|
||||
void *(*)(size_t, void *), void (*)(void *, void *), void *);
|
||||
|
||||
#define PCRE2_CONVERT_CONTEXT_FUNCTIONS \
|
||||
PCRE2_EXP_DECL pcre2_convert_context *PCRE2_CALL_CONVENTION \
|
||||
@ -687,6 +699,8 @@ PCRE2_EXP_DECL PCRE2_SPTR PCRE2_CALL_CONVENTION \
|
||||
pcre2_get_mark(pcre2_match_data *); \
|
||||
PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
|
||||
pcre2_get_match_data_size(pcre2_match_data *); \
|
||||
PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
|
||||
pcre2_get_match_data_heapframes_size(pcre2_match_data *); \
|
||||
PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \
|
||||
pcre2_get_ovector_count(pcre2_match_data *); \
|
||||
PCRE2_EXP_DECL PCRE2_SIZE *PCRE2_CALL_CONVENTION \
|
||||
@ -722,7 +736,7 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_number_from_name(const pcre2_code *, PCRE2_SPTR); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_list_free(PCRE2_SPTR *); \
|
||||
pcre2_substring_list_free(PCRE2_UCHAR **); \
|
||||
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
pcre2_substring_list_get(pcre2_match_data *, PCRE2_UCHAR ***, PCRE2_SIZE **);
|
||||
|
||||
@ -771,7 +785,7 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_jit_free_unused_memory(pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL pcre2_jit_stack *PCRE2_CALL_CONVENTION \
|
||||
pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, pcre2_general_context *); \
|
||||
pcre2_jit_stack_create(size_t, size_t, pcre2_general_context *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
pcre2_jit_stack_assign(pcre2_match_context *, pcre2_jit_callback, void *); \
|
||||
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
|
||||
@ -851,6 +865,7 @@ pcre2_compile are called by application code. */
|
||||
#define pcre2_general_context_free PCRE2_SUFFIX(pcre2_general_context_free_)
|
||||
#define pcre2_get_error_message PCRE2_SUFFIX(pcre2_get_error_message_)
|
||||
#define pcre2_get_mark PCRE2_SUFFIX(pcre2_get_mark_)
|
||||
#define pcre2_get_match_data_heapframes_size PCRE2_SUFFIX(pcre2_get_match_data_heapframes_size_)
|
||||
#define pcre2_get_match_data_size PCRE2_SUFFIX(pcre2_get_match_data_size_)
|
||||
#define pcre2_get_ovector_pointer PCRE2_SUFFIX(pcre2_get_ovector_pointer_)
|
||||
#define pcre2_get_ovector_count PCRE2_SUFFIX(pcre2_get_ovector_count_)
|
||||
@ -886,7 +901,9 @@ pcre2_compile are called by application code. */
|
||||
#define pcre2_set_glob_separator PCRE2_SUFFIX(pcre2_set_glob_separator_)
|
||||
#define pcre2_set_heap_limit PCRE2_SUFFIX(pcre2_set_heap_limit_)
|
||||
#define pcre2_set_match_limit PCRE2_SUFFIX(pcre2_set_match_limit_)
|
||||
#define pcre2_set_max_varlookbehind PCRE2_SUFFIX(pcre2_set_max_varlookbehind_)
|
||||
#define pcre2_set_max_pattern_length PCRE2_SUFFIX(pcre2_set_max_pattern_length_)
|
||||
#define pcre2_set_max_pattern_compiled_length PCRE2_SUFFIX(pcre2_set_max_pattern_compiled_length_)
|
||||
#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_)
|
||||
#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_)
|
||||
#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_)
|
||||
|
@ -556,6 +556,8 @@ matches to an empty string (also represented by a non-zero value). */
|
||||
|
||||
for(;;)
|
||||
{
|
||||
PCRE2_SPTR bracode;
|
||||
|
||||
/* All operations move the code pointer forward.
|
||||
Therefore infinite recursions are not possible. */
|
||||
|
||||
@ -613,7 +615,8 @@ for(;;)
|
||||
recursions. (This could be improved by keeping a list of group numbers that
|
||||
are called by recursion.) */
|
||||
|
||||
switch(*(code - GET(code, 1)))
|
||||
bracode = code - GET(code, 1);
|
||||
switch(*bracode)
|
||||
{
|
||||
case OP_CBRA:
|
||||
case OP_SCBRA:
|
||||
@ -632,16 +635,19 @@ for(;;)
|
||||
break;
|
||||
|
||||
/* Atomic sub-patterns and assertions can always auto-possessify their
|
||||
last iterator. However, if the group was entered as a result of checking
|
||||
a previous iterator, this is not possible. */
|
||||
last iterator except for variable length lookbehinds. However, if the
|
||||
group was entered as a result of checking a previous iterator, this is
|
||||
not possible. */
|
||||
|
||||
case OP_ASSERT:
|
||||
case OP_ASSERT_NOT:
|
||||
case OP_ASSERTBACK:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
case OP_ONCE:
|
||||
return !entered_a_group;
|
||||
|
||||
case OP_ASSERTBACK:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
return (bracode[1+LINK_SIZE] == OP_VREVERSE)? FALSE : !entered_a_group;
|
||||
|
||||
/* Non-atomic assertions - don't possessify last iterator. This needs
|
||||
more thought. */
|
||||
|
||||
|
@ -5,7 +5,8 @@
|
||||
/* This file was automatically written by the pcre2_dftables auxiliary
|
||||
program. It contains character tables that are used when no external
|
||||
tables are passed to PCRE2 by the application that calls it. The tables
|
||||
are used only for characters whose code values are less than 256. */
|
||||
are used only for characters whose code values are less than 256, and
|
||||
only relevant if not in UCP mode. */
|
||||
|
||||
/* This set of tables was written in the C locale. */
|
||||
|
||||
@ -160,7 +161,7 @@ graph, print, punct, and cntrl. Other classes are built from combinations. */
|
||||
0x02 letter
|
||||
0x04 lower case letter
|
||||
0x08 decimal digit
|
||||
0x10 alphanumeric or '_'
|
||||
0x10 word (alphanumeric or '_')
|
||||
*/
|
||||
|
||||
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
|
||||
|
93
Foundation/src/pcre2_chkdint.c
Normal file
93
Foundation/src/pcre2_chkdint.c
Normal file
@ -0,0 +1,93 @@
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 2023 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This file contains functions to implement checked integer operation */
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
#include "pcre2_config.h"
|
||||
#include "pcre2_internal.h"
|
||||
#endif
|
||||
|
||||
/*************************************************
|
||||
* Checked Integer Multiplication *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
Arguments:
|
||||
r A pointer to PCRE2_SIZE to store the answer
|
||||
a, b Two integers
|
||||
|
||||
Returns: Bool indicating if the operation overflows
|
||||
|
||||
It is modeled after C23's <stdckdint.h> interface
|
||||
The INT64_OR_DOUBLE type is a 64-bit integer type when available,
|
||||
otherwise double. */
|
||||
|
||||
BOOL
|
||||
PRIV(ckd_smul)(PCRE2_SIZE *r, int a, int b)
|
||||
{
|
||||
#ifdef HAVE_BUILTIN_MUL_OVERFLOW
|
||||
PCRE2_SIZE m;
|
||||
|
||||
if (__builtin_mul_overflow(a, b, &m)) return TRUE;
|
||||
|
||||
*r = m;
|
||||
#else
|
||||
INT64_OR_DOUBLE m;
|
||||
|
||||
#ifdef PCRE2_DEBUG
|
||||
if (a < 0 || b < 0) abort();
|
||||
#endif
|
||||
|
||||
m = (INT64_OR_DOUBLE)a * (INT64_OR_DOUBLE)b;
|
||||
|
||||
#if defined INT64_MAX || defined int64_t
|
||||
if (sizeof(m) > sizeof(*r) && m > (INT64_OR_DOUBLE)PCRE2_SIZE_MAX) return TRUE;
|
||||
*r = (PCRE2_SIZE)m;
|
||||
#else
|
||||
if (m > PCRE2_SIZE_MAX) return TRUE;
|
||||
*r = m;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* End of pcre_chkdint.c */
|
File diff suppressed because it is too large
Load Diff
@ -57,9 +57,12 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
/* Define this if your compiler supports __attribute__((uninitialized)) */
|
||||
/* #undef HAVE_ATTRIBUTE_UNINITIALIZED */
|
||||
|
||||
/* Define to 1 if you have the `bcopy' function. */
|
||||
/* Define to 1 if you have the 'bcopy' function. */
|
||||
/* #undef HAVE_BCOPY */
|
||||
|
||||
/* Define this if your compiler provides __builtin_mul_overflow() */
|
||||
/* #undef HAVE_BUILTIN_MUL_OVERFLOW */
|
||||
|
||||
/* Define to 1 if you have the <bzlib.h> header file. */
|
||||
/* #undef HAVE_BZLIB_H */
|
||||
|
||||
@ -81,17 +84,17 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
/* Define to 1 if you have the <limits.h> header file. */
|
||||
/* #undef HAVE_LIMITS_H */
|
||||
|
||||
/* Define to 1 if you have the `memfd_create' function. */
|
||||
/* Define to 1 if you have the 'memfd_create' function. */
|
||||
/* #undef HAVE_MEMFD_CREATE */
|
||||
|
||||
/* Define to 1 if you have the `memmove' function. */
|
||||
/* Define to 1 if you have the 'memmove' function. */
|
||||
/* #undef HAVE_MEMMOVE */
|
||||
#define HAVE_MEMMOVE 1
|
||||
|
||||
/* Define to 1 if you have the <minix/config.h> header file. */
|
||||
/* #undef HAVE_MINIX_CONFIG_H */
|
||||
|
||||
/* Define to 1 if you have the `mkostemp' function. */
|
||||
/* Define to 1 if you have the 'mkostemp' function. */
|
||||
/* #undef HAVE_MKOSTEMP */
|
||||
|
||||
/* Define if you have POSIX threads libraries and header files. */
|
||||
@ -112,7 +115,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
/* Define to 1 if you have the `realpath' function. */
|
||||
/* #undef HAVE_REALPATH */
|
||||
|
||||
/* Define to 1 if you have the `secure_getenv' function. */
|
||||
/* Define to 1 if you have the 'secure_getenv' function. */
|
||||
/* #undef HAVE_SECURE_GETENV */
|
||||
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
@ -124,7 +127,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
||||
/* #undef HAVE_STDLIB_H */
|
||||
|
||||
/* Define to 1 if you have the `strerror' function. */
|
||||
/* Define to 1 if you have the 'strerror' function. */
|
||||
/* #undef HAVE_STRERROR */
|
||||
|
||||
/* Define to 1 if you have the <strings.h> header file. */
|
||||
@ -184,7 +187,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
matching attempt. The value is also used to limit a loop counter in
|
||||
pcre2_dfa_match(). There is a runtime interface for setting a different
|
||||
limit. The limit exists in order to catch runaway regular expressions that
|
||||
take for ever to determine that they do not match. The default is set very
|
||||
take forever to determine that they do not match. The default is set very
|
||||
large so that it does not accidentally catch legitimate cases. */
|
||||
#ifndef MATCH_LIMIT
|
||||
#define MATCH_LIMIT 10000000
|
||||
@ -215,7 +218,13 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
Care must be taken if it is increased, because it guards against integer
|
||||
overflow caused by enormously large patterns. */
|
||||
#ifndef MAX_NAME_SIZE
|
||||
#define MAX_NAME_SIZE 32
|
||||
#define MAX_NAME_SIZE 128
|
||||
#endif
|
||||
|
||||
/* The value of MAX_VARLOOKBEHIND specifies the default maximum length, in
|
||||
characters, for a variable-length lookbehind assertion. */
|
||||
#ifndef MAX_VARLOOKBEHIND
|
||||
#define MAX_VARLOOKBEHIND 255
|
||||
#endif
|
||||
|
||||
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
|
||||
@ -239,7 +248,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
#define PACKAGE_NAME "PCRE2"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "PCRE2 10.42"
|
||||
#define PACKAGE_STRING "PCRE2 10.44"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "pcre2"
|
||||
@ -248,7 +257,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "10.42"
|
||||
#define PACKAGE_VERSION "10.44"
|
||||
|
||||
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
|
||||
parentheses (of any kind) in a pattern. This limits the amount of system
|
||||
@ -278,12 +287,16 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
/* Define to any value to include debugging code. */
|
||||
/* #undef PCRE2_DEBUG */
|
||||
|
||||
/* to make a symbol visible */
|
||||
#define PCRE2_EXPORT
|
||||
|
||||
/* If you are compiling for a system other than a Unix-like system or
|
||||
Win32, and it needs some magic to be inserted before the definition
|
||||
of a function that is exported by the library, define this macro to
|
||||
contain the relevant magic. If you do not define this macro, a suitable
|
||||
__declspec value is used for Windows systems; in other environments
|
||||
"extern" is used for a C compiler and "extern C" for a C++ compiler.
|
||||
a compiler relevant "extern" is used with any "visibility" related
|
||||
attributes from PCRE2_EXPORT included.
|
||||
This macro apears at the start of every exported function that is part
|
||||
of the external API. It does not appear on functions that are "external"
|
||||
in the C sense, but which are internal to the library. */
|
||||
@ -304,11 +317,14 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
unless SUPPORT_JIT is also defined. */
|
||||
/* #undef SLJIT_PROT_EXECUTABLE_ALLOCATOR */
|
||||
|
||||
/* Define to 1 if all of the C90 standard headers exist (not just the ones
|
||||
/* Define to 1 if all of the C89 standard headers exist (not just the ones
|
||||
required in a freestanding environment). This macro is provided for
|
||||
backward compatibility; new code need not use it. */
|
||||
/* #undef STDC_HEADERS */
|
||||
|
||||
/* Define to any value to enable differential fuzzing support. */
|
||||
/* #undef SUPPORT_DIFF_FUZZ */
|
||||
|
||||
/* Define to any value to enable support for Just-In-Time compiling. */
|
||||
/* #undef SUPPORT_JIT */
|
||||
|
||||
@ -357,7 +373,7 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
/* Define to any value for valgrind support to find invalid memory reads. */
|
||||
/* #undef SUPPORT_VALGRIND */
|
||||
|
||||
/* Enable extensions on AIX 3, Interix. */
|
||||
/* Enable extensions on AIX, Interix, z/OS. */
|
||||
#ifndef _ALL_SOURCE
|
||||
# define _ALL_SOURCE 1
|
||||
#endif
|
||||
@ -418,11 +434,15 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
#ifndef __STDC_WANT_IEC_60559_DFP_EXT__
|
||||
# define __STDC_WANT_IEC_60559_DFP_EXT__ 1
|
||||
#endif
|
||||
/* Enable extensions specified by C23 Annex F. */
|
||||
#ifndef __STDC_WANT_IEC_60559_EXT__
|
||||
# define __STDC_WANT_IEC_60559_EXT__ 1
|
||||
#endif
|
||||
/* Enable extensions specified by ISO/IEC TS 18661-4:2015. */
|
||||
#ifndef __STDC_WANT_IEC_60559_FUNCS_EXT__
|
||||
# define __STDC_WANT_IEC_60559_FUNCS_EXT__ 1
|
||||
#endif
|
||||
/* Enable extensions specified by ISO/IEC TS 18661-3:2015. */
|
||||
/* Enable extensions specified by C23 Annex H and ISO/IEC TS 18661-3:2015. */
|
||||
#ifndef __STDC_WANT_IEC_60559_TYPES_EXT__
|
||||
# define __STDC_WANT_IEC_60559_TYPES_EXT__ 1
|
||||
#endif
|
||||
@ -445,20 +465,26 @@ sure both macros are undefined; an emulation function will then be used. */
|
||||
#endif
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "10.42"
|
||||
#define VERSION "10.44"
|
||||
|
||||
/* Number of bits in a file offset, on hosts where this is settable. */
|
||||
/* #undef _FILE_OFFSET_BITS */
|
||||
|
||||
/* Define for large files, on AIX-style hosts. */
|
||||
/* Define to 1 on platforms where this makes off_t a 64-bit type. */
|
||||
/* #undef _LARGE_FILES */
|
||||
|
||||
/* Define to empty if `const' does not conform to ANSI C. */
|
||||
/* Number of bits in time_t, on hosts where this is settable. */
|
||||
/* #undef _TIME_BITS */
|
||||
|
||||
/* Define to 1 on platforms where this makes time_t a 64-bit type. */
|
||||
/* #undef __MINGW_USE_VC2005_COMPAT */
|
||||
|
||||
/* Define to empty if 'const' does not conform to ANSI C. */
|
||||
/* #undef const */
|
||||
|
||||
/* Define to the type of a signed integer type of width exactly 64 bits if
|
||||
such a type exists and the standard includes do not define it. */
|
||||
/* #undef int64_t */
|
||||
|
||||
/* Define to `unsigned int' if <sys/types.h> does not define. */
|
||||
/* Define as 'unsigned int' if <stddef.h> doesn't define. */
|
||||
/* #undef size_t */
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
New API code Copyright (c) 2016-2024 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -133,10 +133,13 @@ const pcre2_compile_context PRIV(default_compile_context) = {
|
||||
NULL, /* Stack guard data */
|
||||
PRIV(default_tables), /* Character tables */
|
||||
PCRE2_UNSET, /* Max pattern length */
|
||||
PCRE2_UNSET, /* Max pattern compiled length */
|
||||
BSR_DEFAULT, /* Backslash R default */
|
||||
NEWLINE_DEFAULT, /* Newline convention */
|
||||
PARENS_NEST_LIMIT, /* As it says */
|
||||
0 }; /* Extra options */
|
||||
0, /* Extra options */
|
||||
MAX_VARLOOKBEHIND /* As it says */
|
||||
};
|
||||
|
||||
/* The create function copies the default into the new memory, but must
|
||||
override the default memory handling functions if a gcontext was provided. */
|
||||
@ -225,49 +228,48 @@ return ccontext;
|
||||
PCRE2_EXP_DEFN pcre2_general_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_general_context_copy(pcre2_general_context *gcontext)
|
||||
{
|
||||
pcre2_general_context *new =
|
||||
pcre2_general_context *newcontext =
|
||||
gcontext->memctl.malloc(sizeof(pcre2_real_general_context),
|
||||
gcontext->memctl.memory_data);
|
||||
if (new == NULL) return NULL;
|
||||
memcpy(new, gcontext, sizeof(pcre2_real_general_context));
|
||||
return new;
|
||||
if (newcontext == NULL) return NULL;
|
||||
memcpy(newcontext, gcontext, sizeof(pcre2_real_general_context));
|
||||
return newcontext;
|
||||
}
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_compile_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_compile_context_copy(pcre2_compile_context *ccontext)
|
||||
{
|
||||
pcre2_compile_context *new =
|
||||
pcre2_compile_context *newcontext =
|
||||
ccontext->memctl.malloc(sizeof(pcre2_real_compile_context),
|
||||
ccontext->memctl.memory_data);
|
||||
if (new == NULL) return NULL;
|
||||
memcpy(new, ccontext, sizeof(pcre2_real_compile_context));
|
||||
return new;
|
||||
if (newcontext == NULL) return NULL;
|
||||
memcpy(newcontext, ccontext, sizeof(pcre2_real_compile_context));
|
||||
return newcontext;
|
||||
}
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_match_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_match_context_copy(pcre2_match_context *mcontext)
|
||||
{
|
||||
pcre2_match_context *new =
|
||||
pcre2_match_context *newcontext =
|
||||
mcontext->memctl.malloc(sizeof(pcre2_real_match_context),
|
||||
mcontext->memctl.memory_data);
|
||||
if (new == NULL) return NULL;
|
||||
memcpy(new, mcontext, sizeof(pcre2_real_match_context));
|
||||
return new;
|
||||
if (newcontext == NULL) return NULL;
|
||||
memcpy(newcontext, mcontext, sizeof(pcre2_real_match_context));
|
||||
return newcontext;
|
||||
}
|
||||
|
||||
|
||||
|
||||
PCRE2_EXP_DEFN pcre2_convert_context * PCRE2_CALL_CONVENTION
|
||||
pcre2_convert_context_copy(pcre2_convert_context *ccontext)
|
||||
{
|
||||
pcre2_convert_context *new =
|
||||
pcre2_convert_context *newcontext =
|
||||
ccontext->memctl.malloc(sizeof(pcre2_real_convert_context),
|
||||
ccontext->memctl.memory_data);
|
||||
if (new == NULL) return NULL;
|
||||
memcpy(new, ccontext, sizeof(pcre2_real_convert_context));
|
||||
return new;
|
||||
if (newcontext == NULL) return NULL;
|
||||
memcpy(newcontext, ccontext, sizeof(pcre2_real_convert_context));
|
||||
return newcontext;
|
||||
}
|
||||
|
||||
|
||||
@ -348,6 +350,13 @@ ccontext->max_pattern_length = length;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_max_pattern_compiled_length(pcre2_compile_context *ccontext, PCRE2_SIZE length)
|
||||
{
|
||||
ccontext->max_pattern_compiled_length = length;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t newline)
|
||||
{
|
||||
@ -367,6 +376,13 @@ switch(newline)
|
||||
}
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_max_varlookbehind(pcre2_compile_context *ccontext, uint32_t limit)
|
||||
{
|
||||
ccontext->max_varlookbehind = limit;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, uint32_t limit)
|
||||
{
|
||||
|
@ -537,6 +537,14 @@ Returns: !0 => character is found in the class
|
||||
static BOOL
|
||||
convert_glob_char_in_class(int class_index, PCRE2_UCHAR c)
|
||||
{
|
||||
#if PCRE2_CODE_UNIT_WIDTH != 8
|
||||
if (c > 0xff)
|
||||
{
|
||||
/* ctype functions are not sane for c > 0xff */
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
switch (class_index)
|
||||
{
|
||||
case 1: return isalnum(c);
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
New API code Copyright (c) 2016-2023 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -166,7 +166,7 @@ static const uint8_t coptable[] = {
|
||||
0, /* KetRmax */
|
||||
0, /* KetRmin */
|
||||
0, /* KetRpos */
|
||||
0, /* Reverse */
|
||||
0, 0, /* Reverse, Vreverse */
|
||||
0, /* Assert */
|
||||
0, /* Assert not */
|
||||
0, /* Assert behind */
|
||||
@ -185,7 +185,8 @@ static const uint8_t coptable[] = {
|
||||
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
|
||||
0, 0, /* COMMIT, COMMIT_ARG */
|
||||
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
|
||||
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
|
||||
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
|
||||
0, 0 /* \B and \b in UCP mode */
|
||||
};
|
||||
|
||||
/* This table identifies those opcodes that inspect a character. It is used to
|
||||
@ -243,7 +244,7 @@ static const uint8_t poptable[] = {
|
||||
0, /* KetRmax */
|
||||
0, /* KetRmin */
|
||||
0, /* KetRpos */
|
||||
0, /* Reverse */
|
||||
0, 0, /* Reverse, Vreverse */
|
||||
0, /* Assert */
|
||||
0, /* Assert not */
|
||||
0, /* Assert behind */
|
||||
@ -262,7 +263,8 @@ static const uint8_t poptable[] = {
|
||||
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
|
||||
0, 0, /* COMMIT, COMMIT_ARG */
|
||||
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
|
||||
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
|
||||
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
|
||||
1, 1 /* \B and \b in UCP mode */
|
||||
};
|
||||
|
||||
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
|
||||
@ -424,7 +426,7 @@ overflow. */
|
||||
|
||||
else
|
||||
{
|
||||
uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2;
|
||||
uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;
|
||||
uint32_t newsizeK = newsize/(1024/sizeof(int));
|
||||
|
||||
if (newsizeK + mb->heap_used > mb->heap_limit)
|
||||
@ -587,7 +589,7 @@ if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
|
||||
end_code = this_start_code;
|
||||
do
|
||||
{
|
||||
size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
|
||||
size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);
|
||||
if (back > max_back) max_back = back;
|
||||
end_code += GET(end_code, 1);
|
||||
}
|
||||
@ -631,8 +633,8 @@ if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
|
||||
end_code = this_start_code;
|
||||
do
|
||||
{
|
||||
uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
|
||||
size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
|
||||
uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;
|
||||
size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);
|
||||
if (back <= gone_back)
|
||||
{
|
||||
int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
|
||||
@ -1098,6 +1100,8 @@ for (;;)
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_WORD_BOUNDARY:
|
||||
case OP_NOT_WORD_BOUNDARY:
|
||||
case OP_NOT_UCP_WORD_BOUNDARY:
|
||||
case OP_UCP_WORD_BOUNDARY:
|
||||
{
|
||||
int left_word, right_word;
|
||||
|
||||
@ -1110,13 +1114,13 @@ for (;;)
|
||||
#endif
|
||||
GETCHARTEST(d, temp);
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if ((mb->poptions & PCRE2_UCP) != 0)
|
||||
if (codevalue == OP_UCP_WORD_BOUNDARY ||
|
||||
codevalue == OP_NOT_UCP_WORD_BOUNDARY)
|
||||
{
|
||||
if (d == '_') left_word = TRUE; else
|
||||
{
|
||||
uint32_t cat = UCD_CATEGORY(d);
|
||||
left_word = (cat == ucp_L || cat == ucp_N);
|
||||
}
|
||||
int chartype = UCD_CHARTYPE(d);
|
||||
int category = PRIV(ucp_gentype)[chartype];
|
||||
left_word = (category == ucp_L || category == ucp_N ||
|
||||
chartype == ucp_Mn || chartype == ucp_Pc);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
@ -1135,13 +1139,13 @@ for (;;)
|
||||
mb->last_used_ptr = temp;
|
||||
}
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if ((mb->poptions & PCRE2_UCP) != 0)
|
||||
if (codevalue == OP_UCP_WORD_BOUNDARY ||
|
||||
codevalue == OP_NOT_UCP_WORD_BOUNDARY)
|
||||
{
|
||||
if (c == '_') right_word = TRUE; else
|
||||
{
|
||||
uint32_t cat = UCD_CATEGORY(c);
|
||||
right_word = (cat == ucp_L || cat == ucp_N);
|
||||
}
|
||||
int chartype = UCD_CHARTYPE(c);
|
||||
int category = PRIV(ucp_gentype)[chartype];
|
||||
right_word = (category == ucp_L || category == ucp_N ||
|
||||
chartype == ucp_Mn || chartype == ucp_Pc);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
@ -1149,7 +1153,9 @@ for (;;)
|
||||
}
|
||||
else right_word = FALSE;
|
||||
|
||||
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
|
||||
if ((left_word == right_word) ==
|
||||
(codevalue == OP_NOT_WORD_BOUNDARY ||
|
||||
codevalue == OP_NOT_UCP_WORD_BOUNDARY))
|
||||
{ ADD_ACTIVE(state_offset + 1, 0); }
|
||||
}
|
||||
break;
|
||||
@ -1166,6 +1172,7 @@ for (;;)
|
||||
if (clen > 0)
|
||||
{
|
||||
BOOL OK;
|
||||
int chartype;
|
||||
const uint32_t *cp;
|
||||
const ucd_record * prop = GET_UCD(c);
|
||||
switch(code[1])
|
||||
@ -1175,8 +1182,9 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
||||
prop->chartype == ucp_Lt;
|
||||
chartype = prop->chartype;
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll ||
|
||||
chartype == ucp_Lt;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
@ -1199,8 +1207,9 @@ for (;;)
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
||||
chartype = prop->chartype;
|
||||
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N;
|
||||
break;
|
||||
|
||||
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
|
||||
@ -1223,12 +1232,20 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
||||
c == CHAR_UNDERSCORE;
|
||||
chartype = prop->chartype;
|
||||
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N ||
|
||||
chartype == ucp_Mn || chartype == ucp_Pc;
|
||||
break;
|
||||
|
||||
case PT_CLIST:
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
if (c > MAX_UTF_CODE_POINT)
|
||||
{
|
||||
OK = FALSE;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
cp = PRIV(ucd_caseless_sets) + code[2];
|
||||
for (;;)
|
||||
{
|
||||
@ -1438,6 +1455,7 @@ for (;;)
|
||||
if (clen > 0)
|
||||
{
|
||||
BOOL OK;
|
||||
int chartype;
|
||||
const uint32_t *cp;
|
||||
const ucd_record * prop = GET_UCD(c);
|
||||
switch(code[2])
|
||||
@ -1447,8 +1465,8 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
||||
prop->chartype == ucp_Lt;
|
||||
chartype = prop->chartype;
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
@ -1471,8 +1489,9 @@ for (;;)
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
||||
chartype = prop->chartype;
|
||||
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N;
|
||||
break;
|
||||
|
||||
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
|
||||
@ -1495,12 +1514,20 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
||||
c == CHAR_UNDERSCORE;
|
||||
chartype = prop->chartype;
|
||||
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N ||
|
||||
chartype == ucp_Mn || chartype == ucp_Pc;
|
||||
break;
|
||||
|
||||
case PT_CLIST:
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
if (c > MAX_UTF_CODE_POINT)
|
||||
{
|
||||
OK = FALSE;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
cp = PRIV(ucd_caseless_sets) + code[3];
|
||||
for (;;)
|
||||
{
|
||||
@ -1693,6 +1720,7 @@ for (;;)
|
||||
if (clen > 0)
|
||||
{
|
||||
BOOL OK;
|
||||
int chartype;
|
||||
const uint32_t *cp;
|
||||
const ucd_record * prop = GET_UCD(c);
|
||||
switch(code[2])
|
||||
@ -1702,8 +1730,8 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
||||
prop->chartype == ucp_Lt;
|
||||
chartype = prop->chartype;
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
@ -1726,8 +1754,9 @@ for (;;)
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
||||
chartype = prop->chartype;
|
||||
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N;
|
||||
break;
|
||||
|
||||
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
|
||||
@ -1750,12 +1779,20 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
||||
c == CHAR_UNDERSCORE;
|
||||
chartype = prop->chartype;
|
||||
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N ||
|
||||
chartype == ucp_Mn || chartype == ucp_Pc;
|
||||
break;
|
||||
|
||||
case PT_CLIST:
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
if (c > MAX_UTF_CODE_POINT)
|
||||
{
|
||||
OK = FALSE;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
cp = PRIV(ucd_caseless_sets) + code[3];
|
||||
for (;;)
|
||||
{
|
||||
@ -1973,6 +2010,7 @@ for (;;)
|
||||
if (clen > 0)
|
||||
{
|
||||
BOOL OK;
|
||||
int chartype;
|
||||
const uint32_t *cp;
|
||||
const ucd_record * prop = GET_UCD(c);
|
||||
switch(code[1 + IMM2_SIZE + 1])
|
||||
@ -1982,8 +2020,8 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
||||
prop->chartype == ucp_Lt;
|
||||
chartype = prop->chartype;
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
@ -2007,8 +2045,9 @@ for (;;)
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
||||
chartype = prop->chartype;
|
||||
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N;
|
||||
break;
|
||||
|
||||
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
|
||||
@ -2031,12 +2070,20 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
||||
c == CHAR_UNDERSCORE;
|
||||
chartype = prop->chartype;
|
||||
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N ||
|
||||
chartype == ucp_Mn || chartype == ucp_Pc;
|
||||
break;
|
||||
|
||||
case PT_CLIST:
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
if (c > MAX_UTF_CODE_POINT)
|
||||
{
|
||||
OK = FALSE;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
|
||||
for (;;)
|
||||
{
|
||||
@ -2892,7 +2939,6 @@ for (;;)
|
||||
int *local_workspace;
|
||||
PCRE2_SIZE *local_offsets;
|
||||
RWS_anchor *rws = (RWS_anchor *)RWS;
|
||||
dfa_recursion_info *ri;
|
||||
PCRE2_SPTR callpat = start_code + GET(code, 1);
|
||||
uint32_t recno = (callpat == mb->start_code)? 0 :
|
||||
GET2(callpat, 1 + LINK_SIZE);
|
||||
@ -2909,18 +2955,24 @@ for (;;)
|
||||
rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
|
||||
|
||||
/* Check for repeating a recursion without advancing the subject
|
||||
pointer. This should catch convoluted mutual recursions. (Some simple
|
||||
cases are caught at compile time.) */
|
||||
pointer or last used character. This should catch convoluted mutual
|
||||
recursions. (Some simple cases are caught at compile time.) */
|
||||
|
||||
for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
|
||||
if (recno == ri->group_num && ptr == ri->subject_position)
|
||||
for (dfa_recursion_info *ri = mb->recursive;
|
||||
ri != NULL;
|
||||
ri = ri->prevrec)
|
||||
{
|
||||
if (recno == ri->group_num && ptr == ri->subject_position &&
|
||||
mb->last_used_ptr == ri->last_used_ptr)
|
||||
return PCRE2_ERROR_RECURSELOOP;
|
||||
}
|
||||
|
||||
/* Remember this recursion and where we started it so as to
|
||||
catch infinite loops. */
|
||||
|
||||
new_recursive.group_num = recno;
|
||||
new_recursive.subject_position = ptr;
|
||||
new_recursive.last_used_ptr = mb->last_used_ptr;
|
||||
new_recursive.prevrec = mb->recursive;
|
||||
mb->recursive = &new_recursive;
|
||||
|
||||
@ -3422,7 +3474,7 @@ anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
|
||||
where to start. */
|
||||
|
||||
startline = (re->flags & PCRE2_STARTLINE) != 0;
|
||||
firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
|
||||
firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
|
||||
bumpalong_limit = end_subject;
|
||||
|
||||
/* Initialize and set up the fixed fields in the callout block, with a pointer
|
||||
@ -3992,8 +4044,9 @@ for (;;)
|
||||
match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
|
||||
match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
|
||||
}
|
||||
match_data->subject_length = length;
|
||||
match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
|
||||
match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
|
||||
match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);
|
||||
match_data->startchar = (PCRE2_SIZE)(start_match - subject);
|
||||
match_data->rc = rc;
|
||||
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
New API code Copyright (c) 2016-2024 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -79,7 +79,7 @@ static const unsigned char compile_error_texts[] =
|
||||
"missing closing parenthesis\0"
|
||||
/* 15 */
|
||||
"reference to non-existent subpattern\0"
|
||||
"pattern passed as NULL\0"
|
||||
"pattern passed as NULL with non-zero length\0"
|
||||
"unrecognised compile-time option bit(s)\0"
|
||||
"missing ) after (?# comment\0"
|
||||
"parentheses are too deeply nested\0"
|
||||
@ -90,7 +90,7 @@ static const unsigned char compile_error_texts[] =
|
||||
"internal error: code overflow\0"
|
||||
"missing closing parenthesis for condition\0"
|
||||
/* 25 */
|
||||
"lookbehind assertion is not fixed length\0"
|
||||
"length of lookbehind assertion is not limited\0"
|
||||
"a relative value of zero is not allowed\0"
|
||||
"conditional subpattern contains more than two branches\0"
|
||||
"assertion expected after (?( or (?(?C)\0"
|
||||
@ -184,6 +184,9 @@ static const unsigned char compile_error_texts[] =
|
||||
"too many capturing groups (maximum 65535)\0"
|
||||
"atomic assertion expected after (?( or (?(?C)\0"
|
||||
"\\K is not allowed in lookarounds (but see PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)\0"
|
||||
/* 100 */
|
||||
"branch too long in variable-length lookbehind assertion\0"
|
||||
"compiled pattern would be longer than the limit set by the application\0"
|
||||
;
|
||||
|
||||
/* Match-time and UTF error texts are in the same format. */
|
||||
@ -269,6 +272,7 @@ static const unsigned char match_error_texts[] =
|
||||
/* 65 */
|
||||
"internal error - duplicate substitution match\0"
|
||||
"PCRE2_MATCH_INVALID_UTF is not supported for DFA matching\0"
|
||||
"INTERNAL ERROR: invalid substring offset\0"
|
||||
;
|
||||
|
||||
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
New API code Copyright (c) 2016-2024 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -71,7 +71,11 @@ return NULL;
|
||||
* Match an extended grapheme sequence *
|
||||
*************************************************/
|
||||
|
||||
/*
|
||||
/* NOTE: The logic contained in this function is replicated in three special-
|
||||
purpose functions in the pcre2_jit_compile.c module. If the logic below is
|
||||
changed, they must be kept in step so that the interpreter and the JIT have the
|
||||
same behaviour.
|
||||
|
||||
Arguments:
|
||||
c the first character
|
||||
eptr pointer to next character
|
||||
@ -88,6 +92,7 @@ PCRE2_SPTR
|
||||
PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
|
||||
PCRE2_SPTR end_subject, BOOL utf, int *xcount)
|
||||
{
|
||||
BOOL was_ep_ZWJ = FALSE;
|
||||
int lgb = UCD_GRAPHBREAK(c);
|
||||
|
||||
while (eptr < end_subject)
|
||||
@ -98,6 +103,12 @@ while (eptr < end_subject)
|
||||
rgb = UCD_GRAPHBREAK(c);
|
||||
if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
|
||||
|
||||
/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
|
||||
preceded by Extended Pictographic. */
|
||||
|
||||
if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
|
||||
break;
|
||||
|
||||
/* Not breaking between Regional Indicators is allowed only if there
|
||||
are an even number of preceding RIs. */
|
||||
|
||||
@ -125,12 +136,15 @@ while (eptr < end_subject)
|
||||
if ((ricount & 1) != 0) break; /* Grapheme break required */
|
||||
}
|
||||
|
||||
/* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this
|
||||
allows any number of them before a following Extended_Pictographic. */
|
||||
/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
|
||||
between; see next statement). */
|
||||
|
||||
if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) ||
|
||||
lgb != ucp_gbExtended_Pictographic)
|
||||
lgb = rgb;
|
||||
was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
|
||||
|
||||
/* If Extend follows Extended_Pictographic, do not update lgb; this allows
|
||||
any number of them before a following ZWJ. */
|
||||
|
||||
if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) lgb = rgb;
|
||||
|
||||
eptr += len;
|
||||
if (xcount != NULL) *xcount += 1;
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||
New API code Copyright (c) 2016-2023 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -41,9 +41,9 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
/* This module contains a single function that scans through a compiled pattern
|
||||
until it finds a capturing bracket with the given number, or, if the number is
|
||||
negative, an instance of OP_REVERSE for a lookbehind. The function is called
|
||||
from pcre2_compile.c and also from pcre2_study.c when finding the minimum
|
||||
matching length. */
|
||||
negative, an instance of OP_REVERSE or OP_VREVERSE for a lookbehind. The
|
||||
function is called from pcre2_compile.c and also from pcre2_study.c when
|
||||
finding the minimum matching length. */
|
||||
|
||||
|
||||
#include "pcre2_config.h"
|
||||
@ -82,7 +82,7 @@ for (;;)
|
||||
|
||||
/* Handle lookbehind */
|
||||
|
||||
else if (c == OP_REVERSE)
|
||||
else if (c == OP_REVERSE || c == OP_VREVERSE)
|
||||
{
|
||||
if (number < 0) return (PCRE2_UCHAR *)code;
|
||||
code += PRIV(OP_lengths)[c];
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
New API code Copyright (c) 2016-2023 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -51,6 +51,24 @@ pcre2test.c with CODE_UNIT_WIDTH == 0. */
|
||||
#error The use of both EBCDIC and SUPPORT_UNICODE is not supported.
|
||||
#endif
|
||||
|
||||
/* When compiling one of the libraries, the value of PCRE2_CODE_UNIT_WIDTH must
|
||||
be 8, 16, or 32. AutoTools and CMake ensure that this is always the case, but
|
||||
other other building methods may not, so here is a check. It is cut out when
|
||||
building pcre2test, bcause that sets the value to zero. No other source should
|
||||
be including this file. There is no explicit way of forcing a compile to be
|
||||
abandoned, but trying to include a non-existent file seems cleanest. Otherwise
|
||||
there will be many irrelevant consequential errors. */
|
||||
|
||||
#if (!defined PCRE2_BUILDING_PCRE2TEST && !defined PCRE2_DFTABLES) && \
|
||||
(!defined PCRE2_CODE_UNIT_WIDTH || \
|
||||
(PCRE2_CODE_UNIT_WIDTH != 8 && \
|
||||
PCRE2_CODE_UNIT_WIDTH != 16 && \
|
||||
PCRE2_CODE_UNIT_WIDTH != 32))
|
||||
#error PCRE2_CODE_UNIT_WIDTH must be defined as 8, 16, or 32.
|
||||
#include <AbandonCompile>
|
||||
#endif
|
||||
|
||||
|
||||
/* Standard C headers */
|
||||
|
||||
#include <ctype.h>
|
||||
@ -116,20 +134,24 @@ special-purpose environments) might want to stick other stuff in front of
|
||||
exported symbols. That's why, in the non-Windows case, we set PCRE2_EXP_DEFN
|
||||
only if it is not already set. */
|
||||
|
||||
#if !defined(PCRE2_EXPORT)
|
||||
#define PCRE2_EXPORT
|
||||
#endif
|
||||
|
||||
#ifndef PCRE2_EXP_DECL
|
||||
# ifdef _WIN32
|
||||
# ifndef PCRE2_STATIC
|
||||
# define PCRE2_EXP_DECL extern __declspec(dllexport)
|
||||
# define PCRE2_EXP_DEFN __declspec(dllexport)
|
||||
# else
|
||||
# define PCRE2_EXP_DECL extern
|
||||
# define PCRE2_EXP_DECL extern PCRE2_EXPORT
|
||||
# define PCRE2_EXP_DEFN
|
||||
# endif
|
||||
# else
|
||||
# ifdef __cplusplus
|
||||
# define PCRE2_EXP_DECL extern "C"
|
||||
# define PCRE2_EXP_DECL extern "C" PCRE2_EXPORT
|
||||
# else
|
||||
# define PCRE2_EXP_DECL extern
|
||||
# define PCRE2_EXP_DECL extern PCRE2_EXPORT
|
||||
# endif
|
||||
# ifndef PCRE2_EXP_DEFN
|
||||
# define PCRE2_EXP_DEFN PCRE2_EXP_DECL
|
||||
@ -156,8 +178,8 @@ pcre2_match() because of the way it backtracks. */
|
||||
#define PCRE2_SPTR CUSTOM_SUBJECT_PTR
|
||||
#endif
|
||||
|
||||
/* When checking for integer overflow in pcre2_compile(), we need to handle
|
||||
large integers. If a 64-bit integer type is available, we can use that.
|
||||
/* When checking for integer overflow, we need to handle large integers.
|
||||
If a 64-bit integer type is available, we can use that.
|
||||
Otherwise we have to cast to double, which of course requires floating point
|
||||
arithmetic. Handle this by defining a macro for the appropriate type. */
|
||||
|
||||
@ -1281,7 +1303,7 @@ match. */
|
||||
#define PT_ALNUM 6 /* Alphanumeric - the union of L and N */
|
||||
#define PT_SPACE 7 /* Perl space - general category Z plus 9,10,12,13 */
|
||||
#define PT_PXSPACE 8 /* POSIX space - Z plus 9,10,11,12,13 */
|
||||
#define PT_WORD 9 /* Word - L plus N plus underscore */
|
||||
#define PT_WORD 9 /* Word - L, N, Mn, or Pc */
|
||||
#define PT_CLIST 10 /* Pseudo-property: match character list */
|
||||
#define PT_UCNC 11 /* Universal Character nameable character */
|
||||
#define PT_BIDICL 12 /* Specified bidi class */
|
||||
@ -1297,6 +1319,7 @@ table. */
|
||||
#define PT_PXGRAPH 14 /* [:graph:] - characters that mark the paper */
|
||||
#define PT_PXPRINT 15 /* [:print:] - [:graph:] plus non-control spaces */
|
||||
#define PT_PXPUNCT 16 /* [:punct:] - punctuation characters */
|
||||
#define PT_PXXDIGIT 17 /* [:xdigit:] - hex digits */
|
||||
|
||||
/* This value is used when parsing \p and \P escapes to indicate that neither
|
||||
\p{script:...} nor \p{scx:...} has been encountered. */
|
||||
@ -1327,6 +1350,12 @@ mode rather than an escape sequence. It is also used for [^] in JavaScript
|
||||
compatibility mode, and for \C in non-utf mode. In non-DOTALL mode, "." behaves
|
||||
like \N.
|
||||
|
||||
ESC_ub is a special return from check_escape() when, in BSUX mode, \u{ is not
|
||||
followed by hex digits and }, in which case it should mean a literal "u"
|
||||
followed by a literal "{". This hack is necessary for cases like \u{ 12}
|
||||
because without it, this is interpreted as u{12} now that spaces are allowed in
|
||||
quantifiers.
|
||||
|
||||
Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in
|
||||
check_escape(). There are tests in the code for an escape greater than ESC_b
|
||||
and less than ESC_Z to detect the types that may be repeated. These are the
|
||||
@ -1336,7 +1365,7 @@ consume a character, that code will have to change. */
|
||||
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
|
||||
ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
|
||||
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
|
||||
ESC_E, ESC_Q, ESC_g, ESC_k };
|
||||
ESC_E, ESC_Q, ESC_g, ESC_k, ESC_ub };
|
||||
|
||||
|
||||
/********************** Opcode definitions ******************/
|
||||
@ -1372,8 +1401,8 @@ enum {
|
||||
OP_SOD, /* 1 Start of data: \A */
|
||||
OP_SOM, /* 2 Start of match (subject + offset): \G */
|
||||
OP_SET_SOM, /* 3 Set start of match (\K) */
|
||||
OP_NOT_WORD_BOUNDARY, /* 4 \B */
|
||||
OP_WORD_BOUNDARY, /* 5 \b */
|
||||
OP_NOT_WORD_BOUNDARY, /* 4 \B -- see also OP_NOT_UCP_WORD_BOUNDARY */
|
||||
OP_WORD_BOUNDARY, /* 5 \b -- see also OP_UCP_WORD_BOUNDARY */
|
||||
OP_NOT_DIGIT, /* 6 \D */
|
||||
OP_DIGIT, /* 7 \d */
|
||||
OP_NOT_WHITESPACE, /* 8 \S */
|
||||
@ -1547,78 +1576,85 @@ enum {
|
||||
/* The assertions must come before BRA, CBRA, ONCE, and COND. */
|
||||
|
||||
OP_REVERSE, /* 125 Move pointer back - used in lookbehind assertions */
|
||||
OP_ASSERT, /* 126 Positive lookahead */
|
||||
OP_ASSERT_NOT, /* 127 Negative lookahead */
|
||||
OP_ASSERTBACK, /* 128 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 129 Negative lookbehind */
|
||||
OP_ASSERT_NA, /* 130 Positive non-atomic lookahead */
|
||||
OP_ASSERTBACK_NA, /* 131 Positive non-atomic lookbehind */
|
||||
OP_VREVERSE, /* 126 Move pointer back - variable */
|
||||
OP_ASSERT, /* 127 Positive lookahead */
|
||||
OP_ASSERT_NOT, /* 128 Negative lookahead */
|
||||
OP_ASSERTBACK, /* 129 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 130 Negative lookbehind */
|
||||
OP_ASSERT_NA, /* 131 Positive non-atomic lookahead */
|
||||
OP_ASSERTBACK_NA, /* 132 Positive non-atomic lookbehind */
|
||||
|
||||
/* ONCE, SCRIPT_RUN, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come
|
||||
immediately after the assertions, with ONCE first, as there's a test for >=
|
||||
ONCE for a subpattern that isn't an assertion. The POS versions must
|
||||
immediately follow the non-POS versions in each case. */
|
||||
|
||||
OP_ONCE, /* 132 Atomic group, contains captures */
|
||||
OP_SCRIPT_RUN, /* 133 Non-capture, but check characters' scripts */
|
||||
OP_BRA, /* 134 Start of non-capturing bracket */
|
||||
OP_BRAPOS, /* 135 Ditto, with unlimited, possessive repeat */
|
||||
OP_CBRA, /* 136 Start of capturing bracket */
|
||||
OP_CBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */
|
||||
OP_COND, /* 138 Conditional group */
|
||||
OP_ONCE, /* 133 Atomic group, contains captures */
|
||||
OP_SCRIPT_RUN, /* 134 Non-capture, but check characters' scripts */
|
||||
OP_BRA, /* 135 Start of non-capturing bracket */
|
||||
OP_BRAPOS, /* 136 Ditto, with unlimited, possessive repeat */
|
||||
OP_CBRA, /* 137 Start of capturing bracket */
|
||||
OP_CBRAPOS, /* 138 Ditto, with unlimited, possessive repeat */
|
||||
OP_COND, /* 139 Conditional group */
|
||||
|
||||
/* These five must follow the previous five, in the same order. There's a
|
||||
check for >= SBRA to distinguish the two sets. */
|
||||
|
||||
OP_SBRA, /* 139 Start of non-capturing bracket, check empty */
|
||||
OP_SBRAPOS, /* 149 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCBRA, /* 141 Start of capturing bracket, check empty */
|
||||
OP_SCBRAPOS, /* 142 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCOND, /* 143 Conditional group, check empty */
|
||||
OP_SBRA, /* 140 Start of non-capturing bracket, check empty */
|
||||
OP_SBRAPOS, /* 141 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCBRA, /* 142 Start of capturing bracket, check empty */
|
||||
OP_SCBRAPOS, /* 143 Ditto, with unlimited, possessive repeat */
|
||||
OP_SCOND, /* 144 Conditional group, check empty */
|
||||
|
||||
/* The next two pairs must (respectively) be kept together. */
|
||||
|
||||
OP_CREF, /* 144 Used to hold a capture number as condition */
|
||||
OP_DNCREF, /* 145 Used to point to duplicate names as a condition */
|
||||
OP_RREF, /* 146 Used to hold a recursion number as condition */
|
||||
OP_DNRREF, /* 147 Used to point to duplicate names as a condition */
|
||||
OP_FALSE, /* 148 Always false (used by DEFINE and VERSION) */
|
||||
OP_TRUE, /* 149 Always true (used by VERSION) */
|
||||
OP_CREF, /* 145 Used to hold a capture number as condition */
|
||||
OP_DNCREF, /* 146 Used to point to duplicate names as a condition */
|
||||
OP_RREF, /* 147 Used to hold a recursion number as condition */
|
||||
OP_DNRREF, /* 148 Used to point to duplicate names as a condition */
|
||||
OP_FALSE, /* 149 Always false (used by DEFINE and VERSION) */
|
||||
OP_TRUE, /* 150 Always true (used by VERSION) */
|
||||
|
||||
OP_BRAZERO, /* 150 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 151 order. */
|
||||
OP_BRAPOSZERO, /* 152 */
|
||||
OP_BRAZERO, /* 151 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 152 order. */
|
||||
OP_BRAPOSZERO, /* 153 */
|
||||
|
||||
/* These are backtracking control verbs */
|
||||
|
||||
OP_MARK, /* 153 always has an argument */
|
||||
OP_PRUNE, /* 154 */
|
||||
OP_PRUNE_ARG, /* 155 same, but with argument */
|
||||
OP_SKIP, /* 156 */
|
||||
OP_SKIP_ARG, /* 157 same, but with argument */
|
||||
OP_THEN, /* 158 */
|
||||
OP_THEN_ARG, /* 159 same, but with argument */
|
||||
OP_COMMIT, /* 160 */
|
||||
OP_COMMIT_ARG, /* 161 same, but with argument */
|
||||
OP_MARK, /* 154 always has an argument */
|
||||
OP_PRUNE, /* 155 */
|
||||
OP_PRUNE_ARG, /* 156 same, but with argument */
|
||||
OP_SKIP, /* 157 */
|
||||
OP_SKIP_ARG, /* 158 same, but with argument */
|
||||
OP_THEN, /* 159 */
|
||||
OP_THEN_ARG, /* 160 same, but with argument */
|
||||
OP_COMMIT, /* 161 */
|
||||
OP_COMMIT_ARG, /* 162 same, but with argument */
|
||||
|
||||
/* These are forced failure and success verbs. FAIL and ACCEPT do accept an
|
||||
argument, but these cases can be compiled as, for example, (*MARK:X)(*FAIL)
|
||||
without the need for a special opcode. */
|
||||
|
||||
OP_FAIL, /* 162 */
|
||||
OP_ACCEPT, /* 163 */
|
||||
OP_ASSERT_ACCEPT, /* 164 Used inside assertions */
|
||||
OP_CLOSE, /* 165 Used before OP_ACCEPT to close open captures */
|
||||
OP_FAIL, /* 163 */
|
||||
OP_ACCEPT, /* 164 */
|
||||
OP_ASSERT_ACCEPT, /* 165 Used inside assertions */
|
||||
OP_CLOSE, /* 166 Used before OP_ACCEPT to close open captures */
|
||||
|
||||
/* This is used to skip a subpattern with a {0} quantifier */
|
||||
|
||||
OP_SKIPZERO, /* 166 */
|
||||
OP_SKIPZERO, /* 167 */
|
||||
|
||||
/* This is used to identify a DEFINE group during compilation so that it can
|
||||
be checked for having only one branch. It is changed to OP_FALSE before
|
||||
compilation finishes. */
|
||||
|
||||
OP_DEFINE, /* 167 */
|
||||
OP_DEFINE, /* 168 */
|
||||
|
||||
/* These opcodes replace their normal counterparts in UCP mode when
|
||||
PCRE2_EXTRA_ASCII_BSW is not set. */
|
||||
|
||||
OP_NOT_UCP_WORD_BOUNDARY, /* 169 */
|
||||
OP_UCP_WORD_BOUNDARY, /* 170 */
|
||||
|
||||
/* This is not an opcode, but is used to check that tables indexed by opcode
|
||||
are the correct length, in order to catch updating errors - there have been
|
||||
@ -1664,7 +1700,7 @@ some cases doesn't actually use these names at all). */
|
||||
"class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \
|
||||
"Recurse", "Callout", "CalloutStr", \
|
||||
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
|
||||
"Reverse", "Assert", "Assert not", \
|
||||
"Reverse", "VReverse", "Assert", "Assert not", \
|
||||
"Assert back", "Assert back not", \
|
||||
"Non-atomic assert", "Non-atomic assert back", \
|
||||
"Once", \
|
||||
@ -1679,7 +1715,7 @@ some cases doesn't actually use these names at all). */
|
||||
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
|
||||
"*THEN", "*THEN", "*COMMIT", "*COMMIT", "*FAIL", \
|
||||
"*ACCEPT", "*ASSERT_ACCEPT", \
|
||||
"Close", "Skip zero", "Define"
|
||||
"Close", "Skip zero", "Define", "\\B (ucp)", "\\b (ucp)"
|
||||
|
||||
|
||||
/* This macro defines the length of fixed length operations in the compiled
|
||||
@ -1746,7 +1782,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
1+LINK_SIZE, /* KetRmax */ \
|
||||
1+LINK_SIZE, /* KetRmin */ \
|
||||
1+LINK_SIZE, /* KetRpos */ \
|
||||
1+LINK_SIZE, /* Reverse */ \
|
||||
1+IMM2_SIZE, /* Reverse */ \
|
||||
1+2*IMM2_SIZE, /* VReverse */ \
|
||||
1+LINK_SIZE, /* Assert */ \
|
||||
1+LINK_SIZE, /* Assert not */ \
|
||||
1+LINK_SIZE, /* Assert behind */ \
|
||||
@ -1775,7 +1812,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
1, 3, /* COMMIT, COMMIT_ARG */ \
|
||||
1, 1, 1, /* FAIL, ACCEPT, ASSERT_ACCEPT */ \
|
||||
1+IMM2_SIZE, 1, /* CLOSE, SKIPZERO */ \
|
||||
1 /* DEFINE */
|
||||
1, /* DEFINE */ \
|
||||
1, 1 /* \B and \b in UCP mode */
|
||||
|
||||
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
|
||||
|
||||
@ -2042,6 +2080,9 @@ extern void * _pcre2_memmove(void *, const void *, size_t);
|
||||
#endif
|
||||
|
||||
#endif /* PCRE2_CODE_UNIT_WIDTH */
|
||||
|
||||
extern BOOL PRIV(ckd_smul)(PCRE2_SIZE *, int, int);
|
||||
|
||||
#endif /* PCRE2_INTERNAL_H_IDEMPOTENT_GUARD */
|
||||
|
||||
/* End of pcre2_internal.h */
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
New API code Copyright (c) 2016-2024 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -568,10 +568,12 @@ typedef struct pcre2_real_compile_context {
|
||||
void *stack_guard_data;
|
||||
const uint8_t *tables;
|
||||
PCRE2_SIZE max_pattern_length;
|
||||
PCRE2_SIZE max_pattern_compiled_length;
|
||||
uint16_t bsr_convention;
|
||||
uint16_t newline_convention;
|
||||
uint32_t parens_nest_limit;
|
||||
uint32_t extra_options;
|
||||
uint32_t max_varlookbehind;
|
||||
} pcre2_real_compile_context;
|
||||
|
||||
/* The real match context structure. */
|
||||
@ -605,12 +607,12 @@ defined specially because it is required in pcre2_serialize_decode() when
|
||||
copying the size from possibly unaligned memory into a variable of the same
|
||||
type. Use a macro rather than a typedef to avoid compiler warnings when this
|
||||
file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
|
||||
largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
|
||||
argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
|
||||
here.) */
|
||||
largest lookbehind that is supported. (OP_REVERSE and OP_VREVERSE in a pattern
|
||||
have 16-bit arguments in 8-bit and 16-bit modes, so we need no more than a
|
||||
16-bit field here.) */
|
||||
|
||||
#undef CODE_BLOCKSIZE_TYPE
|
||||
#define CODE_BLOCKSIZE_TYPE size_t
|
||||
#define CODE_BLOCKSIZE_TYPE PCRE2_SIZE
|
||||
|
||||
#undef LOOKBEHIND_MAX
|
||||
#define LOOKBEHIND_MAX UINT16_MAX
|
||||
@ -658,6 +660,7 @@ typedef struct pcre2_real_match_data {
|
||||
PCRE2_SPTR mark; /* Pointer to last mark */
|
||||
struct heapframe *heapframes; /* Backtracking frames heap memory */
|
||||
PCRE2_SIZE heapframes_size; /* Malloc-ed size */
|
||||
PCRE2_SIZE subject_length; /* Subject length */
|
||||
PCRE2_SIZE leftchar; /* Offset to leftmost code unit */
|
||||
PCRE2_SIZE rightchar; /* Offset to rightmost code unit */
|
||||
PCRE2_SIZE startchar; /* Offset to starting code unit */
|
||||
@ -675,8 +678,8 @@ typedef struct pcre2_real_match_data {
|
||||
|
||||
#ifndef PCRE2_PCRE2TEST
|
||||
|
||||
/* Structures for checking for mutual recursion when scanning compiled or
|
||||
parsed code. */
|
||||
/* Structures for checking for mutual function recursion when scanning compiled
|
||||
or parsed code. */
|
||||
|
||||
typedef struct recurse_check {
|
||||
struct recurse_check *prev;
|
||||
@ -688,7 +691,7 @@ typedef struct parsed_recurse_check {
|
||||
uint32_t *groupptr;
|
||||
} parsed_recurse_check;
|
||||
|
||||
/* Structure for building a cache when filling in recursion offsets. */
|
||||
/* Structure for building a cache when filling in pattern recursion offsets. */
|
||||
|
||||
typedef struct recurse_cache {
|
||||
PCRE2_SPTR group;
|
||||
@ -734,7 +737,6 @@ typedef struct compile_block {
|
||||
uint16_t name_entry_size; /* Size of each entry */
|
||||
uint16_t parens_depth; /* Depth of nested parentheses */
|
||||
uint16_t assert_depth; /* Depth of nested assertions */
|
||||
open_capitem *open_caps; /* Chain of open capture items */
|
||||
named_group *named_groups; /* Points to vector in pre-compile */
|
||||
uint32_t named_group_list_size; /* Number of entries in the list */
|
||||
uint32_t external_options; /* External (initial) options */
|
||||
@ -752,10 +754,11 @@ typedef struct compile_block {
|
||||
uint32_t class_range_end; /* Overall class range end */
|
||||
PCRE2_UCHAR nl[4]; /* Newline string when fixed length */
|
||||
uint32_t req_varyopt; /* "After variable item" flag for reqbyte */
|
||||
int max_lookbehind; /* Maximum lookbehind (characters) */
|
||||
uint32_t max_varlookbehind; /* Limit for variable lookbehinds */
|
||||
int max_lookbehind; /* Maximum lookbehind encountered (characters) */
|
||||
BOOL had_accept; /* (*ACCEPT) encountered */
|
||||
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
|
||||
BOOL had_recurse; /* Had a recursion or subroutine call */
|
||||
BOOL had_recurse; /* Had a pattern recursion or subroutine call */
|
||||
BOOL dupnames; /* Duplicate names exist */
|
||||
} compile_block;
|
||||
|
||||
@ -773,6 +776,7 @@ call within the pattern when running pcre2_dfa_match(). */
|
||||
typedef struct dfa_recursion_info {
|
||||
struct dfa_recursion_info *prevrec;
|
||||
PCRE2_SPTR subject_position;
|
||||
PCRE2_SPTR last_used_ptr;
|
||||
uint32_t group_num;
|
||||
} dfa_recursion_info;
|
||||
|
||||
@ -793,7 +797,7 @@ typedef struct heapframe {
|
||||
PCRE2_SIZE length; /* Used for character, string, or code lengths */
|
||||
PCRE2_SIZE back_frame; /* Amount to subtract on RRETURN */
|
||||
PCRE2_SIZE temp_size; /* Used for short-term PCRE2_SIZE values */
|
||||
uint32_t rdepth; /* "Recursion" depth */
|
||||
uint32_t rdepth; /* Function "recursion" depth within pcre2_match() */
|
||||
uint32_t group_frame_type; /* Type information for group frames */
|
||||
uint32_t temp_32[4]; /* Used for short-term 32-bit or BOOL values */
|
||||
uint8_t return_id; /* Where to go on in internal "return" */
|
||||
@ -829,7 +833,8 @@ typedef struct heapframe {
|
||||
PCRE2_SPTR eptr; /* MUST BE FIRST */
|
||||
PCRE2_SPTR start_match; /* Can be adjusted by \K */
|
||||
PCRE2_SPTR mark; /* Most recent mark on the success path */
|
||||
uint32_t current_recurse; /* Current (deepest) recursion number */
|
||||
PCRE2_SPTR recurse_last_used; /* Last character used at time of pattern recursion */
|
||||
uint32_t current_recurse; /* Group number of current (deepest) pattern recursion */
|
||||
uint32_t capture_last; /* Most recent capture */
|
||||
PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */
|
||||
PCRE2_SIZE offset_top; /* Offset after highest capture */
|
||||
@ -858,7 +863,7 @@ doing traditional NFA matching (pcre2_match() and friends). */
|
||||
|
||||
typedef struct match_block {
|
||||
pcre2_memctl memctl; /* For general use */
|
||||
PCRE2_SIZE heap_limit; /* As it says */
|
||||
uint32_t heap_limit; /* As it says */
|
||||
uint32_t match_limit; /* As it says */
|
||||
uint32_t match_limit_depth; /* As it says */
|
||||
uint32_t match_call_count; /* Number of times a new frame is created */
|
||||
@ -875,10 +880,11 @@ typedef struct match_block {
|
||||
uint16_t name_count; /* Number of names in name table */
|
||||
uint16_t name_entry_size; /* Size of entry in names table */
|
||||
PCRE2_SPTR name_table; /* Table of group names */
|
||||
PCRE2_SPTR start_code; /* For use when recursing */
|
||||
PCRE2_SPTR start_code; /* For use in pattern recursion */
|
||||
PCRE2_SPTR start_subject; /* Start of the subject string */
|
||||
PCRE2_SPTR check_subject; /* Where UTF-checked from */
|
||||
PCRE2_SPTR end_subject; /* End of the subject string */
|
||||
PCRE2_SPTR end_subject; /* Usable end of the subject string */
|
||||
PCRE2_SPTR true_end_subject; /* Actual end of the subject string */
|
||||
PCRE2_SPTR end_match_ptr; /* Subject position at end match */
|
||||
PCRE2_SPTR start_used_ptr; /* Earliest consulted character */
|
||||
PCRE2_SPTR last_used_ptr; /* Latest consulted character */
|
||||
@ -886,7 +892,7 @@ typedef struct match_block {
|
||||
PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */
|
||||
PCRE2_SPTR verb_ecode_ptr; /* For passing back info */
|
||||
PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */
|
||||
uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */
|
||||
uint32_t verb_current_recurse; /* Current recursion group when (*VERB) happens */
|
||||
uint32_t moptions; /* Match options */
|
||||
uint32_t poptions; /* Pattern options */
|
||||
uint32_t skip_arg_count; /* For counting SKIP_ARGs */
|
||||
@ -911,7 +917,7 @@ typedef struct dfa_match_block {
|
||||
PCRE2_SPTR last_used_ptr; /* Latest consulted character */
|
||||
const uint8_t *tables; /* Character tables */
|
||||
PCRE2_SIZE start_offset; /* The start offset value */
|
||||
PCRE2_SIZE heap_limit; /* As it says */
|
||||
uint32_t heap_limit; /* As it says */
|
||||
PCRE2_SIZE heap_used; /* As it says */
|
||||
uint32_t match_limit; /* As it says */
|
||||
uint32_t match_limit_depth; /* As it says */
|
||||
@ -926,7 +932,7 @@ typedef struct dfa_match_block {
|
||||
pcre2_callout_block *cb; /* Points to a callout block */
|
||||
void *callout_data; /* To pass back to callouts */
|
||||
int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */
|
||||
dfa_recursion_info *recursive; /* Linked list of recursion data */
|
||||
dfa_recursion_info *recursive; /* Linked list of pattern recursion data */
|
||||
} dfa_match_block;
|
||||
|
||||
#endif /* PCRE2_PCRE2TEST */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||
New API code Copyright (c) 2016-2023 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -42,6 +42,12 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
#error This file must be included from pcre2_jit_compile.c.
|
||||
#endif
|
||||
|
||||
#if defined(__has_feature)
|
||||
#if __has_feature(memory_sanitizer)
|
||||
#include <sanitizer/msan_interface.h>
|
||||
#endif /* __has_feature(memory_sanitizer) */
|
||||
#endif /* defined(__has_feature) */
|
||||
|
||||
#ifdef SUPPORT_JIT
|
||||
|
||||
static SLJIT_NOINLINE int jit_machine_stack_exec(jit_arguments *arguments, jit_function executable_func)
|
||||
@ -171,6 +177,7 @@ if (rc > (int)oveccount)
|
||||
rc = 0;
|
||||
match_data->code = re;
|
||||
match_data->subject = (rc >= 0 || rc == PCRE2_ERROR_PARTIAL)? subject : NULL;
|
||||
match_data->subject_length = length;
|
||||
match_data->rc = rc;
|
||||
match_data->startchar = arguments.startchar_ptr - subject;
|
||||
match_data->leftchar = 0;
|
||||
@ -178,6 +185,13 @@ match_data->rightchar = 0;
|
||||
match_data->mark = arguments.mark_ptr;
|
||||
match_data->matchedby = PCRE2_MATCHEDBY_JIT;
|
||||
|
||||
#if defined(__has_feature)
|
||||
#if __has_feature(memory_sanitizer)
|
||||
if (rc > 0)
|
||||
__msan_unpoison(match_data->ovector, 2 * rc * sizeof(match_data->ovector[0]));
|
||||
#endif /* __has_feature(memory_sanitizer) */
|
||||
#endif /* defined(__has_feature) */
|
||||
|
||||
return match_data->rc;
|
||||
|
||||
#endif /* SUPPORT_JIT */
|
||||
|
@ -141,8 +141,8 @@ if (startsize == 0 || maxsize == 0 || maxsize > SIZE_MAX - STACK_GROWTH_RATE)
|
||||
return NULL;
|
||||
if (startsize > maxsize)
|
||||
startsize = maxsize;
|
||||
startsize = (startsize + STACK_GROWTH_RATE - 1) & ~(STACK_GROWTH_RATE - 1);
|
||||
maxsize = (maxsize + STACK_GROWTH_RATE - 1) & ~(STACK_GROWTH_RATE - 1);
|
||||
startsize = (startsize + STACK_GROWTH_RATE - 1) & (size_t)(~(STACK_GROWTH_RATE - 1));
|
||||
maxsize = (maxsize + STACK_GROWTH_RATE - 1) & (size_t)(~(STACK_GROWTH_RATE - 1));
|
||||
|
||||
jit_stack = PRIV(memctl_malloc)(sizeof(pcre2_real_jit_stack), (pcre2_memctl *)gcontext);
|
||||
if (jit_stack == NULL) return NULL;
|
||||
|
@ -96,7 +96,11 @@ for (i = 0; i < 256; i++) *p++ = tolower(i);
|
||||
|
||||
/* Next the case-flipping table */
|
||||
|
||||
for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
|
||||
for (i = 0; i < 256; i++)
|
||||
{
|
||||
int c = islower(i)? toupper(i) : tolower(i);
|
||||
*p++ = (c < 256)? c : i;
|
||||
}
|
||||
|
||||
/* Then the character class tables. Don't try to be clever and save effort on
|
||||
exclusive ones - in some locales things may be different.
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2015-2022 University of Cambridge
|
||||
New API code Copyright (c) 2015-2024 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -41,6 +41,8 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "pcre2_config.h"
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
/* These defines enable debugging code */
|
||||
|
||||
/* #define DEBUG_FRAMES_DISPLAY */
|
||||
@ -51,6 +53,10 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG_SHOW_OPS
|
||||
static const char *OP_names[] = { OP_NAME_LIST };
|
||||
#endif
|
||||
|
||||
/* These defines identify the name of the block containing "static"
|
||||
information, and fields within it. */
|
||||
|
||||
@ -58,8 +64,6 @@ information, and fields within it. */
|
||||
#define PSSTART start_subject /* Field containing processed string start */
|
||||
#define PSEND end_subject /* Field containing processed string end */
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
|
||||
|
||||
/* Masks for identifying the public options that are permitted at match time. */
|
||||
@ -67,7 +71,8 @@ information, and fields within it. */
|
||||
#define PUBLIC_MATCH_OPTIONS \
|
||||
(PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
|
||||
PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
|
||||
PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT)
|
||||
PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT| \
|
||||
PCRE2_DISABLE_RECURSELOOP_CHECK)
|
||||
|
||||
#define PUBLIC_JIT_MATCH_OPTIONS \
|
||||
(PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
|
||||
@ -148,7 +153,7 @@ changed, the code at RETURN_SWITCH below must be updated in sync. */
|
||||
enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
|
||||
RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
|
||||
RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
|
||||
RM31, RM32, RM33, RM34, RM35, RM36 };
|
||||
RM31, RM32, RM33, RM34, RM35, RM36, RM37 };
|
||||
|
||||
#ifdef SUPPORT_WIDE_CHARS
|
||||
enum { RM100=100, RM101 };
|
||||
@ -595,11 +600,12 @@ heapframe *P = NULL;
|
||||
|
||||
heapframe *frames_top; /* End of frames vector */
|
||||
heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
|
||||
PCRE2_SIZE heapframes_size; /* Usable size of frames vector */
|
||||
PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
|
||||
|
||||
/* Local variables that do not need to be preserved over calls to RRMATCH(). */
|
||||
|
||||
PCRE2_SPTR branch_end = NULL;
|
||||
PCRE2_SPTR branch_start;
|
||||
PCRE2_SPTR bracode; /* Temp pointer to start of group */
|
||||
PCRE2_SIZE offset; /* Used for group offsets */
|
||||
PCRE2_SIZE length; /* Used for various length calculations */
|
||||
@ -633,13 +639,10 @@ copied when a new frame is created. */
|
||||
|
||||
frame_copy_size = frame_size - offsetof(heapframe, eptr);
|
||||
|
||||
/* Set up the first frame and the end of the frames vector. We set the local
|
||||
heapframes_size to the usuable amount of the vector, that is, a whole number of
|
||||
frames. */
|
||||
/* Set up the first frame and the end of the frames vector. */
|
||||
|
||||
F = match_data->heapframes;
|
||||
heapframes_size = (match_data->heapframes_size / frame_size) * frame_size;
|
||||
frames_top = (heapframe *)((char *)F + heapframes_size);
|
||||
frames_top = (heapframe *)((char *)F + match_data->heapframes_size);
|
||||
|
||||
Frdepth = 0; /* "Recursion" depth */
|
||||
Fcapture_last = 0; /* Number of most recent capture */
|
||||
@ -660,35 +663,54 @@ MATCH_RECURSE:
|
||||
doubling the size, but constrained by the heap limit (which is in KiB). */
|
||||
|
||||
N = (heapframe *)((char *)F + frame_size);
|
||||
if (N >= frames_top)
|
||||
if ((heapframe *)((char *)N + frame_size) >= frames_top)
|
||||
{
|
||||
heapframe *new;
|
||||
PCRE2_SIZE newsize = match_data->heapframes_size * 2;
|
||||
PCRE2_SIZE newsize;
|
||||
PCRE2_SIZE usedsize = (char *)N - (char *)(match_data->heapframes);
|
||||
|
||||
if (newsize > mb->heap_limit)
|
||||
if (match_data->heapframes_size >= PCRE2_SIZE_MAX / 2)
|
||||
{
|
||||
PCRE2_SIZE maxsize = (mb->heap_limit/frame_size) * frame_size;
|
||||
if (match_data->heapframes_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
|
||||
newsize = maxsize;
|
||||
if (match_data->heapframes_size == PCRE2_SIZE_MAX - 1)
|
||||
return PCRE2_ERROR_NOMEMORY;
|
||||
newsize = PCRE2_SIZE_MAX - 1;
|
||||
}
|
||||
else
|
||||
newsize = match_data->heapframes_size * 2;
|
||||
|
||||
if (newsize / 1024 >= mb->heap_limit)
|
||||
{
|
||||
PCRE2_SIZE old_size = match_data->heapframes_size / 1024;
|
||||
if (mb->heap_limit <= old_size)
|
||||
return PCRE2_ERROR_HEAPLIMIT;
|
||||
else
|
||||
{
|
||||
PCRE2_SIZE max_delta = 1024 * (mb->heap_limit - old_size);
|
||||
int over_bytes = match_data->heapframes_size % 1024;
|
||||
if (over_bytes) max_delta -= (1024 - over_bytes);
|
||||
newsize = match_data->heapframes_size + max_delta;
|
||||
}
|
||||
}
|
||||
|
||||
/* With a heap limit set, the permitted additional size may not be enough for
|
||||
another frame, so do a final check. */
|
||||
|
||||
if (newsize - usedsize < frame_size) return PCRE2_ERROR_HEAPLIMIT;
|
||||
new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data);
|
||||
if (new == NULL) return PCRE2_ERROR_NOMEMORY;
|
||||
memcpy(new, match_data->heapframes, heapframes_size);
|
||||
memcpy(new, match_data->heapframes, usedsize);
|
||||
|
||||
F = (heapframe *)((char *)new + ((char *)F - (char *)match_data->heapframes));
|
||||
N = (heapframe *)((char *)F + frame_size);
|
||||
N = (heapframe *)((char *)new + usedsize);
|
||||
F = (heapframe *)((char *)N - frame_size);
|
||||
|
||||
match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data);
|
||||
match_data->heapframes = new;
|
||||
match_data->heapframes_size = newsize;
|
||||
|
||||
heapframes_size = (newsize / frame_size) * frame_size;
|
||||
frames_top = (heapframe *)((char *)new + heapframes_size);
|
||||
frames_top = (heapframe *)((char *)new + newsize);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_SHOW_RMATCH
|
||||
fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1);
|
||||
fprintf(stderr, "++ RMATCH %d frame=%d", Freturn_id, Frdepth + 1);
|
||||
if (group_frame_type != 0)
|
||||
{
|
||||
fprintf(stderr, " type=%x ", group_frame_type);
|
||||
@ -758,10 +780,16 @@ opcodes. */
|
||||
if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
|
||||
if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
|
||||
|
||||
#ifdef DEBUG_SHOW_OPS
|
||||
fprintf(stderr, "\n++ New frame: type=0x%x subject offset %ld\n",
|
||||
GF_IDMASK(Fgroup_frame_type), Feptr - mb->start_subject);
|
||||
#endif
|
||||
|
||||
for (;;)
|
||||
{
|
||||
#ifdef DEBUG_SHOW_OPS
|
||||
fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
|
||||
OP_names[*Fecode]);
|
||||
#endif
|
||||
|
||||
Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
|
||||
@ -809,15 +837,16 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
assert_accept_frame = F;
|
||||
RRETURN(MATCH_ACCEPT);
|
||||
|
||||
/* If recursing, we have to find the most recent recursion. */
|
||||
/* For ACCEPT within a recursion, we have to find the most recent
|
||||
recursion. If not in a recursion, fall through to code that is common with
|
||||
OP_END. */
|
||||
|
||||
case OP_ACCEPT:
|
||||
case OP_END:
|
||||
|
||||
/* Handle end of a recursion. */
|
||||
|
||||
if (Fcurrent_recurse != RECURSE_UNSET)
|
||||
{
|
||||
#ifdef DEBUG_SHOW_OPS
|
||||
fprintf(stderr, "++ Accept within recursion\n");
|
||||
#endif
|
||||
offset = Flast_group_offset;
|
||||
for(;;)
|
||||
{
|
||||
@ -840,27 +869,49 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
Fecode += 1 + LINK_SIZE;
|
||||
continue;
|
||||
}
|
||||
/* Fall through */
|
||||
|
||||
/* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY
|
||||
is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the
|
||||
start of the subject. In both cases, backtracking will then try other
|
||||
alternatives, if any. */
|
||||
/* OP_END itself can never be reached within a recursion because that is
|
||||
picked up when the OP_KET that always precedes OP_END is reached. */
|
||||
|
||||
case OP_END:
|
||||
|
||||
/* Fail for an empty string match if either PCRE2_NOTEMPTY is set, or if
|
||||
PCRE2_NOTEMPTY_ATSTART is set and we have matched at the start of the
|
||||
subject. In both cases, backtracking will then try other alternatives, if
|
||||
any. */
|
||||
|
||||
if (Feptr == Fstart_match &&
|
||||
((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
|
||||
((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
|
||||
Fstart_match == mb->start_subject + mb->start_offset)))
|
||||
{
|
||||
#ifdef DEBUG_SHOW_OPS
|
||||
fprintf(stderr, "++ Backtrack because empty string\n");
|
||||
#endif
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
|
||||
/* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not
|
||||
/* Fail if PCRE2_ENDANCHORED is set and the end of the match is not
|
||||
the end of the subject. After (*ACCEPT) we fail the entire match (at this
|
||||
position) but backtrack on reaching the end of the pattern. */
|
||||
position) but backtrack if we've reached the end of the pattern. This
|
||||
applies whether or not we are in a recursion. */
|
||||
|
||||
if (Feptr < mb->end_subject &&
|
||||
((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
|
||||
{
|
||||
if (Fop == OP_END) RRETURN(MATCH_NOMATCH);
|
||||
return MATCH_NOMATCH;
|
||||
if (Fop == OP_END)
|
||||
{
|
||||
#ifdef DEBUG_SHOW_OPS
|
||||
fprintf(stderr, "++ Backtrack because not at end (endanchored set)\n");
|
||||
#endif
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_SHOW_OPS
|
||||
fprintf(stderr, "++ Failed ACCEPT not at end (endanchnored set)\n");
|
||||
#endif
|
||||
return MATCH_NOMATCH; /* (*ACCEPT) */
|
||||
}
|
||||
|
||||
/* We have a successful match of the whole pattern. Record the result and
|
||||
@ -2433,6 +2484,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
GETCHARINCTEST(fc, Feptr);
|
||||
{
|
||||
const uint32_t *cp;
|
||||
uint32_t chartype;
|
||||
const ucd_record *prop = GET_UCD(fc);
|
||||
BOOL notmatch = Fop == OP_NOTPROP;
|
||||
|
||||
@ -2443,9 +2495,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
if ((prop->chartype == ucp_Lu ||
|
||||
prop->chartype == ucp_Ll ||
|
||||
prop->chartype == ucp_Lt) == notmatch)
|
||||
chartype = prop->chartype;
|
||||
if ((chartype == ucp_Lu ||
|
||||
chartype == ucp_Ll ||
|
||||
chartype == ucp_Lt) == notmatch)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
break;
|
||||
|
||||
@ -2475,8 +2528,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
/* These are specials */
|
||||
|
||||
case PT_ALNUM:
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N) == notmatch)
|
||||
chartype = prop->chartype;
|
||||
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N) == notmatch)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
break;
|
||||
|
||||
@ -2501,13 +2555,22 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
||||
fc == CHAR_UNDERSCORE) == notmatch)
|
||||
chartype = prop->chartype;
|
||||
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N ||
|
||||
chartype == ucp_Mn ||
|
||||
chartype == ucp_Pc) == notmatch)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
break;
|
||||
|
||||
case PT_CLIST:
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
if (fc > MAX_UTF_CODE_POINT)
|
||||
{
|
||||
if (notmatch) break;;
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
#endif
|
||||
cp = PRIV(ucd_caseless_sets) + Fecode[2];
|
||||
for (;;)
|
||||
{
|
||||
@ -2803,16 +2866,17 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
case PT_WORD:
|
||||
for (i = 1; i <= Lmin; i++)
|
||||
{
|
||||
int category;
|
||||
int chartype, category;
|
||||
if (Feptr >= mb->end_subject)
|
||||
{
|
||||
SCHECK_PARTIAL();
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
GETCHARINCTEST(fc, Feptr);
|
||||
category = UCD_CATEGORY(fc);
|
||||
chartype = UCD_CHARTYPE(fc);
|
||||
category = PRIV(ucp_gentype)[chartype];
|
||||
if ((category == ucp_L || category == ucp_N ||
|
||||
fc == CHAR_UNDERSCORE) == notmatch)
|
||||
chartype == ucp_Mn || chartype == ucp_Pc) == notmatch)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
break;
|
||||
@ -2827,6 +2891,13 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
GETCHARINCTEST(fc, Feptr);
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
if (fc > MAX_UTF_CODE_POINT)
|
||||
{
|
||||
if (notmatch) continue;
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
#endif
|
||||
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
|
||||
for (;;)
|
||||
{
|
||||
@ -3607,7 +3678,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
case PT_WORD:
|
||||
for (;;)
|
||||
{
|
||||
int category;
|
||||
int chartype, category;
|
||||
RMATCH(Fecode, RM215);
|
||||
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
|
||||
@ -3617,10 +3688,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
GETCHARINCTEST(fc, Feptr);
|
||||
category = UCD_CATEGORY(fc);
|
||||
chartype = UCD_CHARTYPE(fc);
|
||||
category = PRIV(ucp_gentype)[chartype];
|
||||
if ((category == ucp_L ||
|
||||
category == ucp_N ||
|
||||
fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
|
||||
chartype == ucp_Mn ||
|
||||
chartype == ucp_Pc) == (Lctype == OP_NOTPROP))
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
/* Control never gets here */
|
||||
@ -3638,6 +3711,13 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
GETCHARINCTEST(fc, Feptr);
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
if (fc > MAX_UTF_CODE_POINT)
|
||||
{
|
||||
if (Lctype == OP_NOTPROP) continue;
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
}
|
||||
#endif
|
||||
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
|
||||
for (;;)
|
||||
{
|
||||
@ -4188,7 +4268,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
case PT_WORD:
|
||||
for (i = Lmin; i < Lmax; i++)
|
||||
{
|
||||
int category;
|
||||
int chartype, category;
|
||||
int len = 1;
|
||||
if (Feptr >= mb->end_subject)
|
||||
{
|
||||
@ -4196,9 +4276,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
break;
|
||||
}
|
||||
GETCHARLENTEST(fc, Feptr, len);
|
||||
category = UCD_CATEGORY(fc);
|
||||
if ((category == ucp_L || category == ucp_N ||
|
||||
fc == CHAR_UNDERSCORE) == notmatch)
|
||||
chartype = UCD_CHARTYPE(fc);
|
||||
category = PRIV(ucp_gentype)[chartype];
|
||||
if ((category == ucp_L ||
|
||||
category == ucp_N ||
|
||||
chartype == ucp_Mn ||
|
||||
chartype == ucp_Pc) == notmatch)
|
||||
break;
|
||||
Feptr+= len;
|
||||
}
|
||||
@ -4215,6 +4298,14 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
break;
|
||||
}
|
||||
GETCHARLENTEST(fc, Feptr, len);
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
if (fc > MAX_UTF_CODE_POINT)
|
||||
{
|
||||
if (!notmatch) goto GOT_MAX;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
|
||||
for (;;)
|
||||
{
|
||||
@ -4223,6 +4314,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
if (fc == *cp++)
|
||||
{ if (notmatch) goto GOT_MAX; else break; }
|
||||
}
|
||||
}
|
||||
|
||||
Feptr += len;
|
||||
}
|
||||
GOT_MAX:
|
||||
@ -5320,9 +5413,11 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
|
||||
|
||||
/* ===================================================================== */
|
||||
/* Recursion either matches the current regex, or some subexpression. The
|
||||
offset data is the offset to the starting bracket from the start of the
|
||||
whole pattern. (This is so that it works from duplicated subpatterns.) */
|
||||
/* Pattern recursion either matches the current regex, or some
|
||||
subexpression. The offset data is the offset to the starting bracket from
|
||||
the start of the whole pattern. This is so that it works from duplicated
|
||||
subpatterns. For a whole-pattern recursion, we have to infer the number
|
||||
zero. */
|
||||
|
||||
#define Lframe_type F->temp_32[0]
|
||||
#define Lstart_branch F->temp_sptr[0]
|
||||
@ -5331,9 +5426,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
bracode = mb->start_code + GET(Fecode, 1);
|
||||
number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
|
||||
|
||||
/* If we are already in a recursion, check for repeating the same one
|
||||
without advancing the subject pointer. This should catch convoluted mutual
|
||||
recursions. (Some simple cases are caught at compile time.) */
|
||||
/* If we are already in a pattern recursion, check for repeating the same
|
||||
one without changing the subject pointer or the last referenced character
|
||||
in the subject. This should catch convoluted mutual recursions; some
|
||||
simple cases are caught at compile time. However, there are rare cases when
|
||||
this check needs to be turned off. In this case, actual recursion loops
|
||||
will be caught by the match or heap limits. */
|
||||
|
||||
if (Fcurrent_recurse != RECURSE_UNSET)
|
||||
{
|
||||
@ -5344,15 +5442,19 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
P = (heapframe *)((char *)N - frame_size);
|
||||
if (N->group_frame_type == (GF_RECURSE | number))
|
||||
{
|
||||
if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP;
|
||||
if (Feptr == P->eptr && mb->last_used_ptr == P->recurse_last_used &&
|
||||
(mb->moptions & PCRE2_DISABLE_RECURSELOOP_CHECK) == 0)
|
||||
return PCRE2_ERROR_RECURSELOOP;
|
||||
break;
|
||||
}
|
||||
offset = P->last_group_offset;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now run the recursion, branch by branch. */
|
||||
/* Remember the current last referenced character and then run the
|
||||
recursion branch by branch. */
|
||||
|
||||
F->recurse_last_used = mb->last_used_ptr;
|
||||
Lstart_branch = bracode;
|
||||
Lframe_type = GF_RECURSE | number;
|
||||
|
||||
@ -5681,13 +5783,13 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
|
||||
|
||||
/* ===================================================================== */
|
||||
/* Move the subject pointer back. This occurs only at the start of each
|
||||
branch of a lookbehind assertion. If we are too close to the start to move
|
||||
back, fail. When working with UTF-8 we move back a number of characters,
|
||||
not bytes. */
|
||||
/* Move the subject pointer back by one fixed amount. This occurs at the
|
||||
start of each branch that has a fixed length in a lookbehind assertion. If
|
||||
we are too close to the start to move back, fail. When working with UTF-8
|
||||
we move back a number of characters, not bytes. */
|
||||
|
||||
case OP_REVERSE:
|
||||
number = GET(Fecode, 1);
|
||||
number = GET2(Fecode, 1);
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf)
|
||||
{
|
||||
@ -5701,7 +5803,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
else
|
||||
#endif
|
||||
|
||||
/* No UTF-8 support, or not in UTF-8 mode: count is code unit count */
|
||||
/* No UTF support, or not in UTF mode: count is code unit count */
|
||||
|
||||
{
|
||||
if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
|
||||
@ -5711,15 +5813,84 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
/* Save the earliest consulted character, then skip to next opcode */
|
||||
|
||||
if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
|
||||
Fecode += 1 + LINK_SIZE;
|
||||
Fecode += 1 + IMM2_SIZE;
|
||||
break;
|
||||
|
||||
|
||||
/* ===================================================================== */
|
||||
/* Move the subject pointer back by a variable amount. This occurs at the
|
||||
start of each branch of a lookbehind assertion when the branch has a
|
||||
variable, but limited, length. A loop is needed to try matching the branch
|
||||
after moving back different numbers of characters. If we are too close to
|
||||
the start to move back even the minimum amount, fail. When working with
|
||||
UTF-8 we move back a number of characters, not bytes. */
|
||||
|
||||
#define Lmin F->temp_32[0]
|
||||
#define Lmax F->temp_32[1]
|
||||
#define Leptr F->temp_sptr[0]
|
||||
|
||||
case OP_VREVERSE:
|
||||
Lmin = GET2(Fecode, 1);
|
||||
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
|
||||
Leptr = Feptr;
|
||||
|
||||
/* Move back by the maximum branch length and then work forwards. This
|
||||
ensures that items such as \d{3,5} get the maximum length, which is
|
||||
relevant for captures, and makes for Perl compatibility. */
|
||||
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf)
|
||||
{
|
||||
for (i = 0; i < Lmax; i++)
|
||||
{
|
||||
if (Feptr == mb->start_subject)
|
||||
{
|
||||
if (i < Lmin) RRETURN(MATCH_NOMATCH);
|
||||
Lmax = i;
|
||||
break;
|
||||
}
|
||||
Feptr--;
|
||||
BACKCHAR(Feptr);
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
|
||||
/* No UTF support or not in UTF mode */
|
||||
|
||||
{
|
||||
ptrdiff_t diff = Feptr - mb->start_subject;
|
||||
uint32_t available = (diff > 65535)? 65535 : ((diff > 0)? (int)diff : 0);
|
||||
if (Lmin > available) RRETURN(MATCH_NOMATCH);
|
||||
if (Lmax > available) Lmax = available;
|
||||
Feptr -= Lmax;
|
||||
}
|
||||
|
||||
/* Now try matching, moving forward one character on failure, until we
|
||||
reach the mimimum back length. */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
RMATCH(Fecode + 1 + 2 * IMM2_SIZE, RM37);
|
||||
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
|
||||
if (Lmax-- <= Lmin) RRETURN(MATCH_NOMATCH);
|
||||
Feptr++;
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if (utf) { FORWARDCHARTEST(Feptr, mb->end_subject); }
|
||||
#endif
|
||||
}
|
||||
/* Control never reaches here */
|
||||
|
||||
#undef Lmin
|
||||
#undef Lmax
|
||||
#undef Leptr
|
||||
|
||||
/* ===================================================================== */
|
||||
/* An alternation is the end of a branch; scan along to find the end of the
|
||||
bracketed group. */
|
||||
|
||||
case OP_ALT:
|
||||
branch_end = Fecode;
|
||||
do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
|
||||
break;
|
||||
|
||||
@ -5727,7 +5898,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
/* ===================================================================== */
|
||||
/* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
|
||||
starting frame was added to the chained frames in order to remember the
|
||||
starting subject position for the group. */
|
||||
starting subject position for the group. (Not true for OP_BRA when it's a
|
||||
whole pattern recursion, but that is handled separately below.)*/
|
||||
|
||||
case OP_KET:
|
||||
case OP_KETRMIN:
|
||||
@ -5736,8 +5908,14 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
|
||||
bracode = Fecode - GET(Fecode, 1);
|
||||
|
||||
/* Point N to the frame at the start of the most recent group.
|
||||
Remember the subject pointer at the start of the group. */
|
||||
if (branch_end == NULL) branch_end = Fecode;
|
||||
branch_start = bracode;
|
||||
while (branch_start + GET(branch_start, 1) != branch_end)
|
||||
branch_start += GET(branch_start, 1);
|
||||
branch_end = NULL;
|
||||
|
||||
/* Point N to the frame at the start of the most recent group, and P to its
|
||||
predecessor. Remember the subject pointer at the start of the group. */
|
||||
|
||||
if (*bracode != OP_BRA && *bracode != OP_COND)
|
||||
{
|
||||
@ -5773,27 +5951,64 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
|
||||
switch (*bracode)
|
||||
{
|
||||
case OP_BRA: /* No need to do anything for these */
|
||||
case OP_COND:
|
||||
/* Whole pattern recursion is handled as a recursion into group 0, but
|
||||
the entire pattern is wrapped in OP_BRA/OP_KET rather than a capturing
|
||||
group - a design mistake: it should perhaps have been capture group 0.
|
||||
Anyway, that means the end of such recursion must be handled here. It is
|
||||
detected by checking for an immediately following OP_END when we are
|
||||
recursing in group 0. If this is not the end of a whole-pattern
|
||||
recursion, there is nothing to be done. */
|
||||
|
||||
case OP_BRA:
|
||||
if (Fcurrent_recurse != 0 || Fecode[1+LINK_SIZE] != OP_END) break;
|
||||
|
||||
/* It is the end of whole-pattern recursion. */
|
||||
|
||||
offset = Flast_group_offset;
|
||||
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
|
||||
N = (heapframe *)((char *)match_data->heapframes + offset);
|
||||
P = (heapframe *)((char *)N - frame_size);
|
||||
Flast_group_offset = P->last_group_offset;
|
||||
|
||||
/* Reinstate the previous set of captures and then carry on after the
|
||||
recursion call. */
|
||||
|
||||
memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
|
||||
Foffset_top * sizeof(PCRE2_SIZE));
|
||||
Foffset_top = P->offset_top;
|
||||
Fcapture_last = P->capture_last;
|
||||
Fcurrent_recurse = P->current_recurse;
|
||||
Fecode = P->ecode + 1 + LINK_SIZE;
|
||||
continue; /* With next opcode */
|
||||
|
||||
case OP_COND: /* No need to do anything for these */
|
||||
case OP_SCOND:
|
||||
break;
|
||||
|
||||
/* Non-atomic positive assertions are like OP_BRA, except that the
|
||||
subject pointer must be put back to where it was at the start of the
|
||||
assertion. */
|
||||
assertion. For a variable lookbehind, check its end point. */
|
||||
|
||||
case OP_ASSERTBACK_NA:
|
||||
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
/* Fall through */
|
||||
|
||||
case OP_ASSERT_NA:
|
||||
case OP_ASSERTBACK_NA:
|
||||
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
|
||||
Feptr = P->eptr;
|
||||
break;
|
||||
|
||||
/* Atomic positive assertions are like OP_ONCE, except that in addition
|
||||
the subject pointer must be put back to where it was at the start of the
|
||||
assertion. */
|
||||
assertion. For a variable lookbehind, check its end point. */
|
||||
|
||||
case OP_ASSERTBACK:
|
||||
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
/* Fall through */
|
||||
|
||||
case OP_ASSERT:
|
||||
case OP_ASSERTBACK:
|
||||
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
|
||||
Feptr = P->eptr;
|
||||
/* Fall through */
|
||||
@ -5814,10 +6029,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
break;
|
||||
|
||||
/* A matching negative assertion returns MATCH, which is turned into
|
||||
NOMATCH at the assertion level. */
|
||||
NOMATCH at the assertion level. For a variable lookbehind, check its end
|
||||
point. */
|
||||
|
||||
case OP_ASSERTBACK_NOT:
|
||||
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
/* Fall through */
|
||||
|
||||
case OP_ASSERT_NOT:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
RRETURN(MATCH_MATCH);
|
||||
|
||||
/* At the end of a script run, apply the script-checking rules. This code
|
||||
@ -5828,9 +6048,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
|
||||
break;
|
||||
|
||||
/* Whole-pattern recursion is coded as a recurse into group 0, so it
|
||||
won't be picked up here. Instead, we catch it when the OP_END is reached.
|
||||
Other recursion is handled here. */
|
||||
/* Whole-pattern recursion is coded as a recurse into group 0, and is
|
||||
handled with OP_BRA above. Other recursion is handled here. */
|
||||
|
||||
case OP_CBRA:
|
||||
case OP_CBRAPOS:
|
||||
@ -5845,7 +6064,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
{
|
||||
P = (heapframe *)((char *)N - frame_size);
|
||||
memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
|
||||
P->offset_top * sizeof(PCRE2_SIZE));
|
||||
Foffset_top * sizeof(PCRE2_SIZE));
|
||||
Foffset_top = P->offset_top;
|
||||
Fcapture_last = P->capture_last;
|
||||
Fcurrent_recurse = P->current_recurse;
|
||||
@ -5928,10 +6147,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
|
||||
|
||||
/* Fall through */
|
||||
/* Unconditional end of subject assertion (\z) */
|
||||
/* Unconditional end of subject assertion (\z). */
|
||||
|
||||
case OP_EOD:
|
||||
if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
|
||||
if (Feptr < mb->true_end_subject) RRETURN(MATCH_NOMATCH);
|
||||
if (mb->partial != 0)
|
||||
{
|
||||
mb->hitend = TRUE;
|
||||
@ -6043,6 +6262,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
|
||||
case OP_NOT_WORD_BOUNDARY:
|
||||
case OP_WORD_BOUNDARY:
|
||||
case OP_NOT_UCP_WORD_BOUNDARY:
|
||||
case OP_UCP_WORD_BOUNDARY:
|
||||
if (Feptr == mb->check_subject) prev_is_word = FALSE; else
|
||||
{
|
||||
PCRE2_SPTR lastptr = Feptr - 1;
|
||||
@ -6057,13 +6278,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
fc = *lastptr;
|
||||
if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if ((mb->poptions & PCRE2_UCP) != 0)
|
||||
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
|
||||
{
|
||||
if (fc == '_') prev_is_word = TRUE; else
|
||||
{
|
||||
int cat = UCD_CATEGORY(fc);
|
||||
prev_is_word = (cat == ucp_L || cat == ucp_N);
|
||||
}
|
||||
int chartype = UCD_CHARTYPE(fc);
|
||||
int category = PRIV(ucp_gentype)[chartype];
|
||||
prev_is_word = (category == ucp_L || category == ucp_N ||
|
||||
chartype == ucp_Mn || chartype == ucp_Pc);
|
||||
}
|
||||
else
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
@ -6091,13 +6311,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
fc = *Feptr;
|
||||
if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
|
||||
#ifdef SUPPORT_UNICODE
|
||||
if ((mb->poptions & PCRE2_UCP) != 0)
|
||||
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
|
||||
{
|
||||
if (fc == '_') cur_is_word = TRUE; else
|
||||
{
|
||||
int cat = UCD_CATEGORY(fc);
|
||||
cur_is_word = (cat == ucp_L || cat == ucp_N);
|
||||
}
|
||||
int chartype = UCD_CHARTYPE(fc);
|
||||
int category = PRIV(ucp_gentype)[chartype];
|
||||
cur_is_word = (category == ucp_L || category == ucp_N ||
|
||||
chartype == ucp_Mn || chartype == ucp_Pc);
|
||||
}
|
||||
else
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
@ -6106,7 +6325,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
|
||||
|
||||
/* Now see if the situation is what we want */
|
||||
|
||||
if ((*Fecode++ == OP_WORD_BOUNDARY)?
|
||||
if ((*Fecode++ == OP_WORD_BOUNDARY || Fop == OP_UCP_WORD_BOUNDARY)?
|
||||
cur_is_word == prev_is_word : cur_is_word != prev_is_word)
|
||||
RRETURN(MATCH_NOMATCH);
|
||||
break;
|
||||
@ -6252,7 +6471,7 @@ F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
|
||||
mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
|
||||
|
||||
#ifdef DEBUG_SHOW_RMATCH
|
||||
fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id);
|
||||
fprintf(stderr, "++ RETURN %d to RM%d\n", rrc, Freturn_id);
|
||||
#endif
|
||||
|
||||
switch (Freturn_id)
|
||||
@ -6261,7 +6480,7 @@ switch (Freturn_id)
|
||||
LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
|
||||
LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
|
||||
LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
|
||||
LBL(33) LBL(34) LBL(35) LBL(36)
|
||||
LBL(33) LBL(34) LBL(35) LBL(36) LBL(37)
|
||||
|
||||
#ifdef SUPPORT_WIDE_CHARS
|
||||
LBL(100) LBL(101)
|
||||
@ -6549,6 +6768,7 @@ if (use_jit)
|
||||
match_data, mcontext);
|
||||
if (rc != PCRE2_ERROR_JIT_BADOPTION)
|
||||
{
|
||||
match_data->subject_length = length;
|
||||
if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
|
||||
{
|
||||
length = CU2BYTES(length + was_zero_terminated);
|
||||
@ -6717,7 +6937,7 @@ if (mcontext == NULL)
|
||||
else mb->memctl = mcontext->memctl;
|
||||
|
||||
anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
|
||||
firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
|
||||
firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
|
||||
startline = (re->flags & PCRE2_STARTLINE) != 0;
|
||||
bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
|
||||
true_end_subject : subject + mcontext->offset_limit;
|
||||
@ -6740,6 +6960,7 @@ mb->callout_data = mcontext->callout_data;
|
||||
mb->start_subject = subject;
|
||||
mb->start_offset = start_offset;
|
||||
mb->end_subject = end_subject;
|
||||
mb->true_end_subject = true_end_subject;
|
||||
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
|
||||
mb->allowemptypartial = (re->max_lookbehind > 0) ||
|
||||
(re->flags & PCRE2_MATCH_EMPTY) != 0;
|
||||
@ -6799,7 +7020,7 @@ the pattern. It is not used at all if there are no capturing parentheses.
|
||||
|
||||
frame_size is the total size of each frame
|
||||
match_data->heapframes is the pointer to the frames vector
|
||||
match_data->heapframes_size is the total size of the vector
|
||||
match_data->heapframes_size is the allocated size of the vector
|
||||
|
||||
We must pad the frame_size for alignment to ensure subsequent frames are as
|
||||
aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE
|
||||
@ -6814,7 +7035,7 @@ frame_size = (offsetof(heapframe, ovector) +
|
||||
smaller. */
|
||||
|
||||
mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)?
|
||||
mcontext->heap_limit : re->limit_heap) * 1024;
|
||||
mcontext->heap_limit : re->limit_heap);
|
||||
|
||||
mb->match_limit = (mcontext->match_limit < re->limit_match)?
|
||||
mcontext->match_limit : re->limit_match;
|
||||
@ -6825,19 +7046,19 @@ mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
|
||||
/* If a pattern has very many capturing parentheses, the frame size may be very
|
||||
large. Set the initial frame vector size to ensure that there are at least 10
|
||||
available frames, but enforce a minimum of START_FRAMES_SIZE. If this is
|
||||
greater than the heap limit, get as large a vector as possible. Always round
|
||||
the size to a multiple of the frame size. */
|
||||
greater than the heap limit, get as large a vector as possible. */
|
||||
|
||||
heapframes_size = frame_size * 10;
|
||||
if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE;
|
||||
if (heapframes_size > mb->heap_limit)
|
||||
if (heapframes_size / 1024 > mb->heap_limit)
|
||||
{
|
||||
if (frame_size > mb->heap_limit ) return PCRE2_ERROR_HEAPLIMIT;
|
||||
heapframes_size = mb->heap_limit;
|
||||
PCRE2_SIZE max_size = 1024 * mb->heap_limit;
|
||||
if (max_size < frame_size) return PCRE2_ERROR_HEAPLIMIT;
|
||||
heapframes_size = max_size;
|
||||
}
|
||||
|
||||
/* If an existing frame vector in the match_data block is large enough, we can
|
||||
use it.Otherwise, free any pre-existing vector and get a new one. */
|
||||
use it. Otherwise, free any pre-existing vector and get a new one. */
|
||||
|
||||
if (match_data->heapframes_size < heapframes_size)
|
||||
{
|
||||
@ -7284,9 +7505,17 @@ for(;;)
|
||||
mb->end_offset_top = 0;
|
||||
mb->skip_arg_count = 0;
|
||||
|
||||
#ifdef DEBUG_SHOW_OPS
|
||||
fprintf(stderr, "++ Calling match()\n");
|
||||
#endif
|
||||
|
||||
rc = match(start_match, mb->start_code, re->top_bracket, frame_size,
|
||||
match_data, mb);
|
||||
|
||||
#ifdef DEBUG_SHOW_OPS
|
||||
fprintf(stderr, "++ match() returned %d\n\n", rc);
|
||||
#endif
|
||||
|
||||
if (mb->hitend && start_partial == NULL)
|
||||
{
|
||||
start_partial = mb->start_used_ptr;
|
||||
@ -7434,6 +7663,7 @@ if (utf && end_subject != true_end_subject &&
|
||||
if (start_match >= true_end_subject)
|
||||
{
|
||||
rc = MATCH_NOMATCH; /* In case it was partial */
|
||||
match_partial = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -7483,6 +7713,7 @@ if (rc == MATCH_MATCH)
|
||||
{
|
||||
match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
|
||||
0 : (int)mb->end_offset_top/2 + 1;
|
||||
match_data->subject_length = length;
|
||||
match_data->startchar = start_match - subject;
|
||||
match_data->leftchar = mb->start_used_ptr - subject;
|
||||
match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
|
||||
@ -7497,6 +7728,7 @@ if (rc == MATCH_MATCH)
|
||||
match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
|
||||
}
|
||||
else match_data->subject = subject;
|
||||
|
||||
return match_data->rc;
|
||||
}
|
||||
|
||||
@ -7518,6 +7750,7 @@ PCRE2_ERROR_PARTIAL. */
|
||||
else if (match_partial != NULL)
|
||||
{
|
||||
match_data->subject = subject;
|
||||
match_data->subject_length = length;
|
||||
match_data->ovector[0] = match_partial - subject;
|
||||
match_data->ovector[1] = end_subject - subject;
|
||||
match_data->startchar = match_partial - subject;
|
||||
|
@ -167,4 +167,16 @@ return offsetof(pcre2_match_data, ovector) +
|
||||
2 * (match_data->oveccount) * sizeof(PCRE2_SIZE);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Get heapframes size *
|
||||
*************************************************/
|
||||
|
||||
PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION
|
||||
pcre2_get_match_data_heapframes_size(pcre2_match_data *match_data)
|
||||
{
|
||||
return match_data->heapframes_size;
|
||||
}
|
||||
|
||||
/* End of pcre2_match_data.c */
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
New API code Copyright (c) 2016-2023 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -253,6 +253,7 @@ for (;;)
|
||||
/* Skip over things that don't match chars */
|
||||
|
||||
case OP_REVERSE:
|
||||
case OP_VREVERSE:
|
||||
case OP_CREF:
|
||||
case OP_DNCREF:
|
||||
case OP_RREF:
|
||||
@ -270,6 +271,8 @@ for (;;)
|
||||
case OP_DOLLM:
|
||||
case OP_NOT_WORD_BOUNDARY:
|
||||
case OP_WORD_BOUNDARY:
|
||||
case OP_NOT_UCP_WORD_BOUNDARY:
|
||||
case OP_UCP_WORD_BOUNDARY:
|
||||
cc += PRIV(OP_lengths)[*cc];
|
||||
break;
|
||||
|
||||
@ -973,6 +976,7 @@ do
|
||||
while (try_next) /* Loop for items in this branch */
|
||||
{
|
||||
int rc;
|
||||
PCRE2_SPTR ncode;
|
||||
uint8_t *classmap = NULL;
|
||||
#ifdef SUPPORT_WIDE_CHARS
|
||||
PCRE2_UCHAR xclassflags;
|
||||
@ -1051,6 +1055,7 @@ do
|
||||
case OP_REF:
|
||||
case OP_REFI:
|
||||
case OP_REVERSE:
|
||||
case OP_VREVERSE:
|
||||
case OP_RREF:
|
||||
case OP_SCOND:
|
||||
case OP_SET_SOM:
|
||||
@ -1098,13 +1103,100 @@ do
|
||||
|
||||
case OP_WORD_BOUNDARY:
|
||||
case OP_NOT_WORD_BOUNDARY:
|
||||
case OP_UCP_WORD_BOUNDARY:
|
||||
case OP_NOT_UCP_WORD_BOUNDARY:
|
||||
tcode++;
|
||||
break;
|
||||
|
||||
/* If we hit a bracket or a positive lookahead assertion, recurse to set
|
||||
bits from within the subpattern. If it can't find anything, we have to
|
||||
give up. If it finds some mandatory character(s), we are done for this
|
||||
branch. Otherwise, carry on scanning after the subpattern. */
|
||||
/* For a positive lookahead assertion, inspect what immediately follows,
|
||||
ignoring intermediate assertions and callouts. If the next item is one
|
||||
that sets a mandatory character, skip this assertion. Otherwise, treat it
|
||||
the same as other bracket groups. */
|
||||
|
||||
case OP_ASSERT:
|
||||
case OP_ASSERT_NA:
|
||||
ncode = tcode + GET(tcode, 1);
|
||||
while (*ncode == OP_ALT) ncode += GET(ncode, 1);
|
||||
ncode += 1 + LINK_SIZE;
|
||||
|
||||
/* Skip irrelevant items */
|
||||
|
||||
for (BOOL done = FALSE; !done;)
|
||||
{
|
||||
switch (*ncode)
|
||||
{
|
||||
case OP_ASSERT:
|
||||
case OP_ASSERT_NOT:
|
||||
case OP_ASSERTBACK:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
case OP_ASSERT_NA:
|
||||
case OP_ASSERTBACK_NA:
|
||||
ncode += GET(ncode, 1);
|
||||
while (*ncode == OP_ALT) ncode += GET(ncode, 1);
|
||||
ncode += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
case OP_WORD_BOUNDARY:
|
||||
case OP_NOT_WORD_BOUNDARY:
|
||||
case OP_UCP_WORD_BOUNDARY:
|
||||
case OP_NOT_UCP_WORD_BOUNDARY:
|
||||
ncode++;
|
||||
break;
|
||||
|
||||
case OP_CALLOUT:
|
||||
ncode += PRIV(OP_lengths)[OP_CALLOUT];
|
||||
break;
|
||||
|
||||
case OP_CALLOUT_STR:
|
||||
ncode += GET(ncode, 1 + 2*LINK_SIZE);
|
||||
break;
|
||||
|
||||
default:
|
||||
done = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Now check the next significant item. */
|
||||
|
||||
switch(*ncode)
|
||||
{
|
||||
default:
|
||||
break;
|
||||
|
||||
case OP_PROP:
|
||||
if (ncode[1] != PT_CLIST) break;
|
||||
/* Fall through */
|
||||
case OP_ANYNL:
|
||||
case OP_CHAR:
|
||||
case OP_CHARI:
|
||||
case OP_EXACT:
|
||||
case OP_EXACTI:
|
||||
case OP_HSPACE:
|
||||
case OP_MINPLUS:
|
||||
case OP_MINPLUSI:
|
||||
case OP_PLUS:
|
||||
case OP_PLUSI:
|
||||
case OP_POSPLUS:
|
||||
case OP_POSPLUSI:
|
||||
case OP_VSPACE:
|
||||
/* Note that these types will only be present in non-UCP mode. */
|
||||
case OP_DIGIT:
|
||||
case OP_NOT_DIGIT:
|
||||
case OP_WORDCHAR:
|
||||
case OP_NOT_WORDCHAR:
|
||||
case OP_WHITESPACE:
|
||||
case OP_NOT_WHITESPACE:
|
||||
tcode = ncode;
|
||||
continue; /* With the following significant opcode */
|
||||
}
|
||||
/* Fall through */
|
||||
|
||||
/* For a group bracket or a positive assertion without an immediately
|
||||
following mandatory setting, recurse to set bits from within the
|
||||
subpattern. If it can't find anything, we have to give up. If it finds
|
||||
some mandatory character(s), we are done for this branch. Otherwise,
|
||||
carry on scanning after the subpattern. */
|
||||
|
||||
case OP_BRA:
|
||||
case OP_SBRA:
|
||||
@ -1116,8 +1208,6 @@ do
|
||||
case OP_SCBRAPOS:
|
||||
case OP_ONCE:
|
||||
case OP_SCRIPT_RUN:
|
||||
case OP_ASSERT:
|
||||
case OP_ASSERT_NA:
|
||||
rc = set_start_bits(re, tcode, utf, ucp, depthptr);
|
||||
if (rc == SSB_DONE)
|
||||
{
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||
New API code Copyright (c) 2016-2023 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -306,6 +306,7 @@ Returns: if successful: 0
|
||||
PCRE2_ERROR_NOSUBSTRING: no such substring
|
||||
PCRE2_ERROR_UNAVAILABLE: ovector is too small
|
||||
PCRE2_ERROR_UNSET: substring is not set
|
||||
PCRE2_ERROR_INVALIDOFFSET: internal error, should not occur
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
|
||||
@ -338,6 +339,8 @@ else /* Matched using pcre2_dfa_match() */
|
||||
|
||||
left = match_data->ovector[stringnumber*2];
|
||||
right = match_data->ovector[stringnumber*2+1];
|
||||
if (left > match_data->subject_length || right > match_data->subject_length)
|
||||
return PCRE2_ERROR_INVALIDOFFSET;
|
||||
if (sizeptr != NULL) *sizeptr = (left > right)? 0 : right - left;
|
||||
return 0;
|
||||
}
|
||||
@ -439,7 +442,7 @@ Returns: nothing
|
||||
*/
|
||||
|
||||
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
|
||||
pcre2_substring_list_free(PCRE2_SPTR *list)
|
||||
pcre2_substring_list_free(PCRE2_UCHAR **list)
|
||||
{
|
||||
if (list != NULL)
|
||||
{
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2021 University of Cambridge
|
||||
New API code Copyright (c) 2016-2024 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -169,9 +169,9 @@ are implementing).
|
||||
6. Do not break after Prepend characters.
|
||||
|
||||
7. Do not break within emoji modifier sequences or emoji zwj sequences. That
|
||||
is, do not break between characters with the Extended_Pictographic property.
|
||||
Extend and ZWJ characters are allowed between the characters; this cannot be
|
||||
represented in this table, the code has to deal with it.
|
||||
is, do not break between characters with the Extended_Pictographic property
|
||||
if a ZWJ intervenes. Extend characters are allowed between the characters;
|
||||
this cannot be represented in this table, the code has to deal with it.
|
||||
|
||||
8. Do not break within emoji flag sequences. That is, do not break between
|
||||
regional indicator (RI) symbols if there are an odd number of RI characters
|
||||
@ -201,8 +201,8 @@ const uint32_t PRIV(ucp_gbtable)[] = {
|
||||
ESZ|(1u<<ucp_gbT), /* 10 LVT */
|
||||
(1u<<ucp_gbRegional_Indicator), /* 11 Regional Indicator */
|
||||
ESZ, /* 12 Other */
|
||||
ESZ, /* 13 ZWJ */
|
||||
ESZ|(1u<<ucp_gbExtended_Pictographic) /* 14 Extended Pictographic */
|
||||
ESZ|(1u<<ucp_gbExtended_Pictographic), /* 13 ZWJ */
|
||||
ESZ /* 14 Extended Pictographic */
|
||||
};
|
||||
|
||||
#undef ESZ
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -166,29 +166,29 @@ enum {
|
||||
/* These are the bidi class values. */
|
||||
|
||||
enum {
|
||||
ucp_bidiAL, /* Arabic letter */
|
||||
ucp_bidiAN, /* Arabic number */
|
||||
ucp_bidiB, /* Paragraph separator */
|
||||
ucp_bidiBN, /* Boundary neutral */
|
||||
ucp_bidiCS, /* Common separator */
|
||||
ucp_bidiEN, /* European number */
|
||||
ucp_bidiES, /* European separator */
|
||||
ucp_bidiET, /* European terminator */
|
||||
ucp_bidiFSI, /* First strong isolate */
|
||||
ucp_bidiL, /* Left to right */
|
||||
ucp_bidiLRE, /* Left to right embedding */
|
||||
ucp_bidiLRI, /* Left to right isolate */
|
||||
ucp_bidiLRO, /* Left to right override */
|
||||
ucp_bidiNSM, /* Non-spacing mark */
|
||||
ucp_bidiON, /* Other neutral */
|
||||
ucp_bidiPDF, /* Pop directional format */
|
||||
ucp_bidiPDI, /* Pop directional isolate */
|
||||
ucp_bidiR, /* Right to left */
|
||||
ucp_bidiRLE, /* Right to left embedding */
|
||||
ucp_bidiRLI, /* Right to left isolate */
|
||||
ucp_bidiRLO, /* Right to left override */
|
||||
ucp_bidiS, /* Segment separator */
|
||||
ucp_bidiWS, /* White space */
|
||||
ucp_bidiAL, /* Arabic_Letter */
|
||||
ucp_bidiAN, /* Arabic_Number */
|
||||
ucp_bidiB, /* Paragraph_Separator */
|
||||
ucp_bidiBN, /* Boundary_Neutral */
|
||||
ucp_bidiCS, /* Common_Separator */
|
||||
ucp_bidiEN, /* European_Number */
|
||||
ucp_bidiES, /* European_Separator */
|
||||
ucp_bidiET, /* European_Terminator */
|
||||
ucp_bidiFSI, /* First_Strong_Isolate */
|
||||
ucp_bidiL, /* Left_To_Right */
|
||||
ucp_bidiLRE, /* Left_To_Right_Embedding */
|
||||
ucp_bidiLRI, /* Left_To_Right_Isolate */
|
||||
ucp_bidiLRO, /* Left_To_Right_Override */
|
||||
ucp_bidiNSM, /* Nonspacing_Mark */
|
||||
ucp_bidiON, /* Other_Neutral */
|
||||
ucp_bidiPDF, /* Pop_Directional_Format */
|
||||
ucp_bidiPDI, /* Pop_Directional_Isolate */
|
||||
ucp_bidiR, /* Right_To_Left */
|
||||
ucp_bidiRLE, /* Right_To_Left_Embedding */
|
||||
ucp_bidiRLI, /* Right_To_Left_Isolate */
|
||||
ucp_bidiRLO, /* Right_To_Left_Override */
|
||||
ucp_bidiS, /* Segment_Separator */
|
||||
ucp_bidiWS, /* White_Space */
|
||||
};
|
||||
|
||||
/* These are grapheme break properties. The Extended Pictographic property
|
||||
@ -380,6 +380,8 @@ enum {
|
||||
ucp_Tangsa,
|
||||
ucp_Toto,
|
||||
ucp_Vithkuqi,
|
||||
ucp_Kawi,
|
||||
ucp_Nag_Mundari,
|
||||
|
||||
/* This must be last */
|
||||
ucp_Script_Count
|
||||
|
@ -265,6 +265,7 @@ the "loose matching" rules that Unicode advises and Perl uses. */
|
||||
#define STRING_kana0 STR_k STR_a STR_n STR_a "\0"
|
||||
#define STRING_kannada0 STR_k STR_a STR_n STR_n STR_a STR_d STR_a "\0"
|
||||
#define STRING_katakana0 STR_k STR_a STR_t STR_a STR_k STR_a STR_n STR_a "\0"
|
||||
#define STRING_kawi0 STR_k STR_a STR_w STR_i "\0"
|
||||
#define STRING_kayahli0 STR_k STR_a STR_y STR_a STR_h STR_l STR_i "\0"
|
||||
#define STRING_khar0 STR_k STR_h STR_a STR_r "\0"
|
||||
#define STRING_kharoshthi0 STR_k STR_h STR_a STR_r STR_o STR_s STR_h STR_t STR_h STR_i "\0"
|
||||
@ -347,6 +348,8 @@ the "loose matching" rules that Unicode advises and Perl uses. */
|
||||
#define STRING_mymr0 STR_m STR_y STR_m STR_r "\0"
|
||||
#define STRING_n0 STR_n "\0"
|
||||
#define STRING_nabataean0 STR_n STR_a STR_b STR_a STR_t STR_a STR_e STR_a STR_n "\0"
|
||||
#define STRING_nagm0 STR_n STR_a STR_g STR_m "\0"
|
||||
#define STRING_nagmundari0 STR_n STR_a STR_g STR_m STR_u STR_n STR_d STR_a STR_r STR_i "\0"
|
||||
#define STRING_nand0 STR_n STR_a STR_n STR_d "\0"
|
||||
#define STRING_nandinagari0 STR_n STR_a STR_n STR_d STR_i STR_n STR_a STR_g STR_a STR_r STR_i "\0"
|
||||
#define STRING_narb0 STR_n STR_a STR_r STR_b "\0"
|
||||
@ -753,6 +756,7 @@ const char PRIV(utt_names)[] =
|
||||
STRING_kana0
|
||||
STRING_kannada0
|
||||
STRING_katakana0
|
||||
STRING_kawi0
|
||||
STRING_kayahli0
|
||||
STRING_khar0
|
||||
STRING_kharoshthi0
|
||||
@ -835,6 +839,8 @@ const char PRIV(utt_names)[] =
|
||||
STRING_mymr0
|
||||
STRING_n0
|
||||
STRING_nabataean0
|
||||
STRING_nagm0
|
||||
STRING_nagmundari0
|
||||
STRING_nand0
|
||||
STRING_nandinagari0
|
||||
STRING_narb0
|
||||
@ -1241,280 +1247,283 @@ const ucp_type_table PRIV(utt)[] = {
|
||||
{ 1665, PT_SCX, ucp_Katakana },
|
||||
{ 1670, PT_SCX, ucp_Kannada },
|
||||
{ 1678, PT_SCX, ucp_Katakana },
|
||||
{ 1687, PT_SCX, ucp_Kayah_Li },
|
||||
{ 1695, PT_SC, ucp_Kharoshthi },
|
||||
{ 1687, PT_SC, ucp_Kawi },
|
||||
{ 1692, PT_SCX, ucp_Kayah_Li },
|
||||
{ 1700, PT_SC, ucp_Kharoshthi },
|
||||
{ 1711, PT_SC, ucp_Khitan_Small_Script },
|
||||
{ 1729, PT_SC, ucp_Khmer },
|
||||
{ 1735, PT_SC, ucp_Khmer },
|
||||
{ 1740, PT_SCX, ucp_Khojki },
|
||||
{ 1705, PT_SC, ucp_Kharoshthi },
|
||||
{ 1716, PT_SC, ucp_Khitan_Small_Script },
|
||||
{ 1734, PT_SC, ucp_Khmer },
|
||||
{ 1740, PT_SC, ucp_Khmer },
|
||||
{ 1745, PT_SCX, ucp_Khojki },
|
||||
{ 1752, PT_SCX, ucp_Khudawadi },
|
||||
{ 1762, PT_SC, ucp_Khitan_Small_Script },
|
||||
{ 1767, PT_SCX, ucp_Kannada },
|
||||
{ 1772, PT_SCX, ucp_Kaithi },
|
||||
{ 1777, PT_GC, ucp_L },
|
||||
{ 1779, PT_LAMP, 0 },
|
||||
{ 1782, PT_SC, ucp_Tai_Tham },
|
||||
{ 1787, PT_SC, ucp_Lao },
|
||||
{ 1791, PT_SC, ucp_Lao },
|
||||
{ 1796, PT_SCX, ucp_Latin },
|
||||
{ 1802, PT_SCX, ucp_Latin },
|
||||
{ 1807, PT_LAMP, 0 },
|
||||
{ 1810, PT_SC, ucp_Lepcha },
|
||||
{ 1750, PT_SCX, ucp_Khojki },
|
||||
{ 1757, PT_SCX, ucp_Khudawadi },
|
||||
{ 1767, PT_SC, ucp_Khitan_Small_Script },
|
||||
{ 1772, PT_SCX, ucp_Kannada },
|
||||
{ 1777, PT_SCX, ucp_Kaithi },
|
||||
{ 1782, PT_GC, ucp_L },
|
||||
{ 1784, PT_LAMP, 0 },
|
||||
{ 1787, PT_SC, ucp_Tai_Tham },
|
||||
{ 1792, PT_SC, ucp_Lao },
|
||||
{ 1796, PT_SC, ucp_Lao },
|
||||
{ 1801, PT_SCX, ucp_Latin },
|
||||
{ 1807, PT_SCX, ucp_Latin },
|
||||
{ 1812, PT_LAMP, 0 },
|
||||
{ 1815, PT_SC, ucp_Lepcha },
|
||||
{ 1822, PT_SCX, ucp_Limbu },
|
||||
{ 1820, PT_SC, ucp_Lepcha },
|
||||
{ 1827, PT_SCX, ucp_Limbu },
|
||||
{ 1833, PT_SCX, ucp_Linear_A },
|
||||
{ 1838, PT_SCX, ucp_Linear_B },
|
||||
{ 1843, PT_SCX, ucp_Linear_A },
|
||||
{ 1851, PT_SCX, ucp_Linear_B },
|
||||
{ 1859, PT_SC, ucp_Lisu },
|
||||
{ 1864, PT_PC, ucp_Ll },
|
||||
{ 1867, PT_PC, ucp_Lm },
|
||||
{ 1870, PT_PC, ucp_Lo },
|
||||
{ 1873, PT_BOOL, ucp_Logical_Order_Exception },
|
||||
{ 1877, PT_BOOL, ucp_Logical_Order_Exception },
|
||||
{ 1899, PT_BOOL, ucp_Lowercase },
|
||||
{ 1905, PT_BOOL, ucp_Lowercase },
|
||||
{ 1915, PT_PC, ucp_Lt },
|
||||
{ 1918, PT_PC, ucp_Lu },
|
||||
{ 1921, PT_SC, ucp_Lycian },
|
||||
{ 1832, PT_SCX, ucp_Limbu },
|
||||
{ 1838, PT_SCX, ucp_Linear_A },
|
||||
{ 1843, PT_SCX, ucp_Linear_B },
|
||||
{ 1848, PT_SCX, ucp_Linear_A },
|
||||
{ 1856, PT_SCX, ucp_Linear_B },
|
||||
{ 1864, PT_SC, ucp_Lisu },
|
||||
{ 1869, PT_PC, ucp_Ll },
|
||||
{ 1872, PT_PC, ucp_Lm },
|
||||
{ 1875, PT_PC, ucp_Lo },
|
||||
{ 1878, PT_BOOL, ucp_Logical_Order_Exception },
|
||||
{ 1882, PT_BOOL, ucp_Logical_Order_Exception },
|
||||
{ 1904, PT_BOOL, ucp_Lowercase },
|
||||
{ 1910, PT_BOOL, ucp_Lowercase },
|
||||
{ 1920, PT_PC, ucp_Lt },
|
||||
{ 1923, PT_PC, ucp_Lu },
|
||||
{ 1926, PT_SC, ucp_Lycian },
|
||||
{ 1933, PT_SC, ucp_Lydian },
|
||||
{ 1931, PT_SC, ucp_Lycian },
|
||||
{ 1938, PT_SC, ucp_Lydian },
|
||||
{ 1945, PT_GC, ucp_M },
|
||||
{ 1947, PT_SCX, ucp_Mahajani },
|
||||
{ 1956, PT_SCX, ucp_Mahajani },
|
||||
{ 1961, PT_SC, ucp_Makasar },
|
||||
{ 1943, PT_SC, ucp_Lydian },
|
||||
{ 1950, PT_GC, ucp_M },
|
||||
{ 1952, PT_SCX, ucp_Mahajani },
|
||||
{ 1961, PT_SCX, ucp_Mahajani },
|
||||
{ 1966, PT_SC, ucp_Makasar },
|
||||
{ 1974, PT_SCX, ucp_Malayalam },
|
||||
{ 1984, PT_SCX, ucp_Mandaic },
|
||||
{ 1971, PT_SC, ucp_Makasar },
|
||||
{ 1979, PT_SCX, ucp_Malayalam },
|
||||
{ 1989, PT_SCX, ucp_Mandaic },
|
||||
{ 1997, PT_SCX, ucp_Manichaean },
|
||||
{ 1994, PT_SCX, ucp_Mandaic },
|
||||
{ 2002, PT_SCX, ucp_Manichaean },
|
||||
{ 2013, PT_SC, ucp_Marchen },
|
||||
{ 2007, PT_SCX, ucp_Manichaean },
|
||||
{ 2018, PT_SC, ucp_Marchen },
|
||||
{ 2026, PT_SCX, ucp_Masaram_Gondi },
|
||||
{ 2039, PT_BOOL, ucp_Math },
|
||||
{ 2044, PT_PC, ucp_Mc },
|
||||
{ 2047, PT_PC, ucp_Me },
|
||||
{ 2050, PT_SC, ucp_Medefaidrin },
|
||||
{ 2062, PT_SC, ucp_Medefaidrin },
|
||||
{ 2067, PT_SC, ucp_Meetei_Mayek },
|
||||
{ 2079, PT_SC, ucp_Mende_Kikakui },
|
||||
{ 2023, PT_SC, ucp_Marchen },
|
||||
{ 2031, PT_SCX, ucp_Masaram_Gondi },
|
||||
{ 2044, PT_BOOL, ucp_Math },
|
||||
{ 2049, PT_PC, ucp_Mc },
|
||||
{ 2052, PT_PC, ucp_Me },
|
||||
{ 2055, PT_SC, ucp_Medefaidrin },
|
||||
{ 2067, PT_SC, ucp_Medefaidrin },
|
||||
{ 2072, PT_SC, ucp_Meetei_Mayek },
|
||||
{ 2084, PT_SC, ucp_Mende_Kikakui },
|
||||
{ 2097, PT_SC, ucp_Meroitic_Cursive },
|
||||
{ 2102, PT_SC, ucp_Meroitic_Hieroglyphs },
|
||||
{ 2107, PT_SC, ucp_Meroitic_Cursive },
|
||||
{ 2123, PT_SC, ucp_Meroitic_Hieroglyphs },
|
||||
{ 2143, PT_SC, ucp_Miao },
|
||||
{ 2148, PT_SCX, ucp_Malayalam },
|
||||
{ 2153, PT_PC, ucp_Mn },
|
||||
{ 2156, PT_SCX, ucp_Modi },
|
||||
{ 2161, PT_SCX, ucp_Mongolian },
|
||||
{ 2089, PT_SC, ucp_Mende_Kikakui },
|
||||
{ 2102, PT_SC, ucp_Meroitic_Cursive },
|
||||
{ 2107, PT_SC, ucp_Meroitic_Hieroglyphs },
|
||||
{ 2112, PT_SC, ucp_Meroitic_Cursive },
|
||||
{ 2128, PT_SC, ucp_Meroitic_Hieroglyphs },
|
||||
{ 2148, PT_SC, ucp_Miao },
|
||||
{ 2153, PT_SCX, ucp_Malayalam },
|
||||
{ 2158, PT_PC, ucp_Mn },
|
||||
{ 2161, PT_SCX, ucp_Modi },
|
||||
{ 2166, PT_SCX, ucp_Mongolian },
|
||||
{ 2176, PT_SC, ucp_Mro },
|
||||
{ 2180, PT_SC, ucp_Mro },
|
||||
{ 2185, PT_SC, ucp_Meetei_Mayek },
|
||||
{ 2190, PT_SCX, ucp_Multani },
|
||||
{ 2171, PT_SCX, ucp_Mongolian },
|
||||
{ 2181, PT_SC, ucp_Mro },
|
||||
{ 2185, PT_SC, ucp_Mro },
|
||||
{ 2190, PT_SC, ucp_Meetei_Mayek },
|
||||
{ 2195, PT_SCX, ucp_Multani },
|
||||
{ 2203, PT_SCX, ucp_Myanmar },
|
||||
{ 2211, PT_SCX, ucp_Myanmar },
|
||||
{ 2216, PT_GC, ucp_N },
|
||||
{ 2218, PT_SC, ucp_Nabataean },
|
||||
{ 2228, PT_SCX, ucp_Nandinagari },
|
||||
{ 2233, PT_SCX, ucp_Nandinagari },
|
||||
{ 2245, PT_SC, ucp_Old_North_Arabian },
|
||||
{ 2250, PT_SC, ucp_Nabataean },
|
||||
{ 2255, PT_BOOL, ucp_Noncharacter_Code_Point },
|
||||
{ 2261, PT_PC, ucp_Nd },
|
||||
{ 2264, PT_SC, ucp_Newa },
|
||||
{ 2269, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 2279, PT_SCX, ucp_Nko },
|
||||
{ 2283, PT_SCX, ucp_Nko },
|
||||
{ 2288, PT_PC, ucp_Nl },
|
||||
{ 2291, PT_PC, ucp_No },
|
||||
{ 2294, PT_BOOL, ucp_Noncharacter_Code_Point },
|
||||
{ 2316, PT_SC, ucp_Nushu },
|
||||
{ 2321, PT_SC, ucp_Nushu },
|
||||
{ 2327, PT_SC, ucp_Nyiakeng_Puachue_Hmong },
|
||||
{ 2348, PT_SC, ucp_Ogham },
|
||||
{ 2353, PT_SC, ucp_Ogham },
|
||||
{ 2359, PT_SC, ucp_Ol_Chiki },
|
||||
{ 2367, PT_SC, ucp_Ol_Chiki },
|
||||
{ 2372, PT_SC, ucp_Old_Hungarian },
|
||||
{ 2385, PT_SC, ucp_Old_Italic },
|
||||
{ 2395, PT_SC, ucp_Old_North_Arabian },
|
||||
{ 2411, PT_SCX, ucp_Old_Permic },
|
||||
{ 2421, PT_SC, ucp_Old_Persian },
|
||||
{ 2432, PT_SC, ucp_Old_Sogdian },
|
||||
{ 2443, PT_SC, ucp_Old_South_Arabian },
|
||||
{ 2459, PT_SC, ucp_Old_Turkic },
|
||||
{ 2469, PT_SCX, ucp_Old_Uyghur },
|
||||
{ 2479, PT_SCX, ucp_Oriya },
|
||||
{ 2485, PT_SC, ucp_Old_Turkic },
|
||||
{ 2490, PT_SCX, ucp_Oriya },
|
||||
{ 2495, PT_SC, ucp_Osage },
|
||||
{ 2501, PT_SC, ucp_Osage },
|
||||
{ 2506, PT_SC, ucp_Osmanya },
|
||||
{ 2511, PT_SC, ucp_Osmanya },
|
||||
{ 2519, PT_SCX, ucp_Old_Uyghur },
|
||||
{ 2524, PT_GC, ucp_P },
|
||||
{ 2526, PT_SC, ucp_Pahawh_Hmong },
|
||||
{ 2538, PT_SC, ucp_Palmyrene },
|
||||
{ 2543, PT_SC, ucp_Palmyrene },
|
||||
{ 2553, PT_BOOL, ucp_Pattern_Syntax },
|
||||
{ 2560, PT_BOOL, ucp_Pattern_Syntax },
|
||||
{ 2574, PT_BOOL, ucp_Pattern_White_Space },
|
||||
{ 2592, PT_BOOL, ucp_Pattern_White_Space },
|
||||
{ 2598, PT_SC, ucp_Pau_Cin_Hau },
|
||||
{ 2603, PT_SC, ucp_Pau_Cin_Hau },
|
||||
{ 2613, PT_PC, ucp_Pc },
|
||||
{ 2616, PT_BOOL, ucp_Prepended_Concatenation_Mark },
|
||||
{ 2620, PT_PC, ucp_Pd },
|
||||
{ 2623, PT_PC, ucp_Pe },
|
||||
{ 2626, PT_SCX, ucp_Old_Permic },
|
||||
{ 2631, PT_PC, ucp_Pf },
|
||||
{ 2634, PT_SCX, ucp_Phags_Pa },
|
||||
{ 2639, PT_SCX, ucp_Phags_Pa },
|
||||
{ 2647, PT_SC, ucp_Inscriptional_Pahlavi },
|
||||
{ 2652, PT_SCX, ucp_Psalter_Pahlavi },
|
||||
{ 2657, PT_SC, ucp_Phoenician },
|
||||
{ 2662, PT_SC, ucp_Phoenician },
|
||||
{ 2673, PT_PC, ucp_Pi },
|
||||
{ 2676, PT_SC, ucp_Miao },
|
||||
{ 2681, PT_PC, ucp_Po },
|
||||
{ 2684, PT_BOOL, ucp_Prepended_Concatenation_Mark },
|
||||
{ 2711, PT_SC, ucp_Inscriptional_Parthian },
|
||||
{ 2716, PT_PC, ucp_Ps },
|
||||
{ 2719, PT_SCX, ucp_Psalter_Pahlavi },
|
||||
{ 2734, PT_SCX, ucp_Coptic },
|
||||
{ 2739, PT_SC, ucp_Inherited },
|
||||
{ 2744, PT_BOOL, ucp_Quotation_Mark },
|
||||
{ 2750, PT_BOOL, ucp_Quotation_Mark },
|
||||
{ 2764, PT_BOOL, ucp_Radical },
|
||||
{ 2772, PT_BOOL, ucp_Regional_Indicator },
|
||||
{ 2790, PT_SC, ucp_Rejang },
|
||||
{ 2797, PT_BOOL, ucp_Regional_Indicator },
|
||||
{ 2800, PT_SC, ucp_Rejang },
|
||||
{ 2805, PT_SCX, ucp_Hanifi_Rohingya },
|
||||
{ 2810, PT_SC, ucp_Runic },
|
||||
{ 2816, PT_SC, ucp_Runic },
|
||||
{ 2821, PT_GC, ucp_S },
|
||||
{ 2823, PT_SC, ucp_Samaritan },
|
||||
{ 2833, PT_SC, ucp_Samaritan },
|
||||
{ 2838, PT_SC, ucp_Old_South_Arabian },
|
||||
{ 2843, PT_SC, ucp_Saurashtra },
|
||||
{ 2848, PT_SC, ucp_Saurashtra },
|
||||
{ 2859, PT_PC, ucp_Sc },
|
||||
{ 2862, PT_BOOL, ucp_Soft_Dotted },
|
||||
{ 2865, PT_BOOL, ucp_Sentence_Terminal },
|
||||
{ 2882, PT_SC, ucp_SignWriting },
|
||||
{ 2887, PT_SCX, ucp_Sharada },
|
||||
{ 2895, PT_SC, ucp_Shavian },
|
||||
{ 2903, PT_SC, ucp_Shavian },
|
||||
{ 2200, PT_SCX, ucp_Multani },
|
||||
{ 2208, PT_SCX, ucp_Myanmar },
|
||||
{ 2216, PT_SCX, ucp_Myanmar },
|
||||
{ 2221, PT_GC, ucp_N },
|
||||
{ 2223, PT_SC, ucp_Nabataean },
|
||||
{ 2233, PT_SC, ucp_Nag_Mundari },
|
||||
{ 2238, PT_SC, ucp_Nag_Mundari },
|
||||
{ 2249, PT_SCX, ucp_Nandinagari },
|
||||
{ 2254, PT_SCX, ucp_Nandinagari },
|
||||
{ 2266, PT_SC, ucp_Old_North_Arabian },
|
||||
{ 2271, PT_SC, ucp_Nabataean },
|
||||
{ 2276, PT_BOOL, ucp_Noncharacter_Code_Point },
|
||||
{ 2282, PT_PC, ucp_Nd },
|
||||
{ 2285, PT_SC, ucp_Newa },
|
||||
{ 2290, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 2300, PT_SCX, ucp_Nko },
|
||||
{ 2304, PT_SCX, ucp_Nko },
|
||||
{ 2309, PT_PC, ucp_Nl },
|
||||
{ 2312, PT_PC, ucp_No },
|
||||
{ 2315, PT_BOOL, ucp_Noncharacter_Code_Point },
|
||||
{ 2337, PT_SC, ucp_Nushu },
|
||||
{ 2342, PT_SC, ucp_Nushu },
|
||||
{ 2348, PT_SC, ucp_Nyiakeng_Puachue_Hmong },
|
||||
{ 2369, PT_SC, ucp_Ogham },
|
||||
{ 2374, PT_SC, ucp_Ogham },
|
||||
{ 2380, PT_SC, ucp_Ol_Chiki },
|
||||
{ 2388, PT_SC, ucp_Ol_Chiki },
|
||||
{ 2393, PT_SC, ucp_Old_Hungarian },
|
||||
{ 2406, PT_SC, ucp_Old_Italic },
|
||||
{ 2416, PT_SC, ucp_Old_North_Arabian },
|
||||
{ 2432, PT_SCX, ucp_Old_Permic },
|
||||
{ 2442, PT_SC, ucp_Old_Persian },
|
||||
{ 2453, PT_SC, ucp_Old_Sogdian },
|
||||
{ 2464, PT_SC, ucp_Old_South_Arabian },
|
||||
{ 2480, PT_SC, ucp_Old_Turkic },
|
||||
{ 2490, PT_SCX, ucp_Old_Uyghur },
|
||||
{ 2500, PT_SCX, ucp_Oriya },
|
||||
{ 2506, PT_SC, ucp_Old_Turkic },
|
||||
{ 2511, PT_SCX, ucp_Oriya },
|
||||
{ 2516, PT_SC, ucp_Osage },
|
||||
{ 2522, PT_SC, ucp_Osage },
|
||||
{ 2527, PT_SC, ucp_Osmanya },
|
||||
{ 2532, PT_SC, ucp_Osmanya },
|
||||
{ 2540, PT_SCX, ucp_Old_Uyghur },
|
||||
{ 2545, PT_GC, ucp_P },
|
||||
{ 2547, PT_SC, ucp_Pahawh_Hmong },
|
||||
{ 2559, PT_SC, ucp_Palmyrene },
|
||||
{ 2564, PT_SC, ucp_Palmyrene },
|
||||
{ 2574, PT_BOOL, ucp_Pattern_Syntax },
|
||||
{ 2581, PT_BOOL, ucp_Pattern_Syntax },
|
||||
{ 2595, PT_BOOL, ucp_Pattern_White_Space },
|
||||
{ 2613, PT_BOOL, ucp_Pattern_White_Space },
|
||||
{ 2619, PT_SC, ucp_Pau_Cin_Hau },
|
||||
{ 2624, PT_SC, ucp_Pau_Cin_Hau },
|
||||
{ 2634, PT_PC, ucp_Pc },
|
||||
{ 2637, PT_BOOL, ucp_Prepended_Concatenation_Mark },
|
||||
{ 2641, PT_PC, ucp_Pd },
|
||||
{ 2644, PT_PC, ucp_Pe },
|
||||
{ 2647, PT_SCX, ucp_Old_Permic },
|
||||
{ 2652, PT_PC, ucp_Pf },
|
||||
{ 2655, PT_SCX, ucp_Phags_Pa },
|
||||
{ 2660, PT_SCX, ucp_Phags_Pa },
|
||||
{ 2668, PT_SC, ucp_Inscriptional_Pahlavi },
|
||||
{ 2673, PT_SCX, ucp_Psalter_Pahlavi },
|
||||
{ 2678, PT_SC, ucp_Phoenician },
|
||||
{ 2683, PT_SC, ucp_Phoenician },
|
||||
{ 2694, PT_PC, ucp_Pi },
|
||||
{ 2697, PT_SC, ucp_Miao },
|
||||
{ 2702, PT_PC, ucp_Po },
|
||||
{ 2705, PT_BOOL, ucp_Prepended_Concatenation_Mark },
|
||||
{ 2732, PT_SC, ucp_Inscriptional_Parthian },
|
||||
{ 2737, PT_PC, ucp_Ps },
|
||||
{ 2740, PT_SCX, ucp_Psalter_Pahlavi },
|
||||
{ 2755, PT_SCX, ucp_Coptic },
|
||||
{ 2760, PT_SC, ucp_Inherited },
|
||||
{ 2765, PT_BOOL, ucp_Quotation_Mark },
|
||||
{ 2771, PT_BOOL, ucp_Quotation_Mark },
|
||||
{ 2785, PT_BOOL, ucp_Radical },
|
||||
{ 2793, PT_BOOL, ucp_Regional_Indicator },
|
||||
{ 2811, PT_SC, ucp_Rejang },
|
||||
{ 2818, PT_BOOL, ucp_Regional_Indicator },
|
||||
{ 2821, PT_SC, ucp_Rejang },
|
||||
{ 2826, PT_SCX, ucp_Hanifi_Rohingya },
|
||||
{ 2831, PT_SC, ucp_Runic },
|
||||
{ 2837, PT_SC, ucp_Runic },
|
||||
{ 2842, PT_GC, ucp_S },
|
||||
{ 2844, PT_SC, ucp_Samaritan },
|
||||
{ 2854, PT_SC, ucp_Samaritan },
|
||||
{ 2859, PT_SC, ucp_Old_South_Arabian },
|
||||
{ 2864, PT_SC, ucp_Saurashtra },
|
||||
{ 2869, PT_SC, ucp_Saurashtra },
|
||||
{ 2880, PT_PC, ucp_Sc },
|
||||
{ 2883, PT_BOOL, ucp_Soft_Dotted },
|
||||
{ 2886, PT_BOOL, ucp_Sentence_Terminal },
|
||||
{ 2903, PT_SC, ucp_SignWriting },
|
||||
{ 2908, PT_SCX, ucp_Sharada },
|
||||
{ 2913, PT_SC, ucp_Siddham },
|
||||
{ 2918, PT_SC, ucp_Siddham },
|
||||
{ 2926, PT_SC, ucp_SignWriting },
|
||||
{ 2938, PT_SCX, ucp_Khudawadi },
|
||||
{ 2943, PT_SCX, ucp_Sinhala },
|
||||
{ 2948, PT_SCX, ucp_Sinhala },
|
||||
{ 2956, PT_PC, ucp_Sk },
|
||||
{ 2959, PT_PC, ucp_Sm },
|
||||
{ 2962, PT_PC, ucp_So },
|
||||
{ 2965, PT_BOOL, ucp_Soft_Dotted },
|
||||
{ 2976, PT_SCX, ucp_Sogdian },
|
||||
{ 2981, PT_SCX, ucp_Sogdian },
|
||||
{ 2989, PT_SC, ucp_Old_Sogdian },
|
||||
{ 2994, PT_SC, ucp_Sora_Sompeng },
|
||||
{ 2999, PT_SC, ucp_Sora_Sompeng },
|
||||
{ 3011, PT_SC, ucp_Soyombo },
|
||||
{ 3016, PT_SC, ucp_Soyombo },
|
||||
{ 3024, PT_BOOL, ucp_White_Space },
|
||||
{ 3030, PT_BOOL, ucp_Sentence_Terminal },
|
||||
{ 3036, PT_SC, ucp_Sundanese },
|
||||
{ 3041, PT_SC, ucp_Sundanese },
|
||||
{ 3051, PT_SCX, ucp_Syloti_Nagri },
|
||||
{ 3056, PT_SCX, ucp_Syloti_Nagri },
|
||||
{ 3068, PT_SCX, ucp_Syriac },
|
||||
{ 3073, PT_SCX, ucp_Syriac },
|
||||
{ 3080, PT_SCX, ucp_Tagalog },
|
||||
{ 3088, PT_SCX, ucp_Tagbanwa },
|
||||
{ 3093, PT_SCX, ucp_Tagbanwa },
|
||||
{ 3102, PT_SCX, ucp_Tai_Le },
|
||||
{ 3108, PT_SC, ucp_Tai_Tham },
|
||||
{ 3116, PT_SC, ucp_Tai_Viet },
|
||||
{ 3124, PT_SCX, ucp_Takri },
|
||||
{ 3129, PT_SCX, ucp_Takri },
|
||||
{ 3135, PT_SCX, ucp_Tai_Le },
|
||||
{ 3140, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 3145, PT_SCX, ucp_Tamil },
|
||||
{ 3151, PT_SCX, ucp_Tamil },
|
||||
{ 3156, PT_SC, ucp_Tangut },
|
||||
{ 3161, PT_SC, ucp_Tangsa },
|
||||
{ 3168, PT_SC, ucp_Tangut },
|
||||
{ 3175, PT_SC, ucp_Tai_Viet },
|
||||
{ 3180, PT_SCX, ucp_Telugu },
|
||||
{ 3185, PT_SCX, ucp_Telugu },
|
||||
{ 3192, PT_BOOL, ucp_Terminal_Punctuation },
|
||||
{ 3197, PT_BOOL, ucp_Terminal_Punctuation },
|
||||
{ 3217, PT_SC, ucp_Tifinagh },
|
||||
{ 3222, PT_SCX, ucp_Tagalog },
|
||||
{ 3227, PT_SCX, ucp_Thaana },
|
||||
{ 3232, PT_SCX, ucp_Thaana },
|
||||
{ 3239, PT_SC, ucp_Thai },
|
||||
{ 3244, PT_SC, ucp_Tibetan },
|
||||
{ 3252, PT_SC, ucp_Tibetan },
|
||||
{ 3257, PT_SC, ucp_Tifinagh },
|
||||
{ 3266, PT_SCX, ucp_Tirhuta },
|
||||
{ 3271, PT_SCX, ucp_Tirhuta },
|
||||
{ 3279, PT_SC, ucp_Tangsa },
|
||||
{ 3284, PT_SC, ucp_Toto },
|
||||
{ 3289, PT_SC, ucp_Ugaritic },
|
||||
{ 3294, PT_SC, ucp_Ugaritic },
|
||||
{ 3303, PT_BOOL, ucp_Unified_Ideograph },
|
||||
{ 3309, PT_BOOL, ucp_Unified_Ideograph },
|
||||
{ 3326, PT_SC, ucp_Unknown },
|
||||
{ 3334, PT_BOOL, ucp_Uppercase },
|
||||
{ 3340, PT_BOOL, ucp_Uppercase },
|
||||
{ 3350, PT_SC, ucp_Vai },
|
||||
{ 3354, PT_SC, ucp_Vai },
|
||||
{ 3359, PT_BOOL, ucp_Variation_Selector },
|
||||
{ 3377, PT_SC, ucp_Vithkuqi },
|
||||
{ 3382, PT_SC, ucp_Vithkuqi },
|
||||
{ 3391, PT_BOOL, ucp_Variation_Selector },
|
||||
{ 3394, PT_SC, ucp_Wancho },
|
||||
{ 3401, PT_SC, ucp_Warang_Citi },
|
||||
{ 3406, PT_SC, ucp_Warang_Citi },
|
||||
{ 3417, PT_SC, ucp_Wancho },
|
||||
{ 3422, PT_BOOL, ucp_White_Space },
|
||||
{ 3433, PT_BOOL, ucp_White_Space },
|
||||
{ 3440, PT_ALNUM, 0 },
|
||||
{ 3444, PT_BOOL, ucp_XID_Continue },
|
||||
{ 3449, PT_BOOL, ucp_XID_Continue },
|
||||
{ 3461, PT_BOOL, ucp_XID_Start },
|
||||
{ 3466, PT_BOOL, ucp_XID_Start },
|
||||
{ 3475, PT_SC, ucp_Old_Persian },
|
||||
{ 3480, PT_PXSPACE, 0 },
|
||||
{ 3484, PT_SPACE, 0 },
|
||||
{ 3488, PT_SC, ucp_Cuneiform },
|
||||
{ 3493, PT_UCNC, 0 },
|
||||
{ 3497, PT_WORD, 0 },
|
||||
{ 3501, PT_SCX, ucp_Yezidi },
|
||||
{ 3506, PT_SCX, ucp_Yezidi },
|
||||
{ 3513, PT_SCX, ucp_Yi },
|
||||
{ 3516, PT_SCX, ucp_Yi },
|
||||
{ 3521, PT_GC, ucp_Z },
|
||||
{ 3523, PT_SC, ucp_Zanabazar_Square },
|
||||
{ 3539, PT_SC, ucp_Zanabazar_Square },
|
||||
{ 3544, PT_SC, ucp_Inherited },
|
||||
{ 3549, PT_PC, ucp_Zl },
|
||||
{ 3552, PT_PC, ucp_Zp },
|
||||
{ 3555, PT_PC, ucp_Zs },
|
||||
{ 3558, PT_SC, ucp_Common },
|
||||
{ 3563, PT_SC, ucp_Unknown }
|
||||
{ 2916, PT_SC, ucp_Shavian },
|
||||
{ 2924, PT_SC, ucp_Shavian },
|
||||
{ 2929, PT_SCX, ucp_Sharada },
|
||||
{ 2934, PT_SC, ucp_Siddham },
|
||||
{ 2939, PT_SC, ucp_Siddham },
|
||||
{ 2947, PT_SC, ucp_SignWriting },
|
||||
{ 2959, PT_SCX, ucp_Khudawadi },
|
||||
{ 2964, PT_SCX, ucp_Sinhala },
|
||||
{ 2969, PT_SCX, ucp_Sinhala },
|
||||
{ 2977, PT_PC, ucp_Sk },
|
||||
{ 2980, PT_PC, ucp_Sm },
|
||||
{ 2983, PT_PC, ucp_So },
|
||||
{ 2986, PT_BOOL, ucp_Soft_Dotted },
|
||||
{ 2997, PT_SCX, ucp_Sogdian },
|
||||
{ 3002, PT_SCX, ucp_Sogdian },
|
||||
{ 3010, PT_SC, ucp_Old_Sogdian },
|
||||
{ 3015, PT_SC, ucp_Sora_Sompeng },
|
||||
{ 3020, PT_SC, ucp_Sora_Sompeng },
|
||||
{ 3032, PT_SC, ucp_Soyombo },
|
||||
{ 3037, PT_SC, ucp_Soyombo },
|
||||
{ 3045, PT_BOOL, ucp_White_Space },
|
||||
{ 3051, PT_BOOL, ucp_Sentence_Terminal },
|
||||
{ 3057, PT_SC, ucp_Sundanese },
|
||||
{ 3062, PT_SC, ucp_Sundanese },
|
||||
{ 3072, PT_SCX, ucp_Syloti_Nagri },
|
||||
{ 3077, PT_SCX, ucp_Syloti_Nagri },
|
||||
{ 3089, PT_SCX, ucp_Syriac },
|
||||
{ 3094, PT_SCX, ucp_Syriac },
|
||||
{ 3101, PT_SCX, ucp_Tagalog },
|
||||
{ 3109, PT_SCX, ucp_Tagbanwa },
|
||||
{ 3114, PT_SCX, ucp_Tagbanwa },
|
||||
{ 3123, PT_SCX, ucp_Tai_Le },
|
||||
{ 3129, PT_SC, ucp_Tai_Tham },
|
||||
{ 3137, PT_SC, ucp_Tai_Viet },
|
||||
{ 3145, PT_SCX, ucp_Takri },
|
||||
{ 3150, PT_SCX, ucp_Takri },
|
||||
{ 3156, PT_SCX, ucp_Tai_Le },
|
||||
{ 3161, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 3166, PT_SCX, ucp_Tamil },
|
||||
{ 3172, PT_SCX, ucp_Tamil },
|
||||
{ 3177, PT_SC, ucp_Tangut },
|
||||
{ 3182, PT_SC, ucp_Tangsa },
|
||||
{ 3189, PT_SC, ucp_Tangut },
|
||||
{ 3196, PT_SC, ucp_Tai_Viet },
|
||||
{ 3201, PT_SCX, ucp_Telugu },
|
||||
{ 3206, PT_SCX, ucp_Telugu },
|
||||
{ 3213, PT_BOOL, ucp_Terminal_Punctuation },
|
||||
{ 3218, PT_BOOL, ucp_Terminal_Punctuation },
|
||||
{ 3238, PT_SC, ucp_Tifinagh },
|
||||
{ 3243, PT_SCX, ucp_Tagalog },
|
||||
{ 3248, PT_SCX, ucp_Thaana },
|
||||
{ 3253, PT_SCX, ucp_Thaana },
|
||||
{ 3260, PT_SC, ucp_Thai },
|
||||
{ 3265, PT_SC, ucp_Tibetan },
|
||||
{ 3273, PT_SC, ucp_Tibetan },
|
||||
{ 3278, PT_SC, ucp_Tifinagh },
|
||||
{ 3287, PT_SCX, ucp_Tirhuta },
|
||||
{ 3292, PT_SCX, ucp_Tirhuta },
|
||||
{ 3300, PT_SC, ucp_Tangsa },
|
||||
{ 3305, PT_SC, ucp_Toto },
|
||||
{ 3310, PT_SC, ucp_Ugaritic },
|
||||
{ 3315, PT_SC, ucp_Ugaritic },
|
||||
{ 3324, PT_BOOL, ucp_Unified_Ideograph },
|
||||
{ 3330, PT_BOOL, ucp_Unified_Ideograph },
|
||||
{ 3347, PT_SC, ucp_Unknown },
|
||||
{ 3355, PT_BOOL, ucp_Uppercase },
|
||||
{ 3361, PT_BOOL, ucp_Uppercase },
|
||||
{ 3371, PT_SC, ucp_Vai },
|
||||
{ 3375, PT_SC, ucp_Vai },
|
||||
{ 3380, PT_BOOL, ucp_Variation_Selector },
|
||||
{ 3398, PT_SC, ucp_Vithkuqi },
|
||||
{ 3403, PT_SC, ucp_Vithkuqi },
|
||||
{ 3412, PT_BOOL, ucp_Variation_Selector },
|
||||
{ 3415, PT_SC, ucp_Wancho },
|
||||
{ 3422, PT_SC, ucp_Warang_Citi },
|
||||
{ 3427, PT_SC, ucp_Warang_Citi },
|
||||
{ 3438, PT_SC, ucp_Wancho },
|
||||
{ 3443, PT_BOOL, ucp_White_Space },
|
||||
{ 3454, PT_BOOL, ucp_White_Space },
|
||||
{ 3461, PT_ALNUM, 0 },
|
||||
{ 3465, PT_BOOL, ucp_XID_Continue },
|
||||
{ 3470, PT_BOOL, ucp_XID_Continue },
|
||||
{ 3482, PT_BOOL, ucp_XID_Start },
|
||||
{ 3487, PT_BOOL, ucp_XID_Start },
|
||||
{ 3496, PT_SC, ucp_Old_Persian },
|
||||
{ 3501, PT_PXSPACE, 0 },
|
||||
{ 3505, PT_SPACE, 0 },
|
||||
{ 3509, PT_SC, ucp_Cuneiform },
|
||||
{ 3514, PT_UCNC, 0 },
|
||||
{ 3518, PT_WORD, 0 },
|
||||
{ 3522, PT_SCX, ucp_Yezidi },
|
||||
{ 3527, PT_SCX, ucp_Yezidi },
|
||||
{ 3534, PT_SCX, ucp_Yi },
|
||||
{ 3537, PT_SCX, ucp_Yi },
|
||||
{ 3542, PT_GC, ucp_Z },
|
||||
{ 3544, PT_SC, ucp_Zanabazar_Square },
|
||||
{ 3560, PT_SC, ucp_Zanabazar_Square },
|
||||
{ 3565, PT_SC, ucp_Inherited },
|
||||
{ 3570, PT_PC, ucp_Zl },
|
||||
{ 3573, PT_PC, ucp_Zp },
|
||||
{ 3576, PT_PC, ucp_Zs },
|
||||
{ 3579, PT_SC, ucp_Common },
|
||||
{ 3584, PT_SC, ucp_Unknown }
|
||||
};
|
||||
|
||||
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
|
||||
|
@ -169,7 +169,7 @@ for (p = string; length > 0; p++)
|
||||
|
||||
if (((d = *(++p)) & 0xc0) != 0x80)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 1;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 1;
|
||||
return PCRE2_ERROR_UTF8_ERR6;
|
||||
}
|
||||
|
||||
@ -184,7 +184,7 @@ for (p = string; length > 0; p++)
|
||||
|
||||
case 1: if ((c & 0x3e) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 1;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 1;
|
||||
return PCRE2_ERROR_UTF8_ERR15;
|
||||
}
|
||||
break;
|
||||
@ -196,17 +196,17 @@ for (p = string; length > 0; p++)
|
||||
case 2:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if (c == 0xe0 && (d & 0x20) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR16;
|
||||
}
|
||||
if (c == 0xed && d >= 0xa0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR14;
|
||||
}
|
||||
break;
|
||||
@ -218,22 +218,22 @@ for (p = string; length > 0; p++)
|
||||
case 3:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR8;
|
||||
}
|
||||
if (c == 0xf0 && (d & 0x30) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR17;
|
||||
}
|
||||
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR13;
|
||||
}
|
||||
break;
|
||||
@ -249,22 +249,22 @@ for (p = string; length > 0; p++)
|
||||
case 4:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR8;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 4;
|
||||
return PCRE2_ERROR_UTF8_ERR9;
|
||||
}
|
||||
if (c == 0xf8 && (d & 0x38) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 4;
|
||||
return PCRE2_ERROR_UTF8_ERR18;
|
||||
}
|
||||
break;
|
||||
@ -275,27 +275,27 @@ for (p = string; length > 0; p++)
|
||||
case 5:
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 2;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
|
||||
return PCRE2_ERROR_UTF8_ERR7;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 3;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 3;
|
||||
return PCRE2_ERROR_UTF8_ERR8;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 4;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 4;
|
||||
return PCRE2_ERROR_UTF8_ERR9;
|
||||
}
|
||||
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 5;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 5;
|
||||
return PCRE2_ERROR_UTF8_ERR10;
|
||||
}
|
||||
if (c == 0xfc && (d & 0x3c) == 0)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - 5;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 5;
|
||||
return PCRE2_ERROR_UTF8_ERR19;
|
||||
}
|
||||
break;
|
||||
@ -307,7 +307,7 @@ for (p = string; length > 0; p++)
|
||||
|
||||
if (ab > 3)
|
||||
{
|
||||
*erroroffset = (int)(p - string) - ab;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - ab;
|
||||
return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12;
|
||||
}
|
||||
}
|
||||
@ -338,21 +338,21 @@ for (p = string; length > 0; p++)
|
||||
/* High surrogate. Must be a followed by a low surrogate. */
|
||||
if (length == 0)
|
||||
{
|
||||
*erroroffset = p - string;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string);
|
||||
return PCRE2_ERROR_UTF16_ERR1;
|
||||
}
|
||||
p++;
|
||||
length--;
|
||||
if ((*p & 0xfc00) != 0xdc00)
|
||||
{
|
||||
*erroroffset = p - string - 1;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string) - 1;
|
||||
return PCRE2_ERROR_UTF16_ERR2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Isolated low surrogate. Always an error. */
|
||||
*erroroffset = p - string;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string);
|
||||
return PCRE2_ERROR_UTF16_ERR3;
|
||||
}
|
||||
}
|
||||
@ -377,14 +377,14 @@ for (p = string; length > 0; length--, p++)
|
||||
/* Normal UTF-32 code point. Neither high nor low surrogate. */
|
||||
if (c > 0x10ffffu)
|
||||
{
|
||||
*erroroffset = p - string;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string);
|
||||
return PCRE2_ERROR_UTF32_ERR2;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* A surrogate */
|
||||
*erroroffset = p - string;
|
||||
*erroroffset = (PCRE2_SIZE)(p - string);
|
||||
return PCRE2_ERROR_UTF32_ERR1;
|
||||
}
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2022 University of Cambridge
|
||||
New API code Copyright (c) 2016-2023 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -129,6 +129,7 @@ while ((t = *data++) != XCL_END)
|
||||
#ifdef SUPPORT_UNICODE
|
||||
else /* XCL_PROP & XCL_NOTPROP */
|
||||
{
|
||||
int chartype;
|
||||
const ucd_record *prop = GET_UCD(c);
|
||||
BOOL isprop = t == XCL_PROP;
|
||||
BOOL ok;
|
||||
@ -140,8 +141,9 @@ while ((t = *data++) != XCL_END)
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
||||
prop->chartype == ucp_Lt) == isprop) return !negated;
|
||||
chartype = prop->chartype;
|
||||
if ((chartype == ucp_Lu || chartype == ucp_Ll ||
|
||||
chartype == ucp_Lt) == isprop) return !negated;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
@ -164,8 +166,9 @@ while ((t = *data++) != XCL_END)
|
||||
break;
|
||||
|
||||
case PT_ALNUM:
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
|
||||
chartype = prop->chartype;
|
||||
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N) == isprop)
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
@ -190,9 +193,10 @@ while ((t = *data++) != XCL_END)
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
|
||||
== isprop)
|
||||
chartype = prop->chartype;
|
||||
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
|
||||
PRIV(ucp_gentype)[chartype] == ucp_N ||
|
||||
chartype == ucp_Mn || chartype == ucp_Pc) == isprop)
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
@ -234,9 +238,10 @@ while ((t = *data++) != XCL_END)
|
||||
*/
|
||||
|
||||
case PT_PXGRAPH:
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
|
||||
(PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
|
||||
(prop->chartype == ucp_Cf &&
|
||||
chartype = prop->chartype;
|
||||
if ((PRIV(ucp_gentype)[chartype] != ucp_Z &&
|
||||
(PRIV(ucp_gentype)[chartype] != ucp_C ||
|
||||
(chartype == ucp_Cf &&
|
||||
c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
|
||||
)) == isprop)
|
||||
return !negated;
|
||||
@ -246,10 +251,11 @@ while ((t = *data++) != XCL_END)
|
||||
not Zl and not Zp, and U+180E. */
|
||||
|
||||
case PT_PXPRINT:
|
||||
if ((prop->chartype != ucp_Zl &&
|
||||
prop->chartype != ucp_Zp &&
|
||||
(PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
|
||||
(prop->chartype == ucp_Cf &&
|
||||
chartype = prop->chartype;
|
||||
if ((chartype != ucp_Zl &&
|
||||
chartype != ucp_Zp &&
|
||||
(PRIV(ucp_gentype)[chartype] != ucp_C ||
|
||||
(chartype == ucp_Cf &&
|
||||
c != 0x061c && (c < 0x2066 || c > 0x2069))
|
||||
)) == isprop)
|
||||
return !negated;
|
||||
@ -260,8 +266,21 @@ while ((t = *data++) != XCL_END)
|
||||
compatibility (these are $+<=>^`|~). */
|
||||
|
||||
case PT_PXPUNCT:
|
||||
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
|
||||
(c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
|
||||
chartype = prop->chartype;
|
||||
if ((PRIV(ucp_gentype)[chartype] == ucp_P ||
|
||||
(c < 128 && PRIV(ucp_gentype)[chartype] == ucp_S)) == isprop)
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
/* Perl has two sets of hex digits */
|
||||
|
||||
case PT_PXXDIGIT:
|
||||
if (((c >= CHAR_0 && c <= CHAR_9) ||
|
||||
(c >= CHAR_A && c <= CHAR_F) ||
|
||||
(c >= CHAR_a && c <= CHAR_f) ||
|
||||
(c >= 0xff10 && c <= 0xff19) || /* Fullwidth digits */
|
||||
(c >= 0xff21 && c <= 0xff26) || /* Fullwidth letters */
|
||||
(c >= 0xff41 && c <= 0xff46)) == isprop)
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user