pcre2: Version 10.44 (#4478) (#4678)

* update(pcre2): Version 10.44 (#4478)

* update(pcre2): Add new file to VS project files (version 10.44) (#4478)

* fix(RegEx): Use generic pcre2 function names (without suffix _8).

* update(pcre2): Fix configuration (define PCRE2_STATIC) (#4478)
This commit is contained in:
Matej Kenda 2024-10-03 10:49:27 +02:00 committed by GitHub
parent 04fe04e3a4
commit f3975eba96
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
36 changed files with 6060 additions and 4480 deletions

View File

@ -42,6 +42,7 @@ else()
POCO_SOURCES(SRCS pcre2
src/pcre2_auto_possess.c
src/pcre2_chartables.c
src/pcre2_chkdint.c
src/pcre2_compile.c
src/pcre2_config.c
src/pcre2_context.c

View File

@ -1047,6 +1047,7 @@
<ClCompile Include="src\PatternFormatter.cpp" />
<ClCompile Include="src\pcre2_auto_possess.c" />
<ClCompile Include="src\pcre2_chartables.c" />
<ClCompile Include="src\pcre2_chkdint.c" />
<ClCompile Include="src\pcre2_compile.c" />
<ClCompile Include="src\pcre2_config.c" />
<ClCompile Include="src\pcre2_context.c" />

View File

@ -846,6 +846,9 @@
<ClCompile Include="src\pcre2_chartables.c">
<Filter>RegularExpression\PCRE2 Source Files</Filter>
</ClCompile>
<ClCompile Include="src\pcre2_chkdint.c">
<Filter>RegularExpression\PCRE2 Source Files</Filter>
</ClCompile>
<ClCompile Include="src\pcre2_compile.c">
<Filter>RegularExpression\PCRE2 Source Files</Filter>
</ClCompile>

View File

@ -1509,6 +1509,7 @@
<ClCompile Include="src\PatternFormatter.cpp" />
<ClCompile Include="src\pcre2_auto_possess.c" />
<ClCompile Include="src\pcre2_chartables.c" />
<ClCompile Include="src\pcre2_chkdint.c" />
<ClCompile Include="src\pcre2_compile.c" />
<ClCompile Include="src\pcre2_config.c" />
<ClCompile Include="src\pcre2_context.c" />

View File

@ -846,6 +846,9 @@
<ClCompile Include="src\pcre2_chartables.c">
<Filter>RegularExpression\PCRE2 Source Files</Filter>
</ClCompile>
<ClCompile Include="src\pcre2_chkdint.c">
<Filter>RegularExpression\PCRE2 Source Files</Filter>
</ClCompile>
<ClCompile Include="src\pcre2_compile.c">
<Filter>RegularExpression\PCRE2 Source Files</Filter>
</ClCompile>

View File

@ -35,7 +35,7 @@ objects = ArchiveStrategy Ascii ASCIIEncoding AsyncChannel AsyncNotificationCent
zlib_objects = adler32 compress crc32 deflate \
infback inffast inflate inftrees trees zutil
pcre_objects = pcre2_auto_possess pcre2_chartables pcre2_compile pcre2_config \
pcre_objects = pcre2_auto_possess pcre2_chartables pcre2_chkdint pcre2_compile pcre2_config \
pcre2_context pcre2_convert pcre2_dfa_match pcre2_error pcre2_extuni \
pcre2_find_bracket pcre2_jit_compile pcre2_maketables pcre2_match \
pcre2_match_data pcre2_newline pcre2_ord2utf pcre2_pattern_info \

View File

@ -29,34 +29,34 @@ namespace
class MatchData
{
public:
MatchData(pcre2_code_8* code):
_match(pcre2_match_data_create_from_pattern_8(reinterpret_cast<pcre2_code_8*>(code), nullptr))
MatchData(pcre2_code* code):
_match(pcre2_match_data_create_from_pattern(reinterpret_cast<pcre2_code*>(code), nullptr))
{
if (!_match) throw Poco::RegularExpressionException("cannot create match data");
}
~MatchData()
{
if (_match) pcre2_match_data_free_8(_match);
if (_match) pcre2_match_data_free(_match);
}
std::uint32_t count() const
{
return pcre2_get_ovector_count_8(_match);
return pcre2_get_ovector_count(_match);
}
const PCRE2_SIZE* data() const
{
return pcre2_get_ovector_pointer_8(_match);
return pcre2_get_ovector_pointer(_match);
}
operator pcre2_match_data_8*()
operator pcre2_match_data*()
{
return _match;
}
private:
pcre2_match_data_8* _match;
pcre2_match_data* _match;
};
}
@ -72,40 +72,40 @@ RegularExpression::RegularExpression(const std::string& pattern, int options, bo
unsigned nameEntrySize;
unsigned char* nameTable;
pcre2_compile_context_8* context = pcre2_compile_context_create_8(nullptr);
pcre2_compile_context* context = pcre2_compile_context_create(nullptr);
if (!context) throw Poco::RegularExpressionException("cannot create compile context");
if (options & RE_NEWLINE_LF)
pcre2_set_newline_8(context, PCRE2_NEWLINE_LF);
pcre2_set_newline(context, PCRE2_NEWLINE_LF);
else if (options & RE_NEWLINE_CRLF)
pcre2_set_newline_8(context, PCRE2_NEWLINE_CRLF);
pcre2_set_newline(context, PCRE2_NEWLINE_CRLF);
else if (options & RE_NEWLINE_ANY)
pcre2_set_newline_8(context, PCRE2_NEWLINE_ANY);
pcre2_set_newline(context, PCRE2_NEWLINE_ANY);
else if (options & RE_NEWLINE_ANYCRLF)
pcre2_set_newline_8(context, PCRE2_NEWLINE_ANYCRLF);
pcre2_set_newline(context, PCRE2_NEWLINE_ANYCRLF);
else // default RE_NEWLINE_CR
pcre2_set_newline_8(context, PCRE2_NEWLINE_CR);
pcre2_set_newline(context, PCRE2_NEWLINE_CR);
_pcre = pcre2_compile_8(reinterpret_cast<const PCRE2_SPTR>(pattern.c_str()), pattern.length(), compileOptions(options), &errorCode, &errorOffset, context);
pcre2_compile_context_free_8(context);
_pcre = pcre2_compile(reinterpret_cast<const PCRE2_SPTR>(pattern.c_str()), pattern.length(), compileOptions(options), &errorCode, &errorOffset, context);
pcre2_compile_context_free(context);
if (!_pcre)
{
PCRE2_UCHAR buffer[256];
pcre2_get_error_message_8(errorCode, buffer, sizeof(buffer));
pcre2_get_error_message(errorCode, buffer, sizeof(buffer));
std::ostringstream msg;
msg << reinterpret_cast<char*>(buffer) << " (at offset " << errorOffset << ")";
throw RegularExpressionException(msg.str());
}
pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMECOUNT, &nameCount);
pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMEENTRYSIZE, &nameEntrySize);
pcre2_pattern_info_8(reinterpret_cast<pcre2_code_8*>(_pcre), PCRE2_INFO_NAMETABLE, &nameTable);
pcre2_pattern_info(reinterpret_cast<pcre2_code*>(_pcre), PCRE2_INFO_NAMECOUNT, &nameCount);
pcre2_pattern_info(reinterpret_cast<pcre2_code*>(_pcre), PCRE2_INFO_NAMEENTRYSIZE, &nameEntrySize);
pcre2_pattern_info(reinterpret_cast<pcre2_code*>(_pcre), PCRE2_INFO_NAMETABLE, &nameTable);
for (int i = 0; i < nameCount; i++)
{
unsigned char* group = nameTable + 2 + (nameEntrySize * i);
int n = pcre2_substring_number_from_name_8(reinterpret_cast<pcre2_code_8*>(_pcre), group);
int n = pcre2_substring_number_from_name(reinterpret_cast<pcre2_code*>(_pcre), group);
_groups[n] = std::string(reinterpret_cast<char*>(group));
}
}
@ -113,7 +113,7 @@ RegularExpression::RegularExpression(const std::string& pattern, int options, bo
RegularExpression::~RegularExpression()
{
if (_pcre) pcre2_code_free_8(reinterpret_cast<pcre2_code_8*>(_pcre));
if (_pcre) pcre2_code_free(reinterpret_cast<pcre2_code*>(_pcre));
}
@ -121,8 +121,8 @@ int RegularExpression::match(const std::string& subject, std::string::size_type
{
poco_assert (offset <= subject.length());
MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre));
int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr);
MatchData matchData(reinterpret_cast<pcre2_code*>(_pcre));
int rc = pcre2_match(reinterpret_cast<pcre2_code*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr);
if (rc == PCRE2_ERROR_NOMATCH)
{
mtch.offset = std::string::npos;
@ -140,7 +140,7 @@ int RegularExpression::match(const std::string& subject, std::string::size_type
else if (rc < 0)
{
PCRE2_UCHAR buffer[256];
pcre2_get_error_message_8(rc, buffer, sizeof(buffer));
pcre2_get_error_message(rc, buffer, sizeof(buffer));
throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer)));
}
const PCRE2_SIZE* ovec = matchData.data();
@ -156,8 +156,8 @@ int RegularExpression::match(const std::string& subject, std::string::size_type
matches.clear();
MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre));
int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, options & 0xFFFF, matchData, nullptr);
MatchData matchData(reinterpret_cast<pcre2_code*>(_pcre));
int rc = pcre2_match(reinterpret_cast<pcre2_code*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, options & 0xFFFF, matchData, nullptr);
if (rc == PCRE2_ERROR_NOMATCH)
{
return 0;
@ -173,7 +173,7 @@ int RegularExpression::match(const std::string& subject, std::string::size_type
else if (rc < 0)
{
PCRE2_UCHAR buffer[256];
pcre2_get_error_message_8(rc, buffer, sizeof(buffer));
pcre2_get_error_message(rc, buffer, sizeof(buffer));
throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer)));
}
matches.reserve(rc);
@ -279,8 +279,8 @@ std::string::size_type RegularExpression::substOne(std::string& subject, std::st
{
if (offset >= subject.length()) return std::string::npos;
MatchData matchData(reinterpret_cast<pcre2_code_8*>(_pcre));
int rc = pcre2_match_8(reinterpret_cast<pcre2_code_8*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr);
MatchData matchData(reinterpret_cast<pcre2_code*>(_pcre));
int rc = pcre2_match(reinterpret_cast<pcre2_code*>(_pcre), reinterpret_cast<const PCRE2_SPTR>(subject.c_str()), subject.size(), offset, matchOptions(options), matchData, nullptr);
if (rc == PCRE2_ERROR_NOMATCH)
{
return std::string::npos;
@ -296,7 +296,7 @@ std::string::size_type RegularExpression::substOne(std::string& subject, std::st
else if (rc < 0)
{
PCRE2_UCHAR buffer[256];
pcre2_get_error_message_8(rc, buffer, sizeof(buffer));
pcre2_get_error_message(rc, buffer, sizeof(buffer));
throw RegularExpressionException(std::string(reinterpret_cast<char*>(buffer)));
}
const PCRE2_SIZE* ovec = matchData.data();

View File

@ -29,7 +29,7 @@ void Unicode::properties(int ch, CharacterProperties& props)
{
if (ch > UCP_MAX_CODEPOINT) ch = 0;
const ucd_record* ucd = GET_UCD(ch);
props.category = static_cast<CharacterCategory>(PRIV(ucp_gentype_8)[ucd->chartype]);
props.category = static_cast<CharacterCategory>(PRIV(ucp_gentype)[ucd->chartype]);
props.type = static_cast<CharacterType>(ucd->chartype);
props.script = static_cast<Script>(ucd->script);
}

View File

@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, second API, to be
#included by applications that call PCRE2 functions.
Copyright (c) 2016-2021 University of Cambridge
Copyright (c) 2016-2024 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
/* The current PCRE version information. */
#define PCRE2_MAJOR 10
#define PCRE2_MINOR 42
#define PCRE2_MINOR 44
#define PCRE2_PRERELEASE
#define PCRE2_DATE 2022-12-11
#define PCRE2_DATE 2024-06-07
/* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE2, the appropriate
@ -153,6 +153,12 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_EXTRA_ESCAPED_CR_IS_LF 0x00000010u /* C */
#define PCRE2_EXTRA_ALT_BSUX 0x00000020u /* C */
#define PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK 0x00000040u /* C */
#define PCRE2_EXTRA_CASELESS_RESTRICT 0x00000080u /* C */
#define PCRE2_EXTRA_ASCII_BSD 0x00000100u /* C */
#define PCRE2_EXTRA_ASCII_BSS 0x00000200u /* C */
#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
/* These are for pcre2_jit_compile(). */
@ -180,11 +186,12 @@ pcre2_jit_match() ignores the latter since it bypasses all sanity checks). */
#define PCRE2_SUBSTITUTE_UNSET_EMPTY 0x00000400u /* pcre2_substitute() only */
#define PCRE2_SUBSTITUTE_UNKNOWN_UNSET 0x00000800u /* pcre2_substitute() only */
#define PCRE2_SUBSTITUTE_OVERFLOW_LENGTH 0x00001000u /* pcre2_substitute() only */
#define PCRE2_NO_JIT 0x00002000u /* Not for pcre2_dfa_match() */
#define PCRE2_NO_JIT 0x00002000u /* not for pcre2_dfa_match() */
#define PCRE2_COPY_MATCHED_SUBJECT 0x00004000u
#define PCRE2_SUBSTITUTE_LITERAL 0x00008000u /* pcre2_substitute() only */
#define PCRE2_SUBSTITUTE_MATCHED 0x00010000u /* pcre2_substitute() only */
#define PCRE2_SUBSTITUTE_REPLACEMENT_ONLY 0x00020000u /* pcre2_substitute() only */
#define PCRE2_DISABLE_RECURSELOOP_CHECK 0x00040000u /* not for pcre2_dfa_match() or pcre2_jit_match() */
/* Options for pcre2_pattern_convert(). */
@ -399,6 +406,7 @@ released, the numbers must not be changed. */
#define PCRE2_ERROR_CONVERT_SYNTAX (-64)
#define PCRE2_ERROR_INTERNAL_DUPMATCH (-65)
#define PCRE2_ERROR_DFA_UINVALID_UTF (-66)
#define PCRE2_ERROR_INVALIDOFFSET (-67)
/* Request types for pcre2_pattern_info() */
@ -575,7 +583,7 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION pcre2_config(uint32_t, void *);
PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \
pcre2_general_context_copy(pcre2_general_context *); \
PCRE2_EXP_DECL pcre2_general_context *PCRE2_CALL_CONVENTION \
pcre2_general_context_create(void *(*)(PCRE2_SIZE, void *), \
pcre2_general_context_create(void *(*)(size_t, void *), \
void (*)(void *, void *), void *); \
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
pcre2_general_context_free(pcre2_general_context *);
@ -595,6 +603,10 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_compile_extra_options(pcre2_compile_context *, uint32_t); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_max_pattern_length(pcre2_compile_context *, PCRE2_SIZE); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_max_pattern_compiled_length(pcre2_compile_context *, PCRE2_SIZE); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_max_varlookbehind(pcre2_compile_context *, uint32_t); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_newline(pcre2_compile_context *, uint32_t); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
@ -628,7 +640,7 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_recursion_limit(pcre2_match_context *, uint32_t); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_set_recursion_memory_management(pcre2_match_context *, \
void *(*)(PCRE2_SIZE, void *), void (*)(void *, void *), void *);
void *(*)(size_t, void *), void (*)(void *, void *), void *);
#define PCRE2_CONVERT_CONTEXT_FUNCTIONS \
PCRE2_EXP_DECL pcre2_convert_context *PCRE2_CALL_CONVENTION \
@ -687,6 +699,8 @@ PCRE2_EXP_DECL PCRE2_SPTR PCRE2_CALL_CONVENTION \
pcre2_get_mark(pcre2_match_data *); \
PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
pcre2_get_match_data_size(pcre2_match_data *); \
PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
pcre2_get_match_data_heapframes_size(pcre2_match_data *); \
PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \
pcre2_get_ovector_count(pcre2_match_data *); \
PCRE2_EXP_DECL PCRE2_SIZE *PCRE2_CALL_CONVENTION \
@ -722,7 +736,7 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_substring_number_from_name(const pcre2_code *, PCRE2_SPTR); \
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
pcre2_substring_list_free(PCRE2_SPTR *); \
pcre2_substring_list_free(PCRE2_UCHAR **); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_substring_list_get(pcre2_match_data *, PCRE2_UCHAR ***, PCRE2_SIZE **);
@ -771,7 +785,7 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
pcre2_jit_free_unused_memory(pcre2_general_context *); \
PCRE2_EXP_DECL pcre2_jit_stack *PCRE2_CALL_CONVENTION \
pcre2_jit_stack_create(PCRE2_SIZE, PCRE2_SIZE, pcre2_general_context *); \
pcre2_jit_stack_create(size_t, size_t, pcre2_general_context *); \
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
pcre2_jit_stack_assign(pcre2_match_context *, pcre2_jit_callback, void *); \
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
@ -851,6 +865,7 @@ pcre2_compile are called by application code. */
#define pcre2_general_context_free PCRE2_SUFFIX(pcre2_general_context_free_)
#define pcre2_get_error_message PCRE2_SUFFIX(pcre2_get_error_message_)
#define pcre2_get_mark PCRE2_SUFFIX(pcre2_get_mark_)
#define pcre2_get_match_data_heapframes_size PCRE2_SUFFIX(pcre2_get_match_data_heapframes_size_)
#define pcre2_get_match_data_size PCRE2_SUFFIX(pcre2_get_match_data_size_)
#define pcre2_get_ovector_pointer PCRE2_SUFFIX(pcre2_get_ovector_pointer_)
#define pcre2_get_ovector_count PCRE2_SUFFIX(pcre2_get_ovector_count_)
@ -886,7 +901,9 @@ pcre2_compile are called by application code. */
#define pcre2_set_glob_separator PCRE2_SUFFIX(pcre2_set_glob_separator_)
#define pcre2_set_heap_limit PCRE2_SUFFIX(pcre2_set_heap_limit_)
#define pcre2_set_match_limit PCRE2_SUFFIX(pcre2_set_match_limit_)
#define pcre2_set_max_varlookbehind PCRE2_SUFFIX(pcre2_set_max_varlookbehind_)
#define pcre2_set_max_pattern_length PCRE2_SUFFIX(pcre2_set_max_pattern_length_)
#define pcre2_set_max_pattern_compiled_length PCRE2_SUFFIX(pcre2_set_max_pattern_compiled_length_)
#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_)
#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_)
#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_)

View File

@ -556,6 +556,8 @@ matches to an empty string (also represented by a non-zero value). */
for(;;)
{
PCRE2_SPTR bracode;
/* All operations move the code pointer forward.
Therefore infinite recursions are not possible. */
@ -613,7 +615,8 @@ for(;;)
recursions. (This could be improved by keeping a list of group numbers that
are called by recursion.) */
switch(*(code - GET(code, 1)))
bracode = code - GET(code, 1);
switch(*bracode)
{
case OP_CBRA:
case OP_SCBRA:
@ -632,16 +635,19 @@ for(;;)
break;
/* Atomic sub-patterns and assertions can always auto-possessify their
last iterator. However, if the group was entered as a result of checking
a previous iterator, this is not possible. */
last iterator except for variable length lookbehinds. However, if the
group was entered as a result of checking a previous iterator, this is
not possible. */
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
case OP_ONCE:
return !entered_a_group;
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
return (bracode[1+LINK_SIZE] == OP_VREVERSE)? FALSE : !entered_a_group;
/* Non-atomic assertions - don't possessify last iterator. This needs
more thought. */

View File

@ -5,7 +5,8 @@
/* This file was automatically written by the pcre2_dftables auxiliary
program. It contains character tables that are used when no external
tables are passed to PCRE2 by the application that calls it. The tables
are used only for characters whose code values are less than 256. */
are used only for characters whose code values are less than 256, and
only relevant if not in UCP mode. */
/* This set of tables was written in the C locale. */
@ -160,7 +161,7 @@ graph, print, punct, and cntrl. Other classes are built from combinations. */
0x02 letter
0x04 lower case letter
0x08 decimal digit
0x10 alphanumeric or '_'
0x10 word (alphanumeric or '_')
*/
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */

View File

@ -0,0 +1,93 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 2023 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This file contains functions to implement checked integer operation */
#ifndef PCRE2_PCRE2TEST
#include "pcre2_config.h"
#include "pcre2_internal.h"
#endif
/*************************************************
* Checked Integer Multiplication *
*************************************************/
/*
Arguments:
r A pointer to PCRE2_SIZE to store the answer
a, b Two integers
Returns: Bool indicating if the operation overflows
It is modeled after C23's <stdckdint.h> interface
The INT64_OR_DOUBLE type is a 64-bit integer type when available,
otherwise double. */
BOOL
PRIV(ckd_smul)(PCRE2_SIZE *r, int a, int b)
{
#ifdef HAVE_BUILTIN_MUL_OVERFLOW
PCRE2_SIZE m;
if (__builtin_mul_overflow(a, b, &m)) return TRUE;
*r = m;
#else
INT64_OR_DOUBLE m;
#ifdef PCRE2_DEBUG
if (a < 0 || b < 0) abort();
#endif
m = (INT64_OR_DOUBLE)a * (INT64_OR_DOUBLE)b;
#if defined INT64_MAX || defined int64_t
if (sizeof(m) > sizeof(*r) && m > (INT64_OR_DOUBLE)PCRE2_SIZE_MAX) return TRUE;
*r = (PCRE2_SIZE)m;
#else
if (m > PCRE2_SIZE_MAX) return TRUE;
*r = m;
#endif
#endif
return FALSE;
}
/* End of pcre_chkdint.c */

File diff suppressed because it is too large Load Diff

View File

@ -57,9 +57,12 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define this if your compiler supports __attribute__((uninitialized)) */
/* #undef HAVE_ATTRIBUTE_UNINITIALIZED */
/* Define to 1 if you have the `bcopy' function. */
/* Define to 1 if you have the 'bcopy' function. */
/* #undef HAVE_BCOPY */
/* Define this if your compiler provides __builtin_mul_overflow() */
/* #undef HAVE_BUILTIN_MUL_OVERFLOW */
/* Define to 1 if you have the <bzlib.h> header file. */
/* #undef HAVE_BZLIB_H */
@ -81,17 +84,17 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to 1 if you have the <limits.h> header file. */
/* #undef HAVE_LIMITS_H */
/* Define to 1 if you have the `memfd_create' function. */
/* Define to 1 if you have the 'memfd_create' function. */
/* #undef HAVE_MEMFD_CREATE */
/* Define to 1 if you have the `memmove' function. */
/* Define to 1 if you have the 'memmove' function. */
/* #undef HAVE_MEMMOVE */
#define HAVE_MEMMOVE 1
/* Define to 1 if you have the <minix/config.h> header file. */
/* #undef HAVE_MINIX_CONFIG_H */
/* Define to 1 if you have the `mkostemp' function. */
/* Define to 1 if you have the 'mkostemp' function. */
/* #undef HAVE_MKOSTEMP */
/* Define if you have POSIX threads libraries and header files. */
@ -112,7 +115,7 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to 1 if you have the `realpath' function. */
/* #undef HAVE_REALPATH */
/* Define to 1 if you have the `secure_getenv' function. */
/* Define to 1 if you have the 'secure_getenv' function. */
/* #undef HAVE_SECURE_GETENV */
/* Define to 1 if you have the <stdint.h> header file. */
@ -124,7 +127,7 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to 1 if you have the <stdlib.h> header file. */
/* #undef HAVE_STDLIB_H */
/* Define to 1 if you have the `strerror' function. */
/* Define to 1 if you have the 'strerror' function. */
/* #undef HAVE_STRERROR */
/* Define to 1 if you have the <strings.h> header file. */
@ -184,7 +187,7 @@ sure both macros are undefined; an emulation function will then be used. */
matching attempt. The value is also used to limit a loop counter in
pcre2_dfa_match(). There is a runtime interface for setting a different
limit. The limit exists in order to catch runaway regular expressions that
take for ever to determine that they do not match. The default is set very
take forever to determine that they do not match. The default is set very
large so that it does not accidentally catch legitimate cases. */
#ifndef MATCH_LIMIT
#define MATCH_LIMIT 10000000
@ -215,7 +218,13 @@ sure both macros are undefined; an emulation function will then be used. */
Care must be taken if it is increased, because it guards against integer
overflow caused by enormously large patterns. */
#ifndef MAX_NAME_SIZE
#define MAX_NAME_SIZE 32
#define MAX_NAME_SIZE 128
#endif
/* The value of MAX_VARLOOKBEHIND specifies the default maximum length, in
characters, for a variable-length lookbehind assertion. */
#ifndef MAX_VARLOOKBEHIND
#define MAX_VARLOOKBEHIND 255
#endif
/* Defining NEVER_BACKSLASH_C locks out the use of \C in all patterns. */
@ -239,7 +248,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_NAME "PCRE2"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "PCRE2 10.42"
#define PACKAGE_STRING "PCRE2 10.44"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "pcre2"
@ -248,7 +257,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "10.42"
#define PACKAGE_VERSION "10.44"
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
parentheses (of any kind) in a pattern. This limits the amount of system
@ -278,12 +287,16 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to any value to include debugging code. */
/* #undef PCRE2_DEBUG */
/* to make a symbol visible */
#define PCRE2_EXPORT
/* If you are compiling for a system other than a Unix-like system or
Win32, and it needs some magic to be inserted before the definition
of a function that is exported by the library, define this macro to
contain the relevant magic. If you do not define this macro, a suitable
__declspec value is used for Windows systems; in other environments
"extern" is used for a C compiler and "extern C" for a C++ compiler.
a compiler relevant "extern" is used with any "visibility" related
attributes from PCRE2_EXPORT included.
This macro apears at the start of every exported function that is part
of the external API. It does not appear on functions that are "external"
in the C sense, but which are internal to the library. */
@ -304,11 +317,14 @@ sure both macros are undefined; an emulation function will then be used. */
unless SUPPORT_JIT is also defined. */
/* #undef SLJIT_PROT_EXECUTABLE_ALLOCATOR */
/* Define to 1 if all of the C90 standard headers exist (not just the ones
/* Define to 1 if all of the C89 standard headers exist (not just the ones
required in a freestanding environment). This macro is provided for
backward compatibility; new code need not use it. */
/* #undef STDC_HEADERS */
/* Define to any value to enable differential fuzzing support. */
/* #undef SUPPORT_DIFF_FUZZ */
/* Define to any value to enable support for Just-In-Time compiling. */
/* #undef SUPPORT_JIT */
@ -357,7 +373,7 @@ sure both macros are undefined; an emulation function will then be used. */
/* Define to any value for valgrind support to find invalid memory reads. */
/* #undef SUPPORT_VALGRIND */
/* Enable extensions on AIX 3, Interix. */
/* Enable extensions on AIX, Interix, z/OS. */
#ifndef _ALL_SOURCE
# define _ALL_SOURCE 1
#endif
@ -418,11 +434,15 @@ sure both macros are undefined; an emulation function will then be used. */
#ifndef __STDC_WANT_IEC_60559_DFP_EXT__
# define __STDC_WANT_IEC_60559_DFP_EXT__ 1
#endif
/* Enable extensions specified by C23 Annex F. */
#ifndef __STDC_WANT_IEC_60559_EXT__
# define __STDC_WANT_IEC_60559_EXT__ 1
#endif
/* Enable extensions specified by ISO/IEC TS 18661-4:2015. */
#ifndef __STDC_WANT_IEC_60559_FUNCS_EXT__
# define __STDC_WANT_IEC_60559_FUNCS_EXT__ 1
#endif
/* Enable extensions specified by ISO/IEC TS 18661-3:2015. */
/* Enable extensions specified by C23 Annex H and ISO/IEC TS 18661-3:2015. */
#ifndef __STDC_WANT_IEC_60559_TYPES_EXT__
# define __STDC_WANT_IEC_60559_TYPES_EXT__ 1
#endif
@ -445,20 +465,26 @@ sure both macros are undefined; an emulation function will then be used. */
#endif
/* Version number of package */
#define VERSION "10.42"
#define VERSION "10.44"
/* Number of bits in a file offset, on hosts where this is settable. */
/* #undef _FILE_OFFSET_BITS */
/* Define for large files, on AIX-style hosts. */
/* Define to 1 on platforms where this makes off_t a 64-bit type. */
/* #undef _LARGE_FILES */
/* Define to empty if `const' does not conform to ANSI C. */
/* Number of bits in time_t, on hosts where this is settable. */
/* #undef _TIME_BITS */
/* Define to 1 on platforms where this makes time_t a 64-bit type. */
/* #undef __MINGW_USE_VC2005_COMPAT */
/* Define to empty if 'const' does not conform to ANSI C. */
/* #undef const */
/* Define to the type of a signed integer type of width exactly 64 bits if
such a type exists and the standard includes do not define it. */
/* #undef int64_t */
/* Define to `unsigned int' if <sys/types.h> does not define. */
/* Define as 'unsigned int' if <stddef.h> doesn't define. */
/* #undef size_t */

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2022 University of Cambridge
New API code Copyright (c) 2016-2024 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -133,10 +133,13 @@ const pcre2_compile_context PRIV(default_compile_context) = {
NULL, /* Stack guard data */
PRIV(default_tables), /* Character tables */
PCRE2_UNSET, /* Max pattern length */
PCRE2_UNSET, /* Max pattern compiled length */
BSR_DEFAULT, /* Backslash R default */
NEWLINE_DEFAULT, /* Newline convention */
PARENS_NEST_LIMIT, /* As it says */
0 }; /* Extra options */
0, /* Extra options */
MAX_VARLOOKBEHIND /* As it says */
};
/* The create function copies the default into the new memory, but must
override the default memory handling functions if a gcontext was provided. */
@ -225,49 +228,48 @@ return ccontext;
PCRE2_EXP_DEFN pcre2_general_context * PCRE2_CALL_CONVENTION
pcre2_general_context_copy(pcre2_general_context *gcontext)
{
pcre2_general_context *new =
pcre2_general_context *newcontext =
gcontext->memctl.malloc(sizeof(pcre2_real_general_context),
gcontext->memctl.memory_data);
if (new == NULL) return NULL;
memcpy(new, gcontext, sizeof(pcre2_real_general_context));
return new;
if (newcontext == NULL) return NULL;
memcpy(newcontext, gcontext, sizeof(pcre2_real_general_context));
return newcontext;
}
PCRE2_EXP_DEFN pcre2_compile_context * PCRE2_CALL_CONVENTION
pcre2_compile_context_copy(pcre2_compile_context *ccontext)
{
pcre2_compile_context *new =
pcre2_compile_context *newcontext =
ccontext->memctl.malloc(sizeof(pcre2_real_compile_context),
ccontext->memctl.memory_data);
if (new == NULL) return NULL;
memcpy(new, ccontext, sizeof(pcre2_real_compile_context));
return new;
if (newcontext == NULL) return NULL;
memcpy(newcontext, ccontext, sizeof(pcre2_real_compile_context));
return newcontext;
}
PCRE2_EXP_DEFN pcre2_match_context * PCRE2_CALL_CONVENTION
pcre2_match_context_copy(pcre2_match_context *mcontext)
{
pcre2_match_context *new =
pcre2_match_context *newcontext =
mcontext->memctl.malloc(sizeof(pcre2_real_match_context),
mcontext->memctl.memory_data);
if (new == NULL) return NULL;
memcpy(new, mcontext, sizeof(pcre2_real_match_context));
return new;
if (newcontext == NULL) return NULL;
memcpy(newcontext, mcontext, sizeof(pcre2_real_match_context));
return newcontext;
}
PCRE2_EXP_DEFN pcre2_convert_context * PCRE2_CALL_CONVENTION
pcre2_convert_context_copy(pcre2_convert_context *ccontext)
{
pcre2_convert_context *new =
pcre2_convert_context *newcontext =
ccontext->memctl.malloc(sizeof(pcre2_real_convert_context),
ccontext->memctl.memory_data);
if (new == NULL) return NULL;
memcpy(new, ccontext, sizeof(pcre2_real_convert_context));
return new;
if (newcontext == NULL) return NULL;
memcpy(newcontext, ccontext, sizeof(pcre2_real_convert_context));
return newcontext;
}
@ -348,6 +350,13 @@ ccontext->max_pattern_length = length;
return 0;
}
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_set_max_pattern_compiled_length(pcre2_compile_context *ccontext, PCRE2_SIZE length)
{
ccontext->max_pattern_compiled_length = length;
return 0;
}
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_set_newline(pcre2_compile_context *ccontext, uint32_t newline)
{
@ -367,6 +376,13 @@ switch(newline)
}
}
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_set_max_varlookbehind(pcre2_compile_context *ccontext, uint32_t limit)
{
ccontext->max_varlookbehind = limit;
return 0;
}
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, uint32_t limit)
{

View File

@ -537,6 +537,14 @@ Returns: !0 => character is found in the class
static BOOL
convert_glob_char_in_class(int class_index, PCRE2_UCHAR c)
{
#if PCRE2_CODE_UNIT_WIDTH != 8
if (c > 0xff)
{
/* ctype functions are not sane for c > 0xff */
return 0;
}
#endif
switch (class_index)
{
case 1: return isalnum(c);

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2022 University of Cambridge
New API code Copyright (c) 2016-2023 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -166,7 +166,7 @@ static const uint8_t coptable[] = {
0, /* KetRmax */
0, /* KetRmin */
0, /* KetRpos */
0, /* Reverse */
0, 0, /* Reverse, Vreverse */
0, /* Assert */
0, /* Assert not */
0, /* Assert behind */
@ -185,7 +185,8 @@ static const uint8_t coptable[] = {
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
0, 0, /* COMMIT, COMMIT_ARG */
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
0, 0 /* \B and \b in UCP mode */
};
/* This table identifies those opcodes that inspect a character. It is used to
@ -243,7 +244,7 @@ static const uint8_t poptable[] = {
0, /* KetRmax */
0, /* KetRmin */
0, /* KetRpos */
0, /* Reverse */
0, 0, /* Reverse, Vreverse */
0, /* Assert */
0, /* Assert not */
0, /* Assert behind */
@ -262,7 +263,8 @@ static const uint8_t poptable[] = {
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
0, 0, /* COMMIT, COMMIT_ARG */
0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
1, 1 /* \B and \b in UCP mode */
};
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
@ -424,7 +426,7 @@ overflow. */
else
{
uint32_t newsize = (rws->size >= UINT32_MAX/2)? UINT32_MAX/2 : rws->size * 2;
uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;
uint32_t newsizeK = newsize/(1024/sizeof(int));
if (newsizeK + mb->heap_used > mb->heap_limit)
@ -587,7 +589,7 @@ if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
end_code = this_start_code;
do
{
size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);
if (back > max_back) max_back = back;
end_code += GET(end_code, 1);
}
@ -631,8 +633,8 @@ if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
end_code = this_start_code;
do
{
uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;
size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);
if (back <= gone_back)
{
int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
@ -1098,6 +1100,8 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_WORD_BOUNDARY:
case OP_NOT_WORD_BOUNDARY:
case OP_NOT_UCP_WORD_BOUNDARY:
case OP_UCP_WORD_BOUNDARY:
{
int left_word, right_word;
@ -1110,13 +1114,13 @@ for (;;)
#endif
GETCHARTEST(d, temp);
#ifdef SUPPORT_UNICODE
if ((mb->poptions & PCRE2_UCP) != 0)
if (codevalue == OP_UCP_WORD_BOUNDARY ||
codevalue == OP_NOT_UCP_WORD_BOUNDARY)
{
if (d == '_') left_word = TRUE; else
{
uint32_t cat = UCD_CATEGORY(d);
left_word = (cat == ucp_L || cat == ucp_N);
}
int chartype = UCD_CHARTYPE(d);
int category = PRIV(ucp_gentype)[chartype];
left_word = (category == ucp_L || category == ucp_N ||
chartype == ucp_Mn || chartype == ucp_Pc);
}
else
#endif
@ -1135,13 +1139,13 @@ for (;;)
mb->last_used_ptr = temp;
}
#ifdef SUPPORT_UNICODE
if ((mb->poptions & PCRE2_UCP) != 0)
if (codevalue == OP_UCP_WORD_BOUNDARY ||
codevalue == OP_NOT_UCP_WORD_BOUNDARY)
{
if (c == '_') right_word = TRUE; else
{
uint32_t cat = UCD_CATEGORY(c);
right_word = (cat == ucp_L || cat == ucp_N);
}
int chartype = UCD_CHARTYPE(c);
int category = PRIV(ucp_gentype)[chartype];
right_word = (category == ucp_L || category == ucp_N ||
chartype == ucp_Mn || chartype == ucp_Pc);
}
else
#endif
@ -1149,7 +1153,9 @@ for (;;)
}
else right_word = FALSE;
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
if ((left_word == right_word) ==
(codevalue == OP_NOT_WORD_BOUNDARY ||
codevalue == OP_NOT_UCP_WORD_BOUNDARY))
{ ADD_ACTIVE(state_offset + 1, 0); }
}
break;
@ -1166,6 +1172,7 @@ for (;;)
if (clen > 0)
{
BOOL OK;
int chartype;
const uint32_t *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[1])
@ -1175,8 +1182,9 @@ for (;;)
break;
case PT_LAMP:
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
prop->chartype == ucp_Lt;
chartype = prop->chartype;
OK = chartype == ucp_Lu || chartype == ucp_Ll ||
chartype == ucp_Lt;
break;
case PT_GC:
@ -1199,8 +1207,9 @@ for (;;)
/* These are specials for combination cases. */
case PT_ALNUM:
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
chartype = prop->chartype;
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N;
break;
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
@ -1223,12 +1232,20 @@ for (;;)
break;
case PT_WORD:
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
c == CHAR_UNDERSCORE;
chartype = prop->chartype;
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N ||
chartype == ucp_Mn || chartype == ucp_Pc;
break;
case PT_CLIST:
#if PCRE2_CODE_UNIT_WIDTH == 32
if (c > MAX_UTF_CODE_POINT)
{
OK = FALSE;
break;
}
#endif
cp = PRIV(ucd_caseless_sets) + code[2];
for (;;)
{
@ -1438,6 +1455,7 @@ for (;;)
if (clen > 0)
{
BOOL OK;
int chartype;
const uint32_t *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[2])
@ -1447,8 +1465,8 @@ for (;;)
break;
case PT_LAMP:
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
prop->chartype == ucp_Lt;
chartype = prop->chartype;
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
break;
case PT_GC:
@ -1471,8 +1489,9 @@ for (;;)
/* These are specials for combination cases. */
case PT_ALNUM:
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
chartype = prop->chartype;
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N;
break;
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
@ -1495,12 +1514,20 @@ for (;;)
break;
case PT_WORD:
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
c == CHAR_UNDERSCORE;
chartype = prop->chartype;
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N ||
chartype == ucp_Mn || chartype == ucp_Pc;
break;
case PT_CLIST:
#if PCRE2_CODE_UNIT_WIDTH == 32
if (c > MAX_UTF_CODE_POINT)
{
OK = FALSE;
break;
}
#endif
cp = PRIV(ucd_caseless_sets) + code[3];
for (;;)
{
@ -1693,6 +1720,7 @@ for (;;)
if (clen > 0)
{
BOOL OK;
int chartype;
const uint32_t *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[2])
@ -1702,8 +1730,8 @@ for (;;)
break;
case PT_LAMP:
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
prop->chartype == ucp_Lt;
chartype = prop->chartype;
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
break;
case PT_GC:
@ -1726,8 +1754,9 @@ for (;;)
/* These are specials for combination cases. */
case PT_ALNUM:
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
chartype = prop->chartype;
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N;
break;
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
@ -1750,12 +1779,20 @@ for (;;)
break;
case PT_WORD:
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
c == CHAR_UNDERSCORE;
chartype = prop->chartype;
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N ||
chartype == ucp_Mn || chartype == ucp_Pc;
break;
case PT_CLIST:
#if PCRE2_CODE_UNIT_WIDTH == 32
if (c > MAX_UTF_CODE_POINT)
{
OK = FALSE;
break;
}
#endif
cp = PRIV(ucd_caseless_sets) + code[3];
for (;;)
{
@ -1973,6 +2010,7 @@ for (;;)
if (clen > 0)
{
BOOL OK;
int chartype;
const uint32_t *cp;
const ucd_record * prop = GET_UCD(c);
switch(code[1 + IMM2_SIZE + 1])
@ -1982,8 +2020,8 @@ for (;;)
break;
case PT_LAMP:
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
prop->chartype == ucp_Lt;
chartype = prop->chartype;
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
break;
case PT_GC:
@ -2007,8 +2045,9 @@ for (;;)
/* These are specials for combination cases. */
case PT_ALNUM:
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
chartype = prop->chartype;
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N;
break;
/* Perl space used to exclude VT, but from Perl 5.18 it is included,
@ -2031,12 +2070,20 @@ for (;;)
break;
case PT_WORD:
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
c == CHAR_UNDERSCORE;
chartype = prop->chartype;
OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N ||
chartype == ucp_Mn || chartype == ucp_Pc;
break;
case PT_CLIST:
#if PCRE2_CODE_UNIT_WIDTH == 32
if (c > MAX_UTF_CODE_POINT)
{
OK = FALSE;
break;
}
#endif
cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
for (;;)
{
@ -2892,7 +2939,6 @@ for (;;)
int *local_workspace;
PCRE2_SIZE *local_offsets;
RWS_anchor *rws = (RWS_anchor *)RWS;
dfa_recursion_info *ri;
PCRE2_SPTR callpat = start_code + GET(code, 1);
uint32_t recno = (callpat == mb->start_code)? 0 :
GET2(callpat, 1 + LINK_SIZE);
@ -2909,18 +2955,24 @@ for (;;)
rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
/* Check for repeating a recursion without advancing the subject
pointer. This should catch convoluted mutual recursions. (Some simple
cases are caught at compile time.) */
pointer or last used character. This should catch convoluted mutual
recursions. (Some simple cases are caught at compile time.) */
for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
if (recno == ri->group_num && ptr == ri->subject_position)
for (dfa_recursion_info *ri = mb->recursive;
ri != NULL;
ri = ri->prevrec)
{
if (recno == ri->group_num && ptr == ri->subject_position &&
mb->last_used_ptr == ri->last_used_ptr)
return PCRE2_ERROR_RECURSELOOP;
}
/* Remember this recursion and where we started it so as to
catch infinite loops. */
new_recursive.group_num = recno;
new_recursive.subject_position = ptr;
new_recursive.last_used_ptr = mb->last_used_ptr;
new_recursive.prevrec = mb->recursive;
mb->recursive = &new_recursive;
@ -3422,7 +3474,7 @@ anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
where to start. */
startline = (re->flags & PCRE2_STARTLINE) != 0;
firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
bumpalong_limit = end_subject;
/* Initialize and set up the fixed fields in the callout block, with a pointer
@ -3992,8 +4044,9 @@ for (;;)
match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
}
match_data->subject_length = length;
match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);
match_data->startchar = (PCRE2_SIZE)(start_match - subject);
match_data->rc = rc;

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2021 University of Cambridge
New API code Copyright (c) 2016-2024 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -79,7 +79,7 @@ static const unsigned char compile_error_texts[] =
"missing closing parenthesis\0"
/* 15 */
"reference to non-existent subpattern\0"
"pattern passed as NULL\0"
"pattern passed as NULL with non-zero length\0"
"unrecognised compile-time option bit(s)\0"
"missing ) after (?# comment\0"
"parentheses are too deeply nested\0"
@ -90,7 +90,7 @@ static const unsigned char compile_error_texts[] =
"internal error: code overflow\0"
"missing closing parenthesis for condition\0"
/* 25 */
"lookbehind assertion is not fixed length\0"
"length of lookbehind assertion is not limited\0"
"a relative value of zero is not allowed\0"
"conditional subpattern contains more than two branches\0"
"assertion expected after (?( or (?(?C)\0"
@ -184,6 +184,9 @@ static const unsigned char compile_error_texts[] =
"too many capturing groups (maximum 65535)\0"
"atomic assertion expected after (?( or (?(?C)\0"
"\\K is not allowed in lookarounds (but see PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK)\0"
/* 100 */
"branch too long in variable-length lookbehind assertion\0"
"compiled pattern would be longer than the limit set by the application\0"
;
/* Match-time and UTF error texts are in the same format. */
@ -269,6 +272,7 @@ static const unsigned char match_error_texts[] =
/* 65 */
"internal error - duplicate substitution match\0"
"PCRE2_MATCH_INVALID_UTF is not supported for DFA matching\0"
"INTERNAL ERROR: invalid substring offset\0"
;

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2021 University of Cambridge
New API code Copyright (c) 2016-2024 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -71,7 +71,11 @@ return NULL;
* Match an extended grapheme sequence *
*************************************************/
/*
/* NOTE: The logic contained in this function is replicated in three special-
purpose functions in the pcre2_jit_compile.c module. If the logic below is
changed, they must be kept in step so that the interpreter and the JIT have the
same behaviour.
Arguments:
c the first character
eptr pointer to next character
@ -88,6 +92,7 @@ PCRE2_SPTR
PRIV(extuni)(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject,
PCRE2_SPTR end_subject, BOOL utf, int *xcount)
{
BOOL was_ep_ZWJ = FALSE;
int lgb = UCD_GRAPHBREAK(c);
while (eptr < end_subject)
@ -98,6 +103,12 @@ while (eptr < end_subject)
rgb = UCD_GRAPHBREAK(c);
if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
/* ZWJ followed by Extended Pictographic is allowed only if the ZWJ was
preceded by Extended Pictographic. */
if (lgb == ucp_gbZWJ && rgb == ucp_gbExtended_Pictographic && !was_ep_ZWJ)
break;
/* Not breaking between Regional Indicators is allowed only if there
are an even number of preceding RIs. */
@ -125,12 +136,15 @@ while (eptr < end_subject)
if ((ricount & 1) != 0) break; /* Grapheme break required */
}
/* If Extend or ZWJ follows Extended_Pictographic, do not update lgb; this
allows any number of them before a following Extended_Pictographic. */
/* Set a flag when ZWJ follows Extended Pictographic (with optional Extend in
between; see next statement). */
if ((rgb != ucp_gbExtend && rgb != ucp_gbZWJ) ||
lgb != ucp_gbExtended_Pictographic)
lgb = rgb;
was_ep_ZWJ = (lgb == ucp_gbExtended_Pictographic && rgb == ucp_gbZWJ);
/* If Extend follows Extended_Pictographic, do not update lgb; this allows
any number of them before a following ZWJ. */
if (rgb != ucp_gbExtend || lgb != ucp_gbExtended_Pictographic) lgb = rgb;
eptr += len;
if (xcount != NULL) *xcount += 1;

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2018 University of Cambridge
New API code Copyright (c) 2016-2023 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -41,9 +41,9 @@ POSSIBILITY OF SUCH DAMAGE.
/* This module contains a single function that scans through a compiled pattern
until it finds a capturing bracket with the given number, or, if the number is
negative, an instance of OP_REVERSE for a lookbehind. The function is called
from pcre2_compile.c and also from pcre2_study.c when finding the minimum
matching length. */
negative, an instance of OP_REVERSE or OP_VREVERSE for a lookbehind. The
function is called from pcre2_compile.c and also from pcre2_study.c when
finding the minimum matching length. */
#include "pcre2_config.h"
@ -82,7 +82,7 @@ for (;;)
/* Handle lookbehind */
else if (c == OP_REVERSE)
else if (c == OP_REVERSE || c == OP_VREVERSE)
{
if (number < 0) return (PCRE2_UCHAR *)code;
code += PRIV(OP_lengths)[c];

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2022 University of Cambridge
New API code Copyright (c) 2016-2023 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -51,6 +51,24 @@ pcre2test.c with CODE_UNIT_WIDTH == 0. */
#error The use of both EBCDIC and SUPPORT_UNICODE is not supported.
#endif
/* When compiling one of the libraries, the value of PCRE2_CODE_UNIT_WIDTH must
be 8, 16, or 32. AutoTools and CMake ensure that this is always the case, but
other other building methods may not, so here is a check. It is cut out when
building pcre2test, bcause that sets the value to zero. No other source should
be including this file. There is no explicit way of forcing a compile to be
abandoned, but trying to include a non-existent file seems cleanest. Otherwise
there will be many irrelevant consequential errors. */
#if (!defined PCRE2_BUILDING_PCRE2TEST && !defined PCRE2_DFTABLES) && \
(!defined PCRE2_CODE_UNIT_WIDTH || \
(PCRE2_CODE_UNIT_WIDTH != 8 && \
PCRE2_CODE_UNIT_WIDTH != 16 && \
PCRE2_CODE_UNIT_WIDTH != 32))
#error PCRE2_CODE_UNIT_WIDTH must be defined as 8, 16, or 32.
#include <AbandonCompile>
#endif
/* Standard C headers */
#include <ctype.h>
@ -116,20 +134,24 @@ special-purpose environments) might want to stick other stuff in front of
exported symbols. That's why, in the non-Windows case, we set PCRE2_EXP_DEFN
only if it is not already set. */
#if !defined(PCRE2_EXPORT)
#define PCRE2_EXPORT
#endif
#ifndef PCRE2_EXP_DECL
# ifdef _WIN32
# ifndef PCRE2_STATIC
# define PCRE2_EXP_DECL extern __declspec(dllexport)
# define PCRE2_EXP_DEFN __declspec(dllexport)
# else
# define PCRE2_EXP_DECL extern
# define PCRE2_EXP_DECL extern PCRE2_EXPORT
# define PCRE2_EXP_DEFN
# endif
# else
# ifdef __cplusplus
# define PCRE2_EXP_DECL extern "C"
# define PCRE2_EXP_DECL extern "C" PCRE2_EXPORT
# else
# define PCRE2_EXP_DECL extern
# define PCRE2_EXP_DECL extern PCRE2_EXPORT
# endif
# ifndef PCRE2_EXP_DEFN
# define PCRE2_EXP_DEFN PCRE2_EXP_DECL
@ -156,8 +178,8 @@ pcre2_match() because of the way it backtracks. */
#define PCRE2_SPTR CUSTOM_SUBJECT_PTR
#endif
/* When checking for integer overflow in pcre2_compile(), we need to handle
large integers. If a 64-bit integer type is available, we can use that.
/* When checking for integer overflow, we need to handle large integers.
If a 64-bit integer type is available, we can use that.
Otherwise we have to cast to double, which of course requires floating point
arithmetic. Handle this by defining a macro for the appropriate type. */
@ -1281,7 +1303,7 @@ match. */
#define PT_ALNUM 6 /* Alphanumeric - the union of L and N */
#define PT_SPACE 7 /* Perl space - general category Z plus 9,10,12,13 */
#define PT_PXSPACE 8 /* POSIX space - Z plus 9,10,11,12,13 */
#define PT_WORD 9 /* Word - L plus N plus underscore */
#define PT_WORD 9 /* Word - L, N, Mn, or Pc */
#define PT_CLIST 10 /* Pseudo-property: match character list */
#define PT_UCNC 11 /* Universal Character nameable character */
#define PT_BIDICL 12 /* Specified bidi class */
@ -1297,6 +1319,7 @@ table. */
#define PT_PXGRAPH 14 /* [:graph:] - characters that mark the paper */
#define PT_PXPRINT 15 /* [:print:] - [:graph:] plus non-control spaces */
#define PT_PXPUNCT 16 /* [:punct:] - punctuation characters */
#define PT_PXXDIGIT 17 /* [:xdigit:] - hex digits */
/* This value is used when parsing \p and \P escapes to indicate that neither
\p{script:...} nor \p{scx:...} has been encountered. */
@ -1327,6 +1350,12 @@ mode rather than an escape sequence. It is also used for [^] in JavaScript
compatibility mode, and for \C in non-utf mode. In non-DOTALL mode, "." behaves
like \N.
ESC_ub is a special return from check_escape() when, in BSUX mode, \u{ is not
followed by hex digits and }, in which case it should mean a literal "u"
followed by a literal "{". This hack is necessary for cases like \u{ 12}
because without it, this is interpreted as u{12} now that spaces are allowed in
quantifiers.
Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in
check_escape(). There are tests in the code for an escape greater than ESC_b
and less than ESC_Z to detect the types that may be repeated. These are the
@ -1336,7 +1365,7 @@ consume a character, that code will have to change. */
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
ESC_E, ESC_Q, ESC_g, ESC_k };
ESC_E, ESC_Q, ESC_g, ESC_k, ESC_ub };
/********************** Opcode definitions ******************/
@ -1372,8 +1401,8 @@ enum {
OP_SOD, /* 1 Start of data: \A */
OP_SOM, /* 2 Start of match (subject + offset): \G */
OP_SET_SOM, /* 3 Set start of match (\K) */
OP_NOT_WORD_BOUNDARY, /* 4 \B */
OP_WORD_BOUNDARY, /* 5 \b */
OP_NOT_WORD_BOUNDARY, /* 4 \B -- see also OP_NOT_UCP_WORD_BOUNDARY */
OP_WORD_BOUNDARY, /* 5 \b -- see also OP_UCP_WORD_BOUNDARY */
OP_NOT_DIGIT, /* 6 \D */
OP_DIGIT, /* 7 \d */
OP_NOT_WHITESPACE, /* 8 \S */
@ -1547,78 +1576,85 @@ enum {
/* The assertions must come before BRA, CBRA, ONCE, and COND. */
OP_REVERSE, /* 125 Move pointer back - used in lookbehind assertions */
OP_ASSERT, /* 126 Positive lookahead */
OP_ASSERT_NOT, /* 127 Negative lookahead */
OP_ASSERTBACK, /* 128 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 129 Negative lookbehind */
OP_ASSERT_NA, /* 130 Positive non-atomic lookahead */
OP_ASSERTBACK_NA, /* 131 Positive non-atomic lookbehind */
OP_VREVERSE, /* 126 Move pointer back - variable */
OP_ASSERT, /* 127 Positive lookahead */
OP_ASSERT_NOT, /* 128 Negative lookahead */
OP_ASSERTBACK, /* 129 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 130 Negative lookbehind */
OP_ASSERT_NA, /* 131 Positive non-atomic lookahead */
OP_ASSERTBACK_NA, /* 132 Positive non-atomic lookbehind */
/* ONCE, SCRIPT_RUN, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come
immediately after the assertions, with ONCE first, as there's a test for >=
ONCE for a subpattern that isn't an assertion. The POS versions must
immediately follow the non-POS versions in each case. */
OP_ONCE, /* 132 Atomic group, contains captures */
OP_SCRIPT_RUN, /* 133 Non-capture, but check characters' scripts */
OP_BRA, /* 134 Start of non-capturing bracket */
OP_BRAPOS, /* 135 Ditto, with unlimited, possessive repeat */
OP_CBRA, /* 136 Start of capturing bracket */
OP_CBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */
OP_COND, /* 138 Conditional group */
OP_ONCE, /* 133 Atomic group, contains captures */
OP_SCRIPT_RUN, /* 134 Non-capture, but check characters' scripts */
OP_BRA, /* 135 Start of non-capturing bracket */
OP_BRAPOS, /* 136 Ditto, with unlimited, possessive repeat */
OP_CBRA, /* 137 Start of capturing bracket */
OP_CBRAPOS, /* 138 Ditto, with unlimited, possessive repeat */
OP_COND, /* 139 Conditional group */
/* These five must follow the previous five, in the same order. There's a
check for >= SBRA to distinguish the two sets. */
OP_SBRA, /* 139 Start of non-capturing bracket, check empty */
OP_SBRAPOS, /* 149 Ditto, with unlimited, possessive repeat */
OP_SCBRA, /* 141 Start of capturing bracket, check empty */
OP_SCBRAPOS, /* 142 Ditto, with unlimited, possessive repeat */
OP_SCOND, /* 143 Conditional group, check empty */
OP_SBRA, /* 140 Start of non-capturing bracket, check empty */
OP_SBRAPOS, /* 141 Ditto, with unlimited, possessive repeat */
OP_SCBRA, /* 142 Start of capturing bracket, check empty */
OP_SCBRAPOS, /* 143 Ditto, with unlimited, possessive repeat */
OP_SCOND, /* 144 Conditional group, check empty */
/* The next two pairs must (respectively) be kept together. */
OP_CREF, /* 144 Used to hold a capture number as condition */
OP_DNCREF, /* 145 Used to point to duplicate names as a condition */
OP_RREF, /* 146 Used to hold a recursion number as condition */
OP_DNRREF, /* 147 Used to point to duplicate names as a condition */
OP_FALSE, /* 148 Always false (used by DEFINE and VERSION) */
OP_TRUE, /* 149 Always true (used by VERSION) */
OP_CREF, /* 145 Used to hold a capture number as condition */
OP_DNCREF, /* 146 Used to point to duplicate names as a condition */
OP_RREF, /* 147 Used to hold a recursion number as condition */
OP_DNRREF, /* 148 Used to point to duplicate names as a condition */
OP_FALSE, /* 149 Always false (used by DEFINE and VERSION) */
OP_TRUE, /* 150 Always true (used by VERSION) */
OP_BRAZERO, /* 150 These two must remain together and in this */
OP_BRAMINZERO, /* 151 order. */
OP_BRAPOSZERO, /* 152 */
OP_BRAZERO, /* 151 These two must remain together and in this */
OP_BRAMINZERO, /* 152 order. */
OP_BRAPOSZERO, /* 153 */
/* These are backtracking control verbs */
OP_MARK, /* 153 always has an argument */
OP_PRUNE, /* 154 */
OP_PRUNE_ARG, /* 155 same, but with argument */
OP_SKIP, /* 156 */
OP_SKIP_ARG, /* 157 same, but with argument */
OP_THEN, /* 158 */
OP_THEN_ARG, /* 159 same, but with argument */
OP_COMMIT, /* 160 */
OP_COMMIT_ARG, /* 161 same, but with argument */
OP_MARK, /* 154 always has an argument */
OP_PRUNE, /* 155 */
OP_PRUNE_ARG, /* 156 same, but with argument */
OP_SKIP, /* 157 */
OP_SKIP_ARG, /* 158 same, but with argument */
OP_THEN, /* 159 */
OP_THEN_ARG, /* 160 same, but with argument */
OP_COMMIT, /* 161 */
OP_COMMIT_ARG, /* 162 same, but with argument */
/* These are forced failure and success verbs. FAIL and ACCEPT do accept an
argument, but these cases can be compiled as, for example, (*MARK:X)(*FAIL)
without the need for a special opcode. */
OP_FAIL, /* 162 */
OP_ACCEPT, /* 163 */
OP_ASSERT_ACCEPT, /* 164 Used inside assertions */
OP_CLOSE, /* 165 Used before OP_ACCEPT to close open captures */
OP_FAIL, /* 163 */
OP_ACCEPT, /* 164 */
OP_ASSERT_ACCEPT, /* 165 Used inside assertions */
OP_CLOSE, /* 166 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
OP_SKIPZERO, /* 166 */
OP_SKIPZERO, /* 167 */
/* This is used to identify a DEFINE group during compilation so that it can
be checked for having only one branch. It is changed to OP_FALSE before
compilation finishes. */
OP_DEFINE, /* 167 */
OP_DEFINE, /* 168 */
/* These opcodes replace their normal counterparts in UCP mode when
PCRE2_EXTRA_ASCII_BSW is not set. */
OP_NOT_UCP_WORD_BOUNDARY, /* 169 */
OP_UCP_WORD_BOUNDARY, /* 170 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
@ -1664,7 +1700,7 @@ some cases doesn't actually use these names at all). */
"class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \
"Recurse", "Callout", "CalloutStr", \
"Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \
"Reverse", "Assert", "Assert not", \
"Reverse", "VReverse", "Assert", "Assert not", \
"Assert back", "Assert back not", \
"Non-atomic assert", "Non-atomic assert back", \
"Once", \
@ -1679,7 +1715,7 @@ some cases doesn't actually use these names at all). */
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
"*THEN", "*THEN", "*COMMIT", "*COMMIT", "*FAIL", \
"*ACCEPT", "*ASSERT_ACCEPT", \
"Close", "Skip zero", "Define"
"Close", "Skip zero", "Define", "\\B (ucp)", "\\b (ucp)"
/* This macro defines the length of fixed length operations in the compiled
@ -1746,7 +1782,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1+LINK_SIZE, /* KetRmax */ \
1+LINK_SIZE, /* KetRmin */ \
1+LINK_SIZE, /* KetRpos */ \
1+LINK_SIZE, /* Reverse */ \
1+IMM2_SIZE, /* Reverse */ \
1+2*IMM2_SIZE, /* VReverse */ \
1+LINK_SIZE, /* Assert */ \
1+LINK_SIZE, /* Assert not */ \
1+LINK_SIZE, /* Assert behind */ \
@ -1775,7 +1812,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1, 3, /* COMMIT, COMMIT_ARG */ \
1, 1, 1, /* FAIL, ACCEPT, ASSERT_ACCEPT */ \
1+IMM2_SIZE, 1, /* CLOSE, SKIPZERO */ \
1 /* DEFINE */
1, /* DEFINE */ \
1, 1 /* \B and \b in UCP mode */
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
@ -2042,6 +2080,9 @@ extern void * _pcre2_memmove(void *, const void *, size_t);
#endif
#endif /* PCRE2_CODE_UNIT_WIDTH */
extern BOOL PRIV(ckd_smul)(PCRE2_SIZE *, int, int);
#endif /* PCRE2_INTERNAL_H_IDEMPOTENT_GUARD */
/* End of pcre2_internal.h */

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2022 University of Cambridge
New API code Copyright (c) 2016-2024 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -568,10 +568,12 @@ typedef struct pcre2_real_compile_context {
void *stack_guard_data;
const uint8_t *tables;
PCRE2_SIZE max_pattern_length;
PCRE2_SIZE max_pattern_compiled_length;
uint16_t bsr_convention;
uint16_t newline_convention;
uint32_t parens_nest_limit;
uint32_t extra_options;
uint32_t max_varlookbehind;
} pcre2_real_compile_context;
/* The real match context structure. */
@ -605,12 +607,12 @@ defined specially because it is required in pcre2_serialize_decode() when
copying the size from possibly unaligned memory into a variable of the same
type. Use a macro rather than a typedef to avoid compiler warnings when this
file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
here.) */
largest lookbehind that is supported. (OP_REVERSE and OP_VREVERSE in a pattern
have 16-bit arguments in 8-bit and 16-bit modes, so we need no more than a
16-bit field here.) */
#undef CODE_BLOCKSIZE_TYPE
#define CODE_BLOCKSIZE_TYPE size_t
#define CODE_BLOCKSIZE_TYPE PCRE2_SIZE
#undef LOOKBEHIND_MAX
#define LOOKBEHIND_MAX UINT16_MAX
@ -658,6 +660,7 @@ typedef struct pcre2_real_match_data {
PCRE2_SPTR mark; /* Pointer to last mark */
struct heapframe *heapframes; /* Backtracking frames heap memory */
PCRE2_SIZE heapframes_size; /* Malloc-ed size */
PCRE2_SIZE subject_length; /* Subject length */
PCRE2_SIZE leftchar; /* Offset to leftmost code unit */
PCRE2_SIZE rightchar; /* Offset to rightmost code unit */
PCRE2_SIZE startchar; /* Offset to starting code unit */
@ -675,8 +678,8 @@ typedef struct pcre2_real_match_data {
#ifndef PCRE2_PCRE2TEST
/* Structures for checking for mutual recursion when scanning compiled or
parsed code. */
/* Structures for checking for mutual function recursion when scanning compiled
or parsed code. */
typedef struct recurse_check {
struct recurse_check *prev;
@ -688,7 +691,7 @@ typedef struct parsed_recurse_check {
uint32_t *groupptr;
} parsed_recurse_check;
/* Structure for building a cache when filling in recursion offsets. */
/* Structure for building a cache when filling in pattern recursion offsets. */
typedef struct recurse_cache {
PCRE2_SPTR group;
@ -734,7 +737,6 @@ typedef struct compile_block {
uint16_t name_entry_size; /* Size of each entry */
uint16_t parens_depth; /* Depth of nested parentheses */
uint16_t assert_depth; /* Depth of nested assertions */
open_capitem *open_caps; /* Chain of open capture items */
named_group *named_groups; /* Points to vector in pre-compile */
uint32_t named_group_list_size; /* Number of entries in the list */
uint32_t external_options; /* External (initial) options */
@ -752,10 +754,11 @@ typedef struct compile_block {
uint32_t class_range_end; /* Overall class range end */
PCRE2_UCHAR nl[4]; /* Newline string when fixed length */
uint32_t req_varyopt; /* "After variable item" flag for reqbyte */
int max_lookbehind; /* Maximum lookbehind (characters) */
uint32_t max_varlookbehind; /* Limit for variable lookbehinds */
int max_lookbehind; /* Maximum lookbehind encountered (characters) */
BOOL had_accept; /* (*ACCEPT) encountered */
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
BOOL had_recurse; /* Had a recursion or subroutine call */
BOOL had_recurse; /* Had a pattern recursion or subroutine call */
BOOL dupnames; /* Duplicate names exist */
} compile_block;
@ -773,6 +776,7 @@ call within the pattern when running pcre2_dfa_match(). */
typedef struct dfa_recursion_info {
struct dfa_recursion_info *prevrec;
PCRE2_SPTR subject_position;
PCRE2_SPTR last_used_ptr;
uint32_t group_num;
} dfa_recursion_info;
@ -793,7 +797,7 @@ typedef struct heapframe {
PCRE2_SIZE length; /* Used for character, string, or code lengths */
PCRE2_SIZE back_frame; /* Amount to subtract on RRETURN */
PCRE2_SIZE temp_size; /* Used for short-term PCRE2_SIZE values */
uint32_t rdepth; /* "Recursion" depth */
uint32_t rdepth; /* Function "recursion" depth within pcre2_match() */
uint32_t group_frame_type; /* Type information for group frames */
uint32_t temp_32[4]; /* Used for short-term 32-bit or BOOL values */
uint8_t return_id; /* Where to go on in internal "return" */
@ -829,7 +833,8 @@ typedef struct heapframe {
PCRE2_SPTR eptr; /* MUST BE FIRST */
PCRE2_SPTR start_match; /* Can be adjusted by \K */
PCRE2_SPTR mark; /* Most recent mark on the success path */
uint32_t current_recurse; /* Current (deepest) recursion number */
PCRE2_SPTR recurse_last_used; /* Last character used at time of pattern recursion */
uint32_t current_recurse; /* Group number of current (deepest) pattern recursion */
uint32_t capture_last; /* Most recent capture */
PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */
PCRE2_SIZE offset_top; /* Offset after highest capture */
@ -858,7 +863,7 @@ doing traditional NFA matching (pcre2_match() and friends). */
typedef struct match_block {
pcre2_memctl memctl; /* For general use */
PCRE2_SIZE heap_limit; /* As it says */
uint32_t heap_limit; /* As it says */
uint32_t match_limit; /* As it says */
uint32_t match_limit_depth; /* As it says */
uint32_t match_call_count; /* Number of times a new frame is created */
@ -875,10 +880,11 @@ typedef struct match_block {
uint16_t name_count; /* Number of names in name table */
uint16_t name_entry_size; /* Size of entry in names table */
PCRE2_SPTR name_table; /* Table of group names */
PCRE2_SPTR start_code; /* For use when recursing */
PCRE2_SPTR start_code; /* For use in pattern recursion */
PCRE2_SPTR start_subject; /* Start of the subject string */
PCRE2_SPTR check_subject; /* Where UTF-checked from */
PCRE2_SPTR end_subject; /* End of the subject string */
PCRE2_SPTR end_subject; /* Usable end of the subject string */
PCRE2_SPTR true_end_subject; /* Actual end of the subject string */
PCRE2_SPTR end_match_ptr; /* Subject position at end match */
PCRE2_SPTR start_used_ptr; /* Earliest consulted character */
PCRE2_SPTR last_used_ptr; /* Latest consulted character */
@ -886,7 +892,7 @@ typedef struct match_block {
PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */
PCRE2_SPTR verb_ecode_ptr; /* For passing back info */
PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */
uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */
uint32_t verb_current_recurse; /* Current recursion group when (*VERB) happens */
uint32_t moptions; /* Match options */
uint32_t poptions; /* Pattern options */
uint32_t skip_arg_count; /* For counting SKIP_ARGs */
@ -911,7 +917,7 @@ typedef struct dfa_match_block {
PCRE2_SPTR last_used_ptr; /* Latest consulted character */
const uint8_t *tables; /* Character tables */
PCRE2_SIZE start_offset; /* The start offset value */
PCRE2_SIZE heap_limit; /* As it says */
uint32_t heap_limit; /* As it says */
PCRE2_SIZE heap_used; /* As it says */
uint32_t match_limit; /* As it says */
uint32_t match_limit_depth; /* As it says */
@ -926,7 +932,7 @@ typedef struct dfa_match_block {
pcre2_callout_block *cb; /* Points to a callout block */
void *callout_data; /* To pass back to callouts */
int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */
dfa_recursion_info *recursive; /* Linked list of recursion data */
dfa_recursion_info *recursive; /* Linked list of pattern recursion data */
} dfa_match_block;
#endif /* PCRE2_PCRE2TEST */

File diff suppressed because it is too large Load Diff

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2018 University of Cambridge
New API code Copyright (c) 2016-2023 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -42,6 +42,12 @@ POSSIBILITY OF SUCH DAMAGE.
#error This file must be included from pcre2_jit_compile.c.
#endif
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#include <sanitizer/msan_interface.h>
#endif /* __has_feature(memory_sanitizer) */
#endif /* defined(__has_feature) */
#ifdef SUPPORT_JIT
static SLJIT_NOINLINE int jit_machine_stack_exec(jit_arguments *arguments, jit_function executable_func)
@ -171,6 +177,7 @@ if (rc > (int)oveccount)
rc = 0;
match_data->code = re;
match_data->subject = (rc >= 0 || rc == PCRE2_ERROR_PARTIAL)? subject : NULL;
match_data->subject_length = length;
match_data->rc = rc;
match_data->startchar = arguments.startchar_ptr - subject;
match_data->leftchar = 0;
@ -178,6 +185,13 @@ match_data->rightchar = 0;
match_data->mark = arguments.mark_ptr;
match_data->matchedby = PCRE2_MATCHEDBY_JIT;
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
if (rc > 0)
__msan_unpoison(match_data->ovector, 2 * rc * sizeof(match_data->ovector[0]));
#endif /* __has_feature(memory_sanitizer) */
#endif /* defined(__has_feature) */
return match_data->rc;
#endif /* SUPPORT_JIT */

View File

@ -141,8 +141,8 @@ if (startsize == 0 || maxsize == 0 || maxsize > SIZE_MAX - STACK_GROWTH_RATE)
return NULL;
if (startsize > maxsize)
startsize = maxsize;
startsize = (startsize + STACK_GROWTH_RATE - 1) & ~(STACK_GROWTH_RATE - 1);
maxsize = (maxsize + STACK_GROWTH_RATE - 1) & ~(STACK_GROWTH_RATE - 1);
startsize = (startsize + STACK_GROWTH_RATE - 1) & (size_t)(~(STACK_GROWTH_RATE - 1));
maxsize = (maxsize + STACK_GROWTH_RATE - 1) & (size_t)(~(STACK_GROWTH_RATE - 1));
jit_stack = PRIV(memctl_malloc)(sizeof(pcre2_real_jit_stack), (pcre2_memctl *)gcontext);
if (jit_stack == NULL) return NULL;

View File

@ -96,7 +96,11 @@ for (i = 0; i < 256; i++) *p++ = tolower(i);
/* Next the case-flipping table */
for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
for (i = 0; i < 256; i++)
{
int c = islower(i)? toupper(i) : tolower(i);
*p++ = (c < 256)? c : i;
}
/* Then the character class tables. Don't try to be clever and save effort on
exclusive ones - in some locales things may be different.

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2015-2022 University of Cambridge
New API code Copyright (c) 2015-2024 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -41,6 +41,8 @@ POSSIBILITY OF SUCH DAMAGE.
#include "pcre2_config.h"
#include "pcre2_internal.h"
/* These defines enable debugging code */
/* #define DEBUG_FRAMES_DISPLAY */
@ -51,6 +53,10 @@ POSSIBILITY OF SUCH DAMAGE.
#include <stdarg.h>
#endif
#ifdef DEBUG_SHOW_OPS
static const char *OP_names[] = { OP_NAME_LIST };
#endif
/* These defines identify the name of the block containing "static"
information, and fields within it. */
@ -58,8 +64,6 @@ information, and fields within it. */
#define PSSTART start_subject /* Field containing processed string start */
#define PSEND end_subject /* Field containing processed string end */
#include "pcre2_internal.h"
#define RECURSE_UNSET 0xffffffffu /* Bigger than max group number */
/* Masks for identifying the public options that are permitted at match time. */
@ -67,7 +71,8 @@ information, and fields within it. */
#define PUBLIC_MATCH_OPTIONS \
(PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT)
PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT| \
PCRE2_DISABLE_RECURSELOOP_CHECK)
#define PUBLIC_JIT_MATCH_OPTIONS \
(PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\
@ -148,7 +153,7 @@ changed, the code at RETURN_SWITCH below must be updated in sync. */
enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
RM31, RM32, RM33, RM34, RM35, RM36 };
RM31, RM32, RM33, RM34, RM35, RM36, RM37 };
#ifdef SUPPORT_WIDE_CHARS
enum { RM100=100, RM101 };
@ -595,11 +600,12 @@ heapframe *P = NULL;
heapframe *frames_top; /* End of frames vector */
heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */
PCRE2_SIZE heapframes_size; /* Usable size of frames vector */
PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */
/* Local variables that do not need to be preserved over calls to RRMATCH(). */
PCRE2_SPTR branch_end = NULL;
PCRE2_SPTR branch_start;
PCRE2_SPTR bracode; /* Temp pointer to start of group */
PCRE2_SIZE offset; /* Used for group offsets */
PCRE2_SIZE length; /* Used for various length calculations */
@ -633,13 +639,10 @@ copied when a new frame is created. */
frame_copy_size = frame_size - offsetof(heapframe, eptr);
/* Set up the first frame and the end of the frames vector. We set the local
heapframes_size to the usuable amount of the vector, that is, a whole number of
frames. */
/* Set up the first frame and the end of the frames vector. */
F = match_data->heapframes;
heapframes_size = (match_data->heapframes_size / frame_size) * frame_size;
frames_top = (heapframe *)((char *)F + heapframes_size);
frames_top = (heapframe *)((char *)F + match_data->heapframes_size);
Frdepth = 0; /* "Recursion" depth */
Fcapture_last = 0; /* Number of most recent capture */
@ -660,35 +663,54 @@ MATCH_RECURSE:
doubling the size, but constrained by the heap limit (which is in KiB). */
N = (heapframe *)((char *)F + frame_size);
if (N >= frames_top)
if ((heapframe *)((char *)N + frame_size) >= frames_top)
{
heapframe *new;
PCRE2_SIZE newsize = match_data->heapframes_size * 2;
PCRE2_SIZE newsize;
PCRE2_SIZE usedsize = (char *)N - (char *)(match_data->heapframes);
if (newsize > mb->heap_limit)
if (match_data->heapframes_size >= PCRE2_SIZE_MAX / 2)
{
PCRE2_SIZE maxsize = (mb->heap_limit/frame_size) * frame_size;
if (match_data->heapframes_size >= maxsize) return PCRE2_ERROR_HEAPLIMIT;
newsize = maxsize;
if (match_data->heapframes_size == PCRE2_SIZE_MAX - 1)
return PCRE2_ERROR_NOMEMORY;
newsize = PCRE2_SIZE_MAX - 1;
}
else
newsize = match_data->heapframes_size * 2;
if (newsize / 1024 >= mb->heap_limit)
{
PCRE2_SIZE old_size = match_data->heapframes_size / 1024;
if (mb->heap_limit <= old_size)
return PCRE2_ERROR_HEAPLIMIT;
else
{
PCRE2_SIZE max_delta = 1024 * (mb->heap_limit - old_size);
int over_bytes = match_data->heapframes_size % 1024;
if (over_bytes) max_delta -= (1024 - over_bytes);
newsize = match_data->heapframes_size + max_delta;
}
}
/* With a heap limit set, the permitted additional size may not be enough for
another frame, so do a final check. */
if (newsize - usedsize < frame_size) return PCRE2_ERROR_HEAPLIMIT;
new = match_data->memctl.malloc(newsize, match_data->memctl.memory_data);
if (new == NULL) return PCRE2_ERROR_NOMEMORY;
memcpy(new, match_data->heapframes, heapframes_size);
memcpy(new, match_data->heapframes, usedsize);
F = (heapframe *)((char *)new + ((char *)F - (char *)match_data->heapframes));
N = (heapframe *)((char *)F + frame_size);
N = (heapframe *)((char *)new + usedsize);
F = (heapframe *)((char *)N - frame_size);
match_data->memctl.free(match_data->heapframes, match_data->memctl.memory_data);
match_data->heapframes = new;
match_data->heapframes_size = newsize;
heapframes_size = (newsize / frame_size) * frame_size;
frames_top = (heapframe *)((char *)new + heapframes_size);
frames_top = (heapframe *)((char *)new + newsize);
}
#ifdef DEBUG_SHOW_RMATCH
fprintf(stderr, "++ RMATCH %2d frame=%d", Freturn_id, Frdepth + 1);
fprintf(stderr, "++ RMATCH %d frame=%d", Freturn_id, Frdepth + 1);
if (group_frame_type != 0)
{
fprintf(stderr, " type=%x ", group_frame_type);
@ -758,10 +780,16 @@ opcodes. */
if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
if (Frdepth >= mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
#ifdef DEBUG_SHOW_OPS
fprintf(stderr, "\n++ New frame: type=0x%x subject offset %ld\n",
GF_IDMASK(Fgroup_frame_type), Feptr - mb->start_subject);
#endif
for (;;)
{
#ifdef DEBUG_SHOW_OPS
fprintf(stderr, "++ op=%d\n", *Fecode);
fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
OP_names[*Fecode]);
#endif
Fop = (uint8_t)(*Fecode); /* Cast needed for 16-bit and 32-bit modes */
@ -809,15 +837,16 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
assert_accept_frame = F;
RRETURN(MATCH_ACCEPT);
/* If recursing, we have to find the most recent recursion. */
/* For ACCEPT within a recursion, we have to find the most recent
recursion. If not in a recursion, fall through to code that is common with
OP_END. */
case OP_ACCEPT:
case OP_END:
/* Handle end of a recursion. */
if (Fcurrent_recurse != RECURSE_UNSET)
{
#ifdef DEBUG_SHOW_OPS
fprintf(stderr, "++ Accept within recursion\n");
#endif
offset = Flast_group_offset;
for(;;)
{
@ -840,27 +869,49 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
Fecode += 1 + LINK_SIZE;
continue;
}
/* Fall through */
/* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY
is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the
start of the subject. In both cases, backtracking will then try other
alternatives, if any. */
/* OP_END itself can never be reached within a recursion because that is
picked up when the OP_KET that always precedes OP_END is reached. */
case OP_END:
/* Fail for an empty string match if either PCRE2_NOTEMPTY is set, or if
PCRE2_NOTEMPTY_ATSTART is set and we have matched at the start of the
subject. In both cases, backtracking will then try other alternatives, if
any. */
if (Feptr == Fstart_match &&
((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
((mb->moptions & PCRE2_NOTEMPTY_ATSTART) != 0 &&
Fstart_match == mb->start_subject + mb->start_offset)))
{
#ifdef DEBUG_SHOW_OPS
fprintf(stderr, "++ Backtrack because empty string\n");
#endif
RRETURN(MATCH_NOMATCH);
}
/* Also fail if PCRE2_ENDANCHORED is set and the end of the match is not
/* Fail if PCRE2_ENDANCHORED is set and the end of the match is not
the end of the subject. After (*ACCEPT) we fail the entire match (at this
position) but backtrack on reaching the end of the pattern. */
position) but backtrack if we've reached the end of the pattern. This
applies whether or not we are in a recursion. */
if (Feptr < mb->end_subject &&
((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0)
{
if (Fop == OP_END) RRETURN(MATCH_NOMATCH);
return MATCH_NOMATCH;
if (Fop == OP_END)
{
#ifdef DEBUG_SHOW_OPS
fprintf(stderr, "++ Backtrack because not at end (endanchored set)\n");
#endif
RRETURN(MATCH_NOMATCH);
}
#ifdef DEBUG_SHOW_OPS
fprintf(stderr, "++ Failed ACCEPT not at end (endanchnored set)\n");
#endif
return MATCH_NOMATCH; /* (*ACCEPT) */
}
/* We have a successful match of the whole pattern. Record the result and
@ -2433,6 +2484,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
GETCHARINCTEST(fc, Feptr);
{
const uint32_t *cp;
uint32_t chartype;
const ucd_record *prop = GET_UCD(fc);
BOOL notmatch = Fop == OP_NOTPROP;
@ -2443,9 +2495,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
break;
case PT_LAMP:
if ((prop->chartype == ucp_Lu ||
prop->chartype == ucp_Ll ||
prop->chartype == ucp_Lt) == notmatch)
chartype = prop->chartype;
if ((chartype == ucp_Lu ||
chartype == ucp_Ll ||
chartype == ucp_Lt) == notmatch)
RRETURN(MATCH_NOMATCH);
break;
@ -2475,8 +2528,9 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/* These are specials */
case PT_ALNUM:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N) == notmatch)
chartype = prop->chartype;
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N) == notmatch)
RRETURN(MATCH_NOMATCH);
break;
@ -2501,13 +2555,22 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
break;
case PT_WORD:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
fc == CHAR_UNDERSCORE) == notmatch)
chartype = prop->chartype;
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N ||
chartype == ucp_Mn ||
chartype == ucp_Pc) == notmatch)
RRETURN(MATCH_NOMATCH);
break;
case PT_CLIST:
#if PCRE2_CODE_UNIT_WIDTH == 32
if (fc > MAX_UTF_CODE_POINT)
{
if (notmatch) break;;
RRETURN(MATCH_NOMATCH);
}
#endif
cp = PRIV(ucd_caseless_sets) + Fecode[2];
for (;;)
{
@ -2803,16 +2866,17 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
case PT_WORD:
for (i = 1; i <= Lmin; i++)
{
int category;
int chartype, category;
if (Feptr >= mb->end_subject)
{
SCHECK_PARTIAL();
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(fc, Feptr);
category = UCD_CATEGORY(fc);
chartype = UCD_CHARTYPE(fc);
category = PRIV(ucp_gentype)[chartype];
if ((category == ucp_L || category == ucp_N ||
fc == CHAR_UNDERSCORE) == notmatch)
chartype == ucp_Mn || chartype == ucp_Pc) == notmatch)
RRETURN(MATCH_NOMATCH);
}
break;
@ -2827,6 +2891,13 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(fc, Feptr);
#if PCRE2_CODE_UNIT_WIDTH == 32
if (fc > MAX_UTF_CODE_POINT)
{
if (notmatch) continue;
RRETURN(MATCH_NOMATCH);
}
#endif
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
for (;;)
{
@ -3607,7 +3678,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
case PT_WORD:
for (;;)
{
int category;
int chartype, category;
RMATCH(Fecode, RM215);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (Lmin++ >= Lmax) RRETURN(MATCH_NOMATCH);
@ -3617,10 +3688,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(fc, Feptr);
category = UCD_CATEGORY(fc);
chartype = UCD_CHARTYPE(fc);
category = PRIV(ucp_gentype)[chartype];
if ((category == ucp_L ||
category == ucp_N ||
fc == CHAR_UNDERSCORE) == (Lctype == OP_NOTPROP))
chartype == ucp_Mn ||
chartype == ucp_Pc) == (Lctype == OP_NOTPROP))
RRETURN(MATCH_NOMATCH);
}
/* Control never gets here */
@ -3638,6 +3711,13 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
RRETURN(MATCH_NOMATCH);
}
GETCHARINCTEST(fc, Feptr);
#if PCRE2_CODE_UNIT_WIDTH == 32
if (fc > MAX_UTF_CODE_POINT)
{
if (Lctype == OP_NOTPROP) continue;
RRETURN(MATCH_NOMATCH);
}
#endif
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
for (;;)
{
@ -4188,7 +4268,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
case PT_WORD:
for (i = Lmin; i < Lmax; i++)
{
int category;
int chartype, category;
int len = 1;
if (Feptr >= mb->end_subject)
{
@ -4196,9 +4276,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
break;
}
GETCHARLENTEST(fc, Feptr, len);
category = UCD_CATEGORY(fc);
if ((category == ucp_L || category == ucp_N ||
fc == CHAR_UNDERSCORE) == notmatch)
chartype = UCD_CHARTYPE(fc);
category = PRIV(ucp_gentype)[chartype];
if ((category == ucp_L ||
category == ucp_N ||
chartype == ucp_Mn ||
chartype == ucp_Pc) == notmatch)
break;
Feptr+= len;
}
@ -4215,6 +4298,14 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
break;
}
GETCHARLENTEST(fc, Feptr, len);
#if PCRE2_CODE_UNIT_WIDTH == 32
if (fc > MAX_UTF_CODE_POINT)
{
if (!notmatch) goto GOT_MAX;
}
else
#endif
{
cp = PRIV(ucd_caseless_sets) + Lpropvalue;
for (;;)
{
@ -4223,6 +4314,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (fc == *cp++)
{ if (notmatch) goto GOT_MAX; else break; }
}
}
Feptr += len;
}
GOT_MAX:
@ -5320,9 +5413,11 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/* ===================================================================== */
/* Recursion either matches the current regex, or some subexpression. The
offset data is the offset to the starting bracket from the start of the
whole pattern. (This is so that it works from duplicated subpatterns.) */
/* Pattern recursion either matches the current regex, or some
subexpression. The offset data is the offset to the starting bracket from
the start of the whole pattern. This is so that it works from duplicated
subpatterns. For a whole-pattern recursion, we have to infer the number
zero. */
#define Lframe_type F->temp_32[0]
#define Lstart_branch F->temp_sptr[0]
@ -5331,9 +5426,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
bracode = mb->start_code + GET(Fecode, 1);
number = (bracode == mb->start_code)? 0 : GET2(bracode, 1 + LINK_SIZE);
/* If we are already in a recursion, check for repeating the same one
without advancing the subject pointer. This should catch convoluted mutual
recursions. (Some simple cases are caught at compile time.) */
/* If we are already in a pattern recursion, check for repeating the same
one without changing the subject pointer or the last referenced character
in the subject. This should catch convoluted mutual recursions; some
simple cases are caught at compile time. However, there are rare cases when
this check needs to be turned off. In this case, actual recursion loops
will be caught by the match or heap limits. */
if (Fcurrent_recurse != RECURSE_UNSET)
{
@ -5344,15 +5442,19 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
P = (heapframe *)((char *)N - frame_size);
if (N->group_frame_type == (GF_RECURSE | number))
{
if (Feptr == P->eptr) return PCRE2_ERROR_RECURSELOOP;
if (Feptr == P->eptr && mb->last_used_ptr == P->recurse_last_used &&
(mb->moptions & PCRE2_DISABLE_RECURSELOOP_CHECK) == 0)
return PCRE2_ERROR_RECURSELOOP;
break;
}
offset = P->last_group_offset;
}
}
/* Now run the recursion, branch by branch. */
/* Remember the current last referenced character and then run the
recursion branch by branch. */
F->recurse_last_used = mb->last_used_ptr;
Lstart_branch = bracode;
Lframe_type = GF_RECURSE | number;
@ -5681,13 +5783,13 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/* ===================================================================== */
/* Move the subject pointer back. This occurs only at the start of each
branch of a lookbehind assertion. If we are too close to the start to move
back, fail. When working with UTF-8 we move back a number of characters,
not bytes. */
/* Move the subject pointer back by one fixed amount. This occurs at the
start of each branch that has a fixed length in a lookbehind assertion. If
we are too close to the start to move back, fail. When working with UTF-8
we move back a number of characters, not bytes. */
case OP_REVERSE:
number = GET(Fecode, 1);
number = GET2(Fecode, 1);
#ifdef SUPPORT_UNICODE
if (utf)
{
@ -5701,7 +5803,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
else
#endif
/* No UTF-8 support, or not in UTF-8 mode: count is code unit count */
/* No UTF support, or not in UTF mode: count is code unit count */
{
if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
@ -5711,15 +5813,84 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/* Save the earliest consulted character, then skip to next opcode */
if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr;
Fecode += 1 + LINK_SIZE;
Fecode += 1 + IMM2_SIZE;
break;
/* ===================================================================== */
/* Move the subject pointer back by a variable amount. This occurs at the
start of each branch of a lookbehind assertion when the branch has a
variable, but limited, length. A loop is needed to try matching the branch
after moving back different numbers of characters. If we are too close to
the start to move back even the minimum amount, fail. When working with
UTF-8 we move back a number of characters, not bytes. */
#define Lmin F->temp_32[0]
#define Lmax F->temp_32[1]
#define Leptr F->temp_sptr[0]
case OP_VREVERSE:
Lmin = GET2(Fecode, 1);
Lmax = GET2(Fecode, 1 + IMM2_SIZE);
Leptr = Feptr;
/* Move back by the maximum branch length and then work forwards. This
ensures that items such as \d{3,5} get the maximum length, which is
relevant for captures, and makes for Perl compatibility. */
#ifdef SUPPORT_UNICODE
if (utf)
{
for (i = 0; i < Lmax; i++)
{
if (Feptr == mb->start_subject)
{
if (i < Lmin) RRETURN(MATCH_NOMATCH);
Lmax = i;
break;
}
Feptr--;
BACKCHAR(Feptr);
}
}
else
#endif
/* No UTF support or not in UTF mode */
{
ptrdiff_t diff = Feptr - mb->start_subject;
uint32_t available = (diff > 65535)? 65535 : ((diff > 0)? (int)diff : 0);
if (Lmin > available) RRETURN(MATCH_NOMATCH);
if (Lmax > available) Lmax = available;
Feptr -= Lmax;
}
/* Now try matching, moving forward one character on failure, until we
reach the mimimum back length. */
for (;;)
{
RMATCH(Fecode + 1 + 2 * IMM2_SIZE, RM37);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (Lmax-- <= Lmin) RRETURN(MATCH_NOMATCH);
Feptr++;
#ifdef SUPPORT_UNICODE
if (utf) { FORWARDCHARTEST(Feptr, mb->end_subject); }
#endif
}
/* Control never reaches here */
#undef Lmin
#undef Lmax
#undef Leptr
/* ===================================================================== */
/* An alternation is the end of a branch; scan along to find the end of the
bracketed group. */
case OP_ALT:
branch_end = Fecode;
do Fecode += GET(Fecode,1); while (*Fecode == OP_ALT);
break;
@ -5727,7 +5898,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/* ===================================================================== */
/* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
starting frame was added to the chained frames in order to remember the
starting subject position for the group. */
starting subject position for the group. (Not true for OP_BRA when it's a
whole pattern recursion, but that is handled separately below.)*/
case OP_KET:
case OP_KETRMIN:
@ -5736,8 +5908,14 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
bracode = Fecode - GET(Fecode, 1);
/* Point N to the frame at the start of the most recent group.
Remember the subject pointer at the start of the group. */
if (branch_end == NULL) branch_end = Fecode;
branch_start = bracode;
while (branch_start + GET(branch_start, 1) != branch_end)
branch_start += GET(branch_start, 1);
branch_end = NULL;
/* Point N to the frame at the start of the most recent group, and P to its
predecessor. Remember the subject pointer at the start of the group. */
if (*bracode != OP_BRA && *bracode != OP_COND)
{
@ -5773,27 +5951,64 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
switch (*bracode)
{
case OP_BRA: /* No need to do anything for these */
case OP_COND:
/* Whole pattern recursion is handled as a recursion into group 0, but
the entire pattern is wrapped in OP_BRA/OP_KET rather than a capturing
group - a design mistake: it should perhaps have been capture group 0.
Anyway, that means the end of such recursion must be handled here. It is
detected by checking for an immediately following OP_END when we are
recursing in group 0. If this is not the end of a whole-pattern
recursion, there is nothing to be done. */
case OP_BRA:
if (Fcurrent_recurse != 0 || Fecode[1+LINK_SIZE] != OP_END) break;
/* It is the end of whole-pattern recursion. */
offset = Flast_group_offset;
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
N = (heapframe *)((char *)match_data->heapframes + offset);
P = (heapframe *)((char *)N - frame_size);
Flast_group_offset = P->last_group_offset;
/* Reinstate the previous set of captures and then carry on after the
recursion call. */
memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
Foffset_top * sizeof(PCRE2_SIZE));
Foffset_top = P->offset_top;
Fcapture_last = P->capture_last;
Fcurrent_recurse = P->current_recurse;
Fecode = P->ecode + 1 + LINK_SIZE;
continue; /* With next opcode */
case OP_COND: /* No need to do anything for these */
case OP_SCOND:
break;
/* Non-atomic positive assertions are like OP_BRA, except that the
subject pointer must be put back to where it was at the start of the
assertion. */
assertion. For a variable lookbehind, check its end point. */
case OP_ASSERTBACK_NA:
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
RRETURN(MATCH_NOMATCH);
/* Fall through */
case OP_ASSERT_NA:
case OP_ASSERTBACK_NA:
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
Feptr = P->eptr;
break;
/* Atomic positive assertions are like OP_ONCE, except that in addition
the subject pointer must be put back to where it was at the start of the
assertion. */
assertion. For a variable lookbehind, check its end point. */
case OP_ASSERTBACK:
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
RRETURN(MATCH_NOMATCH);
/* Fall through */
case OP_ASSERT:
case OP_ASSERTBACK:
if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
Feptr = P->eptr;
/* Fall through */
@ -5814,10 +6029,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
break;
/* A matching negative assertion returns MATCH, which is turned into
NOMATCH at the assertion level. */
NOMATCH at the assertion level. For a variable lookbehind, check its end
point. */
case OP_ASSERTBACK_NOT:
if (branch_start[1 + LINK_SIZE] == OP_VREVERSE && Feptr != P->eptr)
RRETURN(MATCH_NOMATCH);
/* Fall through */
case OP_ASSERT_NOT:
case OP_ASSERTBACK_NOT:
RRETURN(MATCH_MATCH);
/* At the end of a script run, apply the script-checking rules. This code
@ -5828,9 +6048,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
break;
/* Whole-pattern recursion is coded as a recurse into group 0, so it
won't be picked up here. Instead, we catch it when the OP_END is reached.
Other recursion is handled here. */
/* Whole-pattern recursion is coded as a recurse into group 0, and is
handled with OP_BRA above. Other recursion is handled here. */
case OP_CBRA:
case OP_CBRAPOS:
@ -5845,7 +6064,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
{
P = (heapframe *)((char *)N - frame_size);
memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
P->offset_top * sizeof(PCRE2_SIZE));
Foffset_top * sizeof(PCRE2_SIZE));
Foffset_top = P->offset_top;
Fcapture_last = P->capture_last;
Fcurrent_recurse = P->current_recurse;
@ -5928,10 +6147,10 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
if ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0) goto ASSERT_NL_OR_EOS;
/* Fall through */
/* Unconditional end of subject assertion (\z) */
/* Unconditional end of subject assertion (\z). */
case OP_EOD:
if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
if (Feptr < mb->true_end_subject) RRETURN(MATCH_NOMATCH);
if (mb->partial != 0)
{
mb->hitend = TRUE;
@ -6043,6 +6262,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
case OP_NOT_UCP_WORD_BOUNDARY:
case OP_UCP_WORD_BOUNDARY:
if (Feptr == mb->check_subject) prev_is_word = FALSE; else
{
PCRE2_SPTR lastptr = Feptr - 1;
@ -6057,13 +6278,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
fc = *lastptr;
if (lastptr < mb->start_used_ptr) mb->start_used_ptr = lastptr;
#ifdef SUPPORT_UNICODE
if ((mb->poptions & PCRE2_UCP) != 0)
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
{
if (fc == '_') prev_is_word = TRUE; else
{
int cat = UCD_CATEGORY(fc);
prev_is_word = (cat == ucp_L || cat == ucp_N);
}
int chartype = UCD_CHARTYPE(fc);
int category = PRIV(ucp_gentype)[chartype];
prev_is_word = (category == ucp_L || category == ucp_N ||
chartype == ucp_Mn || chartype == ucp_Pc);
}
else
#endif /* SUPPORT_UNICODE */
@ -6091,13 +6311,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
fc = *Feptr;
if (nextptr > mb->last_used_ptr) mb->last_used_ptr = nextptr;
#ifdef SUPPORT_UNICODE
if ((mb->poptions & PCRE2_UCP) != 0)
if (Fop == OP_UCP_WORD_BOUNDARY || Fop == OP_NOT_UCP_WORD_BOUNDARY)
{
if (fc == '_') cur_is_word = TRUE; else
{
int cat = UCD_CATEGORY(fc);
cur_is_word = (cat == ucp_L || cat == ucp_N);
}
int chartype = UCD_CHARTYPE(fc);
int category = PRIV(ucp_gentype)[chartype];
cur_is_word = (category == ucp_L || category == ucp_N ||
chartype == ucp_Mn || chartype == ucp_Pc);
}
else
#endif /* SUPPORT_UNICODE */
@ -6106,7 +6325,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
/* Now see if the situation is what we want */
if ((*Fecode++ == OP_WORD_BOUNDARY)?
if ((*Fecode++ == OP_WORD_BOUNDARY || Fop == OP_UCP_WORD_BOUNDARY)?
cur_is_word == prev_is_word : cur_is_word != prev_is_word)
RRETURN(MATCH_NOMATCH);
break;
@ -6252,7 +6471,7 @@ F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */
mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
#ifdef DEBUG_SHOW_RMATCH
fprintf(stderr, "++ RETURN %d to %d\n", rrc, Freturn_id);
fprintf(stderr, "++ RETURN %d to RM%d\n", rrc, Freturn_id);
#endif
switch (Freturn_id)
@ -6261,7 +6480,7 @@ switch (Freturn_id)
LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
LBL(33) LBL(34) LBL(35) LBL(36)
LBL(33) LBL(34) LBL(35) LBL(36) LBL(37)
#ifdef SUPPORT_WIDE_CHARS
LBL(100) LBL(101)
@ -6549,6 +6768,7 @@ if (use_jit)
match_data, mcontext);
if (rc != PCRE2_ERROR_JIT_BADOPTION)
{
match_data->subject_length = length;
if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
{
length = CU2BYTES(length + was_zero_terminated);
@ -6717,7 +6937,7 @@ if (mcontext == NULL)
else mb->memctl = mcontext->memctl;
anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
startline = (re->flags & PCRE2_STARTLINE) != 0;
bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
true_end_subject : subject + mcontext->offset_limit;
@ -6740,6 +6960,7 @@ mb->callout_data = mcontext->callout_data;
mb->start_subject = subject;
mb->start_offset = start_offset;
mb->end_subject = end_subject;
mb->true_end_subject = true_end_subject;
mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
mb->allowemptypartial = (re->max_lookbehind > 0) ||
(re->flags & PCRE2_MATCH_EMPTY) != 0;
@ -6799,7 +7020,7 @@ the pattern. It is not used at all if there are no capturing parentheses.
frame_size is the total size of each frame
match_data->heapframes is the pointer to the frames vector
match_data->heapframes_size is the total size of the vector
match_data->heapframes_size is the allocated size of the vector
We must pad the frame_size for alignment to ensure subsequent frames are as
aligned as heapframe. Whilst ovector is word-aligned due to being a PCRE2_SIZE
@ -6814,7 +7035,7 @@ frame_size = (offsetof(heapframe, ovector) +
smaller. */
mb->heap_limit = ((mcontext->heap_limit < re->limit_heap)?
mcontext->heap_limit : re->limit_heap) * 1024;
mcontext->heap_limit : re->limit_heap);
mb->match_limit = (mcontext->match_limit < re->limit_match)?
mcontext->match_limit : re->limit_match;
@ -6825,19 +7046,19 @@ mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)?
/* If a pattern has very many capturing parentheses, the frame size may be very
large. Set the initial frame vector size to ensure that there are at least 10
available frames, but enforce a minimum of START_FRAMES_SIZE. If this is
greater than the heap limit, get as large a vector as possible. Always round
the size to a multiple of the frame size. */
greater than the heap limit, get as large a vector as possible. */
heapframes_size = frame_size * 10;
if (heapframes_size < START_FRAMES_SIZE) heapframes_size = START_FRAMES_SIZE;
if (heapframes_size > mb->heap_limit)
if (heapframes_size / 1024 > mb->heap_limit)
{
if (frame_size > mb->heap_limit ) return PCRE2_ERROR_HEAPLIMIT;
heapframes_size = mb->heap_limit;
PCRE2_SIZE max_size = 1024 * mb->heap_limit;
if (max_size < frame_size) return PCRE2_ERROR_HEAPLIMIT;
heapframes_size = max_size;
}
/* If an existing frame vector in the match_data block is large enough, we can
use it.Otherwise, free any pre-existing vector and get a new one. */
use it. Otherwise, free any pre-existing vector and get a new one. */
if (match_data->heapframes_size < heapframes_size)
{
@ -7284,9 +7505,17 @@ for(;;)
mb->end_offset_top = 0;
mb->skip_arg_count = 0;
#ifdef DEBUG_SHOW_OPS
fprintf(stderr, "++ Calling match()\n");
#endif
rc = match(start_match, mb->start_code, re->top_bracket, frame_size,
match_data, mb);
#ifdef DEBUG_SHOW_OPS
fprintf(stderr, "++ match() returned %d\n\n", rc);
#endif
if (mb->hitend && start_partial == NULL)
{
start_partial = mb->start_used_ptr;
@ -7434,6 +7663,7 @@ if (utf && end_subject != true_end_subject &&
if (start_match >= true_end_subject)
{
rc = MATCH_NOMATCH; /* In case it was partial */
match_partial = NULL;
break;
}
@ -7483,6 +7713,7 @@ if (rc == MATCH_MATCH)
{
match_data->rc = ((int)mb->end_offset_top >= 2 * match_data->oveccount)?
0 : (int)mb->end_offset_top/2 + 1;
match_data->subject_length = length;
match_data->startchar = start_match - subject;
match_data->leftchar = mb->start_used_ptr - subject;
match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)?
@ -7497,6 +7728,7 @@ if (rc == MATCH_MATCH)
match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
}
else match_data->subject = subject;
return match_data->rc;
}
@ -7518,6 +7750,7 @@ PCRE2_ERROR_PARTIAL. */
else if (match_partial != NULL)
{
match_data->subject = subject;
match_data->subject_length = length;
match_data->ovector[0] = match_partial - subject;
match_data->ovector[1] = end_subject - subject;
match_data->startchar = match_partial - subject;

View File

@ -167,4 +167,16 @@ return offsetof(pcre2_match_data, ovector) +
2 * (match_data->oveccount) * sizeof(PCRE2_SIZE);
}
/*************************************************
* Get heapframes size *
*************************************************/
PCRE2_EXP_DEFN PCRE2_SIZE PCRE2_CALL_CONVENTION
pcre2_get_match_data_heapframes_size(pcre2_match_data *match_data)
{
return match_data->heapframes_size;
}
/* End of pcre2_match_data.c */

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2021 University of Cambridge
New API code Copyright (c) 2016-2023 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -253,6 +253,7 @@ for (;;)
/* Skip over things that don't match chars */
case OP_REVERSE:
case OP_VREVERSE:
case OP_CREF:
case OP_DNCREF:
case OP_RREF:
@ -270,6 +271,8 @@ for (;;)
case OP_DOLLM:
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
case OP_NOT_UCP_WORD_BOUNDARY:
case OP_UCP_WORD_BOUNDARY:
cc += PRIV(OP_lengths)[*cc];
break;
@ -973,6 +976,7 @@ do
while (try_next) /* Loop for items in this branch */
{
int rc;
PCRE2_SPTR ncode;
uint8_t *classmap = NULL;
#ifdef SUPPORT_WIDE_CHARS
PCRE2_UCHAR xclassflags;
@ -1051,6 +1055,7 @@ do
case OP_REF:
case OP_REFI:
case OP_REVERSE:
case OP_VREVERSE:
case OP_RREF:
case OP_SCOND:
case OP_SET_SOM:
@ -1098,13 +1103,100 @@ do
case OP_WORD_BOUNDARY:
case OP_NOT_WORD_BOUNDARY:
case OP_UCP_WORD_BOUNDARY:
case OP_NOT_UCP_WORD_BOUNDARY:
tcode++;
break;
/* If we hit a bracket or a positive lookahead assertion, recurse to set
bits from within the subpattern. If it can't find anything, we have to
give up. If it finds some mandatory character(s), we are done for this
branch. Otherwise, carry on scanning after the subpattern. */
/* For a positive lookahead assertion, inspect what immediately follows,
ignoring intermediate assertions and callouts. If the next item is one
that sets a mandatory character, skip this assertion. Otherwise, treat it
the same as other bracket groups. */
case OP_ASSERT:
case OP_ASSERT_NA:
ncode = tcode + GET(tcode, 1);
while (*ncode == OP_ALT) ncode += GET(ncode, 1);
ncode += 1 + LINK_SIZE;
/* Skip irrelevant items */
for (BOOL done = FALSE; !done;)
{
switch (*ncode)
{
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
case OP_ASSERT_NA:
case OP_ASSERTBACK_NA:
ncode += GET(ncode, 1);
while (*ncode == OP_ALT) ncode += GET(ncode, 1);
ncode += 1 + LINK_SIZE;
break;
case OP_WORD_BOUNDARY:
case OP_NOT_WORD_BOUNDARY:
case OP_UCP_WORD_BOUNDARY:
case OP_NOT_UCP_WORD_BOUNDARY:
ncode++;
break;
case OP_CALLOUT:
ncode += PRIV(OP_lengths)[OP_CALLOUT];
break;
case OP_CALLOUT_STR:
ncode += GET(ncode, 1 + 2*LINK_SIZE);
break;
default:
done = TRUE;
break;
}
}
/* Now check the next significant item. */
switch(*ncode)
{
default:
break;
case OP_PROP:
if (ncode[1] != PT_CLIST) break;
/* Fall through */
case OP_ANYNL:
case OP_CHAR:
case OP_CHARI:
case OP_EXACT:
case OP_EXACTI:
case OP_HSPACE:
case OP_MINPLUS:
case OP_MINPLUSI:
case OP_PLUS:
case OP_PLUSI:
case OP_POSPLUS:
case OP_POSPLUSI:
case OP_VSPACE:
/* Note that these types will only be present in non-UCP mode. */
case OP_DIGIT:
case OP_NOT_DIGIT:
case OP_WORDCHAR:
case OP_NOT_WORDCHAR:
case OP_WHITESPACE:
case OP_NOT_WHITESPACE:
tcode = ncode;
continue; /* With the following significant opcode */
}
/* Fall through */
/* For a group bracket or a positive assertion without an immediately
following mandatory setting, recurse to set bits from within the
subpattern. If it can't find anything, we have to give up. If it finds
some mandatory character(s), we are done for this branch. Otherwise,
carry on scanning after the subpattern. */
case OP_BRA:
case OP_SBRA:
@ -1116,8 +1208,6 @@ do
case OP_SCBRAPOS:
case OP_ONCE:
case OP_SCRIPT_RUN:
case OP_ASSERT:
case OP_ASSERT_NA:
rc = set_start_bits(re, tcode, utf, ucp, depthptr);
if (rc == SSB_DONE)
{

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2018 University of Cambridge
New API code Copyright (c) 2016-2023 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -306,6 +306,7 @@ Returns: if successful: 0
PCRE2_ERROR_NOSUBSTRING: no such substring
PCRE2_ERROR_UNAVAILABLE: ovector is too small
PCRE2_ERROR_UNSET: substring is not set
PCRE2_ERROR_INVALIDOFFSET: internal error, should not occur
*/
PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
@ -338,6 +339,8 @@ else /* Matched using pcre2_dfa_match() */
left = match_data->ovector[stringnumber*2];
right = match_data->ovector[stringnumber*2+1];
if (left > match_data->subject_length || right > match_data->subject_length)
return PCRE2_ERROR_INVALIDOFFSET;
if (sizeptr != NULL) *sizeptr = (left > right)? 0 : right - left;
return 0;
}
@ -439,7 +442,7 @@ Returns: nothing
*/
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_substring_list_free(PCRE2_SPTR *list)
pcre2_substring_list_free(PCRE2_UCHAR **list)
{
if (list != NULL)
{

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2021 University of Cambridge
New API code Copyright (c) 2016-2024 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -169,9 +169,9 @@ are implementing).
6. Do not break after Prepend characters.
7. Do not break within emoji modifier sequences or emoji zwj sequences. That
is, do not break between characters with the Extended_Pictographic property.
Extend and ZWJ characters are allowed between the characters; this cannot be
represented in this table, the code has to deal with it.
is, do not break between characters with the Extended_Pictographic property
if a ZWJ intervenes. Extend characters are allowed between the characters;
this cannot be represented in this table, the code has to deal with it.
8. Do not break within emoji flag sequences. That is, do not break between
regional indicator (RI) symbols if there are an odd number of RI characters
@ -201,8 +201,8 @@ const uint32_t PRIV(ucp_gbtable)[] = {
ESZ|(1u<<ucp_gbT), /* 10 LVT */
(1u<<ucp_gbRegional_Indicator), /* 11 Regional Indicator */
ESZ, /* 12 Other */
ESZ, /* 13 ZWJ */
ESZ|(1u<<ucp_gbExtended_Pictographic) /* 14 Extended Pictographic */
ESZ|(1u<<ucp_gbExtended_Pictographic), /* 13 ZWJ */
ESZ /* 14 Extended Pictographic */
};
#undef ESZ

File diff suppressed because it is too large Load Diff

View File

@ -166,29 +166,29 @@ enum {
/* These are the bidi class values. */
enum {
ucp_bidiAL, /* Arabic letter */
ucp_bidiAN, /* Arabic number */
ucp_bidiB, /* Paragraph separator */
ucp_bidiBN, /* Boundary neutral */
ucp_bidiCS, /* Common separator */
ucp_bidiEN, /* European number */
ucp_bidiES, /* European separator */
ucp_bidiET, /* European terminator */
ucp_bidiFSI, /* First strong isolate */
ucp_bidiL, /* Left to right */
ucp_bidiLRE, /* Left to right embedding */
ucp_bidiLRI, /* Left to right isolate */
ucp_bidiLRO, /* Left to right override */
ucp_bidiNSM, /* Non-spacing mark */
ucp_bidiON, /* Other neutral */
ucp_bidiPDF, /* Pop directional format */
ucp_bidiPDI, /* Pop directional isolate */
ucp_bidiR, /* Right to left */
ucp_bidiRLE, /* Right to left embedding */
ucp_bidiRLI, /* Right to left isolate */
ucp_bidiRLO, /* Right to left override */
ucp_bidiS, /* Segment separator */
ucp_bidiWS, /* White space */
ucp_bidiAL, /* Arabic_Letter */
ucp_bidiAN, /* Arabic_Number */
ucp_bidiB, /* Paragraph_Separator */
ucp_bidiBN, /* Boundary_Neutral */
ucp_bidiCS, /* Common_Separator */
ucp_bidiEN, /* European_Number */
ucp_bidiES, /* European_Separator */
ucp_bidiET, /* European_Terminator */
ucp_bidiFSI, /* First_Strong_Isolate */
ucp_bidiL, /* Left_To_Right */
ucp_bidiLRE, /* Left_To_Right_Embedding */
ucp_bidiLRI, /* Left_To_Right_Isolate */
ucp_bidiLRO, /* Left_To_Right_Override */
ucp_bidiNSM, /* Nonspacing_Mark */
ucp_bidiON, /* Other_Neutral */
ucp_bidiPDF, /* Pop_Directional_Format */
ucp_bidiPDI, /* Pop_Directional_Isolate */
ucp_bidiR, /* Right_To_Left */
ucp_bidiRLE, /* Right_To_Left_Embedding */
ucp_bidiRLI, /* Right_To_Left_Isolate */
ucp_bidiRLO, /* Right_To_Left_Override */
ucp_bidiS, /* Segment_Separator */
ucp_bidiWS, /* White_Space */
};
/* These are grapheme break properties. The Extended Pictographic property
@ -380,6 +380,8 @@ enum {
ucp_Tangsa,
ucp_Toto,
ucp_Vithkuqi,
ucp_Kawi,
ucp_Nag_Mundari,
/* This must be last */
ucp_Script_Count

View File

@ -265,6 +265,7 @@ the "loose matching" rules that Unicode advises and Perl uses. */
#define STRING_kana0 STR_k STR_a STR_n STR_a "\0"
#define STRING_kannada0 STR_k STR_a STR_n STR_n STR_a STR_d STR_a "\0"
#define STRING_katakana0 STR_k STR_a STR_t STR_a STR_k STR_a STR_n STR_a "\0"
#define STRING_kawi0 STR_k STR_a STR_w STR_i "\0"
#define STRING_kayahli0 STR_k STR_a STR_y STR_a STR_h STR_l STR_i "\0"
#define STRING_khar0 STR_k STR_h STR_a STR_r "\0"
#define STRING_kharoshthi0 STR_k STR_h STR_a STR_r STR_o STR_s STR_h STR_t STR_h STR_i "\0"
@ -347,6 +348,8 @@ the "loose matching" rules that Unicode advises and Perl uses. */
#define STRING_mymr0 STR_m STR_y STR_m STR_r "\0"
#define STRING_n0 STR_n "\0"
#define STRING_nabataean0 STR_n STR_a STR_b STR_a STR_t STR_a STR_e STR_a STR_n "\0"
#define STRING_nagm0 STR_n STR_a STR_g STR_m "\0"
#define STRING_nagmundari0 STR_n STR_a STR_g STR_m STR_u STR_n STR_d STR_a STR_r STR_i "\0"
#define STRING_nand0 STR_n STR_a STR_n STR_d "\0"
#define STRING_nandinagari0 STR_n STR_a STR_n STR_d STR_i STR_n STR_a STR_g STR_a STR_r STR_i "\0"
#define STRING_narb0 STR_n STR_a STR_r STR_b "\0"
@ -753,6 +756,7 @@ const char PRIV(utt_names)[] =
STRING_kana0
STRING_kannada0
STRING_katakana0
STRING_kawi0
STRING_kayahli0
STRING_khar0
STRING_kharoshthi0
@ -835,6 +839,8 @@ const char PRIV(utt_names)[] =
STRING_mymr0
STRING_n0
STRING_nabataean0
STRING_nagm0
STRING_nagmundari0
STRING_nand0
STRING_nandinagari0
STRING_narb0
@ -1241,280 +1247,283 @@ const ucp_type_table PRIV(utt)[] = {
{ 1665, PT_SCX, ucp_Katakana },
{ 1670, PT_SCX, ucp_Kannada },
{ 1678, PT_SCX, ucp_Katakana },
{ 1687, PT_SCX, ucp_Kayah_Li },
{ 1695, PT_SC, ucp_Kharoshthi },
{ 1687, PT_SC, ucp_Kawi },
{ 1692, PT_SCX, ucp_Kayah_Li },
{ 1700, PT_SC, ucp_Kharoshthi },
{ 1711, PT_SC, ucp_Khitan_Small_Script },
{ 1729, PT_SC, ucp_Khmer },
{ 1735, PT_SC, ucp_Khmer },
{ 1740, PT_SCX, ucp_Khojki },
{ 1705, PT_SC, ucp_Kharoshthi },
{ 1716, PT_SC, ucp_Khitan_Small_Script },
{ 1734, PT_SC, ucp_Khmer },
{ 1740, PT_SC, ucp_Khmer },
{ 1745, PT_SCX, ucp_Khojki },
{ 1752, PT_SCX, ucp_Khudawadi },
{ 1762, PT_SC, ucp_Khitan_Small_Script },
{ 1767, PT_SCX, ucp_Kannada },
{ 1772, PT_SCX, ucp_Kaithi },
{ 1777, PT_GC, ucp_L },
{ 1779, PT_LAMP, 0 },
{ 1782, PT_SC, ucp_Tai_Tham },
{ 1787, PT_SC, ucp_Lao },
{ 1791, PT_SC, ucp_Lao },
{ 1796, PT_SCX, ucp_Latin },
{ 1802, PT_SCX, ucp_Latin },
{ 1807, PT_LAMP, 0 },
{ 1810, PT_SC, ucp_Lepcha },
{ 1750, PT_SCX, ucp_Khojki },
{ 1757, PT_SCX, ucp_Khudawadi },
{ 1767, PT_SC, ucp_Khitan_Small_Script },
{ 1772, PT_SCX, ucp_Kannada },
{ 1777, PT_SCX, ucp_Kaithi },
{ 1782, PT_GC, ucp_L },
{ 1784, PT_LAMP, 0 },
{ 1787, PT_SC, ucp_Tai_Tham },
{ 1792, PT_SC, ucp_Lao },
{ 1796, PT_SC, ucp_Lao },
{ 1801, PT_SCX, ucp_Latin },
{ 1807, PT_SCX, ucp_Latin },
{ 1812, PT_LAMP, 0 },
{ 1815, PT_SC, ucp_Lepcha },
{ 1822, PT_SCX, ucp_Limbu },
{ 1820, PT_SC, ucp_Lepcha },
{ 1827, PT_SCX, ucp_Limbu },
{ 1833, PT_SCX, ucp_Linear_A },
{ 1838, PT_SCX, ucp_Linear_B },
{ 1843, PT_SCX, ucp_Linear_A },
{ 1851, PT_SCX, ucp_Linear_B },
{ 1859, PT_SC, ucp_Lisu },
{ 1864, PT_PC, ucp_Ll },
{ 1867, PT_PC, ucp_Lm },
{ 1870, PT_PC, ucp_Lo },
{ 1873, PT_BOOL, ucp_Logical_Order_Exception },
{ 1877, PT_BOOL, ucp_Logical_Order_Exception },
{ 1899, PT_BOOL, ucp_Lowercase },
{ 1905, PT_BOOL, ucp_Lowercase },
{ 1915, PT_PC, ucp_Lt },
{ 1918, PT_PC, ucp_Lu },
{ 1921, PT_SC, ucp_Lycian },
{ 1832, PT_SCX, ucp_Limbu },
{ 1838, PT_SCX, ucp_Linear_A },
{ 1843, PT_SCX, ucp_Linear_B },
{ 1848, PT_SCX, ucp_Linear_A },
{ 1856, PT_SCX, ucp_Linear_B },
{ 1864, PT_SC, ucp_Lisu },
{ 1869, PT_PC, ucp_Ll },
{ 1872, PT_PC, ucp_Lm },
{ 1875, PT_PC, ucp_Lo },
{ 1878, PT_BOOL, ucp_Logical_Order_Exception },
{ 1882, PT_BOOL, ucp_Logical_Order_Exception },
{ 1904, PT_BOOL, ucp_Lowercase },
{ 1910, PT_BOOL, ucp_Lowercase },
{ 1920, PT_PC, ucp_Lt },
{ 1923, PT_PC, ucp_Lu },
{ 1926, PT_SC, ucp_Lycian },
{ 1933, PT_SC, ucp_Lydian },
{ 1931, PT_SC, ucp_Lycian },
{ 1938, PT_SC, ucp_Lydian },
{ 1945, PT_GC, ucp_M },
{ 1947, PT_SCX, ucp_Mahajani },
{ 1956, PT_SCX, ucp_Mahajani },
{ 1961, PT_SC, ucp_Makasar },
{ 1943, PT_SC, ucp_Lydian },
{ 1950, PT_GC, ucp_M },
{ 1952, PT_SCX, ucp_Mahajani },
{ 1961, PT_SCX, ucp_Mahajani },
{ 1966, PT_SC, ucp_Makasar },
{ 1974, PT_SCX, ucp_Malayalam },
{ 1984, PT_SCX, ucp_Mandaic },
{ 1971, PT_SC, ucp_Makasar },
{ 1979, PT_SCX, ucp_Malayalam },
{ 1989, PT_SCX, ucp_Mandaic },
{ 1997, PT_SCX, ucp_Manichaean },
{ 1994, PT_SCX, ucp_Mandaic },
{ 2002, PT_SCX, ucp_Manichaean },
{ 2013, PT_SC, ucp_Marchen },
{ 2007, PT_SCX, ucp_Manichaean },
{ 2018, PT_SC, ucp_Marchen },
{ 2026, PT_SCX, ucp_Masaram_Gondi },
{ 2039, PT_BOOL, ucp_Math },
{ 2044, PT_PC, ucp_Mc },
{ 2047, PT_PC, ucp_Me },
{ 2050, PT_SC, ucp_Medefaidrin },
{ 2062, PT_SC, ucp_Medefaidrin },
{ 2067, PT_SC, ucp_Meetei_Mayek },
{ 2079, PT_SC, ucp_Mende_Kikakui },
{ 2023, PT_SC, ucp_Marchen },
{ 2031, PT_SCX, ucp_Masaram_Gondi },
{ 2044, PT_BOOL, ucp_Math },
{ 2049, PT_PC, ucp_Mc },
{ 2052, PT_PC, ucp_Me },
{ 2055, PT_SC, ucp_Medefaidrin },
{ 2067, PT_SC, ucp_Medefaidrin },
{ 2072, PT_SC, ucp_Meetei_Mayek },
{ 2084, PT_SC, ucp_Mende_Kikakui },
{ 2097, PT_SC, ucp_Meroitic_Cursive },
{ 2102, PT_SC, ucp_Meroitic_Hieroglyphs },
{ 2107, PT_SC, ucp_Meroitic_Cursive },
{ 2123, PT_SC, ucp_Meroitic_Hieroglyphs },
{ 2143, PT_SC, ucp_Miao },
{ 2148, PT_SCX, ucp_Malayalam },
{ 2153, PT_PC, ucp_Mn },
{ 2156, PT_SCX, ucp_Modi },
{ 2161, PT_SCX, ucp_Mongolian },
{ 2089, PT_SC, ucp_Mende_Kikakui },
{ 2102, PT_SC, ucp_Meroitic_Cursive },
{ 2107, PT_SC, ucp_Meroitic_Hieroglyphs },
{ 2112, PT_SC, ucp_Meroitic_Cursive },
{ 2128, PT_SC, ucp_Meroitic_Hieroglyphs },
{ 2148, PT_SC, ucp_Miao },
{ 2153, PT_SCX, ucp_Malayalam },
{ 2158, PT_PC, ucp_Mn },
{ 2161, PT_SCX, ucp_Modi },
{ 2166, PT_SCX, ucp_Mongolian },
{ 2176, PT_SC, ucp_Mro },
{ 2180, PT_SC, ucp_Mro },
{ 2185, PT_SC, ucp_Meetei_Mayek },
{ 2190, PT_SCX, ucp_Multani },
{ 2171, PT_SCX, ucp_Mongolian },
{ 2181, PT_SC, ucp_Mro },
{ 2185, PT_SC, ucp_Mro },
{ 2190, PT_SC, ucp_Meetei_Mayek },
{ 2195, PT_SCX, ucp_Multani },
{ 2203, PT_SCX, ucp_Myanmar },
{ 2211, PT_SCX, ucp_Myanmar },
{ 2216, PT_GC, ucp_N },
{ 2218, PT_SC, ucp_Nabataean },
{ 2228, PT_SCX, ucp_Nandinagari },
{ 2233, PT_SCX, ucp_Nandinagari },
{ 2245, PT_SC, ucp_Old_North_Arabian },
{ 2250, PT_SC, ucp_Nabataean },
{ 2255, PT_BOOL, ucp_Noncharacter_Code_Point },
{ 2261, PT_PC, ucp_Nd },
{ 2264, PT_SC, ucp_Newa },
{ 2269, PT_SC, ucp_New_Tai_Lue },
{ 2279, PT_SCX, ucp_Nko },
{ 2283, PT_SCX, ucp_Nko },
{ 2288, PT_PC, ucp_Nl },
{ 2291, PT_PC, ucp_No },
{ 2294, PT_BOOL, ucp_Noncharacter_Code_Point },
{ 2316, PT_SC, ucp_Nushu },
{ 2321, PT_SC, ucp_Nushu },
{ 2327, PT_SC, ucp_Nyiakeng_Puachue_Hmong },
{ 2348, PT_SC, ucp_Ogham },
{ 2353, PT_SC, ucp_Ogham },
{ 2359, PT_SC, ucp_Ol_Chiki },
{ 2367, PT_SC, ucp_Ol_Chiki },
{ 2372, PT_SC, ucp_Old_Hungarian },
{ 2385, PT_SC, ucp_Old_Italic },
{ 2395, PT_SC, ucp_Old_North_Arabian },
{ 2411, PT_SCX, ucp_Old_Permic },
{ 2421, PT_SC, ucp_Old_Persian },
{ 2432, PT_SC, ucp_Old_Sogdian },
{ 2443, PT_SC, ucp_Old_South_Arabian },
{ 2459, PT_SC, ucp_Old_Turkic },
{ 2469, PT_SCX, ucp_Old_Uyghur },
{ 2479, PT_SCX, ucp_Oriya },
{ 2485, PT_SC, ucp_Old_Turkic },
{ 2490, PT_SCX, ucp_Oriya },
{ 2495, PT_SC, ucp_Osage },
{ 2501, PT_SC, ucp_Osage },
{ 2506, PT_SC, ucp_Osmanya },
{ 2511, PT_SC, ucp_Osmanya },
{ 2519, PT_SCX, ucp_Old_Uyghur },
{ 2524, PT_GC, ucp_P },
{ 2526, PT_SC, ucp_Pahawh_Hmong },
{ 2538, PT_SC, ucp_Palmyrene },
{ 2543, PT_SC, ucp_Palmyrene },
{ 2553, PT_BOOL, ucp_Pattern_Syntax },
{ 2560, PT_BOOL, ucp_Pattern_Syntax },
{ 2574, PT_BOOL, ucp_Pattern_White_Space },
{ 2592, PT_BOOL, ucp_Pattern_White_Space },
{ 2598, PT_SC, ucp_Pau_Cin_Hau },
{ 2603, PT_SC, ucp_Pau_Cin_Hau },
{ 2613, PT_PC, ucp_Pc },
{ 2616, PT_BOOL, ucp_Prepended_Concatenation_Mark },
{ 2620, PT_PC, ucp_Pd },
{ 2623, PT_PC, ucp_Pe },
{ 2626, PT_SCX, ucp_Old_Permic },
{ 2631, PT_PC, ucp_Pf },
{ 2634, PT_SCX, ucp_Phags_Pa },
{ 2639, PT_SCX, ucp_Phags_Pa },
{ 2647, PT_SC, ucp_Inscriptional_Pahlavi },
{ 2652, PT_SCX, ucp_Psalter_Pahlavi },
{ 2657, PT_SC, ucp_Phoenician },
{ 2662, PT_SC, ucp_Phoenician },
{ 2673, PT_PC, ucp_Pi },
{ 2676, PT_SC, ucp_Miao },
{ 2681, PT_PC, ucp_Po },
{ 2684, PT_BOOL, ucp_Prepended_Concatenation_Mark },
{ 2711, PT_SC, ucp_Inscriptional_Parthian },
{ 2716, PT_PC, ucp_Ps },
{ 2719, PT_SCX, ucp_Psalter_Pahlavi },
{ 2734, PT_SCX, ucp_Coptic },
{ 2739, PT_SC, ucp_Inherited },
{ 2744, PT_BOOL, ucp_Quotation_Mark },
{ 2750, PT_BOOL, ucp_Quotation_Mark },
{ 2764, PT_BOOL, ucp_Radical },
{ 2772, PT_BOOL, ucp_Regional_Indicator },
{ 2790, PT_SC, ucp_Rejang },
{ 2797, PT_BOOL, ucp_Regional_Indicator },
{ 2800, PT_SC, ucp_Rejang },
{ 2805, PT_SCX, ucp_Hanifi_Rohingya },
{ 2810, PT_SC, ucp_Runic },
{ 2816, PT_SC, ucp_Runic },
{ 2821, PT_GC, ucp_S },
{ 2823, PT_SC, ucp_Samaritan },
{ 2833, PT_SC, ucp_Samaritan },
{ 2838, PT_SC, ucp_Old_South_Arabian },
{ 2843, PT_SC, ucp_Saurashtra },
{ 2848, PT_SC, ucp_Saurashtra },
{ 2859, PT_PC, ucp_Sc },
{ 2862, PT_BOOL, ucp_Soft_Dotted },
{ 2865, PT_BOOL, ucp_Sentence_Terminal },
{ 2882, PT_SC, ucp_SignWriting },
{ 2887, PT_SCX, ucp_Sharada },
{ 2895, PT_SC, ucp_Shavian },
{ 2903, PT_SC, ucp_Shavian },
{ 2200, PT_SCX, ucp_Multani },
{ 2208, PT_SCX, ucp_Myanmar },
{ 2216, PT_SCX, ucp_Myanmar },
{ 2221, PT_GC, ucp_N },
{ 2223, PT_SC, ucp_Nabataean },
{ 2233, PT_SC, ucp_Nag_Mundari },
{ 2238, PT_SC, ucp_Nag_Mundari },
{ 2249, PT_SCX, ucp_Nandinagari },
{ 2254, PT_SCX, ucp_Nandinagari },
{ 2266, PT_SC, ucp_Old_North_Arabian },
{ 2271, PT_SC, ucp_Nabataean },
{ 2276, PT_BOOL, ucp_Noncharacter_Code_Point },
{ 2282, PT_PC, ucp_Nd },
{ 2285, PT_SC, ucp_Newa },
{ 2290, PT_SC, ucp_New_Tai_Lue },
{ 2300, PT_SCX, ucp_Nko },
{ 2304, PT_SCX, ucp_Nko },
{ 2309, PT_PC, ucp_Nl },
{ 2312, PT_PC, ucp_No },
{ 2315, PT_BOOL, ucp_Noncharacter_Code_Point },
{ 2337, PT_SC, ucp_Nushu },
{ 2342, PT_SC, ucp_Nushu },
{ 2348, PT_SC, ucp_Nyiakeng_Puachue_Hmong },
{ 2369, PT_SC, ucp_Ogham },
{ 2374, PT_SC, ucp_Ogham },
{ 2380, PT_SC, ucp_Ol_Chiki },
{ 2388, PT_SC, ucp_Ol_Chiki },
{ 2393, PT_SC, ucp_Old_Hungarian },
{ 2406, PT_SC, ucp_Old_Italic },
{ 2416, PT_SC, ucp_Old_North_Arabian },
{ 2432, PT_SCX, ucp_Old_Permic },
{ 2442, PT_SC, ucp_Old_Persian },
{ 2453, PT_SC, ucp_Old_Sogdian },
{ 2464, PT_SC, ucp_Old_South_Arabian },
{ 2480, PT_SC, ucp_Old_Turkic },
{ 2490, PT_SCX, ucp_Old_Uyghur },
{ 2500, PT_SCX, ucp_Oriya },
{ 2506, PT_SC, ucp_Old_Turkic },
{ 2511, PT_SCX, ucp_Oriya },
{ 2516, PT_SC, ucp_Osage },
{ 2522, PT_SC, ucp_Osage },
{ 2527, PT_SC, ucp_Osmanya },
{ 2532, PT_SC, ucp_Osmanya },
{ 2540, PT_SCX, ucp_Old_Uyghur },
{ 2545, PT_GC, ucp_P },
{ 2547, PT_SC, ucp_Pahawh_Hmong },
{ 2559, PT_SC, ucp_Palmyrene },
{ 2564, PT_SC, ucp_Palmyrene },
{ 2574, PT_BOOL, ucp_Pattern_Syntax },
{ 2581, PT_BOOL, ucp_Pattern_Syntax },
{ 2595, PT_BOOL, ucp_Pattern_White_Space },
{ 2613, PT_BOOL, ucp_Pattern_White_Space },
{ 2619, PT_SC, ucp_Pau_Cin_Hau },
{ 2624, PT_SC, ucp_Pau_Cin_Hau },
{ 2634, PT_PC, ucp_Pc },
{ 2637, PT_BOOL, ucp_Prepended_Concatenation_Mark },
{ 2641, PT_PC, ucp_Pd },
{ 2644, PT_PC, ucp_Pe },
{ 2647, PT_SCX, ucp_Old_Permic },
{ 2652, PT_PC, ucp_Pf },
{ 2655, PT_SCX, ucp_Phags_Pa },
{ 2660, PT_SCX, ucp_Phags_Pa },
{ 2668, PT_SC, ucp_Inscriptional_Pahlavi },
{ 2673, PT_SCX, ucp_Psalter_Pahlavi },
{ 2678, PT_SC, ucp_Phoenician },
{ 2683, PT_SC, ucp_Phoenician },
{ 2694, PT_PC, ucp_Pi },
{ 2697, PT_SC, ucp_Miao },
{ 2702, PT_PC, ucp_Po },
{ 2705, PT_BOOL, ucp_Prepended_Concatenation_Mark },
{ 2732, PT_SC, ucp_Inscriptional_Parthian },
{ 2737, PT_PC, ucp_Ps },
{ 2740, PT_SCX, ucp_Psalter_Pahlavi },
{ 2755, PT_SCX, ucp_Coptic },
{ 2760, PT_SC, ucp_Inherited },
{ 2765, PT_BOOL, ucp_Quotation_Mark },
{ 2771, PT_BOOL, ucp_Quotation_Mark },
{ 2785, PT_BOOL, ucp_Radical },
{ 2793, PT_BOOL, ucp_Regional_Indicator },
{ 2811, PT_SC, ucp_Rejang },
{ 2818, PT_BOOL, ucp_Regional_Indicator },
{ 2821, PT_SC, ucp_Rejang },
{ 2826, PT_SCX, ucp_Hanifi_Rohingya },
{ 2831, PT_SC, ucp_Runic },
{ 2837, PT_SC, ucp_Runic },
{ 2842, PT_GC, ucp_S },
{ 2844, PT_SC, ucp_Samaritan },
{ 2854, PT_SC, ucp_Samaritan },
{ 2859, PT_SC, ucp_Old_South_Arabian },
{ 2864, PT_SC, ucp_Saurashtra },
{ 2869, PT_SC, ucp_Saurashtra },
{ 2880, PT_PC, ucp_Sc },
{ 2883, PT_BOOL, ucp_Soft_Dotted },
{ 2886, PT_BOOL, ucp_Sentence_Terminal },
{ 2903, PT_SC, ucp_SignWriting },
{ 2908, PT_SCX, ucp_Sharada },
{ 2913, PT_SC, ucp_Siddham },
{ 2918, PT_SC, ucp_Siddham },
{ 2926, PT_SC, ucp_SignWriting },
{ 2938, PT_SCX, ucp_Khudawadi },
{ 2943, PT_SCX, ucp_Sinhala },
{ 2948, PT_SCX, ucp_Sinhala },
{ 2956, PT_PC, ucp_Sk },
{ 2959, PT_PC, ucp_Sm },
{ 2962, PT_PC, ucp_So },
{ 2965, PT_BOOL, ucp_Soft_Dotted },
{ 2976, PT_SCX, ucp_Sogdian },
{ 2981, PT_SCX, ucp_Sogdian },
{ 2989, PT_SC, ucp_Old_Sogdian },
{ 2994, PT_SC, ucp_Sora_Sompeng },
{ 2999, PT_SC, ucp_Sora_Sompeng },
{ 3011, PT_SC, ucp_Soyombo },
{ 3016, PT_SC, ucp_Soyombo },
{ 3024, PT_BOOL, ucp_White_Space },
{ 3030, PT_BOOL, ucp_Sentence_Terminal },
{ 3036, PT_SC, ucp_Sundanese },
{ 3041, PT_SC, ucp_Sundanese },
{ 3051, PT_SCX, ucp_Syloti_Nagri },
{ 3056, PT_SCX, ucp_Syloti_Nagri },
{ 3068, PT_SCX, ucp_Syriac },
{ 3073, PT_SCX, ucp_Syriac },
{ 3080, PT_SCX, ucp_Tagalog },
{ 3088, PT_SCX, ucp_Tagbanwa },
{ 3093, PT_SCX, ucp_Tagbanwa },
{ 3102, PT_SCX, ucp_Tai_Le },
{ 3108, PT_SC, ucp_Tai_Tham },
{ 3116, PT_SC, ucp_Tai_Viet },
{ 3124, PT_SCX, ucp_Takri },
{ 3129, PT_SCX, ucp_Takri },
{ 3135, PT_SCX, ucp_Tai_Le },
{ 3140, PT_SC, ucp_New_Tai_Lue },
{ 3145, PT_SCX, ucp_Tamil },
{ 3151, PT_SCX, ucp_Tamil },
{ 3156, PT_SC, ucp_Tangut },
{ 3161, PT_SC, ucp_Tangsa },
{ 3168, PT_SC, ucp_Tangut },
{ 3175, PT_SC, ucp_Tai_Viet },
{ 3180, PT_SCX, ucp_Telugu },
{ 3185, PT_SCX, ucp_Telugu },
{ 3192, PT_BOOL, ucp_Terminal_Punctuation },
{ 3197, PT_BOOL, ucp_Terminal_Punctuation },
{ 3217, PT_SC, ucp_Tifinagh },
{ 3222, PT_SCX, ucp_Tagalog },
{ 3227, PT_SCX, ucp_Thaana },
{ 3232, PT_SCX, ucp_Thaana },
{ 3239, PT_SC, ucp_Thai },
{ 3244, PT_SC, ucp_Tibetan },
{ 3252, PT_SC, ucp_Tibetan },
{ 3257, PT_SC, ucp_Tifinagh },
{ 3266, PT_SCX, ucp_Tirhuta },
{ 3271, PT_SCX, ucp_Tirhuta },
{ 3279, PT_SC, ucp_Tangsa },
{ 3284, PT_SC, ucp_Toto },
{ 3289, PT_SC, ucp_Ugaritic },
{ 3294, PT_SC, ucp_Ugaritic },
{ 3303, PT_BOOL, ucp_Unified_Ideograph },
{ 3309, PT_BOOL, ucp_Unified_Ideograph },
{ 3326, PT_SC, ucp_Unknown },
{ 3334, PT_BOOL, ucp_Uppercase },
{ 3340, PT_BOOL, ucp_Uppercase },
{ 3350, PT_SC, ucp_Vai },
{ 3354, PT_SC, ucp_Vai },
{ 3359, PT_BOOL, ucp_Variation_Selector },
{ 3377, PT_SC, ucp_Vithkuqi },
{ 3382, PT_SC, ucp_Vithkuqi },
{ 3391, PT_BOOL, ucp_Variation_Selector },
{ 3394, PT_SC, ucp_Wancho },
{ 3401, PT_SC, ucp_Warang_Citi },
{ 3406, PT_SC, ucp_Warang_Citi },
{ 3417, PT_SC, ucp_Wancho },
{ 3422, PT_BOOL, ucp_White_Space },
{ 3433, PT_BOOL, ucp_White_Space },
{ 3440, PT_ALNUM, 0 },
{ 3444, PT_BOOL, ucp_XID_Continue },
{ 3449, PT_BOOL, ucp_XID_Continue },
{ 3461, PT_BOOL, ucp_XID_Start },
{ 3466, PT_BOOL, ucp_XID_Start },
{ 3475, PT_SC, ucp_Old_Persian },
{ 3480, PT_PXSPACE, 0 },
{ 3484, PT_SPACE, 0 },
{ 3488, PT_SC, ucp_Cuneiform },
{ 3493, PT_UCNC, 0 },
{ 3497, PT_WORD, 0 },
{ 3501, PT_SCX, ucp_Yezidi },
{ 3506, PT_SCX, ucp_Yezidi },
{ 3513, PT_SCX, ucp_Yi },
{ 3516, PT_SCX, ucp_Yi },
{ 3521, PT_GC, ucp_Z },
{ 3523, PT_SC, ucp_Zanabazar_Square },
{ 3539, PT_SC, ucp_Zanabazar_Square },
{ 3544, PT_SC, ucp_Inherited },
{ 3549, PT_PC, ucp_Zl },
{ 3552, PT_PC, ucp_Zp },
{ 3555, PT_PC, ucp_Zs },
{ 3558, PT_SC, ucp_Common },
{ 3563, PT_SC, ucp_Unknown }
{ 2916, PT_SC, ucp_Shavian },
{ 2924, PT_SC, ucp_Shavian },
{ 2929, PT_SCX, ucp_Sharada },
{ 2934, PT_SC, ucp_Siddham },
{ 2939, PT_SC, ucp_Siddham },
{ 2947, PT_SC, ucp_SignWriting },
{ 2959, PT_SCX, ucp_Khudawadi },
{ 2964, PT_SCX, ucp_Sinhala },
{ 2969, PT_SCX, ucp_Sinhala },
{ 2977, PT_PC, ucp_Sk },
{ 2980, PT_PC, ucp_Sm },
{ 2983, PT_PC, ucp_So },
{ 2986, PT_BOOL, ucp_Soft_Dotted },
{ 2997, PT_SCX, ucp_Sogdian },
{ 3002, PT_SCX, ucp_Sogdian },
{ 3010, PT_SC, ucp_Old_Sogdian },
{ 3015, PT_SC, ucp_Sora_Sompeng },
{ 3020, PT_SC, ucp_Sora_Sompeng },
{ 3032, PT_SC, ucp_Soyombo },
{ 3037, PT_SC, ucp_Soyombo },
{ 3045, PT_BOOL, ucp_White_Space },
{ 3051, PT_BOOL, ucp_Sentence_Terminal },
{ 3057, PT_SC, ucp_Sundanese },
{ 3062, PT_SC, ucp_Sundanese },
{ 3072, PT_SCX, ucp_Syloti_Nagri },
{ 3077, PT_SCX, ucp_Syloti_Nagri },
{ 3089, PT_SCX, ucp_Syriac },
{ 3094, PT_SCX, ucp_Syriac },
{ 3101, PT_SCX, ucp_Tagalog },
{ 3109, PT_SCX, ucp_Tagbanwa },
{ 3114, PT_SCX, ucp_Tagbanwa },
{ 3123, PT_SCX, ucp_Tai_Le },
{ 3129, PT_SC, ucp_Tai_Tham },
{ 3137, PT_SC, ucp_Tai_Viet },
{ 3145, PT_SCX, ucp_Takri },
{ 3150, PT_SCX, ucp_Takri },
{ 3156, PT_SCX, ucp_Tai_Le },
{ 3161, PT_SC, ucp_New_Tai_Lue },
{ 3166, PT_SCX, ucp_Tamil },
{ 3172, PT_SCX, ucp_Tamil },
{ 3177, PT_SC, ucp_Tangut },
{ 3182, PT_SC, ucp_Tangsa },
{ 3189, PT_SC, ucp_Tangut },
{ 3196, PT_SC, ucp_Tai_Viet },
{ 3201, PT_SCX, ucp_Telugu },
{ 3206, PT_SCX, ucp_Telugu },
{ 3213, PT_BOOL, ucp_Terminal_Punctuation },
{ 3218, PT_BOOL, ucp_Terminal_Punctuation },
{ 3238, PT_SC, ucp_Tifinagh },
{ 3243, PT_SCX, ucp_Tagalog },
{ 3248, PT_SCX, ucp_Thaana },
{ 3253, PT_SCX, ucp_Thaana },
{ 3260, PT_SC, ucp_Thai },
{ 3265, PT_SC, ucp_Tibetan },
{ 3273, PT_SC, ucp_Tibetan },
{ 3278, PT_SC, ucp_Tifinagh },
{ 3287, PT_SCX, ucp_Tirhuta },
{ 3292, PT_SCX, ucp_Tirhuta },
{ 3300, PT_SC, ucp_Tangsa },
{ 3305, PT_SC, ucp_Toto },
{ 3310, PT_SC, ucp_Ugaritic },
{ 3315, PT_SC, ucp_Ugaritic },
{ 3324, PT_BOOL, ucp_Unified_Ideograph },
{ 3330, PT_BOOL, ucp_Unified_Ideograph },
{ 3347, PT_SC, ucp_Unknown },
{ 3355, PT_BOOL, ucp_Uppercase },
{ 3361, PT_BOOL, ucp_Uppercase },
{ 3371, PT_SC, ucp_Vai },
{ 3375, PT_SC, ucp_Vai },
{ 3380, PT_BOOL, ucp_Variation_Selector },
{ 3398, PT_SC, ucp_Vithkuqi },
{ 3403, PT_SC, ucp_Vithkuqi },
{ 3412, PT_BOOL, ucp_Variation_Selector },
{ 3415, PT_SC, ucp_Wancho },
{ 3422, PT_SC, ucp_Warang_Citi },
{ 3427, PT_SC, ucp_Warang_Citi },
{ 3438, PT_SC, ucp_Wancho },
{ 3443, PT_BOOL, ucp_White_Space },
{ 3454, PT_BOOL, ucp_White_Space },
{ 3461, PT_ALNUM, 0 },
{ 3465, PT_BOOL, ucp_XID_Continue },
{ 3470, PT_BOOL, ucp_XID_Continue },
{ 3482, PT_BOOL, ucp_XID_Start },
{ 3487, PT_BOOL, ucp_XID_Start },
{ 3496, PT_SC, ucp_Old_Persian },
{ 3501, PT_PXSPACE, 0 },
{ 3505, PT_SPACE, 0 },
{ 3509, PT_SC, ucp_Cuneiform },
{ 3514, PT_UCNC, 0 },
{ 3518, PT_WORD, 0 },
{ 3522, PT_SCX, ucp_Yezidi },
{ 3527, PT_SCX, ucp_Yezidi },
{ 3534, PT_SCX, ucp_Yi },
{ 3537, PT_SCX, ucp_Yi },
{ 3542, PT_GC, ucp_Z },
{ 3544, PT_SC, ucp_Zanabazar_Square },
{ 3560, PT_SC, ucp_Zanabazar_Square },
{ 3565, PT_SC, ucp_Inherited },
{ 3570, PT_PC, ucp_Zl },
{ 3573, PT_PC, ucp_Zp },
{ 3576, PT_PC, ucp_Zs },
{ 3579, PT_SC, ucp_Common },
{ 3584, PT_SC, ucp_Unknown }
};
const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);

View File

@ -169,7 +169,7 @@ for (p = string; length > 0; p++)
if (((d = *(++p)) & 0xc0) != 0x80)
{
*erroroffset = (int)(p - string) - 1;
*erroroffset = (PCRE2_SIZE)(p - string) - 1;
return PCRE2_ERROR_UTF8_ERR6;
}
@ -184,7 +184,7 @@ for (p = string; length > 0; p++)
case 1: if ((c & 0x3e) == 0)
{
*erroroffset = (int)(p - string) - 1;
*erroroffset = (PCRE2_SIZE)(p - string) - 1;
return PCRE2_ERROR_UTF8_ERR15;
}
break;
@ -196,17 +196,17 @@ for (p = string; length > 0; p++)
case 2:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR7;
}
if (c == 0xe0 && (d & 0x20) == 0)
{
*erroroffset = (int)(p - string) - 2;
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR16;
}
if (c == 0xed && d >= 0xa0)
{
*erroroffset = (int)(p - string) - 2;
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR14;
}
break;
@ -218,22 +218,22 @@ for (p = string; length > 0; p++)
case 3:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
*erroroffset = (PCRE2_SIZE)(p - string) - 3;
return PCRE2_ERROR_UTF8_ERR8;
}
if (c == 0xf0 && (d & 0x30) == 0)
{
*erroroffset = (int)(p - string) - 3;
*erroroffset = (PCRE2_SIZE)(p - string) - 3;
return PCRE2_ERROR_UTF8_ERR17;
}
if (c > 0xf4 || (c == 0xf4 && d > 0x8f))
{
*erroroffset = (int)(p - string) - 3;
*erroroffset = (PCRE2_SIZE)(p - string) - 3;
return PCRE2_ERROR_UTF8_ERR13;
}
break;
@ -249,22 +249,22 @@ for (p = string; length > 0; p++)
case 4:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
*erroroffset = (PCRE2_SIZE)(p - string) - 3;
return PCRE2_ERROR_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
*erroroffset = (int)(p - string) - 4;
*erroroffset = (PCRE2_SIZE)(p - string) - 4;
return PCRE2_ERROR_UTF8_ERR9;
}
if (c == 0xf8 && (d & 0x38) == 0)
{
*erroroffset = (int)(p - string) - 4;
*erroroffset = (PCRE2_SIZE)(p - string) - 4;
return PCRE2_ERROR_UTF8_ERR18;
}
break;
@ -275,27 +275,27 @@ for (p = string; length > 0; p++)
case 5:
if ((*(++p) & 0xc0) != 0x80) /* Third byte */
{
*erroroffset = (int)(p - string) - 2;
*erroroffset = (PCRE2_SIZE)(p - string) - 2;
return PCRE2_ERROR_UTF8_ERR7;
}
if ((*(++p) & 0xc0) != 0x80) /* Fourth byte */
{
*erroroffset = (int)(p - string) - 3;
*erroroffset = (PCRE2_SIZE)(p - string) - 3;
return PCRE2_ERROR_UTF8_ERR8;
}
if ((*(++p) & 0xc0) != 0x80) /* Fifth byte */
{
*erroroffset = (int)(p - string) - 4;
*erroroffset = (PCRE2_SIZE)(p - string) - 4;
return PCRE2_ERROR_UTF8_ERR9;
}
if ((*(++p) & 0xc0) != 0x80) /* Sixth byte */
{
*erroroffset = (int)(p - string) - 5;
*erroroffset = (PCRE2_SIZE)(p - string) - 5;
return PCRE2_ERROR_UTF8_ERR10;
}
if (c == 0xfc && (d & 0x3c) == 0)
{
*erroroffset = (int)(p - string) - 5;
*erroroffset = (PCRE2_SIZE)(p - string) - 5;
return PCRE2_ERROR_UTF8_ERR19;
}
break;
@ -307,7 +307,7 @@ for (p = string; length > 0; p++)
if (ab > 3)
{
*erroroffset = (int)(p - string) - ab;
*erroroffset = (PCRE2_SIZE)(p - string) - ab;
return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12;
}
}
@ -338,21 +338,21 @@ for (p = string; length > 0; p++)
/* High surrogate. Must be a followed by a low surrogate. */
if (length == 0)
{
*erroroffset = p - string;
*erroroffset = (PCRE2_SIZE)(p - string);
return PCRE2_ERROR_UTF16_ERR1;
}
p++;
length--;
if ((*p & 0xfc00) != 0xdc00)
{
*erroroffset = p - string - 1;
*erroroffset = (PCRE2_SIZE)(p - string) - 1;
return PCRE2_ERROR_UTF16_ERR2;
}
}
else
{
/* Isolated low surrogate. Always an error. */
*erroroffset = p - string;
*erroroffset = (PCRE2_SIZE)(p - string);
return PCRE2_ERROR_UTF16_ERR3;
}
}
@ -377,14 +377,14 @@ for (p = string; length > 0; length--, p++)
/* Normal UTF-32 code point. Neither high nor low surrogate. */
if (c > 0x10ffffu)
{
*erroroffset = p - string;
*erroroffset = (PCRE2_SIZE)(p - string);
return PCRE2_ERROR_UTF32_ERR2;
}
}
else
{
/* A surrogate */
*erroroffset = p - string;
*erroroffset = (PCRE2_SIZE)(p - string);
return PCRE2_ERROR_UTF32_ERR1;
}
}

View File

@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2022 University of Cambridge
New API code Copyright (c) 2016-2023 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -129,6 +129,7 @@ while ((t = *data++) != XCL_END)
#ifdef SUPPORT_UNICODE
else /* XCL_PROP & XCL_NOTPROP */
{
int chartype;
const ucd_record *prop = GET_UCD(c);
BOOL isprop = t == XCL_PROP;
BOOL ok;
@ -140,8 +141,9 @@ while ((t = *data++) != XCL_END)
break;
case PT_LAMP:
if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
prop->chartype == ucp_Lt) == isprop) return !negated;
chartype = prop->chartype;
if ((chartype == ucp_Lu || chartype == ucp_Ll ||
chartype == ucp_Lt) == isprop) return !negated;
break;
case PT_GC:
@ -164,8 +166,9 @@ while ((t = *data++) != XCL_END)
break;
case PT_ALNUM:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
chartype = prop->chartype;
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N) == isprop)
return !negated;
break;
@ -190,9 +193,10 @@ while ((t = *data++) != XCL_END)
break;
case PT_WORD:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
== isprop)
chartype = prop->chartype;
if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
PRIV(ucp_gentype)[chartype] == ucp_N ||
chartype == ucp_Mn || chartype == ucp_Pc) == isprop)
return !negated;
break;
@ -234,9 +238,10 @@ while ((t = *data++) != XCL_END)
*/
case PT_PXGRAPH:
if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
(PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
(prop->chartype == ucp_Cf &&
chartype = prop->chartype;
if ((PRIV(ucp_gentype)[chartype] != ucp_Z &&
(PRIV(ucp_gentype)[chartype] != ucp_C ||
(chartype == ucp_Cf &&
c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
)) == isprop)
return !negated;
@ -246,10 +251,11 @@ while ((t = *data++) != XCL_END)
not Zl and not Zp, and U+180E. */
case PT_PXPRINT:
if ((prop->chartype != ucp_Zl &&
prop->chartype != ucp_Zp &&
(PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
(prop->chartype == ucp_Cf &&
chartype = prop->chartype;
if ((chartype != ucp_Zl &&
chartype != ucp_Zp &&
(PRIV(ucp_gentype)[chartype] != ucp_C ||
(chartype == ucp_Cf &&
c != 0x061c && (c < 0x2066 || c > 0x2069))
)) == isprop)
return !negated;
@ -260,8 +266,21 @@ while ((t = *data++) != XCL_END)
compatibility (these are $+<=>^`|~). */
case PT_PXPUNCT:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
(c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
chartype = prop->chartype;
if ((PRIV(ucp_gentype)[chartype] == ucp_P ||
(c < 128 && PRIV(ucp_gentype)[chartype] == ucp_S)) == isprop)
return !negated;
break;
/* Perl has two sets of hex digits */
case PT_PXXDIGIT:
if (((c >= CHAR_0 && c <= CHAR_9) ||
(c >= CHAR_A && c <= CHAR_F) ||
(c >= CHAR_a && c <= CHAR_f) ||
(c >= 0xff10 && c <= 0xff19) || /* Fullwidth digits */
(c >= 0xff21 && c <= 0xff26) || /* Fullwidth letters */
(c >= 0xff41 && c <= 0xff46)) == isprop)
return !negated;
break;