GH #1586: Upgrade bundled PCRE to 8.40

This commit is contained in:
Guenter Obiltschnig 2017-02-11 12:04:36 +01:00
parent f8a0bbff1b
commit 7d91a4bc94
14 changed files with 6596 additions and 4713 deletions

View File

@ -40,7 +40,7 @@ public:
// Implementation note: the following definitions must be kept
// in sync with those from ucp.h (PCRE).
enum CharacterCategory
/// Unicode 5.0 character categories.
/// Unicode character categories.
{
UCP_OTHER,
UCP_LETTER,
@ -52,7 +52,7 @@ public:
};
enum CharacterType
/// Unicode 5.0 character types.
/// Unicode character types.
{
UCP_CONTROL,
UCP_FORMAT,
@ -87,7 +87,7 @@ public:
};
enum Script
/// Unicode 5.0 scripts.
/// Unicode 7.0 script identifiers.
{
UCP_ARABIC,
UCP_ARMENIAN,
@ -150,11 +150,13 @@ public:
UCP_TIFINAGH,
UCP_UGARITIC,
UCP_YI,
// Unicode 5.0
UCP_BALINESE,
UCP_CUNEIFORM,
UCP_NKO,
UCP_PHAGS_PA,
UCP_PHOENICIAN,
// Unicode 5.1
UCP_CARIAN,
UCP_CHAM,
UCP_KAYAH_LI,
@ -165,7 +167,59 @@ public:
UCP_REJANG,
UCP_SAURASHTRA,
UCP_SUNDANESE,
UCP_VAI
UCP_VAI,
// Unicode 5.2
UCP_AVESTAN,
UCP_BAMUM,
UCP_EGYPTIAN_HIEROGLYPHS,
UCP_IMPERIAL_ARAMAIC,
UCP_INSCRIPTIONAL_PAHLAVI,
UCP_INSCRIPTIONAL_PARTHIAN,
UCP_JAVANESE,
UCP_KAITHI,
UCP_LISU,
UCP_MEETEI_MAYEK,
UCP_OLD_SOUTH_ARABIAN,
UCP_OLD_TURKIC,
UCP_SAMARITAN,
UCP_TAI_THAM,
UCP_TAI_VIET,
// Unicode 6.0
UCP_BATAK,
UCP_BRAHMI,
UCP_MANDAIC,
// Unicode 6.1
UCP_CHAKMA,
UCP_MEROITIC_CURSIVE,
UCP_MEROITIC_HIEROGLYPHS,
UCP_MIAO,
UCP_SHARADA,
UCP_SORA_SOMPENG,
UCP_TAKRI,
// Unicode 7.0
UCP_BASSA_VAH,
UCP_CAUCASIAN_ALBANIAN,
UCP_DUPLOYAN,
UCP_ELBASAN,
UCP_GRANTHA,
UCP_KHOJKI,
UCP_KHUDAWADI,
UCP_LINEAR_A,
UCP_MAHAJANI,
UCP_MANICHAEAN,
UCP_MENDE_KIKAKUI,
UCP_MODI,
UCP_MRO,
UCP_NABATAEAN,
UCP_OLD_NORTH_ARABIAN,
UCP_OLD_PERMIC,
UCP_PAHAWH_HMONG,
UCP_PALMYRENE,
UCP_PSALTER_PAHLAVI,
UCP_PAU_CIN_HAU,
UCP_SIDDHAM,
UCP_TIRHUTA,
UCP_WARANG_CITI
};
enum

View File

@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
/* The current PCRE version information. */
#define PCRE_MAJOR 8
#define PCRE_MINOR 35
#define PCRE_MINOR 40
#define PCRE_PRERELEASE
#define PCRE_DATE 2014-04-04
#define PCRE_DATE 2017-01-11
/* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE, the appropriate

File diff suppressed because it is too large Load Diff

View File

@ -283,7 +283,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_NAME "PCRE"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "PCRE 8.35"
#define PACKAGE_STRING "PCRE 8.40"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "pcre"
@ -292,7 +292,7 @@ sure both macros are undefined; an emulation function will then be used. */
#define PACKAGE_URL ""
/* Define to the version of this package. */
#define PACKAGE_VERSION "8.35"
#define PACKAGE_VERSION "8.40"
/* The value of PARENS_NEST_LIMIT specifies the maximum depth of nested
parentheses (of any kind) in a pattern. This limits the amount of system
@ -394,7 +394,7 @@ sure both macros are undefined; an emulation function will then be used. */
/* #undef SUPPORT_VALGRIND */
/* Version number of package */
#define VERSION "8.35"
#define VERSION "8.40"
/* Define to empty if `const' does not conform to ANSI C. */
/* #undef const */

View File

@ -2735,9 +2735,10 @@ for (;;)
condcode == OP_DNRREF)
return PCRE_ERROR_DFA_UCOND;
/* The DEFINE condition is always false */
/* The DEFINE condition is always false, and the assertion (?!) is
converted to OP_FAIL. */
if (condcode == OP_DEF)
if (condcode == OP_DEF || condcode == OP_FAIL)
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
/* The only supported version of OP_RREF is for the value RREF_ANY,
@ -3241,7 +3242,7 @@ md->callout_data = NULL;
if (extra_data != NULL)
{
unsigned int flags = extra_data->flags;
unsigned long int flags = extra_data->flags;
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
study = (const pcre_study_data *)extra_data->study_data;
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;

View File

@ -1137,88 +1137,81 @@ for (;;)
printf("\n");
#endif
if (offset < md->offset_max)
if (offset >= md->offset_max) goto POSSESSIVE_NON_CAPTURE;
matched_once = FALSE;
code_offset = (int)(ecode - md->start_code);
save_offset1 = md->offset_vector[offset];
save_offset2 = md->offset_vector[offset+1];
save_offset3 = md->offset_vector[md->offset_end - number];
save_capture_last = md->capture_last;
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
/* Each time round the loop, save the current subject position for use
when the group matches. For MATCH_MATCH, the group has matched, so we
restart it with a new subject starting position, remembering that we had
at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
usual. If we haven't matched any alternatives in any iteration, check to
see if a previous iteration matched. If so, the group has matched;
continue from afterwards. Otherwise it has failed; restore the previous
capture values before returning NOMATCH. */
for (;;)
{
matched_once = FALSE;
code_offset = (int)(ecode - md->start_code);
save_offset1 = md->offset_vector[offset];
save_offset2 = md->offset_vector[offset+1];
save_offset3 = md->offset_vector[md->offset_end - number];
save_capture_last = md->capture_last;
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
/* Each time round the loop, save the current subject position for use
when the group matches. For MATCH_MATCH, the group has matched, so we
restart it with a new subject starting position, remembering that we had
at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
usual. If we haven't matched any alternatives in any iteration, check to
see if a previous iteration matched. If so, the group has matched;
continue from afterwards. Otherwise it has failed; restore the previous
capture values before returning NOMATCH. */
for (;;)
md->offset_vector[md->offset_end - number] =
(int)(eptr - md->start_subject);
if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
eptrb, RM63);
if (rrc == MATCH_KETRPOS)
{
md->offset_vector[md->offset_end - number] =
(int)(eptr - md->start_subject);
if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
eptrb, RM63);
if (rrc == MATCH_KETRPOS)
offset_top = md->end_offset_top;
ecode = md->start_code + code_offset;
save_capture_last = md->capture_last;
matched_once = TRUE;
mstart = md->start_match_ptr; /* In case \K changed it */
if (eptr == md->end_match_ptr) /* Matched an empty string */
{
offset_top = md->end_offset_top;
eptr = md->end_match_ptr;
ecode = md->start_code + code_offset;
save_capture_last = md->capture_last;
matched_once = TRUE;
mstart = md->start_match_ptr; /* In case \K changed it */
continue;
do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
break;
}
/* See comment in the code for capturing groups above about handling
THEN. */
if (rrc == MATCH_THEN)
{
next = ecode + GET(ecode,1);
if (md->start_match_ptr < next &&
(*ecode == OP_ALT || *next == OP_ALT))
rrc = MATCH_NOMATCH;
}
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
md->capture_last = save_capture_last;
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
eptr = md->end_match_ptr;
continue;
}
if (!matched_once)
/* See comment in the code for capturing groups above about handling
THEN. */
if (rrc == MATCH_THEN)
{
md->offset_vector[offset] = save_offset1;
md->offset_vector[offset+1] = save_offset2;
md->offset_vector[md->offset_end - number] = save_offset3;
next = ecode + GET(ecode,1);
if (md->start_match_ptr < next &&
(*ecode == OP_ALT || *next == OP_ALT))
rrc = MATCH_NOMATCH;
}
if (allow_zero || matched_once)
{
ecode += 1 + LINK_SIZE;
break;
}
RRETURN(MATCH_NOMATCH);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
md->capture_last = save_capture_last;
ecode += GET(ecode, 1);
if (*ecode != OP_ALT) break;
}
/* FALL THROUGH ... Insufficient room for saving captured contents. Treat
as a non-capturing bracket. */
if (!matched_once)
{
md->offset_vector[offset] = save_offset1;
md->offset_vector[offset+1] = save_offset2;
md->offset_vector[md->offset_end - number] = save_offset3;
}
/* VVVVVVVVVVVVVVVVVVVVVVVVV */
/* VVVVVVVVVVVVVVVVVVVVVVVVV */
if (allow_zero || matched_once)
{
ecode += 1 + LINK_SIZE;
break;
}
DPRINTF(("insufficient capture room: treat as non-capturing\n"));
/* VVVVVVVVVVVVVVVVVVVVVVVVV */
/* VVVVVVVVVVVVVVVVVVVVVVVVV */
RRETURN(MATCH_NOMATCH);
/* Non-capturing possessive bracket with unlimited repeat. We come here
from BRAZERO with allow_zero = TRUE. The code is similar to the above,
@ -1242,10 +1235,15 @@ for (;;)
if (rrc == MATCH_KETRPOS)
{
offset_top = md->end_offset_top;
eptr = md->end_match_ptr;
ecode = md->start_code + code_offset;
matched_once = TRUE;
mstart = md->start_match_ptr; /* In case \K reset it */
if (eptr == md->end_match_ptr) /* Matched an empty string */
{
do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
break;
}
eptr = md->end_match_ptr;
continue;
}
@ -1379,6 +1377,7 @@ for (;;)
break;
case OP_DEF: /* DEFINE - always false */
case OP_FAIL: /* From optimized (?!) condition */
break;
/* The condition is an assertion. Call match() to evaluate it - setting
@ -1395,8 +1394,11 @@ for (;;)
condition = TRUE;
/* Advance ecode past the assertion to the start of the first branch,
but adjust it so that the general choosing code below works. */
but adjust it so that the general choosing code below works. If the
assertion has a quantifier that allows zero repeats we must skip over
the BRAZERO. This is a lunatic thing to do, but somebody did! */
if (*ecode == OP_BRAZERO) ecode++;
ecode += GET(ecode, 1);
while (*ecode == OP_ALT) ecode += GET(ecode, 1);
ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
@ -1465,7 +1467,18 @@ for (;;)
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
if (offset_top <= offset) offset_top = offset + 2;
/* If this group is at or above the current highwater mark, ensure that
any groups between the current high water mark and this group are marked
unset and then update the high water mark. */
if (offset >= offset_top)
{
register int *iptr = md->offset_vector + offset_top;
register int *iend = md->offset_vector + offset;
while (iptr < iend) *iptr++ = -1;
offset_top = offset + 2;
}
}
ecode += 1 + IMM2_SIZE;
break;
@ -1817,7 +1830,11 @@ for (;;)
are defined in a range that can be tested for. */
if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
{
if (new_recursive.offset_save != stacksave)
(PUBL(free))(new_recursive.offset_save);
RRETURN(MATCH_NOMATCH);
}
/* Any return code other than NOMATCH is an error. */
@ -1980,6 +1997,19 @@ for (;;)
}
}
/* OP_KETRPOS is a possessive repeating ket. Remember the current position,
and return the MATCH_KETRPOS. This makes it possible to do the repeats one
at a time from the outer level, thus saving stack. This must precede the
empty string test - in this case that test is done at the outer level. */
if (*ecode == OP_KETRPOS)
{
md->start_match_ptr = mstart; /* In case \K reset it */
md->end_match_ptr = eptr;
md->end_offset_top = offset_top;
RRETURN(MATCH_KETRPOS);
}
/* For an ordinary non-repeating ket, just continue at this level. This
also happens for a repeating ket if no characters were matched in the
group. This is the forcible breaking of infinite loops as implemented in
@ -2002,18 +2032,6 @@ for (;;)
break;
}
/* OP_KETRPOS is a possessive repeating ket. Remember the current position,
and return the MATCH_KETRPOS. This makes it possible to do the repeats one
at a time from the outer level, thus saving stack. */
if (*ecode == OP_KETRPOS)
{
md->start_match_ptr = mstart; /* In case \K reset it */
md->end_match_ptr = eptr;
md->end_offset_top = offset_top;
RRETURN(MATCH_KETRPOS);
}
/* The normal repeating kets try the rest of the pattern or restart from
the preceding bracket, in the appropriate order. In the second case, we can
use tail recursion to avoid using another stack frame, unless we have an
@ -3466,7 +3484,7 @@ for (;;)
if (possessive) continue; /* No backtracking */
for(;;)
{
if (eptr == pp) goto TAIL_RECURSE;
if (eptr <= pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
#ifdef SUPPORT_UCP
@ -3887,7 +3905,7 @@ for (;;)
if (possessive) continue; /* No backtracking */
for(;;)
{
if (eptr == pp) goto TAIL_RECURSE;
if (eptr <= pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
@ -4022,7 +4040,7 @@ for (;;)
if (possessive) continue; /* No backtracking */
for(;;)
{
if (eptr == pp) goto TAIL_RECURSE;
if (eptr <= pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
@ -5593,7 +5611,7 @@ for (;;)
if (possessive) continue; /* No backtracking */
for(;;)
{
if (eptr == pp) goto TAIL_RECURSE;
if (eptr <= pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
@ -5635,12 +5653,17 @@ for (;;)
if (possessive) continue; /* No backtracking */
/* We use <= pp rather than == pp to detect the start of the run while
backtracking because the use of \C in UTF mode can cause BACKCHAR to
move back past pp. This is just palliative; the use of \C in UTF mode
is fraught with danger. */
for(;;)
{
int lgb, rgb;
PCRE_PUCHAR fptr;
if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
@ -5658,7 +5681,7 @@ for (;;)
for (;;)
{
if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
fptr = eptr - 1;
if (!utf) c = *fptr; else
{
@ -5682,54 +5705,25 @@ for (;;)
switch(ctype)
{
case OP_ANY:
if (max < INT_MAX)
for (i = min; i < max; i++)
{
for (i = min; i < max; i++)
if (eptr >= md->end_subject)
{
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
break;
}
if (IS_NEWLINE(eptr)) break;
if (md->partial != 0 && /* Take care with CRLF partial */
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
UCHAR21(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
}
eptr++;
ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
SCHECK_PARTIAL();
break;
}
}
/* Handle unlimited UTF-8 repeat */
else
{
for (i = min; i < max; i++)
if (IS_NEWLINE(eptr)) break;
if (md->partial != 0 && /* Take care with CRLF partial */
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
UCHAR21(eptr) == NLBLOCK->nl[0])
{
if (eptr >= md->end_subject)
{
SCHECK_PARTIAL();
break;
}
if (IS_NEWLINE(eptr)) break;
if (md->partial != 0 && /* Take care with CRLF partial */
eptr + 1 >= md->end_subject &&
NLBLOCK->nltype == NLTYPE_FIXED &&
NLBLOCK->nllen == 2 &&
UCHAR21(eptr) == NLBLOCK->nl[0])
{
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
}
eptr++;
ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
md->hitend = TRUE;
if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
}
eptr++;
ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
}
break;
@ -5937,7 +5931,7 @@ for (;;)
if (possessive) continue; /* No backtracking */
for(;;)
{
if (eptr == pp) goto TAIL_RECURSE;
if (eptr <= pp) goto TAIL_RECURSE;
RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
eptr--;
@ -6520,7 +6514,7 @@ tables = re->tables;
if (extra_data != NULL)
{
register unsigned int flags = extra_data->flags;
unsigned long int flags = extra_data->flags;
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
study = (const pcre_study_data *)extra_data->study_data;
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
@ -6692,7 +6686,8 @@ if (md->offset_vector != NULL)
register int *iend = iptr - re->top_bracket;
if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
while (--iptr >= iend) *iptr = -1;
md->offset_vector[0] = md->offset_vector[1] = -1;
if (offsetcount > 0) md->offset_vector[0] = -1;
if (offsetcount > 1) md->offset_vector[1] = -1;
}
/* Set up the first character to match, if available. The first_char value is

View File

@ -247,6 +247,7 @@ Arguments:
code the compiled regex
stringname the name of the capturing substring
ovector the vector of matched substrings
stringcount number of captured substrings
Returns: the number of the first that is set,
or the number of the last one if none are set,
@ -255,13 +256,16 @@ Returns: the number of the first that is set,
#if defined COMPILE_PCRE8
static int
get_first_set(const pcre *code, const char *stringname, int *ovector)
get_first_set(const pcre *code, const char *stringname, int *ovector,
int stringcount)
#elif defined COMPILE_PCRE16
static int
get_first_set(const pcre16 *code, PCRE_SPTR16 stringname, int *ovector)
get_first_set(const pcre16 *code, PCRE_SPTR16 stringname, int *ovector,
int stringcount)
#elif defined COMPILE_PCRE32
static int
get_first_set(const pcre32 *code, PCRE_SPTR32 stringname, int *ovector)
get_first_set(const pcre32 *code, PCRE_SPTR32 stringname, int *ovector,
int stringcount)
#endif
{
const REAL_PCRE *re = (const REAL_PCRE *)code;
@ -292,7 +296,7 @@ if (entrysize <= 0) return entrysize;
for (entry = (pcre_uchar *)first; entry <= (pcre_uchar *)last; entry += entrysize)
{
int n = GET2(entry, 0);
if (ovector[n*2] >= 0) return n;
if (n < stringcount && ovector[n*2] >= 0) return n;
}
return GET2(entry, 0);
}
@ -399,7 +403,7 @@ pcre32_copy_named_substring(const pcre32 *code, PCRE_SPTR32 subject,
PCRE_UCHAR32 *buffer, int size)
#endif
{
int n = get_first_set(code, stringname, ovector);
int n = get_first_set(code, stringname, ovector, stringcount);
if (n <= 0) return n;
#if defined COMPILE_PCRE8
return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size);
@ -454,7 +458,10 @@ pcre_uchar **stringlist;
pcre_uchar *p;
for (i = 0; i < double_count; i += 2)
size += sizeof(pcre_uchar *) + IN_UCHARS(ovector[i+1] - ovector[i] + 1);
{
size += sizeof(pcre_uchar *) + IN_UCHARS(1);
if (ovector[i+1] > ovector[i]) size += IN_UCHARS(ovector[i+1] - ovector[i]);
}
stringlist = (pcre_uchar **)(PUBL(malloc))(size);
if (stringlist == NULL) return PCRE_ERROR_NOMEMORY;
@ -470,7 +477,7 @@ p = (pcre_uchar *)(stringlist + stringcount + 1);
for (i = 0; i < double_count; i += 2)
{
int len = ovector[i+1] - ovector[i];
int len = (ovector[i+1] > ovector[i])? (ovector[i+1] - ovector[i]) : 0;
memcpy(p, subject + ovector[i], IN_UCHARS(len));
*stringlist++ = p;
p += len;
@ -616,7 +623,7 @@ pcre32_get_named_substring(const pcre32 *code, PCRE_SPTR32 subject,
PCRE_SPTR32 *stringptr)
#endif
{
int n = get_first_set(code, stringname, ovector);
int n = get_first_set(code, stringname, ovector, stringcount);
if (n <= 0) return n;
#if defined COMPILE_PCRE8
return pcre_get_substring(subject, ovector, stringcount, n, stringptr);

View File

@ -7,7 +7,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2014 University of Cambridge
Copyright (c) 1997-2016 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -229,9 +229,9 @@ stdint.h is available, include it; it may define INT64_MAX. Systems that do not
have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set
by "configure". */
#if HAVE_STDINT_H
#if defined HAVE_STDINT_H
#include <stdint.h>
#elif HAVE_INTTYPES_H
#elif defined HAVE_INTTYPES_H
#include <inttypes.h>
#endif
@ -275,7 +275,7 @@ pcre.h(.in) and disable (comment out) this message. */
typedef pcre_uint16 pcre_uchar;
#define UCHAR_SHIFT (1)
#define IN_UCHARS(x) ((x) << UCHAR_SHIFT)
#define IN_UCHARS(x) ((x) * 2)
#define MAX_255(c) ((c) <= 255u)
#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))
@ -283,7 +283,7 @@ typedef pcre_uint16 pcre_uchar;
typedef pcre_uint32 pcre_uchar;
#define UCHAR_SHIFT (2)
#define IN_UCHARS(x) ((x) << UCHAR_SHIFT)
#define IN_UCHARS(x) ((x) * 4)
#define MAX_255(c) ((c) <= 255u)
#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))
@ -984,7 +984,7 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */
#ifndef EBCDIC
#define HSPACE_LIST \
CHAR_HT, CHAR_SPACE, 0xa0, \
CHAR_HT, CHAR_SPACE, CHAR_NBSP, \
0x1680, 0x180e, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, \
0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202f, 0x205f, 0x3000, \
NOTACHAR
@ -1010,7 +1010,7 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */
#define HSPACE_BYTE_CASES \
case CHAR_HT: \
case CHAR_SPACE: \
case 0xa0 /* NBSP */
case CHAR_NBSP
#define HSPACE_CASES \
HSPACE_BYTE_CASES: \
@ -1037,11 +1037,12 @@ other. NOTE: The values also appear in pcre_jit_compile.c. */
/* ------ EBCDIC environments ------ */
#else
#define HSPACE_LIST CHAR_HT, CHAR_SPACE
#define HSPACE_LIST CHAR_HT, CHAR_SPACE, CHAR_NBSP, NOTACHAR
#define HSPACE_BYTE_CASES \
case CHAR_HT: \
case CHAR_SPACE
case CHAR_SPACE: \
case CHAR_NBSP
#define HSPACE_CASES HSPACE_BYTE_CASES
@ -1215,6 +1216,7 @@ same code point. */
#define CHAR_ESC '\047'
#define CHAR_DEL '\007'
#define CHAR_NBSP '\x41'
#define STR_ESC "\047"
#define STR_DEL "\007"
@ -1229,6 +1231,7 @@ a positive value. */
#define CHAR_NEL ((unsigned char)'\x85')
#define CHAR_ESC '\033'
#define CHAR_DEL '\177'
#define CHAR_NBSP ((unsigned char)'\xa0')
#define STR_LF "\n"
#define STR_NL STR_LF
@ -1606,6 +1609,7 @@ only. */
#define CHAR_VERTICAL_LINE '\174'
#define CHAR_RIGHT_CURLY_BRACKET '\175'
#define CHAR_TILDE '\176'
#define CHAR_NBSP ((unsigned char)'\xa0')
#define STR_HT "\011"
#define STR_VT "\013"
@ -1762,6 +1766,10 @@ only. */
/* Escape items that are just an encoding of a particular data value. */
#ifndef ESC_a
#define ESC_a CHAR_BEL
#endif
#ifndef ESC_e
#define ESC_e CHAR_ESC
#endif
@ -2281,7 +2289,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69,
ERR70, ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79,
ERR80, ERR81, ERR82, ERR83, ERR84, ERR85, ERRCOUNT };
ERR80, ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERRCOUNT };
/* JIT compiling modes. The function list is indexed by them. */
@ -2446,6 +2454,8 @@ typedef struct compile_data {
BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */
BOOL check_lookbehind; /* Lookbehinds need later checking */
BOOL dupnames; /* Duplicate names exist */
BOOL dupgroups; /* Duplicate groups exist: (?| found */
BOOL iscondassert; /* Next assert is a condition */
int nltype; /* Newline type */
int nllen; /* Newline string length */
pcre_uchar nl[4]; /* Newline string when fixed length */
@ -2459,6 +2469,13 @@ typedef struct branch_chain {
pcre_uchar *current_branch;
} branch_chain;
/* Structure for mutual recursion detection. */
typedef struct recurse_check {
struct recurse_check *prev;
const pcre_uchar *group;
} recurse_check;
/* Structure for items in a linked list that represents an explicit recursive
call within the pattern; used by pcre_exec(). */

File diff suppressed because it is too large Load Diff

View File

@ -67,7 +67,8 @@ Arguments:
code pointer to start of group (the bracket)
startcode pointer to start of the whole pattern's code
options the compiling options
int RECURSE depth
recurses chain of recurse_check to catch mutual recursion
countptr pointer to call count (to catch over complexity)
Returns: the minimum length
-1 if \C in UTF-8 mode or (*ACCEPT) was encountered
@ -77,15 +78,19 @@ Returns: the minimum length
static int
find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
const pcre_uchar *startcode, int options, int recurse_depth)
const pcre_uchar *startcode, int options, recurse_check *recurses,
int *countptr)
{
int length = -1;
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
BOOL utf = (options & PCRE_UTF8) != 0;
BOOL had_recurse = FALSE;
recurse_check this_recurse;
register int branchlength = 0;
register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
if ((*countptr)++ > 1000) return -1; /* too complex */
if (*code == OP_CBRA || *code == OP_SCBRA ||
*code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
@ -127,7 +132,7 @@ for (;;)
case OP_SBRAPOS:
case OP_ONCE:
case OP_ONCE_NC:
d = find_minlength(re, cc, startcode, options, recurse_depth);
d = find_minlength(re, cc, startcode, options, recurses, countptr);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
@ -390,7 +395,7 @@ for (;;)
ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce)
if (cc > cs && cc < ce) /* Simple recursion */
{
d = 0;
had_recurse = TRUE;
@ -398,8 +403,23 @@ for (;;)
}
else
{
int dd = find_minlength(re, cs, startcode, options, recurse_depth);
if (dd < d) d = dd;
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
{
d = 0;
had_recurse = TRUE;
break;
}
else
{
int dd;
this_recurse.prev = recurses;
this_recurse.group = cs;
dd = find_minlength(re, cs, startcode, options, &this_recurse,
countptr);
if (dd < d) d = dd;
}
}
slot += re->name_entry_size;
}
@ -415,14 +435,27 @@ for (;;)
ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce)
if (cc > cs && cc < ce) /* Simple recursion */
{
d = 0;
had_recurse = TRUE;
}
else
{
d = find_minlength(re, cs, startcode, options, recurse_depth);
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
{
d = 0;
had_recurse = TRUE;
}
else
{
this_recurse.prev = recurses;
this_recurse.group = cs;
d = find_minlength(re, cs, startcode, options, &this_recurse,
countptr);
}
}
}
else d = 0;
@ -471,12 +504,21 @@ for (;;)
case OP_RECURSE:
cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
do ce += GET(ce, 1); while (*ce == OP_ALT);
if ((cc > cs && cc < ce) || recurse_depth > 10)
if (cc > cs && cc < ce) /* Simple recursion */
had_recurse = TRUE;
else
{
branchlength += find_minlength(re, cs, startcode, options,
recurse_depth + 1);
recurse_check *r = recurses;
for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
if (r != NULL) /* Mutual recursion */
had_recurse = TRUE;
else
{
this_recurse.prev = recurses;
this_recurse.group = cs;
branchlength += find_minlength(re, cs, startcode, options,
&this_recurse, countptr);
}
}
cc += 1 + LINK_SIZE;
break;
@ -860,7 +902,6 @@ do
case OP_NOTUPTOI:
case OP_NOT_HSPACE:
case OP_NOT_VSPACE:
case OP_PROP:
case OP_PRUNE:
case OP_PRUNE_ARG:
case OP_RECURSE:
@ -878,6 +919,31 @@ do
case OP_THEN_ARG:
return SSB_FAIL;
/* A "real" property test implies no starting bits, but the fake property
PT_CLIST identifies a list of characters. These lists are short, as they
are used for characters with more than one "other case", so there is no
point in recognizing them for OP_NOTPROP. */
case OP_PROP:
if (tcode[1] != PT_CLIST) return SSB_FAIL;
{
const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
while ((c = *p++) < NOTACHAR)
{
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
if (utf)
{
pcre_uchar buff[6];
(void)PRIV(ord2utf)(c, buff);
c = buff[0];
}
#endif
if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
}
}
try_next = FALSE;
break;
/* We can ignore word boundary tests. */
case OP_WORD_BOUNDARY:
@ -1103,24 +1169,17 @@ do
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
ensure it is set as not whitespace. Luckily, the code value is the same
(0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
/* The cbit_space table has vertical tab as whitespace; we no longer
have to play fancy tricks because Perl added VT to its whitespace at
release 5.18. PCRE added it at release 8.34. */
case OP_NOT_WHITESPACE:
set_nottype_bits(start_bits, cbit_space, table_limit, cd);
start_bits[1] |= 0x08;
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to not
set it from the table. Luckily, the code value is the same (0x0b) in
ASCII and EBCDIC, so we can just adjust the appropriate bit. */
case OP_WHITESPACE:
c = start_bits[1]; /* Save in case it was already set */
set_type_bits(start_bits, cbit_space, table_limit, cd);
start_bits[1] = (start_bits[1] & ~0x08) | c;
try_next = FALSE;
break;
@ -1309,7 +1368,7 @@ do
for (c = 0; c < 16; c++) start_bits[c] |= map[c];
for (c = 128; c < 256; c++)
{
if ((map[c/8] && (1 << (c&7))) != 0)
if ((map[c/8] & (1 << (c&7))) != 0)
{
int d = (c >> 6) | 0xc0; /* Set bit for this starter */
start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
@ -1397,6 +1456,7 @@ pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
#endif
{
int min;
int count = 0;
BOOL bits_set = FALSE;
pcre_uint8 start_bits[32];
PUBL(extra) *extra = NULL;
@ -1483,7 +1543,7 @@ if ((re->options & PCRE_ANCHORED) == 0 &&
/* Find the minimum length of subject string. */
switch(min = find_minlength(re, code, code, re->options, 0))
switch(min = find_minlength(re, code, code, re->options, NULL, &count))
{
case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
case -3: *errorptr = "internal error: opcode not recognized"; return NULL;

View File

@ -209,6 +209,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
#define STRING_Bassa_Vah0 STR_B STR_a STR_s STR_s STR_a STR_UNDERSCORE STR_V STR_a STR_h "\0"
#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0"
#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
@ -219,6 +220,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_C0 STR_C "\0"
#define STRING_Canadian_Aboriginal0 STR_C STR_a STR_n STR_a STR_d STR_i STR_a STR_n STR_UNDERSCORE STR_A STR_b STR_o STR_r STR_i STR_g STR_i STR_n STR_a STR_l "\0"
#define STRING_Carian0 STR_C STR_a STR_r STR_i STR_a STR_n "\0"
#define STRING_Caucasian_Albanian0 STR_C STR_a STR_u STR_c STR_a STR_s STR_i STR_a STR_n STR_UNDERSCORE STR_A STR_l STR_b STR_a STR_n STR_i STR_a STR_n "\0"
#define STRING_Cc0 STR_C STR_c "\0"
#define STRING_Cf0 STR_C STR_f "\0"
#define STRING_Chakma0 STR_C STR_h STR_a STR_k STR_m STR_a "\0"
@ -234,11 +236,14 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Cyrillic0 STR_C STR_y STR_r STR_i STR_l STR_l STR_i STR_c "\0"
#define STRING_Deseret0 STR_D STR_e STR_s STR_e STR_r STR_e STR_t "\0"
#define STRING_Devanagari0 STR_D STR_e STR_v STR_a STR_n STR_a STR_g STR_a STR_r STR_i "\0"
#define STRING_Duployan0 STR_D STR_u STR_p STR_l STR_o STR_y STR_a STR_n "\0"
#define STRING_Egyptian_Hieroglyphs0 STR_E STR_g STR_y STR_p STR_t STR_i STR_a STR_n STR_UNDERSCORE STR_H STR_i STR_e STR_r STR_o STR_g STR_l STR_y STR_p STR_h STR_s "\0"
#define STRING_Elbasan0 STR_E STR_l STR_b STR_a STR_s STR_a STR_n "\0"
#define STRING_Ethiopic0 STR_E STR_t STR_h STR_i STR_o STR_p STR_i STR_c "\0"
#define STRING_Georgian0 STR_G STR_e STR_o STR_r STR_g STR_i STR_a STR_n "\0"
#define STRING_Glagolitic0 STR_G STR_l STR_a STR_g STR_o STR_l STR_i STR_t STR_i STR_c "\0"
#define STRING_Gothic0 STR_G STR_o STR_t STR_h STR_i STR_c "\0"
#define STRING_Grantha0 STR_G STR_r STR_a STR_n STR_t STR_h STR_a "\0"
#define STRING_Greek0 STR_G STR_r STR_e STR_e STR_k "\0"
#define STRING_Gujarati0 STR_G STR_u STR_j STR_a STR_r STR_a STR_t STR_i "\0"
#define STRING_Gurmukhi0 STR_G STR_u STR_r STR_m STR_u STR_k STR_h STR_i "\0"
@ -258,12 +263,15 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Kayah_Li0 STR_K STR_a STR_y STR_a STR_h STR_UNDERSCORE STR_L STR_i "\0"
#define STRING_Kharoshthi0 STR_K STR_h STR_a STR_r STR_o STR_s STR_h STR_t STR_h STR_i "\0"
#define STRING_Khmer0 STR_K STR_h STR_m STR_e STR_r "\0"
#define STRING_Khojki0 STR_K STR_h STR_o STR_j STR_k STR_i "\0"
#define STRING_Khudawadi0 STR_K STR_h STR_u STR_d STR_a STR_w STR_a STR_d STR_i "\0"
#define STRING_L0 STR_L "\0"
#define STRING_L_AMPERSAND0 STR_L STR_AMPERSAND "\0"
#define STRING_Lao0 STR_L STR_a STR_o "\0"
#define STRING_Latin0 STR_L STR_a STR_t STR_i STR_n "\0"
#define STRING_Lepcha0 STR_L STR_e STR_p STR_c STR_h STR_a "\0"
#define STRING_Limbu0 STR_L STR_i STR_m STR_b STR_u "\0"
#define STRING_Linear_A0 STR_L STR_i STR_n STR_e STR_a STR_r STR_UNDERSCORE STR_A "\0"
#define STRING_Linear_B0 STR_L STR_i STR_n STR_e STR_a STR_r STR_UNDERSCORE STR_B "\0"
#define STRING_Lisu0 STR_L STR_i STR_s STR_u "\0"
#define STRING_Ll0 STR_L STR_l "\0"
@ -274,18 +282,24 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Lycian0 STR_L STR_y STR_c STR_i STR_a STR_n "\0"
#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
#define STRING_M0 STR_M "\0"
#define STRING_Mahajani0 STR_M STR_a STR_h STR_a STR_j STR_a STR_n STR_i "\0"
#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0"
#define STRING_Manichaean0 STR_M STR_a STR_n STR_i STR_c STR_h STR_a STR_e STR_a STR_n "\0"
#define STRING_Mc0 STR_M STR_c "\0"
#define STRING_Me0 STR_M STR_e "\0"
#define STRING_Meetei_Mayek0 STR_M STR_e STR_e STR_t STR_e STR_i STR_UNDERSCORE STR_M STR_a STR_y STR_e STR_k "\0"
#define STRING_Mende_Kikakui0 STR_M STR_e STR_n STR_d STR_e STR_UNDERSCORE STR_K STR_i STR_k STR_a STR_k STR_u STR_i "\0"
#define STRING_Meroitic_Cursive0 STR_M STR_e STR_r STR_o STR_i STR_t STR_i STR_c STR_UNDERSCORE STR_C STR_u STR_r STR_s STR_i STR_v STR_e "\0"
#define STRING_Meroitic_Hieroglyphs0 STR_M STR_e STR_r STR_o STR_i STR_t STR_i STR_c STR_UNDERSCORE STR_H STR_i STR_e STR_r STR_o STR_g STR_l STR_y STR_p STR_h STR_s "\0"
#define STRING_Miao0 STR_M STR_i STR_a STR_o "\0"
#define STRING_Mn0 STR_M STR_n "\0"
#define STRING_Modi0 STR_M STR_o STR_d STR_i "\0"
#define STRING_Mongolian0 STR_M STR_o STR_n STR_g STR_o STR_l STR_i STR_a STR_n "\0"
#define STRING_Mro0 STR_M STR_r STR_o "\0"
#define STRING_Myanmar0 STR_M STR_y STR_a STR_n STR_m STR_a STR_r "\0"
#define STRING_N0 STR_N "\0"
#define STRING_Nabataean0 STR_N STR_a STR_b STR_a STR_t STR_a STR_e STR_a STR_n "\0"
#define STRING_Nd0 STR_N STR_d "\0"
#define STRING_New_Tai_Lue0 STR_N STR_e STR_w STR_UNDERSCORE STR_T STR_a STR_i STR_UNDERSCORE STR_L STR_u STR_e "\0"
#define STRING_Nko0 STR_N STR_k STR_o "\0"
@ -294,12 +308,17 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Ogham0 STR_O STR_g STR_h STR_a STR_m "\0"
#define STRING_Ol_Chiki0 STR_O STR_l STR_UNDERSCORE STR_C STR_h STR_i STR_k STR_i "\0"
#define STRING_Old_Italic0 STR_O STR_l STR_d STR_UNDERSCORE STR_I STR_t STR_a STR_l STR_i STR_c "\0"
#define STRING_Old_North_Arabian0 STR_O STR_l STR_d STR_UNDERSCORE STR_N STR_o STR_r STR_t STR_h STR_UNDERSCORE STR_A STR_r STR_a STR_b STR_i STR_a STR_n "\0"
#define STRING_Old_Permic0 STR_O STR_l STR_d STR_UNDERSCORE STR_P STR_e STR_r STR_m STR_i STR_c "\0"
#define STRING_Old_Persian0 STR_O STR_l STR_d STR_UNDERSCORE STR_P STR_e STR_r STR_s STR_i STR_a STR_n "\0"
#define STRING_Old_South_Arabian0 STR_O STR_l STR_d STR_UNDERSCORE STR_S STR_o STR_u STR_t STR_h STR_UNDERSCORE STR_A STR_r STR_a STR_b STR_i STR_a STR_n "\0"
#define STRING_Old_Turkic0 STR_O STR_l STR_d STR_UNDERSCORE STR_T STR_u STR_r STR_k STR_i STR_c "\0"
#define STRING_Oriya0 STR_O STR_r STR_i STR_y STR_a "\0"
#define STRING_Osmanya0 STR_O STR_s STR_m STR_a STR_n STR_y STR_a "\0"
#define STRING_P0 STR_P "\0"
#define STRING_Pahawh_Hmong0 STR_P STR_a STR_h STR_a STR_w STR_h STR_UNDERSCORE STR_H STR_m STR_o STR_n STR_g "\0"
#define STRING_Palmyrene0 STR_P STR_a STR_l STR_m STR_y STR_r STR_e STR_n STR_e "\0"
#define STRING_Pau_Cin_Hau0 STR_P STR_a STR_u STR_UNDERSCORE STR_C STR_i STR_n STR_UNDERSCORE STR_H STR_a STR_u "\0"
#define STRING_Pc0 STR_P STR_c "\0"
#define STRING_Pd0 STR_P STR_d "\0"
#define STRING_Pe0 STR_P STR_e "\0"
@ -309,6 +328,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Pi0 STR_P STR_i "\0"
#define STRING_Po0 STR_P STR_o "\0"
#define STRING_Ps0 STR_P STR_s "\0"
#define STRING_Psalter_Pahlavi0 STR_P STR_s STR_a STR_l STR_t STR_e STR_r STR_UNDERSCORE STR_P STR_a STR_h STR_l STR_a STR_v STR_i "\0"
#define STRING_Rejang0 STR_R STR_e STR_j STR_a STR_n STR_g "\0"
#define STRING_Runic0 STR_R STR_u STR_n STR_i STR_c "\0"
#define STRING_S0 STR_S "\0"
@ -317,6 +337,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Sc0 STR_S STR_c "\0"
#define STRING_Sharada0 STR_S STR_h STR_a STR_r STR_a STR_d STR_a "\0"
#define STRING_Shavian0 STR_S STR_h STR_a STR_v STR_i STR_a STR_n "\0"
#define STRING_Siddham0 STR_S STR_i STR_d STR_d STR_h STR_a STR_m "\0"
#define STRING_Sinhala0 STR_S STR_i STR_n STR_h STR_a STR_l STR_a "\0"
#define STRING_Sk0 STR_S STR_k "\0"
#define STRING_Sm0 STR_S STR_m "\0"
@ -337,8 +358,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Thai0 STR_T STR_h STR_a STR_i "\0"
#define STRING_Tibetan0 STR_T STR_i STR_b STR_e STR_t STR_a STR_n "\0"
#define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
#define STRING_Tirhuta0 STR_T STR_i STR_r STR_h STR_u STR_t STR_a "\0"
#define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
#define STRING_Vai0 STR_V STR_a STR_i "\0"
#define STRING_Warang_Citi0 STR_W STR_a STR_r STR_a STR_n STR_g STR_UNDERSCORE STR_C STR_i STR_t STR_i "\0"
#define STRING_Xan0 STR_X STR_a STR_n "\0"
#define STRING_Xps0 STR_X STR_p STR_s "\0"
#define STRING_Xsp0 STR_X STR_s STR_p "\0"
@ -357,6 +380,7 @@ const char PRIV(utt_names)[] =
STRING_Avestan0
STRING_Balinese0
STRING_Bamum0
STRING_Bassa_Vah0
STRING_Batak0
STRING_Bengali0
STRING_Bopomofo0
@ -367,6 +391,7 @@ const char PRIV(utt_names)[] =
STRING_C0
STRING_Canadian_Aboriginal0
STRING_Carian0
STRING_Caucasian_Albanian0
STRING_Cc0
STRING_Cf0
STRING_Chakma0
@ -382,11 +407,14 @@ const char PRIV(utt_names)[] =
STRING_Cyrillic0
STRING_Deseret0
STRING_Devanagari0
STRING_Duployan0
STRING_Egyptian_Hieroglyphs0
STRING_Elbasan0
STRING_Ethiopic0
STRING_Georgian0
STRING_Glagolitic0
STRING_Gothic0
STRING_Grantha0
STRING_Greek0
STRING_Gujarati0
STRING_Gurmukhi0
@ -406,12 +434,15 @@ const char PRIV(utt_names)[] =
STRING_Kayah_Li0
STRING_Kharoshthi0
STRING_Khmer0
STRING_Khojki0
STRING_Khudawadi0
STRING_L0
STRING_L_AMPERSAND0
STRING_Lao0
STRING_Latin0
STRING_Lepcha0
STRING_Limbu0
STRING_Linear_A0
STRING_Linear_B0
STRING_Lisu0
STRING_Ll0
@ -422,18 +453,24 @@ const char PRIV(utt_names)[] =
STRING_Lycian0
STRING_Lydian0
STRING_M0
STRING_Mahajani0
STRING_Malayalam0
STRING_Mandaic0
STRING_Manichaean0
STRING_Mc0
STRING_Me0
STRING_Meetei_Mayek0
STRING_Mende_Kikakui0
STRING_Meroitic_Cursive0
STRING_Meroitic_Hieroglyphs0
STRING_Miao0
STRING_Mn0
STRING_Modi0
STRING_Mongolian0
STRING_Mro0
STRING_Myanmar0
STRING_N0
STRING_Nabataean0
STRING_Nd0
STRING_New_Tai_Lue0
STRING_Nko0
@ -442,12 +479,17 @@ const char PRIV(utt_names)[] =
STRING_Ogham0
STRING_Ol_Chiki0
STRING_Old_Italic0
STRING_Old_North_Arabian0
STRING_Old_Permic0
STRING_Old_Persian0
STRING_Old_South_Arabian0
STRING_Old_Turkic0
STRING_Oriya0
STRING_Osmanya0
STRING_P0
STRING_Pahawh_Hmong0
STRING_Palmyrene0
STRING_Pau_Cin_Hau0
STRING_Pc0
STRING_Pd0
STRING_Pe0
@ -457,6 +499,7 @@ const char PRIV(utt_names)[] =
STRING_Pi0
STRING_Po0
STRING_Ps0
STRING_Psalter_Pahlavi0
STRING_Rejang0
STRING_Runic0
STRING_S0
@ -465,6 +508,7 @@ const char PRIV(utt_names)[] =
STRING_Sc0
STRING_Sharada0
STRING_Shavian0
STRING_Siddham0
STRING_Sinhala0
STRING_Sk0
STRING_Sm0
@ -485,8 +529,10 @@ const char PRIV(utt_names)[] =
STRING_Thai0
STRING_Tibetan0
STRING_Tifinagh0
STRING_Tirhuta0
STRING_Ugaritic0
STRING_Vai0
STRING_Warang_Citi0
STRING_Xan0
STRING_Xps0
STRING_Xsp0
@ -505,146 +551,169 @@ const ucp_type_table PRIV(utt)[] = {
{ 20, PT_SC, ucp_Avestan },
{ 28, PT_SC, ucp_Balinese },
{ 37, PT_SC, ucp_Bamum },
{ 43, PT_SC, ucp_Batak },
{ 49, PT_SC, ucp_Bengali },
{ 57, PT_SC, ucp_Bopomofo },
{ 66, PT_SC, ucp_Brahmi },
{ 73, PT_SC, ucp_Braille },
{ 81, PT_SC, ucp_Buginese },
{ 90, PT_SC, ucp_Buhid },
{ 96, PT_GC, ucp_C },
{ 98, PT_SC, ucp_Canadian_Aboriginal },
{ 118, PT_SC, ucp_Carian },
{ 125, PT_PC, ucp_Cc },
{ 128, PT_PC, ucp_Cf },
{ 131, PT_SC, ucp_Chakma },
{ 138, PT_SC, ucp_Cham },
{ 143, PT_SC, ucp_Cherokee },
{ 152, PT_PC, ucp_Cn },
{ 155, PT_PC, ucp_Co },
{ 158, PT_SC, ucp_Common },
{ 165, PT_SC, ucp_Coptic },
{ 172, PT_PC, ucp_Cs },
{ 175, PT_SC, ucp_Cuneiform },
{ 185, PT_SC, ucp_Cypriot },
{ 193, PT_SC, ucp_Cyrillic },
{ 202, PT_SC, ucp_Deseret },
{ 210, PT_SC, ucp_Devanagari },
{ 221, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 242, PT_SC, ucp_Ethiopic },
{ 251, PT_SC, ucp_Georgian },
{ 260, PT_SC, ucp_Glagolitic },
{ 271, PT_SC, ucp_Gothic },
{ 278, PT_SC, ucp_Greek },
{ 284, PT_SC, ucp_Gujarati },
{ 293, PT_SC, ucp_Gurmukhi },
{ 302, PT_SC, ucp_Han },
{ 306, PT_SC, ucp_Hangul },
{ 313, PT_SC, ucp_Hanunoo },
{ 321, PT_SC, ucp_Hebrew },
{ 328, PT_SC, ucp_Hiragana },
{ 337, PT_SC, ucp_Imperial_Aramaic },
{ 354, PT_SC, ucp_Inherited },
{ 364, PT_SC, ucp_Inscriptional_Pahlavi },
{ 386, PT_SC, ucp_Inscriptional_Parthian },
{ 409, PT_SC, ucp_Javanese },
{ 418, PT_SC, ucp_Kaithi },
{ 425, PT_SC, ucp_Kannada },
{ 433, PT_SC, ucp_Katakana },
{ 442, PT_SC, ucp_Kayah_Li },
{ 451, PT_SC, ucp_Kharoshthi },
{ 462, PT_SC, ucp_Khmer },
{ 468, PT_GC, ucp_L },
{ 470, PT_LAMP, 0 },
{ 473, PT_SC, ucp_Lao },
{ 477, PT_SC, ucp_Latin },
{ 483, PT_SC, ucp_Lepcha },
{ 490, PT_SC, ucp_Limbu },
{ 496, PT_SC, ucp_Linear_B },
{ 505, PT_SC, ucp_Lisu },
{ 510, PT_PC, ucp_Ll },
{ 513, PT_PC, ucp_Lm },
{ 516, PT_PC, ucp_Lo },
{ 519, PT_PC, ucp_Lt },
{ 522, PT_PC, ucp_Lu },
{ 525, PT_SC, ucp_Lycian },
{ 532, PT_SC, ucp_Lydian },
{ 539, PT_GC, ucp_M },
{ 541, PT_SC, ucp_Malayalam },
{ 551, PT_SC, ucp_Mandaic },
{ 559, PT_PC, ucp_Mc },
{ 562, PT_PC, ucp_Me },
{ 565, PT_SC, ucp_Meetei_Mayek },
{ 578, PT_SC, ucp_Meroitic_Cursive },
{ 595, PT_SC, ucp_Meroitic_Hieroglyphs },
{ 616, PT_SC, ucp_Miao },
{ 621, PT_PC, ucp_Mn },
{ 624, PT_SC, ucp_Mongolian },
{ 634, PT_SC, ucp_Myanmar },
{ 642, PT_GC, ucp_N },
{ 644, PT_PC, ucp_Nd },
{ 647, PT_SC, ucp_New_Tai_Lue },
{ 659, PT_SC, ucp_Nko },
{ 663, PT_PC, ucp_Nl },
{ 666, PT_PC, ucp_No },
{ 669, PT_SC, ucp_Ogham },
{ 675, PT_SC, ucp_Ol_Chiki },
{ 684, PT_SC, ucp_Old_Italic },
{ 695, PT_SC, ucp_Old_Persian },
{ 707, PT_SC, ucp_Old_South_Arabian },
{ 725, PT_SC, ucp_Old_Turkic },
{ 736, PT_SC, ucp_Oriya },
{ 742, PT_SC, ucp_Osmanya },
{ 750, PT_GC, ucp_P },
{ 752, PT_PC, ucp_Pc },
{ 755, PT_PC, ucp_Pd },
{ 758, PT_PC, ucp_Pe },
{ 761, PT_PC, ucp_Pf },
{ 764, PT_SC, ucp_Phags_Pa },
{ 773, PT_SC, ucp_Phoenician },
{ 784, PT_PC, ucp_Pi },
{ 787, PT_PC, ucp_Po },
{ 790, PT_PC, ucp_Ps },
{ 793, PT_SC, ucp_Rejang },
{ 800, PT_SC, ucp_Runic },
{ 806, PT_GC, ucp_S },
{ 808, PT_SC, ucp_Samaritan },
{ 818, PT_SC, ucp_Saurashtra },
{ 829, PT_PC, ucp_Sc },
{ 832, PT_SC, ucp_Sharada },
{ 840, PT_SC, ucp_Shavian },
{ 848, PT_SC, ucp_Sinhala },
{ 856, PT_PC, ucp_Sk },
{ 859, PT_PC, ucp_Sm },
{ 862, PT_PC, ucp_So },
{ 865, PT_SC, ucp_Sora_Sompeng },
{ 878, PT_SC, ucp_Sundanese },
{ 888, PT_SC, ucp_Syloti_Nagri },
{ 901, PT_SC, ucp_Syriac },
{ 908, PT_SC, ucp_Tagalog },
{ 916, PT_SC, ucp_Tagbanwa },
{ 925, PT_SC, ucp_Tai_Le },
{ 932, PT_SC, ucp_Tai_Tham },
{ 941, PT_SC, ucp_Tai_Viet },
{ 950, PT_SC, ucp_Takri },
{ 956, PT_SC, ucp_Tamil },
{ 962, PT_SC, ucp_Telugu },
{ 969, PT_SC, ucp_Thaana },
{ 976, PT_SC, ucp_Thai },
{ 981, PT_SC, ucp_Tibetan },
{ 989, PT_SC, ucp_Tifinagh },
{ 998, PT_SC, ucp_Ugaritic },
{ 1007, PT_SC, ucp_Vai },
{ 1011, PT_ALNUM, 0 },
{ 1015, PT_PXSPACE, 0 },
{ 1019, PT_SPACE, 0 },
{ 1023, PT_UCNC, 0 },
{ 1027, PT_WORD, 0 },
{ 1031, PT_SC, ucp_Yi },
{ 1034, PT_GC, ucp_Z },
{ 1036, PT_PC, ucp_Zl },
{ 1039, PT_PC, ucp_Zp },
{ 1042, PT_PC, ucp_Zs }
{ 43, PT_SC, ucp_Bassa_Vah },
{ 53, PT_SC, ucp_Batak },
{ 59, PT_SC, ucp_Bengali },
{ 67, PT_SC, ucp_Bopomofo },
{ 76, PT_SC, ucp_Brahmi },
{ 83, PT_SC, ucp_Braille },
{ 91, PT_SC, ucp_Buginese },
{ 100, PT_SC, ucp_Buhid },
{ 106, PT_GC, ucp_C },
{ 108, PT_SC, ucp_Canadian_Aboriginal },
{ 128, PT_SC, ucp_Carian },
{ 135, PT_SC, ucp_Caucasian_Albanian },
{ 154, PT_PC, ucp_Cc },
{ 157, PT_PC, ucp_Cf },
{ 160, PT_SC, ucp_Chakma },
{ 167, PT_SC, ucp_Cham },
{ 172, PT_SC, ucp_Cherokee },
{ 181, PT_PC, ucp_Cn },
{ 184, PT_PC, ucp_Co },
{ 187, PT_SC, ucp_Common },
{ 194, PT_SC, ucp_Coptic },
{ 201, PT_PC, ucp_Cs },
{ 204, PT_SC, ucp_Cuneiform },
{ 214, PT_SC, ucp_Cypriot },
{ 222, PT_SC, ucp_Cyrillic },
{ 231, PT_SC, ucp_Deseret },
{ 239, PT_SC, ucp_Devanagari },
{ 250, PT_SC, ucp_Duployan },
{ 259, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 280, PT_SC, ucp_Elbasan },
{ 288, PT_SC, ucp_Ethiopic },
{ 297, PT_SC, ucp_Georgian },
{ 306, PT_SC, ucp_Glagolitic },
{ 317, PT_SC, ucp_Gothic },
{ 324, PT_SC, ucp_Grantha },
{ 332, PT_SC, ucp_Greek },
{ 338, PT_SC, ucp_Gujarati },
{ 347, PT_SC, ucp_Gurmukhi },
{ 356, PT_SC, ucp_Han },
{ 360, PT_SC, ucp_Hangul },
{ 367, PT_SC, ucp_Hanunoo },
{ 375, PT_SC, ucp_Hebrew },
{ 382, PT_SC, ucp_Hiragana },
{ 391, PT_SC, ucp_Imperial_Aramaic },
{ 408, PT_SC, ucp_Inherited },
{ 418, PT_SC, ucp_Inscriptional_Pahlavi },
{ 440, PT_SC, ucp_Inscriptional_Parthian },
{ 463, PT_SC, ucp_Javanese },
{ 472, PT_SC, ucp_Kaithi },
{ 479, PT_SC, ucp_Kannada },
{ 487, PT_SC, ucp_Katakana },
{ 496, PT_SC, ucp_Kayah_Li },
{ 505, PT_SC, ucp_Kharoshthi },
{ 516, PT_SC, ucp_Khmer },
{ 522, PT_SC, ucp_Khojki },
{ 529, PT_SC, ucp_Khudawadi },
{ 539, PT_GC, ucp_L },
{ 541, PT_LAMP, 0 },
{ 544, PT_SC, ucp_Lao },
{ 548, PT_SC, ucp_Latin },
{ 554, PT_SC, ucp_Lepcha },
{ 561, PT_SC, ucp_Limbu },
{ 567, PT_SC, ucp_Linear_A },
{ 576, PT_SC, ucp_Linear_B },
{ 585, PT_SC, ucp_Lisu },
{ 590, PT_PC, ucp_Ll },
{ 593, PT_PC, ucp_Lm },
{ 596, PT_PC, ucp_Lo },
{ 599, PT_PC, ucp_Lt },
{ 602, PT_PC, ucp_Lu },
{ 605, PT_SC, ucp_Lycian },
{ 612, PT_SC, ucp_Lydian },
{ 619, PT_GC, ucp_M },
{ 621, PT_SC, ucp_Mahajani },
{ 630, PT_SC, ucp_Malayalam },
{ 640, PT_SC, ucp_Mandaic },
{ 648, PT_SC, ucp_Manichaean },
{ 659, PT_PC, ucp_Mc },
{ 662, PT_PC, ucp_Me },
{ 665, PT_SC, ucp_Meetei_Mayek },
{ 678, PT_SC, ucp_Mende_Kikakui },
{ 692, PT_SC, ucp_Meroitic_Cursive },
{ 709, PT_SC, ucp_Meroitic_Hieroglyphs },
{ 730, PT_SC, ucp_Miao },
{ 735, PT_PC, ucp_Mn },
{ 738, PT_SC, ucp_Modi },
{ 743, PT_SC, ucp_Mongolian },
{ 753, PT_SC, ucp_Mro },
{ 757, PT_SC, ucp_Myanmar },
{ 765, PT_GC, ucp_N },
{ 767, PT_SC, ucp_Nabataean },
{ 777, PT_PC, ucp_Nd },
{ 780, PT_SC, ucp_New_Tai_Lue },
{ 792, PT_SC, ucp_Nko },
{ 796, PT_PC, ucp_Nl },
{ 799, PT_PC, ucp_No },
{ 802, PT_SC, ucp_Ogham },
{ 808, PT_SC, ucp_Ol_Chiki },
{ 817, PT_SC, ucp_Old_Italic },
{ 828, PT_SC, ucp_Old_North_Arabian },
{ 846, PT_SC, ucp_Old_Permic },
{ 857, PT_SC, ucp_Old_Persian },
{ 869, PT_SC, ucp_Old_South_Arabian },
{ 887, PT_SC, ucp_Old_Turkic },
{ 898, PT_SC, ucp_Oriya },
{ 904, PT_SC, ucp_Osmanya },
{ 912, PT_GC, ucp_P },
{ 914, PT_SC, ucp_Pahawh_Hmong },
{ 927, PT_SC, ucp_Palmyrene },
{ 937, PT_SC, ucp_Pau_Cin_Hau },
{ 949, PT_PC, ucp_Pc },
{ 952, PT_PC, ucp_Pd },
{ 955, PT_PC, ucp_Pe },
{ 958, PT_PC, ucp_Pf },
{ 961, PT_SC, ucp_Phags_Pa },
{ 970, PT_SC, ucp_Phoenician },
{ 981, PT_PC, ucp_Pi },
{ 984, PT_PC, ucp_Po },
{ 987, PT_PC, ucp_Ps },
{ 990, PT_SC, ucp_Psalter_Pahlavi },
{ 1006, PT_SC, ucp_Rejang },
{ 1013, PT_SC, ucp_Runic },
{ 1019, PT_GC, ucp_S },
{ 1021, PT_SC, ucp_Samaritan },
{ 1031, PT_SC, ucp_Saurashtra },
{ 1042, PT_PC, ucp_Sc },
{ 1045, PT_SC, ucp_Sharada },
{ 1053, PT_SC, ucp_Shavian },
{ 1061, PT_SC, ucp_Siddham },
{ 1069, PT_SC, ucp_Sinhala },
{ 1077, PT_PC, ucp_Sk },
{ 1080, PT_PC, ucp_Sm },
{ 1083, PT_PC, ucp_So },
{ 1086, PT_SC, ucp_Sora_Sompeng },
{ 1099, PT_SC, ucp_Sundanese },
{ 1109, PT_SC, ucp_Syloti_Nagri },
{ 1122, PT_SC, ucp_Syriac },
{ 1129, PT_SC, ucp_Tagalog },
{ 1137, PT_SC, ucp_Tagbanwa },
{ 1146, PT_SC, ucp_Tai_Le },
{ 1153, PT_SC, ucp_Tai_Tham },
{ 1162, PT_SC, ucp_Tai_Viet },
{ 1171, PT_SC, ucp_Takri },
{ 1177, PT_SC, ucp_Tamil },
{ 1183, PT_SC, ucp_Telugu },
{ 1190, PT_SC, ucp_Thaana },
{ 1197, PT_SC, ucp_Thai },
{ 1202, PT_SC, ucp_Tibetan },
{ 1210, PT_SC, ucp_Tifinagh },
{ 1219, PT_SC, ucp_Tirhuta },
{ 1227, PT_SC, ucp_Ugaritic },
{ 1236, PT_SC, ucp_Vai },
{ 1240, PT_SC, ucp_Warang_Citi },
{ 1252, PT_ALNUM, 0 },
{ 1256, PT_PXSPACE, 0 },
{ 1260, PT_SPACE, 0 },
{ 1264, PT_UCNC, 0 },
{ 1268, PT_WORD, 0 },
{ 1272, PT_SC, ucp_Yi },
{ 1275, PT_GC, ucp_Z },
{ 1277, PT_PC, ucp_Zl },
{ 1280, PT_PC, ucp_Zp },
{ 1283, PT_PC, ucp_Zs }
};
const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);

File diff suppressed because it is too large Load Diff

View File

@ -242,7 +242,7 @@ while ((t = *data++) != XCL_END)
case PT_PXPUNCT:
if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
(c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
(c < 128 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
return !negated;
break;

View File

@ -192,7 +192,31 @@ enum {
ucp_Miao,
ucp_Sharada,
ucp_Sora_Sompeng,
ucp_Takri
ucp_Takri,
/* New for Unicode 7.0.0: */
ucp_Bassa_Vah,
ucp_Caucasian_Albanian,
ucp_Duployan,
ucp_Elbasan,
ucp_Grantha,
ucp_Khojki,
ucp_Khudawadi,
ucp_Linear_A,
ucp_Mahajani,
ucp_Manichaean,
ucp_Mende_Kikakui,
ucp_Modi,
ucp_Mro,
ucp_Nabataean,
ucp_Old_North_Arabian,
ucp_Old_Permic,
ucp_Pahawh_Hmong,
ucp_Palmyrene,
ucp_Psalter_Pahlavi,
ucp_Pau_Cin_Hau,
ucp_Siddham,
ucp_Tirhuta,
ucp_Warang_Citi
};
#endif