PCRE 8.35.0 Update

- Updated to PCRE 8.35 (latest, 2014-04-04) - Updated Build files with new files and removed those that have changed. - Configured like previous configuration.
2025-10-27 02:53:10 +01:00 · 2014-10-03 23:17:24 +10:00
parent 30e26b0d96
commit 010f7a5370
54 changed files with 33954 additions and 7932 deletions
--- a/Foundation/src/pcre_xclass.c
+++ b/Foundation/src/pcre_xclass.c
@@ -6,7 +6,7 @@
 and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
-           Copyright (c) 1997-2008 University of Cambridge
+           Copyright (c) 1997-2013 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -39,11 +39,9 @@ POSSIBILITY OF SUCH DAMAGE.


 /* This module contains an internal function that is used to match an extended
-class (one that contains characters whose values are > 255). It is used by both
-pcre_exec() and pcre_def_exec(). */
+class. It is used by both pcre_exec() and pcre_def_exec(). */

-
-#include "pcre_config.h"
+#include "config.h"
 #include "pcre_internal.h"


@@ -52,7 +50,7 @@ pcre_exec() and pcre_def_exec(). */
 *************************************************/

 /* This function is called to match a character against an extended class that
-might contain values > 255.
+might contain values > 255 and/or Unicode properties.

 Arguments:
  c           the character
@@ -62,68 +60,190 @@ Returns:      TRUE if character matches, else FALSE
 */

 BOOL
-_pcre_xclass(int c, const uschar *data)
+PRIV(xclass)(pcre_uint32 c, const pcre_uchar *data, BOOL utf)
 {
-int t;
+pcre_uchar t;
 BOOL negated = (*data & XCL_NOT) != 0;

+(void)utf;
+#ifdef COMPILE_PCRE8
+/* In 8 bit mode, this must always be TRUE. Help the compiler to know that. */
+utf = TRUE;
+#endif
+
 /* Character values < 256 are matched against a bitmap, if one is present. If
 not, we still carry on, because there may be ranges that start below 256 in the
 additional data. */

 if (c < 256)
  {
-  if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
-    return !negated;   /* char found */
+  if ((*data & XCL_HASPROP) == 0)
+    {
+    if ((*data & XCL_MAP) == 0) return negated;
+    return (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0;
+    }
+  if ((*data & XCL_MAP) != 0 &&
+    (((pcre_uint8 *)(data + 1))[c/8] & (1 << (c&7))) != 0)
+    return !negated; /* char found */
  }

 /* First skip the bit map if present. Then match against the list of Unicode
 properties or large chars or ranges that end with a large char. We won't ever
 encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */

-if ((*data++ & XCL_MAP) != 0) data += 32;
+if ((*data++ & XCL_MAP) != 0) data += 32 / sizeof(pcre_uchar);

 while ((t = *data++) != XCL_END)
  {
-  int x, y;
+  pcre_uint32 x, y;
  if (t == XCL_SINGLE)
    {
-    GETCHARINC(x, data);
+#ifdef SUPPORT_UTF
+    if (utf)
+      {
+      GETCHARINC(x, data); /* macro generates multiple statements */
+      }
+    else
+#endif
+      x = *data++;
    if (c == x) return !negated;
    }
  else if (t == XCL_RANGE)
    {
-    GETCHARINC(x, data);
-    GETCHARINC(y, data);
+#ifdef SUPPORT_UTF
+    if (utf)
+      {
+      GETCHARINC(x, data); /* macro generates multiple statements */
+      GETCHARINC(y, data); /* macro generates multiple statements */
+      }
+    else
+#endif
+      {
+      x = *data++;
+      y = *data++;
+      }
    if (c >= x && c <= y) return !negated;
    }

 #ifdef SUPPORT_UCP
  else  /* XCL_PROP & XCL_NOTPROP */
    {
-    const ucd_record * prop = GET_UCD(c);
+    const ucd_record *prop = GET_UCD(c);
+    BOOL isprop = t == XCL_PROP;

    switch(*data)
      {
      case PT_ANY:
-      if (t == XCL_PROP) return !negated;
+      if (isprop) return !negated;
      break;

      case PT_LAMP:
-      if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt) ==
-          (t == XCL_PROP)) return !negated;
+      if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
+           prop->chartype == ucp_Lt) == isprop) return !negated;
      break;

      case PT_GC:
-      if ((data[1] == _pcre_ucp_gentype[prop->chartype]) == (t == XCL_PROP)) return !negated;
+      if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
+        return !negated;
      break;

      case PT_PC:
-      if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated;
+      if ((data[1] == prop->chartype) == isprop) return !negated;
      break;

      case PT_SC:
-      if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated;
+      if ((data[1] == prop->script) == isprop) return !negated;
+      break;
+
+      case PT_ALNUM:
+      if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
+           PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
+        return !negated;
+      break;
+
+      /* Perl space used to exclude VT, but from Perl 5.18 it is included,
+      which means that Perl space and POSIX space are now identical. PCRE
+      was changed at release 8.34. */
+
+      case PT_SPACE:    /* Perl space */
+      case PT_PXSPACE:  /* POSIX space */
+      switch(c)
+        {
+        HSPACE_CASES:
+        VSPACE_CASES:
+        if (isprop) return !negated;
+        break;
+
+        default:
+        if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
+          return !negated;
+        break;
+        }
+      break;
+
+      case PT_WORD:
+      if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
+           PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
+             == isprop)
+        return !negated;
+      break;
+
+      case PT_UCNC:
+      if (c < 0xa0)
+        {
+        if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
+             c == CHAR_GRAVE_ACCENT) == isprop)
+          return !negated;
+        }
+      else
+        {
+        if ((c < 0xd800 || c > 0xdfff) == isprop)
+          return !negated;
+        }
+      break;
+
+      /* The following three properties can occur only in an XCLASS, as there
+      is no \p or \P coding for them. */
+
+      /* Graphic character. Implement this as not Z (space or separator) and
+      not C (other), except for Cf (format) with a few exceptions. This seems
+      to be what Perl does. The exceptional characters are:
+
+      U+061C           Arabic Letter Mark
+      U+180E           Mongolian Vowel Separator
+      U+2066 - U+2069  Various "isolate"s
+      */
+
+      case PT_PXGRAPH:
+      if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
+            (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
+              (prop->chartype == ucp_Cf &&
+                c != 0x061c && c != 0x180e && (c < 0x2066 || c > 0x2069))
+         )) == isprop)
+        return !negated;
+      break;
+
+      /* Printable character: same as graphic, with the addition of Zs, i.e.
+      not Zl and not Zp, and U+180E. */
+
+      case PT_PXPRINT:
+      if ((prop->chartype != ucp_Zl &&
+           prop->chartype != ucp_Zp &&
+            (PRIV(ucp_gentype)[prop->chartype] != ucp_C ||
+              (prop->chartype == ucp_Cf &&
+                c != 0x061c && (c < 0x2066 || c > 0x2069))
+         )) == isprop)
+        return !negated;
+      break;
+
+      /* Punctuation: all Unicode punctuation, plus ASCII characters that
+      Unicode treats as symbols rather than punctuation, for Perl
+      compatibility (these are $+<=>^`|~). */
+
+      case PT_PXPUNCT:
+      if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P ||
+            (c < 256 && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
+        return !negated;
      break;

      /* This should never occur, but compilers may mutter if there is no