mirror of
https://github.com/pocoproject/poco.git
synced 2025-10-16 18:56:52 +02:00
upgraded to PCRE 7.8
This commit is contained in:
@@ -1712,7 +1712,7 @@
|
||||
RelativePath=".\src\pcre_try_flipped.c">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\pcre_ucp_searchfuncs.c">
|
||||
RelativePath=".\src\pcre_ucd.c">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\pcre_valid_utf8.c">
|
||||
@@ -1750,12 +1750,6 @@
|
||||
<File
|
||||
RelativePath=".\src\ucp.h">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\ucpinternal.h">
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\ucptable.h">
|
||||
</File>
|
||||
</Filter>
|
||||
</Filter>
|
||||
<Filter
|
||||
|
@@ -2193,14 +2193,6 @@
|
||||
RelativePath=".\src\ucp.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\ucpinternal.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\ucptable.h"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Source Files"
|
||||
@@ -2266,7 +2258,7 @@
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\pcre_ucp_searchfuncs.c"
|
||||
RelativePath=".\src\pcre_ucd.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
|
@@ -2184,14 +2184,6 @@
|
||||
RelativePath=".\src\ucp.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\ucpinternal.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\ucptable.h"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Source Files"
|
||||
@@ -2257,7 +2249,7 @@
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\src\pcre_ucp_searchfuncs.c"
|
||||
RelativePath=".\src\pcre_ucd.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
|
@@ -1,7 +1,7 @@
|
||||
//
|
||||
// Unicode.h
|
||||
//
|
||||
// $Id: //poco/svn/Foundation/include/Poco/Unicode.h#2 $
|
||||
// $Id: //poco/1.3/Foundation/include/Poco/Unicode.h#1 $
|
||||
//
|
||||
// Library: Foundation
|
||||
// Package: Text
|
||||
@@ -174,7 +174,18 @@ public:
|
||||
UCP_CUNEIFORM,
|
||||
UCP_NKO,
|
||||
UCP_PHAGS_PA,
|
||||
UCP_PHOENICIAN
|
||||
UCP_PHOENICIAN,
|
||||
UCP_CARIAN,
|
||||
UCP_CHAM,
|
||||
UCP_KAYAH_LI,
|
||||
UCP_LEPCHA,
|
||||
UCP_LYCIAN,
|
||||
UCP_LYDIAN,
|
||||
UCP_OL_CHIKI,
|
||||
UCP_REJANG,
|
||||
UCP_SAURASHTRA,
|
||||
UCP_SUNDANESE,
|
||||
UCP_VAI
|
||||
};
|
||||
|
||||
struct CharacterProperties
|
||||
|
@@ -1,7 +1,7 @@
|
||||
//
|
||||
// Unicode.cpp
|
||||
//
|
||||
// $Id: //poco/svn/Foundation/src/Unicode.cpp#2 $
|
||||
// $Id: //poco/1.3/Foundation/src/Unicode.cpp#2 $
|
||||
//
|
||||
// Library: Foundation
|
||||
// Package: Text
|
||||
@@ -39,6 +39,7 @@
|
||||
|
||||
extern "C"
|
||||
{
|
||||
#include "pcre_config.h"
|
||||
#include "pcre_internal.h"
|
||||
}
|
||||
|
||||
@@ -48,12 +49,10 @@ namespace Poco {
|
||||
|
||||
void Unicode::properties(int ch, CharacterProperties& props)
|
||||
{
|
||||
int type;
|
||||
int script;
|
||||
int category = _pcre_ucp_findprop(static_cast<unsigned>(ch), &type, &script);
|
||||
props.category = static_cast<CharacterCategory>(category);
|
||||
props.type = static_cast<CharacterType>(type);
|
||||
props.script = static_cast<Script>(script);
|
||||
const ucd_record* ucd = GET_UCD(ch);
|
||||
props.category = static_cast<CharacterCategory>(_pcre_ucp_gentype[ucd->chartype]);
|
||||
props.type = static_cast<CharacterType>(ucd->chartype);
|
||||
props.script = static_cast<Script>(ucd->script);
|
||||
}
|
||||
|
||||
|
||||
@@ -76,7 +75,7 @@ bool Unicode::isUpper(int ch)
|
||||
int Unicode::toLower(int ch)
|
||||
{
|
||||
if (isUpper(ch))
|
||||
return static_cast<int>(_pcre_ucp_othercase(static_cast<unsigned>(ch)));
|
||||
return static_cast<int>(UCD_OTHERCASE(static_cast<unsigned>(ch)));
|
||||
else
|
||||
return ch;
|
||||
}
|
||||
@@ -85,7 +84,7 @@ int Unicode::toLower(int ch)
|
||||
int Unicode::toUpper(int ch)
|
||||
{
|
||||
if (isLower(ch))
|
||||
return static_cast<int>(_pcre_ucp_othercase(static_cast<unsigned>(ch)));
|
||||
return static_cast<int>(UCD_OTHERCASE(static_cast<unsigned>(ch)));
|
||||
else
|
||||
return ch;
|
||||
}
|
||||
|
@@ -5,7 +5,7 @@
|
||||
/* This is the public header file for the PCRE library, to be #included by
|
||||
applications that call the PCRE functions.
|
||||
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -42,19 +42,25 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
/* The current PCRE version information. */
|
||||
|
||||
#define PCRE_MAJOR 7
|
||||
#define PCRE_MINOR 1
|
||||
#define PCRE_MINOR 8
|
||||
#define PCRE_PRERELEASE
|
||||
#define PCRE_DATE 2007-04-24
|
||||
#define PCRE_DATE 2008-09-05
|
||||
|
||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE, the appropriate
|
||||
export setting is defined in pcre_internal.h, which includes this file. So we
|
||||
don't change an existing definition of PCRE_EXP_DECL. */
|
||||
don't change existing definitions of PCRE_EXP_DECL and PCRECPP_EXP_DECL. */
|
||||
|
||||
#ifndef PCRE_EXP_DECL
|
||||
# ifdef _WIN32
|
||||
# ifndef PCRE_STATIC
|
||||
# define PCRE_EXP_DECL extern __declspec(dllimport)
|
||||
#if defined(_WIN32) && !defined(PCRE_STATIC)
|
||||
# ifndef PCRE_EXP_DECL
|
||||
# define PCRE_EXP_DECL extern __declspec(dllimport)
|
||||
# endif
|
||||
# ifdef __cplusplus
|
||||
# ifndef PCRECPP_EXP_DECL
|
||||
# define PCRECPP_EXP_DECL extern __declspec(dllimport)
|
||||
# endif
|
||||
# ifndef PCRECPP_EXP_DEFN
|
||||
# define PCRECPP_EXP_DEFN __declspec(dllimport)
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
@@ -63,9 +69,18 @@ don't change an existing definition of PCRE_EXP_DECL. */
|
||||
|
||||
#ifndef PCRE_EXP_DECL
|
||||
# ifdef __cplusplus
|
||||
# define PCRE_EXP_DECL extern "C"
|
||||
# define PCRE_EXP_DECL extern "C"
|
||||
# else
|
||||
# define PCRE_EXP_DECL extern
|
||||
# define PCRE_EXP_DECL extern
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
# ifndef PCRECPP_EXP_DECL
|
||||
# define PCRECPP_EXP_DECL extern
|
||||
# endif
|
||||
# ifndef PCRECPP_EXP_DEFN
|
||||
# define PCRECPP_EXP_DEFN
|
||||
# endif
|
||||
#endif
|
||||
|
||||
@@ -107,6 +122,9 @@ extern "C" {
|
||||
#define PCRE_NEWLINE_CRLF 0x00300000
|
||||
#define PCRE_NEWLINE_ANY 0x00400000
|
||||
#define PCRE_NEWLINE_ANYCRLF 0x00500000
|
||||
#define PCRE_BSR_ANYCRLF 0x00800000
|
||||
#define PCRE_BSR_UNICODE 0x01000000
|
||||
#define PCRE_JAVASCRIPT_COMPAT 0x02000000
|
||||
|
||||
/* Exec-time and get/set-time error codes */
|
||||
|
||||
@@ -132,7 +150,7 @@ extern "C" {
|
||||
#define PCRE_ERROR_DFA_WSSIZE (-19)
|
||||
#define PCRE_ERROR_DFA_RECURSE (-20)
|
||||
#define PCRE_ERROR_RECURSIONLIMIT (-21)
|
||||
#define PCRE_ERROR_NULLWSLIMIT (-22)
|
||||
#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
|
||||
#define PCRE_ERROR_BADNEWLINE (-23)
|
||||
|
||||
/* Request types for pcre_fullinfo() */
|
||||
@@ -150,6 +168,9 @@ extern "C" {
|
||||
#define PCRE_INFO_NAMETABLE 9
|
||||
#define PCRE_INFO_STUDYSIZE 10
|
||||
#define PCRE_INFO_DEFAULT_TABLES 11
|
||||
#define PCRE_INFO_OKPARTIAL 12
|
||||
#define PCRE_INFO_JCHANGED 13
|
||||
#define PCRE_INFO_HASCRORLF 14
|
||||
|
||||
/* Request types for pcre_config(). Do not re-arrange, in order to remain
|
||||
compatible. */
|
||||
@@ -162,6 +183,7 @@ compatible. */
|
||||
#define PCRE_CONFIG_STACKRECURSE 5
|
||||
#define PCRE_CONFIG_UNICODE_PROPERTIES 6
|
||||
#define PCRE_CONFIG_MATCH_LIMIT_RECURSION 7
|
||||
#define PCRE_CONFIG_BSR 8
|
||||
|
||||
/* Bit flags for the pcre_extra structure. Do not re-arrange or redefine
|
||||
these bits, just add new ones on the end, in order to remain compatible. */
|
||||
|
@@ -14,12 +14,13 @@ example ISO-8859-1. When dftables is run, it creates these tables in the
|
||||
current locale. If PCRE is configured with --enable-rebuild-chartables, this
|
||||
happens automatically.
|
||||
|
||||
The following #include is present because without it gcc 4.x may remove the
|
||||
The following #includes are present because without the gcc 4.x may remove the
|
||||
array definition from the final binary if PCRE is built into a static library
|
||||
and dead code stripping is activated. This leads to link errors. Pulling in the
|
||||
header ensures that the array gets flagged as "someone outside this compilation
|
||||
unit might reference this" and so it will always be supplied to the linker. */
|
||||
|
||||
#include "pcre_config.h"
|
||||
#include "pcre_internal.h"
|
||||
|
||||
const unsigned char _pcre_default_tables[] = {
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -2,11 +2,11 @@
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/*PCRE is a library of functions to support regular expressions whose syntax
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -42,6 +42,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
information about a compiled pattern. */
|
||||
|
||||
|
||||
#include "pcre_config.h"
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
@@ -61,7 +62,7 @@ Arguments:
|
||||
Returns: 0 if data returned, negative on error
|
||||
*/
|
||||
|
||||
PCRE_EXP_DEFN int
|
||||
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
||||
pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
|
||||
void *where)
|
||||
{
|
||||
@@ -106,8 +107,8 @@ switch (what)
|
||||
|
||||
case PCRE_INFO_FIRSTBYTE:
|
||||
*((int *)where) =
|
||||
((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
|
||||
((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
|
||||
((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
|
||||
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
|
||||
break;
|
||||
|
||||
/* Make sure we pass back the pointer to the bit vector in the external
|
||||
@@ -121,7 +122,7 @@ switch (what)
|
||||
|
||||
case PCRE_INFO_LASTLITERAL:
|
||||
*((int *)where) =
|
||||
((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
|
||||
((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_NAMEENTRYSIZE:
|
||||
@@ -140,6 +141,18 @@ switch (what)
|
||||
*((const uschar **)where) = (const uschar *)(_pcre_default_tables);
|
||||
break;
|
||||
|
||||
case PCRE_INFO_OKPARTIAL:
|
||||
*((int *)where) = (re->flags & PCRE_NOPARTIAL) == 0;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_JCHANGED:
|
||||
*((int *)where) = (re->flags & PCRE_JCHANGED) != 0;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_HASCRORLF:
|
||||
*((int *)where) = (re->flags & PCRE_HASCRORLF) != 0;
|
||||
break;
|
||||
|
||||
default: return PCRE_ERROR_BADOPTION;
|
||||
}
|
||||
|
||||
|
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -46,6 +46,7 @@ indirection. These values can be changed by the caller, but are shared between
|
||||
all threads. However, when compiling for Virtual Pascal, things are done
|
||||
differently, and global variables are not used (see pcre.in). */
|
||||
|
||||
#include "pcre_config.h"
|
||||
#include "pcre_internal.h"
|
||||
|
||||
#ifndef VPCOMPAT
|
||||
|
@@ -7,7 +7,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -67,10 +67,6 @@ be absolutely sure we get our version. */
|
||||
#endif
|
||||
|
||||
|
||||
/* Get the definitions provided by running "configure" */
|
||||
|
||||
#include "pcre_config.h"
|
||||
|
||||
/* Standard C headers plus the external interface definition. The only time
|
||||
setjmp and stdarg are used is when NO_RECURSE is set. */
|
||||
|
||||
@@ -112,7 +108,7 @@ PCRE_EXP_DATA_DEFN only if they are not already set. */
|
||||
|
||||
#ifndef PCRE_EXP_DECL
|
||||
# ifdef _WIN32
|
||||
# ifdef DLL_EXPORT
|
||||
# ifndef PCRE_STATIC
|
||||
# define PCRE_EXP_DECL extern __declspec(dllexport)
|
||||
# define PCRE_EXP_DEFN __declspec(dllexport)
|
||||
# define PCRE_EXP_DATA_DEFN __declspec(dllexport)
|
||||
@@ -121,7 +117,6 @@ PCRE_EXP_DATA_DEFN only if they are not already set. */
|
||||
# define PCRE_EXP_DEFN
|
||||
# define PCRE_EXP_DATA_DEFN
|
||||
# endif
|
||||
#
|
||||
# else
|
||||
# ifdef __cplusplus
|
||||
# define PCRE_EXP_DECL extern "C"
|
||||
@@ -137,6 +132,20 @@ PCRE_EXP_DATA_DEFN only if they are not already set. */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* When compiling with the MSVC compiler, it is sometimes necessary to include
|
||||
a "calling convention" before exported function names. (This is secondhand
|
||||
information; I know nothing about MSVC myself). For example, something like
|
||||
|
||||
void __cdecl function(....)
|
||||
|
||||
might be needed. In order so make this easy, all the exported functions have
|
||||
PCRE_CALL_CONVENTION just before their names. It is rarely needed; if not
|
||||
set, we ensure here that it has no effect. */
|
||||
|
||||
#ifndef PCRE_CALL_CONVENTION
|
||||
#define PCRE_CALL_CONVENTION
|
||||
#endif
|
||||
|
||||
/* We need to have types that specify unsigned 16-bit and 32-bit integers. We
|
||||
cannot determine these outside the compilation (e.g. by running a program as
|
||||
part of "configure") because PCRE is often cross-compiled for use on other
|
||||
@@ -145,16 +154,20 @@ preprocessor time in standard C environments. */
|
||||
|
||||
#if USHRT_MAX == 65535
|
||||
typedef unsigned short pcre_uint16;
|
||||
typedef short pcre_int16;
|
||||
#elif UINT_MAX == 65535
|
||||
typedef unsigned int pcre_uint16;
|
||||
typedef int pcre_int16;
|
||||
#else
|
||||
#error Cannot determine a type for 16-bit unsigned integers
|
||||
#endif
|
||||
|
||||
#if UINT_MAX == 4294967295
|
||||
typedef unsigned int pcre_uint32;
|
||||
typedef int pcre_int32;
|
||||
#elif ULONG_MAX == 4294967295
|
||||
typedef unsigned long int pcre_uint32;
|
||||
typedef long int pcre_int32;
|
||||
#else
|
||||
#error Cannot determine a type for 32-bit unsigned integers
|
||||
#endif
|
||||
@@ -363,7 +376,9 @@ capturing parenthesis numbers in back references. */
|
||||
|
||||
/* When UTF-8 encoding is being used, a character is no longer just a single
|
||||
byte. The macros for character handling generate simple sequences when used in
|
||||
byte-mode, and more complicated ones for UTF-8 characters. */
|
||||
byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should
|
||||
never be called in byte mode. To make sure it can never even appear when UTF-8
|
||||
support is omitted, we don't even define it. */
|
||||
|
||||
#ifndef SUPPORT_UTF8
|
||||
#define GETCHAR(c, eptr) c = *eptr;
|
||||
@@ -371,7 +386,7 @@ byte-mode, and more complicated ones for UTF-8 characters. */
|
||||
#define GETCHARINC(c, eptr) c = *eptr++;
|
||||
#define GETCHARINCTEST(c, eptr) c = *eptr++;
|
||||
#define GETCHARLEN(c, eptr, len) c = *eptr;
|
||||
#define BACKCHAR(eptr)
|
||||
/* #define BACKCHAR(eptr) */
|
||||
|
||||
#else /* SUPPORT_UTF8 */
|
||||
|
||||
@@ -464,9 +479,10 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */
|
||||
}
|
||||
|
||||
/* If the pointer is not at the start of a character, move it back until
|
||||
it is. Called only in UTF-8 mode. */
|
||||
it is. This is called only in UTF-8 mode - we don't put a test within the macro
|
||||
because almost all calls are already within a block of UTF-8 only code. */
|
||||
|
||||
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
|
||||
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
|
||||
|
||||
#endif
|
||||
|
||||
@@ -483,17 +499,16 @@ Standard C system should have one. */
|
||||
|
||||
#define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
|
||||
|
||||
/* Private options flags start at the most significant end of the four bytes.
|
||||
The public options defined in pcre.h start at the least significant end. Make
|
||||
sure they don't overlap! The bits are getting a bit scarce now -- when we run
|
||||
out, there is a dummy word in the structure that could be used for the private
|
||||
bits. */
|
||||
/* Private flags containing information about the compiled regex. They used to
|
||||
live at the top end of the options word, but that got almost full, so now they
|
||||
are in a 16-bit flags word. */
|
||||
|
||||
#define PCRE_NOPARTIAL 0x80000000 /* can't use partial with this regex */
|
||||
#define PCRE_FIRSTSET 0x40000000 /* first_byte is set */
|
||||
#define PCRE_REQCHSET 0x20000000 /* req_byte is set */
|
||||
#define PCRE_STARTLINE 0x10000000 /* start after \n for multiline */
|
||||
#define PCRE_JCHANGED 0x08000000 /* j option changes within regex */
|
||||
#define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */
|
||||
#define PCRE_FIRSTSET 0x0002 /* first_byte is set */
|
||||
#define PCRE_REQCHSET 0x0004 /* req_byte is set */
|
||||
#define PCRE_STARTLINE 0x0008 /* start after \n for multiline */
|
||||
#define PCRE_JCHANGED 0x0010 /* j option used in regex */
|
||||
#define PCRE_HASCRORLF 0x0020 /* explicit \r or \n in pattern */
|
||||
|
||||
/* Options for the "extra" block produced by pcre_study(). */
|
||||
|
||||
@@ -509,15 +524,17 @@ time, run time, or study time, respectively. */
|
||||
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
|
||||
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
|
||||
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
|
||||
PCRE_DUPNAMES|PCRE_NEWLINE_BITS)
|
||||
PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
|
||||
PCRE_JAVASCRIPT_COMPAT)
|
||||
|
||||
#define PUBLIC_EXEC_OPTIONS \
|
||||
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
|
||||
PCRE_PARTIAL|PCRE_NEWLINE_BITS)
|
||||
PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)
|
||||
|
||||
#define PUBLIC_DFA_EXEC_OPTIONS \
|
||||
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
|
||||
PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS)
|
||||
PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \
|
||||
PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)
|
||||
|
||||
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
|
||||
|
||||
@@ -542,12 +559,15 @@ variable-length repeat, or a anything other than literal characters. */
|
||||
#define REQ_CASELESS 0x0100 /* indicates caselessness */
|
||||
#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
|
||||
|
||||
/* Miscellaneous definitions */
|
||||
/* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in
|
||||
environments where these macros are defined elsewhere. */
|
||||
|
||||
#ifndef FALSE
|
||||
typedef int BOOL;
|
||||
|
||||
#define FALSE 0
|
||||
#define TRUE 1
|
||||
#endif
|
||||
|
||||
/* Escape items that are just an encoding of a particular data value. */
|
||||
|
||||
@@ -598,26 +618,25 @@ contain UTF-8 characters with values greater than 255. */
|
||||
value such as \n. They must have non-zero values, as check_escape() returns
|
||||
their negation. Also, they must appear in the same order as in the opcode
|
||||
definitions below, up to ESC_z. There's a dummy for OP_ANY because it
|
||||
corresponds to "." rather than an escape sequence. The final one must be
|
||||
ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc).
|
||||
There are two tests in the code for an escape greater than ESC_b and less than
|
||||
ESC_Z to detect the types that may be repeated. These are the types that
|
||||
consume characters. If any new escapes are put in between that don't consume a
|
||||
character, that code will have to change. */
|
||||
corresponds to "." rather than an escape sequence, and another for OP_ALLANY
|
||||
(which is used for [^] in JavaScript compatibility mode).
|
||||
|
||||
enum { ESC_A = 1, ESC_G, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W,
|
||||
ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_X, ESC_Z, ESC_z,
|
||||
ESC_E, ESC_Q, ESC_k, ESC_REF };
|
||||
The final escape must be ESC_REF as subsequent values are used for
|
||||
backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
|
||||
greater than ESC_b and less than ESC_Z to detect the types that may be
|
||||
repeated. These are the types that consume characters. If any new escapes are
|
||||
put in between that don't consume a character, that code will have to change.
|
||||
*/
|
||||
|
||||
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
|
||||
ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
|
||||
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
|
||||
ESC_REF };
|
||||
|
||||
|
||||
/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
|
||||
that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
|
||||
/* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
|
||||
OP_EOD must correspond in order to the list of escapes immediately above.
|
||||
|
||||
To keep stored, compiled patterns compatible, new opcodes should be added
|
||||
immediately before OP_BRA, where (since release 7.0) a gap is left for this
|
||||
purpose.
|
||||
|
||||
*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions
|
||||
that follow must also be updated to match. There is also a table called
|
||||
"coptable" in pcre_dfa_exec.c that must be updated. */
|
||||
@@ -629,133 +648,155 @@ enum {
|
||||
|
||||
OP_SOD, /* 1 Start of data: \A */
|
||||
OP_SOM, /* 2 Start of match (subject + offset): \G */
|
||||
OP_NOT_WORD_BOUNDARY, /* 3 \B */
|
||||
OP_WORD_BOUNDARY, /* 4 \b */
|
||||
OP_NOT_DIGIT, /* 5 \D */
|
||||
OP_DIGIT, /* 6 \d */
|
||||
OP_NOT_WHITESPACE, /* 7 \S */
|
||||
OP_WHITESPACE, /* 8 \s */
|
||||
OP_NOT_WORDCHAR, /* 9 \W */
|
||||
OP_WORDCHAR, /* 10 \w */
|
||||
OP_ANY, /* 11 Match any character */
|
||||
OP_ANYBYTE, /* 12 Match any byte (\C); different to OP_ANY for UTF-8 */
|
||||
OP_NOTPROP, /* 13 \P (not Unicode property) */
|
||||
OP_PROP, /* 14 \p (Unicode property) */
|
||||
OP_ANYNL, /* 15 \R (any newline sequence) */
|
||||
OP_EXTUNI, /* 16 \X (extended Unicode sequence */
|
||||
OP_EODN, /* 17 End of data or \n at end of data: \Z. */
|
||||
OP_EOD, /* 18 End of data: \z */
|
||||
OP_SET_SOM, /* 3 Set start of match (\K) */
|
||||
OP_NOT_WORD_BOUNDARY, /* 4 \B */
|
||||
OP_WORD_BOUNDARY, /* 5 \b */
|
||||
OP_NOT_DIGIT, /* 6 \D */
|
||||
OP_DIGIT, /* 7 \d */
|
||||
OP_NOT_WHITESPACE, /* 8 \S */
|
||||
OP_WHITESPACE, /* 9 \s */
|
||||
OP_NOT_WORDCHAR, /* 10 \W */
|
||||
OP_WORDCHAR, /* 11 \w */
|
||||
OP_ANY, /* 12 Match any character (subject to DOTALL) */
|
||||
OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */
|
||||
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
|
||||
OP_NOTPROP, /* 15 \P (not Unicode property) */
|
||||
OP_PROP, /* 16 \p (Unicode property) */
|
||||
OP_ANYNL, /* 17 \R (any newline sequence) */
|
||||
OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */
|
||||
OP_HSPACE, /* 19 \h (horizontal whitespace) */
|
||||
OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */
|
||||
OP_VSPACE, /* 21 \v (vertical whitespace) */
|
||||
OP_EXTUNI, /* 22 \X (extended Unicode sequence */
|
||||
OP_EODN, /* 23 End of data or \n at end of data: \Z. */
|
||||
OP_EOD, /* 24 End of data: \z */
|
||||
|
||||
OP_OPT, /* 19 Set runtime options */
|
||||
OP_CIRC, /* 20 Start of line - varies with multiline switch */
|
||||
OP_DOLL, /* 21 End of line - varies with multiline switch */
|
||||
OP_CHAR, /* 22 Match one character, casefully */
|
||||
OP_CHARNC, /* 23 Match one character, caselessly */
|
||||
OP_NOT, /* 24 Match one character, not the following one */
|
||||
OP_OPT, /* 25 Set runtime options */
|
||||
OP_CIRC, /* 26 Start of line - varies with multiline switch */
|
||||
OP_DOLL, /* 27 End of line - varies with multiline switch */
|
||||
OP_CHAR, /* 28 Match one character, casefully */
|
||||
OP_CHARNC, /* 29 Match one character, caselessly */
|
||||
OP_NOT, /* 30 Match one character, not the following one */
|
||||
|
||||
OP_STAR, /* 25 The maximizing and minimizing versions of */
|
||||
OP_MINSTAR, /* 26 these six opcodes must come in pairs, with */
|
||||
OP_PLUS, /* 27 the minimizing one second. */
|
||||
OP_MINPLUS, /* 28 This first set applies to single characters.*/
|
||||
OP_QUERY, /* 29 */
|
||||
OP_MINQUERY, /* 30 */
|
||||
OP_STAR, /* 31 The maximizing and minimizing versions of */
|
||||
OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */
|
||||
OP_PLUS, /* 33 the minimizing one second. */
|
||||
OP_MINPLUS, /* 34 This first set applies to single characters.*/
|
||||
OP_QUERY, /* 35 */
|
||||
OP_MINQUERY, /* 36 */
|
||||
|
||||
OP_UPTO, /* 31 From 0 to n matches */
|
||||
OP_MINUPTO, /* 32 */
|
||||
OP_EXACT, /* 33 Exactly n matches */
|
||||
OP_UPTO, /* 37 From 0 to n matches */
|
||||
OP_MINUPTO, /* 38 */
|
||||
OP_EXACT, /* 39 Exactly n matches */
|
||||
|
||||
OP_POSSTAR, /* 34 Possessified star */
|
||||
OP_POSPLUS, /* 35 Possessified plus */
|
||||
OP_POSQUERY, /* 36 Posesssified query */
|
||||
OP_POSUPTO, /* 37 Possessified upto */
|
||||
OP_POSSTAR, /* 40 Possessified star */
|
||||
OP_POSPLUS, /* 41 Possessified plus */
|
||||
OP_POSQUERY, /* 42 Posesssified query */
|
||||
OP_POSUPTO, /* 43 Possessified upto */
|
||||
|
||||
OP_NOTSTAR, /* 38 The maximizing and minimizing versions of */
|
||||
OP_NOTMINSTAR, /* 39 these six opcodes must come in pairs, with */
|
||||
OP_NOTPLUS, /* 40 the minimizing one second. They must be in */
|
||||
OP_NOTMINPLUS, /* 41 exactly the same order as those above. */
|
||||
OP_NOTQUERY, /* 42 This set applies to "not" single characters. */
|
||||
OP_NOTMINQUERY, /* 43 */
|
||||
OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */
|
||||
OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */
|
||||
OP_NOTPLUS, /* 46 the minimizing one second. They must be in */
|
||||
OP_NOTMINPLUS, /* 47 exactly the same order as those above. */
|
||||
OP_NOTQUERY, /* 48 This set applies to "not" single characters. */
|
||||
OP_NOTMINQUERY, /* 49 */
|
||||
|
||||
OP_NOTUPTO, /* 44 From 0 to n matches */
|
||||
OP_NOTMINUPTO, /* 45 */
|
||||
OP_NOTEXACT, /* 46 Exactly n matches */
|
||||
OP_NOTUPTO, /* 50 From 0 to n matches */
|
||||
OP_NOTMINUPTO, /* 51 */
|
||||
OP_NOTEXACT, /* 52 Exactly n matches */
|
||||
|
||||
OP_NOTPOSSTAR, /* 47 Possessified versions */
|
||||
OP_NOTPOSPLUS, /* 48 */
|
||||
OP_NOTPOSQUERY, /* 49 */
|
||||
OP_NOTPOSUPTO, /* 50 */
|
||||
OP_NOTPOSSTAR, /* 53 Possessified versions */
|
||||
OP_NOTPOSPLUS, /* 54 */
|
||||
OP_NOTPOSQUERY, /* 55 */
|
||||
OP_NOTPOSUPTO, /* 56 */
|
||||
|
||||
OP_TYPESTAR, /* 51 The maximizing and minimizing versions of */
|
||||
OP_TYPEMINSTAR, /* 52 these six opcodes must come in pairs, with */
|
||||
OP_TYPEPLUS, /* 53 the minimizing one second. These codes must */
|
||||
OP_TYPEMINPLUS, /* 54 be in exactly the same order as those above. */
|
||||
OP_TYPEQUERY, /* 55 This set applies to character types such as \d */
|
||||
OP_TYPEMINQUERY, /* 56 */
|
||||
OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */
|
||||
OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */
|
||||
OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */
|
||||
OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */
|
||||
OP_TYPEQUERY, /* 61 This set applies to character types such as \d */
|
||||
OP_TYPEMINQUERY, /* 62 */
|
||||
|
||||
OP_TYPEUPTO, /* 57 From 0 to n matches */
|
||||
OP_TYPEMINUPTO, /* 58 */
|
||||
OP_TYPEEXACT, /* 59 Exactly n matches */
|
||||
OP_TYPEUPTO, /* 63 From 0 to n matches */
|
||||
OP_TYPEMINUPTO, /* 64 */
|
||||
OP_TYPEEXACT, /* 65 Exactly n matches */
|
||||
|
||||
OP_TYPEPOSSTAR, /* 60 Possessified versions */
|
||||
OP_TYPEPOSPLUS, /* 61 */
|
||||
OP_TYPEPOSQUERY, /* 62 */
|
||||
OP_TYPEPOSUPTO, /* 63 */
|
||||
OP_TYPEPOSSTAR, /* 66 Possessified versions */
|
||||
OP_TYPEPOSPLUS, /* 67 */
|
||||
OP_TYPEPOSQUERY, /* 68 */
|
||||
OP_TYPEPOSUPTO, /* 69 */
|
||||
|
||||
OP_CRSTAR, /* 64 The maximizing and minimizing versions of */
|
||||
OP_CRMINSTAR, /* 65 all these opcodes must come in pairs, with */
|
||||
OP_CRPLUS, /* 66 the minimizing one second. These codes must */
|
||||
OP_CRMINPLUS, /* 67 be in exactly the same order as those above. */
|
||||
OP_CRQUERY, /* 68 These are for character classes and back refs */
|
||||
OP_CRMINQUERY, /* 69 */
|
||||
OP_CRRANGE, /* 70 These are different to the three sets above. */
|
||||
OP_CRMINRANGE, /* 71 */
|
||||
OP_CRSTAR, /* 70 The maximizing and minimizing versions of */
|
||||
OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */
|
||||
OP_CRPLUS, /* 72 the minimizing one second. These codes must */
|
||||
OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */
|
||||
OP_CRQUERY, /* 74 These are for character classes and back refs */
|
||||
OP_CRMINQUERY, /* 75 */
|
||||
OP_CRRANGE, /* 76 These are different to the three sets above. */
|
||||
OP_CRMINRANGE, /* 77 */
|
||||
|
||||
OP_CLASS, /* 72 Match a character class, chars < 256 only */
|
||||
OP_NCLASS, /* 73 Same, but the bitmap was created from a negative
|
||||
OP_CLASS, /* 78 Match a character class, chars < 256 only */
|
||||
OP_NCLASS, /* 79 Same, but the bitmap was created from a negative
|
||||
class - the difference is relevant only when a UTF-8
|
||||
character > 255 is encountered. */
|
||||
|
||||
OP_XCLASS, /* 74 Extended class for handling UTF-8 chars within the
|
||||
OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the
|
||||
class. This does both positive and negative. */
|
||||
|
||||
OP_REF, /* 75 Match a back reference */
|
||||
OP_RECURSE, /* 76 Match a numbered subpattern (possibly recursive) */
|
||||
OP_CALLOUT, /* 77 Call out to external function if provided */
|
||||
OP_REF, /* 81 Match a back reference */
|
||||
OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */
|
||||
OP_CALLOUT, /* 83 Call out to external function if provided */
|
||||
|
||||
OP_ALT, /* 78 Start of alternation */
|
||||
OP_KET, /* 79 End of group that doesn't have an unbounded repeat */
|
||||
OP_KETRMAX, /* 80 These two must remain together and in this */
|
||||
OP_KETRMIN, /* 81 order. They are for groups the repeat for ever. */
|
||||
OP_ALT, /* 84 Start of alternation */
|
||||
OP_KET, /* 85 End of group that doesn't have an unbounded repeat */
|
||||
OP_KETRMAX, /* 86 These two must remain together and in this */
|
||||
OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */
|
||||
|
||||
/* The assertions must come before BRA, CBRA, ONCE, and COND.*/
|
||||
|
||||
OP_ASSERT, /* 82 Positive lookahead */
|
||||
OP_ASSERT_NOT, /* 83 Negative lookahead */
|
||||
OP_ASSERTBACK, /* 84 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 85 Negative lookbehind */
|
||||
OP_REVERSE, /* 86 Move pointer back - used in lookbehind assertions */
|
||||
OP_ASSERT, /* 88 Positive lookahead */
|
||||
OP_ASSERT_NOT, /* 89 Negative lookahead */
|
||||
OP_ASSERTBACK, /* 90 Positive lookbehind */
|
||||
OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */
|
||||
OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */
|
||||
|
||||
/* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
|
||||
as there's a test for >= ONCE for a subpattern that isn't an assertion. */
|
||||
|
||||
OP_ONCE, /* 87 Atomic group */
|
||||
OP_BRA, /* 88 Start of non-capturing bracket */
|
||||
OP_CBRA, /* 89 Start of capturing bracket */
|
||||
OP_COND, /* 90 Conditional group */
|
||||
OP_ONCE, /* 93 Atomic group */
|
||||
OP_BRA, /* 94 Start of non-capturing bracket */
|
||||
OP_CBRA, /* 95 Start of capturing bracket */
|
||||
OP_COND, /* 96 Conditional group */
|
||||
|
||||
/* These three must follow the previous three, in the same order. There's a
|
||||
check for >= SBRA to distinguish the two sets. */
|
||||
|
||||
OP_SBRA, /* 91 Start of non-capturing bracket, check empty */
|
||||
OP_SCBRA, /* 92 Start of capturing bracket, check empty */
|
||||
OP_SCOND, /* 93 Conditional group, check empty */
|
||||
OP_SBRA, /* 97 Start of non-capturing bracket, check empty */
|
||||
OP_SCBRA, /* 98 Start of capturing bracket, check empty */
|
||||
OP_SCOND, /* 99 Conditional group, check empty */
|
||||
|
||||
OP_CREF, /* 94 Used to hold a capture number as condition */
|
||||
OP_RREF, /* 95 Used to hold a recursion number as condition */
|
||||
OP_DEF, /* 96 The DEFINE condition */
|
||||
OP_CREF, /* 100 Used to hold a capture number as condition */
|
||||
OP_RREF, /* 101 Used to hold a recursion number as condition */
|
||||
OP_DEF, /* 102 The DEFINE condition */
|
||||
|
||||
OP_BRAZERO, /* 97 These two must remain together and in this */
|
||||
OP_BRAMINZERO /* 98 order. */
|
||||
OP_BRAZERO, /* 103 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 104 order. */
|
||||
|
||||
/* These are backtracking control verbs */
|
||||
|
||||
OP_PRUNE, /* 105 */
|
||||
OP_SKIP, /* 106 */
|
||||
OP_THEN, /* 107 */
|
||||
OP_COMMIT, /* 108 */
|
||||
|
||||
/* These are forced failure and success verbs */
|
||||
|
||||
OP_FAIL, /* 109 */
|
||||
OP_ACCEPT, /* 110 */
|
||||
|
||||
/* This is used to skip a subpattern with a {0} quantifier */
|
||||
|
||||
OP_SKIPZERO /* 111 */
|
||||
};
|
||||
|
||||
|
||||
@@ -763,10 +804,10 @@ enum {
|
||||
for debugging. The macro is referenced only in pcre_printint.c. */
|
||||
|
||||
#define OP_NAME_LIST \
|
||||
"End", "\\A", "\\G", "\\B", "\\b", "\\D", "\\d", \
|
||||
"\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \
|
||||
"notprop", "prop", "anynl", "extuni", \
|
||||
"\\Z", "\\z", \
|
||||
"End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
|
||||
"\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
|
||||
"notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
|
||||
"extuni", "\\Z", "\\z", \
|
||||
"Opt", "^", "$", "char", "charnc", "not", \
|
||||
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", \
|
||||
"*+","++", "?+", "{", \
|
||||
@@ -778,8 +819,10 @@ for debugging. The macro is referenced only in pcre_printint.c. */
|
||||
"class", "nclass", "xclass", "Ref", "Recurse", "Callout", \
|
||||
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
|
||||
"AssertB", "AssertB not", "Reverse", \
|
||||
"Once", "Bra 0", "Bra", "Cond", "SBra 0", "SBra", "SCond", \
|
||||
"Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero"
|
||||
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
|
||||
"Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \
|
||||
"*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
|
||||
"Skip zero"
|
||||
|
||||
|
||||
/* This macro defines the length of fixed length operations in the compiled
|
||||
@@ -793,9 +836,11 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
|
||||
#define OP_LENGTHS \
|
||||
1, /* End */ \
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* \A, \G, \B, \B, \D, \d, \S, \s, \W, \w */ \
|
||||
1, 1, /* Any, Anybyte */ \
|
||||
3, 3, 1, 1, /* NOTPROP, PROP, EXTUNI, ANYNL */ \
|
||||
1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
|
||||
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
|
||||
1, 1, 1, /* Any, AllAny, Anybyte */ \
|
||||
3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \
|
||||
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
|
||||
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
|
||||
2, /* Char - the minimum length */ \
|
||||
2, /* Charnc - the minimum length */ \
|
||||
@@ -841,6 +886,8 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
3, /* RREF */ \
|
||||
1, /* DEF */ \
|
||||
1, 1, /* BRAZERO, BRAMINZERO */ \
|
||||
1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
|
||||
1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */
|
||||
|
||||
|
||||
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
|
||||
@@ -855,7 +902,8 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
|
||||
ERR20, ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29,
|
||||
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
|
||||
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
|
||||
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57 };
|
||||
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
|
||||
ERR60, ERR61, ERR62, ERR63, ERR64 };
|
||||
|
||||
/* The real format of the start of the pcre block; the index of names and the
|
||||
code vector run on as long as necessary after the end. We store an explicit
|
||||
@@ -877,9 +925,9 @@ NOTE NOTE NOTE:
|
||||
typedef struct real_pcre {
|
||||
pcre_uint32 magic_number;
|
||||
pcre_uint32 size; /* Total that was malloced */
|
||||
pcre_uint32 options;
|
||||
pcre_uint32 dummy1; /* For future use, maybe */
|
||||
|
||||
pcre_uint32 options; /* Public options */
|
||||
pcre_uint16 flags; /* Private flags */
|
||||
pcre_uint16 dummy1; /* For future use */
|
||||
pcre_uint16 top_bracket;
|
||||
pcre_uint16 top_backref;
|
||||
pcre_uint16 first_byte;
|
||||
@@ -918,12 +966,14 @@ typedef struct compile_data {
|
||||
uschar *name_table; /* The name/number table */
|
||||
int names_found; /* Number of entries so far */
|
||||
int name_entry_size; /* Size of each entry */
|
||||
int bracount; /* Count of capturing parens */
|
||||
int bracount; /* Count of capturing parens as we compile */
|
||||
int final_bracount; /* Saved value after first pass */
|
||||
int top_backref; /* Maximum back reference */
|
||||
unsigned int backref_map; /* Bitmap of low back refs */
|
||||
int external_options; /* External (initial) options */
|
||||
int external_flags; /* External flag bits to be set */
|
||||
int req_varyopt; /* "After variable item" flag for reqbyte */
|
||||
BOOL nopartial; /* Set TRUE if partial won't work */
|
||||
BOOL had_accept; /* (*ACCEPT) encountered */
|
||||
int nltype; /* Newline type */
|
||||
int nllen; /* Newline string length */
|
||||
uschar nl[4]; /* Newline string when fixed length */
|
||||
@@ -944,21 +994,11 @@ typedef struct recursion_info {
|
||||
struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
|
||||
int group_num; /* Number of group that was called */
|
||||
const uschar *after_call; /* "Return value": points after the call in the expr */
|
||||
USPTR save_start; /* Old value of md->start_match */
|
||||
USPTR save_start; /* Old value of mstart */
|
||||
int *offset_save; /* Pointer to start of saved offsets */
|
||||
int saved_max; /* Number of saved offsets */
|
||||
} recursion_info;
|
||||
|
||||
/* When compiling in a mode that doesn't use recursive calls to match(),
|
||||
a structure is used to remember local variables on the heap. It is defined in
|
||||
pcre_exec.c, close to the match() function, so that it is easy to keep it in
|
||||
step with any changes of local variable. However, the pointer to the current
|
||||
frame must be saved in some "static" place over a longjmp(). We declare the
|
||||
structure here so that we can put a pointer in the match_data structure. NOTE:
|
||||
This isn't used for a "normal" compilation of pcre. */
|
||||
|
||||
struct heapframe;
|
||||
|
||||
/* Structure for building a chain of data for holding the values of the subject
|
||||
pointer at the start of each subpattern, so as to detect when an empty string
|
||||
has been matched by a subpattern - to break infinite loops. */
|
||||
@@ -988,14 +1028,16 @@ typedef struct match_data {
|
||||
BOOL notbol; /* NOTBOL flag */
|
||||
BOOL noteol; /* NOTEOL flag */
|
||||
BOOL utf8; /* UTF8 flag */
|
||||
BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
|
||||
BOOL endonly; /* Dollar not before final \n */
|
||||
BOOL notempty; /* Empty string match not wanted */
|
||||
BOOL partial; /* PARTIAL flag */
|
||||
BOOL hitend; /* Hit the end of the subject at some point */
|
||||
BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */
|
||||
const uschar *start_code; /* For use when recursing */
|
||||
USPTR start_subject; /* Start of the subject string */
|
||||
USPTR end_subject; /* End of the subject string */
|
||||
USPTR start_match; /* Start of this match attempt */
|
||||
USPTR start_match_ptr; /* Start of matched string */
|
||||
USPTR end_match_ptr; /* Subject position at end match */
|
||||
int end_offset_top; /* Highwater mark at end of match */
|
||||
int capture_last; /* Most recent capture number */
|
||||
@@ -1004,7 +1046,6 @@ typedef struct match_data {
|
||||
int eptrn; /* Next free eptrblock */
|
||||
recursion_info *recursive; /* Linked list of recursion data */
|
||||
void *callout_data; /* To pass back to callouts */
|
||||
struct heapframe *thisframe; /* Used only when compiling for no recursion */
|
||||
} match_data;
|
||||
|
||||
/* A similar structure is used for the same purpose by the DFA matching
|
||||
@@ -1029,7 +1070,7 @@ typedef struct dfa_match_data {
|
||||
#define ctype_letter 0x02
|
||||
#define ctype_digit 0x04
|
||||
#define ctype_xdigit 0x08
|
||||
#define ctype_word 0x10 /* alphameric or '_' */
|
||||
#define ctype_word 0x10 /* alphanumeric or '_' */
|
||||
#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
|
||||
|
||||
/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
|
||||
@@ -1057,10 +1098,12 @@ total length. */
|
||||
#define tables_length (ctypes_offset + 256)
|
||||
|
||||
/* Layout of the UCP type table that translates property names into types and
|
||||
codes. */
|
||||
codes. Each entry used to point directly to a name, but to reduce the number of
|
||||
relocations in shared libraries, it now has an offset into a single string
|
||||
instead. */
|
||||
|
||||
typedef struct {
|
||||
const char *name;
|
||||
pcre_uint16 name_offset;
|
||||
pcre_uint16 type;
|
||||
pcre_uint16 value;
|
||||
} ucp_type_table;
|
||||
@@ -1078,6 +1121,7 @@ extern const uschar _pcre_utf8_table4[];
|
||||
|
||||
extern const int _pcre_utf8_table1_size;
|
||||
|
||||
extern const char _pcre_utt_names[];
|
||||
extern const ucp_type_table _pcre_utt[];
|
||||
extern const int _pcre_utt_size;
|
||||
|
||||
@@ -1095,13 +1139,38 @@ extern BOOL _pcre_is_newline(const uschar *, int, const uschar *,
|
||||
extern int _pcre_ord2utf8(int, uschar *);
|
||||
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
|
||||
const pcre_study_data *, pcre_study_data *);
|
||||
extern int _pcre_ucp_findprop(const unsigned int, int *, int *);
|
||||
extern unsigned int _pcre_ucp_othercase(const unsigned int);
|
||||
extern int _pcre_valid_utf8(const uschar *, int);
|
||||
extern BOOL _pcre_was_newline(const uschar *, int, const uschar *,
|
||||
int *, BOOL);
|
||||
extern BOOL _pcre_xclass(int, const uschar *);
|
||||
|
||||
|
||||
/* Unicode character database (UCD) */
|
||||
|
||||
typedef struct {
|
||||
uschar script;
|
||||
uschar chartype;
|
||||
pcre_int32 other_case;
|
||||
} ucd_record;
|
||||
|
||||
extern const ucd_record _pcre_ucd_records[];
|
||||
extern const uschar _pcre_ucd_stage1[];
|
||||
extern const pcre_uint16 _pcre_ucd_stage2[];
|
||||
extern const int _pcre_ucp_gentype[];
|
||||
|
||||
|
||||
/* UCD access macros */
|
||||
|
||||
#define UCD_BLOCK_SIZE 128
|
||||
#define GET_UCD(ch) (_pcre_ucd_records + \
|
||||
_pcre_ucd_stage2[_pcre_ucd_stage1[(ch) / UCD_BLOCK_SIZE] * \
|
||||
UCD_BLOCK_SIZE + ch % UCD_BLOCK_SIZE])
|
||||
|
||||
#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
|
||||
#define UCD_SCRIPT(ch) GET_UCD(ch)->script
|
||||
#define UCD_CATEGORY(ch) _pcre_ucp_gentype[UCD_CHARTYPE(ch)]
|
||||
#define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)
|
||||
|
||||
#endif
|
||||
|
||||
/* End of pcre_internal.h */
|
||||
|
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -45,7 +45,8 @@ compilation of dftables.c, in which case the macro DFTABLES is defined. */
|
||||
|
||||
|
||||
#ifndef DFTABLES
|
||||
#include "pcre_internal.h"
|
||||
# include "pcre_config.h"
|
||||
# include "pcre_internal.h"
|
||||
#endif
|
||||
|
||||
|
||||
|
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -47,6 +47,7 @@ and NLTYPE_ANY. The full list of Unicode newline characters is taken from
|
||||
http://unicode.org/unicode/reports/tr18/. */
|
||||
|
||||
|
||||
#include "pcre_config.h"
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
@@ -124,12 +125,16 @@ _pcre_was_newline(const uschar *ptr, int type, const uschar *startptr,
|
||||
{
|
||||
int c;
|
||||
ptr--;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8)
|
||||
{
|
||||
BACKCHAR(ptr);
|
||||
GETCHAR(c, ptr);
|
||||
}
|
||||
else c = *ptr;
|
||||
#else /* no UTF-8 support */
|
||||
c = *ptr;
|
||||
#endif /* SUPPORT_UTF8 */
|
||||
|
||||
if (type == NLTYPE_ANYCRLF) switch(c)
|
||||
{
|
||||
|
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -41,7 +41,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
/* This file contains a private PCRE function that converts an ordinal
|
||||
character value into a UTF8 string. */
|
||||
|
||||
|
||||
#include "pcre_config.h"
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
@@ -75,8 +75,10 @@ for (j = i; j > 0; j--)
|
||||
*buffer = _pcre_utf8_table2[i] | cvalue;
|
||||
return i + 1;
|
||||
#else
|
||||
return 0; /* Keep compiler happy; this function won't ever be */
|
||||
#endif /* called when SUPPORT_UTF8 is not defined. */
|
||||
(void)(cvalue); /* Keep compiler happy; this function won't ever be */
|
||||
(void)(buffer); /* called when SUPPORT_UTF8 is not defined. */
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* End of pcre_ord2utf8.c */
|
||||
|
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -42,6 +42,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
supporting functions. */
|
||||
|
||||
|
||||
#include "pcre_config.h"
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
@@ -213,6 +214,14 @@ do
|
||||
tcode += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* SKIPZERO skips the bracket. */
|
||||
|
||||
case OP_SKIPZERO:
|
||||
tcode++;
|
||||
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
|
||||
tcode += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* Single-char * or ? sets the bit and tries the next item */
|
||||
|
||||
case OP_STAR:
|
||||
@@ -337,6 +346,7 @@ do
|
||||
switch(tcode[1])
|
||||
{
|
||||
case OP_ANY:
|
||||
case OP_ALLANY:
|
||||
return SSB_FAIL;
|
||||
|
||||
case OP_NOT_DIGIT:
|
||||
@@ -491,7 +501,7 @@ Returns: pointer to a pcre_extra block, with study_data filled in and the
|
||||
NULL on error or if no optimization possible
|
||||
*/
|
||||
|
||||
PCRE_EXP_DEFN pcre_extra *
|
||||
PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
|
||||
pcre_study(const pcre *external_re, int options, const char **errorptr)
|
||||
{
|
||||
uschar start_bits[32];
|
||||
@@ -523,7 +533,8 @@ code = (uschar *)re + re->name_table_offset +
|
||||
a multiline pattern that matches only at "line starts", no further processing
|
||||
at present. */
|
||||
|
||||
if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
|
||||
if ((re->options & PCRE_ANCHORED) != 0 ||
|
||||
(re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
|
||||
return NULL;
|
||||
|
||||
/* Set the character tables in the block that is passed around */
|
||||
|
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -44,6 +44,7 @@ uses macros to change their names from _pcre_xxx to xxxx, thereby avoiding name
|
||||
clashes with the library. */
|
||||
|
||||
|
||||
#include "pcre_config.h"
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
@@ -83,115 +84,266 @@ const uschar _pcre_utf8_table4[] = {
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
|
||||
|
||||
/* This table translates Unicode property names into type and code values. It
|
||||
is searched by binary chop, so must be in collating sequence of name. */
|
||||
/* Table to translate from particular type value to the general value. */
|
||||
|
||||
const int _pcre_ucp_gentype[] = {
|
||||
ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */
|
||||
ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */
|
||||
ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */
|
||||
ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */
|
||||
ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */
|
||||
ucp_P, ucp_P, /* Ps, Po */
|
||||
ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */
|
||||
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
|
||||
};
|
||||
|
||||
/* The pcre_utt[] table below translates Unicode property names into type and
|
||||
code values. It is searched by binary chop, so must be in collating sequence of
|
||||
name. Originally, the table contained pointers to the name strings in the first
|
||||
field of each entry. However, that leads to a large number of relocations when
|
||||
a shared library is dynamically loaded. A significant reduction is made by
|
||||
putting all the names into a single, large string and then using offsets in the
|
||||
table itself. Maintenance is more error-prone, but frequent changes to this
|
||||
data are unlikely.
|
||||
|
||||
July 2008: There is now a script called maint/GenerateUtt.py which can be used
|
||||
to generate this data instead of maintaining it entirely by hand. */
|
||||
|
||||
const char _pcre_utt_names[] =
|
||||
"Any\0"
|
||||
"Arabic\0"
|
||||
"Armenian\0"
|
||||
"Balinese\0"
|
||||
"Bengali\0"
|
||||
"Bopomofo\0"
|
||||
"Braille\0"
|
||||
"Buginese\0"
|
||||
"Buhid\0"
|
||||
"C\0"
|
||||
"Canadian_Aboriginal\0"
|
||||
"Carian\0"
|
||||
"Cc\0"
|
||||
"Cf\0"
|
||||
"Cham\0"
|
||||
"Cherokee\0"
|
||||
"Cn\0"
|
||||
"Co\0"
|
||||
"Common\0"
|
||||
"Coptic\0"
|
||||
"Cs\0"
|
||||
"Cuneiform\0"
|
||||
"Cypriot\0"
|
||||
"Cyrillic\0"
|
||||
"Deseret\0"
|
||||
"Devanagari\0"
|
||||
"Ethiopic\0"
|
||||
"Georgian\0"
|
||||
"Glagolitic\0"
|
||||
"Gothic\0"
|
||||
"Greek\0"
|
||||
"Gujarati\0"
|
||||
"Gurmukhi\0"
|
||||
"Han\0"
|
||||
"Hangul\0"
|
||||
"Hanunoo\0"
|
||||
"Hebrew\0"
|
||||
"Hiragana\0"
|
||||
"Inherited\0"
|
||||
"Kannada\0"
|
||||
"Katakana\0"
|
||||
"Kayah_Li\0"
|
||||
"Kharoshthi\0"
|
||||
"Khmer\0"
|
||||
"L\0"
|
||||
"L&\0"
|
||||
"Lao\0"
|
||||
"Latin\0"
|
||||
"Lepcha\0"
|
||||
"Limbu\0"
|
||||
"Linear_B\0"
|
||||
"Ll\0"
|
||||
"Lm\0"
|
||||
"Lo\0"
|
||||
"Lt\0"
|
||||
"Lu\0"
|
||||
"Lycian\0"
|
||||
"Lydian\0"
|
||||
"M\0"
|
||||
"Malayalam\0"
|
||||
"Mc\0"
|
||||
"Me\0"
|
||||
"Mn\0"
|
||||
"Mongolian\0"
|
||||
"Myanmar\0"
|
||||
"N\0"
|
||||
"Nd\0"
|
||||
"New_Tai_Lue\0"
|
||||
"Nko\0"
|
||||
"Nl\0"
|
||||
"No\0"
|
||||
"Ogham\0"
|
||||
"Ol_Chiki\0"
|
||||
"Old_Italic\0"
|
||||
"Old_Persian\0"
|
||||
"Oriya\0"
|
||||
"Osmanya\0"
|
||||
"P\0"
|
||||
"Pc\0"
|
||||
"Pd\0"
|
||||
"Pe\0"
|
||||
"Pf\0"
|
||||
"Phags_Pa\0"
|
||||
"Phoenician\0"
|
||||
"Pi\0"
|
||||
"Po\0"
|
||||
"Ps\0"
|
||||
"Rejang\0"
|
||||
"Runic\0"
|
||||
"S\0"
|
||||
"Saurashtra\0"
|
||||
"Sc\0"
|
||||
"Shavian\0"
|
||||
"Sinhala\0"
|
||||
"Sk\0"
|
||||
"Sm\0"
|
||||
"So\0"
|
||||
"Sundanese\0"
|
||||
"Syloti_Nagri\0"
|
||||
"Syriac\0"
|
||||
"Tagalog\0"
|
||||
"Tagbanwa\0"
|
||||
"Tai_Le\0"
|
||||
"Tamil\0"
|
||||
"Telugu\0"
|
||||
"Thaana\0"
|
||||
"Thai\0"
|
||||
"Tibetan\0"
|
||||
"Tifinagh\0"
|
||||
"Ugaritic\0"
|
||||
"Vai\0"
|
||||
"Yi\0"
|
||||
"Z\0"
|
||||
"Zl\0"
|
||||
"Zp\0"
|
||||
"Zs\0";
|
||||
|
||||
const ucp_type_table _pcre_utt[] = {
|
||||
{ "Any", PT_ANY, 0 },
|
||||
{ "Arabic", PT_SC, ucp_Arabic },
|
||||
{ "Armenian", PT_SC, ucp_Armenian },
|
||||
{ "Balinese", PT_SC, ucp_Balinese },
|
||||
{ "Bengali", PT_SC, ucp_Bengali },
|
||||
{ "Bopomofo", PT_SC, ucp_Bopomofo },
|
||||
{ "Braille", PT_SC, ucp_Braille },
|
||||
{ "Buginese", PT_SC, ucp_Buginese },
|
||||
{ "Buhid", PT_SC, ucp_Buhid },
|
||||
{ "C", PT_GC, ucp_C },
|
||||
{ "Canadian_Aboriginal", PT_SC, ucp_Canadian_Aboriginal },
|
||||
{ "Cc", PT_PC, ucp_Cc },
|
||||
{ "Cf", PT_PC, ucp_Cf },
|
||||
{ "Cherokee", PT_SC, ucp_Cherokee },
|
||||
{ "Cn", PT_PC, ucp_Cn },
|
||||
{ "Co", PT_PC, ucp_Co },
|
||||
{ "Common", PT_SC, ucp_Common },
|
||||
{ "Coptic", PT_SC, ucp_Coptic },
|
||||
{ "Cs", PT_PC, ucp_Cs },
|
||||
{ "Cuneiform", PT_SC, ucp_Cuneiform },
|
||||
{ "Cypriot", PT_SC, ucp_Cypriot },
|
||||
{ "Cyrillic", PT_SC, ucp_Cyrillic },
|
||||
{ "Deseret", PT_SC, ucp_Deseret },
|
||||
{ "Devanagari", PT_SC, ucp_Devanagari },
|
||||
{ "Ethiopic", PT_SC, ucp_Ethiopic },
|
||||
{ "Georgian", PT_SC, ucp_Georgian },
|
||||
{ "Glagolitic", PT_SC, ucp_Glagolitic },
|
||||
{ "Gothic", PT_SC, ucp_Gothic },
|
||||
{ "Greek", PT_SC, ucp_Greek },
|
||||
{ "Gujarati", PT_SC, ucp_Gujarati },
|
||||
{ "Gurmukhi", PT_SC, ucp_Gurmukhi },
|
||||
{ "Han", PT_SC, ucp_Han },
|
||||
{ "Hangul", PT_SC, ucp_Hangul },
|
||||
{ "Hanunoo", PT_SC, ucp_Hanunoo },
|
||||
{ "Hebrew", PT_SC, ucp_Hebrew },
|
||||
{ "Hiragana", PT_SC, ucp_Hiragana },
|
||||
{ "Inherited", PT_SC, ucp_Inherited },
|
||||
{ "Kannada", PT_SC, ucp_Kannada },
|
||||
{ "Katakana", PT_SC, ucp_Katakana },
|
||||
{ "Kharoshthi", PT_SC, ucp_Kharoshthi },
|
||||
{ "Khmer", PT_SC, ucp_Khmer },
|
||||
{ "L", PT_GC, ucp_L },
|
||||
{ "L&", PT_LAMP, 0 },
|
||||
{ "Lao", PT_SC, ucp_Lao },
|
||||
{ "Latin", PT_SC, ucp_Latin },
|
||||
{ "Limbu", PT_SC, ucp_Limbu },
|
||||
{ "Linear_B", PT_SC, ucp_Linear_B },
|
||||
{ "Ll", PT_PC, ucp_Ll },
|
||||
{ "Lm", PT_PC, ucp_Lm },
|
||||
{ "Lo", PT_PC, ucp_Lo },
|
||||
{ "Lt", PT_PC, ucp_Lt },
|
||||
{ "Lu", PT_PC, ucp_Lu },
|
||||
{ "M", PT_GC, ucp_M },
|
||||
{ "Malayalam", PT_SC, ucp_Malayalam },
|
||||
{ "Mc", PT_PC, ucp_Mc },
|
||||
{ "Me", PT_PC, ucp_Me },
|
||||
{ "Mn", PT_PC, ucp_Mn },
|
||||
{ "Mongolian", PT_SC, ucp_Mongolian },
|
||||
{ "Myanmar", PT_SC, ucp_Myanmar },
|
||||
{ "N", PT_GC, ucp_N },
|
||||
{ "Nd", PT_PC, ucp_Nd },
|
||||
{ "New_Tai_Lue", PT_SC, ucp_New_Tai_Lue },
|
||||
{ "Nko", PT_SC, ucp_Nko },
|
||||
{ "Nl", PT_PC, ucp_Nl },
|
||||
{ "No", PT_PC, ucp_No },
|
||||
{ "Ogham", PT_SC, ucp_Ogham },
|
||||
{ "Old_Italic", PT_SC, ucp_Old_Italic },
|
||||
{ "Old_Persian", PT_SC, ucp_Old_Persian },
|
||||
{ "Oriya", PT_SC, ucp_Oriya },
|
||||
{ "Osmanya", PT_SC, ucp_Osmanya },
|
||||
{ "P", PT_GC, ucp_P },
|
||||
{ "Pc", PT_PC, ucp_Pc },
|
||||
{ "Pd", PT_PC, ucp_Pd },
|
||||
{ "Pe", PT_PC, ucp_Pe },
|
||||
{ "Pf", PT_PC, ucp_Pf },
|
||||
{ "Phags_Pa", PT_SC, ucp_Phags_Pa },
|
||||
{ "Phoenician", PT_SC, ucp_Phoenician },
|
||||
{ "Pi", PT_PC, ucp_Pi },
|
||||
{ "Po", PT_PC, ucp_Po },
|
||||
{ "Ps", PT_PC, ucp_Ps },
|
||||
{ "Runic", PT_SC, ucp_Runic },
|
||||
{ "S", PT_GC, ucp_S },
|
||||
{ "Sc", PT_PC, ucp_Sc },
|
||||
{ "Shavian", PT_SC, ucp_Shavian },
|
||||
{ "Sinhala", PT_SC, ucp_Sinhala },
|
||||
{ "Sk", PT_PC, ucp_Sk },
|
||||
{ "Sm", PT_PC, ucp_Sm },
|
||||
{ "So", PT_PC, ucp_So },
|
||||
{ "Syloti_Nagri", PT_SC, ucp_Syloti_Nagri },
|
||||
{ "Syriac", PT_SC, ucp_Syriac },
|
||||
{ "Tagalog", PT_SC, ucp_Tagalog },
|
||||
{ "Tagbanwa", PT_SC, ucp_Tagbanwa },
|
||||
{ "Tai_Le", PT_SC, ucp_Tai_Le },
|
||||
{ "Tamil", PT_SC, ucp_Tamil },
|
||||
{ "Telugu", PT_SC, ucp_Telugu },
|
||||
{ "Thaana", PT_SC, ucp_Thaana },
|
||||
{ "Thai", PT_SC, ucp_Thai },
|
||||
{ "Tibetan", PT_SC, ucp_Tibetan },
|
||||
{ "Tifinagh", PT_SC, ucp_Tifinagh },
|
||||
{ "Ugaritic", PT_SC, ucp_Ugaritic },
|
||||
{ "Yi", PT_SC, ucp_Yi },
|
||||
{ "Z", PT_GC, ucp_Z },
|
||||
{ "Zl", PT_PC, ucp_Zl },
|
||||
{ "Zp", PT_PC, ucp_Zp },
|
||||
{ "Zs", PT_PC, ucp_Zs }
|
||||
{ 0, PT_ANY, 0 },
|
||||
{ 4, PT_SC, ucp_Arabic },
|
||||
{ 11, PT_SC, ucp_Armenian },
|
||||
{ 20, PT_SC, ucp_Balinese },
|
||||
{ 29, PT_SC, ucp_Bengali },
|
||||
{ 37, PT_SC, ucp_Bopomofo },
|
||||
{ 46, PT_SC, ucp_Braille },
|
||||
{ 54, PT_SC, ucp_Buginese },
|
||||
{ 63, PT_SC, ucp_Buhid },
|
||||
{ 69, PT_GC, ucp_C },
|
||||
{ 71, PT_SC, ucp_Canadian_Aboriginal },
|
||||
{ 91, PT_SC, ucp_Carian },
|
||||
{ 98, PT_PC, ucp_Cc },
|
||||
{ 101, PT_PC, ucp_Cf },
|
||||
{ 104, PT_SC, ucp_Cham },
|
||||
{ 109, PT_SC, ucp_Cherokee },
|
||||
{ 118, PT_PC, ucp_Cn },
|
||||
{ 121, PT_PC, ucp_Co },
|
||||
{ 124, PT_SC, ucp_Common },
|
||||
{ 131, PT_SC, ucp_Coptic },
|
||||
{ 138, PT_PC, ucp_Cs },
|
||||
{ 141, PT_SC, ucp_Cuneiform },
|
||||
{ 151, PT_SC, ucp_Cypriot },
|
||||
{ 159, PT_SC, ucp_Cyrillic },
|
||||
{ 168, PT_SC, ucp_Deseret },
|
||||
{ 176, PT_SC, ucp_Devanagari },
|
||||
{ 187, PT_SC, ucp_Ethiopic },
|
||||
{ 196, PT_SC, ucp_Georgian },
|
||||
{ 205, PT_SC, ucp_Glagolitic },
|
||||
{ 216, PT_SC, ucp_Gothic },
|
||||
{ 223, PT_SC, ucp_Greek },
|
||||
{ 229, PT_SC, ucp_Gujarati },
|
||||
{ 238, PT_SC, ucp_Gurmukhi },
|
||||
{ 247, PT_SC, ucp_Han },
|
||||
{ 251, PT_SC, ucp_Hangul },
|
||||
{ 258, PT_SC, ucp_Hanunoo },
|
||||
{ 266, PT_SC, ucp_Hebrew },
|
||||
{ 273, PT_SC, ucp_Hiragana },
|
||||
{ 282, PT_SC, ucp_Inherited },
|
||||
{ 292, PT_SC, ucp_Kannada },
|
||||
{ 300, PT_SC, ucp_Katakana },
|
||||
{ 309, PT_SC, ucp_Kayah_Li },
|
||||
{ 318, PT_SC, ucp_Kharoshthi },
|
||||
{ 329, PT_SC, ucp_Khmer },
|
||||
{ 335, PT_GC, ucp_L },
|
||||
{ 337, PT_LAMP, 0 },
|
||||
{ 340, PT_SC, ucp_Lao },
|
||||
{ 344, PT_SC, ucp_Latin },
|
||||
{ 350, PT_SC, ucp_Lepcha },
|
||||
{ 357, PT_SC, ucp_Limbu },
|
||||
{ 363, PT_SC, ucp_Linear_B },
|
||||
{ 372, PT_PC, ucp_Ll },
|
||||
{ 375, PT_PC, ucp_Lm },
|
||||
{ 378, PT_PC, ucp_Lo },
|
||||
{ 381, PT_PC, ucp_Lt },
|
||||
{ 384, PT_PC, ucp_Lu },
|
||||
{ 387, PT_SC, ucp_Lycian },
|
||||
{ 394, PT_SC, ucp_Lydian },
|
||||
{ 401, PT_GC, ucp_M },
|
||||
{ 403, PT_SC, ucp_Malayalam },
|
||||
{ 413, PT_PC, ucp_Mc },
|
||||
{ 416, PT_PC, ucp_Me },
|
||||
{ 419, PT_PC, ucp_Mn },
|
||||
{ 422, PT_SC, ucp_Mongolian },
|
||||
{ 432, PT_SC, ucp_Myanmar },
|
||||
{ 440, PT_GC, ucp_N },
|
||||
{ 442, PT_PC, ucp_Nd },
|
||||
{ 445, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 457, PT_SC, ucp_Nko },
|
||||
{ 461, PT_PC, ucp_Nl },
|
||||
{ 464, PT_PC, ucp_No },
|
||||
{ 467, PT_SC, ucp_Ogham },
|
||||
{ 473, PT_SC, ucp_Ol_Chiki },
|
||||
{ 482, PT_SC, ucp_Old_Italic },
|
||||
{ 493, PT_SC, ucp_Old_Persian },
|
||||
{ 505, PT_SC, ucp_Oriya },
|
||||
{ 511, PT_SC, ucp_Osmanya },
|
||||
{ 519, PT_GC, ucp_P },
|
||||
{ 521, PT_PC, ucp_Pc },
|
||||
{ 524, PT_PC, ucp_Pd },
|
||||
{ 527, PT_PC, ucp_Pe },
|
||||
{ 530, PT_PC, ucp_Pf },
|
||||
{ 533, PT_SC, ucp_Phags_Pa },
|
||||
{ 542, PT_SC, ucp_Phoenician },
|
||||
{ 553, PT_PC, ucp_Pi },
|
||||
{ 556, PT_PC, ucp_Po },
|
||||
{ 559, PT_PC, ucp_Ps },
|
||||
{ 562, PT_SC, ucp_Rejang },
|
||||
{ 569, PT_SC, ucp_Runic },
|
||||
{ 575, PT_GC, ucp_S },
|
||||
{ 577, PT_SC, ucp_Saurashtra },
|
||||
{ 588, PT_PC, ucp_Sc },
|
||||
{ 591, PT_SC, ucp_Shavian },
|
||||
{ 599, PT_SC, ucp_Sinhala },
|
||||
{ 607, PT_PC, ucp_Sk },
|
||||
{ 610, PT_PC, ucp_Sm },
|
||||
{ 613, PT_PC, ucp_So },
|
||||
{ 616, PT_SC, ucp_Sundanese },
|
||||
{ 626, PT_SC, ucp_Syloti_Nagri },
|
||||
{ 639, PT_SC, ucp_Syriac },
|
||||
{ 646, PT_SC, ucp_Tagalog },
|
||||
{ 654, PT_SC, ucp_Tagbanwa },
|
||||
{ 663, PT_SC, ucp_Tai_Le },
|
||||
{ 670, PT_SC, ucp_Tamil },
|
||||
{ 676, PT_SC, ucp_Telugu },
|
||||
{ 683, PT_SC, ucp_Thaana },
|
||||
{ 690, PT_SC, ucp_Thai },
|
||||
{ 695, PT_SC, ucp_Tibetan },
|
||||
{ 703, PT_SC, ucp_Tifinagh },
|
||||
{ 712, PT_SC, ucp_Ugaritic },
|
||||
{ 721, PT_SC, ucp_Vai },
|
||||
{ 725, PT_SC, ucp_Yi },
|
||||
{ 728, PT_GC, ucp_Z },
|
||||
{ 730, PT_PC, ucp_Zl },
|
||||
{ 733, PT_PC, ucp_Zp },
|
||||
{ 736, PT_PC, ucp_Zs }
|
||||
};
|
||||
|
||||
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
|
||||
|
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -43,6 +43,7 @@ see if it was compiled with the opposite endianness. If so, it uses an
|
||||
auxiliary local function to flip the appropriate bytes. */
|
||||
|
||||
|
||||
#include "pcre_config.h"
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
@@ -104,6 +105,7 @@ if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
|
||||
*internal_re = *re; /* To copy other fields */
|
||||
internal_re->size = byteflip(re->size, sizeof(re->size));
|
||||
internal_re->options = byteflip(re->options, sizeof(re->options));
|
||||
internal_re->flags = (pcre_uint16)byteflip(re->flags, sizeof(re->flags));
|
||||
internal_re->top_bracket =
|
||||
(pcre_uint16)byteflip(re->top_bracket, sizeof(re->top_bracket));
|
||||
internal_re->top_backref =
|
||||
|
2608
Foundation/src/pcre_ucd.c
Normal file
2608
Foundation/src/pcre_ucd.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,175 +0,0 @@
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
|
||||
/* This module contains code for searching the table of Unicode character
|
||||
properties. */
|
||||
|
||||
#include "pcre_internal.h"
|
||||
|
||||
#include "ucp.h" /* Category definitions */
|
||||
#include "ucpinternal.h" /* Internal table details */
|
||||
#include "ucptable.h" /* The table itself */
|
||||
|
||||
|
||||
/* Table to translate from particular type value to the general value. */
|
||||
|
||||
static const int ucp_gentype[] = {
|
||||
ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */
|
||||
ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */
|
||||
ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */
|
||||
ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */
|
||||
ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */
|
||||
ucp_P, ucp_P, /* Ps, Po */
|
||||
ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */
|
||||
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Search table and return type *
|
||||
*************************************************/
|
||||
|
||||
/* Three values are returned: the category is ucp_C, ucp_L, etc. The detailed
|
||||
character type is ucp_Lu, ucp_Nd, etc. The script is ucp_Latin, etc.
|
||||
|
||||
Arguments:
|
||||
c the character value
|
||||
type_ptr the detailed character type is returned here
|
||||
script_ptr the script is returned here
|
||||
|
||||
Returns: the character type category
|
||||
*/
|
||||
|
||||
int
|
||||
_pcre_ucp_findprop(const unsigned int c, int *type_ptr, int *script_ptr)
|
||||
{
|
||||
int bot = 0;
|
||||
int top = sizeof(ucp_table)/sizeof(cnode);
|
||||
int mid;
|
||||
|
||||
/* The table is searched using a binary chop. You might think that using
|
||||
intermediate variables to hold some of the common expressions would speed
|
||||
things up, but tests with gcc 3.4.4 on Linux showed that, on the contrary, it
|
||||
makes things a lot slower. */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
if (top <= bot)
|
||||
{
|
||||
*type_ptr = ucp_Cn;
|
||||
*script_ptr = ucp_Common;
|
||||
return ucp_C;
|
||||
}
|
||||
mid = (bot + top) >> 1;
|
||||
if (c == (ucp_table[mid].f0 & f0_charmask)) break;
|
||||
if (c < (ucp_table[mid].f0 & f0_charmask)) top = mid;
|
||||
else
|
||||
{
|
||||
if ((ucp_table[mid].f0 & f0_rangeflag) != 0 &&
|
||||
c <= (ucp_table[mid].f0 & f0_charmask) +
|
||||
(ucp_table[mid].f1 & f1_rangemask)) break;
|
||||
bot = mid + 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Found an entry in the table. Set the script and detailed type values, and
|
||||
return the general type. */
|
||||
|
||||
*script_ptr = (ucp_table[mid].f0 & f0_scriptmask) >> f0_scriptshift;
|
||||
*type_ptr = (ucp_table[mid].f1 & f1_typemask) >> f1_typeshift;
|
||||
|
||||
return ucp_gentype[*type_ptr];
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Search table and return other case *
|
||||
*************************************************/
|
||||
|
||||
/* If the given character is a letter, and there is another case for the
|
||||
letter, return the other case. Otherwise, return -1.
|
||||
|
||||
Arguments:
|
||||
c the character value
|
||||
|
||||
Returns: the other case or NOTACHAR if none
|
||||
*/
|
||||
|
||||
unsigned int
|
||||
_pcre_ucp_othercase(const unsigned int c)
|
||||
{
|
||||
int bot = 0;
|
||||
int top = sizeof(ucp_table)/sizeof(cnode);
|
||||
int mid, offset;
|
||||
|
||||
/* The table is searched using a binary chop. You might think that using
|
||||
intermediate variables to hold some of the common expressions would speed
|
||||
things up, but tests with gcc 3.4.4 on Linux showed that, on the contrary, it
|
||||
makes things a lot slower. */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
if (top <= bot) return -1;
|
||||
mid = (bot + top) >> 1;
|
||||
if (c == (ucp_table[mid].f0 & f0_charmask)) break;
|
||||
if (c < (ucp_table[mid].f0 & f0_charmask)) top = mid;
|
||||
else
|
||||
{
|
||||
if ((ucp_table[mid].f0 & f0_rangeflag) != 0 &&
|
||||
c <= (ucp_table[mid].f0 & f0_charmask) +
|
||||
(ucp_table[mid].f1 & f1_rangemask)) break;
|
||||
bot = mid + 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Found an entry in the table. Return NOTACHAR for a range entry. Otherwise
|
||||
return the other case if there is one, else NOTACHAR. */
|
||||
|
||||
if ((ucp_table[mid].f0 & f0_rangeflag) != 0) return NOTACHAR;
|
||||
|
||||
offset = ucp_table[mid].f1 & f1_casemask;
|
||||
if ((offset & f1_caseneg) != 0) offset |= f1_caseneg;
|
||||
return (offset == 0)? NOTACHAR : c + offset;
|
||||
}
|
||||
|
||||
|
||||
/* End of pcre_ucp_searchfuncs.c */
|
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -42,6 +42,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
strings. */
|
||||
|
||||
|
||||
#include "pcre_config.h"
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
@@ -55,6 +56,13 @@ that subsequent code can assume it is dealing with a valid string. The check
|
||||
can be turned off for maximum performance, but the consequences of supplying
|
||||
an invalid string are then undefined.
|
||||
|
||||
Originally, this function checked according to RFC 2279, allowing for values in
|
||||
the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
|
||||
the canonical format. Once somebody had pointed out RFC 3629 to me (it
|
||||
obsoletes 2279), additional restrictions were applied. The values are now
|
||||
limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
|
||||
subrange 0xd000 to 0xdfff is excluded.
|
||||
|
||||
Arguments:
|
||||
string points to the string
|
||||
length length of string, or -1 if the string is zero-terminated
|
||||
@@ -81,31 +89,48 @@ for (p = string; length-- > 0; p++)
|
||||
register int c = *p;
|
||||
if (c < 128) continue;
|
||||
if (c < 0xc0) return p - string;
|
||||
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
|
||||
if (length < ab) return p - string;
|
||||
ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
|
||||
if (length < ab || ab > 3) return p - string;
|
||||
length -= ab;
|
||||
|
||||
/* Check top bits in the second byte */
|
||||
if ((*(++p) & 0xc0) != 0x80) return p - string;
|
||||
|
||||
/* Check for overlong sequences for each different length */
|
||||
/* Check for overlong sequences for each different length, and for the
|
||||
excluded range 0xd000 to 0xdfff. */
|
||||
|
||||
switch (ab)
|
||||
{
|
||||
/* Check for xx00 000x */
|
||||
/* Check for xx00 000x (overlong sequence) */
|
||||
|
||||
case 1:
|
||||
if ((c & 0x3e) == 0) return p - string;
|
||||
continue; /* We know there aren't any more bytes to check */
|
||||
|
||||
/* Check for 1110 0000, xx0x xxxx */
|
||||
/* Check for 1110 0000, xx0x xxxx (overlong sequence) or
|
||||
1110 1101, 1010 xxxx (0xd000 - 0xdfff) */
|
||||
|
||||
case 2:
|
||||
if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
|
||||
if ((c == 0xe0 && (*p & 0x20) == 0) ||
|
||||
(c == 0xed && *p >= 0xa0))
|
||||
return p - string;
|
||||
break;
|
||||
|
||||
/* Check for 1111 0000, xx00 xxxx */
|
||||
/* Check for 1111 0000, xx00 xxxx (overlong sequence) or
|
||||
greater than 0x0010ffff (f4 8f bf bf) */
|
||||
|
||||
case 3:
|
||||
if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
|
||||
if ((c == 0xf0 && (*p & 0x30) == 0) ||
|
||||
(c > 0xf4 ) ||
|
||||
(c == 0xf4 && *p > 0x8f))
|
||||
return p - string;
|
||||
break;
|
||||
|
||||
#if 0
|
||||
/* These cases can no longer occur, as we restrict to a maximum of four
|
||||
bytes nowadays. Leave the code here in case we ever want to add an option
|
||||
for longer sequences. */
|
||||
|
||||
/* Check for 1111 1000, xx00 0xxx */
|
||||
case 4:
|
||||
if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
|
||||
@@ -116,6 +141,8 @@ for (p = string; length-- > 0; p++)
|
||||
if (c == 0xfe || c == 0xff ||
|
||||
(c == 0xfc && (*p & 0x3c) == 0)) return p - string;
|
||||
break;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/* Check for valid bytes after the 2nd, if any; all must start 10 */
|
||||
@@ -124,6 +151,9 @@ for (p = string; length-- > 0; p++)
|
||||
if ((*(++p) & 0xc0) != 0x80) return p - string;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)(string); /* Keep picky compilers happy */
|
||||
(void)(length);
|
||||
#endif
|
||||
|
||||
return -1;
|
||||
|
@@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2007 University of Cambridge
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -43,6 +43,7 @@ class (one that contains characters whose values are > 255). It is used by both
|
||||
pcre_exec() and pcre_def_exec(). */
|
||||
|
||||
|
||||
#include "pcre_config.h"
|
||||
#include "pcre_internal.h"
|
||||
|
||||
|
||||
@@ -100,8 +101,7 @@ while ((t = *data++) != XCL_END)
|
||||
#ifdef SUPPORT_UCP
|
||||
else /* XCL_PROP & XCL_NOTPROP */
|
||||
{
|
||||
int chartype, script;
|
||||
int category = _pcre_ucp_findprop(c, &chartype, &script);
|
||||
const ucd_record * prop = GET_UCD(c);
|
||||
|
||||
switch(*data)
|
||||
{
|
||||
@@ -110,20 +110,20 @@ while ((t = *data++) != XCL_END)
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) ==
|
||||
if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt) ==
|
||||
(t == XCL_PROP)) return !negated;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
if ((data[1] == category) == (t == XCL_PROP)) return !negated;
|
||||
if ((data[1] == _pcre_ucp_gentype[prop->chartype]) == (t == XCL_PROP)) return !negated;
|
||||
break;
|
||||
|
||||
case PT_PC:
|
||||
if ((data[1] == chartype) == (t == XCL_PROP)) return !negated;
|
||||
if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated;
|
||||
break;
|
||||
|
||||
case PT_SC:
|
||||
if ((data[1] == script) == (t == XCL_PROP)) return !negated;
|
||||
if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated;
|
||||
break;
|
||||
|
||||
/* This should never occur, but compilers may mutter if there is no
|
||||
|
@@ -121,11 +121,24 @@ enum {
|
||||
ucp_Tifinagh,
|
||||
ucp_Ugaritic,
|
||||
ucp_Yi,
|
||||
ucp_Balinese, /* New for Unicode 5.0.0 */
|
||||
ucp_Cuneiform, /* New for Unicode 5.0.0 */
|
||||
ucp_Nko, /* New for Unicode 5.0.0 */
|
||||
ucp_Phags_Pa, /* New for Unicode 5.0.0 */
|
||||
ucp_Phoenician /* New for Unicode 5.0.0 */
|
||||
/* New for Unicode 5.0: */
|
||||
ucp_Balinese,
|
||||
ucp_Cuneiform,
|
||||
ucp_Nko,
|
||||
ucp_Phags_Pa,
|
||||
ucp_Phoenician,
|
||||
/* New for Unicode 5.1: */
|
||||
ucp_Carian,
|
||||
ucp_Cham,
|
||||
ucp_Kayah_Li,
|
||||
ucp_Lepcha,
|
||||
ucp_Lycian,
|
||||
ucp_Lydian,
|
||||
ucp_Ol_Chiki,
|
||||
ucp_Rejang,
|
||||
ucp_Saurashtra,
|
||||
ucp_Sundanese,
|
||||
ucp_Vai
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@@ -1,92 +0,0 @@
|
||||
/*************************************************
|
||||
* Unicode Property Table handler *
|
||||
*************************************************/
|
||||
|
||||
#ifndef _UCPINTERNAL_H
|
||||
#define _UCPINTERNAL_H
|
||||
|
||||
/* Internal header file defining the layout of the bits in each pair of 32-bit
|
||||
words that form a data item in the table. */
|
||||
|
||||
typedef struct cnode {
|
||||
pcre_uint32 f0;
|
||||
pcre_uint32 f1;
|
||||
} cnode;
|
||||
|
||||
/* Things for the f0 field */
|
||||
|
||||
#define f0_scriptmask 0xff000000 /* Mask for script field */
|
||||
#define f0_scriptshift 24 /* Shift for script value */
|
||||
#define f0_rangeflag 0x00f00000 /* Flag for a range item */
|
||||
#define f0_charmask 0x001fffff /* Mask for code point value */
|
||||
|
||||
/* Things for the f1 field */
|
||||
|
||||
#define f1_typemask 0xfc000000 /* Mask for char type field */
|
||||
#define f1_typeshift 26 /* Shift for the type field */
|
||||
#define f1_rangemask 0x0000ffff /* Mask for a range offset */
|
||||
#define f1_casemask 0x0000ffff /* Mask for a case offset */
|
||||
#define f1_caseneg 0xffff8000 /* Bits for negation */
|
||||
|
||||
/* The data consists of a vector of structures of type cnode. The two unsigned
|
||||
32-bit integers are used as follows:
|
||||
|
||||
(f0) (1) The most significant byte holds the script number. The numbers are
|
||||
defined by the enum in ucp.h.
|
||||
|
||||
(2) The 0x00800000 bit is set if this entry defines a range of characters.
|
||||
It is not set if this entry defines a single character
|
||||
|
||||
(3) The 0x00600000 bits are spare.
|
||||
|
||||
(4) The 0x001fffff bits contain the code point. No Unicode code point will
|
||||
ever be greater than 0x0010ffff, so this should be OK for ever.
|
||||
|
||||
(f1) (1) The 0xfc000000 bits contain the character type number. The numbers are
|
||||
defined by an enum in ucp.h.
|
||||
|
||||
(2) The 0x03ff0000 bits are spare.
|
||||
|
||||
(3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of
|
||||
range if this entry defines a range, OR the *signed* offset to the
|
||||
character's "other case" partner if this entry defines a single
|
||||
character. There is no partner if the value is zero.
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
| script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) |
|
||||
-------------------------------------------------------------------------------
|
||||
| | | | |
|
||||
| | |-> spare | |-> spare
|
||||
| | |
|
||||
| |-> spare |-> spare
|
||||
|
|
||||
|-> range flag
|
||||
|
||||
The upper/lower casing information is set only for characters that come in
|
||||
pairs. The non-one-to-one mappings in the Unicode data are ignored.
|
||||
|
||||
When searching the data, proceed as follows:
|
||||
|
||||
(1) Set up for a binary chop search.
|
||||
|
||||
(2) If the top is not greater than the bottom, the character is not in the
|
||||
table. Its type must therefore be "Cn" ("Undefined").
|
||||
|
||||
(3) Find the middle vector element.
|
||||
|
||||
(4) Extract the code point and compare. If equal, we are done.
|
||||
|
||||
(5) If the test character is smaller, set the top to the current point, and
|
||||
goto (2).
|
||||
|
||||
(6) If the current entry defines a range, compute the last character by adding
|
||||
the offset, and see if the test character is within the range. If it is,
|
||||
we are done.
|
||||
|
||||
(7) Otherwise, set the bottom to one element past the current point and goto
|
||||
(2).
|
||||
*/
|
||||
|
||||
#endif /* _UCPINTERNAL_H */
|
||||
|
||||
/* End of ucpinternal.h */
|
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user