[webkit-changes] cvs commit: JavaScriptCore/pcre pcre-config.h
pcre.h pcre_compile.c pcre_internal.h pcre_valid_utf8.c
Darin
darin at opensource.apple.com
Mon Aug 15 09:09:55 PDT 2005
darin 05/08/15 09:09:55
Modified: pcre Tag: pcre-6-1-branch pcre-config.h pcre.h
pcre_compile.c pcre_internal.h
Removed: pcre Tag: pcre-6-1-branch pcre_valid_utf8.c
Log:
A few first steps porting the new PCRE 6.1 to KJS and to UTF-16.
Revision Changes Path
No revision
No revision
1.1.84.1 +97 -3 JavaScriptCore/pcre/pcre-config.h
Index: pcre-config.h
===================================================================
RCS file: /cvs/root/JavaScriptCore/pcre/pcre-config.h,v
retrieving revision 1.1
retrieving revision 1.1.84.1
diff -u -r1.1 -r1.1.84.1
--- pcre-config.h 4 Dec 2002 21:57:20 -0000 1.1
+++ pcre-config.h 15 Aug 2005 16:09:54 -0000 1.1.84.1
@@ -1,5 +1,99 @@
-#define HAVE_MEMMOVE 1
-#define HAVE_BCOPY 1
+/* On Unix systems config.in is converted by configure into config.h. PCRE is
+written in Standard C, but there are a few non-standard things it can cope
+with, allowing it to run on SunOS4 and other "close to standard" systems.
+
+On a non-Unix system you should just copy this file into config.h, and set up
+the macros the way you need them. You should normally change the definitions of
+HAVE_STRERROR and HAVE_MEMMOVE to 1. Unfortunately, because of the way autoconf
+works, these cannot be made the defaults. If your system has bcopy() and not
+memmove(), change the definition of HAVE_BCOPY instead of HAVE_MEMMOVE. If your
+system has neither bcopy() nor memmove(), leave them both as 0; an emulation
+function will be used. */
+
+/* If you are compiling for a system that uses EBCDIC instead of ASCII
+character codes, define this macro as 1. On systems that can use "configure",
+this can be done via --enable-ebcdic. */
+
+#define EBCDIC 0
+
+/* If you are compiling for a system that needs some magic to be inserted
+before the definition of an exported function, define this macro to contain the
+relevant magic. It apears at the start of every exported function. */
+
+#define EXPORT
+
+/* Define to empty if the "const" keyword does not work. */
+
+#undef const
+
+/* Define to "unsigned" if <stddef.h> doesn't define size_t. */
+
+#undef size_t
+
+/* The following two definitions are mainly for the benefit of SunOS4, which
+doesn't have the strerror() or memmove() functions that should be present in
+all Standard C libraries. The macros HAVE_STRERROR and HAVE_MEMMOVE should
+normally be defined with the value 1 for other systems, but unfortunately we
+can't make this the default because "configure" files generated by autoconf
+will only change 0 to 1; they won't change 1 to 0 if the functions are not
+found. */
+
#define HAVE_STRERROR 1
-#define NEWLINE 10
+#define HAVE_MEMMOVE 1
+
+/* There are some non-Unix systems that don't even have bcopy(). If this macro
+is false, an emulation is used. If HAVE_MEMMOVE is set to 1, the value of
+HAVE_BCOPY is not relevant. */
+
+#define HAVE_BCOPY 1
+
+/* The value of NEWLINE determines the newline character. The default is to
+leave it up to the compiler, but some sites want to force a particular value.
+On Unix systems, "configure" can be used to override this default. */
+
+#define NEWLINE '\n'
+
+/* The value of LINK_SIZE determines the number of bytes used to store
+links as offsets within the compiled regex. The default is 2, which allows for
+compiled patterns up to 64K long. This covers the vast majority of cases.
+However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows for
+longer patterns in extreme cases. On Unix systems, "configure" can be used to
+override this default. */
+
+#define LINK_SIZE 2
+
+/* The value of MATCH_LIMIT determines the default number of times the match()
+function can be called during a single execution of pcre_exec(). (There is a
+runtime method of setting a different limit.) The limit exists in order to
+catch runaway regular expressions that take for ever to determine that they do
+not match. The default is set very large so that it does not accidentally catch
+legitimate cases. On Unix systems, "configure" can be used to override this
+default default. */
+
+#define MATCH_LIMIT 10000000
+
+/* When calling PCRE via the POSIX interface, additional working storage is
+required for holding the pointers to capturing substrings because PCRE requires
+three integers per substring, whereas the POSIX interface provides only two. If
+the number of expected substrings is small, the wrapper function uses space on
+the stack, because this is faster than using malloc() for each call. The
+threshold above which the stack is no longer use is defined by POSIX_MALLOC_
+THRESHOLD. On Unix systems, "configure" can be used to override this default.
+*/
+
+#define POSIX_MALLOC_THRESHOLD 10
+
+/* PCRE uses recursive function calls to handle backtracking while matching.
+This can sometimes be a problem on systems that have stacks of limited size.
+Define NO_RECURSE to get a version that doesn't use recursion in the match()
+function; instead it creates its own stack by steam using pcre_recurse_malloc
+to get memory. For more detail, see comments and other stuff just above the
+match() function. On Unix systems, "configure" can be used to set this in the
+Makefile (use --disable-stack-for-recursion). */
+
+/* #define NO_RECURSE */
+
+/* End */
+
+#define SUPPORT_UCP 1
#define SUPPORT_UTF8 1
1.5.2.2 +52 -22 JavaScriptCore/pcre/pcre.h
Index: pcre.h
===================================================================
RCS file: /cvs/root/JavaScriptCore/pcre/pcre.h,v
retrieving revision 1.5.2.1
retrieving revision 1.5.2.2
diff -u -r1.5.2.1 -r1.5.2.2
--- pcre.h 12 Aug 2005 22:23:49 -0000 1.5.2.1
+++ pcre.h 15 Aug 2005 16:09:54 -0000 1.5.2.2
@@ -6,6 +6,7 @@
"configure" into pcre.h.
Copyright (c) 1997-2005 University of Cambridge
+ Copyright (c) 2004, 2005 Apple Computer, Inc.
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -39,12 +40,35 @@
#ifndef _PCRE_H
#define _PCRE_H
-/* The file pcre.h is build by "configure". Do not edit it; instead
-make changes to pcre.in. */
+#define pcre_callout kjs_pcre_callout
+#define pcre_compile kjs_pcre_compile
+#define pcre_compile2 kjs_pcre_compile2
+#define pcre_config kjs_pcre_config
+#define pcre_copy_named_substring kjs_pcre_copy_named_substring
+#define pcre_copy_substring kjs_pcre_copy_substring
+#define pcre_dfa_exec kjs_pcre_dfa_exec
+#define pcre_exec kjs_pcre_exec
+#define pcre_free kjs_pcre_free
+#define pcre_free_substring kjs_pcre_free_substring
+#define pcre_free_substring_list kjs_pcre_free_substring_list
+#define pcre_fullinfo kjs_pcre_fullinfo
+#define pcre_get_named_substring kjs_pcre_get_named_substring
+#define pcre_get_substring kjs_pcre_get_substring
+#define pcre_get_substring_list kjs_pcre_get_substring_list
+#define pcre_info kjs_pcre_info
+#define pcre_maketables kjs_pcre_maketables
+#define pcre_malloc kjs_pcre_malloc
+#define pcre_refcount kjs_pcre_refcount
+#define pcre_stack_free kjs_pcre_stack_free
+#define pcre_stack_malloc kjs_pcre_stack_malloc
+#define pcre_study kjs_pcre_study
+#define pcre_version kjs_pcre_version
+
+#define PCRE_MAJOR 6
+#define PCRE_MINOR 1
+#define PCRE_DATE 21-Jun-2005
-#define PCRE_MAJOR @PCRE_MAJOR@
-#define PCRE_MINOR @PCRE_MINOR@
-#define PCRE_DATE @PCRE_DATE@
+#define PCRE_UTF16 1
/* Win32 uses DLL by default; it needs special stuff for exported functions. */
@@ -161,6 +185,12 @@
/* Types */
+#if PCRE_UTF16
+typedef unsigned short pcre_char;
+#else
+typedef char pcre_char;
+#endif
+
struct real_pcre; /* declaration; the definition is private */
typedef struct real_pcre pcre;
@@ -186,7 +216,7 @@
/* ------------------------ Version 0 ------------------------------- */
int callout_number; /* Number compiled into pattern */
int *offset_vector; /* The offset vector */
- const char *subject; /* The subject being matched */
+ const pcre_char *subject; /* The subject being matched */
int subject_length; /* The length of the subject */
int start_match; /* Offset to start of this match attempt */
int current_position; /* Where we currently are in the subject */
@@ -221,30 +251,30 @@
/* Exported PCRE functions */
-PCRE_DATA_SCOPE pcre *pcre_compile(const char *, int, const char **, int *,
+PCRE_DATA_SCOPE pcre *pcre_compile(const pcre_char *, int, const char **, int *,
const unsigned char *);
-PCRE_DATA_SCOPE pcre *pcre_compile2(const char *, int, int *, const char **,
+PCRE_DATA_SCOPE pcre *pcre_compile2(const pcre_char *, int, int *, const char **,
int *, const unsigned char *);
PCRE_DATA_SCOPE int pcre_config(int, void *);
-PCRE_DATA_SCOPE int pcre_copy_named_substring(const pcre *, const char *,
- int *, int, const char *, char *, int);
-PCRE_DATA_SCOPE int pcre_copy_substring(const char *, int *, int, int, char *,
+PCRE_DATA_SCOPE int pcre_copy_named_substring(const pcre *, const pcre_char *,
+ int *, int, const pcre_char *, pcre_char *, int);
+PCRE_DATA_SCOPE int pcre_copy_substring(const pcre_char *, int *, int, int, pcre_char *,
int);
PCRE_DATA_SCOPE int pcre_dfa_exec(const pcre *, const pcre_extra *,
- const char *, int, int, int, int *, int , int *, int);
-PCRE_DATA_SCOPE int pcre_exec(const pcre *, const pcre_extra *, const char *,
+ const pcre_char *, int, int, int, int *, int , int *, int);
+PCRE_DATA_SCOPE int pcre_exec(const pcre *, const pcre_extra *, const pcre_char *,
int, int, int, int *, int);
-PCRE_DATA_SCOPE void pcre_free_substring(const char *);
-PCRE_DATA_SCOPE void pcre_free_substring_list(const char **);
+PCRE_DATA_SCOPE void pcre_free_substring(const pcre_char *);
+PCRE_DATA_SCOPE void pcre_free_substring_list(const pcre_char **);
PCRE_DATA_SCOPE int pcre_fullinfo(const pcre *, const pcre_extra *, int,
void *);
-PCRE_DATA_SCOPE int pcre_get_named_substring(const pcre *, const char *,
- int *, int, const char *, const char **);
-PCRE_DATA_SCOPE int pcre_get_stringnumber(const pcre *, const char *);
-PCRE_DATA_SCOPE int pcre_get_substring(const char *, int *, int, int,
- const char **);
-PCRE_DATA_SCOPE int pcre_get_substring_list(const char *, int *, int,
- const char ***);
+PCRE_DATA_SCOPE int pcre_get_named_substring(const pcre *, const pcre_char *,
+ int *, int, const pcre_char *, const pcre_char **);
+PCRE_DATA_SCOPE int pcre_get_stringnumber(const pcre *, const pcre_char *);
+PCRE_DATA_SCOPE int pcre_get_substring(const pcre_char *, int *, int, int,
+ const pcre_char **);
+PCRE_DATA_SCOPE int pcre_get_substring_list(const pcre_char *, int *, int,
+ const pcre_char ***);
PCRE_DATA_SCOPE int pcre_info(const pcre *, int *, int *);
PCRE_DATA_SCOPE const unsigned char *pcre_maketables(void);
PCRE_DATA_SCOPE int pcre_refcount(pcre *, int);
1.1.2.1 +86 -48 JavaScriptCore/pcre/Attic/pcre_compile.c
Index: pcre_compile.c
===================================================================
RCS file: /cvs/root/JavaScriptCore/pcre/Attic/pcre_compile.c,v
retrieving revision 1.1
retrieving revision 1.1.2.1
diff -u -r1.1 -r1.1.2.1
--- pcre_compile.c 12 Aug 2005 22:13:26 -0000 1.1
+++ pcre_compile.c 15 Aug 2005 16:09:54 -0000 1.1.2.1
@@ -7,6 +7,7 @@
Written by Philip Hazel
Copyright (c) 1997-2005 University of Cambridge
+ Copyright (c) 2004, 2005 Apple Computer, Inc.
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -110,7 +111,7 @@
terminated by a zero length entry. The first three must be alpha, upper, lower,
as this is assumed for handling case independence. */
-static const char *const posix_names[] = {
+static const char * const const posix_names[] = {
"alpha", "lower", "upper",
"alnum", "ascii", "blank", "cntrl", "digit", "graph",
"print", "punct", "space", "word", "xdigit" };
@@ -143,7 +144,7 @@
/* The texts of compile-time error messages. These are "char *" because they
are passed to the outside world. */
-static const char *error_texts[] = {
+static const char * const error_texts[] = {
"no error",
"\\ at end of pattern",
"\\c at end of pattern",
@@ -196,7 +197,11 @@
"unrecognized character after (?P",
"syntax error after (?P",
"two named groups have the same name",
+#if PCRE_UTF16
+ "invalid UTF-16 string",
+#else
"invalid UTF-8 string",
+#endif
/* 45 */
"support for \\P, \\p, and \\X has not been compiled",
"malformed \\P or \\p sequence",
@@ -220,6 +225,7 @@
Then we can use ctype_digit and ctype_xdigit in the code. */
+// FIXME: handle chars > 255 when looking in this table
#if !EBCDIC /* This is the "normal" case, for ASCII systems */
static const unsigned char digitab[] =
{
@@ -331,7 +337,7 @@
/* Definition to allow mutual recursion */
static BOOL
- compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
+ compile_regex(int, int, int *, uschar **, const ichar **, int *, BOOL, int,
int *, int *, branch_chain *, compile_data *);
@@ -359,10 +365,10 @@
*/
static int
-check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
+check_escape(const ichar **ptrptr, int *errorcodeptr, int bracount,
int options, BOOL isclass)
{
-const uschar *ptr = *ptrptr;
+const ichar *ptr = *ptrptr;
int c, i;
/* If backslash is at the end of the pattern, it's an error. */
@@ -387,7 +393,7 @@
else
{
- const uschar *oldptr;
+ const ichar *oldptr;
switch (c)
{
/* A number of Perl escapes are not handled by PCRE. We give an explicit
@@ -458,7 +464,7 @@
#ifdef SUPPORT_UTF8
if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
{
- const uschar *pt = ptr + 2;
+ const ichar *pt = ptr + 2;
register int count = 0;
c = 0;
while ((digitab[*pt] & ctype_xdigit) != 0)
@@ -475,7 +481,7 @@
}
if (*pt == '}')
{
- if (c < 0 || count > 8) *errorcodeptr = ERR34;
+ if (c < 0 || count > 8 || (c >= 0xd800 && c <= 0xdbff) || (c >= 0xfdd0 && c <= 0xfdef) || c == 0xfffe || c == 0xffff || c > 0x10FFFF) *errorcodeptr = ERR34;
ptr = pt;
break;
}
@@ -566,10 +572,10 @@
*/
static int
-get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
+get_ucp(const ichar **ptrptr, BOOL *negptr, int *errorcodeptr)
{
int c, i, bot, top;
-const uschar *ptr = *ptrptr;
+const ichar *ptr = *ptrptr;
char name[4];
c = *(++ptr);
@@ -656,7 +662,7 @@
*/
static BOOL
-is_counted_repeat(const uschar *p)
+is_counted_repeat(const ichar *p)
{
if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
while ((digitab[*p] & ctype_digit) != 0) p++;
@@ -692,8 +698,8 @@
current ptr on error, with errorcodeptr set non-zero
*/
-static const uschar *
-read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
+static const ichar *
+read_repeat_counts(const ichar *p, int *minp, int *maxp, int *errorcodeptr)
{
int min = 0;
int max = -1;
@@ -889,7 +895,7 @@
#ifdef SUPPORT_UTF8
if ((options & PCRE_UTF8) != 0)
{
- while ((*cc & 0xc0) == 0x80) cc++;
+ while (ISMIDCHAR(*cc)) cc++;
}
#endif
break;
@@ -1037,7 +1043,7 @@
case OP_MINPLUS:
case OP_QUERY:
case OP_MINQUERY:
- while ((*code & 0xc0) == 0x80) code++;
+ while (ISMIDCHAR(*code)) code++;
break;
/* XCLASS is used for classes that cannot be represented just by a bit
@@ -1109,7 +1115,7 @@
case OP_MINPLUS:
case OP_QUERY:
case OP_MINQUERY:
- while ((*code & 0xc0) == 0x80) code++;
+ while (ISMIDCHAR(*code)) code++;
break;
/* XCLASS is used for classes that cannot be represented just by a bit
@@ -1260,7 +1266,7 @@
case OP_MINQUERY:
case OP_UPTO:
case OP_MINUPTO:
- if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
+ if (utf8) while (ISMIDCHAR(code[2])) code++;
break;
#endif
}
@@ -1321,7 +1327,7 @@
*/
static BOOL
-check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
+check_posix_syntax(const ichar *ptr, const ichar **endptr, compile_data *cd)
{
int terminator; /* Don't combine these lines; the Solaris cc */
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
@@ -1335,8 +1341,24 @@
return FALSE;
}
+#if PCRE_UTF16
+static inline BOOL strequal(const ichar *str1, int len, const char *str2)
+{
+ int i;
+ for (i = 0; i < len; i++)
+ if (str1[i] != str2[i])
+ return FALSE;
+ return TRUE;
+}
+
+#define STREQUAL(str1, len, str2) strequal((str), (len), (str2))
+#else
+
+#define STREQUAL(str1, len, str2) (strncmp((const char *)(str), (str2), (len)) == 0)
+
+#endif
/*************************************************
* Check POSIX class name *
@@ -1353,13 +1375,13 @@
*/
static int
-check_posix_name(const uschar *ptr, int len)
+check_posix_name(const ichar *ptr, int len)
{
register int yield = 0;
while (posix_name_lengths[yield] != 0)
{
if (len == posix_name_lengths[yield] &&
- strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
+ STREQUAL(ptr, len, posix_names[yield])) return yield;
yield++;
}
return -1;
@@ -1419,7 +1441,7 @@
*/
static uschar *
-auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
+auto_callout(uschar *code, const ichar *ptr, compile_data *cd)
{
*code++ = OP_CALLOUT;
*code++ = 255;
@@ -1447,7 +1469,7 @@
*/
static void
-complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
+complete_callout(uschar *previous_callout, const ichar *ptr, compile_data *cd)
{
int length = ptr - cd->start_pattern - GET(previous_callout, 2);
PUT(previous_callout, 2 + LINK_SIZE, length);
@@ -1531,7 +1553,7 @@
static BOOL
compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
- const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
+ const ichar **ptrptr, int *errorcodeptr, int *firstbyteptr,
int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
{
int repeat_type, op_type;
@@ -1549,8 +1571,8 @@
uschar *tempcode;
BOOL inescq = FALSE;
BOOL groupsetfirstbyte = FALSE;
-const uschar *ptr = *ptrptr;
-const uschar *tempptr;
+const ichar *ptr = *ptrptr;
+const ichar *tempptr;
uschar *previous = NULL;
uschar *previous_callout = NULL;
uschar classbits[32];
@@ -1844,7 +1866,7 @@
posix_class *= 3;
for (i = 0; i < 3; i++)
{
- BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
+ BOOL blankclass = STREQUAL(ptr, 5, "blank");
int taboffset = posix_class_maps[posix_class + i];
if (taboffset < 0) break;
if (local_negate)
@@ -1983,7 +2005,7 @@
if (d == '\\')
{
- const uschar *oldptr = ptr;
+ const ichar *oldptr = ptr;
d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
/* \b is backslash; \X is literal X; any other special means the '-'
@@ -2362,7 +2384,7 @@
if (utf8 && (code[-1] & 0x80) != 0)
{
uschar *lastchar = code - 1;
- while((*lastchar & 0xc0) == 0x80) lastchar--;
+ while(ISMIDCHAR(*lastchar)) lastchar--;
c = code - lastchar; /* Length of UTF-8 character */
memcpy(utf8_char, lastchar, c); /* Save the char */
c |= 0x80; /* Flag c as a length */
@@ -2881,7 +2903,7 @@
{
int i, namelen;
uschar *slot = cd->name_table;
- const uschar *name; /* Don't amalgamate; some compilers */
+ const ichar *name; /* Don't amalgamate; some compilers */
name = ++ptr; /* grumble at autoincrement in declaration */
while (*ptr++ != '>');
@@ -2919,7 +2941,7 @@
{
int i, namelen;
int type = *ptr++;
- const uschar *name = ptr;
+ const ichar *name = ptr;
uschar *slot = cd->name_table;
while (*ptr != ')') ptr++;
@@ -2927,7 +2949,7 @@
for (i = 0; i < cd->names_found; i++)
{
- if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
+ if (STREQUAL(name, namelen, (char *)slot+2)) break;
slot += cd->name_entry_size;
}
if (i >= cd->names_found)
@@ -3315,9 +3337,9 @@
mcbuffer[0] = c;
#ifdef SUPPORT_UTF8
- if (utf8 && (c & 0xc0) == 0xc0)
+ if (utf8 && ISMBSTARTCHAR(c))
{
- while ((ptr[1] & 0xc0) == 0x80)
+ while (ISMIDCHAR(ptr[1]))
mcbuffer[mclength++] = *(++ptr);
}
#endif
@@ -3409,10 +3431,10 @@
static BOOL
compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
- const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
+ const ichar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
{
-const uschar *ptr = *ptrptr;
+const ichar *ptr = *ptrptr;
uschar *code = *codeptr;
uschar *last_branch = code;
uschar *start_bracket = code;
@@ -3829,7 +3851,7 @@
*/
EXPORT pcre *
-pcre_compile(const char *pattern, int options, const char **errorptr,
+pcre_compile(const pcre_char *pattern, int options, const char **errorptr,
int *erroroffset, const unsigned char *tables)
{
return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
@@ -3837,7 +3859,7 @@
EXPORT pcre *
-pcre_compile2(const char *pattern, int options, int *errorcodeptr,
+pcre_compile2(const pcre_char *pattern, int options, int *errorcodeptr,
const char **errorptr, int *erroroffset, const unsigned char *tables)
{
real_pcre *re;
@@ -3860,7 +3882,7 @@
size_t size;
uschar *code;
const uschar *codestart;
-const uschar *ptr;
+const ichar *ptr;
compile_data compile_block;
int brastack[BRASTACK_SIZE];
uschar bralenstack[BRASTACK_SIZE];
@@ -3888,16 +3910,24 @@
*erroroffset = 0;
+/* Always set the UTF-8 flag if we're compiled for UTF-16; saves on ifdefs. */
+
+#if PCRE_UTF16
+options |= PCRE_UTF8;
+#endif
+
/* Can't support UTF8 unless PCRE has been compiled to include the code. */
#ifdef SUPPORT_UTF8
utf8 = (options & PCRE_UTF8) != 0;
+#if !PCRE_UTF16
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
- (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
+ (*erroroffset = _pcre_valid_utf8((ichar *)pattern, -1)) >= 0)
{
errorcode = ERR44;
goto PCRE_EARLY_ERROR_RETURN;
}
+#endif
#else
if ((options & PCRE_UTF8) != 0)
{
@@ -3940,7 +3970,7 @@
for any counted white space if an "extended" flag setting appears late in the
pattern. We can't be so clever for #-comments. */
-ptr = (const uschar *)(pattern - 1);
+ptr = (const ichar *)(pattern - 1);
while ((c = *(++ptr)) != 0)
{
int min, max;
@@ -3995,6 +4025,13 @@
{
length += 2; /* For a one-byte character */
+#if PCRE_UTF16
+ if (IS_LEADING_SURROGATE(c))
+ {
+ length++;
+ lastitemlength++;
+ }
+#else
#ifdef SUPPORT_UTF8
if (utf8 && c > 127)
{
@@ -4005,6 +4042,7 @@
lastitemlength += i;
}
#endif
+#endif
continue;
}
@@ -4249,7 +4287,7 @@
d = -1;
if (ptr[1] == '-')
{
- uschar const *hyptr = ptr++;
+ ichar const *hyptr = ptr++;
if (ptr[1] == '\\')
{
ptr++;
@@ -4499,7 +4537,7 @@
ptr += 3;
if (*ptr == '<')
{
- const uschar *p; /* Don't amalgamate; some compilers */
+ const ichar *p; /* Don't amalgamate; some compilers */
p = ++ptr; /* grumble at autoincrement in declaration */
while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
if (*ptr != '>')
@@ -4819,9 +4857,9 @@
/* In UTF-8 mode, check for additional bytes. */
#ifdef SUPPORT_UTF8
- if (utf8 && (c & 0xc0) == 0xc0)
+ if (utf8 && ISMBSTARTCHAR(c))
{
- while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
+ while (ISMIDCHAR(ptr[1])) /* Can't flow over the end */
{ /* because the end is marked */
lastitemlength++; /* by a zero byte. */
length++;
@@ -4881,7 +4919,7 @@
compile_block.name_table = (uschar *)re + re->name_table_offset;
codestart = compile_block.name_table + re->name_entry_size * re->name_count;
compile_block.start_code = codestart;
-compile_block.start_pattern = (const uschar *)pattern;
+compile_block.start_pattern = (const ichar *)pattern;
compile_block.req_varyopt = 0;
compile_block.nopartial = FALSE;
@@ -4889,7 +4927,7 @@
error, errorcode will be set non-zero, so we don't need to look at the result
of the function here. */
-ptr = (const uschar *)pattern;
+ptr = (const ichar *)pattern;
code = (uschar *)codestart;
*code = OP_BRA;
bracount = 0;
@@ -4924,7 +4962,7 @@
{
(pcre_free)(re);
PCRE_ERROR_RETURN:
- *erroroffset = ptr - (const uschar *)pattern;
+ *erroroffset = ptr - (const ichar *)pattern;
PCRE_EARLY_ERROR_RETURN:
*errorptr = error_texts[errorcode];
if (errorcodeptr != NULL) *errorcodeptr = errorcode;
@@ -5022,7 +5060,7 @@
{
(pcre_free)(re);
*errorptr = error_texts[ERR23];
- *erroroffset = ptr - (uschar *)pattern;
+ *erroroffset = ptr - (ichar *)pattern;
if (errorcodeptr != NULL) *errorcodeptr = ERR23;
return NULL;
}
1.1.2.1 +85 -12 JavaScriptCore/pcre/Attic/pcre_internal.h
Index: pcre_internal.h
===================================================================
RCS file: /cvs/root/JavaScriptCore/pcre/Attic/pcre_internal.h,v
retrieving revision 1.1
retrieving revision 1.1.2.1
diff -u -r1.1 -r1.1.2.1
--- pcre_internal.h 12 Aug 2005 22:13:27 -0000 1.1
+++ pcre_internal.h 15 Aug 2005 16:09:54 -0000 1.1.2.1
@@ -8,6 +8,7 @@
Written by Philip Hazel
Copyright (c) 1997-2005 University of Cambridge
+ Copyright (c) 2004, 2005 Apple Computer, Inc.
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -42,6 +43,21 @@
modules, but which are not relevant to the exported API. This includes some
functions whose names all begin with "_pcre_". */
+#define _pcre_OP_lengths kjs_pcre_OP_lengths
+#define _pcre_default_tables kjs_pcre_default_tables
+#define _pcre_ord2utf8 kjs_pcre_ord2utf8
+#define _pcre_printint kjs_pcre_printint
+#define _pcre_try_flipped kjs_pcre_try_flipped
+#define _pcre_ucp_findchar kjs_pcre_ucp_findchar
+#define _pcre_utf8_table1 kjs_pcre_utf8_table1
+#define _pcre_utf8_table1_size kjs_pcre_utf8_table1_size
+#define _pcre_utf8_table2 kjs_pcre_utf8_table2
+#define _pcre_utf8_table3 kjs_pcre_utf8_table3
+#define _pcre_utf8_table4 kjs_pcre_utf8_table4
+#define _pcre_utt kjs_pcre_utt
+#define _pcre_utt_size kjs_pcre_utt_size
+#define _pcre_valid_utf8 kjs_pcre_valid_utf8
+#define _pcre_xclass kjs_pcre_xclass
/* Define DEBUG to get debugging output on stdout. */
@@ -63,7 +79,7 @@
/* Get the definitions provided by running "configure" */
-#include "config.h"
+#include "pcre-config.h"
/* Standard C headers plus the external interface definition. The only time
setjmp and stdarg are used is when NO_RECURSE is set. */
@@ -103,6 +119,10 @@
#error Cannot determine a type for 32-bit unsigned integers
#endif
+/* Include the public PCRE header */
+
+#include "pcre.h"
+
/* All character handling must be done as unsigned characters. Otherwise there
are problems with top-bit-set characters and functions such as isspace().
However, we leave the interface to the outside world as char *, because that
@@ -112,9 +132,12 @@
typedef unsigned char uschar;
-/* Include the public PCRE header */
-
-#include "pcre.h"
+/* Use ichar to mean "internal character" for always-unsigned version of pcre_char. */
+#if PCRE_UTF16
+typedef pcre_char ichar;
+#else
+typedef unsigned char ichar;
+#endif
/* Include the (copy of) the public ucp header, changing the external name into
a private one. This does no harm, even if we aren't compiling UCP support. */
@@ -249,6 +272,47 @@
#else /* SUPPORT_UTF8 */
+#if PCRE_UTF16
+
+#define LEAD_OFFSET (0xd800 - (0x10000 >> 10))
+#define SURROGATE_OFFSET (0x10000 - (0xd800 << 10) - 0xdc00)
+
+#define IS_LEADING_SURROGATE(c) (((c) & ~0x3ff) == 0xd800)
+#define IS_TRAILING_SURROGATE(c) (((c) & ~0x3ff) == 0xdc00)
+
+#define DECODE_SURROGATE_PAIR(l, t) (((l) << 10) + (t) + SURROGATE_OFFSET)
+#define LEADING_SURROGATE(c) (LEAD_OFFSET + ((c) >> 10))
+#define TRAILING_SURROGATE(c) (0xdc00 + ((c) & 0x3FF))
+
+#define GETCHAR(c, eptr) \
+ c = eptr[0]; \
+ if (IS_LEADING_SURROGATE(c)) \
+ c = DECODE_SURROGATE_PAIR(c, eptr[1])
+
+#define GETCHARTEST(c, eptr) GETCHAR(c, eptr)
+
+#define GETCHARINC(c, eptr) \
+ c = *eptr++; \
+ if (IS_LEADING_SURROGATE(c)) \
+ c = DECODE_SURROGATE_PAIR(c, *eptr++)
+
+#define GETCHARINCTEST(c, eptr) GETCHARINC(c, eptr)
+
+#define GETCHARLEN(c, eptr, len) \
+ c = eptr[0]; \
+ if (!IS_LEADING_SURROGATE(c)) \
+ len = 1; \
+ else \
+ { \
+ c = DECODE_SURROGATE_PAIR(c, eptr[1]); \
+ len = 2; \
+ }
+
+#define ISMBSTARTCHAR(c) IS_LEADING_SURROGATE(c)
+#define ISMIDCHAR(c) IS_TRAILING_SURROGATE(c)
+
+#else
+
/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */
@@ -337,10 +401,20 @@
len += gcaa; \
}
+/* Return 1 if at the start of a multibyte character. */
+
+#define ISMBSTARTCHAR(c) (((c) & 0xc0) == 0xc0)
+
+/* Return 1 if not the start of a character. */
+
+#define ISMIDCHAR(c) (((c) & 0xc0) == 0x80)
+
+#endif
+
/* If the pointer is not at the start of a character, move it back until
it is. Called only in UTF-8 mode. */
-#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
+#define BACKCHAR(eptr) while(ISMIDCHAR(*eptr)) eptr--;
#endif
@@ -803,10 +877,10 @@
BOOL partial; /* PARTIAL flag */
BOOL hitend; /* Hit the end of the subject at some point */
const uschar *start_code; /* For use when recursing */
- const uschar *start_subject; /* Start of the subject string */
- const uschar *end_subject; /* End of the subject string */
- const uschar *start_match; /* Start of this match attempt */
- const uschar *end_match_ptr; /* Subject position at end match */
+ const ichar *start_subject; /* Start of the subject string */
+ const ichar *end_subject; /* End of the subject string */
+ const ichar *start_match; /* Start of this match attempt */
+ const ichar *end_match_ptr; /* Subject position at end match */
int end_offset_top; /* Highwater mark at end of match */
int capture_last; /* Most recent capture number */
int start_offset; /* The start offset value */
@@ -820,8 +894,8 @@
typedef struct dfa_match_data {
const uschar *start_code; /* Start of the compiled pattern */
- const uschar *start_subject; /* Start of the subject string */
- const uschar *end_subject; /* End of subject string */
+ const ichar *start_subject; /* Start of the subject string */
+ const ichar *end_subject; /* End of subject string */
const uschar *tables; /* Character tables */
int moptions; /* Match options */
int poptions; /* Pattern options */
@@ -889,7 +963,6 @@
extern const uschar _pcre_OP_lengths[];
-
/* Internal shared functions. These are functions that are used by more than
one of the exported public functions. They have to be "external" in the C
sense, but are not part of the PCRE public API. */
More information about the webkit-changes
mailing list