[webkit-changes] cvs commit: JavaScriptCore/pcre pcre-config.h pcre.h pcre_compile.c pcre_internal.h pcre_valid_utf8.c

Mon Aug 15 09:09:55 PDT 2005

darin       05/08/15 09:09:55

  Modified:    pcre     Tag: pcre-6-1-branch pcre-config.h pcre.h
                        pcre_compile.c pcre_internal.h
  Removed:     pcre     Tag: pcre-6-1-branch pcre_valid_utf8.c
  Log:
  A few first steps porting the new PCRE 6.1 to KJS and to UTF-16.

  Revision  Changes    Path
  No                   revision

  No                   revision

  1.1.84.1  +97 -3     JavaScriptCore/pcre/pcre-config.h

  Index: pcre-config.h
  ===================================================================
  RCS file: /cvs/root/JavaScriptCore/pcre/pcre-config.h,v
  retrieving revision 1.1
  retrieving revision 1.1.84.1
  diff -u -r1.1 -r1.1.84.1
  --- pcre-config.h	4 Dec 2002 21:57:20 -0000	1.1
  +++ pcre-config.h	15 Aug 2005 16:09:54 -0000	1.1.84.1
  @@ -1,5 +1,99 @@
  -#define HAVE_MEMMOVE 1
  -#define HAVE_BCOPY 1
  +/* On Unix systems config.in is converted by configure into config.h. PCRE is
  +written in Standard C, but there are a few non-standard things it can cope
  +with, allowing it to run on SunOS4 and other "close to standard" systems.
  +
  +On a non-Unix system you should just copy this file into config.h, and set up
  +the macros the way you need them. You should normally change the definitions of
  +HAVE_STRERROR and HAVE_MEMMOVE to 1. Unfortunately, because of the way autoconf
  +works, these cannot be made the defaults. If your system has bcopy() and not
  +memmove(), change the definition of HAVE_BCOPY instead of HAVE_MEMMOVE. If your
  +system has neither bcopy() nor memmove(), leave them both as 0; an emulation
  +function will be used. */
  +
  +/* If you are compiling for a system that uses EBCDIC instead of ASCII
  +character codes, define this macro as 1. On systems that can use "configure",
  +this can be done via --enable-ebcdic. */
  +
  +#define EBCDIC 0
  +
  +/* If you are compiling for a system that needs some magic to be inserted
  +before the definition of an exported function, define this macro to contain the
  +relevant magic. It apears at the start of every exported function. */
  +
  +#define EXPORT
  +
  +/* Define to empty if the "const" keyword does not work. */
  +
  +#undef const
  +
  +/* Define to "unsigned" if <stddef.h> doesn't define size_t. */
  +
  +#undef size_t
  +
  +/* The following two definitions are mainly for the benefit of SunOS4, which
  +doesn't have the strerror() or memmove() functions that should be present in
  +all Standard C libraries. The macros HAVE_STRERROR and HAVE_MEMMOVE should
  +normally be defined with the value 1 for other systems, but unfortunately we
  +can't make this the default because "configure" files generated by autoconf
  +will only change 0 to 1; they won't change 1 to 0 if the functions are not
  +found. */
  +
   #define HAVE_STRERROR 1
  -#define NEWLINE 10
  +#define HAVE_MEMMOVE  1
  +
  +/* There are some non-Unix systems that don't even have bcopy(). If this macro
  +is false, an emulation is used. If HAVE_MEMMOVE is set to 1, the value of
  +HAVE_BCOPY is not relevant. */
  +
  +#define HAVE_BCOPY    1
  +
  +/* The value of NEWLINE determines the newline character. The default is to
  +leave it up to the compiler, but some sites want to force a particular value.
  +On Unix systems, "configure" can be used to override this default. */
  +
  +#define NEWLINE '\n'
  +
  +/* The value of LINK_SIZE determines the number of bytes used to store
  +links as offsets within the compiled regex. The default is 2, which allows for
  +compiled patterns up to 64K long. This covers the vast majority of cases.
  +However, PCRE can also be compiled to use 3 or 4 bytes instead. This allows for
  +longer patterns in extreme cases. On Unix systems, "configure" can be used to
  +override this default. */
  +
  +#define LINK_SIZE   2
  +
  +/* The value of MATCH_LIMIT determines the default number of times the match()
  +function can be called during a single execution of pcre_exec(). (There is a
  +runtime method of setting a different limit.) The limit exists in order to
  +catch runaway regular expressions that take for ever to determine that they do
  +not match. The default is set very large so that it does not accidentally catch
  +legitimate cases. On Unix systems, "configure" can be used to override this
  +default default. */
  +
  +#define MATCH_LIMIT 10000000
  +
  +/* When calling PCRE via the POSIX interface, additional working storage is
  +required for holding the pointers to capturing substrings because PCRE requires
  +three integers per substring, whereas the POSIX interface provides only two. If
  +the number of expected substrings is small, the wrapper function uses space on
  +the stack, because this is faster than using malloc() for each call. The
  +threshold above which the stack is no longer use is defined by POSIX_MALLOC_
  +THRESHOLD. On Unix systems, "configure" can be used to override this default.
  +*/
  +
  +#define POSIX_MALLOC_THRESHOLD 10
  +
  +/* PCRE uses recursive function calls to handle backtracking while matching.
  +This can sometimes be a problem on systems that have stacks of limited size.
  +Define NO_RECURSE to get a version that doesn't use recursion in the match()
  +function; instead it creates its own stack by steam using pcre_recurse_malloc
  +to get memory. For more detail, see comments and other stuff just above the
  +match() function. On Unix systems, "configure" can be used to set this in the
  +Makefile (use --disable-stack-for-recursion). */
  +
  +/* #define NO_RECURSE */
  +
  +/* End */
  +
  +#define SUPPORT_UCP 1
   #define SUPPORT_UTF8 1

  1.5.2.2   +52 -22    JavaScriptCore/pcre/pcre.h

  Index: pcre.h
  ===================================================================
  RCS file: /cvs/root/JavaScriptCore/pcre/pcre.h,v
  retrieving revision 1.5.2.1
  retrieving revision 1.5.2.2
  diff -u -r1.5.2.1 -r1.5.2.2
  --- pcre.h	12 Aug 2005 22:23:49 -0000	1.5.2.1
  +++ pcre.h	15 Aug 2005 16:09:54 -0000	1.5.2.2
  @@ -6,6 +6,7 @@
   "configure" into pcre.h.

              Copyright (c) 1997-2005 University of Cambridge
  +           Copyright (c) 2004, 2005 Apple Computer, Inc.

   -----------------------------------------------------------------------------
   Redistribution and use in source and binary forms, with or without
  @@ -39,12 +40,35 @@
   #ifndef _PCRE_H
   #define _PCRE_H

  -/* The file pcre.h is build by "configure". Do not edit it; instead
  -make changes to pcre.in. */
  +#define pcre_callout kjs_pcre_callout
  +#define pcre_compile kjs_pcre_compile
  +#define pcre_compile2 kjs_pcre_compile2
  +#define pcre_config kjs_pcre_config
  +#define pcre_copy_named_substring kjs_pcre_copy_named_substring
  +#define pcre_copy_substring kjs_pcre_copy_substring
  +#define pcre_dfa_exec kjs_pcre_dfa_exec
  +#define pcre_exec kjs_pcre_exec
  +#define pcre_free kjs_pcre_free
  +#define pcre_free_substring kjs_pcre_free_substring
  +#define pcre_free_substring_list kjs_pcre_free_substring_list
  +#define pcre_fullinfo kjs_pcre_fullinfo
  +#define pcre_get_named_substring kjs_pcre_get_named_substring
  +#define pcre_get_substring kjs_pcre_get_substring
  +#define pcre_get_substring_list kjs_pcre_get_substring_list
  +#define pcre_info kjs_pcre_info
  +#define pcre_maketables kjs_pcre_maketables
  +#define pcre_malloc kjs_pcre_malloc
  +#define pcre_refcount kjs_pcre_refcount
  +#define pcre_stack_free kjs_pcre_stack_free
  +#define pcre_stack_malloc kjs_pcre_stack_malloc
  +#define pcre_study kjs_pcre_study
  +#define pcre_version kjs_pcre_version
  +
  +#define PCRE_MAJOR          6
  +#define PCRE_MINOR          1
  +#define PCRE_DATE           21-Jun-2005

  -#define PCRE_MAJOR          @PCRE_MAJOR@
  -#define PCRE_MINOR          @PCRE_MINOR@
  -#define PCRE_DATE           @PCRE_DATE@
  +#define PCRE_UTF16          1

   /* Win32 uses DLL by default; it needs special stuff for exported functions. */

  @@ -161,6 +185,12 @@

   /* Types */

  +#if PCRE_UTF16
  +typedef unsigned short pcre_char;
  +#else
  +typedef char pcre_char;
  +#endif
  +
   struct real_pcre;                 /* declaration; the definition is private  */
   typedef struct real_pcre pcre;

  @@ -186,7 +216,7 @@
     /* ------------------------ Version 0 ------------------------------- */
     int          callout_number;    /* Number compiled into pattern */
     int         *offset_vector;     /* The offset vector */
  -  const char  *subject;           /* The subject being matched */
  +  const pcre_char  *subject;      /* The subject being matched */
     int          subject_length;    /* The length of the subject */
     int          start_match;       /* Offset to start of this match attempt */
     int          current_position;  /* Where we currently are in the subject */
  @@ -221,30 +251,30 @@

   /* Exported PCRE functions */

  -PCRE_DATA_SCOPE pcre *pcre_compile(const char *, int, const char **, int *,
  +PCRE_DATA_SCOPE pcre *pcre_compile(const pcre_char *, int, const char **, int *,
                     const unsigned char *);
  -PCRE_DATA_SCOPE pcre *pcre_compile2(const char *, int, int *, const char **,
  +PCRE_DATA_SCOPE pcre *pcre_compile2(const pcre_char *, int, int *, const char **,
                     int *, const unsigned char *);
   PCRE_DATA_SCOPE int  pcre_config(int, void *);
  -PCRE_DATA_SCOPE int  pcre_copy_named_substring(const pcre *, const char *,
  -                  int *, int, const char *, char *, int);
  -PCRE_DATA_SCOPE int  pcre_copy_substring(const char *, int *, int, int, char *,
  +PCRE_DATA_SCOPE int  pcre_copy_named_substring(const pcre *, const pcre_char *,
  +                  int *, int, const pcre_char *, pcre_char *, int);
  +PCRE_DATA_SCOPE int  pcre_copy_substring(const pcre_char *, int *, int, int, pcre_char *,
                     int);
   PCRE_DATA_SCOPE int  pcre_dfa_exec(const pcre *, const pcre_extra *,
  -                  const char *, int, int, int, int *, int , int *, int);
  -PCRE_DATA_SCOPE int  pcre_exec(const pcre *, const pcre_extra *, const char *,
  +                  const pcre_char *, int, int, int, int *, int , int *, int);
  +PCRE_DATA_SCOPE int  pcre_exec(const pcre *, const pcre_extra *, const pcre_char *,
                      int, int, int, int *, int);
  -PCRE_DATA_SCOPE void pcre_free_substring(const char *);
  -PCRE_DATA_SCOPE void pcre_free_substring_list(const char **);
  +PCRE_DATA_SCOPE void pcre_free_substring(const pcre_char *);
  +PCRE_DATA_SCOPE void pcre_free_substring_list(const pcre_char **);
   PCRE_DATA_SCOPE int  pcre_fullinfo(const pcre *, const pcre_extra *, int,
                     void *);
  -PCRE_DATA_SCOPE int  pcre_get_named_substring(const pcre *, const char *,
  -                  int *, int, const char *, const char **);
  -PCRE_DATA_SCOPE int  pcre_get_stringnumber(const pcre *, const char *);
  -PCRE_DATA_SCOPE int  pcre_get_substring(const char *, int *, int, int,
  -                  const char **);
  -PCRE_DATA_SCOPE int  pcre_get_substring_list(const char *, int *, int,
  -                  const char ***);
  +PCRE_DATA_SCOPE int  pcre_get_named_substring(const pcre *, const pcre_char *,
  +                  int *, int, const pcre_char *, const pcre_char **);
  +PCRE_DATA_SCOPE int  pcre_get_stringnumber(const pcre *, const pcre_char *);
  +PCRE_DATA_SCOPE int  pcre_get_substring(const pcre_char *, int *, int, int,
  +                  const pcre_char **);
  +PCRE_DATA_SCOPE int  pcre_get_substring_list(const pcre_char *, int *, int,
  +                  const pcre_char ***);
   PCRE_DATA_SCOPE int  pcre_info(const pcre *, int *, int *);
   PCRE_DATA_SCOPE const unsigned char *pcre_maketables(void);
   PCRE_DATA_SCOPE int  pcre_refcount(pcre *, int);

  1.1.2.1   +86 -48    JavaScriptCore/pcre/Attic/pcre_compile.c

  Index: pcre_compile.c
  ===================================================================
  RCS file: /cvs/root/JavaScriptCore/pcre/Attic/pcre_compile.c,v
  retrieving revision 1.1
  retrieving revision 1.1.2.1
  diff -u -r1.1 -r1.1.2.1
  --- pcre_compile.c	12 Aug 2005 22:13:26 -0000	1.1
  +++ pcre_compile.c	15 Aug 2005 16:09:54 -0000	1.1.2.1
  @@ -7,6 +7,7 @@

                          Written by Philip Hazel
              Copyright (c) 1997-2005 University of Cambridge
  +           Copyright (c) 2004, 2005 Apple Computer, Inc.

   -----------------------------------------------------------------------------
   Redistribution and use in source and binary forms, with or without
  @@ -110,7 +111,7 @@
   terminated by a zero length entry. The first three must be alpha, upper, lower,
   as this is assumed for handling case independence. */

  -static const char *const posix_names[] = {
  +static const char * const const posix_names[] = {
     "alpha", "lower", "upper",
     "alnum", "ascii", "blank", "cntrl", "digit", "graph",
     "print", "punct", "space", "word",  "xdigit" };
  @@ -143,7 +144,7 @@
   /* The texts of compile-time error messages. These are "char *" because they
   are passed to the outside world. */

  -static const char *error_texts[] = {
  +static const char * const error_texts[] = {
     "no error",
     "\\ at end of pattern",
     "\\c at end of pattern",
  @@ -196,7 +197,11 @@
     "unrecognized character after (?P",
     "syntax error after (?P",
     "two named groups have the same name",
  +#if PCRE_UTF16
  +  "invalid UTF-16 string",
  +#else
     "invalid UTF-8 string",
  +#endif
     /* 45 */
     "support for \\P, \\p, and \\X has not been compiled",
     "malformed \\P or \\p sequence",
  @@ -220,6 +225,7 @@

   Then we can use ctype_digit and ctype_xdigit in the code. */

  +// FIXME: handle chars > 255 when looking in this table
   #if !EBCDIC    /* This is the "normal" case, for ASCII systems */
   static const unsigned char digitab[] =
     {
  @@ -331,7 +337,7 @@
   /* Definition to allow mutual recursion */

   static BOOL
  -  compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
  +  compile_regex(int, int, int *, uschar **, const ichar **, int *, BOOL, int,
       int *, int *, branch_chain *, compile_data *);

  @@ -359,10 +365,10 @@
   */

   static int
  -check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
  +check_escape(const ichar **ptrptr, int *errorcodeptr, int bracount,
     int options, BOOL isclass)
   {
  -const uschar *ptr = *ptrptr;
  +const ichar *ptr = *ptrptr;
   int c, i;

   /* If backslash is at the end of the pattern, it's an error. */
  @@ -387,7 +393,7 @@

   else
     {
  -  const uschar *oldptr;
  +  const ichar *oldptr;
     switch (c)
       {
       /* A number of Perl escapes are not handled by PCRE. We give an explicit
  @@ -458,7 +464,7 @@
   #ifdef SUPPORT_UTF8
       if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
         {
  -      const uschar *pt = ptr + 2;
  +      const ichar *pt = ptr + 2;
         register int count = 0;
         c = 0;
         while ((digitab[*pt] & ctype_xdigit) != 0)
  @@ -475,7 +481,7 @@
           }
         if (*pt == '}')
           {
  -        if (c < 0 || count > 8) *errorcodeptr = ERR34;
  +        if (c < 0 || count > 8 || (c >= 0xd800 && c <= 0xdbff) || (c >= 0xfdd0 && c <= 0xfdef) || c == 0xfffe || c == 0xffff || c > 0x10FFFF) *errorcodeptr = ERR34;
           ptr = pt;
           break;
           }
  @@ -566,10 +572,10 @@
   */

   static int
  -get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
  +get_ucp(const ichar **ptrptr, BOOL *negptr, int *errorcodeptr)
   {
   int c, i, bot, top;
  -const uschar *ptr = *ptrptr;
  +const ichar *ptr = *ptrptr;
   char name[4];

   c = *(++ptr);
  @@ -656,7 +662,7 @@
   */

   static BOOL
  -is_counted_repeat(const uschar *p)
  +is_counted_repeat(const ichar *p)
   {
   if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
   while ((digitab[*p] & ctype_digit) != 0) p++;
  @@ -692,8 +698,8 @@
                    current ptr on error, with errorcodeptr set non-zero
   */

  -static const uschar *
  -read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
  +static const ichar *
  +read_repeat_counts(const ichar *p, int *minp, int *maxp, int *errorcodeptr)
   {
   int min = 0;
   int max = -1;
  @@ -889,7 +895,7 @@
   #ifdef SUPPORT_UTF8
       if ((options & PCRE_UTF8) != 0)
         {
  -      while ((*cc & 0xc0) == 0x80) cc++;
  +      while (ISMIDCHAR(*cc)) cc++;
         }
   #endif
       break;
  @@ -1037,7 +1043,7 @@
         case OP_MINPLUS:
         case OP_QUERY:
         case OP_MINQUERY:
  -      while ((*code & 0xc0) == 0x80) code++;
  +      while (ISMIDCHAR(*code)) code++;
         break;

         /* XCLASS is used for classes that cannot be represented just by a bit
  @@ -1109,7 +1115,7 @@
         case OP_MINPLUS:
         case OP_QUERY:
         case OP_MINQUERY:
  -      while ((*code & 0xc0) == 0x80) code++;
  +      while (ISMIDCHAR(*code)) code++;
         break;

         /* XCLASS is used for classes that cannot be represented just by a bit
  @@ -1260,7 +1266,7 @@
       case OP_MINQUERY:
       case OP_UPTO:
       case OP_MINUPTO:
  -    if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
  +    if (utf8) while (ISMIDCHAR(code[2])) code++;
       break;
   #endif
       }
  @@ -1321,7 +1327,7 @@
   */

   static BOOL
  -check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
  +check_posix_syntax(const ichar *ptr, const ichar **endptr, compile_data *cd)
   {
   int terminator;          /* Don't combine these lines; the Solaris cc */
   terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
  @@ -1335,8 +1341,24 @@
   return FALSE;
   }

  +#if PCRE_UTF16

  +static inline BOOL strequal(const ichar *str1, int len, const char *str2)
  +{
  +  int i;
  +  for (i = 0; i < len; i++)
  +    if (str1[i] != str2[i])
  +      return FALSE;
  +  return TRUE;
  +}
  +
  +#define STREQUAL(str1, len, str2) strequal((str), (len), (str2))

  +#else
  +
  +#define STREQUAL(str1, len, str2) (strncmp((const char *)(str), (str2), (len)) == 0)
  +
  +#endif

   /*************************************************
   *          Check POSIX class name                *
  @@ -1353,13 +1375,13 @@
   */

   static int
  -check_posix_name(const uschar *ptr, int len)
  +check_posix_name(const ichar *ptr, int len)
   {
   register int yield = 0;
   while (posix_name_lengths[yield] != 0)
     {
     if (len == posix_name_lengths[yield] &&
  -    strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
  +    STREQUAL(ptr, len, posix_names[yield])) return yield;
     yield++;
     }
   return -1;
  @@ -1419,7 +1441,7 @@
   */

   static uschar *
  -auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
  +auto_callout(uschar *code, const ichar *ptr, compile_data *cd)
   {
   *code++ = OP_CALLOUT;
   *code++ = 255;
  @@ -1447,7 +1469,7 @@
   */

   static void
  -complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
  +complete_callout(uschar *previous_callout, const ichar *ptr, compile_data *cd)
   {
   int length = ptr - cd->start_pattern - GET(previous_callout, 2);
   PUT(previous_callout, 2 + LINK_SIZE, length);
  @@ -1531,7 +1553,7 @@

   static BOOL
   compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
  -  const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
  +  const ichar **ptrptr, int *errorcodeptr, int *firstbyteptr,
     int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
   {
   int repeat_type, op_type;
  @@ -1549,8 +1571,8 @@
   uschar *tempcode;
   BOOL inescq = FALSE;
   BOOL groupsetfirstbyte = FALSE;
  -const uschar *ptr = *ptrptr;
  -const uschar *tempptr;
  +const ichar *ptr = *ptrptr;
  +const ichar *tempptr;
   uschar *previous = NULL;
   uschar *previous_callout = NULL;
   uschar classbits[32];
  @@ -1844,7 +1866,7 @@
           posix_class *= 3;
           for (i = 0; i < 3; i++)
             {
  -          BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
  +          BOOL blankclass = STREQUAL(ptr, 5, "blank");
             int taboffset = posix_class_maps[posix_class + i];
             if (taboffset < 0) break;
             if (local_negate)
  @@ -1983,7 +2005,7 @@

           if (d == '\\')
             {
  -          const uschar *oldptr = ptr;
  +          const ichar *oldptr = ptr;
             d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);

             /* \b is backslash; \X is literal X; any other special means the '-'
  @@ -2362,7 +2384,7 @@
         if (utf8 && (code[-1] & 0x80) != 0)
           {
           uschar *lastchar = code - 1;
  -        while((*lastchar & 0xc0) == 0x80) lastchar--;
  +        while(ISMIDCHAR(*lastchar)) lastchar--;
           c = code - lastchar;            /* Length of UTF-8 character */
           memcpy(utf8_char, lastchar, c); /* Save the char */
           c |= 0x80;                      /* Flag c as a length */
  @@ -2881,7 +2903,7 @@
             {
             int i, namelen;
             uschar *slot = cd->name_table;
  -          const uschar *name;     /* Don't amalgamate; some compilers */
  +          const ichar *name;      /* Don't amalgamate; some compilers */
             name = ++ptr;           /* grumble at autoincrement in declaration */

             while (*ptr++ != '>');
  @@ -2919,7 +2941,7 @@
             {
             int i, namelen;
             int type = *ptr++;
  -          const uschar *name = ptr;
  +          const ichar *name = ptr;
             uschar *slot = cd->name_table;

             while (*ptr != ')') ptr++;
  @@ -2927,7 +2949,7 @@

             for (i = 0; i < cd->names_found; i++)
               {
  -            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
  +            if (STREQUAL(name, namelen, (char *)slot+2)) break;
               slot += cd->name_entry_size;
               }
             if (i >= cd->names_found)
  @@ -3315,9 +3337,9 @@
       mcbuffer[0] = c;

   #ifdef SUPPORT_UTF8
  -    if (utf8 && (c & 0xc0) == 0xc0)
  +    if (utf8 && ISMBSTARTCHAR(c))
         {
  -      while ((ptr[1] & 0xc0) == 0x80)
  +      while (ISMIDCHAR(ptr[1]))
           mcbuffer[mclength++] = *(++ptr);
         }
   #endif
  @@ -3409,10 +3431,10 @@

   static BOOL
   compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
  -  const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
  +  const ichar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
     int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
   {
  -const uschar *ptr = *ptrptr;
  +const ichar *ptr = *ptrptr;
   uschar *code = *codeptr;
   uschar *last_branch = code;
   uschar *start_bracket = code;
  @@ -3829,7 +3851,7 @@
   */

   EXPORT pcre *
  -pcre_compile(const char *pattern, int options, const char **errorptr,
  +pcre_compile(const pcre_char *pattern, int options, const char **errorptr,
     int *erroroffset, const unsigned char *tables)
   {
   return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
  @@ -3837,7 +3859,7 @@

   EXPORT pcre *
  -pcre_compile2(const char *pattern, int options, int *errorcodeptr,
  +pcre_compile2(const pcre_char *pattern, int options, int *errorcodeptr,
     const char **errorptr, int *erroroffset, const unsigned char *tables)
   {
   real_pcre *re;
  @@ -3860,7 +3882,7 @@
   size_t size;
   uschar *code;
   const uschar *codestart;
  -const uschar *ptr;
  +const ichar *ptr;
   compile_data compile_block;
   int brastack[BRASTACK_SIZE];
   uschar bralenstack[BRASTACK_SIZE];
  @@ -3888,16 +3910,24 @@

   *erroroffset = 0;

  +/* Always set the UTF-8 flag if we're compiled for UTF-16; saves on ifdefs. */
  +
  +#if PCRE_UTF16
  +options |= PCRE_UTF8;
  +#endif
  +
   /* Can't support UTF8 unless PCRE has been compiled to include the code. */

   #ifdef SUPPORT_UTF8
   utf8 = (options & PCRE_UTF8) != 0;
  +#if !PCRE_UTF16
   if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
  -     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
  +     (*erroroffset = _pcre_valid_utf8((ichar *)pattern, -1)) >= 0)
     {
     errorcode = ERR44;
     goto PCRE_EARLY_ERROR_RETURN;
     }
  +#endif
   #else
   if ((options & PCRE_UTF8) != 0)
     {
  @@ -3940,7 +3970,7 @@
   for any counted white space if an "extended" flag setting appears late in the
   pattern. We can't be so clever for #-comments. */

  -ptr = (const uschar *)(pattern - 1);
  +ptr = (const ichar *)(pattern - 1);
   while ((c = *(++ptr)) != 0)
     {
     int min, max;
  @@ -3995,6 +4025,13 @@
         {
         length += 2;          /* For a one-byte character */

  +#if PCRE_UTF16
  +      if (IS_LEADING_SURROGATE(c))
  +        {
  +        length++;
  +        lastitemlength++;
  +        }
  +#else
   #ifdef SUPPORT_UTF8
         if (utf8 && c > 127)
           {
  @@ -4005,6 +4042,7 @@
           lastitemlength += i;
           }
   #endif
  +#endif

         continue;
         }
  @@ -4249,7 +4287,7 @@
           d = -1;
           if (ptr[1] == '-')
             {
  -          uschar const *hyptr = ptr++;
  +          ichar const *hyptr = ptr++;
             if (ptr[1] == '\\')
               {
               ptr++;
  @@ -4499,7 +4537,7 @@
           ptr += 3;
           if (*ptr == '<')
             {
  -          const uschar *p;    /* Don't amalgamate; some compilers */
  +          const ichar *p;    /* Don't amalgamate; some compilers */
             p = ++ptr;          /* grumble at autoincrement in declaration */
             while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
             if (*ptr != '>')
  @@ -4819,9 +4857,9 @@
       /* In UTF-8 mode, check for additional bytes. */

   #ifdef SUPPORT_UTF8
  -    if (utf8 && (c & 0xc0) == 0xc0)
  +    if (utf8 && ISMBSTARTCHAR(c))
         {
  -      while ((ptr[1] & 0xc0) == 0x80)         /* Can't flow over the end */
  +      while (ISMIDCHAR(ptr[1]))               /* Can't flow over the end */
           {                                     /* because the end is marked */
           lastitemlength++;                     /* by a zero byte. */
           length++;
  @@ -4881,7 +4919,7 @@
   compile_block.name_table = (uschar *)re + re->name_table_offset;
   codestart = compile_block.name_table + re->name_entry_size * re->name_count;
   compile_block.start_code = codestart;
  -compile_block.start_pattern = (const uschar *)pattern;
  +compile_block.start_pattern = (const ichar *)pattern;
   compile_block.req_varyopt = 0;
   compile_block.nopartial = FALSE;

  @@ -4889,7 +4927,7 @@
   error, errorcode will be set non-zero, so we don't need to look at the result
   of the function here. */

  -ptr = (const uschar *)pattern;
  +ptr = (const ichar *)pattern;
   code = (uschar *)codestart;
   *code = OP_BRA;
   bracount = 0;
  @@ -4924,7 +4962,7 @@
     {
     (pcre_free)(re);
     PCRE_ERROR_RETURN:
  -  *erroroffset = ptr - (const uschar *)pattern;
  +  *erroroffset = ptr - (const ichar *)pattern;
     PCRE_EARLY_ERROR_RETURN:
     *errorptr = error_texts[errorcode];
     if (errorcodeptr != NULL) *errorcodeptr = errorcode;
  @@ -5022,7 +5060,7 @@
     {
     (pcre_free)(re);
     *errorptr = error_texts[ERR23];
  -  *erroroffset = ptr - (uschar *)pattern;
  +  *erroroffset = ptr - (ichar *)pattern;
     if (errorcodeptr != NULL) *errorcodeptr = ERR23;
     return NULL;
     }

  1.1.2.1   +85 -12    JavaScriptCore/pcre/Attic/pcre_internal.h

  Index: pcre_internal.h
  ===================================================================
  RCS file: /cvs/root/JavaScriptCore/pcre/Attic/pcre_internal.h,v
  retrieving revision 1.1
  retrieving revision 1.1.2.1
  diff -u -r1.1 -r1.1.2.1
  --- pcre_internal.h	12 Aug 2005 22:13:27 -0000	1.1
  +++ pcre_internal.h	15 Aug 2005 16:09:54 -0000	1.1.2.1
  @@ -8,6 +8,7 @@

                          Written by Philip Hazel
              Copyright (c) 1997-2005 University of Cambridge
  +           Copyright (c) 2004, 2005 Apple Computer, Inc.

   -----------------------------------------------------------------------------
   Redistribution and use in source and binary forms, with or without
  @@ -42,6 +43,21 @@
   modules, but which are not relevant to the exported API. This includes some
   functions whose names all begin with "_pcre_". */

  +#define _pcre_OP_lengths kjs_pcre_OP_lengths
  +#define _pcre_default_tables kjs_pcre_default_tables
  +#define _pcre_ord2utf8 kjs_pcre_ord2utf8
  +#define _pcre_printint kjs_pcre_printint
  +#define _pcre_try_flipped kjs_pcre_try_flipped
  +#define _pcre_ucp_findchar kjs_pcre_ucp_findchar
  +#define _pcre_utf8_table1 kjs_pcre_utf8_table1
  +#define _pcre_utf8_table1_size  kjs_pcre_utf8_table1_size
  +#define _pcre_utf8_table2 kjs_pcre_utf8_table2
  +#define _pcre_utf8_table3 kjs_pcre_utf8_table3
  +#define _pcre_utf8_table4 kjs_pcre_utf8_table4
  +#define _pcre_utt kjs_pcre_utt
  +#define _pcre_utt_size kjs_pcre_utt_size
  +#define _pcre_valid_utf8 kjs_pcre_valid_utf8
  +#define _pcre_xclass kjs_pcre_xclass

   /* Define DEBUG to get debugging output on stdout. */

  @@ -63,7 +79,7 @@

   /* Get the definitions provided by running "configure" */

  -#include "config.h"
  +#include "pcre-config.h"

   /* Standard C headers plus the external interface definition. The only time
   setjmp and stdarg are used is when NO_RECURSE is set. */
  @@ -103,6 +119,10 @@
     #error Cannot determine a type for 32-bit unsigned integers
   #endif

  +/* Include the public PCRE header */
  +
  +#include "pcre.h"
  +
   /* All character handling must be done as unsigned characters. Otherwise there
   are problems with top-bit-set characters and functions such as isspace().
   However, we leave the interface to the outside world as char *, because that
  @@ -112,9 +132,12 @@

   typedef unsigned char uschar;

  -/* Include the public PCRE header */
  -
  -#include "pcre.h"
  +/* Use ichar to mean "internal character" for always-unsigned version of pcre_char. */
  +#if PCRE_UTF16
  +typedef pcre_char ichar;
  +#else
  +typedef unsigned char ichar;
  +#endif

   /* Include the (copy of) the public ucp header, changing the external name into
   a private one. This does no harm, even if we aren't compiling UCP support. */
  @@ -249,6 +272,47 @@

   #else   /* SUPPORT_UTF8 */

  +#if PCRE_UTF16
  +
  +#define LEAD_OFFSET (0xd800 - (0x10000 >> 10))
  +#define SURROGATE_OFFSET (0x10000 - (0xd800 << 10) - 0xdc00)
  +
  +#define IS_LEADING_SURROGATE(c) (((c) & ~0x3ff) == 0xd800)
  +#define IS_TRAILING_SURROGATE(c) (((c) & ~0x3ff) == 0xdc00)
  +
  +#define DECODE_SURROGATE_PAIR(l, t) (((l) << 10) + (t) + SURROGATE_OFFSET)
  +#define LEADING_SURROGATE(c) (LEAD_OFFSET + ((c) >> 10))
  +#define TRAILING_SURROGATE(c) (0xdc00 + ((c) & 0x3FF))
  +
  +#define GETCHAR(c, eptr) \
  +  c = eptr[0]; \
  +  if (IS_LEADING_SURROGATE(c)) \
  +    c = DECODE_SURROGATE_PAIR(c, eptr[1])
  +
  +#define GETCHARTEST(c, eptr) GETCHAR(c, eptr)
  +
  +#define GETCHARINC(c, eptr) \
  +  c = *eptr++; \
  +  if (IS_LEADING_SURROGATE(c)) \
  +    c = DECODE_SURROGATE_PAIR(c, *eptr++)
  +
  +#define GETCHARINCTEST(c, eptr) GETCHARINC(c, eptr)
  +
  +#define GETCHARLEN(c, eptr, len) \
  +  c = eptr[0]; \
  +  if (!IS_LEADING_SURROGATE(c)) \
  +    len = 1; \
  +  else \
  +    { \
  +    c = DECODE_SURROGATE_PAIR(c, eptr[1]); \
  +    len = 2; \
  +    }
  +
  +#define ISMBSTARTCHAR(c) IS_LEADING_SURROGATE(c)
  +#define ISMIDCHAR(c) IS_TRAILING_SURROGATE(c)
  +
  +#else
  +
   /* Get the next UTF-8 character, not advancing the pointer. This is called when
   we know we are in UTF-8 mode. */

  @@ -337,10 +401,20 @@
       len += gcaa; \
       }

  +/* Return 1 if at the start of a multibyte character. */
  +
  +#define ISMBSTARTCHAR(c) (((c) & 0xc0) == 0xc0)
  +
  +/* Return 1 if not the start of a character. */
  +
  +#define ISMIDCHAR(c) (((c) & 0xc0) == 0x80)
  +
  +#endif
  +
   /* If the pointer is not at the start of a character, move it back until
   it is. Called only in UTF-8 mode. */

  -#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
  +#define BACKCHAR(eptr) while(ISMIDCHAR(*eptr)) eptr--;

   #endif

  @@ -803,10 +877,10 @@
     BOOL   partial;               /* PARTIAL flag */
     BOOL   hitend;                /* Hit the end of the subject at some point */
     const uschar *start_code;     /* For use when recursing */
  -  const uschar *start_subject;  /* Start of the subject string */
  -  const uschar *end_subject;    /* End of the subject string */
  -  const uschar *start_match;    /* Start of this match attempt */
  -  const uschar *end_match_ptr;  /* Subject position at end match */
  +  const ichar *start_subject;   /* Start of the subject string */
  +  const ichar *end_subject;     /* End of the subject string */
  +  const ichar *start_match;     /* Start of this match attempt */
  +  const ichar *end_match_ptr;   /* Subject position at end match */
     int    end_offset_top;        /* Highwater mark at end of match */
     int    capture_last;          /* Most recent capture number */
     int    start_offset;          /* The start offset value */
  @@ -820,8 +894,8 @@

   typedef struct dfa_match_data {
     const uschar *start_code;     /* Start of the compiled pattern */
  -  const uschar *start_subject;  /* Start of the subject string */
  -  const uschar *end_subject;    /* End of subject string */
  +  const ichar *start_subject;   /* Start of the subject string */
  +  const ichar *end_subject;     /* End of subject string */
     const uschar *tables;         /* Character tables */
     int   moptions;               /* Match options */
     int   poptions;               /* Pattern options */
  @@ -889,7 +963,6 @@

   extern const uschar _pcre_OP_lengths[];

  -
   /* Internal shared functions. These are functions that are used by more than
   one of the exported public functions. They have to be "external" in the C
   sense, but are not part of the PCRE public API. */