[webkit-changes] cvs commit: JavaScriptCore/pcre pcre_compile.c pcre_exec.c pcre_internal.h

Mon Aug 15 17:17:18 PDT 2005

darin       05/08/15 17:17:18

  Modified:    pcre     Tag: pcre-6-1-branch pcre_compile.c pcre_exec.c
                        pcre_internal.h
  Log:
  More UTF-16 mojo. Are we done?

  Revision  Changes    Path
  No                   revision

  No                   revision

  1.1.2.4   +56 -32    JavaScriptCore/pcre/Attic/pcre_compile.c

  Index: pcre_compile.c
  ===================================================================
  RCS file: /cvs/root/JavaScriptCore/pcre/Attic/pcre_compile.c,v
  retrieving revision 1.1.2.3
  retrieving revision 1.1.2.4
  diff -u -r1.1.2.3 -r1.1.2.4
  --- pcre_compile.c	15 Aug 2005 22:43:55 -0000	1.1.2.3
  +++ pcre_compile.c	16 Aug 2005 00:17:16 -0000	1.1.2.4
  @@ -46,6 +46,16 @@
   #include "pcre_internal.h"

  +// WARNING: These macros evaluate their parameters more than once.
  +#if PCRE_UTF16
  +#define CTYPES(cd, x) ((x) <= 255 ? (cd)->ctypes[(x)] : 0)
  +#define DIGITAB(x) ((x) <= 255 ? digitab[(x)] : 0)
  +#else
  +#define CTYPES(cd, x) cd->ctypes[(x)]
  +#define DIGITAB(x) digitab[(x)]
  +#endif
  +
  +
   /*************************************************
   *      Code parameters and static tables         *
   *************************************************/
  @@ -225,7 +235,6 @@

   Then we can use ctype_digit and ctype_xdigit in the code. */

  -// FIXME: handle chars > 255 when looking in this table
   #if !EBCDIC    /* This is the "normal" case, for ASCII systems */
   static const unsigned char digitab[] =
     {
  @@ -426,7 +435,7 @@
         {
         oldptr = ptr;
         c -= '0';
  -      while ((digitab[ptr[1]] & ctype_digit) != 0)
  +      while ((DIGITAB(ptr[1]) & ctype_digit) != 0)
           c = c * 10 + *(++ptr) - '0';
         if (c < 10 || c <= bracount)
           {
  @@ -467,7 +476,7 @@
         const pcre_uchar *pt = ptr + 2;
         register int count = 0;
         c = 0;
  -      while ((digitab[*pt] & ctype_xdigit) != 0)
  +      while ((DIGITAB(*pt) & ctype_xdigit) != 0)
           {
           int cc = *pt++;
           count++;
  @@ -493,7 +502,7 @@
       /* Read just a single hex char */

       c = 0;
  -    while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
  +    while (i++ < 2 && (DIGITAB(ptr[1]) & ctype_xdigit) != 0)
         {
         int cc;                               /* Some compilers don't like ++ */
         cc = *(++ptr);                        /* in initializers */
  @@ -598,6 +607,7 @@
       c = *(++ptr);
       if (c == 0) goto ERROR_RETURN;
       if (c == '}') break;
  +    if (c > 127) goto ERROR_RETURN;
       name[i] = c;
       }
     if (c !='}')   /* Try to distinguish error cases */
  @@ -612,6 +622,7 @@

   else
     {
  +  if (c > 127) goto ERROR_RETURN;
     name[0] = c;
     name[1] = 0;
     }
  @@ -664,15 +675,17 @@
   static BOOL
   is_counted_repeat(const pcre_uchar *p)
   {
  -if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
  -while ((digitab[*p] & ctype_digit) != 0) p++;
  +if ((DIGITAB(*p) & ctype_digit) == 0) return FALSE;
  +p++;
  +while ((DIGITAB(*p) & ctype_digit) != 0) p++;
   if (*p == '}') return TRUE;

   if (*p++ != ',') return FALSE;
   if (*p == '}') return TRUE;

  -if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
  -while ((digitab[*p] & ctype_digit) != 0) p++;
  +if ((DIGITAB(*p) & ctype_digit) == 0) return FALSE;
  +p++;
  +while ((DIGITAB(*p) & ctype_digit) != 0) p++;

   return (*p == '}');
   }
  @@ -704,14 +717,14 @@
   int min = 0;
   int max = -1;

  -while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
  +while ((DIGITAB(*p) & ctype_digit) != 0) min = min * 10 + *p++ - '0';

   if (*p == '}') max = min; else
     {
     if (*(++p) != '}')
       {
       max = 0;
  -    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
  +    while((DIGITAB(*p) & ctype_digit) != 0) max = max * 10 + *p++ - '0';
       if (max < min)
         {
         *errorcodeptr = ERR4;
  @@ -895,7 +908,7 @@
   #ifdef SUPPORT_UTF8
       if ((options & PCRE_UTF8) != 0)
         {
  -      while (ISMIDCHAR(*cc)) cc++;
  +      while ((*cc & 0xc0) == 0x80) cc++;
         }
   #endif
       break;
  @@ -1043,7 +1056,7 @@
         case OP_MINPLUS:
         case OP_QUERY:
         case OP_MINQUERY:
  -      while (ISMIDCHAR(*code)) code++;
  +      while ((*code & 0xc0) == 0x80) code++;
         break;

         /* XCLASS is used for classes that cannot be represented just by a bit
  @@ -1115,7 +1128,7 @@
         case OP_MINPLUS:
         case OP_QUERY:
         case OP_MINQUERY:
  -      while (ISMIDCHAR(*code)) code++;
  +      while ((*code & 0xc0) == 0x80) code++;
         break;

         /* XCLASS is used for classes that cannot be represented just by a bit
  @@ -1266,7 +1279,7 @@
       case OP_MINQUERY:
       case OP_UPTO:
       case OP_MINUPTO:
  -    if (utf8) while (ISMIDCHAR(code[2])) code++;
  +    if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
       break;
   #endif
       }
  @@ -1332,7 +1345,7 @@
   int terminator;          /* Don't combine these lines; the Solaris cc */
   terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
   if (*(++ptr) == '^') ptr++;
  -while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
  +while ((CTYPES(cd, *ptr) & ctype_letter) != 0) ptr++;
   if (*ptr == terminator && ptr[1] == ']')
     {
     *endptr = ptr;
  @@ -1674,7 +1687,7 @@

     if ((options & PCRE_EXTENDED) != 0)
       {
  -    if ((cd->ctypes[c] & ctype_space) != 0) continue;
  +    if ((CTYPES(cd, c) & ctype_space) != 0) continue;
       if (c == '#')
         {
         /* The space before the ; is to avoid a warning on a silly compiler
  @@ -2384,7 +2397,7 @@
         if (utf8 && (code[-1] & 0x80) != 0)
           {
           uschar *lastchar = code - 1;
  -        while(ISMIDCHAR(*lastchar)) lastchar--;
  +        while((*lastchar & 0xc0) == 0x80) lastchar--;
           c = code - lastchar;            /* Length of UTF-8 character */
           memcpy(utf8_char, lastchar, c); /* Save the char */
           c |= 0x80;                      /* Flag c as a length */
  @@ -2828,7 +2841,7 @@
           if a digit follows ( then there will just be digits until ) because
           the syntax was checked in the first pass. */

  -        else if ((digitab[ptr[1]] && ctype_digit) != 0)
  +        else if ((DIGITAB(ptr[1]) && ctype_digit) != 0)
             {
             int condref;                 /* Don't amalgamate; some compilers */
             condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
  @@ -2883,8 +2896,12 @@
           *code++ = OP_CALLOUT;     /* Already checked that the terminating */
             {                       /* closing parenthesis is present. */
             int n = 0;
  -          while ((digitab[*(++ptr)] & ctype_digit) != 0)
  +          ++ptr;
  +          while ((DIGITAB(*ptr) & ctype_digit) != 0)
  +            {
               n = n * 10 + *ptr - '0';
  +            ++ptr;
  +            }
             if (n > 255)
               {
               *errorcodeptr = ERR38;
  @@ -2986,7 +3003,7 @@
             {
             const uschar *called;
             recno = 0;
  -          while((digitab[*ptr] & ctype_digit) != 0)
  +          while((DIGITAB(*ptr) & ctype_digit) != 0)
               recno = recno * 10 + *ptr++ - '0';

             /* Come here from code above that handles a named recursion */
  @@ -3337,9 +3354,9 @@
       mcbuffer[0] = c;

   #ifdef SUPPORT_UTF8
  -    if (utf8 && ISMBSTARTCHAR(c))
  +    if (utf8 && (c & 0xc0) == 0xc0)
         {
  -      while (ISMIDCHAR(ptr[1]))
  +      while ((ptr[1] & 0xc0) == 0x80)
           mcbuffer[mclength++] = *(++ptr);
         }
   #endif
  @@ -3990,7 +4007,7 @@

     if ((options & PCRE_EXTENDED) != 0)
       {
  -    if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
  +    if ((CTYPES(&compile_block, c) & ctype_space) != 0) continue;
       if (c == '#')
         {
         /* The space before the ; is to avoid a warning on a silly compiler
  @@ -4496,7 +4513,11 @@
           case '5': case '6': case '7': case '8': case '9':
           ptr += 2;
           if (c != 'R')
  -          while ((digitab[*(++ptr)] & ctype_digit) != 0);
  +          {
  +          ++ptr;
  +          while ((DIGITAB(*ptr) & ctype_digit) != 0)
  +            ++ptr;
  +          }
           if (*ptr != ')')
             {
             errorcode = ERR29;
  @@ -4521,8 +4542,9 @@
           follow (default is zero). */

           case 'C':
  -        ptr += 2;
  -        while ((digitab[*(++ptr)] & ctype_digit) != 0);
  +        ptr += 3;
  +        while ((DIGITAB(*ptr) & ctype_digit) != 0)
  +          ++ptr;
           if (*ptr != ')')
             {
             errorcode = ERR39;
  @@ -4539,7 +4561,7 @@
             {
             const pcre_uchar *p;    /* Don't amalgamate; some compilers */
             p = ++ptr;          /* grumble at autoincrement in declaration */
  -          while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
  +          while ((CTYPES(&compile_block, *ptr) & ctype_word) != 0) ptr++;
             if (*ptr != '>')
               {
               errorcode = ERR42;
  @@ -4552,7 +4574,9 @@

           if (*ptr == '=' || *ptr == '>')
             {
  -          while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
  +          ++ptr;
  +          while ((CTYPES(&compile_block, *ptr) & ctype_word) != 0)
  +            ++ptr;
             if (*ptr != ')')
               {
               errorcode = ERR42;
  @@ -4589,11 +4613,11 @@
             ptr += 4;
             length += 3;
             }
  -        else if ((digitab[ptr[3]] & ctype_digit) != 0)
  +        else if ((DIGITAB(ptr[3]) & ctype_digit) != 0)
             {
             ptr += 4;
             length += 3;
  -          while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
  +          while ((DIGITAB(*ptr) & ctype_digit) != 0) ptr++;
             if (*ptr != ')')
               {
               errorcode = ERR26;
  @@ -4857,9 +4881,9 @@
       /* In UTF-8 mode, check for additional bytes. */

   #ifdef SUPPORT_UTF8
  -    if (utf8 && ISMBSTARTCHAR(c))
  +    if (utf8 && (c & 0xc0) == 0xc0)
         {
  -      while (ISMIDCHAR(ptr[1]))               /* Can't flow over the end */
  +      while ((ptr[1] & 0xc0) == 0x80)         /* Can't flow over the end */
           {                                     /* because the end is marked */
           lastitemlength++;                     /* by a zero byte. */
           length++;

  1.1.2.3   +124 -9    JavaScriptCore/pcre/Attic/pcre_exec.c

  Index: pcre_exec.c
  ===================================================================
  RCS file: /cvs/root/JavaScriptCore/pcre/Attic/pcre_exec.c,v
  retrieving revision 1.1.2.2
  retrieving revision 1.1.2.3
  diff -u -r1.1.2.2 -r1.1.2.3
  --- pcre_exec.c	15 Aug 2005 22:43:55 -0000	1.1.2.2
  +++ pcre_exec.c	16 Aug 2005 00:17:17 -0000	1.1.2.3
  @@ -103,8 +103,14 @@
   {
   int c;
   if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
  -while (length-- > 0)
  -  if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
  +while (length-- > 0) {
  +  if (isprint(c = *(p++))) printf("%c", c);
  +#if PCRE_UTF16
  +  else if (c < 256) printf("\\x%02x", c);
  +  else printf("\\x{%x}", c);
  +#else
  +  else printf("\\x%02x", c);
  +#endif
   }
   #endif

  @@ -452,15 +458,17 @@
   #define fc c

  +#if !PCRE_UTF16
   #ifdef SUPPORT_UTF8                /* Many of these variables are used ony */
   const uschar *charptr;             /* small blocks of the code. My normal  */
   #endif                             /* style of coding would have declared  */
  +#endif
   const uschar *callpat;             /* them within each of those blocks.    */
   const uschar *data;                /* However, in order to accommodate the */
   const uschar *next;                /* version of this code that uses an    */
  -const pcre_uchar *pp;                   /* external "stack" implemented on the  */
  +const pcre_uchar *pp;              /* external "stack" implemented on the  */
   const uschar *prev;                /* heap, it is easier to declare them   */
  -const pcre_uchar *saved_eptr;           /* all here, so the declarations can    */
  +const pcre_uchar *saved_eptr;      /* all here, so the declarations can    */
                                      /* be cut out in a block. The only      */
   recursion_info new_recursive;      /* declarations within blocks below are */
                                      /* for variables that do not have to    */
  @@ -1214,7 +1222,7 @@
           if (eptr == md->start_subject) prev_is_word = FALSE; else
             {
             const pcre_uchar *lastptr = eptr - 1;
  -          while((*lastptr & 0xc0) == 0x80) lastptr--;
  +          while(ISMIDCHAR(*lastptr)) lastptr--;
             GETCHAR(c, lastptr);
             prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
             }
  @@ -1252,7 +1260,7 @@
       if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
   #ifdef SUPPORT_UTF8
       if (utf8)
  -      while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
  +      while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
   #endif
       ecode++;
       break;
  @@ -1900,6 +1908,112 @@

       REPEATCHAR:
   #ifdef SUPPORT_UTF8
  +#if PCRE_UTF16
  +      length = 1;
  +      GETUTF8CHARLEN(fc, ecode, length);
  +      int utf16Length = fc > 0xFFFF ? 2 : 1;
  +      if (min * utf16Length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
  +      ecode += length;
  +
  +      if (utf16Length == 1)
  +        {
  +#ifdef SUPPORT_UCP
  +        int othercase;
  +        int chartype;
  +        if ((ims & PCRE_CASELESS) == 0 || ucp_findchar(fc, &chartype, &othercase) < 0)
  +          othercase = -1; /* Guaranteed to not match any character */
  +#endif  /* SUPPORT_UCP */
  +
  +        for (i = 1; i <= min; i++)
  +          {
  +          if (*eptr != fc && *eptr != othercase) RRETURN(MATCH_NOMATCH);
  +          ++eptr;
  +          }
  +
  +        if (min == max) continue;
  +
  +        if (minimize)
  +          {
  +          for (fi = min;; fi++)
  +            {
  +            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
  +            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  +            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
  +            if (*eptr != fc && *eptr != othercase) RRETURN(MATCH_NOMATCH);
  +            ++eptr;
  +            }
  +          /* Control never gets here */
  +          }
  +        else
  +          {
  +          pp = eptr;
  +          for (i = min; i < max; i++)
  +            {
  +            if (eptr > md->end_subject - length) break;
  +            if (*eptr != fc && *eptr != othercase) break;
  +            ++eptr;
  +            }
  +          while (eptr >= pp)
  +           {
  +           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
  +           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  +           --eptr;
  +           }
  +          RRETURN(MATCH_NOMATCH);
  +          }
  +        /* Control never gets here */
  +        }
  +      else
  +        {
  +        /* No case on surrogate pairs, so no need to bother with "othercase". */
  +
  +        for (i = 1; i <= min; i++)
  +          {
  +          int nc;
  +          GETCHAR(nc, eptr);
  +          if (nc != fc) RRETURN(MATCH_NOMATCH);
  +          eptr += 2;
  +          }
  +
  +        if (min == max) continue;
  +
  +        if (minimize)
  +          {
  +          for (fi = min;; fi++)
  +            {
  +            int nc;
  +            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
  +            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  +            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
  +            GETCHAR(nc, eptr);
  +            if (*eptr != fc) RRETURN(MATCH_NOMATCH);
  +            eptr += 2;
  +            }
  +          /* Control never gets here */
  +          }
  +        else
  +          {
  +          pp = eptr;
  +          for (i = min; i < max; i++)
  +            {
  +            int nc;
  +            if (eptr > md->end_subject - length) break;
  +            GETCHAR(nc, eptr);
  +            if (*eptr != fc) break;
  +            eptr += 2;
  +            }
  +          while (eptr >= pp)
  +           {
  +           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
  +           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  +           eptr -= 2;
  +           }
  +          RRETURN(MATCH_NOMATCH);
  +          }
  +          /* Control never gets here */
  +        }
  +        /* Control never gets here */
  +#else
       if (utf8)
         {
         length = 1;
  @@ -1987,6 +2101,7 @@
         value of fc will always be < 128. */
         }
       else
  +#endif
   #endif  /* SUPPORT_UTF8 */

       /* When not in UTF-8 mode, load a single-byte character. */
  @@ -2483,7 +2598,7 @@
             if (eptr >= md->end_subject ||
                (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
               RRETURN(MATCH_NOMATCH);
  -          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
  +          while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
             }
           break;

  @@ -2517,7 +2632,7 @@
             if (eptr >= md->end_subject ||
                (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
               RRETURN(MATCH_NOMATCH);
  -          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
  +          while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
             }
           break;

  @@ -2537,7 +2652,7 @@
             if (eptr >= md->end_subject ||
                (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
               RRETURN(MATCH_NOMATCH);
  -          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
  +          while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
             }
           break;

  1.1.2.4   +20 -23    JavaScriptCore/pcre/Attic/pcre_internal.h

  Index: pcre_internal.h
  ===================================================================
  RCS file: /cvs/root/JavaScriptCore/pcre/Attic/pcre_internal.h,v
  retrieving revision 1.1.2.3
  retrieving revision 1.1.2.4
  diff -u -r1.1.2.3 -r1.1.2.4
  --- pcre_internal.h	15 Aug 2005 22:43:56 -0000	1.1.2.3
  +++ pcre_internal.h	16 Aug 2005 00:17:17 -0000	1.1.2.4
  @@ -272,6 +272,25 @@

   #else   /* SUPPORT_UTF8 */

  +/* Get the next UTF-8 character, not advancing the pointer, incrementing length
  +if there are extra bytes. This is called when we know we are in UTF-8 mode. */
  +
  +#define GETUTF8CHARLEN(c, eptr, len) \
  +  c = *eptr; \
  +  if ((c & 0xc0) == 0xc0) \
  +    { \
  +    int gcii; \
  +    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
  +    int gcss = 6*gcaa; \
  +    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
  +    for (gcii = 1; gcii <= gcaa; gcii++) \
  +      { \
  +      gcss -= 6; \
  +      c |= (eptr[gcii] & 0x3f) << gcss; \
  +      } \
  +    len += gcaa; \
  +    }
  +
   #if PCRE_UTF16

   #define LEAD_OFFSET (0xd800 - (0x10000 >> 10))
  @@ -306,7 +325,6 @@
       ++len; \
       }

  -#define ISMBSTARTCHAR(c) IS_LEADING_SURROGATE(c)
   #define ISMIDCHAR(c) IS_TRAILING_SURROGATE(c)

   #else
  @@ -380,28 +398,7 @@
         } \
       }

  -/* Get the next UTF-8 character, not advancing the pointer, incrementing length
  -if there are extra bytes. This is called when we know we are in UTF-8 mode. */
  -
  -#define GETCHARLEN(c, eptr, len) \
  -  c = *eptr; \
  -  if ((c & 0xc0) == 0xc0) \
  -    { \
  -    int gcii; \
  -    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
  -    int gcss = 6*gcaa; \
  -    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
  -    for (gcii = 1; gcii <= gcaa; gcii++) \
  -      { \
  -      gcss -= 6; \
  -      c |= (eptr[gcii] & 0x3f) << gcss; \
  -      } \
  -    len += gcaa; \
  -    }
  -
  -/* Return 1 if at the start of a multibyte character. */
  -
  -#define ISMBSTARTCHAR(c) (((c) & 0xc0) == 0xc0)
  +#define GETCHARLEN(c, eptr) GETUTF8CHARLEN(c, eptr)

   /* Return 1 if not the start of a character. */