[webkit-changes] cvs commit: JavaScriptCore/pcre pcre_compile.c
pcre_exec.c pcre_internal.h
Darin
darin at opensource.apple.com
Mon Aug 15 17:17:18 PDT 2005
darin 05/08/15 17:17:18
Modified: pcre Tag: pcre-6-1-branch pcre_compile.c pcre_exec.c
pcre_internal.h
Log:
More UTF-16 mojo. Are we done?
Revision Changes Path
No revision
No revision
1.1.2.4 +56 -32 JavaScriptCore/pcre/Attic/pcre_compile.c
Index: pcre_compile.c
===================================================================
RCS file: /cvs/root/JavaScriptCore/pcre/Attic/pcre_compile.c,v
retrieving revision 1.1.2.3
retrieving revision 1.1.2.4
diff -u -r1.1.2.3 -r1.1.2.4
--- pcre_compile.c 15 Aug 2005 22:43:55 -0000 1.1.2.3
+++ pcre_compile.c 16 Aug 2005 00:17:16 -0000 1.1.2.4
@@ -46,6 +46,16 @@
#include "pcre_internal.h"
+// WARNING: These macros evaluate their parameters more than once.
+#if PCRE_UTF16
+#define CTYPES(cd, x) ((x) <= 255 ? (cd)->ctypes[(x)] : 0)
+#define DIGITAB(x) ((x) <= 255 ? digitab[(x)] : 0)
+#else
+#define CTYPES(cd, x) cd->ctypes[(x)]
+#define DIGITAB(x) digitab[(x)]
+#endif
+
+
/*************************************************
* Code parameters and static tables *
*************************************************/
@@ -225,7 +235,6 @@
Then we can use ctype_digit and ctype_xdigit in the code. */
-// FIXME: handle chars > 255 when looking in this table
#if !EBCDIC /* This is the "normal" case, for ASCII systems */
static const unsigned char digitab[] =
{
@@ -426,7 +435,7 @@
{
oldptr = ptr;
c -= '0';
- while ((digitab[ptr[1]] & ctype_digit) != 0)
+ while ((DIGITAB(ptr[1]) & ctype_digit) != 0)
c = c * 10 + *(++ptr) - '0';
if (c < 10 || c <= bracount)
{
@@ -467,7 +476,7 @@
const pcre_uchar *pt = ptr + 2;
register int count = 0;
c = 0;
- while ((digitab[*pt] & ctype_xdigit) != 0)
+ while ((DIGITAB(*pt) & ctype_xdigit) != 0)
{
int cc = *pt++;
count++;
@@ -493,7 +502,7 @@
/* Read just a single hex char */
c = 0;
- while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
+ while (i++ < 2 && (DIGITAB(ptr[1]) & ctype_xdigit) != 0)
{
int cc; /* Some compilers don't like ++ */
cc = *(++ptr); /* in initializers */
@@ -598,6 +607,7 @@
c = *(++ptr);
if (c == 0) goto ERROR_RETURN;
if (c == '}') break;
+ if (c > 127) goto ERROR_RETURN;
name[i] = c;
}
if (c !='}') /* Try to distinguish error cases */
@@ -612,6 +622,7 @@
else
{
+ if (c > 127) goto ERROR_RETURN;
name[0] = c;
name[1] = 0;
}
@@ -664,15 +675,17 @@
static BOOL
is_counted_repeat(const pcre_uchar *p)
{
-if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
-while ((digitab[*p] & ctype_digit) != 0) p++;
+if ((DIGITAB(*p) & ctype_digit) == 0) return FALSE;
+p++;
+while ((DIGITAB(*p) & ctype_digit) != 0) p++;
if (*p == '}') return TRUE;
if (*p++ != ',') return FALSE;
if (*p == '}') return TRUE;
-if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
-while ((digitab[*p] & ctype_digit) != 0) p++;
+if ((DIGITAB(*p) & ctype_digit) == 0) return FALSE;
+p++;
+while ((DIGITAB(*p) & ctype_digit) != 0) p++;
return (*p == '}');
}
@@ -704,14 +717,14 @@
int min = 0;
int max = -1;
-while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
+while ((DIGITAB(*p) & ctype_digit) != 0) min = min * 10 + *p++ - '0';
if (*p == '}') max = min; else
{
if (*(++p) != '}')
{
max = 0;
- while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
+ while((DIGITAB(*p) & ctype_digit) != 0) max = max * 10 + *p++ - '0';
if (max < min)
{
*errorcodeptr = ERR4;
@@ -895,7 +908,7 @@
#ifdef SUPPORT_UTF8
if ((options & PCRE_UTF8) != 0)
{
- while (ISMIDCHAR(*cc)) cc++;
+ while ((*cc & 0xc0) == 0x80) cc++;
}
#endif
break;
@@ -1043,7 +1056,7 @@
case OP_MINPLUS:
case OP_QUERY:
case OP_MINQUERY:
- while (ISMIDCHAR(*code)) code++;
+ while ((*code & 0xc0) == 0x80) code++;
break;
/* XCLASS is used for classes that cannot be represented just by a bit
@@ -1115,7 +1128,7 @@
case OP_MINPLUS:
case OP_QUERY:
case OP_MINQUERY:
- while (ISMIDCHAR(*code)) code++;
+ while ((*code & 0xc0) == 0x80) code++;
break;
/* XCLASS is used for classes that cannot be represented just by a bit
@@ -1266,7 +1279,7 @@
case OP_MINQUERY:
case OP_UPTO:
case OP_MINUPTO:
- if (utf8) while (ISMIDCHAR(code[2])) code++;
+ if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
break;
#endif
}
@@ -1332,7 +1345,7 @@
int terminator; /* Don't combine these lines; the Solaris cc */
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
if (*(++ptr) == '^') ptr++;
-while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
+while ((CTYPES(cd, *ptr) & ctype_letter) != 0) ptr++;
if (*ptr == terminator && ptr[1] == ']')
{
*endptr = ptr;
@@ -1674,7 +1687,7 @@
if ((options & PCRE_EXTENDED) != 0)
{
- if ((cd->ctypes[c] & ctype_space) != 0) continue;
+ if ((CTYPES(cd, c) & ctype_space) != 0) continue;
if (c == '#')
{
/* The space before the ; is to avoid a warning on a silly compiler
@@ -2384,7 +2397,7 @@
if (utf8 && (code[-1] & 0x80) != 0)
{
uschar *lastchar = code - 1;
- while(ISMIDCHAR(*lastchar)) lastchar--;
+ while((*lastchar & 0xc0) == 0x80) lastchar--;
c = code - lastchar; /* Length of UTF-8 character */
memcpy(utf8_char, lastchar, c); /* Save the char */
c |= 0x80; /* Flag c as a length */
@@ -2828,7 +2841,7 @@
if a digit follows ( then there will just be digits until ) because
the syntax was checked in the first pass. */
- else if ((digitab[ptr[1]] && ctype_digit) != 0)
+ else if ((DIGITAB(ptr[1]) && ctype_digit) != 0)
{
int condref; /* Don't amalgamate; some compilers */
condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
@@ -2883,8 +2896,12 @@
*code++ = OP_CALLOUT; /* Already checked that the terminating */
{ /* closing parenthesis is present. */
int n = 0;
- while ((digitab[*(++ptr)] & ctype_digit) != 0)
+ ++ptr;
+ while ((DIGITAB(*ptr) & ctype_digit) != 0)
+ {
n = n * 10 + *ptr - '0';
+ ++ptr;
+ }
if (n > 255)
{
*errorcodeptr = ERR38;
@@ -2986,7 +3003,7 @@
{
const uschar *called;
recno = 0;
- while((digitab[*ptr] & ctype_digit) != 0)
+ while((DIGITAB(*ptr) & ctype_digit) != 0)
recno = recno * 10 + *ptr++ - '0';
/* Come here from code above that handles a named recursion */
@@ -3337,9 +3354,9 @@
mcbuffer[0] = c;
#ifdef SUPPORT_UTF8
- if (utf8 && ISMBSTARTCHAR(c))
+ if (utf8 && (c & 0xc0) == 0xc0)
{
- while (ISMIDCHAR(ptr[1]))
+ while ((ptr[1] & 0xc0) == 0x80)
mcbuffer[mclength++] = *(++ptr);
}
#endif
@@ -3990,7 +4007,7 @@
if ((options & PCRE_EXTENDED) != 0)
{
- if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
+ if ((CTYPES(&compile_block, c) & ctype_space) != 0) continue;
if (c == '#')
{
/* The space before the ; is to avoid a warning on a silly compiler
@@ -4496,7 +4513,11 @@
case '5': case '6': case '7': case '8': case '9':
ptr += 2;
if (c != 'R')
- while ((digitab[*(++ptr)] & ctype_digit) != 0);
+ {
+ ++ptr;
+ while ((DIGITAB(*ptr) & ctype_digit) != 0)
+ ++ptr;
+ }
if (*ptr != ')')
{
errorcode = ERR29;
@@ -4521,8 +4542,9 @@
follow (default is zero). */
case 'C':
- ptr += 2;
- while ((digitab[*(++ptr)] & ctype_digit) != 0);
+ ptr += 3;
+ while ((DIGITAB(*ptr) & ctype_digit) != 0)
+ ++ptr;
if (*ptr != ')')
{
errorcode = ERR39;
@@ -4539,7 +4561,7 @@
{
const pcre_uchar *p; /* Don't amalgamate; some compilers */
p = ++ptr; /* grumble at autoincrement in declaration */
- while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
+ while ((CTYPES(&compile_block, *ptr) & ctype_word) != 0) ptr++;
if (*ptr != '>')
{
errorcode = ERR42;
@@ -4552,7 +4574,9 @@
if (*ptr == '=' || *ptr == '>')
{
- while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
+ ++ptr;
+ while ((CTYPES(&compile_block, *ptr) & ctype_word) != 0)
+ ++ptr;
if (*ptr != ')')
{
errorcode = ERR42;
@@ -4589,11 +4613,11 @@
ptr += 4;
length += 3;
}
- else if ((digitab[ptr[3]] & ctype_digit) != 0)
+ else if ((DIGITAB(ptr[3]) & ctype_digit) != 0)
{
ptr += 4;
length += 3;
- while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
+ while ((DIGITAB(*ptr) & ctype_digit) != 0) ptr++;
if (*ptr != ')')
{
errorcode = ERR26;
@@ -4857,9 +4881,9 @@
/* In UTF-8 mode, check for additional bytes. */
#ifdef SUPPORT_UTF8
- if (utf8 && ISMBSTARTCHAR(c))
+ if (utf8 && (c & 0xc0) == 0xc0)
{
- while (ISMIDCHAR(ptr[1])) /* Can't flow over the end */
+ while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
{ /* because the end is marked */
lastitemlength++; /* by a zero byte. */
length++;
1.1.2.3 +124 -9 JavaScriptCore/pcre/Attic/pcre_exec.c
Index: pcre_exec.c
===================================================================
RCS file: /cvs/root/JavaScriptCore/pcre/Attic/pcre_exec.c,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -u -r1.1.2.2 -r1.1.2.3
--- pcre_exec.c 15 Aug 2005 22:43:55 -0000 1.1.2.2
+++ pcre_exec.c 16 Aug 2005 00:17:17 -0000 1.1.2.3
@@ -103,8 +103,14 @@
{
int c;
if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
-while (length-- > 0)
- if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
+while (length-- > 0) {
+ if (isprint(c = *(p++))) printf("%c", c);
+#if PCRE_UTF16
+ else if (c < 256) printf("\\x%02x", c);
+ else printf("\\x{%x}", c);
+#else
+ else printf("\\x%02x", c);
+#endif
}
#endif
@@ -452,15 +458,17 @@
#define fc c
+#if !PCRE_UTF16
#ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
const uschar *charptr; /* small blocks of the code. My normal */
#endif /* style of coding would have declared */
+#endif
const uschar *callpat; /* them within each of those blocks. */
const uschar *data; /* However, in order to accommodate the */
const uschar *next; /* version of this code that uses an */
-const pcre_uchar *pp; /* external "stack" implemented on the */
+const pcre_uchar *pp; /* external "stack" implemented on the */
const uschar *prev; /* heap, it is easier to declare them */
-const pcre_uchar *saved_eptr; /* all here, so the declarations can */
+const pcre_uchar *saved_eptr; /* all here, so the declarations can */
/* be cut out in a block. The only */
recursion_info new_recursive; /* declarations within blocks below are */
/* for variables that do not have to */
@@ -1214,7 +1222,7 @@
if (eptr == md->start_subject) prev_is_word = FALSE; else
{
const pcre_uchar *lastptr = eptr - 1;
- while((*lastptr & 0xc0) == 0x80) lastptr--;
+ while(ISMIDCHAR(*lastptr)) lastptr--;
GETCHAR(c, lastptr);
prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
}
@@ -1252,7 +1260,7 @@
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
#ifdef SUPPORT_UTF8
if (utf8)
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
#endif
ecode++;
break;
@@ -1900,6 +1908,112 @@
REPEATCHAR:
#ifdef SUPPORT_UTF8
+#if PCRE_UTF16
+ length = 1;
+ GETUTF8CHARLEN(fc, ecode, length);
+ int utf16Length = fc > 0xFFFF ? 2 : 1;
+ if (min * utf16Length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
+ ecode += length;
+
+ if (utf16Length == 1)
+ {
+#ifdef SUPPORT_UCP
+ int othercase;
+ int chartype;
+ if ((ims & PCRE_CASELESS) == 0 || ucp_findchar(fc, &chartype, &othercase) < 0)
+ othercase = -1; /* Guaranteed to not match any character */
+#endif /* SUPPORT_UCP */
+
+ for (i = 1; i <= min; i++)
+ {
+ if (*eptr != fc && *eptr != othercase) RRETURN(MATCH_NOMATCH);
+ ++eptr;
+ }
+
+ if (min == max) continue;
+
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ if (*eptr != fc && *eptr != othercase) RRETURN(MATCH_NOMATCH);
+ ++eptr;
+ }
+ /* Control never gets here */
+ }
+ else
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ if (eptr > md->end_subject - length) break;
+ if (*eptr != fc && *eptr != othercase) break;
+ ++eptr;
+ }
+ while (eptr >= pp)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ --eptr;
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+ else
+ {
+ /* No case on surrogate pairs, so no need to bother with "othercase". */
+
+ for (i = 1; i <= min; i++)
+ {
+ int nc;
+ GETCHAR(nc, eptr);
+ if (nc != fc) RRETURN(MATCH_NOMATCH);
+ eptr += 2;
+ }
+
+ if (min == max) continue;
+
+ if (minimize)
+ {
+ for (fi = min;; fi++)
+ {
+ int nc;
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHAR(nc, eptr);
+ if (*eptr != fc) RRETURN(MATCH_NOMATCH);
+ eptr += 2;
+ }
+ /* Control never gets here */
+ }
+ else
+ {
+ pp = eptr;
+ for (i = min; i < max; i++)
+ {
+ int nc;
+ if (eptr > md->end_subject - length) break;
+ GETCHAR(nc, eptr);
+ if (*eptr != fc) break;
+ eptr += 2;
+ }
+ while (eptr >= pp)
+ {
+ RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
+ if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+ eptr -= 2;
+ }
+ RRETURN(MATCH_NOMATCH);
+ }
+ /* Control never gets here */
+ }
+ /* Control never gets here */
+#else
if (utf8)
{
length = 1;
@@ -1987,6 +2101,7 @@
value of fc will always be < 128. */
}
else
+#endif
#endif /* SUPPORT_UTF8 */
/* When not in UTF-8 mode, load a single-byte character. */
@@ -2483,7 +2598,7 @@
if (eptr >= md->end_subject ||
(*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
}
break;
@@ -2517,7 +2632,7 @@
if (eptr >= md->end_subject ||
(*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
}
break;
@@ -2537,7 +2652,7 @@
if (eptr >= md->end_subject ||
(*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
RRETURN(MATCH_NOMATCH);
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+ while (eptr < md->end_subject && ISMIDCHAR(*eptr)) eptr++;
}
break;
1.1.2.4 +20 -23 JavaScriptCore/pcre/Attic/pcre_internal.h
Index: pcre_internal.h
===================================================================
RCS file: /cvs/root/JavaScriptCore/pcre/Attic/pcre_internal.h,v
retrieving revision 1.1.2.3
retrieving revision 1.1.2.4
diff -u -r1.1.2.3 -r1.1.2.4
--- pcre_internal.h 15 Aug 2005 22:43:56 -0000 1.1.2.3
+++ pcre_internal.h 16 Aug 2005 00:17:17 -0000 1.1.2.4
@@ -272,6 +272,25 @@
#else /* SUPPORT_UTF8 */
+/* Get the next UTF-8 character, not advancing the pointer, incrementing length
+if there are extra bytes. This is called when we know we are in UTF-8 mode. */
+
+#define GETUTF8CHARLEN(c, eptr, len) \
+ c = *eptr; \
+ if ((c & 0xc0) == 0xc0) \
+ { \
+ int gcii; \
+ int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
+ int gcss = 6*gcaa; \
+ c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
+ for (gcii = 1; gcii <= gcaa; gcii++) \
+ { \
+ gcss -= 6; \
+ c |= (eptr[gcii] & 0x3f) << gcss; \
+ } \
+ len += gcaa; \
+ }
+
#if PCRE_UTF16
#define LEAD_OFFSET (0xd800 - (0x10000 >> 10))
@@ -306,7 +325,6 @@
++len; \
}
-#define ISMBSTARTCHAR(c) IS_LEADING_SURROGATE(c)
#define ISMIDCHAR(c) IS_TRAILING_SURROGATE(c)
#else
@@ -380,28 +398,7 @@
} \
}
-/* Get the next UTF-8 character, not advancing the pointer, incrementing length
-if there are extra bytes. This is called when we know we are in UTF-8 mode. */
-
-#define GETCHARLEN(c, eptr, len) \
- c = *eptr; \
- if ((c & 0xc0) == 0xc0) \
- { \
- int gcii; \
- int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
- int gcss = 6*gcaa; \
- c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
- for (gcii = 1; gcii <= gcaa; gcii++) \
- { \
- gcss -= 6; \
- c |= (eptr[gcii] & 0x3f) << gcss; \
- } \
- len += gcaa; \
- }
-
-/* Return 1 if at the start of a multibyte character. */
-
-#define ISMBSTARTCHAR(c) (((c) & 0xc0) == 0xc0)
+#define GETCHARLEN(c, eptr) GETUTF8CHARLEN(c, eptr)
/* Return 1 if not the start of a character. */
More information about the webkit-changes
mailing list