[webkit-changes] cvs commit: WebCore/khtml/html htmltokenizer.cpp

Sat Aug 6 09:27:33 PDT 2005

darin       05/08/06 09:27:33

  Modified:    .        ChangeLog
               khtml/html htmltokenizer.cpp
  Log:
          Reviewed by Dave Hyatt.
  
          - made a small improvement to how Windows Latin-1 characters are handled in the tokenizer
  
          * khtml/html/htmltokenizer.cpp:
          (khtml::fixUpChar): Changed to use an array and a quick check to determine if a character
          is one of the ones that needs to be mapped. This retains most of the speedup gained from
          not doing anything when the character doesn't need to be fixed up.
          (khtml::HTMLTokenizer::parseSpecial): Get rid of the local check since the check in fixUpChar
          is sufficient not.
          (khtml::HTMLTokenizer::parseText): Ditto.
          (khtml::HTMLTokenizer::parseEntity): Ditto.
          (khtml::HTMLTokenizer::parseTag): Ditto.
          (khtml::HTMLTokenizer::write): Ditto.
  
  Revision  Changes    Path
  1.4531    +17 -0     WebCore/ChangeLog
  
  Index: ChangeLog
  ===================================================================
  RCS file: /cvs/root/WebCore/ChangeLog,v
  retrieving revision 1.4530
  retrieving revision 1.4531
  diff -u -r1.4530 -r1.4531
  --- ChangeLog	6 Aug 2005 08:49:10 -0000	1.4530
  +++ ChangeLog	6 Aug 2005 16:27:25 -0000	1.4531
  @@ -1,3 +1,20 @@
  +2005-08-06  Darin Adler  <darin at apple.com>
  +
  +        Reviewed by Dave Hyatt.
  +
  +        - made a small improvement to how Windows Latin-1 characters are handled in the tokenizer
  +
  +        * khtml/html/htmltokenizer.cpp:
  +        (khtml::fixUpChar): Changed to use an array and a quick check to determine if a character
  +        is one of the ones that needs to be mapped. This retains most of the speedup gained from
  +        not doing anything when the character doesn't need to be fixed up.
  +        (khtml::HTMLTokenizer::parseSpecial): Get rid of the local check since the check in fixUpChar
  +        is sufficient not.
  +        (khtml::HTMLTokenizer::parseText): Ditto.
  +        (khtml::HTMLTokenizer::parseEntity): Ditto.
  +        (khtml::HTMLTokenizer::parseTag): Ditto.
  +        (khtml::HTMLTokenizer::write): Ditto.
  +
   2005-08-06  Eric Seidel  <eseidel at apple.com>
   
           Reviewed by darin.
  
  
  
  1.104     +25 -117   WebCore/khtml/html/htmltokenizer.cpp
  
  Index: htmltokenizer.cpp
  ===================================================================
  RCS file: /cvs/root/WebCore/khtml/html/htmltokenizer.cpp,v
  retrieving revision 1.103
  retrieving revision 1.104
  diff -u -r1.103 -r1.104
  --- htmltokenizer.cpp	30 Jul 2005 02:33:18 -0000	1.103
  +++ htmltokenizer.cpp	6 Aug 2005 16:27:32 -0000	1.104
  @@ -110,116 +110,31 @@
   //
   // There may be better equivalents
   
  -#if APPLE_CHANGES
  -
  -// Note that we have more Unicode characters than Qt, so we use the
  -// official mapping table from the Unicode 2.0 standard here instead of
  -// one with hacks to avoid certain Unicode characters. Also, we don't
  -// need the unrelated hacks to avoid Unicode characters that are in the
  -// original version.
  -
   // We need this for entities at least. For non-entity text, we could
   // handle this in the text codec.
   
   // To cover non-entity text, I think this function would need to be called
  -// in more places. There seem to be many places that don't call fixUpChar.
  +// in more places. There seem to be some places that don't call fixUpChar.
   
  -inline void fixUpChar(QChar& c) {
  -    switch (c.unicode()) {
  -        case 0x0080: c = 0x20AC; break;
  -        case 0x0081: break;
  -        case 0x0082: c = 0x201A; break;
  -        case 0x0083: c = 0x0192; break;
  -        case 0x0084: c = 0x201E; break;
  -        case 0x0085: c = 0x2026; break;
  -        case 0x0086: c = 0x2020; break;
  -        case 0x0087: c = 0x2021; break;
  -        case 0x0088: c = 0x02C6; break;
  -        case 0x0089: c = 0x2030; break;
  -        case 0x008A: c = 0x0160; break;
  -        case 0x008B: c = 0x2039; break;
  -        case 0x008C: c = 0x0152; break;
  -        case 0x008D: break;
  -        case 0x008E: c = 0x017D; break;
  -        case 0x008F: break;
  -        case 0x0090: break;
  -        case 0x0091: c = 0x2018; break;
  -        case 0x0092: c = 0x2019; break;
  -        case 0x0093: c = 0x201C; break;
  -        case 0x0094: c = 0x201D; break;
  -        case 0x0095: c = 0x2022; break;
  -        case 0x0096: c = 0x2013; break;
  -        case 0x0097: c = 0x2014; break;
  -        case 0x0098: c = 0x02DC; break;
  -        case 0x0099: c = 0x2122; break;
  -        case 0x009A: c = 0x0161; break;
  -        case 0x009B: c = 0x203A; break;
  -        case 0x009C: c = 0x0153; break;
  -        case 0x009D: break;
  -        case 0x009E: c = 0x017E; break;
  -        case 0x009F: c = 0x0178; break;
  -    }
  -}
  -
  -#else // APPLE_CHANGES
  -
  -#define fixUpChar(x) \
  -            if (!(x).row() ) { \
  -                switch ((x).cell()) \
  -                { \
  -                /* ALL of these should be changed to Unicode SOON */ \
  -                case 0x80: (x) = 0x20ac; break; \
  -                case 0x82: (x) = ',';    break; \
  -                case 0x83: (x) = 0x0192; break; \
  -                case 0x84: (x) = '"';    break; \
  -                case 0x85: (x) = 0x2026; break; \
  -                case 0x86: (x) = 0x2020; break; \
  -                case 0x87: (x) = 0x2021; break; \
  -                case 0x88: (x) = 0x02C6; break; \
  -                case 0x89: (x) = 0x2030; break; \
  -                case 0x8A: (x) = 0x0160; break; \
  -                case 0x8b: (x) = '<';    break; \
  -                case 0x8C: (x) = 0x0152; break; \
  -\
  -                case 0x8E: (x) = 0x017D; break; \
  -\
  -\
  -                case 0x91: (x) = '\'';   break; \
  -                case 0x92: (x) = '\'';   break; \
  -                case 0x93: (x) = '"';    break; \
  -                case 0x94: (x) = '"';    break; \
  -                case 0x95: (x) = '*';    break; \
  -                case 0x96: (x) = '-';    break; \
  -                case 0x97: (x) = '-';    break; \
  -                case 0x98: (x) = '~';    break; \
  -                case 0x99: (x) = 0x2122; break; \
  -                case 0x9A: (x) = 0x0161; break; \
  -                case 0x9b: (x) = '>';    break; \
  -                case 0x9C: (x) = 0x0153; break; \
  -\
  -                case 0x9E: (x) = 0x017E; break; \
  -                case 0x9F: (x) = 0x0178; break; \
  -                /* This one should die */ \
  -                case 0xb7: (x) = '*';    break; \
  -                default: break; \
  -                } \
  -            } \
  -            else { \
  -                /* These should all die sooner rather than later */ \
  -                switch( (x).unicode() ) { \
  -                case 0x2013: (x) = '-'; break; \
  -                case 0x2014: (x) = '-'; break; \
  -                case 0x2018: (x) = '\''; break; \
  -                case 0x2019: (x) = '\''; break; \
  -                case 0x201c: (x) = '"'; break; \
  -                case 0x201d: (x) = '"'; break; \
  -                case 0x2022: (x) = '*'; break; \
  -                case 0x2122: (x) = 0x2122; break; \
  -                default: break; \
  -                } \
  -            }
  +static const ushort windowsLatin1ExtensionArray[32] = {
  +    0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
  +    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
  +    0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
  +    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178  // 98-9F
  +};
  +
  +static ushort mapChar(ushort c)
  +{
  +    assert(c >= 0x80 && c <= 0x9F);
  +    return windowsLatin1ExtensionArray[c - 0x80];
  +}
   
  -#endif // APPLE_CHANGES
  +static inline void fixUpChar(QChar &c)
  +{
  +    ushort code = c.unicode();
  +    if ((code & ~0x1F) == 0x0080)
  +        c = mapChar(code);
  +}
   
   inline bool tagMatch(const char *s1, const QChar *s2, uint length)
   {
  @@ -487,9 +402,7 @@
               scriptCodeSize = scriptCodeDest-scriptCode;
           }
           else {
  -            scriptCode[scriptCodeSize] = *src;
  -            if (src->unicode() >= 0x0080)
  -                fixUpChar(scriptCode[scriptCodeSize]);
  +            fixUpChar(scriptCode[scriptCodeSize] = *src);
               ++scriptCodeSize;
               ++src;
           }
  @@ -796,8 +709,7 @@
           }
           else {
               *dest = *src;
  -            if (src->unicode() >= 0x0080)
  -                fixUpChar(*dest);
  +            fixUpChar(*dest);
               ++dest;
               ++src;
           }
  @@ -925,8 +837,7 @@
   
                   if (EntityUnicodeValue <= 0xFFFF) {
                       QChar c(EntityUnicodeValue);
  -                    if (c.unicode() >= 0x0080)
  -                        fixUpChar(c);
  +                    fixUpChar(c);
                       checkBuffer();
                       src.push(c);
                   } else {
  @@ -1240,8 +1151,7 @@
                       }
                   }
                   *dest = *src;
  -                if (dest->unicode() >= 0x0080)
  -                    fixUpChar(*dest);
  +                fixUpChar(*dest);
                   ++dest;
                   ++src;
               }
  @@ -1277,8 +1187,7 @@
                   }
   
                   *dest = *src;
  -                if (dest->unicode() >= 0x0080)
  -                    fixUpChar(*dest);
  +                fixUpChar(*dest);
                   ++dest;
                   ++src;
               }
  @@ -1731,8 +1640,7 @@
                       currToken.complexText = true;
   #endif
               *dest = *src;
  -            if (dest->unicode() >= 0x0080)
  -                fixUpChar( *dest );
  +            fixUpChar(*dest);
               ++dest;
               ++src;
           }