[webkit-changes] cvs commit: WebCore/layout-tests/fast/encoding invalid-UTF-8-expected.txt invalid-UTF-8.html

Fri Jun 17 09:41:06 PDT 2005

darin       05/06/17 09:41:05

  Modified:    .        ChangeLog
               kwq      KWQTextCodec.mm
  Added:       layout-tests/fast/encoding invalid-UTF-8-expected.txt
                        invalid-UTF-8.html
  Log:
          Reviewed by John.

          - fixed http://bugzilla.opendarwin.org/show_bug.cgi?id=3556
            black diamond question mark shown for invalid UTF-8 sequences

          Test cases added:
          * layout-tests/fast/encoding/invalid-UTF-8-expected.txt: Added.
          * layout-tests/fast/encoding/invalid-UTF-8.html: Added.

          * kwq/KWQTextCodec.mm:
          (unwanted): Added. Returns true for BOM, replacement, and null characters.
          (KWQTextDecoder::appendOmittingUnwanted): Renamed from appendOmittingNullsAndBOMs and
          changed to use the new "unwanted" function, which causes it to omit replacement characters.
          (KWQTextDecoder::convertUsingTEC): Call append function by its new name.

  Revision  Changes    Path
  1.4268    +17 -0     WebCore/ChangeLog

  Index: ChangeLog
  ===================================================================
  RCS file: /cvs/root/WebCore/ChangeLog,v
  retrieving revision 1.4267
  retrieving revision 1.4268
  diff -u -r1.4267 -r1.4268
  --- ChangeLog	17 Jun 2005 02:13:07 -0000	1.4267
  +++ ChangeLog	17 Jun 2005 16:41:01 -0000	1.4268
  @@ -1,3 +1,20 @@
  +2005-06-17  Darin Adler  <darin at apple.com>
  +
  +        Reviewed by John.
  +
  +        - fixed http://bugzilla.opendarwin.org/show_bug.cgi?id=3556
  +          black diamond question mark shown for invalid UTF-8 sequences
  +
  +        Test cases added:
  +        * layout-tests/fast/encoding/invalid-UTF-8-expected.txt: Added.
  +        * layout-tests/fast/encoding/invalid-UTF-8.html: Added.
  +
  +        * kwq/KWQTextCodec.mm:
  +        (unwanted): Added. Returns true for BOM, replacement, and null characters.
  +        (KWQTextDecoder::appendOmittingUnwanted): Renamed from appendOmittingNullsAndBOMs and
  +        changed to use the new "unwanted" function, which causes it to omit replacement characters.
  +        (KWQTextDecoder::convertUsingTEC): Call append function by its new name.
  +
   2005-06-16  Justin Garcia  <justin.garcia at apple.com>

   	Added a few debugging methods to highlight Selections, VisiblePositions and Nodes in the DOM tree.

  1.50      +23 -6     WebCore/kwq/KWQTextCodec.mm

  Index: KWQTextCodec.mm
  ===================================================================
  RCS file: /cvs/root/WebCore/kwq/KWQTextCodec.mm,v
  retrieving revision 1.49
  retrieving revision 1.50
  diff -u -r1.49 -r1.50
  --- KWQTextCodec.mm	14 Dec 2004 00:10:18 -0000	1.49
  +++ KWQTextCodec.mm	17 Jun 2005 16:41:05 -0000	1.50
  @@ -28,6 +28,7 @@
   #import "KWQAssertions.h"
   #import "KWQCharsets.h"

  +const UniChar replacementCharacter = 0xFFFD;
   const UniChar BOM = 0xFEFF;

   class KWQTextDecoder : public QTextDecoder {
  @@ -48,7 +49,7 @@
       OSStatus createTECConverter();
       OSStatus convertOneChunkUsingTEC(const unsigned char *inputBuffer, int inputBufferLength, int &inputLength,
           void *outputBuffer, int outputBufferLength, int &outputLength);
  -    static void appendOmittingNullsAndBOMs(QString &s, const UniChar *characters, int byteCount);
  +    static void appendOmittingUnwanted(QString &s, const UniChar *characters, int byteCount);

       KWQTextDecoder(const KWQTextDecoder &);
       KWQTextDecoder &operator=(const KWQTextDecoder &);
  @@ -356,14 +357,30 @@
       return noErr;
   }

  -void KWQTextDecoder::appendOmittingNullsAndBOMs(QString &s, const UniChar *characters, int byteCount)
  +// We strip NUL characters because other browsers (at least WinIE) do.
  +// We strip replacement characters because the TEC converter for UTF-8 converts
  +// invalid sequences into replacement characters, but other browsers discard them.
  +// We strip BOM characters because they can show up both at the start of content
  +// and inside content, and we never want them to end up in the decoded text.
  +static inline bool unwanted(UniChar c)
  +{
  +    switch (c) {
  +        case 0:
  +        case replacementCharacter:
  +        case BOM:
  +            return true;
  +        default:
  +            return false;
  +    }
  +}
  +
  +void KWQTextDecoder::appendOmittingUnwanted(QString &s, const UniChar *characters, int byteCount)
   {
       ASSERT(byteCount % sizeof(UniChar) == 0);
       int start = 0;
       int characterCount = byteCount / sizeof(UniChar);
       for (int i = 0; i != characterCount; ++i) {
  -        UniChar c = characters[i];
  -        if (c == 0 || c == BOM) {
  +        if (unwanted(characters[i])) {
               if (start != i) {
                   s.append(reinterpret_cast<const QChar *>(&characters[start]), i - start);
               }
  @@ -498,7 +515,7 @@
                   return QString();
           }

  -        appendOmittingNullsAndBOMs(result, buffer, bytesWritten);
  +        appendOmittingUnwanted(result, buffer, bytesWritten);

           bufferWasFull = status == kTECOutputBufferFullStatus;
       }
  @@ -506,7 +523,7 @@
       if (flush) {
           unsigned long bytesWritten = 0;
           TECFlushText(_converter, reinterpret_cast<unsigned char *>(buffer), sizeof(buffer), &bytesWritten);
  -        appendOmittingNullsAndBOMs(result, buffer, bytesWritten);
  +        appendOmittingUnwanted(result, buffer, bytesWritten);
       }

       // Workaround for a bug in the Text Encoding Converter (see bug 3225472).

  1.1                  WebCore/layout-tests/fast/encoding/invalid-UTF-8-expected.txt

  Index: invalid-UTF-8-expected.txt
  ===================================================================
  layer at (0,0) size 800x600
    RenderCanvas at (0,0) size 800x600
  layer at (0,0) size 800x600
    RenderBlock {HTML} at (0,0) size 800x600
      RenderBody {BODY} at (8,8) size 784x576
        RenderBlock {P} at (0,0) size 784x18
          RenderText {TEXT} at (0,0) size 721x18
            text run at (0,0) width 721: "This tests the rendering of invalid UTF-8 sequences. The way other browsers handle these is to omit them entirely."
        RenderBlock {P} at (0,34) size 784x36
          RenderText {TEXT} at (0,0) size 764x36
            text run at (0,0) width 764: "The text before should show the word \"everywhere\" right next to the word \"including\" and the word \"cube\" right next to"
            text run at (0,18) width 396: "the word \"showing\" without any visible characters in between."
        RenderBlock {HR} at (0,86) size 784x2 [border: (1px inset #000000)]
        RenderBlock {P} at (0,104) size 784x18
          RenderText {TEXT} at (0,0) size 346x18
            text run at (0,0) width 346: "everywhereincluding a 120-screen cubeshowing sports"

  1.1                  WebCore/layout-tests/fast/encoding/invalid-UTF-8.html

Index: invalid-UTF-8.html
 ===================================================================
 <html>
 <head>
 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
 </head>
 <body>
 This tests the rendering of invalid UTF-8 sequences. The way other browsers handle these is to omit them entirely.
 The text before should show the word "everywhere" right next to the word "including" and the word "cube" right next to the
 word "showing" without any visible characters in between.
 <hr>
 everywhere—including a 120-screen cube—showing sports
 </body>
 </html>