Merge branch 'surrogates'

2017-02-13 08:49:06 -05:00 · 2017-02-13 08:49:06 -05:00 · b7c84b1b57
parent 73bf561645 23c4686b0f
commit b7c84b1b57
7 changed files with 212 additions and 2 deletions
--- a/README/MESSAGES.md
+++ b/README/MESSAGES.md
@ -0,0 +1,41 @@
+# Message System
+
+Tidy has quite complex warning/error messaging system. This is all about adding a **new** warning or error message to **libTidy**.
+
+First assign the message a key value. This is done in `message.h`, in one of the two enumerations that are listed there.
+
+ 1. `tidyErrorCodes` - starts with the value `CODES_TIDY_ERROR_FIRST = 200`, and it must be first. 
+ 
+ 2. `tidyMessagesMisc` - starts with the value ACCESS_URL = 2048 - so, at present the above `tidyErrorCodes` must not exceed this.
+ 
+ 3. For the sake of completeness, there's also a third enum present in `access.h` called `accessErrorCodes`; you should only ever be concerned about this if you are working on new strings for Tidy's accessibility module.
+ 
+If your message is something that will appear in the error list, then its key should be defined in the `tidyErrorCodes` enum, unless you are adding errors to the accessibility module (see point 3, above). If you are adding strings that are _not_ intended for the error list, then they belong in `tidyMessagesMisc`. These are strings that are typically output with Tidy's CLI.
+ 
+All enum values are only ever used by name within **libTidy** (and incidentally, should only ever be used by name in your client applications; never trust the value!), so feel free to enter new strings wherever they make the most sense. There are already existing categories (marked by comments), or feel free to create a new category if that's best.
+
+Because some clients retrieve error information via `libTidy`’s callback mechanism, it's also important to update the `language.c:tidyErrorFilterKeysStruct[]`, as well, if your new messages are intended for the error list.
+
+
+## Step 1
+
+So in this case I want to add 3 warning messages: `BAD_SURROGATE_PAIR`, `BAD_SURROGATE_TAIL`, and `BAD_SURROGATE_LEAD`. Because these are error messages, they belong in the `tidyErrorCodes` enum, and they fit into nicely into the "character encoding errors" category just before the **last** `CODES_TIDY_ERROR_LAST`.
+
+
+## Step 2
+
+Because the new messages are error code, update the `tidyErrorFilterKeysStruct` in `language.c` with the same key values, and with string representations thereof. You should put them in the same logical order as you inserted them into `tidyErrorCodes` enum.
+
+Note that at some point when all of the error enums are merged (probably Tidy 5.5) this kludge won't have to be used and we can have a nice, single enum exported to clients.
+
+## Step 3
+
+The next step is adding a `format` string to `language_en.h`. This string may later be translated to various supported language strings, but at present it is important that the other language translated strings, like `language_fr.h`, `language_es.h`, etc, keep the same format order.
+
+Where to add this seems a bit of a mess, but in general things are grouped by where they're used in `libTidy`, and often in alphabetical order within those groups. Here I've added them relative to where they were placed in the other enums and structs.
+
+Depending on which of the output routines you use (consult `message.c`) you may be able to use parameters such as `%u` and `%s` in your format strings. The available data is currently limited to the available message output routines, but perhaps generalizing this in order to make more data available will be a nice focus of Tidy 5.5. Please don't use `printf` for message output within **libTidy**.
+
+In this case I want to add showing the code point(s) in hex, so I need to add that also. **(jim --??)**
+
+eof;
--- a/src/language.c
+++ b/src/language.c
@ -328,6 +328,9 @@ static const tidyErrorFilterKeyItem tidyErrorFilterKeysStruct[] = {
    { "ENCODING_MISMATCH",                             ENCODING_MISMATCH                             },
    { "INVALID_URI",                                   INVALID_URI                                   },
    { "INVALID_NCR",                                   INVALID_NCR                                   },
+    { "BAD_SURROGATE_PAIR",                            BAD_SURROGATE_PAIR                            },
+    { "BAD_SURROGATE_TAIL",                            BAD_SURROGATE_TAIL                            },
+    { "BAD_SURROGATE_LEAD",                            BAD_SURROGATE_LEAD                            },
    { "CODES_TIDY_ERROR_LAST",                         CODES_TIDY_ERROR_LAST                         },
 #if SUPPORT_ACCESSIBILITY_CHECKS
    /* This blocks of codes comes from `accessErrorCodes` enum. */
--- a/src/language_en.h
+++ b/src/language_en.h
@ -336,7 +336,10 @@ static languageDefinition language_en = { whichPluralForm_en, {
    { INVALID_UTF8,                 0,   "%s invalid UTF-8 bytes (char. code %s)"                                  }, /* Error */
    { INVALID_UTF16,                0,   "%s invalid UTF-16 surrogate pair (char. code %s)"                        }, /* Error */
    { INVALID_NCR,                  0,   "%s invalid numeric character reference %s"                               }, /* Error */
-    
+    { BAD_SURROGATE_PAIR,           0,   "Have out-of-range surrogate pair U+%04X:U+%04X, replaced with U+FFFD value."}, /* warning */
+    { BAD_SURROGATE_TAIL,           0,   "Leading (High) surrogate pair U+%04X, with no trailing (Low) entity, replaced with U+FFFD." }, /* warning */
+    { BAD_SURROGATE_LEAD,           0,   "Trailing (Low) surrogate pair U+%04X, with no leading (High) entity, replaced with U+FFFD." }, /* warning */
+
    /* ReportEntityError */
    { MISSING_SEMICOLON,            0,   "entity \"%s\" doesn't end in ';'"                                        }, /* Warning in HTML, Error in XML/XHTML */
    { MISSING_SEMICOLON_NCR,        0,   "numeric character reference \"%s\" doesn't end in ';'"                   }, /* Warning in HTML, Error in XML/XHTML */
--- a/src/lexer.c
+++ b/src/lexer.c
@ -1033,6 +1033,121 @@ static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
    lexer->columns = doc->docIn->curcol;
 }

+/*
+    Issue #483
+    Have detected the first of a surrogate pair...
+    Try to find, decode the second...
+    Already have '&' start...
+*/
+
+typedef enum {
+    SP_ok,
+    SP_failed,
+    SP_error
+}SPStatus;
+
+static SPStatus GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
+{
+    Lexer* lexer = doc->lexer;
+    uint bufSize = 32;
+    uint c, ch, offset = 0;
+    tmbstr buf = 0;
+    SPStatus status = SP_error;  /* assume failed */
+    int type = 0;   /* assume numeric */
+    uint fch = *pch;
+    int i;  /* has to be signed due to for i >= 0 */
+    if (!lexer)
+        return status;
+    buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
+    if (!buf)
+        return status;
+    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
+    {
+        if (c == ';')
+        {
+            break;  /* reached end of entity */
+        }
+        if ((offset + 2) > bufSize)
+        {
+            bufSize *= 2;
+            buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
+            if (!buf)
+            {
+                break;
+            }
+        }
+        buf[offset++] = c;  /* add char to buffer */
+        if (offset == 1)
+        {
+            if (c != '#')   /* is a numeric entity */
+                break;
+        }
+        else if (offset == 2 && ((c == 'x') || (!isXml && c == 'X')))
+        {
+            type = 1;   /* set hex digits */
+        }
+        else
+        {
+            if (type)   /* if hex digits */
+            {
+                if (!IsDigitHex(c))
+                    break;
+            }
+            else    /* if numeric */
+            {
+                if (!TY_(IsDigit)(c))
+                    break;
+            }
+        }
+    }
+
+    if (c == ';')
+    {
+        buf[offset] = 0;
+        if (type)
+            sscanf(buf + 2, "%x", &ch);
+        else
+            sscanf(buf + 1, "%d", &ch);
+
+        if (TY_(IsHighSurrogate)(ch))
+        {
+            ch = TY_(CombineSurrogatePair)(ch, fch);
+            if (TY_(IsValidCombinedChar)(ch))
+            {
+                *pch = ch;  /* return combined pair value */
+                status = SP_ok; /* full success - pair used */
+            }
+            else
+            {
+                status = SP_failed; /* is one of the 32 out-of-range pairs */
+                *pch = 0xFFFD;  /* return substitute character */
+                TY_(ReportSurrogateError)(doc, BAD_SURROGATE_PAIR, fch, ch); /* SP WARNING: -  */
+            }
+        }
+    }
+
+    if (status == SP_error)
+    {
+        /* Error condition - can only put back all the chars */
+        if (c == ';') /* if last, not added to buffer */
+            TY_(UngetChar)(c, doc->docIn);
+        if (buf && offset)
+        {
+            /* correct the order for unget - last first */
+            for (i = offset - 1; i >= 0; i--)
+            {
+                c = buf[i];
+                TY_(UngetChar)(c, doc->docIn);
+            }
+        }
+    }
+
+    if (buf)
+        TidyFree(lexer->allocator, buf);
+
+    return status;
+}
+
 /*
  No longer attempts to insert missing ';' for unknown
  enitities unless one was present already, since this
@ -1159,6 +1274,42 @@ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
        found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
    }

+    /* Issue #483 - Deal with 'surrogate pairs' */
+    /* TODO: Maybe warning/error, like found a leading surrogate
+       but no following surrogate! Maybe should avoid outputting
+       invalid utf-8 for this entity - maybe substitute?  */
+    if (!preserveEntities && found && TY_(IsLowSurrogate)(ch))
+    {
+        uint c1;
+        if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
+        {
+            SPStatus status;
+            /* Have a following entity, 
+               so there is a chance of having a valid surrogate pair */
+            c1 = ch;    /* keep first value, in case of error */
+            status = GetSurrogatePair(doc, isXml, &ch);
+            if (status == SP_error)
+            {
+                TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, c1, 0); /* SP WARNING: - using substitute character */
+                TY_(UngetChar)('&', doc->docIn);  /* otherwise put it back */
+            }
+        }
+        else
+        {
+            /* put this non-entity lead char back */
+            TY_(UngetChar)(c1, doc->docIn);
+            /* Have leading surrogate pair, with no tail */
+            TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, ch, 0); /* SP WARNING: - using substitute character */
+            ch = 0xFFFD;
+        }
+    } 
+    else if (!preserveEntities && found && TY_(IsHighSurrogate)(ch))
+    {
+        /* Have trailing surrogate pair, with no lead */
+        TY_(ReportSurrogateError)(doc, BAD_SURROGATE_LEAD, ch, 0); /* SP WARNING: - using substitute character */
+        ch = 0xFFFD;
+    }
+
    /* deal with unrecognized or invalid entities */
    /* #433012 - fix by Randy Waki 17 Feb 01 */
    /* report invalid NCR's - Terry Teague 01 Sep 01 */
--- a/src/message.c
+++ b/src/message.c
@ -530,6 +530,13 @@ void TY_(ReportEntityError)( TidyDocImpl* doc, uint code, ctmbstr entity,
        messageLexer( doc, TidyWarning, code, fmt, entityname );
 }

+void TY_(ReportSurrogateError)(TidyDocImpl* doc, uint code, uint c1, uint c2)
+{
+    ctmbstr fmt = tidyLocalizedString(code);
+    if (fmt)
+        messageLexer(doc, TidyWarning, code, fmt, c1, c2);
+}
+
 void TY_(ReportAttrError)(TidyDocImpl* doc, Node *node, AttVal *av, uint code)
 {
    char const *name = "NULL", *value = "NULL";
--- a/src/message.h
+++ b/src/message.h
@ -46,6 +46,7 @@ void TY_(ReportEncodingError)(TidyDocImpl* doc, uint code, uint c, Bool discarde
 void TY_(ReportEntityError)( TidyDocImpl* doc, uint code, ctmbstr entity, int c );
 void TY_(ReportAttrError)( TidyDocImpl* doc, Node* node, AttVal* av, uint code );
 void TY_(ReportMissingAttr)( TidyDocImpl* doc, Node* node, ctmbstr name );
+void TY_(ReportSurrogateError)(TidyDocImpl* doc, uint code, uint c1, uint c2);

 #if SUPPORT_ACCESSIBILITY_CHECKS

@ -186,6 +187,10 @@ typedef enum {
    INVALID_URI,
    INVALID_NCR,

+    BAD_SURROGATE_PAIR,
+    BAD_SURROGATE_TAIL,
+    BAD_SURROGATE_LEAD,
+
    /* This MUST be present and last. */
    CODES_TIDY_ERROR_LAST
 } tidyErrorCodes;
--- a/version.txt
+++ b/version.txt
@ -1,2 +1,2 @@
-5.3.16
+5.3.17
 2017.02.12