diff --git a/src/language.c b/src/language.c index c67d794..be17365 100644 --- a/src/language.c +++ b/src/language.c @@ -328,6 +328,9 @@ static const tidyErrorFilterKeyItem tidyErrorFilterKeysStruct[] = { { "ENCODING_MISMATCH", ENCODING_MISMATCH }, { "INVALID_URI", INVALID_URI }, { "INVALID_NCR", INVALID_NCR }, + { "BAD_SURROGATE_PAIR", BAD_SURROGATE_PAIR }, + { "BAD_SURROGATE_TAIL", BAD_SURROGATE_TAIL }, + { "BAD_SURROGATE_LEAD", BAD_SURROGATE_LEAD }, { "CODES_TIDY_ERROR_LAST", CODES_TIDY_ERROR_LAST }, #if SUPPORT_ACCESSIBILITY_CHECKS /* This blocks of codes comes from `accessErrorCodes` enum. */ diff --git a/src/language_en.h b/src/language_en.h index 7a8d1de..b048c73 100644 --- a/src/language_en.h +++ b/src/language_en.h @@ -336,7 +336,10 @@ static languageDefinition language_en = { whichPluralForm_en, { { INVALID_UTF8, 0, "%s invalid UTF-8 bytes (char. code %s)" }, /* Error */ { INVALID_UTF16, 0, "%s invalid UTF-16 surrogate pair (char. code %s)" }, /* Error */ { INVALID_NCR, 0, "%s invalid numeric character reference %s" }, /* Error */ - + { BAD_SURROGATE_PAIR, 0, "Have out-of-range surrogate pair U+%s:U+%s, replaced with 2 U+FFFD values."}, /* warning */ + { BAD_SURROGATE_TAIL, 0, "Leading (High) surrogate pair U+%s, with no trailing (Low) entity, replaced with U+FFFD." }, /* warning */ + { BAD_SURROGATE_LEAD, 0, "Trailing (Low) surrogate pair U+%s, with no leading (High) entity, replaced with U+FFFD." }, /* warning */ + /* ReportEntityError */ { MISSING_SEMICOLON, 0, "entity \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */ { MISSING_SEMICOLON_NCR, 0, "numeric character reference \"%s\" doesn't end in ';'" }, /* Warning in HTML, Error in XML/XHTML */ diff --git a/src/lexer.c b/src/lexer.c index 49cb649..e3fa267 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1034,25 +1034,33 @@ static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer ) } /* + Issue #483 Have detected the first of a surrogate pair... Try to find, decode the second... Already have '&' start... */ -static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch) + +typedef enum { + SP_ok, + SP_failed, + SP_error +}SPStatus; + +static SPStatus GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch) { Lexer* lexer = doc->lexer; uint bufSize = 32; uint c, ch, offset = 0; tmbstr buf = 0; - Bool success = no; /* assume failed */ + SPStatus status = SP_error; /* assume failed */ int type = 0; /* assume numeric */ uint fch = *pch; int i; /* has to be signed due to for i >= 0 */ if (!lexer) - return no; + return status; buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize); if (!buf) - return no; + return status; while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream ) { if (c == ';') @@ -1107,12 +1115,21 @@ static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch) if (TY_(IsValidCombinedChar)(ch)) { *pch = ch; /* return combined pair value */ - success = yes; + status = SP_ok; /* full success - pair used */ + } + else + { + status = SP_failed; /* is one of the 32 out-of-range pairs */ + *pch = 0xFFFD; /* return substitute character */ + /* SP WARNING: - BAD_SURROGATE_PAIR */ + fprintf(stderr, "Warning: Have out-of-range surrogate pair U+%04X:U+%04X, replaced with U+FFFD value.\n", fch, ch); } } } - if (!success) + + if (status == SP_error) { + /* Error condition - can only put back all the chars */ if (c == ';') /* if last, not added to buffer */ TY_(UngetChar)(c, doc->docIn); if (buf && offset) @@ -1129,7 +1146,7 @@ static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch) if (buf) TidyFree(lexer->allocator, buf); - return success; + return status; } /* @@ -1265,20 +1282,36 @@ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode ) if (!preserveEntities && found && TY_(IsLowSurrogate)(ch)) { uint c1; - if ((c1 = TY_(ReadChar)(doc->docIn)) == '&') + if ((c1 = TY_(ReadChar)(doc->docIn)) == '&') { - /* have a following entity */ - if (!GetSurrogatePair(doc, isXml, &ch)) + SPStatus status; + /* Have a following entity, + so there is a chance of having a valid surrogate pair */ + c1 = ch; /* keep first value, in case of error */ + status = GetSurrogatePair(doc, isXml, &ch); + if (status == SP_error) { - TY_(UngetChar)(c1, doc->docIn); /* otherwise put it back */ + /* SP WARNING: BAD_SURROGATE_TAIL - use substitute character */ + fprintf(stderr, "Warning: Leading(High) surrogate pair U+%04X, with no trailing(Low) entity, replaced with U+FFFD.\n", c1); + TY_(UngetChar)('&', doc->docIn); /* otherwise put it back */ } } - else + else { - /* otherwise put it back */ + /* put this non-entity lead char back */ TY_(UngetChar)(c1, doc->docIn); + /* Have leading surrogate pair, with no tail */ + /* SP WARNING: BAD_SURROGATE_TAIL - use substitute character */ + fprintf(stderr, "Warning: Leading(High) surrogate pair U+%04X, with no trailing(Low) entity, replaced with U+FFFD.\n", ch); + ch = 0xFFFD; } - + } + else if (!preserveEntities && found && TY_(IsHighSurrogate)(ch)) + { + /* Have trailing surrogate pair, with no lead */ + /* SP WARNING: - BAD_SURROGATE_LEAD - - use substitute character */ + fprintf(stderr, "Warning: Trailing (Low) surrogate pair U+%04X, with no leading (High) entity, replaced with U+FFFD.\n", ch); + ch = 0xFFFD; } /* deal with unrecognized or invalid entities */ diff --git a/src/message.h b/src/message.h index 7c49499..a1fc129 100644 --- a/src/message.h +++ b/src/message.h @@ -186,6 +186,10 @@ typedef enum { INVALID_URI, INVALID_NCR, + BAD_SURROGATE_PAIR, + BAD_SURROGATE_TAIL, + BAD_SURROGATE_LEAD, + /* This MUST be present and last. */ CODES_TIDY_ERROR_LAST } tidyErrorCodes;