Issue #483 - First cut dealing with 'surrogate pairs'.

Only deals with a successful case.

TODO: Maybe add a warning/error if the trailing surrogate not found, and
maybe consider substituting to avoid invalid utf-8 output.
This commit is contained in:
Geoff McLane 2017-02-01 13:46:25 +01:00
parent 10fd44d101
commit 259d330780
2 changed files with 122 additions and 2 deletions

View file

@ -1033,6 +1033,103 @@ static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
lexer->columns = doc->docIn->curcol;
}
/*
Have detected the first of a surrogate pair...
Try to find, decode the second...
Already have '&' start...
*/
static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
{
Lexer* lexer = doc->lexer;
uint bufSize = 32;
uint c, i, ch, offset = 0;
tmbstr buf = 0;
Bool success = no; /* assume failed */
int type = 0; /* assume numeric */
uint fch = *pch;
if (!lexer)
return no;
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
if (!buf)
return no;
while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
{
if (c == ';')
{
break; /* reached end of entity */
}
if ((offset + 2) > bufSize)
{
bufSize *= 2;
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
if (!buf)
{
break;
}
}
buf[offset++] = c; /* add char to buffer */
if (offset == 1)
{
if (c != '#') /* is a numeric entity */
break;
}
else if (offset == 2 && ((c == 'x') || (!isXml && c == 'X')))
{
type = 1; /* set hex digits */
}
else
{
if (type) /* if hex digits */
{
if (!IsDigitHex(c))
break;
}
else /* if numeric */
{
if (!TY_(IsDigit)(c))
break;
}
}
}
if (c == ';')
{
buf[offset] = 0;
if (type)
sscanf(buf + 2, "%x", &ch);
else
sscanf(buf + 1, "%d", &ch);
if (TY_(IsHighSurrogate)(ch))
{
ch = TY_(CombineSurrogatePair)(ch, fch);
if (TY_(IsValidCombinedChar)(ch))
{
*pch = ch; /* return combined pair value */
success = yes;
}
}
}
if (!success)
{
if (ch == ';')
TY_(UngetChar)(ch, doc->docIn);
if (buf)
{
for (i = 0; i < offset; i++)
{
c = buf[i];
TY_(UngetChar)(c, doc->docIn);
}
}
}
if (buf)
TidyFree(lexer->allocator, buf);
return success;
}
/*
No longer attempts to insert missing ';' for unknown
enitities unless one was present already, since this
@ -1159,6 +1256,29 @@ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
}
/* Issue #483 - Deal with 'surrogate pairs' */
/* TODO: Maybe warning/error, like found a leading surrogate
but no following surrogate! Maybe should avoid outputting
invalid utf-8 for this entity - maybe substitute? */
if (!preserveEntities && found && TY_(IsLowSurrogate)(ch))
{
uint c1;
if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
{
/* have a following entity */
if (!GetSurrogatePair(doc, isXml, &ch))
{
TY_(UngetChar)(c1, doc->docIn); /* otherwise put it back */
}
}
else
{
/* otherwise put it back */
TY_(UngetChar)(c1, doc->docIn);
}
}
/* deal with unrecognized or invalid entities */
/* #433012 - fix by Randy Waki 17 Feb 01 */
/* report invalid NCR's - Terry Teague 01 Sep 01 */

View file

@ -1,2 +1,2 @@
5.3.15
2017.01.29
5.3.16I483
2017.02.01