Issue #483 - First cut dealing with 'surrogate pairs'.
Only deals with a successful case. TODO: Maybe add a warning/error if the trailing surrogate not found, and maybe consider substituting to avoid invalid utf-8 output.
This commit is contained in:
parent
10fd44d101
commit
259d330780
120
src/lexer.c
120
src/lexer.c
|
@ -1033,6 +1033,103 @@ static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
|
|||
lexer->columns = doc->docIn->curcol;
|
||||
}
|
||||
|
||||
/*
|
||||
Have detected the first of a surrogate pair...
|
||||
Try to find, decode the second...
|
||||
Already have '&' start...
|
||||
*/
|
||||
static Bool GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
|
||||
{
|
||||
Lexer* lexer = doc->lexer;
|
||||
uint bufSize = 32;
|
||||
uint c, i, ch, offset = 0;
|
||||
tmbstr buf = 0;
|
||||
Bool success = no; /* assume failed */
|
||||
int type = 0; /* assume numeric */
|
||||
uint fch = *pch;
|
||||
if (!lexer)
|
||||
return no;
|
||||
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
|
||||
if (!buf)
|
||||
return no;
|
||||
while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
|
||||
{
|
||||
if (c == ';')
|
||||
{
|
||||
break; /* reached end of entity */
|
||||
}
|
||||
if ((offset + 2) > bufSize)
|
||||
{
|
||||
bufSize *= 2;
|
||||
buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
|
||||
if (!buf)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
buf[offset++] = c; /* add char to buffer */
|
||||
if (offset == 1)
|
||||
{
|
||||
if (c != '#') /* is a numeric entity */
|
||||
break;
|
||||
}
|
||||
else if (offset == 2 && ((c == 'x') || (!isXml && c == 'X')))
|
||||
{
|
||||
type = 1; /* set hex digits */
|
||||
}
|
||||
else
|
||||
{
|
||||
if (type) /* if hex digits */
|
||||
{
|
||||
if (!IsDigitHex(c))
|
||||
break;
|
||||
}
|
||||
else /* if numeric */
|
||||
{
|
||||
if (!TY_(IsDigit)(c))
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (c == ';')
|
||||
{
|
||||
buf[offset] = 0;
|
||||
if (type)
|
||||
sscanf(buf + 2, "%x", &ch);
|
||||
else
|
||||
sscanf(buf + 1, "%d", &ch);
|
||||
|
||||
if (TY_(IsHighSurrogate)(ch))
|
||||
{
|
||||
ch = TY_(CombineSurrogatePair)(ch, fch);
|
||||
if (TY_(IsValidCombinedChar)(ch))
|
||||
{
|
||||
*pch = ch; /* return combined pair value */
|
||||
success = yes;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!success)
|
||||
{
|
||||
if (ch == ';')
|
||||
TY_(UngetChar)(ch, doc->docIn);
|
||||
if (buf)
|
||||
{
|
||||
for (i = 0; i < offset; i++)
|
||||
{
|
||||
c = buf[i];
|
||||
TY_(UngetChar)(c, doc->docIn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (buf)
|
||||
TidyFree(lexer->allocator, buf);
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
/*
|
||||
No longer attempts to insert missing ';' for unknown
|
||||
enitities unless one was present already, since this
|
||||
|
@ -1159,6 +1256,29 @@ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
|
|||
found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
|
||||
}
|
||||
|
||||
/* Issue #483 - Deal with 'surrogate pairs' */
|
||||
/* TODO: Maybe warning/error, like found a leading surrogate
|
||||
but no following surrogate! Maybe should avoid outputting
|
||||
invalid utf-8 for this entity - maybe substitute? */
|
||||
if (!preserveEntities && found && TY_(IsLowSurrogate)(ch))
|
||||
{
|
||||
uint c1;
|
||||
if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
|
||||
{
|
||||
/* have a following entity */
|
||||
if (!GetSurrogatePair(doc, isXml, &ch))
|
||||
{
|
||||
TY_(UngetChar)(c1, doc->docIn); /* otherwise put it back */
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* otherwise put it back */
|
||||
TY_(UngetChar)(c1, doc->docIn);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* deal with unrecognized or invalid entities */
|
||||
/* #433012 - fix by Randy Waki 17 Feb 01 */
|
||||
/* report invalid NCR's - Terry Teague 01 Sep 01 */
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
5.3.15
|
||||
2017.01.29
|
||||
5.3.16I483
|
||||
2017.02.01
|
||||
|
|
Loading…
Reference in a new issue