Attempt to address issue #352. This patch correctly address the specific issues

in #352, but I'm worried that there's some over-reach here.

Currently only implemented as a warning, with no switch to turn it off, which
maintains current behavior other than the warning.

In general, we're treating any string as a complete URL, rather than breaking
URL's into component parts. Thus the `IsURLCodePoint()` check includes a few
other generic characters that strictly speaking aren't valid codepoints, but
are valid as escape characters and delimiters.

When addressing #338, I ran into a similar situation in not having a built-in
method to separate path components (although a simple generalized solution was
good enough in that case).

Thus without introducing a new structure and functions to deconstruct a URL
into scheme, authority, path, parameters, etc., some variation of this patch
will have to be used to address #352.
This commit is contained in:
Jim Derry 2017-05-06 18:54:42 -04:00
parent fd2400d55b
commit fd77312175
4 changed files with 68 additions and 4 deletions

View file

@ -171,6 +171,7 @@ extern "C" {
FN(ESCAPED_ILLEGAL_URI) \
FN(FIXED_BACKSLASH) \
FN(ID_NAME_MISMATCH) \
FN(ILLEGAL_URI_CODEPOINT) \
FN(ILLEGAL_URI_REFERENCE) \
FN(INSERTING_AUTO_ATTRIBUTE) \
FN(INVALID_ATTRIBUTE) \

View file

@ -1476,15 +1476,64 @@ static void CheckLowerCaseAttrValue( TidyDocImpl* doc, Node *node, AttVal *attva
/* methods for checking value of a specific attribute */
static Bool IsURLCodePoint( ctmbstr p, uint *increment )
{
uint c;
*increment = TY_(GetUTF8)( p, &c ) + 1;
return isalnum( c ) ||
c == '%' || /* not a valid codepoint, but an escape sequence */
c == '#' || /* not a valid codepoint, but a delimiter */
c == '!' ||
c == '$' ||
c == '&' ||
c == '\'' ||
c == '(' ||
c == ')' ||
c == '*' ||
c == '+' ||
c == ',' ||
c == '-' ||
c == '.' ||
c == '/' ||
c == ':' ||
c == ';' ||
c == '=' ||
c == '?' ||
c == '@' ||
c == '_' ||
c == '~' ||
(c >= 0x00A0 && c <= 0xD7FF) ||
(c >= 0xE000 && c <= 0xFDCF) ||
(c >= 0xFDF0 && c <= 0xFFEF) ||
(c >= 0x10000 && c <= 0x1FFFD) ||
(c >= 0x20000 && c <= 0x2FFFD) ||
(c >= 0x30000 && c <= 0x3FFFD) ||
(c >= 0x40000 && c <= 0x4FFFD) ||
(c >= 0x50000 && c <= 0x5FFFD) ||
(c >= 0x60000 && c <= 0x6FFFD) ||
(c >= 0x70000 && c <= 0x7FFFD) ||
(c >= 0x80000 && c <= 0x8FFFD) ||
(c >= 0x90000 && c <= 0x9FFFD) ||
(c >= 0xA0000 && c <= 0xAFFFD) ||
(c >= 0xB0000 && c <= 0xBFFFD) ||
(c >= 0xC0000 && c <= 0xCFFFD) ||
(c >= 0xD0000 && c <= 0xDFFFD) ||
(c >= 0xE0000 && c <= 0xEFFFD) ||
(c >= 0xF0000 && c <= 0xFFFFD) ||
(c >= 0x100000 && c <= 0x10FFFD);
}
void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval)
{
tmbchar c;
tmbchar c;
tmbstr dest, p;
uint escape_count = 0, backslash_count = 0;
uint escape_count = 0, backslash_count = 0, bad_codepoint_count = 0;
uint i, pos = 0;
uint len;
uint increment;
Bool isJavascript = no;
if (!AttrHasValue(attval))
{
TY_(ReportAttrError)( doc, node, attval, MISSING_ATTR_VALUE);
@ -1492,7 +1541,7 @@ void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval)
}
p = attval->value;
isJavascript =
TY_(tmbstrncmp)(p,"javascript:",sizeof("javascript:")-1)==0;
@ -1508,6 +1557,14 @@ void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval)
++escape_count;
}
while ( *p != 0 )
{
if ( !IsURLCodePoint( p, &increment ) )
++bad_codepoint_count;
p = p + increment;
}
p = attval->value;
if ( cfgBool(doc, TidyFixUri) && escape_count )
{
Bool hadnonspace = no;
@ -1557,6 +1614,10 @@ void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval)
doc->badChars |= BC_INVALID_URI;
}
if ( bad_codepoint_count )
{
TY_(ReportAttrError)( doc, node, attval, ILLEGAL_URI_CODEPOINT );
}
}
/* RFC 2396, section 4.2 states:

View file

@ -1821,6 +1821,7 @@ static languageDefinition language_en = { whichPluralForm_en, {
{ ESCAPED_ILLEGAL_URI, 0, "%s escaping malformed URI reference" }, /* ReportAttrError */
{ FIXED_BACKSLASH, 0, "%s converting backslash in URI to slash" }, /* ReportAttrError */
{ ID_NAME_MISMATCH, 0, "%s id and name attribute value mismatch" }, /* ReportAttrError */
{ ILLEGAL_URI_CODEPOINT, 0, "%s illegal characters found in URI" }, /* ReportAttrError */
{ ILLEGAL_URI_REFERENCE, 0, "%s improperly escaped URI reference" }, /* ReportAttrError */
{ INSERTING_AUTO_ATTRIBUTE, 0, "%s inserting \"%s\" attribute using value \"%s\"" }, /* ReportAttrError */
{ INVALID_ATTRIBUTE, 0, "%s attribute name \"%s\" (value=\"%s\") is invalid" }, /* ReportAttrError */

View file

@ -525,6 +525,7 @@ void TY_(ReportAttrError)(TidyDocImpl* doc, Node *node, AttVal *av, uint code)
case ID_NAME_MISMATCH:
case BACKSLASH_IN_URI:
case FIXED_BACKSLASH:
case ILLEGAL_URI_CODEPOINT:
case ILLEGAL_URI_REFERENCE:
case ESCAPED_ILLEGAL_URI:
case NEWLINE_IN_URI: