Attempt to address issue #352. This patch correctly address the specific issues

in #352, but I'm worried that there's some over-reach here. Currently only implemented as a warning, with no switch to turn it off, which maintains current behavior other than the warning. In general, we're treating any string as a complete URL, rather than breaking URL's into component parts. Thus the `IsURLCodePoint()` check includes a few other generic characters that strictly speaking aren't valid codepoints, but are valid as escape characters and delimiters. When addressing #338, I ran into a similar situation in not having a built-in method to separate path components (although a simple generalized solution was good enough in that case). Thus without introducing a new structure and functions to deconstruct a URL into scheme, authority, path, parameters, etc., some variation of this patch will have to be used to address #352.
2017-05-06 18:54:42 -04:00 · 2017-05-06 18:54:42 -04:00 · fd77312175
parent fd2400d55b
commit fd77312175
4 changed files with 68 additions and 4 deletions
--- a/include/tidyenum.h
+++ b/include/tidyenum.h
@ -171,6 +171,7 @@ extern "C" {
    FN(ESCAPED_ILLEGAL_URI)           \
    FN(FIXED_BACKSLASH)               \
    FN(ID_NAME_MISMATCH)              \
+    FN(ILLEGAL_URI_CODEPOINT)         \
    FN(ILLEGAL_URI_REFERENCE)         \
    FN(INSERTING_AUTO_ATTRIBUTE)      \
    FN(INVALID_ATTRIBUTE)             \
--- a/src/attrs.c
+++ b/src/attrs.c
@ -1476,15 +1476,64 @@ static void CheckLowerCaseAttrValue( TidyDocImpl* doc, Node *node, AttVal *attva

 /* methods for checking value of a specific attribute */

+static Bool IsURLCodePoint( ctmbstr p, uint *increment )
+{
+    uint c;
+    *increment = TY_(GetUTF8)( p, &c ) + 1;
+
+    return isalnum( c ) ||
+        c == '%' ||    /* not a valid codepoint, but an escape sequence */
+        c == '#' ||    /* not a valid codepoint, but a delimiter */
+        c == '!' ||
+        c == '$' ||
+        c == '&' ||
+        c == '\'' ||
+        c == '(' ||
+        c == ')' ||
+        c == '*' ||
+        c == '+' ||
+        c == ',' ||
+        c == '-' ||
+        c == '.' ||
+        c == '/' ||
+        c == ':' ||
+        c == ';' ||
+        c == '=' ||
+        c == '?' ||
+        c == '@' ||
+        c == '_' ||
+        c == '~' ||
+        (c >= 0x00A0 && c <= 0xD7FF) ||
+        (c >= 0xE000 && c <= 0xFDCF) ||
+        (c >= 0xFDF0 && c <= 0xFFEF) ||
+        (c >= 0x10000 && c <= 0x1FFFD) ||
+        (c >= 0x20000 && c <= 0x2FFFD) ||
+        (c >= 0x30000 && c <= 0x3FFFD) ||
+        (c >= 0x40000 && c <= 0x4FFFD) ||
+        (c >= 0x50000 && c <= 0x5FFFD) ||
+        (c >= 0x60000 && c <= 0x6FFFD) ||
+        (c >= 0x70000 && c <= 0x7FFFD) ||
+        (c >= 0x80000 && c <= 0x8FFFD) ||
+        (c >= 0x90000 && c <= 0x9FFFD) ||
+        (c >= 0xA0000 && c <= 0xAFFFD) ||
+        (c >= 0xB0000 && c <= 0xBFFFD) ||
+        (c >= 0xC0000 && c <= 0xCFFFD) ||
+        (c >= 0xD0000 && c <= 0xDFFFD) ||
+        (c >= 0xE0000 && c <= 0xEFFFD) ||
+        (c >= 0xF0000 && c <= 0xFFFFD) ||
+        (c >= 0x100000 && c <= 0x10FFFD);
+}
+
 void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval)
 {
-    tmbchar c; 
+    tmbchar c;
    tmbstr dest, p;
-    uint escape_count = 0, backslash_count = 0;
+    uint escape_count = 0, backslash_count = 0, bad_codepoint_count = 0;
    uint i, pos = 0;
    uint len;
+    uint increment;
    Bool isJavascript = no;
-    
+
    if (!AttrHasValue(attval))
    {
        TY_(ReportAttrError)( doc, node, attval, MISSING_ATTR_VALUE);
@ -1492,7 +1541,7 @@ void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval)
    }

    p = attval->value;
-    
+
    isJavascript =
        TY_(tmbstrncmp)(p,"javascript:",sizeof("javascript:")-1)==0;

@ -1508,6 +1557,14 @@ void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval)
            ++escape_count;
    }

+    while ( *p != 0 )
+    {
+        if ( !IsURLCodePoint( p, &increment ) )
+            ++bad_codepoint_count;
+         p = p + increment;
+    }
+    p = attval->value;
+
    if ( cfgBool(doc, TidyFixUri) && escape_count )
    {
        Bool hadnonspace = no;
@ -1557,6 +1614,10 @@ void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval)

        doc->badChars |= BC_INVALID_URI;
    }
+    if ( bad_codepoint_count )
+    {
+        TY_(ReportAttrError)( doc, node, attval, ILLEGAL_URI_CODEPOINT );
+    }
 }

 /* RFC 2396, section 4.2 states:
--- a/src/language_en.h
+++ b/src/language_en.h
@ -1821,6 +1821,7 @@ static languageDefinition language_en = { whichPluralForm_en, {
    { ESCAPED_ILLEGAL_URI,          0,   "%s escaping malformed URI reference"                                     }, /* ReportAttrError */
    { FIXED_BACKSLASH,              0,   "%s converting backslash in URI to slash"                                 }, /* ReportAttrError */
    { ID_NAME_MISMATCH,             0,   "%s id and name attribute value mismatch"                                 }, /* ReportAttrError */
+    { ILLEGAL_URI_CODEPOINT,        0,   "%s illegal characters found in URI"                                      }, /* ReportAttrError */
    { ILLEGAL_URI_REFERENCE,        0,   "%s improperly escaped URI reference"                                     }, /* ReportAttrError */
    { INSERTING_AUTO_ATTRIBUTE,     0,   "%s inserting \"%s\" attribute using value \"%s\""                        }, /* ReportAttrError */
    { INVALID_ATTRIBUTE,            0,   "%s attribute name \"%s\" (value=\"%s\") is invalid"                      }, /* ReportAttrError */
--- a/src/message.c
+++ b/src/message.c
@ -525,6 +525,7 @@ void TY_(ReportAttrError)(TidyDocImpl* doc, Node *node, AttVal *av, uint code)
        case ID_NAME_MISMATCH:
        case BACKSLASH_IN_URI:
        case FIXED_BACKSLASH:
+        case ILLEGAL_URI_CODEPOINT:
        case ILLEGAL_URI_REFERENCE:
        case ESCAPED_ILLEGAL_URI:
        case NEWLINE_IN_URI: