diff --git a/include/tidyenum.h b/include/tidyenum.h index 313b1a3..f7cb3c7 100644 --- a/include/tidyenum.h +++ b/include/tidyenum.h @@ -171,6 +171,7 @@ extern "C" { FN(ESCAPED_ILLEGAL_URI) \ FN(FIXED_BACKSLASH) \ FN(ID_NAME_MISMATCH) \ + FN(ILLEGAL_URI_CODEPOINT) \ FN(ILLEGAL_URI_REFERENCE) \ FN(INSERTING_AUTO_ATTRIBUTE) \ FN(INVALID_ATTRIBUTE) \ diff --git a/src/attrs.c b/src/attrs.c index 467a814..cbcb6a9 100644 --- a/src/attrs.c +++ b/src/attrs.c @@ -1476,15 +1476,64 @@ static void CheckLowerCaseAttrValue( TidyDocImpl* doc, Node *node, AttVal *attva /* methods for checking value of a specific attribute */ +static Bool IsURLCodePoint( ctmbstr p, uint *increment ) +{ + uint c; + *increment = TY_(GetUTF8)( p, &c ) + 1; + + return isalnum( c ) || + c == '%' || /* not a valid codepoint, but an escape sequence */ + c == '#' || /* not a valid codepoint, but a delimiter */ + c == '!' || + c == '$' || + c == '&' || + c == '\'' || + c == '(' || + c == ')' || + c == '*' || + c == '+' || + c == ',' || + c == '-' || + c == '.' || + c == '/' || + c == ':' || + c == ';' || + c == '=' || + c == '?' || + c == '@' || + c == '_' || + c == '~' || + (c >= 0x00A0 && c <= 0xD7FF) || + (c >= 0xE000 && c <= 0xFDCF) || + (c >= 0xFDF0 && c <= 0xFFEF) || + (c >= 0x10000 && c <= 0x1FFFD) || + (c >= 0x20000 && c <= 0x2FFFD) || + (c >= 0x30000 && c <= 0x3FFFD) || + (c >= 0x40000 && c <= 0x4FFFD) || + (c >= 0x50000 && c <= 0x5FFFD) || + (c >= 0x60000 && c <= 0x6FFFD) || + (c >= 0x70000 && c <= 0x7FFFD) || + (c >= 0x80000 && c <= 0x8FFFD) || + (c >= 0x90000 && c <= 0x9FFFD) || + (c >= 0xA0000 && c <= 0xAFFFD) || + (c >= 0xB0000 && c <= 0xBFFFD) || + (c >= 0xC0000 && c <= 0xCFFFD) || + (c >= 0xD0000 && c <= 0xDFFFD) || + (c >= 0xE0000 && c <= 0xEFFFD) || + (c >= 0xF0000 && c <= 0xFFFFD) || + (c >= 0x100000 && c <= 0x10FFFD); +} + void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval) { - tmbchar c; + tmbchar c; tmbstr dest, p; - uint escape_count = 0, backslash_count = 0; + uint escape_count = 0, backslash_count = 0, bad_codepoint_count = 0; uint i, pos = 0; uint len; + uint increment; Bool isJavascript = no; - + if (!AttrHasValue(attval)) { TY_(ReportAttrError)( doc, node, attval, MISSING_ATTR_VALUE); @@ -1492,7 +1541,7 @@ void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval) } p = attval->value; - + isJavascript = TY_(tmbstrncmp)(p,"javascript:",sizeof("javascript:")-1)==0; @@ -1508,6 +1557,14 @@ void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval) ++escape_count; } + while ( *p != 0 ) + { + if ( !IsURLCodePoint( p, &increment ) ) + ++bad_codepoint_count; + p = p + increment; + } + p = attval->value; + if ( cfgBool(doc, TidyFixUri) && escape_count ) { Bool hadnonspace = no; @@ -1557,6 +1614,10 @@ void TY_(CheckUrl)( TidyDocImpl* doc, Node *node, AttVal *attval) doc->badChars |= BC_INVALID_URI; } + if ( bad_codepoint_count ) + { + TY_(ReportAttrError)( doc, node, attval, ILLEGAL_URI_CODEPOINT ); + } } /* RFC 2396, section 4.2 states: diff --git a/src/language_en.h b/src/language_en.h index 24b8aa1..cabc39b 100644 --- a/src/language_en.h +++ b/src/language_en.h @@ -1821,6 +1821,7 @@ static languageDefinition language_en = { whichPluralForm_en, { { ESCAPED_ILLEGAL_URI, 0, "%s escaping malformed URI reference" }, /* ReportAttrError */ { FIXED_BACKSLASH, 0, "%s converting backslash in URI to slash" }, /* ReportAttrError */ { ID_NAME_MISMATCH, 0, "%s id and name attribute value mismatch" }, /* ReportAttrError */ + { ILLEGAL_URI_CODEPOINT, 0, "%s illegal characters found in URI" }, /* ReportAttrError */ { ILLEGAL_URI_REFERENCE, 0, "%s improperly escaped URI reference" }, /* ReportAttrError */ { INSERTING_AUTO_ATTRIBUTE, 0, "%s inserting \"%s\" attribute using value \"%s\"" }, /* ReportAttrError */ { INVALID_ATTRIBUTE, 0, "%s attribute name \"%s\" (value=\"%s\") is invalid" }, /* ReportAttrError */ diff --git a/src/message.c b/src/message.c index 2c5587e..48c300f 100755 --- a/src/message.c +++ b/src/message.c @@ -525,6 +525,7 @@ void TY_(ReportAttrError)(TidyDocImpl* doc, Node *node, AttVal *av, uint code) case ID_NAME_MISMATCH: case BACKSLASH_IN_URI: case FIXED_BACKSLASH: + case ILLEGAL_URI_CODEPOINT: case ILLEGAL_URI_REFERENCE: case ESCAPED_ILLEGAL_URI: case NEWLINE_IN_URI: