Eventually complete a 2007 fix

This commit is contained in:
Geoff McLane 2015-09-16 13:17:50 +02:00
parent cd8dca2d4c
commit d541405a2a
7 changed files with 104 additions and 2 deletions

View file

@ -205,6 +205,7 @@ typedef enum
TidyMergeSpans, /**< Merge multiple SPANs */
TidyAnchorAsName, /**< Define anchors as name attributes */
TidyPPrintTabs, /**< Indent using tabs istead of spaces */
TidySkipQuotes, /**< Skip quotes and comments in script CDATA */
N_TIDY_OPTIONS /**< Must be last */
} TidyOptionId;

View file

@ -321,6 +321,7 @@ static const TidyOptionImpl option_defs[] =
{ TidyMergeSpans, MU, "merge-spans", IN, TidyAutoState, ParseAutoBool, autoBoolPicks },
{ TidyAnchorAsName, MU, "anchor-as-name", BL, yes, ParseBool, boolPicks },
{ TidyPPrintTabs, PP, "indent-with-tabs", BL, no, ParseTabs, boolPicks }, /* 20150515 - Issue #108 */
{ TidySkipQuotes, MU, "skip-quotes", BL, no, ParseBool, boolPicks }, /* 1642186 - Issue #65 */
{ N_TIDY_OPTIONS, XX, NULL, XY, 0, NULL, NULL }
};

View file

@ -1947,6 +1947,73 @@ static Bool ExpectsContent(Node *node)
return yes;
}
/*\
* Issue #65 - also see http://tidy.sf.net/issue/1642186
* Parser too gready over <script> blocks
*
* The idea is to scan the current lexer data, and
* return a Bool
* yes = we are in a javascript comment text, either type,
* or are within quotes, either single or double
* no = Not in any of the above.
*
* This is to avoid tidy finding tags in quoted or comment text.
*
* Controlled by option --skip-quotes yes|no, enum as
* TidySkipQuotes, off by default.
\*/
static Bool IsInQuotesorComment( Lexer * lexer )
{
unsigned int i;
Bool inq, toeol, toec;
unsigned char prev, quot, c;
prev = quot = 0;
inq = toeol = toec = no;
for ( i = lexer->txtstart; i < lexer->lexsize; i++ )
{
c = lexer->lexbuf[i];
if ( toeol )
{
/* continue until END OF LINE */
if ( c == '\n' )
{
toeol = no;
}
}
else if ( toec )
{
/* continue until END OF COMMENT */
if ( ( c == '/' ) && ( prev == '*' ) )
toec = no;
}
else
{
if (( prev != '\\' ) && (( c == '"' ) || ( c == '\'')) )
{
if ( inq && ( c == quot ))
{
inq = no;
}
else
{
inq = yes;
quot = c; /* keep type of start quote - single or double */
}
}
else if ( !inq && ( c == '/' ) && (prev == '/') )
{
toeol = yes; /* set in comment, until END OF LINE */
}
else if ( !inq && ( c == '*' ) && (prev == '/'))
{
toec = yes; /* set until END OF COMMENT */
}
}
prev = c;
}
return (inq | toeol | toec);
}
/*
create a text node for the contents of
a CDATA element like style or script
@ -1971,6 +2038,7 @@ static Node *GetCDATA( TidyDocImpl* doc, Node *container )
Bool matches = no;
uint c;
Bool hasSrc = TY_(AttrGetById)(container, TidyAttr_SRC) != NULL;
Bool skipquotes = cfgBool(doc, TidySkipQuotes); /* #65 - get CONFIG option */
SetLexerLocus( doc, lexer );
lexer->waswhite = no;
@ -1991,6 +2059,13 @@ static Node *GetCDATA( TidyDocImpl* doc, Node *container )
continue;
}
/*\
* Issue #65 - sf 1642186 - try to skip "...", '...', // ...\n, and
* other C/C++ like comment blocks, if the option enabled
\*/
if ( skipquotes && IsInQuotesorComment(lexer) )
continue;
c = TY_(ReadChar)(doc->docIn);
if (TY_(IsLetter)(c))

View file

@ -925,6 +925,10 @@ static const TidyOptionDoc option_docs[] =
"Note TidyTabSize controls converting input tabs to spaces. Set to zero "
"to retain input tabs. "
},
{TidySkipQuotes,
"This option specifies that Tidy should skip quotes, and comments "
"when parsing script data. "
},
{N_TIDY_OPTIONS,
NULL
}

View file

@ -0,0 +1,7 @@
skip-quotes: yes
indent: auto
tidy-mark: no
clean: yes
logical-emphasis: yes
indent-attributes: yes
show-info: no

View file

@ -0,0 +1,14 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Issue #65 - Parser too greedy over script blocks</title>
<script>
/* the <script */
var m1 = "\"<script \"";
var m2 = '<script '; // <script
</script>
</head>
<body>
</body>
</html>

View file

@ -1,2 +1,2 @@
5.1.9
2015.09.10
5.1.10
2015.09.16