HTML Tidy now parses HTML non-recursively.
Instead of recursive calls for each nested level of HTML, the next level is pushed to a stack on the heap, and returned to the main loop. This prevents stack overflow at _n_ depth (where _n_ is operating-system dependent). It's probably still possible to use all of the heap memory, but Tidy's allocators already fail gracefully in this circumstance. Please report any regressions of your own HTML! NOTE: the XML parser is not affected, and is probably still highly recursive.
This commit is contained in:
parent
b6f7e43842
commit
91f29ea7b8
4
regression_testing/cases/dev-cases/case-001.conf
Executable file
4
regression_testing/cases/dev-cases/case-001.conf
Executable file
|
@ -0,0 +1,4 @@
|
|||
# Config for test case.
|
||||
tidy-mark: no
|
||||
indent: yes
|
||||
wrap: 999
|
26
regression_testing/cases/dev-cases/case-001@0.html
Executable file
26
regression_testing/cases/dev-cases/case-001@0.html
Executable file
|
@ -0,0 +1,26 @@
|
|||
<!--
|
||||
This test case represents HTML…
|
||||
-->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>This is a title</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div>
|
||||
<p>This is the first paragraph.</p>
|
||||
<p>Now now, second paragraph?</p>
|
||||
<div>
|
||||
<p>I'm nested in a div.</p>
|
||||
<ul>
|
||||
<li>List item one.
|
||||
<li>List item two. There isn't a third. Hahaha.</li>
|
||||
</ul>
|
||||
<p>Because, you know, lists should have a minimum of three items.</p>
|
||||
</div>
|
||||
<p>Penultimate paragraphs are sometimes the best.</p>
|
||||
</div>
|
||||
<p>Don't Cray; Buy Amiga!</p>
|
||||
</body>
|
||||
</html>
|
4
regression_testing/cases/dev-cases/case-002.conf
Executable file
4
regression_testing/cases/dev-cases/case-002.conf
Executable file
|
@ -0,0 +1,4 @@
|
|||
# Config for test case.
|
||||
tidy-mark: no
|
||||
indent: yes
|
||||
wrap: 999
|
33
regression_testing/cases/dev-cases/case-002@1.html
Executable file
33
regression_testing/cases/dev-cases/case-002@1.html
Executable file
|
@ -0,0 +1,33 @@
|
|||
<!--
|
||||
This test case tests the datalist element and the datalist parser.
|
||||
Oddly, there's not an existing test case that has the datalist element.
|
||||
-->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>This is a title</title>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<label for="ice-cream-choice">Choose a flavor:</label>
|
||||
<input list="ice-cream-flavors" id="ice-cream-choice" name="ice-cream-choice" />
|
||||
|
||||
<datalist id="ice-cream-flavors">
|
||||
<option value="Chocolate">
|
||||
<option value="Coconut">
|
||||
<option value="Mint">
|
||||
<option value="Strawberry">
|
||||
<option value="Vanilla">
|
||||
</datalist>
|
||||
|
||||
<label for="myBrowser">Choose a browser from this list:</label>
|
||||
<input list="browsers" id="myBrowser" name="myBrowser" />
|
||||
<datalist id="browsers">
|
||||
<option value="Chrome">
|
||||
<option value="Firefox">
|
||||
<option value="Internet Explorer">
|
||||
<option value="Opera">
|
||||
<option value="Safari">
|
||||
<option value="Microsoft Edge">
|
||||
</body>
|
||||
</html>
|
4
regression_testing/cases/dev-cases/case-003.conf
Executable file
4
regression_testing/cases/dev-cases/case-003.conf
Executable file
|
@ -0,0 +1,4 @@
|
|||
# Config for test case.
|
||||
tidy-mark: no
|
||||
indent: yes
|
||||
wrap: 999
|
27
regression_testing/cases/dev-cases/case-003@1.html
Normal file
27
regression_testing/cases/dev-cases/case-003@1.html
Normal file
|
@ -0,0 +1,27 @@
|
|||
<!--
|
||||
This test case tests the definition list element and parser.
|
||||
-->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>case-003</title></head>
|
||||
<body>
|
||||
|
||||
<dl>
|
||||
<dd>
|
||||
<div>
|
||||
<table summary="">
|
||||
<tr>
|
||||
<center>
|
||||
<td>What is up?</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
<dd>
|
||||
</dd>
|
||||
<center>Hello</center>
|
||||
</dl>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
||||
|
4
regression_testing/cases/dev-cases/case-004.conf
Executable file
4
regression_testing/cases/dev-cases/case-004.conf
Executable file
|
@ -0,0 +1,4 @@
|
|||
# Config for test case.
|
||||
tidy-mark: no
|
||||
indent: yes
|
||||
wrap: 999
|
41
regression_testing/cases/dev-cases/case-004@1.html
Normal file
41
regression_testing/cases/dev-cases/case-004@1.html
Normal file
|
@ -0,0 +1,41 @@
|
|||
<!--
|
||||
This test case tests the optgroup element and parser.
|
||||
-->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>case-004</title></head>
|
||||
<body>
|
||||
|
||||
<label for="dino-select">Choose a dinosaur:</label>
|
||||
<select id="dino-select">
|
||||
<optgroup label="Theropods">
|
||||
<option>Tyrannosaurus</option>
|
||||
<option>Velociraptor</option>
|
||||
<option>Deinonychus</option>
|
||||
</optgroup>
|
||||
<optgroup label="Sauropods">
|
||||
<option>Diplodocus</option>
|
||||
<option>Saltasaurus</option>
|
||||
<option>Apatosaurus</option>
|
||||
</optgroup>
|
||||
</select>
|
||||
|
||||
<optgroup label="Body Parts">
|
||||
<option>Claws</option>
|
||||
<option>Teeth</option>
|
||||
<option>Tail Spikes</option>
|
||||
</optgroup>
|
||||
|
||||
<optgroup label="Movies">
|
||||
<optgroup label="Scifi">
|
||||
<option>Jurassic Park</option>
|
||||
</optgroup>
|
||||
<option>The Good Dinosaur</option>
|
||||
<option>The Land Before Time</option>
|
||||
</optgroup>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
||||
|
41
regression_testing/cases/dev-expects/case-001.html
Normal file
41
regression_testing/cases/dev-expects/case-001.html
Normal file
|
@ -0,0 +1,41 @@
|
|||
<!--
|
||||
This test case represents HTML…
|
||||
-->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>
|
||||
This is a title
|
||||
</title>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<p>
|
||||
This is the first paragraph.
|
||||
</p>
|
||||
<p>
|
||||
Now now, second paragraph?
|
||||
</p>
|
||||
<div>
|
||||
<p>
|
||||
I'm nested in a div.
|
||||
</p>
|
||||
<ul>
|
||||
<li>List item one.
|
||||
</li>
|
||||
<li>List item two. There isn't a third. Hahaha.
|
||||
</li>
|
||||
</ul>
|
||||
<p>
|
||||
Because, you know, lists should have a minimum of three items.
|
||||
</p>
|
||||
</div>
|
||||
<p>
|
||||
Penultimate paragraphs are sometimes the best.
|
||||
</p>
|
||||
</div>
|
||||
<p>
|
||||
Don't Cray; Buy Amiga!
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
14
regression_testing/cases/dev-expects/case-001.txt
Normal file
14
regression_testing/cases/dev-expects/case-001.txt
Normal file
|
@ -0,0 +1,14 @@
|
|||
line 17 column 13 - Info: missing optional end tag </li>
|
||||
Info: Document content looks like HTML5
|
||||
No warnings or errors were found.
|
||||
|
||||
About HTML Tidy: https://github.com/htacg/tidy-html5
|
||||
Bug reports and comments: https://github.com/htacg/tidy-html5/issues
|
||||
Official mailing list: https://lists.w3.org/Archives/Public/public-htacg/
|
||||
Latest HTML specification: https://html.spec.whatwg.org/multipage/
|
||||
Validate your HTML documents: https://validator.w3.org/nu/
|
||||
Lobby your company to join the W3C: https://www.w3.org/Consortium
|
||||
|
||||
Do you speak a language other than English, or a different variant of
|
||||
English? Consider helping us to localize HTML Tidy. For details please see
|
||||
https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md
|
39
regression_testing/cases/dev-expects/case-002.html
Normal file
39
regression_testing/cases/dev-expects/case-002.html
Normal file
|
@ -0,0 +1,39 @@
|
|||
<!--
|
||||
This test case tests the datalist element and the datalist parser.
|
||||
Oddly, there's not an existing test case that has the datalist element.
|
||||
-->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>
|
||||
This is a title
|
||||
</title>
|
||||
</head>
|
||||
<body>
|
||||
<label for="ice-cream-choice">Choose a flavor:</label> <input list="ice-cream-flavors" id="ice-cream-choice" name="ice-cream-choice"> <datalist id="ice-cream-flavors">
|
||||
<option value="Chocolate">
|
||||
</option>
|
||||
<option value="Coconut">
|
||||
</option>
|
||||
<option value="Mint">
|
||||
</option>
|
||||
<option value="Strawberry">
|
||||
</option>
|
||||
<option value="Vanilla">
|
||||
</option>
|
||||
</datalist> <label for="myBrowser">Choose a browser from this list:</label> <input list="browsers" id="myBrowser" name="myBrowser"> <datalist id="browsers">
|
||||
<option value="Chrome">
|
||||
</option>
|
||||
<option value="Firefox">
|
||||
</option>
|
||||
<option value="Internet Explorer">
|
||||
</option>
|
||||
<option value="Opera">
|
||||
</option>
|
||||
<option value="Safari">
|
||||
</option>
|
||||
<option value="Microsoft Edge">
|
||||
</option>
|
||||
</datalist>
|
||||
</body>
|
||||
</html>
|
16
regression_testing/cases/dev-expects/case-002.txt
Normal file
16
regression_testing/cases/dev-expects/case-002.txt
Normal file
|
@ -0,0 +1,16 @@
|
|||
line 32 column 1 - Warning: discarding unexpected </body>
|
||||
line 33 column 1 - Warning: discarding unexpected </html>
|
||||
line 25 column 5 - Warning: missing </datalist>
|
||||
Info: Document content looks like HTML5
|
||||
Tidy found 3 warnings and 0 errors!
|
||||
|
||||
About HTML Tidy: https://github.com/htacg/tidy-html5
|
||||
Bug reports and comments: https://github.com/htacg/tidy-html5/issues
|
||||
Official mailing list: https://lists.w3.org/Archives/Public/public-htacg/
|
||||
Latest HTML specification: https://html.spec.whatwg.org/multipage/
|
||||
Validate your HTML documents: https://validator.w3.org/nu/
|
||||
Lobby your company to join the W3C: https://www.w3.org/Consortium
|
||||
|
||||
Do you speak a language other than English, or a different variant of
|
||||
English? Consider helping us to localize HTML Tidy. For details please see
|
||||
https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md
|
30
regression_testing/cases/dev-expects/case-003.html
Normal file
30
regression_testing/cases/dev-expects/case-003.html
Normal file
|
@ -0,0 +1,30 @@
|
|||
<!--
|
||||
This test case tests the definition list element and parser.
|
||||
-->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>
|
||||
case-003
|
||||
</title>
|
||||
</head>
|
||||
<body>
|
||||
<dl>
|
||||
<dd>
|
||||
<div>
|
||||
<table summary="">
|
||||
<tr>
|
||||
<td>
|
||||
What is up?
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
</dd>
|
||||
<dd></dd>
|
||||
</dl>
|
||||
<center>
|
||||
Hello
|
||||
</center>
|
||||
</body>
|
||||
</html>
|
26
regression_testing/cases/dev-expects/case-003.txt
Normal file
26
regression_testing/cases/dev-expects/case-003.txt
Normal file
|
@ -0,0 +1,26 @@
|
|||
line 14 column 7 - Warning: <center> isn't allowed in <tr> elements
|
||||
line 13 column 5 - Info: <tr> previously mentioned
|
||||
line 14 column 7 - Warning: missing </center> before <td>
|
||||
line 10 column 3 - Info: missing optional end tag </dd>
|
||||
line 12 column 5 - Warning: The summary attribute on the <table> element is obsolete in HTML5
|
||||
line 14 column 7 - Warning: trimming empty <center>
|
||||
line 21 column 3 - Warning: <center> element removed from HTML5
|
||||
line 12 column 5 - Warning: <table> attribute "summary" not allowed for HTML5
|
||||
Info: Document content looks like HTML5
|
||||
Tidy found 6 warnings and 0 errors!
|
||||
|
||||
One or more empty elements were present in the source document but
|
||||
dropped on output. If these elements are necessary or you don't want
|
||||
this behavior, then consider setting the option "drop-empty-elements"
|
||||
to no.
|
||||
|
||||
About HTML Tidy: https://github.com/htacg/tidy-html5
|
||||
Bug reports and comments: https://github.com/htacg/tidy-html5/issues
|
||||
Official mailing list: https://lists.w3.org/Archives/Public/public-htacg/
|
||||
Latest HTML specification: https://html.spec.whatwg.org/multipage/
|
||||
Validate your HTML documents: https://validator.w3.org/nu/
|
||||
Lobby your company to join the W3C: https://www.w3.org/Consortium
|
||||
|
||||
Do you speak a language other than English, or a different variant of
|
||||
English? Consider helping us to localize HTML Tidy. For details please see
|
||||
https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md
|
61
regression_testing/cases/dev-expects/case-004.html
Normal file
61
regression_testing/cases/dev-expects/case-004.html
Normal file
|
@ -0,0 +1,61 @@
|
|||
<!--
|
||||
This test case tests the optgroup element and parser.
|
||||
-->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>
|
||||
case-004
|
||||
</title>
|
||||
</head>
|
||||
<body>
|
||||
<label for="dino-select">Choose a dinosaur:</label> <select id="dino-select">
|
||||
<optgroup label="Theropods">
|
||||
<option>
|
||||
Tyrannosaurus
|
||||
</option>
|
||||
<option>
|
||||
Velociraptor
|
||||
</option>
|
||||
<option>
|
||||
Deinonychus
|
||||
</option>
|
||||
</optgroup>
|
||||
<optgroup label="Sauropods">
|
||||
<option>
|
||||
Diplodocus
|
||||
</option>
|
||||
<option>
|
||||
Saltasaurus
|
||||
</option>
|
||||
<option>
|
||||
Apatosaurus
|
||||
</option>
|
||||
</optgroup>
|
||||
</select>
|
||||
<optgroup label="Body Parts">
|
||||
<option>
|
||||
Claws
|
||||
</option>
|
||||
<option>
|
||||
Teeth
|
||||
</option>
|
||||
<option>
|
||||
Tail Spikes
|
||||
</option>
|
||||
</optgroup>
|
||||
<optgroup label="Movies">
|
||||
<optgroup label="Scifi">
|
||||
<option>
|
||||
Jurassic Park
|
||||
</option>
|
||||
</optgroup>
|
||||
<option>
|
||||
The Good Dinosaur
|
||||
</option>
|
||||
<option>
|
||||
The Land Before Time
|
||||
</option>
|
||||
</optgroup>
|
||||
</body>
|
||||
</html>
|
14
regression_testing/cases/dev-expects/case-004.txt
Normal file
14
regression_testing/cases/dev-expects/case-004.txt
Normal file
|
@ -0,0 +1,14 @@
|
|||
line 30 column 5 - Warning: <optgroup> can't be nested
|
||||
Info: Document content looks like HTML5
|
||||
Tidy found 1 warning and 0 errors!
|
||||
|
||||
About HTML Tidy: https://github.com/htacg/tidy-html5
|
||||
Bug reports and comments: https://github.com/htacg/tidy-html5/issues
|
||||
Official mailing list: https://lists.w3.org/Archives/Public/public-htacg/
|
||||
Latest HTML specification: https://html.spec.whatwg.org/multipage/
|
||||
Validate your HTML documents: https://validator.w3.org/nu/
|
||||
Lobby your company to join the W3C: https://www.w3.org/Consortium
|
||||
|
||||
Do you speak a language other than English, or a different variant of
|
||||
English? Consider helping us to localize HTML Tidy. For details please see
|
||||
https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md
|
15
regression_testing/cases/special-cases/README.txt
Normal file
15
regression_testing/cases/special-cases/README.txt
Normal file
|
@ -0,0 +1,15 @@
|
|||
About this test suite:
|
||||
======================
|
||||
|
||||
These test files represent that standard regression testing that must be
|
||||
performed prior to committing changes to Tidy's code. In some circumstances
|
||||
results are platform specific and these notices will be displayed in the
|
||||
testing results.
|
||||
|
||||
This test set is intended to collect tests that we **don't** want to run
|
||||
automatically, either because they take a long time to complete, or their
|
||||
expectations files are much too large for keeping the repository a reasonable
|
||||
size.
|
||||
|
||||
Because there's no matching -expects directory, these cases will NOT be included
|
||||
in the default testing service.
|
4
regression_testing/cases/special-cases/case-evil.conf
Executable file
4
regression_testing/cases/special-cases/case-evil.conf
Executable file
|
@ -0,0 +1,4 @@
|
|||
# Config for test case.
|
||||
tidy-mark: no
|
||||
indent: yes
|
||||
wrap: 999
|
6
regression_testing/cases/special-cases/case-evil@1.html
Normal file
6
regression_testing/cases/special-cases/case-evil@1.html
Normal file
File diff suppressed because one or more lines are too long
7234
src/parser.c
7234
src/parser.c
File diff suppressed because it is too large
Load diff
33
src/parser.h
33
src/parser.h
|
@ -41,30 +41,6 @@
|
|||
******************************************************************************/
|
||||
|
||||
|
||||
/**
|
||||
* The parsers keeps track of their states with the states defined here, and
|
||||
* use these symbols when pushing to the stack so that they can later recreate
|
||||
* their environments when re-entered.
|
||||
*/
|
||||
typedef enum {
|
||||
/* Universal states. */
|
||||
STATE_INITIAL, /**< This is the initial state for every parser. */
|
||||
STATE_COMPLETE, /**< Complete! */
|
||||
STATE_PARSE_TAG,
|
||||
STATE_PARSE_TAG_DONE,
|
||||
/* ParseHTML states. */
|
||||
STATE_PRE_HEAD, /**< In this state, we've not detected head yet. */
|
||||
STATE_PRE_BODY, /**< In this state, we'll consider frames vs. body. */
|
||||
STATE_PARSE_BODY, /**< In this state, we can parse the body. */
|
||||
STATE_PARSE_HEAD, /**< In this state, we will setup head for parsing. */
|
||||
STATE_PARSE_HEAD_DONE, /**< Resume here after parsing head. */
|
||||
STATE_PARSE_NOFRAMES, /**< In this state, we can parse noframes content. */
|
||||
STATE_PARSE_NOFRAMES_DONE, /**< In this state, we can restore more state. */
|
||||
STATE_PARSE_FRAMESET, /**< In this state, we will parse frameset content. */
|
||||
STATE_PARSE_FRAMESET_DONE, /**< We need to cleanup some things after parsing frameset. */
|
||||
} parserState;
|
||||
|
||||
|
||||
/**
|
||||
* This typedef represents the state of a parser when it enters and exits.
|
||||
* When the parser needs to finish work on the way back up the stack, it will
|
||||
|
@ -75,10 +51,12 @@ typedef struct _TidyParserMemory
|
|||
{
|
||||
Parser *identity; /**< Which parser pushed this record? */
|
||||
Node *original_node; /**< Originally provided node at entry. */
|
||||
Node *reentry_node; /**< A node a parser might want to save. */
|
||||
GetTokenMode reentry_mode; /**< The mode to use for the next node. */
|
||||
parserState reentry_state; /**< State to set during re-entry. */
|
||||
Node *reentry_node; /**< The node with which to re-enter. */
|
||||
GetTokenMode reentry_mode; /**< The token mode to use when re-entering. */
|
||||
int reentry_state; /**< State to set during re-entry. Defined locally in each parser. */
|
||||
GetTokenMode mode; /**< The caller will peek at this value to get the correct mode. */
|
||||
Bool register_b_1; /**< Local variable storage. */
|
||||
Bool register_b_2; /**< Local variable storage. */
|
||||
} TidyParserMemory;
|
||||
|
||||
|
||||
|
@ -89,7 +67,6 @@ typedef struct _TidyParserMemory
|
|||
typedef struct _TidyParserStack
|
||||
{
|
||||
TidyParserMemory* content; /**< A state record. */
|
||||
TidyAllocator* allocator; /**< The allocator used for creating. */
|
||||
uint size; /**< Current size of the stack. */
|
||||
int top; /**< Top of the stack. */
|
||||
} TidyParserStack;
|
||||
|
|
|
@ -67,7 +67,7 @@ typedef enum
|
|||
** @param popStack A flag indicating that we are re-entering this parser, and
|
||||
** it should restore a state from the stack.
|
||||
*/
|
||||
typedef Node* (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode, Bool popStack );
|
||||
typedef Node* (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode );
|
||||
|
||||
|
||||
/** This typedef describes a function be be used to check the attributes
|
||||
|
|
Loading…
Reference in a new issue