From c6e0ccce1f65dc4d36fea17e5b8fdff43b9fd416 Mon Sep 17 00:00:00 2001 From: Geoff McLane Date: Sat, 21 Nov 2020 16:47:58 +0100 Subject: [PATCH] Is. #896 - make 'bear' docs match code (#898) * Is. #896 - make 'bear' docs match code * Is. #487 #462 add warn msg and do not get stuck until eof The warning message could perhaps be better worded, and maybe there should be another msg when a '>' is encountered while looking for a ']' in a MS Word section, and perhaps the section should be discarded... And perhaps it should be an error, to force the user to fix... But the fix is good as it is, and these issues can be dealt with later... And this fix is piggy backed on this PR, but it is likewise related to 'word-2000' option... --- include/tidyenum.h | 2 +- src/clean.c | 3 +-- src/language_en.h | 6 +++--- src/lexer.c | 7 ++++++- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/tidyenum.h b/include/tidyenum.h index 7b35fbb..e34d1ca 100644 --- a/include/tidyenum.h +++ b/include/tidyenum.h @@ -610,7 +610,7 @@ typedef enum TidyLiteralAttribs, /**< If true attributes may use newlines */ TidyLogicalEmphasis, /**< Replace i by em and b by strong */ TidyLowerLiterals, /**< Folds known attribute values to lower case */ - TidyMakeBare, /**< Make bare HTML: remove Microsoft cruft */ + TidyMakeBare, /**< Replace smart quotes, em dashes, etc with ASCII */ TidyMakeClean, /**< Replace presentational clutter by style rules */ TidyMark, /**< Add meta element indicating tidied doc */ TidyMergeDivs, /**< Merge multiple DIVs */ diff --git a/src/clean.c b/src/clean.c index dc6cac1..6602ff9 100644 --- a/src/clean.c +++ b/src/clean.c @@ -1893,8 +1893,7 @@ void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node) if ( nodeIsHTML(node) ) { /* check that it's a Word 2000 document */ - if ( !TY_(GetAttrByName)(node, "xmlns:o") && - !cfgBool(doc, TidyMakeBare) ) + if ( !TY_(IsWord2000) (doc) ) /* Is. #896 */ return; /* Output proprietary attributes to maintain errout compatability diff --git a/src/language_en.h b/src/language_en.h index 8d0eb7a..eab5567 100644 --- a/src/language_en.h +++ b/src/language_en.h @@ -786,9 +786,9 @@ static languageDefinition language_en = { whichPluralForm_en, { - The strings "Tidy" and "HTML Tidy" are the program name and must not be translated. */ TidyMakeBare, 0, - "This option specifies if Tidy should strip Microsoft specific HTML " - "from Word 2000 documents, and output spaces rather than non-breaking " - "spaces where they exist in the input. " + "This option specifies if Tidy should replace smart quotes and em dashes with " + "ASCII, and output spaces rather than non-breaking " + "spaces, where they exist in the input. " }, {/* Important notes for translators: - Use only , , , , and diff --git a/src/lexer.c b/src/lexer.c index ef70e13..49b74f5 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -2777,6 +2777,7 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode ) } + TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_DROPPING); /* Is. #487 */ /* else swallow characters up to and including next '>' */ while ((c = TY_(ReadChar)(doc->docIn)) != '>') @@ -3340,7 +3341,11 @@ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode ) } } - if (c != ']') + if (c == '>') + { + /* Is. #462 - reached '>' before ']' */ + TY_(UngetChar)(c, doc->docIn); + } else if (c != ']') continue; /* now look for '>' */