From 26e7d9d4b04d07a5d42d549078be834c5e8be461 Mon Sep 17 00:00:00 2001 From: Jim Derry Date: Thu, 31 Dec 2015 13:57:34 +0800 Subject: [PATCH] Fixes Mac OS X encoding issues and harmonizes output across platforms. Previously Tidy produced different output based on the compilation target, NOT based on the file encoding and specified options. Every platform was equal except Mac OS. Now unless the encoding is specifically set to a Mac file type, all encoding assumptions are the same across platforms. --- include/tidyplatform.h | 10 ---------- src/lexer.c | 6 ++---- src/streamio.c | 22 ++++++---------------- src/streamio.h | 6 ------ 4 files changed, 8 insertions(+), 36 deletions(-) diff --git a/include/tidyplatform.h b/include/tidyplatform.h index ea999c5..939a74b 100644 --- a/include/tidyplatform.h +++ b/include/tidyplatform.h @@ -92,9 +92,6 @@ extern "C" { #define MAC_OS #define FILENAMES_CASE_SENSITIVE 0 #define strcasecmp strcmp -#ifndef DFLT_REPL_CHARENC -#define DFLT_REPL_CHARENC MACROMAN -#endif #endif /* Convenience defines for BSD like platforms */ @@ -362,13 +359,6 @@ extern "C" { #include /* needed for unlink on some Unix systems */ #endif -/* This can be set at compile time. Usually Windows, -** except for Macintosh builds. -*/ -#ifndef DFLT_REPL_CHARENC -#define DFLT_REPL_CHARENC WIN1252 -#endif - /* By default, use case-sensitive filename comparison. */ #ifndef FILENAMES_CASE_SENSITIVE diff --git a/src/lexer.c b/src/lexer.c index 909211c..ace9452 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -1037,10 +1037,8 @@ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode ) uint c1 = 0; int replaceMode = DISCARDED_CHAR; - if ( TY_(ReplacementCharEncoding) == WIN1252 ) - c1 = TY_(DecodeWin1252)( ch ); - else if ( TY_(ReplacementCharEncoding) == MACROMAN ) - c1 = TY_(DecodeMacRoman)( ch ); + /* Always assume Win1252 in this circumstance. */ + c1 = TY_(DecodeWin1252)( ch ); if ( c1 ) replaceMode = REPLACED_CHAR; diff --git a/src/streamio.c b/src/streamio.c index 548f3d7..9742fe2 100644 --- a/src/streamio.c +++ b/src/streamio.c @@ -464,10 +464,7 @@ uint TY_(ReadChar)( StreamIn *in ) uint c1 = 0, replMode = DISCARDED_CHAR; Bool isVendorChar = ( in->encoding == WIN1252 || in->encoding == MACROMAN ); - Bool isWinChar = ( in->encoding == WIN1252 || - TY_(ReplacementCharEncoding) == WIN1252 ); - Bool isMacChar = ( in->encoding == MACROMAN || - TY_(ReplacementCharEncoding) == MACROMAN ); + Bool isMacChar = ( in->encoding == MACROMAN ); /* set error position just before offending character */ if (in->doc->lexer) @@ -476,10 +473,10 @@ uint TY_(ReadChar)( StreamIn *in ) in->doc->lexer->columns = in->curcol; } - if ( isWinChar ) - c1 = TY_(DecodeWin1252)( c ); - else if ( isMacChar ) - c1 = TY_(DecodeMacRoman)( c ); + if ( isMacChar ) + c1 = TY_(DecodeMacRoman)( c ); + else + c1 = TY_(DecodeWin1252)( c ); if ( c1 ) replMode = REPLACED_CHAR; @@ -748,14 +745,7 @@ void TY_(WriteChar)( uint c, StreamOut* out ) ** Miscellaneous / Helpers ****************************/ -/* char encoding used when replacing illegal SGML chars, -** regardless of specified encoding. Set at compile time -** to either Windows or Mac. -*/ -const int TY_(ReplacementCharEncoding) = DFLT_REPL_CHARENC; - - -/* Mapping for Windows Western character set CP 1252 +/* Mapping for Windows Western character set CP 1252 ** (chars 128-159/U+0080-U+009F) to Unicode. */ static const uint Win2Unicode[32] = diff --git a/src/streamio.h b/src/streamio.h index 6e2d4b6..696cea9 100644 --- a/src/streamio.h +++ b/src/streamio.h @@ -181,12 +181,6 @@ int TY_(GetCharEncodingFromOptName)(ctmbstr charenc); #endif -/* char encoding used when replacing illegal SGML chars, -** regardless of specified encoding. Set at compile time -** to either Windows or Mac. -*/ -extern const int TY_(ReplacementCharEncoding); - /* Function for conversion from Windows-1252 to Unicode */ uint TY_(DecodeWin1252)(uint c);