217 lines
4.8 KiB
C
217 lines
4.8 KiB
C
#ifndef __STREAMIO_H__
|
|
#define __STREAMIO_H__
|
|
|
|
/* streamio.h -- handles character stream I/O
|
|
|
|
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
|
|
See tidy.h for the copyright notice.
|
|
|
|
Wrapper around Tidy input source and output sink
|
|
that calls appropriate interfaces, and applies
|
|
necessary char encoding transformations: to/from
|
|
ISO-10646 and/or UTF-8.
|
|
|
|
*/
|
|
|
|
#include "forward.h"
|
|
#include "buffio.h"
|
|
#include "fileio.h"
|
|
|
|
#ifdef __cplusplus
|
|
extern "C"
|
|
{
|
|
#endif
|
|
typedef enum
|
|
{
|
|
FileIO,
|
|
BufferIO,
|
|
UserIO
|
|
} IOType;
|
|
|
|
/* states for ISO 2022
|
|
|
|
A document in ISO-2022 based encoding uses some ESC sequences called
|
|
"designator" to switch character sets. The designators defined and
|
|
used in ISO-2022-JP are:
|
|
|
|
"ESC" + "(" + ? for ISO646 variants
|
|
|
|
"ESC" + "$" + ? and
|
|
"ESC" + "$" + "(" + ? for multibyte character sets
|
|
*/
|
|
typedef enum
|
|
{
|
|
FSM_ASCII,
|
|
FSM_ESC,
|
|
FSM_ESCD,
|
|
FSM_ESCDP,
|
|
FSM_ESCP,
|
|
FSM_NONASCII
|
|
} ISO2022State;
|
|
|
|
/************************
|
|
** Source
|
|
************************/
|
|
|
|
enum
|
|
{
|
|
CHARBUF_SIZE=5,
|
|
LASTPOS_SIZE=64
|
|
};
|
|
|
|
/* non-raw input is cleaned up*/
|
|
struct _StreamIn
|
|
{
|
|
ISO2022State state; /* FSM for ISO2022 */
|
|
Bool pushed;
|
|
TidyAllocator *allocator;
|
|
tchar* charbuf;
|
|
uint bufpos;
|
|
uint bufsize;
|
|
int tabs;
|
|
int lastcols[LASTPOS_SIZE];
|
|
unsigned short curlastpos; /* current last position in lastcols */
|
|
unsigned short firstlastpos; /* first valid last position in lastcols */
|
|
int curcol;
|
|
int curline;
|
|
int encoding;
|
|
IOType iotype;
|
|
|
|
TidyInputSource source;
|
|
|
|
#ifdef TIDY_WIN32_MLANG_SUPPORT
|
|
void* mlang;
|
|
#endif
|
|
|
|
#ifdef TIDY_STORE_ORIGINAL_TEXT
|
|
tmbstr otextbuf;
|
|
size_t otextsize;
|
|
uint otextlen;
|
|
#endif
|
|
|
|
/* Pointer back to document for error reporting */
|
|
TidyDocImpl* doc;
|
|
};
|
|
|
|
StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding );
|
|
void TY_(freeStreamIn)(StreamIn* in);
|
|
|
|
StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE* fp, int encoding );
|
|
StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* content, int encoding );
|
|
StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding );
|
|
|
|
int TY_(ReadBOMEncoding)(StreamIn *in);
|
|
uint TY_(ReadChar)( StreamIn* in );
|
|
void TY_(UngetChar)( uint c, StreamIn* in );
|
|
Bool TY_(IsEOF)( StreamIn* in );
|
|
|
|
|
|
/************************
|
|
** Sink
|
|
************************/
|
|
|
|
struct _StreamOut
|
|
{
|
|
int encoding;
|
|
ISO2022State state; /* for ISO 2022 */
|
|
uint nl;
|
|
|
|
#ifdef TIDY_WIN32_MLANG_SUPPORT
|
|
void* mlang;
|
|
#endif
|
|
|
|
IOType iotype;
|
|
TidyOutputSink sink;
|
|
};
|
|
|
|
StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint newln );
|
|
StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint newln );
|
|
StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint newln );
|
|
|
|
StreamOut* TY_(StdErrOutput)(void);
|
|
/* StreamOut* StdOutOutput(void); */
|
|
void TY_(ReleaseStreamOut)( TidyDocImpl *doc, StreamOut* out );
|
|
|
|
void TY_(WriteChar)( uint c, StreamOut* out );
|
|
void TY_(outBOM)( StreamOut *out );
|
|
|
|
ctmbstr TY_(GetEncodingNameFromTidyId)(uint id);
|
|
ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id);
|
|
int TY_(GetCharEncodingFromOptName)(ctmbstr charenc);
|
|
|
|
/************************
|
|
** Misc
|
|
************************/
|
|
|
|
/* character encodings
|
|
*/
|
|
#define RAW 0
|
|
#define ASCII 1
|
|
#define LATIN0 2
|
|
#define LATIN1 3
|
|
#define UTF8 4
|
|
#define ISO2022 5
|
|
#define MACROMAN 6
|
|
#define WIN1252 7
|
|
#define IBM858 8
|
|
|
|
#if SUPPORT_UTF16_ENCODINGS
|
|
#define UTF16LE 9
|
|
#define UTF16BE 10
|
|
#define UTF16 11
|
|
#endif
|
|
|
|
/* Note that Big5 and SHIFTJIS are not converted to ISO 10646 codepoints
|
|
** (i.e., to Unicode) before being recoded into UTF-8. This may be
|
|
** confusing: usually UTF-8 implies ISO10646 codepoints.
|
|
*/
|
|
#if SUPPORT_ASIAN_ENCODINGS
|
|
#if SUPPORT_UTF16_ENCODINGS
|
|
#define BIG5 12
|
|
#define SHIFTJIS 13
|
|
#else
|
|
#define BIG5 9
|
|
#define SHIFTJIS 10
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef TIDY_WIN32_MLANG_SUPPORT
|
|
/* hack: windows code page numbers start at 37 */
|
|
#define WIN32MLANG 36
|
|
#endif
|
|
|
|
|
|
/* char encoding used when replacing illegal SGML chars,
|
|
** regardless of specified encoding. Set at compile time
|
|
** to either Windows or Mac.
|
|
*/
|
|
extern const int TY_(ReplacementCharEncoding);
|
|
|
|
/* Function for conversion from Windows-1252 to Unicode */
|
|
uint TY_(DecodeWin1252)(uint c);
|
|
|
|
/* Function to convert from MacRoman to Unicode */
|
|
uint TY_(DecodeMacRoman)(uint c);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
|
|
/* Use numeric constants as opposed to escape chars (\r, \n)
|
|
** to avoid conflict Mac compilers that may re-define these.
|
|
*/
|
|
#define CR 0xD
|
|
#define LF 0xA
|
|
|
|
#if defined(MAC_OS_CLASSIC)
|
|
#define DEFAULT_NL_CONFIG TidyCR
|
|
#elif defined(_WIN32) || defined(OS2_OS)
|
|
#define DEFAULT_NL_CONFIG TidyCRLF
|
|
#else
|
|
#define DEFAULT_NL_CONFIG TidyLF
|
|
#endif
|
|
|
|
|
|
#endif /* __STREAMIO_H__ */
|