tidy-html5/src/streamio.h

217 lines
4.8 KiB
C
Raw Normal View History

2011-11-17 02:44:16 +00:00
#ifndef __STREAMIO_H__
#define __STREAMIO_H__
/* streamio.h -- handles character stream I/O
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Wrapper around Tidy input source and output sink
that calls appropriate interfaces, and applies
necessary char encoding transformations: to/from
ISO-10646 and/or UTF-8.
*/
#include "forward.h"
#include "buffio.h"
#include "fileio.h"
#ifdef __cplusplus
extern "C"
{
#endif
typedef enum
{
FileIO,
BufferIO,
UserIO
} IOType;
/* states for ISO 2022
A document in ISO-2022 based encoding uses some ESC sequences called
"designator" to switch character sets. The designators defined and
used in ISO-2022-JP are:
"ESC" + "(" + ? for ISO646 variants
"ESC" + "$" + ? and
"ESC" + "$" + "(" + ? for multibyte character sets
*/
typedef enum
{
FSM_ASCII,
FSM_ESC,
FSM_ESCD,
FSM_ESCDP,
FSM_ESCP,
FSM_NONASCII
} ISO2022State;
/************************
** Source
************************/
enum
{
CHARBUF_SIZE=5,
LASTPOS_SIZE=64
};
/* non-raw input is cleaned up*/
struct _StreamIn
{
ISO2022State state; /* FSM for ISO2022 */
Bool pushed;
TidyAllocator *allocator;
tchar* charbuf;
uint bufpos;
uint bufsize;
int tabs;
int lastcols[LASTPOS_SIZE];
unsigned short curlastpos; /* current last position in lastcols */
unsigned short firstlastpos; /* first valid last position in lastcols */
int curcol;
int curline;
int encoding;
IOType iotype;
TidyInputSource source;
#ifdef TIDY_WIN32_MLANG_SUPPORT
void* mlang;
#endif
#ifdef TIDY_STORE_ORIGINAL_TEXT
tmbstr otextbuf;
size_t otextsize;
uint otextlen;
#endif
/* Pointer back to document for error reporting */
TidyDocImpl* doc;
};
StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding );
void TY_(freeStreamIn)(StreamIn* in);
StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE* fp, int encoding );
StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* content, int encoding );
StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding );
int TY_(ReadBOMEncoding)(StreamIn *in);
uint TY_(ReadChar)( StreamIn* in );
void TY_(UngetChar)( uint c, StreamIn* in );
Bool TY_(IsEOF)( StreamIn* in );
/************************
** Sink
************************/
struct _StreamOut
{
int encoding;
ISO2022State state; /* for ISO 2022 */
uint nl;
#ifdef TIDY_WIN32_MLANG_SUPPORT
void* mlang;
#endif
IOType iotype;
TidyOutputSink sink;
};
StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint newln );
StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint newln );
StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint newln );
StreamOut* TY_(StdErrOutput)(void);
/* StreamOut* StdOutOutput(void); */
void TY_(ReleaseStreamOut)( TidyDocImpl *doc, StreamOut* out );
void TY_(WriteChar)( uint c, StreamOut* out );
void TY_(outBOM)( StreamOut *out );
ctmbstr TY_(GetEncodingNameFromTidyId)(uint id);
ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id);
int TY_(GetCharEncodingFromOptName)(ctmbstr charenc);
/************************
** Misc
************************/
/* character encodings
*/
#define RAW 0
#define ASCII 1
#define LATIN0 2
#define LATIN1 3
#define UTF8 4
#define ISO2022 5
#define MACROMAN 6
#define WIN1252 7
#define IBM858 8
#if SUPPORT_UTF16_ENCODINGS
#define UTF16LE 9
#define UTF16BE 10
#define UTF16 11
#endif
/* Note that Big5 and SHIFTJIS are not converted to ISO 10646 codepoints
** (i.e., to Unicode) before being recoded into UTF-8. This may be
** confusing: usually UTF-8 implies ISO10646 codepoints.
*/
#if SUPPORT_ASIAN_ENCODINGS
#if SUPPORT_UTF16_ENCODINGS
#define BIG5 12
#define SHIFTJIS 13
#else
#define BIG5 9
#define SHIFTJIS 10
#endif
#endif
#ifdef TIDY_WIN32_MLANG_SUPPORT
/* hack: windows code page numbers start at 37 */
#define WIN32MLANG 36
#endif
/* char encoding used when replacing illegal SGML chars,
** regardless of specified encoding. Set at compile time
** to either Windows or Mac.
*/
extern const int TY_(ReplacementCharEncoding);
/* Function for conversion from Windows-1252 to Unicode */
uint TY_(DecodeWin1252)(uint c);
/* Function to convert from MacRoman to Unicode */
uint TY_(DecodeMacRoman)(uint c);
#ifdef __cplusplus
}
#endif
/* Use numeric constants as opposed to escape chars (\r, \n)
** to avoid conflict Mac compilers that may re-define these.
*/
#define CR 0xD
#define LF 0xA
#if defined(MAC_OS_CLASSIC)
#define DEFAULT_NL_CONFIG TidyCR
#elif defined(_WIN32) || defined(OS2_OS)
#define DEFAULT_NL_CONFIG TidyCRLF
#else
#define DEFAULT_NL_CONFIG TidyLF
#endif
#endif /* __STREAMIO_H__ */