tidy-html5/src/iconvtc.c

105 lines
2.6 KiB
C
Raw Normal View History

2011-11-17 02:44:16 +00:00
/* iconvtc.c -- Interface to iconv transcoding routines
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidy.h"
#include "forward.h"
#include "streamio.h"
#ifdef TIDY_ICONV_SUPPORT
#include <iconv.h>
/* maximum number of bytes for a single character */
#define TC_INBUFSIZE 16
/* maximum number of characters per byte sequence */
#define TC_OUTBUFSIZE 16
Bool IconvInitInputTranscoder(void)
{
return no;
}
void IconvUninitInputTranscoder(void)
{
return;
}
int IconvGetChar(byte firstByte, StreamIn * in, uint * bytesRead)
{
iconv_t cd;
TidyInputSource * source;
char inbuf[TC_INBUFSIZE] = { 0 };
char outbuf[TC_OUTBUFSIZE] = { 0 };
size_t inbufsize = 0;
assert( in != NULL );
assert( &in->source != NULL );
assert( bytesRead != NULL );
assert( in->iconvptr != 0 );
cd = (iconv_t)in->iconvptr;
source = &in->source;
inbuf[inbufsize++] = (char)firstByte;
while(inbufsize < TC_INBUFSIZE)
{
char * outbufptr = (char*)outbuf;
char * inbufptr = (char*)inbuf;
size_t readNow = inbufsize;
size_t writeNow = TC_OUTBUFSIZE;
size_t result = 0;
int iconv_errno = 0;
int nextByte = EndOfStream;
result = iconv(cd, (const char**)&inbufptr, &readNow, (char**)&outbufptr, &writeNow);
iconv_errno = errno;
if (result != (size_t)(-1))
{
int c;
/* create codepoint from UTF-32LE octets */
c = (unsigned char)outbuf[0];
c += (unsigned char)outbuf[1] << 8;
c += (unsigned char)outbuf[2] << 16;
c += (unsigned char)outbuf[3] << 32;
/* set number of read bytes */
*bytesRead = inbufsize;
return c;
}
assert( iconv_errno != EILSEQ ); /* broken multibyte sequence */
assert( iconv_errno != E2BIG ); /* not enough memory */
assert( iconv_errno == EINVAL ); /* incomplete sequence */
/* we need more bytes */
nextByte = source->getByte(source->sourceData);
if (nextByte == EndOfStream)
{
/* todo: error message for broken stream? */
*bytesRead = inbufsize;
return EndOfStream;
}
inbuf[inbufsize++] = (char)nextByte;
}
/* No full character found after reading TC_INBUFSIZE bytes, */
/* give up to read this stream, it's obviously unreadable. */
/* todo: error message for broken stream? */
return EndOfStream;
}
#endif /* TIDY_ICONV_SUPPORT */