//============================================================================== // intextstream.cpp // Generic input text stream. // // Notice // Copyright ©2008-2011 by David R. Tribble, all rights reserved. // Permission is granted to any person or entity except those designated // by the United States Department of State as a terrorist, or terrorist // government or agency, to use and distribute this source code provided // that the original copyright notice remains present and unaltered. //============================================================================== // Identification static char REV[] = "@(#)drt/src/lib/intextstream.cpp $Revision: 1.3 $$Date: 2011/04/30 00:27:55 $"; // Includes #include #define sys_ctype_h #include #define sys_errno_h #include #define sys_iso646_h #include #define sys_stdio_h #include #define sys_string_h #if _WIN32 #define WINDOWS_LEAN_AND_MEAN #include #include #endif #include "intextstream.hpp" // Manifest constants #define BUF_SIZE (16 * 1024) // I/O vbuffer size //------------------------------------------------------------------------------ // InTextStream::~InTextStream() // Destructor. //------------------------------------------------------------------------------ /*virtual*/ InTextStream::~InTextStream() { #if InTextStream_VS != 201 #error class InTextStream has changed #endif // De-initialize close(); } //------------------------------------------------------------------------------ // InTextStream::InTextStream() // Constructor. //------------------------------------------------------------------------------ InTextStream::InTextStream(): TextStream(), m_get(NULL), m_ungetCh(CH_EOF) { #if InTextStream_VS != 201 #error class InTextStream has changed #endif // Initialize } //------------------------------------------------------------------------------ // InTextStream::openPrep() // Prepare to open an input stream. //------------------------------------------------------------------------------ bool InTextStream::openPrep(enum FileType ftype, enum EolnType eoln) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif // Sanity check if (m_fp != NULL) return false; // Check the file type switch (ftype) { case FTYPE_7NONE: case FTYPE_7MARK: case FTYPE_7EVEN: case FTYPE_7ODD: m_get = &get_7bit; break; case FTYPE_8BIT: m_get = &get_8bit; break; case FTYPE_UTF8: m_get = &get_utf8; break; case FTYPE_UTF16: m_get = &get_utf16; break; case FTYPE_UTF16_R: m_get = &get_utf16r; break; case FTYPE_UTF32: m_get = &get_utf32; break; case FTYPE_UTF32_R: m_get = &get_utf32r; break; case FTYPE_24BIT: m_get = &get_24bit; break; case FTYPE_24BIT_R: m_get = &get_24bitr; break; default: return false; } // Check the newline type switch (eoln) { case EOLN_NONE: case EOLN_CR: case EOLN_CRLF: case EOLN_LF: case EOLN_NEL: case EOLN_ANY: break; default: return false; } // Done return true; } //------------------------------------------------------------------------------ // InTextStream::open() // Open an existing stream as an input stream. //------------------------------------------------------------------------------ /*virtual*/ bool InTextStream::open(FILE *fp, enum FileType ftype, enum EolnType eoln) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif // Sanity check if (fp == NULL) return false; // Prepare to open the stream if (not openPrep(ftype, eoln)) return false; // Open the stream #if _WIN32 ::setmode(::fileno(fp), O_BINARY); #endif #ifdef IS_NOT_USED___ delete[] m_vbuf; m_vbuf = new char[BUF_SIZE]; if (::setvbuf(fp, m_vbuf, _IOFBF, BUF_SIZE) != 0) { delete[] m_vbuf; m_vbuf = NULL; } #endif m_fp = fp; m_mode = MODE_READ; m_ftype = ftype; m_eoln = eoln; return true; } //------------------------------------------------------------------------------ // InTextStream::open() // Open a file as an input stream. //------------------------------------------------------------------------------ /*virtual*/ bool InTextStream::open(const char *fname, enum FileType ftype, enum EolnType eoln) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif FILE * fp; // Sanity check if (fname == NULL or fname[0] == '\0') return false; // Prepare to open the stream if (not openPrep(ftype, eoln)) return false; // Open a file m_errno = 0; fp = ::fopen(fname, "rb"); if (fp == NULL) { m_errno = errno; return false; } // Success delete[] m_vbuf; m_vbuf = new char[BUF_SIZE]; if (::setvbuf(fp, m_vbuf, _IOFBF, BUF_SIZE) != 0) { delete[] m_vbuf; m_vbuf = NULL; } m_fp = fp; m_mode = MODE_READ; m_ftype = ftype; m_eoln = eoln; m_ungetCh = CH_EOF; return true; } //------------------------------------------------------------------------------ // InTextStream::close() // Close the input stream. //------------------------------------------------------------------------------ /*virtual*/ bool InTextStream::close() { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif FILE * fp; bool rc = true; // Sanity check if (m_fp == NULL) return true; // Close the stream fp = m_fp; m_fp = NULL; if (fp != stdin) { m_errno = 0; if (::fclose(fp) < 0) { m_errno = errno; rc = false; } else { delete[] m_vbuf; m_vbuf = NULL; } } m_mode = MODE_NONE; return rc; } //------------------------------------------------------------------------------ // InTextStream::read() // Read a character from the input stream. // // Returns // A Unicode character code in the range [0x0000,0xFFFF], or // CH_EOLN if a newline sequence was read, or // CH_EOF if the end of the stream (end of file) was reached. //------------------------------------------------------------------------------ int InTextStream::read() { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif bool eoln_cr; bool eoln_lf; bool eoln_crlf; bool eoln_nel; // Set up eoln_cr = (m_eoln & EOLN_CR) != 0; eoln_lf = (m_eoln & EOLN_LF) != 0; eoln_crlf = (m_eoln & EOLN_CRLF) != 0; eoln_nel = (m_eoln & EOLN_NEL) != 0; // Read the next character or newline sequence for (;;) { int ch; int rc; // Read the next input char if (m_ungetCh != CH_EOF) { ch = m_ungetCh; m_ungetCh = CH_EOF; } else ch = (*m_get)(m_fp); rc = ch; // Check for an end of line sequence if (ch < 0) { // End of file rc = CH_EOF; } else if (ch == CH_CR) { if (eoln_crlf) { // Check for CR+LF sequence ch = (*m_get)(m_fp); if (ch == CH_LF) { // CR+LF, end of line rc = CH_EOLN; } else { // CR only m_ungetCh = ch; if (eoln_cr) rc = CH_EOLN; } } else if (eoln_cr) { // CR, end of line rc = CH_EOLN; } } else if (ch == CH_LF and eoln_lf) { // LF, end of line rc = CH_EOLN; } else if (ch == CH_NEL and eoln_nel) { // NEL, end of line rc = CH_EOLN; } else if (ch == CH_BOM_R) { // BOM (byte-reversed), reverse the input byte order if (m_get == &get_utf16) { m_get = &get_utf16r; continue; } else if (m_get == &get_utf16r) { m_get = &get_utf16; continue; } else if (m_get == &get_utf32) { m_get = &get_utf32r; continue; } else if (m_get == &get_utf32r) { m_get = &get_utf32; continue; } else if (m_get == &get_24bit) { m_get = &get_24bitr; continue; } else if (m_get == &get_24bitr) { m_get = &get_24bit; continue; } } else if (ch == CH_BOM) { // BOM, ignore continue; } return rc; } } //------------------------------------------------------------------------------ // InTextStream::read() // Read characters from the input stream. // // Returns // Number of characters written into 'buf', or -1 if the end of the stream // (end of file) was reached. // Note that newline sequences are written into 'buf' as CH_EOLN codes. //------------------------------------------------------------------------------ int InTextStream::read(int buf[], int len) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif int cnt; bool eoln_cr; bool eoln_lf; bool eoln_crlf; bool eoln_nel; // Sanity checks if (buf == NULL) return -1; if (m_fp == NULL) return -1; // Set up eoln_cr = (m_eoln & EOLN_CR) != 0; eoln_lf = (m_eoln & EOLN_LF) != 0; eoln_crlf = (m_eoln & EOLN_CRLF) != 0; eoln_nel = (m_eoln & EOLN_NEL) != 0; // Read characters from the input stream cnt = 0; while (cnt < len) { int ch; int rc; // Read the next input char if (m_ungetCh != CH_EOF) { ch = m_ungetCh; m_ungetCh = CH_EOF; } else ch = (*m_get)(m_fp); rc = ch; // Check for an end of line sequence if (ch < 0) { // End of file return (cnt == 0 ? -1 : cnt); } else if (ch == CH_CR and eoln_cr) { if (eoln_crlf) { // Check for CR+LF sequence ch = (*m_get)(m_fp); if (ch == CH_LF) { // CR+LF, end of line buf[cnt++] = CH_EOLN; } else { // CR only m_ungetCh = ch; if (eoln_cr) buf[cnt++] = CH_EOLN; } } else if (eoln_cr) { // CR, end of line buf[cnt++] = CH_EOLN; } } else if (ch == CH_LF and eoln_lf) { // LF, end of line buf[cnt++] = CH_EOLN; } else if (ch == CH_NEL and eoln_nel) { // NEL, end of line buf[cnt++] = CH_EOLN; } else if (ch == CH_BOM_R) { // BOM (byte-reversed), reverse the input byte order if (m_get == &get_utf16) { m_get = &get_utf16r; continue; } else if (m_get == &get_utf16r) { m_get = &get_utf16; continue; } else if (m_get == &get_utf32) { m_get = &get_utf32r; continue; } else if (m_get == &get_utf32r) { m_get = &get_utf32; continue; } else if (m_get == &get_24bit) { m_get = &get_24bitr; continue; } else if (m_get == &get_24bitr) { m_get = &get_24bit; continue; } } else if (ch == CH_BOM) { // BOM, ignore continue; } } return cnt; } //------------------------------------------------------------------------------ // InTextStream::get_7bit() // Read a 7-bit ASCII with parity character from the (input) stream. //------------------------------------------------------------------------------ /*static*/ int InTextStream::get_7bit(FILE *in) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif int ch; // Read a character ch = getc(in); if (ch == EOF) return -1; return ch & 0x7F; } //------------------------------------------------------------------------------ // InTextStream::get_8bit() // Read an 8-bit (ASCII or ISO 8859-x) character from the (input) stream. //------------------------------------------------------------------------------ /*static*/ int InTextStream::get_8bit(FILE *in) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif int ch; // Read a character ch = getc(in); if (ch == EOF) return -1; return ch; } //------------------------------------------------------------------------------ // InTextStream::get_utf8() // Read a UTF-8 character from the (input) stream. //------------------------------------------------------------------------------ /*static*/ int InTextStream::get_utf8(FILE *in) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif int ch1, ch2; // Read a UTF-8 character ch1 = getc(in); if (ch1 == EOF) return -1; if (ch1 < 0x80) { // 1-byte encoding for [0x0000,0x007F] // as <0xxxxxxx>, 7 bits in 1 byte return ch1; } else if ((ch1 & 0xE0) == 0xC0) { // 2-byte encoding for [0x0080,0x07FF] // as <110xxxxx,10xxxxxx>, 11 bits in 2 bytes ch1 &= 0x1F; ch2 = getc(in); if (ch2 == EOF) return ch1; ch1 = (ch1 << 6) | (ch2 & 0x3F); return ch1; } else if ((ch1 & 0xF0) == 0xE0) { // 3-byte encoding for [0x0800,0xFFFF] // as <1110xxxx,10xxxxxx,10xxxxxx>, 16 bits in 3 bytes ch1 &= 0x0F; ch2 = getc(in); if (ch2 == EOF) return ch1; ch1 = (ch1 << 6) + (ch2 & 0x3F); ch2 = getc(in); if (ch2 == EOF) return ch1; ch1 = (ch1 << 6) | (ch2 & 0x3F); return ch1; } else { // 4-byte encoding for [0x00010000,0x0010FFFF] // as <11110xxx,10xxxxxx,10xxxxxx,10xxxxxx,[...]>, 21+ bits in 4+ bytes ch1 &= 0x07; while (ch2 = getc(in), ch2 != EOF and (ch2 & 0xC0) == 0x80) ch1 = (ch1 << 6) + (ch2 & 0x3F); if (ch1 < 0) ch1 &= (-1U >> 1); // Note: works only on 2's-complement CPUs return ch1; } } //------------------------------------------------------------------------------ // InTextStream::get_utf16() // Read a UTF-16 character from the (input) stream. //------------------------------------------------------------------------------ /*static*/ int InTextStream::get_utf16(FILE *in) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif int lo, hi; // Read a character hi = getc(in); if (hi == EOF) return -1; lo = getc(in); if (lo == EOF) return -1; return (hi << 8) + lo; } //------------------------------------------------------------------------------ // InTextStream::get_utf16r() // Read a UTF-16 little-endian character from the (input) stream. //------------------------------------------------------------------------------ /*static*/ int InTextStream::get_utf16r(FILE *in) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif int lo, hi; // Read a character lo = getc(in); if (lo == EOF) return -1; hi = getc(in); if (hi == EOF) return -1; return (hi << 8) + lo; } //------------------------------------------------------------------------------ // InTextStream::get_utf32() // Read a UTF-32 character from the (input) stream. // Note that the returned code is in the range [0x00000000,0x7FFFFFFF], // which may require the truncation of the hit (MSB) bits of the character. //------------------------------------------------------------------------------ /*static*/ int InTextStream::get_utf32(FILE *in) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif int ch, lo; // Read a character ch = getc(in); if (ch == EOF) return -1; lo = getc(in); if (lo == EOF) return -1; ch = (ch << 8) + lo; lo = getc(in); if (lo == EOF) return -1; ch = (ch << 8) + lo; lo = getc(in); if (lo == EOF) return -1; ch = (ch << 8) + lo; return ch & 0x7FFFFFFF; } //------------------------------------------------------------------------------ // InTextStream::get_utf32r() // Read a UTF-32 little-endian character from the (input) stream. // Note that the returned code is in the range [0x00000000,0x7FFFFFFF], // which may require the truncation of the hit (MSB) bits of the character. //------------------------------------------------------------------------------ /*static*/ int InTextStream::get_utf32r(FILE *in) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif int ch, hi; // Read a character ch = getc(in); if (ch == EOF) return -1; hi = getc(in); if (hi == EOF) return -1; ch = (hi << 8*1) + ch; hi = getc(in); if (hi == EOF) return -1; ch = (hi << 8*2) + ch; hi = getc(in); if (hi == EOF) return -1; ch = (hi << 8*3) + ch; return ch & 0x7FFFFFFF; } //------------------------------------------------------------------------------ // InTextStream::get_24bit() // Read a 24-bit character from the (input) stream. //------------------------------------------------------------------------------ /*static*/ int InTextStream::get_24bit(FILE *in) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif int ch, lo; // Read a character ch = getc(in); if (ch == EOF) return -1; lo = getc(in); if (lo == EOF) return -1; ch = (ch << 8) + lo; lo = getc(in); if (lo == EOF) return -1; ch = (ch << 8) + lo; return ch; } //------------------------------------------------------------------------------ // InTextStream::get_24bitr() // Read a 24-bit little-endian character from the (input) stream. //------------------------------------------------------------------------------ /*static*/ int InTextStream::get_24bitr(FILE *in) { #if InTextStream_VS/100 != 2 #error class InTextStream has changed #endif int ch, hi; // Read a character ch = getc(in); if (ch == EOF) return -1; hi = getc(in); if (hi == EOF) return -1; ch = (hi << 8*1) + ch; hi = getc(in); if (hi == EOF) return -1; ch = (hi << 8*2) + ch; return ch; } // End intextstream.cpp