//============================================================================== // crlf.cpp // Convert newlines in text files. // Handles 8-bit (ASCII, ISO 8859-1) text files, 7-bit ASCII with parity, // and UTF-8, UTF-16, and UTF-32 Unicode files. // Handles CR, LF, CR+LF, NEL, Any, and None newline sequence modes. // // Usage // crlf [-option...] file... // // Bugs // The '-c', '-ch', and '-cu' (show control characters) options do not // operate well with the '-iw' and '-ow' (line width) options. The best // approach is to execute the program in two passes, e.g.: // crlf -c foo.txt | crlf -ow 80 - // // Notice // Copyright ©2008-2011 by David R. Tribble, all rights reserved. // Permission is granted to any person or entity except those designated // by the United States Department of State as a terrorist, or terrorist // government or agency, to use and distribute this source code provided // that the original copyright notice remains present and unaltered. //============================================================================== // Identification #define PROG "crlf" #ifndef VERS #define VERS "1.4" #endif #define DATE "2011-04-29" static char REV[] = "@(#)drt/src/cmd/crlf.cpp $Revision: 1.14 $$Date: 2011/04/30 01:00:46 $"; static char BUILT[] = "@(#)" "Built: " __DATE__ " " __TIME__; static char COPYRIGHT[] = "@(#)" "Copyright ©2008-2011 by David R. Tribble, all rights reserved."; // System definitions #if defined(_WIN32) #define OS_WIN32 1 #elif defined(unix) || defined(_unix) || defined(__unix) || defined(__unix__) #define OS_UNIX 1 #elif defined(_MAC) || defined(MACOS) || defined(_MACOS) #define OS_MACOS 1 #else #error Target operating system is unknown #endif // Includes #include #define sys_errno_h #include #define sys_iso646_h #include #define sys_stdlib_h #include #define sys_string_h #include "textstream.hpp" #include "intextstream.hpp" #include "outtextstream.hpp" // System-dependent constants #if OS_WIN32 #define OS_EOLN TextStream::EOLN_CRLF #define NL_MODE "CR+LF" #define DFL_CRLF " (default)" #define DFL_CR "" #define DFL_LF "" #elif OS_UNIX #define OS_EOLN TextStream::EOLN_LF #define NL_MODE "LF" #define DFL_CRLF "" #define DFL_CR "" #define DFL_LF " (default)" #elif OS_MACOS #define OS_EOLN TextStream::EOLN_CR #define NL_MODE "CR" #define DFL_CRLF "" #define DFL_CR " (default)" #define DFL_LF "" #else #define OS_EOLN TextStream::EOLN_CRLF #define NL_MODE "CR+LF" #define DFL_CRLF " (default)" #define DFL_CR "" #define DFL_LF "" #endif //------------------------------------------------------------------------------ // class Program // Program to convert newlines in text files. //------------------------------------------------------------------------------ #define Program_VS 105 // Class version class Program { // ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ // Constants public: enum { RC_OKAY = 0, // Success RC_READ = 1, // Can't read input file RC_WRITE = 2, // Can't write output file RC_USAGE = 255 // Bad program usage }; private: static const char *const s_usageMsg[]; // Usage messages // ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ // Static functions public: static void usage(); // Show a usage message & punt // ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ // Functions public: /**/ ~Program() { } // Destructor /**/ Program(); // Constructor int main(int argc, const char *const *argv); // Run this program private: /**/ Program(const Program &o); const Program & operator =(const Program &o); void convert(InTextStream *in, OutTextStream *out) const; // Convert a file // ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ // Variables private: const char * m_inName; // Input filename const char * m_outName; // Output filename enum TextStream::FileType m_inType; // Input file type enum TextStream::FileType m_outType; // Output file type enum TextStream::EolnType m_inEoln; // Input newline type enum TextStream::EolnType m_outEoln; // Output newline type int m_inWidth; // Max input line width int m_outWidth; // Max output line width int m_nlSpacing; // Output line spacing int m_inTabSize; // Input tab width bool m_ctrlZ; // Stop at Ctrl-Z char bool m_ignNulls; // Remove NUL chars bool m_ignCtrls; // Remove control chars bool m_ignNonNLs; // Remove non-newline ctrl chars bool m_bom; // Output starts with BOM bool m_endNL; // Output ends with newline bool m_showCtrls; // Output ctrl chars as escapes bool m_showUnic; // Output ctrl chars as \u codes bool m_showHTML; // Output ctrl chars as HTML bool m_pad; // Pad output w/ spaces }; //------------------------------------------------------------------------------ // Program::Program() // Constructor. //------------------------------------------------------------------------------ Program::Program(): m_inName(NULL), m_outName("-"), m_inType(TextStream::FTYPE_8BIT), m_outType(TextStream::FTYPE_8BIT), m_inEoln(TextStream::EOLN_ANY), m_outEoln(OS_EOLN), m_inWidth(0), m_outWidth(0), m_nlSpacing(1), m_inTabSize(0), m_ctrlZ(false), m_ignNulls(false), m_ignCtrls(false), m_ignNonNLs(false), m_bom(false), m_endNL(true), m_showCtrls(false), m_showUnic(false), m_showHTML(false), m_pad(false) { #if Program_VS != 105 #error class Program has changed #endif // Initialize } //------------------------------------------------------------------------------ // Program::convert() // Convert a file, translating newline sequences. //------------------------------------------------------------------------------ void Program::convert(InTextStream *in, OutTextStream *out) const { #if Program_VS/100 != 1 #error class Program has changed #endif int inColNo = 1; int outColNo = 1; int inSpaces = 0; int lastCh = TextStream::CH_EOLN; // Write a leading BOM if necessary if (m_bom) out->write(TextStream::CH_BOM); // Convert the file contents for (;;) { int ch; bool isCtrl = false; // Read the next char from the input stream if (inSpaces > 0) { inSpaces--; ch = TextStream::CH_SP; } else { ch = in->read(); if (ch == TextStream::CH_EOF) break; } // Check for ignorable chars if (ch == TextStream::CH_NUL and m_ignNulls) { // Ignore this char continue; } else if (ch == TextStream::CH_SUB and m_ctrlZ) { // Ctrl+Z (SUB), stop reading input break; } else if (ch == TextStream::CH_HT and m_inTabSize > 0) { // Convert a tab (HT) into one or more spaces inSpaces = m_inTabSize - ((inColNo - 1) % m_inTabSize); continue; } else if (m_ignNonNLs and (ch == TextStream::CH_CR or ch == TextStream::CH_LF)) { // Ignore this char continue; } else if ((ch >= 0x0000 and ch < 0x0020 or (ch >= 0x007F and ch <= 0x00A0))) { isCtrl = true; if (m_ignCtrls) { // Ignore this char continue; } } // Handle end of line { bool breakLn = false; // Check for end of line on input if (ch == TextStream::CH_EOLN) { inColNo = 0; breakLn = true; } else if (inColNo > m_inWidth and m_inWidth > 0) { inColNo = 1; breakLn = true; } // Check for end of line on output if (outColNo > m_outWidth and m_outWidth > 0) { // Output line width exceeded breakLn = true; } if (breakLn) { // Write newline(s) for (int n = 0; n < m_nlSpacing; n++) { // Pad the output line if necessary if (m_pad and m_outWidth > 0) { while (outColNo++ <= m_outWidth) out->write(TextStream::CH_SP); } // Write a newline out->writeln(); outColNo = 1; } } } // Write the input char to the output stream inColNo++; if (ch == TextStream::CH_EOLN) { // Do nothing } else if (isCtrl and m_showCtrls) { char buf[20]; const char * m = buf; // Show control char as an escape sequence if (m_showUnic) { if (ch <= 0xFFFF) ::sprintf(buf, "\\u%04X", ch); else ::sprintf(buf, "\\U%08X", ch); } else if (m_showHTML) { if (ch <= 0x00FF) ::sprintf(buf, "&#%d;", ch); else if (ch <= 0xFFFF) ::sprintf(buf, "X;", ch); else if (ch <= 0x00FFFFFF) ::sprintf(buf, "X;", ch); else ::sprintf(buf, "X;", ch); } else { switch (ch) { case TextStream::CH_BEL: m = "\\a"; break; case TextStream::CH_BS: m = "\\b"; break; case TextStream::CH_FF: m = "\\f"; break; case TextStream::CH_LF: m = "\\n"; break; case TextStream::CH_CR: m = "\\r"; break; case TextStream::CH_HT: m = "\\t"; break; case TextStream::CH_VT: m = "\\v"; break; case TextStream::CH_SUB: m = "\\z"; break; case TextStream::CH_ESC: m = "\\e"; break; case TextStream::CH_DEL: m = "\\d"; break; case TextStream::CH_NBSP: m = "\\s"; break; case '\\': m = "\\\\"; break; default: ::sprintf(buf, "\\x%02X", ch); break; } } out->write(m); outColNo += ::strlen(m); } else if (m_showHTML) { const char * m = NULL; // Show special char as an HTML entity switch (ch) { case '&': m = "&"; break; case '<': m = "<"; break; case '>': m = ">"; break; case '"': m = """; break; case '\'': m = "'"; break; default: break; } if (m != NULL) { out->write(m); outColNo += ::strlen(m); } else if (ch != TextStream::CH_EOLN) { out->write(ch); outColNo++; } } else { // Write a regular char out->write(ch); outColNo++; } lastCh = ch; } // Handle the last text line if (lastCh != TextStream::CH_EOLN and m_endNL) { // Force the output to end with newline(s) for (int n = 0; n < m_nlSpacing; n++) { // Pad the output line if necessary if (m_pad and m_outWidth > 0) { while (outColNo++ <= m_outWidth) out->write(TextStream::CH_SP); } // Write a newline out->writeln(); outColNo = 1; } } } //------------------------------------------------------------------------------ // Program::s_usageMsg[] // Program usage messages. // // See // Program::usage() //------------------------------------------------------------------------------ /*static*/ const char *const Program::s_usageMsg[] = { "[" PROG ", " VERS " " DATE "] (david.tribble.com)", "", "Convert newlines in text files.", "", "Handles 8-bit ASCII and ISO 8859-1 text files, 7-bit ASCII with parity, " "and", "UTF-8, UTF-16, and UTF-32 Unicode files. Handles CR, LF, CR/LF, and NEL", "newline sequences.", "", "Usage: " PROG " [-option...] file...", "", "Input options:", " -inone Input newlines are not converted, but read as is" " (implies -end)", " -iany Input newlines are CR, LF, or CR+LF (default)", " -icrlf Input newlines are CR+LF (0D 0A)", " -icr Input newlines are CR (0D)", " -ilf Input newlines are LF (0A)", " -inel Input newlines are NEL (85)", " -i8 Input characters are 8-bit ASCII or ISO 8859-1 (default)", " -i7 Input characters are 7-bit ASCII, parity ignored", " -iutf8 Input characters are UTF-8", " -iutf16 Input characters are UTF-16 big-endian", " -iutf16r Input characters are UTF-16 little-endian", " -iutf32 Input characters are UTF-32 big-endian", " -iutf32r Input characters are UTF-32 little-endian", " -i24 Input characters are 24-bit big-endian (non-standard)", " -i24r Input characters are 24-bit little-endian (non-standard)", " -iw num Maximum input line width (0 = no maximum)", " -nonl Ignore extraneous non-newline control characters (CR, LF)", " -nul Ignore null (NUL) control characters", " -ctl Ignore all control characters", " -t num Convert tabs to spaces (default is 0, no conversion)", " -z Input ends at the first Ctrl-Z (SUB) character", "", "Output options:", " -o file Output file (default is standard output)", " -onone No newlines are written to the output", " -ocrlf Output newlines are CR+LF (0D 0A)" DFL_CRLF, " -ocr Output newlines are CR (0D)" DFL_CR, " -olf Output newlines are LF (0A)" DFL_LF, " -onel Output newlines are NEL (85)", " -o8 Output characters are 8-bit ASCII or ISO 8859-1 (default)", " -o7 Output characters are 7-bit ASCII no (space) parity", " -o7e Output characters are 7-bit ASCII even parity", " -o7m Output characters are 7-bit ASCII mark parity", " -o7o Output characters are 7-bit ASCII odd parity", #ifdef is_not_supported___ " -ot num Output spaces as tabs (default is 0, no conversion)", #endif " -outf8 Output characters are Unicode UTF-8", " -outf16 Output characters are Unicode UTF-16 big-endian", " -outf16r Output characters are Unicode UTF-16 little-endian", " -outf32 Output characters are Unicode UTF-32 big-endian", " -outf32r Output characters are Unicode UTF-32 little-endian", " -o24 Output characters are 24-bit big-endian (non-standard)", " -o24r Output characters are 24-bit little-endian (non-standard)", " -opad Pad output lines with spaces (requires -ow)", " -ow num Maximum output line width (0 = no maximum)", " -bom Output begins with a Unicode byte order mark (BOM)", " -c Show control characters as C escape sequences", " -ch Show special characters as HTML entities", " -cu Show control characters as \\u escape sequences", " -end Do not append a missing newline to the end of the output", " -sp2 Output lines are double-spaced", " -sp3 Output lines are triple-spaced", "", "An input filename of \"-\" indicates standard input.", "An output filename of \"-\" indicates standard output.", "The default newline output mode is the native operating system mode (" NL_MODE ").", "Selecting only option '-inone' results in no character translation of the" " input", "files.", NULL }; //------------------------------------------------------------------------------ // Program::usage() // Display a usage message. // // See // Program::main() // Program::s_usageMsg[] //------------------------------------------------------------------------------ void Program::usage() { #if Program_VS/100 != 1 #error class Program has changed #endif // Display a program usage message for (int i = 0; s_usageMsg[i] != NULL; i++) ::printf("%s\n", s_usageMsg[i]); // Punt ::exit(RC_USAGE); } //------------------------------------------------------------------------------ // Program::main() // Execute the program. //------------------------------------------------------------------------------ int Program::main(int argc, const char *const *argv) { #if Program_VS != 105 #error class Program has changed #endif OutTextStream * out = NULL; int rc = RC_OKAY; int i; // Parse option args for (i = 1; i < argc and argv[i][0] == '-' and argv[i][1] != '\0'; i++) { if (::strcmp(argv[i], "--") == 0) { i++; break; } else if (::strcmp(argv[i], "-o") == 0) { if (++i >= argc) usage(); m_outName = argv[i]; } else if (::strcmp(argv[i], "-iw") == 0) { if (++i >= argc) usage(); m_inWidth = ::atoi(argv[i]); } else if (::strcmp(argv[i], "-ow") == 0) { if (++i >= argc) usage(); m_outWidth = ::atoi(argv[i]); } else if (::strcmp(argv[i], "-inone") == 0) { m_inEoln = TextStream::EOLN_NONE; m_endNL = false; } else if (::strcmp(argv[i], "-iany") == 0) m_inEoln = TextStream::EOLN_ANY; else if (::strcmp(argv[i], "-icr") == 0) m_inEoln = TextStream::EOLN_CR; else if (::strcmp(argv[i], "-ilf") == 0) m_inEoln = TextStream::EOLN_LF; else if (::strcmp(argv[i], "-icrlf") == 0) m_inEoln = TextStream::EOLN_CRLF; else if (::strcmp(argv[i], "-inel") == 0) m_inEoln = TextStream::EOLN_NEL; else if (::strcmp(argv[i], "-i8") == 0) m_inType = TextStream::FTYPE_8BIT; else if (::strcmp(argv[i], "-i7") == 0) m_inType = TextStream::FTYPE_7NONE; else if (::strcmp(argv[i], "-iutf8") == 0) m_inType = TextStream::FTYPE_UTF8; else if (::strcmp(argv[i], "-iutf16") == 0 or ::strcmp(argv[i], "-iutf16be") == 0) m_inType = TextStream::FTYPE_UTF16; else if (::strcmp(argv[i], "-iutf16r") == 0 or ::strcmp(argv[i], "-iutf16le") == 0) m_inType = TextStream::FTYPE_UTF16_R; else if (::strcmp(argv[i], "-iutf32") == 0 or ::strcmp(argv[i], "-iutf32be") == 0) m_inType = TextStream::FTYPE_UTF32; else if (::strcmp(argv[i], "-iutf32r") == 0 or ::strcmp(argv[i], "-iutf32le") == 0) m_inType = TextStream::FTYPE_UTF32_R; else if (::strcmp(argv[i], "-i24") == 0 or ::strcmp(argv[i], "-i24be") == 0) m_inType = TextStream::FTYPE_24BIT; else if (::strcmp(argv[i], "-i24r") == 0 or ::strcmp(argv[i], "-i24le") == 0) m_inType = TextStream::FTYPE_24BIT_R; else if (::strcmp(argv[i], "-nonl") == 0) m_ignNonNLs = true; else if (::strcmp(argv[i], "-nul") == 0) m_ignNulls = true; else if (::strcmp(argv[i], "-ctl") == 0) m_ignCtrls = true; else if (::strcmp(argv[i], "-c") == 0) m_showCtrls = true; else if (::strcmp(argv[i], "-ch") == 0) m_showHTML = m_showCtrls = true; else if (::strcmp(argv[i], "-cu") == 0) m_showUnic = m_showCtrls = true; else if (::strcmp(argv[i], "-t") == 0) { if (++i >= argc) usage(); m_inTabSize = ::atoi(argv[i]); } else if (::strcmp(argv[i], "-z") == 0) m_ctrlZ = true; else if (::strcmp(argv[i], "-bom") == 0) m_bom = true; else if (::strcmp(argv[i], "-end") == 0) m_endNL = false; else if (::strcmp(argv[i], "-sp2") == 0) m_nlSpacing = 2; else if (::strcmp(argv[i], "-sp3") == 0) m_nlSpacing = 3; else if (::strcmp(argv[i], "-onone") == 0) m_outEoln = TextStream::EOLN_NONE; else if (::strcmp(argv[i], "-ocr") == 0) m_outEoln = TextStream::EOLN_CR; else if (::strcmp(argv[i], "-olf") == 0) m_outEoln = TextStream::EOLN_LF; else if (::strcmp(argv[i], "-ocrlf") == 0) m_outEoln = TextStream::EOLN_CRLF; else if (::strcmp(argv[i], "-onel") == 0) m_outEoln = TextStream::EOLN_NEL; else if (::strcmp(argv[i], "-o8") == 0) m_outType = TextStream::FTYPE_8BIT; else if (::strcmp(argv[i], "-o7") == 0 or ::strcmp(argv[i], "-o7n") == 0) m_outType = TextStream::FTYPE_7NONE; else if (::strcmp(argv[i], "-o7m") == 0) m_outType = TextStream::FTYPE_7MARK; else if (::strcmp(argv[i], "-o7e") == 0) m_outType = TextStream::FTYPE_7EVEN; else if (::strcmp(argv[i], "-o7o") == 0) m_outType = TextStream::FTYPE_7ODD; #ifdef is_not_supported___ else if (::strcmp(argv[i], "-ot") == 0) { if (++i >= argc) usage(); m_outTabSize = ::atoi(argv[i]); } #endif else if (::strcmp(argv[i], "-outf8") == 0) m_outType = TextStream::FTYPE_UTF8; else if (::strcmp(argv[i], "-outf16") == 0) m_outType = TextStream::FTYPE_UTF16; else if (::strcmp(argv[i], "-outf16r") == 0) m_outType = TextStream::FTYPE_UTF16_R; else if (::strcmp(argv[i], "-outf32") == 0) m_outType = TextStream::FTYPE_UTF32; else if (::strcmp(argv[i], "-outf32r") == 0) m_outType = TextStream::FTYPE_UTF32_R; else if (::strcmp(argv[i], "-o24") == 0) m_outType = TextStream::FTYPE_24BIT; else if (::strcmp(argv[i], "-o24r") == 0) m_outType = TextStream::FTYPE_24BIT_R; else if (::strcmp(argv[i], "-opad") == 0) m_pad = true; else usage(); } // Check usage if (i >= argc) usage(); // Open the output stream out = new OutTextStream(); if (out == NULL) { ::fprintf(stderr, "Can't allocate an output stream\n"); ::fflush(stderr); return RC_WRITE; } if (::strcmp(m_outName, "-") == 0) { if (not out->open(stdout, m_outType, m_outEoln)) { ::fprintf(stderr, "Can't write to standard output; %s\n", strerror(errno)); ::fflush(stderr); return RC_WRITE; } } else { if (not out->open(m_outName, m_outType, m_outEoln)) { ::fprintf(stderr, "Can't write to: %s; %s\n", m_outName, strerror(errno)); ::fflush(stderr); return RC_WRITE; } } // Convert the named files for ( ; i < argc; i++) { InTextStream * in = NULL; // Open a text file stream m_inName = argv[i]; in = new InTextStream(); if (in == NULL) { ::fprintf(stderr, "Can't allocate an input stream: %s; %s\n", m_inName, strerror(errno)); ::fflush(stderr); rc = RC_READ; goto Done; } if (::strcmp(m_inName, "-") == 0) { if (not in->open(stdin, m_inType, m_inEoln)) { ::fprintf(stderr, "Can't read from standard input; %s\n", strerror(errno)); ::fflush(stderr); rc = RC_READ; goto Done; } } else { if (not in->open(m_inName, m_inType, m_inEoln)) { ::fprintf(stderr, "Can't read: %s; %s\n", m_inName, strerror(errno)); ::fflush(stderr); rc = RC_READ; goto Done; } } // Convert the input file convert(in, out); Done: out->flush(); if (in != NULL) { in->close(); delete in; in = NULL; } } // Done ::fflush(stdout); out->close(); return rc; } //------------------------------------------------------------------------------ // ::main() // Execute this program. // // See // Program::main() //------------------------------------------------------------------------------ int main(int argc, char **argv) { Program pgm; return pgm.main(argc, (const char *const *) argv); } // End crlf.cpp