//============================================================================= // drt/sys/schar4.cpp // Unicode character classes. // // These classes provide a representation of Unicode characters. // // History // 0.01, 1999-08-18, David R Tribble. // First cut. // // Copyright ©1999, by David R. Tribble, all rights reserved. // See "drt/sys/copyr.txt" for more information. //----------------------------------------------------------------------------- // Identification static const char id[] = "@(#)drt/sys/schar4.cpp 0.01"; // System includes #include #define drt_std_assert_h 1 #include #define drt_std_ctype_h 1 // Special includes #include "sdefs.hpp" // Local includes #include "sdebug.hpp" #include "schar.hpp" // Local wrappers #include "slib1.hpp" drt_namespace_begin //----------------------------------------------------------------------------- // Class member functions //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- // DrtChar::toUTF_8() // Write this character into string 's' as one or more octets. // // Notes // The binary representation of the character's integer value is simply // spread across the octets and the number of high bits set in the leading // byte announces the number of bytes in the multibyte sequence: // // Bytes | Bits | Representation // ------+------+---------------- // 1 | 7 | 0vvvvvvv // 2 | 11 | 110vvvvv 10vvvvvv // 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv // 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv // 5 | 26 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv // 6 | 31 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv // // Up to six octets are shown for full 31-bit Unicode character codes, but // since we're only dealing with 16-bit codes, 's' will be filled with no // more than three octets. // // Returns // The number of octets written into string 's', which will be in the // range [1,3], or -1 on error. // // Caveats // Since this Unicode character contains only 16 bits, there will never be // more than three octets placed into string 's'. //----------------------------------------------------------------------------- int DrtChar::toUTF_8(char *s) const { #if DrtChar_VS/100 != 1 #error DrtChar_VS has changed #endif //DrtTrace dbg(s_grp, "toUTF_8", this); // Validate this object validate(); // Check args if (s == null) return (-1); // Decompose this character into one or more octets if (m_ch < 0x0080) { s[0] = m_ch; return (1); } else if (m_ch < 0x0800) { s[0] = 0xC0 | (m_ch >> 6); s[1] = 0x80 | (m_ch & 0x3F); return (2); } else // (m_ch < 0x10000) { s[0] = 0xE0 | (m_ch >> 12); s[1] = 0x80 | (m_ch >> 6 & 0x3F); s[2] = 0x80 | (m_ch & 0x3F); return (3); } } //----------------------------------------------------------------------------- // DrtChar::fromUTF_8() // Read one or more octets from string 's' to compose this Unicode // character code. // // Notes // (See DrtChar::toUTF_8().) // // Returns // The number of octets read from string 's', which will be in the range // [1,3], or -1 on error. // // Caveats // No attempt is made to handle UTF-8 encodings of Unicode characters with // more than 16 bits, i.e., more than three octets. //----------------------------------------------------------------------------- int DrtChar::fromUTF_8(const char *s) { #if DrtChar_VS/100 != 1 #error DrtChar_VS has changed #endif //DrtTrace dbg(s_grp, "fromUTF_8", this); // Validate this object validate(); // Check args if (s == null) return (-1); // Compose this character from one or more octets const unsigned char * c = reinterpret_cast(const unsigned char *, s); if (c[0] < 0x80) { m_ch = c[0]; return (1); } else if (c[0] < 0xE0) { m_ch = (c[0] & 0x1F) << 5; m_ch |= (c[1] & 0x3F); return (2); } else // if (c[0] < 0xF0) { m_ch = (c[0] & 0x0F) << 12; m_ch |= (c[1] & 0x3F) << 6; m_ch |= (c[2] & 0x3F); return (3); } } drt_namespace_end // End schar4.cpp