/* * Character Set Handling * (C) 1999-2007 Jack Lloyd * * Botan is released under the Simplified BSD License (see license.txt) */ #include #include #include #include namespace Botan { namespace { void append_utf8_for(std::string& s, uint32_t c) { if(c >= 0xD800 && c < 0xE000) throw Decoding_Error("Invalid Unicode character"); if(c <= 0x7F) { const uint8_t b0 = static_cast(c); s.push_back(static_cast(b0)); } else if(c <= 0x7FF) { const uint8_t b0 = 0xC0 | static_cast(c >> 6); const uint8_t b1 = 0x80 | static_cast(c & 0x3F); s.push_back(static_cast(b0)); s.push_back(static_cast(b1)); } else if(c <= 0xFFFF) { const uint8_t b0 = 0xE0 | static_cast(c >> 12); const uint8_t b1 = 0x80 | static_cast((c >> 6) & 0x3F); const uint8_t b2 = 0x80 | static_cast(c & 0x3F); s.push_back(static_cast(b0)); s.push_back(static_cast(b1)); s.push_back(static_cast(b2)); } else if(c <= 0x10FFFF) { const uint8_t b0 = 0xF0 | static_cast(c >> 18); const uint8_t b1 = 0x80 | static_cast((c >> 12) & 0x3F); const uint8_t b2 = 0x80 | static_cast((c >> 6) & 0x3F); const uint8_t b3 = 0x80 | static_cast(c & 0x3F); s.push_back(static_cast(b0)); s.push_back(static_cast(b1)); s.push_back(static_cast(b2)); s.push_back(static_cast(b3)); } else throw Decoding_Error("Invalid Unicode character"); } } std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) { if(len % 2 != 0) throw Decoding_Error("Invalid length for UCS-2 string"); const size_t chars = len / 2; std::string s; for(size_t i = 0; i != chars; ++i) { const uint16_t c = load_be(ucs2, i); append_utf8_for(s, c); } return s; } std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) { if(len % 4 != 0) throw Decoding_Error("Invalid length for UCS-4 string"); const size_t chars = len / 4; std::string s; for(size_t i = 0; i != chars; ++i) { const uint32_t c = load_be(ucs4, i); append_utf8_for(s, c); } return s; } /* * Convert from UTF-8 to ISO 8859-1 */ std::string utf8_to_latin1(const std::string& utf8) { std::string iso8859; size_t position = 0; while(position != utf8.size()) { const uint8_t c1 = static_cast(utf8[position++]); if(c1 <= 0x7F) { iso8859 += static_cast(c1); } else if(c1 >= 0xC0 && c1 <= 0xC7) { if(position == utf8.size()) throw Decoding_Error("UTF-8: sequence truncated"); const uint8_t c2 = static_cast(utf8[position++]); const uint8_t iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F); if(iso_char <= 0x7F) throw Decoding_Error("UTF-8: sequence longer than needed"); iso8859 += static_cast(iso_char); } else throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used"); } return iso8859; } /* * Convert from UCS-2 to ISO 8859-1 */ std::string ucs2_to_latin1(const std::string& ucs2) { if(ucs2.size() % 2 == 1) throw Decoding_Error("UCS-2 string has an odd number of bytes"); std::string latin1; for(size_t i = 0; i != ucs2.size(); i += 2) { const uint8_t c1 = ucs2[i]; const uint8_t c2 = ucs2[i+1]; if(c1 != 0) throw Decoding_Error("UCS-2 has non-Latin1 characters"); latin1 += static_cast(c2); } return latin1; } /* * Convert from ISO 8859-1 to UTF-8 */ std::string latin1_to_utf8(const std::string& iso8859) { std::string utf8; for(size_t i = 0; i != iso8859.size(); ++i) { const uint8_t c = static_cast(iso8859[i]); if(c <= 0x7F) utf8 += static_cast(c); else { utf8 += static_cast((0xC0 | (c >> 6))); utf8 += static_cast((0x80 | (c & 0x3F))); } } return utf8; } namespace Charset { /* * Check if a character represents a digit */ bool is_digit(char c) { if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || c == '5' || c == '6' || c == '7' || c == '8' || c == '9') return true; return false; } /* * Check if a character represents whitespace */ bool is_space(char c) { if(c == ' ' || c == '\t' || c == '\n' || c == '\r') return true; return false; } /* * Convert a character to a digit */ uint8_t char2digit(char c) { switch(c) { case '0': return 0; case '1': return 1; case '2': return 2; case '3': return 3; case '4': return 4; case '5': return 5; case '6': return 6; case '7': return 7; case '8': return 8; case '9': return 9; } throw Invalid_Argument("char2digit: Input is not a digit character"); } /* * Convert a digit to a character */ char digit2char(uint8_t b) { switch(b) { case 0: return '0'; case 1: return '1'; case 2: return '2'; case 3: return '3'; case 4: return '4'; case 5: return '5'; case 6: return '6'; case 7: return '7'; case 8: return '8'; case 9: return '9'; } throw Invalid_Argument("digit2char: Input is not a digit"); } /* * Case-insensitive character comparison */ bool caseless_cmp(char a, char b) { return (std::tolower(static_cast(a)) == std::tolower(static_cast(b))); } } }