diff options
author | Jack Lloyd <[email protected]> | 2017-11-09 12:19:28 -0500 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-11-09 12:19:28 -0500 |
commit | 2aea9e3b2d6d9a3ad1f5e3c76e7e0d99c0872122 (patch) | |
tree | cf9d6e0763def177e4452119e6603dfc2ff49b9b /src/lib/utils | |
parent | 7e3cfd7eb4cc9165c9c53c81c3613c23db433cd7 (diff) |
Add UCS-2 and UCS-4 to UTF-8 conversion functions
Crosschecked by fuzzing and comparing with iconv
Needed in #1250
Diffstat (limited to 'src/lib/utils')
-rw-r--r-- | src/lib/utils/charset.cpp | 81 | ||||
-rw-r--r-- | src/lib/utils/charset.h | 16 |
2 files changed, 97 insertions, 0 deletions
diff --git a/src/lib/utils/charset.cpp b/src/lib/utils/charset.cpp index 546e4e74d..dadee8f78 100644 --- a/src/lib/utils/charset.cpp +++ b/src/lib/utils/charset.cpp @@ -7,10 +7,91 @@ #include <botan/charset.h> #include <botan/exceptn.h> +#include <botan/loadstor.h> #include <cctype> namespace Botan { +namespace { + +void append_utf8_for(std::string& s, uint32_t c) + { + if(c >= 0xD800 && c < 0xE000) + throw Decoding_Error("Invalid Unicode character"); + + if(c <= 0x7F) + { + const uint8_t b0 = static_cast<uint8_t>(c); + s.push_back(static_cast<char>(b0)); + } + else if(c <= 0x7FF) + { + const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6); + const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F); + s.push_back(static_cast<char>(b0)); + s.push_back(static_cast<char>(b1)); + } + else if(c <= 0xFFFF) + { + const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12); + const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); + const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F); + s.push_back(static_cast<char>(b0)); + s.push_back(static_cast<char>(b1)); + s.push_back(static_cast<char>(b2)); + } + else if(c <= 0x10FFFF) + { + const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18); + const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F); + const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); + const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F); + s.push_back(static_cast<char>(b0)); + s.push_back(static_cast<char>(b1)); + s.push_back(static_cast<char>(b2)); + s.push_back(static_cast<char>(b3)); + } + else + throw Decoding_Error("Invalid Unicode character"); + + } + +} + +std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) + { + if(len % 2 != 0) + throw Decoding_Error("Invalid length for UCS-2 string"); + + const size_t chars = len / 2; + + std::string s; + for(size_t i = 0; i != chars; ++i) + { + const uint16_t c = load_be<uint16_t>(ucs2, i); + append_utf8_for(s, c); + } + + return s; + } + +std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) + { + if(len % 4 != 0) + throw Decoding_Error("Invalid length for UCS-4 string"); + + const size_t chars = len / 4; + + std::string s; + for(size_t i = 0; i != chars; ++i) + { + const uint32_t c = load_be<uint32_t>(ucs4, i); + append_utf8_for(s, c); + } + + return s; + } + namespace Charset { namespace { diff --git a/src/lib/utils/charset.h b/src/lib/utils/charset.h index 528ab908b..3f2ff9912 100644 --- a/src/lib/utils/charset.h +++ b/src/lib/utils/charset.h @@ -23,6 +23,22 @@ enum Character_Set { LATIN1_CHARSET }; +/** +* Convert a sequence of UCS-2 (big endian) characters to a UTF-8 string +* This is used for ASN.1 BMPString type +* @param ucs2 the sequence of UCS-2 characters +* @param len length of ucs2 in bytes, must be a multiple of 2 +*/ +std::string BOTAN_UNSTABLE_API ucs2_to_utf8(const uint8_t ucs2[], size_t len); + +/** +* Convert a sequence of UCS-4 (big endian) characters to a UTF-8 string +* This is used for ASN.1 UniversalString type +* @param ucs4 the sequence of UCS-4 characters +* @param len length of ucs4 in bytes, must be a multiple of 4 +*/ +std::string BOTAN_UNSTABLE_API ucs4_to_utf8(const uint8_t ucs4[], size_t len); + namespace Charset { /* |