diff options
author | Jack Lloyd <[email protected]> | 2017-11-09 12:19:28 -0500 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-11-09 12:19:28 -0500 |
commit | 2aea9e3b2d6d9a3ad1f5e3c76e7e0d99c0872122 (patch) | |
tree | cf9d6e0763def177e4452119e6603dfc2ff49b9b | |
parent | 7e3cfd7eb4cc9165c9c53c81c3613c23db433cd7 (diff) |
Add UCS-2 and UCS-4 to UTF-8 conversion functions
Crosschecked by fuzzing and comparing with iconv
Needed in #1250
-rw-r--r-- | src/lib/utils/charset.cpp | 81 | ||||
-rw-r--r-- | src/lib/utils/charset.h | 16 | ||||
-rw-r--r-- | src/tests/data/charset.vec | 20 | ||||
-rw-r--r-- | src/tests/test_utils.cpp | 19 |
4 files changed, 131 insertions, 5 deletions
diff --git a/src/lib/utils/charset.cpp b/src/lib/utils/charset.cpp index 546e4e74d..dadee8f78 100644 --- a/src/lib/utils/charset.cpp +++ b/src/lib/utils/charset.cpp @@ -7,10 +7,91 @@ #include <botan/charset.h> #include <botan/exceptn.h> +#include <botan/loadstor.h> #include <cctype> namespace Botan { +namespace { + +void append_utf8_for(std::string& s, uint32_t c) + { + if(c >= 0xD800 && c < 0xE000) + throw Decoding_Error("Invalid Unicode character"); + + if(c <= 0x7F) + { + const uint8_t b0 = static_cast<uint8_t>(c); + s.push_back(static_cast<char>(b0)); + } + else if(c <= 0x7FF) + { + const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6); + const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F); + s.push_back(static_cast<char>(b0)); + s.push_back(static_cast<char>(b1)); + } + else if(c <= 0xFFFF) + { + const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12); + const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); + const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F); + s.push_back(static_cast<char>(b0)); + s.push_back(static_cast<char>(b1)); + s.push_back(static_cast<char>(b2)); + } + else if(c <= 0x10FFFF) + { + const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18); + const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F); + const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F); + const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F); + s.push_back(static_cast<char>(b0)); + s.push_back(static_cast<char>(b1)); + s.push_back(static_cast<char>(b2)); + s.push_back(static_cast<char>(b3)); + } + else + throw Decoding_Error("Invalid Unicode character"); + + } + +} + +std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) + { + if(len % 2 != 0) + throw Decoding_Error("Invalid length for UCS-2 string"); + + const size_t chars = len / 2; + + std::string s; + for(size_t i = 0; i != chars; ++i) + { + const uint16_t c = load_be<uint16_t>(ucs2, i); + append_utf8_for(s, c); + } + + return s; + } + +std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) + { + if(len % 4 != 0) + throw Decoding_Error("Invalid length for UCS-4 string"); + + const size_t chars = len / 4; + + std::string s; + for(size_t i = 0; i != chars; ++i) + { + const uint32_t c = load_be<uint32_t>(ucs4, i); + append_utf8_for(s, c); + } + + return s; + } + namespace Charset { namespace { diff --git a/src/lib/utils/charset.h b/src/lib/utils/charset.h index 528ab908b..3f2ff9912 100644 --- a/src/lib/utils/charset.h +++ b/src/lib/utils/charset.h @@ -23,6 +23,22 @@ enum Character_Set { LATIN1_CHARSET }; +/** +* Convert a sequence of UCS-2 (big endian) characters to a UTF-8 string +* This is used for ASN.1 BMPString type +* @param ucs2 the sequence of UCS-2 characters +* @param len length of ucs2 in bytes, must be a multiple of 2 +*/ +std::string BOTAN_UNSTABLE_API ucs2_to_utf8(const uint8_t ucs2[], size_t len); + +/** +* Convert a sequence of UCS-4 (big endian) characters to a UTF-8 string +* This is used for ASN.1 UniversalString type +* @param ucs4 the sequence of UCS-4 characters +* @param len length of ucs4 in bytes, must be a multiple of 4 +*/ +std::string BOTAN_UNSTABLE_API ucs4_to_utf8(const uint8_t ucs4[], size_t len); + namespace Charset { /* diff --git a/src/tests/data/charset.vec b/src/tests/data/charset.vec index dd64ac6e3..6f12be8c2 100644 --- a/src/tests/data/charset.vec +++ b/src/tests/data/charset.vec @@ -1,3 +1,21 @@ +[UCS2-UTF8] +In = 0042006F00740061006E +Out = 426F74616E + +# Nonsense, converted with iconv +In = 03B404960556096710751827FFF0 +Out = CEB4D296D596E0A5A7E181B5E1A0A7EFBFB0 + +In = B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B246B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B204 +Out = EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8986EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8884 + +In = 0A0000000000000000000000000030000000000000001D1D1D1D01000000000000001D1D1D1D00000000000000000000 +Out = E0A880000000000000E38080000000E1B49DE1B49DC480000000E1B49DE1B49D0000000000 + +[UCS4-UTF8] +In = 0000004800000065000000690000007A000000F60000006C00000072000000FC000000630000006B00000073000000740000006F000000DF000000610000006200000064000000E40000006D0000007000000066000000750000006E00000067 +Out = 4865697AC3B66C72C3BC636B73746FC39F616264C3A46D7066756E67 + [UTF16-LATIN1] # Botan @@ -38,4 +56,4 @@ Out = 4865697AC3B66C72C3BC636B73746FC39F616264C3A46D7066756E67 # ÿ@Ðé¿ã!ð In = FF40D0E9BFE321F0 -Out = C3BF40C390C3A9C2BFC3A321C3B0
\ No newline at end of file +Out = C3BF40C390C3A9C2BFC3A321C3B0 diff --git a/src/tests/test_utils.cpp b/src/tests/test_utils.cpp index 57cd3208c..da2d25d5e 100644 --- a/src/tests/test_utils.cpp +++ b/src/tests/test_utils.cpp @@ -409,22 +409,33 @@ class Charset_Tests final : public Text_Based_Test const std::vector<uint8_t> in = get_req_bin(vars, "In"); const std::vector<uint8_t> expected = get_req_bin(vars, "Out"); + const std::string in_str(in.begin(), in.end()); + std::string converted; - if(type == "UTF16-LATIN1") + + if(type == "UCS2-UTF8") + { + converted = Botan::ucs2_to_utf8(in.data(), in.size()); + } + else if(type == "UCS4-UTF8") + { + converted = Botan::ucs4_to_utf8(in.data(), in.size()); + } + else if(type == "UTF16-LATIN1") { - converted = Botan::Charset::transcode(std::string(in.begin(), in.end()), + converted = Botan::Charset::transcode(in_str, Botan::Character_Set::LATIN1_CHARSET, Botan::Character_Set::UCS2_CHARSET); } else if(type == "UTF8-LATIN1") { - converted = Botan::Charset::transcode(std::string(in.begin(), in.end()), + converted = Botan::Charset::transcode(in_str, Botan::Character_Set::LATIN1_CHARSET, Botan::Character_Set::UTF8_CHARSET); } else if(type == "LATIN1-UTF8") { - converted = Botan::Charset::transcode(std::string(in.begin(), in.end()), + converted = Botan::Charset::transcode(in_str, Botan::Character_Set::UTF8_CHARSET, Botan::Character_Set::LATIN1_CHARSET); } |