From 2aea9e3b2d6d9a3ad1f5e3c76e7e0d99c0872122 Mon Sep 17 00:00:00 2001 From: Jack Lloyd Date: Thu, 9 Nov 2017 12:19:28 -0500 Subject: Add UCS-2 and UCS-4 to UTF-8 conversion functions Crosschecked by fuzzing and comparing with iconv Needed in #1250 --- src/lib/utils/charset.cpp | 81 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib/utils/charset.h | 16 ++++++++++ 2 files changed, 97 insertions(+) (limited to 'src/lib/utils') diff --git a/src/lib/utils/charset.cpp b/src/lib/utils/charset.cpp index 546e4e74d..dadee8f78 100644 --- a/src/lib/utils/charset.cpp +++ b/src/lib/utils/charset.cpp @@ -7,10 +7,91 @@ #include #include +#include #include namespace Botan { +namespace { + +void append_utf8_for(std::string& s, uint32_t c) + { + if(c >= 0xD800 && c < 0xE000) + throw Decoding_Error("Invalid Unicode character"); + + if(c <= 0x7F) + { + const uint8_t b0 = static_cast(c); + s.push_back(static_cast(b0)); + } + else if(c <= 0x7FF) + { + const uint8_t b0 = 0xC0 | static_cast(c >> 6); + const uint8_t b1 = 0x80 | static_cast(c & 0x3F); + s.push_back(static_cast(b0)); + s.push_back(static_cast(b1)); + } + else if(c <= 0xFFFF) + { + const uint8_t b0 = 0xE0 | static_cast(c >> 12); + const uint8_t b1 = 0x80 | static_cast((c >> 6) & 0x3F); + const uint8_t b2 = 0x80 | static_cast(c & 0x3F); + s.push_back(static_cast(b0)); + s.push_back(static_cast(b1)); + s.push_back(static_cast(b2)); + } + else if(c <= 0x10FFFF) + { + const uint8_t b0 = 0xF0 | static_cast(c >> 18); + const uint8_t b1 = 0x80 | static_cast((c >> 12) & 0x3F); + const uint8_t b2 = 0x80 | static_cast((c >> 6) & 0x3F); + const uint8_t b3 = 0x80 | static_cast(c & 0x3F); + s.push_back(static_cast(b0)); + s.push_back(static_cast(b1)); + s.push_back(static_cast(b2)); + s.push_back(static_cast(b3)); + } + else + throw Decoding_Error("Invalid Unicode character"); + + } + +} + +std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len) + { + if(len % 2 != 0) + throw Decoding_Error("Invalid length for UCS-2 string"); + + const size_t chars = len / 2; + + std::string s; + for(size_t i = 0; i != chars; ++i) + { + const uint16_t c = load_be(ucs2, i); + append_utf8_for(s, c); + } + + return s; + } + +std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len) + { + if(len % 4 != 0) + throw Decoding_Error("Invalid length for UCS-4 string"); + + const size_t chars = len / 4; + + std::string s; + for(size_t i = 0; i != chars; ++i) + { + const uint32_t c = load_be(ucs4, i); + append_utf8_for(s, c); + } + + return s; + } + namespace Charset { namespace { diff --git a/src/lib/utils/charset.h b/src/lib/utils/charset.h index 528ab908b..3f2ff9912 100644 --- a/src/lib/utils/charset.h +++ b/src/lib/utils/charset.h @@ -23,6 +23,22 @@ enum Character_Set { LATIN1_CHARSET }; +/** +* Convert a sequence of UCS-2 (big endian) characters to a UTF-8 string +* This is used for ASN.1 BMPString type +* @param ucs2 the sequence of UCS-2 characters +* @param len length of ucs2 in bytes, must be a multiple of 2 +*/ +std::string BOTAN_UNSTABLE_API ucs2_to_utf8(const uint8_t ucs2[], size_t len); + +/** +* Convert a sequence of UCS-4 (big endian) characters to a UTF-8 string +* This is used for ASN.1 UniversalString type +* @param ucs4 the sequence of UCS-4 characters +* @param len length of ucs4 in bytes, must be a multiple of 4 +*/ +std::string BOTAN_UNSTABLE_API ucs4_to_utf8(const uint8_t ucs4[], size_t len); + namespace Charset { /* -- cgit v1.2.3