aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/utils
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2017-11-09 12:19:28 -0500
committerJack Lloyd <[email protected]>2017-11-09 12:19:28 -0500
commit2aea9e3b2d6d9a3ad1f5e3c76e7e0d99c0872122 (patch)
treecf9d6e0763def177e4452119e6603dfc2ff49b9b /src/lib/utils
parent7e3cfd7eb4cc9165c9c53c81c3613c23db433cd7 (diff)
Add UCS-2 and UCS-4 to UTF-8 conversion functions
Crosschecked by fuzzing and comparing with iconv Needed in #1250
Diffstat (limited to 'src/lib/utils')
-rw-r--r--src/lib/utils/charset.cpp81
-rw-r--r--src/lib/utils/charset.h16
2 files changed, 97 insertions, 0 deletions
diff --git a/src/lib/utils/charset.cpp b/src/lib/utils/charset.cpp
index 546e4e74d..dadee8f78 100644
--- a/src/lib/utils/charset.cpp
+++ b/src/lib/utils/charset.cpp
@@ -7,10 +7,91 @@
#include <botan/charset.h>
#include <botan/exceptn.h>
+#include <botan/loadstor.h>
#include <cctype>
namespace Botan {
+namespace {
+
+void append_utf8_for(std::string& s, uint32_t c)
+ {
+ if(c >= 0xD800 && c < 0xE000)
+ throw Decoding_Error("Invalid Unicode character");
+
+ if(c <= 0x7F)
+ {
+ const uint8_t b0 = static_cast<uint8_t>(c);
+ s.push_back(static_cast<char>(b0));
+ }
+ else if(c <= 0x7FF)
+ {
+ const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
+ const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
+ s.push_back(static_cast<char>(b0));
+ s.push_back(static_cast<char>(b1));
+ }
+ else if(c <= 0xFFFF)
+ {
+ const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
+ const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
+ const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
+ s.push_back(static_cast<char>(b0));
+ s.push_back(static_cast<char>(b1));
+ s.push_back(static_cast<char>(b2));
+ }
+ else if(c <= 0x10FFFF)
+ {
+ const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
+ const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
+ const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
+ const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
+ s.push_back(static_cast<char>(b0));
+ s.push_back(static_cast<char>(b1));
+ s.push_back(static_cast<char>(b2));
+ s.push_back(static_cast<char>(b3));
+ }
+ else
+ throw Decoding_Error("Invalid Unicode character");
+
+ }
+
+}
+
+std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
+ {
+ if(len % 2 != 0)
+ throw Decoding_Error("Invalid length for UCS-2 string");
+
+ const size_t chars = len / 2;
+
+ std::string s;
+ for(size_t i = 0; i != chars; ++i)
+ {
+ const uint16_t c = load_be<uint16_t>(ucs2, i);
+ append_utf8_for(s, c);
+ }
+
+ return s;
+ }
+
+std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
+ {
+ if(len % 4 != 0)
+ throw Decoding_Error("Invalid length for UCS-4 string");
+
+ const size_t chars = len / 4;
+
+ std::string s;
+ for(size_t i = 0; i != chars; ++i)
+ {
+ const uint32_t c = load_be<uint32_t>(ucs4, i);
+ append_utf8_for(s, c);
+ }
+
+ return s;
+ }
+
namespace Charset {
namespace {
diff --git a/src/lib/utils/charset.h b/src/lib/utils/charset.h
index 528ab908b..3f2ff9912 100644
--- a/src/lib/utils/charset.h
+++ b/src/lib/utils/charset.h
@@ -23,6 +23,22 @@ enum Character_Set {
LATIN1_CHARSET
};
+/**
+* Convert a sequence of UCS-2 (big endian) characters to a UTF-8 string
+* This is used for ASN.1 BMPString type
+* @param ucs2 the sequence of UCS-2 characters
+* @param len length of ucs2 in bytes, must be a multiple of 2
+*/
+std::string BOTAN_UNSTABLE_API ucs2_to_utf8(const uint8_t ucs2[], size_t len);
+
+/**
+* Convert a sequence of UCS-4 (big endian) characters to a UTF-8 string
+* This is used for ASN.1 UniversalString type
+* @param ucs4 the sequence of UCS-4 characters
+* @param len length of ucs4 in bytes, must be a multiple of 4
+*/
+std::string BOTAN_UNSTABLE_API ucs4_to_utf8(const uint8_t ucs4[], size_t len);
+
namespace Charset {
/*