aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2017-11-09 12:19:28 -0500
committerJack Lloyd <[email protected]>2017-11-09 12:19:28 -0500
commit2aea9e3b2d6d9a3ad1f5e3c76e7e0d99c0872122 (patch)
treecf9d6e0763def177e4452119e6603dfc2ff49b9b
parent7e3cfd7eb4cc9165c9c53c81c3613c23db433cd7 (diff)
Add UCS-2 and UCS-4 to UTF-8 conversion functions
Crosschecked by fuzzing and comparing with iconv Needed in #1250
-rw-r--r--src/lib/utils/charset.cpp81
-rw-r--r--src/lib/utils/charset.h16
-rw-r--r--src/tests/data/charset.vec20
-rw-r--r--src/tests/test_utils.cpp19
4 files changed, 131 insertions, 5 deletions
diff --git a/src/lib/utils/charset.cpp b/src/lib/utils/charset.cpp
index 546e4e74d..dadee8f78 100644
--- a/src/lib/utils/charset.cpp
+++ b/src/lib/utils/charset.cpp
@@ -7,10 +7,91 @@
#include <botan/charset.h>
#include <botan/exceptn.h>
+#include <botan/loadstor.h>
#include <cctype>
namespace Botan {
+namespace {
+
+void append_utf8_for(std::string& s, uint32_t c)
+ {
+ if(c >= 0xD800 && c < 0xE000)
+ throw Decoding_Error("Invalid Unicode character");
+
+ if(c <= 0x7F)
+ {
+ const uint8_t b0 = static_cast<uint8_t>(c);
+ s.push_back(static_cast<char>(b0));
+ }
+ else if(c <= 0x7FF)
+ {
+ const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
+ const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
+ s.push_back(static_cast<char>(b0));
+ s.push_back(static_cast<char>(b1));
+ }
+ else if(c <= 0xFFFF)
+ {
+ const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
+ const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
+ const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
+ s.push_back(static_cast<char>(b0));
+ s.push_back(static_cast<char>(b1));
+ s.push_back(static_cast<char>(b2));
+ }
+ else if(c <= 0x10FFFF)
+ {
+ const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
+ const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
+ const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
+ const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
+ s.push_back(static_cast<char>(b0));
+ s.push_back(static_cast<char>(b1));
+ s.push_back(static_cast<char>(b2));
+ s.push_back(static_cast<char>(b3));
+ }
+ else
+ throw Decoding_Error("Invalid Unicode character");
+
+ }
+
+}
+
+std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
+ {
+ if(len % 2 != 0)
+ throw Decoding_Error("Invalid length for UCS-2 string");
+
+ const size_t chars = len / 2;
+
+ std::string s;
+ for(size_t i = 0; i != chars; ++i)
+ {
+ const uint16_t c = load_be<uint16_t>(ucs2, i);
+ append_utf8_for(s, c);
+ }
+
+ return s;
+ }
+
+std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
+ {
+ if(len % 4 != 0)
+ throw Decoding_Error("Invalid length for UCS-4 string");
+
+ const size_t chars = len / 4;
+
+ std::string s;
+ for(size_t i = 0; i != chars; ++i)
+ {
+ const uint32_t c = load_be<uint32_t>(ucs4, i);
+ append_utf8_for(s, c);
+ }
+
+ return s;
+ }
+
namespace Charset {
namespace {
diff --git a/src/lib/utils/charset.h b/src/lib/utils/charset.h
index 528ab908b..3f2ff9912 100644
--- a/src/lib/utils/charset.h
+++ b/src/lib/utils/charset.h
@@ -23,6 +23,22 @@ enum Character_Set {
LATIN1_CHARSET
};
+/**
+* Convert a sequence of UCS-2 (big endian) characters to a UTF-8 string
+* This is used for ASN.1 BMPString type
+* @param ucs2 the sequence of UCS-2 characters
+* @param len length of ucs2 in bytes, must be a multiple of 2
+*/
+std::string BOTAN_UNSTABLE_API ucs2_to_utf8(const uint8_t ucs2[], size_t len);
+
+/**
+* Convert a sequence of UCS-4 (big endian) characters to a UTF-8 string
+* This is used for ASN.1 UniversalString type
+* @param ucs4 the sequence of UCS-4 characters
+* @param len length of ucs4 in bytes, must be a multiple of 4
+*/
+std::string BOTAN_UNSTABLE_API ucs4_to_utf8(const uint8_t ucs4[], size_t len);
+
namespace Charset {
/*
diff --git a/src/tests/data/charset.vec b/src/tests/data/charset.vec
index dd64ac6e3..6f12be8c2 100644
--- a/src/tests/data/charset.vec
+++ b/src/tests/data/charset.vec
@@ -1,3 +1,21 @@
+[UCS2-UTF8]
+In = 0042006F00740061006E
+Out = 426F74616E
+
+# Nonsense, converted with iconv
+In = 03B404960556096710751827FFF0
+Out = CEB4D296D596E0A5A7E181B5E1A0A7EFBFB0
+
+In = B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B246B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B204
+Out = EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8986EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8884
+
+In = 0A0000000000000000000000000030000000000000001D1D1D1D01000000000000001D1D1D1D00000000000000000000
+Out = E0A880000000000000E38080000000E1B49DE1B49DC480000000E1B49DE1B49D0000000000
+
+[UCS4-UTF8]
+In = 0000004800000065000000690000007A000000F60000006C00000072000000FC000000630000006B00000073000000740000006F000000DF000000610000006200000064000000E40000006D0000007000000066000000750000006E00000067
+Out = 4865697AC3B66C72C3BC636B73746FC39F616264C3A46D7066756E67
+
[UTF16-LATIN1]
# Botan
@@ -38,4 +56,4 @@ Out = 4865697AC3B66C72C3BC636B73746FC39F616264C3A46D7066756E67
# ÿ@Ðé¿ã!ð
In = FF40D0E9BFE321F0
-Out = C3BF40C390C3A9C2BFC3A321C3B0 \ No newline at end of file
+Out = C3BF40C390C3A9C2BFC3A321C3B0
diff --git a/src/tests/test_utils.cpp b/src/tests/test_utils.cpp
index 57cd3208c..da2d25d5e 100644
--- a/src/tests/test_utils.cpp
+++ b/src/tests/test_utils.cpp
@@ -409,22 +409,33 @@ class Charset_Tests final : public Text_Based_Test
const std::vector<uint8_t> in = get_req_bin(vars, "In");
const std::vector<uint8_t> expected = get_req_bin(vars, "Out");
+ const std::string in_str(in.begin(), in.end());
+
std::string converted;
- if(type == "UTF16-LATIN1")
+
+ if(type == "UCS2-UTF8")
+ {
+ converted = Botan::ucs2_to_utf8(in.data(), in.size());
+ }
+ else if(type == "UCS4-UTF8")
+ {
+ converted = Botan::ucs4_to_utf8(in.data(), in.size());
+ }
+ else if(type == "UTF16-LATIN1")
{
- converted = Botan::Charset::transcode(std::string(in.begin(), in.end()),
+ converted = Botan::Charset::transcode(in_str,
Botan::Character_Set::LATIN1_CHARSET,
Botan::Character_Set::UCS2_CHARSET);
}
else if(type == "UTF8-LATIN1")
{
- converted = Botan::Charset::transcode(std::string(in.begin(), in.end()),
+ converted = Botan::Charset::transcode(in_str,
Botan::Character_Set::LATIN1_CHARSET,
Botan::Character_Set::UTF8_CHARSET);
}
else if(type == "LATIN1-UTF8")
{
- converted = Botan::Charset::transcode(std::string(in.begin(), in.end()),
+ converted = Botan::Charset::transcode(in_str,
Botan::Character_Set::UTF8_CHARSET,
Botan::Character_Set::LATIN1_CHARSET);
}