Add UCS-2 and UCS-4 to UTF-8 conversion functions

Crosschecked by fuzzing and comparing with iconv Needed in #1250
author: Jack Lloyd <[email protected]> 2017-11-09 12:19:28 -0500
committer: Jack Lloyd <[email protected]> 2017-11-09 12:19:28 -0500
commit: 2aea9e3b2d6d9a3ad1f5e3c76e7e0d99c0872122 (patch)
tree: cf9d6e0763def177e4452119e6603dfc2ff49b9b /src
parent: 7e3cfd7eb4cc9165c9c53c81c3613c23db433cd7 (diff)
4 files changed, 131 insertions, 5 deletions
diff --git a/src/lib/utils/charset.cpp b/src/lib/utils/charset.cpp
index 546e4e74d..dadee8f78 100644
--- a/src/lib/utils/charset.cpp
+++ b/src/lib/utils/charset.cpp
@@ -7,10 +7,91 @@
 
 #include <botan/charset.h>
 #include <botan/exceptn.h>
+#include <botan/loadstor.h>
 #include <cctype>
 
 namespace Botan {
 
+namespace {
+
+void append_utf8_for(std::string& s, uint32_t c)
+   {
+   if(c >= 0xD800 && c < 0xE000)
+      throw Decoding_Error("Invalid Unicode character");
+
+   if(c <= 0x7F)
+      {
+      const uint8_t b0 = static_cast<uint8_t>(c);
+      s.push_back(static_cast<char>(b0));
+      }
+   else if(c <= 0x7FF)
+      {
+      const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
+      const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
+      s.push_back(static_cast<char>(b0));
+      s.push_back(static_cast<char>(b1));
+      }
+   else if(c <= 0xFFFF)
+      {
+      const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
+      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
+      const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
+      s.push_back(static_cast<char>(b0));
+      s.push_back(static_cast<char>(b1));
+      s.push_back(static_cast<char>(b2));
+      }
+   else if(c <= 0x10FFFF)
+      {
+      const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
+      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
+      const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
+      const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
+      s.push_back(static_cast<char>(b0));
+      s.push_back(static_cast<char>(b1));
+      s.push_back(static_cast<char>(b2));
+      s.push_back(static_cast<char>(b3));
+      }
+   else
+      throw Decoding_Error("Invalid Unicode character");
+
+   }
+
+}
+
+std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
+   {
+   if(len % 2 != 0)
+      throw Decoding_Error("Invalid length for UCS-2 string");
+
+   const size_t chars = len / 2;
+
+   std::string s;
+   for(size_t i = 0; i != chars; ++i)
+      {
+      const uint16_t c = load_be<uint16_t>(ucs2, i);
+      append_utf8_for(s, c);
+      }
+
+   return s;
+   }
+
+std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
+   {
+   if(len % 4 != 0)
+      throw Decoding_Error("Invalid length for UCS-4 string");
+
+   const size_t chars = len / 4;
+
+   std::string s;
+   for(size_t i = 0; i != chars; ++i)
+      {
+      const uint32_t c = load_be<uint32_t>(ucs4, i);
+      append_utf8_for(s, c);
+      }
+
+   return s;
+   }
+
 namespace Charset {
 
 namespace {
diff --git a/src/lib/utils/charset.h b/src/lib/utils/charset.h
index 528ab908b..3f2ff9912 100644
--- a/src/lib/utils/charset.h
+++ b/src/lib/utils/charset.h
@@ -23,6 +23,22 @@ enum Character_Set {
    LATIN1_CHARSET
 };
 
+/**
+* Convert a sequence of UCS-2 (big endian) characters to a UTF-8 string
+* This is used for ASN.1 BMPString type
+* @param ucs2 the sequence of UCS-2 characters
+* @param len length of ucs2 in bytes, must be a multiple of 2
+*/
+std::string BOTAN_UNSTABLE_API ucs2_to_utf8(const uint8_t ucs2[], size_t len);
+
+/**
+* Convert a sequence of UCS-4 (big endian) characters to a UTF-8 string
+* This is used for ASN.1 UniversalString type
+* @param ucs4 the sequence of UCS-4 characters
+* @param len length of ucs4 in bytes, must be a multiple of 4
+*/
+std::string BOTAN_UNSTABLE_API ucs4_to_utf8(const uint8_t ucs4[], size_t len);
+
 namespace Charset {
 
 /*
diff --git a/src/tests/data/charset.vec b/src/tests/data/charset.vec
index dd64ac6e3..6f12be8c2 100644
--- a/src/tests/data/charset.vec
+++ b/src/tests/data/charset.vec
@@ -1,3 +1,21 @@
+[UCS2-UTF8]
+In = 0042006F00740061006E
+Out = 426F74616E
+
+# Nonsense, converted with iconv
+In = 03B404960556096710751827FFF0
+Out = CEB4D296D596E0A5A7E181B5E1A0A7EFBFB0
+
+In = B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B246B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B2B204
+Out = EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8986EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8AB2EB8884
+
+In = 0A0000000000000000000000000030000000000000001D1D1D1D01000000000000001D1D1D1D00000000000000000000
+Out = E0A880000000000000E38080000000E1B49DE1B49DC480000000E1B49DE1B49D0000000000
+
+[UCS4-UTF8]
+In = 0000004800000065000000690000007A000000F60000006C00000072000000FC000000630000006B00000073000000740000006F000000DF000000610000006200000064000000E40000006D0000007000000066000000750000006E00000067
+Out = 4865697AC3B66C72C3BC636B73746FC39F616264C3A46D7066756E67
+
 [UTF16-LATIN1]
 
 # Botan
@@ -38,4 +56,4 @@ Out = 4865697AC3B66C72C3BC636B73746FC39F616264C3A46D7066756E67
 
 # ÿ@Ðé¿ã!ð
 In = FF40D0E9BFE321F0
-Out = C3BF40C390C3A9C2BFC3A321C3B0
-\ No newline at end of file
+Out = C3BF40C390C3A9C2BFC3A321C3B0
diff --git a/src/tests/test_utils.cpp b/src/tests/test_utils.cpp
index 57cd3208c..da2d25d5e 100644
--- a/src/tests/test_utils.cpp
+++ b/src/tests/test_utils.cpp
@@ -409,22 +409,33 @@ class Charset_Tests final : public Text_Based_Test
          const std::vector<uint8_t> in = get_req_bin(vars, "In");
          const std::vector<uint8_t> expected = get_req_bin(vars, "Out");
 
+         const std::string in_str(in.begin(), in.end());
+
          std::string converted;
-         if(type == "UTF16-LATIN1")
+
+         if(type == "UCS2-UTF8")
+            {
+            converted = Botan::ucs2_to_utf8(in.data(), in.size());
+            }
+         else if(type == "UCS4-UTF8")
+            {
+            converted = Botan::ucs4_to_utf8(in.data(), in.size());
+            }
+         else if(type == "UTF16-LATIN1")
             {
-            converted = Botan::Charset::transcode(std::string(in.begin(), in.end()),
+            converted = Botan::Charset::transcode(in_str,
                                                   Botan::Character_Set::LATIN1_CHARSET,
                                                   Botan::Character_Set::UCS2_CHARSET);
             }
          else if(type == "UTF8-LATIN1")
             {
-            converted = Botan::Charset::transcode(std::string(in.begin(), in.end()),
+            converted = Botan::Charset::transcode(in_str,
                                                   Botan::Character_Set::LATIN1_CHARSET,
                                                   Botan::Character_Set::UTF8_CHARSET);
             }
          else if(type == "LATIN1-UTF8")
             {
-            converted = Botan::Charset::transcode(std::string(in.begin(), in.end()),
+            converted = Botan::Charset::transcode(in_str,
                                                   Botan::Character_Set::UTF8_CHARSET,
                                                   Botan::Character_Set::LATIN1_CHARSET);
             }
author	Jack Lloyd <[email protected]>	2017-11-09 12:19:28 -0500
committer	Jack Lloyd <[email protected]>	2017-11-09 12:19:28 -0500
commit	2aea9e3b2d6d9a3ad1f5e3c76e7e0d99c0872122 (patch)
tree	cf9d6e0763def177e4452119e6603dfc2ff49b9b /src
parent	7e3cfd7eb4cc9165c9c53c81c3613c23db433cd7 (diff)