src/def_char.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

/*************************************************
* Default Character Set Handling Source File     *
* (C) 1999-2007 The Botan Project                *
*************************************************/

#include <botan/def_char.h>
#include <botan/exceptn.h>
#include <botan/parsing.h>

namespace Botan {

namespace {

/*************************************************
* Convert from UCS-2 to ISO 8859-1               *
*************************************************/
std::string ucs2_to_latin1(const std::string& ucs2)
   {
   if(ucs2.size() % 2 == 1)
      throw Decoding_Error("UCS-2 string has an odd number of bytes");

   std::string latin1;

   for(u32bit j = 0; j != ucs2.size(); j += 2)
      {
      const byte c1 = ucs2[j];
      const byte c2 = ucs2[j+1];

      if(c1 != 0)
         throw Decoding_Error("UCS-2 has non-Latin1 characters");

      latin1 += (char)c2;
      }

   return latin1;
   }

/*************************************************
* Convert from UTF-8 to ISO 8859-1               *
*************************************************/
std::string utf8_to_latin1(const std::string& utf8)
   {
   std::string iso8859;

   u32bit position = 0;
   while(position != utf8.size())
      {
      const byte c1 = (byte)utf8[position++];

      if(c1 <= 0x7F)
         iso8859 += (char)c1;
      else if(c1 >= 0xC0 && c1 <= 0xC7)
         {
         if(position == utf8.size())
            throw Decoding_Error("UTF-8: sequence truncated");

         const byte c2 = (byte)utf8[position++];
         const byte iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F);

         if(iso_char <= 0x7F)
            throw Decoding_Error("UTF-8: sequence longer than needed");

         iso8859 += (char)iso_char;
         }
      else
         throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used");
      }

   return iso8859;
   }

/*************************************************
* Convert from ISO 8859-1 to UTF-8               *
*************************************************/
std::string latin1_to_utf8(const std::string& iso8859)
   {
   std::string utf8;
   for(u32bit j = 0; j != iso8859.size(); ++j)
      {
      const byte c = (byte)iso8859[j];

      if(c <= 0x7F)
         utf8 += (char)c;
      else
         {
         utf8 += (char)(0xC0 | (c >> 6));
         utf8 += (char)(0x80 | (c & 0x3F));
         }
      }
   return utf8;
   }

}

/*************************************************
* Transcode between character sets               *
*************************************************/
std::string Default_Charset_Transcoder::transcode(const std::string& str,
                                                  Character_Set to,
                                                  Character_Set from) const
   {
   if(to == LOCAL_CHARSET)
      to = LATIN1_CHARSET;
   if(from == LOCAL_CHARSET)
      from = LATIN1_CHARSET;

   if(to == from)
      return str;

   if(from == LATIN1_CHARSET && to == UTF8_CHARSET)
      return latin1_to_utf8(str);
   if(from == UTF8_CHARSET && to == LATIN1_CHARSET)
      return utf8_to_latin1(str);
   if(from == UCS2_CHARSET && to == LATIN1_CHARSET)
      return ucs2_to_latin1(str);

   throw Invalid_Argument("Unknown transcoding operation from " +
                          to_string(from) + " to " + to_string(to));
   }

}