diff options
Diffstat (limited to 'LibOVR/Src/Kernel/OVR_UTF8Util.cpp')
-rw-r--r-- | LibOVR/Src/Kernel/OVR_UTF8Util.cpp | 556 |
1 files changed, 0 insertions, 556 deletions
diff --git a/LibOVR/Src/Kernel/OVR_UTF8Util.cpp b/LibOVR/Src/Kernel/OVR_UTF8Util.cpp deleted file mode 100644 index 68e58ea..0000000 --- a/LibOVR/Src/Kernel/OVR_UTF8Util.cpp +++ /dev/null @@ -1,556 +0,0 @@ -/************************************************************************** - -Filename : OVR_UTF8Util.cpp -Content : UTF8 Unicode character encoding/decoding support -Created : September 19, 2012 -Notes : -Notes : Much useful info at "UTF-8 and Unicode FAQ" - http://www.cl.cam.ac.uk/~mgk25/unicode.html - -Copyright : Copyright 2014 Oculus VR, LLC All Rights reserved. - -Licensed under the Oculus VR Rift SDK License Version 3.2 (the "License"); -you may not use the Oculus VR Rift SDK except in compliance with the License, -which is provided at the time of installation or download, or which -otherwise accompanies this software in either electronic or hard copy form. - -You may obtain a copy of the License at - -http://www.oculusvr.com/licenses/LICENSE-3.2 - -Unless required by applicable law or agreed to in writing, the Oculus VR SDK -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -************************************************************************************/ - -#include "OVR_UTF8Util.h" - -namespace OVR { namespace UTF8Util { - -intptr_t OVR_STDCALL GetLength(const char* buf, intptr_t buflen) -{ - const char* p = buf; - intptr_t length = 0; - - if (buflen != -1) - { - while (p - buf < buflen) - { - // We should be able to have ASStrings with 0 in the middle. - UTF8Util::DecodeNextChar_Advance0(&p); - length++; - } - } - else - { - while (UTF8Util::DecodeNextChar_Advance0(&p)) - length++; - } - - return length; -} - -uint32_t OVR_STDCALL GetCharAt(intptr_t index, const char* putf8str, intptr_t length) -{ - const char* buf = putf8str; - uint32_t c = 0; - - if (length != -1) - { - while (buf - putf8str < length) - { - c = UTF8Util::DecodeNextChar_Advance0(&buf); - if (index == 0) - return c; - index--; - } - - return c; - } - - do - { - c = UTF8Util::DecodeNextChar_Advance0(&buf); - index--; - - if (c == 0) - { - // We've hit the end of the string; don't go further. - OVR_ASSERT(index == 0); - return c; - } - } while (index >= 0); - - return c; -} - -intptr_t OVR_STDCALL GetByteIndex(intptr_t index, const char *putf8str, intptr_t length) -{ - const char* buf = putf8str; - - if (length != -1) - { - while ((buf - putf8str) < length && index > 0) - { - UTF8Util::DecodeNextChar_Advance0(&buf); - index--; - } - - return buf-putf8str; - } - - while (index > 0) - { - uint32_t c = UTF8Util::DecodeNextChar_Advance0(&buf); - index--; - - if (c == 0) - return buf-putf8str; - }; - - return buf-putf8str; -} - -int OVR_STDCALL GetEncodeCharSize(uint32_t ucs_character) -{ - if (ucs_character <= 0x7F) - return 1; - else if (ucs_character <= 0x7FF) - return 2; - else if (ucs_character <= 0xFFFF) - return 3; - else if (ucs_character <= 0x1FFFFF) - return 4; - else if (ucs_character <= 0x3FFFFFF) - return 5; - else if (ucs_character <= 0x7FFFFFFF) - return 6; - else - return 0; -} - -uint32_t OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer) -{ - uint32_t uc; - char c; - - // Security considerations: - // - // Changed, this is now only the case for DecodeNextChar: - // - If we hit a zero byte, we want to return 0 without stepping - // the buffer pointer past the 0. th - // - // If we hit an "overlong sequence"; i.e. a character encoded - // in a longer multibyte string than is necessary, then we - // need to discard the character. This is so attackers can't - // disguise dangerous characters or character sequences -- - // there is only one valid encoding for each character. - // - // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE, - // 0xFFFF } then we ignore them; they are not valid in UTF-8. - - // This isn't actually an invalid character; it's a valid char that - // looks like an inverted question mark. -#define INVALID_CHAR 0x0FFFD - -#define FIRST_BYTE(mask, shift) \ - uc = (c & (mask)) << (shift); - -#define NEXT_BYTE(shift) \ - c = **putf8Buffer; \ - if (c == 0) return 0; /* end of buffer, do not advance */ \ - if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \ - (*putf8Buffer)++; \ - uc |= (c & 0x3F) << shift; - - c = **putf8Buffer; - (*putf8Buffer)++; - if (c == 0) - return 0; // End of buffer. - - if ((c & 0x80) == 0) return (uint32_t) c; // Conventional 7-bit ASCII. - - // Multi-byte sequences. - if ((c & 0xE0) == 0xC0) - { - // Two-byte sequence. - FIRST_BYTE(0x1F, 6); - NEXT_BYTE(0); - if (uc < 0x80) return INVALID_CHAR; // overlong - return uc; - } - else if ((c & 0xF0) == 0xE0) - { - // Three-byte sequence. - FIRST_BYTE(0x0F, 12); - NEXT_BYTE(6); - NEXT_BYTE(0); - if (uc < 0x800) return INVALID_CHAR; // overlong - // Not valid ISO 10646, but Flash requires these to work - // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0) - // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR; - // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646 - return uc; - } - else if ((c & 0xF8) == 0xF0) - { - // Four-byte sequence. - FIRST_BYTE(0x07, 18); - NEXT_BYTE(12); - NEXT_BYTE(6); - NEXT_BYTE(0); - if (uc < 0x010000) return INVALID_CHAR; // overlong - return uc; - } - else if ((c & 0xFC) == 0xF8) - { - // Five-byte sequence. - FIRST_BYTE(0x03, 24); - NEXT_BYTE(18); - NEXT_BYTE(12); - NEXT_BYTE(6); - NEXT_BYTE(0); - if (uc < 0x0200000) return INVALID_CHAR; // overlong - return uc; - } - else if ((c & 0xFE) == 0xFC) - { - // Six-byte sequence. - FIRST_BYTE(0x01, 30); - NEXT_BYTE(24); - NEXT_BYTE(18); - NEXT_BYTE(12); - NEXT_BYTE(6); - NEXT_BYTE(0); - if (uc < 0x04000000) return INVALID_CHAR; // overlong - return uc; - } - else - { - // Invalid. - return INVALID_CHAR; - } -} - - -void OVR_STDCALL EncodeChar(char* pbuffer, intptr_t* pindex, uint32_t ucs_character) -{ - if (ucs_character <= 0x7F) - { - // Plain single-byte ASCII. - pbuffer[(*pindex)++] = (char) ucs_character; - } - else if (ucs_character <= 0x7FF) - { - // Two bytes. - pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); - } - else if (ucs_character <= 0xFFFF) - { - // Three bytes. - pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); - } - else if (ucs_character <= 0x1FFFFF) - { - // Four bytes. - pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); - } - else if (ucs_character <= 0x3FFFFFF) - { - // Five bytes. - pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); - } - else if (ucs_character <= 0x7FFFFFFF) - { - // Six bytes. - pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); - pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); - } - else - { - // Invalid char; don't encode anything. - } -} - -intptr_t OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, intptr_t length) -{ - intptr_t len = 0; - if (length != -1) - for (int i = 0; i < length; i++) - { - len += GetEncodeCharSize(pchar[i]); - } - else - for (int i = 0;; i++) - { - if (pchar[i] == 0) - return len; - len += GetEncodeCharSize(pchar[i]); - } - return len; -} - -void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, intptr_t length) -{ - intptr_t ofs = 0; - if (length != -1) - { - for (int i = 0; i < length; i++) - { - EncodeChar(pbuff, &ofs, pchar[i]); - } - } - else - { - for (int i = 0;; i++) - { - if (pchar[i] == 0) - break; - EncodeChar(pbuff, &ofs, pchar[i]); - } - } - pbuff[ofs] = 0; -} - -size_t OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, intptr_t bytesLen) -{ - wchar_t *pbegin = pbuff; - if (bytesLen == -1) - { - while (1) - { - uint32_t ch = DecodeNextChar_Advance0(&putf8str); - if (ch == 0) - break; - else if (ch >= 0xFFFF) - ch = 0xFFFD; - *pbuff++ = wchar_t(ch); - } - } - else - { - const char* p = putf8str; - while ((p - putf8str) < bytesLen) - { - uint32_t ch = DecodeNextChar_Advance0(&p); - if (ch >= 0xFFFF) - ch = 0xFFFD; - *pbuff++ = wchar_t(ch); - } - } - - *pbuff = 0; - return pbuff - pbegin; -} - - -#ifdef UTF8_UNIT_TEST - -// Compile this test case with something like: -// -// gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test -// -// or -// -// cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I.. -// -// If possible, try running the test program with the first arg -// pointing at the file: -// -// http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt -// -// and examine the results by eye to make sure they are acceptable to -// you. - - -#include "base/utility.h" -#include <stdio.h> - - -bool check_equal(const char* utf8_in, const uint32_t* ucs_in) -{ - for (;;) - { - uint32_t next_ucs = *ucs_in++; - uint32_t next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in); - if (next_ucs != next_ucs_from_utf8) - { - return false; - } - if (next_ucs == 0) - { - OVR_ASSERT(next_ucs_from_utf8 == 0); - break; - } - } - - return true; -} - - -void log_ascii(const char* line) -{ - for (;;) - { - unsigned char c = (unsigned char) *line++; - if (c == 0) - { - // End of line. - return; - } - else if (c != '\n' - && (c < 32 || c > 127)) - { - // Non-printable as plain ASCII. - printf("<0x%02X>", (int) c); - } - else - { - printf("%c", c); - } - } -} - - -void log_ucs(const uint32_t* line) -{ - for (;;) - { - uint32_t uc = *line++; - if (uc == 0) - { - // End of line. - return; - } - else if (uc != '\n' - && (uc < 32 || uc > 127)) - { - // Non-printable as plain ASCII. - printf("<U-%04X>", uc); - } - else - { - printf("%c", (char) uc); - } - } -} - - -// Simple canned test. -int main(int argc, const char* argv[]) -{ - { - const char* test8 = "Ignacio CastaƱo"; - const uint32_t test32[] = - { - 0x49, 0x67, 0x6E, 0x61, 0x63, - 0x69, 0x6F, 0x20, 0x43, 0x61, - 0x73, 0x74, 0x61, 0xF1, 0x6F, - 0x00 - }; - - OVR_ASSERT(check_equal(test8, test32)); - } - - // If user passed an arg, try reading the file as UTF-8 encoded text. - if (argc > 1) - { - const char* filename = argv[1]; - FILE* fp = fopen(filename, "rb"); - if (fp == NULL) - { - printf("Can't open file '%s'\n", filename); - return 1; - } - - // Read lines from the file, encode/decode them, and highlight discrepancies. - const int LINE_SIZE = 200; // max line size - char line_buffer_utf8[LINE_SIZE]; - char reencoded_utf8[6 * LINE_SIZE]; - uint32_t line_buffer_ucs[LINE_SIZE]; - - int byte_counter = 0; - for (;;) - { - int c = fgetc(fp); - if (c == EOF) - { - // Done. - break; - } - line_buffer_utf8[byte_counter++] = c; - if (c == '\n' || byte_counter >= LINE_SIZE - 2) - { - // End of line. Process the line. - line_buffer_utf8[byte_counter++] = 0; // terminate. - - // Decode into UCS. - const char* p = line_buffer_utf8; - uint32_t* q = line_buffer_ucs; - for (;;) - { - uint32_t uc = UTF8Util::DecodeNextChar(&p); - *q++ = uc; - - OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE); - OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE); - - if (uc == 0) break; - } - - // Encode back into UTF-8. - q = line_buffer_ucs; - int index = 0; - for (;;) - { - uint32_t uc = *q++; - OVR_ASSERT(index < LINE_SIZE * 6 - 6); - int last_index = index; - UTF8Util::EncodeChar(reencoded_utf8, &index, uc); - OVR_ASSERT(index <= last_index + 6); - if (uc == 0) break; - } - - // This can be useful for debugging. -#if 0 - // Show the UCS and the re-encoded UTF-8. - log_ucs(line_buffer_ucs); - log_ascii(reencoded_utf8); -#endif // 0 - - OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs)); - OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs)); - - // Start next line. - byte_counter = 0; - } - } - - fclose(fp); - } - - return 0; -} - - -#endif // UTF8_UNIT_TEST - -}} // namespace UTF8Util::OVR - |