python: Rework bytes/unicode string handling

In both Python 2 and 3, opening a file without specifying the mode will open it for reading in text mode ('r'). On Python 2, the read() method of a file object opened in mode 'r' will return byte strings, while on Python 3 it will return unicode strings. Explicitly specifying the binary mode ('rb') then decoding the byte string means we always handle unicode strings on both Python 2 and 3. Which in turns means all re.match(line) will return unicode strings as well. If we also make expandCString return unicode strings, we don't need the call to the unicode() constructor any more. We were using the ugettext() method because it always returns unicode strings in Python 2, contrarily to the gettext() one which returns byte strings. The ugettext() method doesn't exist on Python 3, so we must use the right method on each version of Python. The last hurdles are that Python 3 doesn't let us concatenate unicode and byte strings directly, and that Python 2's stdout wants encoded byte strings while Python 3's want unicode strings. With these changes, the script gives the same output on both Python 2 and 3. Signed-off-by: Mathieu Bridon <[email protected]> Reviewed-by: Dylan Baker <[email protected]>
author: Mathieu Bridon <[email protected]> 2018-08-10 23:17:08 +0200
committer: Dylan Baker <[email protected]> 2018-08-10 15:14:48 -0700
commit: bd27203f4d808763ac24ac94eb677cacf3e7cb99 (patch)
tree: d152e27647c13f2dc953d9fcd3f500b465077e3e
parent: 15ac05fd45afb0d85f1806a77fc9ec47f4949f01 (diff)
1 files changed, 31 insertions, 10 deletions
diff --git a/src/util/xmlpool/gen_xmlpool.py b/src/util/xmlpool/gen_xmlpool.py
index b0db183854a..327709c7f8d 100644
--- a/src/util/xmlpool/gen_xmlpool.py
+++ b/src/util/xmlpool/gen_xmlpool.py
@@ -13,6 +13,12 @@ import sys
 import gettext
 import re
 
+
+if sys.version_info < (3, 0):
+    gettext_method = 'ugettext'
+else:
+    gettext_method = 'gettext'
+
 # Path to t_options.h
 template_header_path = sys.argv[1]
 
@@ -60,7 +66,7 @@ def expandCString (s):
     octa = False
     num = 0
     digits = 0
-    r = ''
+    r = u''
     while i < len(s):
         if not escape:
             if s[i] == '\\':
@@ -128,16 +134,29 @@ def expandMatches (matches, translations, end=None):
         if len(matches) == 1 and i < len(translations) and \
                not matches[0].expand (r'\7').endswith('\\'):
             suffix = ' \\'
-        # Expand the description line. Need to use ugettext in order to allow
-        # non-ascii unicode chars in the original English descriptions.
-        text = escapeCString (trans.ugettext (unicode (expandCString (
-            matches[0].expand (r'\5')), "utf-8"))).encode("utf-8")
-        print(matches[0].expand (r'\1' + lang + r'\3"' + text + r'"\7') + suffix)
+        text = escapeCString (getattr(trans, gettext_method) (expandCString (
+            matches[0].expand (r'\5'))))
+        text = (matches[0].expand (r'\1' + lang + r'\3"' + text + r'"\7') + suffix)
+
+        # In Python 2, stdout expects encoded byte strings, or else it will
+        # encode them with the ascii 'codec'
+        if sys.version_info.major == 2:
+            text = text.encode('utf-8')
+
+        print(text)
+
         # Expand any subsequent enum lines
         for match in matches[1:]:
-            text = escapeCString (trans.ugettext (unicode (expandCString (
-                match.expand (r'\3')), "utf-8"))).encode("utf-8")
-            print(match.expand (r'\1"' + text + r'"\5'))
+            text = escapeCString (getattr(trans, gettext_method) (expandCString (
+                match.expand (r'\3'))))
+            text = match.expand (r'\1"' + text + r'"\5')
+
+            # In Python 2, stdout expects encoded byte strings, or else it will
+            # encode them with the ascii 'codec'
+            if sys.version_info.major == 2:
+                text = text.encode('utf-8')
+
+            print(text)
 
         # Expand description end
         if end:
@@ -168,9 +187,11 @@ print("/***********************************************************************\
 
 # Process the options template and generate options.h with all
 # translations.
-template = open (template_header_path, "r")
+template = open (template_header_path, "rb")
 descMatches = []
 for line in template:
+    line = line.decode('utf-8')
+
     if len(descMatches) > 0:
         matchENUM     = reENUM    .match (line)
         matchDESC_END = reDESC_END.match (line)
author	Mathieu Bridon <[email protected]>	2018-08-10 23:17:08 +0200
committer	Dylan Baker <[email protected]>	2018-08-10 15:14:48 -0700
commit	bd27203f4d808763ac24ac94eb677cacf3e7cb99 (patch)
tree	d152e27647c13f2dc953d9fcd3f500b465077e3e
parent	15ac05fd45afb0d85f1806a77fc9ec47f4949f01 (diff)