Add an extra layer of encoding recovery when sending data to sessions, as per #800.
This commit is contained in:
parent
0a75fbf64e
commit
8b6891428f
1 changed files with 57 additions and 2 deletions
|
|
@ -1,3 +1,4 @@
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
General helper functions that don't fit neatly under any given category.
|
General helper functions that don't fit neatly under any given category.
|
||||||
|
|
||||||
|
|
@ -503,6 +504,56 @@ def dbref_to_obj(inp, objclass, raise_errors=True):
|
||||||
dbid_to_obj = dbref_to_obj
|
dbid_to_obj = dbref_to_obj
|
||||||
|
|
||||||
|
|
||||||
|
# some direct translations for the latinify
|
||||||
|
_UNICODE_MAP = {"EM DASH": "-", "FIGURE DASH": "-", "EN DASH": "-", "HORIZONTAL BAR": "-",
|
||||||
|
"HORIZONTAL ELLIPSIS": "...", "RIGHT SINGLE QUOTATION MARK": "'"}
|
||||||
|
|
||||||
|
def latinify(unicode_string, default='?', pure_ascii=False):
|
||||||
|
"""
|
||||||
|
Convert a unicode string to "safe" ascii/latin-1 characters.
|
||||||
|
This is used as a last resort when normal decoding does not work.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
unicode_string (unicode): A string to convert to an ascii
|
||||||
|
or latin-1 string.
|
||||||
|
default (str, optional): Characters resisting mapping will be replaced
|
||||||
|
with this character or string.
|
||||||
|
Notes:
|
||||||
|
This is inspired by the gist by Ricardo Murri:
|
||||||
|
https://gist.github.com/riccardomurri/3c3ccec30f037be174d3
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from unicodedata import name
|
||||||
|
|
||||||
|
converted = []
|
||||||
|
for unich in iter(unicode_string):
|
||||||
|
try:
|
||||||
|
ch = unich.decode('ascii')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# deduce a latin letter equivalent from the Unicode data
|
||||||
|
# point name; e.g., since `name(u'á') == 'LATIN SMALL
|
||||||
|
# LETTER A WITH ACUTE'` translate `á` to `a`. However, in
|
||||||
|
# some cases the unicode name is still "LATIN LETTER"
|
||||||
|
# although no direct equivalent in the Latin alphabeth
|
||||||
|
# exists (e.g., Þ, "LATIN CAPITAL LETTER THORN") -- we can
|
||||||
|
# avoid these cases by checking that the letter name is
|
||||||
|
# composed of one letter only.
|
||||||
|
# We also supply some direct-translations for some particular
|
||||||
|
# common cases.
|
||||||
|
what = name(unich)
|
||||||
|
if what in _UNICODE_MAP:
|
||||||
|
ch = _UNICODE_MAP[what]
|
||||||
|
else:
|
||||||
|
what = what.split()
|
||||||
|
if what[0] == 'LATIN' and what[2] == 'LETTER' and len(what[3]) == 1:
|
||||||
|
ch = what[3].lower() if what[1] == 'SMALL' else what[3].upper()
|
||||||
|
else:
|
||||||
|
ch = default
|
||||||
|
converted.append(chr(ord(ch)))
|
||||||
|
return ''.join(converted)
|
||||||
|
|
||||||
|
|
||||||
def to_unicode(obj, encoding='utf-8', force_string=False):
|
def to_unicode(obj, encoding='utf-8', force_string=False):
|
||||||
"""
|
"""
|
||||||
This decodes a suitable object to the unicode format.
|
This decodes a suitable object to the unicode format.
|
||||||
|
|
@ -570,7 +621,6 @@ def to_str(obj, encoding='utf-8', force_string=False):
|
||||||
conversion of objects to strings.
|
conversion of objects to strings.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if force_string and not isinstance(obj, basestring):
|
if force_string and not isinstance(obj, basestring):
|
||||||
# some sort of other object. Try to
|
# some sort of other object. Try to
|
||||||
# convert it to a string representation.
|
# convert it to a string representation.
|
||||||
|
|
@ -590,7 +640,12 @@ def to_str(obj, encoding='utf-8', force_string=False):
|
||||||
return obj
|
return obj
|
||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
pass
|
pass
|
||||||
raise Exception("Error: Unicode could not encode unicode string '%s'(%s) to a bytestring. " % (obj, encoding))
|
|
||||||
|
# if we get to this point we have not found any way to convert this string. Try to parse it manually,
|
||||||
|
try:
|
||||||
|
return latinify(obj, '?')
|
||||||
|
except Exception, err:
|
||||||
|
raise Exception("%s, Error: Unicode could not encode unicode string '%s'(%s) to a bytestring. " % (err, obj, encoding))
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue