Added a 'regexable' mode for ANSIString, fixed a few bugs with it.

Refactored with metaclass and added comments. Resolves #481, Resolves #480
2014-02-15 18:41:55 -06:00 · 2014-02-15 18:41:55 -06:00 · a9ad82d005
commit a9ad82d005
parent 638483fa66
1 changed files with 395 additions and 200 deletions
--- a/src/utils/ansi.py
+++ b/src/utils/ansi.py
@ -174,11 +174,11 @@ class ANSIParser(object):
        strip_ansi flag instead removes all ansi markup.
        """
-        if hasattr(string, 'raw_string'):
+        if hasattr(string, '_raw_string'):
            if strip_ansi:
-                return string.clean_string
+                return string.clean()
            else:
-                return string.raw_string
+                return string.raw()
        if not string:
            return ''
        self.do_xterm256 = xterm256
@ -322,6 +322,10 @@ def group(lst, n):
 def _spacing_preflight(func):
    """
    This wrapper function is used to do some preflight checks on functions used
    for padding ANSIStrings.
    """
    def wrapped(self, width, fillchar=None):
        if fillchar is None:
            fillchar = " "
@ -336,195 +340,14 @@ def _spacing_preflight(func):
    return wrapped
 class ANSIString(unicode):
    """
    String-like object that is aware of ANSI codes.
    This isn't especially efficient, as it doesn't really have an
    understanding of what the codes mean in order to eliminate
    redundant characters, but a proper parser would have to be written for
    that.
    Take note of the instructions at the bottom of the module, which modify
    this class.
    """
    def __new__(cls, *args, **kwargs):
        """
        When creating a new ANSIString, you may use a custom parser that has
        the same attributes as the standard one, and you may declare the
        string to be handled as already decoded. It is important not to double
        decode strings, as escapes can only be respected once.
        """
        string = to_str(args[0], force_string=True)
        if not isinstance(string, basestring):
            string = str(string)
        parser = kwargs.get('parser', ANSI_PARSER)
        decoded = kwargs.get('decoded', False) or hasattr(string, 'raw_string')
        if not decoded:
            string = parser.parse_ansi(string)
        return super(ANSIString, cls).__new__(ANSIString, string, 'utf-8')
    def __repr__(self):
        return "ANSIString(%s, decoded=True)" % repr(self.raw_string)
    def __init__(self, *args, **kwargs):
        self.parser = kwargs.pop('parser', ANSI_PARSER)
        super(ANSIString, self).__init__(*args, **kwargs)
        self.raw_string = unicode(self)
        self.clean_string = unicode(self.parser.parse_ansi(
            self.raw_string, strip_ansi=True), 'utf-8')
        self._code_indexes, self._char_indexes = self._get_indexes()
    def __len__(self):
        return len(self.clean_string)
    def __add__(self, other):
        if not isinstance(other, basestring):
            return NotImplemented
        return ANSIString(self.raw_string + getattr(
            other, 'raw_string', other), decoded=True)
    def __radd__(self, other):
        if not isinstance(other, basestring):
            return NotImplemented
        return ANSIString(getattr(
            other, 'raw_string', other) + self.raw_string, decoded=True)
    def __getslice__(self, i, j):
        return self.__getitem__(slice(i, j))
    def _slice(self, item):
        slice_indexes = self._char_indexes[item]
        if not slice_indexes:
            return ANSIString('')
        try:
            string = self[item.start].raw_string
        except IndexError:
            return ANSIString('')
        last_mark = slice_indexes[0]
        for i in slice_indexes[1:]:
            for index in range(last_mark, i):
                if index in self._code_indexes:
                    string += self.raw_string[index]
            last_mark = i
            try:
                string += self.raw_string[i]
            except IndexError:
                pass
        return ANSIString(string, decoded=True)
    def __getitem__(self, item):
        if isinstance(item, slice):
            return self._slice(item)
        try:
            item = self._char_indexes[item]
        except IndexError:
            raise IndexError("ANSIString index out of range.")
        clean = self.raw_string[item]
        result = ''
        for index in range(0, item + 1):
            if index in self._code_indexes:
                result += self.raw_string[index]
        return ANSIString(result + clean, decoded=True)
    def rsplit(self, sep=None, maxsplit=None):
        return self.split(sep, maxsplit, reverse=True)
    def split(self, sep=None, maxsplit=None, reverse=False):
        if hasattr(sep, 'clean_string'):
            sep = sep.clean_string
        args = [sep]
        if maxsplit is not None:
            args.append(maxsplit)
        if reverse:
            parent_result = self.clean_string.rsplit(*args)
        else:
            parent_result = self.clean_string.split(*args)
        current_index = 0
        result = []
        for section in parent_result:
            result.append(self[current_index:current_index + len(section)])
            current_index += (len(section)) + len(sep)
        return result
    def partition(self, sep, reverse=False):
        if hasattr(sep, 'clean_string'):
            sep = sep.clean_string
        if reverse:
            parent_result = self.clean_string.rpartition(sep)
        else:
            parent_result = self.clean_string.partition(sep)
        current_index = 0
        result = tuple()
        for section in parent_result:
            result += (self[current_index:current_index + len(section)],)
            current_index += len(section)
        return result
    def _get_indexes(self):
        matches = [
            (match.start(), match.end())
            for match in self.parser.ansi_regex.finditer(self.raw_string)]
        code_indexes = []
        # These are all the indexes which hold code characters.
        for start, end in matches:
            code_indexes.extend(range(start, end))
        if not code_indexes:
            # Plain string, no ANSI codes.
            return code_indexes, range(0, len(self.raw_string))
        flat_ranges = []
        # We need to get the ones between them, but the code might start at
        # the beginning, and there might be codes at the end.
        for tup in matches:
            flat_ranges.extend(tup)
        # Is the beginning of the string a code character?
        if flat_ranges[0] == 0:
            flat_ranges.pop(0)
        else:
            flat_ranges.insert(0, 0)
        # How about the end?
        end_index = (len(self.raw_string) - 1)
        if flat_ranges[-1] == end_index:
            flat_ranges.pop()
        else:
            flat_ranges.append(end_index)
        char_indexes = []
        for start, end in list(group(flat_ranges, 2)):
            char_indexes.extend(range(start, end))
        # The end character will be left off if it's a normal character. Fix
        # that here.
        if end_index in flat_ranges:
            char_indexes.append(end_index)
        return code_indexes, char_indexes
    @_spacing_preflight
    def center(self, width, fillchar, difference):
        remainder = difference % 2
        difference /= 2
        spacing = difference * fillchar
        result = spacing + self + spacing + (remainder * fillchar)
        return result
    @_spacing_preflight
    def ljust(self, width, fillchar, difference):
        return self + (difference * fillchar)
    @_spacing_preflight
    def rjust(self, width, fillchar, difference):
        return (difference * fillchar) + self
 def _query_super(func_name):
    """
    Have the string class handle this with the cleaned string instead of
    ANSIString.
    """
-    def query_func(self, *args, **kwargs):
+    def wrapped(self, *args, **kwargs):
-        return getattr(self.clean_string, func_name)(*args, **kwargs)
+        return getattr(self.clean(), func_name)(*args, **kwargs)
-    return query_func
+    return wrapped
 def _on_raw(func_name):
@ -536,7 +359,7 @@ def _on_raw(func_name):
        try:
            string = args.pop(0)
            if hasattr(string, 'raw_string'):
-                args.insert(0, string.raw_string)
+                args.insert(0, string.raw())
            else:
                args.insert(0, string)
        except IndexError:
@ -566,16 +389,388 @@ def _transform(func_name):
        return ANSIString(''.join(to_string), decoded=True)
    return wrapped
 class ANSIMeta(type):
    """
    Many functions on ANSIString are just light wrappers around the unicode
    base class. We apply them here, as part of the classes construction.
    """
    def __init__(cls, *args, **kwargs):
        for func_name in [
                'count', 'startswith', 'endswith', 'find', 'index', 'isalnum',
                'isalpha', 'isdigit', 'islower', 'isspace', 'istitle', 'isupper',
                'rfind', 'rindex', '__len__']:
            setattr(cls, func_name, _query_super(func_name))
        for func_name in [
                '__mul__', '__mod__', 'expandtabs', '__rmul__', 'join',
                'decode', 'replace', 'format']:
            setattr(cls, func_name, _on_raw(func_name))
        for func_name in [
                'capitalize', 'translate', 'lower', 'upper', 'swapcase']:
            setattr(cls, func_name, _transform(func_name))
        super(ANSIMeta, cls).__init__(*args, **kwargs)
-for func_name in [
+
-        'count', 'startswith', 'endswith', 'find', 'index', 'isalnum',
+class ANSIString(unicode):
-        'isalpha', 'isdigit', 'islower', 'isspace', 'istitle', 'isupper',
+    """
-        'rfind', 'rindex']:
+    String-like object that is aware of ANSI codes.
-    setattr(ANSIString, func_name, _query_super(func_name))
+
-for func_name in [
+    This isn't especially efficient, as it doesn't really have an
-        '__mul__', '__mod__', 'expandtabs', '__rmul__', 'join',
+    understanding of what the codes mean in order to eliminate
-        'decode', 'replace', 'format']:
+    redundant characters. This could be made as an enhancement to ANSI_PARSER.
-    setattr(ANSIString, func_name, _on_raw(func_name))
+
-for func_name in [
+    If one is going to use ANSIString, one should generally avoid converting
-        'capitalize', 'translate', 'lower', 'upper', 'swapcase']:
+    away from it until one is about to send information on the wire. This is
-    setattr(ANSIString, func_name, _transform(func_name))
+    because escape sequences in the string may otherwise already be decoded,
    and taken literally the second time around.
    Please refer to the Metaclass, ANSIMeta, which is used to apply wrappers
    for several of the methods that need not be defined directly here.
    """
    __metaclass__ = ANSIMeta
    def __new__(cls, *args, **kwargs):
        """
        When creating a new ANSIString, you may use a custom parser that has
        the same attributes as the standard one, and you may declare the
        string to be handled as already decoded. It is important not to double
        decode strings, as escapes can only be respected once.
        If the regexable flag is set, using __getitem__, such as when getting
        an index or slicing, will return the result from the raw string. If
        this flag is set False, it will intelligently skip ANSI escapes.
        ANSIString('{rHello{g, W{yorld', regexable=True)[0] will return the
        first byte of the escape sequence before 'Hello', while
        ANSIString('{rHello{g, W{yorld')[0] will return a red 'H'.
        When a regexable ANSIString is sliced, the result is returned as a
        non-regexable ANSI String. This ensures that usage of regexable
        ANSIStrings is an explicit choice.
        Why all this complication with the regexable flag?
        The reason is that while we are able to subclass the unicode object in
        Python, the byte representation of the string in memory cannot be
        changed and still exists under the hood. This doesn't matter for things
        coded in pure Python, but since Regexes need to be mindful of
        performance, the module that handles them operates directly on the
        memory representation of the string in order to do matching. It is thus
        completely unaware of our customizations to the class. Interestingly,
        however, while the re module does its matching on the raw string, it
        slices the string using the object's methods. This means that running
        a regex on an ANSIString would return matches at bogus indexes, since
        the __getitem__ method of ANSIString skips ANSI escape sequences, which
        were part of the raw data regex was matching against.
        So, if you need to use regex on an ANSIString, make sure you get it in
        regexable mode first, and be ready to deal with a few edge cases.
        """
        string = to_str(args[0], force_string=True)
        if not isinstance(string, basestring):
            string = str(string)
        parser = kwargs.get('parser', ANSI_PARSER)
        regexable = kwargs.get('regexable', False)
        decoded = kwargs.get('decoded', False) or hasattr(string, 'raw_string')
        if not decoded:
            string = parser.parse_ansi(string)
        if isinstance(string, unicode):
            string = super(ANSIString, cls).__new__(ANSIString, string)
        else:
            string = super(ANSIString, cls).__new__(ANSIString, string, 'utf-8')
        string._regexable = regexable
        return string
    def __repr__(self):
        """
        Let's make the repr the command that would actually be used to
        construct this object, for convenience and reference.
        """
        if self._regexable:
            reg = ', regexable=True'
        else:
            reg = ''
        return "ANSIString(%s, decoded=True%s)" % (repr(self._raw_string), reg)
    def __init__(self, *args, **kwargs):
        """
        When the ANSIString is first initialized, a few internal variables
        have to be set.
        The first is the parser. It is possible to replace Evennia's standard
        ANSI parser with one of your own syntax if you wish, so long as it
        implements the same interface.
        The second is the _raw_string. It should be noted that the ANSIStrings
        are unicode based. This seemed more reasonable than basing it off of
        the string class, because if someone were to use a unicode character,
        the benefits of knowing the indexes of the ANSI characters would be
        negated by the fact that a character within the string might require
        more than one byte to be represented. The raw string is, then, a
        unicode object rather than a true encoded string. If you need the
        encoded string for sending over the wire, try using the .encode()
        method.
        The third thing to set is the _clean_string. This is a unicode object
        that is devoid of all ANSI Escapes.
        Finally, _code_indexes and _char_indexes are defined. These are lookup
        tables for which characters in the raw string are related to ANSI
        escapes, and which are for the readable text.
        """
        self.parser = kwargs.pop('parser', ANSI_PARSER)
        super(ANSIString, self).__init__(*args, **kwargs)
        self._raw_string = unicode(self)
        self._clean_string = unicode(self.parser.parse_ansi(
            self._raw_string, strip_ansi=True), 'utf-8')
        self._code_indexes, self._char_indexes = self._get_indexes()
    def __add__(self, other):
        """
        We have to be careful when adding two strings not to reprocess things
        that don't need to be reprocessed, lest we end up with escapes being
        interpreted literally.
        """
        if not isinstance(other, basestring):
            return NotImplemented
        return ANSIString(self._raw_string + getattr(
            other, 'raw_string', other), decoded=True)
    def __radd__(self, other):
        """
        Likewise, if we're on the other end.
        """
        if not isinstance(other, basestring):
            return NotImplemented
        return ANSIString(getattr(
            other, 'raw_string', other) + self._raw_string, decoded=True)
    def __getslice__(self, i, j):
        """
        This function is deprecated, so we just make it call the proper
        function.
        """
        return self.__getitem__(slice(i, j))
    def _slice(self, slc):
        """
        This function takes a slice() object.
        Slices have to be handled specially. Not only are they able to specify
        a start and end with [x:y], but many forget that they can also specify
        an interval with [x:y:z]. As a result, not only do we have to track
        the ANSI Escapes that have played before the start of the slice, we
        must also replay any in these intervals, should the exist.
        Thankfully, slicing the _char_indexes table gives us the actual
        indexes that need slicing in the raw string. We can check between
        those indexes to figure out what escape characters need to be
        replayed.
        """
        slice_indexes = self._char_indexes[slc]
        if not slice_indexes:
            return ANSIString('')
        try:
            string = self[slc.start]._raw_string
        except IndexError:
            return ANSIString('')
        last_mark = slice_indexes[0]
        # Check between the slice intervals for escape sequences.
        for i in slice_indexes[1:]:
            for index in range(last_mark, i):
                if index in self._code_indexes:
                    string += self._raw_string[index]
            last_mark = i
            try:
                string += self._raw_string[i]
            except IndexError:
                pass
        return ANSIString(string, decoded=True)
    def __getitem__(self, item):
        """
        Gateway for slices and getting specific indexes in the ANSIString. If
        this is a regexable ANSIString, it will get the data from the raw
        string instead, bypassing ANSIString's intelligent escape skipping,
        for reasons explained in the __new__ method's docstring.
        """
        if self._regexable:
            return ANSIString(self._raw_string[item], decoded=True)
        if isinstance(item, slice):
            # Slices must be handled specially.
            return self._slice(item)
        try:
            item = self._char_indexes[item]
        except IndexError:
            raise IndexError("ANSIString index out of range.")
        clean = self._raw_string[item]
        result = ''
        # Get the character they're after, and replay all escape sequences
        # previous to it.
        for index in range(0, item + 1):
            if index in self._code_indexes:
                result += self._raw_string[index]
        return ANSIString(result + clean, decoded=True)
    def rsplit(self, sep=None, maxsplit=None):
        """
        Like split, but from the end of the string, rather than the beginning.
        """
        return self.split(sep, maxsplit, reverse=True)
    def split(self, sep=None, maxsplit=None, reverse=False):
        """
        Splits in a manner similar to the standard string split method. First,
        we split the clean string. Then we measure each section of the result
        to figure out where they start and end, and replay any escapes that
        would have occured before that.
        """
        if hasattr(sep, 'clean_string'):
            sep = sep.clean_string
        args = [sep]
        if maxsplit is not None:
            args.append(maxsplit)
        if reverse:
            parent_result = self._clean_string.rsplit(*args)
        else:
            parent_result = self._clean_string.split(*args)
        # Might be None.
        sep = sep or ''
        current_index = 0
        result = []
        for section in parent_result:
            result.append(self[current_index:current_index + len(section)])
            current_index += (len(section)) + len(sep)
        return result
    def clean(self):
        """
        Return a unicode object without the ANSI escapes.
        """
        return self._clean_string
    def raw(self):
        """
        Return a unicode object with the ANSI escapes.
        """
        return self._raw_string
    def is_regexable(self):
        """
        State whether or not this ANSIString is a 'regexable' ANSIString.
        Regexable ANSIStrings return indexes from _raw_string when sliced.
        """
        return self._regexable
    def regexable(self):
        """
        Return the regexable version of this ANSIString.
        """
        return ANSIString(self, decoded=True, regexable=True)
    def non_regexable(self):
        """
        Return the non-regexable version of this ANSIString.
        """
        return ANSIString(self, decoded=True)
    def partition(self, sep, reverse=False):
        """
        Similar to split, but always creates a tuple with three items:
        1. The part before the separator
        2. The separator itself.
        3. The part after.
        We use the same techniques we used in split() to make sure each are
        colored.
        """
        if hasattr(sep, '_clean_string'):
            sep = sep.clean()
        if reverse:
            parent_result = self._clean_string.rpartition(sep)
        else:
            parent_result = self._clean_string.partition(sep)
        current_index = 0
        result = tuple()
        for section in parent_result:
            result += (self[current_index:current_index + len(section)],)
            current_index += len(section)
        return result
    def _get_indexes(self):
        """
        Two tables need to be made, one which contains the indexes of all
        readable characters, and one which contains the indexes of all ANSI
        escapes. It's important to remember that ANSI escapes require more
        that one character at a time, though no readable character needs more
        than one character, since the unicode base class abstracts that away
        from us. However, several readable characters can be placed in a row.
        We must use regexes here to figure out where all the escape sequences
        are hiding in the string. Then we use the ranges of their starts and
        ends to create a final, comprehensive list of all indexes which are
        dedicated to code, and all dedicated to text.
        It's possible that only one of these tables is actually needed, the
        other assumed to be what isn't in the first.
        """
        matches = [
            (match.start(), match.end())
            for match in self.parser.ansi_regex.finditer(self._raw_string)]
        code_indexes = []
        # These are all the indexes which hold code characters.
        for start, end in matches:
            code_indexes.extend(range(start, end))
        if not code_indexes:
            # Plain string, no ANSI codes.
            return code_indexes, range(0, len(self._raw_string))
        flat_ranges = []
        # We need to get the ones between them, but the code might start at
        # the beginning, and there might be codes at the end.
        for tup in matches:
            flat_ranges.extend(tup)
        # Is the beginning of the string a code character?
        if flat_ranges[0] == 0:
            flat_ranges.pop(0)
        else:
            flat_ranges.insert(0, 0)
        # How about the end?
        end_index = (len(self._raw_string) - 1)
        if flat_ranges[-1] == end_index:
            flat_ranges.pop()
        else:
            flat_ranges.append(end_index)
        char_indexes = []
        for start, end in list(group(flat_ranges, 2)):
            char_indexes.extend(range(start, end))
        # The end character will be left off if it's a normal character. Fix
        # that here.
        if end_index in flat_ranges:
            char_indexes.append(end_index)
        return code_indexes, char_indexes
    @_spacing_preflight
    def center(self, width, fillchar, difference):
        """
        Center some text with some spaces padding both sides.
        """
        remainder = difference % 2
        difference /= 2
        spacing = difference * fillchar
        result = spacing + self + spacing + (remainder * fillchar)
        return result
    @_spacing_preflight
    def ljust(self, width, fillchar, difference):
        """
        Left justify some text.
        """
        return self + (difference * fillchar)
    @_spacing_preflight
    def rjust(self, width, fillchar, difference):
        """
        Right justify some text.
        """
        return (difference * fillchar) + self