Merge pull request #1 from SaladDais/speed_up_notation_parse

bennettgoble · web-flow · commit eea1b9aab097 · 2022-11-17T07:24:28.000-08:00
Speed up parsing notation LLSD
diff --git a/llsd/base.py b/llsd/base.py
@@ -141,6 +141,19 @@ def B(fmt):
             return fmt
 
 
+class PY3SemanticBytes(BytesType):
+    """Wrapper to make `buffer[n]` return an integer like in Py3"""
+    __slots__ = []
+
+    def __getitem__(self, item):
+        ret = super(PY3SemanticBytes, self).__getitem__(item)
+        # `buffer[n]` should return an integer, but slice syntax like
+        # `buffer[n:n+1]` should still return a `Bytes` object as before.
+        if is_integer(item):
+            return ord(ret)
+        return ret
+
+
 def is_integer(o):
     """ portable test if an object is like an int """
     return isinstance(o, IntTypes)
@@ -321,19 +334,6 @@ def _to_python(node):
     return NODE_HANDLERS[node.tag](node)
 
 
-def _hex_as_nybble(hex):
-    "Accepts a single hex character and returns a nybble."
-    if (hex >= b'0') and (hex <= b'9'):
-        return ord(hex) - ord(b'0')
-    elif (hex >= b'a') and (hex <=b'f'):
-        return 10 + ord(hex) - ord(b'a')
-    elif (hex >= b'A') and (hex <=b'F'):
-        return 10 + ord(hex) - ord(b'A')
-    else:
-        raise LLSDParseError('Invalid hex character: %s' % hex)
-
-
-
 class LLSDBaseFormatter(object):
     """
     This base class cannot be instantiated on its own: it assumes a subclass
@@ -366,13 +366,22 @@ def __init__(self):
         }
 
 
+_X_ORD = ord(b'x')
+_BACKSLASH_ORD = ord(b'\\')
+_DECODE_BUFF_ALLOC_SIZE = 1024
+
+
 class LLSDBaseParser(object):
     """
     Utility methods useful for parser subclasses.
     """
+    __slots__ = ['_buffer', '_index', '_decode_buff']
+
     def __init__(self):
         self._buffer = b''
-        self._index  = 0
+        self._index = 0
+        # Scratch space for decoding delimited strings
+        self._decode_buff = bytearray(_DECODE_BUFF_ALLOC_SIZE)
 
     def _error(self, message, offset=0):
         try:
@@ -399,53 +408,85 @@ def _getc(self, num=1):
 
     # map char following escape char to corresponding character
     _escaped = {
-        b'a': b'\a',
-        b'b': b'\b',
-        b'f': b'\f',
-        b'n': b'\n',
-        b'r': b'\r',
-        b't': b'\t',
-        b'v': b'\v',
+        ord(b'a'): ord(b'\a'),
+        ord(b'b'): ord(b'\b'),
+        ord(b'f'): ord(b'\f'),
+        ord(b'n'): ord(b'\n'),
+        ord(b'r'): ord(b'\r'),
+        ord(b't'): ord(b'\t'),
+        ord(b'v'): ord(b'\v'),
     }
 
     def _parse_string_delim(self, delim):
         "Parse a delimited string."
-        parts = bytearray()
-        found_escape = False
-        found_hex = False
-        found_digit = False
-        byte = 0
+        insert_idx = 0
+        delim_ord = ord(delim)
+        # Preallocate a working buffer for the decoded string output
+        # to avoid allocs in the hot loop.
+        decode_buff = self._decode_buff
+        # Cache these in locals, otherwise we have to perform a lookup on
+        # `self` in the hot loop.
+        buff = self._buffer
+        read_idx = self._index
+        cc = 0
         while True:
-            cc = self._getc()
-            if found_escape:
-                if found_hex:
-                    if found_digit:
-                        found_escape = False
-                        found_hex = False
-                        found_digit = False
-                        byte <<= 4
-                        byte |= _hex_as_nybble(cc)
-                        parts.append(byte)
-                        byte = 0
+            try:
+                cc = buff[read_idx]
+                read_idx += 1
+
+                if cc == _BACKSLASH_ORD:
+                    # Backslash, figure out if this is an \xNN hex escape or
+                    # something like \t
+                    cc = buff[read_idx]
+                    read_idx += 1
+                    if cc == _X_ORD:
+                        # It's a hex escape. char is the value of the two
+                        # following hex nybbles. This slice may result in
+                        # a short read (0 or 1 bytes), but either a
+                        # `ValueError` will be triggered by the first case,
+                        # and the second will cause an `IndexError` on the
+                        # next iteration of the loop.
+                        hex_bytes = buff[read_idx:read_idx + 2]
+                        read_idx += 2
+                        try:
+                            # int() can parse a `bytes` containing hex,
+                            # no explicit `bytes.decode("ascii")` required.
+                            cc = int(hex_bytes, 16)
+                        except ValueError as e:
+                            # One of the hex characters was likely invalid.
+                            # Wrap the ValueError so that we can provide a
+                            # byte offset in the error.
+                            self._index = read_idx
+                            self._error(e, offset=-2)
                     else:
-                        found_digit = True
-                        byte = _hex_as_nybble(cc)
-                elif cc == b'x':
-                    found_hex = True
-                else:
-                    found_escape = False
-                    # escape char preceding anything other than the chars in
-                    # _escaped just results in that same char without the
-                    # escape char
-                    parts.extend(self._escaped.get(cc, cc))
-            elif cc == b'\\':
-                found_escape = True
-            elif cc == delim:
-                break
-            else:
-                parts.extend(cc)
+                        # escape char preceding anything other than the chars
+                        # in _escaped just results in that same char without
+                        # the escape char
+                        cc = self._escaped.get(cc, cc)
+                elif cc == delim_ord:
+                    break
+            except IndexError:
+                # We can be reasonably sure that any IndexErrors inside here
+                # were caused by an out-of-bounds `buff[read_idx]`.
+                self._index = read_idx
+                self._error("Trying to read past end of buffer")
+
+            try:
+                decode_buff[insert_idx] = cc
+            except IndexError:
+                # Oops, that overflowed the decoding buffer, make a
+                # new expanded buffer containing the existing contents.
+                decode_buff = bytearray(decode_buff)
+                decode_buff.extend(b"\x00" * _DECODE_BUFF_ALLOC_SIZE)
+                decode_buff[insert_idx] = cc
+
+            insert_idx += 1
+
+        # Sync our local read index with the canonical one
+        self._index = read_idx
         try:
-            return parts.decode('utf-8')
+            # Slice off only what we used of the working decode buffer
+            return decode_buff[:insert_idx].decode('utf-8')
         except UnicodeDecodeError as exc:
             self._error(exc)
 
@@ -457,4 +498,4 @@ def starts_with(startstr, something):
         pos = something.tell()
         s = something.read(len(startstr))
         something.seek(pos, os.SEEK_SET)
-        return (s == startstr)
+        return (s == startstr)
diff --git a/llsd/serde_binary.py b/llsd/serde_binary.py
@@ -4,7 +4,7 @@
 import uuid
 
 from llsd.base import (_LLSD, LLSDBaseParser, LLSDSerializationError, _str_to_bytes, binary, is_integer, is_string,
-                       starts_with, uri)
+                       starts_with, uri, PY2, is_bytes, PY3SemanticBytes)
 
 
 class LLSDBinaryParser(LLSDBaseParser):
@@ -13,6 +13,8 @@ class LLSDBinaryParser(LLSDBaseParser):
 
     See http://wiki.secondlife.com/wiki/LLSD#Binary_Serialization
     """
+    __slots__ = ['_dispatch', '_keep_binary']
+
     def __init__(self):
         super(LLSDBinaryParser, self).__init__()
         # One way of dispatching based on the next character we see would be a
@@ -61,6 +63,10 @@ def parse(self, buffer, ignore_binary = False):
         :param ignore_binary: parser throws away data in llsd binary nodes.
         :returns: returns a python object.
         """
+        if PY2 and is_bytes(buffer):
+            # We need to wrap this in a helper class so that individual element
+            # access works the same as in PY3
+            buffer = PY3SemanticBytes(buffer)
         self._buffer = buffer
         self._index = 0
         self._keep_binary = not ignore_binary
diff --git a/llsd/serde_notation.py b/llsd/serde_notation.py
@@ -4,7 +4,7 @@
 import uuid
 
 from llsd.base import (_LLSD, B, LLSDBaseFormatter, LLSDBaseParser, LLSDParseError, LLSDSerializationError, UnicodeType,
-                       _format_datestr, _parse_datestr, _str_to_bytes, binary, uri)
+                       _format_datestr, _parse_datestr, _str_to_bytes, binary, uri, PY2, is_bytes, PY3SemanticBytes)
 
 _int_regex = re.compile(br"[-+]?\d+")
 _real_regex = re.compile(br"[-+]?(?:(\d+(\.\d*)?|\d*\.\d+)([eE][-+]?\d+)?)|[-+]?inf|[-+]?nan")
@@ -86,6 +86,11 @@ def parse(self, buffer, ignore_binary = False):
         if buffer == b"":
             return False
 
+        if PY2 and is_bytes(buffer):
+            # We need to wrap this in a helper class so that individual element
+            # access works the same as in PY3
+            buffer = PY3SemanticBytes(buffer)
+
         self._buffer = buffer
         self._index = 0
         return self._parse()
@@ -328,6 +333,8 @@ class LLSDNotationFormatter(LLSDBaseFormatter):
 
     See http://wiki.secondlife.com/wiki/LLSD#Notation_Serialization
     """
+    __slots__ = []
+
     def LLSD(self, v):
         return self._generate(v.thing)
     def UNDEF(self, v):
diff --git a/llsd/serde_xml.py b/llsd/serde_xml.py
@@ -36,6 +36,7 @@ class LLSDXMLFormatter(LLSDBaseFormatter):
     module level format_xml is the most convenient interface to this
     functionality.
     """
+    __slots__ = []
 
     def _elt(self, name, contents=None):
         "Serialize a single element."
diff --git a/tests/llsd_test.py b/tests/llsd_test.py
@@ -507,6 +507,21 @@ def testParseNotationIncorrectMIME(self):
         except llsd.LLSDParseError:
             pass
 
+    def testParseNotationUnterminatedString(self):
+        """
+        Test with an unterminated delimited string
+        """
+        self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'foo")
+
+    def testParseNotationHexEscapeNoChars(self):
+        self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'\\x")
+
+    def testParseNotationHalfTruncatedHex(self):
+        self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'\\xf")
+
+    def testParseNotationInvalidHex(self):
+        self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'\\xzz'")
+
 
 class LLSDBinaryUnitTest(unittest.TestCase):
     """