Skip to content

Commit eea1b9a

Browse files
authored
Merge pull request #1 from SaladDais/speed_up_notation_parse
Speed up parsing notation LLSD
2 parents f590a0b + 2325efb commit eea1b9a

File tree

5 files changed

+127
-57
lines changed

5 files changed

+127
-57
lines changed

llsd/base.py

Lines changed: 96 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,19 @@ def B(fmt):
141141
return fmt
142142

143143

144+
class PY3SemanticBytes(BytesType):
145+
"""Wrapper to make `buffer[n]` return an integer like in Py3"""
146+
__slots__ = []
147+
148+
def __getitem__(self, item):
149+
ret = super(PY3SemanticBytes, self).__getitem__(item)
150+
# `buffer[n]` should return an integer, but slice syntax like
151+
# `buffer[n:n+1]` should still return a `Bytes` object as before.
152+
if is_integer(item):
153+
return ord(ret)
154+
return ret
155+
156+
144157
def is_integer(o):
145158
""" portable test if an object is like an int """
146159
return isinstance(o, IntTypes)
@@ -321,19 +334,6 @@ def _to_python(node):
321334
return NODE_HANDLERS[node.tag](node)
322335

323336

324-
def _hex_as_nybble(hex):
325-
"Accepts a single hex character and returns a nybble."
326-
if (hex >= b'0') and (hex <= b'9'):
327-
return ord(hex) - ord(b'0')
328-
elif (hex >= b'a') and (hex <=b'f'):
329-
return 10 + ord(hex) - ord(b'a')
330-
elif (hex >= b'A') and (hex <=b'F'):
331-
return 10 + ord(hex) - ord(b'A')
332-
else:
333-
raise LLSDParseError('Invalid hex character: %s' % hex)
334-
335-
336-
337337
class LLSDBaseFormatter(object):
338338
"""
339339
This base class cannot be instantiated on its own: it assumes a subclass
@@ -366,13 +366,22 @@ def __init__(self):
366366
}
367367

368368

369+
_X_ORD = ord(b'x')
370+
_BACKSLASH_ORD = ord(b'\\')
371+
_DECODE_BUFF_ALLOC_SIZE = 1024
372+
373+
369374
class LLSDBaseParser(object):
370375
"""
371376
Utility methods useful for parser subclasses.
372377
"""
378+
__slots__ = ['_buffer', '_index', '_decode_buff']
379+
373380
def __init__(self):
374381
self._buffer = b''
375-
self._index = 0
382+
self._index = 0
383+
# Scratch space for decoding delimited strings
384+
self._decode_buff = bytearray(_DECODE_BUFF_ALLOC_SIZE)
376385

377386
def _error(self, message, offset=0):
378387
try:
@@ -399,53 +408,85 @@ def _getc(self, num=1):
399408

400409
# map char following escape char to corresponding character
401410
_escaped = {
402-
b'a': b'\a',
403-
b'b': b'\b',
404-
b'f': b'\f',
405-
b'n': b'\n',
406-
b'r': b'\r',
407-
b't': b'\t',
408-
b'v': b'\v',
411+
ord(b'a'): ord(b'\a'),
412+
ord(b'b'): ord(b'\b'),
413+
ord(b'f'): ord(b'\f'),
414+
ord(b'n'): ord(b'\n'),
415+
ord(b'r'): ord(b'\r'),
416+
ord(b't'): ord(b'\t'),
417+
ord(b'v'): ord(b'\v'),
409418
}
410419

411420
def _parse_string_delim(self, delim):
412421
"Parse a delimited string."
413-
parts = bytearray()
414-
found_escape = False
415-
found_hex = False
416-
found_digit = False
417-
byte = 0
422+
insert_idx = 0
423+
delim_ord = ord(delim)
424+
# Preallocate a working buffer for the decoded string output
425+
# to avoid allocs in the hot loop.
426+
decode_buff = self._decode_buff
427+
# Cache these in locals, otherwise we have to perform a lookup on
428+
# `self` in the hot loop.
429+
buff = self._buffer
430+
read_idx = self._index
431+
cc = 0
418432
while True:
419-
cc = self._getc()
420-
if found_escape:
421-
if found_hex:
422-
if found_digit:
423-
found_escape = False
424-
found_hex = False
425-
found_digit = False
426-
byte <<= 4
427-
byte |= _hex_as_nybble(cc)
428-
parts.append(byte)
429-
byte = 0
433+
try:
434+
cc = buff[read_idx]
435+
read_idx += 1
436+
437+
if cc == _BACKSLASH_ORD:
438+
# Backslash, figure out if this is an \xNN hex escape or
439+
# something like \t
440+
cc = buff[read_idx]
441+
read_idx += 1
442+
if cc == _X_ORD:
443+
# It's a hex escape. char is the value of the two
444+
# following hex nybbles. This slice may result in
445+
# a short read (0 or 1 bytes), but either a
446+
# `ValueError` will be triggered by the first case,
447+
# and the second will cause an `IndexError` on the
448+
# next iteration of the loop.
449+
hex_bytes = buff[read_idx:read_idx + 2]
450+
read_idx += 2
451+
try:
452+
# int() can parse a `bytes` containing hex,
453+
# no explicit `bytes.decode("ascii")` required.
454+
cc = int(hex_bytes, 16)
455+
except ValueError as e:
456+
# One of the hex characters was likely invalid.
457+
# Wrap the ValueError so that we can provide a
458+
# byte offset in the error.
459+
self._index = read_idx
460+
self._error(e, offset=-2)
430461
else:
431-
found_digit = True
432-
byte = _hex_as_nybble(cc)
433-
elif cc == b'x':
434-
found_hex = True
435-
else:
436-
found_escape = False
437-
# escape char preceding anything other than the chars in
438-
# _escaped just results in that same char without the
439-
# escape char
440-
parts.extend(self._escaped.get(cc, cc))
441-
elif cc == b'\\':
442-
found_escape = True
443-
elif cc == delim:
444-
break
445-
else:
446-
parts.extend(cc)
462+
# escape char preceding anything other than the chars
463+
# in _escaped just results in that same char without
464+
# the escape char
465+
cc = self._escaped.get(cc, cc)
466+
elif cc == delim_ord:
467+
break
468+
except IndexError:
469+
# We can be reasonably sure that any IndexErrors inside here
470+
# were caused by an out-of-bounds `buff[read_idx]`.
471+
self._index = read_idx
472+
self._error("Trying to read past end of buffer")
473+
474+
try:
475+
decode_buff[insert_idx] = cc
476+
except IndexError:
477+
# Oops, that overflowed the decoding buffer, make a
478+
# new expanded buffer containing the existing contents.
479+
decode_buff = bytearray(decode_buff)
480+
decode_buff.extend(b"\x00" * _DECODE_BUFF_ALLOC_SIZE)
481+
decode_buff[insert_idx] = cc
482+
483+
insert_idx += 1
484+
485+
# Sync our local read index with the canonical one
486+
self._index = read_idx
447487
try:
448-
return parts.decode('utf-8')
488+
# Slice off only what we used of the working decode buffer
489+
return decode_buff[:insert_idx].decode('utf-8')
449490
except UnicodeDecodeError as exc:
450491
self._error(exc)
451492

@@ -457,4 +498,4 @@ def starts_with(startstr, something):
457498
pos = something.tell()
458499
s = something.read(len(startstr))
459500
something.seek(pos, os.SEEK_SET)
460-
return (s == startstr)
501+
return (s == startstr)

llsd/serde_binary.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import uuid
55

66
from llsd.base import (_LLSD, LLSDBaseParser, LLSDSerializationError, _str_to_bytes, binary, is_integer, is_string,
7-
starts_with, uri)
7+
starts_with, uri, PY2, is_bytes, PY3SemanticBytes)
88

99

1010
class LLSDBinaryParser(LLSDBaseParser):
@@ -13,6 +13,8 @@ class LLSDBinaryParser(LLSDBaseParser):
1313
1414
See http://wiki.secondlife.com/wiki/LLSD#Binary_Serialization
1515
"""
16+
__slots__ = ['_dispatch', '_keep_binary']
17+
1618
def __init__(self):
1719
super(LLSDBinaryParser, self).__init__()
1820
# One way of dispatching based on the next character we see would be a
@@ -61,6 +63,10 @@ def parse(self, buffer, ignore_binary = False):
6163
:param ignore_binary: parser throws away data in llsd binary nodes.
6264
:returns: returns a python object.
6365
"""
66+
if PY2 and is_bytes(buffer):
67+
# We need to wrap this in a helper class so that individual element
68+
# access works the same as in PY3
69+
buffer = PY3SemanticBytes(buffer)
6470
self._buffer = buffer
6571
self._index = 0
6672
self._keep_binary = not ignore_binary

llsd/serde_notation.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import uuid
55

66
from llsd.base import (_LLSD, B, LLSDBaseFormatter, LLSDBaseParser, LLSDParseError, LLSDSerializationError, UnicodeType,
7-
_format_datestr, _parse_datestr, _str_to_bytes, binary, uri)
7+
_format_datestr, _parse_datestr, _str_to_bytes, binary, uri, PY2, is_bytes, PY3SemanticBytes)
88

99
_int_regex = re.compile(br"[-+]?\d+")
1010
_real_regex = re.compile(br"[-+]?(?:(\d+(\.\d*)?|\d*\.\d+)([eE][-+]?\d+)?)|[-+]?inf|[-+]?nan")
@@ -86,6 +86,11 @@ def parse(self, buffer, ignore_binary = False):
8686
if buffer == b"":
8787
return False
8888

89+
if PY2 and is_bytes(buffer):
90+
# We need to wrap this in a helper class so that individual element
91+
# access works the same as in PY3
92+
buffer = PY3SemanticBytes(buffer)
93+
8994
self._buffer = buffer
9095
self._index = 0
9196
return self._parse()
@@ -328,6 +333,8 @@ class LLSDNotationFormatter(LLSDBaseFormatter):
328333
329334
See http://wiki.secondlife.com/wiki/LLSD#Notation_Serialization
330335
"""
336+
__slots__ = []
337+
331338
def LLSD(self, v):
332339
return self._generate(v.thing)
333340
def UNDEF(self, v):

llsd/serde_xml.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class LLSDXMLFormatter(LLSDBaseFormatter):
3636
module level format_xml is the most convenient interface to this
3737
functionality.
3838
"""
39+
__slots__ = []
3940

4041
def _elt(self, name, contents=None):
4142
"Serialize a single element."

tests/llsd_test.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,21 @@ def testParseNotationIncorrectMIME(self):
507507
except llsd.LLSDParseError:
508508
pass
509509

510+
def testParseNotationUnterminatedString(self):
511+
"""
512+
Test with an unterminated delimited string
513+
"""
514+
self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'foo")
515+
516+
def testParseNotationHexEscapeNoChars(self):
517+
self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'\\x")
518+
519+
def testParseNotationHalfTruncatedHex(self):
520+
self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'\\xf")
521+
522+
def testParseNotationInvalidHex(self):
523+
self.assertRaises(llsd.LLSDParseError, self.llsd.parse, b"'\\xzz'")
524+
510525

511526
class LLSDBinaryUnitTest(unittest.TestCase):
512527
"""

0 commit comments

Comments
 (0)