@@ -141,6 +141,19 @@ def B(fmt):
141141 return fmt
142142
143143
144+ class PY3SemanticBytes (BytesType ):
145+ """Wrapper to make `buffer[n]` return an integer like in Py3"""
146+ __slots__ = []
147+
148+ def __getitem__ (self , item ):
149+ ret = super (PY3SemanticBytes , self ).__getitem__ (item )
150+ # `buffer[n]` should return an integer, but slice syntax like
151+ # `buffer[n:n+1]` should still return a `Bytes` object as before.
152+ if is_integer (item ):
153+ return ord (ret )
154+ return ret
155+
156+
144157def is_integer (o ):
145158 """ portable test if an object is like an int """
146159 return isinstance (o , IntTypes )
@@ -321,19 +334,6 @@ def _to_python(node):
321334 return NODE_HANDLERS [node .tag ](node )
322335
323336
324- def _hex_as_nybble (hex ):
325- "Accepts a single hex character and returns a nybble."
326- if (hex >= b'0' ) and (hex <= b'9' ):
327- return ord (hex ) - ord (b'0' )
328- elif (hex >= b'a' ) and (hex <= b'f' ):
329- return 10 + ord (hex ) - ord (b'a' )
330- elif (hex >= b'A' ) and (hex <= b'F' ):
331- return 10 + ord (hex ) - ord (b'A' )
332- else :
333- raise LLSDParseError ('Invalid hex character: %s' % hex )
334-
335-
336-
337337class LLSDBaseFormatter (object ):
338338 """
339339 This base class cannot be instantiated on its own: it assumes a subclass
@@ -366,13 +366,22 @@ def __init__(self):
366366 }
367367
368368
369+ _X_ORD = ord (b'x' )
370+ _BACKSLASH_ORD = ord (b'\\ ' )
371+ _DECODE_BUFF_ALLOC_SIZE = 1024
372+
373+
369374class LLSDBaseParser (object ):
370375 """
371376 Utility methods useful for parser subclasses.
372377 """
378+ __slots__ = ['_buffer' , '_index' , '_decode_buff' ]
379+
373380 def __init__ (self ):
374381 self ._buffer = b''
375- self ._index = 0
382+ self ._index = 0
383+ # Scratch space for decoding delimited strings
384+ self ._decode_buff = bytearray (_DECODE_BUFF_ALLOC_SIZE )
376385
377386 def _error (self , message , offset = 0 ):
378387 try :
@@ -399,53 +408,85 @@ def _getc(self, num=1):
399408
400409 # map char following escape char to corresponding character
401410 _escaped = {
402- b'a' : b'\a ' ,
403- b'b' : b'\b ' ,
404- b'f' : b'\f ' ,
405- b'n' : b'\n ' ,
406- b'r' : b'\r ' ,
407- b't' : b'\t ' ,
408- b'v' : b'\v ' ,
411+ ord ( b'a' ): ord ( b'\a ' ) ,
412+ ord ( b'b' ): ord ( b'\b ' ) ,
413+ ord ( b'f' ): ord ( b'\f ' ) ,
414+ ord ( b'n' ): ord ( b'\n ' ) ,
415+ ord ( b'r' ): ord ( b'\r ' ) ,
416+ ord ( b't' ): ord ( b'\t ' ) ,
417+ ord ( b'v' ): ord ( b'\v ' ) ,
409418 }
410419
411420 def _parse_string_delim (self , delim ):
412421 "Parse a delimited string."
413- parts = bytearray ()
414- found_escape = False
415- found_hex = False
416- found_digit = False
417- byte = 0
422+ insert_idx = 0
423+ delim_ord = ord (delim )
424+ # Preallocate a working buffer for the decoded string output
425+ # to avoid allocs in the hot loop.
426+ decode_buff = self ._decode_buff
427+ # Cache these in locals, otherwise we have to perform a lookup on
428+ # `self` in the hot loop.
429+ buff = self ._buffer
430+ read_idx = self ._index
431+ cc = 0
418432 while True :
419- cc = self ._getc ()
420- if found_escape :
421- if found_hex :
422- if found_digit :
423- found_escape = False
424- found_hex = False
425- found_digit = False
426- byte <<= 4
427- byte |= _hex_as_nybble (cc )
428- parts .append (byte )
429- byte = 0
433+ try :
434+ cc = buff [read_idx ]
435+ read_idx += 1
436+
437+ if cc == _BACKSLASH_ORD :
438+ # Backslash, figure out if this is an \xNN hex escape or
439+ # something like \t
440+ cc = buff [read_idx ]
441+ read_idx += 1
442+ if cc == _X_ORD :
443+ # It's a hex escape. char is the value of the two
444+ # following hex nybbles. This slice may result in
445+ # a short read (0 or 1 bytes), but either a
446+ # `ValueError` will be triggered by the first case,
447+ # and the second will cause an `IndexError` on the
448+ # next iteration of the loop.
449+ hex_bytes = buff [read_idx :read_idx + 2 ]
450+ read_idx += 2
451+ try :
452+ # int() can parse a `bytes` containing hex,
453+ # no explicit `bytes.decode("ascii")` required.
454+ cc = int (hex_bytes , 16 )
455+ except ValueError as e :
456+ # One of the hex characters was likely invalid.
457+ # Wrap the ValueError so that we can provide a
458+ # byte offset in the error.
459+ self ._index = read_idx
460+ self ._error (e , offset = - 2 )
430461 else :
431- found_digit = True
432- byte = _hex_as_nybble (cc )
433- elif cc == b'x' :
434- found_hex = True
435- else :
436- found_escape = False
437- # escape char preceding anything other than the chars in
438- # _escaped just results in that same char without the
439- # escape char
440- parts .extend (self ._escaped .get (cc , cc ))
441- elif cc == b'\\ ' :
442- found_escape = True
443- elif cc == delim :
444- break
445- else :
446- parts .extend (cc )
462+ # escape char preceding anything other than the chars
463+ # in _escaped just results in that same char without
464+ # the escape char
465+ cc = self ._escaped .get (cc , cc )
466+ elif cc == delim_ord :
467+ break
468+ except IndexError :
469+ # We can be reasonably sure that any IndexErrors inside here
470+ # were caused by an out-of-bounds `buff[read_idx]`.
471+ self ._index = read_idx
472+ self ._error ("Trying to read past end of buffer" )
473+
474+ try :
475+ decode_buff [insert_idx ] = cc
476+ except IndexError :
477+ # Oops, that overflowed the decoding buffer, make a
478+ # new expanded buffer containing the existing contents.
479+ decode_buff = bytearray (decode_buff )
480+ decode_buff .extend (b"\x00 " * _DECODE_BUFF_ALLOC_SIZE )
481+ decode_buff [insert_idx ] = cc
482+
483+ insert_idx += 1
484+
485+ # Sync our local read index with the canonical one
486+ self ._index = read_idx
447487 try :
448- return parts .decode ('utf-8' )
488+ # Slice off only what we used of the working decode buffer
489+ return decode_buff [:insert_idx ].decode ('utf-8' )
449490 except UnicodeDecodeError as exc :
450491 self ._error (exc )
451492
@@ -457,4 +498,4 @@ def starts_with(startstr, something):
457498 pos = something .tell ()
458499 s = something .read (len (startstr ))
459500 something .seek (pos , os .SEEK_SET )
460- return (s == startstr )
501+ return (s == startstr )
0 commit comments