22import base64
33import binascii
44import datetime
5+ import io
56import os
67import re
78import sys
2425BINARY_MIME_TYPE = 'application/llsd+binary'
2526NOTATION_MIME_TYPE = 'application/llsd+notation'
2627
28+ XML_HEADER = b'<? llsd/xml ?>'
29+ BINARY_HEADER = b'<? llsd/binary ?>'
30+ NOTATION_HEADER = b'<? llsd/notation ?>'
31+
2732ALL_CHARS = str (bytearray (range (256 ))) if PY2 else bytes (range (256 ))
2833
2934
@@ -78,12 +83,6 @@ class LLSDSerializationError(TypeError):
7883except NameError :
7984 UnicodeType = str
8085
81- # can't just check for NameError: 'bytes' is defined in both Python 2 and 3
82- if PY2 :
83- BytesType = str
84- else :
85- BytesType = bytes
86-
8786try :
8887 b'%s' % (b'yes' ,)
8988except TypeError :
@@ -141,19 +140,6 @@ def B(fmt):
141140 return fmt
142141
143142
144- class PY3SemanticBytes (BytesType ):
145- """Wrapper to make `buffer[n]` return an integer like in Py3"""
146- __slots__ = []
147-
148- def __getitem__ (self , item ):
149- ret = super (PY3SemanticBytes , self ).__getitem__ (item )
150- # `buffer[n]` should return an integer, but slice syntax like
151- # `buffer[n:n+1]` should still return a `Bytes` object as before.
152- if is_integer (item ):
153- return ord (ret )
154- return ret
155-
156-
157143def is_integer (o ):
158144 """ portable test if an object is like an int """
159145 return isinstance (o , IntTypes )
@@ -169,11 +155,6 @@ def is_string(o):
169155 return isinstance (o , StringTypes )
170156
171157
172- def is_bytes (o ):
173- """ portable check if an object is an immutable byte array """
174- return isinstance (o , BytesType )
175-
176-
177158#date: d"YYYY-MM-DDTHH:MM:SS.FFFFFFZ"
178159_date_regex = re .compile (r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T"
179160 r"(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})"
@@ -375,36 +356,136 @@ class LLSDBaseParser(object):
375356 """
376357 Utility methods useful for parser subclasses.
377358 """
378- __slots__ = ['_buffer' , '_index ' , '_decode_buff' ]
359+ __slots__ = ['_stream ' , '_decode_buff' ]
379360
380- def __init__ (self ):
381- self ._buffer = b''
382- self ._index = 0
361+ def __init__ (self , something = b'' ):
362+ self ._reset (something )
383363 # Scratch space for decoding delimited strings
384364 self ._decode_buff = bytearray (_DECODE_BUFF_ALLOC_SIZE )
385365
386- def _error (self , message , offset = 0 ):
366+ def _reset (self , something ):
367+ if isinstance (something , LLSDBaseParser ):
368+ # When passed an existing LLSDBaseParser (subclass) instance, just
369+ # borrow its existing _stream.
370+ self ._stream = something ._stream
371+ elif isinstance (something , bytes ):
372+ # Wrap an incoming bytes string into a stream. If the passed bytes
373+ # string is so large that the overhead of copying it into a
374+ # BytesIO is significant, advise caller to pass a stream instead.
375+ # BytesIO has no peek() method, so wrap it in BufferedReader.
376+ self ._stream = io .BufferedReader (io .BytesIO (something ))
377+ elif hasattr (something , 'peek' ):
378+ # 'something' is already a buffered stream, use directly
379+ self ._stream = something
380+ else :
381+ # 'something' isn't buffered, wrap in BufferedReader
382+ # (let BufferedReader handle the problem of passing an
383+ # inappropriate object)
384+ self ._stream = io .BufferedReader (something )
385+
386+ def starts_with (self , pattern ):
387+ """
388+ Like matchseq(), except that starts_with() doesn't consume what it
389+ matches: it always resets our input stream to its previous position.
390+ """
391+ oldpos = self ._stream .tell ()
387392 try :
388- byte = self ._buffer [self ._index + offset ]
389- except IndexError :
390- byte = None
391- raise LLSDParseError ("%s at byte %d: %s" % (message , self ._index + offset , byte ))
393+ return self .matchseq (pattern )
394+ finally :
395+ self ._stream .seek (oldpos )
392396
393- def _peek (self , num = 1 ):
397+ def matchseq (self , pattern ):
398+ """
399+ Match bytes object 'pattern' after skipping arbitrary leading
400+ whitespace. After successfully matching 'pattern', skip trailing
401+ whitespace as well.
402+
403+ 'pattern' is NOT a regular expression, but a bytes string in which
404+ each space character matches zero or more whitespace characters in the
405+ stream. Non-space characters are matched case-insensitively.
406+
407+ If 'pattern' matches, return True and leave our input stream advanced
408+ past the last byte examined.
409+
410+ If 'pattern' does not match, return False and reset our input stream
411+ to its previous read position.
412+ """
413+ oldpos = self ._stream .tell ()
414+ for chunk in pattern .split ():
415+ # skip leading space before this chunk
416+ c = self ._next_nonblank ()
417+ # if we hit EOF, no match
418+ if not c :
419+ self ._stream .seek (oldpos )
420+ return False
421+ # not EOF: try to match non-empty chunk,
422+ # not forgetting that 'c' is a lookahead byte
423+ # (split() never produces a zero-length chunk)
424+ maybe = c + self ._stream .read (len (chunk )- 1 )
425+ if maybe .lower () != chunk .lower ():
426+ # mismatch, reset
427+ self ._stream .seek (oldpos )
428+ return False
429+ # so far so good, back for next chunk
430+
431+ # here we've matched every chunk, with the read pointer just at the end of
432+ # the last matched chunk -- skip trailing space
433+ if self ._next_nonblank ():
434+ # back up one character, i.e. put back the nonblank
435+ self ._stream .seek (- 1 , io .SEEK_CUR )
436+ # success!
437+ return True
438+
439+ def remainder (self ):
440+ # return a stream object representing the parse input (from last
441+ # _reset() call), whose read position is set past scanned input
442+ return self ._stream
443+
444+ def _next_nonblank (self ):
445+ # we directly call read() rather than getc() because our caller is
446+ # prepared to handle empty string, meaning EOF
447+ # (YES we want the walrus operator)
448+ c = self ._stream .read (1 )
449+ while c .isspace ():
450+ c = self ._stream .read (1 )
451+ return c
452+
453+ def _getc (self , num = 1 ):
454+ got = self ._stream .read (num )
455+ if len (got ) < num :
456+ self ._error ("Trying to read past end of stream" )
457+ return got
458+
459+ def _peek (self , num = 1 , full = True ):
460+ # full=True means error if we can't peek ahead num bytes
394461 if num < 0 :
395462 # There aren't many ways this can happen. The likeliest is that
396463 # we've just read garbage length bytes from a binary input string.
397464 # We happen to know that lengths are encoded as 4 bytes, so back
398465 # off by 4 bytes to try to point the user at the right spot.
399466 self ._error ("Invalid length field %d" % num , - 4 )
400- if self ._index + num > len (self ._buffer ):
401- self ._error ("Trying to read past end of buffer" )
402- return self ._buffer [self ._index :self ._index + num ]
403467
404- def _getc (self , num = 1 ):
405- chars = self ._peek (num )
406- self ._index += num
407- return chars
468+ got = self ._stream .peek (num )
469+ if full and len (got ) < num :
470+ # Going right to this error is a little iffy:
471+ # BufferedReader.peek() does not promise to return the requested
472+ # length, but does not clarify the conditions under which it
473+ # returns fewer bytes. If this is an actual problem, we could loop
474+ # until we have the requested length or EOF -- but the loop must
475+ # explicitly seek() past already-peeked data, then reset after.
476+ # https://docs.python.org/3/library/io.html#io.BufferedReader.peek
477+ self ._error ("Trying to peek past end of stream" )
478+
479+ # Interestingly, peek() can also return MORE than requested -- but for
480+ # our purposes (e.g. ord(peek(1))) it's important to constrain it.
481+ return got [:num ]
482+
483+ def _error (self , message , offset = 0 ):
484+ oldpos = self ._stream .tell ()
485+ # 'offset' is relative to current pos
486+ self ._stream .seek (offset , io .SEEK_CUR )
487+ raise LLSDParseError ("%s at byte %d: %r" %
488+ (message , oldpos + offset , self ._peek (1 , full = False )))
408489
409490 # map char following escape char to corresponding character
410491 _escaped = {
@@ -424,30 +505,26 @@ def _parse_string_delim(self, delim):
424505 # Preallocate a working buffer for the decoded string output
425506 # to avoid allocs in the hot loop.
426507 decode_buff = self ._decode_buff
427- # Cache these in locals, otherwise we have to perform a lookup on
508+ # Cache this in locals, otherwise we have to perform a lookup on
428509 # `self` in the hot loop.
429- buff = self ._buffer
430- read_idx = self ._index
510+ getc = self ._getc
431511 cc = 0
432512 while True :
433513 try :
434- cc = buff [read_idx ]
435- read_idx += 1
514+ cc = ord (getc ())
436515
437516 if cc == _BACKSLASH_ORD :
438517 # Backslash, figure out if this is an \xNN hex escape or
439518 # something like \t
440- cc = buff [read_idx ]
441- read_idx += 1
519+ cc = ord (getc ())
442520 if cc == _X_ORD :
443521 # It's a hex escape. char is the value of the two
444522 # following hex nybbles. This slice may result in
445523 # a short read (0 or 1 bytes), but either a
446524 # `ValueError` will be triggered by the first case,
447525 # and the second will cause an `IndexError` on the
448526 # next iteration of the loop.
449- hex_bytes = buff [read_idx :read_idx + 2 ]
450- read_idx += 2
527+ hex_bytes = getc (2 )
451528 try :
452529 # int() can parse a `bytes` containing hex,
453530 # no explicit `bytes.decode("ascii")` required.
@@ -456,7 +533,6 @@ def _parse_string_delim(self, delim):
456533 # One of the hex characters was likely invalid.
457534 # Wrap the ValueError so that we can provide a
458535 # byte offset in the error.
459- self ._index = read_idx
460536 self ._error (e , offset = - 2 )
461537 else :
462538 # escape char preceding anything other than the chars
@@ -468,7 +544,6 @@ def _parse_string_delim(self, delim):
468544 except IndexError :
469545 # We can be reasonably sure that any IndexErrors inside here
470546 # were caused by an out-of-bounds `buff[read_idx]`.
471- self ._index = read_idx
472547 self ._error ("Trying to read past end of buffer" )
473548
474549 try :
@@ -483,19 +558,8 @@ def _parse_string_delim(self, delim):
483558 insert_idx += 1
484559
485560 # Sync our local read index with the canonical one
486- self ._index = read_idx
487561 try :
488562 # Slice off only what we used of the working decode buffer
489563 return decode_buff [:insert_idx ].decode ('utf-8' )
490564 except UnicodeDecodeError as exc :
491565 self ._error (exc )
492-
493-
494- def starts_with (startstr , something ):
495- if hasattr (something , 'startswith' ):
496- return something .startswith (startstr )
497- else :
498- pos = something .tell ()
499- s = something .read (len (startstr ))
500- something .seek (pos , os .SEEK_SET )
501- return (s == startstr )
0 commit comments