Skip to content

Commit db460d6

Browse files
Merge pull request #4 from secondlife/sl-18330
SL-18330: Support parsing direct from stream; improve C++ compatibility.
2 parents 10992b7 + cbb89ca commit db460d6

File tree

7 files changed

+310
-152
lines changed

7 files changed

+310
-152
lines changed

llsd/__init__.py

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,40 +7,55 @@
77
88
http://wiki.secondlife.com/wiki/LLSD
99
"""
10-
from llsd.base import (_LLSD, BINARY_MIME_TYPE, NOTATION_MIME_TYPE, XML_MIME_TYPE, LLSDParseError,
11-
LLSDSerializationError, LongType, UnicodeType, binary, starts_with, undef, uri)
12-
from llsd.serde_binary import LLSDBinaryParser, format_binary, parse_binary
13-
from llsd.serde_notation import LLSDNotationFormatter, LLSDNotationParser, format_notation, parse_notation
14-
from llsd.serde_xml import LLSDXMLFormatter, LLSDXMLPrettyFormatter, format_pretty_xml, format_xml, parse_xml
10+
from llsd.base import (_LLSD, BINARY_MIME_TYPE, NOTATION_MIME_TYPE, XML_MIME_TYPE,
11+
BINARY_HEADER, NOTATION_HEADER, XML_HEADER,
12+
LLSDBaseParser, LLSDParseError, LLSDSerializationError,
13+
LongType, UnicodeType, binary, undef, uri)
14+
from llsd.serde_binary import LLSDBinaryParser, format_binary, parse_binary, parse_binary_nohdr
15+
from llsd.serde_notation import LLSDNotationFormatter, LLSDNotationParser, format_notation, parse_notation, parse_notation_nohdr
16+
from llsd.serde_xml import LLSDXMLFormatter, LLSDXMLPrettyFormatter, format_pretty_xml, format_xml, parse_xml, parse_xml_nohdr
1517

1618

1719
def parse(something, mime_type = None):
1820
"""
1921
This is the basic public interface for parsing llsd.
2022
21-
:param something: The data to parse. This is expected to be bytes, not strings
23+
:param something: The data to parse. This is expected to be bytes, not
24+
strings, or a byte stream.
2225
:param mime_type: The mime_type of the data if it is known.
2326
:returns: Returns a python object.
2427
2528
Python 3 Note: when reading LLSD from a file, use open()'s 'rb' mode explicitly
2629
"""
27-
if mime_type in (XML_MIME_TYPE, 'application/llsd'):
28-
return parse_xml(something)
29-
elif mime_type == BINARY_MIME_TYPE:
30-
return parse_binary(something)
31-
elif mime_type == NOTATION_MIME_TYPE:
32-
return parse_notation(something)
33-
#elif content_type == 'application/json':
34-
# return parse_notation(something)
3530
try:
36-
something = something.lstrip() #remove any pre-trailing whitespace
37-
if starts_with(b'<?llsd/binary?>', something):
38-
return parse_binary(something)
39-
# This should be better.
40-
elif starts_with(b'<', something):
41-
return parse_xml(something)
31+
if mime_type:
32+
# explicit mime_type -- 'something' may or may not also have a header
33+
for mime_types, parser in (
34+
({XML_MIME_TYPE, 'application/llsd'}, parse_xml),
35+
({BINARY_MIME_TYPE}, parse_binary),
36+
({NOTATION_MIME_TYPE}, parse_notation),
37+
## ({'application/json'}, parse_notation),
38+
):
39+
if mime_type.lower() in mime_types:
40+
return parser(something)
41+
42+
# no recognized mime type, look for header
43+
baseparser = LLSDBaseParser(something)
44+
for pattern, parser in (
45+
(BINARY_HEADER, parse_binary_nohdr),
46+
(NOTATION_HEADER, parse_notation_nohdr),
47+
(XML_HEADER, parse_xml_nohdr),
48+
):
49+
if baseparser.matchseq(pattern):
50+
# we already saw the header, don't check again
51+
return parser(baseparser)
52+
53+
# no recognized header -- does content resemble XML?
54+
if baseparser.starts_with(b'<'):
55+
return parse_xml_nohdr(baseparser)
4256
else:
43-
return parse_notation(something)
57+
return parse_notation_nohdr(baseparser)
58+
4459
except KeyError as e:
4560
raise LLSDParseError('LLSD could not be parsed: %s' % (e,))
4661
except TypeError as e:

llsd/base.py

Lines changed: 127 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import base64
33
import binascii
44
import datetime
5+
import io
56
import os
67
import re
78
import sys
@@ -24,6 +25,10 @@
2425
BINARY_MIME_TYPE = 'application/llsd+binary'
2526
NOTATION_MIME_TYPE = 'application/llsd+notation'
2627

28+
XML_HEADER = b'<? llsd/xml ?>'
29+
BINARY_HEADER = b'<? llsd/binary ?>'
30+
NOTATION_HEADER = b'<? llsd/notation ?>'
31+
2732
ALL_CHARS = str(bytearray(range(256))) if PY2 else bytes(range(256))
2833

2934

@@ -78,12 +83,6 @@ class LLSDSerializationError(TypeError):
7883
except NameError:
7984
UnicodeType = str
8085

81-
# can't just check for NameError: 'bytes' is defined in both Python 2 and 3
82-
if PY2:
83-
BytesType = str
84-
else:
85-
BytesType = bytes
86-
8786
try:
8887
b'%s' % (b'yes',)
8988
except TypeError:
@@ -141,19 +140,6 @@ def B(fmt):
141140
return fmt
142141

143142

144-
class PY3SemanticBytes(BytesType):
145-
"""Wrapper to make `buffer[n]` return an integer like in Py3"""
146-
__slots__ = []
147-
148-
def __getitem__(self, item):
149-
ret = super(PY3SemanticBytes, self).__getitem__(item)
150-
# `buffer[n]` should return an integer, but slice syntax like
151-
# `buffer[n:n+1]` should still return a `Bytes` object as before.
152-
if is_integer(item):
153-
return ord(ret)
154-
return ret
155-
156-
157143
def is_integer(o):
158144
""" portable test if an object is like an int """
159145
return isinstance(o, IntTypes)
@@ -169,11 +155,6 @@ def is_string(o):
169155
return isinstance(o, StringTypes)
170156

171157

172-
def is_bytes(o):
173-
""" portable check if an object is an immutable byte array """
174-
return isinstance(o, BytesType)
175-
176-
177158
#date: d"YYYY-MM-DDTHH:MM:SS.FFFFFFZ"
178159
_date_regex = re.compile(r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T"
179160
r"(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})"
@@ -375,36 +356,136 @@ class LLSDBaseParser(object):
375356
"""
376357
Utility methods useful for parser subclasses.
377358
"""
378-
__slots__ = ['_buffer', '_index', '_decode_buff']
359+
__slots__ = ['_stream', '_decode_buff']
379360

380-
def __init__(self):
381-
self._buffer = b''
382-
self._index = 0
361+
def __init__(self, something=b''):
362+
self._reset(something)
383363
# Scratch space for decoding delimited strings
384364
self._decode_buff = bytearray(_DECODE_BUFF_ALLOC_SIZE)
385365

386-
def _error(self, message, offset=0):
366+
def _reset(self, something):
367+
if isinstance(something, LLSDBaseParser):
368+
# When passed an existing LLSDBaseParser (subclass) instance, just
369+
# borrow its existing _stream.
370+
self._stream = something._stream
371+
elif isinstance(something, bytes):
372+
# Wrap an incoming bytes string into a stream. If the passed bytes
373+
# string is so large that the overhead of copying it into a
374+
# BytesIO is significant, advise caller to pass a stream instead.
375+
# BytesIO has no peek() method, so wrap it in BufferedReader.
376+
self._stream = io.BufferedReader(io.BytesIO(something))
377+
elif hasattr(something, 'peek'):
378+
# 'something' is already a buffered stream, use directly
379+
self._stream = something
380+
else:
381+
# 'something' isn't buffered, wrap in BufferedReader
382+
# (let BufferedReader handle the problem of passing an
383+
# inappropriate object)
384+
self._stream = io.BufferedReader(something)
385+
386+
def starts_with(self, pattern):
387+
"""
388+
Like matchseq(), except that starts_with() doesn't consume what it
389+
matches: it always resets our input stream to its previous position.
390+
"""
391+
oldpos = self._stream.tell()
387392
try:
388-
byte = self._buffer[self._index+offset]
389-
except IndexError:
390-
byte = None
391-
raise LLSDParseError("%s at byte %d: %s" % (message, self._index+offset, byte))
393+
return self.matchseq(pattern)
394+
finally:
395+
self._stream.seek(oldpos)
392396

393-
def _peek(self, num=1):
397+
def matchseq(self, pattern):
398+
"""
399+
Match bytes object 'pattern' after skipping arbitrary leading
400+
whitespace. After successfully matching 'pattern', skip trailing
401+
whitespace as well.
402+
403+
'pattern' is NOT a regular expression, but a bytes string in which
404+
each space character matches zero or more whitespace characters in the
405+
stream. Non-space characters are matched case-insensitively.
406+
407+
If 'pattern' matches, return True and leave our input stream advanced
408+
past the last byte examined.
409+
410+
If 'pattern' does not match, return False and reset our input stream
411+
to its previous read position.
412+
"""
413+
oldpos = self._stream.tell()
414+
for chunk in pattern.split():
415+
# skip leading space before this chunk
416+
c = self._next_nonblank()
417+
# if we hit EOF, no match
418+
if not c:
419+
self._stream.seek(oldpos)
420+
return False
421+
# not EOF: try to match non-empty chunk,
422+
# not forgetting that 'c' is a lookahead byte
423+
# (split() never produces a zero-length chunk)
424+
maybe = c + self._stream.read(len(chunk)-1)
425+
if maybe.lower() != chunk.lower():
426+
# mismatch, reset
427+
self._stream.seek(oldpos)
428+
return False
429+
# so far so good, back for next chunk
430+
431+
# here we've matched every chunk, with the read pointer just at the end of
432+
# the last matched chunk -- skip trailing space
433+
if self._next_nonblank():
434+
# back up one character, i.e. put back the nonblank
435+
self._stream.seek(-1, io.SEEK_CUR)
436+
# success!
437+
return True
438+
439+
def remainder(self):
440+
# return a stream object representing the parse input (from last
441+
# _reset() call), whose read position is set past scanned input
442+
return self._stream
443+
444+
def _next_nonblank(self):
445+
# we directly call read() rather than getc() because our caller is
446+
# prepared to handle empty string, meaning EOF
447+
# (YES we want the walrus operator)
448+
c = self._stream.read(1)
449+
while c.isspace():
450+
c = self._stream.read(1)
451+
return c
452+
453+
def _getc(self, num=1):
454+
got = self._stream.read(num)
455+
if len(got) < num:
456+
self._error("Trying to read past end of stream")
457+
return got
458+
459+
def _peek(self, num=1, full=True):
460+
# full=True means error if we can't peek ahead num bytes
394461
if num < 0:
395462
# There aren't many ways this can happen. The likeliest is that
396463
# we've just read garbage length bytes from a binary input string.
397464
# We happen to know that lengths are encoded as 4 bytes, so back
398465
# off by 4 bytes to try to point the user at the right spot.
399466
self._error("Invalid length field %d" % num, -4)
400-
if self._index + num > len(self._buffer):
401-
self._error("Trying to read past end of buffer")
402-
return self._buffer[self._index:self._index + num]
403467

404-
def _getc(self, num=1):
405-
chars = self._peek(num)
406-
self._index += num
407-
return chars
468+
got = self._stream.peek(num)
469+
if full and len(got) < num:
470+
# Going right to this error is a little iffy:
471+
# BufferedReader.peek() does not promise to return the requested
472+
# length, but does not clarify the conditions under which it
473+
# returns fewer bytes. If this is an actual problem, we could loop
474+
# until we have the requested length or EOF -- but the loop must
475+
# explicitly seek() past already-peeked data, then reset after.
476+
# https://docs.python.org/3/library/io.html#io.BufferedReader.peek
477+
self._error("Trying to peek past end of stream")
478+
479+
# Interestingly, peek() can also return MORE than requested -- but for
480+
# our purposes (e.g. ord(peek(1))) it's important to constrain it.
481+
return got[:num]
482+
483+
def _error(self, message, offset=0):
484+
oldpos = self._stream.tell()
485+
# 'offset' is relative to current pos
486+
self._stream.seek(offset, io.SEEK_CUR)
487+
raise LLSDParseError("%s at byte %d: %r" %
488+
(message, oldpos+offset, self._peek(1, full=False)))
408489

409490
# map char following escape char to corresponding character
410491
_escaped = {
@@ -424,30 +505,26 @@ def _parse_string_delim(self, delim):
424505
# Preallocate a working buffer for the decoded string output
425506
# to avoid allocs in the hot loop.
426507
decode_buff = self._decode_buff
427-
# Cache these in locals, otherwise we have to perform a lookup on
508+
# Cache this in locals, otherwise we have to perform a lookup on
428509
# `self` in the hot loop.
429-
buff = self._buffer
430-
read_idx = self._index
510+
getc = self._getc
431511
cc = 0
432512
while True:
433513
try:
434-
cc = buff[read_idx]
435-
read_idx += 1
514+
cc = ord(getc())
436515

437516
if cc == _BACKSLASH_ORD:
438517
# Backslash, figure out if this is an \xNN hex escape or
439518
# something like \t
440-
cc = buff[read_idx]
441-
read_idx += 1
519+
cc = ord(getc())
442520
if cc == _X_ORD:
443521
# It's a hex escape. char is the value of the two
444522
# following hex nybbles. This slice may result in
445523
# a short read (0 or 1 bytes), but either a
446524
# `ValueError` will be triggered by the first case,
447525
# and the second will cause an `IndexError` on the
448526
# next iteration of the loop.
449-
hex_bytes = buff[read_idx:read_idx + 2]
450-
read_idx += 2
527+
hex_bytes = getc(2)
451528
try:
452529
# int() can parse a `bytes` containing hex,
453530
# no explicit `bytes.decode("ascii")` required.
@@ -456,7 +533,6 @@ def _parse_string_delim(self, delim):
456533
# One of the hex characters was likely invalid.
457534
# Wrap the ValueError so that we can provide a
458535
# byte offset in the error.
459-
self._index = read_idx
460536
self._error(e, offset=-2)
461537
else:
462538
# escape char preceding anything other than the chars
@@ -468,7 +544,6 @@ def _parse_string_delim(self, delim):
468544
except IndexError:
469545
# We can be reasonably sure that any IndexErrors inside here
470546
# were caused by an out-of-bounds `buff[read_idx]`.
471-
self._index = read_idx
472547
self._error("Trying to read past end of buffer")
473548

474549
try:
@@ -483,19 +558,8 @@ def _parse_string_delim(self, delim):
483558
insert_idx += 1
484559

485560
# Sync our local read index with the canonical one
486-
self._index = read_idx
487561
try:
488562
# Slice off only what we used of the working decode buffer
489563
return decode_buff[:insert_idx].decode('utf-8')
490564
except UnicodeDecodeError as exc:
491565
self._error(exc)
492-
493-
494-
def starts_with(startstr, something):
495-
if hasattr(something, 'startswith'):
496-
return something.startswith(startstr)
497-
else:
498-
pos = something.tell()
499-
s = something.read(len(startstr))
500-
something.seek(pos, os.SEEK_SET)
501-
return (s == startstr)

0 commit comments

Comments
 (0)