Skip to content

Commit 3a2106e

Browse files
committed
SL-19707 - use python3 translate functionality for escaping strings
1 parent bcb4202 commit 3a2106e

File tree

2 files changed

+94
-63
lines changed

2 files changed

+94
-63
lines changed

llsd/serde_xml.py

Lines changed: 90 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,24 @@
55

66
from llsd.base import (_LLSD, ALL_CHARS, LLSDBaseParser, LLSDBaseFormatter, XML_HEADER,
77
LLSDParseError, LLSDSerializationError, UnicodeType,
8-
_format_datestr, _str_to_bytes, _to_python, is_unicode)
8+
_format_datestr, _str_to_bytes, _to_python, is_unicode, PY2)
99
from llsd.fastest_elementtree import ElementTreeError, fromstring, parse as _parse
1010

1111
INVALID_XML_BYTES = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c'\
1212
b'\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18'\
1313
b'\x19\x1a\x1b\x1c\x1d\x1e\x1f'
14+
15+
XML_ESC_TRANS = {}
16+
if not PY2:
17+
XML_ESC_TRANS = str.maketrans({'&': '&',
18+
'<':'&lt;',
19+
'>':'&gt;',
20+
u'\uffff':None, # cannot be parsed
21+
u'\ufffe':None}) # cannot be parsed
22+
23+
for x in INVALID_XML_BYTES:
24+
XML_ESC_TRANS[x] = None
25+
1426
INVALID_XML_RE = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f]')
1527

1628

@@ -25,6 +37,24 @@ def remove_invalid_xml_bytes(b):
2537
# unit tests)
2638
return INVALID_XML_RE.sub('', b)
2739

40+
def xml_esc(v):
41+
"Escape string or unicode object v for xml output"
42+
43+
# Use is_unicode() instead of is_string() because in python 2, str is
44+
# bytes, not unicode, and should not be "encode()"d. Attempts to
45+
# encode("utf-8") a bytes type will result in an implicit
46+
# decode("ascii") that will throw a UnicodeDecodeError if the string
47+
# contains non-ascii characters.
48+
if is_unicode(v):
49+
# we need to drop these invalid characters because they
50+
# cannot be parsed (and encode() doesn't drop them for us)
51+
v = v.replace(u'\uffff', u'')
52+
v = v.replace(u'\ufffe', u'')
53+
v = v.encode('utf-8')
54+
v = remove_invalid_xml_bytes(v)
55+
return v.replace(b'&',b'&amp;').replace(b'<',b'&lt;').replace(b'>',b'&gt;')
56+
57+
2858

2959
class LLSDXMLFormatter(LLSDBaseFormatter):
3060
"""
@@ -38,47 +68,41 @@ class LLSDXMLFormatter(LLSDBaseFormatter):
3868
interface to this functionality.
3969
"""
4070

41-
def xml_esc(self, v):
42-
"Escape string or unicode object v for xml output"
43-
44-
# Use is_unicode() instead of is_string() because in python 2, str is
45-
# bytes, not unicode, and should not be "encode()"d. Attempts to
46-
# encode("utf-8") a bytes type will result in an implicit
47-
# decode("ascii") that will throw a UnicodeDecodeError if the string
48-
# contains non-ascii characters.
49-
if is_unicode(v):
50-
# we need to drop these invalid characters because they
51-
# cannot be parsed (and encode() doesn't drop them for us)
52-
v = v.replace(u'\uffff', u'')
53-
v = v.replace(u'\ufffe', u'')
54-
v = v.encode('utf-8')
55-
v = remove_invalid_xml_bytes(v)
56-
return v.replace(b'&',b'&amp;').replace(b'<',b'&lt;').replace(b'>',b'&gt;')
57-
71+
def __init__(self, indent_atom = None):
72+
"Construct a pretty serializer."
73+
# Call the super class constructor so that we have the type map
74+
super(LLSDXMLFormatter, self).__init__()
75+
self.py2 = PY2
76+
5877
def _LLSD(self, v):
5978
raise LLSDSerializationError("We should never end up here")
6079
def _UNDEF(self, _v):
6180
return b'<undef/>'
6281
def _BOOLEAN(self, v):
6382
if v:
6483
return b'<boolean>true</boolean>'
65-
else:
66-
return b'<boolean>false</boolean>'
84+
return b'<boolean>false</boolean>'
6785
def _INTEGER(self, v):
68-
return b'<integer>' + str(v).encode() + b'</integer>'
86+
return b'<integer>' + str(v).encode('utf-8') + b'</integer>'
6987
def _REAL(self, v):
70-
return b'<real>' + str(v).encode() + b'</real>'
88+
return b'<real>' + str(v).encode('utf-8') + b'</real>'
7189
def _UUID(self, v):
7290
if v.int == 0:
7391
return b'<uuid/>'
7492
else:
75-
return b'<uuid>' + str(v).encode() + b'</uuid>'
93+
return b'<uuid>' + str(v).encode('utf-8') + b'</uuid>'
7694
def _BINARY(self, v):
7795
return b'<binary>' + base64.b64encode(v).strip() + b'</binary>'
7896
def _STRING(self, v):
79-
return b'<string>' + self.xml_esc(v) + b'</string>'
97+
if self.py2:
98+
return b'<string>' + _str_to_bytes(xml_esc(v)) + b'</string>'
99+
else:
100+
return b'<string>' + v.translate(XML_ESC_TRANS).encode('utf-8') + b'</string>'
80101
def _URI(self, v):
81-
return b'<uri>' + self.xml_esc(v) + b'</uri>'
102+
if self.py2:
103+
return b'<uri>' + _str_to_bytes(xml_esc(v)) + b'</uri>'
104+
else:
105+
return b'<uri>' + UnicodeType(v).translate(XML_ESC_TRANS).encode('utf-8') + b'</uri>'
82106
def _DATE(self, v):
83107
return b'<date>' + _format_datestr(v) + b'</date>'
84108
def _ARRAY(self, v):
@@ -97,28 +121,38 @@ def _write(self, something):
97121

98122
iter_stack = [(iter([something]), b"", None)]
99123
while True:
100-
cur_iter, iter_type, iterable = iter_stack[-1]
124+
cur_iter, iter_type, iterable_obj = iter_stack[-1]
101125
try:
102126
item = next(cur_iter)
103127
if iter_type == b"map":
104-
self.stream.write(b'<key>' + self.xml_esc(UnicodeType(item)) + b'</key>')
105-
item = iterable[item]
128+
129+
if self.py2:
130+
self.stream.write(b'<key>' +
131+
_str_to_bytes(xml_esc(UnicodeType(item))) +
132+
b'</key>')
133+
else:
134+
# fair performance improvement by explicitly doing the
135+
# translate for py3 instead of calling xml_esc
136+
self.stream.write(b'<key>' +
137+
UnicodeType(item).translate(XML_ESC_TRANS).encode('utf-8') +
138+
b'</key>')
139+
item = iterable_obj[item]
106140
if isinstance(item, _LLSD):
107141
item = item.thing
108-
t = type(item)
109-
if not t in self.type_map:
142+
item_type = type(item)
143+
if not item_type in self.type_map:
110144
raise LLSDSerializationError(
111-
"Cannot serialize unknown type: %s (%s)" % (t, item))
112-
tf = self.type_map[t]
145+
"Cannot serialize unknown type: %s (%s)" % (item_type, item))
146+
tfunction = self.type_map[item_type]
113147

114-
if tf == self._MAP:
148+
if tfunction == self._MAP:
115149
self.stream.write(b'<map>')
116150
iter_stack.append((iter(list(item)), b"map", item))
117-
elif tf == self._ARRAY:
151+
elif tfunction == self._ARRAY:
118152
self.stream.write(b'<array>')
119153
iter_stack.append((iter(item), b"array", None))
120154
else:
121-
self.stream.write(tf(item))
155+
self.stream.write(tfunction(item))
122156
except StopIteration:
123157
self.stream.write(b'</' + iter_type + b'>')
124158
iter_stack.pop()
@@ -156,14 +190,6 @@ def _indent(self):
156190
"Write an indentation based on the atom and indentation level."
157191
self.stream.writelines([self._indent_atom] * self._indent_level)
158192

159-
def _ARRAY(self, v):
160-
"Recursively format an array with pretty turned on."
161-
raise LLSDSerializationError("We should never end up here")
162-
163-
def _MAP(self, v):
164-
"Recursively format a map with pretty turned on."
165-
raise LLSDSerializationError("We should never end up here")
166-
167193
def _write(self, something):
168194
"""
169195
Serialize a python object to self.stream as application/llsd+xml.
@@ -178,36 +204,44 @@ def _write(self, something):
178204
self.stream.write(b'<?xml version="1.0" ?>\n'
179205
b'<llsd>\n')
180206

181-
iter_stack = [(iter([something]), b"")]
207+
iter_stack = [(iter([something]), b"", None)]
182208
while True:
183-
cur_iter, iter_type = iter_stack[-1]
209+
cur_iter, iter_type, iterable_obj = iter_stack[-1]
184210
try:
185211
item = next(cur_iter)
186212
if iter_type == b"map":
187213
self._indent()
188-
self.stream.write(b'<key>' + _str_to_bytes(self.xml_esc(UnicodeType(item[0]))) + b'</key>\n')
189-
item = item[1]
214+
if self.py2:
215+
self.stream.write(b'<key>' +
216+
_str_to_bytes(xml_esc(UnicodeType(item))) +
217+
b'</key>')
218+
else:
219+
# calling translate directly is a bit faster
220+
self.stream.write(b'<key>' +
221+
UnicodeType(item).translate(XML_ESC_TRANS).encode('utf-8') +
222+
b'</key>\n')
223+
item = iterable_obj[item]
190224
if isinstance(item, _LLSD):
191225
item = item.thing
192-
t = type(item)
193-
if not t in self.type_map:
226+
item_type = type(item)
227+
if not item_type in self.type_map:
194228
raise LLSDSerializationError(
195-
"Cannot serialize unknown type: %s (%s)" % (t, item))
196-
tf = self.type_map[t]
229+
"Cannot serialize unknown type: %s (%s)" % (item_type, item))
230+
tfunction = self.type_map[item_type]
197231

198-
if tf == self._MAP:
232+
if tfunction == self._MAP:
199233
self._indent()
200234
self.stream.write(b'<map>\n')
201235
self._indent_level += 1
202-
iter_stack.append((iter(item.items()), b"map"))
203-
elif tf == self._ARRAY:
236+
iter_stack.append((iter(list(item)), b"map", item))
237+
elif tfunction == self._ARRAY:
204238
self._indent()
205239
self.stream.write(b'<array>\n')
206240
self._indent_level += 1
207-
iter_stack.append((iter(item), b"array"))
241+
iter_stack.append((iter(item), b"array", None))
208242
else:
209243
self._indent()
210-
self.stream.write(tf(item))
244+
self.stream.write(tfunction(item))
211245
self.stream.write(b'\n')
212246
except StopIteration:
213247
self._indent_level -= 1

tests/bench.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ def binary_stream():
8282
yield f
8383

8484
def build_deep_xml():
85-
8685
deep_data = {}
8786
curr_data = deep_data
8887
for i in range(250):
@@ -96,16 +95,14 @@ def build_deep_xml():
9695
_deep_bench_data = build_deep_xml()
9796

9897
def build_wide_xml():
98+
9999
wide_xml = b"""
100100
<?xml version="1.0" encoding="UTF-8"?><llsd><map><key>wide_array</key><array>"
101101
"""
102-
102+
wide_data = {}
103103
for i in range(100000):
104-
wide_xml += b"""
105-
<real>5000</real>"""
106-
wide_xml += b"</array></map></llsd>"
107-
108-
return llsd.parse_xml(wide_xml)
104+
wide_data["item"+str(i)] = {"item1":2.345, "item2": [1,2,3], "item3": "string", "item4":{"subitem": llsd.uri("http://foo.bar.com")}}
105+
return wide_data
109106
_wide_bench_data = build_wide_xml()
110107

111108
def bench_stream(parse, stream):

0 commit comments

Comments
 (0)