55
66from llsd .base import (_LLSD , ALL_CHARS , LLSDBaseParser , LLSDBaseFormatter , XML_HEADER ,
77 LLSDParseError , LLSDSerializationError , UnicodeType ,
8- _format_datestr , _str_to_bytes , _to_python , is_unicode )
8+ _format_datestr , _str_to_bytes , _to_python , is_unicode , PY2 )
99from llsd .fastest_elementtree import ElementTreeError , fromstring , parse as _parse
1010
1111INVALID_XML_BYTES = b'\x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0c ' \
1212 b'\x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 ' \
1313 b'\x19 \x1a \x1b \x1c \x1d \x1e \x1f '
14+
15+ XML_ESC_TRANS = {}
16+ if not PY2 :
17+ XML_ESC_TRANS = str .maketrans ({'&' : '&' ,
18+ '<' :'<' ,
19+ '>' :'>' ,
20+ u'\uffff ' :None , # cannot be parsed
21+ u'\ufffe ' :None }) # cannot be parsed
22+
23+ for x in INVALID_XML_BYTES :
24+ XML_ESC_TRANS [x ] = None
25+
1426INVALID_XML_RE = re .compile (r'[\x00-\x08\x0b\x0c\x0e-\x1f]' )
1527
1628
@@ -25,6 +37,24 @@ def remove_invalid_xml_bytes(b):
2537 # unit tests)
2638 return INVALID_XML_RE .sub ('' , b )
2739
40+ def xml_esc (v ):
41+ "Escape string or unicode object v for xml output"
42+
43+ # Use is_unicode() instead of is_string() because in python 2, str is
44+ # bytes, not unicode, and should not be "encode()"d. Attempts to
45+ # encode("utf-8") a bytes type will result in an implicit
46+ # decode("ascii") that will throw a UnicodeDecodeError if the string
47+ # contains non-ascii characters.
48+ if is_unicode (v ):
49+ # we need to drop these invalid characters because they
50+ # cannot be parsed (and encode() doesn't drop them for us)
51+ v = v .replace (u'\uffff ' , u'' )
52+ v = v .replace (u'\ufffe ' , u'' )
53+ v = v .encode ('utf-8' )
54+ v = remove_invalid_xml_bytes (v )
55+ return v .replace (b'&' ,b'&' ).replace (b'<' ,b'<' ).replace (b'>' ,b'>' )
56+
57+
2858
2959class LLSDXMLFormatter (LLSDBaseFormatter ):
3060 """
@@ -38,47 +68,41 @@ class LLSDXMLFormatter(LLSDBaseFormatter):
3868 interface to this functionality.
3969 """
4070
41- def xml_esc (self , v ):
42- "Escape string or unicode object v for xml output"
43-
44- # Use is_unicode() instead of is_string() because in python 2, str is
45- # bytes, not unicode, and should not be "encode()"d. Attempts to
46- # encode("utf-8") a bytes type will result in an implicit
47- # decode("ascii") that will throw a UnicodeDecodeError if the string
48- # contains non-ascii characters.
49- if is_unicode (v ):
50- # we need to drop these invalid characters because they
51- # cannot be parsed (and encode() doesn't drop them for us)
52- v = v .replace (u'\uffff ' , u'' )
53- v = v .replace (u'\ufffe ' , u'' )
54- v = v .encode ('utf-8' )
55- v = remove_invalid_xml_bytes (v )
56- return v .replace (b'&' ,b'&' ).replace (b'<' ,b'<' ).replace (b'>' ,b'>' )
57-
71+ def __init__ (self , indent_atom = None ):
72+ "Construct a pretty serializer."
73+ # Call the super class constructor so that we have the type map
74+ super (LLSDXMLFormatter , self ).__init__ ()
75+ self .py2 = PY2
76+
5877 def _LLSD (self , v ):
5978 raise LLSDSerializationError ("We should never end up here" )
6079 def _UNDEF (self , _v ):
6180 return b'<undef/>'
6281 def _BOOLEAN (self , v ):
6382 if v :
6483 return b'<boolean>true</boolean>'
65- else :
66- return b'<boolean>false</boolean>'
84+ return b'<boolean>false</boolean>'
6785 def _INTEGER (self , v ):
68- return b'<integer>' + str (v ).encode () + b'</integer>'
86+ return b'<integer>' + str (v ).encode ('utf-8' ) + b'</integer>'
6987 def _REAL (self , v ):
70- return b'<real>' + str (v ).encode () + b'</real>'
88+ return b'<real>' + str (v ).encode ('utf-8' ) + b'</real>'
7189 def _UUID (self , v ):
7290 if v .int == 0 :
7391 return b'<uuid/>'
7492 else :
75- return b'<uuid>' + str (v ).encode () + b'</uuid>'
93+ return b'<uuid>' + str (v ).encode ('utf-8' ) + b'</uuid>'
7694 def _BINARY (self , v ):
7795 return b'<binary>' + base64 .b64encode (v ).strip () + b'</binary>'
7896 def _STRING (self , v ):
79- return b'<string>' + self .xml_esc (v ) + b'</string>'
97+ if self .py2 :
98+ return b'<string>' + _str_to_bytes (xml_esc (v )) + b'</string>'
99+ else :
100+ return b'<string>' + v .translate (XML_ESC_TRANS ).encode ('utf-8' ) + b'</string>'
80101 def _URI (self , v ):
81- return b'<uri>' + self .xml_esc (v ) + b'</uri>'
102+ if self .py2 :
103+ return b'<uri>' + _str_to_bytes (xml_esc (v )) + b'</uri>'
104+ else :
105+ return b'<uri>' + UnicodeType (v ).translate (XML_ESC_TRANS ).encode ('utf-8' ) + b'</uri>'
82106 def _DATE (self , v ):
83107 return b'<date>' + _format_datestr (v ) + b'</date>'
84108 def _ARRAY (self , v ):
@@ -97,28 +121,38 @@ def _write(self, something):
97121
98122 iter_stack = [(iter ([something ]), b"" , None )]
99123 while True :
100- cur_iter , iter_type , iterable = iter_stack [- 1 ]
124+ cur_iter , iter_type , iterable_obj = iter_stack [- 1 ]
101125 try :
102126 item = next (cur_iter )
103127 if iter_type == b"map" :
104- self .stream .write (b'<key>' + self .xml_esc (UnicodeType (item )) + b'</key>' )
105- item = iterable [item ]
128+
129+ if self .py2 :
130+ self .stream .write (b'<key>' +
131+ _str_to_bytes (xml_esc (UnicodeType (item ))) +
132+ b'</key>' )
133+ else :
134+ # fair performance improvement by explicitly doing the
135+ # translate for py3 instead of calling xml_esc
136+ self .stream .write (b'<key>' +
137+ UnicodeType (item ).translate (XML_ESC_TRANS ).encode ('utf-8' ) +
138+ b'</key>' )
139+ item = iterable_obj [item ]
106140 if isinstance (item , _LLSD ):
107141 item = item .thing
108- t = type (item )
109- if not t in self .type_map :
142+ item_type = type (item )
143+ if not item_type in self .type_map :
110144 raise LLSDSerializationError (
111- "Cannot serialize unknown type: %s (%s)" % (t , item ))
112- tf = self .type_map [t ]
145+ "Cannot serialize unknown type: %s (%s)" % (item_type , item ))
146+ tfunction = self .type_map [item_type ]
113147
114- if tf == self ._MAP :
148+ if tfunction == self ._MAP :
115149 self .stream .write (b'<map>' )
116150 iter_stack .append ((iter (list (item )), b"map" , item ))
117- elif tf == self ._ARRAY :
151+ elif tfunction == self ._ARRAY :
118152 self .stream .write (b'<array>' )
119153 iter_stack .append ((iter (item ), b"array" , None ))
120154 else :
121- self .stream .write (tf (item ))
155+ self .stream .write (tfunction (item ))
122156 except StopIteration :
123157 self .stream .write (b'</' + iter_type + b'>' )
124158 iter_stack .pop ()
@@ -156,14 +190,6 @@ def _indent(self):
156190 "Write an indentation based on the atom and indentation level."
157191 self .stream .writelines ([self ._indent_atom ] * self ._indent_level )
158192
159- def _ARRAY (self , v ):
160- "Recursively format an array with pretty turned on."
161- raise LLSDSerializationError ("We should never end up here" )
162-
163- def _MAP (self , v ):
164- "Recursively format a map with pretty turned on."
165- raise LLSDSerializationError ("We should never end up here" )
166-
167193 def _write (self , something ):
168194 """
169195 Serialize a python object to self.stream as application/llsd+xml.
@@ -178,36 +204,44 @@ def _write(self, something):
178204 self .stream .write (b'<?xml version="1.0" ?>\n '
179205 b'<llsd>\n ' )
180206
181- iter_stack = [(iter ([something ]), b"" )]
207+ iter_stack = [(iter ([something ]), b"" , None )]
182208 while True :
183- cur_iter , iter_type = iter_stack [- 1 ]
209+ cur_iter , iter_type , iterable_obj = iter_stack [- 1 ]
184210 try :
185211 item = next (cur_iter )
186212 if iter_type == b"map" :
187213 self ._indent ()
188- self .stream .write (b'<key>' + _str_to_bytes (self .xml_esc (UnicodeType (item [0 ]))) + b'</key>\n ' )
189- item = item [1 ]
214+ if self .py2 :
215+ self .stream .write (b'<key>' +
216+ _str_to_bytes (xml_esc (UnicodeType (item ))) +
217+ b'</key>' )
218+ else :
219+ # calling translate directly is a bit faster
220+ self .stream .write (b'<key>' +
221+ UnicodeType (item ).translate (XML_ESC_TRANS ).encode ('utf-8' ) +
222+ b'</key>\n ' )
223+ item = iterable_obj [item ]
190224 if isinstance (item , _LLSD ):
191225 item = item .thing
192- t = type (item )
193- if not t in self .type_map :
226+ item_type = type (item )
227+ if not item_type in self .type_map :
194228 raise LLSDSerializationError (
195- "Cannot serialize unknown type: %s (%s)" % (t , item ))
196- tf = self .type_map [t ]
229+ "Cannot serialize unknown type: %s (%s)" % (item_type , item ))
230+ tfunction = self .type_map [item_type ]
197231
198- if tf == self ._MAP :
232+ if tfunction == self ._MAP :
199233 self ._indent ()
200234 self .stream .write (b'<map>\n ' )
201235 self ._indent_level += 1
202- iter_stack .append ((iter (item . items ( )), b"map" ))
203- elif tf == self ._ARRAY :
236+ iter_stack .append ((iter (list ( item )), b"map" , item ))
237+ elif tfunction == self ._ARRAY :
204238 self ._indent ()
205239 self .stream .write (b'<array>\n ' )
206240 self ._indent_level += 1
207- iter_stack .append ((iter (item ), b"array" ))
241+ iter_stack .append ((iter (item ), b"array" , None ))
208242 else :
209243 self ._indent ()
210- self .stream .write (tf (item ))
244+ self .stream .write (tfunction (item ))
211245 self .stream .write (b'\n ' )
212246 except StopIteration :
213247 self ._indent_level -= 1
0 commit comments