@@ -182,7 +182,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
182182 preserve_dtypes = preserve_dtypes ,
183183 columns = columns ,
184184 order_categoricals = order_categoricals ,
185- chunksize = chunksize , encoding = encoding )
185+ chunksize = chunksize )
186186
187187 if iterator or chunksize :
188188 data = reader
@@ -838,15 +838,8 @@ def get_base_missing_value(cls, dtype):
838838
839839
840840class StataParser (object ):
841- _default_encoding = 'latin-1'
842841
843- def __init__ (self , encoding ):
844- if encoding is not None :
845- if encoding not in VALID_ENCODINGS :
846- raise ValueError ('Unknown encoding. Only latin-1 and ascii '
847- 'supported.' )
848-
849- self ._encoding = encoding
842+ def __init__ (self ):
850843
851844 # type code.
852845 # --------------------
@@ -964,8 +957,8 @@ def __init__(self, path_or_buf, convert_dates=True,
964957 convert_categoricals = True , index_col = None ,
965958 convert_missing = False , preserve_dtypes = True ,
966959 columns = None , order_categoricals = True ,
967- encoding = 'latin-1' , chunksize = None ):
968- super (StataReader , self ).__init__ (encoding )
960+ encoding = None , chunksize = None ):
961+ super (StataReader , self ).__init__ ()
969962 self .col_sizes = ()
970963
971964 # Arguments to the reader (can be temporarily overridden in
@@ -977,10 +970,6 @@ def __init__(self, path_or_buf, convert_dates=True,
977970 self ._preserve_dtypes = preserve_dtypes
978971 self ._columns = columns
979972 self ._order_categoricals = order_categoricals
980- if encoding is not None :
981- if encoding not in VALID_ENCODINGS :
982- raise ValueError ('Unknown encoding. Only latin-1 and ascii '
983- 'supported.' )
984973 self ._encoding = encoding
985974 self ._chunksize = chunksize
986975
@@ -998,18 +987,13 @@ def __init__(self, path_or_buf, convert_dates=True,
998987 path_or_buf = _stringify_path (path_or_buf )
999988 if isinstance (path_or_buf , str ):
1000989 path_or_buf , encoding , _ , should_close = get_filepath_or_buffer (
1001- path_or_buf , encoding = self ._default_encoding
1002- )
990+ path_or_buf )
1003991
1004992 if isinstance (path_or_buf , (str , text_type , bytes )):
1005993 self .path_or_buf = open (path_or_buf , 'rb' )
1006994 else :
1007995 # Copy to BytesIO, and ensure no encoding
1008996 contents = path_or_buf .read ()
1009- try :
1010- contents = contents .encode (self ._default_encoding )
1011- except :
1012- pass
1013997 self .path_or_buf = BytesIO (contents )
1014998
1015999 self ._read_header ()
@@ -1030,6 +1014,15 @@ def close(self):
10301014 except IOError :
10311015 pass
10321016
1017+ def _set_encoding (self ):
1018+ """
1019+ Set string encoding which depends on file version
1020+ """
1021+ if self .format_version < 118 :
1022+ self ._encoding = 'latin-1'
1023+ else :
1024+ self ._encoding = 'utf-8'
1025+
10331026 def _read_header (self ):
10341027 first_char = self .path_or_buf .read (1 )
10351028 if struct .unpack ('c' , first_char )[0 ] == b'<' :
@@ -1049,6 +1042,7 @@ def _read_new_header(self, first_char):
10491042 self .format_version = int (self .path_or_buf .read (3 ))
10501043 if self .format_version not in [117 , 118 ]:
10511044 raise ValueError (_version_error )
1045+ self ._set_encoding ()
10521046 self .path_or_buf .read (21 ) # </release><byteorder>
10531047 self .byteorder = self .path_or_buf .read (3 ) == b'MSF' and '>' or '<'
10541048 self .path_or_buf .read (15 ) # </byteorder><K>
@@ -1235,6 +1229,7 @@ def _read_old_header(self, first_char):
12351229 self .format_version = struct .unpack ('b' , first_char )[0 ]
12361230 if self .format_version not in [104 , 105 , 108 , 111 , 113 , 114 , 115 ]:
12371231 raise ValueError (_version_error )
1232+ self ._set_encoding ()
12381233 self .byteorder = struct .unpack ('b' , self .path_or_buf .read (1 ))[
12391234 0 ] == 0x1 and '>' or '<'
12401235 self .filetype = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
@@ -1338,16 +1333,9 @@ def _decode(self, s):
13381333 return s .decode ('utf-8' )
13391334
13401335 def _null_terminate (self , s ):
1341- if compat .PY3 or self ._encoding is not None :
1342- # have bytes not strings, so must decode
1343- s = s .partition (b"\0 " )[0 ]
1344- return s .decode (self ._encoding or self ._default_encoding )
1345- else :
1346- null_byte = "\0 "
1347- try :
1348- return s .lstrip (null_byte )[:s .index (null_byte )]
1349- except :
1350- return s
1336+ # have bytes not strings, so must decode
1337+ s = s .partition (b"\0 " )[0 ]
1338+ return s .decode (self ._encoding )
13511339
13521340 def _read_value_labels (self ):
13531341 if self ._value_labels_read :
@@ -1433,10 +1421,7 @@ def _read_strls(self):
14331421 self .path_or_buf .read (4 ))[0 ]
14341422 va = self .path_or_buf .read (length )
14351423 if typ == 130 :
1436- encoding = 'utf-8'
1437- if self .format_version == 117 :
1438- encoding = self ._encoding or self ._default_encoding
1439- va = va [0 :- 1 ].decode (encoding )
1424+ va = va [0 :- 1 ].decode (self ._encoding )
14401425 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
14411426 self .GSO [str (v_o )] = va
14421427
@@ -1980,9 +1965,14 @@ class StataWriter(StataParser):
19801965 def __init__ (self , fname , data , convert_dates = None , write_index = True ,
19811966 encoding = "latin-1" , byteorder = None , time_stamp = None ,
19821967 data_label = None , variable_labels = None ):
1983- super (StataWriter , self ).__init__ (encoding )
1968+ super (StataWriter , self ).__init__ ()
19841969 self ._convert_dates = {} if convert_dates is None else convert_dates
19851970 self ._write_index = write_index
1971+ if encoding is not None :
1972+ if encoding not in VALID_ENCODINGS :
1973+ raise ValueError ('Unknown encoding. Only latin-1 and ascii '
1974+ 'supported.' )
1975+ self ._encoding = encoding
19861976 self ._time_stamp = time_stamp
19871977 self ._data_label = data_label
19881978 self ._variable_labels = variable_labels
0 commit comments