2727 uses_params ,
2828 uses_relative ,
2929)
30+ import warnings
3031import zipfile
3132
3233from pandas ._typing import (
3334 CompressionDict ,
3435 CompressionOptions ,
36+ EncodingVar ,
37+ FileOrBuffer ,
3538 FilePathOrBuffer ,
39+ IOargs ,
40+ ModeVar ,
3641 StorageOptions ,
3742)
3843from pandas .compat import _get_lzma_file , _import_lzma
@@ -69,9 +74,7 @@ def is_url(url) -> bool:
6974 return parse_url (url ).scheme in _VALID_URLS
7075
7176
72- def _expand_user (
73- filepath_or_buffer : FilePathOrBuffer [AnyStr ],
74- ) -> FilePathOrBuffer [AnyStr ]:
77+ def _expand_user (filepath_or_buffer : FileOrBuffer [AnyStr ]) -> FileOrBuffer [AnyStr ]:
7578 """
7679 Return the argument with an initial component of ~ or ~user
7780 replaced by that user's home directory.
@@ -101,7 +104,7 @@ def validate_header_arg(header) -> None:
101104
102105def stringify_path (
103106 filepath_or_buffer : FilePathOrBuffer [AnyStr ],
104- ) -> FilePathOrBuffer [AnyStr ]:
107+ ) -> FileOrBuffer [AnyStr ]:
105108 """
106109 Attempt to convert a path-like object to a string.
107110
@@ -134,9 +137,9 @@ def stringify_path(
134137 # "__fspath__" [union-attr]
135138 # error: Item "IO[bytes]" of "Union[str, Path, IO[bytes]]" has no
136139 # attribute "__fspath__" [union-attr]
137- return filepath_or_buffer .__fspath__ () # type: ignore[union-attr]
140+ filepath_or_buffer = filepath_or_buffer .__fspath__ () # type: ignore[union-attr]
138141 elif isinstance (filepath_or_buffer , pathlib .Path ):
139- return str (filepath_or_buffer )
142+ filepath_or_buffer = str (filepath_or_buffer )
140143 return _expand_user (filepath_or_buffer )
141144
142145
@@ -162,13 +165,13 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool:
162165 )
163166
164167
165- def get_filepath_or_buffer (
168+ def get_filepath_or_buffer ( # type: ignore[assignment]
166169 filepath_or_buffer : FilePathOrBuffer ,
167- encoding : Optional [ str ] = None ,
170+ encoding : EncodingVar = None ,
168171 compression : CompressionOptions = None ,
169- mode : Optional [ str ] = None ,
172+ mode : ModeVar = None ,
170173 storage_options : StorageOptions = None ,
171- ):
174+ ) -> IOargs [ ModeVar , EncodingVar ] :
172175 """
173176 If the filepath_or_buffer is a url, translate and return the buffer.
174177 Otherwise passthrough.
@@ -191,14 +194,35 @@ def get_filepath_or_buffer(
191194
192195 .. versionadded:: 1.2.0
193196
194- Returns
195- -------
196- Tuple[FilePathOrBuffer, str, CompressionOptions, bool]
197- Tuple containing the filepath or buffer, the encoding, the compression
198- and should_close.
197+ ..versionchange:: 1.2.0
198+
199+ Returns the dataclass IOargs.
199200 """
200201 filepath_or_buffer = stringify_path (filepath_or_buffer )
201202
203+ # bz2 and xz do not write the byte order mark for utf-16 and utf-32
204+ # print a warning when writing such files
205+ compression_method = infer_compression (
206+ filepath_or_buffer , get_compression_method (compression )[0 ]
207+ )
208+ if (
209+ mode
210+ and "w" in mode
211+ and compression_method in ["bz2" , "xz" ]
212+ and encoding in ["utf-16" , "utf-32" ]
213+ ):
214+ warnings .warn (
215+ f"{ compression } will not write the byte order mark for { encoding } " ,
216+ UnicodeWarning ,
217+ )
218+
219+ # Use binary mode when converting path-like objects to file-like objects (fsspec)
220+ # except when text mode is explicitly requested. The original mode is returned if
221+ # fsspec is not used.
222+ fsspec_mode = mode or "rb"
223+ if "t" not in fsspec_mode and "b" not in fsspec_mode :
224+ fsspec_mode += "b"
225+
202226 if isinstance (filepath_or_buffer , str ) and is_url (filepath_or_buffer ):
203227 # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged
204228 if storage_options :
@@ -212,7 +236,13 @@ def get_filepath_or_buffer(
212236 compression = "gzip"
213237 reader = BytesIO (req .read ())
214238 req .close ()
215- return reader , encoding , compression , True
239+ return IOargs (
240+ filepath_or_buffer = reader ,
241+ encoding = encoding ,
242+ compression = compression ,
243+ should_close = True ,
244+ mode = fsspec_mode ,
245+ )
216246
217247 if is_fsspec_url (filepath_or_buffer ):
218248 assert isinstance (
@@ -244,7 +274,7 @@ def get_filepath_or_buffer(
244274
245275 try :
246276 file_obj = fsspec .open (
247- filepath_or_buffer , mode = mode or "rb" , ** (storage_options or {})
277+ filepath_or_buffer , mode = fsspec_mode , ** (storage_options or {})
248278 ).open ()
249279 # GH 34626 Reads from Public Buckets without Credentials needs anon=True
250280 except tuple (err_types_to_retry_with_anon ):
@@ -255,23 +285,41 @@ def get_filepath_or_buffer(
255285 storage_options = dict (storage_options )
256286 storage_options ["anon" ] = True
257287 file_obj = fsspec .open (
258- filepath_or_buffer , mode = mode or "rb" , ** (storage_options or {})
288+ filepath_or_buffer , mode = fsspec_mode , ** (storage_options or {})
259289 ).open ()
260290
261- return file_obj , encoding , compression , True
291+ return IOargs (
292+ filepath_or_buffer = file_obj ,
293+ encoding = encoding ,
294+ compression = compression ,
295+ should_close = True ,
296+ mode = fsspec_mode ,
297+ )
262298 elif storage_options :
263299 raise ValueError (
264300 "storage_options passed with file object or non-fsspec file path"
265301 )
266302
267303 if isinstance (filepath_or_buffer , (str , bytes , mmap .mmap )):
268- return _expand_user (filepath_or_buffer ), None , compression , False
304+ return IOargs (
305+ filepath_or_buffer = _expand_user (filepath_or_buffer ),
306+ encoding = encoding ,
307+ compression = compression ,
308+ should_close = False ,
309+ mode = mode ,
310+ )
269311
270312 if not is_file_like (filepath_or_buffer ):
271313 msg = f"Invalid file path or buffer object type: { type (filepath_or_buffer )} "
272314 raise ValueError (msg )
273315
274- return filepath_or_buffer , None , compression , False
316+ return IOargs (
317+ filepath_or_buffer = filepath_or_buffer ,
318+ encoding = encoding ,
319+ compression = compression ,
320+ should_close = False ,
321+ mode = mode ,
322+ )
275323
276324
277325def file_path_to_url (path : str ) -> str :
@@ -452,6 +500,15 @@ def get_handle(
452500 need_text_wrapping = (BufferedIOBase , RawIOBase , S3File )
453501 except ImportError :
454502 need_text_wrapping = (BufferedIOBase , RawIOBase )
503+ # fsspec is an optional dependency. If it is available, add its file-object
504+ # class to the list of classes that need text wrapping. If fsspec is too old and is
505+ # needed, get_filepath_or_buffer would already have thrown an exception.
506+ try :
507+ from fsspec .spec import AbstractFileSystem
508+
509+ need_text_wrapping = (* need_text_wrapping , AbstractFileSystem )
510+ except ImportError :
511+ pass
455512
456513 handles : List [Union [IO , _MMapWrapper ]] = list ()
457514 f = path_or_buf
@@ -583,12 +640,15 @@ def __init__(
583640 self .archive_name = archive_name
584641 kwargs_zip : Dict [str , Any ] = {"compression" : zipfile .ZIP_DEFLATED }
585642 kwargs_zip .update (kwargs )
586- super ().__init__ (file , mode , ** kwargs_zip )
643+ super ().__init__ (file , mode , ** kwargs_zip ) # type: ignore[arg-type]
587644
588645 def write (self , data ):
589646 archive_name = self .filename
590647 if self .archive_name is not None :
591648 archive_name = self .archive_name
649+ if archive_name is None :
650+ # ZipFile needs a non-empty string
651+ archive_name = "zip"
592652 super ().writestr (archive_name , data )
593653
594654 @property
0 commit comments