Skip to content

Commit a3bd444

Browse files
committed
DEPR: infer bytes to bytes[pyarrow]
1 parent 985e7af commit a3bd444

File tree

22 files changed

+460
-99
lines changed

22 files changed

+460
-99
lines changed

pandas/_libs/lib.pyx

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,7 @@ cdef class Seen:
12721272
bint interval_ # seen_interval
12731273
bint time_
12741274
bint date_
1275+
bint bytes_
12751276

12761277
def __cinit__(self, bint coerce_numeric=False):
12771278
"""
@@ -1300,6 +1301,7 @@ cdef class Seen:
13001301
self.interval_ = False
13011302
self.time_ = False
13021303
self.date_ = False
1304+
self.bytes_ = False
13031305
self.coerce_numeric = coerce_numeric
13041306

13051307
cdef bint check_uint64_conflict(self) except -1:
@@ -2595,6 +2597,12 @@ def maybe_convert_objects(ndarray[object] objects,
25952597
else:
25962598
seen.object_ = True
25972599
break
2600+
elif isinstance(val, bytes):
2601+
if convert_non_numeric:
2602+
seen.bytes_ = True
2603+
else:
2604+
seen.object_ = True
2605+
break
25982606
elif PyTime_Check(val):
25992607
if convert_non_numeric and val.tzinfo is None:
26002608
seen.time_ = True
@@ -2605,8 +2613,37 @@ def maybe_convert_objects(ndarray[object] objects,
26052613
seen.object_ = True
26062614
break
26072615

2608-
# we try to coerce datetime w/tz but must all have the same tz
2609-
if seen.datetimetz_:
2616+
if seen.bytes_:
2617+
if is_bytes_array(objects):
2618+
opt = get_option("future.infer_bytes")
2619+
if opt is True:
2620+
import pyarrow as pa
2621+
2622+
from pandas.core.dtypes.dtypes import ArrowDtype
2623+
2624+
obj = pa.array(objects)
2625+
dtype = ArrowDtype(obj.type)
2626+
return dtype.construct_array_type()(obj)
2627+
elif opt is False:
2628+
# explicitly set to keep the old behavior and avoid the warning
2629+
pass
2630+
else:
2631+
from pandas.util._exceptions import find_stack_level
2632+
warnings.warn(
2633+
"Pandas type inference with a sequence of `bytes` "
2634+
"objects is deprecated. In a future version, this will give "
2635+
"bytes[pyarrow] dtype, which will require pyarrow to be "
2636+
"installed. To opt in to the new behavior immediately set "
2637+
"`pd.set_option('future.infer_bytes', True)`. To keep the "
2638+
"old behavior pass `dtype=object`.",
2639+
FutureWarning,
2640+
stacklevel=find_stack_level(),
2641+
)
2642+
2643+
seen.object_ = True
2644+
2645+
elif seen.datetimetz_:
2646+
# we try to coerce datetime w/tz but must all have the same tz
26102647
if is_datetime_with_singletz_array(objects):
26112648
from pandas import DatetimeIndex
26122649

pandas/core/config_init.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -892,6 +892,14 @@ def register_converter_cb(key) -> None:
892892

893893

894894
with cf.config_prefix("future"):
895+
cf.register_option(
896+
"future.infer_bytes",
897+
None,
898+
"Whether to infer sequence of bytes objects as pyarrow bytes "
899+
"dtype, which will be the default in pandas 3.0 "
900+
"(at which point this option will be deprecated).",
901+
validator=is_one_of_factory([True, False, None]),
902+
)
895903
cf.register_option(
896904
"future.infer_time",
897905
None,

pandas/core/construction.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,30 @@ def array(
415415
stacklevel=find_stack_level(),
416416
)
417417

418+
elif inferred_dtype == "bytes":
419+
opt = get_option("future.infer_bytes")
420+
421+
if opt is True:
422+
import pyarrow as pa
423+
424+
obj = pa.array(data)
425+
dtype = ArrowDtype(obj.type)
426+
return dtype.construct_array_type()(obj)
427+
elif opt is False:
428+
# explicitly set to keep the old behavior and avoid the warning
429+
pass
430+
else:
431+
warnings.warn(
432+
"Pandas type inference with a sequence of `bytes` "
433+
"objects is deprecated. In a future version, this will give "
434+
"bytes[pyarrow] dtype, which will require pyarrow to be "
435+
"installed. To opt in to the new behavior immediately set "
436+
"`pd.set_option('future.infer_bytes', True)`. To keep the "
437+
"old behavior pass `dtype=object`.",
438+
FutureWarning,
439+
stacklevel=find_stack_level(),
440+
)
441+
418442
# Pandas overrides NumPy for
419443
# 1. datetime64[ns,us,ms,s]
420444
# 2. timedelta64[ns,us,ms,s]

pandas/core/dtypes/cast.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -860,7 +860,25 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
860860
import pyarrow as pa
861861

862862
pa_dtype = pa.date32()
863+
dtype = ArrowDtype(pa_dtype)
864+
865+
elif isinstance(val, bytes):
866+
opt = get_option("future.infer_bytes")
867+
if opt is None:
868+
warnings.warn(
869+
"Pandas type inference with a `bytes` "
870+
"object is deprecated. In a future version, this will give "
871+
"bytes[pyarrow] dtype, which will require pyarrow to be "
872+
"installed. To opt in to the new behavior immediately set "
873+
"`pd.set_option('future.infer_bytes', True)`. To keep the "
874+
"old behavior pass `dtype=object`.",
875+
FutureWarning,
876+
stacklevel=find_stack_level(),
877+
)
878+
elif opt is True:
879+
import pyarrow as pa
863880

881+
pa_dtype = pa.binary()
864882
dtype = ArrowDtype(pa_dtype)
865883

866884
elif is_bool(val):

pandas/core/strings/accessor.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1967,7 +1967,13 @@ def decode(self, encoding, errors: str = "strict"):
19671967
f = lambda x: decoder(x, errors)[0]
19681968
arr = self._data.array
19691969
# assert isinstance(arr, (StringArray,))
1970-
result = arr._str_map(f)
1970+
1971+
if isinstance(arr.dtype, ArrowDtype):
1972+
# TODO: is there a performant way to do this?
1973+
res_values = arr.map(f)
1974+
result = type(arr)._from_sequence(res_values)
1975+
else:
1976+
result = arr._str_map(f)
19711977
return self._wrap_result(result)
19721978

19731979
@forbid_nonstring_types(["bytes"])

pandas/io/pytables.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5066,7 +5066,16 @@ def _unconvert_string_array(
50665066
dtype = f"U{itemsize}"
50675067

50685068
if isinstance(data[0], bytes):
5069-
data = Series(data, copy=False).str.decode(encoding, errors=errors)._values
5069+
with warnings.catch_warnings():
5070+
# Deprecation about inferring bytes to bytes[pyarrow] dtype
5071+
# TODO: try to avoid this altogether
5072+
warnings.filterwarnings("ignore", category=FutureWarning)
5073+
5074+
data = (
5075+
Series(data, copy=False).str.decode(encoding, errors=errors)._values
5076+
).astype(object, copy=False)
5077+
# TODO: if we have pyarrow str instead of object here to begin
5078+
# with, can we avoid object dtype cast here?
50705079
else:
50715080
data = data.astype(dtype, copy=False).astype(object, copy=False)
50725081

pandas/io/stata.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2910,7 +2910,13 @@ def _prepare_data(self) -> np.recarray:
29102910
for i, col in enumerate(data):
29112911
typ = typlist[i]
29122912
if typ <= self._max_string_length:
2913-
data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,))
2913+
with warnings.catch_warnings():
2914+
# deprecated behavior with sequence of bytes, will infer
2915+
# to bytes[pyarrow]
2916+
# TODO: can we avoid this altogether
2917+
warnings.filterwarnings("ignore", category=FutureWarning)
2918+
2919+
data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,))
29142920
stype = f"S{typ}"
29152921
dtypes[col] = stype
29162922
data[col] = data[col].astype(stype)

pandas/tests/dtypes/cast/test_infer_dtype.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,14 @@ def test_infer_dtype_from_scalar_errors():
163163
],
164164
)
165165
def test_infer_dtype_from_scalar(value, expected):
166-
dtype, _ = infer_dtype_from_scalar(value)
166+
msg = "type inference with a `bytes` object is deprecated"
167+
warn = None
168+
if isinstance(value, bytes):
169+
warn = FutureWarning
170+
171+
with tm.assert_produces_warning(warn, match=msg):
172+
dtype, _ = infer_dtype_from_scalar(value)
173+
167174
assert is_dtype_equal(dtype, expected)
168175

169176
with pytest.raises(TypeError, match="must be list-like"):

pandas/tests/dtypes/cast/test_promote.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,13 @@ def test_maybe_promote_any_with_bytes(any_numpy_dtype):
311311
# output is not a generic bytes, but corresponds to expected_dtype
312312
exp_val_for_scalar = np.array([fill_value], dtype=expected_dtype)[0]
313313

314-
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
314+
msg = "type inference with a `bytes` object"
315+
warn = None
316+
if any_numpy_dtype in ["timedelta64[ns]", "datetime64[ns]"]:
317+
warn = FutureWarning
318+
319+
with tm.assert_produces_warning(warn, match=msg):
320+
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
315321

316322

317323
def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype):
@@ -330,7 +336,13 @@ def test_maybe_promote_datetime64_with_any(datetime64_dtype, any_numpy_dtype):
330336
expected_dtype = np.dtype(object)
331337
exp_val_for_scalar = fill_value
332338

333-
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
339+
msg = "type inference with a `bytes` object is deprecated"
340+
warn = None
341+
if any_numpy_dtype is bytes and datetime64_dtype == "datetime64[ns]":
342+
warn = FutureWarning
343+
344+
with tm.assert_produces_warning(warn, match=msg):
345+
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
334346

335347

336348
@pytest.mark.parametrize(
@@ -413,7 +425,13 @@ def test_maybe_promote_timedelta64_with_any(timedelta64_dtype, any_numpy_dtype):
413425
expected_dtype = np.dtype(object)
414426
exp_val_for_scalar = fill_value
415427

416-
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
428+
msg = "type inference with a `bytes` object is deprecated"
429+
warn = None
430+
if any_numpy_dtype is bytes and timedelta64_dtype == "timedelta64[ns]":
431+
warn = FutureWarning
432+
433+
with tm.assert_produces_warning(warn, match=msg):
434+
_check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar)
417435

418436

419437
@pytest.mark.parametrize(

pandas/tests/extension/test_arrow.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,9 @@ def test_stack(self, data, columns):
741741
warn_msg = (
742742
"Pandas type inference with a sequence of `datetime.date` objects"
743743
)
744+
if pa.types.is_binary(pa_dtype):
745+
warn = FutureWarning
746+
warn_msg = "Pandas type inference with a sequence of `bytes` objects"
744747

745748
with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
746749
super().test_stack(data, columns)
@@ -815,6 +818,9 @@ def test_hash_pandas_object_works(self, data, as_frame):
815818
# TODO(#48964) This warning will be avoided by implementing
816819
# ArrowExtensionArray.hash_pandas_object
817820
warn = FutureWarning
821+
elif pa.types.is_binary(pa_dtype):
822+
warn_msg = "Pandas type inference with a sequence of `bytes`"
823+
warn = FutureWarning
818824

819825
with tm.assert_produces_warning(warn, match=warn_msg, check_stacklevel=False):
820826
super().test_hash_pandas_object_works(data, as_frame)

0 commit comments

Comments
 (0)