File tree Expand file tree Collapse file tree 4 files changed +20
-3
lines changed Expand file tree Collapse file tree 4 files changed +20
-3
lines changed Original file line number Diff line number Diff line change @@ -39,11 +39,15 @@ We are collecting feedback on this decision `here <https://github.com/pandas-dev
3939Avoid NumPy object dtype for strings by default
4040^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4141
42- Previously, all strings were stored in columns with NumPy object dtype.
42+ Previously, all strings were stored in columns with NumPy object dtype by default .
4343This release introduces an option ``future.infer_string `` that infers all
4444strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]" `` instead.
4545This is a new string dtype implementation that follows NumPy semantics in comparison
4646operations and will return ``np.nan `` as the missing value indicator.
47+ Setting the option will also infer the dtype ``"string" `` as a :class: `StringDtype ` with
48+ storage set to ``"pyarrow_numpy" ``, ignoring the value behind the option
49+ ``mode.string_storage ``.
50+
4751This option only works if PyArrow is installed. PyArrow backed strings have a
4852significantly reduced memory footprint and provide a big performance improvement
4953compared to NumPy object (:issue: `54430 `).
Original file line number Diff line number Diff line change @@ -112,7 +112,11 @@ def na_value(self) -> libmissing.NAType | float: # type: ignore[override]
112112
113113 def __init__ (self , storage = None ) -> None :
114114 if storage is None :
115- storage = get_option ("mode.string_storage" )
115+ infer_string = get_option ("future.infer_string" )
116+ if infer_string :
117+ storage = "pyarrow_numpy"
118+ else :
119+ storage = get_option ("mode.string_storage" )
116120 if storage not in {"python" , "pyarrow" , "pyarrow_numpy" }:
117121 raise ValueError (
118122 f"Storage must be 'python' or 'pyarrow'. Got { storage } instead."
Original file line number Diff line number Diff line change @@ -492,7 +492,8 @@ def use_inf_as_na_cb(key) -> None:
492492
493493string_storage_doc = """
494494: string
495- The default storage for StringDtype.
495+ The default storage for StringDtype. This option is ignored if
496+ ``future.infer_string`` is set to True.
496497"""
497498
498499with cf .config_prefix ("mode" ):
Original file line number Diff line number Diff line change @@ -2115,6 +2115,14 @@ def test_series_string_inference_array_string_dtype(self):
21152115 ser = Series (np .array (["a" , "b" ]))
21162116 tm .assert_series_equal (ser , expected )
21172117
2118+ def test_series_string_inference_storage_definition (self ):
2119+ # GH#54793
2120+ pytest .importorskip ("pyarrow" )
2121+ expected = Series (["a" , "b" ], dtype = "string[pyarrow_numpy]" )
2122+ with pd .option_context ("future.infer_string" , True ):
2123+ result = Series (["a" , "b" ], dtype = "string" )
2124+ tm .assert_series_equal (result , expected )
2125+
21182126
21192127class TestSeriesConstructorIndexCoercion :
21202128 def test_series_constructor_datetimelike_index_coercion (self ):
You can’t perform that action at this time.
0 commit comments