File tree Expand file tree Collapse file tree 4 files changed +20
-3
lines changed Expand file tree Collapse file tree 4 files changed +20
-3
lines changed Original file line number Diff line number Diff line change @@ -39,11 +39,15 @@ We are collecting feedback on this decision `here <https://github.com/pandas-dev
3939Avoid NumPy object dtype for strings by default
4040^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4141
42- Previously, all strings were stored in columns with NumPy object dtype.
42+ Previously, all strings were stored in columns with NumPy object dtype by default .
4343This release introduces an option ``future.infer_string `` that infers all
4444strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]" `` instead.
4545This is a new string dtype implementation that follows NumPy semantics in comparison
4646operations and will return ``np.nan `` as the missing value indicator.
47+ Setting the option will also infer the dtype ``"string" `` as a :class: `StringDtype ` with
48+ storage set to ``"pyarrow_numpy" ``, ignoring the value behind the option
49+ ``mode.string_storage ``.
50+
4751This option only works if PyArrow is installed. PyArrow backed strings have a
4852significantly reduced memory footprint and provide a big performance improvement
4953compared to NumPy object (:issue: `54430 `).
Original file line number Diff line number Diff line change @@ -115,7 +115,11 @@ def na_value(self) -> libmissing.NAType | float: # type: ignore[override]
115115
116116 def __init__ (self , storage = None ) -> None :
117117 if storage is None :
118- storage = get_option ("mode.string_storage" )
118+ infer_string = get_option ("future.infer_string" )
119+ if infer_string :
120+ storage = "pyarrow_numpy"
121+ else :
122+ storage = get_option ("mode.string_storage" )
119123 if storage not in {"python" , "pyarrow" , "pyarrow_numpy" }:
120124 raise ValueError (
121125 f"Storage must be 'python' or 'pyarrow'. Got { storage } instead."
Original file line number Diff line number Diff line change @@ -493,7 +493,8 @@ def use_inf_as_na_cb(key) -> None:
493493
494494string_storage_doc = """
495495: string
496- The default storage for StringDtype.
496+ The default storage for StringDtype. This option is ignored if
497+ ``future.infer_string`` is set to True.
497498"""
498499
499500with cf .config_prefix ("mode" ):
Original file line number Diff line number Diff line change @@ -2115,6 +2115,14 @@ def test_series_string_inference_array_string_dtype(self):
21152115 ser = Series (np .array (["a" , "b" ]))
21162116 tm .assert_series_equal (ser , expected )
21172117
2118+ def test_series_string_inference_storage_definition (self ):
2119+ # GH#54793
2120+ pytest .importorskip ("pyarrow" )
2121+ expected = Series (["a" , "b" ], dtype = "string[pyarrow_numpy]" )
2122+ with pd .option_context ("future.infer_string" , True ):
2123+ result = Series (["a" , "b" ], dtype = "string" )
2124+ tm .assert_series_equal (result , expected )
2125+
21182126
21192127class TestSeriesConstructorIndexCoercion :
21202128 def test_series_constructor_datetimelike_index_coercion (self ):
You can’t perform that action at this time.
0 commit comments