Skip to content

Commit 165a970

Browse files
Fix scan operations for string columns (rapidsai#20460)
Closes: rapidsai#20444 This PR enables scan operations for `string` type, and fixes a bug where improper storage types were being propagated in `to_pandas`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Murray (https://github.com/Matt711) URL: rapidsai#20460
1 parent 03bf586 commit 165a970

File tree

3 files changed

+39
-48
lines changed

3 files changed

+39
-48
lines changed

python/cudf/cudf/core/column/string.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717

1818
import cudf
1919
from cudf.api.types import is_scalar
20+
from cudf.core._compat import PANDAS_GE_230
2021
from cudf.core._internals import binaryop
2122
from cudf.core.buffer import Buffer, acquire_spill_lock
2223
from cudf.core.column.column import ColumnBase, as_column, column_empty
24+
from cudf.core.mixins import Scannable
2325
from cudf.errors import MixedTypeError
2426
from cudf.utils.dtypes import (
2527
CUDF_STRING_DTYPE,
@@ -77,7 +79,7 @@ def plc_flags_from_re_flags(
7779
return plc_flags
7880

7981

80-
class StringColumn(ColumnBase):
82+
class StringColumn(ColumnBase, Scannable):
8183
"""
8284
Implements operations for Columns of String type
8385
@@ -115,6 +117,10 @@ class StringColumn(ColumnBase):
115117
"__truediv__",
116118
"__floordiv__",
117119
}
120+
_VALID_SCANS = {
121+
"cummin",
122+
"cummax",
123+
}
118124

119125
def __init__(
120126
self,
@@ -327,6 +333,11 @@ def _with_type_metadata(self: Self, dtype: Dtype) -> Self:
327333
self._dtype = dtype
328334
return self
329335

336+
def _scan(self, op: str):
337+
return self.scan(op.replace("cum", ""), True)._with_type_metadata(
338+
self.dtype
339+
)
340+
330341
def as_numerical_column(self, dtype: np.dtype) -> NumericalColumn:
331342
if dtype.kind == "b":
332343
result = self.count_characters() > np.int8(0)
@@ -473,11 +484,20 @@ def to_pandas(
473484
if (
474485
cudf.get_option("mode.pandas_compatible")
475486
and isinstance(self.dtype, pd.StringDtype)
476-
and "pyarrow" in self.dtype.storage
487+
and self.dtype.storage in ["pyarrow", "python"]
477488
):
478-
pandas_array = self.dtype.__from_arrow__(
479-
self.to_arrow().cast(pa.large_string())
480-
)
489+
if self.dtype.storage == "pyarrow":
490+
pandas_array = self.dtype.__from_arrow__(
491+
self.to_arrow().cast(pa.large_string())
492+
)
493+
elif self.dtype.na_value is np.nan and PANDAS_GE_230:
494+
pandas_array = pd.core.arrays.string_.StringArrayNumpySemantics._from_sequence(
495+
self.to_arrow()
496+
)
497+
else:
498+
return super().to_pandas(
499+
nullable=nullable, arrow_type=arrow_type
500+
)
481501
return pd.Index(pandas_array, copy=False)
482502
return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
483503

python/cudf/cudf/pandas/scripts/conftest-patch.py

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -960,7 +960,6 @@ def pytest_unconfigure(config):
960960
"tests/arrays/string_/test_string.py::test_min_max[string=string[pyarrow]-False-min]",
961961
"tests/arrays/string_/test_string.py::test_min_max[string=string[python]-False-max]",
962962
"tests/arrays/string_/test_string.py::test_min_max[string=string[python]-False-min]",
963-
"tests/arrays/string_/test_string.py::test_numpy_array_ufunc[string=str[python]-Series]",
964963
"tests/arrays/string_/test_string.py::test_repr[pyarrow_numpy]",
965964
"tests/arrays/string_/test_string.py::test_repr[string=str[pyarrow]]",
966965
"tests/arrays/string_/test_string.py::test_repr[string=str[python]]",
@@ -3655,10 +3654,6 @@ def pytest_unconfigure(config):
36553654
"tests/extension/test_string.py::TestStringArray::test_unary_ufunc_dunder_equivalence[pyarrow-True-positive]",
36563655
"tests/extension/test_string.py::TestStringArray::test_unary_ufunc_dunder_equivalence[pyarrow_numpy-False-positive]",
36573656
"tests/extension/test_string.py::TestStringArray::test_unary_ufunc_dunder_equivalence[pyarrow_numpy-True-positive]",
3658-
"tests/extension/test_string.py::TestStringArray::test_unique[string=str[python]-False-<lambda>-Series]",
3659-
"tests/extension/test_string.py::TestStringArray::test_unique[string=str[python]-False-unique-Series]",
3660-
"tests/extension/test_string.py::TestStringArray::test_unique[string=str[python]-True-<lambda>-Series]",
3661-
"tests/extension/test_string.py::TestStringArray::test_unique[string=str[python]-True-unique-Series]",
36623657
"tests/extension/test_string.py::TestStringArray::test_unstack[pyarrow-False-frame-index1]",
36633658
"tests/extension/test_string.py::TestStringArray::test_unstack[pyarrow-False-frame-index2]",
36643659
"tests/extension/test_string.py::TestStringArray::test_unstack[pyarrow-False-frame-index3]",
@@ -3703,20 +3698,12 @@ def pytest_unconfigure(config):
37033698
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-frame-index3]",
37043699
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-series-index2]",
37053700
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-series-index3]",
3706-
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-frame-index0]",
3707-
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-frame-index1]",
37083701
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-frame-index2]",
37093702
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-frame-index3]",
3710-
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-series-index0]",
3711-
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-series-index1]",
37123703
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-series-index2]",
37133704
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-series-index3]",
3714-
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-frame-index0]",
3715-
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-frame-index1]",
37163705
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-frame-index2]",
37173706
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-frame-index3]",
3718-
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index0]",
3719-
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index1]",
37203707
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index2]",
37213708
"tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index3]",
37223709
"tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-frame-index2]",
@@ -9246,36 +9233,6 @@ def pytest_unconfigure(config):
92469233
"tests/series/test_constructors.py::TestSeriesConstructors::test_series_string_inference_scalar",
92479234
"tests/series/test_constructors.py::TestSeriesConstructors::test_series_string_inference_storage_definition",
92489235
"tests/series/test_constructors.py::TestSeriesConstructors::test_series_string_with_na_inference[None]",
9249-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data0-cumsum-True-expected_data0]",
9250-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data1-cumsum-False-expected_data1]",
9251-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data10-cummin-True-expected_data10]",
9252-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data11-cummin-False-expected_data11]",
9253-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data12-cummin-True-expected_data12]",
9254-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data13-cummin-False-expected_data13]",
9255-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data14-cummin-True-expected_data14]",
9256-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data15-cummin-False-expected_data15]",
9257-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data16-cummin-True-expected_data16]",
9258-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data17-cummin-False-expected_data17]",
9259-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data18-cummin-True-expected_data18]",
9260-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data19-cummin-False-expected_data19]",
9261-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data2-cumsum-True-expected_data2]",
9262-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data20-cummax-True-expected_data20]",
9263-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data21-cummax-False-expected_data21]",
9264-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data22-cummax-True-expected_data22]",
9265-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data23-cummax-False-expected_data23]",
9266-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data24-cummax-True-expected_data24]",
9267-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data25-cummax-False-expected_data25]",
9268-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data26-cummax-True-expected_data26]",
9269-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data27-cummax-False-expected_data27]",
9270-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data28-cummax-True-expected_data28]",
9271-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data29-cummax-False-expected_data29]",
9272-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data3-cumsum-False-expected_data3]",
9273-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data4-cumsum-True-expected_data4]",
9274-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data5-cumsum-False-expected_data5]",
9275-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data6-cumsum-True-expected_data6]",
9276-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data7-cumsum-False-expected_data7]",
9277-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data8-cumsum-True-expected_data8]",
9278-
"tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data9-cumsum-False-expected_data9]",
92799236
"tests/series/test_formats.py::TestCategoricalRepr::test_categorical_series_repr_datetime",
92809237
"tests/series/test_formats.py::TestCategoricalRepr::test_categorical_series_repr_datetime_ordered",
92819238
"tests/series/test_formats.py::TestCategoricalRepr::test_categorical_series_repr_timedelta",

python/cudf/cudf/tests/series/methods/test_cumulative_methods.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,3 +123,17 @@ def test_scan_boolean(method):
123123
expect = getattr(s.to_pandas(), method)()
124124

125125
assert_eq(expect, got)
126+
127+
128+
def test_cummin_cummax_strings():
129+
data = ["dog", "cat", "zebra", "ant", "bat"]
130+
gser = cudf.Series(data)
131+
pser = pd.Series(data)
132+
133+
got_min = gser.cummin()
134+
expected_min = pser.cummin()
135+
assert_eq(got_min, expected_min)
136+
137+
got_max = gser.cummax()
138+
expected_max = pser.cummax()
139+
assert_eq(got_max, expected_max)

0 commit comments

Comments
 (0)