Fix scan operations for string columns (rapidsai#20460)

galipremsagar · web-flow · commit 165a9705fa8b · 2025-11-03T16:44:51.000Z
Closes: rapidsai#20444 This PR enables scan operations for `string` type, and fixes a bug where improper storage types were being propagated in `to_pandas`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Murray (https://github.com/Matt711) URL: rapidsai#20460
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -17,9 +17,11 @@
 
 import cudf
 from cudf.api.types import is_scalar
+from cudf.core._compat import PANDAS_GE_230
 from cudf.core._internals import binaryop
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase, as_column, column_empty
+from cudf.core.mixins import Scannable
 from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
     CUDF_STRING_DTYPE,
@@ -77,7 +79,7 @@ def plc_flags_from_re_flags(
     return plc_flags
 
 
-class StringColumn(ColumnBase):
+class StringColumn(ColumnBase, Scannable):
     """
     Implements operations for Columns of String type
 
@@ -115,6 +117,10 @@ class StringColumn(ColumnBase):
         "__truediv__",
         "__floordiv__",
     }
+    _VALID_SCANS = {
+        "cummin",
+        "cummax",
+    }
 
     def __init__(
         self,
@@ -327,6 +333,11 @@ def _with_type_metadata(self: Self, dtype: Dtype) -> Self:
             self._dtype = dtype
         return self
 
+    def _scan(self, op: str):
+        return self.scan(op.replace("cum", ""), True)._with_type_metadata(
+            self.dtype
+        )
+
     def as_numerical_column(self, dtype: np.dtype) -> NumericalColumn:
         if dtype.kind == "b":
             result = self.count_characters() > np.int8(0)
@@ -473,11 +484,20 @@ def to_pandas(
         if (
             cudf.get_option("mode.pandas_compatible")
             and isinstance(self.dtype, pd.StringDtype)
-            and "pyarrow" in self.dtype.storage
+            and self.dtype.storage in ["pyarrow", "python"]
         ):
-            pandas_array = self.dtype.__from_arrow__(
-                self.to_arrow().cast(pa.large_string())
-            )
+            if self.dtype.storage == "pyarrow":
+                pandas_array = self.dtype.__from_arrow__(
+                    self.to_arrow().cast(pa.large_string())
+                )
+            elif self.dtype.na_value is np.nan and PANDAS_GE_230:
+                pandas_array = pd.core.arrays.string_.StringArrayNumpySemantics._from_sequence(
+                    self.to_arrow()
+                )
+            else:
+                return super().to_pandas(
+                    nullable=nullable, arrow_type=arrow_type
+                )
             return pd.Index(pandas_array, copy=False)
         return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
 
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -960,7 +960,6 @@ def pytest_unconfigure(config):
     "tests/arrays/string_/test_string.py::test_min_max[string=string[pyarrow]-False-min]",
     "tests/arrays/string_/test_string.py::test_min_max[string=string[python]-False-max]",
     "tests/arrays/string_/test_string.py::test_min_max[string=string[python]-False-min]",
-    "tests/arrays/string_/test_string.py::test_numpy_array_ufunc[string=str[python]-Series]",
     "tests/arrays/string_/test_string.py::test_repr[pyarrow_numpy]",
     "tests/arrays/string_/test_string.py::test_repr[string=str[pyarrow]]",
     "tests/arrays/string_/test_string.py::test_repr[string=str[python]]",
@@ -3655,10 +3654,6 @@ def pytest_unconfigure(config):
     "tests/extension/test_string.py::TestStringArray::test_unary_ufunc_dunder_equivalence[pyarrow-True-positive]",
     "tests/extension/test_string.py::TestStringArray::test_unary_ufunc_dunder_equivalence[pyarrow_numpy-False-positive]",
     "tests/extension/test_string.py::TestStringArray::test_unary_ufunc_dunder_equivalence[pyarrow_numpy-True-positive]",
-    "tests/extension/test_string.py::TestStringArray::test_unique[string=str[python]-False-<lambda>-Series]",
-    "tests/extension/test_string.py::TestStringArray::test_unique[string=str[python]-False-unique-Series]",
-    "tests/extension/test_string.py::TestStringArray::test_unique[string=str[python]-True-<lambda>-Series]",
-    "tests/extension/test_string.py::TestStringArray::test_unique[string=str[python]-True-unique-Series]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[pyarrow-False-frame-index1]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[pyarrow-False-frame-index2]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[pyarrow-False-frame-index3]",
@@ -3703,20 +3698,12 @@ def pytest_unconfigure(config):
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-frame-index3]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-series-index2]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-series-index3]",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-frame-index0]",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-frame-index1]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-frame-index2]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-frame-index3]",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-series-index0]",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-series-index1]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-series-index2]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-series-index3]",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-frame-index0]",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-frame-index1]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-frame-index2]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-frame-index3]",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index0]",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index1]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index2]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index3]",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-frame-index2]",
@@ -9246,36 +9233,6 @@ def pytest_unconfigure(config):
     "tests/series/test_constructors.py::TestSeriesConstructors::test_series_string_inference_scalar",
     "tests/series/test_constructors.py::TestSeriesConstructors::test_series_string_inference_storage_definition",
     "tests/series/test_constructors.py::TestSeriesConstructors::test_series_string_with_na_inference[None]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data0-cumsum-True-expected_data0]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data1-cumsum-False-expected_data1]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data10-cummin-True-expected_data10]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data11-cummin-False-expected_data11]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data12-cummin-True-expected_data12]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data13-cummin-False-expected_data13]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data14-cummin-True-expected_data14]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data15-cummin-False-expected_data15]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data16-cummin-True-expected_data16]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data17-cummin-False-expected_data17]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data18-cummin-True-expected_data18]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data19-cummin-False-expected_data19]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data2-cumsum-True-expected_data2]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data20-cummax-True-expected_data20]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data21-cummax-False-expected_data21]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data22-cummax-True-expected_data22]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data23-cummax-False-expected_data23]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data24-cummax-True-expected_data24]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data25-cummax-False-expected_data25]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data26-cummax-True-expected_data26]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data27-cummax-False-expected_data27]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data28-cummax-True-expected_data28]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data29-cummax-False-expected_data29]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data3-cumsum-False-expected_data3]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data4-cumsum-True-expected_data4]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data5-cumsum-False-expected_data5]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data6-cumsum-True-expected_data6]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data7-cumsum-False-expected_data7]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data8-cumsum-True-expected_data8]",
-    "tests/series/test_cumulative.py::TestSeriesCumulativeOps::test_cum_methods_ea_strings[string=str[python]-data9-cumsum-False-expected_data9]",
     "tests/series/test_formats.py::TestCategoricalRepr::test_categorical_series_repr_datetime",
     "tests/series/test_formats.py::TestCategoricalRepr::test_categorical_series_repr_datetime_ordered",
     "tests/series/test_formats.py::TestCategoricalRepr::test_categorical_series_repr_timedelta",
diff --git a/python/cudf/cudf/tests/series/methods/test_cumulative_methods.py b/python/cudf/cudf/tests/series/methods/test_cumulative_methods.py
@@ -123,3 +123,17 @@ def test_scan_boolean(method):
     expect = getattr(s.to_pandas(), method)()
 
     assert_eq(expect, got)
+
+
+def test_cummin_cummax_strings():
+    data = ["dog", "cat", "zebra", "ant", "bat"]
+    gser = cudf.Series(data)
+    pser = pd.Series(data)
+
+    got_min = gser.cummin()
+    expected_min = pser.cummin()
+    assert_eq(got_min, expected_min)
+
+    got_max = gser.cummax()
+    expected_max = pser.cummax()
+    assert_eq(got_max, expected_max)