[SPARK-40510][PS] Implement ddof in Series.cov

zhengruifeng · zhengruifeng · commit 1476a9f96e58 · 2022-09-22T17:51:58.000+08:00
### What changes were proposed in this pull request? Implement `ddof` in `Series.cov`, by switch to `SF.covar` ### Why are the changes needed? for API coverage ### Does this PR introduce _any_ user-facing change? yes, `ddof` supported now ``` >>> s1 = ps.Series([0.90010907, 0.13484424, 0.62036035]) >>> s2 = ps.Series([0.12528585, 0.26962463, 0.51111198]) >>> with ps.option_context("compute.ops_on_diff_frames", True): ... s1.cov(s2) -0.016857... >>> with ps.option_context("compute.ops_on_diff_frames", True): ... s1.cov(s2, ddof=2) -0.033715... ``` ### How was this patch tested? added UT Closes #37953 from zhengruifeng/ps_ser_cov. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
@@ -9012,7 +9012,7 @@ def cov(self, min_periods: Optional[int] = None, ddof: int = 1) -> "DataFrame":
             Minimum number of observations required per pair of columns
             to have a valid result.
         ddof : int, default 1
-            Delta degrees of freedom.  The divisor used in calculations
+            Delta degrees of freedom. The divisor used in calculations
             is ``N - ddof``, where ``N`` represents the number of elements.
 
             .. versionadded:: 3.4.0
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
@@ -987,7 +987,7 @@ def between(self, left: Any, right: Any, inclusive: Union[bool, str] = "both") -
 
         return lmask & rmask
 
-    def cov(self, other: "Series", min_periods: Optional[int] = None) -> float:
+    def cov(self, other: "Series", min_periods: Optional[int] = None, ddof: int = 1) -> float:
         """
         Compute covariance with Series, excluding missing values.
 
@@ -999,6 +999,11 @@ def cov(self, other: "Series", min_periods: Optional[int] = None) -> float:
             Series with which to compute the covariance.
         min_periods : int, optional
             Minimum number of observations needed to have a valid result.
+        ddof : int, default 1
+            Delta degrees of freedom. The divisor used in calculations
+            is ``N - ddof``, where ``N`` represents the number of elements.
+
+            .. versionadded:: 3.4.0
 
         Returns
         -------
@@ -1008,19 +1013,23 @@ def cov(self, other: "Series", min_periods: Optional[int] = None) -> float:
         Examples
         --------
         >>> from pyspark.pandas.config import set_option, reset_option
-        >>> set_option("compute.ops_on_diff_frames", True)
         >>> s1 = ps.Series([0.90010907, 0.13484424, 0.62036035])
         >>> s2 = ps.Series([0.12528585, 0.26962463, 0.51111198])
-        >>> s1.cov(s2)
-        -0.016857626527158744
-        >>> reset_option("compute.ops_on_diff_frames")
+        >>> with ps.option_context("compute.ops_on_diff_frames", True):
+        ...     s1.cov(s2)
+        -0.016857...
+        >>> with ps.option_context("compute.ops_on_diff_frames", True):
+        ...     s1.cov(s2, ddof=2)
+        -0.033715...
         """
         if not isinstance(other, Series):
             raise TypeError("unsupported type: %s" % type(other))
         if not np.issubdtype(self.dtype, np.number):  # type: ignore[arg-type]
             raise TypeError("unsupported dtype: %s" % self.dtype)
         if not np.issubdtype(other.dtype, np.number):  # type: ignore[arg-type]
             raise TypeError("unsupported dtype: %s" % other.dtype)
+        if not isinstance(ddof, int):
+            raise TypeError("ddof must be integer")
 
         min_periods = 1 if min_periods is None else min_periods
 
@@ -1035,7 +1044,8 @@ def cov(self, other: "Series", min_periods: Optional[int] = None) -> float:
         if len(sdf.head(min_periods)) < min_periods:
             return np.nan
         else:
-            return sdf.select(F.covar_samp(*sdf.columns)).head(1)[0][0]
+            sdf = sdf.select(SF.covar(F.col(sdf.columns[0]), F.col(sdf.columns[1]), ddof))
+            return sdf.head(1)[0][0]
 
     # TODO: NaN and None when ``arg`` is an empty dict
     # TODO: Support ps.Series ``arg``
diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py
@@ -3236,6 +3236,8 @@ def test_cov(self):
             psdf["s1"].cov(psdf["s2"])
         with self.assertRaisesRegex(TypeError, "unsupported dtype: object"):
             psdf["s2"].cov(psdf["s1"])
+        with self.assertRaisesRegex(TypeError, "ddof must be integer"):
+            psdf["s2"].cov(psdf["s2"], ddof="ddof")
 
         pdf = pd.DataFrame(
             {
@@ -3258,17 +3260,32 @@ def test_cov(self):
     def _test_cov(self, pdf):
         psdf = ps.from_pandas(pdf)
 
-        pcov = pdf["s1"].cov(pdf["s2"])
-        pscov = psdf["s1"].cov(psdf["s2"])
-        self.assert_eq(pcov, pscov, almost=True)
+        self.assert_eq(pdf["s1"].cov(pdf["s2"]), psdf["s1"].cov(psdf["s2"]), almost=True)
+        self.assert_eq(
+            pdf["s1"].cov(pdf["s2"], ddof=2), psdf["s1"].cov(psdf["s2"], ddof=2), almost=True
+        )
 
-        pcov = pdf["s1"].cov(pdf["s2"], min_periods=3)
-        pscov = psdf["s1"].cov(psdf["s2"], min_periods=3)
-        self.assert_eq(pcov, pscov, almost=True)
+        self.assert_eq(
+            pdf["s1"].cov(pdf["s2"], min_periods=3),
+            psdf["s1"].cov(psdf["s2"], min_periods=3),
+            almost=True,
+        )
+        self.assert_eq(
+            pdf["s1"].cov(pdf["s2"], min_periods=3, ddof=-1),
+            psdf["s1"].cov(psdf["s2"], min_periods=3, ddof=-1),
+            almost=True,
+        )
 
-        pcov = pdf["s1"].cov(pdf["s2"], min_periods=4)
-        pscov = psdf["s1"].cov(psdf["s2"], min_periods=4)
-        self.assert_eq(pcov, pscov, almost=True)
+        self.assert_eq(
+            pdf["s1"].cov(pdf["s2"], min_periods=4),
+            psdf["s1"].cov(psdf["s2"], min_periods=4),
+            almost=True,
+        )
+        self.assert_eq(
+            pdf["s1"].cov(pdf["s2"], min_periods=4, ddof=3),
+            psdf["s1"].cov(psdf["s2"], min_periods=4, ddof=3),
+            almost=True,
+        )
 
     def test_eq(self):
         pser = pd.Series([1, 2, 3, 4, 5, 6], name="x")