Skip to content

Commit 1476a9f

Browse files
committed
[SPARK-40510][PS] Implement ddof in Series.cov
### What changes were proposed in this pull request? Implement `ddof` in `Series.cov`, by switch to `SF.covar` ### Why are the changes needed? for API coverage ### Does this PR introduce _any_ user-facing change? yes, `ddof` supported now ``` >>> s1 = ps.Series([0.90010907, 0.13484424, 0.62036035]) >>> s2 = ps.Series([0.12528585, 0.26962463, 0.51111198]) >>> with ps.option_context("compute.ops_on_diff_frames", True): ... s1.cov(s2) -0.016857... >>> with ps.option_context("compute.ops_on_diff_frames", True): ... s1.cov(s2, ddof=2) -0.033715... ``` ### How was this patch tested? added UT Closes #37953 from zhengruifeng/ps_ser_cov. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent e5b4b32 commit 1476a9f

File tree

3 files changed

+43
-16
lines changed

3 files changed

+43
-16
lines changed

python/pyspark/pandas/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9012,7 +9012,7 @@ def cov(self, min_periods: Optional[int] = None, ddof: int = 1) -> "DataFrame":
90129012
Minimum number of observations required per pair of columns
90139013
to have a valid result.
90149014
ddof : int, default 1
9015-
Delta degrees of freedom. The divisor used in calculations
9015+
Delta degrees of freedom. The divisor used in calculations
90169016
is ``N - ddof``, where ``N`` represents the number of elements.
90179017
90189018
.. versionadded:: 3.4.0

python/pyspark/pandas/series.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -987,7 +987,7 @@ def between(self, left: Any, right: Any, inclusive: Union[bool, str] = "both") -
987987

988988
return lmask & rmask
989989

990-
def cov(self, other: "Series", min_periods: Optional[int] = None) -> float:
990+
def cov(self, other: "Series", min_periods: Optional[int] = None, ddof: int = 1) -> float:
991991
"""
992992
Compute covariance with Series, excluding missing values.
993993
@@ -999,6 +999,11 @@ def cov(self, other: "Series", min_periods: Optional[int] = None) -> float:
999999
Series with which to compute the covariance.
10001000
min_periods : int, optional
10011001
Minimum number of observations needed to have a valid result.
1002+
ddof : int, default 1
1003+
Delta degrees of freedom. The divisor used in calculations
1004+
is ``N - ddof``, where ``N`` represents the number of elements.
1005+
1006+
.. versionadded:: 3.4.0
10021007
10031008
Returns
10041009
-------
@@ -1008,19 +1013,23 @@ def cov(self, other: "Series", min_periods: Optional[int] = None) -> float:
10081013
Examples
10091014
--------
10101015
>>> from pyspark.pandas.config import set_option, reset_option
1011-
>>> set_option("compute.ops_on_diff_frames", True)
10121016
>>> s1 = ps.Series([0.90010907, 0.13484424, 0.62036035])
10131017
>>> s2 = ps.Series([0.12528585, 0.26962463, 0.51111198])
1014-
>>> s1.cov(s2)
1015-
-0.016857626527158744
1016-
>>> reset_option("compute.ops_on_diff_frames")
1018+
>>> with ps.option_context("compute.ops_on_diff_frames", True):
1019+
... s1.cov(s2)
1020+
-0.016857...
1021+
>>> with ps.option_context("compute.ops_on_diff_frames", True):
1022+
... s1.cov(s2, ddof=2)
1023+
-0.033715...
10171024
"""
10181025
if not isinstance(other, Series):
10191026
raise TypeError("unsupported type: %s" % type(other))
10201027
if not np.issubdtype(self.dtype, np.number): # type: ignore[arg-type]
10211028
raise TypeError("unsupported dtype: %s" % self.dtype)
10221029
if not np.issubdtype(other.dtype, np.number): # type: ignore[arg-type]
10231030
raise TypeError("unsupported dtype: %s" % other.dtype)
1031+
if not isinstance(ddof, int):
1032+
raise TypeError("ddof must be integer")
10241033

10251034
min_periods = 1 if min_periods is None else min_periods
10261035

@@ -1035,7 +1044,8 @@ def cov(self, other: "Series", min_periods: Optional[int] = None) -> float:
10351044
if len(sdf.head(min_periods)) < min_periods:
10361045
return np.nan
10371046
else:
1038-
return sdf.select(F.covar_samp(*sdf.columns)).head(1)[0][0]
1047+
sdf = sdf.select(SF.covar(F.col(sdf.columns[0]), F.col(sdf.columns[1]), ddof))
1048+
return sdf.head(1)[0][0]
10391049

10401050
# TODO: NaN and None when ``arg`` is an empty dict
10411051
# TODO: Support ps.Series ``arg``

python/pyspark/pandas/tests/test_series.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3236,6 +3236,8 @@ def test_cov(self):
32363236
psdf["s1"].cov(psdf["s2"])
32373237
with self.assertRaisesRegex(TypeError, "unsupported dtype: object"):
32383238
psdf["s2"].cov(psdf["s1"])
3239+
with self.assertRaisesRegex(TypeError, "ddof must be integer"):
3240+
psdf["s2"].cov(psdf["s2"], ddof="ddof")
32393241

32403242
pdf = pd.DataFrame(
32413243
{
@@ -3258,17 +3260,32 @@ def test_cov(self):
32583260
def _test_cov(self, pdf):
32593261
psdf = ps.from_pandas(pdf)
32603262

3261-
pcov = pdf["s1"].cov(pdf["s2"])
3262-
pscov = psdf["s1"].cov(psdf["s2"])
3263-
self.assert_eq(pcov, pscov, almost=True)
3263+
self.assert_eq(pdf["s1"].cov(pdf["s2"]), psdf["s1"].cov(psdf["s2"]), almost=True)
3264+
self.assert_eq(
3265+
pdf["s1"].cov(pdf["s2"], ddof=2), psdf["s1"].cov(psdf["s2"], ddof=2), almost=True
3266+
)
32643267

3265-
pcov = pdf["s1"].cov(pdf["s2"], min_periods=3)
3266-
pscov = psdf["s1"].cov(psdf["s2"], min_periods=3)
3267-
self.assert_eq(pcov, pscov, almost=True)
3268+
self.assert_eq(
3269+
pdf["s1"].cov(pdf["s2"], min_periods=3),
3270+
psdf["s1"].cov(psdf["s2"], min_periods=3),
3271+
almost=True,
3272+
)
3273+
self.assert_eq(
3274+
pdf["s1"].cov(pdf["s2"], min_periods=3, ddof=-1),
3275+
psdf["s1"].cov(psdf["s2"], min_periods=3, ddof=-1),
3276+
almost=True,
3277+
)
32683278

3269-
pcov = pdf["s1"].cov(pdf["s2"], min_periods=4)
3270-
pscov = psdf["s1"].cov(psdf["s2"], min_periods=4)
3271-
self.assert_eq(pcov, pscov, almost=True)
3279+
self.assert_eq(
3280+
pdf["s1"].cov(pdf["s2"], min_periods=4),
3281+
psdf["s1"].cov(psdf["s2"], min_periods=4),
3282+
almost=True,
3283+
)
3284+
self.assert_eq(
3285+
pdf["s1"].cov(pdf["s2"], min_periods=4, ddof=3),
3286+
psdf["s1"].cov(psdf["s2"], min_periods=4, ddof=3),
3287+
almost=True,
3288+
)
32723289

32733290
def test_eq(self):
32743291
pser = pd.Series([1, 2, 3, 4, 5, 6], name="x")

0 commit comments

Comments
 (0)