Skip to content

Commit 1f82a46

Browse files
SaladRaiderdevin-petersohn
authored andcommitted
[DataFrame] Implements df.update (#1997)
* Working on fixing update * Fixing update implementation * Adding test * Addressing comments
1 parent 12da021 commit 1f82a46

File tree

2 files changed

+61
-21
lines changed

2 files changed

+61
-21
lines changed

python/ray/dataframe/dataframe.py

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4415,9 +4415,34 @@ def unstack(self, level=-1, fill_value=None):
44154415

44164416
def update(self, other, join='left', overwrite=True, filter_func=None,
44174417
raise_conflict=False):
4418-
raise NotImplementedError(
4419-
"To contribute to Pandas on Ray, please visit "
4420-
"github.com/ray-project/ray.")
4418+
"""Modify DataFrame in place using non-NA values from other.
4419+
4420+
Args:
4421+
other: DataFrame, or object coercible into a DataFrame
4422+
join: {'left'}, default 'left'
4423+
overwrite: If True then overwrite values for common keys in frame
4424+
filter_func: Can choose to replace values other than NA.
4425+
raise_conflict: If True, will raise an error if the DataFrame and
4426+
other both contain data in the same place.
4427+
4428+
Returns:
4429+
None
4430+
"""
4431+
if raise_conflict:
4432+
raise NotImplementedError(
4433+
"raise_conflict parameter not yet supported. "
4434+
"To contribute to Pandas on Ray, please visit "
4435+
"github.com/ray-project/ray.")
4436+
4437+
if not isinstance(other, DataFrame):
4438+
other = DataFrame(other)
4439+
4440+
def update_helper(x, y):
4441+
x.update(y, join, overwrite, filter_func, False)
4442+
return x
4443+
4444+
self._inter_df_op_helper(update_helper, other, join, 0, None,
4445+
inplace=True)
44214446

44224447
def var(self, axis=None, skipna=None, level=None, ddof=1,
44234448
numeric_only=None, **kwargs):
@@ -4971,36 +4996,40 @@ def _operator_helper(self, func, other, axis, level, *args):
49714996
if isinstance(other, DataFrame):
49724997
return self._inter_df_op_helper(
49734998
lambda x, y: func(x, y, axis, level, *args),
4974-
other, axis, level)
4999+
other, "outer", axis, level)
49755000
else:
49765001
return self._single_df_op_helper(
49775002
lambda df: func(df, other, axis, level, *args),
49785003
other, axis, level)
49795004

4980-
def _inter_df_op_helper(self, func, other, axis, level):
5005+
def _inter_df_op_helper(self, func, other, how, axis, level,
5006+
inplace=False):
49815007
if level is not None:
49825008
raise NotImplementedError("Mutlilevel index not yet supported "
49835009
"in Pandas on Ray")
49845010
axis = pd.DataFrame()._get_axis_number(axis)
49855011

4986-
# Adding two DataFrames causes an outer join.
4987-
if isinstance(other, DataFrame):
4988-
new_column_index = self.columns.join(other.columns, how="outer")
4989-
new_index = self.index.join(other.index, how="outer")
4990-
copartitions = self._copartition(other, new_index)
4991-
4992-
new_blocks = \
4993-
np.array([_co_op_helper._submit(
4994-
args=tuple([func, self.columns, other.columns,
4995-
len(part[0]), None] +
4996-
np.concatenate(part).tolist()),
4997-
num_return_vals=len(part[0]))
4998-
for part in copartitions])
5012+
new_column_index = self.columns.join(other.columns, how=how)
5013+
new_index = self.index.join(other.index, how=how)
5014+
copartitions = self._copartition(other, new_index)
49995015

5016+
new_blocks = \
5017+
np.array([_co_op_helper._submit(
5018+
args=tuple([func, self.columns, other.columns,
5019+
len(part[0]), None] +
5020+
np.concatenate(part).tolist()),
5021+
num_return_vals=len(part[0]))
5022+
for part in copartitions])
5023+
5024+
if not inplace:
50005025
# TODO join the Index Metadata objects together for performance.
50015026
return DataFrame(block_partitions=new_blocks,
50025027
columns=new_column_index,
50035028
index=new_index)
5029+
else:
5030+
self._update_inplace(block_partitions=new_blocks,
5031+
columns=new_column_index,
5032+
index=new_index)
50045033

50055034
def _single_df_op_helper(self, func, other, axis, level):
50065035
if level is not None:

python/ray/dataframe/test/test_dataframe.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3030,10 +3030,21 @@ def test_unstack():
30303030

30313031

30323032
def test_update():
3033-
ray_df = create_test_dataframe()
3033+
df = rdf.DataFrame([[1.5, np.nan, 3.],
3034+
[1.5, np.nan, 3.],
3035+
[1.5, np.nan, 3],
3036+
[1.5, np.nan, 3]])
30343037

3035-
with pytest.raises(NotImplementedError):
3036-
ray_df.update(None)
3038+
other = rdf.DataFrame([[3.6, 2., np.nan],
3039+
[np.nan, np.nan, 7]], index=[1, 3])
3040+
3041+
df.update(other)
3042+
3043+
expected = rdf.DataFrame([[1.5, np.nan, 3],
3044+
[3.6, 2, 3],
3045+
[1.5, np.nan, 3],
3046+
[1.5, np.nan, 7.]])
3047+
assert ray_df_equals(df, expected)
30373048

30383049

30393050
@pytest.fixture

0 commit comments

Comments
 (0)