Skip to content

Commit 4030356

Browse files
kunalgosardevin-petersohn
authored andcommitted
[DataFrame] Implements filter and dropna (#1959)
* implement filter * begin implementation of dropna * implement dropna * docs and tests * resolving comments * resolving merge * add error checking to dropna * fix update inplace call * Implement multiple axis for dropna (#13) * Implement multiple axis for dropna * Add multiple axis dropna test * Fix using dummy_frame in dropna * Clean up dropna multiple axis tests * remove unnecessary axis modification * Clean up dropna tests * resolve comments * fix lint
1 parent 22d4950 commit 4030356

File tree

3 files changed

+248
-11
lines changed

3 files changed

+248
-11
lines changed

python/ray/dataframe/dataframe.py

Lines changed: 133 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pandas._libs import lib
1010
from pandas.core.dtypes.cast import maybe_upcast_putmask
1111
from pandas import compat
12-
from pandas.compat import lzip, string_types, cPickle as pkl
12+
from pandas.compat import lzip, to_str, string_types, cPickle as pkl
1313
import pandas.core.common as com
1414
from pandas.core.dtypes.common import (
1515
is_bool_dtype,
@@ -756,7 +756,8 @@ def transpose(self, *args, **kwargs):
756756

757757
T = property(transpose)
758758

759-
def dropna(self, axis, how, thresh=None, subset=[], inplace=False):
759+
def dropna(self, axis=0, how='any', thresh=None, subset=None,
760+
inplace=False):
760761
"""Create a new DataFrame from the removed NA values from this one.
761762
762763
Args:
@@ -774,7 +775,94 @@ def dropna(self, axis, how, thresh=None, subset=[], inplace=False):
774775
If inplace is set to True, returns None, otherwise returns a new
775776
DataFrame with the dropna applied.
776777
"""
777-
raise NotImplementedError("Not yet")
778+
inplace = validate_bool_kwarg(inplace, "inplace")
779+
780+
if is_list_like(axis):
781+
axis = [pd.DataFrame()._get_axis_number(ax) for ax in axis]
782+
783+
result = self
784+
# TODO(kunalgosar): this builds an intermediate dataframe,
785+
# which does unnecessary computation
786+
for ax in axis:
787+
result = result.dropna(
788+
axis=ax, how=how, thresh=thresh, subset=subset)
789+
if not inplace:
790+
return result
791+
792+
self._update_inplace(block_partitions=result._block_partitions,
793+
columns=result.columns,
794+
index=result.index)
795+
796+
return None
797+
798+
axis = pd.DataFrame()._get_axis_number(axis)
799+
800+
if how is not None and how not in ['any', 'all']:
801+
raise ValueError('invalid how option: %s' % how)
802+
if how is None and thresh is None:
803+
raise TypeError('must specify how or thresh')
804+
805+
if subset is not None:
806+
subset = set(subset)
807+
808+
if axis == 1:
809+
subset = [item for item in self.index if item in subset]
810+
else:
811+
subset = [item for item in self.columns if item in subset]
812+
813+
def dropna_helper(df):
814+
new_df = df.dropna(axis=axis, how=how, thresh=thresh,
815+
subset=subset, inplace=False)
816+
817+
if axis == 1:
818+
new_index = new_df.columns
819+
new_df.columns = pd.RangeIndex(0, len(new_df.columns))
820+
else:
821+
new_index = new_df.index
822+
new_df.reset_index(drop=True, inplace=True)
823+
824+
return new_df, new_index
825+
826+
parts = self._col_partitions if axis == 1 else self._row_partitions
827+
result = [_deploy_func._submit(args=(dropna_helper, df),
828+
num_return_vals=2) for df in parts]
829+
new_parts, new_vals = [list(t) for t in zip(*result)]
830+
831+
if axis == 1:
832+
new_vals = [self._col_metadata.get_global_indices(i, vals)
833+
for i, vals in enumerate(ray.get(new_vals))]
834+
835+
# This flattens the 2d array to 1d
836+
new_vals = [i for j in new_vals for i in j]
837+
new_cols = self.columns[new_vals]
838+
839+
if not inplace:
840+
return DataFrame(col_partitions=new_parts,
841+
columns=new_cols,
842+
index=self.index)
843+
844+
self._update_inplace(col_partitions=new_parts,
845+
columns=new_cols,
846+
index=self.index)
847+
848+
else:
849+
new_vals = [self._row_metadata.get_global_indices(i, vals)
850+
for i, vals in enumerate(ray.get(new_vals))]
851+
852+
# This flattens the 2d array to 1d
853+
new_vals = [i for j in new_vals for i in j]
854+
new_rows = self.index[new_vals]
855+
856+
if not inplace:
857+
return DataFrame(row_partitions=new_parts,
858+
index=new_rows,
859+
columns=self.columns)
860+
861+
self._update_inplace(row_partitions=new_parts,
862+
index=new_rows,
863+
columns=self.columns)
864+
865+
return None
778866

779867
def add(self, other, axis='columns', level=None, fill_value=None):
780868
"""Add this DataFrame to another or a scalar/list.
@@ -1797,9 +1885,45 @@ def fillna(self, value=None, method=None, axis=None, inplace=False,
17971885
return new_obj
17981886

17991887
def filter(self, items=None, like=None, regex=None, axis=None):
1800-
raise NotImplementedError(
1801-
"To contribute to Pandas on Ray, please visit "
1802-
"github.com/ray-project/ray.")
1888+
"""Subset rows or columns based on their labels
1889+
1890+
Args:
1891+
items (list): list of labels to subset
1892+
like (string): retain labels where `arg in label == True`
1893+
regex (string): retain labels matching regex input
1894+
axis: axis to filter on
1895+
1896+
Returns:
1897+
A new dataframe with the filter applied.
1898+
"""
1899+
nkw = com._count_not_none(items, like, regex)
1900+
if nkw > 1:
1901+
raise TypeError('Keyword arguments `items`, `like`, or `regex` '
1902+
'are mutually exclusive')
1903+
if nkw == 0:
1904+
raise TypeError('Must pass either `items`, `like`, or `regex`')
1905+
1906+
if axis is None:
1907+
axis = 'columns' # This is the default info axis for dataframes
1908+
1909+
axis = pd.DataFrame()._get_axis_number(axis)
1910+
labels = self.columns if axis else self.index
1911+
1912+
if items is not None:
1913+
bool_arr = labels.isin(items)
1914+
elif like is not None:
1915+
def f(x):
1916+
return like in to_str(x)
1917+
bool_arr = labels.map(f).tolist()
1918+
else:
1919+
def f(x):
1920+
return matcher.search(to_str(x)) is not None
1921+
matcher = re.compile(regex)
1922+
bool_arr = labels.map(f).tolist()
1923+
1924+
if not axis:
1925+
return self[bool_arr]
1926+
return self[self.columns[bool_arr]]
18031927

18041928
def first(self, offset):
18051929
raise NotImplementedError(
@@ -3990,7 +4114,9 @@ def _getitem_array(self, key):
39904114
index=index)
39914115
else:
39924116
columns = self._col_metadata[key].index
3993-
indices_for_rows = [col for col in self.col if col in set(columns)]
4117+
indices_for_rows = \
4118+
[i for i, item in enumerate(self.columns)
4119+
if item in set(columns)]
39944120

39954121
new_parts = [_deploy_func.remote(
39964122
lambda df: df.__getitem__(indices_for_rows),

python/ray/dataframe/index_metadata.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,13 @@ def insert(self, key, loc=None, partition=None,
271271
# Return inserted coordinate for callee
272272
return coord_to_insert
273273

274+
def get_global_indices(self, partition, index_within_partition_list):
275+
total = 0
276+
for i in range(partition):
277+
total += self._lengths[i]
278+
279+
return [total + i for i in index_within_partition_list]
280+
274281
def squeeze(self, partition, index_within_partition):
275282
"""Prepare a single coordinate for removal by "squeezing" the
276283
subsequent coordinates "up" one index within that partition. To be used

python/ray/dataframe/test/test_dataframe.py

Lines changed: 108 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,11 @@ def test_int_dataframe():
190190
'col3',
191191
'col4']
192192

193+
filter_by = {'items': ['col1', 'col5'],
194+
'regex': '4$|3$',
195+
'like': 'col'}
196+
197+
test_filter(ray_df, pandas_df, filter_by)
193198
test_roundtrip(ray_df, pandas_df)
194199
test_index(ray_df, pandas_df)
195200
test_size(ray_df, pandas_df)
@@ -348,6 +353,11 @@ def test_float_dataframe():
348353
'col3',
349354
'col4']
350355

356+
filter_by = {'items': ['col1', 'col5'],
357+
'regex': '4$|3$',
358+
'like': 'col'}
359+
360+
test_filter(ray_df, pandas_df, filter_by)
351361
test_roundtrip(ray_df, pandas_df)
352362
test_index(ray_df, pandas_df)
353363
test_size(ray_df, pandas_df)
@@ -506,6 +516,11 @@ def test_mixed_dtype_dataframe():
506516
'col3',
507517
'col4']
508518

519+
filter_by = {'items': ['col1', 'col5'],
520+
'regex': '4$|3$',
521+
'like': 'col'}
522+
523+
test_filter(ray_df, pandas_df, filter_by)
509524
test_roundtrip(ray_df, pandas_df)
510525
test_index(ray_df, pandas_df)
511526
test_size(ray_df, pandas_df)
@@ -664,6 +679,11 @@ def test_nan_dataframe():
664679
'col3',
665680
'col4']
666681

682+
filter_by = {'items': ['col1', 'col5'],
683+
'regex': '4$|3$',
684+
'like': 'col'}
685+
686+
test_filter(ray_df, pandas_df, filter_by)
667687
test_roundtrip(ray_df, pandas_df)
668688
test_index(ray_df, pandas_df)
669689
test_size(ray_df, pandas_df)
@@ -798,6 +818,23 @@ def test_nan_dataframe():
798818
test_transform(ray_df, pandas_df)
799819

800820

821+
def test_dense_nan_df():
822+
ray_df = rdf.DataFrame([[np.nan, 2, np.nan, 0],
823+
[3, 4, np.nan, 1],
824+
[np.nan, np.nan, np.nan, 5]],
825+
columns=list('ABCD'))
826+
827+
pd_df = pd.DataFrame([[np.nan, 2, np.nan, 0],
828+
[3, 4, np.nan, 1],
829+
[np.nan, np.nan, np.nan, 5]],
830+
columns=list('ABCD'))
831+
832+
test_dropna(ray_df, pd_df)
833+
test_dropna_inplace(ray_df, pd_df)
834+
test_dropna_multiple_axes(ray_df, pd_df)
835+
test_dropna_multiple_axes_inplace(ray_df, pd_df)
836+
837+
801838
@pytest.fixture
802839
def test_inter_df_math(op, simple=False):
803840
ray_df = rdf.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7],
@@ -1252,6 +1289,68 @@ def test_drop_duplicates():
12521289
ray_df.drop_duplicates()
12531290

12541291

1292+
@pytest.fixture
1293+
def test_dropna(ray_df, pd_df):
1294+
assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='all'),
1295+
pd_df.dropna(axis=1, how='all'))
1296+
1297+
assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='any'),
1298+
pd_df.dropna(axis=1, how='any'))
1299+
1300+
assert ray_df_equals_pandas(ray_df.dropna(axis=0, how='all'),
1301+
pd_df.dropna(axis=0, how='all'))
1302+
1303+
assert ray_df_equals_pandas(ray_df.dropna(thresh=2),
1304+
pd_df.dropna(thresh=2))
1305+
1306+
1307+
@pytest.fixture
1308+
def test_dropna_inplace(ray_df, pd_df):
1309+
ray_df = ray_df.copy()
1310+
pd_df = pd_df.copy()
1311+
1312+
ray_df.dropna(thresh=2, inplace=True)
1313+
pd_df.dropna(thresh=2, inplace=True)
1314+
1315+
assert ray_df_equals_pandas(ray_df, pd_df)
1316+
1317+
ray_df.dropna(axis=1, how='any', inplace=True)
1318+
pd_df.dropna(axis=1, how='any', inplace=True)
1319+
1320+
assert ray_df_equals_pandas(ray_df, pd_df)
1321+
1322+
1323+
@pytest.fixture
1324+
def test_dropna_multiple_axes(ray_df, pd_df):
1325+
assert ray_df_equals_pandas(
1326+
ray_df.dropna(how='all', axis=[0, 1]),
1327+
pd_df.dropna(how='all', axis=[0, 1])
1328+
)
1329+
assert ray_df_equals_pandas(
1330+
ray_df.dropna(how='all', axis=(0, 1)),
1331+
pd_df.dropna(how='all', axis=(0, 1))
1332+
)
1333+
1334+
1335+
@pytest.fixture
1336+
def test_dropna_multiple_axes_inplace(ray_df, pd_df):
1337+
ray_df_copy = ray_df.copy()
1338+
pd_df_copy = pd_df.copy()
1339+
1340+
ray_df_copy.dropna(how='all', axis=[0, 1], inplace=True)
1341+
pd_df_copy.dropna(how='all', axis=[0, 1], inplace=True)
1342+
1343+
assert ray_df_equals_pandas(ray_df_copy, pd_df_copy)
1344+
1345+
ray_df_copy = ray_df.copy()
1346+
pd_df_copy = pd_df.copy()
1347+
1348+
ray_df_copy.dropna(how='all', axis=(0, 1), inplace=True)
1349+
pd_df_copy.dropna(how='all', axis=(0, 1), inplace=True)
1350+
1351+
assert ray_df_equals_pandas(ray_df_copy, pd_df_copy)
1352+
1353+
12551354
def test_duplicated():
12561355
ray_df = create_test_dataframe()
12571356

@@ -1747,11 +1846,16 @@ def test_fillna_datetime_columns(num_partitions=2):
17471846
"""
17481847

17491848

1750-
def test_filter():
1751-
ray_df = create_test_dataframe()
1849+
@pytest.fixture
1850+
def test_filter(ray_df, pandas_df, by):
1851+
ray_df_equals_pandas(ray_df.filter(items=by['items']),
1852+
pandas_df.filter(items=by['items']))
17521853

1753-
with pytest.raises(NotImplementedError):
1754-
ray_df.filter()
1854+
ray_df_equals_pandas(ray_df.filter(regex=by['regex']),
1855+
pandas_df.filter(regex=by['regex']))
1856+
1857+
ray_df_equals_pandas(ray_df.filter(like=by['like']),
1858+
pandas_df.filter(like=by['like']))
17551859

17561860

17571861
def test_first():

0 commit comments

Comments
 (0)