Skip to content

Commit f73b58c

Browse files
authored
Remove ancillary methods. (#1027)
1 parent 2a30329 commit f73b58c

File tree

8 files changed

+171
-770
lines changed

8 files changed

+171
-770
lines changed

docs/reference/catalog_dataframe.rst

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,7 @@ Dataframe Methods
66
.. autosummary::
77
:toctree: api/
88

9-
Catalog.assign
10-
Catalog.dropna
119
Catalog.reduce
12-
Catalog.sort_nested_values
1310
Catalog.map_partitions
1411
Catalog.to_hats
1512
Catalog.compute

docs/tutorials/pre_executed/nestedframe.ipynb

Lines changed: 166 additions & 254 deletions
Large diffs are not rendered by default.

src/lsdb/catalog/catalog.py

Lines changed: 2 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import warnings
44
from pathlib import Path
5-
from typing import Any, Callable, Iterable, Literal, Type
5+
from typing import Any, Callable, Iterable, Type
66

77
import dask.dataframe as dd
88
import hats as hc
@@ -11,9 +11,7 @@
1111
from hats.catalog.catalog_collection import CatalogCollection
1212
from hats.catalog.healpix_dataset.healpix_dataset import HealpixDataset as HCHealpixDataset
1313
from hats.catalog.index.index_catalog import IndexCatalog as HCIndexCatalog
14-
from pandas._libs import lib
15-
from pandas._typing import AnyAll, Axis, IndexLabel, Renamer
16-
from pandas.api.extensions import no_default
14+
from pandas._typing import Renamer
1715
from typing_extensions import Self
1816
from upath import UPath
1917

@@ -146,33 +144,6 @@ def rename(self, columns: Renamer) -> Catalog:
146144
catalog.margin = self.margin.rename(columns)
147145
return catalog
148146

149-
def assign(self, **kwargs) -> Catalog:
150-
"""Assigns new columns to a catalog
151-
152-
Args:
153-
**kwargs: Arguments to pass to the assign method. This dictionary
154-
should contain the column names as keys and either a
155-
function or a 1-D Dask array as their corresponding value.
156-
157-
Returns:
158-
The catalog containing both the old columns and the newly created columns
159-
160-
Examples:
161-
Create a new column using a function::
162-
163-
catalog = Catalog(...)
164-
catalog = catalog.assign(new_col=lambda df: df['existing_col'] * 2)
165-
166-
Add a column from a 1-D Dask array::
167-
168-
import dask.array as da
169-
new_data = da.arange(...)
170-
catalog = catalog.assign(new_col=new_data)
171-
"""
172-
self._check_unloaded_columns(list(kwargs.keys()))
173-
ddf = self._ddf.assign(**kwargs)
174-
return self._create_updated_dataset(ddf=ddf)
175-
176147
def crossmatch(
177148
self,
178149
other: Catalog,
@@ -951,81 +922,6 @@ def nest_lists(
951922
)
952923
return catalog
953924

954-
def dropna(
955-
self,
956-
*,
957-
axis: Axis = 0,
958-
how: AnyAll | lib.NoDefault = no_default,
959-
thresh: int | lib.NoDefault = no_default,
960-
on_nested: bool = False,
961-
subset: IndexLabel | None = None,
962-
ignore_index: bool = False,
963-
) -> Catalog:
964-
"""Remove missing values for one layer of nested columns in the catalog.
965-
966-
Parameters
967-
----------
968-
axis : {0 or 'index', 1 or 'columns'}, default 0
969-
Determine if rows or columns which contain missing values are
970-
removed.
971-
972-
* 0, or 'index' : Drop rows which contain missing values.
973-
* 1, or 'columns' : Drop columns which contain missing value.
974-
975-
Only a single axis is allowed.
976-
977-
how : {'any', 'all'}, default 'any'
978-
Determine if row or column is removed from catalog, when we have
979-
at least one NA or all NA.
980-
981-
* 'any' : If any NA values are present, drop that row or column.
982-
* 'all' : If all values are NA, drop that row or column.
983-
thresh : int, optional
984-
Require that many non-NA values. Cannot be combined with how.
985-
on_nested : str or bool, optional
986-
If not False, applies the call to the nested dataframe in the
987-
column with label equal to the provided string. If specified,
988-
the nested dataframe should align with any columns given in
989-
`subset`.
990-
subset : column label or sequence of labels, optional
991-
Labels along other axis to consider, e.g. if you are dropping rows
992-
these would be a list of columns to include.
993-
994-
Access nested columns using `nested_df.nested_col` (where
995-
`nested_df` refers to a particular nested dataframe and
996-
`nested_col` is a column of that nested dataframe).
997-
ignore_index : bool, default ``False``
998-
If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
999-
1000-
Returns
1001-
-------
1002-
Catalog
1003-
Catalog with NA entries dropped from it.
1004-
1005-
Notes
1006-
-----
1007-
Operations that target a particular nested structure return a dataframe
1008-
with rows of that particular nested structure affected.
1009-
1010-
Values for `on_nested` and `subset` should be consistent in pointing
1011-
to a single layer, multi-layer operations are not supported at this
1012-
time.
1013-
"""
1014-
self._check_unloaded_columns(subset)
1015-
catalog = super().dropna(
1016-
axis=axis, how=how, thresh=thresh, on_nested=on_nested, subset=subset, ignore_index=ignore_index
1017-
)
1018-
if self.margin is not None:
1019-
catalog.margin = self.margin.dropna(
1020-
axis=axis,
1021-
how=how,
1022-
thresh=thresh,
1023-
on_nested=on_nested,
1024-
subset=subset,
1025-
ignore_index=ignore_index,
1026-
)
1027-
return catalog
1028-
1029925
def reduce(self, func, *args, meta=None, append_columns=False, infer_nesting=True, **kwargs) -> Catalog:
1030926
"""
1031927
Takes a function and applies it to each top-level row of the Catalog.
@@ -1092,55 +988,6 @@ def reduce(self, func, *args, meta=None, append_columns=False, infer_nesting=Tru
1092988
)
1093989
return catalog
1094990

1095-
def sort_nested_values(
1096-
self,
1097-
by: str | list[str],
1098-
ascending: bool | list[bool] = True,
1099-
na_position: Literal["first"] | Literal["last"] = "last",
1100-
ignore_index: bool | None = False,
1101-
**options,
1102-
) -> Catalog:
1103-
# pylint: disable=duplicate-code
1104-
"""Sort nested columns for each row in the catalog.
1105-
1106-
Note that this does NOT sort rows, only nested values within rows.
1107-
1108-
Args:
1109-
by: str or list[str]
1110-
Column(s) to sort by.
1111-
ascending: bool or list[bool], optional
1112-
Sort ascending vs. descending. Defaults to True. Specify list for
1113-
multiple sort orders. If this is a list of bools, must match the
1114-
length of the `by`.
1115-
na_position: {‘last’, ‘first’}, optional
1116-
Puts NaNs at the beginning if ‘first’, puts NaN at the end if
1117-
‘last’. Defaults to ‘last’.
1118-
ignore_index: bool, optional
1119-
If True, the resulting axis will be labeled 0, 1, …, n - 1.
1120-
Defaults to False.
1121-
**options: keyword arguments, optional
1122-
Additional options to pass to the sorting function.
1123-
1124-
Returns:
1125-
A new catalog where the specified nested columns are sorted.
1126-
"""
1127-
catalog = super().sort_nested_values(
1128-
by=by,
1129-
ascending=ascending,
1130-
na_position=na_position,
1131-
ignore_index=ignore_index,
1132-
**options,
1133-
)
1134-
if self.margin is not None:
1135-
catalog.margin = self.margin.sort_nested_values(
1136-
by=by,
1137-
ascending=ascending,
1138-
na_position=na_position,
1139-
ignore_index=ignore_index,
1140-
**options,
1141-
)
1142-
return catalog
1143-
1144991
def to_hats(
1145992
self,
1146993
base_catalog_path: str | Path | UPath,

src/lsdb/catalog/dataset/healpix_dataset.py

Lines changed: 2 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import warnings
66
from collections.abc import Sequence
77
from pathlib import Path
8-
from typing import Callable, Iterable, Literal, Type, cast
8+
from typing import Callable, Iterable, Type
99

1010
import astropy
1111
import dask
@@ -22,12 +22,9 @@
2222
from hats.inspection.visualize_catalog import get_fov_moc_from_wcs, initialize_wcs_axes
2323
from hats.pixel_math import HealpixPixel
2424
from hats.pixel_math.healpix_pixel_function import get_pixel_argsort
25-
from hats.pixel_math.spatial_index import SPATIAL_INDEX_COLUMN
2625
from matplotlib.figure import Figure
2726
from mocpy import MOC
28-
from pandas._libs import lib
29-
from pandas._typing import AnyAll, Axis, IndexLabel, Renamer
30-
from pandas.api.extensions import no_default
27+
from pandas._typing import Renamer
3128
from typing_extensions import Self
3229
from upath import UPath
3330

@@ -855,91 +852,6 @@ def write_catalog(
855852
**kwargs,
856853
)
857854

858-
def dropna(
859-
self,
860-
*,
861-
axis: Axis = 0,
862-
how: AnyAll | lib.NoDefault = no_default,
863-
thresh: int | lib.NoDefault = no_default,
864-
on_nested: bool = False,
865-
subset: IndexLabel | None = None,
866-
ignore_index: bool = False,
867-
) -> Self: # type: ignore[name-defined] # noqa: F821:
868-
"""
869-
Remove missing values for one layer of nested columns in the catalog.
870-
871-
Parameters
872-
----------
873-
axis : {0 or 'index', 1 or 'columns'}, default 0
874-
Determine if rows or columns which contain missing values are
875-
removed.
876-
877-
* 0, or 'index' : Drop rows which contain missing values.
878-
* 1, or 'columns' : Drop columns which contain missing value.
879-
880-
Only a single axis is allowed.
881-
882-
how : {'any', 'all'}, default 'any'
883-
Determine if row or column is removed from catalog, when we have
884-
at least one NA or all NA.
885-
886-
* 'any' : If any NA values are present, drop that row or column.
887-
* 'all' : If all values are NA, drop that row or column.
888-
thresh : int, optional
889-
Require that many non-NA values. Cannot be combined with how.
890-
on_nested : str or bool, optional
891-
If not False, applies the call to the nested dataframe in the
892-
column with label equal to the provided string. If specified,
893-
the nested dataframe should align with any columns given in
894-
`subset`.
895-
subset : column label or sequence of labels, optional
896-
Labels along other axis to consider, e.g. if you are dropping rows
897-
these would be a list of columns to include.
898-
899-
Access nested columns using `nested_df.nested_col` (where
900-
`nested_df` refers to a particular nested dataframe and
901-
`nested_col` is a column of that nested dataframe).
902-
ignore_index : bool, default ``False``
903-
If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
904-
905-
.. versionadded:: 2.0.0
906-
907-
Returns
908-
-------
909-
Catalog
910-
Catalog with NA entries dropped from it.
911-
912-
Notes
913-
-----
914-
Operations that target a particular nested structure return a dataframe
915-
with rows of that particular nested structure affected.
916-
917-
Values for `on_nested` and `subset` should be consistent in pointing
918-
to a single layer, multi-layer operations are not supported at this
919-
time.
920-
"""
921-
922-
def drop_na_part(df: npd.NestedFrame):
923-
if df.index.name == SPATIAL_INDEX_COLUMN:
924-
df = df.reset_index()
925-
df = cast(
926-
npd.NestedFrame,
927-
df.dropna(
928-
axis=axis,
929-
how=how,
930-
thresh=thresh,
931-
on_nested=on_nested,
932-
subset=subset,
933-
ignore_index=ignore_index,
934-
),
935-
)
936-
if SPATIAL_INDEX_COLUMN in df.columns:
937-
df = df.set_index(SPATIAL_INDEX_COLUMN)
938-
return df
939-
940-
ndf = self._ddf.map_partitions(drop_na_part, meta=self._ddf._meta)
941-
return self._create_updated_dataset(ddf=ndf)
942-
943855
def nest_lists(
944856
self,
945857
base_columns: list[str] | None = None,
@@ -1170,48 +1082,3 @@ def plot_points(
11701082
fig=fig,
11711083
**kwargs,
11721084
)
1173-
1174-
def sort_nested_values(
1175-
self,
1176-
by: str | list[str],
1177-
ascending: bool | list[bool] = True,
1178-
na_position: Literal["first"] | Literal["last"] = "last",
1179-
ignore_index: bool | None = False,
1180-
**options,
1181-
) -> Self:
1182-
"""Sort nested columns for each row in the catalog.
1183-
1184-
Args:
1185-
by: str or list[str]
1186-
Column(s) to sort by.
1187-
ascending: bool or list[bool], optional
1188-
Sort ascending vs. descending. Defaults to True. Specify list for
1189-
multiple sort orders. If this is a list of bools, must match the
1190-
length of the by.
1191-
na_position: {‘last’, ‘first’}, optional
1192-
Puts NaNs at the beginning if ‘first’, puts NaN at the end if
1193-
‘last’. Defaults to ‘last’.
1194-
ignore_index: bool, optional
1195-
If True, the resulting axis will be labeled 0, 1, …, n - 1.
1196-
Defaults to False.
1197-
**options: keyword arguments, optional
1198-
Additional options to pass to the sorting function.
1199-
1200-
Returns:
1201-
A new catalog where the specified nested columns are sorted.
1202-
"""
1203-
if isinstance(by, str):
1204-
by = [by]
1205-
self._check_unloaded_columns(by)
1206-
# Check "by" columns for hierarchical references
1207-
for col in by:
1208-
if not self._ddf._is_known_hierarchical_column(col):
1209-
raise ValueError(f"{col} not found in nested columns")
1210-
ndf = self._ddf.sort_values(
1211-
by=by,
1212-
ascending=ascending,
1213-
na_position=na_position,
1214-
ignore_index=ignore_index,
1215-
**options,
1216-
)
1217-
return self._create_updated_dataset(ddf=ndf)

0 commit comments

Comments
 (0)