diff --git a/docs/reference/catalog_dataframe.rst b/docs/reference/catalog_dataframe.rst index 0ee7597a5..a062a2bc1 100644 --- a/docs/reference/catalog_dataframe.rst +++ b/docs/reference/catalog_dataframe.rst @@ -6,10 +6,7 @@ Dataframe Methods .. autosummary:: :toctree: api/ - Catalog.assign - Catalog.dropna Catalog.reduce - Catalog.sort_nested_values Catalog.map_partitions Catalog.to_hats Catalog.compute diff --git a/docs/tutorials/pre_executed/nestedframe.ipynb b/docs/tutorials/pre_executed/nestedframe.ipynb index d620c1db6..d89e96a93 100644 --- a/docs/tutorials/pre_executed/nestedframe.ipynb +++ b/docs/tutorials/pre_executed/nestedframe.ipynb @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 1, "id": "b2ff58f8", "metadata": {}, "outputs": [ @@ -69,27 +69,27 @@ "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 idradecablightcurveidradecablightcurve
12342.16693140.9503810.7203240.372520\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
12342.16693140.9503810.7203240.372520\n", " \n", " \n", " \n", @@ -115,13 +115,13 @@ "
t
422112.259323-70.8882480.1467561.077633\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
422112.259323-70.8882480.1467561.077633\n", " \n", " \n", " \n", @@ -147,13 +147,13 @@ "
t
024184.255785-8.8209460.4170220.184677\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
024184.255785-8.8209460.4170220.184677\n", " \n", " \n", " \n", @@ -179,13 +179,13 @@ "
t
23651.897461-10.4630700.0001140.691121\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
23651.897461-10.4630700.0001140.691121\n", " \n", " \n", " \n", @@ -211,13 +211,13 @@ "
t
346341.5138015.6923780.3023330.793535\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
346341.5138015.6923780.3023330.793535\n", " \n", " \n", " \n", @@ -262,7 +262,7 @@ "3 [{t: 17.562349, flux: 41.417927, band: 'g', fl... " ] }, - "execution_count": 100, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -288,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 2, "id": "f35f6a8a", "metadata": {}, "outputs": [ @@ -408,7 +408,7 @@ "9 1.067251 62.336012 r 3.116801" ] }, - "execution_count": 102, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -429,7 +429,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 3, "id": "4e060bea", "metadata": {}, "outputs": [ @@ -491,7 +491,7 @@ "Expr=RenameFrame(frame=Repartition(frame=MapPartitions(NestedFrame), new_partitions=1), columns={'nested': 'lightcurve'})" ] }, - "execution_count": 107, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -510,7 +510,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 4, "id": "95974178", "metadata": {}, "outputs": [ @@ -520,7 +520,7 @@ "['lightcurve']" ] }, - "execution_count": 108, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -549,7 +549,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 5, "id": "8a5d87d8", "metadata": {}, "outputs": [ @@ -564,7 +564,7 @@ "Name: flux, Length: 50, dtype: double[pyarrow]" ] }, - "execution_count": 109, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -584,7 +584,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 6, "id": "59e6167e", "metadata": {}, "outputs": [ @@ -593,27 +593,27 @@ "text/html": [ "\n", - "
t
\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 idradecablightcurveidradecablightcurve
12342.16693140.9503810.7203240.372520\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
12342.16693140.9503810.7203240.372520\n", " \n", " \n", " \n", @@ -639,13 +639,13 @@ "
t
422112.259323-70.8882480.1467561.077633\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
422112.259323-70.8882480.1467561.077633\n", " \n", " \n", " \n", @@ -671,13 +671,13 @@ "
t
024184.255785-8.8209460.4170220.184677\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
024184.255785-8.8209460.4170220.184677\n", " \n", " \n", " \n", @@ -703,13 +703,13 @@ "
t
23651.897461-10.4630700.0001140.691121\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
23651.897461-10.4630700.0001140.691121\n", " \n", " \n", " \n", @@ -735,13 +735,13 @@ "
t
346341.5138015.6923780.3023330.793535\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
346341.5138015.6923780.3023330.793535\n", " \n", " \n", " \n", @@ -786,7 +786,7 @@ "3 [{t: 17.562349, flux: 41.417927, band: 'g', fl... " ] }, - "execution_count": 110, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -805,7 +805,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 7, "id": "57034f44", "metadata": {}, "outputs": [ @@ -814,36 +814,36 @@ "text/html": [ "\n", - "
t
\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 idradecablightcurveidradecablightcurve
12342.16693140.9503810.7203240.372520None
422112.259323-70.8882480.1467561.077633\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
12342.16693140.9503810.7203240.372520None
422112.259323-70.8882480.1467561.077633\n", " \n", " \n", " \n", @@ -869,22 +869,22 @@ "
t
024184.255785-8.8209460.4170220.184677None024184.255785-8.8209460.4170220.184677None
23651.897461-10.4630700.0001140.691121\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
23651.897461-10.4630700.0001140.691121\n", " \n", " \n", " \n", @@ -910,13 +910,13 @@ "
t
346341.5138015.6923780.3023330.793535None346341.5138015.6923780.3023330.793535None
\n", @@ -938,7 +938,7 @@ "3 None " ] }, - "execution_count": 111, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -958,109 +958,21 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": null, "id": "926ba6f3", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 idradecablightcurve
422112.259323-70.8882480.1467561.077633\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tfluxbandflux_err
13.99516799.732285g4.986614
+0 rows.........
23651.897461-10.4630700.0001140.691121\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tfluxbandflux_err
16.69251396.484005g4.8242
+0 rows.........
\n", - "2 rows x 6 columns" - ], - "text/plain": [ - " id ra dec a b \\\n", - "4 22 112.259323 -70.888248 0.146756 1.077633 \n", - "2 36 51.897461 -10.463070 0.000114 0.691121 \n", - "\n", - " lightcurve \n", - "4 [{t: 13.995167, flux: 99.732285, band: 'g', fl... \n", - "2 [{t: 16.692513, flux: 96.484005, band: 'g', fl... " - ] - }, - "execution_count": 112, - "metadata": {}, - "output_type": "execute_result" + "ename": "AttributeError", + "evalue": "'NestedFrame' object has no attribute 'map_partitions'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)", + "\u001b[32m/tmp/ipykernel_257475/2928617375.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m nf_highflux.map_partitions(\u001b[38;5;28;01mlambda\u001b[39;00m nf: nf.dropna(subset=\u001b[33m\"lightcurve\"\u001b[39m))\n", + "\u001b[32m~/.virtualenvs/sep/lib/python3.12/site-packages/pandas/core/generic.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m 6314\u001b[39m \u001b[38;5;28;01mand\u001b[39;00m name \u001b[38;5;28;01mnot\u001b[39;00m \u001b[38;5;28;01min\u001b[39;00m self._accessors\n\u001b[32m 6315\u001b[39m \u001b[38;5;28;01mand\u001b[39;00m self._info_axis._can_hold_identifiers_and_holds_name(name)\n\u001b[32m 6316\u001b[39m ):\n\u001b[32m 6317\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m self[name]\n\u001b[32m-> \u001b[39m\u001b[32m6318\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m object.__getattribute__(self, name)\n", + "\u001b[31mAttributeError\u001b[39m: 'NestedFrame' object has no attribute 'map_partitions'" + ] } ], "source": [ @@ -1081,7 +993,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": null, "id": "2a5c1914", "metadata": {}, "outputs": [ @@ -1152,7 +1064,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": null, "id": "b39105b0", "metadata": {}, "outputs": [ @@ -1379,7 +1291,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": null, "id": "9e466113", "metadata": {}, "outputs": [ @@ -1472,7 +1384,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": null, "id": "1fcaea43", "metadata": {}, "outputs": [ @@ -1501,7 +1413,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": null, "id": "96262092", "metadata": {}, "outputs": [ @@ -1530,7 +1442,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": null, "id": "3fdd4d9c", "metadata": {}, "outputs": [ @@ -1626,7 +1538,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": null, "id": "d77e4f97", "metadata": {}, "outputs": [ @@ -1752,7 +1664,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": null, "id": "064bbfe0", "metadata": {}, "outputs": [ @@ -1831,7 +1743,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": null, "id": "abd01a79", "metadata": {}, "outputs": [ @@ -2011,7 +1923,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": null, "id": "90eb46ac", "metadata": {}, "outputs": [ @@ -2108,7 +2020,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": null, "id": "44545efc", "metadata": {}, "outputs": [ @@ -2257,7 +2169,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "sep", "language": "python", "name": "python3" }, @@ -2271,7 +2183,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.2" + "version": "3.12.3" } }, "nbformat": 4, diff --git a/src/lsdb/catalog/catalog.py b/src/lsdb/catalog/catalog.py index 5a3cc0029..f46e03141 100644 --- a/src/lsdb/catalog/catalog.py +++ b/src/lsdb/catalog/catalog.py @@ -2,7 +2,7 @@ import warnings from pathlib import Path -from typing import Any, Callable, Iterable, Literal, Type +from typing import Any, Callable, Iterable, Type import dask.dataframe as dd import hats as hc @@ -11,9 +11,7 @@ from hats.catalog.catalog_collection import CatalogCollection from hats.catalog.healpix_dataset.healpix_dataset import HealpixDataset as HCHealpixDataset from hats.catalog.index.index_catalog import IndexCatalog as HCIndexCatalog -from pandas._libs import lib -from pandas._typing import AnyAll, Axis, IndexLabel, Renamer -from pandas.api.extensions import no_default +from pandas._typing import Renamer from typing_extensions import Self from upath import UPath @@ -146,33 +144,6 @@ def rename(self, columns: Renamer) -> Catalog: catalog.margin = self.margin.rename(columns) return catalog - def assign(self, **kwargs) -> Catalog: - """Assigns new columns to a catalog - - Args: - **kwargs: Arguments to pass to the assign method. This dictionary - should contain the column names as keys and either a - function or a 1-D Dask array as their corresponding value. - - Returns: - The catalog containing both the old columns and the newly created columns - - Examples: - Create a new column using a function:: - - catalog = Catalog(...) - catalog = catalog.assign(new_col=lambda df: df['existing_col'] * 2) - - Add a column from a 1-D Dask array:: - - import dask.array as da - new_data = da.arange(...) - catalog = catalog.assign(new_col=new_data) - """ - self._check_unloaded_columns(list(kwargs.keys())) - ddf = self._ddf.assign(**kwargs) - return self._create_updated_dataset(ddf=ddf) - def crossmatch( self, other: Catalog, @@ -951,81 +922,6 @@ def nest_lists( ) return catalog - def dropna( - self, - *, - axis: Axis = 0, - how: AnyAll | lib.NoDefault = no_default, - thresh: int | lib.NoDefault = no_default, - on_nested: bool = False, - subset: IndexLabel | None = None, - ignore_index: bool = False, - ) -> Catalog: - """Remove missing values for one layer of nested columns in the catalog. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - Determine if rows or columns which contain missing values are - removed. - - * 0, or 'index' : Drop rows which contain missing values. - * 1, or 'columns' : Drop columns which contain missing value. - - Only a single axis is allowed. - - how : {'any', 'all'}, default 'any' - Determine if row or column is removed from catalog, when we have - at least one NA or all NA. - - * 'any' : If any NA values are present, drop that row or column. - * 'all' : If all values are NA, drop that row or column. - thresh : int, optional - Require that many non-NA values. Cannot be combined with how. - on_nested : str or bool, optional - If not False, applies the call to the nested dataframe in the - column with label equal to the provided string. If specified, - the nested dataframe should align with any columns given in - `subset`. - subset : column label or sequence of labels, optional - Labels along other axis to consider, e.g. if you are dropping rows - these would be a list of columns to include. - - Access nested columns using `nested_df.nested_col` (where - `nested_df` refers to a particular nested dataframe and - `nested_col` is a column of that nested dataframe). - ignore_index : bool, default ``False`` - If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. - - Returns - ------- - Catalog - Catalog with NA entries dropped from it. - - Notes - ----- - Operations that target a particular nested structure return a dataframe - with rows of that particular nested structure affected. - - Values for `on_nested` and `subset` should be consistent in pointing - to a single layer, multi-layer operations are not supported at this - time. - """ - self._check_unloaded_columns(subset) - catalog = super().dropna( - axis=axis, how=how, thresh=thresh, on_nested=on_nested, subset=subset, ignore_index=ignore_index - ) - if self.margin is not None: - catalog.margin = self.margin.dropna( - axis=axis, - how=how, - thresh=thresh, - on_nested=on_nested, - subset=subset, - ignore_index=ignore_index, - ) - return catalog - def reduce(self, func, *args, meta=None, append_columns=False, infer_nesting=True, **kwargs) -> Catalog: """ Takes a function and applies it to each top-level row of the Catalog. @@ -1092,55 +988,6 @@ def reduce(self, func, *args, meta=None, append_columns=False, infer_nesting=Tru ) return catalog - def sort_nested_values( - self, - by: str | list[str], - ascending: bool | list[bool] = True, - na_position: Literal["first"] | Literal["last"] = "last", - ignore_index: bool | None = False, - **options, - ) -> Catalog: - # pylint: disable=duplicate-code - """Sort nested columns for each row in the catalog. - - Note that this does NOT sort rows, only nested values within rows. - - Args: - by: str or list[str] - Column(s) to sort by. - ascending: bool or list[bool], optional - Sort ascending vs. descending. Defaults to True. Specify list for - multiple sort orders. If this is a list of bools, must match the - length of the `by`. - na_position: {‘last’, ‘first’}, optional - Puts NaNs at the beginning if ‘first’, puts NaN at the end if - ‘last’. Defaults to ‘last’. - ignore_index: bool, optional - If True, the resulting axis will be labeled 0, 1, …, n - 1. - Defaults to False. - **options: keyword arguments, optional - Additional options to pass to the sorting function. - - Returns: - A new catalog where the specified nested columns are sorted. - """ - catalog = super().sort_nested_values( - by=by, - ascending=ascending, - na_position=na_position, - ignore_index=ignore_index, - **options, - ) - if self.margin is not None: - catalog.margin = self.margin.sort_nested_values( - by=by, - ascending=ascending, - na_position=na_position, - ignore_index=ignore_index, - **options, - ) - return catalog - def to_hats( self, base_catalog_path: str | Path | UPath, diff --git a/src/lsdb/catalog/dataset/healpix_dataset.py b/src/lsdb/catalog/dataset/healpix_dataset.py index e721cbcca..bb107d366 100644 --- a/src/lsdb/catalog/dataset/healpix_dataset.py +++ b/src/lsdb/catalog/dataset/healpix_dataset.py @@ -5,7 +5,7 @@ import warnings from collections.abc import Sequence from pathlib import Path -from typing import Callable, Iterable, Literal, Type, cast +from typing import Callable, Iterable, Type import astropy import dask @@ -22,12 +22,9 @@ from hats.inspection.visualize_catalog import get_fov_moc_from_wcs, initialize_wcs_axes from hats.pixel_math import HealpixPixel from hats.pixel_math.healpix_pixel_function import get_pixel_argsort -from hats.pixel_math.spatial_index import SPATIAL_INDEX_COLUMN from matplotlib.figure import Figure from mocpy import MOC -from pandas._libs import lib -from pandas._typing import AnyAll, Axis, IndexLabel, Renamer -from pandas.api.extensions import no_default +from pandas._typing import Renamer from typing_extensions import Self from upath import UPath @@ -855,91 +852,6 @@ def write_catalog( **kwargs, ) - def dropna( - self, - *, - axis: Axis = 0, - how: AnyAll | lib.NoDefault = no_default, - thresh: int | lib.NoDefault = no_default, - on_nested: bool = False, - subset: IndexLabel | None = None, - ignore_index: bool = False, - ) -> Self: # type: ignore[name-defined] # noqa: F821: - """ - Remove missing values for one layer of nested columns in the catalog. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - Determine if rows or columns which contain missing values are - removed. - - * 0, or 'index' : Drop rows which contain missing values. - * 1, or 'columns' : Drop columns which contain missing value. - - Only a single axis is allowed. - - how : {'any', 'all'}, default 'any' - Determine if row or column is removed from catalog, when we have - at least one NA or all NA. - - * 'any' : If any NA values are present, drop that row or column. - * 'all' : If all values are NA, drop that row or column. - thresh : int, optional - Require that many non-NA values. Cannot be combined with how. - on_nested : str or bool, optional - If not False, applies the call to the nested dataframe in the - column with label equal to the provided string. If specified, - the nested dataframe should align with any columns given in - `subset`. - subset : column label or sequence of labels, optional - Labels along other axis to consider, e.g. if you are dropping rows - these would be a list of columns to include. - - Access nested columns using `nested_df.nested_col` (where - `nested_df` refers to a particular nested dataframe and - `nested_col` is a column of that nested dataframe). - ignore_index : bool, default ``False`` - If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. - - .. versionadded:: 2.0.0 - - Returns - ------- - Catalog - Catalog with NA entries dropped from it. - - Notes - ----- - Operations that target a particular nested structure return a dataframe - with rows of that particular nested structure affected. - - Values for `on_nested` and `subset` should be consistent in pointing - to a single layer, multi-layer operations are not supported at this - time. - """ - - def drop_na_part(df: npd.NestedFrame): - if df.index.name == SPATIAL_INDEX_COLUMN: - df = df.reset_index() - df = cast( - npd.NestedFrame, - df.dropna( - axis=axis, - how=how, - thresh=thresh, - on_nested=on_nested, - subset=subset, - ignore_index=ignore_index, - ), - ) - if SPATIAL_INDEX_COLUMN in df.columns: - df = df.set_index(SPATIAL_INDEX_COLUMN) - return df - - ndf = self._ddf.map_partitions(drop_na_part, meta=self._ddf._meta) - return self._create_updated_dataset(ddf=ndf) - def nest_lists( self, base_columns: list[str] | None = None, @@ -1170,48 +1082,3 @@ def plot_points( fig=fig, **kwargs, ) - - def sort_nested_values( - self, - by: str | list[str], - ascending: bool | list[bool] = True, - na_position: Literal["first"] | Literal["last"] = "last", - ignore_index: bool | None = False, - **options, - ) -> Self: - """Sort nested columns for each row in the catalog. - - Args: - by: str or list[str] - Column(s) to sort by. - ascending: bool or list[bool], optional - Sort ascending vs. descending. Defaults to True. Specify list for - multiple sort orders. If this is a list of bools, must match the - length of the by. - na_position: {‘last’, ‘first’}, optional - Puts NaNs at the beginning if ‘first’, puts NaN at the end if - ‘last’. Defaults to ‘last’. - ignore_index: bool, optional - If True, the resulting axis will be labeled 0, 1, …, n - 1. - Defaults to False. - **options: keyword arguments, optional - Additional options to pass to the sorting function. - - Returns: - A new catalog where the specified nested columns are sorted. - """ - if isinstance(by, str): - by = [by] - self._check_unloaded_columns(by) - # Check "by" columns for hierarchical references - for col in by: - if not self._ddf._is_known_hierarchical_column(col): - raise ValueError(f"{col} not found in nested columns") - ndf = self._ddf.sort_values( - by=by, - ascending=ascending, - na_position=na_position, - ignore_index=ignore_index, - **options, - ) - return self._create_updated_dataset(ddf=ndf) diff --git a/src/lsdb/nested/core.py b/src/lsdb/nested/core.py index d86ba7c36..ef9f85879 100644 --- a/src/lsdb/nested/core.py +++ b/src/lsdb/nested/core.py @@ -14,9 +14,6 @@ from dask.dataframe.dask_expr._expr import no_default as dsk_no_default from nested_pandas.series.dtype import NestedDtype from nested_pandas.series.packer import pack, pack_flat, pack_lists -from pandas._libs import lib -from pandas._typing import Axis, IndexLabel -from pandas.api.extensions import no_default from typing_extensions import Self # need this for the base _Frame class @@ -551,89 +548,6 @@ def query(self, expr) -> Self: # type: ignore # noqa: F821: # pylint: disable=u lambda x: npd.NestedFrame(x).query(expr), meta=self._meta ) # pylint: disable=protected-access - # pylint: disable=arguments-differ - def dropna( - self, - *, - axis: Axis = 0, - how: str | lib.NoDefault = no_default, - thresh: int | lib.NoDefault = no_default, - on_nested: bool = False, - subset: IndexLabel | None = None, - inplace: bool = False, - ignore_index: bool = False, - ) -> Self: # type: ignore[name-defined] # noqa: F821: # pylint: disable=undefined-variable - """ - Remove missing values for one layer of the NestedFrame. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - Determine if rows or columns which contain missing values are - removed. - - * 0, or 'index' : Drop rows which contain missing values. - * 1, or 'columns' : Drop columns which contain missing value. - - Only a single axis is allowed. - - how : {'any', 'all'}, default 'any' - Determine if row or column is removed from DataFrame, when we have - at least one NA or all NA. - - * 'any' : If any NA values are present, drop that row or column. - * 'all' : If all values are NA, drop that row or column. - thresh : int, optional - Require that many non-NA values. Cannot be combined with how. - on_nested : str or bool, optional - If not False, applies the call to the nested dataframe in the - column with label equal to the provided string. If specified, - the nested dataframe should align with any columns given in - `subset`. - subset : column label or sequence of labels, optional - Labels along other axis to consider, e.g. if you are dropping rows - these would be a list of columns to include. - - Access nested columns using `nested_df.nested_col` (where - `nested_df` refers to a particular nested dataframe and - `nested_col` is a column of that nested dataframe). - inplace : bool, default False - Whether to modify the DataFrame rather than creating a new one. - ignore_index : bool, default ``False`` - If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. - - .. versionadded:: 2.0.0 - - Returns - ------- - DataFrame or None - DataFrame with NA entries dropped from it or None if ``inplace=True``. - - Notes - ----- - Operations that target a particular nested structure return a dataframe - with rows of that particular nested structure affected. - - Values for `on_nested` and `subset` should be consistent in pointing - to a single layer, multi-layer operations are not supported at this - time. - """ - # propagate meta, assumes row-based operation - return self.map_partitions( - lambda x: npd.NestedFrame(x).dropna( - axis=axis, - how=how, - thresh=thresh, - on_nested=on_nested, - subset=subset, - inplace=inplace, - ignore_index=ignore_index, - ), - meta=self._meta, # pylint: disable=protected-access - ) - - # NOTE: This is wrapped as a much more restrictive sort_nested_values - # function in lsdb.catalog def sort_values( self, by: str | list[str], diff --git a/tests/lsdb/catalog/test_catalog.py b/tests/lsdb/catalog/test_catalog.py index f38e981d5..5c30c7f68 100644 --- a/tests/lsdb/catalog/test_catalog.py +++ b/tests/lsdb/catalog/test_catalog.py @@ -2,7 +2,6 @@ from pathlib import Path import astropy.units as u -import dask.array as da import dask.dataframe as dd import hats as hc import hats.pixel_math.healpix_shim as hp @@ -325,58 +324,6 @@ def test_rename_with_dict(small_sky_xmatch_with_margin): assert renamed_catalog.margin.columns[i] == f"{col}_{i}" -def test_assign_no_arguments(small_sky_order1_catalog): - result_catalog = small_sky_order1_catalog.assign() - pd.testing.assert_frame_equal(result_catalog._ddf.compute(), small_sky_order1_catalog._ddf.compute()) - assert isinstance(result_catalog._ddf, nd.NestedFrame) - - -def test_assign_with_callable(small_sky_order1_catalog): - kwargs = {"squared_ra_err": lambda x: x["ra_error"] ** 2} - result_catalog = small_sky_order1_catalog.assign(**kwargs) - expected_ddf = small_sky_order1_catalog._ddf.copy() - expected_ddf["squared_ra_err"] = expected_ddf["ra_error"] ** 2 - pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) - assert isinstance(result_catalog._ddf, nd.NestedFrame) - - -def test_assign_with_series(small_sky_order1_catalog): - # The series is created from the original dataframe because indices must match - squared_ra_err = small_sky_order1_catalog._ddf["ra_error"].map(lambda x: x**2) - kwargs = {"new_column": squared_ra_err} - result_catalog = small_sky_order1_catalog.assign(**kwargs) - expected_ddf = small_sky_order1_catalog._ddf.copy() - expected_ddf["new_column"] = squared_ra_err - pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) - assert isinstance(result_catalog._ddf, nd.NestedFrame) - - -def test_assign_with_multiple_columns(small_sky_order1_catalog): - # These series are created from the original dataframe because indices must match - squared_ra_err = small_sky_order1_catalog._ddf["ra_error"].map(lambda x: x**2) - squared_dec_err = small_sky_order1_catalog._ddf["dec_error"].map(lambda x: x**2) - kwargs = { - "squared_ra_err": squared_ra_err, - "squared_dec_err": squared_dec_err, - } - result_catalog = small_sky_order1_catalog.assign(**kwargs) - expected_ddf = small_sky_order1_catalog._ddf.copy() - expected_ddf["squared_ra_err"] = squared_ra_err - expected_ddf["squared_dec_err"] = squared_dec_err - pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute()) - - -def test_assign_with_invalid_arguments(small_sky_order1_catalog): - with pytest.raises(TypeError, match="Column assignment doesn't support type"): - small_sky_order1_catalog.assign(new_column=[1, 2, 3]) - with pytest.raises(ValueError, match="Array assignment only supports 1-D arrays"): - small_sky_order1_catalog.assign(new_column=da.ones((10, 10))) - with pytest.raises(ValueError, match="Number of partitions do not match"): - chunks = small_sky_order1_catalog._ddf.npartitions + 1 - array = da.random.random(size=10, chunks=chunks) - small_sky_order1_catalog.assign(new_column=array) - - def test_read_hats(small_sky_catalog, tmp_path): new_catalog_name = "small_sky" base_catalog_path = Path(tmp_path) / new_catalog_name @@ -707,8 +654,6 @@ def test_filtered_catalog_has_undetermined_len(small_sky_order1_catalog, small_s len(small_sky_order1_catalog.id_search(values={"id": 900}, index_catalogs={"id": catalog_index})) with pytest.raises(ValueError, match="undetermined"): len(small_sky_order1_catalog.pixel_search([(0, 11)])) - with pytest.raises(ValueError, match="undetermined"): - len(small_sky_order1_catalog.dropna()) @pytest.mark.sphgeom diff --git a/tests/lsdb/catalog/test_nested.py b/tests/lsdb/catalog/test_nested.py index 49aeb5173..31b87b4a0 100644 --- a/tests/lsdb/catalog/test_nested.py +++ b/tests/lsdb/catalog/test_nested.py @@ -7,43 +7,13 @@ import lsdb import lsdb.nested as nd -from lsdb import Catalog, MarginCatalog +from lsdb import Catalog def test_nested_columns_property(small_sky_with_nested_sources): assert list(small_sky_with_nested_sources.nested_columns) == ["sources"] -def test_dropna(small_sky_with_nested_sources): - filtered_cat = small_sky_with_nested_sources.query("sources.mag < 15.1") - drop_na_cat = filtered_cat.dropna() - assert isinstance(drop_na_cat, Catalog) - assert isinstance(drop_na_cat._ddf, nd.NestedFrame) - drop_na_compute = drop_na_cat.compute() - assert isinstance(drop_na_compute, npd.NestedFrame) - filtered_compute = filtered_cat.compute() - assert len(drop_na_compute) < len(filtered_compute) - pd.testing.assert_frame_equal(drop_na_compute, filtered_compute.dropna()) - - -def test_dropna_on_nested(small_sky_with_nested_sources): - def add_na_values_nested(df): - """replaces the first source_ra value in each nested df with NaN""" - for i in range(len(df)): - first_ra_value = df.iloc[i]["sources"].iloc[0]["source_ra"] - df["sources"].array[i] = df["sources"].array[i].replace(first_ra_value, np.nan) - return df - - filtered_cat = small_sky_with_nested_sources.map_partitions(add_na_values_nested) - drop_na_cat = filtered_cat.dropna(on_nested="sources") - assert isinstance(drop_na_cat, Catalog) - assert isinstance(drop_na_cat._ddf, nd.NestedFrame) - drop_na_sources_compute = drop_na_cat["sources"].compute() - filtered_sources_compute = filtered_cat["sources"].compute() - assert len(drop_na_sources_compute) == len(filtered_sources_compute) - assert sum(map(len, drop_na_sources_compute)) < sum(map(len, filtered_sources_compute)) - - def test_nest_lists(small_sky_with_nested_sources): """Test the behavior of catalog.nest_lists""" cat_ndf = small_sky_with_nested_sources._ddf.map_partitions( @@ -227,40 +197,6 @@ def mean_mag(ra, dec, mag): assert list(res_false.columns) == ["new_nested.ra_mag", "new_nested.dec_mag"] -def test_sort_nested_values(small_sky_with_nested_sources): - # Sorting on nested "mjd" source column, in descending order - sorted_nested = small_sky_with_nested_sources.sort_nested_values(by="sources.mjd", ascending=False) - assert isinstance(sorted_nested, Catalog) - unsorted_source = small_sky_with_nested_sources["sources"].compute() - sorted_source = sorted_nested["sources"].compute() - for i in range(len(unsorted_source)): - expected_mjd = sorted(unsorted_source.iloc[i]["mjd"], reverse=True) - assert expected_mjd == sorted_source.iloc[i]["mjd"].values.tolist() - expected_schema = small_sky_with_nested_sources.hc_structure.schema - assert expected_schema.equals(sorted_nested.hc_structure.schema) - - -def test_sort_nested_values_with_margin(small_sky_with_nested_sources_with_margin): - # Sorting values in nested column also sorts catalog margin - sorted_nested = small_sky_with_nested_sources_with_margin.sort_nested_values( - by="sources.mjd", ascending=False - ) - assert isinstance(sorted_nested, Catalog) - assert isinstance(sorted_nested.margin, MarginCatalog) - unsorted_source = small_sky_with_nested_sources_with_margin.margin["sources"].compute() - sorted_source = sorted_nested.margin["sources"].compute() - for i in range(len(unsorted_source)): - expected_mjd = sorted(unsorted_source.iloc[i]["mjd"], reverse=True) - assert expected_mjd == sorted_source.iloc[i]["mjd"].values.tolist() - expected_schema = small_sky_with_nested_sources_with_margin.margin.hc_structure.schema - assert expected_schema.equals(sorted_nested.margin.hc_structure.schema) - - -def test_sort_nested_values_using_base_column(small_sky_with_nested_sources): - with pytest.raises(ValueError, match="nested columns"): - small_sky_with_nested_sources.sort_nested_values(by="ra") - - def test_serialization_read(small_sky_with_nested_sources): assert isinstance(small_sky_with_nested_sources.dtypes["sources"], NestedDtype) diff --git a/tests/lsdb/nested/test_nestedframe.py b/tests/lsdb/nested/test_nestedframe.py index 27c261e7c..d493c7b7e 100644 --- a/tests/lsdb/nested/test_nestedframe.py +++ b/tests/lsdb/nested/test_nestedframe.py @@ -291,23 +291,6 @@ def test_query_on_nested(test_dataset): assert len(res) == 50 # make sure the base df remains unchanged -def test_dropna(test_dataset_with_nans): - """test the dropna function""" - - nan_free_base = test_dataset_with_nans.dropna(subset=["a"]) - # should just remove one row - assert len(nan_free_base) == len(test_dataset_with_nans) - 1 - - meta = test_dataset_with_nans.loc[0].head(0).nested.nest.to_flat() - - nan_free_nested = test_dataset_with_nans.dropna(subset=["nested.t"]) - - flat_nested_nan_free = nan_free_nested.map_partitions(lambda x: x.nested.nest.to_flat(), meta=meta) - flat_nested = test_dataset_with_nans.map_partitions(lambda x: x.nested.nest.to_flat(), meta=meta) - # should just remove one row - assert len(flat_nested_nan_free) == len(flat_nested) - 1 - - def test_sort_values(test_dataset): """test the sort_values function"""