\n",
- " 1 | \n",
- " 2 | \n",
- " 342.166931 | \n",
- " 40.950381 | \n",
- " 0.720324 | \n",
- " 0.372520 | \n",
- " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 342.166931 | \n",
+ " 40.950381 | \n",
+ " 0.720324 | \n",
+ " 0.372520 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -115,13 +115,13 @@
" | \n",
" \n",
" \n",
- " 4 | \n",
- " 22 | \n",
- " 112.259323 | \n",
- " -70.888248 | \n",
- " 0.146756 | \n",
- " 1.077633 | \n",
- " \n",
+ " 4 | \n",
+ " 22 | \n",
+ " 112.259323 | \n",
+ " -70.888248 | \n",
+ " 0.146756 | \n",
+ " 1.077633 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -147,13 +147,13 @@
" | \n",
" \n",
" \n",
- " 0 | \n",
- " 24 | \n",
- " 184.255785 | \n",
- " -8.820946 | \n",
- " 0.417022 | \n",
- " 0.184677 | \n",
- " \n",
+ " 0 | \n",
+ " 24 | \n",
+ " 184.255785 | \n",
+ " -8.820946 | \n",
+ " 0.417022 | \n",
+ " 0.184677 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -179,13 +179,13 @@
" | \n",
" \n",
" \n",
- " 2 | \n",
- " 36 | \n",
- " 51.897461 | \n",
- " -10.463070 | \n",
- " 0.000114 | \n",
- " 0.691121 | \n",
- " \n",
+ " 2 | \n",
+ " 36 | \n",
+ " 51.897461 | \n",
+ " -10.463070 | \n",
+ " 0.000114 | \n",
+ " 0.691121 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -211,13 +211,13 @@
" | \n",
" \n",
" \n",
- " 3 | \n",
- " 46 | \n",
- " 341.513801 | \n",
- " 5.692378 | \n",
- " 0.302333 | \n",
- " 0.793535 | \n",
- " \n",
+ " 3 | \n",
+ " 46 | \n",
+ " 341.513801 | \n",
+ " 5.692378 | \n",
+ " 0.302333 | \n",
+ " 0.793535 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -262,7 +262,7 @@
"3 [{t: 17.562349, flux: 41.417927, band: 'g', fl... "
]
},
- "execution_count": 100,
+ "execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@@ -288,7 +288,7 @@
},
{
"cell_type": "code",
- "execution_count": 102,
+ "execution_count": 2,
"id": "f35f6a8a",
"metadata": {},
"outputs": [
@@ -408,7 +408,7 @@
"9 1.067251 62.336012 r 3.116801"
]
},
- "execution_count": 102,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@@ -429,7 +429,7 @@
},
{
"cell_type": "code",
- "execution_count": 107,
+ "execution_count": 3,
"id": "4e060bea",
"metadata": {},
"outputs": [
@@ -491,7 +491,7 @@
"Expr=RenameFrame(frame=Repartition(frame=MapPartitions(NestedFrame), new_partitions=1), columns={'nested': 'lightcurve'})"
]
},
- "execution_count": 107,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -510,7 +510,7 @@
},
{
"cell_type": "code",
- "execution_count": 108,
+ "execution_count": 4,
"id": "95974178",
"metadata": {},
"outputs": [
@@ -520,7 +520,7 @@
"['lightcurve']"
]
},
- "execution_count": 108,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -549,7 +549,7 @@
},
{
"cell_type": "code",
- "execution_count": 109,
+ "execution_count": 5,
"id": "8a5d87d8",
"metadata": {},
"outputs": [
@@ -564,7 +564,7 @@
"Name: flux, Length: 50, dtype: double[pyarrow]"
]
},
- "execution_count": 109,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -584,7 +584,7 @@
},
{
"cell_type": "code",
- "execution_count": 110,
+ "execution_count": 6,
"id": "59e6167e",
"metadata": {},
"outputs": [
@@ -593,27 +593,27 @@
"text/html": [
"\n",
- "\n",
+ "\n",
" \n",
" \n",
" | \n",
- " id | \n",
- " ra | \n",
- " dec | \n",
- " a | \n",
- " b | \n",
- " lightcurve | \n",
+ " id | \n",
+ " ra | \n",
+ " dec | \n",
+ " a | \n",
+ " b | \n",
+ " lightcurve | \n",
" \n",
" \n",
" \n",
" \n",
- " 1 | \n",
- " 2 | \n",
- " 342.166931 | \n",
- " 40.950381 | \n",
- " 0.720324 | \n",
- " 0.372520 | \n",
- " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 342.166931 | \n",
+ " 40.950381 | \n",
+ " 0.720324 | \n",
+ " 0.372520 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -639,13 +639,13 @@
" | \n",
" \n",
" \n",
- " 4 | \n",
- " 22 | \n",
- " 112.259323 | \n",
- " -70.888248 | \n",
- " 0.146756 | \n",
- " 1.077633 | \n",
- " \n",
+ " 4 | \n",
+ " 22 | \n",
+ " 112.259323 | \n",
+ " -70.888248 | \n",
+ " 0.146756 | \n",
+ " 1.077633 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -671,13 +671,13 @@
" | \n",
" \n",
" \n",
- " 0 | \n",
- " 24 | \n",
- " 184.255785 | \n",
- " -8.820946 | \n",
- " 0.417022 | \n",
- " 0.184677 | \n",
- " \n",
+ " 0 | \n",
+ " 24 | \n",
+ " 184.255785 | \n",
+ " -8.820946 | \n",
+ " 0.417022 | \n",
+ " 0.184677 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -703,13 +703,13 @@
" | \n",
" \n",
" \n",
- " 2 | \n",
- " 36 | \n",
- " 51.897461 | \n",
- " -10.463070 | \n",
- " 0.000114 | \n",
- " 0.691121 | \n",
- " \n",
+ " 2 | \n",
+ " 36 | \n",
+ " 51.897461 | \n",
+ " -10.463070 | \n",
+ " 0.000114 | \n",
+ " 0.691121 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -735,13 +735,13 @@
" | \n",
" \n",
" \n",
- " 3 | \n",
- " 46 | \n",
- " 341.513801 | \n",
- " 5.692378 | \n",
- " 0.302333 | \n",
- " 0.793535 | \n",
- " \n",
+ " 3 | \n",
+ " 46 | \n",
+ " 341.513801 | \n",
+ " 5.692378 | \n",
+ " 0.302333 | \n",
+ " 0.793535 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -786,7 +786,7 @@
"3 [{t: 17.562349, flux: 41.417927, band: 'g', fl... "
]
},
- "execution_count": 110,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -805,7 +805,7 @@
},
{
"cell_type": "code",
- "execution_count": 111,
+ "execution_count": 7,
"id": "57034f44",
"metadata": {},
"outputs": [
@@ -814,36 +814,36 @@
"text/html": [
"\n",
- "\n",
+ "\n",
" \n",
" \n",
" | \n",
- " id | \n",
- " ra | \n",
- " dec | \n",
- " a | \n",
- " b | \n",
- " lightcurve | \n",
+ " id | \n",
+ " ra | \n",
+ " dec | \n",
+ " a | \n",
+ " b | \n",
+ " lightcurve | \n",
" \n",
" \n",
" \n",
" \n",
- " 1 | \n",
- " 2 | \n",
- " 342.166931 | \n",
- " 40.950381 | \n",
- " 0.720324 | \n",
- " 0.372520 | \n",
- " None | \n",
- " \n",
- " \n",
- " 4 | \n",
- " 22 | \n",
- " 112.259323 | \n",
- " -70.888248 | \n",
- " 0.146756 | \n",
- " 1.077633 | \n",
- " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 342.166931 | \n",
+ " 40.950381 | \n",
+ " 0.720324 | \n",
+ " 0.372520 | \n",
+ " None | \n",
+ " \n",
+ " \n",
+ " 4 | \n",
+ " 22 | \n",
+ " 112.259323 | \n",
+ " -70.888248 | \n",
+ " 0.146756 | \n",
+ " 1.077633 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -869,22 +869,22 @@
" | \n",
" \n",
" \n",
- " 0 | \n",
- " 24 | \n",
- " 184.255785 | \n",
- " -8.820946 | \n",
- " 0.417022 | \n",
- " 0.184677 | \n",
- " None | \n",
+ " 0 | \n",
+ " 24 | \n",
+ " 184.255785 | \n",
+ " -8.820946 | \n",
+ " 0.417022 | \n",
+ " 0.184677 | \n",
+ " None | \n",
" \n",
" \n",
- " 2 | \n",
- " 36 | \n",
- " 51.897461 | \n",
- " -10.463070 | \n",
- " 0.000114 | \n",
- " 0.691121 | \n",
- " \n",
+ " 2 | \n",
+ " 36 | \n",
+ " 51.897461 | \n",
+ " -10.463070 | \n",
+ " 0.000114 | \n",
+ " 0.691121 | \n",
+ " \n",
" \n",
" \n",
" t | \n",
@@ -910,13 +910,13 @@
" | \n",
" \n",
" \n",
- " 3 | \n",
- " 46 | \n",
- " 341.513801 | \n",
- " 5.692378 | \n",
- " 0.302333 | \n",
- " 0.793535 | \n",
- " None | \n",
+ " 3 | \n",
+ " 46 | \n",
+ " 341.513801 | \n",
+ " 5.692378 | \n",
+ " 0.302333 | \n",
+ " 0.793535 | \n",
+ " None | \n",
" \n",
" \n",
" \n",
@@ -938,7 +938,7 @@
"3 None "
]
},
- "execution_count": 111,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -958,109 +958,21 @@
},
{
"cell_type": "code",
- "execution_count": 112,
+ "execution_count": null,
"id": "926ba6f3",
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " ra | \n",
- " dec | \n",
- " a | \n",
- " b | \n",
- " lightcurve | \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 4 | \n",
- " 22 | \n",
- " 112.259323 | \n",
- " -70.888248 | \n",
- " 0.146756 | \n",
- " 1.077633 | \n",
- " \n",
- " \n",
- " \n",
- " t | \n",
- " flux | \n",
- " band | \n",
- " flux_err | \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 13.995167 | \n",
- " 99.732285 | \n",
- " g | \n",
- " 4.986614 | \n",
- " \n",
- " \n",
- " +0 rows | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " \n",
- " \n",
- " | \n",
- " \n",
- " \n",
- " 2 | \n",
- " 36 | \n",
- " 51.897461 | \n",
- " -10.463070 | \n",
- " 0.000114 | \n",
- " 0.691121 | \n",
- " \n",
- " \n",
- " \n",
- " t | \n",
- " flux | \n",
- " band | \n",
- " flux_err | \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 16.692513 | \n",
- " 96.484005 | \n",
- " g | \n",
- " 4.8242 | \n",
- " \n",
- " \n",
- " +0 rows | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " \n",
- " \n",
- " | \n",
- " \n",
- " \n",
- " \n",
- "2 rows x 6 columns"
- ],
- "text/plain": [
- " id ra dec a b \\\n",
- "4 22 112.259323 -70.888248 0.146756 1.077633 \n",
- "2 36 51.897461 -10.463070 0.000114 0.691121 \n",
- "\n",
- " lightcurve \n",
- "4 [{t: 13.995167, flux: 99.732285, band: 'g', fl... \n",
- "2 [{t: 16.692513, flux: 96.484005, band: 'g', fl... "
- ]
- },
- "execution_count": 112,
- "metadata": {},
- "output_type": "execute_result"
+ "ename": "AttributeError",
+ "evalue": "'NestedFrame' object has no attribute 'map_partitions'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+ "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
+ "\u001b[32m/tmp/ipykernel_257475/2928617375.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m nf_highflux.map_partitions(\u001b[38;5;28;01mlambda\u001b[39;00m nf: nf.dropna(subset=\u001b[33m\"lightcurve\"\u001b[39m))\n",
+ "\u001b[32m~/.virtualenvs/sep/lib/python3.12/site-packages/pandas/core/generic.py\u001b[39m in \u001b[36m?\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m 6314\u001b[39m \u001b[38;5;28;01mand\u001b[39;00m name \u001b[38;5;28;01mnot\u001b[39;00m \u001b[38;5;28;01min\u001b[39;00m self._accessors\n\u001b[32m 6315\u001b[39m \u001b[38;5;28;01mand\u001b[39;00m self._info_axis._can_hold_identifiers_and_holds_name(name)\n\u001b[32m 6316\u001b[39m ):\n\u001b[32m 6317\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m self[name]\n\u001b[32m-> \u001b[39m\u001b[32m6318\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m object.__getattribute__(self, name)\n",
+ "\u001b[31mAttributeError\u001b[39m: 'NestedFrame' object has no attribute 'map_partitions'"
+ ]
}
],
"source": [
@@ -1081,7 +993,7 @@
},
{
"cell_type": "code",
- "execution_count": 113,
+ "execution_count": null,
"id": "2a5c1914",
"metadata": {},
"outputs": [
@@ -1152,7 +1064,7 @@
},
{
"cell_type": "code",
- "execution_count": 124,
+ "execution_count": null,
"id": "b39105b0",
"metadata": {},
"outputs": [
@@ -1379,7 +1291,7 @@
},
{
"cell_type": "code",
- "execution_count": 114,
+ "execution_count": null,
"id": "9e466113",
"metadata": {},
"outputs": [
@@ -1472,7 +1384,7 @@
},
{
"cell_type": "code",
- "execution_count": 116,
+ "execution_count": null,
"id": "1fcaea43",
"metadata": {},
"outputs": [
@@ -1501,7 +1413,7 @@
},
{
"cell_type": "code",
- "execution_count": 118,
+ "execution_count": null,
"id": "96262092",
"metadata": {},
"outputs": [
@@ -1530,7 +1442,7 @@
},
{
"cell_type": "code",
- "execution_count": 119,
+ "execution_count": null,
"id": "3fdd4d9c",
"metadata": {},
"outputs": [
@@ -1626,7 +1538,7 @@
},
{
"cell_type": "code",
- "execution_count": 120,
+ "execution_count": null,
"id": "d77e4f97",
"metadata": {},
"outputs": [
@@ -1752,7 +1664,7 @@
},
{
"cell_type": "code",
- "execution_count": 65,
+ "execution_count": null,
"id": "064bbfe0",
"metadata": {},
"outputs": [
@@ -1831,7 +1743,7 @@
},
{
"cell_type": "code",
- "execution_count": 121,
+ "execution_count": null,
"id": "abd01a79",
"metadata": {},
"outputs": [
@@ -2011,7 +1923,7 @@
},
{
"cell_type": "code",
- "execution_count": 80,
+ "execution_count": null,
"id": "90eb46ac",
"metadata": {},
"outputs": [
@@ -2108,7 +2020,7 @@
},
{
"cell_type": "code",
- "execution_count": 122,
+ "execution_count": null,
"id": "44545efc",
"metadata": {},
"outputs": [
@@ -2257,7 +2169,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": ".venv",
+ "display_name": "sep",
"language": "python",
"name": "python3"
},
@@ -2271,7 +2183,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.13.2"
+ "version": "3.12.3"
}
},
"nbformat": 4,
diff --git a/src/lsdb/catalog/catalog.py b/src/lsdb/catalog/catalog.py
index 5a3cc0029..f46e03141 100644
--- a/src/lsdb/catalog/catalog.py
+++ b/src/lsdb/catalog/catalog.py
@@ -2,7 +2,7 @@
import warnings
from pathlib import Path
-from typing import Any, Callable, Iterable, Literal, Type
+from typing import Any, Callable, Iterable, Type
import dask.dataframe as dd
import hats as hc
@@ -11,9 +11,7 @@
from hats.catalog.catalog_collection import CatalogCollection
from hats.catalog.healpix_dataset.healpix_dataset import HealpixDataset as HCHealpixDataset
from hats.catalog.index.index_catalog import IndexCatalog as HCIndexCatalog
-from pandas._libs import lib
-from pandas._typing import AnyAll, Axis, IndexLabel, Renamer
-from pandas.api.extensions import no_default
+from pandas._typing import Renamer
from typing_extensions import Self
from upath import UPath
@@ -146,33 +144,6 @@ def rename(self, columns: Renamer) -> Catalog:
catalog.margin = self.margin.rename(columns)
return catalog
- def assign(self, **kwargs) -> Catalog:
- """Assigns new columns to a catalog
-
- Args:
- **kwargs: Arguments to pass to the assign method. This dictionary
- should contain the column names as keys and either a
- function or a 1-D Dask array as their corresponding value.
-
- Returns:
- The catalog containing both the old columns and the newly created columns
-
- Examples:
- Create a new column using a function::
-
- catalog = Catalog(...)
- catalog = catalog.assign(new_col=lambda df: df['existing_col'] * 2)
-
- Add a column from a 1-D Dask array::
-
- import dask.array as da
- new_data = da.arange(...)
- catalog = catalog.assign(new_col=new_data)
- """
- self._check_unloaded_columns(list(kwargs.keys()))
- ddf = self._ddf.assign(**kwargs)
- return self._create_updated_dataset(ddf=ddf)
-
def crossmatch(
self,
other: Catalog,
@@ -951,81 +922,6 @@ def nest_lists(
)
return catalog
- def dropna(
- self,
- *,
- axis: Axis = 0,
- how: AnyAll | lib.NoDefault = no_default,
- thresh: int | lib.NoDefault = no_default,
- on_nested: bool = False,
- subset: IndexLabel | None = None,
- ignore_index: bool = False,
- ) -> Catalog:
- """Remove missing values for one layer of nested columns in the catalog.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Determine if rows or columns which contain missing values are
- removed.
-
- * 0, or 'index' : Drop rows which contain missing values.
- * 1, or 'columns' : Drop columns which contain missing value.
-
- Only a single axis is allowed.
-
- how : {'any', 'all'}, default 'any'
- Determine if row or column is removed from catalog, when we have
- at least one NA or all NA.
-
- * 'any' : If any NA values are present, drop that row or column.
- * 'all' : If all values are NA, drop that row or column.
- thresh : int, optional
- Require that many non-NA values. Cannot be combined with how.
- on_nested : str or bool, optional
- If not False, applies the call to the nested dataframe in the
- column with label equal to the provided string. If specified,
- the nested dataframe should align with any columns given in
- `subset`.
- subset : column label or sequence of labels, optional
- Labels along other axis to consider, e.g. if you are dropping rows
- these would be a list of columns to include.
-
- Access nested columns using `nested_df.nested_col` (where
- `nested_df` refers to a particular nested dataframe and
- `nested_col` is a column of that nested dataframe).
- ignore_index : bool, default ``False``
- If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
-
- Returns
- -------
- Catalog
- Catalog with NA entries dropped from it.
-
- Notes
- -----
- Operations that target a particular nested structure return a dataframe
- with rows of that particular nested structure affected.
-
- Values for `on_nested` and `subset` should be consistent in pointing
- to a single layer, multi-layer operations are not supported at this
- time.
- """
- self._check_unloaded_columns(subset)
- catalog = super().dropna(
- axis=axis, how=how, thresh=thresh, on_nested=on_nested, subset=subset, ignore_index=ignore_index
- )
- if self.margin is not None:
- catalog.margin = self.margin.dropna(
- axis=axis,
- how=how,
- thresh=thresh,
- on_nested=on_nested,
- subset=subset,
- ignore_index=ignore_index,
- )
- return catalog
-
def reduce(self, func, *args, meta=None, append_columns=False, infer_nesting=True, **kwargs) -> Catalog:
"""
Takes a function and applies it to each top-level row of the Catalog.
@@ -1092,55 +988,6 @@ def reduce(self, func, *args, meta=None, append_columns=False, infer_nesting=Tru
)
return catalog
- def sort_nested_values(
- self,
- by: str | list[str],
- ascending: bool | list[bool] = True,
- na_position: Literal["first"] | Literal["last"] = "last",
- ignore_index: bool | None = False,
- **options,
- ) -> Catalog:
- # pylint: disable=duplicate-code
- """Sort nested columns for each row in the catalog.
-
- Note that this does NOT sort rows, only nested values within rows.
-
- Args:
- by: str or list[str]
- Column(s) to sort by.
- ascending: bool or list[bool], optional
- Sort ascending vs. descending. Defaults to True. Specify list for
- multiple sort orders. If this is a list of bools, must match the
- length of the `by`.
- na_position: {‘last’, ‘first’}, optional
- Puts NaNs at the beginning if ‘first’, puts NaN at the end if
- ‘last’. Defaults to ‘last’.
- ignore_index: bool, optional
- If True, the resulting axis will be labeled 0, 1, …, n - 1.
- Defaults to False.
- **options: keyword arguments, optional
- Additional options to pass to the sorting function.
-
- Returns:
- A new catalog where the specified nested columns are sorted.
- """
- catalog = super().sort_nested_values(
- by=by,
- ascending=ascending,
- na_position=na_position,
- ignore_index=ignore_index,
- **options,
- )
- if self.margin is not None:
- catalog.margin = self.margin.sort_nested_values(
- by=by,
- ascending=ascending,
- na_position=na_position,
- ignore_index=ignore_index,
- **options,
- )
- return catalog
-
def to_hats(
self,
base_catalog_path: str | Path | UPath,
diff --git a/src/lsdb/catalog/dataset/healpix_dataset.py b/src/lsdb/catalog/dataset/healpix_dataset.py
index e721cbcca..bb107d366 100644
--- a/src/lsdb/catalog/dataset/healpix_dataset.py
+++ b/src/lsdb/catalog/dataset/healpix_dataset.py
@@ -5,7 +5,7 @@
import warnings
from collections.abc import Sequence
from pathlib import Path
-from typing import Callable, Iterable, Literal, Type, cast
+from typing import Callable, Iterable, Type
import astropy
import dask
@@ -22,12 +22,9 @@
from hats.inspection.visualize_catalog import get_fov_moc_from_wcs, initialize_wcs_axes
from hats.pixel_math import HealpixPixel
from hats.pixel_math.healpix_pixel_function import get_pixel_argsort
-from hats.pixel_math.spatial_index import SPATIAL_INDEX_COLUMN
from matplotlib.figure import Figure
from mocpy import MOC
-from pandas._libs import lib
-from pandas._typing import AnyAll, Axis, IndexLabel, Renamer
-from pandas.api.extensions import no_default
+from pandas._typing import Renamer
from typing_extensions import Self
from upath import UPath
@@ -855,91 +852,6 @@ def write_catalog(
**kwargs,
)
- def dropna(
- self,
- *,
- axis: Axis = 0,
- how: AnyAll | lib.NoDefault = no_default,
- thresh: int | lib.NoDefault = no_default,
- on_nested: bool = False,
- subset: IndexLabel | None = None,
- ignore_index: bool = False,
- ) -> Self: # type: ignore[name-defined] # noqa: F821:
- """
- Remove missing values for one layer of nested columns in the catalog.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Determine if rows or columns which contain missing values are
- removed.
-
- * 0, or 'index' : Drop rows which contain missing values.
- * 1, or 'columns' : Drop columns which contain missing value.
-
- Only a single axis is allowed.
-
- how : {'any', 'all'}, default 'any'
- Determine if row or column is removed from catalog, when we have
- at least one NA or all NA.
-
- * 'any' : If any NA values are present, drop that row or column.
- * 'all' : If all values are NA, drop that row or column.
- thresh : int, optional
- Require that many non-NA values. Cannot be combined with how.
- on_nested : str or bool, optional
- If not False, applies the call to the nested dataframe in the
- column with label equal to the provided string. If specified,
- the nested dataframe should align with any columns given in
- `subset`.
- subset : column label or sequence of labels, optional
- Labels along other axis to consider, e.g. if you are dropping rows
- these would be a list of columns to include.
-
- Access nested columns using `nested_df.nested_col` (where
- `nested_df` refers to a particular nested dataframe and
- `nested_col` is a column of that nested dataframe).
- ignore_index : bool, default ``False``
- If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
-
- .. versionadded:: 2.0.0
-
- Returns
- -------
- Catalog
- Catalog with NA entries dropped from it.
-
- Notes
- -----
- Operations that target a particular nested structure return a dataframe
- with rows of that particular nested structure affected.
-
- Values for `on_nested` and `subset` should be consistent in pointing
- to a single layer, multi-layer operations are not supported at this
- time.
- """
-
- def drop_na_part(df: npd.NestedFrame):
- if df.index.name == SPATIAL_INDEX_COLUMN:
- df = df.reset_index()
- df = cast(
- npd.NestedFrame,
- df.dropna(
- axis=axis,
- how=how,
- thresh=thresh,
- on_nested=on_nested,
- subset=subset,
- ignore_index=ignore_index,
- ),
- )
- if SPATIAL_INDEX_COLUMN in df.columns:
- df = df.set_index(SPATIAL_INDEX_COLUMN)
- return df
-
- ndf = self._ddf.map_partitions(drop_na_part, meta=self._ddf._meta)
- return self._create_updated_dataset(ddf=ndf)
-
def nest_lists(
self,
base_columns: list[str] | None = None,
@@ -1170,48 +1082,3 @@ def plot_points(
fig=fig,
**kwargs,
)
-
- def sort_nested_values(
- self,
- by: str | list[str],
- ascending: bool | list[bool] = True,
- na_position: Literal["first"] | Literal["last"] = "last",
- ignore_index: bool | None = False,
- **options,
- ) -> Self:
- """Sort nested columns for each row in the catalog.
-
- Args:
- by: str or list[str]
- Column(s) to sort by.
- ascending: bool or list[bool], optional
- Sort ascending vs. descending. Defaults to True. Specify list for
- multiple sort orders. If this is a list of bools, must match the
- length of the by.
- na_position: {‘last’, ‘first’}, optional
- Puts NaNs at the beginning if ‘first’, puts NaN at the end if
- ‘last’. Defaults to ‘last’.
- ignore_index: bool, optional
- If True, the resulting axis will be labeled 0, 1, …, n - 1.
- Defaults to False.
- **options: keyword arguments, optional
- Additional options to pass to the sorting function.
-
- Returns:
- A new catalog where the specified nested columns are sorted.
- """
- if isinstance(by, str):
- by = [by]
- self._check_unloaded_columns(by)
- # Check "by" columns for hierarchical references
- for col in by:
- if not self._ddf._is_known_hierarchical_column(col):
- raise ValueError(f"{col} not found in nested columns")
- ndf = self._ddf.sort_values(
- by=by,
- ascending=ascending,
- na_position=na_position,
- ignore_index=ignore_index,
- **options,
- )
- return self._create_updated_dataset(ddf=ndf)
diff --git a/src/lsdb/nested/core.py b/src/lsdb/nested/core.py
index d86ba7c36..ef9f85879 100644
--- a/src/lsdb/nested/core.py
+++ b/src/lsdb/nested/core.py
@@ -14,9 +14,6 @@
from dask.dataframe.dask_expr._expr import no_default as dsk_no_default
from nested_pandas.series.dtype import NestedDtype
from nested_pandas.series.packer import pack, pack_flat, pack_lists
-from pandas._libs import lib
-from pandas._typing import Axis, IndexLabel
-from pandas.api.extensions import no_default
from typing_extensions import Self
# need this for the base _Frame class
@@ -551,89 +548,6 @@ def query(self, expr) -> Self: # type: ignore # noqa: F821: # pylint: disable=u
lambda x: npd.NestedFrame(x).query(expr), meta=self._meta
) # pylint: disable=protected-access
- # pylint: disable=arguments-differ
- def dropna(
- self,
- *,
- axis: Axis = 0,
- how: str | lib.NoDefault = no_default,
- thresh: int | lib.NoDefault = no_default,
- on_nested: bool = False,
- subset: IndexLabel | None = None,
- inplace: bool = False,
- ignore_index: bool = False,
- ) -> Self: # type: ignore[name-defined] # noqa: F821: # pylint: disable=undefined-variable
- """
- Remove missing values for one layer of the NestedFrame.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Determine if rows or columns which contain missing values are
- removed.
-
- * 0, or 'index' : Drop rows which contain missing values.
- * 1, or 'columns' : Drop columns which contain missing value.
-
- Only a single axis is allowed.
-
- how : {'any', 'all'}, default 'any'
- Determine if row or column is removed from DataFrame, when we have
- at least one NA or all NA.
-
- * 'any' : If any NA values are present, drop that row or column.
- * 'all' : If all values are NA, drop that row or column.
- thresh : int, optional
- Require that many non-NA values. Cannot be combined with how.
- on_nested : str or bool, optional
- If not False, applies the call to the nested dataframe in the
- column with label equal to the provided string. If specified,
- the nested dataframe should align with any columns given in
- `subset`.
- subset : column label or sequence of labels, optional
- Labels along other axis to consider, e.g. if you are dropping rows
- these would be a list of columns to include.
-
- Access nested columns using `nested_df.nested_col` (where
- `nested_df` refers to a particular nested dataframe and
- `nested_col` is a column of that nested dataframe).
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.
- ignore_index : bool, default ``False``
- If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
-
- .. versionadded:: 2.0.0
-
- Returns
- -------
- DataFrame or None
- DataFrame with NA entries dropped from it or None if ``inplace=True``.
-
- Notes
- -----
- Operations that target a particular nested structure return a dataframe
- with rows of that particular nested structure affected.
-
- Values for `on_nested` and `subset` should be consistent in pointing
- to a single layer, multi-layer operations are not supported at this
- time.
- """
- # propagate meta, assumes row-based operation
- return self.map_partitions(
- lambda x: npd.NestedFrame(x).dropna(
- axis=axis,
- how=how,
- thresh=thresh,
- on_nested=on_nested,
- subset=subset,
- inplace=inplace,
- ignore_index=ignore_index,
- ),
- meta=self._meta, # pylint: disable=protected-access
- )
-
- # NOTE: This is wrapped as a much more restrictive sort_nested_values
- # function in lsdb.catalog
def sort_values(
self,
by: str | list[str],
diff --git a/tests/lsdb/catalog/test_catalog.py b/tests/lsdb/catalog/test_catalog.py
index f38e981d5..5c30c7f68 100644
--- a/tests/lsdb/catalog/test_catalog.py
+++ b/tests/lsdb/catalog/test_catalog.py
@@ -2,7 +2,6 @@
from pathlib import Path
import astropy.units as u
-import dask.array as da
import dask.dataframe as dd
import hats as hc
import hats.pixel_math.healpix_shim as hp
@@ -325,58 +324,6 @@ def test_rename_with_dict(small_sky_xmatch_with_margin):
assert renamed_catalog.margin.columns[i] == f"{col}_{i}"
-def test_assign_no_arguments(small_sky_order1_catalog):
- result_catalog = small_sky_order1_catalog.assign()
- pd.testing.assert_frame_equal(result_catalog._ddf.compute(), small_sky_order1_catalog._ddf.compute())
- assert isinstance(result_catalog._ddf, nd.NestedFrame)
-
-
-def test_assign_with_callable(small_sky_order1_catalog):
- kwargs = {"squared_ra_err": lambda x: x["ra_error"] ** 2}
- result_catalog = small_sky_order1_catalog.assign(**kwargs)
- expected_ddf = small_sky_order1_catalog._ddf.copy()
- expected_ddf["squared_ra_err"] = expected_ddf["ra_error"] ** 2
- pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute())
- assert isinstance(result_catalog._ddf, nd.NestedFrame)
-
-
-def test_assign_with_series(small_sky_order1_catalog):
- # The series is created from the original dataframe because indices must match
- squared_ra_err = small_sky_order1_catalog._ddf["ra_error"].map(lambda x: x**2)
- kwargs = {"new_column": squared_ra_err}
- result_catalog = small_sky_order1_catalog.assign(**kwargs)
- expected_ddf = small_sky_order1_catalog._ddf.copy()
- expected_ddf["new_column"] = squared_ra_err
- pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute())
- assert isinstance(result_catalog._ddf, nd.NestedFrame)
-
-
-def test_assign_with_multiple_columns(small_sky_order1_catalog):
- # These series are created from the original dataframe because indices must match
- squared_ra_err = small_sky_order1_catalog._ddf["ra_error"].map(lambda x: x**2)
- squared_dec_err = small_sky_order1_catalog._ddf["dec_error"].map(lambda x: x**2)
- kwargs = {
- "squared_ra_err": squared_ra_err,
- "squared_dec_err": squared_dec_err,
- }
- result_catalog = small_sky_order1_catalog.assign(**kwargs)
- expected_ddf = small_sky_order1_catalog._ddf.copy()
- expected_ddf["squared_ra_err"] = squared_ra_err
- expected_ddf["squared_dec_err"] = squared_dec_err
- pd.testing.assert_frame_equal(result_catalog.compute(), expected_ddf.compute())
-
-
-def test_assign_with_invalid_arguments(small_sky_order1_catalog):
- with pytest.raises(TypeError, match="Column assignment doesn't support type"):
- small_sky_order1_catalog.assign(new_column=[1, 2, 3])
- with pytest.raises(ValueError, match="Array assignment only supports 1-D arrays"):
- small_sky_order1_catalog.assign(new_column=da.ones((10, 10)))
- with pytest.raises(ValueError, match="Number of partitions do not match"):
- chunks = small_sky_order1_catalog._ddf.npartitions + 1
- array = da.random.random(size=10, chunks=chunks)
- small_sky_order1_catalog.assign(new_column=array)
-
-
def test_read_hats(small_sky_catalog, tmp_path):
new_catalog_name = "small_sky"
base_catalog_path = Path(tmp_path) / new_catalog_name
@@ -707,8 +654,6 @@ def test_filtered_catalog_has_undetermined_len(small_sky_order1_catalog, small_s
len(small_sky_order1_catalog.id_search(values={"id": 900}, index_catalogs={"id": catalog_index}))
with pytest.raises(ValueError, match="undetermined"):
len(small_sky_order1_catalog.pixel_search([(0, 11)]))
- with pytest.raises(ValueError, match="undetermined"):
- len(small_sky_order1_catalog.dropna())
@pytest.mark.sphgeom
diff --git a/tests/lsdb/catalog/test_nested.py b/tests/lsdb/catalog/test_nested.py
index 49aeb5173..31b87b4a0 100644
--- a/tests/lsdb/catalog/test_nested.py
+++ b/tests/lsdb/catalog/test_nested.py
@@ -7,43 +7,13 @@
import lsdb
import lsdb.nested as nd
-from lsdb import Catalog, MarginCatalog
+from lsdb import Catalog
def test_nested_columns_property(small_sky_with_nested_sources):
assert list(small_sky_with_nested_sources.nested_columns) == ["sources"]
-def test_dropna(small_sky_with_nested_sources):
- filtered_cat = small_sky_with_nested_sources.query("sources.mag < 15.1")
- drop_na_cat = filtered_cat.dropna()
- assert isinstance(drop_na_cat, Catalog)
- assert isinstance(drop_na_cat._ddf, nd.NestedFrame)
- drop_na_compute = drop_na_cat.compute()
- assert isinstance(drop_na_compute, npd.NestedFrame)
- filtered_compute = filtered_cat.compute()
- assert len(drop_na_compute) < len(filtered_compute)
- pd.testing.assert_frame_equal(drop_na_compute, filtered_compute.dropna())
-
-
-def test_dropna_on_nested(small_sky_with_nested_sources):
- def add_na_values_nested(df):
- """replaces the first source_ra value in each nested df with NaN"""
- for i in range(len(df)):
- first_ra_value = df.iloc[i]["sources"].iloc[0]["source_ra"]
- df["sources"].array[i] = df["sources"].array[i].replace(first_ra_value, np.nan)
- return df
-
- filtered_cat = small_sky_with_nested_sources.map_partitions(add_na_values_nested)
- drop_na_cat = filtered_cat.dropna(on_nested="sources")
- assert isinstance(drop_na_cat, Catalog)
- assert isinstance(drop_na_cat._ddf, nd.NestedFrame)
- drop_na_sources_compute = drop_na_cat["sources"].compute()
- filtered_sources_compute = filtered_cat["sources"].compute()
- assert len(drop_na_sources_compute) == len(filtered_sources_compute)
- assert sum(map(len, drop_na_sources_compute)) < sum(map(len, filtered_sources_compute))
-
-
def test_nest_lists(small_sky_with_nested_sources):
"""Test the behavior of catalog.nest_lists"""
cat_ndf = small_sky_with_nested_sources._ddf.map_partitions(
@@ -227,40 +197,6 @@ def mean_mag(ra, dec, mag):
assert list(res_false.columns) == ["new_nested.ra_mag", "new_nested.dec_mag"]
-def test_sort_nested_values(small_sky_with_nested_sources):
- # Sorting on nested "mjd" source column, in descending order
- sorted_nested = small_sky_with_nested_sources.sort_nested_values(by="sources.mjd", ascending=False)
- assert isinstance(sorted_nested, Catalog)
- unsorted_source = small_sky_with_nested_sources["sources"].compute()
- sorted_source = sorted_nested["sources"].compute()
- for i in range(len(unsorted_source)):
- expected_mjd = sorted(unsorted_source.iloc[i]["mjd"], reverse=True)
- assert expected_mjd == sorted_source.iloc[i]["mjd"].values.tolist()
- expected_schema = small_sky_with_nested_sources.hc_structure.schema
- assert expected_schema.equals(sorted_nested.hc_structure.schema)
-
-
-def test_sort_nested_values_with_margin(small_sky_with_nested_sources_with_margin):
- # Sorting values in nested column also sorts catalog margin
- sorted_nested = small_sky_with_nested_sources_with_margin.sort_nested_values(
- by="sources.mjd", ascending=False
- )
- assert isinstance(sorted_nested, Catalog)
- assert isinstance(sorted_nested.margin, MarginCatalog)
- unsorted_source = small_sky_with_nested_sources_with_margin.margin["sources"].compute()
- sorted_source = sorted_nested.margin["sources"].compute()
- for i in range(len(unsorted_source)):
- expected_mjd = sorted(unsorted_source.iloc[i]["mjd"], reverse=True)
- assert expected_mjd == sorted_source.iloc[i]["mjd"].values.tolist()
- expected_schema = small_sky_with_nested_sources_with_margin.margin.hc_structure.schema
- assert expected_schema.equals(sorted_nested.margin.hc_structure.schema)
-
-
-def test_sort_nested_values_using_base_column(small_sky_with_nested_sources):
- with pytest.raises(ValueError, match="nested columns"):
- small_sky_with_nested_sources.sort_nested_values(by="ra")
-
-
def test_serialization_read(small_sky_with_nested_sources):
assert isinstance(small_sky_with_nested_sources.dtypes["sources"], NestedDtype)
diff --git a/tests/lsdb/nested/test_nestedframe.py b/tests/lsdb/nested/test_nestedframe.py
index 27c261e7c..d493c7b7e 100644
--- a/tests/lsdb/nested/test_nestedframe.py
+++ b/tests/lsdb/nested/test_nestedframe.py
@@ -291,23 +291,6 @@ def test_query_on_nested(test_dataset):
assert len(res) == 50 # make sure the base df remains unchanged
-def test_dropna(test_dataset_with_nans):
- """test the dropna function"""
-
- nan_free_base = test_dataset_with_nans.dropna(subset=["a"])
- # should just remove one row
- assert len(nan_free_base) == len(test_dataset_with_nans) - 1
-
- meta = test_dataset_with_nans.loc[0].head(0).nested.nest.to_flat()
-
- nan_free_nested = test_dataset_with_nans.dropna(subset=["nested.t"])
-
- flat_nested_nan_free = nan_free_nested.map_partitions(lambda x: x.nested.nest.to_flat(), meta=meta)
- flat_nested = test_dataset_with_nans.map_partitions(lambda x: x.nested.nest.to_flat(), meta=meta)
- # should just remove one row
- assert len(flat_nested_nan_free) == len(flat_nested) - 1
-
-
def test_sort_values(test_dataset):
"""test the sort_values function"""
| | | | | | | | | | | | | |