From c131bf09b60f395440afed4e1fdbaadedeb46726 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 27 Aug 2025 08:27:25 +0530 Subject: [PATCH 01/28] Use typed lists from biocutils for compressed lists --- src/compressed_lists/__init__.py | 8 +- .../{CompressedList.py => base.py} | 2 +- src/compressed_lists/bool_list.py | 99 +++++++++++++++++++ src/compressed_lists/float_list.py | 99 +++++++++++++++++++ ...mpressedIntegerList.py => integer_list.py} | 23 +++-- ...CompressedStringList.py => string_list.py} | 20 +++- tests/test_comp_bool.py | 33 +++++++ tests/test_comp_float.py | 33 +++++++ tests/test_comp_int.py | 54 +++++----- tests/test_comp_str.py | 15 +-- 10 files changed, 336 insertions(+), 50 deletions(-) rename src/compressed_lists/{CompressedList.py => base.py} (99%) create mode 100644 src/compressed_lists/bool_list.py create mode 100644 src/compressed_lists/float_list.py rename src/compressed_lists/{CompressedIntegerList.py => integer_list.py} (75%) rename src/compressed_lists/{CompressedStringList.py => string_list.py} (77%) create mode 100644 tests/test_comp_bool.py create mode 100644 tests/test_comp_float.py diff --git a/src/compressed_lists/__init__.py b/src/compressed_lists/__init__.py index b320c98..5d615de 100644 --- a/src/compressed_lists/__init__.py +++ b/src/compressed_lists/__init__.py @@ -16,6 +16,8 @@ del version, PackageNotFoundError from .partition import Partitioning -from .CompressedList import CompressedList -from .CompressedIntegerList import CompressedIntegerList -from .CompressedStringList import CompressedStringList +from .base import CompressedList +from .integer_list import CompressedIntegerList +from .string_list import CompressedStringList, CompressedCharacterList +from .bool_list import CompressedBooleanList +from .float_list import CompressedFloatList \ No newline at end of file diff --git a/src/compressed_lists/CompressedList.py b/src/compressed_lists/base.py similarity index 99% rename from src/compressed_lists/CompressedList.py rename to src/compressed_lists/base.py index 0f629bd..8422756 100644 --- a/src/compressed_lists/CompressedList.py +++ b/src/compressed_lists/base.py @@ -43,7 +43,7 @@ def __init__( Vector-like object containing concatenated elements. partitioning: - Partitioning object defining element boundaries. + Partitioning object defining element boundaries (exclusive). element_type: String identifier for the type of elements. diff --git a/src/compressed_lists/bool_list.py b/src/compressed_lists/bool_list.py new file mode 100644 index 0000000..4b1ac4f --- /dev/null +++ b/src/compressed_lists/bool_list.py @@ -0,0 +1,99 @@ +from typing import List, Optional, Sequence + +from biocutils.BooleanList import BooleanList + +from .base import CompressedList +from .partition import Partitioning + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class CompressedBooleanList(CompressedList): + """CompressedList implementation for lists of integers.""" + + def __init__( + self, + unlist_data: BooleanList, + partitioning: Partitioning, + element_metadata: dict = None, + metadata: dict = None, + **kwargs, + ): + """Initialize a CompressedIntegerList. + + Args: + unlist_data: + List of booleans. + + partitioning: + Partitioning object defining element boundaries. + + element_metadata: + Optional metadata for elements. + + metadata: + Optional general metadata. + + kwargs: + Additional arguments. + """ + + if not isinstance(unlist_data, BooleanList): + try: + unlist_data = BooleanList(unlist_data) + except Exception as e: + raise TypeError("'unlist_data' must be an `BooleanList`, provided ", type(unlist_data)) from e + + super().__init__( + unlist_data, partitioning, element_type="integer", element_metadata=element_metadata, metadata=metadata + ) + + def _extract_range(self, start: int, end: int) -> BooleanList: + """Extract a range from unlist_data. + + Args: + start: + Start index (inclusive). + + end: + End index (exclusive). + + Returns: + Same type as unlist_data. + """ + return self._unlist_data[start:end] + + @classmethod + def from_list( + cls, lst: List[List[bool]], names: Optional[Sequence[str]] = None, metadata: dict = None + ) -> "CompressedBooleanList": + """ + Create a `CompressedBooleanList` from a list of integer lists. + + Args: + lst: + List of integer lists. + + names: + Optional names for list elements. + + metadata: + Optional metadata. + + Returns: + A new `CompressedBooleanList`. + """ + # Flatten the list + flat_data = [] + for sublist in lst: + flat_data.extend(sublist) + + # Create partitioning + partitioning = Partitioning.from_list(lst, names) + + # Create unlist_data + unlist_data = BooleanList(data=flat_data) + + return cls(unlist_data, partitioning, metadata=metadata) diff --git a/src/compressed_lists/float_list.py b/src/compressed_lists/float_list.py new file mode 100644 index 0000000..2a87d53 --- /dev/null +++ b/src/compressed_lists/float_list.py @@ -0,0 +1,99 @@ +from typing import List, Optional, Sequence + +from biocutils.FloatList import FloatList + +from .base import CompressedList +from .partition import Partitioning + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class CompressedFloatList(CompressedList): + """CompressedList implementation for lists of integers.""" + + def __init__( + self, + unlist_data: FloatList, + partitioning: Partitioning, + element_metadata: dict = None, + metadata: dict = None, + **kwargs, + ): + """Initialize a CompressedIntegerList. + + Args: + unlist_data: + List of floats. + + partitioning: + Partitioning object defining element boundaries. + + element_metadata: + Optional metadata for elements. + + metadata: + Optional general metadata. + + kwargs: + Additional arguments. + """ + + if not isinstance(unlist_data, FloatList): + try: + unlist_data = FloatList(unlist_data) + except Exception as e: + raise TypeError("'unlist_data' must be an `FloatList`, provided ", type(unlist_data)) from e + + super().__init__( + unlist_data, partitioning, element_type="integer", element_metadata=element_metadata, metadata=metadata + ) + + def _extract_range(self, start: int, end: int) -> FloatList: + """Extract a range from unlist_data. + + Args: + start: + Start index (inclusive). + + end: + End index (exclusive). + + Returns: + Same type as unlist_data. + """ + return self._unlist_data[start:end] + + @classmethod + def from_list( + cls, lst: List[List[bool]], names: Optional[Sequence[str]] = None, metadata: dict = None + ) -> "CompressedFloatList": + """ + Create a `CompressedFloatList` from a list of integer lists. + + Args: + lst: + List of integer lists. + + names: + Optional names for list elements. + + metadata: + Optional metadata. + + Returns: + A new `CompressedFloatList`. + """ + # Flatten the list + flat_data = [] + for sublist in lst: + flat_data.extend(sublist) + + # Create partitioning + partitioning = Partitioning.from_list(lst, names) + + # Create unlist_data + unlist_data = FloatList(data=flat_data) + + return cls(unlist_data, partitioning, metadata=metadata) diff --git a/src/compressed_lists/CompressedIntegerList.py b/src/compressed_lists/integer_list.py similarity index 75% rename from src/compressed_lists/CompressedIntegerList.py rename to src/compressed_lists/integer_list.py index 5d3bae9..678b80b 100644 --- a/src/compressed_lists/CompressedIntegerList.py +++ b/src/compressed_lists/integer_list.py @@ -1,8 +1,8 @@ from typing import List, Optional, Sequence -import numpy as np +from biocutils.IntegerList import IntegerList -from .CompressedList import CompressedList +from .base import CompressedList from .partition import Partitioning __author__ = "Jayaram Kancherla" @@ -15,7 +15,7 @@ class CompressedIntegerList(CompressedList): def __init__( self, - unlist_data: np.ndarray, + unlist_data: IntegerList, partitioning: Partitioning, element_metadata: dict = None, metadata: dict = None, @@ -25,7 +25,7 @@ def __init__( Args: unlist_data: - NumPy array of integers. + List of integers. partitioning: Partitioning object defining element boundaries. @@ -39,11 +39,18 @@ def __init__( kwargs: Additional arguments. """ + + if not isinstance(unlist_data, IntegerList): + try: + unlist_data = IntegerList(unlist_data) + except Exception as e: + raise TypeError("'unlist_data' must be an `IntegerList`, provided ", type(unlist_data)) from e + super().__init__( unlist_data, partitioning, element_type="integer", element_metadata=element_metadata, metadata=metadata ) - def _extract_range(self, start: int, end: int) -> np.ndarray: + def _extract_range(self, start: int, end: int) -> IntegerList: """Extract a range from unlist_data. Args: @@ -63,7 +70,7 @@ def from_list( cls, lst: List[List[int]], names: Optional[Sequence[str]] = None, metadata: dict = None ) -> "CompressedIntegerList": """ - Create a CompressedIntegerList from a list of integer lists. + Create a `CompressedIntegerList` from a list of integer lists. Args: lst: @@ -76,7 +83,7 @@ def from_list( Optional metadata. Returns: - A new CompressedIntegerList. + A new `CompressedIntegerList`. """ # Flatten the list flat_data = [] @@ -87,6 +94,6 @@ def from_list( partitioning = Partitioning.from_list(lst, names) # Create unlist_data - unlist_data = np.array(flat_data, dtype=np.int64) + unlist_data = IntegerList(data=flat_data) return cls(unlist_data, partitioning, metadata=metadata) diff --git a/src/compressed_lists/CompressedStringList.py b/src/compressed_lists/string_list.py similarity index 77% rename from src/compressed_lists/CompressedStringList.py rename to src/compressed_lists/string_list.py index 88cce67..863229c 100644 --- a/src/compressed_lists/CompressedStringList.py +++ b/src/compressed_lists/string_list.py @@ -1,6 +1,8 @@ from typing import List, Optional, Sequence -from .CompressedList import CompressedList +from biocutils.StringList import StringList + +from .base import CompressedList from .partition import Partitioning __author__ = "Jayaram Kancherla" @@ -13,7 +15,7 @@ class CompressedStringList(CompressedList): def __init__( self, - unlist_data: List[str], + unlist_data: StringList, partitioning: Partitioning, element_metadata: dict = None, metadata: dict = None, @@ -37,11 +39,17 @@ def __init__( kwargs: Additional arguments. """ + if not isinstance(unlist_data, StringList): + try: + unlist_data = StringList(unlist_data) + except Exception as e: + raise TypeError("'unlist_data' must be an `StringList`, provided ", type(unlist_data)) from e + super().__init__( unlist_data, partitioning, element_type="string", element_metadata=element_metadata, metadata=metadata ) - def _extract_range(self, start: int, end: int) -> List[str]: + def _extract_range(self, start: int, end: int) -> StringList: """Extract a range from unlist_data. Args: @@ -83,4 +91,8 @@ def from_list( # Create partitioning partitioning = Partitioning.from_list(lst, names) - return cls(flat_data, partitioning, metadata=metadata) + return cls(StringList(flat_data), partitioning, metadata=metadata) + + +class CompressedCharacterList(CompressedStringList): + pass diff --git a/tests/test_comp_bool.py b/tests/test_comp_bool.py new file mode 100644 index 0000000..077ae35 --- /dev/null +++ b/tests/test_comp_bool.py @@ -0,0 +1,33 @@ +import pytest +from biocutils.BooleanList import BooleanList + +from compressed_lists import CompressedBooleanList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@pytest.fixture +def bool_data(): + return [[True, False], [False, True, False], [False]] + + +@pytest.fixture +def bool_list(bool_data): + names = ["fruits1", "fruits2", "fruits3"] + return CompressedBooleanList.from_list(bool_data, names) + + +def test_creation(bool_data): + bool_list = CompressedBooleanList.from_list(bool_data) + + assert len(bool_list) == 3 + assert isinstance(bool_list.unlist_data, BooleanList) + assert list(bool_list.get_unlist_data()) == [True, False, False, True, False, False] + assert list(bool_list.get_element_lengths()) == [2, 3, 1] + + +def test_getitem(bool_list): + assert list(bool_list[0]) == [True, False] + assert list(bool_list["fruits2"]) == [False, True, False] diff --git a/tests/test_comp_float.py b/tests/test_comp_float.py new file mode 100644 index 0000000..1913508 --- /dev/null +++ b/tests/test_comp_float.py @@ -0,0 +1,33 @@ +import pytest +from biocutils.FloatList import FloatList + +from compressed_lists import CompressedFloatList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@pytest.fixture +def float_data(): + return [[1.1, 1.2], [2.1, 2.2, 2.3], [3]] + + +@pytest.fixture +def float_list(float_data): + names = ["fruits1", "fruits2", "fruits3"] + return CompressedFloatList.from_list(float_data, names) + + +def test_creation(float_data): + float_list = CompressedFloatList.from_list(float_data) + + assert len(float_list) == 3 + assert isinstance(float_list.unlist_data, FloatList) + assert list(float_list.get_unlist_data()) == [1.1, 1.2, 2.1, 2.2, 2.3, 3.0] + assert list(float_list.get_element_lengths()) == [2, 3, 1] + + +def test_getitem(float_list): + assert list(float_list[0]) == [1.1, 1.2] + assert list(float_list["fruits2"]) == [2.1, 2.2, 2.3] diff --git a/tests/test_comp_int.py b/tests/test_comp_int.py index 5771031..daac5d5 100644 --- a/tests/test_comp_int.py +++ b/tests/test_comp_int.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from biocutils.IntegerList import IntegerList from compressed_lists import CompressedIntegerList, Partitioning @@ -23,16 +24,16 @@ def test_creation(int_data): int_list = CompressedIntegerList.from_list(int_data) assert len(int_list) == 3 - assert isinstance(int_list.unlist_data, np.ndarray) + assert isinstance(int_list.unlist_data, IntegerList) assert list(int_list.get_unlist_data()) == [1, 2, 3, 4, 5, 6, 7, 8, 9] assert list(int_list.get_element_lengths()) == [3, 2, 4] def test_creation_from_parts(): - int_list = CompressedIntegerList(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), Partitioning(ends=[3, 5, 9])) + int_list = CompressedIntegerList([1, 2, 3, 4, 5, 6, 7, 8, 9], Partitioning(ends=[3, 5, 9])) assert len(int_list) == 3 - assert isinstance(int_list.unlist_data, np.ndarray) + assert isinstance(int_list.unlist_data, IntegerList) assert list(int_list.get_unlist_data()) == [1, 2, 3, 4, 5, 6, 7, 8, 9] assert list(int_list.get_element_lengths()) == [3, 2, 4] @@ -45,7 +46,7 @@ def test_creation_with_names(int_data): def test_validation(): - data = np.array([1, 2, 3, 4, 5]) + data = IntegerList([1, 2, 3, 4, 5]) partitioning = Partitioning([2, 4, 7]) with pytest.raises(ValueError): @@ -53,20 +54,19 @@ def test_validation(): def test_getitem_by_index(int_list): - assert np.allclose(int_list[0], [1, 2, 3]) - assert np.allclose(int_list[1], [4, 5]) - assert np.allclose(int_list[2], [6, 7, 8, 9]) - - assert np.allclose(int_list[-1], [6, 7, 8, 9]) + assert np.allclose(list(int_list[0]), [1, 2, 3]) + assert np.allclose(list(int_list[1]), [4, 5]) + assert np.allclose(list(int_list[2]), [6, 7, 8, 9]) + assert np.allclose(list(int_list[-1]), [6, 7, 8, 9]) with pytest.raises(IndexError): int_list[3] def test_getitem_by_name(int_list): - assert np.allclose(int_list["A"], [1, 2, 3]) - assert np.allclose(int_list["B"], [4, 5]) - assert np.allclose(int_list["C"], [6, 7, 8, 9]) + assert np.allclose(list(int_list["A"]), [1, 2, 3]) + assert np.allclose(list(int_list["B"]), [4, 5]) + assert np.allclose(list(int_list["C"]), [6, 7, 8, 9]) with pytest.raises(KeyError): int_list["D"] @@ -76,8 +76,8 @@ def test_getitem_by_slice(int_list): sliced = int_list[1:3] assert len(sliced) == 2 - assert np.allclose(sliced[0], [4, 5]) - assert np.allclose(sliced[1], [6, 7, 8, 9]) + assert np.allclose(list(sliced[0]), [4, 5]) + assert np.allclose(list(sliced[1]), [6, 7, 8, 9]) assert list(sliced.names) == ["B", "C"] # Empty slice @@ -89,41 +89,41 @@ def test_iteration(int_list, int_data): items = list(int_list) print(items, int_data) for i, lst in enumerate(items): - assert np.allclose(lst, int_data[i]) + assert np.allclose(list(lst), int_data[i]) def test_to_list(int_list, int_data): regular_list = int_list.to_list() for i, lst in enumerate(regular_list): - assert np.allclose(lst, int_data[i]) + assert np.allclose(list(lst), int_data[i]) def test_unlist(int_list): unlisted = int_list.unlist() - assert isinstance(unlisted, np.ndarray) + assert isinstance(unlisted, IntegerList) assert np.allclose(list(unlisted), [1, 2, 3, 4, 5, 6, 7, 8, 9]) def test_relist(int_list): - new_data = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90], dtype=np.int64) + new_data = IntegerList([10, 20, 30, 40, 50, 60, 70, 80, 90]) relisted = int_list.relist(new_data) assert len(relisted) == len(int_list) assert list(relisted.get_names()) == list(int_list.names) - assert np.allclose(relisted[0], [10, 20, 30]) - assert np.allclose(relisted[1], [40, 50]) - assert np.allclose(relisted[2], [60, 70, 80, 90]) + assert np.allclose(list(relisted[0]), [10, 20, 30]) + assert np.allclose(list(relisted[1]), [40, 50]) + assert np.allclose(list(relisted[2]), [60, 70, 80, 90]) with pytest.raises(ValueError): - int_list.relist(np.array([1, 2, 3])) + int_list.relist(IntegerList([1, 2, 3])) def test_extract_subset(int_list): subset = int_list.extract_subset([0, 2]) assert len(subset) == 2 - assert np.allclose(subset[0], [1, 2, 3]) - assert np.allclose(subset[1], [6, 7, 8, 9]) + assert np.allclose(list(subset[0]), [1, 2, 3]) + assert np.allclose(list(subset[1]), [6, 7, 8, 9]) assert list(subset.names) == ["A", "C"] with pytest.raises(IndexError): @@ -134,6 +134,6 @@ def test_lapply(int_list): squared = int_list.lapply(lambda x: [i**2 for i in x]) assert len(squared) == len(int_list) - assert np.allclose(squared[0], [1, 4, 9]) - assert np.allclose(squared[1], [16, 25]) - assert np.allclose(squared[2], [36, 49, 64, 81]) + assert np.allclose(list(squared[0]), [1, 4, 9]) + assert np.allclose(list(squared[1]), [16, 25]) + assert np.allclose(list(squared[2]), [36, 49, 64, 81]) diff --git a/tests/test_comp_str.py b/tests/test_comp_str.py index ef54f8c..d65adbd 100644 --- a/tests/test_comp_str.py +++ b/tests/test_comp_str.py @@ -1,4 +1,5 @@ import pytest +from biocutils.StringList import StringList from compressed_lists import CompressedStringList @@ -22,20 +23,20 @@ def test_creation(char_data): char_list = CompressedStringList.from_list(char_data) assert len(char_list) == 3 - assert isinstance(char_list.unlist_data, list) - assert char_list.get_unlist_data() == ["apple", "banana", "cherry", "date", "elderberry", "fig"] + assert isinstance(char_list.unlist_data, StringList) + assert list(char_list.get_unlist_data()) == ["apple", "banana", "cherry", "date", "elderberry", "fig"] assert list(char_list.get_element_lengths()) == [2, 3, 1] def test_getitem(char_list): - assert char_list[0] == ["apple", "banana"] - assert char_list["fruits2"] == ["cherry", "date", "elderberry"] + assert list(char_list[0]) == ["apple", "banana"] + assert list(char_list["fruits2"]) == ["cherry", "date", "elderberry"] def test_lapply(char_list): uppercased = char_list.lapply(lambda x: [s.upper() for s in x]) assert len(uppercased) == len(char_list) - assert uppercased[0] == ["APPLE", "BANANA"] - assert uppercased[1] == ["CHERRY", "DATE", "ELDERBERRY"] - assert uppercased[2] == ["FIG"] + assert list(uppercased[0]) == ["APPLE", "BANANA"] + assert list(uppercased[1]) == ["CHERRY", "DATE", "ELDERBERRY"] + assert list(uppercased[2]) == ["FIG"] From 1d01f858431e4b5c28954850d04191e791350b33 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 27 Aug 2025 08:28:17 +0530 Subject: [PATCH 02/28] Update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 792a416..fb081ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.2.0 + +- Switch to typed lists from the biocutils package. + ## Version 0.1.0 - 0.1.1 - Initial implementation of various classes - Partitioning and CompressedLists. From c475591c7e644ffa32cb158badfa6c612f31763d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Aug 2025 02:58:59 +0000 Subject: [PATCH 03/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/compressed_lists/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compressed_lists/__init__.py b/src/compressed_lists/__init__.py index 5d615de..baa8bb1 100644 --- a/src/compressed_lists/__init__.py +++ b/src/compressed_lists/__init__.py @@ -20,4 +20,4 @@ from .integer_list import CompressedIntegerList from .string_list import CompressedStringList, CompressedCharacterList from .bool_list import CompressedBooleanList -from .float_list import CompressedFloatList \ No newline at end of file +from .float_list import CompressedFloatList From 4b9ca3d625d743c83dbac79631b98e2fa5b9b1d1 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 27 Aug 2025 08:30:37 +0530 Subject: [PATCH 04/28] fix element type --- src/compressed_lists/bool_list.py | 2 +- src/compressed_lists/float_list.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compressed_lists/bool_list.py b/src/compressed_lists/bool_list.py index 4b1ac4f..deb16ca 100644 --- a/src/compressed_lists/bool_list.py +++ b/src/compressed_lists/bool_list.py @@ -47,7 +47,7 @@ def __init__( raise TypeError("'unlist_data' must be an `BooleanList`, provided ", type(unlist_data)) from e super().__init__( - unlist_data, partitioning, element_type="integer", element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type="boolean", element_metadata=element_metadata, metadata=metadata ) def _extract_range(self, start: int, end: int) -> BooleanList: diff --git a/src/compressed_lists/float_list.py b/src/compressed_lists/float_list.py index 2a87d53..4ca3c60 100644 --- a/src/compressed_lists/float_list.py +++ b/src/compressed_lists/float_list.py @@ -47,7 +47,7 @@ def __init__( raise TypeError("'unlist_data' must be an `FloatList`, provided ", type(unlist_data)) from e super().__init__( - unlist_data, partitioning, element_type="integer", element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type="float", element_metadata=element_metadata, metadata=metadata ) def _extract_range(self, start: int, end: int) -> FloatList: From 220aad0e840ad1ac71c89fba626b01d8c4fcd93b Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 27 Aug 2025 08:54:53 +0530 Subject: [PATCH 05/28] Compressed NumPy lis --- setup.cfg | 1 + src/compressed_lists/__init__.py | 1 + src/compressed_lists/bool_list.py | 8 +-- src/compressed_lists/float_list.py | 10 +-- src/compressed_lists/numpy_list.py | 100 ++++++++++++++++++++++++++++ tests/test_comp_numpy.py | 103 +++++++++++++++++++++++++++++ 6 files changed, 214 insertions(+), 9 deletions(-) create mode 100644 src/compressed_lists/numpy_list.py create mode 100644 tests/test_comp_numpy.py diff --git a/setup.cfg b/setup.cfg index 01e5b74..72798f5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,6 +50,7 @@ python_requires = >=3.9 install_requires = importlib-metadata; python_version<"3.8" biocutils + numpy [options.packages.find] diff --git a/src/compressed_lists/__init__.py b/src/compressed_lists/__init__.py index baa8bb1..7471663 100644 --- a/src/compressed_lists/__init__.py +++ b/src/compressed_lists/__init__.py @@ -21,3 +21,4 @@ from .string_list import CompressedStringList, CompressedCharacterList from .bool_list import CompressedBooleanList from .float_list import CompressedFloatList +from .numpy_list import CompressedNumpyList \ No newline at end of file diff --git a/src/compressed_lists/bool_list.py b/src/compressed_lists/bool_list.py index deb16ca..2d1e429 100644 --- a/src/compressed_lists/bool_list.py +++ b/src/compressed_lists/bool_list.py @@ -11,7 +11,7 @@ class CompressedBooleanList(CompressedList): - """CompressedList implementation for lists of integers.""" + """CompressedList implementation for lists of booleans.""" def __init__( self, @@ -21,7 +21,7 @@ def __init__( metadata: dict = None, **kwargs, ): - """Initialize a CompressedIntegerList. + """Initialize a CompressedBooleanList. Args: unlist_data: @@ -70,11 +70,11 @@ def from_list( cls, lst: List[List[bool]], names: Optional[Sequence[str]] = None, metadata: dict = None ) -> "CompressedBooleanList": """ - Create a `CompressedBooleanList` from a list of integer lists. + Create a `CompressedBooleanList` from a list of boolean lists. Args: lst: - List of integer lists. + List of boolean lists. names: Optional names for list elements. diff --git a/src/compressed_lists/float_list.py b/src/compressed_lists/float_list.py index 4ca3c60..a694a8a 100644 --- a/src/compressed_lists/float_list.py +++ b/src/compressed_lists/float_list.py @@ -11,7 +11,7 @@ class CompressedFloatList(CompressedList): - """CompressedList implementation for lists of integers.""" + """CompressedList implementation for lists of floats.""" def __init__( self, @@ -21,7 +21,7 @@ def __init__( metadata: dict = None, **kwargs, ): - """Initialize a CompressedIntegerList. + """Initialize a CompressedFloatList. Args: unlist_data: @@ -67,14 +67,14 @@ def _extract_range(self, start: int, end: int) -> FloatList: @classmethod def from_list( - cls, lst: List[List[bool]], names: Optional[Sequence[str]] = None, metadata: dict = None + cls, lst: List[List[float]], names: Optional[Sequence[str]] = None, metadata: dict = None ) -> "CompressedFloatList": """ - Create a `CompressedFloatList` from a list of integer lists. + Create a `CompressedFloatList` from a list of float lists. Args: lst: - List of integer lists. + List of float lists. names: Optional names for list elements. diff --git a/src/compressed_lists/numpy_list.py b/src/compressed_lists/numpy_list.py new file mode 100644 index 0000000..018fd7a --- /dev/null +++ b/src/compressed_lists/numpy_list.py @@ -0,0 +1,100 @@ +from typing import List, Optional, Sequence + +import numpy as np +from biocutils.IntegerList import IntegerList + +from .base import CompressedList +from .partition import Partitioning + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class CompressedNumpyList(CompressedList): + """CompressedList implementation for lists of NumPy vectors.""" + + def __init__( + self, + unlist_data: np.ndarray, + partitioning: Partitioning, + element_metadata: dict = None, + metadata: dict = None, + **kwargs, + ): + """Initialize a CompressedNumpyList. + + Args: + unlist_data: + NumPy vector. + + partitioning: + Partitioning object defining element boundaries. + + element_metadata: + Optional metadata for elements. + + metadata: + Optional general metadata. + + kwargs: + Additional arguments. + """ + + if not isinstance(unlist_data, np.ndarray): + try: + unlist_data = np.array(unlist_data) + except Exception as e: + raise TypeError("'unlist_data' must be an `np.ndarray`, provided ", type(unlist_data)) from e + + print(unlist_data) + super().__init__( + unlist_data, partitioning, element_type="ndarray", element_metadata=element_metadata, metadata=metadata + ) + + def _extract_range(self, start: int, end: int) -> np.ndarray: + """Extract a range from unlist_data. + + Args: + start: + Start index (inclusive). + + end: + End index (exclusive). + + Returns: + Same type as unlist_data. + """ + return self._unlist_data[start:end] + + @classmethod + def from_list( + cls, lst: List[List[np.ndarray]], names: Optional[Sequence[str]] = None, metadata: dict = None + ) -> "CompressedNumpyList": + """ + Create a `CompressedNumpyList` from a list of NumPy vectors. + + Args: + lst: + List of NumPy vectors. + + names: + Optional names for list elements. + + metadata: + Optional metadata. + + Returns: + A new `CompressedNumpyList`. + """ + + # Create partitioning + partitioning = Partitioning.from_list(lst, names) + + # Create unlist_data + if len(lst) == 0: + unlist_data = np.array([]) + else: + unlist_data = np.hstack(lst) + + return cls(unlist_data, partitioning, metadata=metadata) diff --git a/tests/test_comp_numpy.py b/tests/test_comp_numpy.py new file mode 100644 index 0000000..aebffb0 --- /dev/null +++ b/tests/test_comp_numpy.py @@ -0,0 +1,103 @@ +import numpy as np +import pytest + +from compressed_lists import CompressedNumpyList, Partitioning + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@pytest.fixture +def numpy_data(): + return [np.array([1, 2, 3]), np.array([4, 5]), np.array([6, 7, 8, 9])] + + +@pytest.fixture +def numpy_list(numpy_data): + names = ["A", "B", "C"] + return CompressedNumpyList.from_list(numpy_data, names) + + +def test_creation(numpy_data): + numpy_list = CompressedNumpyList.from_list(numpy_data) + + assert len(numpy_list) == 3 + assert isinstance(numpy_list.unlist_data, np.ndarray) + assert list(numpy_list.get_unlist_data()) == [1, 2, 3, 4, 5, 6, 7, 8, 9] + assert list(numpy_list.get_element_lengths()) == [3, 2, 4] + + +def test_creation_from_parts(): + numpy_list = CompressedNumpyList([1, 2, 3, 4, 5, 6, 7, 8, 9], Partitioning(ends=[3, 5, 9])) + + assert len(numpy_list) == 3 + assert isinstance(numpy_list.unlist_data, np.ndarray) + assert list(numpy_list.get_unlist_data()) == [1, 2, 3, 4, 5, 6, 7, 8, 9] + assert list(numpy_list.get_element_lengths()) == [3, 2, 4] + + +def test_creation_with_names(numpy_data): + names = ["A", "B", "C"] + numpy_list = CompressedNumpyList.from_list(numpy_data, names) + + assert list(numpy_list.names) == names + + +def test_validation(): + data = np.array([1, 2, 3, 4, 5]) + partitioning = Partitioning([2, 4, 7]) + + with pytest.raises(ValueError): + CompressedNumpyList(data, partitioning) + + +def test_getitem_by_index(numpy_list): + assert np.allclose(numpy_list[0], [1, 2, 3]) + assert np.allclose(numpy_list[1], [4, 5]) + assert np.allclose(numpy_list[2], [6, 7, 8, 9]) + assert np.allclose(numpy_list[-1], [6, 7, 8, 9]) + + with pytest.raises(IndexError): + numpy_list[3] + + +def test_getitem_by_name(numpy_list): + assert np.allclose(numpy_list["A"], [1, 2, 3]) + assert np.allclose(numpy_list["B"], [4, 5]) + assert np.allclose(numpy_list["C"], [6, 7, 8, 9]) + + with pytest.raises(KeyError): + numpy_list["D"] + + +def test_getitem_by_slice(numpy_list): + sliced = numpy_list[1:3] + + assert len(sliced) == 2 + assert np.allclose(sliced[0], [4, 5]) + assert np.allclose(sliced[1], [6, 7, 8, 9]) + assert list(sliced.names) == ["B", "C"] + + # Empty slice + empty = numpy_list[3:4] + assert len(empty) == 0 + + +def test_iteration(numpy_list, numpy_data): + items = list(numpy_list) + print(items, numpy_data) + for i, lst in enumerate(items): + assert np.allclose(lst, numpy_data[i]) + + +def test_to_list(numpy_list, numpy_data): + regular_list = numpy_list.to_list() + for i, lst in enumerate(regular_list): + assert np.allclose(list(lst), numpy_data[i]) + + +def test_unlist(numpy_list): + unlisted = numpy_list.unlist() + assert isinstance(unlisted, np.ndarray) + assert np.allclose(unlisted, [1, 2, 3, 4, 5, 6, 7, 8, 9]) From e9f5fc4861ea2b38e97128a6105be28eec849439 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Aug 2025 03:25:19 +0000 Subject: [PATCH 06/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/compressed_lists/__init__.py | 2 +- src/compressed_lists/numpy_list.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/compressed_lists/__init__.py b/src/compressed_lists/__init__.py index 7471663..3715ce0 100644 --- a/src/compressed_lists/__init__.py +++ b/src/compressed_lists/__init__.py @@ -21,4 +21,4 @@ from .string_list import CompressedStringList, CompressedCharacterList from .bool_list import CompressedBooleanList from .float_list import CompressedFloatList -from .numpy_list import CompressedNumpyList \ No newline at end of file +from .numpy_list import CompressedNumpyList diff --git a/src/compressed_lists/numpy_list.py b/src/compressed_lists/numpy_list.py index 018fd7a..b8e165b 100644 --- a/src/compressed_lists/numpy_list.py +++ b/src/compressed_lists/numpy_list.py @@ -1,7 +1,6 @@ from typing import List, Optional, Sequence import numpy as np -from biocutils.IntegerList import IntegerList from .base import CompressedList from .partition import Partitioning From 16d899d6b744866123cd3f329acfb777f2cfc6de Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Thu, 28 Aug 2025 07:47:35 +0530 Subject: [PATCH 07/28] Base is not an abstract class anymore --- src/compressed_lists/base.py | 35 +++++++++++++------ src/compressed_lists/bool_list.py | 52 +--------------------------- src/compressed_lists/float_list.py | 52 +--------------------------- src/compressed_lists/integer_list.py | 52 +--------------------------- src/compressed_lists/numpy_list.py | 18 +--------- src/compressed_lists/string_list.py | 48 +------------------------ tests/test_comp_custom.py | 23 ++++++------ 7 files changed, 41 insertions(+), 239 deletions(-) diff --git a/src/compressed_lists/base.py b/src/compressed_lists/base.py index 8422756..e3c770f 100644 --- a/src/compressed_lists/base.py +++ b/src/compressed_lists/base.py @@ -1,4 +1,3 @@ -from abc import ABC, abstractmethod from typing import Any, Callable, Iterator, List, Optional, Sequence, Union from warnings import warn @@ -20,7 +19,7 @@ def _validate_data_and_partitions(unlist_data, partition): ) -class CompressedList(ABC): +class CompressedList: """Base class for compressed list objects. `CompressedList` stores list elements concatenated in a single vector-like object @@ -31,7 +30,7 @@ def __init__( self, unlist_data: Any, partitioning: Partitioning, - element_type: str = None, + element_type: Any = None, element_metadata: dict = None, metadata: Optional[dict] = None, validate: bool = True, @@ -46,7 +45,7 @@ def __init__( Partitioning object defining element boundaries (exclusive). element_type: - String identifier for the type of elements. + class for the type of elements. element_metadata: Optional metadata for elements. @@ -414,7 +413,7 @@ def __getitem__(self, key: Union[int, str, slice]) -> Any: raise IndexError(f"List index '{key}' out of range.") start, end = self._partitioning.get_partition_range(key) - return self._extract_range(start, end) + return self.extract_range(start, end) # slices elif isinstance(key, slice): @@ -422,7 +421,7 @@ def __getitem__(self, key: Union[int, str, slice]) -> Any: result = [] for i in indices: start, end = self._partitioning.get_partition_range(i) - result.append(self._extract_range(start, end)) + result.append(self.extract_range(start, end)) # Create a new CompressedList from the result return self.__class__.from_list( @@ -436,8 +435,7 @@ def __getitem__(self, key: Union[int, str, slice]) -> Any: ######>> abstract methods <<###### ################################## - @abstractmethod - def _extract_range(self, start: int, end: int) -> Any: + def extract_range(self, start: int, end: int) -> Any: """Extract a range from `unlist_data`. This method must be implemented by subclasses to handle @@ -453,10 +451,14 @@ def _extract_range(self, start: int, end: int) -> Any: Returns: Extracted element. """ - pass + try: + return self._unlist_data[start:end] + except Exception as e: + raise NotImplementedError( + "Custom classes should implement their own `extract_range` method for slice operations" + ) from e @classmethod - @abstractmethod def from_list( cls, lst: List[Any], names: Optional[Sequence[str]] = None, metadata: dict = None ) -> "CompressedList[Any]": @@ -478,7 +480,18 @@ def from_list( Returns: A new `CompressedList`. """ - pass + # Flatten the list + flat_data = [] + for sublist in lst: + flat_data.extend(sublist) + + # Create partitioning + partitioning = Partitioning.from_list(lst, names) + + # Create unlist_data + # unlist_data = cls._element_type(data=flat_data) + + return cls(flat_data, partitioning, metadata=metadata) ########################### ######>> coercions <<###### diff --git a/src/compressed_lists/bool_list.py b/src/compressed_lists/bool_list.py index 2d1e429..c948665 100644 --- a/src/compressed_lists/bool_list.py +++ b/src/compressed_lists/bool_list.py @@ -1,5 +1,3 @@ -from typing import List, Optional, Sequence - from biocutils.BooleanList import BooleanList from .base import CompressedList @@ -47,53 +45,5 @@ def __init__( raise TypeError("'unlist_data' must be an `BooleanList`, provided ", type(unlist_data)) from e super().__init__( - unlist_data, partitioning, element_type="boolean", element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type=BooleanList, element_metadata=element_metadata, metadata=metadata ) - - def _extract_range(self, start: int, end: int) -> BooleanList: - """Extract a range from unlist_data. - - Args: - start: - Start index (inclusive). - - end: - End index (exclusive). - - Returns: - Same type as unlist_data. - """ - return self._unlist_data[start:end] - - @classmethod - def from_list( - cls, lst: List[List[bool]], names: Optional[Sequence[str]] = None, metadata: dict = None - ) -> "CompressedBooleanList": - """ - Create a `CompressedBooleanList` from a list of boolean lists. - - Args: - lst: - List of boolean lists. - - names: - Optional names for list elements. - - metadata: - Optional metadata. - - Returns: - A new `CompressedBooleanList`. - """ - # Flatten the list - flat_data = [] - for sublist in lst: - flat_data.extend(sublist) - - # Create partitioning - partitioning = Partitioning.from_list(lst, names) - - # Create unlist_data - unlist_data = BooleanList(data=flat_data) - - return cls(unlist_data, partitioning, metadata=metadata) diff --git a/src/compressed_lists/float_list.py b/src/compressed_lists/float_list.py index a694a8a..908fc56 100644 --- a/src/compressed_lists/float_list.py +++ b/src/compressed_lists/float_list.py @@ -1,5 +1,3 @@ -from typing import List, Optional, Sequence - from biocutils.FloatList import FloatList from .base import CompressedList @@ -47,53 +45,5 @@ def __init__( raise TypeError("'unlist_data' must be an `FloatList`, provided ", type(unlist_data)) from e super().__init__( - unlist_data, partitioning, element_type="float", element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type=FloatList, element_metadata=element_metadata, metadata=metadata ) - - def _extract_range(self, start: int, end: int) -> FloatList: - """Extract a range from unlist_data. - - Args: - start: - Start index (inclusive). - - end: - End index (exclusive). - - Returns: - Same type as unlist_data. - """ - return self._unlist_data[start:end] - - @classmethod - def from_list( - cls, lst: List[List[float]], names: Optional[Sequence[str]] = None, metadata: dict = None - ) -> "CompressedFloatList": - """ - Create a `CompressedFloatList` from a list of float lists. - - Args: - lst: - List of float lists. - - names: - Optional names for list elements. - - metadata: - Optional metadata. - - Returns: - A new `CompressedFloatList`. - """ - # Flatten the list - flat_data = [] - for sublist in lst: - flat_data.extend(sublist) - - # Create partitioning - partitioning = Partitioning.from_list(lst, names) - - # Create unlist_data - unlist_data = FloatList(data=flat_data) - - return cls(unlist_data, partitioning, metadata=metadata) diff --git a/src/compressed_lists/integer_list.py b/src/compressed_lists/integer_list.py index 678b80b..d4f58d1 100644 --- a/src/compressed_lists/integer_list.py +++ b/src/compressed_lists/integer_list.py @@ -1,5 +1,3 @@ -from typing import List, Optional, Sequence - from biocutils.IntegerList import IntegerList from .base import CompressedList @@ -47,53 +45,5 @@ def __init__( raise TypeError("'unlist_data' must be an `IntegerList`, provided ", type(unlist_data)) from e super().__init__( - unlist_data, partitioning, element_type="integer", element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type=IntegerList, element_metadata=element_metadata, metadata=metadata ) - - def _extract_range(self, start: int, end: int) -> IntegerList: - """Extract a range from unlist_data. - - Args: - start: - Start index (inclusive). - - end: - End index (exclusive). - - Returns: - Same type as unlist_data. - """ - return self._unlist_data[start:end] - - @classmethod - def from_list( - cls, lst: List[List[int]], names: Optional[Sequence[str]] = None, metadata: dict = None - ) -> "CompressedIntegerList": - """ - Create a `CompressedIntegerList` from a list of integer lists. - - Args: - lst: - List of integer lists. - - names: - Optional names for list elements. - - metadata: - Optional metadata. - - Returns: - A new `CompressedIntegerList`. - """ - # Flatten the list - flat_data = [] - for sublist in lst: - flat_data.extend(sublist) - - # Create partitioning - partitioning = Partitioning.from_list(lst, names) - - # Create unlist_data - unlist_data = IntegerList(data=flat_data) - - return cls(unlist_data, partitioning, metadata=metadata) diff --git a/src/compressed_lists/numpy_list.py b/src/compressed_lists/numpy_list.py index 018fd7a..a283a4f 100644 --- a/src/compressed_lists/numpy_list.py +++ b/src/compressed_lists/numpy_list.py @@ -1,7 +1,6 @@ from typing import List, Optional, Sequence import numpy as np -from biocutils.IntegerList import IntegerList from .base import CompressedList from .partition import Partitioning @@ -49,24 +48,9 @@ def __init__( print(unlist_data) super().__init__( - unlist_data, partitioning, element_type="ndarray", element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type=np.array, element_metadata=element_metadata, metadata=metadata ) - def _extract_range(self, start: int, end: int) -> np.ndarray: - """Extract a range from unlist_data. - - Args: - start: - Start index (inclusive). - - end: - End index (exclusive). - - Returns: - Same type as unlist_data. - """ - return self._unlist_data[start:end] - @classmethod def from_list( cls, lst: List[List[np.ndarray]], names: Optional[Sequence[str]] = None, metadata: dict = None diff --git a/src/compressed_lists/string_list.py b/src/compressed_lists/string_list.py index 863229c..c1ab643 100644 --- a/src/compressed_lists/string_list.py +++ b/src/compressed_lists/string_list.py @@ -1,5 +1,3 @@ -from typing import List, Optional, Sequence - from biocutils.StringList import StringList from .base import CompressedList @@ -46,53 +44,9 @@ def __init__( raise TypeError("'unlist_data' must be an `StringList`, provided ", type(unlist_data)) from e super().__init__( - unlist_data, partitioning, element_type="string", element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type=StringList, element_metadata=element_metadata, metadata=metadata ) - def _extract_range(self, start: int, end: int) -> StringList: - """Extract a range from unlist_data. - - Args: - start: - Start index (inclusive). - - end: - End index (exclusive). - - Returns: - List of strings. - """ - return self._unlist_data[start:end] - - @classmethod - def from_list( - cls, lst: List[List[str]], names: Optional[Sequence[str]] = None, metadata: dict = None - ) -> "CompressedStringList": - """Create a `CompressedStringList` from a list of string lists. - - Args: - lst: - List of string lists. - - names: - Optional names for list elements. - - metadata: - Optional metadata. - - Returns: - A new `CompressedStringList`. - """ - # Flatten the list - flat_data = [] - for sublist in lst: - flat_data.extend(sublist) - - # Create partitioning - partitioning = Partitioning.from_list(lst, names) - - return cls(StringList(flat_data), partitioning, metadata=metadata) - class CompressedCharacterList(CompressedStringList): pass diff --git a/tests/test_comp_custom.py b/tests/test_comp_custom.py index 1476109..276df4f 100644 --- a/tests/test_comp_custom.py +++ b/tests/test_comp_custom.py @@ -11,11 +11,11 @@ @pytest.fixture -def CompressedFloatList(): - class CompressedFloatList(CompressedList): +def CompressedCustomFloatList(): + class CompressedCustomFloatList(CompressedList): def __init__( self, - unlist_data: np.ndarray, + unlist_data: List[float], partitioning: Partitioning, element_metadata: dict = None, metadata: dict = None, @@ -24,26 +24,27 @@ def __init__( unlist_data, partitioning, element_type="float", element_metadata=element_metadata, metadata=metadata ) - def _extract_range(self, start: int, end: int) -> List[float]: - return self._unlist_data[start:end].tolist() + def extract_range(self, start: int, end: int) -> List[float]: + return self._unlist_data[start:end] @classmethod - def from_list(cls, lst: List[List[float]], names: list = None, metadata: dict = None) -> "CompressedFloatList": + def from_list( + cls, lst: List[List[float]], names: list = None, metadata: dict = None + ) -> "CompressedCustomFloatList": flat_data = [] for sublist in lst: flat_data.extend(sublist) partitioning = Partitioning.from_list(lst, names) - unlist_data = np.array(flat_data, dtype=np.float64) - return cls(unlist_data, partitioning, metadata=metadata) + return cls(flat_data, partitioning, metadata=metadata) - return CompressedFloatList + return CompressedCustomFloatList -def test_custom_class(CompressedFloatList): +def test_custom_class(CompressedCustomFloatList): float_data = [[1.1, 2.2, 3.3], [4.4, 5.5], [6.6, 7.7, 8.8, 9.9]] names = ["X", "Y", "Z"] - float_list = CompressedFloatList.from_list(float_data, names) + float_list = CompressedCustomFloatList.from_list(float_data, names) assert len(float_list) == 3 assert float_list._element_type == "float" From be24a2f5808b821f893f750da0d8075456cce439 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Aug 2025 02:17:55 +0000 Subject: [PATCH 08/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_comp_custom.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_comp_custom.py b/tests/test_comp_custom.py index 276df4f..8284b22 100644 --- a/tests/test_comp_custom.py +++ b/tests/test_comp_custom.py @@ -1,6 +1,5 @@ from typing import List -import numpy as np import pytest from compressed_lists import CompressedList, Partitioning From b0c56acab01d1515ebb05fd78560e5c6b8d36e6e Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 16 Sep 2025 23:41:15 -0700 Subject: [PATCH 09/28] update test --- tests/test_comp_custom.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_comp_custom.py b/tests/test_comp_custom.py index 8284b22..d6cb817 100644 --- a/tests/test_comp_custom.py +++ b/tests/test_comp_custom.py @@ -55,3 +55,13 @@ def test_custom_class(CompressedCustomFloatList): assert rounded[0] == [1.0, 2.0, 3.0] assert rounded[1] == [4.0, 6.0] assert rounded[2] == [7.0, 8.0, 9.0, 10.0] + + +def test_custom_plain_list(): + list_of_bools = [[True, False], [False, True, False], [False]] + unclassed = CompressedList.from_list(list_of_bools) + + assert unclassed is not None + assert isinstance(unclassed, CompressedList) + assert len(unclassed) == 3 + assert list(unclassed.get_element_lengths()) == [2, 3, 1] \ No newline at end of file From 4c496b93de7c786b78a851f6d8c9e0155e731bc7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 17 Sep 2025 06:41:56 +0000 Subject: [PATCH 10/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_comp_custom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_comp_custom.py b/tests/test_comp_custom.py index d6cb817..aa4f582 100644 --- a/tests/test_comp_custom.py +++ b/tests/test_comp_custom.py @@ -64,4 +64,4 @@ def test_custom_plain_list(): assert unclassed is not None assert isinstance(unclassed, CompressedList) assert len(unclassed) == 3 - assert list(unclassed.get_element_lengths()) == [2, 3, 1] \ No newline at end of file + assert list(unclassed.get_element_lengths()) == [2, 3, 1] From 7c7f2be205ef68369b48f6805f2c1d4a5f9cc333 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 17 Sep 2025 01:22:09 -0700 Subject: [PATCH 11/28] thinking of split generic --- src/compressed_lists/split_generic.py | 76 +++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 src/compressed_lists/split_generic.py diff --git a/src/compressed_lists/split_generic.py b/src/compressed_lists/split_generic.py new file mode 100644 index 0000000..4488faf --- /dev/null +++ b/src/compressed_lists/split_generic.py @@ -0,0 +1,76 @@ +from functools import singledispatch +from typing import Any, List, Optional, Sequence, Type, Union + +import numpy as np +import pandas as pd +from base import CompressedList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@singledispatch +def splitAsCompressedList( + data: Any, names: Optional[List[str]] = None, metadata: Optional[dict] = None +) -> CompressedList: + """Generic function to split data into an appropriate `CompressedList` subclass. + + This function uses single dispatch to automatically choose the right + `CompressedList` subclass based on the data type. Third parties can + register their own types by using the @splitAsCompressedList.register + decorator. + + Args: + data: + The data to split into a `CompressedList`. + + names: + Optional names for the list elements. + + metadata: + Optional metadata for the `CompressedList`. + + Returns: + An appropriate `CompressedList` subclass instance. + """ + raise NotImplementedError(f"No `splitAsCompressedList` implementation for type {type(data)}.") + + +@splitAsCompressedList.register +def _(data: list, names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None) -> CompressedList: + """Handle regular Python lists by inspecting element types.""" + if not data: + raise ValueError("Cannot create `CompressedList` from empty list.") + + first_element = None + for item in data: + if item and len(item) > 0: + first_element = item[0] + break + + if first_element is None: + raise ValueError("All elements are empty, cannot determine type") + + if isinstance(first_element, int): + from integer_list import CompressedIntegerList + + return CompressedIntegerList.from_list(data, names, metadata) + elif isinstance(first_element, str): + from string_list import CompressedCharacterList + + return CompressedCharacterList.from_list(data, names, metadata) + elif isinstance(first_element, float): + from float_list import CompressedFloatList + + return CompressedFloatList.from_list(data, names, metadata) + elif isinstance(first_element, float): + from bool_list import CompressedBooleanList + + return CompressedBooleanList.from_list(data, names, metadata) + elif isinstance(first_element, np.ndarray): + from numpy_list import CompressedNumpyList + + return CompressedNumpyList.from_list(data, names, metadata) + else: + raise NotImplementedError(f"No `CompressedList` implementation for element type {type(first_element)}.") From 8d32fee48debecb3567b405633a7985b693e7e88 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 17 Sep 2025 08:24:04 +0000 Subject: [PATCH 12/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/compressed_lists/split_generic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/compressed_lists/split_generic.py b/src/compressed_lists/split_generic.py index 4488faf..388d155 100644 --- a/src/compressed_lists/split_generic.py +++ b/src/compressed_lists/split_generic.py @@ -1,8 +1,7 @@ from functools import singledispatch -from typing import Any, List, Optional, Sequence, Type, Union +from typing import Any, List, Optional, Sequence import numpy as np -import pandas as pd from base import CompressedList __author__ = "Jayaram Kancherla" From b5424d47bbad814e644d43705bf353c9d43b07de Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 17 Sep 2025 22:48:36 -0700 Subject: [PATCH 13/28] implement generic --- src/compressed_lists/bool_list.py | 11 +++++++++++ src/compressed_lists/float_list.py | 11 +++++++++++ src/compressed_lists/integer_list.py | 11 +++++++++++ src/compressed_lists/numpy_list.py | 11 ++++++++--- src/compressed_lists/split_generic.py | 9 +++------ src/compressed_lists/string_list.py | 11 +++++++++++ 6 files changed, 55 insertions(+), 9 deletions(-) diff --git a/src/compressed_lists/bool_list.py b/src/compressed_lists/bool_list.py index c948665..2341cd6 100644 --- a/src/compressed_lists/bool_list.py +++ b/src/compressed_lists/bool_list.py @@ -1,7 +1,10 @@ +from typing import List, Union + from biocutils.BooleanList import BooleanList from .base import CompressedList from .partition import Partitioning +from .split_generic import splitAsCompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -47,3 +50,11 @@ def __init__( super().__init__( unlist_data, partitioning, element_type=BooleanList, element_metadata=element_metadata, metadata=metadata ) + + +@splitAsCompressedList.register +def _( + data: Union[List[List[bool]], List[BooleanList]], names: List[str] = None, metadata: dict = None +) -> CompressedBooleanList: + """Handle lists of boolean.""" + return CompressedBooleanList.from_list(data, names, metadata) diff --git a/src/compressed_lists/float_list.py b/src/compressed_lists/float_list.py index 908fc56..60ca68d 100644 --- a/src/compressed_lists/float_list.py +++ b/src/compressed_lists/float_list.py @@ -1,7 +1,10 @@ +from typing import List, Union + from biocutils.FloatList import FloatList from .base import CompressedList from .partition import Partitioning +from .split_generic import splitAsCompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -47,3 +50,11 @@ def __init__( super().__init__( unlist_data, partitioning, element_type=FloatList, element_metadata=element_metadata, metadata=metadata ) + + +@splitAsCompressedList.register +def _( + data: Union[List[List[float]], List[FloatList]], names: List[str] = None, metadata: dict = None +) -> CompressedFloatList: + """Handle lists of floats.""" + return CompressedFloatList.from_list(data, names, metadata) diff --git a/src/compressed_lists/integer_list.py b/src/compressed_lists/integer_list.py index d4f58d1..750e007 100644 --- a/src/compressed_lists/integer_list.py +++ b/src/compressed_lists/integer_list.py @@ -1,7 +1,10 @@ +from typing import List, Union + from biocutils.IntegerList import IntegerList from .base import CompressedList from .partition import Partitioning +from .split_generic import splitAsCompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -47,3 +50,11 @@ def __init__( super().__init__( unlist_data, partitioning, element_type=IntegerList, element_metadata=element_metadata, metadata=metadata ) + + +@splitAsCompressedList.register +def _( + data: Union[List[List[int]], List[IntegerList]], names: List[str] = None, metadata: dict = None +) -> CompressedIntegerList: + """Handle lists of integers.""" + return CompressedIntegerList.from_list(data, names, metadata) diff --git a/src/compressed_lists/numpy_list.py b/src/compressed_lists/numpy_list.py index a283a4f..92fd087 100644 --- a/src/compressed_lists/numpy_list.py +++ b/src/compressed_lists/numpy_list.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Sequence +from typing import List, Optional, Sequence, Union import numpy as np @@ -46,14 +46,13 @@ def __init__( except Exception as e: raise TypeError("'unlist_data' must be an `np.ndarray`, provided ", type(unlist_data)) from e - print(unlist_data) super().__init__( unlist_data, partitioning, element_type=np.array, element_metadata=element_metadata, metadata=metadata ) @classmethod def from_list( - cls, lst: List[List[np.ndarray]], names: Optional[Sequence[str]] = None, metadata: dict = None + cls, lst: List[List[np.ndarray]], names: List[str] = None, metadata: dict = None ) -> "CompressedNumpyList": """ Create a `CompressedNumpyList` from a list of NumPy vectors. @@ -82,3 +81,9 @@ def from_list( unlist_data = np.hstack(lst) return cls(unlist_data, partitioning, metadata=metadata) + + +@splitAsCompressedList.register +def _(data: List[np.ndarray], names: List[str] = None, metadata: dict = None) -> CompressedNumpyList: + """Handle lists of numpy vectors.""" + return CompressedNumpyList.from_list(data, names, metadata) diff --git a/src/compressed_lists/split_generic.py b/src/compressed_lists/split_generic.py index 4488faf..af08848 100644 --- a/src/compressed_lists/split_generic.py +++ b/src/compressed_lists/split_generic.py @@ -1,8 +1,7 @@ from functools import singledispatch -from typing import Any, List, Optional, Sequence, Type, Union +from typing import Any, List, Optional, Sequence import numpy as np -import pandas as pd from base import CompressedList __author__ = "Jayaram Kancherla" @@ -11,9 +10,7 @@ @singledispatch -def splitAsCompressedList( - data: Any, names: Optional[List[str]] = None, metadata: Optional[dict] = None -) -> CompressedList: +def splitAsCompressedList(data: Any, names: List[str] = None, metadata: dict = None) -> CompressedList: """Generic function to split data into an appropriate `CompressedList` subclass. This function uses single dispatch to automatically choose the right @@ -38,7 +35,7 @@ def splitAsCompressedList( @splitAsCompressedList.register -def _(data: list, names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None) -> CompressedList: +def _(data: list, names: List[str] = None, metadata: dict = None) -> CompressedList: """Handle regular Python lists by inspecting element types.""" if not data: raise ValueError("Cannot create `CompressedList` from empty list.") diff --git a/src/compressed_lists/string_list.py b/src/compressed_lists/string_list.py index c1ab643..1607b83 100644 --- a/src/compressed_lists/string_list.py +++ b/src/compressed_lists/string_list.py @@ -1,7 +1,10 @@ +from typing import List, Union + from biocutils.StringList import StringList from .base import CompressedList from .partition import Partitioning +from .split_generic import splitAsCompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -50,3 +53,11 @@ def __init__( class CompressedCharacterList(CompressedStringList): pass + + +@splitAsCompressedList.register +def _( + data: Union[List[List[str]], List[StringList]], names: List[str] = None, metadata: dict = None +) -> CompressedStringList: + """Handle lists of strings.""" + return CompressedStringList.from_list(data, names, metadata) From f59bdfebfaefd07eb15c37df2501d5a99d650abb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 18 Sep 2025 05:48:59 +0000 Subject: [PATCH 14/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/compressed_lists/numpy_list.py | 2 +- src/compressed_lists/split_generic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compressed_lists/numpy_list.py b/src/compressed_lists/numpy_list.py index 92fd087..6b30336 100644 --- a/src/compressed_lists/numpy_list.py +++ b/src/compressed_lists/numpy_list.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Sequence, Union +from typing import List import numpy as np diff --git a/src/compressed_lists/split_generic.py b/src/compressed_lists/split_generic.py index af08848..7b6503d 100644 --- a/src/compressed_lists/split_generic.py +++ b/src/compressed_lists/split_generic.py @@ -1,5 +1,5 @@ from functools import singledispatch -from typing import Any, List, Optional, Sequence +from typing import Any, List import numpy as np from base import CompressedList From 1265d6c40375d31bf8ac4a6bae45842501c7e9d7 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 17 Sep 2025 23:05:31 -0700 Subject: [PATCH 15/28] fix import --- src/compressed_lists/split_generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compressed_lists/split_generic.py b/src/compressed_lists/split_generic.py index 7b6503d..27a4a0c 100644 --- a/src/compressed_lists/split_generic.py +++ b/src/compressed_lists/split_generic.py @@ -2,7 +2,7 @@ from typing import Any, List import numpy as np -from base import CompressedList +from .base import CompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" From 7cb79715cd83f33edbac0a6d259137f946ab31f2 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 7 Oct 2025 11:55:33 -0700 Subject: [PATCH 16/28] adding the generic and other changes to support initialization --- src/compressed_lists/base.py | 41 +++++---- src/compressed_lists/bool_list.py | 52 +++++++++-- src/compressed_lists/float_list.py | 50 +++++++++-- src/compressed_lists/integer_list.py | 51 +++++++++-- src/compressed_lists/numpy_list.py | 59 +++++++++++-- src/compressed_lists/partition.py | 16 ++-- src/compressed_lists/split_generic.py | 120 +++++++++++++++++--------- src/compressed_lists/string_list.py | 52 +++++++++-- 8 files changed, 342 insertions(+), 99 deletions(-) diff --git a/src/compressed_lists/base.py b/src/compressed_lists/base.py index e3c770f..f4253ec 100644 --- a/src/compressed_lists/base.py +++ b/src/compressed_lists/base.py @@ -31,7 +31,7 @@ def __init__( unlist_data: Any, partitioning: Partitioning, element_type: Any = None, - element_metadata: dict = None, + element_metadata: Optional[dict] = None, metadata: Optional[dict] = None, validate: bool = True, ): @@ -65,7 +65,7 @@ class for the type of elements. if validate: _validate_data_and_partitions(self._unlist_data, self._partitioning) - def _define_output(self, in_place: bool = False) -> "Partitioning": + def _define_output(self, in_place: bool = False) -> "CompressedList": if in_place is True: return self else: @@ -207,7 +207,7 @@ def get_names(self) -> Optional[ut.NamedList]: """Get the names of list elements.""" return self._partitioning.get_names() - def set_names(self, names: Sequence[str], in_place: bool = False) -> "CompressedList": + def set_names(self, names: List[str], in_place: bool = False) -> "CompressedList": """Set the names of list elements. names: @@ -401,7 +401,7 @@ def __getitem__(self, key: Union[int, str, slice]) -> Any: """ # string keys (names) if isinstance(key, str): - if key not in self.names: + if key not in list(self.get_names()): raise KeyError(f"No element named '{key}'.") key = list(self.names).index(key) @@ -422,14 +422,14 @@ def __getitem__(self, key: Union[int, str, slice]) -> Any: for i in indices: start, end = self._partitioning.get_partition_range(i) result.append(self.extract_range(start, end)) - - # Create a new CompressedList from the result - return self.__class__.from_list( + + current_class_const = type(self) + return current_class_const.from_list( result, names=[self.names[i] for i in indices] if self.names[0] is not None else None ) else: - raise TypeError("Index must be int, str, or slice.") + raise TypeError("'key' must be int, str, or slice.") ################################## ######>> abstract methods <<###### @@ -460,8 +460,8 @@ def extract_range(self, start: int, end: int) -> Any: @classmethod def from_list( - cls, lst: List[Any], names: Optional[Sequence[str]] = None, metadata: dict = None - ) -> "CompressedList[Any]": + cls, lst: List[Any], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None + ) -> "CompressedList": """Create a CompressedList from a regular list. This method must be implemented by subclasses to handle @@ -519,7 +519,7 @@ def unlist(self, use_names: bool = True) -> Any: """ return self._unlist_data - def relist(self, unlist_data: Any) -> "CompressedList[Any]": + def relist(self, unlist_data: Any) -> "CompressedList": """Create a new `CompressedList` with the same partitioning but different data. Args: @@ -531,7 +531,8 @@ def relist(self, unlist_data: Any) -> "CompressedList[Any]": """ _validate_data_and_partitions(unlist_data, self._partitioning) - return self.__class__( + current_class_const = type(self) + return current_class_const( unlist_data, self._partitioning.copy(), element_type=self._element_type, @@ -539,7 +540,7 @@ def relist(self, unlist_data: Any) -> "CompressedList[Any]": metadata=self._metadata.copy(), ) - def extract_subset(self, indices: Sequence[int]) -> "CompressedList[Any]": + def extract_subset(self, indices: Sequence[int]) -> "CompressedList": """Extract a subset of elements by indices. Args: @@ -555,8 +556,8 @@ def extract_subset(self, indices: Sequence[int]) -> "CompressedList[Any]": raise IndexError(f"Index {i} out of range") # Extract element lengths and names - new_lengths = [self.get_element_lengths()[i] for i in indices] - new_names = [self.names[i] for i in indices] if self.names[0] is not None else None + new_lengths = ut.subset_sequence(self.get_element_lengths(), indices) + new_names = ut.subset_sequence(self.names, indices) if self.names is not None else None # Create new partitioning new_partitioning = Partitioning.from_lengths(new_lengths, new_names) @@ -573,8 +574,8 @@ def extract_subset(self, indices: Sequence[int]) -> "CompressedList[Any]": if isinstance(self._unlist_data, np.ndarray): new_data = np.concatenate(new_data) - # Create new compressed list - return self.__class__( + current_class_const = type(self) + return current_class_const( new_data, new_partitioning, element_type=self._element_type, @@ -582,7 +583,7 @@ def extract_subset(self, indices: Sequence[int]) -> "CompressedList[Any]": metadata=self._metadata.copy(), ) - def lapply(self, func: Callable) -> "CompressedList[Any]": + def lapply(self, func: Callable) -> "CompressedList": """Apply a function to each element. Args: @@ -593,4 +594,6 @@ def lapply(self, func: Callable) -> "CompressedList[Any]": A new CompressedList with the results. """ result = [func(elem) for elem in self] - return self.__class__.from_list(result, self.names, self._metadata) + + current_class_const = type(self) + return current_class_const.from_list(result, self.names, self._metadata) diff --git a/src/compressed_lists/bool_list.py b/src/compressed_lists/bool_list.py index 2341cd6..980b866 100644 --- a/src/compressed_lists/bool_list.py +++ b/src/compressed_lists/bool_list.py @@ -1,10 +1,11 @@ -from typing import List, Union +from typing import List, Optional, Sequence, Union +import numpy as np from biocutils.BooleanList import BooleanList from .base import CompressedList from .partition import Partitioning -from .split_generic import splitAsCompressedList +from .split_generic import _generic_register_helper, splitAsCompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -18,8 +19,8 @@ def __init__( self, unlist_data: BooleanList, partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, **kwargs, ): """Initialize a CompressedBooleanList. @@ -51,10 +52,47 @@ def __init__( unlist_data, partitioning, element_type=BooleanList, element_metadata=element_metadata, metadata=metadata ) + @classmethod + def from_partitioned_data( + cls, partitioned_data: List[List], partitioning: Partitioning, metadata: Optional[dict] = None + ) -> "CompressedBooleanList": + """Create `CompressedBooleanList` from already-partitioned data. + + Args: + partitioned_data: + List of `BooleanList`'s, each containing booleans for one partition. + + partitioning: + Partitioning object defining the boundaries. + + metadata: + Optional metadata. + + Returns: + A new `CompressedBooleanList`. + """ + flat_data = [] + for partition in partitioned_data: + flat_data.extend(partition) + + unlist_data = BooleanList(flat_data) + + return cls(unlist_data, partitioning, metadata=metadata) + @splitAsCompressedList.register def _( - data: Union[List[List[bool]], List[BooleanList]], names: List[str] = None, metadata: dict = None + data: BooleanList, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, ) -> CompressedBooleanList: - """Handle lists of boolean.""" - return CompressedBooleanList.from_list(data, names, metadata) + """Handle lists of booleans.""" + + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) + + return CompressedBooleanList.from_partitioned_data( + partitioned_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata + ) diff --git a/src/compressed_lists/float_list.py b/src/compressed_lists/float_list.py index 60ca68d..59efcff 100644 --- a/src/compressed_lists/float_list.py +++ b/src/compressed_lists/float_list.py @@ -1,10 +1,11 @@ -from typing import List, Union +from typing import List, Optional, Sequence, Union +import numpy as np from biocutils.FloatList import FloatList from .base import CompressedList from .partition import Partitioning -from .split_generic import splitAsCompressedList +from .split_generic import _generic_register_helper, splitAsCompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -18,8 +19,8 @@ def __init__( self, unlist_data: FloatList, partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, **kwargs, ): """Initialize a CompressedFloatList. @@ -51,10 +52,47 @@ def __init__( unlist_data, partitioning, element_type=FloatList, element_metadata=element_metadata, metadata=metadata ) + @classmethod + def from_partitioned_data( + cls, partitioned_data: List[List], partitioning: Partitioning, metadata: Optional[dict] = None + ) -> "CompressedFloatList": + """Create `CompressedFloatList` from already-partitioned data. + + Args: + partitioned_data: + List of lists, each containing floats for one partition. + + partitioning: + Partitioning object defining the boundaries. + + metadata: + Optional metadata. + + Returns: + A new `CompressedFloatList`. + """ + flat_data = [] + for partition in partitioned_data: + flat_data.extend(partition) + + unlist_data = FloatList(flat_data) + + return cls(unlist_data, partitioning, metadata=metadata) + @splitAsCompressedList.register def _( - data: Union[List[List[float]], List[FloatList]], names: List[str] = None, metadata: dict = None + data: FloatList, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, ) -> CompressedFloatList: """Handle lists of floats.""" - return CompressedFloatList.from_list(data, names, metadata) + + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) + + return CompressedFloatList.from_partitioned_data( + partitioned_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata + ) diff --git a/src/compressed_lists/integer_list.py b/src/compressed_lists/integer_list.py index 750e007..adc601d 100644 --- a/src/compressed_lists/integer_list.py +++ b/src/compressed_lists/integer_list.py @@ -1,10 +1,11 @@ -from typing import List, Union +from typing import Optional, Sequence, Union +import numpy as np from biocutils.IntegerList import IntegerList from .base import CompressedList from .partition import Partitioning -from .split_generic import splitAsCompressedList +from .split_generic import splitAsCompressedList, _generic_register_helper __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -18,8 +19,8 @@ def __init__( self, unlist_data: IntegerList, partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, **kwargs, ): """Initialize a CompressedIntegerList. @@ -51,10 +52,48 @@ def __init__( unlist_data, partitioning, element_type=IntegerList, element_metadata=element_metadata, metadata=metadata ) + @classmethod + def from_partitioned_data( + cls, partitioned_data: Sequence[IntegerList], partitioning: Partitioning, metadata: Optional[dict] = None + ) -> "CompressedIntegerList": + """Create `CompressedIntegerList` from already-partitioned data. + + Args: + partitioned_data: + List of `IntegerList`'s, each containing integers for one partition. + + partitioning: + Partitioning object defining the boundaries. + + metadata: + Optional metadata. + + Returns: + A new `CompressedIntegerList`. + """ + + flat_data = [] + for partition in partitioned_data: + flat_data.extend(partition) + + unlist_data = IntegerList(flat_data) + + return cls(unlist_data, partitioning, metadata=metadata) + @splitAsCompressedList.register def _( - data: Union[List[List[int]], List[IntegerList]], names: List[str] = None, metadata: dict = None + data: IntegerList, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, ) -> CompressedIntegerList: """Handle lists of integers.""" - return CompressedIntegerList.from_list(data, names, metadata) + + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) + + return CompressedIntegerList.from_partitioned_data( + partitioned_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata + ) diff --git a/src/compressed_lists/numpy_list.py b/src/compressed_lists/numpy_list.py index 6b30336..deb52a5 100644 --- a/src/compressed_lists/numpy_list.py +++ b/src/compressed_lists/numpy_list.py @@ -1,9 +1,10 @@ -from typing import List +from typing import List, Optional import numpy as np from .base import CompressedList from .partition import Partitioning +from .split_generic import splitAsCompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -17,8 +18,8 @@ def __init__( self, unlist_data: np.ndarray, partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, **kwargs, ): """Initialize a CompressedNumpyList. @@ -52,7 +53,7 @@ def __init__( @classmethod def from_list( - cls, lst: List[List[np.ndarray]], names: List[str] = None, metadata: dict = None + cls, lst: List[List[np.ndarray]], names: Optional[List[str]] = None, metadata: Optional[dict] = None ) -> "CompressedNumpyList": """ Create a `CompressedNumpyList` from a list of NumPy vectors. @@ -82,8 +83,52 @@ def from_list( return cls(unlist_data, partitioning, metadata=metadata) + @classmethod + def _from_partitioned_data( + cls, partitioned_data: List[List], partitioning: Partitioning, metadata: Optional[dict] = None + ) -> "CompressedNumpyList": + """Create CompressedNumpyList from already-partitioned data. + + Args: + partitioned_data: List of arrays, each containing numpy data for one partition + partitioning: Partitioning object defining the boundaries + metadata: Optional metadata + + Returns: + A new CompressedNumpyList + """ + import numpy as np + + # Concatenate the numpy arrays + if not partitioned_data or not partitioned_data[0]: + unlist_data = np.array([]) + else: + unlist_data = np.concatenate(partitioned_data) + + return cls(unlist_data, partitioning, metadata=metadata) + @splitAsCompressedList.register -def _(data: List[np.ndarray], names: List[str] = None, metadata: dict = None) -> CompressedNumpyList: - """Handle lists of numpy vectors.""" - return CompressedNumpyList.from_list(data, names, metadata) +def _( + data: np.ndarray, + names: Optional[List[str]] = None, + metadata: Optional[dict] = None, + groups: Optional[list] = None, + partitions: Optional[Partitioning] = None, +): + """Handle NumPy arrays.""" + + if groups is not None: + return _splitAsCompressedList_by_groups(data, groups, names, metadata) + elif partitions is not None: + return _splitAsCompressedList_by_partitions(data, partitions, names, metadata) + else: + # Original behavior: convert single array to list of arrays + if data.ndim == 1: + list_data = [data] + else: + list_data = [row for row in data] + + from .numpy_list import CompressedNumpyList + + return CompressedNumpyList.from_list(list_data, names, metadata) diff --git a/src/compressed_lists/partition.py b/src/compressed_lists/partition.py index 154ad5e..6782c65 100644 --- a/src/compressed_lists/partition.py +++ b/src/compressed_lists/partition.py @@ -46,7 +46,7 @@ def __init__(self, ends: Sequence[int], names: Optional[Sequence[str]] = None, v self._names = None if names is not None: - self._names = ut.NamedList(names) + self._names = ut.Names(names) if validate: _validate_names(names, len(ends)) @@ -212,11 +212,11 @@ def __getitem__(self, key: Union[int, slice]) -> Union[tuple, List[tuple]]: ######>> names <<##### ###################### - def get_names(self) -> Optional[ut.NamedList]: + def get_names(self) -> Optional[ut.Names]: """Return the names of each partition.""" return self._names - def set_names(self, names: Optional[List[str]], in_place: bool = False) -> "Partitioning": + def set_names(self, names: Optional[Sequence[str]], in_place: bool = False) -> "Partitioning": """Set the names of list elements. Args: @@ -247,7 +247,7 @@ def names(self) -> Optional[ut.Names]: return self.get_names() @names.setter - def names(self, names: Optional[List[str]]): + def names(self, names: Optional[Sequence[str]]): """Alias for :py:meth:`~set_names` with ``in_place = True``. As this mutates the original object, a warning is raised. @@ -262,12 +262,12 @@ def names(self, names: Optional[List[str]]): ######>> ends <<##### ##################### - def get_ends(self) -> Optional[ut.NamedList]: + def get_ends(self) -> np.ndarray: """Return the names of each partition.""" return self._ends @property - def ends(self) -> Optional[ut.Names]: + def ends(self) -> np.ndarray: """Alias for :py:attr:`~get_ends`, provided for back-compatibility.""" return self.get_ends() @@ -275,11 +275,11 @@ def ends(self) -> Optional[ut.Names]: ######>> starts <<##### ####################### - def get_starts(self) -> Optional[ut.NamedList]: + def get_starts(self) -> np.ndarray: """Return the starts of each partition.""" return self._starts @property - def starts(self) -> Optional[ut.Names]: + def starts(self) -> np.ndarray: """Alias for :py:attr:`~get_starts`, provided for back-compatibility.""" return self.get_starts() diff --git a/src/compressed_lists/split_generic.py b/src/compressed_lists/split_generic.py index 27a4a0c..97321d6 100644 --- a/src/compressed_lists/split_generic.py +++ b/src/compressed_lists/split_generic.py @@ -1,27 +1,81 @@ +from collections import defaultdict from functools import singledispatch -from typing import Any, List +from typing import Any, List, Optional, Sequence, Tuple, Union import numpy as np + from .base import CompressedList +from .partition import Partitioning __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" __license__ = "MIT" +def groups_to_partition( + data: Any, groups: list, names: Optional[Sequence[str]] = None +) -> Tuple[List[Any], Partitioning]: + """Convert group membership vector to partitioned data and Partitioning object. + + Args: + data: + The data to be split (flat vector-like object). + + groups: + Group membership vector, same length as data. + + names: + Optional names for groups. + + Returns: + Tuple of (partitioned_data_list, partitioning_object) + """ + + if len(data) != len(groups): + raise ValueError(f"Length of data ({len(data)}) must match length of groups ({len(groups)})") + + group_dict = defaultdict(list) + for item, group in zip(data, groups): + group_dict[group].append(item) + + sorted_groups = sorted(group_dict.keys()) + partitioned_data = [group_dict[group] for group in sorted_groups] + + if names is None: + group_names = [str(group) for group in sorted_groups] + else: + if len(names) != len(sorted_groups): + raise ValueError( + f"Length of names ({len(names)}) must match number of unique groups ({len(sorted_groups)})" + ) + group_names = names + + partitioning = Partitioning.from_list(partitioned_data, group_names) + + return partitioned_data, partitioning + + @singledispatch -def splitAsCompressedList(data: Any, names: List[str] = None, metadata: dict = None) -> CompressedList: +def splitAsCompressedList( + data: Any, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, +) -> CompressedList: """Generic function to split data into an appropriate `CompressedList` subclass. - This function uses single dispatch to automatically choose the right - `CompressedList` subclass based on the data type. Third parties can - register their own types by using the @splitAsCompressedList.register - decorator. + This function can work in two modes: + 1. Group-based splitting where a flat vector is split according to group membership. + 2. Partition-based splitting where a flat vector is split according to explicit partitions. Args: data: The data to split into a `CompressedList`. + groups_or_partitions: + Optional group membership vector (same length as data) or + explicit partitioning object. + names: Optional names for the list elements. @@ -31,43 +85,31 @@ def splitAsCompressedList(data: Any, names: List[str] = None, metadata: dict = N Returns: An appropriate `CompressedList` subclass instance. """ - raise NotImplementedError(f"No `splitAsCompressedList` implementation for type {type(data)}.") + element_type = type(data) + raise NotImplementedError(f"No `splitAsCompressedList` dispatcher found for element type {element_type}") -@splitAsCompressedList.register -def _(data: list, names: List[str] = None, metadata: dict = None) -> CompressedList: - """Handle regular Python lists by inspecting element types.""" - if not data: - raise ValueError("Cannot create `CompressedList` from empty list.") +def _generic_register_helper(data, groups_or_partitions, names=None): + if groups_or_partitions is None: + raise ValueError("'groups_or_paritions' cannot be 'None'.") - first_element = None - for item in data: - if item and len(item) > 0: - first_element = item[0] - break - - if first_element is None: - raise ValueError("All elements are empty, cannot determine type") - - if isinstance(first_element, int): - from integer_list import CompressedIntegerList - - return CompressedIntegerList.from_list(data, names, metadata) - elif isinstance(first_element, str): - from string_list import CompressedCharacterList - - return CompressedCharacterList.from_list(data, names, metadata) - elif isinstance(first_element, float): - from float_list import CompressedFloatList + if not data: + raise ValueError("'data' cannot be empty.") - return CompressedFloatList.from_list(data, names, metadata) - elif isinstance(first_element, float): - from bool_list import CompressedBooleanList + if isinstance(groups_or_partitions, (list, np.ndarray)): + partitioned_data, groups_or_partitions = groups_to_partition(data, groups=groups_or_partitions, names=names) - return CompressedBooleanList.from_list(data, names, metadata) - elif isinstance(first_element, np.ndarray): - from numpy_list import CompressedNumpyList + if len(partitioned_data) == 0: + raise ValueError("No data after grouping") + elif isinstance(groups_or_partitions, Partitioning): + if names is not None: + groups_or_partitions = groups_or_partitions.set_names(names, in_place=False) - return CompressedNumpyList.from_list(data, names, metadata) + partitioned_data = [] + for i in range(len(groups_or_partitions)): + start, end = groups_or_partitions.get_partition_range(i) + partitioned_data.append(data[start:end]) else: - raise NotImplementedError(f"No `CompressedList` implementation for element type {type(first_element)}.") + raise ValueError("'groups_or_paritions' must be a group vector or a Partition object.") + + return partitioned_data, groups_or_partitions \ No newline at end of file diff --git a/src/compressed_lists/string_list.py b/src/compressed_lists/string_list.py index 1607b83..20f85ce 100644 --- a/src/compressed_lists/string_list.py +++ b/src/compressed_lists/string_list.py @@ -1,10 +1,11 @@ -from typing import List, Union +from typing import List, Optional, Sequence, Union +import numpy as np from biocutils.StringList import StringList from .base import CompressedList from .partition import Partitioning -from .split_generic import splitAsCompressedList +from .split_generic import _generic_register_helper, splitAsCompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -18,8 +19,8 @@ def __init__( self, unlist_data: StringList, partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, **kwargs, ): """Initialize a CompressedStringList. @@ -50,6 +51,33 @@ def __init__( unlist_data, partitioning, element_type=StringList, element_metadata=element_metadata, metadata=metadata ) + @classmethod + def from_partitioned_data( + cls, partitioned_data: List[List], partitioning: Partitioning, metadata: Optional[dict] = None + ) -> "CompressedStringList": + """Create `CompressedStringList` from already-partitioned data. + + Args: + partitioned_data: + List of lists, each containing strings for one partition. + + partitioning: + Partitioning object defining the boundaries. + + metadata: + Optional metadata. + + Returns: + A new `CompressedStringList`. + """ + flat_data = [] + for partition in partitioned_data: + flat_data.extend(partition) + + unlist_data = StringList(flat_data) + + return cls(unlist_data, partitioning, metadata=metadata) + class CompressedCharacterList(CompressedStringList): pass @@ -57,7 +85,17 @@ class CompressedCharacterList(CompressedStringList): @splitAsCompressedList.register def _( - data: Union[List[List[str]], List[StringList]], names: List[str] = None, metadata: dict = None + data: StringList, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, ) -> CompressedStringList: - """Handle lists of strings.""" - return CompressedStringList.from_list(data, names, metadata) + """Handle lists of floats.""" + + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) + + return CompressedStringList.from_partitioned_data( + partitioned_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata + ) From e58ff7bf2bde383358c8afd676a7af61148dd0e2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Oct 2025 18:55:44 +0000 Subject: [PATCH 17/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/compressed_lists/base.py | 4 ++-- src/compressed_lists/bool_list.py | 1 - src/compressed_lists/float_list.py | 1 - src/compressed_lists/integer_list.py | 5 ++--- src/compressed_lists/split_generic.py | 2 +- src/compressed_lists/string_list.py | 1 - 6 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/compressed_lists/base.py b/src/compressed_lists/base.py index f4253ec..f450059 100644 --- a/src/compressed_lists/base.py +++ b/src/compressed_lists/base.py @@ -422,7 +422,7 @@ def __getitem__(self, key: Union[int, str, slice]) -> Any: for i in indices: start, end = self._partitioning.get_partition_range(i) result.append(self.extract_range(start, end)) - + current_class_const = type(self) return current_class_const.from_list( result, names=[self.names[i] for i in indices] if self.names[0] is not None else None @@ -594,6 +594,6 @@ def lapply(self, func: Callable) -> "CompressedList": A new CompressedList with the results. """ result = [func(elem) for elem in self] - + current_class_const = type(self) return current_class_const.from_list(result, self.names, self._metadata) diff --git a/src/compressed_lists/bool_list.py b/src/compressed_lists/bool_list.py index 980b866..19a0903 100644 --- a/src/compressed_lists/bool_list.py +++ b/src/compressed_lists/bool_list.py @@ -1,6 +1,5 @@ from typing import List, Optional, Sequence, Union -import numpy as np from biocutils.BooleanList import BooleanList from .base import CompressedList diff --git a/src/compressed_lists/float_list.py b/src/compressed_lists/float_list.py index 59efcff..2b56884 100644 --- a/src/compressed_lists/float_list.py +++ b/src/compressed_lists/float_list.py @@ -1,6 +1,5 @@ from typing import List, Optional, Sequence, Union -import numpy as np from biocutils.FloatList import FloatList from .base import CompressedList diff --git a/src/compressed_lists/integer_list.py b/src/compressed_lists/integer_list.py index adc601d..10410a1 100644 --- a/src/compressed_lists/integer_list.py +++ b/src/compressed_lists/integer_list.py @@ -1,6 +1,5 @@ from typing import Optional, Sequence, Union -import numpy as np from biocutils.IntegerList import IntegerList from .base import CompressedList @@ -89,11 +88,11 @@ def _( metadata: Optional[dict] = None, ) -> CompressedIntegerList: """Handle lists of integers.""" - + partitioned_data, groups_or_partitions = _generic_register_helper( data=data, groups_or_partitions=groups_or_partitions, names=names ) - + return CompressedIntegerList.from_partitioned_data( partitioned_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata ) diff --git a/src/compressed_lists/split_generic.py b/src/compressed_lists/split_generic.py index 97321d6..bfe9cae 100644 --- a/src/compressed_lists/split_generic.py +++ b/src/compressed_lists/split_generic.py @@ -112,4 +112,4 @@ def _generic_register_helper(data, groups_or_partitions, names=None): else: raise ValueError("'groups_or_paritions' must be a group vector or a Partition object.") - return partitioned_data, groups_or_partitions \ No newline at end of file + return partitioned_data, groups_or_partitions diff --git a/src/compressed_lists/string_list.py b/src/compressed_lists/string_list.py index 20f85ce..b7cbd4d 100644 --- a/src/compressed_lists/string_list.py +++ b/src/compressed_lists/string_list.py @@ -1,6 +1,5 @@ from typing import List, Optional, Sequence, Union -import numpy as np from biocutils.StringList import StringList from .base import CompressedList From 056f32a59693f689082f10d38d1a82943202b72e Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 7 Oct 2025 22:53:12 -0700 Subject: [PATCH 18/28] cleaning up numpylists --- src/compressed_lists/numpy_list.py | 57 ++++++++++++++---------------- tests/test_comp_numpy.py | 2 +- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/src/compressed_lists/numpy_list.py b/src/compressed_lists/numpy_list.py index deb52a5..74becd0 100644 --- a/src/compressed_lists/numpy_list.py +++ b/src/compressed_lists/numpy_list.py @@ -1,10 +1,10 @@ -from typing import List, Optional +from typing import List, Optional, Sequence, Union import numpy as np from .base import CompressedList from .partition import Partitioning -from .split_generic import splitAsCompressedList +from .split_generic import _generic_register_helper, splitAsCompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -26,7 +26,7 @@ def __init__( Args: unlist_data: - NumPy vector. + List of NumPy vectors. partitioning: Partitioning object defining element boundaries. @@ -43,17 +43,17 @@ def __init__( if not isinstance(unlist_data, np.ndarray): try: - unlist_data = np.array(unlist_data) + unlist_data = np.concatenate(unlist_data) except Exception as e: raise TypeError("'unlist_data' must be an `np.ndarray`, provided ", type(unlist_data)) from e super().__init__( - unlist_data, partitioning, element_type=np.array, element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type=np.ndarray, element_metadata=element_metadata, metadata=metadata ) @classmethod def from_list( - cls, lst: List[List[np.ndarray]], names: Optional[List[str]] = None, metadata: Optional[dict] = None + cls, lst: List[np.ndarray], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None ) -> "CompressedNumpyList": """ Create a `CompressedNumpyList` from a list of NumPy vectors. @@ -79,23 +79,28 @@ def from_list( if len(lst) == 0: unlist_data = np.array([]) else: - unlist_data = np.hstack(lst) + unlist_data = np.concatenate(lst) return cls(unlist_data, partitioning, metadata=metadata) @classmethod - def _from_partitioned_data( + def from_partitioned_data( cls, partitioned_data: List[List], partitioning: Partitioning, metadata: Optional[dict] = None ) -> "CompressedNumpyList": - """Create CompressedNumpyList from already-partitioned data. + """Create `CompressedNumpyList` from already-partitioned data. Args: - partitioned_data: List of arrays, each containing numpy data for one partition - partitioning: Partitioning object defining the boundaries - metadata: Optional metadata + partitioned_data: + List of arrays, each containing numpy data for one partition. + + partitioning: + Partitioning object defining the boundaries. + + metadata: + Optional metadata. Returns: - A new CompressedNumpyList + A new `CompressedNumpyList`. """ import numpy as np @@ -111,24 +116,16 @@ def _from_partitioned_data( @splitAsCompressedList.register def _( data: np.ndarray, - names: Optional[List[str]] = None, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None, - groups: Optional[list] = None, - partitions: Optional[Partitioning] = None, -): +) -> CompressedNumpyList: """Handle NumPy arrays.""" - if groups is not None: - return _splitAsCompressedList_by_groups(data, groups, names, metadata) - elif partitions is not None: - return _splitAsCompressedList_by_partitions(data, partitions, names, metadata) - else: - # Original behavior: convert single array to list of arrays - if data.ndim == 1: - list_data = [data] - else: - list_data = [row for row in data] - - from .numpy_list import CompressedNumpyList + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) - return CompressedNumpyList.from_list(list_data, names, metadata) + return CompressedNumpyList.from_partitioned_data( + partitioned_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata + ) diff --git a/tests/test_comp_numpy.py b/tests/test_comp_numpy.py index aebffb0..8611591 100644 --- a/tests/test_comp_numpy.py +++ b/tests/test_comp_numpy.py @@ -29,7 +29,7 @@ def test_creation(numpy_data): def test_creation_from_parts(): - numpy_list = CompressedNumpyList([1, 2, 3, 4, 5, 6, 7, 8, 9], Partitioning(ends=[3, 5, 9])) + numpy_list = CompressedNumpyList(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), Partitioning(ends=[3, 5, 9])) assert len(numpy_list) == 3 assert isinstance(numpy_list.unlist_data, np.ndarray) From 00e6c898086681d04bb8ce73a7355c0bc17d98f4 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 8 Oct 2025 11:04:50 -0700 Subject: [PATCH 19/28] more tests for the generic --- src/compressed_lists/__init__.py | 1 + src/compressed_lists/biocframe_list.py | 72 ++++++++++++++++++++++++++ src/compressed_lists/bool_list.py | 14 ++--- src/compressed_lists/float_list.py | 14 ++--- src/compressed_lists/integer_list.py | 11 ++-- src/compressed_lists/numpy_list.py | 8 +-- src/compressed_lists/split_generic.py | 21 ++++---- src/compressed_lists/string_list.py | 14 ++--- tests/test_comp_custom.py | 12 +++-- tests/test_generics.py | 26 ++++++++++ 10 files changed, 149 insertions(+), 44 deletions(-) create mode 100644 src/compressed_lists/biocframe_list.py create mode 100644 tests/test_generics.py diff --git a/src/compressed_lists/__init__.py b/src/compressed_lists/__init__.py index 3715ce0..114defd 100644 --- a/src/compressed_lists/__init__.py +++ b/src/compressed_lists/__init__.py @@ -22,3 +22,4 @@ from .bool_list import CompressedBooleanList from .float_list import CompressedFloatList from .numpy_list import CompressedNumpyList +from .split_generic import splitAsCompressedList \ No newline at end of file diff --git a/src/compressed_lists/biocframe_list.py b/src/compressed_lists/biocframe_list.py new file mode 100644 index 0000000..bc320b0 --- /dev/null +++ b/src/compressed_lists/biocframe_list.py @@ -0,0 +1,72 @@ +from typing import List, Union + +from biocframe import BiocFrame + +from .base import CompressedList +from .partition import Partitioning +from .split_generic import splitAsCompressedList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class CompressedBiocFrameList(CompressedList): + """CompressedList for BiocFrames.""" + + def __init__( + self, + unlist_data: BiocFrame, + partitioning: Partitioning, + element_metadata: dict = None, + metadata: dict = None, + **kwargs, + ): + super().__init__( + unlist_data, partitioning, element_type="BiocFrame", element_metadata=element_metadata, metadata=metadata + ) + + @classmethod + def from_list(cls, lst: List[BiocFrame], names: List[str] = None, metadata: dict = None): + partitioning = Partitioning.from_list(lst, names) + return cls(lst, partitioning, metadata=metadata) + + def __getitem__(self, key: Union[int, str, slice]): + """Override to handle column extraction using `splitAsCompressedList`. + + When extracting a column, this will automatically dispatch to the + appropriate `CompressedList` subclass based on the column data type. + """ + if isinstance(key, str): + column_data = [] + for df in self.unlist_data: + if key in df.columns: + column_data.append(df[key].tolist()) + else: + column_data.append([]) + + return splitAsCompressedList(column_data, names=self.names, metadata=self.metadata) + else: + return super().__getitem__(key) + + @classmethod + def _from_partitioned_data(cls, partitioned_data: List, partitioning: Partitioning, metadata: dict = None) -> "CompressedBiocFrameList": + """Create CompressedBiocFrameList from already-partitioned data. + + Args: + partitioned_data: List of BiocFrame objects (already partitioned) + partitioning: Partitioning object defining the boundaries + metadata: Optional metadata + + Returns: + A new CompressedBiocFrameList + """ + # For BiocFrame, the partitioned_data should already be a list of BiocFrame objects + # so we can use it directly as unlist_data + return cls(partitioned_data, partitioning, metadata=metadata) + + +@splitAsCompressedList.register +def _(data: List[BiocFrame], names: List[str] = None, metadata: dict = None) -> CompressedBiocFrameList: + """Handle lists of boolean.""" + return CompressedBiocFrameList.from_list(data, names, metadata) diff --git a/src/compressed_lists/bool_list.py b/src/compressed_lists/bool_list.py index 19a0903..a3aed3f 100644 --- a/src/compressed_lists/bool_list.py +++ b/src/compressed_lists/bool_list.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Sequence, Union +from typing import Optional, Sequence, Union from biocutils.BooleanList import BooleanList @@ -53,7 +53,7 @@ def __init__( @classmethod def from_partitioned_data( - cls, partitioned_data: List[List], partitioning: Partitioning, metadata: Optional[dict] = None + cls, partitioned_data: Sequence[BooleanList], partitioning: Partitioning, metadata: Optional[dict] = None ) -> "CompressedBooleanList": """Create `CompressedBooleanList` from already-partitioned data. @@ -70,11 +70,13 @@ def from_partitioned_data( Returns: A new `CompressedBooleanList`. """ - flat_data = [] - for partition in partitioned_data: - flat_data.extend(partition) + unlist_data = partitioned_data + if isinstance(partitioned_data, list): + flat_data = [] + for partition in partitioned_data: + flat_data.extend(partition) - unlist_data = BooleanList(flat_data) + unlist_data = BooleanList(flat_data) return cls(unlist_data, partitioning, metadata=metadata) diff --git a/src/compressed_lists/float_list.py b/src/compressed_lists/float_list.py index 2b56884..6c97c5b 100644 --- a/src/compressed_lists/float_list.py +++ b/src/compressed_lists/float_list.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Sequence, Union +from typing import Optional, Sequence, Union from biocutils.FloatList import FloatList @@ -53,7 +53,7 @@ def __init__( @classmethod def from_partitioned_data( - cls, partitioned_data: List[List], partitioning: Partitioning, metadata: Optional[dict] = None + cls, partitioned_data: Sequence[FloatList], partitioning: Partitioning, metadata: Optional[dict] = None ) -> "CompressedFloatList": """Create `CompressedFloatList` from already-partitioned data. @@ -70,11 +70,13 @@ def from_partitioned_data( Returns: A new `CompressedFloatList`. """ - flat_data = [] - for partition in partitioned_data: - flat_data.extend(partition) + unlist_data = partitioned_data + if isinstance(partitioned_data, list): + flat_data = [] + for partition in partitioned_data: + flat_data.extend(partition) - unlist_data = FloatList(flat_data) + unlist_data = FloatList(flat_data) return cls(unlist_data, partitioning, metadata=metadata) diff --git a/src/compressed_lists/integer_list.py b/src/compressed_lists/integer_list.py index 10410a1..3739d76 100644 --- a/src/compressed_lists/integer_list.py +++ b/src/compressed_lists/integer_list.py @@ -70,12 +70,13 @@ def from_partitioned_data( Returns: A new `CompressedIntegerList`. """ + unlist_data = partitioned_data + if isinstance(partitioned_data, list): + flat_data = [] + for partition in partitioned_data: + flat_data.extend(partition) - flat_data = [] - for partition in partitioned_data: - flat_data.extend(partition) - - unlist_data = IntegerList(flat_data) + unlist_data = IntegerList(flat_data) return cls(unlist_data, partitioning, metadata=metadata) diff --git a/src/compressed_lists/numpy_list.py b/src/compressed_lists/numpy_list.py index 74becd0..5f7aeeb 100644 --- a/src/compressed_lists/numpy_list.py +++ b/src/compressed_lists/numpy_list.py @@ -71,11 +71,8 @@ def from_list( Returns: A new `CompressedNumpyList`. """ - - # Create partitioning partitioning = Partitioning.from_list(lst, names) - # Create unlist_data if len(lst) == 0: unlist_data = np.array([]) else: @@ -85,7 +82,7 @@ def from_list( @classmethod def from_partitioned_data( - cls, partitioned_data: List[List], partitioning: Partitioning, metadata: Optional[dict] = None + cls, partitioned_data: Sequence[np.ndarray], partitioning: Partitioning, metadata: Optional[dict] = None ) -> "CompressedNumpyList": """Create `CompressedNumpyList` from already-partitioned data. @@ -102,9 +99,6 @@ def from_partitioned_data( Returns: A new `CompressedNumpyList`. """ - import numpy as np - - # Concatenate the numpy arrays if not partitioned_data or not partitioned_data[0]: unlist_data = np.array([]) else: diff --git a/src/compressed_lists/split_generic.py b/src/compressed_lists/split_generic.py index bfe9cae..d63e55e 100644 --- a/src/compressed_lists/split_generic.py +++ b/src/compressed_lists/split_generic.py @@ -96,19 +96,22 @@ def _generic_register_helper(data, groups_or_partitions, names=None): if not data: raise ValueError("'data' cannot be empty.") - if isinstance(groups_or_partitions, (list, np.ndarray)): + if isinstance(groups_or_partitions, Partitioning): + if names is not None: + groups_or_partitions = groups_or_partitions.set_names(names, in_place=False) + + # TODO: probably not necessary to split when groups is a partition object. + # unless ordering matters + # partitioned_data = [] + # for i in range(len(groups_or_partitions)): + # start, end = groups_or_partitions.get_partition_range(i) + # partitioned_data.append(data[start:end]) + partitioned_data = data + elif isinstance(groups_or_partitions, (list, np.ndarray)): partitioned_data, groups_or_partitions = groups_to_partition(data, groups=groups_or_partitions, names=names) if len(partitioned_data) == 0: raise ValueError("No data after grouping") - elif isinstance(groups_or_partitions, Partitioning): - if names is not None: - groups_or_partitions = groups_or_partitions.set_names(names, in_place=False) - - partitioned_data = [] - for i in range(len(groups_or_partitions)): - start, end = groups_or_partitions.get_partition_range(i) - partitioned_data.append(data[start:end]) else: raise ValueError("'groups_or_paritions' must be a group vector or a Partition object.") diff --git a/src/compressed_lists/string_list.py b/src/compressed_lists/string_list.py index b7cbd4d..19fb82e 100644 --- a/src/compressed_lists/string_list.py +++ b/src/compressed_lists/string_list.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Sequence, Union +from typing import Optional, Sequence, Union from biocutils.StringList import StringList @@ -52,7 +52,7 @@ def __init__( @classmethod def from_partitioned_data( - cls, partitioned_data: List[List], partitioning: Partitioning, metadata: Optional[dict] = None + cls, partitioned_data: Sequence[StringList], partitioning: Partitioning, metadata: Optional[dict] = None ) -> "CompressedStringList": """Create `CompressedStringList` from already-partitioned data. @@ -69,11 +69,13 @@ def from_partitioned_data( Returns: A new `CompressedStringList`. """ - flat_data = [] - for partition in partitioned_data: - flat_data.extend(partition) + unlist_data = partitioned_data + if isinstance(partitioned_data, list): + flat_data = [] + for partition in partitioned_data: + flat_data.extend(partition) - unlist_data = StringList(flat_data) + unlist_data = StringList(flat_data) return cls(unlist_data, partitioning, metadata=metadata) diff --git a/tests/test_comp_custom.py b/tests/test_comp_custom.py index aa4f582..82bd860 100644 --- a/tests/test_comp_custom.py +++ b/tests/test_comp_custom.py @@ -1,4 +1,4 @@ -from typing import List +from typing import Any, List, Optional, Sequence import pytest @@ -14,10 +14,12 @@ def CompressedCustomFloatList(): class CompressedCustomFloatList(CompressedList): def __init__( self, - unlist_data: List[float], + unlist_data: Any, partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None, + element_type: Any = None, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, + validate: bool = True, ): super().__init__( unlist_data, partitioning, element_type="float", element_metadata=element_metadata, metadata=metadata @@ -28,7 +30,7 @@ def extract_range(self, start: int, end: int) -> List[float]: @classmethod def from_list( - cls, lst: List[List[float]], names: list = None, metadata: dict = None + cls, lst: List[Any], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None ) -> "CompressedCustomFloatList": flat_data = [] for sublist in lst: diff --git a/tests/test_generics.py b/tests/test_generics.py new file mode 100644 index 0000000..8ec5b0e --- /dev/null +++ b/tests/test_generics.py @@ -0,0 +1,26 @@ +import biocutils as ut +import numpy as np +import pytest + +from compressed_lists import CompressedFloatList, CompressedIntegerList, Partitioning, splitAsCompressedList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def test_groups(): + float_vec = ut.FloatList([1.1, 1.2, 2.1, 2.2, 2.3, 3.0]) + groups = [1, 2, 3, 1, 2, 3] + + clist = splitAsCompressedList(float_vec, groups_or_partitions=groups) + + assert isinstance(clist, CompressedFloatList) + + +def test_partitions(): + int_list = splitAsCompressedList( + ut.IntegerList([1, 2, 3, 4, 5, 6, 7, 8, 9]), groups_or_partitions=Partitioning(ends=[3, 5, 9]) + ) + + assert isinstance(int_list, CompressedIntegerList) From 155e8aa58f3d575bd9768bf88c9046715ed6b85d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Oct 2025 18:05:09 +0000 Subject: [PATCH 20/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/compressed_lists/__init__.py | 2 +- src/compressed_lists/biocframe_list.py | 10 ++++++---- tests/test_generics.py | 2 -- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/compressed_lists/__init__.py b/src/compressed_lists/__init__.py index 114defd..6f94bef 100644 --- a/src/compressed_lists/__init__.py +++ b/src/compressed_lists/__init__.py @@ -22,4 +22,4 @@ from .bool_list import CompressedBooleanList from .float_list import CompressedFloatList from .numpy_list import CompressedNumpyList -from .split_generic import splitAsCompressedList \ No newline at end of file +from .split_generic import splitAsCompressedList diff --git a/src/compressed_lists/biocframe_list.py b/src/compressed_lists/biocframe_list.py index bc320b0..0870d2c 100644 --- a/src/compressed_lists/biocframe_list.py +++ b/src/compressed_lists/biocframe_list.py @@ -50,14 +50,16 @@ def __getitem__(self, key: Union[int, str, slice]): return super().__getitem__(key) @classmethod - def _from_partitioned_data(cls, partitioned_data: List, partitioning: Partitioning, metadata: dict = None) -> "CompressedBiocFrameList": + def _from_partitioned_data( + cls, partitioned_data: List, partitioning: Partitioning, metadata: dict = None + ) -> "CompressedBiocFrameList": """Create CompressedBiocFrameList from already-partitioned data. - + Args: partitioned_data: List of BiocFrame objects (already partitioned) - partitioning: Partitioning object defining the boundaries + partitioning: Partitioning object defining the boundaries metadata: Optional metadata - + Returns: A new CompressedBiocFrameList """ diff --git a/tests/test_generics.py b/tests/test_generics.py index 8ec5b0e..ae45e43 100644 --- a/tests/test_generics.py +++ b/tests/test_generics.py @@ -1,6 +1,4 @@ import biocutils as ut -import numpy as np -import pytest from compressed_lists import CompressedFloatList, CompressedIntegerList, Partitioning, splitAsCompressedList From 7c7ecba90f7da07418499cd732cd5387bc41b60b Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 8 Oct 2025 19:25:43 -0700 Subject: [PATCH 21/28] more biocutils methods, getting rid of from_partitioned data --- src/compressed_lists/biocframe_list.py | 107 ++++++++++++++++--------- src/compressed_lists/bool_list.py | 50 +++--------- src/compressed_lists/float_list.py | 50 +++--------- src/compressed_lists/integer_list.py | 52 +++--------- src/compressed_lists/numpy_list.py | 36 ++------- src/compressed_lists/split_generic.py | 5 +- src/compressed_lists/string_list.py | 50 +++--------- 7 files changed, 128 insertions(+), 222 deletions(-) diff --git a/src/compressed_lists/biocframe_list.py b/src/compressed_lists/biocframe_list.py index 0870d2c..ec1c423 100644 --- a/src/compressed_lists/biocframe_list.py +++ b/src/compressed_lists/biocframe_list.py @@ -1,10 +1,11 @@ -from typing import List, Union +from typing import List, Optional, Sequence, Union +import biocutils as ut from biocframe import BiocFrame from .base import CompressedList from .partition import Partitioning -from .split_generic import splitAsCompressedList +from .split_generic import _generic_register_helper, splitAsCompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -18,57 +19,87 @@ def __init__( self, unlist_data: BiocFrame, partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, **kwargs, ): + """Initialize a CompressedBiocFrameList. + + Args: + unlist_data: + BiocFrame object. + + partitioning: + Partitioning object defining element boundaries. + + element_metadata: + Optional metadata for elements. + + metadata: + Optional general metadata. + + kwargs: + Additional arguments. + """ + if not isinstance(unlist_data, BiocFrame): + raise TypeError("'unlist_data' is not a `BiocFrame` object.") + super().__init__( unlist_data, partitioning, element_type="BiocFrame", element_metadata=element_metadata, metadata=metadata ) @classmethod - def from_list(cls, lst: List[BiocFrame], names: List[str] = None, metadata: dict = None): - partitioning = Partitioning.from_list(lst, names) - return cls(lst, partitioning, metadata=metadata) + def from_list( + cls, lst: List[BiocFrame], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None + ) -> "CompressedBiocFrameList": + """Create a `CompressedBiocFrameList` from a regular list. - def __getitem__(self, key: Union[int, str, slice]): - """Override to handle column extraction using `splitAsCompressedList`. + This concatenates the list of `BiocFrame` objects. + + Args: + lst: + List of `BiocFrame` objects. - When extracting a column, this will automatically dispatch to the - appropriate `CompressedList` subclass based on the column data type. + names: + Optional names for list elements. + + metadata: + Optional metadata. + + Returns: + A new `CompressedList`. """ + unlist_data = ut.relaxed_combine_rows(*lst) + partitioning = Partitioning.from_list(lst, names) + return cls(unlist_data, partitioning, metadata=metadata) + + def __getitem__(self, key: Union[int, str, slice]): + """Override to handle column extraction using `splitAsCompressedList`.""" if isinstance(key, str): - column_data = [] - for df in self.unlist_data: - if key in df.columns: - column_data.append(df[key].tolist()) - else: - column_data.append([]) - - return splitAsCompressedList(column_data, names=self.names, metadata=self.metadata) + column_data = self._unlist_data.get_column(key) + return splitAsCompressedList( + column_data, partition=self._partitioning, names=self.names, metadata=self.metadata + ) else: return super().__getitem__(key) - @classmethod - def _from_partitioned_data( - cls, partitioned_data: List, partitioning: Partitioning, metadata: dict = None - ) -> "CompressedBiocFrameList": - """Create CompressedBiocFrameList from already-partitioned data. - Args: - partitioned_data: List of BiocFrame objects (already partitioned) - partitioning: Partitioning object defining the boundaries - metadata: Optional metadata +@splitAsCompressedList.register +def _( + data: BiocFrame, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, +) -> CompressedBiocFrameList: + """Handle lists of BiocFrame objects.""" - Returns: - A new CompressedBiocFrameList - """ - # For BiocFrame, the partitioned_data should already be a list of BiocFrame objects - # so we can use it directly as unlist_data - return cls(partitioned_data, partitioning, metadata=metadata) + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) + if not isinstance(partitioned_data, BiocFrame): + partitioned_data = ut.relaxed_combine_rows(*partitioned_data) -@splitAsCompressedList.register -def _(data: List[BiocFrame], names: List[str] = None, metadata: dict = None) -> CompressedBiocFrameList: - """Handle lists of boolean.""" - return CompressedBiocFrameList.from_list(data, names, metadata) + return CompressedBiocFrameList( + unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata + ) diff --git a/src/compressed_lists/bool_list.py b/src/compressed_lists/bool_list.py index a3aed3f..8ca8f29 100644 --- a/src/compressed_lists/bool_list.py +++ b/src/compressed_lists/bool_list.py @@ -1,6 +1,7 @@ from typing import Optional, Sequence, Union +from warnings import warn -from biocutils.BooleanList import BooleanList +import biocutils as ut from .base import CompressedList from .partition import Partitioning @@ -16,7 +17,7 @@ class CompressedBooleanList(CompressedList): def __init__( self, - unlist_data: BooleanList, + unlist_data: ut.BooleanList, partitioning: Partitioning, element_metadata: Optional[dict] = None, metadata: Optional[dict] = None, @@ -41,49 +42,21 @@ def __init__( Additional arguments. """ - if not isinstance(unlist_data, BooleanList): + if not isinstance(unlist_data, ut.BooleanList): try: - unlist_data = BooleanList(unlist_data) + warn("trying to coerce 'unlist_data' to `BooleanList`..") + unlist_data = ut.BooleanList(unlist_data) except Exception as e: raise TypeError("'unlist_data' must be an `BooleanList`, provided ", type(unlist_data)) from e super().__init__( - unlist_data, partitioning, element_type=BooleanList, element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type=ut.BooleanList, element_metadata=element_metadata, metadata=metadata ) - @classmethod - def from_partitioned_data( - cls, partitioned_data: Sequence[BooleanList], partitioning: Partitioning, metadata: Optional[dict] = None - ) -> "CompressedBooleanList": - """Create `CompressedBooleanList` from already-partitioned data. - - Args: - partitioned_data: - List of `BooleanList`'s, each containing booleans for one partition. - - partitioning: - Partitioning object defining the boundaries. - - metadata: - Optional metadata. - - Returns: - A new `CompressedBooleanList`. - """ - unlist_data = partitioned_data - if isinstance(partitioned_data, list): - flat_data = [] - for partition in partitioned_data: - flat_data.extend(partition) - - unlist_data = BooleanList(flat_data) - - return cls(unlist_data, partitioning, metadata=metadata) - @splitAsCompressedList.register def _( - data: BooleanList, + data: ut.BooleanList, groups_or_partitions: Union[list, Partitioning], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None, @@ -94,6 +67,7 @@ def _( data=data, groups_or_partitions=groups_or_partitions, names=names ) - return CompressedBooleanList.from_partitioned_data( - partitioned_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata - ) + if not isinstance(partitioned_data, ut.BooleanList): + partitioned_data = ut.combine_sequences(*partitioned_data) + + return CompressedBooleanList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) diff --git a/src/compressed_lists/float_list.py b/src/compressed_lists/float_list.py index 6c97c5b..006f767 100644 --- a/src/compressed_lists/float_list.py +++ b/src/compressed_lists/float_list.py @@ -1,6 +1,7 @@ from typing import Optional, Sequence, Union +from warnings import warn -from biocutils.FloatList import FloatList +import biocutils as ut from .base import CompressedList from .partition import Partitioning @@ -16,7 +17,7 @@ class CompressedFloatList(CompressedList): def __init__( self, - unlist_data: FloatList, + unlist_data: ut.FloatList, partitioning: Partitioning, element_metadata: Optional[dict] = None, metadata: Optional[dict] = None, @@ -41,49 +42,21 @@ def __init__( Additional arguments. """ - if not isinstance(unlist_data, FloatList): + if not isinstance(unlist_data, ut.FloatList): try: - unlist_data = FloatList(unlist_data) + warn("trying to coerce 'unlist_data' to `FloatList`..") + unlist_data = ut.FloatList(unlist_data) except Exception as e: raise TypeError("'unlist_data' must be an `FloatList`, provided ", type(unlist_data)) from e super().__init__( - unlist_data, partitioning, element_type=FloatList, element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type=ut.FloatList, element_metadata=element_metadata, metadata=metadata ) - @classmethod - def from_partitioned_data( - cls, partitioned_data: Sequence[FloatList], partitioning: Partitioning, metadata: Optional[dict] = None - ) -> "CompressedFloatList": - """Create `CompressedFloatList` from already-partitioned data. - - Args: - partitioned_data: - List of lists, each containing floats for one partition. - - partitioning: - Partitioning object defining the boundaries. - - metadata: - Optional metadata. - - Returns: - A new `CompressedFloatList`. - """ - unlist_data = partitioned_data - if isinstance(partitioned_data, list): - flat_data = [] - for partition in partitioned_data: - flat_data.extend(partition) - - unlist_data = FloatList(flat_data) - - return cls(unlist_data, partitioning, metadata=metadata) - @splitAsCompressedList.register def _( - data: FloatList, + data: ut.FloatList, groups_or_partitions: Union[list, Partitioning], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None, @@ -94,6 +67,7 @@ def _( data=data, groups_or_partitions=groups_or_partitions, names=names ) - return CompressedFloatList.from_partitioned_data( - partitioned_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata - ) + if not isinstance(partitioned_data, ut.FloatList): + partitioned_data = ut.combine_sequences(*partitioned_data) + + return CompressedFloatList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) diff --git a/src/compressed_lists/integer_list.py b/src/compressed_lists/integer_list.py index 3739d76..e79a9f1 100644 --- a/src/compressed_lists/integer_list.py +++ b/src/compressed_lists/integer_list.py @@ -1,10 +1,11 @@ from typing import Optional, Sequence, Union +from warnings import warn -from biocutils.IntegerList import IntegerList +import biocutils as ut from .base import CompressedList from .partition import Partitioning -from .split_generic import splitAsCompressedList, _generic_register_helper +from .split_generic import _generic_register_helper, splitAsCompressedList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -16,7 +17,7 @@ class CompressedIntegerList(CompressedList): def __init__( self, - unlist_data: IntegerList, + unlist_data: ut.IntegerList, partitioning: Partitioning, element_metadata: Optional[dict] = None, metadata: Optional[dict] = None, @@ -41,49 +42,21 @@ def __init__( Additional arguments. """ - if not isinstance(unlist_data, IntegerList): + if not isinstance(unlist_data, ut.IntegerList): try: - unlist_data = IntegerList(unlist_data) + warn("trying to coerce 'unlist_data' to `IntegerList`..") + unlist_data = ut.IntegerList(unlist_data) except Exception as e: raise TypeError("'unlist_data' must be an `IntegerList`, provided ", type(unlist_data)) from e super().__init__( - unlist_data, partitioning, element_type=IntegerList, element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type=ut.IntegerList, element_metadata=element_metadata, metadata=metadata ) - @classmethod - def from_partitioned_data( - cls, partitioned_data: Sequence[IntegerList], partitioning: Partitioning, metadata: Optional[dict] = None - ) -> "CompressedIntegerList": - """Create `CompressedIntegerList` from already-partitioned data. - - Args: - partitioned_data: - List of `IntegerList`'s, each containing integers for one partition. - - partitioning: - Partitioning object defining the boundaries. - - metadata: - Optional metadata. - - Returns: - A new `CompressedIntegerList`. - """ - unlist_data = partitioned_data - if isinstance(partitioned_data, list): - flat_data = [] - for partition in partitioned_data: - flat_data.extend(partition) - - unlist_data = IntegerList(flat_data) - - return cls(unlist_data, partitioning, metadata=metadata) - @splitAsCompressedList.register def _( - data: IntegerList, + data: ut.IntegerList, groups_or_partitions: Union[list, Partitioning], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None, @@ -94,6 +67,7 @@ def _( data=data, groups_or_partitions=groups_or_partitions, names=names ) - return CompressedIntegerList.from_partitioned_data( - partitioned_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata - ) + if not isinstance(partitioned_data, ut.IntegerList): + partitioned_data = ut.combine_sequences(*partitioned_data) + + return CompressedIntegerList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) diff --git a/src/compressed_lists/numpy_list.py b/src/compressed_lists/numpy_list.py index 5f7aeeb..1e70f2b 100644 --- a/src/compressed_lists/numpy_list.py +++ b/src/compressed_lists/numpy_list.py @@ -1,5 +1,7 @@ from typing import List, Optional, Sequence, Union +from warnings import warn +import biocutils as ut import numpy as np from .base import CompressedList @@ -43,6 +45,7 @@ def __init__( if not isinstance(unlist_data, np.ndarray): try: + warn("trying to concatenate/coerce 'unlist_data' to a `np.ndarray`..") unlist_data = np.concatenate(unlist_data) except Exception as e: raise TypeError("'unlist_data' must be an `np.ndarray`, provided ", type(unlist_data)) from e @@ -80,32 +83,6 @@ def from_list( return cls(unlist_data, partitioning, metadata=metadata) - @classmethod - def from_partitioned_data( - cls, partitioned_data: Sequence[np.ndarray], partitioning: Partitioning, metadata: Optional[dict] = None - ) -> "CompressedNumpyList": - """Create `CompressedNumpyList` from already-partitioned data. - - Args: - partitioned_data: - List of arrays, each containing numpy data for one partition. - - partitioning: - Partitioning object defining the boundaries. - - metadata: - Optional metadata. - - Returns: - A new `CompressedNumpyList`. - """ - if not partitioned_data or not partitioned_data[0]: - unlist_data = np.array([]) - else: - unlist_data = np.concatenate(partitioned_data) - - return cls(unlist_data, partitioning, metadata=metadata) - @splitAsCompressedList.register def _( @@ -120,6 +97,7 @@ def _( data=data, groups_or_partitions=groups_or_partitions, names=names ) - return CompressedNumpyList.from_partitioned_data( - partitioned_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata - ) + if not isinstance(partitioned_data, np.ndarray): + partitioned_data = ut.combine_sequences(*partitioned_data) + + return CompressedNumpyList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) diff --git a/src/compressed_lists/split_generic.py b/src/compressed_lists/split_generic.py index d63e55e..78aea5e 100644 --- a/src/compressed_lists/split_generic.py +++ b/src/compressed_lists/split_generic.py @@ -30,7 +30,7 @@ def groups_to_partition( Returns: Tuple of (partitioned_data_list, partitioning_object) """ - + print(data, groups) if len(data) != len(groups): raise ValueError(f"Length of data ({len(data)}) must match length of groups ({len(groups)})") @@ -39,8 +39,9 @@ def groups_to_partition( group_dict[group].append(item) sorted_groups = sorted(group_dict.keys()) + print(sorted_groups) partitioned_data = [group_dict[group] for group in sorted_groups] - + print("pdata", partitioned_data) if names is None: group_names = [str(group) for group in sorted_groups] else: diff --git a/src/compressed_lists/string_list.py b/src/compressed_lists/string_list.py index 19fb82e..f612747 100644 --- a/src/compressed_lists/string_list.py +++ b/src/compressed_lists/string_list.py @@ -1,6 +1,7 @@ from typing import Optional, Sequence, Union +from warnings import warn -from biocutils.StringList import StringList +import biocutils as ut from .base import CompressedList from .partition import Partitioning @@ -16,7 +17,7 @@ class CompressedStringList(CompressedList): def __init__( self, - unlist_data: StringList, + unlist_data: ut.StringList, partitioning: Partitioning, element_metadata: Optional[dict] = None, metadata: Optional[dict] = None, @@ -40,45 +41,17 @@ def __init__( kwargs: Additional arguments. """ - if not isinstance(unlist_data, StringList): + if not isinstance(unlist_data, ut.StringList): try: - unlist_data = StringList(unlist_data) + warn("trying to coerce 'unlist_data' to `StringList`..") + unlist_data = ut.StringList(unlist_data) except Exception as e: raise TypeError("'unlist_data' must be an `StringList`, provided ", type(unlist_data)) from e super().__init__( - unlist_data, partitioning, element_type=StringList, element_metadata=element_metadata, metadata=metadata + unlist_data, partitioning, element_type=ut.StringList, element_metadata=element_metadata, metadata=metadata ) - @classmethod - def from_partitioned_data( - cls, partitioned_data: Sequence[StringList], partitioning: Partitioning, metadata: Optional[dict] = None - ) -> "CompressedStringList": - """Create `CompressedStringList` from already-partitioned data. - - Args: - partitioned_data: - List of lists, each containing strings for one partition. - - partitioning: - Partitioning object defining the boundaries. - - metadata: - Optional metadata. - - Returns: - A new `CompressedStringList`. - """ - unlist_data = partitioned_data - if isinstance(partitioned_data, list): - flat_data = [] - for partition in partitioned_data: - flat_data.extend(partition) - - unlist_data = StringList(flat_data) - - return cls(unlist_data, partitioning, metadata=metadata) - class CompressedCharacterList(CompressedStringList): pass @@ -86,7 +59,7 @@ class CompressedCharacterList(CompressedStringList): @splitAsCompressedList.register def _( - data: StringList, + data: ut.StringList, groups_or_partitions: Union[list, Partitioning], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None, @@ -97,6 +70,7 @@ def _( data=data, groups_or_partitions=groups_or_partitions, names=names ) - return CompressedStringList.from_partitioned_data( - partitioned_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata - ) + if not isinstance(partitioned_data, ut.StringList): + partitioned_data = ut.combine_sequences(*partitioned_data) + + return CompressedStringList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) From b799d710aca9de35b1c6f54c8871ba71d2f9fa4e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Oct 2025 02:25:52 +0000 Subject: [PATCH 22/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/compressed_lists/biocframe_list.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/compressed_lists/biocframe_list.py b/src/compressed_lists/biocframe_list.py index ec1c423..c716597 100644 --- a/src/compressed_lists/biocframe_list.py +++ b/src/compressed_lists/biocframe_list.py @@ -100,6 +100,4 @@ def _( if not isinstance(partitioned_data, BiocFrame): partitioned_data = ut.relaxed_combine_rows(*partitioned_data) - return CompressedBiocFrameList( - unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata - ) + return CompressedBiocFrameList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) From 7d916a3954e751d208975a1866051a8c6545a2c5 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 8 Oct 2025 19:43:52 -0700 Subject: [PATCH 23/28] tests for compressed biocframe --- setup.cfg | 1 + src/compressed_lists/__init__.py | 3 ++- src/compressed_lists/biocframe_list.py | 23 +++++++++++++++++++ tests/test_comp_biocframe.py | 31 ++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 tests/test_comp_biocframe.py diff --git a/setup.cfg b/setup.cfg index 72798f5..6cdfdec 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,6 +51,7 @@ install_requires = importlib-metadata; python_version<"3.8" biocutils numpy + biocframe [options.packages.find] diff --git a/src/compressed_lists/__init__.py b/src/compressed_lists/__init__.py index 6f94bef..7d4f677 100644 --- a/src/compressed_lists/__init__.py +++ b/src/compressed_lists/__init__.py @@ -22,4 +22,5 @@ from .bool_list import CompressedBooleanList from .float_list import CompressedFloatList from .numpy_list import CompressedNumpyList -from .split_generic import splitAsCompressedList +from .biocframe_list import CompressedBiocFrameList +from .split_generic import splitAsCompressedList \ No newline at end of file diff --git a/src/compressed_lists/biocframe_list.py b/src/compressed_lists/biocframe_list.py index c716597..4832804 100644 --- a/src/compressed_lists/biocframe_list.py +++ b/src/compressed_lists/biocframe_list.py @@ -83,6 +83,29 @@ def __getitem__(self, key: Union[int, str, slice]): else: return super().__getitem__(key) + def extract_range(self, start: int, end: int) -> BiocFrame: + """Extract a range from `unlist_data`. + + This method must be implemented by subclasses to handle + type-specific extraction from `unlist_data`. + + Args: + start: + Start index (inclusive). + + end: + End index (exclusive). + + Returns: + Extracted element. + """ + try: + return self._unlist_data[start:end, :] + except Exception as e: + raise NotImplementedError( + "Custom classes should implement their own `extract_range` method for slice operations" + ) from e + @splitAsCompressedList.register def _( diff --git a/tests/test_comp_biocframe.py b/tests/test_comp_biocframe.py new file mode 100644 index 0000000..65345b8 --- /dev/null +++ b/tests/test_comp_biocframe.py @@ -0,0 +1,31 @@ +import pytest +from biocframe import BiocFrame + +from compressed_lists import CompressedBiocFrameList, Partitioning + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@pytest.fixture +def frame_data(): + return BiocFrame( + { + "ensembl": ["ENS00001", "ENS00002", "ENS00003"], + "symbol": ["MAP1A", "BIN1", "ESR1"], + } + ) + + +def test_creation(frame_data): + frame_list = CompressedBiocFrameList(frame_data, partitioning=Partitioning.from_lengths([1, 2])) + + assert isinstance(frame_list, CompressedBiocFrameList) + assert len(frame_list) == 2 + assert isinstance(frame_list.unlist_data, BiocFrame) + assert len(frame_list.get_unlist_data()) == 3 + assert list(frame_list.get_element_lengths()) == [1,2] + print(frame_list._unlist_data) + print(frame_list[0]) + assert frame_list[0].get_column("symbol") == ["MAP1A"] From 06a2a2e866bfce3905117986bdd5ad7f879d2d18 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Oct 2025 02:44:00 +0000 Subject: [PATCH 24/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/compressed_lists/__init__.py | 2 +- tests/test_comp_biocframe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compressed_lists/__init__.py b/src/compressed_lists/__init__.py index 7d4f677..e9497e6 100644 --- a/src/compressed_lists/__init__.py +++ b/src/compressed_lists/__init__.py @@ -23,4 +23,4 @@ from .float_list import CompressedFloatList from .numpy_list import CompressedNumpyList from .biocframe_list import CompressedBiocFrameList -from .split_generic import splitAsCompressedList \ No newline at end of file +from .split_generic import splitAsCompressedList diff --git a/tests/test_comp_biocframe.py b/tests/test_comp_biocframe.py index 65345b8..ff54102 100644 --- a/tests/test_comp_biocframe.py +++ b/tests/test_comp_biocframe.py @@ -25,7 +25,7 @@ def test_creation(frame_data): assert len(frame_list) == 2 assert isinstance(frame_list.unlist_data, BiocFrame) assert len(frame_list.get_unlist_data()) == 3 - assert list(frame_list.get_element_lengths()) == [1,2] + assert list(frame_list.get_element_lengths()) == [1, 2] print(frame_list._unlist_data) print(frame_list[0]) assert frame_list[0].get_column("symbol") == ["MAP1A"] From 04ac7ca097f519ea894500224f2df54df9a8bb46 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 8 Oct 2025 19:46:24 -0700 Subject: [PATCH 25/28] remove prints --- src/compressed_lists/split_generic.py | 4 +--- tests/test_comp_biocframe.py | 2 -- tests/test_comp_int.py | 1 - tests/test_comp_numpy.py | 1 - 4 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/compressed_lists/split_generic.py b/src/compressed_lists/split_generic.py index 78aea5e..1bcd9c8 100644 --- a/src/compressed_lists/split_generic.py +++ b/src/compressed_lists/split_generic.py @@ -30,7 +30,6 @@ def groups_to_partition( Returns: Tuple of (partitioned_data_list, partitioning_object) """ - print(data, groups) if len(data) != len(groups): raise ValueError(f"Length of data ({len(data)}) must match length of groups ({len(groups)})") @@ -39,9 +38,8 @@ def groups_to_partition( group_dict[group].append(item) sorted_groups = sorted(group_dict.keys()) - print(sorted_groups) partitioned_data = [group_dict[group] for group in sorted_groups] - print("pdata", partitioned_data) + if names is None: group_names = [str(group) for group in sorted_groups] else: diff --git a/tests/test_comp_biocframe.py b/tests/test_comp_biocframe.py index ff54102..5306206 100644 --- a/tests/test_comp_biocframe.py +++ b/tests/test_comp_biocframe.py @@ -26,6 +26,4 @@ def test_creation(frame_data): assert isinstance(frame_list.unlist_data, BiocFrame) assert len(frame_list.get_unlist_data()) == 3 assert list(frame_list.get_element_lengths()) == [1, 2] - print(frame_list._unlist_data) - print(frame_list[0]) assert frame_list[0].get_column("symbol") == ["MAP1A"] diff --git a/tests/test_comp_int.py b/tests/test_comp_int.py index daac5d5..479bf34 100644 --- a/tests/test_comp_int.py +++ b/tests/test_comp_int.py @@ -87,7 +87,6 @@ def test_getitem_by_slice(int_list): def test_iteration(int_list, int_data): items = list(int_list) - print(items, int_data) for i, lst in enumerate(items): assert np.allclose(list(lst), int_data[i]) diff --git a/tests/test_comp_numpy.py b/tests/test_comp_numpy.py index 8611591..99afb8b 100644 --- a/tests/test_comp_numpy.py +++ b/tests/test_comp_numpy.py @@ -86,7 +86,6 @@ def test_getitem_by_slice(numpy_list): def test_iteration(numpy_list, numpy_data): items = list(numpy_list) - print(items, numpy_data) for i, lst in enumerate(items): assert np.allclose(lst, numpy_data[i]) From fea4835985bface7ff50d11970e57c96f35f7513 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 8 Oct 2025 20:06:52 -0700 Subject: [PATCH 26/28] update readme and docs --- README.md | 14 +++++++------ docs/tutorial.md | 54 +++++++++++++++++++++++++++--------------------- 2 files changed, 38 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 96c0fc1..1c1ce09 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,8 @@ pip install compressed-lists ## Usage - ```py -from compressed_lists import CompressedIntegerList, CompressedStringList +from compressed_lists import CompressedIntegerList, CompressedStringList, Partitioning # Create a CompressedIntegerList int_data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]] @@ -38,9 +37,12 @@ print(squared[0]) # [1, 4, 9] # Convert to a regular Python list regular_list = int_list.to_list() -# Create a CompressedStringList -char_data = [["apple", "banana"], ["cherry", "date", "elderberry"], ["fig"]] -char_list = CompressedStringList.from_list(char_data) +# Create a CompressedStringList from lengths +import biocutils as ut +char_data = ut.StringList(["apple", "banana", "cherry", "date", "elderberry", "fig"]) + +char_list = CompressedStringList(char_data, partitioning=Partitioning.from_lengths([2,3,1])) +print(char_list) ``` ### Partitioning @@ -61,7 +63,7 @@ start, end = part[1] # Returns (3, 5) > [!NOTE] > -> Check out the [documentation](https://biocpy.github.io/compressed-lists) for extending CompressedLists to custom data types. +> Check out the [documentation](https://biocpy.github.io/compressed-lists) for available compressed list implementations and extending `CompressedLists` to custom data types. diff --git a/docs/tutorial.md b/docs/tutorial.md index fdd1f0b..7092129 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -7,7 +7,7 @@ kernelspec: # Basic Usage ```{code-cell} -from compressed_lists import CompressedIntegerList, CompressedStringList +from compressed_lists import CompressedIntegerList, CompressedStringList, Partitioning # Create a CompressedIntegerList int_data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]] @@ -15,20 +15,23 @@ names = ["A", "B", "C"] int_list = CompressedIntegerList.from_list(int_data, names) # Access elements -print(int_list[0]) # [1, 2, 3] -print(int_list["B"]) # [4, 5] -print(int_list[1:3]) # Slice of elements +print(int_list[0]) +print(int_list["B"]) +print(int_list[1:3]) # Apply a function to each element squared = int_list.lapply(lambda x: [i**2 for i in x]) -print(squared[0]) # [1, 4, 9] +print(squared[0]) # Convert to a regular Python list regular_list = int_list.to_list() -# Create a CompressedStringList -char_data = [["apple", "banana"], ["cherry", "date", "elderberry"], ["fig"]] -char_list = CompressedStringList.from_list(char_data) +# Create a CompressedStringList from lengths +import biocutils as ut +char_data = ut.StringList(["apple", "banana", "cherry", "date", "elderberry", "fig"]) + +char_list = CompressedStringList(char_data, partitioning=Partitioning.from_lengths([2,3,1])) +print(char_list) ``` ## Partitioning @@ -57,7 +60,7 @@ print(start, end) Create a new class that inherits from `CompressedList` with appropriate type annotations: ```python -from typing import List, TypeVar, Generic +from typing import List from compressed_lists import CompressedList, Partitioning import numpy as np @@ -72,31 +75,32 @@ The constructor should initialize the superclass with the appropriate data: ```python def __init__(self, - unlist_data: Any, # Replace with your data type - partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None): + unlist_data: Any, # Replace with your data type + partitioning: Partitioning, + element_type: Any = None, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None): super().__init__(unlist_data, partitioning, - element_type="custom_type", # Set your element type - element_metadata=element_metadata, - metadata=metadata) + element_type="custom_type", # Set your element type + element_metadata=element_metadata, + metadata=metadata) ``` -## 3. Implement _extract_range Method +## 3. Implement `extract_range` Method This method defines how to extract a range of elements from your unlisted data: ```python -def _extract_range(self, start: int, end: int) -> List[T]: +def extract_range(self, start: int, end: int) -> List[T]: """Extract a range from unlisted data.""" # For example, with numpy arrays: - return self.unlist_data[start:end].tolist() + return self.unlist_data[start:end] # Or for other data types: - # return self.unlist_data[start:end] + # return self.unlist_data[start:end, :] ``` -## 4. Implement from_list Class Method +## 4. Implement `from_list` Class Method This factory method creates a new instance from a list: @@ -140,7 +144,7 @@ class CompressedFloatList(CompressedList): element_metadata=element_metadata, metadata=metadata) - def _extract_range(self, start: int, end: int) -> List[float]: + def extract_range(self, start: int, end: int) -> List[float]: return self.unlist_data[start:end].tolist() @classmethod @@ -176,10 +180,10 @@ class MyObject: def __init__(self, value): self.value = value -class CompressedMyObjectList(CompressedList[List[MyObject]]): +class CompressedMyObjectList(CompressedList): # Implementation details... - def _extract_range(self, start: int, end: int) -> List[MyObject]: + def extract_range(self, start: int, end: int) -> List[MyObject]: return self.unlist_data[start:end] @classmethod @@ -187,3 +191,5 @@ class CompressedMyObjectList(CompressedList[List[MyObject]]): # Custom flattening and storage logic # ... ``` + +Check out the `CompressedBiocFrameList` for a complete example of this usecase. From 0dd56eda8ef26daa08916a24f323aa4be7fd9a08 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Wed, 8 Oct 2025 21:21:02 -0700 Subject: [PATCH 27/28] more tests and update changelog --- CHANGELOG.md | 2 +- src/compressed_lists/biocframe_list.py | 2 +- tests/test_comp_biocframe.py | 18 +++++++++++++++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb081ab..ee0980a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## Version 0.2.0 -- Switch to typed lists from the biocutils package. +- Major changes to the package; Switch to typed lists from the biocutils package. ## Version 0.1.0 - 0.1.1 diff --git a/src/compressed_lists/biocframe_list.py b/src/compressed_lists/biocframe_list.py index 4832804..b06c68d 100644 --- a/src/compressed_lists/biocframe_list.py +++ b/src/compressed_lists/biocframe_list.py @@ -78,7 +78,7 @@ def __getitem__(self, key: Union[int, str, slice]): if isinstance(key, str): column_data = self._unlist_data.get_column(key) return splitAsCompressedList( - column_data, partition=self._partitioning, names=self.names, metadata=self.metadata + column_data, groups_or_partitions=self._partitioning, names=self.names, metadata=self.metadata ) else: return super().__getitem__(key) diff --git a/tests/test_comp_biocframe.py b/tests/test_comp_biocframe.py index 5306206..1d37f28 100644 --- a/tests/test_comp_biocframe.py +++ b/tests/test_comp_biocframe.py @@ -1,7 +1,8 @@ import pytest from biocframe import BiocFrame +import biocutils as ut -from compressed_lists import CompressedBiocFrameList, Partitioning +from compressed_lists import CompressedBiocFrameList, Partitioning, CompressedStringList __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -27,3 +28,18 @@ def test_creation(frame_data): assert len(frame_list.get_unlist_data()) == 3 assert list(frame_list.get_element_lengths()) == [1, 2] assert frame_list[0].get_column("symbol") == ["MAP1A"] + + +def test_bframe_typed_list_column(): + bframe = BiocFrame( + { + "ensembl": ut.StringList(["ENS00001", "ENS00002", "ENS00003"]), + "symbol": ["MAP1A", "BIN1", "ESR1"], + } + ) + frame_list = CompressedBiocFrameList(bframe, partitioning=Partitioning.from_lengths([1, 2])) + + ens_col = frame_list["ensembl"] + assert isinstance(ens_col, CompressedStringList) + assert len(ens_col) == 2 + From 423ea911be30bf91c87342555bc1a6a5393fb092 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Oct 2025 04:21:11 +0000 Subject: [PATCH 28/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_comp_biocframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_comp_biocframe.py b/tests/test_comp_biocframe.py index 1d37f28..64691b6 100644 --- a/tests/test_comp_biocframe.py +++ b/tests/test_comp_biocframe.py @@ -42,4 +42,3 @@ def test_bframe_typed_list_column(): ens_col = frame_list["ensembl"] assert isinstance(ens_col, CompressedStringList) assert len(ens_col) == 2 -