diff --git a/CHANGELOG.md b/CHANGELOG.md index 792a416..ee0980a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## Version 0.2.0 + +- Major changes to the package; Switch to typed lists from the biocutils package. + ## Version 0.1.0 - 0.1.1 - Initial implementation of various classes - Partitioning and CompressedLists. diff --git a/README.md b/README.md index 96c0fc1..1c1ce09 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,8 @@ pip install compressed-lists ## Usage - ```py -from compressed_lists import CompressedIntegerList, CompressedStringList +from compressed_lists import CompressedIntegerList, CompressedStringList, Partitioning # Create a CompressedIntegerList int_data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]] @@ -38,9 +37,12 @@ print(squared[0]) # [1, 4, 9] # Convert to a regular Python list regular_list = int_list.to_list() -# Create a CompressedStringList -char_data = [["apple", "banana"], ["cherry", "date", "elderberry"], ["fig"]] -char_list = CompressedStringList.from_list(char_data) +# Create a CompressedStringList from lengths +import biocutils as ut +char_data = ut.StringList(["apple", "banana", "cherry", "date", "elderberry", "fig"]) + +char_list = CompressedStringList(char_data, partitioning=Partitioning.from_lengths([2,3,1])) +print(char_list) ``` ### Partitioning @@ -61,7 +63,7 @@ start, end = part[1] # Returns (3, 5) > [!NOTE] > -> Check out the [documentation](https://biocpy.github.io/compressed-lists) for extending CompressedLists to custom data types. +> Check out the [documentation](https://biocpy.github.io/compressed-lists) for available compressed list implementations and extending `CompressedLists` to custom data types. diff --git a/docs/tutorial.md b/docs/tutorial.md index fdd1f0b..7092129 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -7,7 +7,7 @@ kernelspec: # Basic Usage ```{code-cell} -from compressed_lists import CompressedIntegerList, CompressedStringList +from compressed_lists import CompressedIntegerList, CompressedStringList, Partitioning # Create a CompressedIntegerList int_data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]] @@ -15,20 +15,23 @@ names = ["A", "B", "C"] int_list = CompressedIntegerList.from_list(int_data, names) # Access elements -print(int_list[0]) # [1, 2, 3] -print(int_list["B"]) # [4, 5] -print(int_list[1:3]) # Slice of elements +print(int_list[0]) +print(int_list["B"]) +print(int_list[1:3]) # Apply a function to each element squared = int_list.lapply(lambda x: [i**2 for i in x]) -print(squared[0]) # [1, 4, 9] +print(squared[0]) # Convert to a regular Python list regular_list = int_list.to_list() -# Create a CompressedStringList -char_data = [["apple", "banana"], ["cherry", "date", "elderberry"], ["fig"]] -char_list = CompressedStringList.from_list(char_data) +# Create a CompressedStringList from lengths +import biocutils as ut +char_data = ut.StringList(["apple", "banana", "cherry", "date", "elderberry", "fig"]) + +char_list = CompressedStringList(char_data, partitioning=Partitioning.from_lengths([2,3,1])) +print(char_list) ``` ## Partitioning @@ -57,7 +60,7 @@ print(start, end) Create a new class that inherits from `CompressedList` with appropriate type annotations: ```python -from typing import List, TypeVar, Generic +from typing import List from compressed_lists import CompressedList, Partitioning import numpy as np @@ -72,31 +75,32 @@ The constructor should initialize the superclass with the appropriate data: ```python def __init__(self, - unlist_data: Any, # Replace with your data type - partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None): + unlist_data: Any, # Replace with your data type + partitioning: Partitioning, + element_type: Any = None, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None): super().__init__(unlist_data, partitioning, - element_type="custom_type", # Set your element type - element_metadata=element_metadata, - metadata=metadata) + element_type="custom_type", # Set your element type + element_metadata=element_metadata, + metadata=metadata) ``` -## 3. Implement _extract_range Method +## 3. Implement `extract_range` Method This method defines how to extract a range of elements from your unlisted data: ```python -def _extract_range(self, start: int, end: int) -> List[T]: +def extract_range(self, start: int, end: int) -> List[T]: """Extract a range from unlisted data.""" # For example, with numpy arrays: - return self.unlist_data[start:end].tolist() + return self.unlist_data[start:end] # Or for other data types: - # return self.unlist_data[start:end] + # return self.unlist_data[start:end, :] ``` -## 4. Implement from_list Class Method +## 4. Implement `from_list` Class Method This factory method creates a new instance from a list: @@ -140,7 +144,7 @@ class CompressedFloatList(CompressedList): element_metadata=element_metadata, metadata=metadata) - def _extract_range(self, start: int, end: int) -> List[float]: + def extract_range(self, start: int, end: int) -> List[float]: return self.unlist_data[start:end].tolist() @classmethod @@ -176,10 +180,10 @@ class MyObject: def __init__(self, value): self.value = value -class CompressedMyObjectList(CompressedList[List[MyObject]]): +class CompressedMyObjectList(CompressedList): # Implementation details... - def _extract_range(self, start: int, end: int) -> List[MyObject]: + def extract_range(self, start: int, end: int) -> List[MyObject]: return self.unlist_data[start:end] @classmethod @@ -187,3 +191,5 @@ class CompressedMyObjectList(CompressedList[List[MyObject]]): # Custom flattening and storage logic # ... ``` + +Check out the `CompressedBiocFrameList` for a complete example of this usecase. diff --git a/setup.cfg b/setup.cfg index 01e5b74..6cdfdec 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,6 +50,8 @@ python_requires = >=3.9 install_requires = importlib-metadata; python_version<"3.8" biocutils + numpy + biocframe [options.packages.find] diff --git a/src/compressed_lists/CompressedIntegerList.py b/src/compressed_lists/CompressedIntegerList.py deleted file mode 100644 index 5d3bae9..0000000 --- a/src/compressed_lists/CompressedIntegerList.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import List, Optional, Sequence - -import numpy as np - -from .CompressedList import CompressedList -from .partition import Partitioning - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - - -class CompressedIntegerList(CompressedList): - """CompressedList implementation for lists of integers.""" - - def __init__( - self, - unlist_data: np.ndarray, - partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None, - **kwargs, - ): - """Initialize a CompressedIntegerList. - - Args: - unlist_data: - NumPy array of integers. - - partitioning: - Partitioning object defining element boundaries. - - element_metadata: - Optional metadata for elements. - - metadata: - Optional general metadata. - - kwargs: - Additional arguments. - """ - super().__init__( - unlist_data, partitioning, element_type="integer", element_metadata=element_metadata, metadata=metadata - ) - - def _extract_range(self, start: int, end: int) -> np.ndarray: - """Extract a range from unlist_data. - - Args: - start: - Start index (inclusive). - - end: - End index (exclusive). - - Returns: - Same type as unlist_data. - """ - return self._unlist_data[start:end] - - @classmethod - def from_list( - cls, lst: List[List[int]], names: Optional[Sequence[str]] = None, metadata: dict = None - ) -> "CompressedIntegerList": - """ - Create a CompressedIntegerList from a list of integer lists. - - Args: - lst: - List of integer lists. - - names: - Optional names for list elements. - - metadata: - Optional metadata. - - Returns: - A new CompressedIntegerList. - """ - # Flatten the list - flat_data = [] - for sublist in lst: - flat_data.extend(sublist) - - # Create partitioning - partitioning = Partitioning.from_list(lst, names) - - # Create unlist_data - unlist_data = np.array(flat_data, dtype=np.int64) - - return cls(unlist_data, partitioning, metadata=metadata) diff --git a/src/compressed_lists/CompressedStringList.py b/src/compressed_lists/CompressedStringList.py deleted file mode 100644 index 88cce67..0000000 --- a/src/compressed_lists/CompressedStringList.py +++ /dev/null @@ -1,86 +0,0 @@ -from typing import List, Optional, Sequence - -from .CompressedList import CompressedList -from .partition import Partitioning - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - - -class CompressedStringList(CompressedList): - """CompressedList implementation for lists of strings.""" - - def __init__( - self, - unlist_data: List[str], - partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None, - **kwargs, - ): - """Initialize a CompressedStringList. - - Args: - unlist_data: - List of strings. - - partitioning: - Partitioning object defining element boundaries. - - element_metadata: - Optional metadata for elements. - - metadata: - Optional general metadata. - - kwargs: - Additional arguments. - """ - super().__init__( - unlist_data, partitioning, element_type="string", element_metadata=element_metadata, metadata=metadata - ) - - def _extract_range(self, start: int, end: int) -> List[str]: - """Extract a range from unlist_data. - - Args: - start: - Start index (inclusive). - - end: - End index (exclusive). - - Returns: - List of strings. - """ - return self._unlist_data[start:end] - - @classmethod - def from_list( - cls, lst: List[List[str]], names: Optional[Sequence[str]] = None, metadata: dict = None - ) -> "CompressedStringList": - """Create a `CompressedStringList` from a list of string lists. - - Args: - lst: - List of string lists. - - names: - Optional names for list elements. - - metadata: - Optional metadata. - - Returns: - A new `CompressedStringList`. - """ - # Flatten the list - flat_data = [] - for sublist in lst: - flat_data.extend(sublist) - - # Create partitioning - partitioning = Partitioning.from_list(lst, names) - - return cls(flat_data, partitioning, metadata=metadata) diff --git a/src/compressed_lists/__init__.py b/src/compressed_lists/__init__.py index b320c98..e9497e6 100644 --- a/src/compressed_lists/__init__.py +++ b/src/compressed_lists/__init__.py @@ -16,6 +16,11 @@ del version, PackageNotFoundError from .partition import Partitioning -from .CompressedList import CompressedList -from .CompressedIntegerList import CompressedIntegerList -from .CompressedStringList import CompressedStringList +from .base import CompressedList +from .integer_list import CompressedIntegerList +from .string_list import CompressedStringList, CompressedCharacterList +from .bool_list import CompressedBooleanList +from .float_list import CompressedFloatList +from .numpy_list import CompressedNumpyList +from .biocframe_list import CompressedBiocFrameList +from .split_generic import splitAsCompressedList diff --git a/src/compressed_lists/CompressedList.py b/src/compressed_lists/base.py similarity index 89% rename from src/compressed_lists/CompressedList.py rename to src/compressed_lists/base.py index 0f629bd..f450059 100644 --- a/src/compressed_lists/CompressedList.py +++ b/src/compressed_lists/base.py @@ -1,4 +1,3 @@ -from abc import ABC, abstractmethod from typing import Any, Callable, Iterator, List, Optional, Sequence, Union from warnings import warn @@ -20,7 +19,7 @@ def _validate_data_and_partitions(unlist_data, partition): ) -class CompressedList(ABC): +class CompressedList: """Base class for compressed list objects. `CompressedList` stores list elements concatenated in a single vector-like object @@ -31,8 +30,8 @@ def __init__( self, unlist_data: Any, partitioning: Partitioning, - element_type: str = None, - element_metadata: dict = None, + element_type: Any = None, + element_metadata: Optional[dict] = None, metadata: Optional[dict] = None, validate: bool = True, ): @@ -43,10 +42,10 @@ def __init__( Vector-like object containing concatenated elements. partitioning: - Partitioning object defining element boundaries. + Partitioning object defining element boundaries (exclusive). element_type: - String identifier for the type of elements. + class for the type of elements. element_metadata: Optional metadata for elements. @@ -66,7 +65,7 @@ def __init__( if validate: _validate_data_and_partitions(self._unlist_data, self._partitioning) - def _define_output(self, in_place: bool = False) -> "Partitioning": + def _define_output(self, in_place: bool = False) -> "CompressedList": if in_place is True: return self else: @@ -208,7 +207,7 @@ def get_names(self) -> Optional[ut.NamedList]: """Get the names of list elements.""" return self._partitioning.get_names() - def set_names(self, names: Sequence[str], in_place: bool = False) -> "CompressedList": + def set_names(self, names: List[str], in_place: bool = False) -> "CompressedList": """Set the names of list elements. names: @@ -402,7 +401,7 @@ def __getitem__(self, key: Union[int, str, slice]) -> Any: """ # string keys (names) if isinstance(key, str): - if key not in self.names: + if key not in list(self.get_names()): raise KeyError(f"No element named '{key}'.") key = list(self.names).index(key) @@ -414,7 +413,7 @@ def __getitem__(self, key: Union[int, str, slice]) -> Any: raise IndexError(f"List index '{key}' out of range.") start, end = self._partitioning.get_partition_range(key) - return self._extract_range(start, end) + return self.extract_range(start, end) # slices elif isinstance(key, slice): @@ -422,22 +421,21 @@ def __getitem__(self, key: Union[int, str, slice]) -> Any: result = [] for i in indices: start, end = self._partitioning.get_partition_range(i) - result.append(self._extract_range(start, end)) + result.append(self.extract_range(start, end)) - # Create a new CompressedList from the result - return self.__class__.from_list( + current_class_const = type(self) + return current_class_const.from_list( result, names=[self.names[i] for i in indices] if self.names[0] is not None else None ) else: - raise TypeError("Index must be int, str, or slice.") + raise TypeError("'key' must be int, str, or slice.") ################################## ######>> abstract methods <<###### ################################## - @abstractmethod - def _extract_range(self, start: int, end: int) -> Any: + def extract_range(self, start: int, end: int) -> Any: """Extract a range from `unlist_data`. This method must be implemented by subclasses to handle @@ -453,13 +451,17 @@ def _extract_range(self, start: int, end: int) -> Any: Returns: Extracted element. """ - pass + try: + return self._unlist_data[start:end] + except Exception as e: + raise NotImplementedError( + "Custom classes should implement their own `extract_range` method for slice operations" + ) from e @classmethod - @abstractmethod def from_list( - cls, lst: List[Any], names: Optional[Sequence[str]] = None, metadata: dict = None - ) -> "CompressedList[Any]": + cls, lst: List[Any], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None + ) -> "CompressedList": """Create a CompressedList from a regular list. This method must be implemented by subclasses to handle @@ -478,7 +480,18 @@ def from_list( Returns: A new `CompressedList`. """ - pass + # Flatten the list + flat_data = [] + for sublist in lst: + flat_data.extend(sublist) + + # Create partitioning + partitioning = Partitioning.from_list(lst, names) + + # Create unlist_data + # unlist_data = cls._element_type(data=flat_data) + + return cls(flat_data, partitioning, metadata=metadata) ########################### ######>> coercions <<###### @@ -506,7 +519,7 @@ def unlist(self, use_names: bool = True) -> Any: """ return self._unlist_data - def relist(self, unlist_data: Any) -> "CompressedList[Any]": + def relist(self, unlist_data: Any) -> "CompressedList": """Create a new `CompressedList` with the same partitioning but different data. Args: @@ -518,7 +531,8 @@ def relist(self, unlist_data: Any) -> "CompressedList[Any]": """ _validate_data_and_partitions(unlist_data, self._partitioning) - return self.__class__( + current_class_const = type(self) + return current_class_const( unlist_data, self._partitioning.copy(), element_type=self._element_type, @@ -526,7 +540,7 @@ def relist(self, unlist_data: Any) -> "CompressedList[Any]": metadata=self._metadata.copy(), ) - def extract_subset(self, indices: Sequence[int]) -> "CompressedList[Any]": + def extract_subset(self, indices: Sequence[int]) -> "CompressedList": """Extract a subset of elements by indices. Args: @@ -542,8 +556,8 @@ def extract_subset(self, indices: Sequence[int]) -> "CompressedList[Any]": raise IndexError(f"Index {i} out of range") # Extract element lengths and names - new_lengths = [self.get_element_lengths()[i] for i in indices] - new_names = [self.names[i] for i in indices] if self.names[0] is not None else None + new_lengths = ut.subset_sequence(self.get_element_lengths(), indices) + new_names = ut.subset_sequence(self.names, indices) if self.names is not None else None # Create new partitioning new_partitioning = Partitioning.from_lengths(new_lengths, new_names) @@ -560,8 +574,8 @@ def extract_subset(self, indices: Sequence[int]) -> "CompressedList[Any]": if isinstance(self._unlist_data, np.ndarray): new_data = np.concatenate(new_data) - # Create new compressed list - return self.__class__( + current_class_const = type(self) + return current_class_const( new_data, new_partitioning, element_type=self._element_type, @@ -569,7 +583,7 @@ def extract_subset(self, indices: Sequence[int]) -> "CompressedList[Any]": metadata=self._metadata.copy(), ) - def lapply(self, func: Callable) -> "CompressedList[Any]": + def lapply(self, func: Callable) -> "CompressedList": """Apply a function to each element. Args: @@ -580,4 +594,6 @@ def lapply(self, func: Callable) -> "CompressedList[Any]": A new CompressedList with the results. """ result = [func(elem) for elem in self] - return self.__class__.from_list(result, self.names, self._metadata) + + current_class_const = type(self) + return current_class_const.from_list(result, self.names, self._metadata) diff --git a/src/compressed_lists/biocframe_list.py b/src/compressed_lists/biocframe_list.py new file mode 100644 index 0000000..b06c68d --- /dev/null +++ b/src/compressed_lists/biocframe_list.py @@ -0,0 +1,126 @@ +from typing import List, Optional, Sequence, Union + +import biocutils as ut +from biocframe import BiocFrame + +from .base import CompressedList +from .partition import Partitioning +from .split_generic import _generic_register_helper, splitAsCompressedList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class CompressedBiocFrameList(CompressedList): + """CompressedList for BiocFrames.""" + + def __init__( + self, + unlist_data: BiocFrame, + partitioning: Partitioning, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, + **kwargs, + ): + """Initialize a CompressedBiocFrameList. + + Args: + unlist_data: + BiocFrame object. + + partitioning: + Partitioning object defining element boundaries. + + element_metadata: + Optional metadata for elements. + + metadata: + Optional general metadata. + + kwargs: + Additional arguments. + """ + if not isinstance(unlist_data, BiocFrame): + raise TypeError("'unlist_data' is not a `BiocFrame` object.") + + super().__init__( + unlist_data, partitioning, element_type="BiocFrame", element_metadata=element_metadata, metadata=metadata + ) + + @classmethod + def from_list( + cls, lst: List[BiocFrame], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None + ) -> "CompressedBiocFrameList": + """Create a `CompressedBiocFrameList` from a regular list. + + This concatenates the list of `BiocFrame` objects. + + Args: + lst: + List of `BiocFrame` objects. + + names: + Optional names for list elements. + + metadata: + Optional metadata. + + Returns: + A new `CompressedList`. + """ + unlist_data = ut.relaxed_combine_rows(*lst) + partitioning = Partitioning.from_list(lst, names) + return cls(unlist_data, partitioning, metadata=metadata) + + def __getitem__(self, key: Union[int, str, slice]): + """Override to handle column extraction using `splitAsCompressedList`.""" + if isinstance(key, str): + column_data = self._unlist_data.get_column(key) + return splitAsCompressedList( + column_data, groups_or_partitions=self._partitioning, names=self.names, metadata=self.metadata + ) + else: + return super().__getitem__(key) + + def extract_range(self, start: int, end: int) -> BiocFrame: + """Extract a range from `unlist_data`. + + This method must be implemented by subclasses to handle + type-specific extraction from `unlist_data`. + + Args: + start: + Start index (inclusive). + + end: + End index (exclusive). + + Returns: + Extracted element. + """ + try: + return self._unlist_data[start:end, :] + except Exception as e: + raise NotImplementedError( + "Custom classes should implement their own `extract_range` method for slice operations" + ) from e + + +@splitAsCompressedList.register +def _( + data: BiocFrame, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, +) -> CompressedBiocFrameList: + """Handle lists of BiocFrame objects.""" + + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) + + if not isinstance(partitioned_data, BiocFrame): + partitioned_data = ut.relaxed_combine_rows(*partitioned_data) + + return CompressedBiocFrameList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) diff --git a/src/compressed_lists/bool_list.py b/src/compressed_lists/bool_list.py new file mode 100644 index 0000000..8ca8f29 --- /dev/null +++ b/src/compressed_lists/bool_list.py @@ -0,0 +1,73 @@ +from typing import Optional, Sequence, Union +from warnings import warn + +import biocutils as ut + +from .base import CompressedList +from .partition import Partitioning +from .split_generic import _generic_register_helper, splitAsCompressedList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class CompressedBooleanList(CompressedList): + """CompressedList implementation for lists of booleans.""" + + def __init__( + self, + unlist_data: ut.BooleanList, + partitioning: Partitioning, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, + **kwargs, + ): + """Initialize a CompressedBooleanList. + + Args: + unlist_data: + List of booleans. + + partitioning: + Partitioning object defining element boundaries. + + element_metadata: + Optional metadata for elements. + + metadata: + Optional general metadata. + + kwargs: + Additional arguments. + """ + + if not isinstance(unlist_data, ut.BooleanList): + try: + warn("trying to coerce 'unlist_data' to `BooleanList`..") + unlist_data = ut.BooleanList(unlist_data) + except Exception as e: + raise TypeError("'unlist_data' must be an `BooleanList`, provided ", type(unlist_data)) from e + + super().__init__( + unlist_data, partitioning, element_type=ut.BooleanList, element_metadata=element_metadata, metadata=metadata + ) + + +@splitAsCompressedList.register +def _( + data: ut.BooleanList, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, +) -> CompressedBooleanList: + """Handle lists of booleans.""" + + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) + + if not isinstance(partitioned_data, ut.BooleanList): + partitioned_data = ut.combine_sequences(*partitioned_data) + + return CompressedBooleanList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) diff --git a/src/compressed_lists/float_list.py b/src/compressed_lists/float_list.py new file mode 100644 index 0000000..006f767 --- /dev/null +++ b/src/compressed_lists/float_list.py @@ -0,0 +1,73 @@ +from typing import Optional, Sequence, Union +from warnings import warn + +import biocutils as ut + +from .base import CompressedList +from .partition import Partitioning +from .split_generic import _generic_register_helper, splitAsCompressedList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class CompressedFloatList(CompressedList): + """CompressedList implementation for lists of floats.""" + + def __init__( + self, + unlist_data: ut.FloatList, + partitioning: Partitioning, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, + **kwargs, + ): + """Initialize a CompressedFloatList. + + Args: + unlist_data: + List of floats. + + partitioning: + Partitioning object defining element boundaries. + + element_metadata: + Optional metadata for elements. + + metadata: + Optional general metadata. + + kwargs: + Additional arguments. + """ + + if not isinstance(unlist_data, ut.FloatList): + try: + warn("trying to coerce 'unlist_data' to `FloatList`..") + unlist_data = ut.FloatList(unlist_data) + except Exception as e: + raise TypeError("'unlist_data' must be an `FloatList`, provided ", type(unlist_data)) from e + + super().__init__( + unlist_data, partitioning, element_type=ut.FloatList, element_metadata=element_metadata, metadata=metadata + ) + + +@splitAsCompressedList.register +def _( + data: ut.FloatList, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, +) -> CompressedFloatList: + """Handle lists of floats.""" + + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) + + if not isinstance(partitioned_data, ut.FloatList): + partitioned_data = ut.combine_sequences(*partitioned_data) + + return CompressedFloatList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) diff --git a/src/compressed_lists/integer_list.py b/src/compressed_lists/integer_list.py new file mode 100644 index 0000000..e79a9f1 --- /dev/null +++ b/src/compressed_lists/integer_list.py @@ -0,0 +1,73 @@ +from typing import Optional, Sequence, Union +from warnings import warn + +import biocutils as ut + +from .base import CompressedList +from .partition import Partitioning +from .split_generic import _generic_register_helper, splitAsCompressedList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class CompressedIntegerList(CompressedList): + """CompressedList implementation for lists of integers.""" + + def __init__( + self, + unlist_data: ut.IntegerList, + partitioning: Partitioning, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, + **kwargs, + ): + """Initialize a CompressedIntegerList. + + Args: + unlist_data: + List of integers. + + partitioning: + Partitioning object defining element boundaries. + + element_metadata: + Optional metadata for elements. + + metadata: + Optional general metadata. + + kwargs: + Additional arguments. + """ + + if not isinstance(unlist_data, ut.IntegerList): + try: + warn("trying to coerce 'unlist_data' to `IntegerList`..") + unlist_data = ut.IntegerList(unlist_data) + except Exception as e: + raise TypeError("'unlist_data' must be an `IntegerList`, provided ", type(unlist_data)) from e + + super().__init__( + unlist_data, partitioning, element_type=ut.IntegerList, element_metadata=element_metadata, metadata=metadata + ) + + +@splitAsCompressedList.register +def _( + data: ut.IntegerList, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, +) -> CompressedIntegerList: + """Handle lists of integers.""" + + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) + + if not isinstance(partitioned_data, ut.IntegerList): + partitioned_data = ut.combine_sequences(*partitioned_data) + + return CompressedIntegerList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) diff --git a/src/compressed_lists/numpy_list.py b/src/compressed_lists/numpy_list.py new file mode 100644 index 0000000..1e70f2b --- /dev/null +++ b/src/compressed_lists/numpy_list.py @@ -0,0 +1,103 @@ +from typing import List, Optional, Sequence, Union +from warnings import warn + +import biocutils as ut +import numpy as np + +from .base import CompressedList +from .partition import Partitioning +from .split_generic import _generic_register_helper, splitAsCompressedList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class CompressedNumpyList(CompressedList): + """CompressedList implementation for lists of NumPy vectors.""" + + def __init__( + self, + unlist_data: np.ndarray, + partitioning: Partitioning, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, + **kwargs, + ): + """Initialize a CompressedNumpyList. + + Args: + unlist_data: + List of NumPy vectors. + + partitioning: + Partitioning object defining element boundaries. + + element_metadata: + Optional metadata for elements. + + metadata: + Optional general metadata. + + kwargs: + Additional arguments. + """ + + if not isinstance(unlist_data, np.ndarray): + try: + warn("trying to concatenate/coerce 'unlist_data' to a `np.ndarray`..") + unlist_data = np.concatenate(unlist_data) + except Exception as e: + raise TypeError("'unlist_data' must be an `np.ndarray`, provided ", type(unlist_data)) from e + + super().__init__( + unlist_data, partitioning, element_type=np.ndarray, element_metadata=element_metadata, metadata=metadata + ) + + @classmethod + def from_list( + cls, lst: List[np.ndarray], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None + ) -> "CompressedNumpyList": + """ + Create a `CompressedNumpyList` from a list of NumPy vectors. + + Args: + lst: + List of NumPy vectors. + + names: + Optional names for list elements. + + metadata: + Optional metadata. + + Returns: + A new `CompressedNumpyList`. + """ + partitioning = Partitioning.from_list(lst, names) + + if len(lst) == 0: + unlist_data = np.array([]) + else: + unlist_data = np.concatenate(lst) + + return cls(unlist_data, partitioning, metadata=metadata) + + +@splitAsCompressedList.register +def _( + data: np.ndarray, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, +) -> CompressedNumpyList: + """Handle NumPy arrays.""" + + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) + + if not isinstance(partitioned_data, np.ndarray): + partitioned_data = ut.combine_sequences(*partitioned_data) + + return CompressedNumpyList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) diff --git a/src/compressed_lists/partition.py b/src/compressed_lists/partition.py index 154ad5e..6782c65 100644 --- a/src/compressed_lists/partition.py +++ b/src/compressed_lists/partition.py @@ -46,7 +46,7 @@ def __init__(self, ends: Sequence[int], names: Optional[Sequence[str]] = None, v self._names = None if names is not None: - self._names = ut.NamedList(names) + self._names = ut.Names(names) if validate: _validate_names(names, len(ends)) @@ -212,11 +212,11 @@ def __getitem__(self, key: Union[int, slice]) -> Union[tuple, List[tuple]]: ######>> names <<##### ###################### - def get_names(self) -> Optional[ut.NamedList]: + def get_names(self) -> Optional[ut.Names]: """Return the names of each partition.""" return self._names - def set_names(self, names: Optional[List[str]], in_place: bool = False) -> "Partitioning": + def set_names(self, names: Optional[Sequence[str]], in_place: bool = False) -> "Partitioning": """Set the names of list elements. Args: @@ -247,7 +247,7 @@ def names(self) -> Optional[ut.Names]: return self.get_names() @names.setter - def names(self, names: Optional[List[str]]): + def names(self, names: Optional[Sequence[str]]): """Alias for :py:meth:`~set_names` with ``in_place = True``. As this mutates the original object, a warning is raised. @@ -262,12 +262,12 @@ def names(self, names: Optional[List[str]]): ######>> ends <<##### ##################### - def get_ends(self) -> Optional[ut.NamedList]: + def get_ends(self) -> np.ndarray: """Return the names of each partition.""" return self._ends @property - def ends(self) -> Optional[ut.Names]: + def ends(self) -> np.ndarray: """Alias for :py:attr:`~get_ends`, provided for back-compatibility.""" return self.get_ends() @@ -275,11 +275,11 @@ def ends(self) -> Optional[ut.Names]: ######>> starts <<##### ####################### - def get_starts(self) -> Optional[ut.NamedList]: + def get_starts(self) -> np.ndarray: """Return the starts of each partition.""" return self._starts @property - def starts(self) -> Optional[ut.Names]: + def starts(self) -> np.ndarray: """Alias for :py:attr:`~get_starts`, provided for back-compatibility.""" return self.get_starts() diff --git a/src/compressed_lists/split_generic.py b/src/compressed_lists/split_generic.py new file mode 100644 index 0000000..1bcd9c8 --- /dev/null +++ b/src/compressed_lists/split_generic.py @@ -0,0 +1,117 @@ +from collections import defaultdict +from functools import singledispatch +from typing import Any, List, Optional, Sequence, Tuple, Union + +import numpy as np + +from .base import CompressedList +from .partition import Partitioning + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def groups_to_partition( + data: Any, groups: list, names: Optional[Sequence[str]] = None +) -> Tuple[List[Any], Partitioning]: + """Convert group membership vector to partitioned data and Partitioning object. + + Args: + data: + The data to be split (flat vector-like object). + + groups: + Group membership vector, same length as data. + + names: + Optional names for groups. + + Returns: + Tuple of (partitioned_data_list, partitioning_object) + """ + if len(data) != len(groups): + raise ValueError(f"Length of data ({len(data)}) must match length of groups ({len(groups)})") + + group_dict = defaultdict(list) + for item, group in zip(data, groups): + group_dict[group].append(item) + + sorted_groups = sorted(group_dict.keys()) + partitioned_data = [group_dict[group] for group in sorted_groups] + + if names is None: + group_names = [str(group) for group in sorted_groups] + else: + if len(names) != len(sorted_groups): + raise ValueError( + f"Length of names ({len(names)}) must match number of unique groups ({len(sorted_groups)})" + ) + group_names = names + + partitioning = Partitioning.from_list(partitioned_data, group_names) + + return partitioned_data, partitioning + + +@singledispatch +def splitAsCompressedList( + data: Any, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, +) -> CompressedList: + """Generic function to split data into an appropriate `CompressedList` subclass. + + This function can work in two modes: + 1. Group-based splitting where a flat vector is split according to group membership. + 2. Partition-based splitting where a flat vector is split according to explicit partitions. + + Args: + data: + The data to split into a `CompressedList`. + + groups_or_partitions: + Optional group membership vector (same length as data) or + explicit partitioning object. + + names: + Optional names for the list elements. + + metadata: + Optional metadata for the `CompressedList`. + + Returns: + An appropriate `CompressedList` subclass instance. + """ + element_type = type(data) + raise NotImplementedError(f"No `splitAsCompressedList` dispatcher found for element type {element_type}") + + +def _generic_register_helper(data, groups_or_partitions, names=None): + if groups_or_partitions is None: + raise ValueError("'groups_or_paritions' cannot be 'None'.") + + if not data: + raise ValueError("'data' cannot be empty.") + + if isinstance(groups_or_partitions, Partitioning): + if names is not None: + groups_or_partitions = groups_or_partitions.set_names(names, in_place=False) + + # TODO: probably not necessary to split when groups is a partition object. + # unless ordering matters + # partitioned_data = [] + # for i in range(len(groups_or_partitions)): + # start, end = groups_or_partitions.get_partition_range(i) + # partitioned_data.append(data[start:end]) + partitioned_data = data + elif isinstance(groups_or_partitions, (list, np.ndarray)): + partitioned_data, groups_or_partitions = groups_to_partition(data, groups=groups_or_partitions, names=names) + + if len(partitioned_data) == 0: + raise ValueError("No data after grouping") + else: + raise ValueError("'groups_or_paritions' must be a group vector or a Partition object.") + + return partitioned_data, groups_or_partitions diff --git a/src/compressed_lists/string_list.py b/src/compressed_lists/string_list.py new file mode 100644 index 0000000..f612747 --- /dev/null +++ b/src/compressed_lists/string_list.py @@ -0,0 +1,76 @@ +from typing import Optional, Sequence, Union +from warnings import warn + +import biocutils as ut + +from .base import CompressedList +from .partition import Partitioning +from .split_generic import _generic_register_helper, splitAsCompressedList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +class CompressedStringList(CompressedList): + """CompressedList implementation for lists of strings.""" + + def __init__( + self, + unlist_data: ut.StringList, + partitioning: Partitioning, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, + **kwargs, + ): + """Initialize a CompressedStringList. + + Args: + unlist_data: + List of strings. + + partitioning: + Partitioning object defining element boundaries. + + element_metadata: + Optional metadata for elements. + + metadata: + Optional general metadata. + + kwargs: + Additional arguments. + """ + if not isinstance(unlist_data, ut.StringList): + try: + warn("trying to coerce 'unlist_data' to `StringList`..") + unlist_data = ut.StringList(unlist_data) + except Exception as e: + raise TypeError("'unlist_data' must be an `StringList`, provided ", type(unlist_data)) from e + + super().__init__( + unlist_data, partitioning, element_type=ut.StringList, element_metadata=element_metadata, metadata=metadata + ) + + +class CompressedCharacterList(CompressedStringList): + pass + + +@splitAsCompressedList.register +def _( + data: ut.StringList, + groups_or_partitions: Union[list, Partitioning], + names: Optional[Sequence[str]] = None, + metadata: Optional[dict] = None, +) -> CompressedStringList: + """Handle lists of floats.""" + + partitioned_data, groups_or_partitions = _generic_register_helper( + data=data, groups_or_partitions=groups_or_partitions, names=names + ) + + if not isinstance(partitioned_data, ut.StringList): + partitioned_data = ut.combine_sequences(*partitioned_data) + + return CompressedStringList(unlist_data=partitioned_data, partitioning=groups_or_partitions, metadata=metadata) diff --git a/tests/test_comp_biocframe.py b/tests/test_comp_biocframe.py new file mode 100644 index 0000000..64691b6 --- /dev/null +++ b/tests/test_comp_biocframe.py @@ -0,0 +1,44 @@ +import pytest +from biocframe import BiocFrame +import biocutils as ut + +from compressed_lists import CompressedBiocFrameList, Partitioning, CompressedStringList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@pytest.fixture +def frame_data(): + return BiocFrame( + { + "ensembl": ["ENS00001", "ENS00002", "ENS00003"], + "symbol": ["MAP1A", "BIN1", "ESR1"], + } + ) + + +def test_creation(frame_data): + frame_list = CompressedBiocFrameList(frame_data, partitioning=Partitioning.from_lengths([1, 2])) + + assert isinstance(frame_list, CompressedBiocFrameList) + assert len(frame_list) == 2 + assert isinstance(frame_list.unlist_data, BiocFrame) + assert len(frame_list.get_unlist_data()) == 3 + assert list(frame_list.get_element_lengths()) == [1, 2] + assert frame_list[0].get_column("symbol") == ["MAP1A"] + + +def test_bframe_typed_list_column(): + bframe = BiocFrame( + { + "ensembl": ut.StringList(["ENS00001", "ENS00002", "ENS00003"]), + "symbol": ["MAP1A", "BIN1", "ESR1"], + } + ) + frame_list = CompressedBiocFrameList(bframe, partitioning=Partitioning.from_lengths([1, 2])) + + ens_col = frame_list["ensembl"] + assert isinstance(ens_col, CompressedStringList) + assert len(ens_col) == 2 diff --git a/tests/test_comp_bool.py b/tests/test_comp_bool.py new file mode 100644 index 0000000..077ae35 --- /dev/null +++ b/tests/test_comp_bool.py @@ -0,0 +1,33 @@ +import pytest +from biocutils.BooleanList import BooleanList + +from compressed_lists import CompressedBooleanList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@pytest.fixture +def bool_data(): + return [[True, False], [False, True, False], [False]] + + +@pytest.fixture +def bool_list(bool_data): + names = ["fruits1", "fruits2", "fruits3"] + return CompressedBooleanList.from_list(bool_data, names) + + +def test_creation(bool_data): + bool_list = CompressedBooleanList.from_list(bool_data) + + assert len(bool_list) == 3 + assert isinstance(bool_list.unlist_data, BooleanList) + assert list(bool_list.get_unlist_data()) == [True, False, False, True, False, False] + assert list(bool_list.get_element_lengths()) == [2, 3, 1] + + +def test_getitem(bool_list): + assert list(bool_list[0]) == [True, False] + assert list(bool_list["fruits2"]) == [False, True, False] diff --git a/tests/test_comp_custom.py b/tests/test_comp_custom.py index 1476109..82bd860 100644 --- a/tests/test_comp_custom.py +++ b/tests/test_comp_custom.py @@ -1,6 +1,5 @@ -from typing import List +from typing import Any, List, Optional, Sequence -import numpy as np import pytest from compressed_lists import CompressedList, Partitioning @@ -11,39 +10,42 @@ @pytest.fixture -def CompressedFloatList(): - class CompressedFloatList(CompressedList): +def CompressedCustomFloatList(): + class CompressedCustomFloatList(CompressedList): def __init__( self, - unlist_data: np.ndarray, + unlist_data: Any, partitioning: Partitioning, - element_metadata: dict = None, - metadata: dict = None, + element_type: Any = None, + element_metadata: Optional[dict] = None, + metadata: Optional[dict] = None, + validate: bool = True, ): super().__init__( unlist_data, partitioning, element_type="float", element_metadata=element_metadata, metadata=metadata ) - def _extract_range(self, start: int, end: int) -> List[float]: - return self._unlist_data[start:end].tolist() + def extract_range(self, start: int, end: int) -> List[float]: + return self._unlist_data[start:end] @classmethod - def from_list(cls, lst: List[List[float]], names: list = None, metadata: dict = None) -> "CompressedFloatList": + def from_list( + cls, lst: List[Any], names: Optional[Sequence[str]] = None, metadata: Optional[dict] = None + ) -> "CompressedCustomFloatList": flat_data = [] for sublist in lst: flat_data.extend(sublist) partitioning = Partitioning.from_list(lst, names) - unlist_data = np.array(flat_data, dtype=np.float64) - return cls(unlist_data, partitioning, metadata=metadata) + return cls(flat_data, partitioning, metadata=metadata) - return CompressedFloatList + return CompressedCustomFloatList -def test_custom_class(CompressedFloatList): +def test_custom_class(CompressedCustomFloatList): float_data = [[1.1, 2.2, 3.3], [4.4, 5.5], [6.6, 7.7, 8.8, 9.9]] names = ["X", "Y", "Z"] - float_list = CompressedFloatList.from_list(float_data, names) + float_list = CompressedCustomFloatList.from_list(float_data, names) assert len(float_list) == 3 assert float_list._element_type == "float" @@ -55,3 +57,13 @@ def test_custom_class(CompressedFloatList): assert rounded[0] == [1.0, 2.0, 3.0] assert rounded[1] == [4.0, 6.0] assert rounded[2] == [7.0, 8.0, 9.0, 10.0] + + +def test_custom_plain_list(): + list_of_bools = [[True, False], [False, True, False], [False]] + unclassed = CompressedList.from_list(list_of_bools) + + assert unclassed is not None + assert isinstance(unclassed, CompressedList) + assert len(unclassed) == 3 + assert list(unclassed.get_element_lengths()) == [2, 3, 1] diff --git a/tests/test_comp_float.py b/tests/test_comp_float.py new file mode 100644 index 0000000..1913508 --- /dev/null +++ b/tests/test_comp_float.py @@ -0,0 +1,33 @@ +import pytest +from biocutils.FloatList import FloatList + +from compressed_lists import CompressedFloatList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@pytest.fixture +def float_data(): + return [[1.1, 1.2], [2.1, 2.2, 2.3], [3]] + + +@pytest.fixture +def float_list(float_data): + names = ["fruits1", "fruits2", "fruits3"] + return CompressedFloatList.from_list(float_data, names) + + +def test_creation(float_data): + float_list = CompressedFloatList.from_list(float_data) + + assert len(float_list) == 3 + assert isinstance(float_list.unlist_data, FloatList) + assert list(float_list.get_unlist_data()) == [1.1, 1.2, 2.1, 2.2, 2.3, 3.0] + assert list(float_list.get_element_lengths()) == [2, 3, 1] + + +def test_getitem(float_list): + assert list(float_list[0]) == [1.1, 1.2] + assert list(float_list["fruits2"]) == [2.1, 2.2, 2.3] diff --git a/tests/test_comp_int.py b/tests/test_comp_int.py index 5771031..479bf34 100644 --- a/tests/test_comp_int.py +++ b/tests/test_comp_int.py @@ -1,5 +1,6 @@ import numpy as np import pytest +from biocutils.IntegerList import IntegerList from compressed_lists import CompressedIntegerList, Partitioning @@ -23,16 +24,16 @@ def test_creation(int_data): int_list = CompressedIntegerList.from_list(int_data) assert len(int_list) == 3 - assert isinstance(int_list.unlist_data, np.ndarray) + assert isinstance(int_list.unlist_data, IntegerList) assert list(int_list.get_unlist_data()) == [1, 2, 3, 4, 5, 6, 7, 8, 9] assert list(int_list.get_element_lengths()) == [3, 2, 4] def test_creation_from_parts(): - int_list = CompressedIntegerList(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), Partitioning(ends=[3, 5, 9])) + int_list = CompressedIntegerList([1, 2, 3, 4, 5, 6, 7, 8, 9], Partitioning(ends=[3, 5, 9])) assert len(int_list) == 3 - assert isinstance(int_list.unlist_data, np.ndarray) + assert isinstance(int_list.unlist_data, IntegerList) assert list(int_list.get_unlist_data()) == [1, 2, 3, 4, 5, 6, 7, 8, 9] assert list(int_list.get_element_lengths()) == [3, 2, 4] @@ -45,7 +46,7 @@ def test_creation_with_names(int_data): def test_validation(): - data = np.array([1, 2, 3, 4, 5]) + data = IntegerList([1, 2, 3, 4, 5]) partitioning = Partitioning([2, 4, 7]) with pytest.raises(ValueError): @@ -53,20 +54,19 @@ def test_validation(): def test_getitem_by_index(int_list): - assert np.allclose(int_list[0], [1, 2, 3]) - assert np.allclose(int_list[1], [4, 5]) - assert np.allclose(int_list[2], [6, 7, 8, 9]) - - assert np.allclose(int_list[-1], [6, 7, 8, 9]) + assert np.allclose(list(int_list[0]), [1, 2, 3]) + assert np.allclose(list(int_list[1]), [4, 5]) + assert np.allclose(list(int_list[2]), [6, 7, 8, 9]) + assert np.allclose(list(int_list[-1]), [6, 7, 8, 9]) with pytest.raises(IndexError): int_list[3] def test_getitem_by_name(int_list): - assert np.allclose(int_list["A"], [1, 2, 3]) - assert np.allclose(int_list["B"], [4, 5]) - assert np.allclose(int_list["C"], [6, 7, 8, 9]) + assert np.allclose(list(int_list["A"]), [1, 2, 3]) + assert np.allclose(list(int_list["B"]), [4, 5]) + assert np.allclose(list(int_list["C"]), [6, 7, 8, 9]) with pytest.raises(KeyError): int_list["D"] @@ -76,8 +76,8 @@ def test_getitem_by_slice(int_list): sliced = int_list[1:3] assert len(sliced) == 2 - assert np.allclose(sliced[0], [4, 5]) - assert np.allclose(sliced[1], [6, 7, 8, 9]) + assert np.allclose(list(sliced[0]), [4, 5]) + assert np.allclose(list(sliced[1]), [6, 7, 8, 9]) assert list(sliced.names) == ["B", "C"] # Empty slice @@ -87,43 +87,42 @@ def test_getitem_by_slice(int_list): def test_iteration(int_list, int_data): items = list(int_list) - print(items, int_data) for i, lst in enumerate(items): - assert np.allclose(lst, int_data[i]) + assert np.allclose(list(lst), int_data[i]) def test_to_list(int_list, int_data): regular_list = int_list.to_list() for i, lst in enumerate(regular_list): - assert np.allclose(lst, int_data[i]) + assert np.allclose(list(lst), int_data[i]) def test_unlist(int_list): unlisted = int_list.unlist() - assert isinstance(unlisted, np.ndarray) + assert isinstance(unlisted, IntegerList) assert np.allclose(list(unlisted), [1, 2, 3, 4, 5, 6, 7, 8, 9]) def test_relist(int_list): - new_data = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90], dtype=np.int64) + new_data = IntegerList([10, 20, 30, 40, 50, 60, 70, 80, 90]) relisted = int_list.relist(new_data) assert len(relisted) == len(int_list) assert list(relisted.get_names()) == list(int_list.names) - assert np.allclose(relisted[0], [10, 20, 30]) - assert np.allclose(relisted[1], [40, 50]) - assert np.allclose(relisted[2], [60, 70, 80, 90]) + assert np.allclose(list(relisted[0]), [10, 20, 30]) + assert np.allclose(list(relisted[1]), [40, 50]) + assert np.allclose(list(relisted[2]), [60, 70, 80, 90]) with pytest.raises(ValueError): - int_list.relist(np.array([1, 2, 3])) + int_list.relist(IntegerList([1, 2, 3])) def test_extract_subset(int_list): subset = int_list.extract_subset([0, 2]) assert len(subset) == 2 - assert np.allclose(subset[0], [1, 2, 3]) - assert np.allclose(subset[1], [6, 7, 8, 9]) + assert np.allclose(list(subset[0]), [1, 2, 3]) + assert np.allclose(list(subset[1]), [6, 7, 8, 9]) assert list(subset.names) == ["A", "C"] with pytest.raises(IndexError): @@ -134,6 +133,6 @@ def test_lapply(int_list): squared = int_list.lapply(lambda x: [i**2 for i in x]) assert len(squared) == len(int_list) - assert np.allclose(squared[0], [1, 4, 9]) - assert np.allclose(squared[1], [16, 25]) - assert np.allclose(squared[2], [36, 49, 64, 81]) + assert np.allclose(list(squared[0]), [1, 4, 9]) + assert np.allclose(list(squared[1]), [16, 25]) + assert np.allclose(list(squared[2]), [36, 49, 64, 81]) diff --git a/tests/test_comp_numpy.py b/tests/test_comp_numpy.py new file mode 100644 index 0000000..99afb8b --- /dev/null +++ b/tests/test_comp_numpy.py @@ -0,0 +1,102 @@ +import numpy as np +import pytest + +from compressed_lists import CompressedNumpyList, Partitioning + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@pytest.fixture +def numpy_data(): + return [np.array([1, 2, 3]), np.array([4, 5]), np.array([6, 7, 8, 9])] + + +@pytest.fixture +def numpy_list(numpy_data): + names = ["A", "B", "C"] + return CompressedNumpyList.from_list(numpy_data, names) + + +def test_creation(numpy_data): + numpy_list = CompressedNumpyList.from_list(numpy_data) + + assert len(numpy_list) == 3 + assert isinstance(numpy_list.unlist_data, np.ndarray) + assert list(numpy_list.get_unlist_data()) == [1, 2, 3, 4, 5, 6, 7, 8, 9] + assert list(numpy_list.get_element_lengths()) == [3, 2, 4] + + +def test_creation_from_parts(): + numpy_list = CompressedNumpyList(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), Partitioning(ends=[3, 5, 9])) + + assert len(numpy_list) == 3 + assert isinstance(numpy_list.unlist_data, np.ndarray) + assert list(numpy_list.get_unlist_data()) == [1, 2, 3, 4, 5, 6, 7, 8, 9] + assert list(numpy_list.get_element_lengths()) == [3, 2, 4] + + +def test_creation_with_names(numpy_data): + names = ["A", "B", "C"] + numpy_list = CompressedNumpyList.from_list(numpy_data, names) + + assert list(numpy_list.names) == names + + +def test_validation(): + data = np.array([1, 2, 3, 4, 5]) + partitioning = Partitioning([2, 4, 7]) + + with pytest.raises(ValueError): + CompressedNumpyList(data, partitioning) + + +def test_getitem_by_index(numpy_list): + assert np.allclose(numpy_list[0], [1, 2, 3]) + assert np.allclose(numpy_list[1], [4, 5]) + assert np.allclose(numpy_list[2], [6, 7, 8, 9]) + assert np.allclose(numpy_list[-1], [6, 7, 8, 9]) + + with pytest.raises(IndexError): + numpy_list[3] + + +def test_getitem_by_name(numpy_list): + assert np.allclose(numpy_list["A"], [1, 2, 3]) + assert np.allclose(numpy_list["B"], [4, 5]) + assert np.allclose(numpy_list["C"], [6, 7, 8, 9]) + + with pytest.raises(KeyError): + numpy_list["D"] + + +def test_getitem_by_slice(numpy_list): + sliced = numpy_list[1:3] + + assert len(sliced) == 2 + assert np.allclose(sliced[0], [4, 5]) + assert np.allclose(sliced[1], [6, 7, 8, 9]) + assert list(sliced.names) == ["B", "C"] + + # Empty slice + empty = numpy_list[3:4] + assert len(empty) == 0 + + +def test_iteration(numpy_list, numpy_data): + items = list(numpy_list) + for i, lst in enumerate(items): + assert np.allclose(lst, numpy_data[i]) + + +def test_to_list(numpy_list, numpy_data): + regular_list = numpy_list.to_list() + for i, lst in enumerate(regular_list): + assert np.allclose(list(lst), numpy_data[i]) + + +def test_unlist(numpy_list): + unlisted = numpy_list.unlist() + assert isinstance(unlisted, np.ndarray) + assert np.allclose(unlisted, [1, 2, 3, 4, 5, 6, 7, 8, 9]) diff --git a/tests/test_comp_str.py b/tests/test_comp_str.py index ef54f8c..d65adbd 100644 --- a/tests/test_comp_str.py +++ b/tests/test_comp_str.py @@ -1,4 +1,5 @@ import pytest +from biocutils.StringList import StringList from compressed_lists import CompressedStringList @@ -22,20 +23,20 @@ def test_creation(char_data): char_list = CompressedStringList.from_list(char_data) assert len(char_list) == 3 - assert isinstance(char_list.unlist_data, list) - assert char_list.get_unlist_data() == ["apple", "banana", "cherry", "date", "elderberry", "fig"] + assert isinstance(char_list.unlist_data, StringList) + assert list(char_list.get_unlist_data()) == ["apple", "banana", "cherry", "date", "elderberry", "fig"] assert list(char_list.get_element_lengths()) == [2, 3, 1] def test_getitem(char_list): - assert char_list[0] == ["apple", "banana"] - assert char_list["fruits2"] == ["cherry", "date", "elderberry"] + assert list(char_list[0]) == ["apple", "banana"] + assert list(char_list["fruits2"]) == ["cherry", "date", "elderberry"] def test_lapply(char_list): uppercased = char_list.lapply(lambda x: [s.upper() for s in x]) assert len(uppercased) == len(char_list) - assert uppercased[0] == ["APPLE", "BANANA"] - assert uppercased[1] == ["CHERRY", "DATE", "ELDERBERRY"] - assert uppercased[2] == ["FIG"] + assert list(uppercased[0]) == ["APPLE", "BANANA"] + assert list(uppercased[1]) == ["CHERRY", "DATE", "ELDERBERRY"] + assert list(uppercased[2]) == ["FIG"] diff --git a/tests/test_generics.py b/tests/test_generics.py new file mode 100644 index 0000000..ae45e43 --- /dev/null +++ b/tests/test_generics.py @@ -0,0 +1,24 @@ +import biocutils as ut + +from compressed_lists import CompressedFloatList, CompressedIntegerList, Partitioning, splitAsCompressedList + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def test_groups(): + float_vec = ut.FloatList([1.1, 1.2, 2.1, 2.2, 2.3, 3.0]) + groups = [1, 2, 3, 1, 2, 3] + + clist = splitAsCompressedList(float_vec, groups_or_partitions=groups) + + assert isinstance(clist, CompressedFloatList) + + +def test_partitions(): + int_list = splitAsCompressedList( + ut.IntegerList([1, 2, 3, 4, 5, 6, 7, 8, 9]), groups_or_partitions=Partitioning(ends=[3, 5, 9]) + ) + + assert isinstance(int_list, CompressedIntegerList)