From 0dcf01eab606bd8325e84a81147637f74d11725c Mon Sep 17 00:00:00 2001 From: George Panchuk Date: Tue, 9 Apr 2024 14:51:48 +0200 Subject: [PATCH 1/2] fix: remove scipy, read csr matrix manually --- dataset_reader/base_reader.py | 6 +- dataset_reader/sparse_reader.py | 99 ++++++++++++++++++++------------- pyproject.toml | 4 +- 3 files changed, 63 insertions(+), 46 deletions(-) diff --git a/dataset_reader/base_reader.py b/dataset_reader/base_reader.py index 3329ecbb..f8410476 100644 --- a/dataset_reader/base_reader.py +++ b/dataset_reader/base_reader.py @@ -1,13 +1,11 @@ from dataclasses import dataclass from typing import Iterator, List, Optional -import numpy as np - @dataclass class SparseVector: - indices: np.array - values: np.array + indices: List[int] + values: List[float] @dataclass diff --git a/dataset_reader/sparse_reader.py b/dataset_reader/sparse_reader.py index e7b15708..3fae63ac 100644 --- a/dataset_reader/sparse_reader.py +++ b/dataset_reader/sparse_reader.py @@ -1,48 +1,56 @@ import os -from typing import Iterator +from pathlib import Path +from typing import Iterator, Tuple, Union, List import numpy as np -from scipy.sparse import csr_matrix from dataset_reader.base_reader import BaseReader, Query, Record, SparseVector -# credit: code extracted from neuIPS 2023 benchmarks +def read_sparse_matrix_fields(filename: Union[Path, str]) -> Tuple[np.array, np.array, np.array]: + """Read the fields of a CSR matrix without instantiating it""" -def read_sparse_matrix_fields(fname): - """read the fields of a CSR matrix without instantiating it""" - with open(fname, "rb") as f: + with open(filename, "rb") as f: sizes = np.fromfile(f, dtype="int64", count=3) - nrow, ncol, nnz = sizes - indptr = np.fromfile(f, dtype="int64", count=nrow + 1) - assert nnz == indptr[-1] - indices = np.fromfile(f, dtype="int32", count=nnz) - assert np.all(indices >= 0) and np.all(indices < ncol) - data = np.fromfile(f, dtype="float32", count=nnz) - return data, indices, indptr, ncol - - -def read_sparse_matrix(fname) -> Iterator[SparseVector]: - """read a CSR matrix in spmat format""" - data, indices, indptr, ncol = read_sparse_matrix_fields(fname) - # Need scipy csr_matrix to parse spmat format and easily take out rows - csr_mat = csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, ncol)) - num_vectors = csr_mat.shape[0] - - for i in range(num_vectors): - indices = csr_mat[i].indices.tolist() - values = csr_mat[i].data.tolist() - yield SparseVector(indices=indices, values=values) - - -def knn_result_read(fname): - n, d = map(int, np.fromfile(fname, dtype="uint32", count=2)) - assert os.stat(fname).st_size == 8 + n * d * (4 + 4) - f = open(fname, "rb") - f.seek(4 + 4) - ids = np.fromfile(f, dtype="int32", count=n * d).reshape(n, d) - scores = np.fromfile(f, dtype="float32", count=n * d).reshape(n, d) - f.close() + n_row, n_col, n_non_zero = sizes + index_pointer = np.fromfile(f, dtype="int64", count=n_row + 1) + assert n_non_zero == index_pointer[-1] + columns = np.fromfile(f, dtype="int32", count=n_non_zero) + assert np.all(columns >= 0) and np.all(columns < n_col) + values = np.fromfile(f, dtype="float32", count=n_non_zero) + return values, columns, index_pointer + + +def csr_to_sparse_vectors(values: List[float], columns: List[int], index_pointer: List[int]) -> Iterator[SparseVector]: + num_rows = len(index_pointer) - 1 + + for i in range(num_rows): + start = index_pointer[i] + end = index_pointer[i + 1] + row_values, row_indices = [], [] + for j in range(start, end): + row_values.append(values[j]) + row_indices.append(columns[j]) + yield SparseVector(indices=row_indices, values=row_values) + + +def read_csr_matrix(filename: Union[Path, str]) -> Iterator[SparseVector]: + """Read a CSR matrix in spmat format""" + values, columns, index_pointer = read_sparse_matrix_fields(filename) + values = values.tolist() + columns = columns.tolist() + index_pointer = index_pointer.tolist() + + yield from csr_to_sparse_vectors(values, columns, index_pointer) + + +def knn_result_read(filename: Union[Path, str]) -> Tuple[List[List[int]], List[List[float]]]: + n, d = map(int, np.fromfile(filename, dtype="uint32", count=2)) + assert os.stat(filename).st_size == 8 + n * d * (4 + 4) + with open(filename, "rb") as f: + f.seek(4 + 4) + ids = np.fromfile(f, dtype="int32", count=n * d).reshape(n, d).tolist() + scores = np.fromfile(f, dtype="float32", count=n * d).reshape(n, d).tolist() return ids, scores @@ -53,7 +61,7 @@ def __init__(self, path, normalize=False): def read_queries(self) -> Iterator[Query]: queries_path = self.path / "queries.csr" - X = read_sparse_matrix(queries_path) + X = read_csr_matrix(queries_path) gt_path = self.path / "results.gt" gt_indices, _ = knn_result_read(gt_path) @@ -63,12 +71,25 @@ def read_queries(self) -> Iterator[Query]: vector=None, sparse_vector=sparse_vector, meta_conditions=None, - expected_result=gt_indices[i].tolist(), + expected_result=gt_indices[i], ) def read_data(self) -> Iterator[Record]: data_path = self.path / "data.csr" - X = read_sparse_matrix(data_path) + X = read_csr_matrix(data_path) for i, sparse_vector in enumerate(X): yield Record(id=i, vector=None, sparse_vector=sparse_vector, metadata=None) + + +if __name__ == '__main__': + vals = [1, 3, 2, 3, 6, 4, 5] + cols = [0, 2, 2, 1, 3, 0, 2] + pointers = [0, 2, 3, 5, 7] + vecs = [vec for vec in csr_to_sparse_vectors(vals, cols, pointers)] + + assert vecs[0] == SparseVector(indices=[0, 2], values=[1, 3]) + assert vecs[1] == SparseVector(indices=[2], values=[2]) + assert vecs[2] == SparseVector(indices=[1, 3], values=[3, 6]) + assert vecs[3] == SparseVector(indices=[0, 2], values=[4, 5]) + diff --git a/pyproject.toml b/pyproject.toml index 13abee77..cc6ac42d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "vector-db-benchmark" version = "0.1.0" description = "" -authors = ["Kacper Ɓukawski "] +authors = ["Qdrant Team "] [tool.poetry.dependencies] python = ">=3.9,<3.12" @@ -20,8 +20,6 @@ opensearch-py = "^2.3.2" tqdm = "^4.66.1" psycopg = {extras = ["binary"], version = "^3.1.17"} pgvector = "^0.2.4" -scipy = "^1.12.0" - [tool.poetry.dev-dependencies] pre-commit = "^2.20.0" From 33492b1cb974ba31f9968a4b4b4fcc09206ec368 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 9 Apr 2024 12:53:12 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dataset_reader/sparse_reader.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/dataset_reader/sparse_reader.py b/dataset_reader/sparse_reader.py index 3fae63ac..fb2af5d9 100644 --- a/dataset_reader/sparse_reader.py +++ b/dataset_reader/sparse_reader.py @@ -1,13 +1,15 @@ import os from pathlib import Path -from typing import Iterator, Tuple, Union, List +from typing import Iterator, List, Tuple, Union import numpy as np from dataset_reader.base_reader import BaseReader, Query, Record, SparseVector -def read_sparse_matrix_fields(filename: Union[Path, str]) -> Tuple[np.array, np.array, np.array]: +def read_sparse_matrix_fields( + filename: Union[Path, str] +) -> Tuple[np.array, np.array, np.array]: """Read the fields of a CSR matrix without instantiating it""" with open(filename, "rb") as f: @@ -21,7 +23,9 @@ def read_sparse_matrix_fields(filename: Union[Path, str]) -> Tuple[np.array, np. return values, columns, index_pointer -def csr_to_sparse_vectors(values: List[float], columns: List[int], index_pointer: List[int]) -> Iterator[SparseVector]: +def csr_to_sparse_vectors( + values: List[float], columns: List[int], index_pointer: List[int] +) -> Iterator[SparseVector]: num_rows = len(index_pointer) - 1 for i in range(num_rows): @@ -44,7 +48,9 @@ def read_csr_matrix(filename: Union[Path, str]) -> Iterator[SparseVector]: yield from csr_to_sparse_vectors(values, columns, index_pointer) -def knn_result_read(filename: Union[Path, str]) -> Tuple[List[List[int]], List[List[float]]]: +def knn_result_read( + filename: Union[Path, str] +) -> Tuple[List[List[int]], List[List[float]]]: n, d = map(int, np.fromfile(filename, dtype="uint32", count=2)) assert os.stat(filename).st_size == 8 + n * d * (4 + 4) with open(filename, "rb") as f: @@ -82,7 +88,7 @@ def read_data(self) -> Iterator[Record]: yield Record(id=i, vector=None, sparse_vector=sparse_vector, metadata=None) -if __name__ == '__main__': +if __name__ == "__main__": vals = [1, 3, 2, 3, 6, 4, 5] cols = [0, 2, 2, 1, 3, 0, 2] pointers = [0, 2, 3, 5, 7] @@ -92,4 +98,3 @@ def read_data(self) -> Iterator[Record]: assert vecs[1] == SparseVector(indices=[2], values=[2]) assert vecs[2] == SparseVector(indices=[1, 3], values=[3, 6]) assert vecs[3] == SparseVector(indices=[0, 2], values=[4, 5]) -