diff --git a/dataset_reader/base_reader.py b/dataset_reader/base_reader.py index 3329ecbb..f8410476 100644 --- a/dataset_reader/base_reader.py +++ b/dataset_reader/base_reader.py @@ -1,13 +1,11 @@ from dataclasses import dataclass from typing import Iterator, List, Optional -import numpy as np - @dataclass class SparseVector: - indices: np.array - values: np.array + indices: List[int] + values: List[float] @dataclass diff --git a/dataset_reader/sparse_reader.py b/dataset_reader/sparse_reader.py index e7b15708..fb2af5d9 100644 --- a/dataset_reader/sparse_reader.py +++ b/dataset_reader/sparse_reader.py @@ -1,48 +1,62 @@ import os -from typing import Iterator +from pathlib import Path +from typing import Iterator, List, Tuple, Union import numpy as np -from scipy.sparse import csr_matrix from dataset_reader.base_reader import BaseReader, Query, Record, SparseVector -# credit: code extracted from neuIPS 2023 benchmarks +def read_sparse_matrix_fields( + filename: Union[Path, str] +) -> Tuple[np.array, np.array, np.array]: + """Read the fields of a CSR matrix without instantiating it""" -def read_sparse_matrix_fields(fname): - """read the fields of a CSR matrix without instantiating it""" - with open(fname, "rb") as f: + with open(filename, "rb") as f: sizes = np.fromfile(f, dtype="int64", count=3) - nrow, ncol, nnz = sizes - indptr = np.fromfile(f, dtype="int64", count=nrow + 1) - assert nnz == indptr[-1] - indices = np.fromfile(f, dtype="int32", count=nnz) - assert np.all(indices >= 0) and np.all(indices < ncol) - data = np.fromfile(f, dtype="float32", count=nnz) - return data, indices, indptr, ncol - - -def read_sparse_matrix(fname) -> Iterator[SparseVector]: - """read a CSR matrix in spmat format""" - data, indices, indptr, ncol = read_sparse_matrix_fields(fname) - # Need scipy csr_matrix to parse spmat format and easily take out rows - csr_mat = csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, ncol)) - num_vectors = csr_mat.shape[0] - - for i in range(num_vectors): - indices = csr_mat[i].indices.tolist() - values = csr_mat[i].data.tolist() - yield SparseVector(indices=indices, values=values) - - -def knn_result_read(fname): - n, d = map(int, np.fromfile(fname, dtype="uint32", count=2)) - assert os.stat(fname).st_size == 8 + n * d * (4 + 4) - f = open(fname, "rb") - f.seek(4 + 4) - ids = np.fromfile(f, dtype="int32", count=n * d).reshape(n, d) - scores = np.fromfile(f, dtype="float32", count=n * d).reshape(n, d) - f.close() + n_row, n_col, n_non_zero = sizes + index_pointer = np.fromfile(f, dtype="int64", count=n_row + 1) + assert n_non_zero == index_pointer[-1] + columns = np.fromfile(f, dtype="int32", count=n_non_zero) + assert np.all(columns >= 0) and np.all(columns < n_col) + values = np.fromfile(f, dtype="float32", count=n_non_zero) + return values, columns, index_pointer + + +def csr_to_sparse_vectors( + values: List[float], columns: List[int], index_pointer: List[int] +) -> Iterator[SparseVector]: + num_rows = len(index_pointer) - 1 + + for i in range(num_rows): + start = index_pointer[i] + end = index_pointer[i + 1] + row_values, row_indices = [], [] + for j in range(start, end): + row_values.append(values[j]) + row_indices.append(columns[j]) + yield SparseVector(indices=row_indices, values=row_values) + + +def read_csr_matrix(filename: Union[Path, str]) -> Iterator[SparseVector]: + """Read a CSR matrix in spmat format""" + values, columns, index_pointer = read_sparse_matrix_fields(filename) + values = values.tolist() + columns = columns.tolist() + index_pointer = index_pointer.tolist() + + yield from csr_to_sparse_vectors(values, columns, index_pointer) + + +def knn_result_read( + filename: Union[Path, str] +) -> Tuple[List[List[int]], List[List[float]]]: + n, d = map(int, np.fromfile(filename, dtype="uint32", count=2)) + assert os.stat(filename).st_size == 8 + n * d * (4 + 4) + with open(filename, "rb") as f: + f.seek(4 + 4) + ids = np.fromfile(f, dtype="int32", count=n * d).reshape(n, d).tolist() + scores = np.fromfile(f, dtype="float32", count=n * d).reshape(n, d).tolist() return ids, scores @@ -53,7 +67,7 @@ def __init__(self, path, normalize=False): def read_queries(self) -> Iterator[Query]: queries_path = self.path / "queries.csr" - X = read_sparse_matrix(queries_path) + X = read_csr_matrix(queries_path) gt_path = self.path / "results.gt" gt_indices, _ = knn_result_read(gt_path) @@ -63,12 +77,24 @@ def read_queries(self) -> Iterator[Query]: vector=None, sparse_vector=sparse_vector, meta_conditions=None, - expected_result=gt_indices[i].tolist(), + expected_result=gt_indices[i], ) def read_data(self) -> Iterator[Record]: data_path = self.path / "data.csr" - X = read_sparse_matrix(data_path) + X = read_csr_matrix(data_path) for i, sparse_vector in enumerate(X): yield Record(id=i, vector=None, sparse_vector=sparse_vector, metadata=None) + + +if __name__ == "__main__": + vals = [1, 3, 2, 3, 6, 4, 5] + cols = [0, 2, 2, 1, 3, 0, 2] + pointers = [0, 2, 3, 5, 7] + vecs = [vec for vec in csr_to_sparse_vectors(vals, cols, pointers)] + + assert vecs[0] == SparseVector(indices=[0, 2], values=[1, 3]) + assert vecs[1] == SparseVector(indices=[2], values=[2]) + assert vecs[2] == SparseVector(indices=[1, 3], values=[3, 6]) + assert vecs[3] == SparseVector(indices=[0, 2], values=[4, 5]) diff --git a/pyproject.toml b/pyproject.toml index 13abee77..cc6ac42d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "vector-db-benchmark" version = "0.1.0" description = "" -authors = ["Kacper Ɓukawski "] +authors = ["Qdrant Team "] [tool.poetry.dependencies] python = ">=3.9,<3.12" @@ -20,8 +20,6 @@ opensearch-py = "^2.3.2" tqdm = "^4.66.1" psycopg = {extras = ["binary"], version = "^3.1.17"} pgvector = "^0.2.4" -scipy = "^1.12.0" - [tool.poetry.dev-dependencies] pre-commit = "^2.20.0"