Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions dataset_reader/base_reader.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from dataclasses import dataclass
from typing import Iterator, List, Optional

import numpy as np


@dataclass
class SparseVector:
indices: np.array
values: np.array
indices: List[int]
values: List[float]


@dataclass
Expand Down
104 changes: 65 additions & 39 deletions dataset_reader/sparse_reader.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,62 @@
import os
from typing import Iterator
from pathlib import Path
from typing import Iterator, List, Tuple, Union

import numpy as np
from scipy.sparse import csr_matrix

from dataset_reader.base_reader import BaseReader, Query, Record, SparseVector

# credit: code extracted from neuIPS 2023 benchmarks

def read_sparse_matrix_fields(
filename: Union[Path, str]
) -> Tuple[np.array, np.array, np.array]:
"""Read the fields of a CSR matrix without instantiating it"""

def read_sparse_matrix_fields(fname):
"""read the fields of a CSR matrix without instantiating it"""
with open(fname, "rb") as f:
with open(filename, "rb") as f:
sizes = np.fromfile(f, dtype="int64", count=3)
nrow, ncol, nnz = sizes
indptr = np.fromfile(f, dtype="int64", count=nrow + 1)
assert nnz == indptr[-1]
indices = np.fromfile(f, dtype="int32", count=nnz)
assert np.all(indices >= 0) and np.all(indices < ncol)
data = np.fromfile(f, dtype="float32", count=nnz)
return data, indices, indptr, ncol


def read_sparse_matrix(fname) -> Iterator[SparseVector]:
"""read a CSR matrix in spmat format"""
data, indices, indptr, ncol = read_sparse_matrix_fields(fname)
# Need scipy csr_matrix to parse spmat format and easily take out rows
csr_mat = csr_matrix((data, indices, indptr), shape=(len(indptr) - 1, ncol))
num_vectors = csr_mat.shape[0]

for i in range(num_vectors):
indices = csr_mat[i].indices.tolist()
values = csr_mat[i].data.tolist()
yield SparseVector(indices=indices, values=values)


def knn_result_read(fname):
n, d = map(int, np.fromfile(fname, dtype="uint32", count=2))
assert os.stat(fname).st_size == 8 + n * d * (4 + 4)
f = open(fname, "rb")
f.seek(4 + 4)
ids = np.fromfile(f, dtype="int32", count=n * d).reshape(n, d)
scores = np.fromfile(f, dtype="float32", count=n * d).reshape(n, d)
f.close()
n_row, n_col, n_non_zero = sizes
index_pointer = np.fromfile(f, dtype="int64", count=n_row + 1)
assert n_non_zero == index_pointer[-1]
columns = np.fromfile(f, dtype="int32", count=n_non_zero)
assert np.all(columns >= 0) and np.all(columns < n_col)
values = np.fromfile(f, dtype="float32", count=n_non_zero)
return values, columns, index_pointer


def csr_to_sparse_vectors(
values: List[float], columns: List[int], index_pointer: List[int]
) -> Iterator[SparseVector]:
num_rows = len(index_pointer) - 1

for i in range(num_rows):
start = index_pointer[i]
end = index_pointer[i + 1]
row_values, row_indices = [], []
for j in range(start, end):
row_values.append(values[j])
row_indices.append(columns[j])
yield SparseVector(indices=row_indices, values=row_values)


def read_csr_matrix(filename: Union[Path, str]) -> Iterator[SparseVector]:
"""Read a CSR matrix in spmat format"""
values, columns, index_pointer = read_sparse_matrix_fields(filename)
values = values.tolist()
columns = columns.tolist()
index_pointer = index_pointer.tolist()

yield from csr_to_sparse_vectors(values, columns, index_pointer)


def knn_result_read(
filename: Union[Path, str]
) -> Tuple[List[List[int]], List[List[float]]]:
n, d = map(int, np.fromfile(filename, dtype="uint32", count=2))
assert os.stat(filename).st_size == 8 + n * d * (4 + 4)
with open(filename, "rb") as f:
f.seek(4 + 4)
ids = np.fromfile(f, dtype="int32", count=n * d).reshape(n, d).tolist()
scores = np.fromfile(f, dtype="float32", count=n * d).reshape(n, d).tolist()
return ids, scores


Expand All @@ -53,7 +67,7 @@ def __init__(self, path, normalize=False):

def read_queries(self) -> Iterator[Query]:
queries_path = self.path / "queries.csr"
X = read_sparse_matrix(queries_path)
X = read_csr_matrix(queries_path)

gt_path = self.path / "results.gt"
gt_indices, _ = knn_result_read(gt_path)
Expand All @@ -63,12 +77,24 @@ def read_queries(self) -> Iterator[Query]:
vector=None,
sparse_vector=sparse_vector,
meta_conditions=None,
expected_result=gt_indices[i].tolist(),
expected_result=gt_indices[i],
)

def read_data(self) -> Iterator[Record]:
data_path = self.path / "data.csr"
X = read_sparse_matrix(data_path)
X = read_csr_matrix(data_path)

for i, sparse_vector in enumerate(X):
yield Record(id=i, vector=None, sparse_vector=sparse_vector, metadata=None)


if __name__ == "__main__":
vals = [1, 3, 2, 3, 6, 4, 5]
cols = [0, 2, 2, 1, 3, 0, 2]
pointers = [0, 2, 3, 5, 7]
vecs = [vec for vec in csr_to_sparse_vectors(vals, cols, pointers)]

assert vecs[0] == SparseVector(indices=[0, 2], values=[1, 3])
assert vecs[1] == SparseVector(indices=[2], values=[2])
assert vecs[2] == SparseVector(indices=[1, 3], values=[3, 6])
assert vecs[3] == SparseVector(indices=[0, 2], values=[4, 5])
4 changes: 1 addition & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "vector-db-benchmark"
version = "0.1.0"
description = ""
authors = ["Kacper Łukawski <kacper.lukawski@qdrant.com>"]
authors = ["Qdrant Team <info@qdrant.tech>"]

[tool.poetry.dependencies]
python = ">=3.9,<3.12"
Expand All @@ -20,8 +20,6 @@ opensearch-py = "^2.3.2"
tqdm = "^4.66.1"
psycopg = {extras = ["binary"], version = "^3.1.17"}
pgvector = "^0.2.4"
scipy = "^1.12.0"


[tool.poetry.dev-dependencies]
pre-commit = "^2.20.0"
Expand Down