diff --git a/src/svsbench/generate_ground_truth.py b/src/svsbench/generate_ground_truth.py index f0682ee..0571360 100644 --- a/src/svsbench/generate_ground_truth.py +++ b/src/svsbench/generate_ground_truth.py @@ -23,6 +23,11 @@ def _read_args(argv: list[str] | None = None) -> argparse.Namespace: parser.add_argument("--vecs_file", help="Vectors *vecs file", type=Path) parser.add_argument("--query_file", help="Query vectors file", type=Path) parser.add_argument("--out_file", help="Output file", type=Path) + parser.add_argument( + "--query_out_file", + help="Output file for query vectors generated when num_queries given", + type=Path, + ) parser.add_argument( "--distance", help="Distance", @@ -33,6 +38,14 @@ def _read_args(argv: list[str] | None = None) -> argparse.Namespace: "-k", help="Number of neighbors", type=int, default=100 ) parser.add_argument("--num_vectors", help="Number of vectors", type=int) + parser.add_argument( + "--num_query_vectors", + help="Number of query vectors." + " If given, query vectors will be shuffled." + " If more than in the query file, the query vectors will be shuffled" + " and repeated as needed.", + type=int, + ) parser.add_argument( "--shuffle", help="Shuffle order of vectors", action="store_true" ) @@ -54,8 +67,10 @@ def main(argv: str | None = None) -> None: k=args.k, num_threads=args.max_threads, out_file=args.out_file, + query_out_path=args.query_out_file, shuffle=args.shuffle, seed=args.seed, + num_query_vectors=args.num_query_vectors, ) @@ -64,31 +79,72 @@ def generate_ground_truth( vecs_path: Path, query_file: Path, distance: svs.DistanceType, - num_vectors: int | None, + num_vectors: int | None = None, k: int = 100, num_threads: int = 1, out_file: Path | None = None, + query_out_path: Path | None = None, shuffle: bool = False, seed: int = 42, + num_query_vectors: int | None = None, ) -> None: - if out_file is None: - out_file = utils.ground_truth_path( - vecs_path, query_file, distance, num_vectors, seed if shuffle else None, + if out_file is not None and out_file.suffix != ".ivecs": + raise SystemExit("Error: --out_file must end in .ivecs") + if ( + query_out_path is not None + and query_out_path.suffix != query_file.suffix + ): + raise SystemExit( + "Error: --query_out_path must have the same suffix as --query_file" ) - else: - if out_file.suffix != ".ivecs": - raise SystemExit("Error: --out_file must end in .ivecs") - out_file = str(out_file) queries = svs.read_vecs(str(query_file)) vectors = svs.read_vecs(str(vecs_path)) - if num_vectors is None: - num_vectors = vectors.shape[0] + # If num_vectors is None or larger than the number of vectors, + # slicing will return the whole array. vectors = vectors[:num_vectors] if shuffle: - vectors = vectors[np.random.default_rng(seed).permutation(num_vectors)] + np.random.default_rng(seed).shuffle(vectors) index = svs.Flat(vectors, distance=distance, num_threads=num_threads) idxs, _ = index.search(queries, k) - svs.write_vecs(idxs.astype(np.uint32), out_file) + if num_query_vectors is not None: + queries_all = np.empty_like( + queries, shape=(num_query_vectors, queries.shape[1]) + ) + ground_truth_all = np.empty_like( + idxs, shape=(num_query_vectors, idxs.shape[1]) + ) + rng = np.random.default_rng(seed) + cursor = 0 + while cursor < num_query_vectors: + permutation = rng.permutation(len(queries)) + batch_size = min(num_query_vectors - cursor, len(queries)) + queries_all[cursor : cursor + batch_size] = queries[ + permutation[:batch_size] + ] + ground_truth_all[cursor : cursor + batch_size] = idxs[ + permutation[:batch_size] + ] + cursor += batch_size + if query_out_path is None: + query_out_path = ( + query_file.parent + / f"{query_file.stem}-{num_query_vectors}_{seed}" + f"{query_file.suffix}" + ) + svs.write_vecs(queries_all, str(query_out_path)) + queries_path = query_out_path + else: + queries_path = query_file + ground_truth_all = idxs + if out_file is None: + out_file = utils.ground_truth_path( + vecs_path, + queries_path, + distance, + num_vectors, + seed if shuffle else None, + ) + svs.write_vecs(ground_truth_all.astype(np.uint32), str(out_file)) logger.info({"ground_truth_saved": out_file}) diff --git a/tests/conftest.py b/tests/conftest.py index 0bab4ee..ce230eb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,19 +15,23 @@ 8: svs.LeanVecKind.lvq8, } +RANDOM_VECTORS_SHAPE: Final = (1000, 100) +NUM_RANDOM_QUERY_VECTORS: Final = 100 +GROUND_TRUTH_K: Final = 100 + def random_array(dtype: np.dtype) -> np.ndarray: rng = np.random.default_rng(42) if np.dtype(dtype).kind == "i": iinfo = np.iinfo(dtype) - return rng.integers(iinfo.min, iinfo.max, (1000, 100), dtype=dtype) + return rng.integers( + iinfo.min, iinfo.max, RANDOM_VECTORS_SHAPE, dtype=dtype + ) else: - return rng.random((1000, 100)).astype(dtype) + return rng.random(RANDOM_VECTORS_SHAPE).astype(dtype) -@pytest.fixture( - scope="session", params=consts.SUFFIX_TO_SVS_TYPE.keys() -) +@pytest.fixture(scope="session", params=consts.SUFFIX_TO_SVS_TYPE.keys()) def tmp_vecs(request, tmp_path_factory): suffix = request.param vecs_path = tmp_path_factory.mktemp("vecs") / ("random" + suffix) @@ -134,7 +138,10 @@ def index_dir_with_svs_type_and_dynamic(request, tmp_path_factory): def query_path(tmp_path_factory) -> Path: path = tmp_path_factory.mktemp("query") / "query.fvecs" svs.write_vecs( - np.random.default_rng(42).random((100, 100)).astype(np.float32), path + np.random.default_rng(42) + .random((NUM_RANDOM_QUERY_VECTORS, RANDOM_VECTORS_SHAPE[1])) + .astype(np.float32), + path, ) return path @@ -162,7 +169,7 @@ def ground_truth_path( ) vectors = np.load(index_dir / "data.npy") index = svs.Flat(vectors, distance=distance, num_threads=num_threads) - idxs, _ = index.search(svs.read_vecs(str(query_path)), 100) + idxs, _ = index.search(svs.read_vecs(str(query_path)), GROUND_TRUTH_K) ground_truth_path = ( tmp_path_factory.mktemp("ground_truth") / f"ground_truth_{index_svs_type}.ivecs" diff --git a/tests/test_generate_ground_truth.py b/tests/test_generate_ground_truth.py new file mode 100644 index 0000000..74d929b --- /dev/null +++ b/tests/test_generate_ground_truth.py @@ -0,0 +1,134 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +import functools + +import conftest +import pytest +import svs + +from svsbench.generate_ground_truth import generate_ground_truth, main + + +def test_generate_ground_truth_no_shuffle( + tmp_vecs, query_path, distance, num_threads, tmp_path_factory +): + if tmp_vecs.suffix == ".hvecs": + pytest.xfail("Not implemented") + out_file = tmp_path_factory.mktemp("output") / "ground_truth.ivecs" + k = 10 + generate_ground_truth( + vecs_path=tmp_vecs, + query_file=query_path, + distance=distance, + num_vectors=None, + k=k, + num_threads=num_threads, + out_file=out_file, + query_out_path=None, + shuffle=False, + seed=42, + ) + assert out_file.is_file() + gt = svs.read_vecs(str(out_file)) + assert gt.shape == (conftest.NUM_RANDOM_QUERY_VECTORS, k), ( + "Expected (num_queries, k) shape" + ) + + +def test_generate_ground_truth_shuffle( + tmp_vecs, query_path, distance, num_threads, tmp_path_factory +): + if tmp_vecs.suffix == ".hvecs": + pytest.xfail("Not implemented") + out_file = ( + tmp_path_factory.mktemp("output") / "ground_truth_shuffled.ivecs" + ) + k = 5 + generate_ground_truth( + vecs_path=tmp_vecs, + query_file=query_path, + distance=distance, + num_vectors=500, + k=k, + num_threads=num_threads, + out_file=out_file, + shuffle=True, + seed=2, + ) + assert out_file.is_file() + gt = svs.read_vecs(str(out_file)) + assert gt.shape == (conftest.NUM_RANDOM_QUERY_VECTORS, k) + + +def test_generate_ground_truth_num_query_vectors( + tmp_vecs, query_path, distance, num_threads, tmp_path_factory +): + k = 7 + if tmp_vecs.suffix == ".hvecs": + pytest.xfail("Not implemented") + out_dir = tmp_path_factory.mktemp("output") + out_file = out_dir / "ground_truth_subqueries.ivecs" + query_out_path = out_dir / "queries_out.fvecs" + generate_ground_truth_partial = functools.partial( + generate_ground_truth, + vecs_path=tmp_vecs, + query_file=query_path, + distance=distance, + k=k, + num_threads=num_threads, + out_file=out_file, + query_out_path=query_out_path, + shuffle=True, + ) + for num_query_vectors in [20, 200]: + generate_ground_truth_partial(num_query_vectors=num_query_vectors) + gt = svs.read_vecs(str(out_file)) + new_queries = svs.read_vecs(str(query_out_path)) + assert gt.shape == (num_query_vectors, k) + assert new_queries.shape == ( + num_query_vectors, + conftest.RANDOM_VECTORS_SHAPE[1], + ) + + +def test_generate_ground_truth_main( + tmp_vecs, query_path, num_threads, tmp_path_factory +): + if tmp_vecs.suffix == ".hvecs": + pytest.xfail("Not implemented") + out_dir = tmp_path_factory.mktemp("cli") + out_file = out_dir / "gt.ivecs" + query_out_path = out_dir / "queries_out.fvecs" + k = 8 + num_query_vectors = 150 + + argv = [ + "--vecs_file", + str(tmp_vecs), + "--query_file", + str(query_path), + "--out_file", + str(out_file), + "--distance", + "mip", + "-k", + str(k), + "--max_threads", + str(num_threads), + "--seed", + "42", + "--num_query_vectors", + str(num_query_vectors), + "--query_out_file", + str(query_out_path), + ] + + main(argv) + + gt = svs.read_vecs(str(out_file)) + new_queries = svs.read_vecs(str(query_out_path)) + assert gt.shape == (num_query_vectors, k) + assert new_queries.shape == ( + num_query_vectors, + conftest.RANDOM_VECTORS_SHAPE[1], + )