diff --git a/examples/cpp/shared/CMakeLists.txt b/examples/cpp/shared/CMakeLists.txt index 6f70c68b..6e7cd123 100644 --- a/examples/cpp/shared/CMakeLists.txt +++ b/examples/cpp/shared/CMakeLists.txt @@ -47,3 +47,4 @@ endfunction() create_example_executable(shared shared.cpp) create_example_executable(example_vamana_with_compression_lvq example_vamana_with_compression_lvq.cpp) create_example_executable(example_vamana_with_compression example_vamana_with_compression.cpp) +create_example_executable(example_vamana_with_compression_dynamic example_vamana_with_compression_dynamic.cpp) diff --git a/examples/cpp/shared/README.md b/examples/cpp/shared/README.md index 43f4378b..4dc96605 100644 --- a/examples/cpp/shared/README.md +++ b/examples/cpp/shared/README.md @@ -17,6 +17,7 @@ These examples utilize LVQ and LeanVec interfaces which are available when linking to a SVS shared/static library, which are published with [releases](https://github.com/intel/ScalableVectorSearch/releases). Note that these examples will _not_ run after building the open source codebase without the shared/static library. These examples include: - [example_vamana_with_compression.cpp](./example_vamana_with_compression.cpp): Demonstrates building, searching, saving, and reloading an index with a LeanVec-compressed dataset. - [example_vamana_with_compression_lvq.cpp](./example_vamana_with_compression_lvq.cpp): Demonstrates building, searching, saving, and reloading an index with a LVQ-compressed dataset. +- [example_vamana_with_compression_dynamic.cpp](./example_vamana_with_compression_dynamic.cpp): Demonstrates building, searching, saving, and reloading a dynamic index (allows vector insertions and deletions over time) with a LeanVec-compressed dataset. See [CMakeLists.txt](./CMakeLists.txt) for details on linking to the SVS shared library and follow the commands below to compile and use the SVS shared library to run shared.cpp example: diff --git a/examples/cpp/shared/example_vamana_with_compression_dynamic.cpp b/examples/cpp/shared/example_vamana_with_compression_dynamic.cpp new file mode 100644 index 00000000..ce71ccd4 --- /dev/null +++ b/examples/cpp/shared/example_vamana_with_compression_dynamic.cpp @@ -0,0 +1,144 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// SVS +#include "svs/core/recall.h" +#include "svs/extensions/flat/leanvec.h" +#include "svs/extensions/flat/lvq.h" +#include "svs/extensions/vamana/leanvec.h" +#include "svs/extensions/vamana/lvq.h" +#include "svs/orchestrators/dynamic_vamana.h" +#include "svs/orchestrators/exhaustive.h" +#include "svs/orchestrators/vamana.h" + +// Alias for blocked Lean dataset that supports resize/compact +using BlockedLean = svs::leanvec::LeanDataset< + svs::leanvec::UsingLVQ<4>, + svs::leanvec::UsingLVQ<8>, + svs::Dynamic, + svs::Dynamic, + svs::data::Blocked>>; + +int main() { + // STEP 1: Compress Data with LeanVec, reducing dimensionality to leanvec_dim dimensions + // and using 4 and 8 bits for primary and secondary levels respectively. + //! [Compress data] + const size_t num_threads = 4; + size_t padding = 32; + size_t leanvec_dim = 64; + auto threadpool = svs::threads::as_threadpool(num_threads); + auto loaded = + svs::VectorDataLoader(std::filesystem::path(SVS_DATA_DIR) / "data_f32.svs") + .load(); + auto data = BlockedLean::reduce( + loaded, + std::nullopt, + threadpool, + padding, + svs::lib::MaybeStatic(leanvec_dim) + ); + //! [Compress data] + + // STEP 2: Build Dynamic Vamana Index with initial set of vectors + //! [Index Build] + auto parameters = svs::index::vamana::VamanaBuildParameters{}; + + // Create id labels for build set + std::vector ids_build(loaded.size()); + for (size_t i = 0; i < loaded.size(); ++i) { + ids_build[i] = i; + } + + svs::DynamicVamana index = svs::DynamicVamana::build( + parameters, + data, + svs::lib::as_span(ids_build), + svs::distance::DistanceL2(), + num_threads + ); + //! [Index Build] + + // STEP 3: Add and delete vectors as needed. + //! [Delete vectors] + size_t num_to_delete = 100; + std::vector ids_delete(num_to_delete); + for (size_t i = 0; i < ids_delete.size(); ++i) { + ids_delete[i] = i; + } + + fmt::print("Deleting {} vectors.\n", ids_delete.size()); + + index.delete_points(ids_delete); + //! [Delete vectors] + + //! [Add vectors] + // Add the deleted vectors back in. + auto points = + svs::data::SimpleData(ids_delete.size(), loaded.dimensions()); + + size_t i = 0; + for (const auto& j : ids_delete) { + points.set_datum(i, loaded.get_datum(j)); + ++i; + } + auto points_const_view = points.cview(); + + fmt::print("Adding {} vectors.\n", ids_delete.size()); + + index.add_points(points_const_view, svs::lib::as_span(ids_delete), num_threads); + //! [Add vectors] + + // STEP 4: Search the Index + //! [Perform Queries] + const size_t search_window_size = 50; + const size_t n_neighbors = 10; + index.set_search_window_size(search_window_size); + + auto queries = + svs::load_data(std::filesystem::path(SVS_DATA_DIR) / "queries_f32.fvecs"); + auto results = index.search(queries, n_neighbors); + //! [Perform Queries] + + //! [Recall] + auto groundtruth = svs::load_data( + std::filesystem::path(SVS_DATA_DIR) / "groundtruth_euclidean.ivecs" + ); + double recall = svs::k_recall_at_n(groundtruth, results, n_neighbors, n_neighbors); + + fmt::print("Recall@{} = {:.4f}\n", n_neighbors, recall); + fmt::print( + "Note that recall is low because this example is using a dummy random dataset.\n" + ); + //! [Recall] + + // STEP 5: Saving and reloading the index + //! [Saving Loading] + index.save("config", "graph", "data"); + index = svs::DynamicVamana::assemble( + "config", + svs::GraphLoader("graph"), + svs::lib::load_from_disk("data", padding), + svs::distance::DistanceL2(), + num_threads + ); + //! [Saving Loading] + index.set_search_window_size(search_window_size); + recall = svs::k_recall_at_n(groundtruth, results, n_neighbors, n_neighbors); + + fmt::print("Recall@{} after saving and reloading = {:.4f}\n", n_neighbors, recall); + + return 0; +} diff --git a/examples/python/vamana_with_compression_lvq.py b/examples/python/vamana_with_compression_lvq.py new file mode 100644 index 00000000..47fc47b4 --- /dev/null +++ b/examples/python/vamana_with_compression_lvq.py @@ -0,0 +1,144 @@ +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Import `unittest` to allow for automated testing. +import unittest + +# [imports] +import os +import svs +# [imports] + +DEBUG_MODE = False +def assert_equal(lhs, rhs, message: str = "", epsilon = 0.05): + if DEBUG_MODE: + print(f"{message}: {lhs} == {rhs}") + else: + assert lhs < rhs + epsilon, message + assert lhs > rhs - epsilon, message + +test_data_dir = None + +def run(): + # [generate-dataset] + # Create a test dataset. + # This will create a directory "example_data_vamana" and populate it with three + # entries: + # - data.fvecs: The test dataset. + # - queries.fvecs: The test queries. + # - groundtruth.fvecs: The groundtruth. + test_data_dir = "./example_data_vamana" + svs.generate_test_dataset( + 1000, # Create 1000 vectors in the dataset. + 100, # Generate 100 query vectors. + 256, # Set the vector dimensionality to 256. + test_data_dir, # The directory where results will be generated. + data_seed = 1234, # Random number seed for reproducibility. + query_seed = 5678, # Random number seed for reproducibility. + num_threads = 4, # Number of threads to use. + distance = svs.DistanceType.L2, # The distance type to use. + ) + # [generate-dataset] + + # [create-loader] + # We are going to construct a LeanVec dataset on-the-fly from uncompressed data. + # First, we construct a loader for the uncompressed data. + uncompressed_loader = svs.VectorDataLoader( + os.path.join(test_data_dir, "data.fvecs"), + svs.DataType.float32 + ) + + # Next - we construct a LVQLoader which is configured to use LVQ compression with 4 + # bits for the primary and 8 bits for the residual quantization. + B1 = 4 # Number of bits for the first level LVQ quantization + B2 = 8 # Number of bits for the residuals quantization + compressed_loader = svs.LVQLoader(uncompressed_loader, + primary=B1, + residual=B2, + ) + # [create-loader] + + # An index can be constructed using a LeanVec dataset. + # [build-parameters] + parameters = svs.VamanaBuildParameters( + graph_max_degree = 64, + window_size = 128, + ) + # [build-parameters] + + # [build-index] + index = svs.Vamana.build( + parameters, + compressed_loader, + svs.DistanceType.L2, + num_threads = 4, + ) + # [build-index] + + # Set the search window size of the index and perform queries and load the queries. + # [perform-queries] + n_neighbors = 10 + index.search_window_size = 20 + index.num_threads = 4 + + queries = svs.read_vecs(os.path.join(test_data_dir, "queries.fvecs")) + I, D = index.search(queries, n_neighbors) + # [perform-queries] + + # Compare with the groundtruth. + # [recall] + groundtruth = svs.read_vecs(os.path.join(test_data_dir, "groundtruth.ivecs")) + recall = svs.k_recall_at(groundtruth, I, n_neighbors, n_neighbors) + print(f"Recall = {recall}") + # [recall] + assert_equal(recall, 0.953) + + # Finally, we can save the index and reload from a previously saved set of files. + # [saving-loading] + index.save( + os.path.join(test_data_dir, "example_config"), + os.path.join(test_data_dir, "example_graph"), + os.path.join(test_data_dir, "example_data"), + ) + + index = svs.Vamana( + os.path.join(test_data_dir, "example_config"), + os.path.join(test_data_dir, "example_graph"), + os.path.join(test_data_dir, "example_data"), + svs.DistanceType.L2, + num_threads = 4, + ) + # [saving-loading] + + +##### +##### Main Executable +##### + +if __name__ == "__main__": + run() + +##### +##### As a unit test. +##### + +class VamanaExampleTestCase(unittest.TestCase): + def tearDown(self): + if test_data_dir is not None: + print(f"Removing temporary directory {test_data_dir}") + os.rmdir(test_data_dir) + + def test_all(self): + run()