datastax · jshook · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025
@@ -59,21 +59,21 @@ jobs:
       - name: Test Panama Support (JDK ${{ matrix.jdk }})
         if: matrix.jdk == '20'
         run: >-
-          mvn -B -Pjdk20 -pl jvector-tests test -am test
+          mvn -X -B -Pjdk20 -pl jvector-tests test -am test
           -DTest_RequireSpecificVectorizationProvider=PanamaVectorizationProvider
 
       - name: Verify native-access vector support (JDK ${{ matrix.jdk }})
         if: matrix.jdk == '24'
         run: >-
-          mvn -B -Punix-amd64-profile -pl jvector-tests -am test
+          mvn -X -B -Punix-amd64-profile -pl jvector-tests -am test
           -DTest_RequireSpecificVectorizationProvider=NativeVectorizationProvider
           -Dsurefire.failIfNoSpecifiedTests=false
           -Dtest=TestVectorizationProvider
 
       - name: Compile, run tests and package (JDK ${{ matrix.jdk }})
         if: matrix.jdk == '24'
         run: >-
-          mvn -B -Punix-amd64-profile -pl jvector-tests -am test
+          mvn -X -B -Punix-amd64-profile -pl jvector-tests -am test
           -DTest_RequireSpecificVectorizationProvider=NativeVectorizationProvider
 
       - name: Test Summary for (ISA:${{ matrix.isa}},JDK${{ matrix.jdk }})

@@ -34,6 +34,16 @@ hdf5/
 ### aider
 .aider*
 
+### claude
+.claude/**
+
+### junie
+.junie/**
+
 # JMH generated files
 dependency-reduced-pom.xml
 results.csv
+
+# Local testing files
+local/**
+
@@ -13,6 +13,7 @@
     <name>JVector Examples</name>
     <properties>
         <awssdk.version>2.21.10</awssdk.version>
+        <vectordata.version>0.1.8</vectordata.version>
     </properties>
     <build>
         <plugins>
@@ -24,6 +25,35 @@
                     <workingDirectory>${project.parent.basedir}</workingDirectory>
                 </configuration>
             </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <version>3.7.1</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <archive>
+                        <manifest>
+                            <addClasspath>true</addClasspath>
+                            <mainClass>io.github.jbellis.jvector.example.testrig.commands.TestRig_CMD</mainClass>
+                        </manifest>
+                    </archive>
+                    <descriptorRefs>
+                        <!--                  <ref>assembly.xml</ref>-->
+                        <descriptorRef>jar-with-dependencies</descriptorRef>
+                        <!--            <descriptorRef>assembly</descriptorRef>-->
+                    </descriptorRefs>
+                </configuration>
+            </plugin>
+
+
+
         </plugins>
     </build>
     <dependencies>
@@ -79,6 +109,22 @@
                 </exclusion>
             </exclusions>
         </dependency>
+        <dependency>
+            <groupId>io.nosqlbench</groupId>
+            <artifactId>vectordata</artifactId>
+            <version>${vectordata.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>io.nosqlbench</groupId>
+            <artifactId>nbvectors</artifactId>
+            <version>${vectordata.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>info.picocli</groupId>
+            <artifactId>picocli</artifactId>
+            <version>4.7.6</version>
+        </dependency>
+
         <dependency>
             <groupId>com.kohlschutter.junixsocket</groupId>
             <artifactId>junixsocket-core</artifactId>
@@ -89,6 +135,10 @@
     <profiles>
         <profile>
             <id>jdk11</id>
+            <properties>
+                <maven.compiler.source>11</maven.compiler.source>
+                <maven.compiler.target>11</maven.compiler.target>
+            </properties>
             <build>
                 <plugins>
                     <plugin>
@@ -215,6 +265,9 @@
             <activation>
                 <activeByDefault>true</activeByDefault>
             </activation>
+            <properties>
+                <maven.compiler.target>22</maven.compiler.target>
+            </properties>
             <dependencies>
                 <dependency>
                     <groupId>io.github.jbellis</groupId>

@@ -19,16 +19,21 @@
 import io.github.jbellis.jvector.example.util.CompressorParameters;
 import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters;
 import io.github.jbellis.jvector.example.util.DataSet;
-import io.github.jbellis.jvector.example.util.DataSetLoader;
+import io.github.jbellis.jvector.example.util.DataSetSource;
 import io.github.jbellis.jvector.example.yaml.DatasetCollection;
 import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
 import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
+import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator;
+import io.nosqlbench.vectordata.discovery.TestDataSources;
+import io.nosqlbench.vectordata.discovery.TestDataView;
+import io.nosqlbench.vectordata.downloader.Catalog;
+import io.nosqlbench.vectordata.downloader.DatasetEntry;
+import io.nosqlbench.vectordata.spec.datasets.types.DatasetView;
+import org.jetbrains.annotations.NotNull;
 
 import java.io.IOException;
-import java.util.Arrays;
-import java.util.EnumSet;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
+import java.util.concurrent.CompletableFuture;
 import java.util.function.Function;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
@@ -57,7 +62,7 @@ public static void main(String[] args) throws IOException {
         List<Function<DataSet, CompressorParameters>> buildCompression = Arrays.asList(
                 ds -> new PQParameters(ds.getDimension() / 8,
                         256,
-                        ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN,
+                        ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN,
                         UNWEIGHTED),
                 __ -> CompressorParameters.NONE
         );
@@ -66,7 +71,7 @@ public static void main(String[] args) throws IOException {
                 // ds -> new CompressorParameters.BQParameters(),
                 ds -> new PQParameters(ds.getDimension() / 8,
                         256,
-                        ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN,
+                        ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN,
                         UNWEIGHTED)
         );
         List<EnumSet<FeatureId>> featureSets = Arrays.asList(
@@ -85,13 +90,42 @@ public static void main(String[] args) throws IOException {
     }
 
     private static void execute(Pattern pattern, List<Function<DataSet, CompressorParameters>> buildCompression, List<EnumSet<FeatureId>> featureSets, List<Function<DataSet, CompressorParameters>> compressionGrid, List<Integer> mGrid, List<Integer> efConstructionGrid, List<Float> neighborOverflowGrid, List<Boolean> addHierarchyGrid, List<Boolean> refineFinalGraphGrid, Map<Integer, List<Double>> topKGrid, List<Boolean> usePruningGrid) throws IOException {
+
+        TestDataSources testDataSources = new TestDataSources().configure().addOptionalCatalogs("~/.config/jvector/catalogs.yaml");
+        Catalog testDataCatalog = testDataSources.catalog();
+        DataSetSource dsSource = DataSetSource.DEFAULT.and(loadStreamingDataSource(testDataCatalog));
+
         var datasetCollection = DatasetCollection.load();
         var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());
+
         System.out.println("Executing the following datasets: " + datasetNames);
 
         for (var datasetName : datasetNames) {
-            DataSet ds = DataSetLoader.loadDataSet(datasetName);
+          DataSet ds =
+                  dsSource.apply(datasetName).orElseThrow(() -> new RuntimeException("Unknown dataset: " + datasetName));
             Grid.runAll(ds, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid);
         }
     }
+
+    @NotNull
+    private static DataSetSource loadStreamingDataSource(Catalog catalog) {
+        return name -> {
+            Optional<DatasetEntry> dsentryOption = catalog.matchOne(name);
+            if (dsentryOption.isEmpty()) { return Optional.empty(); }
+            DatasetEntry dsentry = dsentryOption.orElseThrow(() -> new RuntimeException("Unknown dataset: " + name));
+            TestDataView tdv = dsentry.select().profile(name);
+            System.out.println("prebuffering dataset (assumed performance oriented testing)");
+            CompletableFuture<Void> statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer();
+            if (statusFuture instanceof ProgressIndicator<?>) {
+                ((ProgressIndicator<?>)statusFuture).monitorProgress(1000);
+            }
+//            tdv.getQueryVectors().orElseThrow().prebuffer();
+//            tdv.getNeighborIndices().orElseThrow().prebuffer();
+//            tdv.getNeighborDistances().map(DatasetView::prebuffer);
+
+            TestDataViewWrapper tdw = new TestDataViewWrapper(tdv);
+            System.out.println("Loaded " + tdw.getName() + " from streaming source.");
+            return Optional.of(tdw);
+        };
+    }
 }
@@ -18,6 +18,7 @@
 
 import io.github.jbellis.jvector.example.util.DataSet;
 import io.github.jbellis.jvector.example.util.DataSetLoader;
+import io.github.jbellis.jvector.example.util.DataSetSource;
 import io.github.jbellis.jvector.example.yaml.DatasetCollection;
 import io.github.jbellis.jvector.example.yaml.MultiConfig;
 
@@ -45,6 +46,7 @@ public static void main(String[] args) throws IOException {
         var pattern = Pattern.compile(regex);
 
         var datasetCollection = DatasetCollection.load();
+        DataSetSource datasetSource = DataSetLoader.DEFAULT;
         var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());
 
         List<MultiConfig> allConfigs = new ArrayList<>();
@@ -53,7 +55,11 @@ public static void main(String[] args) throws IOException {
             System.out.println("Executing the following datasets: " + datasetNames);
 
             for (var datasetName : datasetNames) {
-                DataSet ds = DataSetLoader.loadDataSet(datasetName);
+                String finalDatasetName = datasetName;
+                DataSet ds = datasetSource.apply(datasetName)
+                    .orElseThrow(() -> new IllegalArgumentException(
+                        "Unknown dataset: " + finalDatasetName));
+                // DataSet ds = DataSetLoader.loadDataSet(datasetName);
 
                 if (datasetName.endsWith(".hdf5")) {
                     datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length());
@@ -73,15 +79,22 @@ public static void main(String[] args) throws IOException {
             }
         }
 
-        for (var config : allConfigs) {
-            String datasetName = config.dataset;
+        for (var datasetName : datasetNames) {
+            String finalDatasetName = datasetName;
+            DataSet ds = datasetSource.apply(datasetName)
+                    .orElseThrow(() -> new IllegalArgumentException(
+                            "Unknown dataset: " + finalDatasetName));
+            // DataSet ds = DataSetLoader.loadDataSet(datasetName);
 
-            DataSet ds = DataSetLoader.loadDataSet(datasetName);
+            if (datasetName.endsWith(".hdf5")) {
+                datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length());
+            }
+            MultiConfig config = MultiConfig.getDefaultConfig(datasetName);
 
             Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction,
                     config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph,
                     config.construction.getFeatureSets(), config.construction.getCompressorParameters(),
-                    config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning, config.search.benchmarks);
+                    config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning);
         }
     }
 }
@@ -170,7 +170,7 @@ static void runOneGraph(List<? extends Set<FeatureId>> featureSets,
                 } else {
                     long start = System.nanoTime();
                     cv = compressor.encodeAll(ds.getBaseRavv());
-                    System.out.format("%s encoded %d vectors [%.2f MB] in %.2fs%n", compressor, ds.baseVectors.size(), (cv.ramBytesUsed() / 1024f / 1024f), (System.nanoTime() - start) / 1_000_000_000.0);
+                    System.out.format("%s encoded %d vectors [%.2f MB] in %.2fs%n", compressor, ds.getBaseVectors().size(), (cv.ramBytesUsed() / 1024f / 1024f), (System.nanoTime() - start) / 1_000_000_000.0);
                 }
 
                 indexes.forEach((features, index) -> {
@@ -206,7 +206,7 @@ private static Map<Set<FeatureId>, GraphIndex> buildOnDisk(List<? extends Set<Fe
         var floatVectors = ds.getBaseRavv();
 
         var pq = (PQVectors) buildCompressor.encodeAll(floatVectors);
-        var bsp = BuildScoreProvider.pqBuildScoreProvider(ds.similarityFunction, pq);
+        var bsp = BuildScoreProvider.pqBuildScoreProvider(ds.getSimilarityFunction(), pq);
         GraphIndexBuilder builder = new GraphIndexBuilder(bsp, floatVectors.dimension(), M, efConstruction, neighborOverflow, 1.2f, addHierarchy, refineFinalGraph);
 
         // use the inline vectors index as the score provider for graph construction
@@ -343,7 +343,7 @@ private static Map<Set<FeatureId>, GraphIndex> buildInMemory(List<? extends Set<
         var floatVectors = ds.getBaseRavv();
         Map<Set<FeatureId>, GraphIndex> indexes = new HashMap<>();
         long start;
-        var bsp = BuildScoreProvider.randomAccessScoreProvider(floatVectors, ds.similarityFunction);
+        var bsp = BuildScoreProvider.randomAccessScoreProvider(floatVectors, ds.getSimilarityFunction());
         GraphIndexBuilder builder = new GraphIndexBuilder(bsp,
                                                           floatVectors.dimension(),
                                                           M,
@@ -567,17 +567,17 @@ public static class ConfiguredSystem implements AutoCloseable {
         public SearchScoreProvider scoreProviderFor(VectorFloat<?> queryVector, GraphIndex.View view) {
             // if we're not compressing then just use the exact score function
             if (cv == null) {
-                return DefaultSearchScoreProvider.exact(queryVector, ds.similarityFunction, ds.getBaseRavv());
+                return DefaultSearchScoreProvider.exact(queryVector, ds.getSimilarityFunction(), ds.getBaseRavv());
             }
 
             var scoringView = (GraphIndex.ScoringView) view;
             ScoreFunction.ApproximateScoreFunction asf;
             if (features.contains(FeatureId.FUSED_ADC)) {
-                asf = scoringView.approximateScoreFunctionFor(queryVector, ds.similarityFunction);
+                asf = scoringView.approximateScoreFunctionFor(queryVector, ds.getSimilarityFunction());
             } else {
-                asf = cv.precomputedScoreFunctionFor(queryVector, ds.similarityFunction);
+                asf = cv.precomputedScoreFunctionFor(queryVector, ds.getSimilarityFunction());
             }
-            var rr = scoringView.rerankerFor(queryVector, ds.similarityFunction);
+            var rr = scoringView.rerankerFor(queryVector, ds.getSimilarityFunction());
             return new DefaultSearchScoreProvider(asf, rr);
         }