Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,21 @@ jobs:
- name: Test Panama Support (JDK ${{ matrix.jdk }})
if: matrix.jdk == '20'
run: >-
mvn -B -Pjdk20 -pl jvector-tests test -am test
mvn -X -B -Pjdk20 -pl jvector-tests test -am test
-DTest_RequireSpecificVectorizationProvider=PanamaVectorizationProvider

- name: Verify native-access vector support (JDK ${{ matrix.jdk }})
if: matrix.jdk == '24'
run: >-
mvn -B -Punix-amd64-profile -pl jvector-tests -am test
mvn -X -B -Punix-amd64-profile -pl jvector-tests -am test
-DTest_RequireSpecificVectorizationProvider=NativeVectorizationProvider
-Dsurefire.failIfNoSpecifiedTests=false
-Dtest=TestVectorizationProvider

- name: Compile, run tests and package (JDK ${{ matrix.jdk }})
if: matrix.jdk == '24'
run: >-
mvn -B -Punix-amd64-profile -pl jvector-tests -am test
mvn -X -B -Punix-amd64-profile -pl jvector-tests -am test
-DTest_RequireSpecificVectorizationProvider=NativeVectorizationProvider

- name: Test Summary for (ISA:${{ matrix.isa}},JDK${{ matrix.jdk }})
Expand Down
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@ hdf5/
### aider
.aider*

### claude
.claude/**

### junie
.junie/**

# JMH generated files
dependency-reduced-pom.xml
results.csv

# Local testing files
local/**

53 changes: 53 additions & 0 deletions jvector-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
<name>JVector Examples</name>
<properties>
<awssdk.version>2.21.10</awssdk.version>
<vectordata.version>0.1.8</vectordata.version>
</properties>
<build>
<plugins>
Expand All @@ -24,6 +25,35 @@
<workingDirectory>${project.parent.basedir}</workingDirectory>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.7.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>io.github.jbellis.jvector.example.testrig.commands.TestRig_CMD</mainClass>
</manifest>
</archive>
<descriptorRefs>
<!-- <ref>assembly.xml</ref>-->
<descriptorRef>jar-with-dependencies</descriptorRef>
<!-- <descriptorRef>assembly</descriptorRef>-->
</descriptorRefs>
</configuration>
</plugin>



</plugins>
</build>
<dependencies>
Expand Down Expand Up @@ -79,6 +109,22 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>io.nosqlbench</groupId>
<artifactId>vectordata</artifactId>
<version>${vectordata.version}</version>
</dependency>
<dependency>
<groupId>io.nosqlbench</groupId>
<artifactId>nbvectors</artifactId>
<version>${vectordata.version}</version>
</dependency>
<dependency>
<groupId>info.picocli</groupId>
<artifactId>picocli</artifactId>
<version>4.7.6</version>
</dependency>

<dependency>
<groupId>com.kohlschutter.junixsocket</groupId>
<artifactId>junixsocket-core</artifactId>
Expand All @@ -89,6 +135,10 @@
<profiles>
<profile>
<id>jdk11</id>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
</properties>
<build>
<plugins>
<plugin>
Expand Down Expand Up @@ -215,6 +265,9 @@
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<properties>
<maven.compiler.target>22</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>io.github.jbellis</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,21 @@
import io.github.jbellis.jvector.example.util.CompressorParameters;
import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters;
import io.github.jbellis.jvector.example.util.DataSet;
import io.github.jbellis.jvector.example.util.DataSetLoader;
import io.github.jbellis.jvector.example.util.DataSetSource;
import io.github.jbellis.jvector.example.yaml.DatasetCollection;
import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator;
import io.nosqlbench.vectordata.discovery.TestDataSources;
import io.nosqlbench.vectordata.discovery.TestDataView;
import io.nosqlbench.vectordata.downloader.Catalog;
import io.nosqlbench.vectordata.downloader.DatasetEntry;
import io.nosqlbench.vectordata.spec.datasets.types.DatasetView;
import org.jetbrains.annotations.NotNull;

import java.io.IOException;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -57,7 +62,7 @@ public static void main(String[] args) throws IOException {
List<Function<DataSet, CompressorParameters>> buildCompression = Arrays.asList(
ds -> new PQParameters(ds.getDimension() / 8,
256,
ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN,
ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN,
UNWEIGHTED),
__ -> CompressorParameters.NONE
);
Expand All @@ -66,7 +71,7 @@ public static void main(String[] args) throws IOException {
// ds -> new CompressorParameters.BQParameters(),
ds -> new PQParameters(ds.getDimension() / 8,
256,
ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN,
ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN,
UNWEIGHTED)
);
List<EnumSet<FeatureId>> featureSets = Arrays.asList(
Expand All @@ -85,13 +90,42 @@ public static void main(String[] args) throws IOException {
}

private static void execute(Pattern pattern, List<Function<DataSet, CompressorParameters>> buildCompression, List<EnumSet<FeatureId>> featureSets, List<Function<DataSet, CompressorParameters>> compressionGrid, List<Integer> mGrid, List<Integer> efConstructionGrid, List<Float> neighborOverflowGrid, List<Boolean> addHierarchyGrid, List<Boolean> refineFinalGraphGrid, Map<Integer, List<Double>> topKGrid, List<Boolean> usePruningGrid) throws IOException {

TestDataSources testDataSources = new TestDataSources().configure().addOptionalCatalogs("~/.config/jvector/catalogs.yaml");
Catalog testDataCatalog = testDataSources.catalog();
DataSetSource dsSource = DataSetSource.DEFAULT.and(loadStreamingDataSource(testDataCatalog));

var datasetCollection = DatasetCollection.load();
var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());

System.out.println("Executing the following datasets: " + datasetNames);

for (var datasetName : datasetNames) {
DataSet ds = DataSetLoader.loadDataSet(datasetName);
DataSet ds =
dsSource.apply(datasetName).orElseThrow(() -> new RuntimeException("Unknown dataset: " + datasetName));
Grid.runAll(ds, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid);
}
}

@NotNull
private static DataSetSource loadStreamingDataSource(Catalog catalog) {
return name -> {
Optional<DatasetEntry> dsentryOption = catalog.matchOne(name);
if (dsentryOption.isEmpty()) { return Optional.empty(); }
DatasetEntry dsentry = dsentryOption.orElseThrow(() -> new RuntimeException("Unknown dataset: " + name));
TestDataView tdv = dsentry.select().profile(name);
System.out.println("prebuffering dataset (assumed performance oriented testing)");
CompletableFuture<Void> statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer();
if (statusFuture instanceof ProgressIndicator<?>) {
((ProgressIndicator<?>)statusFuture).monitorProgress(1000);
}
// tdv.getQueryVectors().orElseThrow().prebuffer();
// tdv.getNeighborIndices().orElseThrow().prebuffer();
// tdv.getNeighborDistances().map(DatasetView::prebuffer);

TestDataViewWrapper tdw = new TestDataViewWrapper(tdv);
System.out.println("Loaded " + tdw.getName() + " from streaming source.");
return Optional.of(tdw);
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import io.github.jbellis.jvector.example.util.DataSet;
import io.github.jbellis.jvector.example.util.DataSetLoader;
import io.github.jbellis.jvector.example.util.DataSetSource;
import io.github.jbellis.jvector.example.yaml.DatasetCollection;
import io.github.jbellis.jvector.example.yaml.MultiConfig;

Expand Down Expand Up @@ -45,6 +46,7 @@ public static void main(String[] args) throws IOException {
var pattern = Pattern.compile(regex);

var datasetCollection = DatasetCollection.load();
DataSetSource datasetSource = DataSetLoader.DEFAULT;
var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());

List<MultiConfig> allConfigs = new ArrayList<>();
Expand All @@ -53,7 +55,11 @@ public static void main(String[] args) throws IOException {
System.out.println("Executing the following datasets: " + datasetNames);

for (var datasetName : datasetNames) {
DataSet ds = DataSetLoader.loadDataSet(datasetName);
String finalDatasetName = datasetName;
DataSet ds = datasetSource.apply(datasetName)
.orElseThrow(() -> new IllegalArgumentException(
"Unknown dataset: " + finalDatasetName));
// DataSet ds = DataSetLoader.loadDataSet(datasetName);

if (datasetName.endsWith(".hdf5")) {
datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length());
Expand All @@ -73,15 +79,22 @@ public static void main(String[] args) throws IOException {
}
}

for (var config : allConfigs) {
String datasetName = config.dataset;
for (var datasetName : datasetNames) {
String finalDatasetName = datasetName;
DataSet ds = datasetSource.apply(datasetName)
.orElseThrow(() -> new IllegalArgumentException(
"Unknown dataset: " + finalDatasetName));
// DataSet ds = DataSetLoader.loadDataSet(datasetName);

DataSet ds = DataSetLoader.loadDataSet(datasetName);
if (datasetName.endsWith(".hdf5")) {
datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length());
}
MultiConfig config = MultiConfig.getDefaultConfig(datasetName);

Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction,
config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph,
config.construction.getFeatureSets(), config.construction.getCompressorParameters(),
config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning, config.search.benchmarks);
config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ static void runOneGraph(List<? extends Set<FeatureId>> featureSets,
} else {
long start = System.nanoTime();
cv = compressor.encodeAll(ds.getBaseRavv());
System.out.format("%s encoded %d vectors [%.2f MB] in %.2fs%n", compressor, ds.baseVectors.size(), (cv.ramBytesUsed() / 1024f / 1024f), (System.nanoTime() - start) / 1_000_000_000.0);
System.out.format("%s encoded %d vectors [%.2f MB] in %.2fs%n", compressor, ds.getBaseVectors().size(), (cv.ramBytesUsed() / 1024f / 1024f), (System.nanoTime() - start) / 1_000_000_000.0);
}

indexes.forEach((features, index) -> {
Expand Down Expand Up @@ -206,7 +206,7 @@ private static Map<Set<FeatureId>, GraphIndex> buildOnDisk(List<? extends Set<Fe
var floatVectors = ds.getBaseRavv();

var pq = (PQVectors) buildCompressor.encodeAll(floatVectors);
var bsp = BuildScoreProvider.pqBuildScoreProvider(ds.similarityFunction, pq);
var bsp = BuildScoreProvider.pqBuildScoreProvider(ds.getSimilarityFunction(), pq);
GraphIndexBuilder builder = new GraphIndexBuilder(bsp, floatVectors.dimension(), M, efConstruction, neighborOverflow, 1.2f, addHierarchy, refineFinalGraph);

// use the inline vectors index as the score provider for graph construction
Expand Down Expand Up @@ -343,7 +343,7 @@ private static Map<Set<FeatureId>, GraphIndex> buildInMemory(List<? extends Set<
var floatVectors = ds.getBaseRavv();
Map<Set<FeatureId>, GraphIndex> indexes = new HashMap<>();
long start;
var bsp = BuildScoreProvider.randomAccessScoreProvider(floatVectors, ds.similarityFunction);
var bsp = BuildScoreProvider.randomAccessScoreProvider(floatVectors, ds.getSimilarityFunction());
GraphIndexBuilder builder = new GraphIndexBuilder(bsp,
floatVectors.dimension(),
M,
Expand Down Expand Up @@ -567,17 +567,17 @@ public static class ConfiguredSystem implements AutoCloseable {
public SearchScoreProvider scoreProviderFor(VectorFloat<?> queryVector, GraphIndex.View view) {
// if we're not compressing then just use the exact score function
if (cv == null) {
return DefaultSearchScoreProvider.exact(queryVector, ds.similarityFunction, ds.getBaseRavv());
return DefaultSearchScoreProvider.exact(queryVector, ds.getSimilarityFunction(), ds.getBaseRavv());
}

var scoringView = (GraphIndex.ScoringView) view;
ScoreFunction.ApproximateScoreFunction asf;
if (features.contains(FeatureId.FUSED_ADC)) {
asf = scoringView.approximateScoreFunctionFor(queryVector, ds.similarityFunction);
asf = scoringView.approximateScoreFunctionFor(queryVector, ds.getSimilarityFunction());
} else {
asf = cv.precomputedScoreFunctionFor(queryVector, ds.similarityFunction);
asf = cv.precomputedScoreFunctionFor(queryVector, ds.getSimilarityFunction());
}
var rr = scoringView.rerankerFor(queryVector, ds.similarityFunction);
var rr = scoringView.rerankerFor(queryVector, ds.getSimilarityFunction());
return new DefaultSearchScoreProvider(asf, rr);
}

Expand Down
Loading
Loading