diff --git a/lucene/demo/src/java/org/apache/lucene/demo/IndexFiles.java b/lucene/demo/src/java/org/apache/lucene/demo/IndexFiles.java
index 1e23c5d9b84..71a63839cee 100644
--- a/lucene/demo/src/java/org/apache/lucene/demo/IndexFiles.java
+++ b/lucene/demo/src/java/org/apache/lucene/demo/IndexFiles.java
@@ -47,6 +47,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.IOUtils;
/**
* Index all text files under a directory.
@@ -55,17 +56,18 @@ import org.apache.lucene.store.FSDirectory;
* command-line arguments for usage information.
*/
public class IndexFiles implements AutoCloseable {
+ static final String KNN_DICT = "knn-dict";
// Calculates embedding vectors for KnnVector search
private final DemoEmbeddings demoEmbeddings;
private final KnnVectorDict vectorDict;
- private IndexFiles(Path vectorDictPath) throws IOException {
- if (vectorDictPath != null) {
- vectorDict = new KnnVectorDict(vectorDictPath);
+ private IndexFiles(KnnVectorDict vectorDict) throws IOException {
+ if (vectorDict != null) {
+ this.vectorDict = vectorDict;
demoEmbeddings = new DemoEmbeddings(vectorDict);
} else {
- vectorDict = null;
+ this.vectorDict = null;
demoEmbeddings = null;
}
}
@@ -80,7 +82,7 @@ public class IndexFiles implements AutoCloseable {
+ "IF DICT_PATH contains a KnnVector dictionary, the index will also support KnnVector search";
String indexPath = "index";
String docsPath = null;
- Path vectorDictPath = null;
+ String vectorDictSource = null;
boolean create = true;
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
@@ -91,7 +93,7 @@ public class IndexFiles implements AutoCloseable {
docsPath = args[++i];
break;
case "-knn_dict":
- vectorDictPath = Paths.get(args[++i]);
+ vectorDictSource = args[++i];
break;
case "-update":
create = false;
@@ -142,8 +144,16 @@ public class IndexFiles implements AutoCloseable {
//
// iwc.setRAMBufferSizeMB(256.0);
+ KnnVectorDict vectorDictInstance = null;
+ long vectorDictSize = 0;
+ if (vectorDictSource != null) {
+ KnnVectorDict.build(Paths.get(vectorDictSource), dir, KNN_DICT);
+ vectorDictInstance = new KnnVectorDict(dir, KNN_DICT);
+ vectorDictSize = vectorDictInstance.ramBytesUsed();
+ }
+
try (IndexWriter writer = new IndexWriter(dir, iwc);
- IndexFiles indexFiles = new IndexFiles(vectorDictPath)) {
+ IndexFiles indexFiles = new IndexFiles(vectorDictInstance)) {
indexFiles.indexDocs(writer, docDir);
// NOTE: if you want to maximize search performance,
@@ -153,6 +163,8 @@ public class IndexFiles implements AutoCloseable {
// you're done adding documents to it):
//
// writer.forceMerge(1);
+ } finally {
+ IOUtils.close(vectorDictInstance);
}
Date end = new Date();
@@ -163,6 +175,10 @@ public class IndexFiles implements AutoCloseable {
+ " documents in "
+ (end.getTime() - start.getTime())
+ " milliseconds");
+ if (reader.numDocs() > 100 && vectorDictSize < 1_000_000) {
+ throw new RuntimeException(
+ "Are you (ab)using the toy vector dictionary? See the package javadocs to understand why you got this exception.");
+ }
}
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
@@ -263,8 +279,6 @@ public class IndexFiles implements AutoCloseable {
@Override
public void close() throws IOException {
- if (vectorDict != null) {
- vectorDict.close();
- }
+ IOUtils.close(vectorDict);
}
}
diff --git a/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java b/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java
index eeaaa95176e..e6195c9a801 100644
--- a/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java
+++ b/lucene/demo/src/java/org/apache/lucene/demo/SearchFiles.java
@@ -31,7 +31,6 @@ import org.apache.lucene.demo.knn.DemoEmbeddings;
import org.apache.lucene.demo.knn.KnnVectorDict;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
@@ -103,12 +102,12 @@ public class SearchFiles {
}
}
- IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
+ DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
KnnVectorDict vectorDict = null;
if (knnVectors > 0) {
- vectorDict = new KnnVectorDict(Paths.get(index).resolve("knn-dict"));
+ vectorDict = new KnnVectorDict(reader.directory(), IndexFiles.KNN_DICT);
}
BufferedReader in;
if (queries != null) {
diff --git a/lucene/demo/src/java/org/apache/lucene/demo/knn/KnnVectorDict.java b/lucene/demo/src/java/org/apache/lucene/demo/knn/KnnVectorDict.java
index 1601ae7b4c2..116fea0daf7 100644
--- a/lucene/demo/src/java/org/apache/lucene/demo/knn/KnnVectorDict.java
+++ b/lucene/demo/src/java/org/apache/lucene/demo/knn/KnnVectorDict.java
@@ -17,17 +17,19 @@
package org.apache.lucene.demo.knn;
import java.io.BufferedReader;
-import java.io.DataOutputStream;
+import java.io.Closeable;
import java.io.IOException;
-import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.FloatBuffer;
-import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.regex.Pattern;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.VectorUtil;
@@ -40,32 +42,29 @@ import org.apache.lucene.util.fst.Util;
* Manages a map from token to numeric vector for use with KnnVector indexing and search. The map is
* stored as an FST: token-to-ordinal plus a dense binary file holding the vectors.
*/
-public class KnnVectorDict implements AutoCloseable {
+public class KnnVectorDict implements Closeable {
private final FST
In addition to indexing and searching text, IndexFiles and SearchFiles can also index and search + numeric vectors derived from that text, known as "embeddings." This demo code uses pre-computed embeddings + provided by the GloVe project, which are in the public + domain. The dictionary here is a tiny subset of the full GloVe dataset. It includes only the words that occur + in the toy data set, and is definitely not ready for production use! If you use this code to create + a vector index for a larger document set, the indexer will throw an exception because + a more complete set of embeddings is needed to get reasonable results. +
+