LUCENE-10016: Added KnnVector index/query support to demo

This commit is contained in:
Michael Sokolov 2021-08-18 08:13:59 -04:00 committed by GitHub
parent 4213f9d3cd
commit a37844aedd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 2819 additions and 51 deletions

View File

@ -343,6 +343,7 @@ configure(project(":lucene")) {
"elegant-icon-font-*",
"ant-*",
"ivy-*",
"pddl-10.txt",
]
}
}

View File

@ -93,6 +93,10 @@ allprojects {
exclude "src/**/en-test-lemmas.dict"
break
case ":lucene:demo":
exclude "src/**/knn-token-vectors"
break
case ":lucene:test-framework":
exclude "src/**/europarl.lines.txt.seek"
break

View File

@ -43,9 +43,9 @@ public enum VectorSimilarityFunction {
};
/**
* If true, the scores associated with vector comparisons are in reverse order; that is, lower
* scores represent more similar vectors. Otherwise, if false, higher scores represent more
* similar vectors.
* If true, the scores associated with vector comparisons are nonnegative and in reverse order;
* that is, lower scores represent more similar vectors. Otherwise, if false, higher scores
* represent more similar vectors, and scores may be negative or positive.
*/
public final boolean reversed;

View File

@ -117,17 +117,44 @@ public final class VectorUtil {
* thrown for zero vectors.
*/
public static void l2normalize(float[] v) {
l2normalize(v, true);
}
/**
* Modifies the argument to be unit length, dividing by its l2-norm.
*
* @param v the vector to normalize
* @param throwOnZero whether to throw an exception when <code>v</code> has all zeros
* @throws IllegalArgumentException when the vector is all zero and throwOnZero is true
*/
public static void l2normalize(float[] v, boolean throwOnZero) {
double squareSum = 0.0f;
int dim = v.length;
for (float x : v) {
squareSum += x * x;
}
if (squareSum == 0) {
throw new IllegalArgumentException("Cannot normalize a zero-length vector");
if (throwOnZero) {
throw new IllegalArgumentException("Cannot normalize a zero-length vector");
} else {
return;
}
}
double length = Math.sqrt(squareSum);
for (int i = 0; i < dim; i++) {
v[i] /= length;
}
}
/**
* Adds the second argument to the first
*
* @param u the destination
* @param v the vector to add to the destination
*/
public static void add(float[] u, float[] v) {
for (int i = 0; i < u.length; i++) {
u[i] += v[i];
}
}
}

View File

@ -26,6 +26,6 @@ dependencies {
implementation project(':lucene:analysis:common')
implementation project(':lucene:queryparser')
implementation project(':lucene:expressions')
testImplementation project(':lucene:test-framework')
}

View File

@ -30,15 +30,21 @@ import java.nio.file.attribute.BasicFileAttributes;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.demo.knn.DemoEmbeddings;
import org.apache.lucene.demo.knn.KnnVectorDict;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.KnnVectorField;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
@ -50,27 +56,48 @@ import org.apache.lucene.store.FSDirectory;
*/
public class IndexFiles {
private IndexFiles() {}
// Calculates embedding vectors for KnnVector search
private final DemoEmbeddings demoEmbeddings;
private IndexFiles(Path vectorDictPath) throws IOException {
if (vectorDictPath != null) {
demoEmbeddings = new DemoEmbeddings(new KnnVectorDict(vectorDictPath));
} else {
demoEmbeddings = null;
}
}
/** Index all text files under a directory. */
public static void main(String[] args) {
public static void main(String[] args) throws Exception {
String usage =
"java org.apache.lucene.demo.IndexFiles"
+ " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
+ " [-index INDEX_PATH] [-docs DOCS_PATH] [-update] [-knn_dict DICT_PATH]\n\n"
+ "This indexes the documents in DOCS_PATH, creating a Lucene index"
+ "in INDEX_PATH that can be searched with SearchFiles";
+ "in INDEX_PATH that can be searched with SearchFiles\n"
+ "IF DICT_PATH contains a KnnVector dictionary, the index will also support KnnVector search";
String indexPath = "index";
String docsPath = null;
Path vectorDictPath = null;
boolean create = true;
for (int i = 0; i < args.length; i++) {
if ("-index".equals(args[i])) {
indexPath = args[i + 1];
i++;
} else if ("-docs".equals(args[i])) {
docsPath = args[i + 1];
i++;
} else if ("-update".equals(args[i])) {
create = false;
switch (args[i]) {
case "-index":
indexPath = args[++i];
break;
case "-docs":
docsPath = args[++i];
break;
case "-knn_dict":
vectorDictPath = Paths.get(args[++i]);
break;
case "-update":
create = false;
break;
case "-create":
create = true;
break;
default:
throw new IllegalArgumentException("unknown parameter " + args[i]);
}
}
@ -113,7 +140,8 @@ public class IndexFiles {
// iwc.setRAMBufferSizeMB(256.0);
IndexWriter writer = new IndexWriter(dir, iwc);
indexDocs(writer, docDir);
IndexFiles indexFiles = new IndexFiles(vectorDictPath);
indexFiles.indexDocs(writer, docDir);
// NOTE: if you want to maximize search performance,
// you can optionally call forceMerge here. This can be
@ -126,8 +154,14 @@ public class IndexFiles {
writer.close();
Date end = new Date();
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
try (IndexReader reader = DirectoryReader.open(dir)) {
System.out.println(
"Indexed "
+ reader.numDocs()
+ " documents in "
+ (end.getTime() - start.getTime())
+ " milliseconds");
}
} catch (IOException e) {
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
}
@ -147,7 +181,7 @@ public class IndexFiles {
* @param path The file to index, or the directory to recurse into to find files to index
* @throws IOException If there is a low-level I/O error
*/
static void indexDocs(final IndexWriter writer, Path path) throws IOException {
void indexDocs(final IndexWriter writer, Path path) throws IOException {
if (Files.isDirectory(path)) {
Files.walkFileTree(
path,
@ -160,6 +194,7 @@ public class IndexFiles {
} catch (
@SuppressWarnings("unused")
IOException ignore) {
ignore.printStackTrace(System.err);
// don't index files that can't be read.
}
return FileVisitResult.CONTINUE;
@ -171,7 +206,7 @@ public class IndexFiles {
}
/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
try (InputStream stream = Files.newInputStream(file)) {
// make a new, empty document
Document doc = new Document();
@ -201,6 +236,16 @@ public class IndexFiles {
"contents",
new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
if (demoEmbeddings != null) {
try (InputStream in = Files.newInputStream(file)) {
float[] vector =
demoEmbeddings.computeEmbedding(
new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)));
doc.add(
new KnnVectorField("contents-vector", vector, VectorSimilarityFunction.DOT_PRODUCT));
}
}
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
// New index, so we just add the document (no old document can be there):
System.out.println("adding " + file);

View File

@ -22,15 +22,24 @@ import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.demo.knn.DemoEmbeddings;
import org.apache.lucene.demo.knn.KnnVectorDict;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.KnnVectorQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
@ -43,7 +52,7 @@ public class SearchFiles {
/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
String usage =
"Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details.";
"Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage] [-knn_vector knnHits]\n\nSee http://lucene.apache.org/core/9_0_0/demo/ for details.";
if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
System.out.println(usage);
System.exit(0);
@ -54,42 +63,54 @@ public class SearchFiles {
String queries = null;
int repeat = 0;
boolean raw = false;
int knnVectors = 0;
String queryString = null;
int hitsPerPage = 10;
for (int i = 0; i < args.length; i++) {
if ("-index".equals(args[i])) {
index = args[i + 1];
i++;
} else if ("-field".equals(args[i])) {
field = args[i + 1];
i++;
} else if ("-queries".equals(args[i])) {
queries = args[i + 1];
i++;
} else if ("-query".equals(args[i])) {
queryString = args[i + 1];
i++;
} else if ("-repeat".equals(args[i])) {
repeat = Integer.parseInt(args[i + 1]);
i++;
} else if ("-raw".equals(args[i])) {
raw = true;
} else if ("-paging".equals(args[i])) {
hitsPerPage = Integer.parseInt(args[i + 1]);
if (hitsPerPage <= 0) {
System.err.println("There must be at least 1 hit per page.");
switch (args[i]) {
case "-index":
index = args[++i];
break;
case "-field":
field = args[++i];
break;
case "-queries":
queries = args[++i];
break;
case "-query":
queryString = args[++i];
break;
case "-repeat":
repeat = Integer.parseInt(args[++i]);
break;
case "-raw":
raw = true;
break;
case "-paging":
hitsPerPage = Integer.parseInt(args[++i]);
if (hitsPerPage <= 0) {
System.err.println("There must be at least 1 hit per page.");
System.exit(1);
}
break;
case "-knn_vector":
knnVectors = Integer.parseInt(args[++i]);
break;
default:
System.err.println("Unknown argument: " + args[i]);
System.exit(1);
}
i++;
}
}
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
BufferedReader in = null;
KnnVectorDict vectorDict = null;
if (knnVectors > 0) {
vectorDict = new KnnVectorDict(Paths.get(index).resolve("knn-dict"));
}
BufferedReader in;
if (queries != null) {
in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
} else {
@ -113,6 +134,9 @@ public class SearchFiles {
}
Query query = parser.parse(line);
if (knnVectors > 0) {
query = addSemanticQuery(query, vectorDict, knnVectors);
}
System.out.println("Searching for: " + query.toString(field));
if (repeat > 0) { // repeat & time as benchmark
@ -242,4 +266,55 @@ public class SearchFiles {
}
}
}
private static Query addSemanticQuery(Query query, KnnVectorDict vectorDict, int k)
throws IOException {
StringBuilder semanticQueryText = new StringBuilder();
QueryFieldTermExtractor termExtractor = new QueryFieldTermExtractor("contents");
query.visit(termExtractor);
for (String term : termExtractor.terms) {
semanticQueryText.append(term).append(' ');
}
if (semanticQueryText.length() > 0) {
KnnVectorQuery knnQuery =
new KnnVectorQuery(
"contents-vector",
new DemoEmbeddings(vectorDict).computeEmbedding(semanticQueryText.toString()),
k);
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(query, BooleanClause.Occur.SHOULD);
builder.add(knnQuery, BooleanClause.Occur.SHOULD);
return builder.build();
}
return query;
}
private static class QueryFieldTermExtractor extends QueryVisitor {
private final String field;
private final List<String> terms = new ArrayList<>();
QueryFieldTermExtractor(String field) {
this.field = field;
}
@Override
public boolean acceptField(String field) {
return field.equals(this.field);
}
@Override
public void consumeTerms(Query query, Term... terms) {
for (Term term : terms) {
this.terms.add(term.text());
}
}
@Override
public QueryVisitor getSubVisitor(BooleanClause.Occur occur, Query parent) {
if (occur == BooleanClause.Occur.MUST_NOT) {
return QueryVisitor.EMPTY_VISITOR;
}
return this;
}
}
}

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.demo.knn;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
* This class provides {@link #computeEmbedding(String)} and {@link #computeEmbedding(Reader)} for
* calculating "semantic" embedding vectors for textual input.
*/
public class DemoEmbeddings {
private final Analyzer analyzer;
/**
* Sole constructor
*
* @param vectorDict a token to vector dictionary
*/
public DemoEmbeddings(KnnVectorDict vectorDict) {
analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new StandardTokenizer();
TokenStream output =
new KnnVectorDictFilter(new LowerCaseFilter(tokenizer), vectorDict);
return new TokenStreamComponents(tokenizer, output);
}
};
}
/**
* Tokenize and lower-case the input, look up the tokens in the dictionary, and sum the token
* vectors. Unrecognized tokens are ignored. The resulting vector is normalized to unit length.
*
* @param input the input to analyze
* @return the KnnVector for the input
*/
public float[] computeEmbedding(String input) throws IOException {
return computeEmbedding(new StringReader(input));
}
/**
* Tokenize and lower-case the input, look up the tokens in the dictionary, and sum the token
* vectors. Unrecognized tokens are ignored. The resulting vector is normalized to unit length.
*
* @param input the input to analyze
* @return the KnnVector for the input
*/
public float[] computeEmbedding(Reader input) throws IOException {
try (TokenStream tokens = analyzer.tokenStream("dummyField", input)) {
tokens.reset();
while (tokens.incrementToken()) {}
tokens.end();
return ((KnnVectorDictFilter) tokens).getResult();
}
}
}

View File

@ -0,0 +1,211 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.demo.knn;
import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.FloatBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.regex.Pattern;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.VectorUtil;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
/**
* Manages a map from token to numeric vector for use with KnnVector indexing and search. The map is
* stored as an FST: token-to-ordinal plus a dense binary file holding the vectors.
*/
public class KnnVectorDict implements AutoCloseable {
private final FST<Long> fst;
private final FileChannel vectors;
private final ByteBuffer vbuffer;
private final int dimension;
/**
* Sole constructor
*
* @param knnDictPath the base path name of the files that will store the KnnVectorDict. The file
* with extension '.bin' holds the vectors and the '.fst' maps tokens to offsets in the '.bin'
* file.
*/
public KnnVectorDict(Path knnDictPath) throws IOException {
String dictName = knnDictPath.getFileName().toString();
Path fstPath = knnDictPath.resolveSibling(dictName + ".fst");
Path binPath = knnDictPath.resolveSibling(dictName + ".bin");
fst = FST.read(fstPath, PositiveIntOutputs.getSingleton());
vectors = FileChannel.open(binPath);
long size = vectors.size();
if (size > Integer.MAX_VALUE) {
throw new IllegalArgumentException("vector file is too large: " + size + " bytes");
}
vbuffer = vectors.map(FileChannel.MapMode.READ_ONLY, 0, size);
dimension = vbuffer.getInt((int) (size - Integer.BYTES));
if ((size - Integer.BYTES) % (dimension * Float.BYTES) != 0) {
throw new IllegalStateException(
"vector file size " + size + " is not consonant with the vector dimension " + dimension);
}
}
/**
* Get the vector corresponding to the given token. NOTE: the returned array is shared and its
* contents will be overwritten by subsequent calls. The caller is responsible to copy the data as
* needed.
*
* @param token the token to look up
* @param output the array in which to write the corresponding vector. Its length must be {@link
* #getDimension()} * {@link Float#BYTES}. It will be filled with zeros if the token is not
* present in the dictionary.
* @throws IllegalArgumentException if the output array is incorrectly sized
* @throws IOException if there is a problem reading the dictionary
*/
public void get(BytesRef token, byte[] output) throws IOException {
if (output.length != dimension * Float.BYTES) {
throw new IllegalArgumentException(
"the output array must be of length "
+ (dimension * Float.BYTES)
+ ", got "
+ output.length);
}
Long ord = Util.get(fst, token);
if (ord == null) {
Arrays.fill(output, (byte) 0);
} else {
vbuffer.position((int) (ord * dimension * Float.BYTES));
vbuffer.get(output);
}
}
/**
* Get the dimension of the vectors returned by this.
*
* @return the vector dimension
*/
public int getDimension() {
return dimension;
}
@Override
public void close() throws IOException {
vectors.close();
}
/**
* Convert from a GloVe-formatted dictionary file to a KnnVectorDict file pair.
*
* @param gloveInput the path to the input dictionary. The dictionary is delimited by newlines,
* and each line is space-delimited. The first column has the token, and the remaining columns
* are the vector components, as text. The dictionary must be sorted by its leading tokens
* (considered as bytes).
* @param dictOutput a dictionary path prefix. The output will be two files, named by appending
* '.fst' and '.bin' to this path.
*/
public static void build(Path gloveInput, Path dictOutput) throws IOException {
new Builder().build(gloveInput, dictOutput);
}
private static class Builder {
private static final Pattern SPACE_RE = Pattern.compile(" ");
private final IntsRefBuilder intsRefBuilder = new IntsRefBuilder();
private final FSTCompiler<Long> fstCompiler =
new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, PositiveIntOutputs.getSingleton());
private float[] scratch;
private ByteBuffer byteBuffer;
private long ordinal = 1;
private int numFields;
void build(Path gloveInput, Path dictOutput) throws IOException {
String dictName = dictOutput.getFileName().toString();
Path fstPath = dictOutput.resolveSibling(dictName + ".fst");
Path binPath = dictOutput.resolveSibling(dictName + ".bin");
try (BufferedReader in = Files.newBufferedReader(gloveInput);
OutputStream binOut = Files.newOutputStream(binPath);
DataOutputStream binDataOut = new DataOutputStream(binOut)) {
writeFirstLine(in, binOut);
while (true) {
if (addOneLine(in, binOut) == false) {
break;
}
}
fstCompiler.compile().save(fstPath);
binDataOut.writeInt(numFields - 1);
}
}
private void writeFirstLine(BufferedReader in, OutputStream out) throws IOException {
String[] fields = readOneLine(in);
if (fields == null) {
return;
}
numFields = fields.length;
byteBuffer =
ByteBuffer.allocate((numFields - 1) * Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
scratch = new float[numFields - 1];
writeVector(fields, out);
}
private String[] readOneLine(BufferedReader in) throws IOException {
String line = in.readLine();
if (line == null) {
return null;
}
return SPACE_RE.split(line, 0);
}
private boolean addOneLine(BufferedReader in, OutputStream out) throws IOException {
String[] fields = readOneLine(in);
if (fields == null) {
return false;
}
if (fields.length != numFields) {
throw new IllegalStateException(
"different field count at line "
+ ordinal
+ " got "
+ fields.length
+ " when expecting "
+ numFields);
}
fstCompiler.add(Util.toIntsRef(new BytesRef(fields[0]), intsRefBuilder), ordinal++);
writeVector(fields, out);
return true;
}
private void writeVector(String[] fields, OutputStream out) throws IOException {
byteBuffer.position(0);
FloatBuffer floatBuffer = byteBuffer.asFloatBuffer();
for (int i = 1; i < fields.length; i++) {
scratch[i - 1] = Float.parseFloat(fields[i]);
}
VectorUtil.l2normalize(scratch);
floatBuffer.put(scratch);
out.write(byteBuffer.array());
}
}
}

View File

@ -0,0 +1,91 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.demo.knn;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.FloatBuffer;
import java.util.Arrays;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.VectorUtil;
/**
* Looks up each tokens in a dictionary, and sums the token vectors. Unrecognized tokens are
* ignored. The resulting vector is normalized to unit length.
*/
public final class KnnVectorDictFilter extends TokenFilter {
private final KnnVectorDict dict;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final float[] scratchFloats;
private final float[] result;
private final byte[] scratchBytes;
private final FloatBuffer scratchBuffer;
/**
* sole constructor
*
* @param input the input token stream to filter.
* @param dict a token to vector dictionary, used to look up the token vectors.
*/
public KnnVectorDictFilter(TokenStream input, KnnVectorDict dict) {
super(input);
this.dict = dict;
result = new float[dict.getDimension()];
scratchBytes = new byte[dict.getDimension() * Float.BYTES];
scratchBuffer = ByteBuffer.wrap(scratchBytes).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
scratchFloats = new float[dict.getDimension()];
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken() == false) {
return false;
}
BytesRef term = new BytesRef(termAtt.toString());
dict.get(term, scratchBytes);
scratchBuffer.position(0);
scratchBuffer.get(scratchFloats);
VectorUtil.add(result, scratchFloats);
return true;
}
@Override
public void reset() throws IOException {
super.reset();
Arrays.fill(result, 0);
}
@Override
public void end() throws IOException {
super.end();
VectorUtil.l2normalize(result, false);
}
/**
* Get the vector computed from the input
*
* @return the resultant sum of the vectors of each term.
*/
public float[] getResult() {
return result;
}
}

View File

@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* KnnVector example code.
*
* <p>The vector dictionary used in the demo is taken from the GloVe project hosted at
* https://nlp.stanford.edu/projects/glove, whose data is in the public domain, as described by
* http://opendatacommons.org/licenses/pddl/1.0, available in the Lucene distribution as
* lucene/licenses/pddl-10.txt.
*/
package org.apache.lucene.demo.knn;

View File

@ -20,6 +20,7 @@ import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.nio.charset.Charset;
import java.nio.file.Path;
import org.apache.lucene.demo.knn.KnnVectorDict;
import org.apache.lucene.util.LuceneTestCase;
public class TestDemo extends LuceneTestCase {
@ -28,12 +29,13 @@ public class TestDemo extends LuceneTestCase {
PrintStream outSave = System.out;
try {
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
PrintStream fakeSystemOut = new PrintStream(bytes, false, Charset.defaultCharset().name());
PrintStream fakeSystemOut = new PrintStream(bytes, false, Charset.defaultCharset());
System.setOut(fakeSystemOut);
SearchFiles.main(new String[] {"-query", query, "-index", indexPath.toString()});
SearchFiles.main(
new String[] {"-query", query, "-index", indexPath.toString(), "-paging", "20"});
fakeSystemOut.flush();
String output =
bytes.toString(Charset.defaultCharset().name()); // intentionally use default encoding
bytes.toString(Charset.defaultCharset()); // intentionally use default encoding
assertTrue(
"output=" + output, output.contains(expectedHitCount + " total matching documents"));
} finally {
@ -53,4 +55,66 @@ public class TestDemo extends LuceneTestCase {
testOneSearch(indexDir, "derivative", 8);
testOneSearch(indexDir, "license", 13);
}
private void testVectorSearch(Path indexPath, String query, int expectedHitCount)
throws Exception {
testVectorSearch(indexPath, query, expectedHitCount, expectedHitCount);
}
private void testVectorSearch(
Path indexPath, String query, int expectedMinHitCount, int expectedMaxHitCount)
throws Exception {
PrintStream outSave = System.out;
try {
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
PrintStream fakeSystemOut = new PrintStream(bytes, false, Charset.defaultCharset());
System.setOut(fakeSystemOut);
SearchFiles.main(
new String[] {
"-query", query, "-index", indexPath.toString(), "-knn_vector", "1", "-paging", "20"
});
fakeSystemOut.flush();
String output =
bytes.toString(Charset.defaultCharset()); // intentionally use default encoding
int offset = output.indexOf(" total matching documents");
int hitCount =
Integer.parseInt(output.substring(output.lastIndexOf('\n', offset) + 1, offset));
assertTrue(
"unexpected hit count " + hitCount + " for query: " + query,
hitCount >= expectedMinHitCount && hitCount <= expectedMaxHitCount);
} finally {
System.setOut(outSave);
}
}
public void testKnnVectorSearch() throws Exception {
Path dir = getDataPath("test-files/docs");
Path indexDir = createTempDir("ContribDemoTest");
Path dictPath = indexDir.resolve("knn-dict");
Path vectorDictSource = getDataPath("test-files/knn-dict").resolve("knn-token-vectors");
KnnVectorDict.build(vectorDictSource, dictPath);
IndexFiles.main(
new String[] {
"-create",
"-docs",
dir.toString(),
"-index",
indexDir.toString(),
"-knn_dict",
dictPath.toString()
});
// We add a single semantic hit by passing the "-knn_vector 1" argument to SearchFiles. The
// term-based matches are usually also the best semantic matches and overlap, but sometimes due
// to randomness in the vector search algorithm, it picks a different top hit.
testVectorSearch(indexDir, "apache", 3, 4);
testVectorSearch(indexDir, "gnu", 6, 7);
testVectorSearch(indexDir, "derivative", 8, 9);
testVectorSearch(indexDir, "patent", 9, 10);
testVectorSearch(indexDir, "license", 13, 14);
// this matched 0 by token; semantic matching always adds one
testVectorSearch(indexDir, "lucene", 1);
}
}

View File

@ -0,0 +1,65 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.demo.knn;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Path;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.VectorUtil;
public class TestDemoEmbeddings extends LuceneTestCase {
public void testComputeEmbedding() throws IOException {
Path testVectors = getDataPath("../test-files/knn-dict").resolve("knn-token-vectors");
Path dictPath = createTempDir("knn-demo").resolve("dict");
KnnVectorDict.build(testVectors, dictPath);
try (KnnVectorDict dict = new KnnVectorDict(dictPath)) {
DemoEmbeddings demoEmbeddings = new DemoEmbeddings(dict);
// test garbage
float[] garbageVector =
demoEmbeddings.computeEmbedding("garbagethathasneverbeen seeneverinlife");
assertEquals(50, garbageVector.length);
assertArrayEquals(new float[50], garbageVector, 0);
// test space
assertArrayEquals(new float[50], demoEmbeddings.computeEmbedding(" "), 0);
// test some real words that are in the dictionary and some that are not
float[] realVector = demoEmbeddings.computeEmbedding("the real fact");
assertEquals(50, realVector.length);
float[] the = getTermVector(dict, "the");
assertArrayEquals(new float[50], getTermVector(dict, "real"), 0);
float[] fact = getTermVector(dict, "fact");
VectorUtil.add(the, fact);
VectorUtil.l2normalize(the);
assertArrayEquals(the, realVector, 0);
}
}
private float[] getTermVector(KnnVectorDict dict, String term) throws IOException {
byte[] bytes = new byte[200];
dict.get(new BytesRef(term), bytes);
float[] vector = new float[50];
ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer().get(vector);
return vector;
}
}

View File

@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.demo.knn;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
public class TestKnnVectorDict extends LuceneTestCase {
public void testBuild() throws IOException {
Path testVectors = getDataPath("../test-files/knn-dict").resolve("knn-token-vectors");
Path dictPath = createTempDir("knn-demo").resolve("dict");
KnnVectorDict.build(testVectors, dictPath);
try (KnnVectorDict dict = new KnnVectorDict(dictPath)) {
assertEquals(50, dict.getDimension());
byte[] vector = new byte[dict.getDimension() * Float.BYTES];
// not found token has zero vector
dict.get(new BytesRef("never saw this token"), vector);
assertArrayEquals(new byte[200], vector);
// found token has nonzero vector
dict.get(new BytesRef("the"), vector);
assertFalse(Arrays.equals(new byte[200], vector));
// incorrect dimension for output buffer
expectThrows(
IllegalArgumentException.class, () -> dict.get(new BytesRef("the"), new byte[10]));
}
}
}

File diff suppressed because it is too large Load Diff

208
lucene/licenses/pddl-10.txt Normal file
View File

@ -0,0 +1,208 @@
Public Domain Dedication and License (PDDL)
Preamble
The Open Data Commons Public Domain Dedication and Licence is a document intended to allow you to freely share, modify, and use this work for any purpose and without any restrictions. This licence is intended for use on databases or their contents (“data”), either together or individually.
Many databases are covered by copyright. Some jurisdictions, mainly in Europe, have specific special rights that cover databases called the “sui generis” database right. Both of these sets of rights, as well as other legal rights used to protect databases and data, can create uncertainty or practical difficulty for those wishing to share databases and their underlying data but retain a limited amount of rights under a “some rights reserved” approach to licensing as outlined in the Science Commons Protocol for Implementing Open Access Data. As a result, this waiver and licence tries to the fullest extent possible to eliminate or fully license any rights that cover this database and data. Any Community Norms or similar statements of use of the database or data do not form a part of this document, and do not act as a contract for access or other terms of use for the database or data.
The position of the recipient of the work
Because this document places the database and its contents in or as close as possible within the public domain, there are no restrictions or requirements placed on the recipient by this document. Recipients may use this work commercially, use technical protection measures, combine this data or database with other databases or data, and share their changes and additions or keep them secret. It is not a requirement that recipients provide further users with a copy of this licence or attribute the original creator of the data or database as a source. The goal is to eliminate restrictions held by the original creator of the data and database on the use of it by others.
The position of the dedicator of the work
Copyright law, as with most other law under the banner of “intellectual property”, is inherently national law. This means that there exists several differences in how copyright and other IP rights can be relinquished, waived or licensed in the many legal jurisdictions of the world. This is despite much harmonisation of minimum levels of protection. The internet and other communication technologies span these many disparate legal jurisdictions and thus pose special difficulties for a document relinquishing and waiving intellectual property rights, including copyright and database rights, for use by the global community. Because of this feature of intellectual property law, this document first relinquishes the rights and waives the relevant rights and claims. It then goes on to license these same rights for jurisdictions or areas of law that may make it difficult to relinquish or waive rights or claims.
The purpose of this document is to enable rightsholders to place their work into the public domain. Unlike licences for free and open source software, free cultural works, or open content licences, rightsholders will not be able to “dual license” their work by releasing the same work under different licences. This is because they have allowed anyone to use the work in whatever way they choose. Rightsholders therefore cant re-license it under copyright or database rights on different terms because they have nothing left to license. Doing so creates truly accessible data to build rich applications and advance the progress of science and the arts.
This document can cover either or both of the database and its contents (the data). Because databases can have a wide variety of content not just factual data rightsholders should use the Open Data Commons Public Domain Dedication & Licence for an entire database and its contents only if everything can be placed under the terms of this document. Because even factual data can sometimes have intellectual property rights, rightsholders should use this licence to cover both the database and its factual data when making material available under this document; even if it is likely that the data would not be covered by copyright or database rights.
Rightsholders can also use this document to cover any copyright or database rights claims over only a database, and leave the contents to be covered by other licences or documents. They can do this because this document refers to the “Work”, which can be either or both the database and its contents. As a result, rightsholders need to clearly state what they are dedicating under this document when they dedicate it.
Just like any licence or other document dealing with intellectual property, rightsholders should be aware that one can only license what one owns. Please ensure that the rights have been cleared to make this material available under this document.
This document permanently and irrevocably makes the Work available to the public for any use of any kind, and it should not be used unless the rightsholder is prepared for this to happen.
Part I: Introduction
The Rightsholder (the Person holding rights or claims over the Work) agrees as follows:
1.0 Definitions of Capitalised Words
“Copyright” Includes rights under copyright and under neighbouring rights and similarly related sets of rights under the law of the relevant jurisdiction under Section 6.4.
“Data” The contents of the Database, which includes the information, independent works, or other material collected into the Database offered under the terms of this Document.
“Database” A collection of Data arranged in a systematic or methodical way and individually accessible by electronic or other means offered under the terms of this Document.
“Database Right” Means rights over Data resulting from the Chapter III (“sui generis”) rights in the Database Directive (Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases) and any future updates as well as any similar rights available in the relevant jurisdiction under Section 6.4.
“Document” means this relinquishment and waiver of rights and claims and back up licence agreement.
“Person” Means a natural or legal person or a body of persons corporate or incorporate.
“Use” As a verb, means doing any act that is restricted by Copyright or Database Rights whether in the original medium or any other; and includes modifying the Work as may be technically necessary to use it in a different mode or format. This includes the right to sublicense the Work.
“Work” Means either or both of the Database and Data offered under the terms of this Document.
“You” the Person acquiring rights under the licence elements of this Document.
Words in the singular include the plural and vice versa.
2.0 What this document covers
2.1. Legal effect of this Document. This Document is:
a. A dedication to the public domain and waiver of Copyright and Database Rights over the Work; and
b. A licence of Copyright and Database Rights over the Work in jurisdictions that do not allow for relinquishment or waiver.
2.2. Legal rights covered.
a. Copyright. Any copyright or neighbouring rights in the Work. Copyright law varies between jurisdictions, but is likely to cover: the Database model or schema, which is the structure, arrangement, and organisation of the Database, and can also include the Database tables and table indexes; the data entry and output sheets; and the Field names of Data stored in the Database. Copyright may also cover the Data depending on the jurisdiction and type of Data; and
b. Database Rights. Database Rights only extend to the extraction and re-utilisation of the whole or a substantial part of the Data. Database Rights can apply even when there is no copyright over the Database. Database Rights can also apply when the Data is removed from the Database and is selected and arranged in a way that would not infringe any applicable copyright.
2.2 Rights not covered.
a. This Document does not apply to computer programs used in the making or operation of the Database;
b. This Document does not cover any patents over the Data or the Database. Please see Section 4.2 later in this Document for further details; and
c. This Document does not cover any trade marks associated with the Database. Please see Section 4.3 later in this Document for further details.
Users of this Database are cautioned that they may have to clear other rights or consult other licences.
2.3 Facts are free. The Rightsholder takes the position that factual information is not covered by Copyright. This Document however covers the Work in jurisdictions that may protect the factual information in the Work by Copyright, and to cover any information protected by Copyright that is contained in the Work.
Part II: Dedication to the public domain
3.0 Dedication, waiver, and licence of Copyright and Database Rights
3.1 Dedication of Copyright and Database Rights to the public domain. The Rightsholder by using this Document, dedicates the Work to the public domain for the benefit of the public and relinquishes all rights in Copyright and Database Rights over the Work.
a. The Rightsholder realises that once these rights are relinquished, that the Rightsholder has no further rights in Copyright and Database Rights over the Work, and that the Work is free and open for others to Use.
b. The Rightsholder intends for their relinquishment to cover all present and future rights in the Work under Copyright and Database Rights, whether they are vested or contingent rights, and that this relinquishment of rights covers all their heirs and successors.
The above relinquishment of rights applies worldwide and includes media and formats now known or created in the future.
3.2 Waiver of rights and claims in Copyright and Database Rights when Section 3.1 dedication inapplicable. If the dedication in Section 3.1 does not apply in the relevant jurisdiction under Section 6.4, the Rightsholder waives any rights and claims that the Rightsholder may have or acquire in the future over the Work in:
a. Copyright; and
b. Database Rights.
To the extent possible in the relevant jurisdiction, the above waiver of rights and claims applies worldwide and includes media and formats now known or created in the future. The Rightsholder agrees not to assert the above rights and waives the right to enforce them over the Work.
3.3 Licence of Copyright and Database Rights when Sections 3.1 and 3.2 inapplicable. If the dedication and waiver in Sections 3.1 and 3.2 does not apply in the relevant jurisdiction under Section 6.4, the Rightsholder and You agree as follows:
a. The Licensor grants to You a worldwide, royalty-free, non-exclusive, licence to Use the Work for the duration of any applicable Copyright and Database Rights. These rights explicitly include commercial use, and do not exclude any field of endeavour. To the extent possible in the relevant jurisdiction, these rights may be exercised in all media and formats whether now known or created in the future.
3.4 Moral rights. This section covers moral rights, including the right to be identified as the author of the Work or to object to treatment that would otherwise prejudice the authors honour and reputation, or any other derogatory treatment:
a. For jurisdictions allowing waiver of moral rights, Licensor waives all moral rights that Licensor may have in the Work to the fullest extent possible by the law of the relevant jurisdiction under Section 6.4;
b. If waiver of moral rights under Section 3.4 a in the relevant jurisdiction is not possible, Licensor agrees not to assert any moral rights over the Work and waives all claims in moral rights to the fullest extent possible by the law of the relevant jurisdiction under Section 6.4; and
c. For jurisdictions not allowing waiver or an agreement not to assert moral rights under Section 3.4 a and b, the author may retain their moral rights over the copyrighted aspects of the Work.
Please note that some jurisdictions do not allow for the waiver of moral rights, and so moral rights may still subsist over the work in some jurisdictions.
4.0 Relationship to other rights
4.1 No other contractual conditions. The Rightsholder makes this Work available to You without any other contractual obligations, either express or implied. Any Community Norms statement associated with the Work is not a contract and does not form part of this Document.
4.2 Relationship to patents. This Document does not grant You a licence for any patents that the Rightsholder may own. Users of this Database are cautioned that they may have to clear other rights or consult other licences.
4.3 Relationship to trade marks. This Document does not grant You a licence for any trade marks that the Rightsholder may own or that the Rightsholder may use to cover the Work. Users of this Database are cautioned that they may have to clear other rights or consult other licences.
Part III: General provisions
5.0 Warranties, disclaimer, and limitation of liability
5.1 The Work is provided by the Rightsholder “as is” and without any warranty of any kind, either express or implied, whether of title, of accuracy or completeness, of the presence of absence of errors, of fitness for purpose, or otherwise. Some jurisdictions do not allow the exclusion of implied warranties, so this exclusion may not apply to You.
5.2 Subject to any liability that may not be excluded or limited by law, the Rightsholder is not liable for, and expressly excludes, all liability for loss or damage however and whenever caused to anyone by any use under this Document, whether by You or by anyone else, and whether caused by any fault on the part of the Rightsholder or not. This exclusion of liability includes, but is not limited to, any special, incidental, consequential, punitive, or exemplary damages. This exclusion applies even if the Rightsholder has been advised of the possibility of such damages.
5.3 If liability may not be excluded by law, it is limited to actual and direct financial loss to the extent it is caused by proved negligence on the part of the Rightsholder.
6.0 General
6.1 If any provision of this Document is held to be invalid or unenforceable, that must not affect the validity or enforceability of the remainder of the terms of this Document.
6.2 This Document is the entire agreement between the parties with respect to the Work covered here. It replaces any earlier understandings, agreements or representations with respect to the Work not specified here.
6.3 This Document does not affect any rights that You or anyone else may independently have under any applicable law to make any use of this Work, including (for jurisdictions where this Document is a licence) fair dealing, fair use, database exceptions, or any other legally recognised limitation or exception to infringement of copyright or other applicable laws.
6.4 This Document takes effect in the relevant jurisdiction in which the Document terms are sought to be enforced. If the rights waived or granted under applicable law in the relevant jurisdiction includes additional rights not waived or granted under this Document, these additional rights are included in this Document in order to meet the intent of this Document.