LUCENE-7093: Added point values support to the memory index

This commit is contained in:
Martijn van Groningen 2016-03-15 12:32:46 +01:00
parent 82c06190a3
commit c1dfeb8ef8
4 changed files with 389 additions and 17 deletions

View File

@ -209,6 +209,9 @@ Other
* LUCENE-7091: Add doc values support to MemoryIndex
(Martijn van Groningen, David Smiley)
* LUCENE-7093: Add point values support to MemoryIndex
(Martijn van Groningen, Mike McCandless)
======================= Lucene 5.5.0 =======================
New Features

View File

@ -258,7 +258,8 @@ public class MemoryIndex {
throw new IllegalArgumentException("analyzer must not be null");
TokenStream stream = analyzer.tokenStream(fieldName, text);
addField(fieldName, stream, 1.0f, analyzer.getPositionIncrementGap(fieldName), analyzer.getOffsetGap(fieldName), DocValuesType.NONE, null);
addField(fieldName, stream, 1.0f, analyzer.getPositionIncrementGap(fieldName), analyzer.getOffsetGap(fieldName),
DocValuesType.NONE, null, 0, 0, null);
}
/**
@ -377,10 +378,6 @@ public class MemoryIndex {
* structures are not supported by MemoryIndex
*/
public void addField(IndexableField field, Analyzer analyzer, float boost) {
if (field.fieldType().pointDimensionCount() != 0) {
throw new IllegalArgumentException("MemoryIndex does not support Points");
}
int offsetGap;
TokenStream tokenStream;
int positionIncrementGap;
@ -412,7 +409,12 @@ public class MemoryIndex {
default:
throw new UnsupportedOperationException("unknown doc values type [" + docValuesType + "]");
}
addField(field.name(), tokenStream, boost, positionIncrementGap, offsetGap, docValuesType, docValuesValue);
BytesRef pointValue = null;
if (field.fieldType().pointDimensionCount() > 0) {
pointValue = field.binaryValue();
}
addField(field.name(), tokenStream, boost, positionIncrementGap, offsetGap, docValuesType, docValuesValue,
field.fieldType().pointDimensionCount(), field.fieldType().pointNumBytes(), pointValue);
}
/**
@ -481,11 +483,12 @@ public class MemoryIndex {
* @see org.apache.lucene.document.Field#setBoost(float)
*/
public void addField(String fieldName, TokenStream tokenStream, float boost, int positionIncrementGap, int offsetGap) {
addField(fieldName, tokenStream, boost, positionIncrementGap, offsetGap, DocValuesType.NONE, null);
addField(fieldName, tokenStream, boost, positionIncrementGap, offsetGap, DocValuesType.NONE, null, 0, 0, null);
}
private void addField(String fieldName, TokenStream tokenStream, float boost, int positionIncrementGap, int offsetGap,
DocValuesType docValuesType, Object docValuesValue) {
DocValuesType docValuesType, Object docValuesValue, int pointDimensionCount, int pointNumBytes,
BytesRef pointValue) {
if (frozen) {
throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen");
@ -503,7 +506,9 @@ public class MemoryIndex {
FieldInfo fieldInfo = new FieldInfo(fieldName, fields.size(), true, false, storePayloads, indexOptions, docValuesType, -1, Collections.emptyMap(), 0, 0);
fields.put(fieldName, info = new Info(fieldInfo, byteBlockPool));
}
if (pointDimensionCount > 0) {
storePointValues(info, pointDimensionCount, pointNumBytes, pointValue);
}
if (docValuesType != DocValuesType.NONE) {
storeDocValues(info, docValuesType, docValuesValue);
}
@ -512,6 +517,15 @@ public class MemoryIndex {
}
}
private void storePointValues(Info info, int pointDimensionCount, int pointNumBytes, BytesRef pointValue) {
info.fieldInfo.setPointDimensions(pointDimensionCount, pointNumBytes);
if (info.pointValues == null) {
info.pointValues = new BytesRef[4];
}
info.pointValues = ArrayUtil.grow(info.pointValues, info.pointValuesCount + 1);
info.pointValues[info.pointValuesCount++] = BytesRef.deepCopyOf(pointValue);
}
private void storeDocValues(Info info, DocValuesType docValuesType, Object docValuesValue) {
String fieldName = info.fieldInfo.name;
DocValuesType existingDocValuesType = info.fieldInfo.getDocValuesType();
@ -829,7 +843,15 @@ public class MemoryIndex {
private NumericDocValuesProducer numericProducer;
private boolean preparedDocValues;
private boolean preparedDocValuesAndPointValues;
private BytesRef[] pointValues;
private byte[] minPackedValue;
private byte[] maxPackedValue;
private int pointValuesCount;
private Info(FieldInfo fieldInfo, ByteBlockPool byteBlockPool) {
this.fieldInfo = fieldInfo;
@ -841,7 +863,7 @@ public class MemoryIndex {
void freeze() {
sortTerms();
prepareDocValues();
prepareDocValuesAndPointValues();
getNormDocValues();
}
@ -859,8 +881,8 @@ public class MemoryIndex {
}
}
void prepareDocValues() {
if (preparedDocValues == false) {
void prepareDocValuesAndPointValues() {
if (preparedDocValuesAndPointValues == false) {
DocValuesType dvType = fieldInfo.getDocValuesType();
if (dvType == DocValuesType.NUMERIC || dvType == DocValuesType.SORTED_NUMERIC) {
numericProducer.prepareForUsage();
@ -868,7 +890,30 @@ public class MemoryIndex {
if (dvType == DocValuesType.BINARY || dvType == DocValuesType.SORTED || dvType == DocValuesType.SORTED_SET) {
binaryProducer.prepareForUsage();
}
preparedDocValues = true;
if (pointValues != null) {
assert pointValues[0].bytes.length == pointValues[0].length : "BytesRef should wrap a precise byte[], BytesRef.deepCopyOf() should take care of this";
final int numDimensions = fieldInfo.getPointDimensionCount();
final int numBytesPerDimension = fieldInfo.getPointNumBytes();
minPackedValue = pointValues[0].bytes.clone();
maxPackedValue = pointValues[0].bytes.clone();
for (int i = 0; i < pointValuesCount; i++) {
BytesRef pointValue = pointValues[i];
assert pointValue.bytes.length == pointValue.length : "BytesRef should wrap a precise byte[], BytesRef.deepCopyOf() should take care of this";
for (int dim = 0; dim < numDimensions; ++dim) {
int offset = dim * numBytesPerDimension;
if (StringHelper.compare(numBytesPerDimension, pointValue.bytes, offset, minPackedValue, offset) < 0) {
System.arraycopy(pointValue.bytes, offset, minPackedValue, offset, numBytesPerDimension);
}
if (StringHelper.compare(numBytesPerDimension, pointValue.bytes, offset, maxPackedValue, offset) > 0) {
System.arraycopy(pointValue.bytes, offset, maxPackedValue, offset, numBytesPerDimension);
}
}
}
}
preparedDocValuesAndPointValues = true;
}
}
@ -977,11 +1022,22 @@ public class MemoryIndex {
* required by the Lucene IndexReader contracts.
*/
private final class MemoryIndexReader extends LeafReader {
private final PointValues pointValues;
private MemoryIndexReader() {
super(); // avoid as much superclass baggage as possible
boolean hasPointValues = false;
for (Info info : fields.values()) {
info.prepareDocValues();
info.prepareDocValuesAndPointValues();
if (info.pointValues != null) {
hasPointValues = true;
}
}
if (hasPointValues) {
pointValues = new MemoryIndexPointValues();
} else {
pointValues = null;
}
}
@ -1111,7 +1167,7 @@ public class MemoryIndex {
@Override
public PointValues getPointValues() {
return null;
return pointValues;
}
@Override
@ -1412,6 +1468,101 @@ public class MemoryIndex {
return 1;
}
}
private class MemoryIndexPointValues extends PointValues {
@Override
public void intersect(String fieldName, IntersectVisitor visitor) throws IOException {
Info info = fields.get(fieldName);
if (info == null) {
return;
}
BytesRef[] values = info.pointValues;
if (values == null) {
return;
}
visitor.grow(info.pointValuesCount);
for (int i = 0; i < info.pointValuesCount; i++) {
visitor.visit(0, values[i].bytes);
}
}
@Override
public byte[] getMinPackedValue(String fieldName) throws IOException {
Info info = fields.get(fieldName);
if (info == null) {
return null;
}
BytesRef[] values = info.pointValues;
if (values != null) {
return info.minPackedValue;
} else {
return null;
}
}
@Override
public byte[] getMaxPackedValue(String fieldName) throws IOException {
Info info = fields.get(fieldName);
if (info == null) {
return null;
}
BytesRef[] values = info.pointValues;
if (values != null) {
return info.maxPackedValue;
} else {
return null;
}
}
@Override
public int getNumDimensions(String fieldName) throws IOException {
Info info = fields.get(fieldName);
if (info == null){
return 0;
}
return info.fieldInfo.getPointDimensionCount();
}
@Override
public int getBytesPerDimension(String fieldName) throws IOException {
Info info = fields.get(fieldName);
if (info == null){
return 0;
}
return info.fieldInfo.getPointNumBytes();
}
@Override
public long size(String fieldName) {
Info info = fields.get(fieldName);
if (info == null) {
return 0;
}
BytesRef[] values = info.pointValues;
if (values != null) {
return info.pointValuesCount;
} else {
return 0;
}
}
@Override
public int getDocCount(String fieldName) {
Info info = fields.get(fieldName);
if (info == null) {
return 0;
}
BytesRef[] values = info.pointValues;
if (values != null) {
return 1;
} else {
return 0;
}
}
}
@Override
public Fields getTermVectors(int docID) {

View File

@ -17,13 +17,28 @@
package org.apache.lucene.index.memory;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.IntStream;
import java.util.stream.LongStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockPayloadAnalyzer;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.BinaryPoint;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedNumericDocValuesField;
@ -31,9 +46,12 @@ import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
@ -44,7 +62,9 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.util.BytesRef;
@ -310,4 +330,144 @@ public class TestMemoryIndex extends LuceneTestCase {
assertEquals("quick brown fox", binaryDocValues.get(0).utf8ToString());
}
public void testPointValues() throws Exception {
List<Function<Long, IndexableField>> fieldFunctions = Arrays.asList(
(t) -> new IntPoint("number", t.intValue()),
(t) -> new LongPoint("number", t),
(t) -> new FloatPoint("number", t.floatValue()),
(t) -> new DoublePoint("number", t.doubleValue())
);
List<Function<Long, Query>> exactQueryFunctions = Arrays.asList(
(t) -> IntPoint.newExactQuery("number", t.intValue()),
(t) -> LongPoint.newExactQuery("number", t),
(t) -> FloatPoint.newExactQuery("number", t.floatValue()),
(t) -> DoublePoint.newExactQuery("number", t.doubleValue())
);
List<Function<long[], Query>> setQueryFunctions = Arrays.asList(
(t) -> IntPoint.newSetQuery("number", LongStream.of(t).mapToInt(value -> (int) value).toArray()),
(t) -> LongPoint.newSetQuery("number", t),
(t) -> FloatPoint.newSetQuery("number", Arrays.asList(LongStream.of(t).mapToObj(value -> (float) value).toArray(Float[]::new))),
(t) -> DoublePoint.newSetQuery("number", LongStream.of(t).mapToDouble(value -> (double) value).toArray())
);
List<BiFunction<Long, Long, Query>> rangeQueryFunctions = Arrays.asList(
(t, u) -> IntPoint.newRangeQuery("number", t.intValue(), u.intValue()),
(t, u) -> LongPoint.newRangeQuery("number", t, u),
(t, u) -> FloatPoint.newRangeQuery("number", t.floatValue(), u.floatValue()),
(t, u) -> DoublePoint.newRangeQuery("number", t.doubleValue(), u.doubleValue())
);
for (int i = 0; i < fieldFunctions.size(); i++) {
Function<Long, IndexableField> fieldFunction = fieldFunctions.get(i);
Function<Long, Query> exactQueryFunction = exactQueryFunctions.get(i);
Function<long[], Query> setQueryFunction = setQueryFunctions.get(i);
BiFunction<Long, Long, Query> rangeQueryFunction = rangeQueryFunctions.get(i);
Document doc = new Document();
for (int number = 1; number < 32; number += 2) {
doc.add(fieldFunction.apply((long) number));
}
MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer);
IndexSearcher indexSearcher = mi.createSearcher();
Query query = exactQueryFunction.apply(5L);
assertEquals(1, indexSearcher.count(query));
query = exactQueryFunction.apply(4L);
assertEquals(0, indexSearcher.count(query));
query = setQueryFunction.apply(new long[]{3L, 9L, 19L});
assertEquals(1, indexSearcher.count(query));
query = setQueryFunction.apply(new long[]{2L, 8L, 13L});
assertEquals(1, indexSearcher.count(query));
query = setQueryFunction.apply(new long[]{2L, 8L, 16L});
assertEquals(0, indexSearcher.count(query));
query = rangeQueryFunction.apply(2L, 16L);
assertEquals(1, indexSearcher.count(query));
query = rangeQueryFunction.apply(24L, 48L);
assertEquals(1, indexSearcher.count(query));
query = rangeQueryFunction.apply(48L, 68L);
assertEquals(0, indexSearcher.count(query));
}
}
public void testPointValuesDoNotAffectBoostPositionsOrOffset() throws Exception {
MemoryIndex mi = new MemoryIndex(true, true);
mi.addField(new TextField("text", "quick brown fox", Field.Store.NO), analyzer, 5f);
mi.addField(new BinaryPoint("text", "quick".getBytes(StandardCharsets.UTF_8)), analyzer, 5f);
mi.addField(new BinaryPoint("text", "brown".getBytes(StandardCharsets.UTF_8)), analyzer, 5f);
LeafReader leafReader = mi.createSearcher().getIndexReader().leaves().get(0).reader();
TermsEnum tenum = leafReader.terms("text").iterator();
assertEquals("brown", tenum.next().utf8ToString());
PostingsEnum penum = tenum.postings(null, PostingsEnum.OFFSETS);
assertEquals(0, penum.nextDoc());
assertEquals(1, penum.freq());
assertEquals(1, penum.nextPosition());
assertEquals(6, penum.startOffset());
assertEquals(11, penum.endOffset());
assertEquals("fox", tenum.next().utf8ToString());
penum = tenum.postings(penum, PostingsEnum.OFFSETS);
assertEquals(0, penum.nextDoc());
assertEquals(1, penum.freq());
assertEquals(2, penum.nextPosition());
assertEquals(12, penum.startOffset());
assertEquals(15, penum.endOffset());
assertEquals("quick", tenum.next().utf8ToString());
penum = tenum.postings(penum, PostingsEnum.OFFSETS);
assertEquals(0, penum.nextDoc());
assertEquals(1, penum.freq());
assertEquals(0, penum.nextPosition());
assertEquals(0, penum.startOffset());
assertEquals(5, penum.endOffset());
IndexSearcher indexSearcher = mi.createSearcher();
assertEquals(1, indexSearcher.count(BinaryPoint.newExactQuery("text", "quick".getBytes(StandardCharsets.UTF_8))));
assertEquals(1, indexSearcher.count(BinaryPoint.newExactQuery("text", "brown".getBytes(StandardCharsets.UTF_8))));
assertEquals(0, indexSearcher.count(BinaryPoint.newExactQuery("text", "jumps".getBytes(StandardCharsets.UTF_8))));
}
public void test2DPoints() throws Exception {
Document doc = new Document();
doc.add(new IntPoint("ints", 0, -100));
doc.add(new IntPoint("ints", 20, 20));
doc.add(new IntPoint("ints", 100, -100));
doc.add(new LongPoint("longs", 0L, -100L));
doc.add(new LongPoint("longs", 20L, 20L));
doc.add(new LongPoint("longs", 100L, -100L));
doc.add(new FloatPoint("floats", 0F, -100F));
doc.add(new FloatPoint("floats", 20F, 20F));
doc.add(new FloatPoint("floats", 100F, -100F));
doc.add(new DoublePoint("doubles", 0D, -100D));
doc.add(new DoublePoint("doubles", 20D, 20D));
doc.add(new DoublePoint("doubles", 100D, -100D));
MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer);
IndexSearcher s = mi.createSearcher();
assertEquals(1, s.count(IntPoint.newRangeQuery("ints", new int[] {10, 10}, new int[] {30, 30})));
assertEquals(1, s.count(LongPoint.newRangeQuery("longs", new long[] {10L, 10L}, new long[] {30L, 30L})));
assertEquals(1, s.count(FloatPoint.newRangeQuery("floats", new float[] {10F, 10F}, new float[] {30F, 30F})));
assertEquals(1, s.count(DoublePoint.newRangeQuery("doubles", new double[] {10D, 10D}, new double[] {30D, 30D})));
}
public void testIndexingPointsAndDocValues() throws Exception {
FieldType type = new FieldType();
type.setDimensions(1, 4);
type.setDocValuesType(DocValuesType.BINARY);
type.freeze();
Document doc = new Document();
byte[] packedPoint = "term".getBytes(StandardCharsets.UTF_8);
doc.add(new BinaryPoint("field", packedPoint, type));
MemoryIndex mi = MemoryIndex.fromDocument(doc, analyzer);
LeafReader leafReader = mi.createSearcher().getIndexReader().leaves().get(0).reader();
assertEquals(1, leafReader.getPointValues().size("field"));
assertArrayEquals(packedPoint, leafReader.getPointValues().getMinPackedValue("field"));
assertArrayEquals(packedPoint, leafReader.getPointValues().getMaxPackedValue("field"));
assertEquals("term", leafReader.getBinaryDocValues("field").get(0).utf8ToString());
}
}

View File

@ -21,8 +21,12 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.IntSupplier;
import java.util.function.Supplier;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -37,9 +41,13 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.LegacyLongField;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedNumericDocValuesField;
@ -70,6 +78,7 @@ import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
@ -568,6 +577,55 @@ public class TestMemoryIndexAgainstRAMDir extends BaseTokenStreamTestCase {
dir.close();
}
public void testPointValuesMemoryIndexVsNormalIndex() throws Exception {
int size = atLeast(12);
List<Integer> randomValues = new ArrayList<>();
Document doc = new Document();
for (Integer randomInteger : random().ints(size).toArray()) {
doc.add(new IntPoint("int", randomInteger));
randomValues.add(randomInteger);
doc.add(new LongPoint("long", randomInteger));
doc.add(new FloatPoint("float", randomInteger));
doc.add(new DoublePoint("double", randomInteger));
}
MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
MemoryIndex memoryIndex = MemoryIndex.fromDocument(doc, mockAnalyzer);
IndexSearcher memoryIndexSearcher = memoryIndex.createSearcher();
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(random(), mockAnalyzer));
writer.addDocument(doc);
writer.close();
IndexReader controlIndexReader = DirectoryReader.open(dir);
IndexSearcher controlIndexSearcher = new IndexSearcher(controlIndexReader);
Supplier<Integer> valueSupplier = () -> randomValues.get(random().nextInt(randomValues.size()));
Query[] queries = new Query[] {
IntPoint.newExactQuery("int", valueSupplier.get()),
LongPoint.newExactQuery("long", valueSupplier.get()),
FloatPoint.newExactQuery("float", valueSupplier.get()),
DoublePoint.newExactQuery("double", valueSupplier.get()),
IntPoint.newSetQuery("int", valueSupplier.get(), valueSupplier.get()),
LongPoint.newSetQuery("long", valueSupplier.get(), valueSupplier.get()),
FloatPoint.newSetQuery("float", valueSupplier.get(), valueSupplier.get()),
DoublePoint.newSetQuery("double", valueSupplier.get(), valueSupplier.get()),
IntPoint.newRangeQuery("int", valueSupplier.get(), valueSupplier.get()),
LongPoint.newRangeQuery("long", valueSupplier.get(), valueSupplier.get()),
FloatPoint.newRangeQuery("float", valueSupplier.get(), valueSupplier.get()),
DoublePoint.newRangeQuery("double", valueSupplier.get(), valueSupplier.get())
};
for (Query query : queries) {
assertEquals(controlIndexSearcher.count(query), controlIndexSearcher.count(query));
}
memoryIndexSearcher.getIndexReader().close();
controlIndexReader.close();
dir.close();
}
public void testDuellMemIndex() throws IOException {
LineFileDocs lineFileDocs = new LineFileDocs(random());
int numDocs = atLeast(10);