From 013abeaa5b7b1a817f5e94016fccaee93fafd732 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Tue, 23 Feb 2016 16:34:39 -0500 Subject: [PATCH] a rough start --- .../apache/lucene/index/PrefixCodedTerms.java | 2 +- .../lucene/index/TermsHashPerField.java | 3 + .../apache/lucene/search/PointInSetQuery.java | 341 ++++++++++++++++++ .../lucene/search/TestPointQueries.java | 35 ++ 4 files changed, 380 insertions(+), 1 deletion(-) create mode 100644 lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java diff --git a/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java b/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java index 87a9ae2c06b..3dca3dba927 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java +++ b/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java @@ -67,7 +67,7 @@ public class PrefixCodedTerms implements Accountable { add(term.field(), term.bytes()); } - /** add a term */ + /** add a term. This fully consumes in the incoming {@link BytesRef}. */ public void add(String field, BytesRef bytes) { assert lastTerm.equals(new Term("")) || new Term(field, bytes).compareTo(lastTerm) > 0; diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java index 3275a4e6811..b06295c9ae9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java @@ -93,7 +93,10 @@ abstract class TermsHashPerField implements Comparable { /** Collapse the hash table and sort in-place; also sets * this.sortedTermIDs to the results */ public int[] sortPostings() { + long t0 = System.nanoTime(); sortedTermIDs = bytesHash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); + // nocommit + System.out.println("MKM: field " + fieldInfo.name + " has " + bytesHash.size() + " unique terms " + ((System.nanoTime()-t0)/1000000.0) + " msec to sort"); return sortedTermIDs; } diff --git a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java new file mode 100644 index 00000000000..4403e456c32 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + + +import java.io.IOException; +import java.util.Arrays; +import java.util.Objects; + +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PointValues.IntersectVisitor; +import org.apache.lucene.index.PointValues.Relation; +import org.apache.lucene.index.PointValues; +import org.apache.lucene.index.PrefixCodedTerms.TermIterator; +import org.apache.lucene.index.PrefixCodedTerms; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.StringHelper; + +/** Finds all documents whose point value, previously indexed with e.g. {@link org.apache.lucene.document.LongPoint}, is contained in the + * specified set */ + +// nocommit make abstract +public class PointInSetQuery extends Query { + // A little bit overkill for us, since all of our "terms" are always in the same field: + final PrefixCodedTerms sortedPackedPoints; + final int sortedPackedPointsHashCode; + final String field; + final int numDims; + final int bytesPerDim; + + /** {@code packedPoints} must already be sorted! */ + protected PointInSetQuery(String field, int numDims, int bytesPerDim, BytesRefIterator packedPoints) throws IOException { + this.field = field; + // nocommit validate these: + this.bytesPerDim = bytesPerDim; + this.numDims = numDims; + + // In the 1D case this works well (the more points, the more common prefixes they share, typically), but in + // the > 1 D case, where we are only looking at the first dimension's prefix bytes, it can at worst not hurt: + PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder(); + BytesRefBuilder previous = null; + BytesRef current; + while ((current = packedPoints.next()) != null) { + if (current.length != numDims * bytesPerDim) { + throw new IllegalArgumentException("packed point length should be " + (numDims * bytesPerDim) + " but got " + current.length + "; field=\"" + field + "\", numDims=" + numDims + " bytesPerDim=" + bytesPerDim); + } + if (previous == null) { + previous = new BytesRefBuilder(); + } else if (previous.get().equals(current)) { + continue; // deduplicate + } + builder.add(field, current); + previous.copyBytes(current); + } + sortedPackedPoints = builder.finish(); + sortedPackedPointsHashCode = sortedPackedPoints.hashCode(); + } + + /** Use in the 1D case when you indexed 1D int values using {@link org.apache.lucene.document.IntPoint} */ + public static PointInSetQuery newIntSet(String field, int... valuesIn) { + + // Don't unexpectedly change the user's incoming array: + int[] values = valuesIn.clone(); + + Arrays.sort(values); + + final BytesRef value = new BytesRef(new byte[Integer.BYTES]); + value.length = Integer.BYTES; + + try { + return new PointInSetQuery(field, 1, Integer.BYTES, + new BytesRefIterator() { + + int upto; + + @Override + public BytesRef next() { + if (upto == values.length) { + return null; + } else { + IntPoint.encodeDimension(values[upto], value.bytes, 0); + upto++; + return value; + } + } + }); + } catch (IOException bogus) { + // Should never happen ;) + throw new RuntimeException(bogus); + } + } + + @Override + public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException { + + // We don't use RandomAccessWeight here: it's no good to approximate with "match all docs". + // This is an inverted structure and should be used in the first pass: + + return new ConstantScoreWeight(this) { + + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + LeafReader reader = context.reader(); + PointValues values = reader.getPointValues(); + if (values == null) { + // No docs in this segment indexed any points + return null; + } + FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); + if (fieldInfo == null) { + // No docs in this segment indexed this field at all + return null; + } + if (fieldInfo.getPointDimensionCount() != numDims) { + throw new IllegalArgumentException("field=\"" + field + "\" was indexed with numDims=" + fieldInfo.getPointDimensionCount() + " but this query has numDims=" + numDims); + } + if (bytesPerDim != fieldInfo.getPointNumBytes()) { + throw new IllegalArgumentException("field=\"" + field + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() + " but this query has bytesPerDim=" + bytesPerDim); + } + int bytesPerDim = fieldInfo.getPointNumBytes(); + + DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc()); + + int[] hitCount = new int[1]; + final TermIterator iterator = sortedPackedPoints.iterator(); + byte[] pointBytes = new byte[bytesPerDim * numDims]; + + if (numDims == 1) { + + final BytesRef scratch = new BytesRef(); + scratch.length = bytesPerDim; + + // Optimize this common case, effectively doing a merge sort of the indexed values vs the queried set: + values.intersect(field, + new IntersectVisitor() { + + private BytesRef nextQueryPoint = iterator.next(); + + @Override + public void grow(int count) { + result.grow(count); + } + + @Override + public void visit(int docID) { + hitCount[0]++; + result.add(docID); + } + + @Override + public void visit(int docID, byte[] packedValue) { + scratch.bytes = packedValue; + while (nextQueryPoint != null) { + int cmp = nextQueryPoint.compareTo(scratch); + if (cmp == 0) { + // Query point equals index point, so collect and return + hitCount[0]++; + result.add(docID); + break; + } else if (cmp < 0) { + // Query point is before index point, so we move to next query point + nextQueryPoint = iterator.next(); + } else { + // Query point is after index point, so we don't collect and we return: + break; + } + } + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + + while (nextQueryPoint != null) { + scratch.bytes = minPackedValue; + int cmpMin = nextQueryPoint.compareTo(scratch); + if (cmpMin < 0) { + // query point is before the start of this cell + nextQueryPoint = iterator.next(); + continue; + } + scratch.bytes = maxPackedValue; + int cmpMax = nextQueryPoint.compareTo(scratch); + if (cmpMax > 0) { + // query point is after the end of this cell + return Relation.CELL_OUTSIDE_QUERY; + } + + if (cmpMin == 0 && cmpMax == 0) { + // NOTE: we only hit this if we are on a cell whose min and max values are exactly equal to our point, + // which can easily happen if many (> 1024) docs share this one value + return Relation.CELL_INSIDE_QUERY; + } else { + return Relation.CELL_CROSSES_QUERY; + } + } + + // We exhausted all points in the query: + return Relation.CELL_OUTSIDE_QUERY; + } + }); + } else { + for (BytesRef point = iterator.next(); point != null; point = iterator.next()) { + // nocommit make sure a test tests this: + assert point.length == pointBytes.length; + System.arraycopy(point.bytes, point.offset, pointBytes, 0, pointBytes.length); + + final BytesRef finalPoint = point; + + values.intersect(field, + // nocommit don't make new instance of this for each point? + new IntersectVisitor() { + + @Override + public void grow(int count) { + result.grow(count); + } + + @Override + public void visit(int docID) { + hitCount[0]++; + result.add(docID); + } + + @Override + public void visit(int docID, byte[] packedValue) { + assert packedValue.length == finalPoint.length; + if (Arrays.equals(packedValue, pointBytes)) { + // The point for this doc matches the point we are querying on + hitCount[0]++; + result.add(docID); + } + } + + @Override + public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { + + boolean crosses = false; + + for(int dim=0;dim 0) { + return Relation.CELL_OUTSIDE_QUERY; + } + + int cmpMax = StringHelper.compare(bytesPerDim, maxPackedValue, offset, pointBytes, offset); + if (cmpMax < 0) { + return Relation.CELL_OUTSIDE_QUERY; + } + + if (cmpMin != 0 || cmpMax != 0) { + crosses = true; + } + } + + if (crosses) { + return Relation.CELL_CROSSES_QUERY; + } else { + // nocommit make sure tests hit this case: + // NOTE: we only hit this if we are on a cell whose min and max values are exactly equal to our point, + // which can easily happen if many docs share this one value + return Relation.CELL_INSIDE_QUERY; + } + } + }); + } + } + + // NOTE: hitCount[0] will be over-estimate in multi-valued case + return new ConstantScoreScorer(this, score(), result.build(hitCount[0]).iterator()); + } + }; + } + + @Override + public int hashCode() { + int hash = super.hashCode(); + hash += sortedPackedPointsHashCode^0x14fa55fb; + hash += numDims^0x14fa55fb; + hash += bytesPerDim^0x14fa55fb; + return hash; + } + + @Override + public boolean equals(Object other) { + if (super.equals(other)) { + final PointInSetQuery q = (PointInSetQuery) other; + return q.numDims == numDims && + q.bytesPerDim == bytesPerDim && + q.sortedPackedPointsHashCode == sortedPackedPointsHashCode && + q.sortedPackedPoints.equals(sortedPackedPoints); + } + + return false; + } + + @Override + public String toString(String field) { + final StringBuilder sb = new StringBuilder(); + sb.append(getClass().getSimpleName()); + sb.append(':'); + if (this.field.equals(field) == false) { + sb.append("field="); + sb.append(this.field); + sb.append(':'); + } + + TermIterator iterator = sortedPackedPoints.iterator(); + for (BytesRef point = iterator.next(); point != null; point = iterator.next()) { + sb.append(' '); + // nocommit fix me to convert back to the numbers/etc.: + sb.append(point); + } + + return sb.toString(); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java index cfab5fbea09..ea432f7ab2d 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java @@ -1114,4 +1114,39 @@ public class TestPointQueries extends LuceneTestCase { new boolean[] { false, true }).toString()); } + + // nocommit fix existing randomized tests to sometimes randomly use PointInSet instead + + // nocommit need 2D test too + + public void testBasicPointInSetQuery() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setCodec(getCodec()); + IndexWriter w = new IndexWriter(dir, iwc); + + Document doc = new Document(); + doc.add(new IntPoint("int", 17)); + w.addDocument(doc); + + doc = new Document(); + doc.add(new IntPoint("int", 42)); + w.addDocument(doc); + + doc = new Document(); + doc.add(new IntPoint("int", 97)); + w.addDocument(doc); + + IndexReader r = DirectoryReader.open(w); + IndexSearcher s = newSearcher(r); + assertEquals(0, s.count(PointInSetQuery.newIntSet("int", 16))); + assertEquals(1, s.count(PointInSetQuery.newIntSet("int", 17))); + assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 97, 42))); + assertEquals(3, s.count(PointInSetQuery.newIntSet("int", -7, 17, 42, 97))); + assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 20, 42, 97))); + assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 105, 42, 97))); + w.close(); + r.close(); + dir.close(); + } }