a rough start

This commit is contained in:
Mike McCandless 2016-02-23 16:34:39 -05:00
parent 18bb8caede
commit 013abeaa5b
4 changed files with 380 additions and 1 deletions

View File

@ -67,7 +67,7 @@ public class PrefixCodedTerms implements Accountable {
add(term.field(), term.bytes());
}
/** add a term */
/** add a term. This fully consumes in the incoming {@link BytesRef}. */
public void add(String field, BytesRef bytes) {
assert lastTerm.equals(new Term("")) || new Term(field, bytes).compareTo(lastTerm) > 0;

View File

@ -93,7 +93,10 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
/** Collapse the hash table and sort in-place; also sets
* this.sortedTermIDs to the results */
public int[] sortPostings() {
long t0 = System.nanoTime();
sortedTermIDs = bytesHash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
// nocommit
System.out.println("MKM: field " + fieldInfo.name + " has " + bytesHash.size() + " unique terms " + ((System.nanoTime()-t0)/1000000.0) + " msec to sort");
return sortedTermIDs;
}

View File

@ -0,0 +1,341 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import java.util.Objects;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
/** Finds all documents whose point value, previously indexed with e.g. {@link org.apache.lucene.document.LongPoint}, is contained in the
* specified set */
// nocommit make abstract
public class PointInSetQuery extends Query {
// A little bit overkill for us, since all of our "terms" are always in the same field:
final PrefixCodedTerms sortedPackedPoints;
final int sortedPackedPointsHashCode;
final String field;
final int numDims;
final int bytesPerDim;
/** {@code packedPoints} must already be sorted! */
protected PointInSetQuery(String field, int numDims, int bytesPerDim, BytesRefIterator packedPoints) throws IOException {
this.field = field;
// nocommit validate these:
this.bytesPerDim = bytesPerDim;
this.numDims = numDims;
// In the 1D case this works well (the more points, the more common prefixes they share, typically), but in
// the > 1 D case, where we are only looking at the first dimension's prefix bytes, it can at worst not hurt:
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
BytesRefBuilder previous = null;
BytesRef current;
while ((current = packedPoints.next()) != null) {
if (current.length != numDims * bytesPerDim) {
throw new IllegalArgumentException("packed point length should be " + (numDims * bytesPerDim) + " but got " + current.length + "; field=\"" + field + "\", numDims=" + numDims + " bytesPerDim=" + bytesPerDim);
}
if (previous == null) {
previous = new BytesRefBuilder();
} else if (previous.get().equals(current)) {
continue; // deduplicate
}
builder.add(field, current);
previous.copyBytes(current);
}
sortedPackedPoints = builder.finish();
sortedPackedPointsHashCode = sortedPackedPoints.hashCode();
}
/** Use in the 1D case when you indexed 1D int values using {@link org.apache.lucene.document.IntPoint} */
public static PointInSetQuery newIntSet(String field, int... valuesIn) {
// Don't unexpectedly change the user's incoming array:
int[] values = valuesIn.clone();
Arrays.sort(values);
final BytesRef value = new BytesRef(new byte[Integer.BYTES]);
value.length = Integer.BYTES;
try {
return new PointInSetQuery(field, 1, Integer.BYTES,
new BytesRefIterator() {
int upto;
@Override
public BytesRef next() {
if (upto == values.length) {
return null;
} else {
IntPoint.encodeDimension(values[upto], value.bytes, 0);
upto++;
return value;
}
}
});
} catch (IOException bogus) {
// Should never happen ;)
throw new RuntimeException(bogus);
}
}
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
// We don't use RandomAccessWeight here: it's no good to approximate with "match all docs".
// This is an inverted structure and should be used in the first pass:
return new ConstantScoreWeight(this) {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
PointValues values = reader.getPointValues();
if (values == null) {
// No docs in this segment indexed any points
return null;
}
FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
if (fieldInfo == null) {
// No docs in this segment indexed this field at all
return null;
}
if (fieldInfo.getPointDimensionCount() != numDims) {
throw new IllegalArgumentException("field=\"" + field + "\" was indexed with numDims=" + fieldInfo.getPointDimensionCount() + " but this query has numDims=" + numDims);
}
if (bytesPerDim != fieldInfo.getPointNumBytes()) {
throw new IllegalArgumentException("field=\"" + field + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() + " but this query has bytesPerDim=" + bytesPerDim);
}
int bytesPerDim = fieldInfo.getPointNumBytes();
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
int[] hitCount = new int[1];
final TermIterator iterator = sortedPackedPoints.iterator();
byte[] pointBytes = new byte[bytesPerDim * numDims];
if (numDims == 1) {
final BytesRef scratch = new BytesRef();
scratch.length = bytesPerDim;
// Optimize this common case, effectively doing a merge sort of the indexed values vs the queried set:
values.intersect(field,
new IntersectVisitor() {
private BytesRef nextQueryPoint = iterator.next();
@Override
public void grow(int count) {
result.grow(count);
}
@Override
public void visit(int docID) {
hitCount[0]++;
result.add(docID);
}
@Override
public void visit(int docID, byte[] packedValue) {
scratch.bytes = packedValue;
while (nextQueryPoint != null) {
int cmp = nextQueryPoint.compareTo(scratch);
if (cmp == 0) {
// Query point equals index point, so collect and return
hitCount[0]++;
result.add(docID);
break;
} else if (cmp < 0) {
// Query point is before index point, so we move to next query point
nextQueryPoint = iterator.next();
} else {
// Query point is after index point, so we don't collect and we return:
break;
}
}
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
while (nextQueryPoint != null) {
scratch.bytes = minPackedValue;
int cmpMin = nextQueryPoint.compareTo(scratch);
if (cmpMin < 0) {
// query point is before the start of this cell
nextQueryPoint = iterator.next();
continue;
}
scratch.bytes = maxPackedValue;
int cmpMax = nextQueryPoint.compareTo(scratch);
if (cmpMax > 0) {
// query point is after the end of this cell
return Relation.CELL_OUTSIDE_QUERY;
}
if (cmpMin == 0 && cmpMax == 0) {
// NOTE: we only hit this if we are on a cell whose min and max values are exactly equal to our point,
// which can easily happen if many (> 1024) docs share this one value
return Relation.CELL_INSIDE_QUERY;
} else {
return Relation.CELL_CROSSES_QUERY;
}
}
// We exhausted all points in the query:
return Relation.CELL_OUTSIDE_QUERY;
}
});
} else {
for (BytesRef point = iterator.next(); point != null; point = iterator.next()) {
// nocommit make sure a test tests this:
assert point.length == pointBytes.length;
System.arraycopy(point.bytes, point.offset, pointBytes, 0, pointBytes.length);
final BytesRef finalPoint = point;
values.intersect(field,
// nocommit don't make new instance of this for each point?
new IntersectVisitor() {
@Override
public void grow(int count) {
result.grow(count);
}
@Override
public void visit(int docID) {
hitCount[0]++;
result.add(docID);
}
@Override
public void visit(int docID, byte[] packedValue) {
assert packedValue.length == finalPoint.length;
if (Arrays.equals(packedValue, pointBytes)) {
// The point for this doc matches the point we are querying on
hitCount[0]++;
result.add(docID);
}
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
boolean crosses = false;
for(int dim=0;dim<numDims;dim++) {
int offset = dim*bytesPerDim;
int cmpMin = StringHelper.compare(bytesPerDim, minPackedValue, offset, pointBytes, offset);
if (cmpMin > 0) {
return Relation.CELL_OUTSIDE_QUERY;
}
int cmpMax = StringHelper.compare(bytesPerDim, maxPackedValue, offset, pointBytes, offset);
if (cmpMax < 0) {
return Relation.CELL_OUTSIDE_QUERY;
}
if (cmpMin != 0 || cmpMax != 0) {
crosses = true;
}
}
if (crosses) {
return Relation.CELL_CROSSES_QUERY;
} else {
// nocommit make sure tests hit this case:
// NOTE: we only hit this if we are on a cell whose min and max values are exactly equal to our point,
// which can easily happen if many docs share this one value
return Relation.CELL_INSIDE_QUERY;
}
}
});
}
}
// NOTE: hitCount[0] will be over-estimate in multi-valued case
return new ConstantScoreScorer(this, score(), result.build(hitCount[0]).iterator());
}
};
}
@Override
public int hashCode() {
int hash = super.hashCode();
hash += sortedPackedPointsHashCode^0x14fa55fb;
hash += numDims^0x14fa55fb;
hash += bytesPerDim^0x14fa55fb;
return hash;
}
@Override
public boolean equals(Object other) {
if (super.equals(other)) {
final PointInSetQuery q = (PointInSetQuery) other;
return q.numDims == numDims &&
q.bytesPerDim == bytesPerDim &&
q.sortedPackedPointsHashCode == sortedPackedPointsHashCode &&
q.sortedPackedPoints.equals(sortedPackedPoints);
}
return false;
}
@Override
public String toString(String field) {
final StringBuilder sb = new StringBuilder();
sb.append(getClass().getSimpleName());
sb.append(':');
if (this.field.equals(field) == false) {
sb.append("field=");
sb.append(this.field);
sb.append(':');
}
TermIterator iterator = sortedPackedPoints.iterator();
for (BytesRef point = iterator.next(); point != null; point = iterator.next()) {
sb.append(' ');
// nocommit fix me to convert back to the numbers/etc.:
sb.append(point);
}
return sb.toString();
}
}

View File

@ -1114,4 +1114,39 @@ public class TestPointQueries extends LuceneTestCase {
new boolean[] { false, true }).toString());
}
// nocommit fix existing randomized tests to sometimes randomly use PointInSet instead
// nocommit need 2D test too
public void testBasicPointInSetQuery() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setCodec(getCodec());
IndexWriter w = new IndexWriter(dir, iwc);
Document doc = new Document();
doc.add(new IntPoint("int", 17));
w.addDocument(doc);
doc = new Document();
doc.add(new IntPoint("int", 42));
w.addDocument(doc);
doc = new Document();
doc.add(new IntPoint("int", 97));
w.addDocument(doc);
IndexReader r = DirectoryReader.open(w);
IndexSearcher s = newSearcher(r);
assertEquals(0, s.count(PointInSetQuery.newIntSet("int", 16)));
assertEquals(1, s.count(PointInSetQuery.newIntSet("int", 17)));
assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 97, 42)));
assertEquals(3, s.count(PointInSetQuery.newIntSet("int", -7, 17, 42, 97)));
assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 20, 42, 97)));
assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 105, 42, 97)));
w.close();
r.close();
dir.close();
}
}