mirror of https://github.com/apache/lucene.git
a rough start
This commit is contained in:
parent
18bb8caede
commit
013abeaa5b
|
@ -67,7 +67,7 @@ public class PrefixCodedTerms implements Accountable {
|
||||||
add(term.field(), term.bytes());
|
add(term.field(), term.bytes());
|
||||||
}
|
}
|
||||||
|
|
||||||
/** add a term */
|
/** add a term. This fully consumes in the incoming {@link BytesRef}. */
|
||||||
public void add(String field, BytesRef bytes) {
|
public void add(String field, BytesRef bytes) {
|
||||||
assert lastTerm.equals(new Term("")) || new Term(field, bytes).compareTo(lastTerm) > 0;
|
assert lastTerm.equals(new Term("")) || new Term(field, bytes).compareTo(lastTerm) > 0;
|
||||||
|
|
||||||
|
|
|
@ -93,7 +93,10 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
||||||
/** Collapse the hash table and sort in-place; also sets
|
/** Collapse the hash table and sort in-place; also sets
|
||||||
* this.sortedTermIDs to the results */
|
* this.sortedTermIDs to the results */
|
||||||
public int[] sortPostings() {
|
public int[] sortPostings() {
|
||||||
|
long t0 = System.nanoTime();
|
||||||
sortedTermIDs = bytesHash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
sortedTermIDs = bytesHash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||||
|
// nocommit
|
||||||
|
System.out.println("MKM: field " + fieldInfo.name + " has " + bytesHash.size() + " unique terms " + ((System.nanoTime()-t0)/1000000.0) + " msec to sort");
|
||||||
return sortedTermIDs;
|
return sortedTermIDs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,341 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.IntPoint;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.LeafReader;
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
||||||
|
import org.apache.lucene.index.PointValues.Relation;
|
||||||
|
import org.apache.lucene.index.PointValues;
|
||||||
|
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
|
||||||
|
import org.apache.lucene.index.PrefixCodedTerms;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
|
import org.apache.lucene.util.BytesRefIterator;
|
||||||
|
import org.apache.lucene.util.DocIdSetBuilder;
|
||||||
|
import org.apache.lucene.util.NumericUtils;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
import org.apache.lucene.util.StringHelper;
|
||||||
|
|
||||||
|
/** Finds all documents whose point value, previously indexed with e.g. {@link org.apache.lucene.document.LongPoint}, is contained in the
|
||||||
|
* specified set */
|
||||||
|
|
||||||
|
// nocommit make abstract
|
||||||
|
public class PointInSetQuery extends Query {
|
||||||
|
// A little bit overkill for us, since all of our "terms" are always in the same field:
|
||||||
|
final PrefixCodedTerms sortedPackedPoints;
|
||||||
|
final int sortedPackedPointsHashCode;
|
||||||
|
final String field;
|
||||||
|
final int numDims;
|
||||||
|
final int bytesPerDim;
|
||||||
|
|
||||||
|
/** {@code packedPoints} must already be sorted! */
|
||||||
|
protected PointInSetQuery(String field, int numDims, int bytesPerDim, BytesRefIterator packedPoints) throws IOException {
|
||||||
|
this.field = field;
|
||||||
|
// nocommit validate these:
|
||||||
|
this.bytesPerDim = bytesPerDim;
|
||||||
|
this.numDims = numDims;
|
||||||
|
|
||||||
|
// In the 1D case this works well (the more points, the more common prefixes they share, typically), but in
|
||||||
|
// the > 1 D case, where we are only looking at the first dimension's prefix bytes, it can at worst not hurt:
|
||||||
|
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
|
||||||
|
BytesRefBuilder previous = null;
|
||||||
|
BytesRef current;
|
||||||
|
while ((current = packedPoints.next()) != null) {
|
||||||
|
if (current.length != numDims * bytesPerDim) {
|
||||||
|
throw new IllegalArgumentException("packed point length should be " + (numDims * bytesPerDim) + " but got " + current.length + "; field=\"" + field + "\", numDims=" + numDims + " bytesPerDim=" + bytesPerDim);
|
||||||
|
}
|
||||||
|
if (previous == null) {
|
||||||
|
previous = new BytesRefBuilder();
|
||||||
|
} else if (previous.get().equals(current)) {
|
||||||
|
continue; // deduplicate
|
||||||
|
}
|
||||||
|
builder.add(field, current);
|
||||||
|
previous.copyBytes(current);
|
||||||
|
}
|
||||||
|
sortedPackedPoints = builder.finish();
|
||||||
|
sortedPackedPointsHashCode = sortedPackedPoints.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Use in the 1D case when you indexed 1D int values using {@link org.apache.lucene.document.IntPoint} */
|
||||||
|
public static PointInSetQuery newIntSet(String field, int... valuesIn) {
|
||||||
|
|
||||||
|
// Don't unexpectedly change the user's incoming array:
|
||||||
|
int[] values = valuesIn.clone();
|
||||||
|
|
||||||
|
Arrays.sort(values);
|
||||||
|
|
||||||
|
final BytesRef value = new BytesRef(new byte[Integer.BYTES]);
|
||||||
|
value.length = Integer.BYTES;
|
||||||
|
|
||||||
|
try {
|
||||||
|
return new PointInSetQuery(field, 1, Integer.BYTES,
|
||||||
|
new BytesRefIterator() {
|
||||||
|
|
||||||
|
int upto;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef next() {
|
||||||
|
if (upto == values.length) {
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
IntPoint.encodeDimension(values[upto], value.bytes, 0);
|
||||||
|
upto++;
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (IOException bogus) {
|
||||||
|
// Should never happen ;)
|
||||||
|
throw new RuntimeException(bogus);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
|
||||||
|
|
||||||
|
// We don't use RandomAccessWeight here: it's no good to approximate with "match all docs".
|
||||||
|
// This is an inverted structure and should be used in the first pass:
|
||||||
|
|
||||||
|
return new ConstantScoreWeight(this) {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||||
|
LeafReader reader = context.reader();
|
||||||
|
PointValues values = reader.getPointValues();
|
||||||
|
if (values == null) {
|
||||||
|
// No docs in this segment indexed any points
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
|
||||||
|
if (fieldInfo == null) {
|
||||||
|
// No docs in this segment indexed this field at all
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (fieldInfo.getPointDimensionCount() != numDims) {
|
||||||
|
throw new IllegalArgumentException("field=\"" + field + "\" was indexed with numDims=" + fieldInfo.getPointDimensionCount() + " but this query has numDims=" + numDims);
|
||||||
|
}
|
||||||
|
if (bytesPerDim != fieldInfo.getPointNumBytes()) {
|
||||||
|
throw new IllegalArgumentException("field=\"" + field + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() + " but this query has bytesPerDim=" + bytesPerDim);
|
||||||
|
}
|
||||||
|
int bytesPerDim = fieldInfo.getPointNumBytes();
|
||||||
|
|
||||||
|
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
|
||||||
|
|
||||||
|
int[] hitCount = new int[1];
|
||||||
|
final TermIterator iterator = sortedPackedPoints.iterator();
|
||||||
|
byte[] pointBytes = new byte[bytesPerDim * numDims];
|
||||||
|
|
||||||
|
if (numDims == 1) {
|
||||||
|
|
||||||
|
final BytesRef scratch = new BytesRef();
|
||||||
|
scratch.length = bytesPerDim;
|
||||||
|
|
||||||
|
// Optimize this common case, effectively doing a merge sort of the indexed values vs the queried set:
|
||||||
|
values.intersect(field,
|
||||||
|
new IntersectVisitor() {
|
||||||
|
|
||||||
|
private BytesRef nextQueryPoint = iterator.next();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void grow(int count) {
|
||||||
|
result.grow(count);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID) {
|
||||||
|
hitCount[0]++;
|
||||||
|
result.add(docID);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID, byte[] packedValue) {
|
||||||
|
scratch.bytes = packedValue;
|
||||||
|
while (nextQueryPoint != null) {
|
||||||
|
int cmp = nextQueryPoint.compareTo(scratch);
|
||||||
|
if (cmp == 0) {
|
||||||
|
// Query point equals index point, so collect and return
|
||||||
|
hitCount[0]++;
|
||||||
|
result.add(docID);
|
||||||
|
break;
|
||||||
|
} else if (cmp < 0) {
|
||||||
|
// Query point is before index point, so we move to next query point
|
||||||
|
nextQueryPoint = iterator.next();
|
||||||
|
} else {
|
||||||
|
// Query point is after index point, so we don't collect and we return:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||||
|
|
||||||
|
while (nextQueryPoint != null) {
|
||||||
|
scratch.bytes = minPackedValue;
|
||||||
|
int cmpMin = nextQueryPoint.compareTo(scratch);
|
||||||
|
if (cmpMin < 0) {
|
||||||
|
// query point is before the start of this cell
|
||||||
|
nextQueryPoint = iterator.next();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
scratch.bytes = maxPackedValue;
|
||||||
|
int cmpMax = nextQueryPoint.compareTo(scratch);
|
||||||
|
if (cmpMax > 0) {
|
||||||
|
// query point is after the end of this cell
|
||||||
|
return Relation.CELL_OUTSIDE_QUERY;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cmpMin == 0 && cmpMax == 0) {
|
||||||
|
// NOTE: we only hit this if we are on a cell whose min and max values are exactly equal to our point,
|
||||||
|
// which can easily happen if many (> 1024) docs share this one value
|
||||||
|
return Relation.CELL_INSIDE_QUERY;
|
||||||
|
} else {
|
||||||
|
return Relation.CELL_CROSSES_QUERY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We exhausted all points in the query:
|
||||||
|
return Relation.CELL_OUTSIDE_QUERY;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
for (BytesRef point = iterator.next(); point != null; point = iterator.next()) {
|
||||||
|
// nocommit make sure a test tests this:
|
||||||
|
assert point.length == pointBytes.length;
|
||||||
|
System.arraycopy(point.bytes, point.offset, pointBytes, 0, pointBytes.length);
|
||||||
|
|
||||||
|
final BytesRef finalPoint = point;
|
||||||
|
|
||||||
|
values.intersect(field,
|
||||||
|
// nocommit don't make new instance of this for each point?
|
||||||
|
new IntersectVisitor() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void grow(int count) {
|
||||||
|
result.grow(count);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID) {
|
||||||
|
hitCount[0]++;
|
||||||
|
result.add(docID);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID, byte[] packedValue) {
|
||||||
|
assert packedValue.length == finalPoint.length;
|
||||||
|
if (Arrays.equals(packedValue, pointBytes)) {
|
||||||
|
// The point for this doc matches the point we are querying on
|
||||||
|
hitCount[0]++;
|
||||||
|
result.add(docID);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||||
|
|
||||||
|
boolean crosses = false;
|
||||||
|
|
||||||
|
for(int dim=0;dim<numDims;dim++) {
|
||||||
|
int offset = dim*bytesPerDim;
|
||||||
|
|
||||||
|
int cmpMin = StringHelper.compare(bytesPerDim, minPackedValue, offset, pointBytes, offset);
|
||||||
|
if (cmpMin > 0) {
|
||||||
|
return Relation.CELL_OUTSIDE_QUERY;
|
||||||
|
}
|
||||||
|
|
||||||
|
int cmpMax = StringHelper.compare(bytesPerDim, maxPackedValue, offset, pointBytes, offset);
|
||||||
|
if (cmpMax < 0) {
|
||||||
|
return Relation.CELL_OUTSIDE_QUERY;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cmpMin != 0 || cmpMax != 0) {
|
||||||
|
crosses = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (crosses) {
|
||||||
|
return Relation.CELL_CROSSES_QUERY;
|
||||||
|
} else {
|
||||||
|
// nocommit make sure tests hit this case:
|
||||||
|
// NOTE: we only hit this if we are on a cell whose min and max values are exactly equal to our point,
|
||||||
|
// which can easily happen if many docs share this one value
|
||||||
|
return Relation.CELL_INSIDE_QUERY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: hitCount[0] will be over-estimate in multi-valued case
|
||||||
|
return new ConstantScoreScorer(this, score(), result.build(hitCount[0]).iterator());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
int hash = super.hashCode();
|
||||||
|
hash += sortedPackedPointsHashCode^0x14fa55fb;
|
||||||
|
hash += numDims^0x14fa55fb;
|
||||||
|
hash += bytesPerDim^0x14fa55fb;
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (super.equals(other)) {
|
||||||
|
final PointInSetQuery q = (PointInSetQuery) other;
|
||||||
|
return q.numDims == numDims &&
|
||||||
|
q.bytesPerDim == bytesPerDim &&
|
||||||
|
q.sortedPackedPointsHashCode == sortedPackedPointsHashCode &&
|
||||||
|
q.sortedPackedPoints.equals(sortedPackedPoints);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString(String field) {
|
||||||
|
final StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append(getClass().getSimpleName());
|
||||||
|
sb.append(':');
|
||||||
|
if (this.field.equals(field) == false) {
|
||||||
|
sb.append("field=");
|
||||||
|
sb.append(this.field);
|
||||||
|
sb.append(':');
|
||||||
|
}
|
||||||
|
|
||||||
|
TermIterator iterator = sortedPackedPoints.iterator();
|
||||||
|
for (BytesRef point = iterator.next(); point != null; point = iterator.next()) {
|
||||||
|
sb.append(' ');
|
||||||
|
// nocommit fix me to convert back to the numbers/etc.:
|
||||||
|
sb.append(point);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -1114,4 +1114,39 @@ public class TestPointQueries extends LuceneTestCase {
|
||||||
new boolean[] { false, true }).toString());
|
new boolean[] { false, true }).toString());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// nocommit fix existing randomized tests to sometimes randomly use PointInSet instead
|
||||||
|
|
||||||
|
// nocommit need 2D test too
|
||||||
|
|
||||||
|
public void testBasicPointInSetQuery() throws Exception {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig iwc = newIndexWriterConfig();
|
||||||
|
iwc.setCodec(getCodec());
|
||||||
|
IndexWriter w = new IndexWriter(dir, iwc);
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new IntPoint("int", 17));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new IntPoint("int", 42));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new IntPoint("int", 97));
|
||||||
|
w.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = DirectoryReader.open(w);
|
||||||
|
IndexSearcher s = newSearcher(r);
|
||||||
|
assertEquals(0, s.count(PointInSetQuery.newIntSet("int", 16)));
|
||||||
|
assertEquals(1, s.count(PointInSetQuery.newIntSet("int", 17)));
|
||||||
|
assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 97, 42)));
|
||||||
|
assertEquals(3, s.count(PointInSetQuery.newIntSet("int", -7, 17, 42, 97)));
|
||||||
|
assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 20, 42, 97)));
|
||||||
|
assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 105, 42, 97)));
|
||||||
|
w.close();
|
||||||
|
r.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue