mirror of https://github.com/apache/lucene.git
LUCENE-6350: TermsQuery is now compressed with PrefixCodedTerms.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1678164 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2cec2feec8
commit
b7c48ea313
|
@ -123,6 +123,9 @@ Optimizations
|
||||||
* LUCENE-6330: BooleanScorer (used for top-level disjunctions) does not decode
|
* LUCENE-6330: BooleanScorer (used for top-level disjunctions) does not decode
|
||||||
norms when not necessary anymore. (Adrien Grand)
|
norms when not necessary anymore. (Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-6350: TermsQuery is now compressed with PrefixCodedTerms.
|
||||||
|
(Robert Muir, Mike McCandless, Adrien Grand)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
* LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.
|
* LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.
|
||||||
|
|
|
@ -19,8 +19,6 @@ package org.apache.lucene.index;
|
||||||
|
|
||||||
import org.apache.lucene.util.BytesRefIterator;
|
import org.apache.lucene.util.BytesRefIterator;
|
||||||
|
|
||||||
// TODO: maybe TermsFilter could use this?
|
|
||||||
|
|
||||||
/** Iterates over terms in across multiple fields. The caller must
|
/** Iterates over terms in across multiple fields. The caller must
|
||||||
* check {@link #field} after each {@link #next} to see if the field
|
* check {@link #field} after each {@link #next} to see if the field
|
||||||
* changed, but {@code ==} can be used since the iterator
|
* changed, but {@code ==} can be used since the iterator
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.index;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.RAMFile;
|
import org.apache.lucene.store.RAMFile;
|
||||||
|
@ -29,14 +30,14 @@ import org.apache.lucene.util.BytesRefBuilder;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prefix codes term instances (prefixes are shared)
|
* Prefix codes term instances (prefixes are shared)
|
||||||
* @lucene.experimental
|
* @lucene.internal
|
||||||
*/
|
*/
|
||||||
class PrefixCodedTerms implements Accountable {
|
public class PrefixCodedTerms implements Accountable {
|
||||||
final RAMFile buffer;
|
final RAMFile buffer;
|
||||||
private long delGen;
|
private long delGen;
|
||||||
|
|
||||||
private PrefixCodedTerms(RAMFile buffer) {
|
private PrefixCodedTerms(RAMFile buffer) {
|
||||||
this.buffer = buffer;
|
this.buffer = Objects.requireNonNull(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -56,6 +57,9 @@ class PrefixCodedTerms implements Accountable {
|
||||||
private Term lastTerm = new Term("");
|
private Term lastTerm = new Term("");
|
||||||
private BytesRefBuilder lastTermBytes = new BytesRefBuilder();
|
private BytesRefBuilder lastTermBytes = new BytesRefBuilder();
|
||||||
|
|
||||||
|
/** Sole constructor. */
|
||||||
|
public Builder() {}
|
||||||
|
|
||||||
/** add a term */
|
/** add a term */
|
||||||
public void add(Term term) {
|
public void add(Term term) {
|
||||||
assert lastTerm.equals(new Term("")) || term.compareTo(lastTerm) > 0;
|
assert lastTerm.equals(new Term("")) || term.compareTo(lastTerm) > 0;
|
||||||
|
@ -104,6 +108,7 @@ class PrefixCodedTerms implements Accountable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** An iterator over the list of terms stored in a {@link PrefixCodedTerms}. */
|
||||||
public static class TermIterator extends FieldTermIterator {
|
public static class TermIterator extends FieldTermIterator {
|
||||||
final IndexInput input;
|
final IndexInput input;
|
||||||
final BytesRefBuilder builder = new BytesRefBuilder();
|
final BytesRefBuilder builder = new BytesRefBuilder();
|
||||||
|
@ -112,7 +117,7 @@ class PrefixCodedTerms implements Accountable {
|
||||||
final long delGen;
|
final long delGen;
|
||||||
String field = "";
|
String field = "";
|
||||||
|
|
||||||
public TermIterator(long delGen, RAMFile buffer) {
|
private TermIterator(long delGen, RAMFile buffer) {
|
||||||
try {
|
try {
|
||||||
input = new RAMInputStream("MergedPrefixCodedTermsIterator", buffer);
|
input = new RAMInputStream("MergedPrefixCodedTermsIterator", buffer);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
@ -162,7 +167,24 @@ class PrefixCodedTerms implements Accountable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Return an iterator over the terms stored in this {@link PrefixCodedTerms}. */
|
||||||
public TermIterator iterator() {
|
public TermIterator iterator() {
|
||||||
return new TermIterator(delGen, buffer);
|
return new TermIterator(delGen, buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
int h = buffer.hashCode();
|
||||||
|
h = 31 * h + (int) (delGen ^ (delGen >>> 32));
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj) return true;
|
||||||
|
if (obj == null) return false;
|
||||||
|
if (getClass() != obj.getClass()) return false;
|
||||||
|
PrefixCodedTerms other = (PrefixCodedTerms) obj;
|
||||||
|
return buffer.equals(other.buffer) && delGen == other.delGen;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.store;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
import org.apache.lucene.util.Accountable;
|
import org.apache.lucene.util.Accountable;
|
||||||
|
|
||||||
|
@ -25,7 +26,7 @@ import org.apache.lucene.util.Accountable;
|
||||||
* Represents a file in RAM as a list of byte[] buffers.
|
* Represents a file in RAM as a list of byte[] buffers.
|
||||||
* @lucene.internal */
|
* @lucene.internal */
|
||||||
public class RAMFile implements Accountable {
|
public class RAMFile implements Accountable {
|
||||||
protected ArrayList<byte[]> buffers = new ArrayList<>();
|
protected final ArrayList<byte[]> buffers = new ArrayList<>();
|
||||||
long length;
|
long length;
|
||||||
RAMDirectory directory;
|
RAMDirectory directory;
|
||||||
protected long sizeInBytes;
|
protected long sizeInBytes;
|
||||||
|
@ -86,4 +87,31 @@ public class RAMFile implements Accountable {
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return getClass().getSimpleName() + "(length=" + length + ")";
|
return getClass().getSimpleName() + "(length=" + length + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
int h = (int) (length ^ (length >>> 32));
|
||||||
|
for (byte[] block : buffers) {
|
||||||
|
h = 31 * h + Arrays.hashCode(block);
|
||||||
|
}
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj) return true;
|
||||||
|
if (obj == null) return false;
|
||||||
|
if (getClass() != obj.getClass()) return false;
|
||||||
|
RAMFile other = (RAMFile) obj;
|
||||||
|
if (length != other.length) return false;
|
||||||
|
if (buffers.size() != other.buffers.size()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < buffers.size(); i++) {
|
||||||
|
if (!Arrays.equals(buffers.get(i), other.buffers.get(i))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,11 +18,9 @@ package org.apache.lucene.queries;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
@ -30,13 +28,15 @@ import org.apache.lucene.index.Fields;
|
||||||
import org.apache.lucene.index.LeafReader;
|
import org.apache.lucene.index.LeafReader;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
|
import org.apache.lucene.index.PrefixCodedTerms;
|
||||||
|
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.ConstantScoreQuery;
|
import org.apache.lucene.search.ConstantScoreQuery;
|
||||||
|
import org.apache.lucene.search.ConstantScoreWeight;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.search.Explanation;
|
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.Scorer;
|
import org.apache.lucene.search.Scorer;
|
||||||
|
@ -73,38 +73,35 @@ public class TermsQuery extends Query implements Accountable {
|
||||||
|
|
||||||
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermsQuery.class);
|
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermsQuery.class);
|
||||||
|
|
||||||
/*
|
private final PrefixCodedTerms termData;
|
||||||
* this class is often used for large number of terms in a single field.
|
private final int termDataHashCode; // cached hashcode of termData
|
||||||
* to optimize for this case and to be filter-cache friendly we
|
|
||||||
* serialize all terms into a single byte array and store offsets
|
private static Term[] toTermArray(String field, List<BytesRef> termBytes) {
|
||||||
* in a parallel array to keep the # of object constant and speed up
|
Term[] array = new Term[termBytes.size()];
|
||||||
* equals / hashcode.
|
int i = 0;
|
||||||
*
|
for (BytesRef t : termBytes) {
|
||||||
* This adds quite a bit of complexity but allows large term queries to
|
array[i++] = new Term(field, t);
|
||||||
* be efficient for GC and cache-lookups
|
}
|
||||||
*/
|
return array;
|
||||||
private final int[] offsets;
|
}
|
||||||
private final byte[] termsBytes;
|
|
||||||
private final TermsAndField[] termsAndFields;
|
|
||||||
private final int hashCode; // cached hashcode for fast cache lookups, not including the boost
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new {@link TermsQuery} from the given list. The list
|
* Creates a new {@link TermsQuery} from the given list. The list
|
||||||
* can contain duplicate terms and multiple fields.
|
* can contain duplicate terms and multiple fields.
|
||||||
*/
|
*/
|
||||||
public TermsQuery(final List<Term> terms) {
|
public TermsQuery(final List<Term> terms) {
|
||||||
this(new FieldAndTermEnum() {
|
Term[] sortedTerms = terms.toArray(new Term[terms.size()]);
|
||||||
// we need to sort for deduplication and to have a common cache key
|
ArrayUtil.timSort(sortedTerms);
|
||||||
final Iterator<Term> iter = sort(terms).iterator();
|
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
|
||||||
@Override
|
Term previous = null;
|
||||||
public BytesRef next() {
|
for (Term term : sortedTerms) {
|
||||||
if (iter.hasNext()) {
|
if (term.equals(previous) == false) {
|
||||||
Term next = iter.next();
|
builder.add(term);
|
||||||
field = next.field();
|
|
||||||
return next.bytes();
|
|
||||||
}
|
}
|
||||||
return null;
|
previous = term;
|
||||||
}}, terms.size());
|
}
|
||||||
|
termData = builder.finish();
|
||||||
|
termDataHashCode = termData.hashCode();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -112,17 +109,7 @@ public class TermsQuery extends Query implements Accountable {
|
||||||
* a single field.
|
* a single field.
|
||||||
*/
|
*/
|
||||||
public TermsQuery(final String field, final List<BytesRef> terms) {
|
public TermsQuery(final String field, final List<BytesRef> terms) {
|
||||||
this(new FieldAndTermEnum(field) {
|
this(toTermArray(field, terms));
|
||||||
// we need to sort for deduplication and to have a common cache key
|
|
||||||
final Iterator<BytesRef> iter = sort(terms).iterator();
|
|
||||||
@Override
|
|
||||||
public BytesRef next() {
|
|
||||||
if (iter.hasNext()) {
|
|
||||||
return iter.next();
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}, terms.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -142,106 +129,37 @@ public class TermsQuery extends Query implements Accountable {
|
||||||
this(Arrays.asList(terms));
|
this(Arrays.asList(terms));
|
||||||
}
|
}
|
||||||
|
|
||||||
private TermsQuery(FieldAndTermEnum iter, int length) {
|
|
||||||
// TODO: maybe use oal.index.PrefixCodedTerms instead?
|
|
||||||
// If number of terms is more than a few hundred it
|
|
||||||
// should be a win
|
|
||||||
|
|
||||||
// TODO: we also pack terms in FieldCache/DocValues
|
|
||||||
// ... maybe we can refactor to share that code
|
|
||||||
|
|
||||||
// TODO: yet another option is to build the union of the terms in
|
|
||||||
// an automaton an call intersect on the termsenum if the density is high
|
|
||||||
|
|
||||||
int hash = 9;
|
|
||||||
byte[] serializedTerms = new byte[0];
|
|
||||||
this.offsets = new int[length+1];
|
|
||||||
int lastEndOffset = 0;
|
|
||||||
int index = 0;
|
|
||||||
ArrayList<TermsAndField> termsAndFields = new ArrayList<>();
|
|
||||||
TermsAndField lastTermsAndField = null;
|
|
||||||
BytesRef previousTerm = null;
|
|
||||||
String previousField = null;
|
|
||||||
BytesRef currentTerm;
|
|
||||||
String currentField;
|
|
||||||
while((currentTerm = iter.next()) != null) {
|
|
||||||
currentField = iter.field();
|
|
||||||
if (currentField == null) {
|
|
||||||
throw new IllegalArgumentException("Field must not be null");
|
|
||||||
}
|
|
||||||
if (previousField != null) {
|
|
||||||
// deduplicate
|
|
||||||
if (previousField.equals(currentField)) {
|
|
||||||
if (previousTerm.bytesEquals(currentTerm)){
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
|
|
||||||
lastTermsAndField = new TermsAndField(start, index, previousField);
|
|
||||||
termsAndFields.add(lastTermsAndField);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
hash = 31 * hash + currentField.hashCode();
|
|
||||||
hash = 31 * hash + currentTerm.hashCode();
|
|
||||||
if (serializedTerms.length < lastEndOffset+currentTerm.length) {
|
|
||||||
serializedTerms = ArrayUtil.grow(serializedTerms, lastEndOffset+currentTerm.length);
|
|
||||||
}
|
|
||||||
System.arraycopy(currentTerm.bytes, currentTerm.offset, serializedTerms, lastEndOffset, currentTerm.length);
|
|
||||||
offsets[index] = lastEndOffset;
|
|
||||||
lastEndOffset += currentTerm.length;
|
|
||||||
index++;
|
|
||||||
previousTerm = currentTerm;
|
|
||||||
previousField = currentField;
|
|
||||||
}
|
|
||||||
offsets[index] = lastEndOffset;
|
|
||||||
final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
|
|
||||||
lastTermsAndField = new TermsAndField(start, index, previousField);
|
|
||||||
termsAndFields.add(lastTermsAndField);
|
|
||||||
this.termsBytes = ArrayUtil.shrink(serializedTerms, lastEndOffset);
|
|
||||||
this.termsAndFields = termsAndFields.toArray(new TermsAndField[termsAndFields.size()]);
|
|
||||||
this.hashCode = hash;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (!super.equals(obj)) {
|
if (!super.equals(obj)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
TermsQuery that = (TermsQuery) obj;
|
TermsQuery that = (TermsQuery) obj;
|
||||||
// first check the fields before even comparing the bytes
|
// termData might be heavy to compare so check the hash code first
|
||||||
if (that.hashCode == hashCode && getBoost() == that.getBoost() && Arrays.equals(termsAndFields, that.termsAndFields)) {
|
return termDataHashCode == that.termDataHashCode
|
||||||
int lastOffset = termsAndFields[termsAndFields.length - 1].end;
|
&& termData.equals(that.termData);
|
||||||
// compare offsets since we sort they must be identical
|
|
||||||
if (ArrayUtil.equals(offsets, 0, that.offsets, 0, lastOffset + 1)) {
|
|
||||||
// straight byte comparison since we sort they must be identical
|
|
||||||
return ArrayUtil.equals(termsBytes, 0, that.termsBytes, 0, offsets[lastOffset]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return super.hashCode() ^ this.hashCode;
|
return 31 * super.hashCode() + termDataHashCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString(String defaultField) {
|
public String toString(String defaultField) {
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
BytesRef spare = new BytesRef(termsBytes);
|
|
||||||
boolean first = true;
|
boolean first = true;
|
||||||
for (int i = 0; i < termsAndFields.length; i++) {
|
TermIterator iterator = termData.iterator();
|
||||||
TermsAndField current = termsAndFields[i];
|
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
|
||||||
for (int j = current.start; j < current.end; j++) {
|
|
||||||
spare.offset = offsets[j];
|
|
||||||
spare.length = offsets[j+1] - offsets[j];
|
|
||||||
if (!first) {
|
if (!first) {
|
||||||
builder.append(' ');
|
builder.append(' ');
|
||||||
}
|
}
|
||||||
first = false;
|
first = false;
|
||||||
builder.append(current.field).append(':');
|
builder.append(iterator.field()).append(':');
|
||||||
builder.append(spare.utf8ToString());
|
builder.append(term.utf8ToString());
|
||||||
}
|
|
||||||
}
|
}
|
||||||
builder.append(ToStringUtils.boost(getBoost()));
|
builder.append(ToStringUtils.boost(getBoost()));
|
||||||
|
|
||||||
|
@ -250,41 +168,7 @@ public class TermsQuery extends Query implements Accountable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long ramBytesUsed() {
|
public long ramBytesUsed() {
|
||||||
return BASE_RAM_BYTES_USED
|
return BASE_RAM_BYTES_USED + termData.ramBytesUsed();
|
||||||
+ RamUsageEstimator.sizeOf(termsAndFields)
|
|
||||||
+ RamUsageEstimator.sizeOf(termsBytes)
|
|
||||||
+ RamUsageEstimator.sizeOf(offsets);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Collection<Accountable> getChildResources() {
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final class TermsAndField implements Accountable {
|
|
||||||
|
|
||||||
private static final long BASE_RAM_BYTES_USED =
|
|
||||||
RamUsageEstimator.shallowSizeOfInstance(TermsAndField.class)
|
|
||||||
+ RamUsageEstimator.shallowSizeOfInstance(String.class)
|
|
||||||
+ RamUsageEstimator.NUM_BYTES_ARRAY_HEADER; // header of the array held by the String
|
|
||||||
|
|
||||||
final int start;
|
|
||||||
final int end;
|
|
||||||
final String field;
|
|
||||||
|
|
||||||
|
|
||||||
TermsAndField(int start, int end, String field) {
|
|
||||||
super();
|
|
||||||
this.start = start;
|
|
||||||
this.end = end;
|
|
||||||
this.field = field;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public long ramBytesUsed() {
|
|
||||||
// this is an approximation since we don't actually know how strings store
|
|
||||||
// their data, which can be JVM-dependent
|
|
||||||
return BASE_RAM_BYTES_USED + field.length() * RamUsageEstimator.NUM_BYTES_CHAR;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -293,63 +177,8 @@ public class TermsQuery extends Query implements Accountable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
|
||||||
final int prime = 31;
|
return new ConstantScoreWeight(this) {
|
||||||
int result = 1;
|
|
||||||
result = prime * result + ((field == null) ? 0 : field.hashCode());
|
|
||||||
result = prime * result + end;
|
|
||||||
result = prime * result + start;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object obj) {
|
|
||||||
if (this == obj) return true;
|
|
||||||
if (obj == null) return false;
|
|
||||||
if (getClass() != obj.getClass()) return false;
|
|
||||||
TermsAndField other = (TermsAndField) obj;
|
|
||||||
if (field == null) {
|
|
||||||
if (other.field != null) return false;
|
|
||||||
} else if (!field.equals(other.field)) return false;
|
|
||||||
if (end != other.end) return false;
|
|
||||||
if (start != other.start) return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private static abstract class FieldAndTermEnum {
|
|
||||||
protected String field;
|
|
||||||
|
|
||||||
public abstract BytesRef next();
|
|
||||||
|
|
||||||
public FieldAndTermEnum() {}
|
|
||||||
|
|
||||||
public FieldAndTermEnum(String field) { this.field = field; }
|
|
||||||
|
|
||||||
public String field() {
|
|
||||||
return field;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* simple utility that returns the in-place sorted list
|
|
||||||
*/
|
|
||||||
private static <T extends Comparable<? super T>> List<T> sort(List<T> toSort) {
|
|
||||||
if (toSort.isEmpty()) {
|
|
||||||
throw new IllegalArgumentException("no terms provided");
|
|
||||||
}
|
|
||||||
Collections.sort(toSort);
|
|
||||||
return toSort;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Weight createWeight(IndexSearcher searcher, boolean needsScores)
|
|
||||||
throws IOException {
|
|
||||||
return new Weight(this) {
|
|
||||||
|
|
||||||
private float queryNorm;
|
|
||||||
private float queryWeight;
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void extractTerms(Set<Term> terms) {
|
public void extractTerms(Set<Term> terms) {
|
||||||
|
@ -360,63 +189,46 @@ public class TermsQuery extends Query implements Accountable {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public float getValueForNormalization() throws IOException {
|
public Scorer scorer(LeafReaderContext context, Bits acceptDocs, float score) throws IOException {
|
||||||
queryWeight = getBoost();
|
|
||||||
return queryWeight * queryWeight;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void normalize(float norm, float topLevelBoost) {
|
|
||||||
queryNorm = norm * topLevelBoost;
|
|
||||||
queryWeight *= queryNorm;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
|
|
||||||
final Scorer s = scorer(context, context.reader().getLiveDocs());
|
|
||||||
final boolean exists = (s != null && s.advance(doc) == doc);
|
|
||||||
|
|
||||||
if (exists) {
|
|
||||||
return Explanation.match(queryWeight, TermsQuery.this.toString() + ", product of:",
|
|
||||||
Explanation.match(getBoost(), "boost"), Explanation.match(queryNorm, "queryNorm"));
|
|
||||||
} else {
|
|
||||||
return Explanation.noMatch(TermsQuery.this.toString() + " doesn't match id " + doc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Scorer scorer(LeafReaderContext context, Bits acceptDocs) throws IOException {
|
|
||||||
final LeafReader reader = context.reader();
|
final LeafReader reader = context.reader();
|
||||||
BitDocIdSet.Builder builder = new BitDocIdSet.Builder(reader.maxDoc());
|
BitDocIdSet.Builder builder = new BitDocIdSet.Builder(reader.maxDoc());
|
||||||
final Fields fields = reader.fields();
|
final Fields fields = reader.fields();
|
||||||
final BytesRef spare = new BytesRef(termsBytes);
|
String lastField = null;
|
||||||
Terms terms = null;
|
Terms terms = null;
|
||||||
TermsEnum termsEnum = null;
|
TermsEnum termsEnum = null;
|
||||||
PostingsEnum docs = null;
|
PostingsEnum docs = null;
|
||||||
for (TermsAndField termsAndField : termsAndFields) {
|
TermIterator iterator = termData.iterator();
|
||||||
if ((terms = fields.terms(termsAndField.field)) != null) {
|
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
|
||||||
termsEnum = terms.iterator(); // this won't return null
|
String field = iterator.field();
|
||||||
for (int i = termsAndField.start; i < termsAndField.end; i++) {
|
// comparing references is fine here
|
||||||
spare.offset = offsets[i];
|
if (field != lastField) {
|
||||||
spare.length = offsets[i+1] - offsets[i];
|
terms = fields.terms(field);
|
||||||
if (termsEnum.seekExact(spare)) {
|
if (terms == null) {
|
||||||
docs = termsEnum.postings(acceptDocs, docs, PostingsEnum.NONE); // no freq since we don't need them
|
termsEnum = null;
|
||||||
|
} else {
|
||||||
|
termsEnum = terms.iterator();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (termsEnum != null && termsEnum.seekExact(term)) {
|
||||||
|
docs = termsEnum.postings(acceptDocs, docs, PostingsEnum.NONE);
|
||||||
builder.or(docs);
|
builder.or(docs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
BitDocIdSet result = builder.build();
|
BitDocIdSet result = builder.build();
|
||||||
if (result == null) {
|
if (result == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
final DocIdSetIterator disi = result.iterator();
|
final DocIdSetIterator disi = result.iterator();
|
||||||
|
if (disi == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
return new Scorer(this) {
|
return new Scorer(this) {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public float score() throws IOException {
|
public float score() throws IOException {
|
||||||
return queryWeight;
|
return score;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -37,6 +37,7 @@ import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.ConstantScoreQuery;
|
import org.apache.lucene.search.ConstantScoreQuery;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.QueryUtils;
|
||||||
import org.apache.lucene.search.Sort;
|
import org.apache.lucene.search.Sort;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.TopDocs;
|
import org.apache.lucene.search.TopDocs;
|
||||||
|
@ -185,30 +186,6 @@ public class TermsQueryTest extends LuceneTestCase {
|
||||||
assertFalse(left.equals(right));
|
assertFalse(left.equals(right));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNoTerms() {
|
|
||||||
List<Term> emptyTerms = Collections.emptyList();
|
|
||||||
List<BytesRef> emptyBytesRef = Collections.emptyList();
|
|
||||||
try {
|
|
||||||
new TermsQuery(emptyTerms);
|
|
||||||
fail("must fail - no terms!");
|
|
||||||
} catch (IllegalArgumentException e) {}
|
|
||||||
|
|
||||||
try {
|
|
||||||
new TermsQuery(emptyTerms.toArray(new Term[0]));
|
|
||||||
fail("must fail - no terms!");
|
|
||||||
} catch (IllegalArgumentException e) {}
|
|
||||||
|
|
||||||
try {
|
|
||||||
new TermsQuery(null, emptyBytesRef.toArray(new BytesRef[0]));
|
|
||||||
fail("must fail - no terms!");
|
|
||||||
} catch (IllegalArgumentException e) {}
|
|
||||||
|
|
||||||
try {
|
|
||||||
new TermsQuery(null, emptyBytesRef);
|
|
||||||
fail("must fail - no terms!");
|
|
||||||
} catch (IllegalArgumentException e) {}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testToString() {
|
public void testToString() {
|
||||||
TermsQuery termsQuery = new TermsQuery(new Term("field1", "a"),
|
TermsQuery termsQuery = new TermsQuery(new Term("field1", "a"),
|
||||||
new Term("field1", "b"),
|
new Term("field1", "b"),
|
||||||
|
@ -216,6 +193,24 @@ public class TermsQueryTest extends LuceneTestCase {
|
||||||
assertEquals("field1:a field1:b field1:c", termsQuery.toString());
|
assertEquals("field1:a field1:b field1:c", termsQuery.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testDedup() {
|
||||||
|
Query query1 = new TermsQuery(new Term("foo", "bar"));
|
||||||
|
Query query2 = new TermsQuery(new Term("foo", "bar"), new Term("foo", "bar"));
|
||||||
|
QueryUtils.checkEqual(query1, query2);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOrderDoesNotMatter() {
|
||||||
|
// order of terms if different
|
||||||
|
Query query1 = new TermsQuery(new Term("foo", "bar"), new Term("foo", "baz"));
|
||||||
|
Query query2 = new TermsQuery(new Term("foo", "baz"), new Term("foo", "bar"));
|
||||||
|
QueryUtils.checkEqual(query1, query2);
|
||||||
|
|
||||||
|
// order of fields is different
|
||||||
|
query1 = new TermsQuery(new Term("foo", "bar"), new Term("bar", "bar"));
|
||||||
|
query2 = new TermsQuery(new Term("bar", "bar"), new Term("foo", "bar"));
|
||||||
|
QueryUtils.checkEqual(query1, query2);
|
||||||
|
}
|
||||||
|
|
||||||
public void testRamBytesUsed() {
|
public void testRamBytesUsed() {
|
||||||
List<Term> terms = new ArrayList<>();
|
List<Term> terms = new ArrayList<>();
|
||||||
final int numTerms = 1000 + random().nextInt(1000);
|
final int numTerms = 1000 + random().nextInt(1000);
|
||||||
|
@ -225,8 +220,7 @@ public class TermsQueryTest extends LuceneTestCase {
|
||||||
TermsQuery query = new TermsQuery(terms);
|
TermsQuery query = new TermsQuery(terms);
|
||||||
final long actualRamBytesUsed = RamUsageTester.sizeOf(query);
|
final long actualRamBytesUsed = RamUsageTester.sizeOf(query);
|
||||||
final long expectedRamBytesUsed = query.ramBytesUsed();
|
final long expectedRamBytesUsed = query.ramBytesUsed();
|
||||||
// error margin within 1%
|
// error margin within 5%
|
||||||
assertEquals(actualRamBytesUsed, expectedRamBytesUsed, actualRamBytesUsed / 100);
|
assertEquals(actualRamBytesUsed, expectedRamBytesUsed, actualRamBytesUsed / 20);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue