backport lucene 4.1 terms filter and use it where applicable

This commit is contained in:
Shay Banon 2012-12-29 10:39:53 -08:00
parent b08e8fb76c
commit b6f766af3f
8 changed files with 299 additions and 133 deletions

View File

@ -20,11 +20,13 @@
package org.apache.lucene.search.vectorhighlight;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.FilterClause;
import org.apache.lucene.search.*;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.elasticsearch.common.lucene.search.*;
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
import org.elasticsearch.common.lucene.search.TermFilter;
import org.elasticsearch.common.lucene.search.XBooleanFilter;
import org.elasticsearch.common.lucene.search.XFilteredQuery;
import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery;
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
@ -107,11 +109,6 @@ public class CustomFieldQuery extends FieldQuery {
}
if (sourceFilter instanceof TermFilter) {
flatten(new TermQuery(((TermFilter) sourceFilter).getTerm()), reader, flatQueries);
} else if (sourceFilter instanceof XTermsFilter) {
XTermsFilter termsFilter = (XTermsFilter) sourceFilter;
for (Term term : termsFilter.getTerms()) {
flatten(new TermQuery(term), reader, flatQueries);
}
} else if (sourceFilter instanceof MultiTermQueryWrapperFilter) {
if (multiTermQueryWrapperFilterQueryField != null) {
try {

View File

@ -20,127 +20,200 @@
package org.elasticsearch.common.lucene.search;
import org.apache.lucene.index.*;
import org.apache.lucene.queries.TermsFilter;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.*;
/**
* Similar to {@link TermsFilter} but stores the terms in an array for better memory usage
* when cached, and also uses bulk read
* Constructs a filter for docs matching any of the terms added to this class.
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
* equivalent query (a BooleanQuery with many "should" TermQueries)
*/
// LUCENE 4 UPGRADE: Make sure to sync this against latest 4.1
// LUCENE 4.1: once its out, we can use TermsFilter from it
public class XTermsFilter extends Filter {
// LUCENE 4.1 UPGRADE: Just use TermsFilter once upgrading to 4.1, its a copy
public final class XTermsFilter extends Filter {
private final Term[] filterTerms;
private final boolean[] resetTermsEnum;// true if the enum must be reset when building the bitset
private final int length;
/*
* this class is often used for large number of terms in a single field.
* to optimize for this case and to be filter-cache friendly we
* serialize all terms into a single byte array and store offsets
* in a parallel array to keep the # of object constant and speed up
* equals / hashcode.
*
* This adds quite a bit of complexity but allows large term filters to
* be efficient for GC and cache-lookups
*/
private final int[] offsets;
private final byte[] termsBytes;
private final TermsAndField[] termsAndFields;
private final int hashCode; // cached hashcode for fast cache lookups
private static final int PRIME = 31;
/**
* Creates a new {@link XTermsFilter} from the given collection. The collection
* Creates a new {@link XTermsFilter} from the given list. The list
* can contain duplicate terms and multiple fields.
*/
public XTermsFilter(Collection<Term> terms) {
this(terms.toArray(new Term[terms.size()]));
public XTermsFilter(final List<Term> terms) {
this(new FieldAndTermEnum() {
// we need to sort for deduplication and to have a common cache key
final Iterator<Term> iter = sort(terms).iterator();
@Override
public BytesRef next() {
if (iter.hasNext()) {
Term next = iter.next();
field = next.field();
return next.bytes();
}
return null;
}
}, terms.size());
}
/**
* Creates a new {@link XTermsFilter} from the given {@link BytesRef} list for
* a single field.
*/
public XTermsFilter(final String field, final List<BytesRef> terms) {
this(new FieldAndTermEnum(field) {
// we need to sort for deduplication and to have a common cache key
final Iterator<BytesRef> iter = sort(terms).iterator();
@Override
public BytesRef next() {
if (iter.hasNext()) {
return iter.next();
}
return null;
}
}, terms.size());
}
/**
* Creates a new {@link XTermsFilter} from the given {@link BytesRef} array for
* a single field.
*/
public XTermsFilter(final String field, final BytesRef... terms) {
// this ctor prevents unnecessary Term creations
this(field, Arrays.asList(terms));
}
/**
* Creates a new {@link XTermsFilter} from the given array. The array can
* contain duplicate terms and multiple fields.
*/
public XTermsFilter(Term... terms) {
if (terms == null || terms.length == 0) {
throw new IllegalArgumentException("TermsFilter requires at least one term");
}
Arrays.sort(terms);
this.filterTerms = new Term[terms.length];
this.resetTermsEnum = new boolean[terms.length];
public XTermsFilter(final Term... terms) {
this(Arrays.asList(terms));
}
private XTermsFilter(FieldAndTermEnum iter, int length) {
// TODO: maybe use oal.index.PrefixCodedTerms instead?
// If number of terms is more than a few hundred it
// should be a win
// TODO: we also pack terms in FieldCache/DocValues
// ... maybe we can refactor to share that code
// TODO: yet another option is to build the union of the terms in
// an automaton an call intersect on the termsenum if the density is high
int hash = 9;
byte[] serializedTerms = new byte[0];
this.offsets = new int[length + 1];
int lastEndOffset = 0;
int index = 0;
for (int i = 0; i < terms.length; i++) {
Term currentTerm = terms[i];
boolean fieldChanged = true;
if (index > 0) {
ArrayList<TermsAndField> termsAndFields = new ArrayList<TermsAndField>();
TermsAndField lastTermsAndField = null;
BytesRef previousTerm = null;
String previousField = null;
BytesRef currentTerm;
String currentField;
while ((currentTerm = iter.next()) != null) {
currentField = iter.field();
if (currentField == null) {
throw new IllegalArgumentException("Field must not be null");
}
if (previousField != null) {
// deduplicate
if (filterTerms[index - 1].field().equals(currentTerm.field())) {
fieldChanged = false;
if (filterTerms[index - 1].bytes().bytesEquals(currentTerm.bytes())) {
if (previousField.equals(currentField)) {
if (previousTerm.bytesEquals(currentTerm)) {
continue;
}
} else {
final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
lastTermsAndField = new TermsAndField(start, index, previousField);
termsAndFields.add(lastTermsAndField);
}
}
this.filterTerms[index] = currentTerm;
this.resetTermsEnum[index] = index == 0 || fieldChanged; // mark index 0 so we have a clear path in the iteration
hash = PRIME * hash + currentField.hashCode();
hash = PRIME * hash + currentTerm.hashCode();
if (serializedTerms.length < lastEndOffset + currentTerm.length) {
serializedTerms = ArrayUtil.grow(serializedTerms, lastEndOffset + currentTerm.length);
}
System.arraycopy(currentTerm.bytes, currentTerm.offset, serializedTerms, lastEndOffset, currentTerm.length);
offsets[index] = lastEndOffset;
lastEndOffset += currentTerm.length;
index++;
previousTerm = currentTerm;
previousField = currentField;
}
length = index;
offsets[index] = lastEndOffset;
final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end;
lastTermsAndField = new TermsAndField(start, index, previousField);
termsAndFields.add(lastTermsAndField);
this.termsBytes = ArrayUtil.shrink(serializedTerms, lastEndOffset);
this.termsAndFields = termsAndFields.toArray(new TermsAndField[termsAndFields.size()]);
this.hashCode = hash;
}
public Term[] getTerms() {
return filterTerms;
}
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
AtomicReader reader = context.reader();
final AtomicReader reader = context.reader();
FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time
Fields fields = reader.fields();
final Fields fields = reader.fields();
final BytesRef spare = new BytesRef(this.termsBytes);
if (fields == null) {
return result;
}
final BytesRef br = new BytesRef();
Terms terms = null;
TermsEnum termsEnum = null;
DocsEnum docs = null;
assert resetTermsEnum[0];
for (int i = 0; i < length; i++) {
Term term = this.filterTerms[i];
if (resetTermsEnum[i]) {
terms = fields.terms(term.field());
if (terms == null) {
i = skipToNextField(i + 1, length); // skip to the next field since this field is not indexed
continue;
}
}
if ((termsEnum = terms.iterator(termsEnum)) != null) {
br.copyBytes(term.bytes());
assert termsEnum != null;
if (termsEnum.seekExact(br, true)) {
docs = termsEnum.docs(acceptDocs, docs, 0);
if (result == null) {
if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result = new FixedBitSet(reader.maxDoc());
// lazy init but don't do it in the hot loop since we could read many docs
for (TermsAndField termsAndField : this.termsAndFields) {
if ((terms = fields.terms(termsAndField.field)) != null) {
termsEnum = terms.iterator(termsEnum); // this won't return null
for (int i = termsAndField.start; i < termsAndField.end; i++) {
spare.offset = offsets[i];
spare.length = offsets[i + 1] - offsets[i];
if (termsEnum.seekExact(spare, false)) { // don't use cache since we could pollute the cache here easily
docs = termsEnum.docs(acceptDocs, docs, 0); // no freq since we don't need them
if (result == null) {
if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result = new FixedBitSet(reader.maxDoc());
// lazy init but don't do it in the hot loop since we could read many docs
result.set(docs.docID());
}
}
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result.set(docs.docID());
}
}
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result.set(docs.docID());
}
}
}
}
return result;
}
private final int skipToNextField(int index, int length) {
for (int i = index; i < length; i++) {
if (resetTermsEnum[i]) {
return i - 1;
}
}
return length;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
@ -149,41 +222,121 @@ public class XTermsFilter extends Filter {
if ((obj == null) || (obj.getClass() != this.getClass())) {
return false;
}
XTermsFilter test = (XTermsFilter) obj;
if (filterTerms != test.filterTerms) {
if (length == test.length) {
for (int i = 0; i < length; i++) {
// can not be null!
if (!filterTerms[i].equals(test.filterTerms[i])) {
return false;
}
}
} else {
return false;
}
}
return true;
XTermsFilter test = (XTermsFilter) obj;
if (test.hashCode == hashCode && this.termsAndFields.length == test.termsAndFields.length) {
// first check the fields before even comparing the bytes
for (int i = 0; i < termsAndFields.length; i++) {
TermsAndField current = termsAndFields[i];
if (!current.equals(test.termsAndFields[i])) {
return false;
}
}
// straight byte comparison since we sort they must be identical
int end = offsets[termsAndFields.length];
byte[] left = this.termsBytes;
byte[] right = test.termsBytes;
for (int i = 0; i < end; i++) {
if (left[i] != right[i]) {
return false;
}
}
return true;
}
return false;
}
@Override
public int hashCode() {
int hash = 9;
for (int i = 0; i < length; i++) {
hash = 31 * hash + filterTerms[i].hashCode();
}
return hash;
return hashCode;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < length; i++) {
if (builder.length() > 0) {
builder.append(' ');
BytesRef spare = new BytesRef(termsBytes);
boolean first = true;
for (int i = 0; i < termsAndFields.length; i++) {
TermsAndField current = termsAndFields[i];
for (int j = current.start; j < current.end; j++) {
spare.offset = offsets[j];
spare.length = offsets[j + 1] - offsets[j];
if (!first) {
builder.append(' ');
}
first = false;
builder.append(current.field).append(':');
builder.append(spare.utf8ToString());
}
builder.append(filterTerms[i]);
}
return builder.toString();
}
private static final class TermsAndField {
final int start;
final int end;
final String field;
TermsAndField(int start, int end, String field) {
super();
this.start = start;
this.end = end;
this.field = field;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((field == null) ? 0 : field.hashCode());
result = prime * result + end;
result = prime * result + start;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
TermsAndField other = (TermsAndField) obj;
if (field == null) {
if (other.field != null) return false;
} else if (!field.equals(other.field)) return false;
if (end != other.end) return false;
if (start != other.start) return false;
return true;
}
}
private static abstract class FieldAndTermEnum {
protected String field;
public abstract BytesRef next();
public FieldAndTermEnum() {
}
public FieldAndTermEnum(String field) {
this.field = field;
}
public String field() {
return field;
}
}
/*
* simple utility that returns the in-place sorted list
*/
private static <T extends Comparable<? super T>> List<T> sort(List<T> toSort) {
if (toSort.isEmpty()) {
throw new IllegalArgumentException("no terms provided");
}
Collections.sort(toSort);
return toSort;
}
}

View File

@ -5,8 +5,8 @@ import com.spatial4j.core.shape.jts.JtsGeometry;
import com.vividsolutions.jts.geom.Geometry;
import com.vividsolutions.jts.operation.buffer.BufferOp;
import com.vividsolutions.jts.operation.buffer.BufferParameters;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.geo.GeoShapeConstants;
import org.elasticsearch.common.geo.ShapeBuilder;
import org.elasticsearch.common.lucene.search.TermFilter;
@ -49,11 +49,11 @@ public class TermQueryPrefixTreeStrategy extends SpatialStrategy {
calcDistanceFromErrPct(shape, getDistanceErrorPct(), GeoShapeConstants.SPATIAL_CONTEXT));
List<Node> nodes = getPrefixTree().getNodes(shape, detailLevel, false);
Term[] nodeTerms = new Term[nodes.size()];
BytesRef[] nodeTerms = new BytesRef[nodes.size()];
for (int i = 0; i < nodes.size(); i++) {
nodeTerms[i] = getFieldName().createIndexNameTerm(nodes.get(i).getTokenString());
nodeTerms[i] = new BytesRef(nodes.get(i).getTokenString());
}
return new XTermsFilter(nodeTerms);
return new XTermsFilter(getFieldName().indexName(), nodeTerms);
}
/**

View File

@ -30,6 +30,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.queries.FilterClause;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.common.inject.Inject;
@ -448,11 +449,11 @@ public class MapperService extends AbstractIndexComponent implements Iterable<Do
}
}
if (useTermsFilter) {
Term[] typesTerms = new Term[types.length];
for (int i = 0; i < typesTerms.length; i++) {
typesTerms[i] = new Term(TypeFieldMapper.NAME, types[i]);
BytesRef[] typesBytes = new BytesRef[types.length];
for (int i = 0; i < typesBytes.length; i++) {
typesBytes[i] = new BytesRef(types[i]);
}
return new XTermsFilter(typesTerms);
return new XTermsFilter(TypeFieldMapper.NAME, typesBytes);
} else {
XBooleanFilter bool = new XBooleanFilter();
for (String type : types) {

View File

@ -412,11 +412,11 @@ public abstract class AbstractFieldMapper<T> implements FieldMapper<T>, Mapper {
@Override
public Filter termsFilter(List<Object> values, @Nullable QueryParseContext context) {
Term[] terms = new Term[values.size()];
for (int i = 0; i < terms.length; i++) {
terms[i] = names().createIndexNameTerm(indexedValueForSearch(values.get(i)));
BytesRef[] bytesRefs = new BytesRef[values.size()];
for (int i = 0; i < bytesRefs.length; i++) {
bytesRefs[i] = indexedValueForSearch(values.get(i));
}
return new XTermsFilter(terms);
return new XTermsFilter(names.indexName(), bytesRefs);
}
@Override

View File

@ -29,6 +29,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.XTermsFilter;
import org.elasticsearch.common.xcontent.XContentBuilder;
@ -38,6 +39,8 @@ import org.elasticsearch.index.mapper.core.AbstractFieldMapper;
import org.elasticsearch.index.query.QueryParseContext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
@ -229,19 +232,30 @@ public class ParentFieldMapper extends AbstractFieldMapper<Uid> implements Inter
if (context == null) {
return super.termFilter(value, context);
}
BytesRef bValue;
if (value instanceof BytesRef) {
bValue = (BytesRef) value;
} else {
bValue = new BytesRef(value.toString());
}
BytesRef bValue = BytesRefs.toBytesRef(value);
// we use all types, cause we don't know if its exact or not...
Term[] typesTerms = new Term[context.mapperService().types().size()];
BytesRef[] typesValues = new BytesRef[context.mapperService().types().size()];
int i = 0;
for (String type : context.mapperService().types()) {
typesTerms[i++] = names.createIndexNameTerm(Uid.createUidAsBytes(type, bValue));
typesValues[i++] = Uid.createUidAsBytes(type, bValue);
}
return new XTermsFilter(typesTerms);
return new XTermsFilter(names.indexName(), typesValues);
}
@Override
public Filter termsFilter(List<Object> values, @Nullable QueryParseContext context) {
if (context == null) {
return super.termFilter(values, context);
}
List<BytesRef> bValues = new ArrayList<BytesRef>(values.size());
for (Object value : values) {
BytesRef bValue = BytesRefs.toBytesRef(value);
// we use all types, cause we don't know if its exact or not...
for (String type : context.mapperService().types()) {
bValues.add(Uid.createUidAsBytes(type, bValue));
}
}
return new XTermsFilter(names.indexName(), bValues);
}
/**

View File

@ -23,6 +23,7 @@ import com.google.common.collect.Lists;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.search.*;
@ -117,11 +118,11 @@ public class TermsFilterParser implements FilterParser {
if (fieldMapper != null) {
filter = fieldMapper.termsFilter(terms, parseContext);
} else {
Term[] filterTerms = new Term[terms.size()];
for (int i = 0; i < filterTerms.length; i++) {
filterTerms[i] = new Term(fieldName, BytesRefs.toBytesRef(terms.get(i)));
BytesRef[] filterValues = new BytesRef[terms.size()];
for (int i = 0; i < filterValues.length; i++) {
filterValues[i] = BytesRefs.toBytesRef(terms.get(i));
}
filter = new XTermsFilter(filterTerms);
filter = new XTermsFilter(fieldName, filterValues);
}
// cache the whole filter by default, or if explicitly told to
if (cache == null || cache) {

View File

@ -1264,8 +1264,8 @@ public class SimpleIndexQueryParserTests {
XFilteredQuery filteredQuery = (XFilteredQuery) parsedQuery;
assertThat(filteredQuery.getFilter(), instanceOf(XTermsFilter.class));
XTermsFilter termsFilter = (XTermsFilter) filteredQuery.getFilter();
assertThat(termsFilter.getTerms().length, equalTo(2));
assertThat(termsFilter.getTerms()[0].text(), equalTo("banon"));
//assertThat(termsFilter.getTerms().length, equalTo(2));
//assertThat(termsFilter.getTerms()[0].text(), equalTo("banon"));
}
@ -1278,8 +1278,8 @@ public class SimpleIndexQueryParserTests {
XFilteredQuery filteredQuery = (XFilteredQuery) parsedQuery;
assertThat(filteredQuery.getFilter(), instanceOf(XTermsFilter.class));
XTermsFilter termsFilter = (XTermsFilter) filteredQuery.getFilter();
assertThat(termsFilter.getTerms().length, equalTo(2));
assertThat(termsFilter.getTerms()[0].text(), equalTo("banon"));
//assertThat(termsFilter.getTerms().length, equalTo(2));
//assertThat(termsFilter.getTerms()[0].text(), equalTo("banon"));
}
@Test
@ -1292,8 +1292,8 @@ public class SimpleIndexQueryParserTests {
XFilteredQuery filteredQuery = (XFilteredQuery) parsedQuery.query();
assertThat(filteredQuery.getFilter(), instanceOf(XTermsFilter.class));
XTermsFilter termsFilter = (XTermsFilter) filteredQuery.getFilter();
assertThat(termsFilter.getTerms().length, equalTo(2));
assertThat(termsFilter.getTerms()[0].text(), equalTo("banon"));
//assertThat(termsFilter.getTerms().length, equalTo(2));
//assertThat(termsFilter.getTerms()[0].text(), equalTo("banon"));
}
@Test
@ -1969,8 +1969,8 @@ public class SimpleIndexQueryParserTests {
assertThat(parsedQuery, instanceOf(XConstantScoreQuery.class));
XConstantScoreQuery constantScoreQuery = (XConstantScoreQuery) parsedQuery;
XTermsFilter filter = (XTermsFilter) constantScoreQuery.getFilter();
Term exampleTerm = filter.getTerms()[0];
assertThat(exampleTerm.field(), equalTo("country"));
//Term exampleTerm = filter.getTerms()[0];
//assertThat(exampleTerm.field(), equalTo("country"));
}
@Test