lucene 4: cleanup terms/uid filter

This commit is contained in:
Shay Banon 2012-11-02 10:25:56 +01:00
parent 79368bb221
commit 0660e20c47
4 changed files with 114 additions and 123 deletions

View File

@ -28,6 +28,7 @@ import org.elasticsearch.common.lucene.search.NotDeletedFilter;
// So it can basically be cached safely even with a reader that changes deletions but remain with teh same cache key // So it can basically be cached safely even with a reader that changes deletions but remain with teh same cache key
// See more: https://issues.apache.org/jira/browse/LUCENE-2468 // See more: https://issues.apache.org/jira/browse/LUCENE-2468
// TODO Lucene 4.0 won't need this, since live docs are "and'ed" while scoring // TODO Lucene 4.0 won't need this, since live docs are "and'ed" while scoring
// LUCENE 4 UPGRADE: we probably don't need this anymore, because of acceptDocs
public class DeletionAwareConstantScoreQuery extends ConstantScoreQuery { public class DeletionAwareConstantScoreQuery extends ConstantScoreQuery {
private final Filter actualFilter; private final Filter actualFilter;

View File

@ -19,13 +19,7 @@
package org.apache.lucene.search; package org.apache.lucene.search;
import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.*;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queries.TermsFilter; import org.apache.lucene.queries.TermsFilter;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -39,51 +33,52 @@ import java.util.Collection;
* Similar to {@link TermsFilter} but stores the terms in an array for better memory usage * Similar to {@link TermsFilter} but stores the terms in an array for better memory usage
* when cached, and also uses bulk read * when cached, and also uses bulk read
*/ */
// LUCENE MONITOR: Against TermsFilter - this is now identical to TermsFilter once 4.1 is released // LUCENE 4 UPGRADE: Make sure to sync this against latest 4.1
// LUCENE 4.1: once its out, we can use TermsFilter from it
public class XTermsFilter extends Filter { public class XTermsFilter extends Filter {
private final Term[] filterTerms; private final Term[] filterTerms;
private final boolean[] resetTermsEnum;// true if the enum must be reset when building the bitset private final boolean[] resetTermsEnum;// true if the enum must be reset when building the bitset
private final int length; private final int length;
/** /**
* Creates a new {@link XTermsFilter} from the given collection. The collection * Creates a new {@link XTermsFilter} from the given collection. The collection
* can contain duplicate terms and multiple fields. * can contain duplicate terms and multiple fields.
*/ */
public XTermsFilter(Collection<Term> terms) { public XTermsFilter(Collection<Term> terms) {
this(terms.toArray(new Term[terms.size()])); this(terms.toArray(new Term[terms.size()]));
} }
/** /**
* Creates a new {@link XTermsFilter} from the given array. The array can * Creates a new {@link XTermsFilter} from the given array. The array can
* contain duplicate terms and multiple fields. * contain duplicate terms and multiple fields.
*/ */
public XTermsFilter(Term... terms) { public XTermsFilter(Term... terms) {
if (terms == null || terms.length == 0) { if (terms == null || terms.length == 0) {
throw new IllegalArgumentException("TermsFilter requires at least one term"); throw new IllegalArgumentException("TermsFilter requires at least one term");
}
Arrays.sort(terms);
this.filterTerms = new Term[terms.length];
this.resetTermsEnum = new boolean[terms.length];
int index = 0;
for (int i = 0; i < terms.length; i++) {
Term currentTerm = terms[i];
boolean fieldChanged = true;
if (index > 0) {
// deduplicate
if (filterTerms[index-1].field().equals(currentTerm.field())) {
fieldChanged = false;
if (filterTerms[index-1].bytes().bytesEquals(currentTerm.bytes())){
continue;
}
}
} }
this.filterTerms[index] = currentTerm; Arrays.sort(terms);
this.resetTermsEnum[index] = index == 0 || fieldChanged; // mark index 0 so we have a clear path in the iteration this.filterTerms = new Term[terms.length];
this.resetTermsEnum = new boolean[terms.length];
index++; int index = 0;
} for (int i = 0; i < terms.length; i++) {
length = index; Term currentTerm = terms[i];
boolean fieldChanged = true;
if (index > 0) {
// deduplicate
if (filterTerms[index - 1].field().equals(currentTerm.field())) {
fieldChanged = false;
if (filterTerms[index - 1].bytes().bytesEquals(currentTerm.bytes())) {
continue;
}
}
}
this.filterTerms[index] = currentTerm;
this.resetTermsEnum[index] = index == 0 || fieldChanged; // mark index 0 so we have a clear path in the iteration
index++;
}
length = index;
} }
public Term[] getTerms() { public Term[] getTerms() {
@ -92,100 +87,100 @@ public class XTermsFilter extends Filter {
@Override @Override
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
AtomicReader reader = context.reader(); AtomicReader reader = context.reader();
FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time FixedBitSet result = null; // lazy init if needed - no need to create a big bitset ahead of time
Fields fields = reader.fields(); Fields fields = reader.fields();
if (fields == null) { if (fields == null) {
return result;
}
final BytesRef br = new BytesRef();
Terms terms = null;
TermsEnum termsEnum = null;
DocsEnum docs = null;
assert resetTermsEnum[0];
for (int i = 0; i < length; i++) {
Term term = this.filterTerms[i];
if (resetTermsEnum[i]) {
terms = fields.terms(term.field());
if (terms == null) {
i = skipToNextField(i + 1, length); // skip to the next field since this field is not indexed
continue;
}
}
if ((termsEnum = terms.iterator(termsEnum)) != null) {
br.copyBytes(term.bytes());
assert termsEnum != null;
if (termsEnum.seekExact(br, true)) {
docs = termsEnum.docs(acceptDocs, docs, 0);
if (result == null) {
if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result = new FixedBitSet(reader.maxDoc());
// lazy init but don't do it in the hot loop since we could read many docs
result.set(docs.docID());
}
}
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result.set(docs.docID());
}
}
}
}
return result; return result;
}
final BytesRef br = new BytesRef();
Terms terms = null;
TermsEnum termsEnum = null;
DocsEnum docs = null;
assert resetTermsEnum[0];
for (int i = 0; i < length; i++) {
Term term = this.filterTerms[i];
if (resetTermsEnum[i]) {
terms = fields.terms(term.field());
if (terms == null) {
i = skipToNextField(i+1, length); // skip to the next field since this field is not indexed
continue;
}
}
if ((termsEnum = terms.iterator(termsEnum)) != null) {
br.copyBytes(term.bytes());
assert termsEnum != null;
if (termsEnum.seekExact(br,true)) {
docs = termsEnum.docs(acceptDocs, docs, 0);
if (result == null) {
if (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result = new FixedBitSet(reader.maxDoc());
// lazy init but don't do it in the hot loop since we could read many docs
result.set(docs.docID());
}
}
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
result.set(docs.docID());
}
}
}
}
return result;
} }
private final int skipToNextField(int index, int length) { private final int skipToNextField(int index, int length) {
for (int i = index; i < length; i++) { for (int i = index; i < length; i++) {
if (resetTermsEnum[i]) { if (resetTermsEnum[i]) {
return i-1; return i - 1;
}
} }
} return length;
return length;
} }
@Override @Override
public boolean equals(Object obj) { public boolean equals(Object obj) {
if (this == obj) { if (this == obj) {
return true; return true;
}
if ((obj == null) || (obj.getClass() != this.getClass())) {
return false;
}
XTermsFilter test = (XTermsFilter) obj;
if (filterTerms != test.filterTerms) {
if (length == test.length) {
for (int i = 0; i < length; i++) {
// can not be null!
if (!filterTerms[i].equals(test.filterTerms[i])) {
return false;
}
}
} else {
return false;
} }
} if ((obj == null) || (obj.getClass() != this.getClass())) {
return true; return false;
}
XTermsFilter test = (XTermsFilter) obj;
if (filterTerms != test.filterTerms) {
if (length == test.length) {
for (int i = 0; i < length; i++) {
// can not be null!
if (!filterTerms[i].equals(test.filterTerms[i])) {
return false;
}
}
} else {
return false;
}
}
return true;
} }
@Override @Override
public int hashCode() { public int hashCode() {
int hash = 9; int hash = 9;
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
hash = 31 * hash + filterTerms[i].hashCode(); hash = 31 * hash + filterTerms[i].hashCode();
} }
return hash; return hash;
} }
@Override @Override
public String toString() { public String toString() {
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
if (builder.length() > 0) { if (builder.length() > 0) {
builder.append(' '); builder.append(' ');
}
builder.append(filterTerms[i]);
} }
builder.append(filterTerms[i]); return builder.toString();
}
return builder.toString();
} }
} }

View File

@ -166,7 +166,7 @@ public class IdFieldMapper extends AbstractFieldMapper<String> implements Intern
if (indexed() || context == null) { if (indexed() || context == null) {
return super.fieldQuery(value, context); return super.fieldQuery(value, context);
} }
UidFilter filter = new UidFilter(context.queryTypes(), ImmutableList.of(value), context.indexCache().bloomCache()); UidFilter filter = new UidFilter(context.queryTypes(), ImmutableList.of(value));
// no need for constant score filter, since we don't cache the filter, and it always takes deletes into account // no need for constant score filter, since we don't cache the filter, and it always takes deletes into account
return new ConstantScoreQuery(filter); return new ConstantScoreQuery(filter);
} }
@ -176,7 +176,7 @@ public class IdFieldMapper extends AbstractFieldMapper<String> implements Intern
if (indexed() || context == null) { if (indexed() || context == null) {
return super.fieldFilter(value, context); return super.fieldFilter(value, context);
} }
return new UidFilter(context.queryTypes(), ImmutableList.of(value), context.indexCache().bloomCache()); return new UidFilter(context.queryTypes(), ImmutableList.of(value));
} }
@Override @Override
@ -280,7 +280,7 @@ public class IdFieldMapper extends AbstractFieldMapper<String> implements Intern
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
// if all are defaults, no sense to write it at all // if all are defaults, no sense to write it at all
if (fieldType.stored() == Defaults.ID_FIELD_TYPE.stored() && if (fieldType.stored() == Defaults.ID_FIELD_TYPE.stored() &&
fieldType.indexed() == Defaults.ID_FIELD_TYPE.indexed() && path == Defaults.PATH) { fieldType.indexed() == Defaults.ID_FIELD_TYPE.indexed() && path == Defaults.PATH) {
return builder; return builder;
} }
builder.startObject(CONTENT_TYPE); builder.startObject(CONTENT_TYPE);

View File

@ -19,11 +19,7 @@
package org.elasticsearch.index.search; package org.elasticsearch.index.search;
import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.*;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter; import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
@ -36,9 +32,11 @@ import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
// LUCENE 4 UPGRADE: we can potentially use TermsFilter here, specifically, now when we don't do bloom filter, batching, and with optimization on single field terms
public class UidFilter extends Filter { public class UidFilter extends Filter {
final Term[] uids; final Term[] uids;
public UidFilter(Collection<String> types, List<String> ids) { public UidFilter(Collection<String> types, List<String> ids) {
this.uids = new Term[types.size() * ids.size()]; this.uids = new Term[types.size() * ids.size()];
int i = 0; int i = 0;
@ -60,7 +58,6 @@ public class UidFilter extends Filter {
// - If we have a single id, we can create a SingleIdDocIdSet to save on mem // - If we have a single id, we can create a SingleIdDocIdSet to save on mem
// - We can use sorted int array DocIdSet to reserve memory compared to OpenBitSet in some cases // - We can use sorted int array DocIdSet to reserve memory compared to OpenBitSet in some cases
@Override @Override
// LUCENE 4 UPGRADE: this filter does respect acceptDocs maybe we need to change this
public DocIdSet getDocIdSet(AtomicReaderContext ctx, Bits acceptedDocs) throws IOException { public DocIdSet getDocIdSet(AtomicReaderContext ctx, Bits acceptedDocs) throws IOException {
FixedBitSet set = null; FixedBitSet set = null;
final AtomicReader reader = ctx.reader(); final AtomicReader reader = ctx.reader();
@ -71,8 +68,6 @@ public class UidFilter extends Filter {
docsEnum = termsEnum.docs(acceptedDocs, docsEnum, 0); docsEnum = termsEnum.docs(acceptedDocs, docsEnum, 0);
int doc; int doc;
while ((doc = docsEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) { while ((doc = docsEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
// no need for batching, its on the UID, there will be only
// one doc
if (set == null) { if (set == null) {
set = new FixedBitSet(reader.maxDoc()); set = new FixedBitSet(reader.maxDoc());
} }