LUCENE-6470: Improve efficiency of TermsQuery constructors.

This commit is contained in:
Robert Muir 2016-01-26 03:38:19 -05:00
parent 0ff8d11367
commit a3b2ad334c
3 changed files with 48 additions and 27 deletions

View File

@ -193,6 +193,8 @@ Optimizations
* LUCENE-6940: MUST_NOT clauses execute faster, especially when they are sparse.
(Adrien Grand)
* LUCENE-6470: Improve efficiency of TermsQuery constructors. (Robert Muir)
Bug Fixes
* LUCENE-6976: BytesRefTermAttributeImpl.copyTo NPE'ed if BytesRef was null.

View File

@ -66,22 +66,27 @@ public class PrefixCodedTerms implements Accountable {
/** add a term */
public void add(Term term) {
assert lastTerm.equals(new Term("")) || term.compareTo(lastTerm) > 0;
add(term.field(), term.bytes());
}
/** add a term */
public void add(String field, BytesRef bytes) {
assert lastTerm.equals(new Term("")) || new Term(field, bytes).compareTo(lastTerm) > 0;
try {
int prefix = sharedPrefix(lastTerm.bytes, term.bytes);
int suffix = term.bytes.length - prefix;
if (term.field.equals(lastTerm.field)) {
int prefix = sharedPrefix(lastTerm.bytes, bytes);
int suffix = bytes.length - prefix;
if (field.equals(lastTerm.field)) {
output.writeVInt(prefix << 1);
} else {
output.writeVInt(prefix << 1 | 1);
output.writeString(term.field);
output.writeString(field);
}
output.writeVInt(suffix);
output.writeBytes(term.bytes.bytes, term.bytes.offset + prefix, suffix);
lastTermBytes.copyBytes(term.bytes);
output.writeBytes(bytes.bytes, bytes.offset + prefix, suffix);
lastTermBytes.copyBytes(bytes);
lastTerm.bytes = lastTermBytes.get();
lastTerm.field = term.field;
lastTerm.field = field;
size += 1;
} catch (IOException e) {
throw new RuntimeException(e);

View File

@ -25,6 +25,7 @@ import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
@ -55,6 +56,7 @@ import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
/**
@ -86,22 +88,17 @@ public class TermsQuery extends Query implements Accountable {
private final PrefixCodedTerms termData;
private final int termDataHashCode; // cached hashcode of termData
private static Term[] toTermArray(String field, List<BytesRef> termBytes) {
Term[] array = new Term[termBytes.size()];
int i = 0;
for (BytesRef t : termBytes) {
array[i++] = new Term(field, t);
}
return array;
}
/**
* Creates a new {@link TermsQuery} from the given list. The list
* Creates a new {@link TermsQuery} from the given collection. It
* can contain duplicate terms and multiple fields.
*/
public TermsQuery(final List<Term> terms) {
public TermsQuery(Collection<Term> terms) {
Term[] sortedTerms = terms.toArray(new Term[terms.size()]);
ArrayUtil.timSort(sortedTerms);
// already sorted if we are a SortedSet with natural order
boolean sorted = terms instanceof SortedSet && ((SortedSet<Term>)terms).comparator() == null;
if (!sorted) {
ArrayUtil.timSort(sortedTerms);
}
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
Term previous = null;
for (Term term : sortedTerms) {
@ -113,21 +110,38 @@ public class TermsQuery extends Query implements Accountable {
termData = builder.finish();
termDataHashCode = termData.hashCode();
}
/**
* Creates a new {@link TermsQuery} from the given {@link BytesRef} list for
* a single field.
* Creates a new {@link TermsQuery} from the given collection for
* a single field. It can contain duplicate terms.
*/
public TermsQuery(final String field, final List<BytesRef> terms) {
this(toTermArray(field, terms));
public TermsQuery(String field, Collection<BytesRef> terms) {
BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]);
// already sorted if we are a SortedSet with natural order
boolean sorted = terms instanceof SortedSet && ((SortedSet<BytesRef>)terms).comparator() == null;
if (!sorted) {
ArrayUtil.timSort(sortedTerms);
}
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
BytesRefBuilder previous = null;
for (BytesRef term : sortedTerms) {
if (previous == null) {
previous = new BytesRefBuilder();
} else if (previous.get().equals(term)) {
continue; // deduplicate
}
builder.add(field, term);
previous.copyBytes(term);
}
termData = builder.finish();
termDataHashCode = termData.hashCode();
}
/**
* Creates a new {@link TermsQuery} from the given {@link BytesRef} array for
* a single field.
*/
public TermsQuery(final String field, final BytesRef...terms) {
// this ctor prevents unnecessary Term creations
public TermsQuery(String field, BytesRef...terms) {
this(field, Arrays.asList(terms));
}