LUCENE-2694: Make MTQ rewrite + weight/scorer single pass

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1058328 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2011-01-12 21:38:51 +00:00
parent 3df2f89d6c
commit eacfb5d636
35 changed files with 756 additions and 229 deletions

View File

@ -360,6 +360,11 @@ Optimizations
when the current term is less than the next sub-range's lower end.
(Uwe Schindler, Mike McCandless)
* LUCENE-2694: Optimize MultiTermQuery to be single pass for Term lookups.
MultiTermQuery now stores TermState per leaf reader during rewrite to re-
seek the term dictionary in TermQuery / TermWeight.
(Simon Willnauer, Mike McCandless, Robert Muir)
Documentation
* LUCENE-2579: Fix oal.search's package.html description of abstract

View File

@ -18,10 +18,15 @@ package org.apache.lucene.store.instantiated;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.codecs.PrefixCodedTermState;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
@ -90,10 +95,6 @@ public class InstantiatedTermsEnum extends TermsEnum {
}
}
@Override
public void cacheCurrentTerm() {
}
@Override
public BytesRef term() {
return br;
@ -129,5 +130,18 @@ public class InstantiatedTermsEnum extends TermsEnum {
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public TermState termState() throws IOException {
final OrdTermState state = new OrdTermState();
state.ord = upto - start;
return state;
}
@Override
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
assert state != null && state instanceof OrdTermState;
return seek(((OrdTermState)state).ord); // just use the ord for simplicity
}
}

View File

@ -39,6 +39,8 @@ import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.FieldsEnum;
@ -883,10 +885,6 @@ public class MemoryIndex implements Serializable {
return br;
}
@Override
public void cacheCurrentTerm() {
}
@Override
public long ord() {
return termUpto;
@ -917,6 +915,19 @@ public class MemoryIndex implements Serializable {
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
assert state != null;
return this.seek(((OrdTermState)state).ord);
}
@Override
public TermState termState() throws IOException {
OrdTermState ts = new OrdTermState();
ts.ord = termUpto;
return ts;
}
}
private class MemoryDocsEnum extends DocsEnum {

View File

@ -372,7 +372,6 @@ class BufferedDeletes {
Query query = entry.getKey();
int limit = entry.getValue().intValue();
Weight weight = query.weight(searcher);
Scorer scorer = weight.scorer(readerContext, true, false);
if (scorer != null) {
while(true) {

View File

@ -130,11 +130,6 @@ public class FilterIndexReader extends IndexReader {
return in.seek(text, useCache);
}
@Override
public void cacheCurrentTerm() throws IOException {
in.cacheCurrentTerm();
}
@Override
public SeekStatus seek(long ord) throws IOException {
return in.seek(ord);
@ -174,6 +169,16 @@ public class FilterIndexReader extends IndexReader {
public Comparator<BytesRef> getComparator() throws IOException {
return in.getComparator();
}
@Override
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
return in.seek(term, state);
}
@Override
public TermState termState() throws IOException {
return in.termState();
}
}
/** Base class for filtering {@link DocsEnum} implementations. */

View File

@ -1071,6 +1071,47 @@ public abstract class IndexReader implements Cloneable,Closeable {
}
}
/**
* Returns {@link DocsEnum} for the specified field and
* {@link TermState}. This may return null, if either the field or the term
* does not exists or the {@link TermState} is invalid for the underlying
* implementation.*/
public DocsEnum termDocsEnum(Bits skipDocs, String field, BytesRef term, TermState state) throws IOException {
assert state != null;
assert field != null;
final Fields fields = fields();
if (fields == null) {
return null;
}
final Terms terms = fields.terms(field);
if (terms != null) {
return terms.docs(skipDocs, term, state, null);
} else {
return null;
}
}
/**
* Returns {@link DocsAndPositionsEnum} for the specified field and
* {@link TermState}. This may return null, if either the field or the term
* does not exists, the {@link TermState} is invalid for the underlying
* implementation, or positions were not stored for this term.*/
public DocsAndPositionsEnum termPositionsEnum(Bits skipDocs, String field, BytesRef term, TermState state) throws IOException {
assert state != null;
assert field != null;
final Fields fields = fields();
if (fields == null) {
return null;
}
final Terms terms = fields.terms(field);
if (terms != null) {
return terms.docsAndPositions(skipDocs, term, state, null);
} else {
return null;
}
}
/** Deletes the document numbered <code>docNum</code>. Once a document is
* deleted it will not appear in TermDocs or TermPositions enumerations.
* Attempts to read its field with the {@link #document}

View File

@ -19,7 +19,6 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.document.Document;

View File

@ -90,13 +90,6 @@ public final class MultiTermsEnum extends TermsEnum {
return current;
}
@Override
public void cacheCurrentTerm() throws IOException {
for(int i=0;i<numTop;i++) {
top[i].terms.cacheCurrentTerm();
}
}
@Override
public Comparator<BytesRef> getComparator() {
return termComp;

View File

@ -0,0 +1,33 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An ordinal based {@link TermState}
*
* @lucene.experimental
*/
public class OrdTermState extends TermState {
public long ord;
@Override
public void copyFrom(TermState other) {
assert other instanceof OrdTermState : "can not copy from " + other.getClass().getName();
this.ord = ((OrdTermState) other).ord;
}
}

View File

@ -0,0 +1,47 @@
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Encapsulates all required internal state to position the associated
* {@link TermsEnum} without re-seeking.
*
* @see TermsEnum#seek(org.apache.lucene.util.BytesRef, TermState)
* @see TermsEnum#termState()
* @lucene.experimental
*/
public abstract class TermState implements Cloneable {
/**
* Copies the content of the given {@link TermState} to this instance
*
* @param other
* the TermState to copy
*/
public abstract void copyFrom(TermState other);
@Override
public Object clone() {
try {
return super.clone();
} catch (CloneNotSupportedException cnse) {
// should not happen
throw new RuntimeException(cnse);
}
}
}

View File

@ -80,11 +80,57 @@ public abstract class Terms {
}
}
/**
* Expert: Get {@link DocsEnum} for the specified {@link TermState}.
* This method may return <code>null</code> if the term does not exist.
*
* @see TermsEnum#termState()
* @see TermsEnum#seek(BytesRef, TermState) */
public DocsEnum docs(Bits skipDocs, BytesRef term, TermState termState, DocsEnum reuse) throws IOException {
final TermsEnum termsEnum = getThreadTermsEnum();
if (termsEnum.seek(term, termState) == TermsEnum.SeekStatus.FOUND) {
return termsEnum.docs(skipDocs, reuse);
} else {
return null;
}
}
/**
* Get {@link DocsEnum} for the specified {@link TermState}. This
* method will may return <code>null</code> if the term does not exists, or positions were
* not indexed.
*
* @see TermsEnum#termState()
* @see TermsEnum#seek(BytesRef, TermState) */
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, BytesRef term, TermState termState, DocsAndPositionsEnum reuse) throws IOException {
final TermsEnum termsEnum = getThreadTermsEnum();
if (termsEnum.seek(term, termState) == TermsEnum.SeekStatus.FOUND) {
return termsEnum.docsAndPositions(skipDocs, reuse);
} else {
return null;
}
}
public long getUniqueTermCount() throws IOException {
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
}
protected TermsEnum getThreadTermsEnum() throws IOException {
/**
* Returns a thread-private {@link TermsEnum} instance. Obtaining
* {@link TermsEnum} from this method might be more efficient than using
* {@link #iterator()} directly since this method doesn't necessarily create a
* new {@link TermsEnum} instance.
* <p>
* NOTE: {@link TermsEnum} instances obtained from this method must not be
* shared across threads. The enum should only be used within a local context
* where other threads can't access it.
*
* @return a thread-private {@link TermsEnum} instance
* @throws IOException
* if an IOException occurs
* @lucene.internal
*/
public TermsEnum getThreadTermsEnum() throws IOException {
TermsEnum termsEnum = threadEnums.get();
if (termsEnum == null) {
termsEnum = iterator();

View File

@ -74,6 +74,33 @@ public abstract class TermsEnum {
* #seek(BytesRef)}. */
public abstract SeekStatus seek(long ord) throws IOException;
/**
* Expert: Seeks a specific position by {@link TermState} previously obtained
* from {@link #termState()}. Callers should maintain the {@link TermState} to
* use this method. Low-level implementations may position the TermsEnum
* without re-seeking the term dictionary.
* <p>
* Seeking by {@link TermState} should only be used iff the enu the state was
* obtained from and the enum the state is used for seeking are obtained from
* the same {@link IndexReader}, otherwise a {@link #seek(BytesRef, TermState)} call can
* leave the enum in undefined state.
* <p>
* NOTE: Using this method with an incompatible {@link TermState} might leave
* this {@link TermsEnum} in undefined state. On a segment level
* {@link TermState} instances are compatible only iff the source and the
* target {@link TermsEnum} operate on the same field. If operating on segment
* level, TermState instances must not be used across segments.
* <p>
* NOTE: A seek by {@link TermState} might not restore the
* {@link AttributeSource}'s state. {@link AttributeSource} states must be
* maintained separately if this method is used.
* @param term the term the TermState corresponds to
* @param state the {@link TermState}
* */
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
return seek(term);
}
/** Increments the enumeration to the next element.
* Returns the resulting term, or null if the end was
* hit. The returned BytesRef may be re-used across calls
@ -116,6 +143,25 @@ public abstract class TermsEnum {
* the postings by this codec. */
public abstract DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
/**
* Expert: Returns the TermsEnums internal state to position the TermsEnum
* without re-seeking the term dictionary.
* <p>
* NOTE: A seek by {@link TermState} might not capture the
* {@link AttributeSource}'s state. Callers must maintain the
* {@link AttributeSource} states separately
*
* @see TermState
* @see #seek(BytesRef, TermState)
*/
public TermState termState() throws IOException {
return new TermState() {
@Override
public void copyFrom(TermState other) {
}
};
}
/** Return the {@link BytesRef} Comparator used to sort
* terms provided by the iterator. This may return
* null if there are no terms. Callers may invoke this
@ -123,10 +169,6 @@ public abstract class TermsEnum {
* instance & reuse it. */
public abstract Comparator<BytesRef> getComparator() throws IOException;
/** Optional optimization hint: informs the codec that the
* current term is likely to be re-seek'd-to soon. */
public abstract void cacheCurrentTerm() throws IOException;
/** An empty TermsEnum for quickly returning an empty instance e.g.
* in {@link org.apache.lucene.search.MultiTermQuery}
* <p><em>Please note:</em> This enum should be unmodifiable,
@ -141,9 +183,6 @@ public abstract class TermsEnum {
@Override
public SeekStatus seek(long ord) { return SeekStatus.END; }
@Override
public void cacheCurrentTerm() {}
@Override
public BytesRef term() {
throw new IllegalStateException("this method should never be called");
@ -183,5 +222,15 @@ public abstract class TermsEnum {
public synchronized AttributeSource attributes() {
return super.attributes();
}
@Override
public TermState termState() throws IOException {
throw new IllegalStateException("this method should never be called");
}
@Override
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
throw new IllegalStateException("this method should never be called");
}
};
}

View File

@ -42,17 +42,17 @@ public abstract class PostingsReaderBase implements Closeable {
public abstract void init(IndexInput termsIn) throws IOException;
/** Return a newly created empty TermState */
public abstract TermState newTermState() throws IOException;
public abstract PrefixCodedTermState newTermState() throws IOException;
public abstract void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState state, boolean isIndexTerm) throws IOException;
public abstract void readTerm(IndexInput termsIn, FieldInfo fieldInfo, PrefixCodedTermState state, boolean isIndexTerm) throws IOException;
/** Must fully consume state, since after this call that
* TermState may be reused. */
public abstract DocsEnum docs(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsEnum reuse) throws IOException;
public abstract DocsEnum docs(FieldInfo fieldInfo, PrefixCodedTermState state, Bits skipDocs, DocsEnum reuse) throws IOException;
/** Must fully consume state, since after this call that
* TermState may be reused. */
public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, PrefixCodedTermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
public abstract void close() throws IOException;
}

View File

@ -1,5 +1,4 @@
package org.apache.lucene.index.codecs;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -17,40 +16,30 @@ package org.apache.lucene.index.codecs;
* limitations under the License.
*/
import org.apache.lucene.index.DocsEnum; // for javadocs
import org.apache.lucene.index.codecs.standard.StandardPostingsReader; // javadocs
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.TermState;
/**
* Holds all state required for {@link StandardPostingsReader}
* Holds all state required for {@link PostingsReaderBase}
* to produce a {@link DocsEnum} without re-seeking the
* terms dict.
* @lucene.experimental
*/
public class TermState implements Cloneable {
public long ord; // ord for this term
public long filePointer; // fp into the terms dict primary file (_X.tis)
public class PrefixCodedTermState extends OrdTermState {
public int docFreq; // how many docs have this term
public long filePointer; // fp into the terms dict primary file (_X.tis)
public void copyFrom(TermState other) {
ord = other.ord;
@Override
public void copyFrom(TermState _other) {
assert _other instanceof PrefixCodedTermState : "can not copy from " + _other.getClass().getName();
PrefixCodedTermState other = (PrefixCodedTermState) _other;
super.copyFrom(_other);
filePointer = other.filePointer;
docFreq = other.docFreq;
}
@Override
public Object clone() {
try {
return super.clone();
} catch (CloneNotSupportedException cnse) {
// should not happen
throw new RuntimeException(cnse);
}
public String toString() {
return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + "]";
}
@Override
public String toString() {
return "tis.fp=" + filePointer + " docFreq=" + docFreq + " ord=" + ord;
}
}

View File

@ -31,6 +31,7 @@ import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
@ -68,7 +69,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
private final Comparator<BytesRef> termComp;
// Caches the most recently looked-up field + terms:
private final DoubleBarrelLRUCache<FieldAndTerm,TermState> termsCache;
private final DoubleBarrelLRUCache<FieldAndTerm,PrefixCodedTermState> termsCache;
// Reads the terms index
private TermsIndexReaderBase indexReader;
@ -84,11 +85,6 @@ public class PrefixCodedTermsReader extends FieldsProducer {
public FieldAndTerm() {
}
public FieldAndTerm(String field, BytesRef term) {
this.field = field;
this.term = new BytesRef(term);
}
public FieldAndTerm(FieldAndTerm other) {
field = other.field;
term = new BytesRef(other.term);
@ -116,7 +112,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
throws IOException {
this.postingsReader = postingsReader;
termsCache = new DoubleBarrelLRUCache<FieldAndTerm,TermState>(termsCacheSize);
termsCache = new DoubleBarrelLRUCache<FieldAndTerm,PrefixCodedTermState>(termsCacheSize);
this.termComp = termComp;
@ -278,10 +274,10 @@ public class PrefixCodedTermsReader extends FieldsProducer {
}
// Iterates through terms in this field, not supporting ord()
private class SegmentTermsEnum extends TermsEnum {
private final class SegmentTermsEnum extends TermsEnum {
private final IndexInput in;
private final DeltaBytesReader bytesReader;
private final TermState state;
private final PrefixCodedTermState state;
private boolean seekPending;
private final FieldAndTerm fieldTerm = new FieldAndTerm();
private final TermsIndexReaderBase.FieldIndexEnum indexEnum;
@ -307,14 +303,6 @@ public class PrefixCodedTermsReader extends FieldsProducer {
return termComp;
}
@Override
public void cacheCurrentTerm() {
TermState stateCopy = (TermState) state.clone();
stateCopy.filePointer = in.getFilePointer();
termsCache.put(new FieldAndTerm(fieldInfo.name, bytesReader.term),
stateCopy);
}
// called only from assert
private boolean first;
private int indexTermCount;
@ -342,7 +330,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
* is found, SeekStatus.NOT_FOUND if a different term
* was found, SeekStatus.END if we hit EOF */
@Override
public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
public SeekStatus seek(final BytesRef term, final boolean useCache) throws IOException {
if (indexEnum == null) {
throw new IllegalStateException("terms index was not loaded");
@ -357,9 +345,8 @@ public class PrefixCodedTermsReader extends FieldsProducer {
cachedState = termsCache.get(fieldTerm);
if (cachedState != null) {
state.copyFrom(cachedState);
seekPending = true;
setTermState(term, state);
positioned = false;
bytesReader.term.copy(term);
//System.out.println(" cached!");
return SeekStatus.FOUND;
}
@ -439,12 +426,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
if (cmp == 0) {
// Done!
if (useCache) {
// Store in cache
FieldAndTerm entryKey = new FieldAndTerm(fieldTerm);
cachedState = (TermState) state.clone();
// this is fp after current term
cachedState.filePointer = in.getFilePointer();
termsCache.put(entryKey, cachedState);
cacheTerm(fieldTerm);
}
return SeekStatus.FOUND;
@ -464,6 +446,23 @@ public class PrefixCodedTermsReader extends FieldsProducer {
return SeekStatus.END;
}
private final void setTermState(BytesRef term, final TermState termState) {
assert termState != null && termState instanceof PrefixCodedTermState;
state.copyFrom(termState);
seekPending = true;
bytesReader.term.copy(term);
}
private final void cacheTerm(FieldAndTerm other) {
// Store in cache
final FieldAndTerm entryKey = new FieldAndTerm(other);
final PrefixCodedTermState cachedState = (PrefixCodedTermState) state.clone();
// this is fp after current term
cachedState.filePointer = in.getFilePointer();
termsCache.put(entryKey, cachedState);
}
@Override
public BytesRef term() {
return bytesReader.term;
@ -498,7 +497,9 @@ public class PrefixCodedTermsReader extends FieldsProducer {
postingsReader.readTerm(in,
fieldInfo, state,
isIndexTerm);
if (doOrd) {
state.ord++;
}
positioned = true;
//System.out.println("te.next term=" + bytesReader.term.utf8ToString());
@ -512,7 +513,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
assert docsEnum != null;
return docsEnum;
}
@ -526,6 +527,23 @@ public class PrefixCodedTermsReader extends FieldsProducer {
}
}
@Override
public SeekStatus seek(BytesRef term, TermState otherState) throws IOException {
assert otherState != null && otherState instanceof PrefixCodedTermState;
assert otherState.getClass() == this.state.getClass() : "Illegal TermState type " + otherState.getClass();
assert ((PrefixCodedTermState)otherState).ord < numTerms;
setTermState(term, otherState);
positioned = false;
return SeekStatus.FOUND;
}
@Override
public TermState termState() throws IOException {
final PrefixCodedTermState newTermState = (PrefixCodedTermState) state.clone();
newTermState.filePointer = in.getFilePointer();
return newTermState;
}
@Override
public SeekStatus seek(long ord) throws IOException {
@ -562,7 +580,6 @@ public class PrefixCodedTermsReader extends FieldsProducer {
return SeekStatus.FOUND;
}
@Override
public long ord() {
if (!doOrd) {
throw new UnsupportedOperationException();

View File

@ -33,6 +33,7 @@ import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.CompoundFileReader;
@ -741,11 +742,6 @@ public class PreFlexFields extends FieldsProducer {
}
}
@Override
public void cacheCurrentTerm() throws IOException {
getTermsDict().cacheCurrentTerm(termEnum);
}
@Override
public SeekStatus seek(long ord) throws IOException {
throw new UnsupportedOperationException();

View File

@ -22,8 +22,9 @@ import java.io.IOException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.codecs.PostingsReaderBase;
import org.apache.lucene.index.codecs.TermState;
import org.apache.lucene.index.codecs.PrefixCodedTermState;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
@ -56,10 +57,10 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
wrappedPostingsReader.init(termsIn);
}
private static class PulsingTermState extends TermState {
private static class PulsingTermState extends PrefixCodedTermState {
private byte[] postings;
private int postingsSize; // -1 if this term was not inlined
private TermState wrappedTermState;
private PrefixCodedTermState wrappedTermState;
private boolean pendingIndexTerm;
@Override
@ -71,7 +72,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
System.arraycopy(postings, 0, clone.postings, 0, postingsSize);
} else {
assert wrappedTermState != null;
clone.wrappedTermState = (TermState) wrappedTermState.clone();
clone.wrappedTermState = (PrefixCodedTermState) wrappedTermState.clone();
}
return clone;
}
@ -102,15 +103,14 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
}
@Override
public TermState newTermState() throws IOException {
public PrefixCodedTermState newTermState() throws IOException {
PulsingTermState state = new PulsingTermState();
state.wrappedTermState = wrappedPostingsReader.newTermState();
return state;
}
@Override
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState _termState, boolean isIndexTerm) throws IOException {
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, PrefixCodedTermState _termState, boolean isIndexTerm) throws IOException {
PulsingTermState termState = (PulsingTermState) _termState;
termState.pendingIndexTerm |= isIndexTerm;
@ -137,7 +137,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
// TODO: we could actually reuse, by having TL that
// holds the last wrapped reuse, and vice-versa
@Override
public DocsEnum docs(FieldInfo field, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
public DocsEnum docs(FieldInfo field, PrefixCodedTermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
PulsingTermState termState = (PulsingTermState) _termState;
if (termState.postingsSize != -1) {
PulsingDocsEnum postings;
@ -162,7 +162,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
// TODO: -- not great that we can't always reuse
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, PrefixCodedTermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
if (field.omitTermFreqAndPositions) {
return null;
}

View File

@ -25,8 +25,9 @@ import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.codecs.PostingsReaderBase;
import org.apache.lucene.index.codecs.TermState;
import org.apache.lucene.index.codecs.PrefixCodedTermState;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
@ -129,12 +130,13 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
}
}
private static class SepTermState extends TermState {
private static final class SepTermState extends PrefixCodedTermState {
// We store only the seek point to the docs file because
// the rest of the info (freqIndex, posIndex, etc.) is
// stored in the docs file:
IntIndexInput.Index docIndex;
@Override
public Object clone() {
SepTermState other = (SepTermState) super.clone();
other.docIndex = (IntIndexInput.Index) docIndex.clone();
@ -154,19 +156,19 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
}
@Override
public TermState newTermState() throws IOException {
public PrefixCodedTermState newTermState() throws IOException {
final SepTermState state = new SepTermState();
state.docIndex = docIn.index();
return state;
}
@Override
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm) throws IOException {
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, PrefixCodedTermState termState, boolean isIndexTerm) throws IOException {
((SepTermState) termState).docIndex.read(termsIn, isIndexTerm);
}
@Override
public DocsEnum docs(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
public DocsEnum docs(FieldInfo fieldInfo, PrefixCodedTermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
final SepTermState termState = (SepTermState) _termState;
SepDocsEnum docsEnum;
if (reuse == null || !(reuse instanceof SepDocsEnum)) {
@ -185,7 +187,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
}
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, PrefixCodedTermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
assert !fieldInfo.omitTermFreqAndPositions;
final SepTermState termState = (SepTermState) _termState;
SepDocsAndPositionsEnum postingsEnum;

View File

@ -21,6 +21,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
@ -151,10 +152,6 @@ class SimpleTextFieldsReader extends FieldsProducer {
}
}
@Override
public void cacheCurrentTerm() {
}
@Override
public BytesRef next() throws IOException {
assert !ended;
@ -439,7 +436,6 @@ class SimpleTextFieldsReader extends FieldsProducer {
}
private class SimpleTextTerms extends Terms {
private final String field;
private final long termsStart;
private final boolean omitTF;
private FST<PairOutputs.Pair<Long,Long>> fst;
@ -447,7 +443,6 @@ class SimpleTextFieldsReader extends FieldsProducer {
private final BytesRef scratch = new BytesRef(10);
public SimpleTextTerms(String field, long termsStart) throws IOException {
this.field = StringHelper.intern(field);
this.termsStart = termsStart;
omitTF = fieldInfos.fieldInfo(field).omitTermFreqAndPositions;
loadTerms();

View File

@ -26,8 +26,9 @@ import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.codecs.PostingsReaderBase;
import org.apache.lucene.index.codecs.TermState;
import org.apache.lucene.index.codecs.PrefixCodedTermState;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
@ -83,20 +84,20 @@ public class StandardPostingsReader extends PostingsReaderBase {
}
// Must keep final because we do non-standard clone
private final static class DocTermState extends TermState {
private final static class StandardTermState extends PrefixCodedTermState {
long freqOffset;
long proxOffset;
int skipOffset;
public Object clone() {
DocTermState other = new DocTermState();
StandardTermState other = new StandardTermState();
other.copyFrom(this);
return other;
}
public void copyFrom(TermState _other) {
super.copyFrom(_other);
DocTermState other = (DocTermState) _other;
StandardTermState other = (StandardTermState) _other;
freqOffset = other.freqOffset;
proxOffset = other.proxOffset;
skipOffset = other.skipOffset;
@ -108,8 +109,8 @@ public class StandardPostingsReader extends PostingsReaderBase {
}
@Override
public TermState newTermState() {
return new DocTermState();
public PrefixCodedTermState newTermState() {
return new StandardTermState();
}
@Override
@ -126,10 +127,9 @@ public class StandardPostingsReader extends PostingsReaderBase {
}
@Override
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm)
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, PrefixCodedTermState termState, boolean isIndexTerm)
throws IOException {
final DocTermState docTermState = (DocTermState) termState;
final StandardTermState docTermState = (StandardTermState) termState;
if (isIndexTerm) {
docTermState.freqOffset = termsIn.readVLong();
@ -153,7 +153,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
}
@Override
public DocsEnum docs(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsEnum reuse) throws IOException {
public DocsEnum docs(FieldInfo fieldInfo, PrefixCodedTermState termState, Bits skipDocs, DocsEnum reuse) throws IOException {
SegmentDocsEnum docsEnum;
if (reuse == null || !(reuse instanceof SegmentDocsEnum)) {
docsEnum = new SegmentDocsEnum(freqIn);
@ -166,11 +166,11 @@ public class StandardPostingsReader extends PostingsReaderBase {
docsEnum = new SegmentDocsEnum(freqIn);
}
}
return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
}
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, PrefixCodedTermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
if (fieldInfo.omitTermFreqAndPositions) {
return null;
}
@ -189,7 +189,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
}
}
return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
} else {
SegmentDocsAndPositionsEnum docsEnum;
if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsEnum)) {
@ -203,7 +203,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn);
}
}
return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
}
}
@ -233,7 +233,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
this.freqIn = (IndexInput) freqIn.clone();
}
public SegmentDocsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
public SegmentDocsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
omitTF = fieldInfo.omitTermFreqAndPositions;
if (omitTF) {
freq = 1;
@ -407,7 +407,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
this.proxIn = (IndexInput) proxIn.clone();
}
public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
assert !fieldInfo.omitTermFreqAndPositions;
assert !fieldInfo.storePayloads;
@ -594,7 +594,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
this.proxIn = (IndexInput) proxIn.clone();
}
public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
assert !fieldInfo.omitTermFreqAndPositions;
assert fieldInfo.storePayloads;
if (payload == null) {

View File

@ -21,9 +21,15 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.PerReaderTermState;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
@ -71,8 +77,8 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/) {
topLevel.add(new TermQuery(term, docFreq), BooleanClause.Occur.SHOULD);
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, PerReaderTermState states) {
topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD);
}
@Override
@ -98,9 +104,10 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
final BytesRefHash pendingTerms = col.pendingTerms;
final int sort[] = pendingTerms.sort(col.termsEnum.getComparator());
for(int i = 0; i < size; i++) {
final int pos = sort[i];
// docFreq is not used for constant score here, we pass 1
// to explicitely set a fake value, so it's not calculated
addClause(bq, placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1, 1.0f);
addClause(bq, placeholderTerm.createTerm(pendingTerms.get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]);
}
// Strip scores
final Query result = new ConstantScoreQuery(bq);
@ -123,12 +130,21 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
@Override
public boolean collect(BytesRef bytes) throws IOException {
pendingTerms.add(bytes);
int pos = pendingTerms.add(bytes);
docVisitCount += termsEnum.docFreq();
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
hasCutOff = true;
return false;
}
final TermState termState = termsEnum.termState();
assert termState != null;
if (pos < 0) {
pos = (-pos)-1;
array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq());
} else {
array.termState[pos] = new PerReaderTermState(topReaderContext, termState, readerContext.ord, termsEnum.docFreq());
}
return true;
}
@ -137,7 +153,8 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
TermsEnum termsEnum;
final int docCountCutoff, termCountLimit;
final BytesRefHash pendingTerms = new BytesRefHash();
final TermStateByteStart array = new TermStateByteStart(16);
final BytesRefHash pendingTerms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
}
@Override
@ -166,4 +183,40 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
return true;
}
/** Special implementation of BytesStartArray that keeps parallel arrays for {@link PerReaderTermState} */
static final class TermStateByteStart extends DirectBytesStartArray {
PerReaderTermState[] termState;
public TermStateByteStart(int initSize) {
super(initSize);
}
@Override
public int[] init() {
final int[] ord = super.init();
termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert termState.length >= ord.length;
return ord;
}
@Override
public int[] grow() {
final int[] ord = super.grow();
if (termState.length < ord.length) {
PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
termState = tmpTermState;
}
assert termState.length >= ord.length;
return ord;
}
@Override
public int[] clear() {
termState = null;
return super.clear();
}
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
@ -156,9 +157,21 @@ public abstract class FilteredTermsEnum extends TermsEnum {
return tenum.docsAndPositions(bits, reuse);
}
/** This enum does not support seeking!
* @throws UnsupportedOperationException
*/
@Override
public void cacheCurrentTerm() throws IOException {
tenum.cacheCurrentTerm();
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
throw new UnsupportedOperationException(getClass().getName()+" does not support seeking");
}
/**
* Returns the filtered enums term state
*/
@Override
public TermState termState() throws IOException {
assert tenum != null;
return tenum.termState();
}
@SuppressWarnings("fallthrough")

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
@ -244,11 +245,6 @@ public final class FuzzyTermsEnum extends TermsEnum {
return actualEnum.docFreq();
}
@Override
public void cacheCurrentTerm() throws IOException {
actualEnum.cacheCurrentTerm();
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
return actualEnum.docs(skipDocs, reuse);
@ -260,6 +256,15 @@ public final class FuzzyTermsEnum extends TermsEnum {
return actualEnum.docsAndPositions(skipDocs, reuse);
}
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
return actualEnum.seek(term, state);
}
@Override
public TermState termState() throws IOException {
return actualEnum.termState();
}
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return actualEnum.getComparator();

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.PerReaderTermState;
/**
* An abstract {@link Query} that matches documents
@ -159,8 +160,8 @@ public abstract class MultiTermQuery extends Query {
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost) {
final TermQuery tq = new TermQuery(term, docCount);
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) {
final TermQuery tq = new TermQuery(term, states);
tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD);
}
@ -200,8 +201,8 @@ public abstract class MultiTermQuery extends Query {
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost) {
final Query q = new ConstantScoreQuery(new TermQuery(term, docFreq));
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) {
final Query q = new ConstantScoreQuery(new TermQuery(term, states));
q.setBoost(boost);
topLevel.add(q, BooleanClause.Occur.SHOULD);
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
@ -27,6 +28,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.PerReaderTermState;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
@ -53,8 +55,9 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost) {
final TermQuery tq = new TermQuery(term, docCount);
protected void addClause(BooleanQuery topLevel, Term term, int docCount,
float boost, PerReaderTermState states) {
final TermQuery tq = new TermQuery(term, states);
tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD);
}
@ -114,13 +117,13 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
final int size = col.terms.size();
if (size > 0) {
final int sort[] = col.terms.sort(col.termsEnum.getComparator());
final int[] docFreq = col.array.docFreq;
final float[] boost = col.array.boost;
final PerReaderTermState[] termStates = col.array.termState;
for (int i = 0; i < size; i++) {
final int pos = sort[i];
final Term term = placeholderTerm.createTerm(col.terms.get(pos, new BytesRef()));
assert reader.docFreq(term) == docFreq[pos];
addClause(result, term, docFreq[pos], query.getBoost() * boost[pos]);
assert reader.docFreq(term) == termStates[pos].docFreq();
addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos], termStates[pos]);
}
}
query.incTotalNumberOfTerms(size);
@ -143,15 +146,17 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
@Override
public boolean collect(BytesRef bytes) throws IOException {
final int e = terms.add(bytes);
final TermState state = termsEnum.termState();
assert state != null;
if (e < 0 ) {
// duplicate term: update docFreq
final int pos = (-e)-1;
array.docFreq[pos] += termsEnum.docFreq();
array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq());
assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums";
} else {
// new entry: we populate the entry initially
array.docFreq[e] = termsEnum.docFreq();
array.boost[e] = boostAtt.getBoost();
array.termState[e] = new PerReaderTermState(topReaderContext, state, readerContext.ord, termsEnum.docFreq());
ScoringRewrite.this.checkMaxClauseCount(terms.size());
}
return true;
@ -160,8 +165,8 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
static final class TermFreqBoostByteStart extends DirectBytesStartArray {
int[] docFreq;
float[] boost;
PerReaderTermState[] termState;
public TermFreqBoostByteStart(int initSize) {
super(initSize);
@ -171,24 +176,28 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
public int[] init() {
final int[] ord = super.init();
boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)];
docFreq = new int[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_INT)];
assert boost.length >= ord.length && docFreq.length >= ord.length;
termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert termState.length >= ord.length && boost.length >= ord.length;
return ord;
}
@Override
public int[] grow() {
final int[] ord = super.grow();
docFreq = ArrayUtil.grow(docFreq, ord.length);
boost = ArrayUtil.grow(boost, ord.length);
assert boost.length >= ord.length && docFreq.length >= ord.length;
if (termState.length < ord.length) {
PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
termState = tmpTermState;
}
assert termState.length >= ord.length && boost.length >= ord.length;
return ord;
}
@Override
public int[] clear() {
boost = null;
docFreq = null;
termState = null;
return super.clear();
}

View File

@ -18,8 +18,6 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Comparator;
import org.apache.lucene.index.Fields;
@ -27,25 +25,33 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState;
import org.apache.lucene.util.ReaderUtil;
abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.RewriteMethod {
/** Return a suitable top-level Query for holding all expanded terms. */
protected abstract Q getTopLevelQuery() throws IOException;
/** Add a MultiTermQuery term to the top-level query */
protected abstract void addClause(Q topLevel, Term term, int docCount, float boost) throws IOException;
protected final void addClause(Q topLevel, Term term, int docCount, float boost) throws IOException {
addClause(topLevel, term, docCount, boost, null);
}
protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, PerReaderTermState states) throws IOException;
protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
final List<IndexReader> subReaders = new ArrayList<IndexReader>();
ReaderUtil.gatherSubReaders(subReaders, reader);
ReaderContext topReaderContext = reader.getTopReaderContext();
Comparator<BytesRef> lastTermComp = null;
for (IndexReader r : subReaders) {
final Fields fields = r.fields();
final AtomicReaderContext[] leaves = ReaderUtil.leaves(topReaderContext);
for (AtomicReaderContext context : leaves) {
final Fields fields = context.reader.fields();
if (fields == null) {
// reader has no fields
continue;
@ -68,11 +74,10 @@ abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.Rew
if (lastTermComp != null && newTermComp != null && newTermComp != lastTermComp)
throw new RuntimeException("term comparator should not change between segments: "+lastTermComp+" != "+newTermComp);
lastTermComp = newTermComp;
collector.setReaderContext(topReaderContext, context);
collector.setNextEnum(termsEnum);
BytesRef bytes;
while ((bytes = termsEnum.next()) != null) {
termsEnum.cacheCurrentTerm();
if (!collector.collect(bytes))
return; // interrupt whole term collection, so also don't iterate other subReaders
}
@ -80,6 +85,14 @@ abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.Rew
}
protected static abstract class TermCollector {
protected AtomicReaderContext readerContext;
protected ReaderContext topReaderContext;
public void setReaderContext(ReaderContext topReaderContext, AtomicReaderContext readerContext) {
this.readerContext = readerContext;
this.topReaderContext = topReaderContext;
}
/** attributes used for communication with the enum */
public final AttributeSource attributes = new AttributeSource();

View File

@ -22,10 +22,14 @@ import java.util.Set;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState;
import org.apache.lucene.util.ToStringUtils;
/** A Query that matches documents containing a term.
@ -33,20 +37,22 @@ import org.apache.lucene.util.ToStringUtils;
*/
public class TermQuery extends Query {
private final Term term;
private final int docFreq;
private int docFreq;
private transient PerReaderTermState perReaderTermState;
private class TermWeight extends Weight {
private final Similarity similarity;
private float value;
private float idf;
private final float idf;
private float queryNorm;
private float queryWeight;
private IDFExplanation idfExp;
private transient ReaderContext weightContext; // only set if -ea for assert in scorer()
private final IDFExplanation idfExp;
private transient PerReaderTermState termStates;
public TermWeight(IndexSearcher searcher)
public TermWeight(IndexSearcher searcher, PerReaderTermState termStates, int docFreq)
throws IOException {
assert setWeightContext(searcher);
assert termStates != null : "PerReaderTermState must not be null";
this.termStates = termStates;
this.similarity = getSimilarity(searcher);
if (docFreq != -1) {
idfExp = similarity.idfExplain(term, searcher, docFreq);
@ -80,30 +86,33 @@ public class TermQuery extends Query {
@Override
public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
assert assertTopReaderContext(context);
final String field = term.field();
final IndexReader reader = context.reader;
DocsEnum docs = reader.termDocsEnum(reader.getDeletedDocs(),
term.field(),
term.bytes());
if (docs == null) {
assert assertTopReaderContext(termStates, context) : "The top-reader used to create Weight is not the same as the current reader's top-reader";
final TermState state = termStates
.get(context.ord);
if (state == null) { // term is not present in that reader
assert termNotInReader(reader, field, term.bytes()) : "no termstate found but term exists in reader";
return null;
}
return new TermScorer(this, docs, similarity, reader.norms(term.field()));
final DocsEnum docs = reader.termDocsEnum(reader.getDeletedDocs(), field, term.bytes(), state);
assert docs != null;
return new TermScorer(this, docs, similarity, context.reader.norms(field));
}
private boolean assertTopReaderContext(ReaderContext context) {
while (context.parent != null) {
private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException {
// only called from assert
final Terms terms = reader.terms(field);
return terms == null || terms.docFreq(bytes) == 0;
}
private boolean assertTopReaderContext(PerReaderTermState state, ReaderContext context) {
while(context.parent != null) {
context = context.parent;
}
return weightContext == context;
return state.topReaderContext == context;
}
private boolean setWeightContext(IndexSearcher searcher) {
weightContext = searcher.getTopReaderContext();
return true;
}
@Override
public Explanation explain(AtomicReaderContext context, int doc)
@ -157,7 +166,7 @@ public class TermQuery extends Query {
fieldExpl.addDetail(expl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = reader.norms(field);
final byte[] fieldNorms = reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
@ -193,6 +202,17 @@ public class TermQuery extends Query {
public TermQuery(Term t, int docFreq) {
term = t;
this.docFreq = docFreq;
perReaderTermState = null;
}
/** Expert: constructs a TermQuery that will use the
* provided docFreq instead of looking up the docFreq
* against the searcher. */
public TermQuery(Term t, PerReaderTermState states) {
assert states != null;
term = t;
docFreq = states.docFreq();
perReaderTermState = states;
}
/** Returns the term of this query. */
@ -200,7 +220,21 @@ public class TermQuery extends Query {
@Override
public Weight createWeight(IndexSearcher searcher) throws IOException {
return new TermWeight(searcher);
final ReaderContext context = searcher.getTopReaderContext();
final int weightDocFreq;
final PerReaderTermState termState;
if (perReaderTermState == null || perReaderTermState.topReaderContext != context) {
// make TermQuery single-pass if we don't have a PRTS or if the context differs!
termState = PerReaderTermState.build(context, term, true); // cache term lookups!
// we must not ignore the given docFreq - if set use the given value
weightDocFreq = docFreq == -1 ? termState.docFreq() : docFreq;
} else {
// PRTS was pre-build for this IS
termState = this.perReaderTermState;
weightDocFreq = docFreq;
}
return new TermWeight(searcher, termState, weightDocFreq);
}
@Override

View File

@ -25,9 +25,11 @@ import java.util.Comparator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState;
/**
* Base rewrite method for collecting only the top terms
@ -78,12 +80,12 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
this.termComp = termsEnum.getComparator();
// lazy init the initial ScoreTerm because comparator is not known on ctor:
if (st == null)
st = new ScoreTerm(this.termComp);
st = new ScoreTerm(this.termComp, new PerReaderTermState(topReaderContext));
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
@Override
public boolean collect(BytesRef bytes) {
public boolean collect(BytesRef bytes) throws IOException {
final float boost = boostAtt.getBoost();
// ignore uncompetetive hits
if (stQueue.size() == maxSize) {
@ -94,23 +96,27 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
return true;
}
ScoreTerm t = visitedTerms.get(bytes);
final TermState state = termsEnum.termState();
assert state != null;
if (t != null) {
// if the term is already in the PQ, only update docFreq of term in PQ
t.docFreq += termsEnum.docFreq();
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
t.termState.register(state, readerContext.ord, termsEnum.docFreq());
} else {
// add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copy(bytes);
st.boost = boost;
st.docFreq = termsEnum.docFreq();
visitedTerms.put(st.bytes, st);
assert st.termState.docFreq() == 0;
st.termState.register(state, readerContext.ord, termsEnum.docFreq());
stQueue.offer(st);
// possibly drop entries from queue
if (stQueue.size() > maxSize) {
st = stQueue.poll();
visitedTerms.remove(st.bytes);
st.termState.clear(); // reset the termstate!
} else {
st = new ScoreTerm(termComp);
st = new ScoreTerm(termComp, new PerReaderTermState(topReaderContext));
}
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
@ -120,6 +126,7 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
maxBoostAtt.setCompetitiveTerm(t.bytes);
}
}
return true;
}
});
@ -130,8 +137,8 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
ArrayUtil.quickSort(scoreTerms, scoreTermSortByTermComp);
for (final ScoreTerm st : scoreTerms) {
final Term term = placeholderTerm.createTerm(st.bytes);
assert reader.docFreq(term) == st.docFreq;
addClause(q, term, st.docFreq, query.getBoost() * st.boost); // add to query
assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq();
addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query
}
query.incTotalNumberOfTerms(scoreTerms.length);
return q;
@ -147,7 +154,7 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
final TopTermsRewrite other = (TopTermsRewrite) obj;
final TopTermsRewrite<?> other = (TopTermsRewrite<?>) obj;
if (size != other.size) return false;
return true;
}
@ -163,13 +170,12 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
static final class ScoreTerm implements Comparable<ScoreTerm> {
public final Comparator<BytesRef> termComp;
public final BytesRef bytes = new BytesRef();
public float boost;
public int docFreq;
public ScoreTerm(Comparator<BytesRef> termComp) {
public final PerReaderTermState termState;
public ScoreTerm(Comparator<BytesRef> termComp, PerReaderTermState termState) {
this.termComp = termComp;
this.termState = termState;
}
public int compareTo(ScoreTerm other) {

View File

@ -24,8 +24,11 @@ import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.codecs.PrefixCodedTermState;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache.DocTermsIndex;
import org.apache.lucene.util.ArrayUtil;
@ -303,11 +306,6 @@ public class DocTermsIndexCreator extends EntryCreatorWithOptions<DocTermsIndex>
return term;
}
@Override
public void cacheCurrentTerm() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public BytesRef term() throws IOException {
return term;
@ -337,6 +335,19 @@ public class DocTermsIndexCreator extends EntryCreatorWithOptions<DocTermsIndex>
public Comparator<BytesRef> getComparator() throws IOException {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
assert state != null && state instanceof OrdTermState;
return this.seek(((OrdTermState)state).ord);
}
@Override
public TermState termState() throws IOException {
OrdTermState state = new OrdTermState();
state.ord = currentOrd;
return state;
}
}
}
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopTermsRewrite;
import org.apache.lucene.search.ScoringRewrite;
import org.apache.lucene.search.BooleanClause.Occur; // javadocs only
import org.apache.lucene.util.PerReaderTermState;
/**
* Wraps any {@link MultiTermQuery} as a {@link SpanQuery},
@ -153,7 +154,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
}
@Override
protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost) {
protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) {
final SpanTermQuery q = new SpanTermQuery(term);
q.setBoost(boost);
topLevel.addClause(q);
@ -202,7 +203,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
}
@Override
protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost) {
protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) {
final SpanTermQuery q = new SpanTermQuery(term);
q.setBoost(boost);
topLevel.addClause(q);

View File

@ -0,0 +1,148 @@
package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.TermsEnum.SeekStatus;
/**
* Maintains a {@link IndexReader} {@link TermState} view over
* {@link IndexReader} instances containing a single term. The
* {@link PerReaderTermState} doesn't track if the given {@link TermState}
* objects are valid, neither if the {@link TermState} instances refer to the
* same terms in the associated readers.
*
* @lucene.experimental
*/
public final class PerReaderTermState {
public final ReaderContext topReaderContext; // for asserting!
private final TermState[] states;
private int docFreq;
/**
* Creates an empty {@link PerReaderTermState} from a {@link ReaderContext}
*/
public PerReaderTermState(ReaderContext context) {
assert context != null && context.isTopLevel;
topReaderContext = context;
docFreq = 0;
final int len;
if (context.leaves() == null) {
len = 1;
} else {
len = context.leaves().length;
}
states = new TermState[len];
}
/**
* Creates a {@link PerReaderTermState} with an initial {@link TermState},
* {@link IndexReader} pair.
*/
public PerReaderTermState(ReaderContext context, TermState state, int ord, int docFreq) {
this(context);
register(state, ord, docFreq);
}
/**
* Creates a {@link PerReaderTermState} from a top-level {@link ReaderContext} and the
* given {@link Term}. This method will lookup the given term in all context's leaf readers
* and register each of the readers containing the term in the returned {@link PerReaderTermState}
* using the leaf reader's ordinal.
* <p>
* Note: the given context must be a top-level context.
*/
public static PerReaderTermState build(ReaderContext context, Term term, boolean cache)
throws IOException {
assert context != null && context.isTopLevel;
final String field = term.field();
final BytesRef bytes = term.bytes();
final PerReaderTermState perReaderTermState = new PerReaderTermState(context);
final AtomicReaderContext[] leaves = ReaderUtil.leaves(context);
for (int i = 0; i < leaves.length; i++) {
final Fields fields = leaves[i].reader.fields();
if (fields != null) {
final Terms terms = fields.terms(field);
if (terms != null) {
final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share!
if (SeekStatus.FOUND == termsEnum.seek(bytes, cache)) {
final TermState termState = termsEnum.termState();
perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq());
}
}
}
}
return perReaderTermState;
}
/**
* Clears the {@link PerReaderTermState} internal state and removes all
* registered {@link TermState}s
*/
public void clear() {
docFreq = 0;
Arrays.fill(states, null);
}
/**
* Registers and associates a {@link TermState} with an leaf ordinal. The leaf ordinal
* should be derived from a {@link ReaderContext}'s leaf ord.
*/
public void register(TermState state, final int ord, final int docFreq) {
assert state != null : "state must not be null";
assert ord >= 0 && ord < states.length;
assert states[ord] == null : "state for ord: " + ord
+ " already registered";
this.docFreq += docFreq;
states[ord] = state;
}
/**
* Returns the {@link TermState} for an leaf ordinal or <code>null</code> if no
* {@link TermState} for the ordinal was registered.
*
* @param ord
* the readers leaf ordinal to get the {@link TermState} for.
* @return the {@link TermState} for the given readers ord or <code>null</code> if no
* {@link TermState} for the reader was registered
*/
public TermState get(int ord) {
assert ord >= 0 && ord < states.length;
return states[ord];
}
/**
* Returns the accumulated document frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int)}.
* @return the accumulated document frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int)}.
*/
public int docFreq() {
return docFreq;
}
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene;
*/
import org.apache.lucene.util.*;
import org.apache.lucene.util.Bits;
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.lucene.search.*;
@ -329,10 +330,6 @@ public class TestExternalCodecs extends LuceneTestCase {
return ramField.termToDocs.get(current).docs.size();
}
@Override
public void cacheCurrentTerm() {
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs);

View File

@ -213,8 +213,6 @@ public class QueryUtils {
}
}
/** alternate scorer skipTo(),skipTo(),next(),next(),skipTo(),skipTo(), etc
* and ensure a hitcollector receives same docs and scores
*/

View File

@ -40,6 +40,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.codecs.CodecProvider;

View File

@ -1000,10 +1000,6 @@ class NumberedTermsEnum extends TermsEnum {
return tenum.docFreq();
}
@Override
public void cacheCurrentTerm() {
throw new UnsupportedOperationException();
}
public BytesRef skipTo(BytesRef target) throws IOException {