mirror of https://github.com/apache/lucene.git
LUCENE-2694: Make MTQ rewrite + weight/scorer single pass
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1058328 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3df2f89d6c
commit
eacfb5d636
|
@ -359,6 +359,11 @@ Optimizations
|
|||
not seek backwards when a sub-range has no terms. It now only seeks
|
||||
when the current term is less than the next sub-range's lower end.
|
||||
(Uwe Schindler, Mike McCandless)
|
||||
|
||||
* LUCENE-2694: Optimize MultiTermQuery to be single pass for Term lookups.
|
||||
MultiTermQuery now stores TermState per leaf reader during rewrite to re-
|
||||
seek the term dictionary in TermQuery / TermWeight.
|
||||
(Simon Willnauer, Mike McCandless, Robert Muir)
|
||||
|
||||
Documentation
|
||||
|
||||
|
|
|
@ -18,10 +18,15 @@ package org.apache.lucene.store.instantiated;
|
|||
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.OrdTermState;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.codecs.PrefixCodedTermState;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
|
||||
|
@ -90,10 +95,6 @@ public class InstantiatedTermsEnum extends TermsEnum {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef term() {
|
||||
return br;
|
||||
|
@ -129,5 +130,18 @@ public class InstantiatedTermsEnum extends TermsEnum {
|
|||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermState termState() throws IOException {
|
||||
final OrdTermState state = new OrdTermState();
|
||||
state.ord = upto - start;
|
||||
return state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
|
||||
assert state != null && state instanceof OrdTermState;
|
||||
return seek(((OrdTermState)state).ord); // just use the ord for simplicity
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -39,6 +39,8 @@ import org.apache.lucene.document.FieldSelector;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.OrdTermState;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
|
@ -883,10 +885,6 @@ public class MemoryIndex implements Serializable {
|
|||
return br;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ord() {
|
||||
return termUpto;
|
||||
|
@ -917,8 +915,21 @@ public class MemoryIndex implements Serializable {
|
|||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
|
||||
assert state != null;
|
||||
return this.seek(((OrdTermState)state).ord);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermState termState() throws IOException {
|
||||
OrdTermState ts = new OrdTermState();
|
||||
ts.ord = termUpto;
|
||||
return ts;
|
||||
}
|
||||
}
|
||||
|
||||
private class MemoryDocsEnum extends DocsEnum {
|
||||
private ArrayIntList positions;
|
||||
private boolean hasNext;
|
||||
|
|
|
@ -372,7 +372,6 @@ class BufferedDeletes {
|
|||
Query query = entry.getKey();
|
||||
int limit = entry.getValue().intValue();
|
||||
Weight weight = query.weight(searcher);
|
||||
|
||||
Scorer scorer = weight.scorer(readerContext, true, false);
|
||||
if (scorer != null) {
|
||||
while(true) {
|
||||
|
|
|
@ -130,11 +130,6 @@ public class FilterIndexReader extends IndexReader {
|
|||
return in.seek(text, useCache);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
in.cacheCurrentTerm();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(long ord) throws IOException {
|
||||
return in.seek(ord);
|
||||
|
@ -174,6 +169,16 @@ public class FilterIndexReader extends IndexReader {
|
|||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
return in.getComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
|
||||
return in.seek(term, state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermState termState() throws IOException {
|
||||
return in.termState();
|
||||
}
|
||||
}
|
||||
|
||||
/** Base class for filtering {@link DocsEnum} implementations. */
|
||||
|
|
|
@ -1070,6 +1070,47 @@ public abstract class IndexReader implements Cloneable,Closeable {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns {@link DocsEnum} for the specified field and
|
||||
* {@link TermState}. This may return null, if either the field or the term
|
||||
* does not exists or the {@link TermState} is invalid for the underlying
|
||||
* implementation.*/
|
||||
public DocsEnum termDocsEnum(Bits skipDocs, String field, BytesRef term, TermState state) throws IOException {
|
||||
assert state != null;
|
||||
assert field != null;
|
||||
final Fields fields = fields();
|
||||
if (fields == null) {
|
||||
return null;
|
||||
}
|
||||
final Terms terms = fields.terms(field);
|
||||
if (terms != null) {
|
||||
return terms.docs(skipDocs, term, state, null);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns {@link DocsAndPositionsEnum} for the specified field and
|
||||
* {@link TermState}. This may return null, if either the field or the term
|
||||
* does not exists, the {@link TermState} is invalid for the underlying
|
||||
* implementation, or positions were not stored for this term.*/
|
||||
public DocsAndPositionsEnum termPositionsEnum(Bits skipDocs, String field, BytesRef term, TermState state) throws IOException {
|
||||
assert state != null;
|
||||
assert field != null;
|
||||
final Fields fields = fields();
|
||||
if (fields == null) {
|
||||
return null;
|
||||
}
|
||||
final Terms terms = fields.terms(field);
|
||||
if (terms != null) {
|
||||
return terms.docsAndPositions(skipDocs, term, state, null);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Deletes the document numbered <code>docNum</code>. Once a document is
|
||||
* deleted it will not appear in TermDocs or TermPositions enumerations.
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.index;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
|
|
|
@ -90,13 +90,6 @@ public final class MultiTermsEnum extends TermsEnum {
|
|||
return current;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
for(int i=0;i<numTop;i++) {
|
||||
top[i].terms.cacheCurrentTerm();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return termComp;
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* An ordinal based {@link TermState}
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class OrdTermState extends TermState {
|
||||
public long ord;
|
||||
|
||||
@Override
|
||||
public void copyFrom(TermState other) {
|
||||
assert other instanceof OrdTermState : "can not copy from " + other.getClass().getName();
|
||||
this.ord = ((OrdTermState) other).ord;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Encapsulates all required internal state to position the associated
|
||||
* {@link TermsEnum} without re-seeking.
|
||||
*
|
||||
* @see TermsEnum#seek(org.apache.lucene.util.BytesRef, TermState)
|
||||
* @see TermsEnum#termState()
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class TermState implements Cloneable {
|
||||
|
||||
/**
|
||||
* Copies the content of the given {@link TermState} to this instance
|
||||
*
|
||||
* @param other
|
||||
* the TermState to copy
|
||||
*/
|
||||
public abstract void copyFrom(TermState other);
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
try {
|
||||
return super.clone();
|
||||
} catch (CloneNotSupportedException cnse) {
|
||||
// should not happen
|
||||
throw new RuntimeException(cnse);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -80,11 +80,57 @@ public abstract class Terms {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: Get {@link DocsEnum} for the specified {@link TermState}.
|
||||
* This method may return <code>null</code> if the term does not exist.
|
||||
*
|
||||
* @see TermsEnum#termState()
|
||||
* @see TermsEnum#seek(BytesRef, TermState) */
|
||||
public DocsEnum docs(Bits skipDocs, BytesRef term, TermState termState, DocsEnum reuse) throws IOException {
|
||||
final TermsEnum termsEnum = getThreadTermsEnum();
|
||||
if (termsEnum.seek(term, termState) == TermsEnum.SeekStatus.FOUND) {
|
||||
return termsEnum.docs(skipDocs, reuse);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get {@link DocsEnum} for the specified {@link TermState}. This
|
||||
* method will may return <code>null</code> if the term does not exists, or positions were
|
||||
* not indexed.
|
||||
*
|
||||
* @see TermsEnum#termState()
|
||||
* @see TermsEnum#seek(BytesRef, TermState) */
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, BytesRef term, TermState termState, DocsAndPositionsEnum reuse) throws IOException {
|
||||
final TermsEnum termsEnum = getThreadTermsEnum();
|
||||
if (termsEnum.seek(term, termState) == TermsEnum.SeekStatus.FOUND) {
|
||||
return termsEnum.docsAndPositions(skipDocs, reuse);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
|
||||
}
|
||||
|
||||
protected TermsEnum getThreadTermsEnum() throws IOException {
|
||||
/**
|
||||
* Returns a thread-private {@link TermsEnum} instance. Obtaining
|
||||
* {@link TermsEnum} from this method might be more efficient than using
|
||||
* {@link #iterator()} directly since this method doesn't necessarily create a
|
||||
* new {@link TermsEnum} instance.
|
||||
* <p>
|
||||
* NOTE: {@link TermsEnum} instances obtained from this method must not be
|
||||
* shared across threads. The enum should only be used within a local context
|
||||
* where other threads can't access it.
|
||||
*
|
||||
* @return a thread-private {@link TermsEnum} instance
|
||||
* @throws IOException
|
||||
* if an IOException occurs
|
||||
* @lucene.internal
|
||||
*/
|
||||
public TermsEnum getThreadTermsEnum() throws IOException {
|
||||
TermsEnum termsEnum = threadEnums.get();
|
||||
if (termsEnum == null) {
|
||||
termsEnum = iterator();
|
||||
|
|
|
@ -73,7 +73,34 @@ public abstract class TermsEnum {
|
|||
* may be before or after the current ord. See {@link
|
||||
* #seek(BytesRef)}. */
|
||||
public abstract SeekStatus seek(long ord) throws IOException;
|
||||
|
||||
|
||||
/**
|
||||
* Expert: Seeks a specific position by {@link TermState} previously obtained
|
||||
* from {@link #termState()}. Callers should maintain the {@link TermState} to
|
||||
* use this method. Low-level implementations may position the TermsEnum
|
||||
* without re-seeking the term dictionary.
|
||||
* <p>
|
||||
* Seeking by {@link TermState} should only be used iff the enu the state was
|
||||
* obtained from and the enum the state is used for seeking are obtained from
|
||||
* the same {@link IndexReader}, otherwise a {@link #seek(BytesRef, TermState)} call can
|
||||
* leave the enum in undefined state.
|
||||
* <p>
|
||||
* NOTE: Using this method with an incompatible {@link TermState} might leave
|
||||
* this {@link TermsEnum} in undefined state. On a segment level
|
||||
* {@link TermState} instances are compatible only iff the source and the
|
||||
* target {@link TermsEnum} operate on the same field. If operating on segment
|
||||
* level, TermState instances must not be used across segments.
|
||||
* <p>
|
||||
* NOTE: A seek by {@link TermState} might not restore the
|
||||
* {@link AttributeSource}'s state. {@link AttributeSource} states must be
|
||||
* maintained separately if this method is used.
|
||||
* @param term the term the TermState corresponds to
|
||||
* @param state the {@link TermState}
|
||||
* */
|
||||
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
|
||||
return seek(term);
|
||||
}
|
||||
|
||||
/** Increments the enumeration to the next element.
|
||||
* Returns the resulting term, or null if the end was
|
||||
* hit. The returned BytesRef may be re-used across calls
|
||||
|
@ -98,7 +125,7 @@ public abstract class TermsEnum {
|
|||
* first time, after next() returns null or seek returns
|
||||
* {@link SeekStatus#END}.*/
|
||||
public abstract int docFreq();
|
||||
|
||||
|
||||
/** Get {@link DocsEnum} for the current term. Do not
|
||||
* call this before calling {@link #next} or {@link
|
||||
* #seek} for the first time. This method will not
|
||||
|
@ -116,6 +143,25 @@ public abstract class TermsEnum {
|
|||
* the postings by this codec. */
|
||||
public abstract DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
|
||||
|
||||
/**
|
||||
* Expert: Returns the TermsEnums internal state to position the TermsEnum
|
||||
* without re-seeking the term dictionary.
|
||||
* <p>
|
||||
* NOTE: A seek by {@link TermState} might not capture the
|
||||
* {@link AttributeSource}'s state. Callers must maintain the
|
||||
* {@link AttributeSource} states separately
|
||||
*
|
||||
* @see TermState
|
||||
* @see #seek(BytesRef, TermState)
|
||||
*/
|
||||
public TermState termState() throws IOException {
|
||||
return new TermState() {
|
||||
@Override
|
||||
public void copyFrom(TermState other) {
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** Return the {@link BytesRef} Comparator used to sort
|
||||
* terms provided by the iterator. This may return
|
||||
* null if there are no terms. Callers may invoke this
|
||||
|
@ -123,10 +169,6 @@ public abstract class TermsEnum {
|
|||
* instance & reuse it. */
|
||||
public abstract Comparator<BytesRef> getComparator() throws IOException;
|
||||
|
||||
/** Optional optimization hint: informs the codec that the
|
||||
* current term is likely to be re-seek'd-to soon. */
|
||||
public abstract void cacheCurrentTerm() throws IOException;
|
||||
|
||||
/** An empty TermsEnum for quickly returning an empty instance e.g.
|
||||
* in {@link org.apache.lucene.search.MultiTermQuery}
|
||||
* <p><em>Please note:</em> This enum should be unmodifiable,
|
||||
|
@ -141,9 +183,6 @@ public abstract class TermsEnum {
|
|||
@Override
|
||||
public SeekStatus seek(long ord) { return SeekStatus.END; }
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {}
|
||||
|
||||
@Override
|
||||
public BytesRef term() {
|
||||
throw new IllegalStateException("this method should never be called");
|
||||
|
@ -183,5 +222,15 @@ public abstract class TermsEnum {
|
|||
public synchronized AttributeSource attributes() {
|
||||
return super.attributes();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermState termState() throws IOException {
|
||||
throw new IllegalStateException("this method should never be called");
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
|
||||
throw new IllegalStateException("this method should never be called");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -42,17 +42,17 @@ public abstract class PostingsReaderBase implements Closeable {
|
|||
public abstract void init(IndexInput termsIn) throws IOException;
|
||||
|
||||
/** Return a newly created empty TermState */
|
||||
public abstract TermState newTermState() throws IOException;
|
||||
public abstract PrefixCodedTermState newTermState() throws IOException;
|
||||
|
||||
public abstract void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState state, boolean isIndexTerm) throws IOException;
|
||||
public abstract void readTerm(IndexInput termsIn, FieldInfo fieldInfo, PrefixCodedTermState state, boolean isIndexTerm) throws IOException;
|
||||
|
||||
/** Must fully consume state, since after this call that
|
||||
* TermState may be reused. */
|
||||
public abstract DocsEnum docs(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsEnum reuse) throws IOException;
|
||||
public abstract DocsEnum docs(FieldInfo fieldInfo, PrefixCodedTermState state, Bits skipDocs, DocsEnum reuse) throws IOException;
|
||||
|
||||
/** Must fully consume state, since after this call that
|
||||
* TermState may be reused. */
|
||||
public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
|
||||
public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, PrefixCodedTermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
|
||||
|
||||
public abstract void close() throws IOException;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
package org.apache.lucene.index.codecs;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -17,40 +16,30 @@ package org.apache.lucene.index.codecs;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.DocsEnum; // for javadocs
|
||||
|
||||
import org.apache.lucene.index.codecs.standard.StandardPostingsReader; // javadocs
|
||||
import org.apache.lucene.index.OrdTermState;
|
||||
import org.apache.lucene.index.TermState;
|
||||
|
||||
/**
|
||||
* Holds all state required for {@link StandardPostingsReader}
|
||||
* Holds all state required for {@link PostingsReaderBase}
|
||||
* to produce a {@link DocsEnum} without re-seeking the
|
||||
* terms dict.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class PrefixCodedTermState extends OrdTermState {
|
||||
public int docFreq; // how many docs have this term
|
||||
public long filePointer; // fp into the terms dict primary file (_X.tis)
|
||||
|
||||
public class TermState implements Cloneable {
|
||||
public long ord; // ord for this term
|
||||
public long filePointer; // fp into the terms dict primary file (_X.tis)
|
||||
public int docFreq; // how many docs have this term
|
||||
|
||||
public void copyFrom(TermState other) {
|
||||
ord = other.ord;
|
||||
@Override
|
||||
public void copyFrom(TermState _other) {
|
||||
assert _other instanceof PrefixCodedTermState : "can not copy from " + _other.getClass().getName();
|
||||
PrefixCodedTermState other = (PrefixCodedTermState) _other;
|
||||
super.copyFrom(_other);
|
||||
filePointer = other.filePointer;
|
||||
docFreq = other.docFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
try {
|
||||
return super.clone();
|
||||
} catch (CloneNotSupportedException cnse) {
|
||||
// should not happen
|
||||
throw new RuntimeException(cnse);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "tis.fp=" + filePointer + " docFreq=" + docFreq + " ord=" + ord;
|
||||
return super.toString() + "[ord=" + ord + ", tis.filePointer=" + filePointer + "]";
|
||||
}
|
||||
|
||||
}
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.index.FieldInfos;
|
|||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
@ -68,7 +69,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
private final Comparator<BytesRef> termComp;
|
||||
|
||||
// Caches the most recently looked-up field + terms:
|
||||
private final DoubleBarrelLRUCache<FieldAndTerm,TermState> termsCache;
|
||||
private final DoubleBarrelLRUCache<FieldAndTerm,PrefixCodedTermState> termsCache;
|
||||
|
||||
// Reads the terms index
|
||||
private TermsIndexReaderBase indexReader;
|
||||
|
@ -84,11 +85,6 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
public FieldAndTerm() {
|
||||
}
|
||||
|
||||
public FieldAndTerm(String field, BytesRef term) {
|
||||
this.field = field;
|
||||
this.term = new BytesRef(term);
|
||||
}
|
||||
|
||||
public FieldAndTerm(FieldAndTerm other) {
|
||||
field = other.field;
|
||||
term = new BytesRef(other.term);
|
||||
|
@ -116,7 +112,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
throws IOException {
|
||||
|
||||
this.postingsReader = postingsReader;
|
||||
termsCache = new DoubleBarrelLRUCache<FieldAndTerm,TermState>(termsCacheSize);
|
||||
termsCache = new DoubleBarrelLRUCache<FieldAndTerm,PrefixCodedTermState>(termsCacheSize);
|
||||
|
||||
this.termComp = termComp;
|
||||
|
||||
|
@ -278,10 +274,10 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
// Iterates through terms in this field, not supporting ord()
|
||||
private class SegmentTermsEnum extends TermsEnum {
|
||||
private final class SegmentTermsEnum extends TermsEnum {
|
||||
private final IndexInput in;
|
||||
private final DeltaBytesReader bytesReader;
|
||||
private final TermState state;
|
||||
private final PrefixCodedTermState state;
|
||||
private boolean seekPending;
|
||||
private final FieldAndTerm fieldTerm = new FieldAndTerm();
|
||||
private final TermsIndexReaderBase.FieldIndexEnum indexEnum;
|
||||
|
@ -307,14 +303,6 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
return termComp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
TermState stateCopy = (TermState) state.clone();
|
||||
stateCopy.filePointer = in.getFilePointer();
|
||||
termsCache.put(new FieldAndTerm(fieldInfo.name, bytesReader.term),
|
||||
stateCopy);
|
||||
}
|
||||
|
||||
// called only from assert
|
||||
private boolean first;
|
||||
private int indexTermCount;
|
||||
|
@ -342,7 +330,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
* is found, SeekStatus.NOT_FOUND if a different term
|
||||
* was found, SeekStatus.END if we hit EOF */
|
||||
@Override
|
||||
public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
|
||||
public SeekStatus seek(final BytesRef term, final boolean useCache) throws IOException {
|
||||
|
||||
if (indexEnum == null) {
|
||||
throw new IllegalStateException("terms index was not loaded");
|
||||
|
@ -357,9 +345,8 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
cachedState = termsCache.get(fieldTerm);
|
||||
if (cachedState != null) {
|
||||
state.copyFrom(cachedState);
|
||||
seekPending = true;
|
||||
setTermState(term, state);
|
||||
positioned = false;
|
||||
bytesReader.term.copy(term);
|
||||
//System.out.println(" cached!");
|
||||
return SeekStatus.FOUND;
|
||||
}
|
||||
|
@ -439,12 +426,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
if (cmp == 0) {
|
||||
// Done!
|
||||
if (useCache) {
|
||||
// Store in cache
|
||||
FieldAndTerm entryKey = new FieldAndTerm(fieldTerm);
|
||||
cachedState = (TermState) state.clone();
|
||||
// this is fp after current term
|
||||
cachedState.filePointer = in.getFilePointer();
|
||||
termsCache.put(entryKey, cachedState);
|
||||
cacheTerm(fieldTerm);
|
||||
}
|
||||
|
||||
return SeekStatus.FOUND;
|
||||
|
@ -464,6 +446,23 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
return SeekStatus.END;
|
||||
}
|
||||
|
||||
private final void setTermState(BytesRef term, final TermState termState) {
|
||||
assert termState != null && termState instanceof PrefixCodedTermState;
|
||||
state.copyFrom(termState);
|
||||
seekPending = true;
|
||||
bytesReader.term.copy(term);
|
||||
}
|
||||
|
||||
private final void cacheTerm(FieldAndTerm other) {
|
||||
// Store in cache
|
||||
final FieldAndTerm entryKey = new FieldAndTerm(other);
|
||||
final PrefixCodedTermState cachedState = (PrefixCodedTermState) state.clone();
|
||||
// this is fp after current term
|
||||
cachedState.filePointer = in.getFilePointer();
|
||||
termsCache.put(entryKey, cachedState);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BytesRef term() {
|
||||
return bytesReader.term;
|
||||
|
@ -498,7 +497,9 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
postingsReader.readTerm(in,
|
||||
fieldInfo, state,
|
||||
isIndexTerm);
|
||||
state.ord++;
|
||||
if (doOrd) {
|
||||
state.ord++;
|
||||
}
|
||||
positioned = true;
|
||||
|
||||
//System.out.println("te.next term=" + bytesReader.term.utf8ToString());
|
||||
|
@ -512,7 +513,7 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
|
||||
final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
|
||||
assert docsEnum != null;
|
||||
return docsEnum;
|
||||
}
|
||||
|
@ -526,6 +527,23 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(BytesRef term, TermState otherState) throws IOException {
|
||||
assert otherState != null && otherState instanceof PrefixCodedTermState;
|
||||
assert otherState.getClass() == this.state.getClass() : "Illegal TermState type " + otherState.getClass();
|
||||
assert ((PrefixCodedTermState)otherState).ord < numTerms;
|
||||
setTermState(term, otherState);
|
||||
positioned = false;
|
||||
return SeekStatus.FOUND;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermState termState() throws IOException {
|
||||
final PrefixCodedTermState newTermState = (PrefixCodedTermState) state.clone();
|
||||
newTermState.filePointer = in.getFilePointer();
|
||||
return newTermState;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(long ord) throws IOException {
|
||||
|
||||
|
@ -562,7 +580,6 @@ public class PrefixCodedTermsReader extends FieldsProducer {
|
|||
return SeekStatus.FOUND;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ord() {
|
||||
if (!doOrd) {
|
||||
throw new UnsupportedOperationException();
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.index.FieldsEnum;
|
|||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.CompoundFileReader;
|
||||
|
@ -741,11 +742,6 @@ public class PreFlexFields extends FieldsProducer {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
getTermsDict().cacheCurrentTerm(termEnum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(long ord) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
|
|
|
@ -22,8 +22,9 @@ import java.io.IOException;
|
|||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.index.codecs.TermState;
|
||||
import org.apache.lucene.index.codecs.PrefixCodedTermState;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
@ -56,10 +57,10 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
|
|||
wrappedPostingsReader.init(termsIn);
|
||||
}
|
||||
|
||||
private static class PulsingTermState extends TermState {
|
||||
private static class PulsingTermState extends PrefixCodedTermState {
|
||||
private byte[] postings;
|
||||
private int postingsSize; // -1 if this term was not inlined
|
||||
private TermState wrappedTermState;
|
||||
private PrefixCodedTermState wrappedTermState;
|
||||
private boolean pendingIndexTerm;
|
||||
|
||||
@Override
|
||||
|
@ -71,7 +72,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
|
|||
System.arraycopy(postings, 0, clone.postings, 0, postingsSize);
|
||||
} else {
|
||||
assert wrappedTermState != null;
|
||||
clone.wrappedTermState = (TermState) wrappedTermState.clone();
|
||||
clone.wrappedTermState = (PrefixCodedTermState) wrappedTermState.clone();
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
|
@ -102,15 +103,14 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public TermState newTermState() throws IOException {
|
||||
public PrefixCodedTermState newTermState() throws IOException {
|
||||
PulsingTermState state = new PulsingTermState();
|
||||
state.wrappedTermState = wrappedPostingsReader.newTermState();
|
||||
return state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState _termState, boolean isIndexTerm) throws IOException {
|
||||
|
||||
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, PrefixCodedTermState _termState, boolean isIndexTerm) throws IOException {
|
||||
PulsingTermState termState = (PulsingTermState) _termState;
|
||||
|
||||
termState.pendingIndexTerm |= isIndexTerm;
|
||||
|
@ -137,7 +137,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
|
|||
// TODO: we could actually reuse, by having TL that
|
||||
// holds the last wrapped reuse, and vice-versa
|
||||
@Override
|
||||
public DocsEnum docs(FieldInfo field, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
public DocsEnum docs(FieldInfo field, PrefixCodedTermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
PulsingTermState termState = (PulsingTermState) _termState;
|
||||
if (termState.postingsSize != -1) {
|
||||
PulsingDocsEnum postings;
|
||||
|
@ -162,7 +162,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
|
|||
|
||||
// TODO: -- not great that we can't always reuse
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, PrefixCodedTermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
if (field.omitTermFreqAndPositions) {
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -25,8 +25,9 @@ import org.apache.lucene.index.DocsAndPositionsEnum;
|
|||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.index.codecs.TermState;
|
||||
import org.apache.lucene.index.codecs.PrefixCodedTermState;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
@ -129,12 +130,13 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
private static class SepTermState extends TermState {
|
||||
private static final class SepTermState extends PrefixCodedTermState {
|
||||
// We store only the seek point to the docs file because
|
||||
// the rest of the info (freqIndex, posIndex, etc.) is
|
||||
// stored in the docs file:
|
||||
IntIndexInput.Index docIndex;
|
||||
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
SepTermState other = (SepTermState) super.clone();
|
||||
other.docIndex = (IntIndexInput.Index) docIndex.clone();
|
||||
|
@ -154,19 +156,19 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public TermState newTermState() throws IOException {
|
||||
public PrefixCodedTermState newTermState() throws IOException {
|
||||
final SepTermState state = new SepTermState();
|
||||
state.docIndex = docIn.index();
|
||||
return state;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm) throws IOException {
|
||||
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, PrefixCodedTermState termState, boolean isIndexTerm) throws IOException {
|
||||
((SepTermState) termState).docIndex.read(termsIn, isIndexTerm);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
public DocsEnum docs(FieldInfo fieldInfo, PrefixCodedTermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
final SepTermState termState = (SepTermState) _termState;
|
||||
SepDocsEnum docsEnum;
|
||||
if (reuse == null || !(reuse instanceof SepDocsEnum)) {
|
||||
|
@ -185,7 +187,7 @@ public class SepPostingsReaderImpl extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, PrefixCodedTermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
assert !fieldInfo.omitTermFreqAndPositions;
|
||||
final SepTermState termState = (SepTermState) _termState;
|
||||
SepDocsAndPositionsEnum postingsEnum;
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.index.codecs.FieldsProducer;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
|
@ -151,10 +152,6 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
assert !ended;
|
||||
|
@ -214,7 +211,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
}
|
||||
return docsAndPositionsEnum.reset(docsStart, skipDocs);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
|
@ -439,7 +436,6 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
private class SimpleTextTerms extends Terms {
|
||||
private final String field;
|
||||
private final long termsStart;
|
||||
private final boolean omitTF;
|
||||
private FST<PairOutputs.Pair<Long,Long>> fst;
|
||||
|
@ -447,7 +443,6 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
private final BytesRef scratch = new BytesRef(10);
|
||||
|
||||
public SimpleTextTerms(String field, long termsStart) throws IOException {
|
||||
this.field = StringHelper.intern(field);
|
||||
this.termsStart = termsStart;
|
||||
omitTF = fieldInfos.fieldInfo(field).omitTermFreqAndPositions;
|
||||
loadTerms();
|
||||
|
|
|
@ -26,8 +26,9 @@ import org.apache.lucene.index.FieldInfo;
|
|||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.codecs.PostingsReaderBase;
|
||||
import org.apache.lucene.index.codecs.TermState;
|
||||
import org.apache.lucene.index.codecs.PrefixCodedTermState;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -83,20 +84,20 @@ public class StandardPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
// Must keep final because we do non-standard clone
|
||||
private final static class DocTermState extends TermState {
|
||||
private final static class StandardTermState extends PrefixCodedTermState {
|
||||
long freqOffset;
|
||||
long proxOffset;
|
||||
int skipOffset;
|
||||
|
||||
public Object clone() {
|
||||
DocTermState other = new DocTermState();
|
||||
StandardTermState other = new StandardTermState();
|
||||
other.copyFrom(this);
|
||||
return other;
|
||||
}
|
||||
|
||||
public void copyFrom(TermState _other) {
|
||||
super.copyFrom(_other);
|
||||
DocTermState other = (DocTermState) _other;
|
||||
StandardTermState other = (StandardTermState) _other;
|
||||
freqOffset = other.freqOffset;
|
||||
proxOffset = other.proxOffset;
|
||||
skipOffset = other.skipOffset;
|
||||
|
@ -108,8 +109,8 @@ public class StandardPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public TermState newTermState() {
|
||||
return new DocTermState();
|
||||
public PrefixCodedTermState newTermState() {
|
||||
return new StandardTermState();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -126,10 +127,9 @@ public class StandardPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState termState, boolean isIndexTerm)
|
||||
public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, PrefixCodedTermState termState, boolean isIndexTerm)
|
||||
throws IOException {
|
||||
|
||||
final DocTermState docTermState = (DocTermState) termState;
|
||||
final StandardTermState docTermState = (StandardTermState) termState;
|
||||
|
||||
if (isIndexTerm) {
|
||||
docTermState.freqOffset = termsIn.readVLong();
|
||||
|
@ -153,7 +153,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
public DocsEnum docs(FieldInfo fieldInfo, PrefixCodedTermState termState, Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
SegmentDocsEnum docsEnum;
|
||||
if (reuse == null || !(reuse instanceof SegmentDocsEnum)) {
|
||||
docsEnum = new SegmentDocsEnum(freqIn);
|
||||
|
@ -166,11 +166,11 @@ public class StandardPostingsReader extends PostingsReaderBase {
|
|||
docsEnum = new SegmentDocsEnum(freqIn);
|
||||
}
|
||||
}
|
||||
return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
|
||||
return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, PrefixCodedTermState termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
|
||||
if (fieldInfo.omitTermFreqAndPositions) {
|
||||
return null;
|
||||
}
|
||||
|
@ -189,7 +189,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
|
|||
docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
|
||||
}
|
||||
}
|
||||
return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
|
||||
return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
|
||||
} else {
|
||||
SegmentDocsAndPositionsEnum docsEnum;
|
||||
if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsEnum)) {
|
||||
|
@ -203,7 +203,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
|
|||
docsEnum = new SegmentDocsAndPositionsEnum(freqIn, proxIn);
|
||||
}
|
||||
}
|
||||
return docsEnum.reset(fieldInfo, (DocTermState) termState, skipDocs);
|
||||
return docsEnum.reset(fieldInfo, (StandardTermState) termState, skipDocs);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -233,7 +233,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
|
|||
this.freqIn = (IndexInput) freqIn.clone();
|
||||
}
|
||||
|
||||
public SegmentDocsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
|
||||
public SegmentDocsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
|
||||
omitTF = fieldInfo.omitTermFreqAndPositions;
|
||||
if (omitTF) {
|
||||
freq = 1;
|
||||
|
@ -407,7 +407,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
|
|||
this.proxIn = (IndexInput) proxIn.clone();
|
||||
}
|
||||
|
||||
public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
|
||||
public SegmentDocsAndPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
|
||||
assert !fieldInfo.omitTermFreqAndPositions;
|
||||
assert !fieldInfo.storePayloads;
|
||||
|
||||
|
@ -594,7 +594,7 @@ public class StandardPostingsReader extends PostingsReaderBase {
|
|||
this.proxIn = (IndexInput) proxIn.clone();
|
||||
}
|
||||
|
||||
public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, DocTermState termState, Bits skipDocs) throws IOException {
|
||||
public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits skipDocs) throws IOException {
|
||||
assert !fieldInfo.omitTermFreqAndPositions;
|
||||
assert fieldInfo.storePayloads;
|
||||
if (payload == null) {
|
||||
|
|
|
@ -21,9 +21,15 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.ByteBlockPool;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
import org.apache.lucene.util.PerReaderTermState;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
|
||||
|
||||
class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
|
||||
|
||||
|
@ -71,8 +77,8 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/) {
|
||||
topLevel.add(new TermQuery(term, docFreq), BooleanClause.Occur.SHOULD);
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, PerReaderTermState states) {
|
||||
topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -98,9 +104,10 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
|
|||
final BytesRefHash pendingTerms = col.pendingTerms;
|
||||
final int sort[] = pendingTerms.sort(col.termsEnum.getComparator());
|
||||
for(int i = 0; i < size; i++) {
|
||||
final int pos = sort[i];
|
||||
// docFreq is not used for constant score here, we pass 1
|
||||
// to explicitely set a fake value, so it's not calculated
|
||||
addClause(bq, placeholderTerm.createTerm(pendingTerms.get(sort[i], new BytesRef())), 1, 1.0f);
|
||||
addClause(bq, placeholderTerm.createTerm(pendingTerms.get(pos, new BytesRef())), 1, 1.0f, col.array.termState[pos]);
|
||||
}
|
||||
// Strip scores
|
||||
final Query result = new ConstantScoreQuery(bq);
|
||||
|
@ -123,12 +130,21 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
|
|||
|
||||
@Override
|
||||
public boolean collect(BytesRef bytes) throws IOException {
|
||||
pendingTerms.add(bytes);
|
||||
int pos = pendingTerms.add(bytes);
|
||||
docVisitCount += termsEnum.docFreq();
|
||||
if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) {
|
||||
hasCutOff = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
final TermState termState = termsEnum.termState();
|
||||
assert termState != null;
|
||||
if (pos < 0) {
|
||||
pos = (-pos)-1;
|
||||
array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq());
|
||||
} else {
|
||||
array.termState[pos] = new PerReaderTermState(topReaderContext, termState, readerContext.ord, termsEnum.docFreq());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -137,7 +153,8 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
|
|||
TermsEnum termsEnum;
|
||||
|
||||
final int docCountCutoff, termCountLimit;
|
||||
final BytesRefHash pendingTerms = new BytesRefHash();
|
||||
final TermStateByteStart array = new TermStateByteStart(16);
|
||||
final BytesRefHash pendingTerms = new BytesRefHash(new ByteBlockPool(new ByteBlockPool.DirectAllocator()), 16, array);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -166,4 +183,40 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
|
|||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Special implementation of BytesStartArray that keeps parallel arrays for {@link PerReaderTermState} */
|
||||
static final class TermStateByteStart extends DirectBytesStartArray {
|
||||
PerReaderTermState[] termState;
|
||||
|
||||
public TermStateByteStart(int initSize) {
|
||||
super(initSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] init() {
|
||||
final int[] ord = super.init();
|
||||
termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
assert termState.length >= ord.length;
|
||||
return ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] grow() {
|
||||
final int[] ord = super.grow();
|
||||
if (termState.length < ord.length) {
|
||||
PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
|
||||
termState = tmpTermState;
|
||||
}
|
||||
assert termState.length >= ord.length;
|
||||
return ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] clear() {
|
||||
termState = null;
|
||||
return super.clear();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
|
@ -155,12 +156,24 @@ public abstract class FilteredTermsEnum extends TermsEnum {
|
|||
public DocsAndPositionsEnum docsAndPositions(Bits bits, DocsAndPositionsEnum reuse) throws IOException {
|
||||
return tenum.docsAndPositions(bits, reuse);
|
||||
}
|
||||
|
||||
|
||||
/** This enum does not support seeking!
|
||||
* @throws UnsupportedOperationException
|
||||
*/
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
tenum.cacheCurrentTerm();
|
||||
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
|
||||
throw new UnsupportedOperationException(getClass().getName()+" does not support seeking");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the filtered enums term state
|
||||
*/
|
||||
@Override
|
||||
public TermState termState() throws IOException {
|
||||
assert tenum != null;
|
||||
return tenum.termState();
|
||||
}
|
||||
|
||||
@SuppressWarnings("fallthrough")
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.search;
|
|||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
@ -244,11 +245,6 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
return actualEnum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
actualEnum.cacheCurrentTerm();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
|
||||
return actualEnum.docs(skipDocs, reuse);
|
||||
|
@ -260,6 +256,15 @@ public final class FuzzyTermsEnum extends TermsEnum {
|
|||
return actualEnum.docsAndPositions(skipDocs, reuse);
|
||||
}
|
||||
|
||||
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
|
||||
return actualEnum.seek(term, state);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermState termState() throws IOException {
|
||||
return actualEnum.termState();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
return actualEnum.getComparator();
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.index.Terms;
|
|||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.PerReaderTermState;
|
||||
|
||||
/**
|
||||
* An abstract {@link Query} that matches documents
|
||||
|
@ -159,8 +160,8 @@ public abstract class MultiTermQuery extends Query {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost) {
|
||||
final TermQuery tq = new TermQuery(term, docCount);
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) {
|
||||
final TermQuery tq = new TermQuery(term, states);
|
||||
tq.setBoost(boost);
|
||||
topLevel.add(tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
@ -200,8 +201,8 @@ public abstract class MultiTermQuery extends Query {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost) {
|
||||
final Query q = new ConstantScoreQuery(new TermQuery(term, docFreq));
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) {
|
||||
final Query q = new ConstantScoreQuery(new TermQuery(term, states));
|
||||
q.setBoost(boost);
|
||||
topLevel.add(q, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.search;
|
|||
import java.io.IOException;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.MultiTermQuery.RewriteMethod;
|
||||
|
||||
|
@ -27,6 +28,7 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.ByteBlockPool;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
import org.apache.lucene.util.PerReaderTermState;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
|
||||
|
||||
|
@ -53,8 +55,9 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost) {
|
||||
final TermQuery tq = new TermQuery(term, docCount);
|
||||
protected void addClause(BooleanQuery topLevel, Term term, int docCount,
|
||||
float boost, PerReaderTermState states) {
|
||||
final TermQuery tq = new TermQuery(term, states);
|
||||
tq.setBoost(boost);
|
||||
topLevel.add(tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
@ -114,13 +117,13 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
|
|||
final int size = col.terms.size();
|
||||
if (size > 0) {
|
||||
final int sort[] = col.terms.sort(col.termsEnum.getComparator());
|
||||
final int[] docFreq = col.array.docFreq;
|
||||
final float[] boost = col.array.boost;
|
||||
final PerReaderTermState[] termStates = col.array.termState;
|
||||
for (int i = 0; i < size; i++) {
|
||||
final int pos = sort[i];
|
||||
final Term term = placeholderTerm.createTerm(col.terms.get(pos, new BytesRef()));
|
||||
assert reader.docFreq(term) == docFreq[pos];
|
||||
addClause(result, term, docFreq[pos], query.getBoost() * boost[pos]);
|
||||
assert reader.docFreq(term) == termStates[pos].docFreq();
|
||||
addClause(result, term, termStates[pos].docFreq(), query.getBoost() * boost[pos], termStates[pos]);
|
||||
}
|
||||
}
|
||||
query.incTotalNumberOfTerms(size);
|
||||
|
@ -143,15 +146,17 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
|
|||
@Override
|
||||
public boolean collect(BytesRef bytes) throws IOException {
|
||||
final int e = terms.add(bytes);
|
||||
final TermState state = termsEnum.termState();
|
||||
assert state != null;
|
||||
if (e < 0 ) {
|
||||
// duplicate term: update docFreq
|
||||
final int pos = (-e)-1;
|
||||
array.docFreq[pos] += termsEnum.docFreq();
|
||||
array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq());
|
||||
assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums";
|
||||
} else {
|
||||
// new entry: we populate the entry initially
|
||||
array.docFreq[e] = termsEnum.docFreq();
|
||||
array.boost[e] = boostAtt.getBoost();
|
||||
array.termState[e] = new PerReaderTermState(topReaderContext, state, readerContext.ord, termsEnum.docFreq());
|
||||
ScoringRewrite.this.checkMaxClauseCount(terms.size());
|
||||
}
|
||||
return true;
|
||||
|
@ -160,8 +165,8 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
|
|||
|
||||
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
|
||||
static final class TermFreqBoostByteStart extends DirectBytesStartArray {
|
||||
int[] docFreq;
|
||||
float[] boost;
|
||||
PerReaderTermState[] termState;
|
||||
|
||||
public TermFreqBoostByteStart(int initSize) {
|
||||
super(initSize);
|
||||
|
@ -171,24 +176,28 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
|
|||
public int[] init() {
|
||||
final int[] ord = super.init();
|
||||
boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)];
|
||||
docFreq = new int[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_INT)];
|
||||
assert boost.length >= ord.length && docFreq.length >= ord.length;
|
||||
termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
assert termState.length >= ord.length && boost.length >= ord.length;
|
||||
return ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] grow() {
|
||||
final int[] ord = super.grow();
|
||||
docFreq = ArrayUtil.grow(docFreq, ord.length);
|
||||
boost = ArrayUtil.grow(boost, ord.length);
|
||||
assert boost.length >= ord.length && docFreq.length >= ord.length;
|
||||
if (termState.length < ord.length) {
|
||||
PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
|
||||
System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
|
||||
termState = tmpTermState;
|
||||
}
|
||||
assert termState.length >= ord.length && boost.length >= ord.length;
|
||||
return ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] clear() {
|
||||
boost = null;
|
||||
docFreq = null;
|
||||
termState = null;
|
||||
return super.clear();
|
||||
}
|
||||
|
||||
|
|
|
@ -18,8 +18,6 @@ package org.apache.lucene.search;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.index.Fields;
|
||||
|
@ -27,25 +25,33 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader.ReaderContext;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.PerReaderTermState;
|
||||
import org.apache.lucene.util.ReaderUtil;
|
||||
|
||||
abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.RewriteMethod {
|
||||
|
||||
|
||||
/** Return a suitable top-level Query for holding all expanded terms. */
|
||||
protected abstract Q getTopLevelQuery() throws IOException;
|
||||
|
||||
/** Add a MultiTermQuery term to the top-level query */
|
||||
protected abstract void addClause(Q topLevel, Term term, int docCount, float boost) throws IOException;
|
||||
protected final void addClause(Q topLevel, Term term, int docCount, float boost) throws IOException {
|
||||
addClause(topLevel, term, docCount, boost, null);
|
||||
}
|
||||
|
||||
protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, PerReaderTermState states) throws IOException;
|
||||
|
||||
|
||||
protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
|
||||
final List<IndexReader> subReaders = new ArrayList<IndexReader>();
|
||||
ReaderUtil.gatherSubReaders(subReaders, reader);
|
||||
ReaderContext topReaderContext = reader.getTopReaderContext();
|
||||
Comparator<BytesRef> lastTermComp = null;
|
||||
|
||||
for (IndexReader r : subReaders) {
|
||||
final Fields fields = r.fields();
|
||||
final AtomicReaderContext[] leaves = ReaderUtil.leaves(topReaderContext);
|
||||
for (AtomicReaderContext context : leaves) {
|
||||
final Fields fields = context.reader.fields();
|
||||
if (fields == null) {
|
||||
// reader has no fields
|
||||
continue;
|
||||
|
@ -68,11 +74,10 @@ abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.Rew
|
|||
if (lastTermComp != null && newTermComp != null && newTermComp != lastTermComp)
|
||||
throw new RuntimeException("term comparator should not change between segments: "+lastTermComp+" != "+newTermComp);
|
||||
lastTermComp = newTermComp;
|
||||
|
||||
collector.setReaderContext(topReaderContext, context);
|
||||
collector.setNextEnum(termsEnum);
|
||||
BytesRef bytes;
|
||||
while ((bytes = termsEnum.next()) != null) {
|
||||
termsEnum.cacheCurrentTerm();
|
||||
if (!collector.collect(bytes))
|
||||
return; // interrupt whole term collection, so also don't iterate other subReaders
|
||||
}
|
||||
|
@ -80,6 +85,14 @@ abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.Rew
|
|||
}
|
||||
|
||||
protected static abstract class TermCollector {
|
||||
|
||||
protected AtomicReaderContext readerContext;
|
||||
protected ReaderContext topReaderContext;
|
||||
|
||||
public void setReaderContext(ReaderContext topReaderContext, AtomicReaderContext readerContext) {
|
||||
this.readerContext = readerContext;
|
||||
this.topReaderContext = topReaderContext;
|
||||
}
|
||||
/** attributes used for communication with the enum */
|
||||
public final AttributeSource attributes = new AttributeSource();
|
||||
|
||||
|
|
|
@ -22,10 +22,14 @@ import java.util.Set;
|
|||
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader.ReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Explanation.IDFExplanation;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.PerReaderTermState;
|
||||
import org.apache.lucene.util.ToStringUtils;
|
||||
|
||||
/** A Query that matches documents containing a term.
|
||||
|
@ -33,20 +37,22 @@ import org.apache.lucene.util.ToStringUtils;
|
|||
*/
|
||||
public class TermQuery extends Query {
|
||||
private final Term term;
|
||||
private final int docFreq;
|
||||
private int docFreq;
|
||||
private transient PerReaderTermState perReaderTermState;
|
||||
|
||||
private class TermWeight extends Weight {
|
||||
private final Similarity similarity;
|
||||
private float value;
|
||||
private float idf;
|
||||
private final float idf;
|
||||
private float queryNorm;
|
||||
private float queryWeight;
|
||||
private IDFExplanation idfExp;
|
||||
private transient ReaderContext weightContext; // only set if -ea for assert in scorer()
|
||||
private final IDFExplanation idfExp;
|
||||
private transient PerReaderTermState termStates;
|
||||
|
||||
public TermWeight(IndexSearcher searcher)
|
||||
public TermWeight(IndexSearcher searcher, PerReaderTermState termStates, int docFreq)
|
||||
throws IOException {
|
||||
assert setWeightContext(searcher);
|
||||
assert termStates != null : "PerReaderTermState must not be null";
|
||||
this.termStates = termStates;
|
||||
this.similarity = getSimilarity(searcher);
|
||||
if (docFreq != -1) {
|
||||
idfExp = similarity.idfExplain(term, searcher, docFreq);
|
||||
|
@ -80,31 +86,34 @@ public class TermQuery extends Query {
|
|||
|
||||
@Override
|
||||
public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
|
||||
assert assertTopReaderContext(context);
|
||||
final String field = term.field();
|
||||
final IndexReader reader = context.reader;
|
||||
DocsEnum docs = reader.termDocsEnum(reader.getDeletedDocs(),
|
||||
term.field(),
|
||||
term.bytes());
|
||||
|
||||
if (docs == null) {
|
||||
assert assertTopReaderContext(termStates, context) : "The top-reader used to create Weight is not the same as the current reader's top-reader";
|
||||
final TermState state = termStates
|
||||
.get(context.ord);
|
||||
if (state == null) { // term is not present in that reader
|
||||
assert termNotInReader(reader, field, term.bytes()) : "no termstate found but term exists in reader";
|
||||
return null;
|
||||
}
|
||||
|
||||
return new TermScorer(this, docs, similarity, reader.norms(term.field()));
|
||||
final DocsEnum docs = reader.termDocsEnum(reader.getDeletedDocs(), field, term.bytes(), state);
|
||||
assert docs != null;
|
||||
return new TermScorer(this, docs, similarity, context.reader.norms(field));
|
||||
}
|
||||
|
||||
private boolean assertTopReaderContext(ReaderContext context) {
|
||||
while (context.parent != null) {
|
||||
private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException {
|
||||
// only called from assert
|
||||
final Terms terms = reader.terms(field);
|
||||
return terms == null || terms.docFreq(bytes) == 0;
|
||||
}
|
||||
|
||||
private boolean assertTopReaderContext(PerReaderTermState state, ReaderContext context) {
|
||||
while(context.parent != null) {
|
||||
context = context.parent;
|
||||
}
|
||||
return weightContext == context;
|
||||
return state.topReaderContext == context;
|
||||
}
|
||||
|
||||
private boolean setWeightContext(IndexSearcher searcher) {
|
||||
weightContext = searcher.getTopReaderContext();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Explanation explain(AtomicReaderContext context, int doc)
|
||||
throws IOException {
|
||||
|
@ -157,7 +166,7 @@ public class TermQuery extends Query {
|
|||
fieldExpl.addDetail(expl);
|
||||
|
||||
Explanation fieldNormExpl = new Explanation();
|
||||
byte[] fieldNorms = reader.norms(field);
|
||||
final byte[] fieldNorms = reader.norms(field);
|
||||
float fieldNorm =
|
||||
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
|
||||
fieldNormExpl.setValue(fieldNorm);
|
||||
|
@ -193,6 +202,17 @@ public class TermQuery extends Query {
|
|||
public TermQuery(Term t, int docFreq) {
|
||||
term = t;
|
||||
this.docFreq = docFreq;
|
||||
perReaderTermState = null;
|
||||
}
|
||||
|
||||
/** Expert: constructs a TermQuery that will use the
|
||||
* provided docFreq instead of looking up the docFreq
|
||||
* against the searcher. */
|
||||
public TermQuery(Term t, PerReaderTermState states) {
|
||||
assert states != null;
|
||||
term = t;
|
||||
docFreq = states.docFreq();
|
||||
perReaderTermState = states;
|
||||
}
|
||||
|
||||
/** Returns the term of this query. */
|
||||
|
@ -200,7 +220,21 @@ public class TermQuery extends Query {
|
|||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher) throws IOException {
|
||||
return new TermWeight(searcher);
|
||||
final ReaderContext context = searcher.getTopReaderContext();
|
||||
final int weightDocFreq;
|
||||
final PerReaderTermState termState;
|
||||
if (perReaderTermState == null || perReaderTermState.topReaderContext != context) {
|
||||
// make TermQuery single-pass if we don't have a PRTS or if the context differs!
|
||||
termState = PerReaderTermState.build(context, term, true); // cache term lookups!
|
||||
// we must not ignore the given docFreq - if set use the given value
|
||||
weightDocFreq = docFreq == -1 ? termState.docFreq() : docFreq;
|
||||
} else {
|
||||
// PRTS was pre-build for this IS
|
||||
termState = this.perReaderTermState;
|
||||
weightDocFreq = docFreq;
|
||||
}
|
||||
|
||||
return new TermWeight(searcher, termState, weightDocFreq);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -25,9 +25,11 @@ import java.util.Comparator;
|
|||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.PerReaderTermState;
|
||||
|
||||
/**
|
||||
* Base rewrite method for collecting only the top terms
|
||||
|
@ -78,12 +80,12 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
|
|||
this.termComp = termsEnum.getComparator();
|
||||
// lazy init the initial ScoreTerm because comparator is not known on ctor:
|
||||
if (st == null)
|
||||
st = new ScoreTerm(this.termComp);
|
||||
st = new ScoreTerm(this.termComp, new PerReaderTermState(topReaderContext));
|
||||
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collect(BytesRef bytes) {
|
||||
public boolean collect(BytesRef bytes) throws IOException {
|
||||
final float boost = boostAtt.getBoost();
|
||||
// ignore uncompetetive hits
|
||||
if (stQueue.size() == maxSize) {
|
||||
|
@ -94,23 +96,27 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
|
|||
return true;
|
||||
}
|
||||
ScoreTerm t = visitedTerms.get(bytes);
|
||||
final TermState state = termsEnum.termState();
|
||||
assert state != null;
|
||||
if (t != null) {
|
||||
// if the term is already in the PQ, only update docFreq of term in PQ
|
||||
t.docFreq += termsEnum.docFreq();
|
||||
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
|
||||
t.termState.register(state, readerContext.ord, termsEnum.docFreq());
|
||||
} else {
|
||||
// add new entry in PQ, we must clone the term, else it may get overwritten!
|
||||
st.bytes.copy(bytes);
|
||||
st.boost = boost;
|
||||
st.docFreq = termsEnum.docFreq();
|
||||
visitedTerms.put(st.bytes, st);
|
||||
assert st.termState.docFreq() == 0;
|
||||
st.termState.register(state, readerContext.ord, termsEnum.docFreq());
|
||||
stQueue.offer(st);
|
||||
// possibly drop entries from queue
|
||||
if (stQueue.size() > maxSize) {
|
||||
st = stQueue.poll();
|
||||
visitedTerms.remove(st.bytes);
|
||||
st.termState.clear(); // reset the termstate!
|
||||
} else {
|
||||
st = new ScoreTerm(termComp);
|
||||
st = new ScoreTerm(termComp, new PerReaderTermState(topReaderContext));
|
||||
}
|
||||
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
|
||||
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
|
||||
|
@ -120,6 +126,7 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
|
|||
maxBoostAtt.setCompetitiveTerm(t.bytes);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
@ -130,8 +137,8 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
|
|||
ArrayUtil.quickSort(scoreTerms, scoreTermSortByTermComp);
|
||||
for (final ScoreTerm st : scoreTerms) {
|
||||
final Term term = placeholderTerm.createTerm(st.bytes);
|
||||
assert reader.docFreq(term) == st.docFreq;
|
||||
addClause(q, term, st.docFreq, query.getBoost() * st.boost); // add to query
|
||||
assert reader.docFreq(term) == st.termState.docFreq() : "reader DF is " + reader.docFreq(term) + " vs " + st.termState.docFreq();
|
||||
addClause(q, term, st.termState.docFreq(), query.getBoost() * st.boost, st.termState); // add to query
|
||||
}
|
||||
query.incTotalNumberOfTerms(scoreTerms.length);
|
||||
return q;
|
||||
|
@ -147,7 +154,7 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
|
|||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
final TopTermsRewrite other = (TopTermsRewrite) obj;
|
||||
final TopTermsRewrite<?> other = (TopTermsRewrite<?>) obj;
|
||||
if (size != other.size) return false;
|
||||
return true;
|
||||
}
|
||||
|
@ -163,13 +170,12 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
|
|||
|
||||
static final class ScoreTerm implements Comparable<ScoreTerm> {
|
||||
public final Comparator<BytesRef> termComp;
|
||||
|
||||
public final BytesRef bytes = new BytesRef();
|
||||
public float boost;
|
||||
public int docFreq;
|
||||
|
||||
public ScoreTerm(Comparator<BytesRef> termComp) {
|
||||
public final PerReaderTermState termState;
|
||||
public ScoreTerm(Comparator<BytesRef> termComp, PerReaderTermState termState) {
|
||||
this.termComp = termComp;
|
||||
this.termState = termState;
|
||||
}
|
||||
|
||||
public int compareTo(ScoreTerm other) {
|
||||
|
|
|
@ -24,8 +24,11 @@ import org.apache.lucene.index.DocsAndPositionsEnum;
|
|||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.OrdTermState;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.codecs.PrefixCodedTermState;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.FieldCache.DocTermsIndex;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
@ -303,11 +306,6 @@ public class DocTermsIndexCreator extends EntryCreatorWithOptions<DocTermsIndex>
|
|||
return term;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef term() throws IOException {
|
||||
return term;
|
||||
|
@ -337,6 +335,19 @@ public class DocTermsIndexCreator extends EntryCreatorWithOptions<DocTermsIndex>
|
|||
public Comparator<BytesRef> getComparator() throws IOException {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seek(BytesRef term, TermState state) throws IOException {
|
||||
assert state != null && state instanceof OrdTermState;
|
||||
return this.seek(((OrdTermState)state).ord);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermState termState() throws IOException {
|
||||
OrdTermState state = new OrdTermState();
|
||||
state.ord = currentOrd;
|
||||
return state;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.search.Query;
|
|||
import org.apache.lucene.search.TopTermsRewrite;
|
||||
import org.apache.lucene.search.ScoringRewrite;
|
||||
import org.apache.lucene.search.BooleanClause.Occur; // javadocs only
|
||||
import org.apache.lucene.util.PerReaderTermState;
|
||||
|
||||
/**
|
||||
* Wraps any {@link MultiTermQuery} as a {@link SpanQuery},
|
||||
|
@ -153,7 +154,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost) {
|
||||
protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) {
|
||||
final SpanTermQuery q = new SpanTermQuery(term);
|
||||
q.setBoost(boost);
|
||||
topLevel.addClause(q);
|
||||
|
@ -202,7 +203,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost) {
|
||||
protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) {
|
||||
final SpanTermQuery q = new SpanTermQuery(term);
|
||||
q.setBoost(boost);
|
||||
topLevel.addClause(q);
|
||||
|
|
|
@ -0,0 +1,148 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.IndexReader.ReaderContext;
|
||||
import org.apache.lucene.index.TermsEnum.SeekStatus;
|
||||
|
||||
/**
|
||||
* Maintains a {@link IndexReader} {@link TermState} view over
|
||||
* {@link IndexReader} instances containing a single term. The
|
||||
* {@link PerReaderTermState} doesn't track if the given {@link TermState}
|
||||
* objects are valid, neither if the {@link TermState} instances refer to the
|
||||
* same terms in the associated readers.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class PerReaderTermState {
|
||||
public final ReaderContext topReaderContext; // for asserting!
|
||||
private final TermState[] states;
|
||||
private int docFreq;
|
||||
|
||||
/**
|
||||
* Creates an empty {@link PerReaderTermState} from a {@link ReaderContext}
|
||||
*/
|
||||
public PerReaderTermState(ReaderContext context) {
|
||||
assert context != null && context.isTopLevel;
|
||||
topReaderContext = context;
|
||||
docFreq = 0;
|
||||
final int len;
|
||||
if (context.leaves() == null) {
|
||||
len = 1;
|
||||
} else {
|
||||
len = context.leaves().length;
|
||||
}
|
||||
states = new TermState[len];
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link PerReaderTermState} with an initial {@link TermState},
|
||||
* {@link IndexReader} pair.
|
||||
*/
|
||||
public PerReaderTermState(ReaderContext context, TermState state, int ord, int docFreq) {
|
||||
this(context);
|
||||
register(state, ord, docFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link PerReaderTermState} from a top-level {@link ReaderContext} and the
|
||||
* given {@link Term}. This method will lookup the given term in all context's leaf readers
|
||||
* and register each of the readers containing the term in the returned {@link PerReaderTermState}
|
||||
* using the leaf reader's ordinal.
|
||||
* <p>
|
||||
* Note: the given context must be a top-level context.
|
||||
*/
|
||||
public static PerReaderTermState build(ReaderContext context, Term term, boolean cache)
|
||||
throws IOException {
|
||||
assert context != null && context.isTopLevel;
|
||||
final String field = term.field();
|
||||
final BytesRef bytes = term.bytes();
|
||||
final PerReaderTermState perReaderTermState = new PerReaderTermState(context);
|
||||
final AtomicReaderContext[] leaves = ReaderUtil.leaves(context);
|
||||
for (int i = 0; i < leaves.length; i++) {
|
||||
final Fields fields = leaves[i].reader.fields();
|
||||
if (fields != null) {
|
||||
final Terms terms = fields.terms(field);
|
||||
if (terms != null) {
|
||||
final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share!
|
||||
if (SeekStatus.FOUND == termsEnum.seek(bytes, cache)) {
|
||||
final TermState termState = termsEnum.termState();
|
||||
perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return perReaderTermState;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears the {@link PerReaderTermState} internal state and removes all
|
||||
* registered {@link TermState}s
|
||||
*/
|
||||
public void clear() {
|
||||
docFreq = 0;
|
||||
Arrays.fill(states, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers and associates a {@link TermState} with an leaf ordinal. The leaf ordinal
|
||||
* should be derived from a {@link ReaderContext}'s leaf ord.
|
||||
*/
|
||||
public void register(TermState state, final int ord, final int docFreq) {
|
||||
assert state != null : "state must not be null";
|
||||
assert ord >= 0 && ord < states.length;
|
||||
assert states[ord] == null : "state for ord: " + ord
|
||||
+ " already registered";
|
||||
this.docFreq += docFreq;
|
||||
states[ord] = state;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the {@link TermState} for an leaf ordinal or <code>null</code> if no
|
||||
* {@link TermState} for the ordinal was registered.
|
||||
*
|
||||
* @param ord
|
||||
* the readers leaf ordinal to get the {@link TermState} for.
|
||||
* @return the {@link TermState} for the given readers ord or <code>null</code> if no
|
||||
* {@link TermState} for the reader was registered
|
||||
*/
|
||||
public TermState get(int ord) {
|
||||
assert ord >= 0 && ord < states.length;
|
||||
return states[ord];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the accumulated document frequency of all {@link TermState}
|
||||
* instances passed to {@link #register(TermState, int)}.
|
||||
* @return the accumulated document frequency of all {@link TermState}
|
||||
* instances passed to {@link #register(TermState, int)}.
|
||||
*/
|
||||
public int docFreq() {
|
||||
return docFreq;
|
||||
}
|
||||
}
|
|
@ -18,6 +18,7 @@ package org.apache.lucene;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.util.*;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.search.*;
|
||||
|
@ -329,10 +330,6 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
return ramField.termToDocs.get(current).docs.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) {
|
||||
return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs);
|
||||
|
|
|
@ -212,8 +212,6 @@ public class QueryUtils {
|
|||
throw e2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** alternate scorer skipTo(),skipTo(),next(),next(),skipTo(),skipTo(), etc
|
||||
* and ensure a hitcollector receives same docs and scores
|
||||
|
|
|
@ -40,6 +40,7 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.OrdTermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.codecs.CodecProvider;
|
||||
|
|
|
@ -1000,10 +1000,6 @@ class NumberedTermsEnum extends TermsEnum {
|
|||
return tenum.docFreq();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cacheCurrentTerm() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
public BytesRef skipTo(BytesRef target) throws IOException {
|
||||
|
||||
|
|
Loading…
Reference in New Issue