mirror of https://github.com/apache/lucene.git
LUCENE-2922: optimize the scan-within-block step of BlockTermsReader.seek
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1071564 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0d57f3b786
commit
3f8c9b5cfc
|
@ -71,7 +71,7 @@ public class AppendingCodec extends Codec {
|
|||
}
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new AppendingTermsDictWriter(indexWriter, state, docsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
FieldsConsumer ret = new AppendingTermsDictWriter(indexWriter, state, docsWriter);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -111,7 +111,6 @@ public class AppendingCodec extends Codec {
|
|||
state.dir, state.fieldInfos, state.segmentInfo.name,
|
||||
docsReader,
|
||||
state.readBufferSize,
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator(),
|
||||
StandardCodec.TERMS_CACHE_SIZE,
|
||||
state.codecId);
|
||||
success = true;
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.index.codecs.appending;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.codecs.PostingsReaderBase;
|
||||
|
@ -27,7 +26,6 @@ import org.apache.lucene.index.codecs.BlockTermsWriter;
|
|||
import org.apache.lucene.index.codecs.TermsIndexReaderBase;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CodecUtil;
|
||||
|
||||
public class AppendingTermsDictReader extends BlockTermsReader {
|
||||
|
@ -35,9 +33,9 @@ public class AppendingTermsDictReader extends BlockTermsReader {
|
|||
public AppendingTermsDictReader(TermsIndexReaderBase indexReader,
|
||||
Directory dir, FieldInfos fieldInfos, String segment,
|
||||
PostingsReaderBase postingsReader, int readBufferSize,
|
||||
Comparator<BytesRef> termComp, int termsCacheSize, String codecId) throws IOException {
|
||||
int termsCacheSize, String codecId) throws IOException {
|
||||
super(indexReader, dir, fieldInfos, segment, postingsReader, readBufferSize,
|
||||
termComp, termsCacheSize, codecId);
|
||||
termsCacheSize, codecId);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -18,23 +18,21 @@ package org.apache.lucene.index.codecs.appending;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.codecs.BlockTermsWriter;
|
||||
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CodecUtil;
|
||||
|
||||
public class AppendingTermsDictWriter extends BlockTermsWriter {
|
||||
final static String CODEC_NAME = "APPENDING_TERMS_DICT";
|
||||
|
||||
public AppendingTermsDictWriter(TermsIndexWriterBase indexWriter,
|
||||
SegmentWriteState state, PostingsWriterBase postingsWriter,
|
||||
Comparator<BytesRef> termComp) throws IOException {
|
||||
super(indexWriter, state, postingsWriter, termComp);
|
||||
SegmentWriteState state, PostingsWriterBase postingsWriter)
|
||||
throws IOException {
|
||||
super(indexWriter, state, postingsWriter);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -30,4 +30,9 @@ public class OrdTermState extends TermState {
|
|||
assert other instanceof OrdTermState : "can not copy from " + other.getClass().getName();
|
||||
this.ord = ((OrdTermState) other).ord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "OrdTermState ord=" + ord;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,4 +44,9 @@ public abstract class TermState implements Cloneable {
|
|||
throw new RuntimeException(cnse);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "TermState";
|
||||
}
|
||||
}
|
|
@ -51,6 +51,6 @@ public class BlockTermState extends OrdTermState {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return super.toString() + "ord=" + ord + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termCount=" + termCount + " blockFP=" + blockFilePointer;
|
||||
return "ord=" + ord + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termCount=" + termCount + " blockFP=" + blockFilePointer;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -66,9 +66,6 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
|
||||
private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>();
|
||||
|
||||
// Comparator that orders our terms
|
||||
private final Comparator<BytesRef> termComp;
|
||||
|
||||
// Caches the most recently looked-up field + terms:
|
||||
private final DoubleBarrelLRUCache<FieldAndTerm,BlockTermState> termsCache;
|
||||
|
||||
|
@ -111,13 +108,12 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
//private String segment;
|
||||
|
||||
public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, int readBufferSize,
|
||||
Comparator<BytesRef> termComp, int termsCacheSize, String codecId)
|
||||
int termsCacheSize, String codecId)
|
||||
throws IOException {
|
||||
|
||||
this.postingsReader = postingsReader;
|
||||
termsCache = new DoubleBarrelLRUCache<FieldAndTerm,BlockTermState>(termsCacheSize);
|
||||
|
||||
this.termComp = termComp;
|
||||
//this.segment = segment;
|
||||
in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTermsWriter.TERMS_EXTENSION),
|
||||
readBufferSize);
|
||||
|
@ -260,7 +256,7 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return termComp;
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -342,9 +338,15 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return termComp;
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
// TODO: we may want an alternate mode here which is
|
||||
// "if you are about to return NOT_FOUND I won't use
|
||||
// the terms data from that"; eg FuzzyTermsEnum will
|
||||
// (usually) just immediately call seek again if we
|
||||
// return NOT_FOUND so it's a waste for us to fill in
|
||||
// the term that was actually NOT_FOUND
|
||||
@Override
|
||||
public SeekStatus seek(final BytesRef target, final boolean useCache) throws IOException {
|
||||
|
||||
|
@ -352,13 +354,13 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
throw new IllegalStateException("terms index was not loaded");
|
||||
}
|
||||
|
||||
//System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this);
|
||||
/*
|
||||
System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this);
|
||||
if (didIndexNext) {
|
||||
if (nextIndexTerm == null) {
|
||||
//System.out.println(" nextIndexTerm=null");
|
||||
System.out.println(" nextIndexTerm=null");
|
||||
} else {
|
||||
//System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString());
|
||||
System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString());
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
@ -386,7 +388,7 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
// is after current term but before next index term:
|
||||
if (indexIsCurrent) {
|
||||
|
||||
final int cmp = termComp.compare(term, target);
|
||||
final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term, target);
|
||||
|
||||
if (cmp == 0) {
|
||||
// Already at the requested term
|
||||
|
@ -404,7 +406,7 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
didIndexNext = true;
|
||||
}
|
||||
|
||||
if (nextIndexTerm == null || termComp.compare(target, nextIndexTerm) < 0) {
|
||||
if (nextIndexTerm == null || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) {
|
||||
// Optimization: requested term is within the
|
||||
// same term block we are now in; skip seeking
|
||||
// (but do scanning):
|
||||
|
@ -434,49 +436,176 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
state.ord = indexEnum.ord()-1;
|
||||
}
|
||||
|
||||
// NOTE: the first _next() after an index seek is
|
||||
// a bit wasteful, since it redundantly reads some
|
||||
// suffix bytes into the buffer. We could avoid storing
|
||||
// those bytes in the primary file, but then when
|
||||
// next()ing over an index term we'd have to
|
||||
// special case it:
|
||||
term.copy(indexEnum.term());
|
||||
//System.out.println(" seek: term=" + term.utf8ToString());
|
||||
} else {
|
||||
////System.out.println(" skip seek");
|
||||
//System.out.println(" skip seek");
|
||||
if (state.termCount == state.blockTermCount && !nextBlock()) {
|
||||
indexIsCurrent = false;
|
||||
return SeekStatus.END;
|
||||
}
|
||||
}
|
||||
|
||||
seekPending = false;
|
||||
|
||||
// Now scan:
|
||||
while (_next() != null) {
|
||||
final int cmp = termComp.compare(term, target);
|
||||
if (cmp == 0) {
|
||||
// Match!
|
||||
int common = 0;
|
||||
|
||||
// Scan within block. We could do this by calling
|
||||
// _next() and testing the resulting term, but this
|
||||
// is wasteful. Instead, we first confirm the
|
||||
// target matches the common prefix of this block,
|
||||
// and then we scan the term bytes directly from the
|
||||
// termSuffixesreader's byte[], saving a copy into
|
||||
// the BytesRef term per term. Only when we return
|
||||
// do we then copy the bytes into the term.
|
||||
|
||||
while(true) {
|
||||
|
||||
// First, see if target term matches common prefix
|
||||
// in this block:
|
||||
if (common < termBlockPrefix) {
|
||||
final int cmp = (term.bytes[common]&0xFF) - (target.bytes[target.offset + common]&0xFF);
|
||||
if (cmp < 0) {
|
||||
|
||||
// TODO: maybe we should store common prefix
|
||||
// in block header? (instead of relying on
|
||||
// last term of previous block)
|
||||
|
||||
// Target's prefix is after the common block
|
||||
// prefix, so term cannot be in this block
|
||||
// but it could be in next block. We
|
||||
// must scan to end-of-block to set common
|
||||
// prefix for next block:
|
||||
if (state.termCount < state.blockTermCount) {
|
||||
while(state.termCount < state.blockTermCount-1) {
|
||||
state.termCount++;
|
||||
state.ord++;
|
||||
termSuffixesReader.skipBytes(termSuffixesReader.readVInt());
|
||||
}
|
||||
final int suffix = termSuffixesReader.readVInt();
|
||||
term.length = termBlockPrefix + suffix;
|
||||
if (term.bytes.length < term.length) {
|
||||
term.grow(term.length);
|
||||
}
|
||||
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
|
||||
}
|
||||
state.ord++;
|
||||
|
||||
if (!nextBlock()) {
|
||||
indexIsCurrent = false;
|
||||
return SeekStatus.END;
|
||||
}
|
||||
common = 0;
|
||||
|
||||
} else if (cmp > 0) {
|
||||
// Target's prefix is before the common prefix
|
||||
// of this block, so we position to start of
|
||||
// block and return NOT_FOUND:
|
||||
assert state.termCount == 0;
|
||||
|
||||
final int suffix = termSuffixesReader.readVInt();
|
||||
term.length = termBlockPrefix + suffix;
|
||||
if (term.bytes.length < term.length) {
|
||||
term.grow(term.length);
|
||||
}
|
||||
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
|
||||
return SeekStatus.NOT_FOUND;
|
||||
} else {
|
||||
common++;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Test every term in this block
|
||||
while (true) {
|
||||
state.termCount++;
|
||||
state.ord++;
|
||||
|
||||
final int suffix = termSuffixesReader.readVInt();
|
||||
|
||||
// We know the prefix matches, so just compare the new suffix:
|
||||
final int termLen = termBlockPrefix + suffix;
|
||||
int bytePos = termSuffixesReader.getPosition();
|
||||
|
||||
boolean next = false;
|
||||
final int limit = target.offset + (termLen < target.length ? termLen : target.length);
|
||||
int targetPos = target.offset + termBlockPrefix;
|
||||
while(targetPos < limit) {
|
||||
final int cmp = (termSuffixes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF);
|
||||
if (cmp < 0) {
|
||||
// Current term is still before the target;
|
||||
// keep scanning
|
||||
next = true;
|
||||
break;
|
||||
} else if (cmp > 0) {
|
||||
// Done! Current term is after target. Stop
|
||||
// here, fill in real term, return NOT_FOUND.
|
||||
term.length = termBlockPrefix + suffix;
|
||||
if (term.bytes.length < term.length) {
|
||||
term.grow(term.length);
|
||||
}
|
||||
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
|
||||
//System.out.println(" NOT_FOUND");
|
||||
return SeekStatus.NOT_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
if (!next && target.length <= termLen) {
|
||||
term.length = termBlockPrefix + suffix;
|
||||
if (term.bytes.length < term.length) {
|
||||
term.grow(term.length);
|
||||
}
|
||||
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
|
||||
|
||||
if (target.length == termLen) {
|
||||
// Done! Exact match. Stop here, fill in
|
||||
// real term, return FOUND.
|
||||
//System.out.println(" FOUND");
|
||||
|
||||
if (useCache) {
|
||||
// Store in cache
|
||||
decodeMetaData();
|
||||
//System.out.println(" cache! state=" + state);
|
||||
termsCache.put(new FieldAndTerm(fieldTerm), (BlockTermState) state.clone());
|
||||
}
|
||||
//System.out.println(" FOUND");
|
||||
|
||||
return SeekStatus.FOUND;
|
||||
} else if (cmp > 0) {
|
||||
//System.out.println(" NOT_FOUND term=" + term.utf8ToString());
|
||||
} else {
|
||||
//System.out.println(" NOT_FOUND");
|
||||
return SeekStatus.NOT_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
if (state.termCount == state.blockTermCount) {
|
||||
// Must pre-fill term for next block's common prefix
|
||||
term.length = termBlockPrefix + suffix;
|
||||
if (term.bytes.length < term.length) {
|
||||
term.grow(term.length);
|
||||
}
|
||||
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
|
||||
break;
|
||||
} else {
|
||||
termSuffixesReader.skipBytes(suffix);
|
||||
}
|
||||
}
|
||||
|
||||
// The purpose of the terms dict index is to seek
|
||||
// the enum to the closest index term before the
|
||||
// term we are looking for. So, we should never
|
||||
// cross another index term (besides the first
|
||||
// one) while we are scanning:
|
||||
assert indexIsCurrent;
|
||||
}
|
||||
|
||||
indexIsCurrent = false;
|
||||
assert indexIsCurrent;
|
||||
|
||||
if (!nextBlock()) {
|
||||
//System.out.println(" END");
|
||||
indexIsCurrent = false;
|
||||
return SeekStatus.END;
|
||||
}
|
||||
common = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
|
@ -515,13 +644,11 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
decode all metadata up to the current term. */
|
||||
private BytesRef _next() throws IOException {
|
||||
//System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + state.termCount + " (vs " + state.blockTermCount + ")");
|
||||
if (state.termCount == state.blockTermCount) {
|
||||
if (!nextBlock()) {
|
||||
if (state.termCount == state.blockTermCount && !nextBlock()) {
|
||||
//System.out.println(" eof");
|
||||
indexIsCurrent = false;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: cutover to something better for these ints! simple64?
|
||||
final int suffix = termSuffixesReader.readVInt();
|
||||
|
@ -689,7 +816,7 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
}
|
||||
//System.out.println(" termSuffixes len=" + len);
|
||||
in.readBytes(termSuffixes, 0, len);
|
||||
termSuffixesReader.reset(termSuffixes);
|
||||
termSuffixesReader.reset(termSuffixes, 0, len);
|
||||
|
||||
// docFreq, totalTermFreq
|
||||
len = in.readVInt();
|
||||
|
@ -698,7 +825,7 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
}
|
||||
//System.out.println(" freq bytes len=" + len);
|
||||
in.readBytes(docFreqBytes, 0, len);
|
||||
freqReader.reset(docFreqBytes);
|
||||
freqReader.reset(docFreqBytes, 0, len);
|
||||
metaDataUpto = 0;
|
||||
|
||||
state.termCount = 0;
|
||||
|
@ -717,23 +844,32 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
if (!seekPending) {
|
||||
// lazily catch up on metadata decode:
|
||||
final int limit = state.termCount;
|
||||
// We must set/incr state.termCount because
|
||||
// postings impl can look at this
|
||||
state.termCount = metaDataUpto;
|
||||
// TODO: better API would be "jump straight to term=N"???
|
||||
while (metaDataUpto < limit) {
|
||||
//System.out.println(" decode");
|
||||
//System.out.println(" decode mdUpto=" + metaDataUpto);
|
||||
// TODO: we could make "tiers" of metadata, ie,
|
||||
// decode docFreq/totalTF but don't decode postings
|
||||
// metadata; this way caller could get
|
||||
// docFreq/totalTF w/o paying decode cost for
|
||||
// postings
|
||||
|
||||
// TODO: if docFreq were bulk decoded we could
|
||||
// just skipN here:
|
||||
state.docFreq = freqReader.readVInt();
|
||||
//System.out.println(" dF=" + state.docFreq);
|
||||
if (!fieldInfo.omitTermFreqAndPositions) {
|
||||
state.totalTermFreq = state.docFreq + freqReader.readVLong();
|
||||
//System.out.println(" totTF=" + state.totalTermFreq);
|
||||
}
|
||||
|
||||
postingsReader.nextTerm(fieldInfo, state);
|
||||
metaDataUpto++;
|
||||
state.termCount++;
|
||||
}
|
||||
} else {
|
||||
//} else {
|
||||
//System.out.println(" skip! seekPending");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -63,24 +63,23 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
FieldInfo currentField;
|
||||
private final TermsIndexWriterBase termsIndexWriter;
|
||||
private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
|
||||
private final Comparator<BytesRef> termComp;
|
||||
private final String segment;
|
||||
|
||||
//private final String segment;
|
||||
|
||||
public BlockTermsWriter(
|
||||
TermsIndexWriterBase termsIndexWriter,
|
||||
SegmentWriteState state,
|
||||
PostingsWriterBase postingsWriter,
|
||||
Comparator<BytesRef> termComp) throws IOException
|
||||
PostingsWriterBase postingsWriter)
|
||||
throws IOException
|
||||
{
|
||||
final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_EXTENSION);
|
||||
this.termsIndexWriter = termsIndexWriter;
|
||||
this.termComp = termComp;
|
||||
out = state.directory.createOutput(termsFileName);
|
||||
fieldInfos = state.fieldInfos;
|
||||
writeHeader(out);
|
||||
currentField = null;
|
||||
this.postingsWriter = postingsWriter;
|
||||
segment = state.segmentName;
|
||||
//segment = state.segmentName;
|
||||
|
||||
//System.out.println("BTW.init seg=" + state.segmentName);
|
||||
|
||||
|
@ -161,7 +160,6 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
private long numTerms;
|
||||
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
|
||||
long sumTotalTermFreq;
|
||||
private final BytesRef lastTerm = new BytesRef();
|
||||
|
||||
private TermEntry[] pendingTerms;
|
||||
|
||||
|
@ -185,12 +183,12 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return termComp;
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public PostingsConsumer startTerm(BytesRef text) throws IOException {
|
||||
//System.out.println("BTW.startTerm seg=" + segment + " term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text);
|
||||
//System.out.println("BTW.startTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment);
|
||||
postingsWriter.startTerm();
|
||||
return postingsWriter;
|
||||
}
|
||||
|
@ -201,7 +199,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
|
||||
assert stats.docFreq > 0;
|
||||
//System.out.println("BTW.finishTerm seg=" + segment + " term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " df=" + stats.docFreq);
|
||||
//System.out.println("BTW.finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment + " df=" + stats.docFreq);
|
||||
|
||||
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
|
||||
|
||||
|
@ -213,6 +211,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
flushBlock();
|
||||
}
|
||||
fieldIndexWriter.add(text, stats, out.getFilePointer());
|
||||
//System.out.println(" index term!");
|
||||
}
|
||||
|
||||
if (pendingTerms.length == pendingCount) {
|
||||
|
@ -265,7 +264,7 @@ public class BlockTermsWriter extends FieldsConsumer {
|
|||
private final RAMOutputStream bytesWriter = new RAMOutputStream();
|
||||
|
||||
private void flushBlock() throws IOException {
|
||||
//System.out.println("BTW.flushBlock pendingCount=" + pendingCount);
|
||||
//System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer());
|
||||
|
||||
// First pass: compute common prefix for all terms
|
||||
// in the block, against term before first term in
|
||||
|
|
|
@ -89,7 +89,7 @@ public class PulsingCodec extends Codec {
|
|||
// Terms dict
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -136,7 +136,6 @@ public class PulsingCodec extends Codec {
|
|||
state.dir, state.fieldInfos, state.segmentInfo.name,
|
||||
pulsingReader,
|
||||
state.readBufferSize,
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator(),
|
||||
StandardCodec.TERMS_CACHE_SIZE,
|
||||
state.codecId);
|
||||
success = true;
|
||||
|
|
|
@ -144,7 +144,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
|
|||
//System.out.println(" count=" + count + " threshold=" + maxPositions);
|
||||
|
||||
if (count <= maxPositions) {
|
||||
//System.out.println(" inlined");
|
||||
//System.out.println(" inlined pos=" + termState.inlinedBytesReader.getPosition());
|
||||
|
||||
// Inlined into terms dict -- just read the byte[] blob in,
|
||||
// but don't decode it now (we only decode when a DocsEnum
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.util.Set;
|
|||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.codecs.Codec;
|
||||
import org.apache.lucene.index.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.index.codecs.FieldsProducer;
|
||||
|
@ -66,7 +65,7 @@ public class StandardCodec extends Codec {
|
|||
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -109,7 +108,6 @@ public class StandardCodec extends Codec {
|
|||
state.segmentInfo.name,
|
||||
postings,
|
||||
state.readBufferSize,
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator(),
|
||||
TERMS_CACHE_SIZE,
|
||||
state.codecId);
|
||||
success = true;
|
||||
|
|
|
@ -126,7 +126,7 @@ public class MockFixedIntBlockCodec extends Codec {
|
|||
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -170,7 +170,6 @@ public class MockFixedIntBlockCodec extends Codec {
|
|||
state.segmentInfo.name,
|
||||
postingsReader,
|
||||
state.readBufferSize,
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator(),
|
||||
StandardCodec.TERMS_CACHE_SIZE,
|
||||
state.codecId);
|
||||
success = true;
|
||||
|
|
|
@ -150,7 +150,7 @@ public class MockVariableIntBlockCodec extends Codec {
|
|||
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -195,7 +195,6 @@ public class MockVariableIntBlockCodec extends Codec {
|
|||
state.segmentInfo.name,
|
||||
postingsReader,
|
||||
state.readBufferSize,
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator(),
|
||||
StandardCodec.TERMS_CACHE_SIZE,
|
||||
state.codecId);
|
||||
success = true;
|
||||
|
|
|
@ -205,7 +205,7 @@ public class MockRandomCodec extends Codec {
|
|||
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -306,7 +306,6 @@ public class MockRandomCodec extends Codec {
|
|||
state.segmentInfo.name,
|
||||
postingsReader,
|
||||
state.readBufferSize,
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator(),
|
||||
termsCacheSize,
|
||||
state.codecId);
|
||||
success = true;
|
||||
|
|
|
@ -70,7 +70,7 @@ public class MockSepCodec extends Codec {
|
|||
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -114,7 +114,6 @@ public class MockSepCodec extends Codec {
|
|||
state.segmentInfo.name,
|
||||
postingsReader,
|
||||
state.readBufferSize,
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator(),
|
||||
StandardCodec.TERMS_CACHE_SIZE,
|
||||
state.codecId);
|
||||
success = true;
|
||||
|
|
|
@ -496,139 +496,13 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public static class MyCodecs extends CodecProvider {
|
||||
MyCodecs() {
|
||||
Codec ram = new RAMOnlyCodec();
|
||||
register(ram);
|
||||
setDefaultFieldCodec(ram.name);
|
||||
}
|
||||
}
|
||||
|
||||
// copied from PulsingCodec, just changing the terms
|
||||
// comparator
|
||||
private static class PulsingReverseTermsCodec extends Codec {
|
||||
|
||||
public PulsingReverseTermsCodec() {
|
||||
name = "PulsingReverseTerms";
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase docsWriter = new StandardPostingsWriter(state);
|
||||
|
||||
// Terms that have <= freqCutoff number of docs are
|
||||
// "pulsed" (inlined):
|
||||
final int freqCutoff = 1;
|
||||
PostingsWriterBase pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter);
|
||||
|
||||
// Terms dict index
|
||||
TermsIndexWriterBase indexWriter;
|
||||
boolean success = false;
|
||||
try {
|
||||
indexWriter = new FixedGapTermsIndexWriter(state) {
|
||||
// We sort in reverse unicode order, so, we must
|
||||
// disable the suffix-stripping opto that
|
||||
// FixedGapTermsIndexWriter does by default!
|
||||
@Override
|
||||
protected int indexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm) {
|
||||
return indexedTerm.length;
|
||||
}
|
||||
};
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
pulsingWriter.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Terms dict
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter, reverseUnicodeComparator);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
try {
|
||||
pulsingWriter.close();
|
||||
} finally {
|
||||
indexWriter.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
|
||||
PostingsReaderBase docsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId);
|
||||
PostingsReaderBase pulsingReader = new PulsingPostingsReaderImpl(docsReader);
|
||||
|
||||
// Terms dict index reader
|
||||
TermsIndexReaderBase indexReader;
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
indexReader = new FixedGapTermsIndexReader(state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo.name,
|
||||
state.termsIndexDivisor,
|
||||
reverseUnicodeComparator,
|
||||
state.codecId);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
pulsingReader.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Terms dict reader
|
||||
success = false;
|
||||
try {
|
||||
FieldsProducer ret = new BlockTermsReader(indexReader,
|
||||
state.dir,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo.name,
|
||||
pulsingReader,
|
||||
state.readBufferSize,
|
||||
reverseUnicodeComparator,
|
||||
StandardCodec.TERMS_CACHE_SIZE,
|
||||
state.codecId);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
try {
|
||||
pulsingReader.close();
|
||||
} finally {
|
||||
indexReader.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set<String> files) throws IOException {
|
||||
StandardPostingsReader.files(dir, segmentInfo, codecId, files);
|
||||
BlockTermsReader.files(dir, segmentInfo, codecId, files);
|
||||
FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getExtensions(Set<String> extensions) {
|
||||
StandardCodec.getStandardExtensions(extensions);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// tests storing "id" and "field2" fields as pulsing codec,
|
||||
// whose term sort is backwards unicode code point, and
|
||||
// storing "field1" as a custom entirely-in-RAM codec
|
||||
public void testPerFieldCodec() throws Exception {
|
||||
CodecProvider provider = new MyCodecs();
|
||||
Codec pulsing = new PulsingReverseTermsCodec();
|
||||
provider.register(pulsing);
|
||||
|
||||
CodecProvider provider = new CoreCodecProvider();
|
||||
provider.register(new RAMOnlyCodec());
|
||||
provider.setDefaultFieldCodec("RamOnly");
|
||||
|
||||
final int NUM_DOCS = 173;
|
||||
MockDirectoryWrapper dir = newDirectory();
|
||||
|
@ -645,11 +519,11 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
doc.add(newField("field1", "this field uses the standard codec as the test", Field.Store.NO, Field.Index.ANALYZED));
|
||||
// uses pulsing codec:
|
||||
Field field2 = newField("field2", "this field uses the pulsing codec as the test", Field.Store.NO, Field.Index.ANALYZED);
|
||||
provider.setFieldCodec(field2.name(), pulsing.name);
|
||||
provider.setFieldCodec(field2.name(), "Pulsing");
|
||||
doc.add(field2);
|
||||
|
||||
Field idField = newField("id", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
|
||||
provider.setFieldCodec(idField.name(), pulsing.name);
|
||||
provider.setFieldCodec(idField.name(), "Pulsing");
|
||||
|
||||
doc.add(idField);
|
||||
for(int i=0;i<NUM_DOCS;i++) {
|
||||
|
@ -659,16 +533,13 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
w.commit();
|
||||
}
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: now delete id=77");
|
||||
}
|
||||
w.deleteDocuments(new Term("id", "77"));
|
||||
|
||||
IndexReader r = IndexReader.open(w, true);
|
||||
IndexReader[] subs = r.getSequentialSubReaders();
|
||||
// test each segment
|
||||
for(int i=0;i<subs.length;i++) {
|
||||
testTermsOrder(subs[i]);
|
||||
}
|
||||
// test each multi-reader
|
||||
testTermsOrder(r);
|
||||
|
||||
assertEquals(NUM_DOCS-1, r.numDocs());
|
||||
IndexSearcher s = newSearcher(r);
|
||||
|
@ -689,7 +560,6 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
assertEquals(0, s.search(new TermQuery(new Term("id", "77")), 1).totalHits);
|
||||
assertEquals(0, s.search(new TermQuery(new Term("id", "44")), 1).totalHits);
|
||||
|
||||
testTermsOrder(r);
|
||||
r.close();
|
||||
s.close();
|
||||
|
||||
|
@ -697,25 +567,4 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void testTermsOrder(IndexReader r) throws Exception {
|
||||
|
||||
// Verify sort order matches what my comparator said:
|
||||
BytesRef lastBytesRef = null;
|
||||
TermsEnum terms = MultiFields.getFields(r).terms("id").iterator();
|
||||
//System.out.println("id terms:");
|
||||
while(true) {
|
||||
BytesRef t = terms.next();
|
||||
if (t == null) {
|
||||
break;
|
||||
}
|
||||
//System.out.println(" " + t);
|
||||
if (lastBytesRef == null) {
|
||||
lastBytesRef = new BytesRef(t);
|
||||
} else {
|
||||
assertTrue("terms in wrong order last=" + lastBytesRef.utf8ToString() + " current=" + t.utf8ToString(), reverseUnicodeComparator.compare(lastBytesRef, t) < 0);
|
||||
lastBytesRef.copy(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2784,6 +2784,9 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
final String id = ""+i;
|
||||
idField.setValue(id);
|
||||
docs.put(id, doc);
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: add doc id=" + id);
|
||||
}
|
||||
|
||||
for(int field: fieldIDs) {
|
||||
final String s;
|
||||
|
@ -2802,7 +2805,7 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
if (rand.nextInt(5) == 3 && i > 0) {
|
||||
final String delID = ""+rand.nextInt(i);
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: delete doc " + delID);
|
||||
System.out.println("TEST: delete doc id=" + delID);
|
||||
}
|
||||
w.deleteDocuments(new Term("id", delID));
|
||||
docs.remove(delID);
|
||||
|
@ -2825,6 +2828,9 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
|
||||
for(int iter=0;iter<1000*RANDOM_MULTIPLIER;iter++) {
|
||||
String testID = idsList[rand.nextInt(idsList.length)];
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: test id=" + testID);
|
||||
}
|
||||
TopDocs hits = s.search(new TermQuery(new Term("id", testID)), 1);
|
||||
assertEquals(1, hits.totalHits);
|
||||
Document doc = r.document(hits.scoreDocs[0].doc);
|
||||
|
|
|
@ -124,7 +124,11 @@ public class TestIndexWriterReader extends LuceneTestCase {
|
|||
} else {
|
||||
iwc.setMergePolicy(NoMergePolicy.COMPOUND_FILES);
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: make index");
|
||||
}
|
||||
IndexWriter writer = new IndexWriter(dir1, iwc);
|
||||
writer.setInfoStream(VERBOSE ? System.out : null);
|
||||
|
||||
// create the index
|
||||
createIndexNoClose(!optimize, "index1", writer);
|
||||
|
@ -146,6 +150,9 @@ public class TestIndexWriterReader extends LuceneTestCase {
|
|||
IndexReader r2 = writer.getReader();
|
||||
assertTrue(r2.isCurrent());
|
||||
assertEquals(0, count(new Term("id", id10), r2));
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: verify id");
|
||||
}
|
||||
assertEquals(1, count(new Term("id", Integer.toString(8000)), r2));
|
||||
|
||||
r1.close();
|
||||
|
|
|
@ -79,6 +79,9 @@ public class TestAutomatonQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
private int automatonQueryNrHits(AutomatonQuery query) throws IOException {
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: run aq=" + query);
|
||||
}
|
||||
return searcher.search(query, 5).totalHits;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue