LUCENE-2922: optimize the scan-within-block step of BlockTermsReader.seek

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1071564 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-02-17 10:31:59 +00:00
parent 0d57f3b786
commit 3f8c9b5cfc
19 changed files with 246 additions and 248 deletions

View File

@ -71,7 +71,7 @@ public class AppendingCodec extends Codec {
} }
success = false; success = false;
try { try {
FieldsConsumer ret = new AppendingTermsDictWriter(indexWriter, state, docsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); FieldsConsumer ret = new AppendingTermsDictWriter(indexWriter, state, docsWriter);
success = true; success = true;
return ret; return ret;
} finally { } finally {
@ -111,7 +111,6 @@ public class AppendingCodec extends Codec {
state.dir, state.fieldInfos, state.segmentInfo.name, state.dir, state.fieldInfos, state.segmentInfo.name,
docsReader, docsReader,
state.readBufferSize, state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE, StandardCodec.TERMS_CACHE_SIZE,
state.codecId); state.codecId);
success = true; success = true;

View File

@ -18,7 +18,6 @@ package org.apache.lucene.index.codecs.appending;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.PostingsReaderBase;
@ -27,7 +26,6 @@ import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.TermsIndexReaderBase; import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.CodecUtil;
public class AppendingTermsDictReader extends BlockTermsReader { public class AppendingTermsDictReader extends BlockTermsReader {
@ -35,9 +33,9 @@ public class AppendingTermsDictReader extends BlockTermsReader {
public AppendingTermsDictReader(TermsIndexReaderBase indexReader, public AppendingTermsDictReader(TermsIndexReaderBase indexReader,
Directory dir, FieldInfos fieldInfos, String segment, Directory dir, FieldInfos fieldInfos, String segment,
PostingsReaderBase postingsReader, int readBufferSize, PostingsReaderBase postingsReader, int readBufferSize,
Comparator<BytesRef> termComp, int termsCacheSize, String codecId) throws IOException { int termsCacheSize, String codecId) throws IOException {
super(indexReader, dir, fieldInfos, segment, postingsReader, readBufferSize, super(indexReader, dir, fieldInfos, segment, postingsReader, readBufferSize,
termComp, termsCacheSize, codecId); termsCacheSize, codecId);
} }
@Override @Override

View File

@ -18,23 +18,21 @@ package org.apache.lucene.index.codecs.appending;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.PostingsWriterBase; import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.BlockTermsWriter; import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.TermsIndexWriterBase; import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.CodecUtil;
public class AppendingTermsDictWriter extends BlockTermsWriter { public class AppendingTermsDictWriter extends BlockTermsWriter {
final static String CODEC_NAME = "APPENDING_TERMS_DICT"; final static String CODEC_NAME = "APPENDING_TERMS_DICT";
public AppendingTermsDictWriter(TermsIndexWriterBase indexWriter, public AppendingTermsDictWriter(TermsIndexWriterBase indexWriter,
SegmentWriteState state, PostingsWriterBase postingsWriter, SegmentWriteState state, PostingsWriterBase postingsWriter)
Comparator<BytesRef> termComp) throws IOException { throws IOException {
super(indexWriter, state, postingsWriter, termComp); super(indexWriter, state, postingsWriter);
} }
@Override @Override

View File

@ -30,4 +30,9 @@ public class OrdTermState extends TermState {
assert other instanceof OrdTermState : "can not copy from " + other.getClass().getName(); assert other instanceof OrdTermState : "can not copy from " + other.getClass().getName();
this.ord = ((OrdTermState) other).ord; this.ord = ((OrdTermState) other).ord;
} }
@Override
public String toString() {
return "OrdTermState ord=" + ord;
}
} }

View File

@ -44,4 +44,9 @@ public abstract class TermState implements Cloneable {
throw new RuntimeException(cnse); throw new RuntimeException(cnse);
} }
} }
}
@Override
public String toString() {
return "TermState";
}
}

View File

@ -51,6 +51,6 @@ public class BlockTermState extends OrdTermState {
@Override @Override
public String toString() { public String toString() {
return super.toString() + "ord=" + ord + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termCount=" + termCount + " blockFP=" + blockFilePointer; return "ord=" + ord + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termCount=" + termCount + " blockFP=" + blockFilePointer;
} }
} }

View File

@ -66,9 +66,6 @@ public class BlockTermsReader extends FieldsProducer {
private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>(); private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>();
// Comparator that orders our terms
private final Comparator<BytesRef> termComp;
// Caches the most recently looked-up field + terms: // Caches the most recently looked-up field + terms:
private final DoubleBarrelLRUCache<FieldAndTerm,BlockTermState> termsCache; private final DoubleBarrelLRUCache<FieldAndTerm,BlockTermState> termsCache;
@ -111,13 +108,12 @@ public class BlockTermsReader extends FieldsProducer {
//private String segment; //private String segment;
public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, int readBufferSize, public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, int readBufferSize,
Comparator<BytesRef> termComp, int termsCacheSize, String codecId) int termsCacheSize, String codecId)
throws IOException { throws IOException {
this.postingsReader = postingsReader; this.postingsReader = postingsReader;
termsCache = new DoubleBarrelLRUCache<FieldAndTerm,BlockTermState>(termsCacheSize); termsCache = new DoubleBarrelLRUCache<FieldAndTerm,BlockTermState>(termsCacheSize);
this.termComp = termComp;
//this.segment = segment; //this.segment = segment;
in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTermsWriter.TERMS_EXTENSION), in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTermsWriter.TERMS_EXTENSION),
readBufferSize); readBufferSize);
@ -260,7 +256,7 @@ public class BlockTermsReader extends FieldsProducer {
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
return termComp; return BytesRef.getUTF8SortedAsUnicodeComparator();
} }
@Override @Override
@ -342,23 +338,29 @@ public class BlockTermsReader extends FieldsProducer {
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
return termComp; return BytesRef.getUTF8SortedAsUnicodeComparator();
} }
// TODO: we may want an alternate mode here which is
// "if you are about to return NOT_FOUND I won't use
// the terms data from that"; eg FuzzyTermsEnum will
// (usually) just immediately call seek again if we
// return NOT_FOUND so it's a waste for us to fill in
// the term that was actually NOT_FOUND
@Override @Override
public SeekStatus seek(final BytesRef target, final boolean useCache) throws IOException { public SeekStatus seek(final BytesRef target, final boolean useCache) throws IOException {
if (indexEnum == null) { if (indexEnum == null) {
throw new IllegalStateException("terms index was not loaded"); throw new IllegalStateException("terms index was not loaded");
} }
//System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this);
/* /*
System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this);
if (didIndexNext) { if (didIndexNext) {
if (nextIndexTerm == null) { if (nextIndexTerm == null) {
//System.out.println(" nextIndexTerm=null"); System.out.println(" nextIndexTerm=null");
} else { } else {
//System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString());
} }
} }
*/ */
@ -386,7 +388,7 @@ public class BlockTermsReader extends FieldsProducer {
// is after current term but before next index term: // is after current term but before next index term:
if (indexIsCurrent) { if (indexIsCurrent) {
final int cmp = termComp.compare(term, target); final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term, target);
if (cmp == 0) { if (cmp == 0) {
// Already at the requested term // Already at the requested term
@ -404,7 +406,7 @@ public class BlockTermsReader extends FieldsProducer {
didIndexNext = true; didIndexNext = true;
} }
if (nextIndexTerm == null || termComp.compare(target, nextIndexTerm) < 0) { if (nextIndexTerm == null || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) {
// Optimization: requested term is within the // Optimization: requested term is within the
// same term block we are now in; skip seeking // same term block we are now in; skip seeking
// (but do scanning): // (but do scanning):
@ -434,48 +436,175 @@ public class BlockTermsReader extends FieldsProducer {
state.ord = indexEnum.ord()-1; state.ord = indexEnum.ord()-1;
} }
// NOTE: the first _next() after an index seek is
// a bit wasteful, since it redundantly reads some
// suffix bytes into the buffer. We could avoid storing
// those bytes in the primary file, but then when
// next()ing over an index term we'd have to
// special case it:
term.copy(indexEnum.term()); term.copy(indexEnum.term());
//System.out.println(" seek: term=" + term.utf8ToString()); //System.out.println(" seek: term=" + term.utf8ToString());
} else { } else {
////System.out.println(" skip seek"); //System.out.println(" skip seek");
if (state.termCount == state.blockTermCount && !nextBlock()) {
indexIsCurrent = false;
return SeekStatus.END;
}
} }
seekPending = false; seekPending = false;
// Now scan: int common = 0;
while (_next() != null) {
final int cmp = termComp.compare(term, target); // Scan within block. We could do this by calling
if (cmp == 0) { // _next() and testing the resulting term, but this
// Match! // is wasteful. Instead, we first confirm the
if (useCache) { // target matches the common prefix of this block,
// Store in cache // and then we scan the term bytes directly from the
decodeMetaData(); // termSuffixesreader's byte[], saving a copy into
termsCache.put(new FieldAndTerm(fieldTerm), (BlockTermState) state.clone()); // the BytesRef term per term. Only when we return
// do we then copy the bytes into the term.
while(true) {
// First, see if target term matches common prefix
// in this block:
if (common < termBlockPrefix) {
final int cmp = (term.bytes[common]&0xFF) - (target.bytes[target.offset + common]&0xFF);
if (cmp < 0) {
// TODO: maybe we should store common prefix
// in block header? (instead of relying on
// last term of previous block)
// Target's prefix is after the common block
// prefix, so term cannot be in this block
// but it could be in next block. We
// must scan to end-of-block to set common
// prefix for next block:
if (state.termCount < state.blockTermCount) {
while(state.termCount < state.blockTermCount-1) {
state.termCount++;
state.ord++;
termSuffixesReader.skipBytes(termSuffixesReader.readVInt());
}
final int suffix = termSuffixesReader.readVInt();
term.length = termBlockPrefix + suffix;
if (term.bytes.length < term.length) {
term.grow(term.length);
}
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
}
state.ord++;
if (!nextBlock()) {
indexIsCurrent = false;
return SeekStatus.END;
}
common = 0;
} else if (cmp > 0) {
// Target's prefix is before the common prefix
// of this block, so we position to start of
// block and return NOT_FOUND:
assert state.termCount == 0;
final int suffix = termSuffixesReader.readVInt();
term.length = termBlockPrefix + suffix;
if (term.bytes.length < term.length) {
term.grow(term.length);
}
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
return SeekStatus.NOT_FOUND;
} else {
common++;
} }
//System.out.println(" FOUND");
return SeekStatus.FOUND; continue;
} else if (cmp > 0) {
//System.out.println(" NOT_FOUND term=" + term.utf8ToString());
return SeekStatus.NOT_FOUND;
} }
// Test every term in this block
while (true) {
state.termCount++;
state.ord++;
final int suffix = termSuffixesReader.readVInt();
// We know the prefix matches, so just compare the new suffix:
final int termLen = termBlockPrefix + suffix;
int bytePos = termSuffixesReader.getPosition();
boolean next = false;
final int limit = target.offset + (termLen < target.length ? termLen : target.length);
int targetPos = target.offset + termBlockPrefix;
while(targetPos < limit) {
final int cmp = (termSuffixes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF);
if (cmp < 0) {
// Current term is still before the target;
// keep scanning
next = true;
break;
} else if (cmp > 0) {
// Done! Current term is after target. Stop
// here, fill in real term, return NOT_FOUND.
term.length = termBlockPrefix + suffix;
if (term.bytes.length < term.length) {
term.grow(term.length);
}
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
//System.out.println(" NOT_FOUND");
return SeekStatus.NOT_FOUND;
}
}
if (!next && target.length <= termLen) {
term.length = termBlockPrefix + suffix;
if (term.bytes.length < term.length) {
term.grow(term.length);
}
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
if (target.length == termLen) {
// Done! Exact match. Stop here, fill in
// real term, return FOUND.
//System.out.println(" FOUND");
if (useCache) {
// Store in cache
decodeMetaData();
//System.out.println(" cache! state=" + state);
termsCache.put(new FieldAndTerm(fieldTerm), (BlockTermState) state.clone());
}
return SeekStatus.FOUND;
} else {
//System.out.println(" NOT_FOUND");
return SeekStatus.NOT_FOUND;
}
}
if (state.termCount == state.blockTermCount) {
// Must pre-fill term for next block's common prefix
term.length = termBlockPrefix + suffix;
if (term.bytes.length < term.length) {
term.grow(term.length);
}
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
break;
} else {
termSuffixesReader.skipBytes(suffix);
}
}
// The purpose of the terms dict index is to seek // The purpose of the terms dict index is to seek
// the enum to the closest index term before the // the enum to the closest index term before the
// term we are looking for. So, we should never // term we are looking for. So, we should never
// cross another index term (besides the first // cross another index term (besides the first
// one) while we are scanning: // one) while we are scanning:
assert indexIsCurrent;
}
indexIsCurrent = false; assert indexIsCurrent;
//System.out.println(" END");
return SeekStatus.END; if (!nextBlock()) {
//System.out.println(" END");
indexIsCurrent = false;
return SeekStatus.END;
}
common = 0;
}
} }
@Override @Override
@ -515,12 +644,10 @@ public class BlockTermsReader extends FieldsProducer {
decode all metadata up to the current term. */ decode all metadata up to the current term. */
private BytesRef _next() throws IOException { private BytesRef _next() throws IOException {
//System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + state.termCount + " (vs " + state.blockTermCount + ")"); //System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + state.termCount + " (vs " + state.blockTermCount + ")");
if (state.termCount == state.blockTermCount) { if (state.termCount == state.blockTermCount && !nextBlock()) {
if (!nextBlock()) { //System.out.println(" eof");
//System.out.println(" eof"); indexIsCurrent = false;
indexIsCurrent = false; return null;
return null;
}
} }
// TODO: cutover to something better for these ints! simple64? // TODO: cutover to something better for these ints! simple64?
@ -689,7 +816,7 @@ public class BlockTermsReader extends FieldsProducer {
} }
//System.out.println(" termSuffixes len=" + len); //System.out.println(" termSuffixes len=" + len);
in.readBytes(termSuffixes, 0, len); in.readBytes(termSuffixes, 0, len);
termSuffixesReader.reset(termSuffixes); termSuffixesReader.reset(termSuffixes, 0, len);
// docFreq, totalTermFreq // docFreq, totalTermFreq
len = in.readVInt(); len = in.readVInt();
@ -698,7 +825,7 @@ public class BlockTermsReader extends FieldsProducer {
} }
//System.out.println(" freq bytes len=" + len); //System.out.println(" freq bytes len=" + len);
in.readBytes(docFreqBytes, 0, len); in.readBytes(docFreqBytes, 0, len);
freqReader.reset(docFreqBytes); freqReader.reset(docFreqBytes, 0, len);
metaDataUpto = 0; metaDataUpto = 0;
state.termCount = 0; state.termCount = 0;
@ -717,23 +844,32 @@ public class BlockTermsReader extends FieldsProducer {
if (!seekPending) { if (!seekPending) {
// lazily catch up on metadata decode: // lazily catch up on metadata decode:
final int limit = state.termCount; final int limit = state.termCount;
// We must set/incr state.termCount because
// postings impl can look at this
state.termCount = metaDataUpto; state.termCount = metaDataUpto;
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) { while (metaDataUpto < limit) {
//System.out.println(" decode"); //System.out.println(" decode mdUpto=" + metaDataUpto);
// TODO: we could make "tiers" of metadata, ie, // TODO: we could make "tiers" of metadata, ie,
// decode docFreq/totalTF but don't decode postings // decode docFreq/totalTF but don't decode postings
// metadata; this way caller could get // metadata; this way caller could get
// docFreq/totalTF w/o paying decode cost for // docFreq/totalTF w/o paying decode cost for
// postings // postings
// TODO: if docFreq were bulk decoded we could
// just skipN here:
state.docFreq = freqReader.readVInt(); state.docFreq = freqReader.readVInt();
//System.out.println(" dF=" + state.docFreq);
if (!fieldInfo.omitTermFreqAndPositions) { if (!fieldInfo.omitTermFreqAndPositions) {
state.totalTermFreq = state.docFreq + freqReader.readVLong(); state.totalTermFreq = state.docFreq + freqReader.readVLong();
//System.out.println(" totTF=" + state.totalTermFreq);
} }
postingsReader.nextTerm(fieldInfo, state); postingsReader.nextTerm(fieldInfo, state);
metaDataUpto++; metaDataUpto++;
state.termCount++; state.termCount++;
} }
} else { //} else {
//System.out.println(" skip! seekPending"); //System.out.println(" skip! seekPending");
} }
} }

View File

@ -63,24 +63,23 @@ public class BlockTermsWriter extends FieldsConsumer {
FieldInfo currentField; FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter; private final TermsIndexWriterBase termsIndexWriter;
private final List<TermsWriter> fields = new ArrayList<TermsWriter>(); private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
private final Comparator<BytesRef> termComp;
private final String segment; //private final String segment;
public BlockTermsWriter( public BlockTermsWriter(
TermsIndexWriterBase termsIndexWriter, TermsIndexWriterBase termsIndexWriter,
SegmentWriteState state, SegmentWriteState state,
PostingsWriterBase postingsWriter, PostingsWriterBase postingsWriter)
Comparator<BytesRef> termComp) throws IOException throws IOException
{ {
final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_EXTENSION); final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_EXTENSION);
this.termsIndexWriter = termsIndexWriter; this.termsIndexWriter = termsIndexWriter;
this.termComp = termComp;
out = state.directory.createOutput(termsFileName); out = state.directory.createOutput(termsFileName);
fieldInfos = state.fieldInfos; fieldInfos = state.fieldInfos;
writeHeader(out); writeHeader(out);
currentField = null; currentField = null;
this.postingsWriter = postingsWriter; this.postingsWriter = postingsWriter;
segment = state.segmentName; //segment = state.segmentName;
//System.out.println("BTW.init seg=" + state.segmentName); //System.out.println("BTW.init seg=" + state.segmentName);
@ -161,7 +160,6 @@ public class BlockTermsWriter extends FieldsConsumer {
private long numTerms; private long numTerms;
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter; private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
long sumTotalTermFreq; long sumTotalTermFreq;
private final BytesRef lastTerm = new BytesRef();
private TermEntry[] pendingTerms; private TermEntry[] pendingTerms;
@ -185,12 +183,12 @@ public class BlockTermsWriter extends FieldsConsumer {
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
return termComp; return BytesRef.getUTF8SortedAsUnicodeComparator();
} }
@Override @Override
public PostingsConsumer startTerm(BytesRef text) throws IOException { public PostingsConsumer startTerm(BytesRef text) throws IOException {
//System.out.println("BTW.startTerm seg=" + segment + " term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text); //System.out.println("BTW.startTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment);
postingsWriter.startTerm(); postingsWriter.startTerm();
return postingsWriter; return postingsWriter;
} }
@ -201,7 +199,7 @@ public class BlockTermsWriter extends FieldsConsumer {
public void finishTerm(BytesRef text, TermStats stats) throws IOException { public void finishTerm(BytesRef text, TermStats stats) throws IOException {
assert stats.docFreq > 0; assert stats.docFreq > 0;
//System.out.println("BTW.finishTerm seg=" + segment + " term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " df=" + stats.docFreq); //System.out.println("BTW.finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment + " df=" + stats.docFreq);
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats); final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
@ -213,6 +211,7 @@ public class BlockTermsWriter extends FieldsConsumer {
flushBlock(); flushBlock();
} }
fieldIndexWriter.add(text, stats, out.getFilePointer()); fieldIndexWriter.add(text, stats, out.getFilePointer());
//System.out.println(" index term!");
} }
if (pendingTerms.length == pendingCount) { if (pendingTerms.length == pendingCount) {
@ -265,7 +264,7 @@ public class BlockTermsWriter extends FieldsConsumer {
private final RAMOutputStream bytesWriter = new RAMOutputStream(); private final RAMOutputStream bytesWriter = new RAMOutputStream();
private void flushBlock() throws IOException { private void flushBlock() throws IOException {
//System.out.println("BTW.flushBlock pendingCount=" + pendingCount); //System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer());
// First pass: compute common prefix for all terms // First pass: compute common prefix for all terms
// in the block, against term before first term in // in the block, against term before first term in

View File

@ -89,7 +89,7 @@ public class PulsingCodec extends Codec {
// Terms dict // Terms dict
success = false; success = false;
try { try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter);
success = true; success = true;
return ret; return ret;
} finally { } finally {
@ -136,7 +136,6 @@ public class PulsingCodec extends Codec {
state.dir, state.fieldInfos, state.segmentInfo.name, state.dir, state.fieldInfos, state.segmentInfo.name,
pulsingReader, pulsingReader,
state.readBufferSize, state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE, StandardCodec.TERMS_CACHE_SIZE,
state.codecId); state.codecId);
success = true; success = true;

View File

@ -144,7 +144,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
//System.out.println(" count=" + count + " threshold=" + maxPositions); //System.out.println(" count=" + count + " threshold=" + maxPositions);
if (count <= maxPositions) { if (count <= maxPositions) {
//System.out.println(" inlined"); //System.out.println(" inlined pos=" + termState.inlinedBytesReader.getPosition());
// Inlined into terms dict -- just read the byte[] blob in, // Inlined into terms dict -- just read the byte[] blob in,
// but don't decode it now (we only decode when a DocsEnum // but don't decode it now (we only decode when a DocsEnum

View File

@ -23,7 +23,6 @@ import java.util.Set;
import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer; import org.apache.lucene.index.codecs.FieldsProducer;
@ -66,7 +65,7 @@ public class StandardCodec extends Codec {
success = false; success = false;
try { try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator()); FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs);
success = true; success = true;
return ret; return ret;
} finally { } finally {
@ -109,7 +108,6 @@ public class StandardCodec extends Codec {
state.segmentInfo.name, state.segmentInfo.name,
postings, postings,
state.readBufferSize, state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
TERMS_CACHE_SIZE, TERMS_CACHE_SIZE,
state.codecId); state.codecId);
success = true; success = true;

View File

@ -126,7 +126,7 @@ public class MockFixedIntBlockCodec extends Codec {
success = false; success = false;
try { try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true; success = true;
return ret; return ret;
} finally { } finally {
@ -170,7 +170,6 @@ public class MockFixedIntBlockCodec extends Codec {
state.segmentInfo.name, state.segmentInfo.name,
postingsReader, postingsReader,
state.readBufferSize, state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE, StandardCodec.TERMS_CACHE_SIZE,
state.codecId); state.codecId);
success = true; success = true;

View File

@ -150,7 +150,7 @@ public class MockVariableIntBlockCodec extends Codec {
success = false; success = false;
try { try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true; success = true;
return ret; return ret;
} finally { } finally {
@ -195,7 +195,6 @@ public class MockVariableIntBlockCodec extends Codec {
state.segmentInfo.name, state.segmentInfo.name,
postingsReader, postingsReader,
state.readBufferSize, state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE, StandardCodec.TERMS_CACHE_SIZE,
state.codecId); state.codecId);
success = true; success = true;

View File

@ -205,7 +205,7 @@ public class MockRandomCodec extends Codec {
success = false; success = false;
try { try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true; success = true;
return ret; return ret;
} finally { } finally {
@ -306,7 +306,6 @@ public class MockRandomCodec extends Codec {
state.segmentInfo.name, state.segmentInfo.name,
postingsReader, postingsReader,
state.readBufferSize, state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
termsCacheSize, termsCacheSize,
state.codecId); state.codecId);
success = true; success = true;

View File

@ -70,7 +70,7 @@ public class MockSepCodec extends Codec {
success = false; success = false;
try { try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true; success = true;
return ret; return ret;
} finally { } finally {
@ -114,7 +114,6 @@ public class MockSepCodec extends Codec {
state.segmentInfo.name, state.segmentInfo.name,
postingsReader, postingsReader,
state.readBufferSize, state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE, StandardCodec.TERMS_CACHE_SIZE,
state.codecId); state.codecId);
success = true; success = true;

View File

@ -496,139 +496,13 @@ public class TestExternalCodecs extends LuceneTestCase {
} }
} }
public static class MyCodecs extends CodecProvider {
MyCodecs() {
Codec ram = new RAMOnlyCodec();
register(ram);
setDefaultFieldCodec(ram.name);
}
}
// copied from PulsingCodec, just changing the terms
// comparator
private static class PulsingReverseTermsCodec extends Codec {
public PulsingReverseTermsCodec() {
name = "PulsingReverseTerms";
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase docsWriter = new StandardPostingsWriter(state);
// Terms that have <= freqCutoff number of docs are
// "pulsed" (inlined):
final int freqCutoff = 1;
PostingsWriterBase pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter);
// Terms dict index
TermsIndexWriterBase indexWriter;
boolean success = false;
try {
indexWriter = new FixedGapTermsIndexWriter(state) {
// We sort in reverse unicode order, so, we must
// disable the suffix-stripping opto that
// FixedGapTermsIndexWriter does by default!
@Override
protected int indexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm) {
return indexedTerm.length;
}
};
success = true;
} finally {
if (!success) {
pulsingWriter.close();
}
}
// Terms dict
success = false;
try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter, reverseUnicodeComparator);
success = true;
return ret;
} finally {
if (!success) {
try {
pulsingWriter.close();
} finally {
indexWriter.close();
}
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase docsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId);
PostingsReaderBase pulsingReader = new PulsingPostingsReaderImpl(docsReader);
// Terms dict index reader
TermsIndexReaderBase indexReader;
boolean success = false;
try {
indexReader = new FixedGapTermsIndexReader(state.dir,
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
reverseUnicodeComparator,
state.codecId);
success = true;
} finally {
if (!success) {
pulsingReader.close();
}
}
// Terms dict reader
success = false;
try {
FieldsProducer ret = new BlockTermsReader(indexReader,
state.dir,
state.fieldInfos,
state.segmentInfo.name,
pulsingReader,
state.readBufferSize,
reverseUnicodeComparator,
StandardCodec.TERMS_CACHE_SIZE,
state.codecId);
success = true;
return ret;
} finally {
if (!success) {
try {
pulsingReader.close();
} finally {
indexReader.close();
}
}
}
}
@Override
public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set<String> files) throws IOException {
StandardPostingsReader.files(dir, segmentInfo, codecId, files);
BlockTermsReader.files(dir, segmentInfo, codecId, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
}
@Override
public void getExtensions(Set<String> extensions) {
StandardCodec.getStandardExtensions(extensions);
}
}
// tests storing "id" and "field2" fields as pulsing codec, // tests storing "id" and "field2" fields as pulsing codec,
// whose term sort is backwards unicode code point, and // whose term sort is backwards unicode code point, and
// storing "field1" as a custom entirely-in-RAM codec // storing "field1" as a custom entirely-in-RAM codec
public void testPerFieldCodec() throws Exception { public void testPerFieldCodec() throws Exception {
CodecProvider provider = new MyCodecs(); CodecProvider provider = new CoreCodecProvider();
Codec pulsing = new PulsingReverseTermsCodec(); provider.register(new RAMOnlyCodec());
provider.register(pulsing); provider.setDefaultFieldCodec("RamOnly");
final int NUM_DOCS = 173; final int NUM_DOCS = 173;
MockDirectoryWrapper dir = newDirectory(); MockDirectoryWrapper dir = newDirectory();
@ -645,11 +519,11 @@ public class TestExternalCodecs extends LuceneTestCase {
doc.add(newField("field1", "this field uses the standard codec as the test", Field.Store.NO, Field.Index.ANALYZED)); doc.add(newField("field1", "this field uses the standard codec as the test", Field.Store.NO, Field.Index.ANALYZED));
// uses pulsing codec: // uses pulsing codec:
Field field2 = newField("field2", "this field uses the pulsing codec as the test", Field.Store.NO, Field.Index.ANALYZED); Field field2 = newField("field2", "this field uses the pulsing codec as the test", Field.Store.NO, Field.Index.ANALYZED);
provider.setFieldCodec(field2.name(), pulsing.name); provider.setFieldCodec(field2.name(), "Pulsing");
doc.add(field2); doc.add(field2);
Field idField = newField("id", "", Field.Store.NO, Field.Index.NOT_ANALYZED); Field idField = newField("id", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
provider.setFieldCodec(idField.name(), pulsing.name); provider.setFieldCodec(idField.name(), "Pulsing");
doc.add(idField); doc.add(idField);
for(int i=0;i<NUM_DOCS;i++) { for(int i=0;i<NUM_DOCS;i++) {
@ -659,16 +533,13 @@ public class TestExternalCodecs extends LuceneTestCase {
w.commit(); w.commit();
} }
} }
if (VERBOSE) {
System.out.println("TEST: now delete id=77");
}
w.deleteDocuments(new Term("id", "77")); w.deleteDocuments(new Term("id", "77"));
IndexReader r = IndexReader.open(w, true); IndexReader r = IndexReader.open(w, true);
IndexReader[] subs = r.getSequentialSubReaders(); IndexReader[] subs = r.getSequentialSubReaders();
// test each segment
for(int i=0;i<subs.length;i++) {
testTermsOrder(subs[i]);
}
// test each multi-reader
testTermsOrder(r);
assertEquals(NUM_DOCS-1, r.numDocs()); assertEquals(NUM_DOCS-1, r.numDocs());
IndexSearcher s = newSearcher(r); IndexSearcher s = newSearcher(r);
@ -689,7 +560,6 @@ public class TestExternalCodecs extends LuceneTestCase {
assertEquals(0, s.search(new TermQuery(new Term("id", "77")), 1).totalHits); assertEquals(0, s.search(new TermQuery(new Term("id", "77")), 1).totalHits);
assertEquals(0, s.search(new TermQuery(new Term("id", "44")), 1).totalHits); assertEquals(0, s.search(new TermQuery(new Term("id", "44")), 1).totalHits);
testTermsOrder(r);
r.close(); r.close();
s.close(); s.close();
@ -697,25 +567,4 @@ public class TestExternalCodecs extends LuceneTestCase {
dir.close(); dir.close();
} }
private void testTermsOrder(IndexReader r) throws Exception {
// Verify sort order matches what my comparator said:
BytesRef lastBytesRef = null;
TermsEnum terms = MultiFields.getFields(r).terms("id").iterator();
//System.out.println("id terms:");
while(true) {
BytesRef t = terms.next();
if (t == null) {
break;
}
//System.out.println(" " + t);
if (lastBytesRef == null) {
lastBytesRef = new BytesRef(t);
} else {
assertTrue("terms in wrong order last=" + lastBytesRef.utf8ToString() + " current=" + t.utf8ToString(), reverseUnicodeComparator.compare(lastBytesRef, t) < 0);
lastBytesRef.copy(t);
}
}
}
} }

View File

@ -2784,6 +2784,9 @@ public class TestIndexWriter extends LuceneTestCase {
final String id = ""+i; final String id = ""+i;
idField.setValue(id); idField.setValue(id);
docs.put(id, doc); docs.put(id, doc);
if (VERBOSE) {
System.out.println("TEST: add doc id=" + id);
}
for(int field: fieldIDs) { for(int field: fieldIDs) {
final String s; final String s;
@ -2802,7 +2805,7 @@ public class TestIndexWriter extends LuceneTestCase {
if (rand.nextInt(5) == 3 && i > 0) { if (rand.nextInt(5) == 3 && i > 0) {
final String delID = ""+rand.nextInt(i); final String delID = ""+rand.nextInt(i);
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: delete doc " + delID); System.out.println("TEST: delete doc id=" + delID);
} }
w.deleteDocuments(new Term("id", delID)); w.deleteDocuments(new Term("id", delID));
docs.remove(delID); docs.remove(delID);
@ -2825,6 +2828,9 @@ public class TestIndexWriter extends LuceneTestCase {
for(int iter=0;iter<1000*RANDOM_MULTIPLIER;iter++) { for(int iter=0;iter<1000*RANDOM_MULTIPLIER;iter++) {
String testID = idsList[rand.nextInt(idsList.length)]; String testID = idsList[rand.nextInt(idsList.length)];
if (VERBOSE) {
System.out.println("TEST: test id=" + testID);
}
TopDocs hits = s.search(new TermQuery(new Term("id", testID)), 1); TopDocs hits = s.search(new TermQuery(new Term("id", testID)), 1);
assertEquals(1, hits.totalHits); assertEquals(1, hits.totalHits);
Document doc = r.document(hits.scoreDocs[0].doc); Document doc = r.document(hits.scoreDocs[0].doc);

View File

@ -124,7 +124,11 @@ public class TestIndexWriterReader extends LuceneTestCase {
} else { } else {
iwc.setMergePolicy(NoMergePolicy.COMPOUND_FILES); iwc.setMergePolicy(NoMergePolicy.COMPOUND_FILES);
} }
if (VERBOSE) {
System.out.println("TEST: make index");
}
IndexWriter writer = new IndexWriter(dir1, iwc); IndexWriter writer = new IndexWriter(dir1, iwc);
writer.setInfoStream(VERBOSE ? System.out : null);
// create the index // create the index
createIndexNoClose(!optimize, "index1", writer); createIndexNoClose(!optimize, "index1", writer);
@ -146,6 +150,9 @@ public class TestIndexWriterReader extends LuceneTestCase {
IndexReader r2 = writer.getReader(); IndexReader r2 = writer.getReader();
assertTrue(r2.isCurrent()); assertTrue(r2.isCurrent());
assertEquals(0, count(new Term("id", id10), r2)); assertEquals(0, count(new Term("id", id10), r2));
if (VERBOSE) {
System.out.println("TEST: verify id");
}
assertEquals(1, count(new Term("id", Integer.toString(8000)), r2)); assertEquals(1, count(new Term("id", Integer.toString(8000)), r2));
r1.close(); r1.close();

View File

@ -79,6 +79,9 @@ public class TestAutomatonQuery extends LuceneTestCase {
} }
private int automatonQueryNrHits(AutomatonQuery query) throws IOException { private int automatonQueryNrHits(AutomatonQuery query) throws IOException {
if (VERBOSE) {
System.out.println("TEST: run aq=" + query);
}
return searcher.search(query, 5).totalHits; return searcher.search(query, 5).totalHits;
} }