LUCENE-2922: optimize the scan-within-block step of BlockTermsReader.seek

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1071564 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2011-02-17 10:31:59 +00:00
parent 0d57f3b786
commit 3f8c9b5cfc
19 changed files with 246 additions and 248 deletions

View File

@ -71,7 +71,7 @@ public class AppendingCodec extends Codec {
}
success = false;
try {
FieldsConsumer ret = new AppendingTermsDictWriter(indexWriter, state, docsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
FieldsConsumer ret = new AppendingTermsDictWriter(indexWriter, state, docsWriter);
success = true;
return ret;
} finally {
@ -111,7 +111,6 @@ public class AppendingCodec extends Codec {
state.dir, state.fieldInfos, state.segmentInfo.name,
docsReader,
state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE,
state.codecId);
success = true;

View File

@ -18,7 +18,6 @@ package org.apache.lucene.index.codecs.appending;
*/
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.codecs.PostingsReaderBase;
@ -27,7 +26,6 @@ import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
public class AppendingTermsDictReader extends BlockTermsReader {
@ -35,9 +33,9 @@ public class AppendingTermsDictReader extends BlockTermsReader {
public AppendingTermsDictReader(TermsIndexReaderBase indexReader,
Directory dir, FieldInfos fieldInfos, String segment,
PostingsReaderBase postingsReader, int readBufferSize,
Comparator<BytesRef> termComp, int termsCacheSize, String codecId) throws IOException {
int termsCacheSize, String codecId) throws IOException {
super(indexReader, dir, fieldInfos, segment, postingsReader, readBufferSize,
termComp, termsCacheSize, codecId);
termsCacheSize, codecId);
}
@Override

View File

@ -18,23 +18,21 @@ package org.apache.lucene.index.codecs.appending;
*/
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
public class AppendingTermsDictWriter extends BlockTermsWriter {
final static String CODEC_NAME = "APPENDING_TERMS_DICT";
public AppendingTermsDictWriter(TermsIndexWriterBase indexWriter,
SegmentWriteState state, PostingsWriterBase postingsWriter,
Comparator<BytesRef> termComp) throws IOException {
super(indexWriter, state, postingsWriter, termComp);
SegmentWriteState state, PostingsWriterBase postingsWriter)
throws IOException {
super(indexWriter, state, postingsWriter);
}
@Override

View File

@ -30,4 +30,9 @@ public class OrdTermState extends TermState {
assert other instanceof OrdTermState : "can not copy from " + other.getClass().getName();
this.ord = ((OrdTermState) other).ord;
}
@Override
public String toString() {
return "OrdTermState ord=" + ord;
}
}

View File

@ -44,4 +44,9 @@ public abstract class TermState implements Cloneable {
throw new RuntimeException(cnse);
}
}
@Override
public String toString() {
return "TermState";
}
}

View File

@ -51,6 +51,6 @@ public class BlockTermState extends OrdTermState {
@Override
public String toString() {
return super.toString() + "ord=" + ord + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termCount=" + termCount + " blockFP=" + blockFilePointer;
return "ord=" + ord + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq + " termCount=" + termCount + " blockFP=" + blockFilePointer;
}
}

View File

@ -66,9 +66,6 @@ public class BlockTermsReader extends FieldsProducer {
private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>();
// Comparator that orders our terms
private final Comparator<BytesRef> termComp;
// Caches the most recently looked-up field + terms:
private final DoubleBarrelLRUCache<FieldAndTerm,BlockTermState> termsCache;
@ -111,13 +108,12 @@ public class BlockTermsReader extends FieldsProducer {
//private String segment;
public BlockTermsReader(TermsIndexReaderBase indexReader, Directory dir, FieldInfos fieldInfos, String segment, PostingsReaderBase postingsReader, int readBufferSize,
Comparator<BytesRef> termComp, int termsCacheSize, String codecId)
int termsCacheSize, String codecId)
throws IOException {
this.postingsReader = postingsReader;
termsCache = new DoubleBarrelLRUCache<FieldAndTerm,BlockTermState>(termsCacheSize);
this.termComp = termComp;
//this.segment = segment;
in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, BlockTermsWriter.TERMS_EXTENSION),
readBufferSize);
@ -260,7 +256,7 @@ public class BlockTermsReader extends FieldsProducer {
@Override
public Comparator<BytesRef> getComparator() {
return termComp;
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
@ -342,9 +338,15 @@ public class BlockTermsReader extends FieldsProducer {
@Override
public Comparator<BytesRef> getComparator() {
return termComp;
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
// TODO: we may want an alternate mode here which is
// "if you are about to return NOT_FOUND I won't use
// the terms data from that"; eg FuzzyTermsEnum will
// (usually) just immediately call seek again if we
// return NOT_FOUND so it's a waste for us to fill in
// the term that was actually NOT_FOUND
@Override
public SeekStatus seek(final BytesRef target, final boolean useCache) throws IOException {
@ -352,13 +354,13 @@ public class BlockTermsReader extends FieldsProducer {
throw new IllegalStateException("terms index was not loaded");
}
//System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this);
/*
System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " useCache=" + useCache + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this);
if (didIndexNext) {
if (nextIndexTerm == null) {
//System.out.println(" nextIndexTerm=null");
System.out.println(" nextIndexTerm=null");
} else {
//System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString());
System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString());
}
}
*/
@ -386,7 +388,7 @@ public class BlockTermsReader extends FieldsProducer {
// is after current term but before next index term:
if (indexIsCurrent) {
final int cmp = termComp.compare(term, target);
final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term, target);
if (cmp == 0) {
// Already at the requested term
@ -404,7 +406,7 @@ public class BlockTermsReader extends FieldsProducer {
didIndexNext = true;
}
if (nextIndexTerm == null || termComp.compare(target, nextIndexTerm) < 0) {
if (nextIndexTerm == null || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) {
// Optimization: requested term is within the
// same term block we are now in; skip seeking
// (but do scanning):
@ -434,49 +436,176 @@ public class BlockTermsReader extends FieldsProducer {
state.ord = indexEnum.ord()-1;
}
// NOTE: the first _next() after an index seek is
// a bit wasteful, since it redundantly reads some
// suffix bytes into the buffer. We could avoid storing
// those bytes in the primary file, but then when
// next()ing over an index term we'd have to
// special case it:
term.copy(indexEnum.term());
//System.out.println(" seek: term=" + term.utf8ToString());
} else {
////System.out.println(" skip seek");
//System.out.println(" skip seek");
if (state.termCount == state.blockTermCount && !nextBlock()) {
indexIsCurrent = false;
return SeekStatus.END;
}
}
seekPending = false;
// Now scan:
while (_next() != null) {
final int cmp = termComp.compare(term, target);
if (cmp == 0) {
// Match!
int common = 0;
// Scan within block. We could do this by calling
// _next() and testing the resulting term, but this
// is wasteful. Instead, we first confirm the
// target matches the common prefix of this block,
// and then we scan the term bytes directly from the
// termSuffixesreader's byte[], saving a copy into
// the BytesRef term per term. Only when we return
// do we then copy the bytes into the term.
while(true) {
// First, see if target term matches common prefix
// in this block:
if (common < termBlockPrefix) {
final int cmp = (term.bytes[common]&0xFF) - (target.bytes[target.offset + common]&0xFF);
if (cmp < 0) {
// TODO: maybe we should store common prefix
// in block header? (instead of relying on
// last term of previous block)
// Target's prefix is after the common block
// prefix, so term cannot be in this block
// but it could be in next block. We
// must scan to end-of-block to set common
// prefix for next block:
if (state.termCount < state.blockTermCount) {
while(state.termCount < state.blockTermCount-1) {
state.termCount++;
state.ord++;
termSuffixesReader.skipBytes(termSuffixesReader.readVInt());
}
final int suffix = termSuffixesReader.readVInt();
term.length = termBlockPrefix + suffix;
if (term.bytes.length < term.length) {
term.grow(term.length);
}
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
}
state.ord++;
if (!nextBlock()) {
indexIsCurrent = false;
return SeekStatus.END;
}
common = 0;
} else if (cmp > 0) {
// Target's prefix is before the common prefix
// of this block, so we position to start of
// block and return NOT_FOUND:
assert state.termCount == 0;
final int suffix = termSuffixesReader.readVInt();
term.length = termBlockPrefix + suffix;
if (term.bytes.length < term.length) {
term.grow(term.length);
}
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
return SeekStatus.NOT_FOUND;
} else {
common++;
}
continue;
}
// Test every term in this block
while (true) {
state.termCount++;
state.ord++;
final int suffix = termSuffixesReader.readVInt();
// We know the prefix matches, so just compare the new suffix:
final int termLen = termBlockPrefix + suffix;
int bytePos = termSuffixesReader.getPosition();
boolean next = false;
final int limit = target.offset + (termLen < target.length ? termLen : target.length);
int targetPos = target.offset + termBlockPrefix;
while(targetPos < limit) {
final int cmp = (termSuffixes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF);
if (cmp < 0) {
// Current term is still before the target;
// keep scanning
next = true;
break;
} else if (cmp > 0) {
// Done! Current term is after target. Stop
// here, fill in real term, return NOT_FOUND.
term.length = termBlockPrefix + suffix;
if (term.bytes.length < term.length) {
term.grow(term.length);
}
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
//System.out.println(" NOT_FOUND");
return SeekStatus.NOT_FOUND;
}
}
if (!next && target.length <= termLen) {
term.length = termBlockPrefix + suffix;
if (term.bytes.length < term.length) {
term.grow(term.length);
}
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
if (target.length == termLen) {
// Done! Exact match. Stop here, fill in
// real term, return FOUND.
//System.out.println(" FOUND");
if (useCache) {
// Store in cache
decodeMetaData();
//System.out.println(" cache! state=" + state);
termsCache.put(new FieldAndTerm(fieldTerm), (BlockTermState) state.clone());
}
//System.out.println(" FOUND");
return SeekStatus.FOUND;
} else if (cmp > 0) {
//System.out.println(" NOT_FOUND term=" + term.utf8ToString());
} else {
//System.out.println(" NOT_FOUND");
return SeekStatus.NOT_FOUND;
}
}
if (state.termCount == state.blockTermCount) {
// Must pre-fill term for next block's common prefix
term.length = termBlockPrefix + suffix;
if (term.bytes.length < term.length) {
term.grow(term.length);
}
termSuffixesReader.readBytes(term.bytes, termBlockPrefix, suffix);
break;
} else {
termSuffixesReader.skipBytes(suffix);
}
}
// The purpose of the terms dict index is to seek
// the enum to the closest index term before the
// term we are looking for. So, we should never
// cross another index term (besides the first
// one) while we are scanning:
assert indexIsCurrent;
}
indexIsCurrent = false;
assert indexIsCurrent;
if (!nextBlock()) {
//System.out.println(" END");
indexIsCurrent = false;
return SeekStatus.END;
}
common = 0;
}
}
@Override
public BytesRef next() throws IOException {
@ -515,13 +644,11 @@ public class BlockTermsReader extends FieldsProducer {
decode all metadata up to the current term. */
private BytesRef _next() throws IOException {
//System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" + state.termCount + " (vs " + state.blockTermCount + ")");
if (state.termCount == state.blockTermCount) {
if (!nextBlock()) {
if (state.termCount == state.blockTermCount && !nextBlock()) {
//System.out.println(" eof");
indexIsCurrent = false;
return null;
}
}
// TODO: cutover to something better for these ints! simple64?
final int suffix = termSuffixesReader.readVInt();
@ -689,7 +816,7 @@ public class BlockTermsReader extends FieldsProducer {
}
//System.out.println(" termSuffixes len=" + len);
in.readBytes(termSuffixes, 0, len);
termSuffixesReader.reset(termSuffixes);
termSuffixesReader.reset(termSuffixes, 0, len);
// docFreq, totalTermFreq
len = in.readVInt();
@ -698,7 +825,7 @@ public class BlockTermsReader extends FieldsProducer {
}
//System.out.println(" freq bytes len=" + len);
in.readBytes(docFreqBytes, 0, len);
freqReader.reset(docFreqBytes);
freqReader.reset(docFreqBytes, 0, len);
metaDataUpto = 0;
state.termCount = 0;
@ -717,23 +844,32 @@ public class BlockTermsReader extends FieldsProducer {
if (!seekPending) {
// lazily catch up on metadata decode:
final int limit = state.termCount;
// We must set/incr state.termCount because
// postings impl can look at this
state.termCount = metaDataUpto;
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
//System.out.println(" decode");
//System.out.println(" decode mdUpto=" + metaDataUpto);
// TODO: we could make "tiers" of metadata, ie,
// decode docFreq/totalTF but don't decode postings
// metadata; this way caller could get
// docFreq/totalTF w/o paying decode cost for
// postings
// TODO: if docFreq were bulk decoded we could
// just skipN here:
state.docFreq = freqReader.readVInt();
//System.out.println(" dF=" + state.docFreq);
if (!fieldInfo.omitTermFreqAndPositions) {
state.totalTermFreq = state.docFreq + freqReader.readVLong();
//System.out.println(" totTF=" + state.totalTermFreq);
}
postingsReader.nextTerm(fieldInfo, state);
metaDataUpto++;
state.termCount++;
}
} else {
//} else {
//System.out.println(" skip! seekPending");
}
}

View File

@ -63,24 +63,23 @@ public class BlockTermsWriter extends FieldsConsumer {
FieldInfo currentField;
private final TermsIndexWriterBase termsIndexWriter;
private final List<TermsWriter> fields = new ArrayList<TermsWriter>();
private final Comparator<BytesRef> termComp;
private final String segment;
//private final String segment;
public BlockTermsWriter(
TermsIndexWriterBase termsIndexWriter,
SegmentWriteState state,
PostingsWriterBase postingsWriter,
Comparator<BytesRef> termComp) throws IOException
PostingsWriterBase postingsWriter)
throws IOException
{
final String termsFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_EXTENSION);
this.termsIndexWriter = termsIndexWriter;
this.termComp = termComp;
out = state.directory.createOutput(termsFileName);
fieldInfos = state.fieldInfos;
writeHeader(out);
currentField = null;
this.postingsWriter = postingsWriter;
segment = state.segmentName;
//segment = state.segmentName;
//System.out.println("BTW.init seg=" + state.segmentName);
@ -161,7 +160,6 @@ public class BlockTermsWriter extends FieldsConsumer {
private long numTerms;
private final TermsIndexWriterBase.FieldWriter fieldIndexWriter;
long sumTotalTermFreq;
private final BytesRef lastTerm = new BytesRef();
private TermEntry[] pendingTerms;
@ -185,12 +183,12 @@ public class BlockTermsWriter extends FieldsConsumer {
@Override
public Comparator<BytesRef> getComparator() {
return termComp;
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public PostingsConsumer startTerm(BytesRef text) throws IOException {
//System.out.println("BTW.startTerm seg=" + segment + " term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text);
//System.out.println("BTW.startTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment);
postingsWriter.startTerm();
return postingsWriter;
}
@ -201,7 +199,7 @@ public class BlockTermsWriter extends FieldsConsumer {
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
assert stats.docFreq > 0;
//System.out.println("BTW.finishTerm seg=" + segment + " term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " df=" + stats.docFreq);
//System.out.println("BTW.finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment + " df=" + stats.docFreq);
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, stats);
@ -213,6 +211,7 @@ public class BlockTermsWriter extends FieldsConsumer {
flushBlock();
}
fieldIndexWriter.add(text, stats, out.getFilePointer());
//System.out.println(" index term!");
}
if (pendingTerms.length == pendingCount) {
@ -265,7 +264,7 @@ public class BlockTermsWriter extends FieldsConsumer {
private final RAMOutputStream bytesWriter = new RAMOutputStream();
private void flushBlock() throws IOException {
//System.out.println("BTW.flushBlock pendingCount=" + pendingCount);
//System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer());
// First pass: compute common prefix for all terms
// in the block, against term before first term in

View File

@ -89,7 +89,7 @@ public class PulsingCodec extends Codec {
// Terms dict
success = false;
try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter);
success = true;
return ret;
} finally {
@ -136,7 +136,6 @@ public class PulsingCodec extends Codec {
state.dir, state.fieldInfos, state.segmentInfo.name,
pulsingReader,
state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE,
state.codecId);
success = true;

View File

@ -144,7 +144,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase {
//System.out.println(" count=" + count + " threshold=" + maxPositions);
if (count <= maxPositions) {
//System.out.println(" inlined");
//System.out.println(" inlined pos=" + termState.inlinedBytesReader.getPosition());
// Inlined into terms dict -- just read the byte[] blob in,
// but don't decode it now (we only decode when a DocsEnum

View File

@ -23,7 +23,6 @@ import java.util.Set;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer;
@ -66,7 +65,7 @@ public class StandardCodec extends Codec {
success = false;
try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator());
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs);
success = true;
return ret;
} finally {
@ -109,7 +108,6 @@ public class StandardCodec extends Codec {
state.segmentInfo.name,
postings,
state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
TERMS_CACHE_SIZE,
state.codecId);
success = true;

View File

@ -126,7 +126,7 @@ public class MockFixedIntBlockCodec extends Codec {
success = false;
try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true;
return ret;
} finally {
@ -170,7 +170,6 @@ public class MockFixedIntBlockCodec extends Codec {
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE,
state.codecId);
success = true;

View File

@ -150,7 +150,7 @@ public class MockVariableIntBlockCodec extends Codec {
success = false;
try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true;
return ret;
} finally {
@ -195,7 +195,6 @@ public class MockVariableIntBlockCodec extends Codec {
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE,
state.codecId);
success = true;

View File

@ -205,7 +205,7 @@ public class MockRandomCodec extends Codec {
success = false;
try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true;
return ret;
} finally {
@ -306,7 +306,6 @@ public class MockRandomCodec extends Codec {
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
termsCacheSize,
state.codecId);
success = true;

View File

@ -70,7 +70,7 @@ public class MockSepCodec extends Codec {
success = false;
try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, postingsWriter);
success = true;
return ret;
} finally {
@ -114,7 +114,6 @@ public class MockSepCodec extends Codec {
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE,
state.codecId);
success = true;

View File

@ -496,139 +496,13 @@ public class TestExternalCodecs extends LuceneTestCase {
}
}
public static class MyCodecs extends CodecProvider {
MyCodecs() {
Codec ram = new RAMOnlyCodec();
register(ram);
setDefaultFieldCodec(ram.name);
}
}
// copied from PulsingCodec, just changing the terms
// comparator
private static class PulsingReverseTermsCodec extends Codec {
public PulsingReverseTermsCodec() {
name = "PulsingReverseTerms";
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase docsWriter = new StandardPostingsWriter(state);
// Terms that have <= freqCutoff number of docs are
// "pulsed" (inlined):
final int freqCutoff = 1;
PostingsWriterBase pulsingWriter = new PulsingPostingsWriterImpl(freqCutoff, docsWriter);
// Terms dict index
TermsIndexWriterBase indexWriter;
boolean success = false;
try {
indexWriter = new FixedGapTermsIndexWriter(state) {
// We sort in reverse unicode order, so, we must
// disable the suffix-stripping opto that
// FixedGapTermsIndexWriter does by default!
@Override
protected int indexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm) {
return indexedTerm.length;
}
};
success = true;
} finally {
if (!success) {
pulsingWriter.close();
}
}
// Terms dict
success = false;
try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, pulsingWriter, reverseUnicodeComparator);
success = true;
return ret;
} finally {
if (!success) {
try {
pulsingWriter.close();
} finally {
indexWriter.close();
}
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase docsReader = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId);
PostingsReaderBase pulsingReader = new PulsingPostingsReaderImpl(docsReader);
// Terms dict index reader
TermsIndexReaderBase indexReader;
boolean success = false;
try {
indexReader = new FixedGapTermsIndexReader(state.dir,
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
reverseUnicodeComparator,
state.codecId);
success = true;
} finally {
if (!success) {
pulsingReader.close();
}
}
// Terms dict reader
success = false;
try {
FieldsProducer ret = new BlockTermsReader(indexReader,
state.dir,
state.fieldInfos,
state.segmentInfo.name,
pulsingReader,
state.readBufferSize,
reverseUnicodeComparator,
StandardCodec.TERMS_CACHE_SIZE,
state.codecId);
success = true;
return ret;
} finally {
if (!success) {
try {
pulsingReader.close();
} finally {
indexReader.close();
}
}
}
}
@Override
public void files(Directory dir, SegmentInfo segmentInfo, String codecId, Set<String> files) throws IOException {
StandardPostingsReader.files(dir, segmentInfo, codecId, files);
BlockTermsReader.files(dir, segmentInfo, codecId, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
}
@Override
public void getExtensions(Set<String> extensions) {
StandardCodec.getStandardExtensions(extensions);
}
}
// tests storing "id" and "field2" fields as pulsing codec,
// whose term sort is backwards unicode code point, and
// storing "field1" as a custom entirely-in-RAM codec
public void testPerFieldCodec() throws Exception {
CodecProvider provider = new MyCodecs();
Codec pulsing = new PulsingReverseTermsCodec();
provider.register(pulsing);
CodecProvider provider = new CoreCodecProvider();
provider.register(new RAMOnlyCodec());
provider.setDefaultFieldCodec("RamOnly");
final int NUM_DOCS = 173;
MockDirectoryWrapper dir = newDirectory();
@ -645,11 +519,11 @@ public class TestExternalCodecs extends LuceneTestCase {
doc.add(newField("field1", "this field uses the standard codec as the test", Field.Store.NO, Field.Index.ANALYZED));
// uses pulsing codec:
Field field2 = newField("field2", "this field uses the pulsing codec as the test", Field.Store.NO, Field.Index.ANALYZED);
provider.setFieldCodec(field2.name(), pulsing.name);
provider.setFieldCodec(field2.name(), "Pulsing");
doc.add(field2);
Field idField = newField("id", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
provider.setFieldCodec(idField.name(), pulsing.name);
provider.setFieldCodec(idField.name(), "Pulsing");
doc.add(idField);
for(int i=0;i<NUM_DOCS;i++) {
@ -659,16 +533,13 @@ public class TestExternalCodecs extends LuceneTestCase {
w.commit();
}
}
if (VERBOSE) {
System.out.println("TEST: now delete id=77");
}
w.deleteDocuments(new Term("id", "77"));
IndexReader r = IndexReader.open(w, true);
IndexReader[] subs = r.getSequentialSubReaders();
// test each segment
for(int i=0;i<subs.length;i++) {
testTermsOrder(subs[i]);
}
// test each multi-reader
testTermsOrder(r);
assertEquals(NUM_DOCS-1, r.numDocs());
IndexSearcher s = newSearcher(r);
@ -689,7 +560,6 @@ public class TestExternalCodecs extends LuceneTestCase {
assertEquals(0, s.search(new TermQuery(new Term("id", "77")), 1).totalHits);
assertEquals(0, s.search(new TermQuery(new Term("id", "44")), 1).totalHits);
testTermsOrder(r);
r.close();
s.close();
@ -697,25 +567,4 @@ public class TestExternalCodecs extends LuceneTestCase {
dir.close();
}
private void testTermsOrder(IndexReader r) throws Exception {
// Verify sort order matches what my comparator said:
BytesRef lastBytesRef = null;
TermsEnum terms = MultiFields.getFields(r).terms("id").iterator();
//System.out.println("id terms:");
while(true) {
BytesRef t = terms.next();
if (t == null) {
break;
}
//System.out.println(" " + t);
if (lastBytesRef == null) {
lastBytesRef = new BytesRef(t);
} else {
assertTrue("terms in wrong order last=" + lastBytesRef.utf8ToString() + " current=" + t.utf8ToString(), reverseUnicodeComparator.compare(lastBytesRef, t) < 0);
lastBytesRef.copy(t);
}
}
}
}

View File

@ -2784,6 +2784,9 @@ public class TestIndexWriter extends LuceneTestCase {
final String id = ""+i;
idField.setValue(id);
docs.put(id, doc);
if (VERBOSE) {
System.out.println("TEST: add doc id=" + id);
}
for(int field: fieldIDs) {
final String s;
@ -2802,7 +2805,7 @@ public class TestIndexWriter extends LuceneTestCase {
if (rand.nextInt(5) == 3 && i > 0) {
final String delID = ""+rand.nextInt(i);
if (VERBOSE) {
System.out.println("TEST: delete doc " + delID);
System.out.println("TEST: delete doc id=" + delID);
}
w.deleteDocuments(new Term("id", delID));
docs.remove(delID);
@ -2825,6 +2828,9 @@ public class TestIndexWriter extends LuceneTestCase {
for(int iter=0;iter<1000*RANDOM_MULTIPLIER;iter++) {
String testID = idsList[rand.nextInt(idsList.length)];
if (VERBOSE) {
System.out.println("TEST: test id=" + testID);
}
TopDocs hits = s.search(new TermQuery(new Term("id", testID)), 1);
assertEquals(1, hits.totalHits);
Document doc = r.document(hits.scoreDocs[0].doc);

View File

@ -124,7 +124,11 @@ public class TestIndexWriterReader extends LuceneTestCase {
} else {
iwc.setMergePolicy(NoMergePolicy.COMPOUND_FILES);
}
if (VERBOSE) {
System.out.println("TEST: make index");
}
IndexWriter writer = new IndexWriter(dir1, iwc);
writer.setInfoStream(VERBOSE ? System.out : null);
// create the index
createIndexNoClose(!optimize, "index1", writer);
@ -146,6 +150,9 @@ public class TestIndexWriterReader extends LuceneTestCase {
IndexReader r2 = writer.getReader();
assertTrue(r2.isCurrent());
assertEquals(0, count(new Term("id", id10), r2));
if (VERBOSE) {
System.out.println("TEST: verify id");
}
assertEquals(1, count(new Term("id", Integer.toString(8000)), r2));
r1.close();

View File

@ -79,6 +79,9 @@ public class TestAutomatonQuery extends LuceneTestCase {
}
private int automatonQueryNrHits(AutomatonQuery query) throws IOException {
if (VERBOSE) {
System.out.println("TEST: run aq=" + query);
}
return searcher.search(query, 5).totalHits;
}