LUCENE-4819: move Sorted(set)DocValuesTermsEnum to codec

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1454968 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-03-11 00:59:10 +00:00
parent a0ef70931c
commit dd0855c4fe
13 changed files with 152 additions and 21 deletions

View File

@ -41,6 +41,11 @@ New Features
once as a keyword and once as an ordinary token allow stemmers to emit
a stemmed version along with the un-stemmed version. (Simon Willnauer)
Optimizations
* LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the
default codec for improved enumeration performance. (Robert Muir)
======================= Lucene 4.2.0 =======================
Changes in backwards compatibility policy

View File

@ -32,9 +32,7 @@ import org.apache.lucene.index.MultiDocValues.OrdinalMap;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedDocValuesTermsEnum;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.SortedSetDocValuesTermsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
@ -269,7 +267,7 @@ public abstract class DocValuesConsumer implements Closeable {
SortedDocValues dv = dvs[sub];
Bits liveDocs = reader.getLiveDocs();
if (liveDocs == null) {
liveTerms[sub] = new SortedDocValuesTermsEnum(dv);
liveTerms[sub] = dv.termsEnum();
} else {
OpenBitSet bitset = new OpenBitSet(dv.getValueCount());
for (int i = 0; i < reader.maxDoc(); i++) {
@ -277,7 +275,7 @@ public abstract class DocValuesConsumer implements Closeable {
bitset.set(dv.getOrd(i));
}
}
liveTerms[sub] = new BitsFilteredTermsEnum(new SortedDocValuesTermsEnum(dv), bitset);
liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
}
}
@ -401,7 +399,7 @@ public abstract class DocValuesConsumer implements Closeable {
SortedSetDocValues dv = dvs[sub];
Bits liveDocs = reader.getLiveDocs();
if (liveDocs == null) {
liveTerms[sub] = new SortedSetDocValuesTermsEnum(dv);
liveTerms[sub] = dv.termsEnum();
} else {
OpenBitSet bitset = new OpenBitSet(dv.getValueCount());
for (int i = 0; i < reader.maxDoc(); i++) {
@ -413,7 +411,7 @@ public abstract class DocValuesConsumer implements Closeable {
}
}
}
liveTerms[sub] = new BitsFilteredTermsEnum(new SortedSetDocValuesTermsEnum(dv), bitset);
liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
}
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene42;
*/
import java.io.IOException;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
@ -25,6 +26,8 @@ import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
@ -32,8 +35,10 @@ import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
@ -285,6 +290,11 @@ class Lucene42DocValuesProducer extends DocValuesProducer {
public int getValueCount() {
return (int)entry.numOrds;
}
@Override
public TermsEnum termsEnum() {
return new FSTTermsEnum(fst);
}
};
}
@ -369,6 +379,11 @@ class Lucene42DocValuesProducer extends DocValuesProducer {
public long getValueCount() {
return entry.numOrds;
}
@Override
public TermsEnum termsEnum() {
return new FSTTermsEnum(fst);
}
};
}
@ -396,4 +411,106 @@ class Lucene42DocValuesProducer extends DocValuesProducer {
long offset;
long numOrds;
}
// exposes FSTEnum directly as a TermsEnum: avoids binary-search next()
static class FSTTermsEnum extends TermsEnum {
final BytesRefFSTEnum<Long> in;
// this is all for the complicated seek(ord)...
// maybe we should add a FSTEnum that supports this operation?
final FST<Long> fst;
final FST.BytesReader bytesReader;
final Arc<Long> firstArc = new Arc<Long>();
final Arc<Long> scratchArc = new Arc<Long>();
final IntsRef scratchInts = new IntsRef();
final BytesRef scratchBytes = new BytesRef();
FSTTermsEnum(FST<Long> fst) {
this.fst = fst;
in = new BytesRefFSTEnum<Long>(fst);
bytesReader = fst.getBytesReader();
}
@Override
public BytesRef next() throws IOException {
InputOutput<Long> io = in.next();
if (io == null) {
return null;
} else {
return io.input;
}
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public SeekStatus seekCeil(BytesRef text, boolean useCache) throws IOException {
if (in.seekCeil(text) == null) {
return SeekStatus.NOT_FOUND;
} else if (term().equals(text)) {
// TODO: add SeekStatus to FSTEnum like in https://issues.apache.org/jira/browse/LUCENE-3729
// to remove this comparision?
return SeekStatus.FOUND;
} else {
return SeekStatus.NOT_FOUND;
}
}
@Override
public boolean seekExact(BytesRef text, boolean useCache) throws IOException {
if (in.seekExact(text) == null) {
return false;
} else {
return true;
}
}
@Override
public void seekExact(long ord) throws IOException {
// TODO: would be better to make this simpler and faster.
// but we dont want to introduce a bug that corrupts our enum state!
bytesReader.setPosition(0);
fst.getFirstArc(firstArc);
IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts);
scratchBytes.bytes = new byte[output.length];
scratchBytes.offset = 0;
scratchBytes.length = 0;
Util.toBytesRef(output, scratchBytes);
// TODO: we could do this lazily, better to try to push into FSTEnum though?
in.seekExact(scratchBytes);
}
@Override
public BytesRef term() throws IOException {
return in.current().input;
}
@Override
public long ord() throws IOException {
return in.current().output;
}
@Override
public int docFreq() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long totalTermFreq() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
throw new UnsupportedOperationException();
}
}
}

View File

@ -217,7 +217,7 @@ public class MultiDocValues {
} else {
TermsEnum enums[] = new TermsEnum[values.length];
for (int i = 0; i < values.length; i++) {
enums[i] = new SortedDocValuesTermsEnum(values[i]);
enums[i] = values[i].termsEnum();
}
OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums);
return new MultiSortedDocValues(values, starts, mapping);
@ -261,7 +261,7 @@ public class MultiDocValues {
} else {
TermsEnum enums[] = new TermsEnum[values.length];
for (int i = 0; i < values.length; i++) {
enums[i] = new SortedSetDocValuesTermsEnum(values[i]);
enums[i] = values[i].termsEnum();
}
OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums);
return new MultiSortedSetDocValues(values, starts, mapping);

View File

@ -114,4 +114,12 @@ public abstract class SortedDocValues extends BinaryDocValues {
return -(low + 1); // key not found.
}
/**
* Returns a {@link TermsEnum} over the values.
* The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}.
*/
public TermsEnum termsEnum() {
return new SortedDocValuesTermsEnum(this);
}
}

View File

@ -26,7 +26,7 @@ import org.apache.lucene.util.BytesRef;
/** Implements a {@link TermsEnum} wrapping a provided
* {@link SortedDocValues}. */
public class SortedDocValuesTermsEnum extends TermsEnum {
class SortedDocValuesTermsEnum extends TermsEnum {
private final SortedDocValues values;
private int currentOrd = -1;
private final BytesRef term = new BytesRef();

View File

@ -117,4 +117,12 @@ public abstract class SortedSetDocValues {
return -(low + 1); // key not found.
}
/**
* Returns a {@link TermsEnum} over the values.
* The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}.
*/
public TermsEnum termsEnum() {
return new SortedSetDocValuesTermsEnum(this);
}
}

View File

@ -26,7 +26,7 @@ import org.apache.lucene.util.BytesRef;
/** Implements a {@link TermsEnum} wrapping a provided
* {@link SortedSetDocValues}. */
public class SortedSetDocValuesTermsEnum extends TermsEnum {
class SortedSetDocValuesTermsEnum extends TermsEnum {
private final SortedSetDocValues values;
private long currentOrd = -1;
private final BytesRef term = new BytesRef();

View File

@ -23,7 +23,6 @@ import java.util.Comparator;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.SortedSetDocValuesTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
@ -98,7 +97,7 @@ public final class DocTermOrdsRewriteMethod extends MultiTermQuery.RewriteMethod
@Override
public TermsEnum iterator(TermsEnum reuse) {
return new SortedSetDocValuesTermsEnum(docTermOrds);
return docTermOrds.termsEnum();
}
@Override

View File

@ -23,7 +23,6 @@ import java.util.Comparator;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedDocValuesTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
@ -98,7 +97,7 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
@Override
public TermsEnum iterator(TermsEnum reuse) {
return new SortedDocValuesTermsEnum(fcsi);
return fcsi.termsEnum();
}
@Override

View File

@ -219,7 +219,7 @@ public class TestFieldCache extends LuceneTestCase {
int nTerms = termsIndex.getValueCount();
TermsEnum tenum = new SortedDocValuesTermsEnum(termsIndex);
TermsEnum tenum = termsIndex.termsEnum();
BytesRef val = new BytesRef();
for (int i=0; i<nTerms; i++) {
BytesRef val1 = tenum.next();

View File

@ -24,9 +24,7 @@ import java.util.List;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocTermOrds;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedDocValuesTermsEnum;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.SortedSetDocValuesTermsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.grouping.AbstractGroupFacetCollector;
@ -170,7 +168,7 @@ public abstract class TermGroupFacetCollector extends AbstractGroupFacetCollecto
@Override
protected SegmentResult createSegmentResult() throws IOException {
return new SegmentResult(segmentFacetCounts, segmentTotalCount, new SortedDocValuesTermsEnum(facetFieldTermsIndex), startFacetOrd, endFacetOrd);
return new SegmentResult(segmentFacetCounts, segmentTotalCount, facetFieldTermsIndex.termsEnum(), startFacetOrd, endFacetOrd);
}
private static class SegmentResult extends AbstractGroupFacetCollector.SegmentResult {
@ -289,7 +287,7 @@ public abstract class TermGroupFacetCollector extends AbstractGroupFacetCollecto
if (facetFieldNumTerms == 0) {
facetOrdTermsEnum = null;
} else {
facetOrdTermsEnum = new SortedSetDocValuesTermsEnum(facetFieldDocTermOrds);
facetOrdTermsEnum = facetFieldDocTermOrds.termsEnum();
}
// [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet field
segmentFacetCounts = new int[facetFieldNumTerms + 1];

View File

@ -23,7 +23,6 @@ import java.util.concurrent.*;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedDocValuesTermsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
@ -155,7 +154,7 @@ class PerSegmentSingleValuedFaceting {
seg.pos = seg.startTermIndex;
}
if (seg.pos < seg.endTermIndex) {
seg.tenum = new SortedDocValuesTermsEnum(seg.si);
seg.tenum = seg.si.termsEnum();
seg.tenum.seekExact(seg.pos);
seg.tempBR = seg.tenum.term();
queue.add(seg);