From b25cc528d9e00a202507a9d6300cf10e4089f648 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 12 Mar 2013 02:17:27 +0000 Subject: [PATCH] LUCENE-4819: move Sorted(set)DocValuesTermsEnum to codec git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1455391 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 10 ++ .../lucene/codecs/DocValuesConsumer.java | 10 +- .../lucene42/Lucene42DocValuesProducer.java | 117 +++++++++++++++ .../apache/lucene/index/MultiDocValues.java | 4 +- .../apache/lucene/index/SortedDocValues.java | 8 + .../index/SortedDocValuesTermsEnum.java | 8 +- .../lucene/index/SortedSetDocValues.java | 8 + .../index/SortedSetDocValuesTermsEnum.java | 8 +- .../search/DocTermOrdsRewriteMethod.java | 3 +- .../search/FieldCacheRewriteMethod.java | 3 +- .../apache/lucene/search/TestFieldCache.java | 2 +- .../term/TermGroupFacetCollector.java | 6 +- .../index/BaseDocValuesFormatTestCase.java | 137 ++++++++++++++++++ .../PerSegmentSingleValuedFaceting.java | 3 +- 14 files changed, 306 insertions(+), 21 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index cf025575a41..ff615c07885 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -55,6 +55,16 @@ New Features query "i " will no longer suggest "Isla de Muerta" for example. (Mike McCandless) +Optimizations + +* LUCENE-4819: Added Sorted[Set]DocValues.termsEnum(), and optimized the + default codec for improved enumeration performance. (Robert Muir) + +Bug Fixes + +* LUCENE-4819: seekExact(BytesRef, boolean) did not work correctly with + Sorted[Set]DocValuesTermsEnum. (Robert Muir) + ======================= Lucene 4.2.0 ======================= Changes in backwards compatibility policy diff --git a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java index 1db957b2c60..921b94dff69 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/DocValuesConsumer.java @@ -32,9 +32,7 @@ import org.apache.lucene.index.MultiDocValues.OrdinalMap; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.SortedDocValues; -import org.apache.lucene.index.SortedDocValuesTermsEnum; import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.index.SortedSetDocValuesTermsEnum; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; @@ -269,7 +267,7 @@ public abstract class DocValuesConsumer implements Closeable { SortedDocValues dv = dvs[sub]; Bits liveDocs = reader.getLiveDocs(); if (liveDocs == null) { - liveTerms[sub] = new SortedDocValuesTermsEnum(dv); + liveTerms[sub] = dv.termsEnum(); } else { OpenBitSet bitset = new OpenBitSet(dv.getValueCount()); for (int i = 0; i < reader.maxDoc(); i++) { @@ -277,7 +275,7 @@ public abstract class DocValuesConsumer implements Closeable { bitset.set(dv.getOrd(i)); } } - liveTerms[sub] = new BitsFilteredTermsEnum(new SortedDocValuesTermsEnum(dv), bitset); + liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); } } @@ -401,7 +399,7 @@ public abstract class DocValuesConsumer implements Closeable { SortedSetDocValues dv = dvs[sub]; Bits liveDocs = reader.getLiveDocs(); if (liveDocs == null) { - liveTerms[sub] = new SortedSetDocValuesTermsEnum(dv); + liveTerms[sub] = dv.termsEnum(); } else { OpenBitSet bitset = new OpenBitSet(dv.getValueCount()); for (int i = 0; i < reader.maxDoc(); i++) { @@ -413,7 +411,7 @@ public abstract class DocValuesConsumer implements Closeable { } } } - liveTerms[sub] = new BitsFilteredTermsEnum(new SortedSetDocValuesTermsEnum(dv), bitset); + liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java index a9463b8e58e..6b13ab499ce 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesProducer.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene42; */ import java.io.IOException; +import java.util.Comparator; import java.util.HashMap; import java.util.Map; @@ -25,6 +26,8 @@ import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; @@ -32,8 +35,10 @@ import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; @@ -285,6 +290,11 @@ class Lucene42DocValuesProducer extends DocValuesProducer { public int getValueCount() { return (int)entry.numOrds; } + + @Override + public TermsEnum termsEnum() { + return new FSTTermsEnum(fst); + } }; } @@ -369,6 +379,11 @@ class Lucene42DocValuesProducer extends DocValuesProducer { public long getValueCount() { return entry.numOrds; } + + @Override + public TermsEnum termsEnum() { + return new FSTTermsEnum(fst); + } }; } @@ -396,4 +411,106 @@ class Lucene42DocValuesProducer extends DocValuesProducer { long offset; long numOrds; } + + // exposes FSTEnum directly as a TermsEnum: avoids binary-search next() + static class FSTTermsEnum extends TermsEnum { + final BytesRefFSTEnum in; + + // this is all for the complicated seek(ord)... + // maybe we should add a FSTEnum that supports this operation? + final FST fst; + final FST.BytesReader bytesReader; + final Arc firstArc = new Arc(); + final Arc scratchArc = new Arc(); + final IntsRef scratchInts = new IntsRef(); + final BytesRef scratchBytes = new BytesRef(); + + FSTTermsEnum(FST fst) { + this.fst = fst; + in = new BytesRefFSTEnum(fst); + bytesReader = fst.getBytesReader(); + } + + @Override + public BytesRef next() throws IOException { + InputOutput io = in.next(); + if (io == null) { + return null; + } else { + return io.input; + } + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public SeekStatus seekCeil(BytesRef text, boolean useCache) throws IOException { + if (in.seekCeil(text) == null) { + return SeekStatus.END; + } else if (term().equals(text)) { + // TODO: add SeekStatus to FSTEnum like in https://issues.apache.org/jira/browse/LUCENE-3729 + // to remove this comparision? + return SeekStatus.FOUND; + } else { + return SeekStatus.NOT_FOUND; + } + } + + @Override + public boolean seekExact(BytesRef text, boolean useCache) throws IOException { + if (in.seekExact(text) == null) { + return false; + } else { + return true; + } + } + + @Override + public void seekExact(long ord) throws IOException { + // TODO: would be better to make this simpler and faster. + // but we dont want to introduce a bug that corrupts our enum state! + bytesReader.setPosition(0); + fst.getFirstArc(firstArc); + IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts); + scratchBytes.bytes = new byte[output.length]; + scratchBytes.offset = 0; + scratchBytes.length = 0; + Util.toBytesRef(output, scratchBytes); + // TODO: we could do this lazily, better to try to push into FSTEnum though? + in.seekExact(scratchBytes); + } + + @Override + public BytesRef term() throws IOException { + return in.current().input; + } + + @Override + public long ord() throws IOException { + return in.current().output; + } + + @Override + public int docFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long totalTermFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + throw new UnsupportedOperationException(); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java index 5a9c76333c6..adc749ab6bb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiDocValues.java @@ -217,7 +217,7 @@ public class MultiDocValues { } else { TermsEnum enums[] = new TermsEnum[values.length]; for (int i = 0; i < values.length; i++) { - enums[i] = new SortedDocValuesTermsEnum(values[i]); + enums[i] = values[i].termsEnum(); } OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums); return new MultiSortedDocValues(values, starts, mapping); @@ -261,7 +261,7 @@ public class MultiDocValues { } else { TermsEnum enums[] = new TermsEnum[values.length]; for (int i = 0; i < values.length; i++) { - enums[i] = new SortedSetDocValuesTermsEnum(values[i]); + enums[i] = values[i].termsEnum(); } OrdinalMap mapping = new OrdinalMap(r.getCoreCacheKey(), enums); return new MultiSortedSetDocValues(values, starts, mapping); diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java index ebdab6e8268..c7dae5b3dd9 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java @@ -114,4 +114,12 @@ public abstract class SortedDocValues extends BinaryDocValues { return -(low + 1); // key not found. } + + /** + * Returns a {@link TermsEnum} over the values. + * The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}. + */ + public TermsEnum termsEnum() { + return new SortedDocValuesTermsEnum(this); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesTermsEnum.java index 6cb92a1a92f..c30ea86d2ca 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesTermsEnum.java @@ -26,7 +26,7 @@ import org.apache.lucene.util.BytesRef; /** Implements a {@link TermsEnum} wrapping a provided * {@link SortedDocValues}. */ -public class SortedDocValuesTermsEnum extends TermsEnum { +class SortedDocValuesTermsEnum extends TermsEnum { private final SortedDocValues values; private int currentOrd = -1; private final BytesRef term = new BytesRef(); @@ -64,6 +64,12 @@ public class SortedDocValuesTermsEnum extends TermsEnum { public boolean seekExact(BytesRef text, boolean useCache) throws IOException { int ord = values.lookupTerm(text); if (ord >= 0) { + term.offset = 0; + // TODO: is there a cleaner way? + // term.bytes may be pointing to codec-private byte[] + // storage, so we must force new byte[] allocation: + term.bytes = new byte[text.length]; + term.copyBytes(text); currentOrd = ord; return true; } else { diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java index ce10caa940a..77e6e165b0e 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java @@ -117,4 +117,12 @@ public abstract class SortedSetDocValues { return -(low + 1); // key not found. } + + /** + * Returns a {@link TermsEnum} over the values. + * The enum supports {@link TermsEnum#ord()} and {@link TermsEnum#seekExact(long)}. + */ + public TermsEnum termsEnum() { + return new SortedSetDocValuesTermsEnum(this); + } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesTermsEnum.java index 589bc103649..a9ceac950bb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesTermsEnum.java @@ -26,7 +26,7 @@ import org.apache.lucene.util.BytesRef; /** Implements a {@link TermsEnum} wrapping a provided * {@link SortedSetDocValues}. */ -public class SortedSetDocValuesTermsEnum extends TermsEnum { +class SortedSetDocValuesTermsEnum extends TermsEnum { private final SortedSetDocValues values; private long currentOrd = -1; private final BytesRef term = new BytesRef(); @@ -64,6 +64,12 @@ public class SortedSetDocValuesTermsEnum extends TermsEnum { public boolean seekExact(BytesRef text, boolean useCache) throws IOException { long ord = values.lookupTerm(text); if (ord >= 0) { + term.offset = 0; + // TODO: is there a cleaner way? + // term.bytes may be pointing to codec-private byte[] + // storage, so we must force new byte[] allocation: + term.bytes = new byte[text.length]; + term.copyBytes(text); currentOrd = ord; return true; } else { diff --git a/lucene/core/src/java/org/apache/lucene/search/DocTermOrdsRewriteMethod.java b/lucene/core/src/java/org/apache/lucene/search/DocTermOrdsRewriteMethod.java index d25318402e5..f21834d48be 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocTermOrdsRewriteMethod.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocTermOrdsRewriteMethod.java @@ -23,7 +23,6 @@ import java.util.Comparator; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.index.SortedSetDocValuesTermsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.Bits; @@ -98,7 +97,7 @@ public final class DocTermOrdsRewriteMethod extends MultiTermQuery.RewriteMethod @Override public TermsEnum iterator(TermsEnum reuse) { - return new SortedSetDocValuesTermsEnum(docTermOrds); + return docTermOrds.termsEnum(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java b/lucene/core/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java index 84f6092a3db..bcb79239a62 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java +++ b/lucene/core/src/java/org/apache/lucene/search/FieldCacheRewriteMethod.java @@ -23,7 +23,6 @@ import java.util.Comparator; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.SortedDocValues; -import org.apache.lucene.index.SortedDocValuesTermsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.Bits; @@ -98,7 +97,7 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod @Override public TermsEnum iterator(TermsEnum reuse) { - return new SortedDocValuesTermsEnum(fcsi); + return fcsi.termsEnum(); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java b/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java index b5f3d73f32c..c2ed9a0088a 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java @@ -219,7 +219,7 @@ public class TestFieldCache extends LuceneTestCase { int nTerms = termsIndex.getValueCount(); - TermsEnum tenum = new SortedDocValuesTermsEnum(termsIndex); + TermsEnum tenum = termsIndex.termsEnum(); BytesRef val = new BytesRef(); for (int i=0; i